Skip to content

Commit 4bec311

Browse files
belieferyaooqinn
authored andcommitted
[SPARK-44342][SQL] Replace SQLContext with SparkSession for GenTPCDSData
### What changes were proposed in this pull request? The `SQLContext` is an old API for Spark SQL. But `GenTPCDSData` still use it directly. ### Why are the changes needed? Avoid use the legacy API for `GenTPCDSData`. ### Does this PR introduce _any_ user-facing change? 'No'. Just update the benchmark utils. ### How was this patch tested? Manual test by running: `build/sbt "sql/Test/runMain org.apache.spark.sql.GenTPCDSData --dsdgenDir ... --location ... --scaleFactor 1"` Closes #41900 from beliefer/SPARK-44342. Authored-by: Jiaan Geng <[email protected]> Signed-off-by: Kent Yao <[email protected]>
1 parent 71703d6 commit 4bec311

File tree

1 file changed

+8
-9
lines changed

1 file changed

+8
-9
lines changed

sql/core/src/test/scala/org/apache/spark/sql/GenTPCDSData.scala

+8-9
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ import org.apache.spark.SparkContext
2727
import org.apache.spark.internal.Logging
2828
import org.apache.spark.rdd.RDD
2929
import org.apache.spark.sql.functions.{col, rpad}
30+
import org.apache.spark.sql.internal.SQLConf
3031
import org.apache.spark.sql.types.{CharType, StringType, StructField, StructType, VarcharType}
3132

3233
// The classes in this file are basically moved from https://github.com/databricks/spark-sql-perf
@@ -120,7 +121,7 @@ class Dsdgen(dsdgenDir: String) extends Serializable {
120121
}
121122
}
122123

123-
class TPCDSTables(sqlContext: SQLContext, dsdgenDir: String, scaleFactor: Int)
124+
class TPCDSTables(spark: SparkSession, dsdgenDir: String, scaleFactor: Int)
124125
extends TPCDSSchema with Logging with Serializable {
125126

126127
private val dataGenerator = new Dsdgen(dsdgenDir)
@@ -138,7 +139,7 @@ class TPCDSTables(sqlContext: SQLContext, dsdgenDir: String, scaleFactor: Int)
138139

139140
private def df(numPartition: Int) = {
140141
val generatedData = dataGenerator.generate(
141-
sqlContext.sparkContext, name, numPartition, scaleFactor)
142+
spark.sparkContext, name, numPartition, scaleFactor)
142143
val rows = generatedData.mapPartitions { iter =>
143144
iter.map { l =>
144145
val values = l.split("\\|", -1).dropRight(1).map { v =>
@@ -154,7 +155,7 @@ class TPCDSTables(sqlContext: SQLContext, dsdgenDir: String, scaleFactor: Int)
154155
}
155156

156157
val stringData =
157-
sqlContext.createDataFrame(
158+
spark.createDataFrame(
158159
rows,
159160
StructType(schema.fields.map(f => StructField(f.name, StringType))))
160161

@@ -210,7 +211,7 @@ class TPCDSTables(sqlContext: SQLContext, dsdgenDir: String, scaleFactor: Int)
210211
|DISTRIBUTE BY
211212
| $partitionColumnString
212213
""".stripMargin
213-
val grouped = sqlContext.sql(query)
214+
val grouped = spark.sql(query)
214215
logInfo(s"Pre-clustering with partitioning columns with query $query.")
215216
grouped.write
216217
} else {
@@ -223,9 +224,7 @@ class TPCDSTables(sqlContext: SQLContext, dsdgenDir: String, scaleFactor: Int)
223224
// datagen speed files will be truncated to maxRecordsPerFile value, so the final
224225
// result will be the same.
225226
val numRows = data.count
226-
val maxRecordPerFile = Try {
227-
sqlContext.getConf("spark.sql.files.maxRecordsPerFile").toInt
228-
}.getOrElse(0)
227+
val maxRecordPerFile = spark.conf.get(SQLConf.MAX_RECORDS_PER_FILE)
229228

230229
if (maxRecordPerFile > 0 && numRows > maxRecordPerFile) {
231230
val numFiles = (numRows.toDouble/maxRecordPerFile).ceil.toInt
@@ -244,7 +243,7 @@ class TPCDSTables(sqlContext: SQLContext, dsdgenDir: String, scaleFactor: Int)
244243
}
245244
logInfo(s"Generating table $name in database to $location with save mode $mode.")
246245
writer.save(location)
247-
sqlContext.dropTempTable(tempTableName)
246+
spark.catalog.dropTempView(tempTableName)
248247
}
249248
}
250249

@@ -429,7 +428,7 @@ object GenTPCDSData {
429428
.getOrCreate()
430429

431430
val tables = new TPCDSTables(
432-
spark.sqlContext,
431+
spark,
433432
dsdgenDir = config.dsdgenDir,
434433
scaleFactor = config.scaleFactor)
435434

0 commit comments

Comments
 (0)