17
17
18
18
package org .apache .spark .sql .catalyst .util
19
19
20
+ import scala .collection .mutable .ArrayBuffer
20
21
import scala .util .Random
21
22
22
23
import org .apache .spark .SparkFunSuite
@@ -54,25 +55,51 @@ class QuantileSummariesSuite extends SparkFunSuite {
54
55
summary
55
56
}
56
57
57
- private def checkQuantile (quant : Double , data : Seq [Double ], summary : QuantileSummaries ): Unit = {
58
+ private def validateQuantileApproximation (
59
+ approx : Double ,
60
+ percentile : Double ,
61
+ data : Seq [Double ],
62
+ summary : QuantileSummaries ): Unit = {
63
+ assert(data.nonEmpty)
64
+
65
+ val rankOfValue = data.count(_ <= approx)
66
+ val rankOfPreValue = data.count(_ < approx)
67
+ // `rankOfValue` is the last position of the quantile value. If the input repeats the value
68
+ // chosen as the quantile, e.g. in (1,2,2,2,2,2,3), the 50% quantile is 2, then it's
69
+ // improper to choose the last position as its rank. Instead, we get the rank by averaging
70
+ // `rankOfValue` and `rankOfPreValue`.
71
+ val rank = math.ceil((rankOfValue + rankOfPreValue) / 2.0 )
72
+ val lower = math.floor((percentile - summary.relativeError) * data.size)
73
+ val upper = math.ceil((percentile + summary.relativeError) * data.size)
74
+ val msg =
75
+ s " $rank not in [ $lower $upper], requested percentile: $percentile, approx returned: $approx"
76
+ assert(rank >= lower, msg)
77
+ assert(rank <= upper, msg)
78
+ }
79
+
80
+ private def checkQuantile (
81
+ percentile : Double ,
82
+ data : Seq [Double ],
83
+ summary : QuantileSummaries ): Unit = {
58
84
if (data.nonEmpty) {
59
- val approx = summary.query(quant).get
60
- // Get the rank of the approximation.
61
- val rankOfValue = data.count(_ <= approx)
62
- val rankOfPreValue = data.count(_ < approx)
63
- // `rankOfValue` is the last position of the quantile value. If the input repeats the value
64
- // chosen as the quantile, e.g. in (1,2,2,2,2,2,3), the 50% quantile is 2, then it's
65
- // improper to choose the last position as its rank. Instead, we get the rank by averaging
66
- // `rankOfValue` and `rankOfPreValue`.
67
- val rank = math.ceil((rankOfValue + rankOfPreValue) / 2.0 )
68
- val lower = math.floor((quant - summary.relativeError) * data.size)
69
- val upper = math.ceil((quant + summary.relativeError) * data.size)
70
- val msg =
71
- s " $rank not in [ $lower $upper], requested quantile: $quant, approx returned: $approx"
72
- assert(rank >= lower, msg)
73
- assert(rank <= upper, msg)
85
+ val approx = summary.query(percentile).get
86
+ validateQuantileApproximation(approx, percentile, data, summary)
87
+ } else {
88
+ assert(summary.query(percentile).isEmpty)
89
+ }
90
+ }
91
+
92
+ private def checkQuantiles (
93
+ percentiles : Seq [Double ],
94
+ data : Seq [Double ],
95
+ summary : QuantileSummaries ): Unit = {
96
+ if (data.nonEmpty) {
97
+ val approx = summary.query(percentiles).get
98
+ for ((q, a) <- percentiles zip approx) {
99
+ validateQuantileApproximation(a, q, data, summary)
100
+ }
74
101
} else {
75
- assert(summary.query(quant ).isEmpty)
102
+ assert(summary.query(percentiles ).isEmpty)
76
103
}
77
104
}
78
105
@@ -98,6 +125,8 @@ class QuantileSummariesSuite extends SparkFunSuite {
98
125
checkQuantile(0.5 , data, s)
99
126
checkQuantile(0.1 , data, s)
100
127
checkQuantile(0.001 , data, s)
128
+ checkQuantiles(Seq (0.001 , 0.1 , 0.5 , 0.9 , 0.9999 ), data, s)
129
+ checkQuantiles(Seq (0.9999 , 0.9 , 0.5 , 0.1 , 0.001 ), data, s)
101
130
}
102
131
103
132
test(s " Some quantile values with epsi= $epsi and seq= $seq_name, compression= $compression " +
@@ -109,6 +138,8 @@ class QuantileSummariesSuite extends SparkFunSuite {
109
138
checkQuantile(0.5 , data, s)
110
139
checkQuantile(0.1 , data, s)
111
140
checkQuantile(0.001 , data, s)
141
+ checkQuantiles(Seq (0.001 , 0.1 , 0.5 , 0.9 , 0.9999 ), data, s)
142
+ checkQuantiles(Seq (0.9999 , 0.9 , 0.5 , 0.1 , 0.001 ), data, s)
112
143
}
113
144
114
145
test(s " Tests on empty data with epsi= $epsi and seq= $seq_name, compression= $compression" ) {
@@ -121,6 +152,8 @@ class QuantileSummariesSuite extends SparkFunSuite {
121
152
checkQuantile(0.5 , emptyData, s)
122
153
checkQuantile(0.1 , emptyData, s)
123
154
checkQuantile(0.001 , emptyData, s)
155
+ checkQuantiles(Seq (0.001 , 0.1 , 0.5 , 0.9 , 0.9999 ), emptyData, s)
156
+ checkQuantiles(Seq (0.9999 , 0.9 , 0.5 , 0.1 , 0.001 ), emptyData, s)
124
157
}
125
158
}
126
159
@@ -149,6 +182,8 @@ class QuantileSummariesSuite extends SparkFunSuite {
149
182
checkQuantile(0.5 , data, s)
150
183
checkQuantile(0.1 , data, s)
151
184
checkQuantile(0.001 , data, s)
185
+ checkQuantiles(Seq (0.001 , 0.1 , 0.5 , 0.9 , 0.9999 ), data, s)
186
+ checkQuantiles(Seq (0.9999 , 0.9 , 0.5 , 0.1 , 0.001 ), data, s)
152
187
}
153
188
154
189
val (data11, data12) = {
@@ -168,6 +203,8 @@ class QuantileSummariesSuite extends SparkFunSuite {
168
203
checkQuantile(0.5 , data, s)
169
204
checkQuantile(0.1 , data, s)
170
205
checkQuantile(0.001 , data, s)
206
+ checkQuantiles(Seq (0.001 , 0.1 , 0.5 , 0.9 , 0.9999 ), data, s)
207
+ checkQuantiles(Seq (0.9999 , 0.9 , 0.5 , 0.1 , 0.001 ), data, s)
171
208
}
172
209
173
210
// length of data21 is 4 * length of data22
@@ -181,10 +218,14 @@ class QuantileSummariesSuite extends SparkFunSuite {
181
218
val s2 = buildSummary(data22, epsi, compression)
182
219
val s = s1.merge(s2)
183
220
// Check all quantiles
221
+ val percentiles = ArrayBuffer [Double ]()
184
222
for (queryRank <- 1 to n) {
185
- val queryQuantile = queryRank.toDouble / n.toDouble
186
- checkQuantile(queryQuantile, data, s)
223
+ val percentile = queryRank.toDouble / n.toDouble
224
+ checkQuantile(percentile, data, s)
225
+ percentiles += percentile
187
226
}
227
+ checkQuantiles(percentiles.toSeq, data, s)
228
+ checkQuantiles(percentiles.reverse.toSeq, data, s)
188
229
}
189
230
}
190
231
}
0 commit comments