Skip to content

Commit ceaee2c

Browse files
xinrong-menghimadripal
authored and
himadripal
committed
[SPARK-49792][PYTHON][BUILD] Upgrade to numpy 2 for building and testing Spark branches
### What changes were proposed in this pull request? Upgrade numpy to 2.1.0 for building and testing Spark branches. Failed tests are categorized into the following groups: - Most of test failures fixed are related to pandas-dev/pandas#59838 (comment). - Replaced np.mat with np.asmatrix. - TODO: SPARK-49793 ### Why are the changes needed? Ensure compatibility with newer NumPy, which is utilized by Pandas (on Spark). ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Existing tests. ### Was this patch authored or co-authored using generative AI tooling? No. Closes apache#48180 from xinrong-meng/np_upgrade. Authored-by: Xinrong Meng <[email protected]> Signed-off-by: Dongjoon Hyun <[email protected]>
1 parent 04874fe commit ceaee2c

File tree

12 files changed

+107
-99
lines changed

12 files changed

+107
-99
lines changed

dev/infra/Dockerfile

+3-3
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ LABEL org.opencontainers.image.ref.name="Apache Spark Infra Image"
2424
# Overwrite this label to avoid exposing the underlying Ubuntu OS version label
2525
LABEL org.opencontainers.image.version=""
2626

27-
ENV FULL_REFRESH_DATE 20241002
27+
ENV FULL_REFRESH_DATE 20241007
2828

2929
ENV DEBIAN_FRONTEND noninteractive
3030
ENV DEBCONF_NONINTERACTIVE_SEEN true
@@ -91,10 +91,10 @@ RUN mkdir -p /usr/local/pypy/pypy3.9 && \
9191
ln -sf /usr/local/pypy/pypy3.9/bin/pypy /usr/local/bin/pypy3.9 && \
9292
ln -sf /usr/local/pypy/pypy3.9/bin/pypy /usr/local/bin/pypy3
9393
RUN curl -sS https://bootstrap.pypa.io/get-pip.py | pypy3
94-
RUN pypy3 -m pip install 'numpy==1.26.4' 'six==1.16.0' 'pandas==2.2.3' scipy coverage matplotlib lxml
94+
RUN pypy3 -m pip install numpy 'six==1.16.0' 'pandas==2.2.3' scipy coverage matplotlib lxml
9595

9696

97-
ARG BASIC_PIP_PKGS="numpy==1.26.4 pyarrow>=15.0.0 six==1.16.0 pandas==2.2.3 scipy plotly>=4.8 mlflow>=2.8.1 coverage matplotlib openpyxl memory-profiler>=0.61.0 scikit-learn>=1.3.2"
97+
ARG BASIC_PIP_PKGS="numpy pyarrow>=15.0.0 six==1.16.0 pandas==2.2.3 scipy plotly>=4.8 mlflow>=2.8.1 coverage matplotlib openpyxl memory-profiler>=0.61.0 scikit-learn>=1.3.2"
9898
# Python deps for Spark Connect
9999
ARG CONNECT_PIP_PKGS="grpcio==1.62.0 grpcio-status==1.62.0 protobuf==4.25.1 googleapis-common-protos==1.56.4 graphviz==0.20.3"
100100

python/pyspark/ml/classification.py

+4-4
Original file line numberDiff line numberDiff line change
@@ -699,7 +699,7 @@ class LinearSVC(
699699
>>> model_path = temp_path + "/svm_model"
700700
>>> model.save(model_path)
701701
>>> model2 = LinearSVCModel.load(model_path)
702-
>>> model.coefficients[0] == model2.coefficients[0]
702+
>>> bool(model.coefficients[0] == model2.coefficients[0])
703703
True
704704
>>> model.intercept == model2.intercept
705705
True
@@ -1210,7 +1210,7 @@ class LogisticRegression(
12101210
>>> model_path = temp_path + "/lr_model"
12111211
>>> blorModel.save(model_path)
12121212
>>> model2 = LogisticRegressionModel.load(model_path)
1213-
>>> blorModel.coefficients[0] == model2.coefficients[0]
1213+
>>> bool(blorModel.coefficients[0] == model2.coefficients[0])
12141214
True
12151215
>>> blorModel.intercept == model2.intercept
12161216
True
@@ -2038,9 +2038,9 @@ class RandomForestClassifier(
20382038
>>> result = model.transform(test0).head()
20392039
>>> result.prediction
20402040
0.0
2041-
>>> numpy.argmax(result.probability)
2041+
>>> int(numpy.argmax(result.probability))
20422042
0
2043-
>>> numpy.argmax(result.newRawPrediction)
2043+
>>> int(numpy.argmax(result.newRawPrediction))
20442044
0
20452045
>>> result.leafId
20462046
DenseVector([0.0, 0.0, 0.0])

python/pyspark/ml/regression.py

+5-5
Original file line numberDiff line numberDiff line change
@@ -266,7 +266,7 @@ class LinearRegression(
266266
True
267267
>>> abs(model.transform(test0).head().newPrediction - (-1.0)) < 0.001
268268
True
269-
>>> abs(model.coefficients[0] - 1.0) < 0.001
269+
>>> bool(abs(model.coefficients[0] - 1.0) < 0.001)
270270
True
271271
>>> abs(model.intercept - 0.0) < 0.001
272272
True
@@ -283,11 +283,11 @@ class LinearRegression(
283283
>>> model_path = temp_path + "/lr_model"
284284
>>> model.save(model_path)
285285
>>> model2 = LinearRegressionModel.load(model_path)
286-
>>> model.coefficients[0] == model2.coefficients[0]
286+
>>> bool(model.coefficients[0] == model2.coefficients[0])
287287
True
288-
>>> model.intercept == model2.intercept
288+
>>> bool(model.intercept == model2.intercept)
289289
True
290-
>>> model.transform(test0).take(1) == model2.transform(test0).take(1)
290+
>>> bool(model.transform(test0).take(1) == model2.transform(test0).take(1))
291291
True
292292
>>> model.numFeatures
293293
1
@@ -2542,7 +2542,7 @@ class GeneralizedLinearRegression(
25422542
>>> model2 = GeneralizedLinearRegressionModel.load(model_path)
25432543
>>> model.intercept == model2.intercept
25442544
True
2545-
>>> model.coefficients[0] == model2.coefficients[0]
2545+
>>> bool(model.coefficients[0] == model2.coefficients[0])
25462546
True
25472547
>>> model.transform(df).take(1) == model2.transform(df).take(1)
25482548
True

python/pyspark/ml/tests/test_functions.py

+5
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818

1919
import numpy as np
2020

21+
from pyspark.loose_version import LooseVersion
2122
from pyspark.ml.functions import predict_batch_udf
2223
from pyspark.sql.functions import array, struct, col
2324
from pyspark.sql.types import ArrayType, DoubleType, IntegerType, StructType, StructField, FloatType
@@ -193,6 +194,10 @@ def predict(inputs):
193194
batch_sizes = preds["preds"].to_numpy()
194195
self.assertTrue(all(batch_sizes <= batch_size))
195196

197+
# TODO(SPARK-49793): enable the test below
198+
@unittest.skipIf(
199+
LooseVersion(np.__version__) >= LooseVersion("2"), "Caching does not work with numpy 2"
200+
)
196201
def test_caching(self):
197202
def make_predict_fn():
198203
# emulate loading a model, this should only be invoked once (per worker process)

python/pyspark/ml/tuning.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -706,7 +706,7 @@ class CrossValidator(
706706
>>> cvModel = cv.fit(dataset)
707707
>>> cvModel.getNumFolds()
708708
3
709-
>>> cvModel.avgMetrics[0]
709+
>>> float(cvModel.avgMetrics[0])
710710
0.5
711711
>>> path = tempfile.mkdtemp()
712712
>>> model_path = path + "/model"

python/pyspark/mllib/classification.py

+14-11
Original file line numberDiff line numberDiff line change
@@ -172,9 +172,9 @@ class LogisticRegressionModel(LinearClassificationModel):
172172
>>> path = tempfile.mkdtemp()
173173
>>> lrm.save(sc, path)
174174
>>> sameModel = LogisticRegressionModel.load(sc, path)
175-
>>> sameModel.predict(numpy.array([0.0, 1.0]))
175+
>>> int(sameModel.predict(numpy.array([0.0, 1.0])))
176176
1
177-
>>> sameModel.predict(SparseVector(2, {0: 1.0}))
177+
>>> int(sameModel.predict(SparseVector(2, {0: 1.0})))
178178
0
179179
>>> from shutil import rmtree
180180
>>> try:
@@ -555,7 +555,7 @@ class SVMModel(LinearClassificationModel):
555555
>>> svm.predict(sc.parallelize([[1.0]])).collect()
556556
[1]
557557
>>> svm.clearThreshold()
558-
>>> svm.predict(numpy.array([1.0]))
558+
>>> float(svm.predict(numpy.array([1.0])))
559559
1.44...
560560
561561
>>> sparse_data = [
@@ -573,9 +573,9 @@ class SVMModel(LinearClassificationModel):
573573
>>> path = tempfile.mkdtemp()
574574
>>> svm.save(sc, path)
575575
>>> sameModel = SVMModel.load(sc, path)
576-
>>> sameModel.predict(SparseVector(2, {1: 1.0}))
576+
>>> int(sameModel.predict(SparseVector(2, {1: 1.0})))
577577
1
578-
>>> sameModel.predict(SparseVector(2, {0: -1.0}))
578+
>>> int(sameModel.predict(SparseVector(2, {0: -1.0})))
579579
0
580580
>>> from shutil import rmtree
581581
>>> try:
@@ -756,27 +756,30 @@ class NaiveBayesModel(Saveable, Loader["NaiveBayesModel"]):
756756
... LabeledPoint(1.0, [1.0, 0.0]),
757757
... ]
758758
>>> model = NaiveBayes.train(sc.parallelize(data))
759-
>>> model.predict(numpy.array([0.0, 1.0]))
759+
>>> float(model.predict(numpy.array([0.0, 1.0])))
760760
0.0
761-
>>> model.predict(numpy.array([1.0, 0.0]))
761+
>>> float(model.predict(numpy.array([1.0, 0.0])))
762762
1.0
763-
>>> model.predict(sc.parallelize([[1.0, 0.0]])).collect()
763+
>>> list(map(float, model.predict(sc.parallelize([[1.0, 0.0]])).collect()))
764764
[1.0]
765765
>>> sparse_data = [
766766
... LabeledPoint(0.0, SparseVector(2, {1: 0.0})),
767767
... LabeledPoint(0.0, SparseVector(2, {1: 1.0})),
768768
... LabeledPoint(1.0, SparseVector(2, {0: 1.0}))
769769
... ]
770770
>>> model = NaiveBayes.train(sc.parallelize(sparse_data))
771-
>>> model.predict(SparseVector(2, {1: 1.0}))
771+
>>> float(model.predict(SparseVector(2, {1: 1.0})))
772772
0.0
773-
>>> model.predict(SparseVector(2, {0: 1.0}))
773+
>>> float(model.predict(SparseVector(2, {0: 1.0})))
774774
1.0
775775
>>> import os, tempfile
776776
>>> path = tempfile.mkdtemp()
777777
>>> model.save(sc, path)
778778
>>> sameModel = NaiveBayesModel.load(sc, path)
779-
>>> sameModel.predict(SparseVector(2, {0: 1.0})) == model.predict(SparseVector(2, {0: 1.0}))
779+
>>> bool((
780+
... sameModel.predict(SparseVector(2, {0: 1.0})) ==
781+
... model.predict(SparseVector(2, {0: 1.0}))
782+
... ))
780783
True
781784
>>> from shutil import rmtree
782785
>>> try:

python/pyspark/mllib/feature.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -554,9 +554,9 @@ class PCA:
554554
... Vectors.dense([4.0, 0.0, 0.0, 6.0, 7.0])]
555555
>>> model = PCA(2).fit(sc.parallelize(data))
556556
>>> pcArray = model.transform(Vectors.sparse(5, [(1, 1.0), (3, 7.0)])).toArray()
557-
>>> pcArray[0]
557+
>>> float(pcArray[0])
558558
1.648...
559-
>>> pcArray[1]
559+
>>> float(pcArray[1])
560560
-4.013...
561561
"""
562562

python/pyspark/mllib/random.py

+21-21
Original file line numberDiff line numberDiff line change
@@ -134,9 +134,9 @@ def normalRDD(
134134
>>> stats = x.stats()
135135
>>> stats.count()
136136
1000
137-
>>> abs(stats.mean() - 0.0) < 0.1
137+
>>> bool(abs(stats.mean() - 0.0) < 0.1)
138138
True
139-
>>> abs(stats.stdev() - 1.0) < 0.1
139+
>>> bool(abs(stats.stdev() - 1.0) < 0.1)
140140
True
141141
"""
142142
return callMLlibFunc("normalRDD", sc._jsc, size, numPartitions, seed)
@@ -186,10 +186,10 @@ def logNormalRDD(
186186
>>> stats = x.stats()
187187
>>> stats.count()
188188
1000
189-
>>> abs(stats.mean() - expMean) < 0.5
189+
>>> bool(abs(stats.mean() - expMean) < 0.5)
190190
True
191191
>>> from math import sqrt
192-
>>> abs(stats.stdev() - expStd) < 0.5
192+
>>> bool(abs(stats.stdev() - expStd) < 0.5)
193193
True
194194
"""
195195
return callMLlibFunc(
@@ -238,7 +238,7 @@ def poissonRDD(
238238
>>> abs(stats.mean() - mean) < 0.5
239239
True
240240
>>> from math import sqrt
241-
>>> abs(stats.stdev() - sqrt(mean)) < 0.5
241+
>>> bool(abs(stats.stdev() - sqrt(mean)) < 0.5)
242242
True
243243
"""
244244
return callMLlibFunc("poissonRDD", sc._jsc, float(mean), size, numPartitions, seed)
@@ -285,7 +285,7 @@ def exponentialRDD(
285285
>>> abs(stats.mean() - mean) < 0.5
286286
True
287287
>>> from math import sqrt
288-
>>> abs(stats.stdev() - sqrt(mean)) < 0.5
288+
>>> bool(abs(stats.stdev() - sqrt(mean)) < 0.5)
289289
True
290290
"""
291291
return callMLlibFunc("exponentialRDD", sc._jsc, float(mean), size, numPartitions, seed)
@@ -336,9 +336,9 @@ def gammaRDD(
336336
>>> stats = x.stats()
337337
>>> stats.count()
338338
1000
339-
>>> abs(stats.mean() - expMean) < 0.5
339+
>>> bool(abs(stats.mean() - expMean) < 0.5)
340340
True
341-
>>> abs(stats.stdev() - expStd) < 0.5
341+
>>> bool(abs(stats.stdev() - expStd) < 0.5)
342342
True
343343
"""
344344
return callMLlibFunc(
@@ -384,7 +384,7 @@ def uniformVectorRDD(
384384
>>> mat = np.matrix(RandomRDDs.uniformVectorRDD(sc, 10, 10).collect())
385385
>>> mat.shape
386386
(10, 10)
387-
>>> mat.max() <= 1.0 and mat.min() >= 0.0
387+
>>> bool(mat.max() <= 1.0 and mat.min() >= 0.0)
388388
True
389389
>>> RandomRDDs.uniformVectorRDD(sc, 10, 10, 4).getNumPartitions()
390390
4
@@ -430,9 +430,9 @@ def normalVectorRDD(
430430
>>> mat = np.matrix(RandomRDDs.normalVectorRDD(sc, 100, 100, seed=1).collect())
431431
>>> mat.shape
432432
(100, 100)
433-
>>> abs(mat.mean() - 0.0) < 0.1
433+
>>> bool(abs(mat.mean() - 0.0) < 0.1)
434434
True
435-
>>> abs(mat.std() - 1.0) < 0.1
435+
>>> bool(abs(mat.std() - 1.0) < 0.1)
436436
True
437437
"""
438438
return callMLlibFunc("normalVectorRDD", sc._jsc, numRows, numCols, numPartitions, seed)
@@ -488,9 +488,9 @@ def logNormalVectorRDD(
488488
>>> mat = np.matrix(m)
489489
>>> mat.shape
490490
(100, 100)
491-
>>> abs(mat.mean() - expMean) < 0.1
491+
>>> bool(abs(mat.mean() - expMean) < 0.1)
492492
True
493-
>>> abs(mat.std() - expStd) < 0.1
493+
>>> bool(abs(mat.std() - expStd) < 0.1)
494494
True
495495
"""
496496
return callMLlibFunc(
@@ -545,13 +545,13 @@ def poissonVectorRDD(
545545
>>> import numpy as np
546546
>>> mean = 100.0
547547
>>> rdd = RandomRDDs.poissonVectorRDD(sc, mean, 100, 100, seed=1)
548-
>>> mat = np.mat(rdd.collect())
548+
>>> mat = np.asmatrix(rdd.collect())
549549
>>> mat.shape
550550
(100, 100)
551-
>>> abs(mat.mean() - mean) < 0.5
551+
>>> bool(abs(mat.mean() - mean) < 0.5)
552552
True
553553
>>> from math import sqrt
554-
>>> abs(mat.std() - sqrt(mean)) < 0.5
554+
>>> bool(abs(mat.std() - sqrt(mean)) < 0.5)
555555
True
556556
"""
557557
return callMLlibFunc(
@@ -599,13 +599,13 @@ def exponentialVectorRDD(
599599
>>> import numpy as np
600600
>>> mean = 0.5
601601
>>> rdd = RandomRDDs.exponentialVectorRDD(sc, mean, 100, 100, seed=1)
602-
>>> mat = np.mat(rdd.collect())
602+
>>> mat = np.asmatrix(rdd.collect())
603603
>>> mat.shape
604604
(100, 100)
605-
>>> abs(mat.mean() - mean) < 0.5
605+
>>> bool(abs(mat.mean() - mean) < 0.5)
606606
True
607607
>>> from math import sqrt
608-
>>> abs(mat.std() - sqrt(mean)) < 0.5
608+
>>> bool(abs(mat.std() - sqrt(mean)) < 0.5)
609609
True
610610
"""
611611
return callMLlibFunc(
@@ -662,9 +662,9 @@ def gammaVectorRDD(
662662
>>> mat = np.matrix(RandomRDDs.gammaVectorRDD(sc, shape, scale, 100, 100, seed=1).collect())
663663
>>> mat.shape
664664
(100, 100)
665-
>>> abs(mat.mean() - expMean) < 0.1
665+
>>> bool(abs(mat.mean() - expMean) < 0.1)
666666
True
667-
>>> abs(mat.std() - expStd) < 0.1
667+
>>> bool(abs(mat.std() - expStd) < 0.1)
668668
True
669669
"""
670670
return callMLlibFunc(

0 commit comments

Comments
 (0)