[SPARK-49792][PYTHON][BUILD] Upgrade to numpy 2 for building and testing Spark branches

xinrong-meng · himadripal · commit ceaee2c9fef8 · 2024-10-19T12:10:46.000-07:00
### What changes were proposed in this pull request? Upgrade numpy to 2.1.0 for building and testing Spark branches. Failed tests are categorized into the following groups: - Most of test failures fixed are related to pandas-dev/pandas#59838 (comment). - Replaced np.mat with np.asmatrix. - TODO: SPARK-49793 ### Why are the changes needed? Ensure compatibility with newer NumPy, which is utilized by Pandas (on Spark). ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Existing tests. ### Was this patch authored or co-authored using generative AI tooling? No. Closes apache#48180 from xinrong-meng/np_upgrade. Authored-by: Xinrong Meng <xinrong@apache.org> Signed-off-by: Dongjoon Hyun <dongjoon@apache.org>
diff --git a/dev/infra/Dockerfile b/dev/infra/Dockerfile
@@ -24,7 +24,7 @@ LABEL org.opencontainers.image.ref.name="Apache Spark Infra Image"
 # Overwrite this label to avoid exposing the underlying Ubuntu OS version label
 LABEL org.opencontainers.image.version=""
 
-ENV FULL_REFRESH_DATE 20241002
+ENV FULL_REFRESH_DATE 20241007
 
 ENV DEBIAN_FRONTEND noninteractive
 ENV DEBCONF_NONINTERACTIVE_SEEN true
@@ -91,10 +91,10 @@ RUN mkdir -p /usr/local/pypy/pypy3.9 && \
     ln -sf /usr/local/pypy/pypy3.9/bin/pypy /usr/local/bin/pypy3.9 && \
     ln -sf /usr/local/pypy/pypy3.9/bin/pypy /usr/local/bin/pypy3
 RUN curl -sS https://bootstrap.pypa.io/get-pip.py | pypy3
-RUN pypy3 -m pip install 'numpy==1.26.4' 'six==1.16.0' 'pandas==2.2.3' scipy coverage matplotlib lxml
+RUN pypy3 -m pip install numpy 'six==1.16.0' 'pandas==2.2.3' scipy coverage matplotlib lxml
 
 
-ARG BASIC_PIP_PKGS="numpy==1.26.4 pyarrow>=15.0.0 six==1.16.0 pandas==2.2.3 scipy plotly>=4.8 mlflow>=2.8.1 coverage matplotlib openpyxl memory-profiler>=0.61.0 scikit-learn>=1.3.2"
+ARG BASIC_PIP_PKGS="numpy pyarrow>=15.0.0 six==1.16.0 pandas==2.2.3 scipy plotly>=4.8 mlflow>=2.8.1 coverage matplotlib openpyxl memory-profiler>=0.61.0 scikit-learn>=1.3.2"
 # Python deps for Spark Connect
 ARG CONNECT_PIP_PKGS="grpcio==1.62.0 grpcio-status==1.62.0 protobuf==4.25.1 googleapis-common-protos==1.56.4 graphviz==0.20.3"
 
diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py
@@ -699,7 +699,7 @@ class LinearSVC(
     >>> model_path = temp_path + "/svm_model"
     >>> model.save(model_path)
     >>> model2 = LinearSVCModel.load(model_path)
-    >>> model.coefficients[0] == model2.coefficients[0]
+    >>> bool(model.coefficients[0] == model2.coefficients[0])
     True
     >>> model.intercept == model2.intercept
     True
@@ -1210,7 +1210,7 @@ class LogisticRegression(
     >>> model_path = temp_path + "/lr_model"
     >>> blorModel.save(model_path)
     >>> model2 = LogisticRegressionModel.load(model_path)
-    >>> blorModel.coefficients[0] == model2.coefficients[0]
+    >>> bool(blorModel.coefficients[0] == model2.coefficients[0])
     True
     >>> blorModel.intercept == model2.intercept
     True
@@ -2038,9 +2038,9 @@ class RandomForestClassifier(
     >>> result = model.transform(test0).head()
     >>> result.prediction
     0.0
-    >>> numpy.argmax(result.probability)
+    >>> int(numpy.argmax(result.probability))
     0
-    >>> numpy.argmax(result.newRawPrediction)
+    >>> int(numpy.argmax(result.newRawPrediction))
     0
     >>> result.leafId
     DenseVector([0.0, 0.0, 0.0])
diff --git a/python/pyspark/ml/regression.py b/python/pyspark/ml/regression.py
@@ -266,7 +266,7 @@ class LinearRegression(
     True
     >>> abs(model.transform(test0).head().newPrediction - (-1.0)) < 0.001
     True
-    >>> abs(model.coefficients[0] - 1.0) < 0.001
+    >>> bool(abs(model.coefficients[0] - 1.0) < 0.001)
     True
     >>> abs(model.intercept - 0.0) < 0.001
     True
@@ -283,11 +283,11 @@ class LinearRegression(
     >>> model_path = temp_path + "/lr_model"
     >>> model.save(model_path)
     >>> model2 = LinearRegressionModel.load(model_path)
-    >>> model.coefficients[0] == model2.coefficients[0]
+    >>> bool(model.coefficients[0] == model2.coefficients[0])
     True
-    >>> model.intercept == model2.intercept
+    >>> bool(model.intercept == model2.intercept)
     True
-    >>> model.transform(test0).take(1) == model2.transform(test0).take(1)
+    >>> bool(model.transform(test0).take(1) == model2.transform(test0).take(1))
     True
     >>> model.numFeatures
     1
@@ -2542,7 +2542,7 @@ class GeneralizedLinearRegression(
     >>> model2 = GeneralizedLinearRegressionModel.load(model_path)
     >>> model.intercept == model2.intercept
     True
-    >>> model.coefficients[0] == model2.coefficients[0]
+    >>> bool(model.coefficients[0] == model2.coefficients[0])
     True
     >>> model.transform(df).take(1) == model2.transform(df).take(1)
     True
diff --git a/python/pyspark/ml/tests/test_functions.py b/python/pyspark/ml/tests/test_functions.py
@@ -18,6 +18,7 @@
 
 import numpy as np
 
+from pyspark.loose_version import LooseVersion
 from pyspark.ml.functions import predict_batch_udf
 from pyspark.sql.functions import array, struct, col
 from pyspark.sql.types import ArrayType, DoubleType, IntegerType, StructType, StructField, FloatType
@@ -193,6 +194,10 @@ def predict(inputs):
         batch_sizes = preds["preds"].to_numpy()
         self.assertTrue(all(batch_sizes <= batch_size))
 
+    # TODO(SPARK-49793): enable the test below
+    @unittest.skipIf(
+        LooseVersion(np.__version__) >= LooseVersion("2"), "Caching does not work with numpy 2"
+    )
     def test_caching(self):
         def make_predict_fn():
             # emulate loading a model, this should only be invoked once (per worker process)
diff --git a/python/pyspark/ml/tuning.py b/python/pyspark/ml/tuning.py
@@ -706,7 +706,7 @@ class CrossValidator(
     >>> cvModel = cv.fit(dataset)
     >>> cvModel.getNumFolds()
     3
-    >>> cvModel.avgMetrics[0]
+    >>> float(cvModel.avgMetrics[0])
     0.5
     >>> path = tempfile.mkdtemp()
     >>> model_path = path + "/model"
diff --git a/python/pyspark/mllib/classification.py b/python/pyspark/mllib/classification.py
@@ -172,9 +172,9 @@ class LogisticRegressionModel(LinearClassificationModel):
     >>> path = tempfile.mkdtemp()
     >>> lrm.save(sc, path)
     >>> sameModel = LogisticRegressionModel.load(sc, path)
-    >>> sameModel.predict(numpy.array([0.0, 1.0]))
+    >>> int(sameModel.predict(numpy.array([0.0, 1.0])))
     1
-    >>> sameModel.predict(SparseVector(2, {0: 1.0}))
+    >>> int(sameModel.predict(SparseVector(2, {0: 1.0})))
     0
     >>> from shutil import rmtree
     >>> try:
@@ -555,7 +555,7 @@ class SVMModel(LinearClassificationModel):
     >>> svm.predict(sc.parallelize([[1.0]])).collect()
     [1]
     >>> svm.clearThreshold()
-    >>> svm.predict(numpy.array([1.0]))
+    >>> float(svm.predict(numpy.array([1.0])))
     1.44...
 
     >>> sparse_data = [
@@ -573,9 +573,9 @@ class SVMModel(LinearClassificationModel):
     >>> path = tempfile.mkdtemp()
     >>> svm.save(sc, path)
     >>> sameModel = SVMModel.load(sc, path)
-    >>> sameModel.predict(SparseVector(2, {1: 1.0}))
+    >>> int(sameModel.predict(SparseVector(2, {1: 1.0})))
     1
-    >>> sameModel.predict(SparseVector(2, {0: -1.0}))
+    >>> int(sameModel.predict(SparseVector(2, {0: -1.0})))
     0
     >>> from shutil import rmtree
     >>> try:
@@ -756,27 +756,30 @@ class NaiveBayesModel(Saveable, Loader["NaiveBayesModel"]):
     ...     LabeledPoint(1.0, [1.0, 0.0]),
     ... ]
     >>> model = NaiveBayes.train(sc.parallelize(data))
-    >>> model.predict(numpy.array([0.0, 1.0]))
+    >>> float(model.predict(numpy.array([0.0, 1.0])))
     0.0
-    >>> model.predict(numpy.array([1.0, 0.0]))
+    >>> float(model.predict(numpy.array([1.0, 0.0])))
     1.0
-    >>> model.predict(sc.parallelize([[1.0, 0.0]])).collect()
+    >>> list(map(float, model.predict(sc.parallelize([[1.0, 0.0]])).collect()))
     [1.0]
     >>> sparse_data = [
     ...     LabeledPoint(0.0, SparseVector(2, {1: 0.0})),
     ...     LabeledPoint(0.0, SparseVector(2, {1: 1.0})),
     ...     LabeledPoint(1.0, SparseVector(2, {0: 1.0}))
     ... ]
     >>> model = NaiveBayes.train(sc.parallelize(sparse_data))
-    >>> model.predict(SparseVector(2, {1: 1.0}))
+    >>> float(model.predict(SparseVector(2, {1: 1.0})))
     0.0
-    >>> model.predict(SparseVector(2, {0: 1.0}))
+    >>> float(model.predict(SparseVector(2, {0: 1.0})))
     1.0
     >>> import os, tempfile
     >>> path = tempfile.mkdtemp()
     >>> model.save(sc, path)
     >>> sameModel = NaiveBayesModel.load(sc, path)
-    >>> sameModel.predict(SparseVector(2, {0: 1.0})) == model.predict(SparseVector(2, {0: 1.0}))
+    >>> bool((
+    ...     sameModel.predict(SparseVector(2, {0: 1.0})) ==
+    ...     model.predict(SparseVector(2, {0: 1.0}))
+    ... ))
     True
     >>> from shutil import rmtree
     >>> try:
diff --git a/python/pyspark/mllib/feature.py b/python/pyspark/mllib/feature.py
@@ -554,9 +554,9 @@ class PCA:
     ...     Vectors.dense([4.0, 0.0, 0.0, 6.0, 7.0])]
     >>> model = PCA(2).fit(sc.parallelize(data))
     >>> pcArray = model.transform(Vectors.sparse(5, [(1, 1.0), (3, 7.0)])).toArray()
-    >>> pcArray[0]
+    >>> float(pcArray[0])
     1.648...
-    >>> pcArray[1]
+    >>> float(pcArray[1])
     -4.013...
     """
 
diff --git a/python/pyspark/mllib/random.py b/python/pyspark/mllib/random.py
@@ -134,9 +134,9 @@ def normalRDD(
         >>> stats = x.stats()
         >>> stats.count()
         1000
-        >>> abs(stats.mean() - 0.0) < 0.1
+        >>> bool(abs(stats.mean() - 0.0) < 0.1)
         True
-        >>> abs(stats.stdev() - 1.0) < 0.1
+        >>> bool(abs(stats.stdev() - 1.0) < 0.1)
         True
         """
         return callMLlibFunc("normalRDD", sc._jsc, size, numPartitions, seed)
@@ -186,10 +186,10 @@ def logNormalRDD(
         >>> stats = x.stats()
         >>> stats.count()
         1000
-        >>> abs(stats.mean() - expMean) < 0.5
+        >>> bool(abs(stats.mean() - expMean) < 0.5)
         True
         >>> from math import sqrt
-        >>> abs(stats.stdev() - expStd) < 0.5
+        >>> bool(abs(stats.stdev() - expStd) < 0.5)
         True
         """
         return callMLlibFunc(
@@ -238,7 +238,7 @@ def poissonRDD(
         >>> abs(stats.mean() - mean) < 0.5
         True
         >>> from math import sqrt
-        >>> abs(stats.stdev() - sqrt(mean)) < 0.5
+        >>> bool(abs(stats.stdev() - sqrt(mean)) < 0.5)
         True
         """
         return callMLlibFunc("poissonRDD", sc._jsc, float(mean), size, numPartitions, seed)
@@ -285,7 +285,7 @@ def exponentialRDD(
         >>> abs(stats.mean() - mean) < 0.5
         True
         >>> from math import sqrt
-        >>> abs(stats.stdev() - sqrt(mean)) < 0.5
+        >>> bool(abs(stats.stdev() - sqrt(mean)) < 0.5)
         True
         """
         return callMLlibFunc("exponentialRDD", sc._jsc, float(mean), size, numPartitions, seed)
@@ -336,9 +336,9 @@ def gammaRDD(
         >>> stats = x.stats()
         >>> stats.count()
         1000
-        >>> abs(stats.mean() - expMean) < 0.5
+        >>> bool(abs(stats.mean() - expMean) < 0.5)
         True
-        >>> abs(stats.stdev() - expStd) < 0.5
+        >>> bool(abs(stats.stdev() - expStd) < 0.5)
         True
         """
         return callMLlibFunc(
@@ -384,7 +384,7 @@ def uniformVectorRDD(
         >>> mat = np.matrix(RandomRDDs.uniformVectorRDD(sc, 10, 10).collect())
         >>> mat.shape
         (10, 10)
-        >>> mat.max() <= 1.0 and mat.min() >= 0.0
+        >>> bool(mat.max() <= 1.0 and mat.min() >= 0.0)
         True
         >>> RandomRDDs.uniformVectorRDD(sc, 10, 10, 4).getNumPartitions()
         4
@@ -430,9 +430,9 @@ def normalVectorRDD(
         >>> mat = np.matrix(RandomRDDs.normalVectorRDD(sc, 100, 100, seed=1).collect())
         >>> mat.shape
         (100, 100)
-        >>> abs(mat.mean() - 0.0) < 0.1
+        >>> bool(abs(mat.mean() - 0.0) < 0.1)
         True
-        >>> abs(mat.std() - 1.0) < 0.1
+        >>> bool(abs(mat.std() - 1.0) < 0.1)
         True
         """
         return callMLlibFunc("normalVectorRDD", sc._jsc, numRows, numCols, numPartitions, seed)
@@ -488,9 +488,9 @@ def logNormalVectorRDD(
         >>> mat = np.matrix(m)
         >>> mat.shape
         (100, 100)
-        >>> abs(mat.mean() - expMean) < 0.1
+        >>> bool(abs(mat.mean() - expMean) < 0.1)
         True
-        >>> abs(mat.std() - expStd) < 0.1
+        >>> bool(abs(mat.std() - expStd) < 0.1)
         True
         """
         return callMLlibFunc(
@@ -545,13 +545,13 @@ def poissonVectorRDD(
         >>> import numpy as np
         >>> mean = 100.0
         >>> rdd = RandomRDDs.poissonVectorRDD(sc, mean, 100, 100, seed=1)
-        >>> mat = np.mat(rdd.collect())
+        >>> mat = np.asmatrix(rdd.collect())
         >>> mat.shape
         (100, 100)
-        >>> abs(mat.mean() - mean) < 0.5
+        >>> bool(abs(mat.mean() - mean) < 0.5)
         True
         >>> from math import sqrt
-        >>> abs(mat.std() - sqrt(mean)) < 0.5
+        >>> bool(abs(mat.std() - sqrt(mean)) < 0.5)
         True
         """
         return callMLlibFunc(
@@ -599,13 +599,13 @@ def exponentialVectorRDD(
         >>> import numpy as np
         >>> mean = 0.5
         >>> rdd = RandomRDDs.exponentialVectorRDD(sc, mean, 100, 100, seed=1)
-        >>> mat = np.mat(rdd.collect())
+        >>> mat = np.asmatrix(rdd.collect())
         >>> mat.shape
         (100, 100)
-        >>> abs(mat.mean() - mean) < 0.5
+        >>> bool(abs(mat.mean() - mean) < 0.5)
         True
         >>> from math import sqrt
-        >>> abs(mat.std() - sqrt(mean)) < 0.5
+        >>> bool(abs(mat.std() - sqrt(mean)) < 0.5)
         True
         """
         return callMLlibFunc(
@@ -662,9 +662,9 @@ def gammaVectorRDD(
         >>> mat = np.matrix(RandomRDDs.gammaVectorRDD(sc, shape, scale, 100, 100, seed=1).collect())
         >>> mat.shape
         (100, 100)
-        >>> abs(mat.mean() - expMean) < 0.1
+        >>> bool(abs(mat.mean() - expMean) < 0.1)
         True
-        >>> abs(mat.std() - expStd) < 0.1
+        >>> bool(abs(mat.std() - expStd) < 0.1)
         True
         """
         return callMLlibFunc(
diff --git a/python/pyspark/mllib/regression.py b/python/pyspark/mllib/regression.py
diff --git a/python/pyspark/pandas/generic.py b/python/pyspark/pandas/generic.py
diff --git a/python/pyspark/pandas/indexing.py b/python/pyspark/pandas/indexing.py
diff --git a/python/pyspark/pandas/series.py b/python/pyspark/pandas/series.py

Original file line number	Diff line number	Diff line change
`@@ -706,7 +706,7 @@ class CrossValidator(`
`706`	`706`	`>>> cvModel = cv.fit(dataset)`
`707`	`707`	`>>> cvModel.getNumFolds()`
`708`	`708`	`3`
`709`		`- >>> cvModel.avgMetrics[0]`
	`709`	`+ >>> float(cvModel.avgMetrics[0])`
`710`	`710`	`0.5`
`711`	`711`	`>>> path = tempfile.mkdtemp()`
`712`	`712`	`>>> model_path = path + "/model"`