Skip to content

Commit cb075b5

Browse files
ueshinHyukjinKwon
authored andcommitted
[SPARK-36345][SPARK-36367][INFRA][PYTHON] Disable tests failed by the incompatible behavior of pandas 1.3
Disable tests failed by the incompatible behavior of pandas 1.3. Pandas 1.3 has been released. There are some behavior changes and we should follow it, but it's not ready yet. No. Disabled some tests related to the behavior change. Closes #33598 from ueshin/issues/SPARK-36367/disable_tests. Authored-by: Takuya UESHIN <[email protected]> Signed-off-by: Hyukjin Kwon <[email protected]> (cherry picked from commit 8cb9cf3) Signed-off-by: Hyukjin Kwon <[email protected]>
1 parent c25f1e4 commit cb075b5

11 files changed

+222
-105
lines changed

.github/workflows/build_and_test.yml

+1-3
Original file line numberDiff line numberDiff line change
@@ -149,7 +149,7 @@ jobs:
149149
name: "Build modules: ${{ matrix.modules }}"
150150
runs-on: ubuntu-20.04
151151
container:
152-
image: dongjoon/apache-spark-github-action-image:20210602
152+
image: dongjoon/apache-spark-github-action-image:20210730
153153
strategy:
154154
fail-fast: false
155155
matrix:
@@ -227,8 +227,6 @@ jobs:
227227
# Run the tests.
228228
- name: Run tests
229229
run: |
230-
# TODO(SPARK-36345): Install mlflow>=1.0 and sklearn in Python 3.9 of the base image
231-
python3.9 -m pip install 'mlflow>=1.0' sklearn
232230
export PATH=$PATH:$HOME/miniconda/bin
233231
./dev/run-tests --parallelism 1 --modules "$MODULES_TO_TEST"
234232
- name: Upload test results to report

python/pyspark/pandas/groupby.py

+8
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
"""
2121

2222
from abc import ABCMeta, abstractmethod
23+
import builtins
2324
import sys
2425
import inspect
2526
from collections import OrderedDict, namedtuple
@@ -43,6 +44,7 @@
4344
TYPE_CHECKING,
4445
)
4546

47+
import numpy as np
4648
import pandas as pd
4749
from pandas.api.types import is_hashable, is_list_like
4850

@@ -102,6 +104,12 @@
102104
# to keep it the same as pandas
103105
NamedAgg = namedtuple("NamedAgg", ["column", "aggfunc"])
104106

107+
_builtin_table = {
108+
builtins.sum: np.sum,
109+
builtins.max: np.max,
110+
builtins.min: np.min,
111+
} # type: Dict[Callable, Callable]
112+
105113

106114
class GroupBy(Generic[FrameLike], metaclass=ABCMeta):
107115
"""

python/pyspark/pandas/tests/data_type_ops/test_categorical_ops.py

+5-1
Original file line numberDiff line numberDiff line change
@@ -190,8 +190,12 @@ def test_astype(self):
190190
self.assert_eq(pser.astype(str), psser.astype(str))
191191
self.assert_eq(pser.astype(bool), psser.astype(bool))
192192
self.assert_eq(pser.astype("category"), psser.astype("category"))
193+
193194
cat_type = CategoricalDtype(categories=[3, 1, 2])
194-
if LooseVersion(pd.__version__) >= LooseVersion("1.2"):
195+
if LooseVersion(pd.__version__) >= LooseVersion("1.3"):
196+
# TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3
197+
pass
198+
elif LooseVersion(pd.__version__) >= LooseVersion("1.2"):
195199
self.assert_eq(pser.astype(cat_type), psser.astype(cat_type))
196200
else:
197201
self.assert_eq(pd.Series(data).astype(cat_type), psser.astype(cat_type))

python/pyspark/pandas/tests/indexes/test_base.py

+44-32
Original file line numberDiff line numberDiff line change
@@ -1478,25 +1478,30 @@ def test_union(self):
14781478
psidx2 = ps.from_pandas(pidx2)
14791479

14801480
self.assert_eq(psidx1.union(psidx2), pidx1.union(pidx2))
1481-
self.assert_eq(psidx2.union(psidx1), pidx2.union(pidx1))
14821481
self.assert_eq(
14831482
psidx1.union([3, 4, 3, 3, 5, 6]), pidx1.union([3, 4, 3, 4, 5, 6]), almost=True
14841483
)
1485-
self.assert_eq(
1486-
psidx2.union([1, 2, 3, 4, 3, 4, 3, 4]),
1487-
pidx2.union([1, 2, 3, 4, 3, 4, 3, 4]),
1488-
almost=True,
1489-
)
14901484
self.assert_eq(
14911485
psidx1.union(ps.Series([3, 4, 3, 3, 5, 6])),
14921486
pidx1.union(pd.Series([3, 4, 3, 4, 5, 6])),
14931487
almost=True,
14941488
)
1495-
self.assert_eq(
1496-
psidx2.union(ps.Series([1, 2, 3, 4, 3, 4, 3, 4])),
1497-
pidx2.union(pd.Series([1, 2, 3, 4, 3, 4, 3, 4])),
1498-
almost=True,
1499-
)
1489+
1490+
if LooseVersion(pd.__version__) >= LooseVersion("1.3"):
1491+
# TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3
1492+
pass
1493+
else:
1494+
self.assert_eq(psidx2.union(psidx1), pidx2.union(pidx1))
1495+
self.assert_eq(
1496+
psidx2.union([1, 2, 3, 4, 3, 4, 3, 4]),
1497+
pidx2.union([1, 2, 3, 4, 3, 4, 3, 4]),
1498+
almost=True,
1499+
)
1500+
self.assert_eq(
1501+
psidx2.union(ps.Series([1, 2, 3, 4, 3, 4, 3, 4])),
1502+
pidx2.union(pd.Series([1, 2, 3, 4, 3, 4, 3, 4])),
1503+
almost=True,
1504+
)
15001505

15011506
# MultiIndex
15021507
pmidx1 = pd.MultiIndex.from_tuples([("x", "a"), ("x", "b"), ("x", "a"), ("x", "b")])
@@ -1508,30 +1513,37 @@ def test_union(self):
15081513
psmidx3 = ps.from_pandas(pmidx3)
15091514
psmidx4 = ps.from_pandas(pmidx4)
15101515

1511-
self.assert_eq(psmidx1.union(psmidx2), pmidx1.union(pmidx2))
1512-
self.assert_eq(psmidx2.union(psmidx1), pmidx2.union(pmidx1))
1513-
self.assert_eq(psmidx3.union(psmidx4), pmidx3.union(pmidx4))
1514-
self.assert_eq(psmidx4.union(psmidx3), pmidx4.union(pmidx3))
1515-
self.assert_eq(
1516-
psmidx1.union([("x", "a"), ("x", "b"), ("x", "c"), ("x", "d")]),
1517-
pmidx1.union([("x", "a"), ("x", "b"), ("x", "c"), ("x", "d")]),
1518-
)
1519-
self.assert_eq(
1520-
psmidx2.union([("x", "a"), ("x", "b"), ("x", "a"), ("x", "b")]),
1521-
pmidx2.union([("x", "a"), ("x", "b"), ("x", "a"), ("x", "b")]),
1522-
)
1523-
self.assert_eq(
1524-
psmidx3.union([(1, 3), (1, 4), (1, 5), (1, 6)]),
1525-
pmidx3.union([(1, 3), (1, 4), (1, 5), (1, 6)]),
1526-
)
1527-
self.assert_eq(
1528-
psmidx4.union([(1, 1), (1, 2), (1, 3), (1, 4), (1, 3), (1, 4)]),
1529-
pmidx4.union([(1, 1), (1, 2), (1, 3), (1, 4), (1, 3), (1, 4)]),
1530-
)
1516+
if LooseVersion(pd.__version__) >= LooseVersion("1.3"):
1517+
# TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3
1518+
pass
1519+
else:
1520+
self.assert_eq(psmidx1.union(psmidx2), pmidx1.union(pmidx2))
1521+
self.assert_eq(psmidx2.union(psmidx1), pmidx2.union(pmidx1))
1522+
self.assert_eq(psmidx3.union(psmidx4), pmidx3.union(pmidx4))
1523+
self.assert_eq(psmidx4.union(psmidx3), pmidx4.union(pmidx3))
1524+
self.assert_eq(
1525+
psmidx1.union([("x", "a"), ("x", "b"), ("x", "c"), ("x", "d")]),
1526+
pmidx1.union([("x", "a"), ("x", "b"), ("x", "c"), ("x", "d")]),
1527+
)
1528+
self.assert_eq(
1529+
psmidx2.union([("x", "a"), ("x", "b"), ("x", "a"), ("x", "b")]),
1530+
pmidx2.union([("x", "a"), ("x", "b"), ("x", "a"), ("x", "b")]),
1531+
)
1532+
self.assert_eq(
1533+
psmidx3.union([(1, 3), (1, 4), (1, 5), (1, 6)]),
1534+
pmidx3.union([(1, 3), (1, 4), (1, 5), (1, 6)]),
1535+
)
1536+
self.assert_eq(
1537+
psmidx4.union([(1, 1), (1, 2), (1, 3), (1, 4), (1, 3), (1, 4)]),
1538+
pmidx4.union([(1, 1), (1, 2), (1, 3), (1, 4), (1, 3), (1, 4)]),
1539+
)
15311540

1541+
if LooseVersion(pd.__version__) >= LooseVersion("1.3"):
1542+
# TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3
1543+
pass
15321544
# Testing if the result is correct after sort=False.
15331545
# The `sort` argument is added in pandas 0.24.
1534-
if LooseVersion(pd.__version__) >= LooseVersion("0.24"):
1546+
elif LooseVersion(pd.__version__) >= LooseVersion("0.24"):
15351547
self.assert_eq(
15361548
psmidx1.union(psmidx2, sort=False).sort_values(),
15371549
pmidx1.union(pmidx2, sort=False).sort_values(),

python/pyspark/pandas/tests/indexes/test_category.py

+4-1
Original file line numberDiff line numberDiff line change
@@ -176,7 +176,10 @@ def test_astype(self):
176176

177177
self.assert_eq(kcidx.astype("category"), pcidx.astype("category"))
178178

179-
if LooseVersion(pd.__version__) >= LooseVersion("1.2"):
179+
if LooseVersion(pd.__version__) >= LooseVersion("1.3"):
180+
# TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3
181+
pass
182+
elif LooseVersion(pd.__version__) >= LooseVersion("1.2"):
180183
self.assert_eq(
181184
kcidx.astype(CategoricalDtype(["b", "c", "a"])),
182185
pcidx.astype(CategoricalDtype(["b", "c", "a"])),

python/pyspark/pandas/tests/test_categorical.py

+68-14
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,11 @@ def test_categories_setter(self):
7373

7474
pser.cat.categories = ["z", "y", "x"]
7575
psser.cat.categories = ["z", "y", "x"]
76-
self.assert_eq(pser, psser)
76+
if LooseVersion(pd.__version__) >= LooseVersion("1.3"):
77+
# TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3
78+
pass
79+
else:
80+
self.assert_eq(pser, psser)
7781
self.assert_eq(pdf, psdf)
7882

7983
with self.assertRaises(ValueError):
@@ -91,7 +95,11 @@ def test_add_categories(self):
9195

9296
pser.cat.add_categories(4, inplace=True)
9397
psser.cat.add_categories(4, inplace=True)
94-
self.assert_eq(pser, psser)
98+
if LooseVersion(pd.__version__) >= LooseVersion("1.3"):
99+
# TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3
100+
pass
101+
else:
102+
self.assert_eq(pser, psser)
95103
self.assert_eq(pdf, psdf)
96104

97105
self.assertRaises(ValueError, lambda: psser.cat.add_categories(4))
@@ -115,7 +123,11 @@ def test_remove_categories(self):
115123

116124
pser.cat.remove_categories(2, inplace=True)
117125
psser.cat.remove_categories(2, inplace=True)
118-
self.assert_eq(pser, psser)
126+
if LooseVersion(pd.__version__) >= LooseVersion("1.3"):
127+
# TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3
128+
pass
129+
else:
130+
self.assert_eq(pser, psser)
119131
self.assert_eq(pdf, psdf)
120132

121133
self.assertRaises(ValueError, lambda: psser.cat.remove_categories(4))
@@ -138,7 +150,11 @@ def test_remove_unused_categories(self):
138150

139151
pser.cat.remove_unused_categories(inplace=True)
140152
psser.cat.remove_unused_categories(inplace=True)
141-
self.assert_eq(pser, psser)
153+
if LooseVersion(pd.__version__) >= LooseVersion("1.3"):
154+
# TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3
155+
pass
156+
else:
157+
self.assert_eq(pser, psser)
142158
self.assert_eq(pdf, psdf)
143159

144160
def test_reorder_categories(self):
@@ -164,12 +180,20 @@ def test_reorder_categories(self):
164180

165181
pser.cat.reorder_categories([1, 2, 3], inplace=True)
166182
psser.cat.reorder_categories([1, 2, 3], inplace=True)
167-
self.assert_eq(pser, psser)
183+
if LooseVersion(pd.__version__) >= LooseVersion("1.3"):
184+
# TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3
185+
pass
186+
else:
187+
self.assert_eq(pser, psser)
168188
self.assert_eq(pdf, psdf)
169189

170190
pser.cat.reorder_categories([3, 2, 1], ordered=True, inplace=True)
171191
psser.cat.reorder_categories([3, 2, 1], ordered=True, inplace=True)
172-
self.assert_eq(pser, psser)
192+
if LooseVersion(pd.__version__) >= LooseVersion("1.3"):
193+
# TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3
194+
pass
195+
else:
196+
self.assert_eq(pser, psser)
173197
self.assert_eq(pdf, psdf)
174198

175199
self.assertRaises(ValueError, lambda: psser.cat.reorder_categories([1, 2]))
@@ -189,7 +213,11 @@ def test_as_ordered_unordered(self):
189213

190214
pser.cat.as_ordered(inplace=True)
191215
psser.cat.as_ordered(inplace=True)
192-
self.assert_eq(pser, psser)
216+
if LooseVersion(pd.__version__) >= LooseVersion("1.3"):
217+
# TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3
218+
pass
219+
else:
220+
self.assert_eq(pser, psser)
193221
self.assert_eq(pdf, psdf)
194222

195223
# as_unordered
@@ -215,7 +243,10 @@ def test_astype(self):
215243

216244
self.assert_eq(kcser.astype("category"), pcser.astype("category"))
217245

218-
if LooseVersion(pd.__version__) >= LooseVersion("1.2"):
246+
if LooseVersion(pd.__version__) >= LooseVersion("1.3"):
247+
# TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3
248+
pass
249+
elif LooseVersion(pd.__version__) >= LooseVersion("1.2"):
219250
self.assert_eq(
220251
kcser.astype(CategoricalDtype(["b", "c", "a"])),
221252
pcser.astype(CategoricalDtype(["b", "c", "a"])),
@@ -419,7 +450,10 @@ def identity(x) -> ps.Series[psdf.b.dtype]: # type: ignore
419450
def astype(x) -> ps.Series[dtype]:
420451
return x.astype(dtype)
421452

422-
if LooseVersion(pd.__version__) >= LooseVersion("1.2"):
453+
if LooseVersion(pd.__version__) >= LooseVersion("1.3"):
454+
# TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3
455+
pass
456+
elif LooseVersion(pd.__version__) >= LooseVersion("1.2"):
423457
self.assert_eq(
424458
psdf.groupby("a").transform(astype).sort_values("b").reset_index(drop=True),
425459
pdf.groupby("a").transform(astype).sort_values("b").reset_index(drop=True),
@@ -637,17 +671,29 @@ def test_rename_categories(self):
637671

638672
pser.cat.rename_categories({"a": "A", "c": "C"}, inplace=True)
639673
psser.cat.rename_categories({"a": "A", "c": "C"}, inplace=True)
640-
self.assert_eq(pser, psser)
674+
if LooseVersion(pd.__version__) >= LooseVersion("1.3"):
675+
# TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3
676+
pass
677+
else:
678+
self.assert_eq(pser, psser)
641679
self.assert_eq(pdf, psdf)
642680

643681
pser.cat.rename_categories(lambda x: x.upper(), inplace=True)
644682
psser.cat.rename_categories(lambda x: x.upper(), inplace=True)
645-
self.assert_eq(pser, psser)
683+
if LooseVersion(pd.__version__) >= LooseVersion("1.3"):
684+
# TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3
685+
pass
686+
else:
687+
self.assert_eq(pser, psser)
646688
self.assert_eq(pdf, psdf)
647689

648690
pser.cat.rename_categories([0, 1, 3, 2], inplace=True)
649691
psser.cat.rename_categories([0, 1, 3, 2], inplace=True)
650-
self.assert_eq(pser, psser)
692+
if LooseVersion(pd.__version__) >= LooseVersion("1.3"):
693+
# TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3
694+
pass
695+
else:
696+
self.assert_eq(pser, psser)
651697
self.assert_eq(pdf, psdf)
652698

653699
self.assertRaisesRegex(
@@ -717,12 +763,20 @@ def test_set_categories(self):
717763
pser.cat.set_categories(["a", "c", "b", "o"], inplace=True, rename=True),
718764
psser.cat.set_categories(["a", "c", "b", "o"], inplace=True, rename=True),
719765
)
720-
self.assert_eq(pser, psser)
766+
if LooseVersion(pd.__version__) >= LooseVersion("1.3"):
767+
# TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3
768+
pass
769+
else:
770+
self.assert_eq(pser, psser)
721771
self.assert_eq(pdf, psdf)
722772

723773
pser.cat.set_categories([2, 3, 1, 0], inplace=True, rename=False),
724774
psser.cat.set_categories([2, 3, 1, 0], inplace=True, rename=False),
725-
self.assert_eq(pser, psser)
775+
if LooseVersion(pd.__version__) >= LooseVersion("1.3"):
776+
# TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3
777+
pass
778+
else:
779+
self.assert_eq(pser, psser)
726780
self.assert_eq(pdf, psdf)
727781

728782
self.assertRaisesRegex(

0 commit comments

Comments
 (0)