Skip to content

Commit 8cb9cf3

Browse files
ueshinHyukjinKwon
authored andcommitted
[SPARK-36345][SPARK-36367][INFRA][PYTHON] Disable tests failed by the incompatible behavior of pandas 1.3
### What changes were proposed in this pull request? Disable tests failed by the incompatible behavior of pandas 1.3. ### Why are the changes needed? Pandas 1.3 has been released. There are some behavior changes and we should follow it, but it's not ready yet. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Disabled some tests related to the behavior change. Closes #33598 from ueshin/issues/SPARK-36367/disable_tests. Authored-by: Takuya UESHIN <[email protected]> Signed-off-by: Hyukjin Kwon <[email protected]>
1 parent 63517eb commit 8cb9cf3

12 files changed

+227
-114
lines changed

.github/workflows/build_and_test.yml

+1-3
Original file line numberDiff line numberDiff line change
@@ -199,7 +199,7 @@ jobs:
199199
name: "Build modules (${{ format('{0}, {1} job', needs.configure-jobs.outputs.branch, needs.configure-jobs.outputs.type) }}): ${{ matrix.modules }}"
200200
runs-on: ubuntu-20.04
201201
container:
202-
image: dongjoon/apache-spark-github-action-image:20210602
202+
image: dongjoon/apache-spark-github-action-image:20210730
203203
strategy:
204204
fail-fast: false
205205
matrix:
@@ -266,8 +266,6 @@ jobs:
266266
- name: Run tests
267267
env: ${{ fromJSON(needs.configure-jobs.outputs.envs) }}
268268
run: |
269-
# TODO(SPARK-36345): Install mlflow>=1.0 and sklearn in Python 3.9 of the base image
270-
python3.9 -m pip install 'mlflow>=1.0' sklearn
271269
# TODO(SPARK-36361): Install coverage in Python 3.9 and PyPy 3 in the base image
272270
python3.9 -m pip install coverage
273271
pypy3 -m pip install coverage

python/pyspark/pandas/groupby.py

+11-7
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
"""
2121

2222
from abc import ABCMeta, abstractmethod
23+
import builtins
2324
import sys
2425
import inspect
2526
from collections import OrderedDict, namedtuple
@@ -43,6 +44,7 @@
4344
TYPE_CHECKING,
4445
)
4546

47+
import numpy as np
4648
import pandas as pd
4749
from pandas.api.types import is_hashable, is_list_like
4850

@@ -95,6 +97,12 @@
9597
# to keep it the same as pandas
9698
NamedAgg = namedtuple("NamedAgg", ["column", "aggfunc"])
9799

100+
_builtin_table = {
101+
builtins.sum: np.sum,
102+
builtins.max: np.max,
103+
builtins.min: np.min,
104+
} # type: Dict[Callable, Callable]
105+
98106

99107
class GroupBy(Generic[FrameLike], metaclass=ABCMeta):
100108
"""
@@ -1143,8 +1151,6 @@ def apply(self, func: Callable, *args: Any, **kwargs: Any) -> Union[DataFrame, S
11431151
1 52
11441152
Name: B, dtype: int64
11451153
"""
1146-
from pandas.core.base import SelectionMixin
1147-
11481154
if not isinstance(func, Callable): # type: ignore
11491155
raise TypeError("%s object is not callable" % type(func).__name__)
11501156

@@ -1171,9 +1177,9 @@ def apply(self, func: Callable, *args: Any, **kwargs: Any) -> Union[DataFrame, S
11711177

11721178
if is_series_groupby:
11731179
name = psdf.columns[-1]
1174-
pandas_apply = SelectionMixin._builtin_table.get(func, func)
1180+
pandas_apply = _builtin_table.get(func, func)
11751181
else:
1176-
f = SelectionMixin._builtin_table.get(func, func)
1182+
f = _builtin_table.get(func, func)
11771183

11781184
def pandas_apply(pdf: pd.DataFrame, *a: Any, **k: Any) -> Any:
11791185
return f(pdf.drop(groupkey_names, axis=1), *a, **k)
@@ -1346,8 +1352,6 @@ def filter(self, func: Callable[[FrameLike], FrameLike]) -> FrameLike:
13461352
5 6
13471353
Name: B, dtype: int64
13481354
"""
1349-
from pandas.core.base import SelectionMixin
1350-
13511355
if not isinstance(func, Callable): # type: ignore
13521356
raise TypeError("%s object is not callable" % type(func).__name__)
13531357

@@ -1378,7 +1382,7 @@ def pandas_filter(pdf: pd.DataFrame) -> pd.DataFrame:
13781382
return pd.DataFrame(pdf.groupby(groupkey_names)[pdf.columns[-1]].filter(func))
13791383

13801384
else:
1381-
f = SelectionMixin._builtin_table.get(func, func)
1385+
f = _builtin_table.get(func, func)
13821386

13831387
def wrapped_func(pdf: pd.DataFrame) -> pd.DataFrame:
13841388
return f(pdf.drop(groupkey_names, axis=1))

python/pyspark/pandas/series.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@
2323
import inspect
2424
import sys
2525
from collections.abc import Mapping
26-
from functools import partial, wraps, reduce
26+
from functools import partial, reduce
2727
from typing import (
2828
Any,
2929
Callable,
@@ -3164,7 +3164,7 @@ def apply(self, func: Callable, args: Sequence[Any] = (), **kwds: Any) -> "Serie
31643164
# Falls back to schema inference if it fails to get signature.
31653165
should_infer_schema = True
31663166

3167-
apply_each = wraps(func)(lambda s: s.apply(func, args=args, **kwds))
3167+
apply_each = lambda s: s.apply(func, args=args, **kwds)
31683168

31693169
if should_infer_schema:
31703170
return self.pandas_on_spark._transform_batch(apply_each, None)

python/pyspark/pandas/tests/data_type_ops/test_categorical_ops.py

+5-1
Original file line numberDiff line numberDiff line change
@@ -190,8 +190,12 @@ def test_astype(self):
190190
self.assert_eq(pser.astype(str), psser.astype(str))
191191
self.assert_eq(pser.astype(bool), psser.astype(bool))
192192
self.assert_eq(pser.astype("category"), psser.astype("category"))
193+
193194
cat_type = CategoricalDtype(categories=[3, 1, 2])
194-
if LooseVersion(pd.__version__) >= LooseVersion("1.2"):
195+
if LooseVersion(pd.__version__) >= LooseVersion("1.3"):
196+
# TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3
197+
pass
198+
elif LooseVersion(pd.__version__) >= LooseVersion("1.2"):
195199
self.assert_eq(pser.astype(cat_type), psser.astype(cat_type))
196200
else:
197201
self.assert_eq(pd.Series(data).astype(cat_type), psser.astype(cat_type))

python/pyspark/pandas/tests/indexes/test_base.py

+44-32
Original file line numberDiff line numberDiff line change
@@ -1518,25 +1518,30 @@ def test_union(self):
15181518
psidx2 = ps.from_pandas(pidx2)
15191519

15201520
self.assert_eq(psidx1.union(psidx2), pidx1.union(pidx2))
1521-
self.assert_eq(psidx2.union(psidx1), pidx2.union(pidx1))
15221521
self.assert_eq(
15231522
psidx1.union([3, 4, 3, 3, 5, 6]), pidx1.union([3, 4, 3, 4, 5, 6]), almost=True
15241523
)
1525-
self.assert_eq(
1526-
psidx2.union([1, 2, 3, 4, 3, 4, 3, 4]),
1527-
pidx2.union([1, 2, 3, 4, 3, 4, 3, 4]),
1528-
almost=True,
1529-
)
15301524
self.assert_eq(
15311525
psidx1.union(ps.Series([3, 4, 3, 3, 5, 6])),
15321526
pidx1.union(pd.Series([3, 4, 3, 4, 5, 6])),
15331527
almost=True,
15341528
)
1535-
self.assert_eq(
1536-
psidx2.union(ps.Series([1, 2, 3, 4, 3, 4, 3, 4])),
1537-
pidx2.union(pd.Series([1, 2, 3, 4, 3, 4, 3, 4])),
1538-
almost=True,
1539-
)
1529+
1530+
if LooseVersion(pd.__version__) >= LooseVersion("1.3"):
1531+
# TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3
1532+
pass
1533+
else:
1534+
self.assert_eq(psidx2.union(psidx1), pidx2.union(pidx1))
1535+
self.assert_eq(
1536+
psidx2.union([1, 2, 3, 4, 3, 4, 3, 4]),
1537+
pidx2.union([1, 2, 3, 4, 3, 4, 3, 4]),
1538+
almost=True,
1539+
)
1540+
self.assert_eq(
1541+
psidx2.union(ps.Series([1, 2, 3, 4, 3, 4, 3, 4])),
1542+
pidx2.union(pd.Series([1, 2, 3, 4, 3, 4, 3, 4])),
1543+
almost=True,
1544+
)
15401545

15411546
# MultiIndex
15421547
pmidx1 = pd.MultiIndex.from_tuples([("x", "a"), ("x", "b"), ("x", "a"), ("x", "b")])
@@ -1548,30 +1553,37 @@ def test_union(self):
15481553
psmidx3 = ps.from_pandas(pmidx3)
15491554
psmidx4 = ps.from_pandas(pmidx4)
15501555

1551-
self.assert_eq(psmidx1.union(psmidx2), pmidx1.union(pmidx2))
1552-
self.assert_eq(psmidx2.union(psmidx1), pmidx2.union(pmidx1))
1553-
self.assert_eq(psmidx3.union(psmidx4), pmidx3.union(pmidx4))
1554-
self.assert_eq(psmidx4.union(psmidx3), pmidx4.union(pmidx3))
1555-
self.assert_eq(
1556-
psmidx1.union([("x", "a"), ("x", "b"), ("x", "c"), ("x", "d")]),
1557-
pmidx1.union([("x", "a"), ("x", "b"), ("x", "c"), ("x", "d")]),
1558-
)
1559-
self.assert_eq(
1560-
psmidx2.union([("x", "a"), ("x", "b"), ("x", "a"), ("x", "b")]),
1561-
pmidx2.union([("x", "a"), ("x", "b"), ("x", "a"), ("x", "b")]),
1562-
)
1563-
self.assert_eq(
1564-
psmidx3.union([(1, 3), (1, 4), (1, 5), (1, 6)]),
1565-
pmidx3.union([(1, 3), (1, 4), (1, 5), (1, 6)]),
1566-
)
1567-
self.assert_eq(
1568-
psmidx4.union([(1, 1), (1, 2), (1, 3), (1, 4), (1, 3), (1, 4)]),
1569-
pmidx4.union([(1, 1), (1, 2), (1, 3), (1, 4), (1, 3), (1, 4)]),
1570-
)
1556+
if LooseVersion(pd.__version__) >= LooseVersion("1.3"):
1557+
# TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3
1558+
pass
1559+
else:
1560+
self.assert_eq(psmidx1.union(psmidx2), pmidx1.union(pmidx2))
1561+
self.assert_eq(psmidx2.union(psmidx1), pmidx2.union(pmidx1))
1562+
self.assert_eq(psmidx3.union(psmidx4), pmidx3.union(pmidx4))
1563+
self.assert_eq(psmidx4.union(psmidx3), pmidx4.union(pmidx3))
1564+
self.assert_eq(
1565+
psmidx1.union([("x", "a"), ("x", "b"), ("x", "c"), ("x", "d")]),
1566+
pmidx1.union([("x", "a"), ("x", "b"), ("x", "c"), ("x", "d")]),
1567+
)
1568+
self.assert_eq(
1569+
psmidx2.union([("x", "a"), ("x", "b"), ("x", "a"), ("x", "b")]),
1570+
pmidx2.union([("x", "a"), ("x", "b"), ("x", "a"), ("x", "b")]),
1571+
)
1572+
self.assert_eq(
1573+
psmidx3.union([(1, 3), (1, 4), (1, 5), (1, 6)]),
1574+
pmidx3.union([(1, 3), (1, 4), (1, 5), (1, 6)]),
1575+
)
1576+
self.assert_eq(
1577+
psmidx4.union([(1, 1), (1, 2), (1, 3), (1, 4), (1, 3), (1, 4)]),
1578+
pmidx4.union([(1, 1), (1, 2), (1, 3), (1, 4), (1, 3), (1, 4)]),
1579+
)
15711580

1581+
if LooseVersion(pd.__version__) >= LooseVersion("1.3"):
1582+
# TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3
1583+
pass
15721584
# Testing if the result is correct after sort=False.
15731585
# The `sort` argument is added in pandas 0.24.
1574-
if LooseVersion(pd.__version__) >= LooseVersion("0.24"):
1586+
elif LooseVersion(pd.__version__) >= LooseVersion("0.24"):
15751587
self.assert_eq(
15761588
psmidx1.union(psmidx2, sort=False).sort_values(),
15771589
pmidx1.union(pmidx2, sort=False).sort_values(),

python/pyspark/pandas/tests/indexes/test_category.py

+4-1
Original file line numberDiff line numberDiff line change
@@ -176,7 +176,10 @@ def test_astype(self):
176176

177177
self.assert_eq(kcidx.astype("category"), pcidx.astype("category"))
178178

179-
if LooseVersion(pd.__version__) >= LooseVersion("1.2"):
179+
if LooseVersion(pd.__version__) >= LooseVersion("1.3"):
180+
# TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3
181+
pass
182+
elif LooseVersion(pd.__version__) >= LooseVersion("1.2"):
180183
self.assert_eq(
181184
kcidx.astype(CategoricalDtype(["b", "c", "a"])),
182185
pcidx.astype(CategoricalDtype(["b", "c", "a"])),

python/pyspark/pandas/tests/test_categorical.py

+68-14
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,11 @@ def test_categories_setter(self):
7373

7474
pser.cat.categories = ["z", "y", "x"]
7575
psser.cat.categories = ["z", "y", "x"]
76-
self.assert_eq(pser, psser)
76+
if LooseVersion(pd.__version__) >= LooseVersion("1.3"):
77+
# TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3
78+
pass
79+
else:
80+
self.assert_eq(pser, psser)
7781
self.assert_eq(pdf, psdf)
7882

7983
with self.assertRaises(ValueError):
@@ -91,7 +95,11 @@ def test_add_categories(self):
9195

9296
pser.cat.add_categories(4, inplace=True)
9397
psser.cat.add_categories(4, inplace=True)
94-
self.assert_eq(pser, psser)
98+
if LooseVersion(pd.__version__) >= LooseVersion("1.3"):
99+
# TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3
100+
pass
101+
else:
102+
self.assert_eq(pser, psser)
95103
self.assert_eq(pdf, psdf)
96104

97105
self.assertRaises(ValueError, lambda: psser.cat.add_categories(4))
@@ -115,7 +123,11 @@ def test_remove_categories(self):
115123

116124
pser.cat.remove_categories(2, inplace=True)
117125
psser.cat.remove_categories(2, inplace=True)
118-
self.assert_eq(pser, psser)
126+
if LooseVersion(pd.__version__) >= LooseVersion("1.3"):
127+
# TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3
128+
pass
129+
else:
130+
self.assert_eq(pser, psser)
119131
self.assert_eq(pdf, psdf)
120132

121133
self.assertRaises(ValueError, lambda: psser.cat.remove_categories(4))
@@ -138,7 +150,11 @@ def test_remove_unused_categories(self):
138150

139151
pser.cat.remove_unused_categories(inplace=True)
140152
psser.cat.remove_unused_categories(inplace=True)
141-
self.assert_eq(pser, psser)
153+
if LooseVersion(pd.__version__) >= LooseVersion("1.3"):
154+
# TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3
155+
pass
156+
else:
157+
self.assert_eq(pser, psser)
142158
self.assert_eq(pdf, psdf)
143159

144160
def test_reorder_categories(self):
@@ -164,12 +180,20 @@ def test_reorder_categories(self):
164180

165181
pser.cat.reorder_categories([1, 2, 3], inplace=True)
166182
psser.cat.reorder_categories([1, 2, 3], inplace=True)
167-
self.assert_eq(pser, psser)
183+
if LooseVersion(pd.__version__) >= LooseVersion("1.3"):
184+
# TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3
185+
pass
186+
else:
187+
self.assert_eq(pser, psser)
168188
self.assert_eq(pdf, psdf)
169189

170190
pser.cat.reorder_categories([3, 2, 1], ordered=True, inplace=True)
171191
psser.cat.reorder_categories([3, 2, 1], ordered=True, inplace=True)
172-
self.assert_eq(pser, psser)
192+
if LooseVersion(pd.__version__) >= LooseVersion("1.3"):
193+
# TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3
194+
pass
195+
else:
196+
self.assert_eq(pser, psser)
173197
self.assert_eq(pdf, psdf)
174198

175199
self.assertRaises(ValueError, lambda: psser.cat.reorder_categories([1, 2]))
@@ -189,7 +213,11 @@ def test_as_ordered_unordered(self):
189213

190214
pser.cat.as_ordered(inplace=True)
191215
psser.cat.as_ordered(inplace=True)
192-
self.assert_eq(pser, psser)
216+
if LooseVersion(pd.__version__) >= LooseVersion("1.3"):
217+
# TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3
218+
pass
219+
else:
220+
self.assert_eq(pser, psser)
193221
self.assert_eq(pdf, psdf)
194222

195223
# as_unordered
@@ -215,7 +243,10 @@ def test_astype(self):
215243

216244
self.assert_eq(kcser.astype("category"), pcser.astype("category"))
217245

218-
if LooseVersion(pd.__version__) >= LooseVersion("1.2"):
246+
if LooseVersion(pd.__version__) >= LooseVersion("1.3"):
247+
# TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3
248+
pass
249+
elif LooseVersion(pd.__version__) >= LooseVersion("1.2"):
219250
self.assert_eq(
220251
kcser.astype(CategoricalDtype(["b", "c", "a"])),
221252
pcser.astype(CategoricalDtype(["b", "c", "a"])),
@@ -419,7 +450,10 @@ def identity(x) -> ps.Series[psdf.b.dtype]: # type: ignore
419450
def astype(x) -> ps.Series[dtype]:
420451
return x.astype(dtype)
421452

422-
if LooseVersion(pd.__version__) >= LooseVersion("1.2"):
453+
if LooseVersion(pd.__version__) >= LooseVersion("1.3"):
454+
# TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3
455+
pass
456+
elif LooseVersion(pd.__version__) >= LooseVersion("1.2"):
423457
self.assert_eq(
424458
psdf.groupby("a").transform(astype).sort_values("b").reset_index(drop=True),
425459
pdf.groupby("a").transform(astype).sort_values("b").reset_index(drop=True),
@@ -637,17 +671,29 @@ def test_rename_categories(self):
637671

638672
pser.cat.rename_categories({"a": "A", "c": "C"}, inplace=True)
639673
psser.cat.rename_categories({"a": "A", "c": "C"}, inplace=True)
640-
self.assert_eq(pser, psser)
674+
if LooseVersion(pd.__version__) >= LooseVersion("1.3"):
675+
# TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3
676+
pass
677+
else:
678+
self.assert_eq(pser, psser)
641679
self.assert_eq(pdf, psdf)
642680

643681
pser.cat.rename_categories(lambda x: x.upper(), inplace=True)
644682
psser.cat.rename_categories(lambda x: x.upper(), inplace=True)
645-
self.assert_eq(pser, psser)
683+
if LooseVersion(pd.__version__) >= LooseVersion("1.3"):
684+
# TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3
685+
pass
686+
else:
687+
self.assert_eq(pser, psser)
646688
self.assert_eq(pdf, psdf)
647689

648690
pser.cat.rename_categories([0, 1, 3, 2], inplace=True)
649691
psser.cat.rename_categories([0, 1, 3, 2], inplace=True)
650-
self.assert_eq(pser, psser)
692+
if LooseVersion(pd.__version__) >= LooseVersion("1.3"):
693+
# TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3
694+
pass
695+
else:
696+
self.assert_eq(pser, psser)
651697
self.assert_eq(pdf, psdf)
652698

653699
self.assertRaisesRegex(
@@ -717,12 +763,20 @@ def test_set_categories(self):
717763
pser.cat.set_categories(["a", "c", "b", "o"], inplace=True, rename=True),
718764
psser.cat.set_categories(["a", "c", "b", "o"], inplace=True, rename=True),
719765
)
720-
self.assert_eq(pser, psser)
766+
if LooseVersion(pd.__version__) >= LooseVersion("1.3"):
767+
# TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3
768+
pass
769+
else:
770+
self.assert_eq(pser, psser)
721771
self.assert_eq(pdf, psdf)
722772

723773
pser.cat.set_categories([2, 3, 1, 0], inplace=True, rename=False),
724774
psser.cat.set_categories([2, 3, 1, 0], inplace=True, rename=False),
725-
self.assert_eq(pser, psser)
775+
if LooseVersion(pd.__version__) >= LooseVersion("1.3"):
776+
# TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3
777+
pass
778+
else:
779+
self.assert_eq(pser, psser)
726780
self.assert_eq(pdf, psdf)
727781

728782
self.assertRaisesRegex(

0 commit comments

Comments
 (0)