Skip to content

Commit 14fadcd

Browse files
itholicvpolet
authored and
vpolet
committed
[SPARK-44289][SPARK-43874][SPARK-43869][SPARK-43607][PS] Support indexer_between_time for pandas 2.0.0 & enabling more tests
### What changes were proposed in this pull request? This PR proposes to support `DatetimeIndex.indexer_between_time` to support pandas 2.0.0 and above. See pandas-dev/pandas#43248 for more detail. This PR also enables bunch of tests for `Series`, `Index` and `GroupBy`. ### Why are the changes needed? To match the behavior with latest pandas. ### Does this PR introduce _any_ user-facing change? `DatetimeIndex.indexer_between_time` now has the same behavior with the latest pandas. ### How was this patch tested? Enabling & updating the existing UTs and doctests. Closes apache#42533 from itholic/enable-many-tests. Authored-by: itholic <[email protected]> Signed-off-by: Ruifeng Zheng <[email protected]>
1 parent a1c5f58 commit 14fadcd

File tree

11 files changed

+58
-142
lines changed

11 files changed

+58
-142
lines changed

python/pyspark/pandas/groupby.py

+4-3
Original file line numberDiff line numberDiff line change
@@ -4165,6 +4165,7 @@ def value_counts(
41654165
41664166
Examples
41674167
--------
4168+
>>> import numpy as np
41684169
>>> df = ps.DataFrame({'A': [1, 2, 2, 3, 3, 3],
41694170
... 'B': [1, 1, 2, 3, 3, np.nan]},
41704171
... columns=['A', 'B'])
@@ -4183,7 +4184,7 @@ def value_counts(
41834184
2 1.0 1
41844185
2.0 1
41854186
3 3.0 2
4186-
Name: B, dtype: int64
4187+
Name: count, dtype: int64
41874188
41884189
Don't include counts of NaN when dropna is False.
41894190
@@ -4195,7 +4196,7 @@ def value_counts(
41954196
2.0 1
41964197
3 3.0 2
41974198
NaN 1
4198-
Name: B, dtype: int64
4199+
Name: count, dtype: int64
41994200
"""
42004201
warnings.warn(
42014202
"The resulting Series will have a fixed name of 'count' from 4.0.0.",
@@ -4232,7 +4233,7 @@ def value_counts(
42324233
psser._internal.data_fields[0].copy(name=name)
42334234
for psser, name in zip(groupkeys, groupkey_names)
42344235
],
4235-
column_labels=[self._agg_columns[0]._column_label],
4236+
column_labels=[("count",)],
42364237
data_spark_columns=[scol_for(sdf, agg_column)],
42374238
)
42384239
return first_series(DataFrame(internal))

python/pyspark/pandas/indexes/datetimes.py

+14-6
Original file line numberDiff line numberDiff line change
@@ -730,24 +730,32 @@ def indexer_between_time(
730730
731731
Examples
732732
--------
733-
>>> psidx = ps.date_range("2000-01-01", periods=3, freq="T") # doctest: +SKIP
734-
>>> psidx # doctest: +SKIP
733+
>>> psidx = ps.date_range("2000-01-01", periods=3, freq="T")
734+
>>> psidx
735735
DatetimeIndex(['2000-01-01 00:00:00', '2000-01-01 00:01:00',
736736
'2000-01-01 00:02:00'],
737737
dtype='datetime64[ns]', freq=None)
738738
739-
>>> psidx.indexer_between_time("00:01", "00:02").sort_values() # doctest: +SKIP
739+
>>> psidx.indexer_between_time("00:01", "00:02").sort_values()
740740
Index([1, 2], dtype='int64')
741741
742-
>>> psidx.indexer_between_time("00:01", "00:02", include_end=False) # doctest: +SKIP
742+
>>> psidx.indexer_between_time("00:01", "00:02", include_end=False)
743743
Index([1], dtype='int64')
744744
745-
>>> psidx.indexer_between_time("00:01", "00:02", include_start=False) # doctest: +SKIP
745+
>>> psidx.indexer_between_time("00:01", "00:02", include_start=False)
746746
Index([2], dtype='int64')
747747
"""
748748

749749
def pandas_between_time(pdf) -> ps.DataFrame[int]: # type: ignore[no-untyped-def]
750-
return pdf.between_time(start_time, end_time, include_start, include_end)
750+
if include_start and include_end:
751+
inclusive = "both"
752+
elif not include_start and not include_end:
753+
inclusive = "neither"
754+
elif include_start and not include_end:
755+
inclusive = "left"
756+
elif not include_start and include_end:
757+
inclusive = "right"
758+
return pdf.between_time(start_time, end_time, inclusive=inclusive)
751759

752760
psdf = self.to_frame()[[]]
753761
id_column_name = verify_temp_column_name(psdf, "__id_column__")

python/pyspark/pandas/tests/computation/test_cov.py

+9-42
Original file line numberDiff line numberDiff line change
@@ -28,10 +28,6 @@
2828

2929

3030
class FrameCovMixin:
31-
@unittest.skipIf(
32-
LooseVersion(pd.__version__) >= LooseVersion("2.0.0"),
33-
"TODO(SPARK-43809): Enable DataFrameSlowTests.test_cov for pandas 2.0.0.",
34-
)
3531
def test_cov(self):
3632
# SPARK-36396: Implement DataFrame.cov
3733

@@ -66,12 +62,8 @@ def test_cov(self):
6662
self.assert_eq(pdf.cov(min_periods=5), psdf.cov(min_periods=5))
6763

6864
# extension dtype
69-
if LooseVersion(pd.__version__) >= LooseVersion("1.2"):
70-
numeric_dtypes = ["Int8", "Int16", "Int32", "Int64", "Float32", "Float64", "float"]
71-
boolean_dtypes = ["boolean", "bool"]
72-
else:
73-
numeric_dtypes = ["Int8", "Int16", "Int32", "Int64", "float"]
74-
boolean_dtypes = ["boolean", "bool"]
65+
numeric_dtypes = ["Int8", "Int16", "Int32", "Int64", "Float32", "Float64", "float"]
66+
boolean_dtypes = ["boolean", "bool"]
7567

7668
sers = [pd.Series([1, 2, 3, None], dtype=dtype) for dtype in numeric_dtypes]
7769
sers += [pd.Series([True, False, True, None], dtype=dtype) for dtype in boolean_dtypes]
@@ -81,44 +73,19 @@ def test_cov(self):
8173
pdf.columns = [dtype for dtype in numeric_dtypes + boolean_dtypes] + ["decimal"]
8274
psdf = ps.from_pandas(pdf)
8375

84-
if LooseVersion(pd.__version__) >= LooseVersion("1.2"):
85-
self.assert_eq(pdf.cov(), psdf.cov(), almost=True)
86-
self.assert_eq(pdf.cov(min_periods=3), psdf.cov(min_periods=3), almost=True)
87-
self.assert_eq(pdf.cov(min_periods=4), psdf.cov(min_periods=4))
88-
else:
89-
test_types = [
90-
"Int8",
91-
"Int16",
92-
"Int32",
93-
"Int64",
94-
"float",
95-
"boolean",
96-
"bool",
97-
]
98-
expected = pd.DataFrame(
99-
data=[
100-
[1.0, 1.0, 1.0, 1.0, 1.0, 0.0000000, 0.0000000],
101-
[1.0, 1.0, 1.0, 1.0, 1.0, 0.0000000, 0.0000000],
102-
[1.0, 1.0, 1.0, 1.0, 1.0, 0.0000000, 0.0000000],
103-
[1.0, 1.0, 1.0, 1.0, 1.0, 0.0000000, 0.0000000],
104-
[1.0, 1.0, 1.0, 1.0, 1.0, 0.0000000, 0.0000000],
105-
[0.0, 0.0, 0.0, 0.0, 0.0, 0.3333333, 0.3333333],
106-
[0.0, 0.0, 0.0, 0.0, 0.0, 0.3333333, 0.3333333],
107-
],
108-
index=test_types,
109-
columns=test_types,
110-
)
111-
self.assert_eq(expected, psdf.cov(), almost=True)
76+
self.assert_eq(pdf.cov(numeric_only=True), psdf.cov(), almost=True)
11277

11378
# string column
11479
pdf = pd.DataFrame(
11580
[(1, 2, "a", 1), (0, 3, "b", 1), (2, 0, "c", 9), (1, 1, "d", 1)],
11681
columns=["a", "b", "c", "d"],
11782
)
11883
psdf = ps.from_pandas(pdf)
119-
self.assert_eq(pdf.cov(), psdf.cov(), almost=True)
120-
self.assert_eq(pdf.cov(min_periods=4), psdf.cov(min_periods=4), almost=True)
121-
self.assert_eq(pdf.cov(min_periods=5), psdf.cov(min_periods=5))
84+
self.assert_eq(pdf.cov(numeric_only=True), psdf.cov(), almost=True)
85+
self.assert_eq(
86+
pdf.cov(numeric_only=True, min_periods=4), psdf.cov(min_periods=4), almost=True
87+
)
88+
self.assert_eq(pdf.cov(numeric_only=True, min_periods=5), psdf.cov(min_periods=5))
12289

12390
# nan
12491
np.random.seed(42)
@@ -132,7 +99,7 @@ def test_cov(self):
13299
# return empty DataFrame
133100
pdf = pd.DataFrame([("1", "2"), ("0", "3"), ("2", "0"), ("1", "1")], columns=["a", "b"])
134101
psdf = ps.from_pandas(pdf)
135-
self.assert_eq(pdf.cov(), psdf.cov())
102+
self.assert_eq(pdf.cov(numeric_only=True), psdf.cov())
136103

137104

138105
class FrameCovTests(FrameCovMixin, ComparisonTestBase, SQLTestUtils):

python/pyspark/pandas/tests/data_type_ops/test_date_ops.py

+8-12
Original file line numberDiff line numberDiff line change
@@ -63,25 +63,25 @@ def test_add(self):
6363
for psser in self.pssers:
6464
self.assertRaises(TypeError, lambda: self.psser + psser)
6565

66-
@unittest.skipIf(
67-
LooseVersion(pd.__version__) >= LooseVersion("2.0.0"),
68-
"TODO(SPARK-43571): Enable DateOpsTests.test_sub for pandas 2.0.0.",
69-
)
7066
def test_sub(self):
7167
self.assertRaises(TypeError, lambda: self.psser - "x")
7268
self.assertRaises(TypeError, lambda: self.psser - 1)
7369
self.assert_eq(
74-
(self.pser - self.some_date).dt.days,
70+
(self.pser - self.some_date).apply(lambda x: x.days),
7571
self.psser - self.some_date,
7672
)
7773
pdf, psdf = self.pdf, self.psdf
7874
for col in self.df_cols:
7975
if col == "date":
80-
self.assert_eq((pdf["date"] - pdf[col]).dt.days, psdf["date"] - psdf[col])
76+
self.assert_eq(
77+
(pdf["date"] - pdf[col]).apply(lambda x: x.days), psdf["date"] - psdf[col]
78+
)
8179
else:
8280
self.assertRaises(TypeError, lambda: psdf["date"] - psdf[col])
8381
pdf, psdf = self.date_pdf, self.date_psdf
84-
self.assert_eq((pdf["this"] - pdf["that"]).dt.days, psdf["this"] - psdf["that"])
82+
self.assert_eq(
83+
(pdf["this"] - pdf["that"]).apply(lambda x: x.days), psdf["this"] - psdf["that"]
84+
)
8585

8686
def test_mul(self):
8787
self.assertRaises(TypeError, lambda: self.psser * "x")
@@ -128,15 +128,11 @@ def test_radd(self):
128128
self.assertRaises(TypeError, lambda: 1 + self.psser)
129129
self.assertRaises(TypeError, lambda: self.some_date + self.psser)
130130

131-
@unittest.skipIf(
132-
LooseVersion(pd.__version__) >= LooseVersion("2.0.0"),
133-
"TODO(SPARK-43570): Enable DateOpsTests.test_rsub for pandas 2.0.0.",
134-
)
135131
def test_rsub(self):
136132
self.assertRaises(TypeError, lambda: "x" - self.psser)
137133
self.assertRaises(TypeError, lambda: 1 - self.psser)
138134
self.assert_eq(
139-
(self.some_date - self.pser).dt.days,
135+
(self.some_date - self.pser).apply(lambda x: x.days),
140136
self.some_date - self.psser,
141137
)
142138

python/pyspark/pandas/tests/groupby/test_aggregate.py

+2-10
Original file line numberDiff line numberDiff line change
@@ -40,10 +40,6 @@ def pdf(self):
4040
def psdf(self):
4141
return ps.from_pandas(self.pdf)
4242

43-
@unittest.skipIf(
44-
LooseVersion(pd.__version__) >= LooseVersion("2.0.0"),
45-
"TODO(SPARK-44289): Enable GroupbyAggregateTests.test_aggregate for pandas 2.0.0.",
46-
)
4743
def test_aggregate(self):
4844
pdf = pd.DataFrame(
4945
{"A": [1, 1, 2, 2], "B": [1, 2, 3, 4], "C": [0.362, 0.227, 1.267, -0.562]}
@@ -173,12 +169,8 @@ def sort(df):
173169
stats_psdf = psdf.groupby(10).agg({20: ["min", "max"], 30: "sum"})
174170
stats_pdf = pdf.groupby(10).agg({20: ["min", "max"], 30: "sum"})
175171
self.assert_eq(
176-
stats_psdf.sort_values(by=[(20, "min"), (20, "max"), (30, "sum")]).reset_index(
177-
drop=True
178-
),
179-
stats_pdf.sort_values(by=[(20, "min"), (20, "max"), (30, "sum")]).reset_index(
180-
drop=True
181-
),
172+
stats_psdf.reset_index(drop=True),
173+
stats_pdf.reset_index(drop=True),
182174
)
183175

184176
def test_aggregate_func_str_list(self):

python/pyspark/pandas/tests/groupby/test_apply_func.py

+3-8
Original file line numberDiff line numberDiff line change
@@ -42,10 +42,6 @@ def pdf(self):
4242
def psdf(self):
4343
return ps.from_pandas(self.pdf)
4444

45-
@unittest.skipIf(
46-
LooseVersion(pd.__version__) >= LooseVersion("2.0.0"),
47-
"TODO(SPARK-43708): Enable GroupByTests.test_apply " "for pandas 2.0.0.",
48-
)
4945
def test_apply(self):
5046
pdf = pd.DataFrame(
5147
{"a": [1, 2, 3, 4, 5, 6], "b": [1, 1, 2, 3, 5, 8], "c": [1, 4, 9, 16, 25, 36]},
@@ -87,14 +83,17 @@ def test_apply(self):
8783
self.assert_eq(
8884
psdf.groupby(psdf.b // 5).apply(lambda x: x + x.min()).sort_index(),
8985
pdf.groupby(pdf.b // 5).apply(lambda x: x + x.min()).sort_index(),
86+
almost=True,
9087
)
9188
self.assert_eq(
9289
psdf.groupby(psdf.b // 5)["a"].apply(lambda x: x + x.min()).sort_index(),
9390
pdf.groupby(pdf.b // 5)["a"].apply(lambda x: x + x.min()).sort_index(),
91+
almost=True,
9492
)
9593
self.assert_eq(
9694
psdf.groupby(psdf.b // 5)[["a"]].apply(lambda x: x + x.min()).sort_index(),
9795
pdf.groupby(pdf.b // 5)[["a"]].apply(lambda x: x + x.min()).sort_index(),
96+
almost=True,
9897
)
9998
self.assert_eq(
10099
psdf.groupby(psdf.b // 5)[["a"]].apply(len).sort_index(),
@@ -139,10 +138,6 @@ def test_apply(self):
139138
pdf.groupby([("x", "a"), ("x", "b")]).apply(len).sort_index(),
140139
)
141140

142-
@unittest.skipIf(
143-
LooseVersion(pd.__version__) >= LooseVersion("2.0.0"),
144-
"TODO(SPARK-43706): Enable GroupByTests.test_apply_without_shortcut " "for pandas 2.0.0.",
145-
)
146141
def test_apply_without_shortcut(self):
147142
with option_context("compute.shortcut_limit", 0):
148143
self.test_apply()

python/pyspark/pandas/tests/groupby/test_groupby.py

+3-4
Original file line numberDiff line numberDiff line change
@@ -769,10 +769,6 @@ def test_unique(self):
769769
for act, exp in zip(actual, expect):
770770
self.assertTrue(sorted(act) == sorted(exp))
771771

772-
@unittest.skipIf(
773-
LooseVersion(pd.__version__) >= LooseVersion("2.0.0"),
774-
"TODO(SPARK-43444): Enable GroupBySlowTests.test_value_counts for pandas 2.0.0.",
775-
)
776772
def test_value_counts(self):
777773
pdf = pd.DataFrame(
778774
{"A": [np.nan, 2, 2, 3, 3, 3], "B": [1, 1, 2, 3, 3, np.nan]}, columns=["A", "B"]
@@ -785,6 +781,7 @@ def test_value_counts(self):
785781
self.assert_eq(
786782
psdf.groupby("A")["B"].value_counts(dropna=False).sort_index(),
787783
pdf.groupby("A")["B"].value_counts(dropna=False).sort_index(),
784+
almost=True,
788785
)
789786
self.assert_eq(
790787
psdf.groupby("A", dropna=False)["B"].value_counts(dropna=False).sort_index(),
@@ -804,6 +801,7 @@ def test_value_counts(self):
804801
pdf.groupby("A")["B"]
805802
.value_counts(sort=True, ascending=False, dropna=False)
806803
.sort_index(),
804+
almost=True,
807805
)
808806
self.assert_eq(
809807
psdf.groupby("A")["B"]
@@ -812,6 +810,7 @@ def test_value_counts(self):
812810
pdf.groupby("A")["B"]
813811
.value_counts(sort=True, ascending=True, dropna=False)
814812
.sort_index(),
813+
almost=True,
815814
)
816815
self.assert_eq(
817816
psdf.B.rename().groupby(psdf.A).value_counts().sort_index(),

python/pyspark/pandas/tests/indexes/test_base.py

-24
Original file line numberDiff line numberDiff line change
@@ -1577,10 +1577,6 @@ def test_asof(self):
15771577
psmidx = ps.MultiIndex.from_tuples([("a", "a"), ("a", "b"), ("a", "c")])
15781578
self.assertRaises(NotImplementedError, lambda: psmidx.asof(("a", "b")))
15791579

1580-
@unittest.skipIf(
1581-
LooseVersion(pd.__version__) >= LooseVersion("2.0.0"),
1582-
"TODO(SPARK-43608): Enable IndexesTests.test_union for pandas 2.0.0.",
1583-
)
15841580
def test_union(self):
15851581
# Index
15861582
pidx1 = pd.Index([1, 2, 3, 4])
@@ -1593,13 +1589,6 @@ def test_union(self):
15931589
self.assert_eq(psidx1.union(psidx2), pidx1.union(pidx2))
15941590
self.assert_eq(psidx2.union(psidx1), pidx2.union(pidx1))
15951591
self.assert_eq(psidx1.union(psidx3), pidx1.union(pidx3))
1596-
# Deprecated case, but adding to track if pandas stop supporting union
1597-
# as a set operation. It should work fine until stop supporting anyway.
1598-
# No longer supported from pandas 2.0.0.
1599-
if LooseVersion(pd.__version__) >= LooseVersion("2.0.0"):
1600-
self.assert_eq(psidx1 | psidx2, ps.Index([3, 4], dtype="int64"))
1601-
else:
1602-
self.assert_eq(pidx1 | pidx2, psidx1 | psidx2)
16031592

16041593
self.assert_eq(psidx1.union([3, 4, 5, 6]), pidx1.union([3, 4, 5, 6]), almost=True)
16051594
self.assert_eq(psidx2.union([1, 2, 3, 4]), pidx2.union([1, 2, 3, 4]), almost=True)
@@ -1904,10 +1893,6 @@ def test_hasnans(self):
19041893
psmidx = ps.Index([("a", 1), ("b", 2)])
19051894
self.assertRaises(NotImplementedError, lambda: psmidx.hasnans())
19061895

1907-
@unittest.skipIf(
1908-
LooseVersion(pd.__version__) >= LooseVersion("2.0.0"),
1909-
"TODO(SPARK-43607): Enable IndexesTests.test_intersection for pandas 2.0.0.",
1910-
)
19111896
def test_intersection(self):
19121897
pidx = pd.Index([1, 2, 3, 4], name="Koalas")
19131898
psidx = ps.from_pandas(pidx)
@@ -1919,15 +1904,6 @@ def test_intersection(self):
19191904
self.assert_eq(
19201905
(pidx + 1).intersection(pidx_other), (psidx + 1).intersection(psidx_other).sort_values()
19211906
)
1922-
# Deprecated case, but adding to track if pandas stop supporting intersection
1923-
# as a set operation. It should work fine until stop supporting anyway.
1924-
# No longer supported from pandas 2.0.0.
1925-
if LooseVersion(pd.__version__) >= LooseVersion("2.0.0"):
1926-
self.assert_eq(
1927-
(psidx & psidx_other).sort_values(), ps.Index([3, 1, 7, 1], dtype="int64")
1928-
)
1929-
else:
1930-
self.assert_eq(pidx & pidx_other, (psidx & psidx_other).sort_values())
19311907

19321908
pidx_other_different_name = pd.Index([3, 4, 5, 6], name="Databricks")
19331909
psidx_other_different_name = ps.from_pandas(pidx_other_different_name)

python/pyspark/pandas/tests/indexes/test_datetime.py

-5
Original file line numberDiff line numberDiff line change
@@ -161,11 +161,6 @@ def test_strftime(self):
161161
psidx.strftime(date_format="%B %d, %Y"), pidx.strftime(date_format="%B %d, %Y")
162162
)
163163

164-
@unittest.skipIf(
165-
LooseVersion(pd.__version__) >= LooseVersion("2.0.0"),
166-
"TODO(SPARK-43644): Enable DatetimeIndexTests.test_indexer_between_time "
167-
"for pandas 2.0.0.",
168-
)
169164
def test_indexer_between_time(self):
170165
for psidx, pidx in self.idx_pairs:
171166
self.assert_eq(

python/pyspark/pandas/tests/indexes/test_reindex.py

+4-5
Original file line numberDiff line numberDiff line change
@@ -39,10 +39,6 @@ def df_pair(self):
3939
psdf = ps.from_pandas(pdf)
4040
return pdf, psdf
4141

42-
@unittest.skipIf(
43-
LooseVersion(pd.__version__) >= LooseVersion("2.0.0"),
44-
"TODO(SPARK-43811): Enable DataFrameTests.test_reindex for pandas 2.0.0.",
45-
)
4642
def test_reindex(self):
4743
index = pd.Index(["A", "B", "C", "D", "E"])
4844
columns = pd.Index(["numbers"])
@@ -64,9 +60,12 @@ def test_reindex(self):
6460
psdf.reindex(["A", "B", "C"], columns=["numbers", "2", "3"]).sort_index(),
6561
)
6662

63+
# We manually test this due to the bug in pandas.
64+
expected_result = ps.DataFrame([1.0, 2.0, 3.0], index=ps.Index(["A", "B", "C"]))
65+
expected_result.columns = pd.Index(["numbers"], name="cols")
6766
self.assert_eq(
68-
pdf.reindex(["A", "B", "C"], index=["numbers", "2", "3"]).sort_index(),
6967
psdf.reindex(["A", "B", "C"], index=["numbers", "2", "3"]).sort_index(),
68+
expected_result,
7069
)
7170

7271
self.assert_eq(

0 commit comments

Comments
 (0)