Skip to content

Commit b4cb45e

Browse files
authored
Matching behavior to pandas 1.1.4 (#1881)
Since pandas 1.1.4 has been released, upgraded pandas version in CI. From pandas 1.1.4, `MultiIndex.monotonic_increasing` and `MultiIndex.monotonic_decreasing` return `False` if the `MultiIndex` includes null value - before pandas 1.1.4, they treat the null value as the smallest value -. Refer to pandas-dev/pandas#37220 for more detail. Therefore, several behavior is fixed, and added more related tests. [What’s new in pandas 1.1.4](https://pandas.pydata.org/docs/whatsnew/v1.1.4.html)
1 parent 674cdd2 commit b4cb45e

File tree

3 files changed

+124
-52
lines changed

3 files changed

+124
-52
lines changed

.github/workflows/master.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -107,7 +107,7 @@ jobs:
107107
pyarrow-version: 0.15.1
108108
- python-version: 3.8
109109
spark-version: 3.0.1
110-
pandas-version: 1.1.3
110+
pandas-version: 1.1.4
111111
pyarrow-version: 1.0.1
112112
default-index-type: 'distributed-sequence'
113113
env:

databricks/koalas/indexes.py

Lines changed: 13 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -42,14 +42,7 @@
4242
import pyspark
4343
from pyspark import sql as spark
4444
from pyspark.sql import functions as F, Window
45-
from pyspark.sql.types import (
46-
BooleanType,
47-
DataType,
48-
NumericType,
49-
StringType,
50-
TimestampType,
51-
IntegralType,
52-
)
45+
from pyspark.sql.types import TimestampType, IntegralType, DataType
5346

5447
from databricks import koalas as ks # For running doctests and reference resolution in PyCharm.
5548
from databricks.koalas.config import get_option, option_context
@@ -60,10 +53,7 @@
6053
from databricks.koalas.series import Series, first_series
6154
from databricks.koalas.spark.accessors import SparkIndexMethods
6255
from databricks.koalas.utils import (
63-
compare_allow_null,
6456
compare_disallow_null,
65-
compare_null_first,
66-
compare_null_last,
6757
default_session,
6858
is_name_like_tuple,
6959
is_name_like_value,
@@ -2680,10 +2670,7 @@ def levshape(self) -> Tuple[int, ...]:
26802670

26812671
@staticmethod
26822672
def _comparator_for_monotonic_increasing(data_type):
2683-
if isinstance(data_type, BooleanType):
2684-
return compare_allow_null
2685-
else:
2686-
return compare_null_last
2673+
return compare_disallow_null
26872674

26882675
def _is_monotonic(self, order):
26892676
if order == "increasing":
@@ -2697,15 +2684,19 @@ def _is_monotonic_increasing(self):
26972684
prev = F.lag(scol, 1).over(window)
26982685

26992686
cond = F.lit(True)
2687+
has_not_null = F.lit(True)
27002688
for field in self.spark.data_type[::-1]:
27012689
left = scol.getField(field.name)
27022690
right = prev.getField(field.name)
27032691
compare = MultiIndex._comparator_for_monotonic_increasing(field.dataType)
2692+
# Since pandas 1.1.4, null value is not allowed at any levels of MultiIndex.
2693+
# Therefore, we should check `has_not_null` over the all levels.
2694+
has_not_null = has_not_null & left.isNotNull()
27042695
cond = F.when(left.eqNullSafe(right), cond).otherwise(
27052696
compare(left, right, spark.Column.__gt__)
27062697
)
27072698

2708-
cond = prev.isNull() | cond
2699+
cond = has_not_null & (prev.isNull() | cond)
27092700

27102701
internal = InternalFrame(
27112702
spark_frame=self._internal.spark_frame.select(
@@ -2719,30 +2710,27 @@ def _is_monotonic_increasing(self):
27192710

27202711
@staticmethod
27212712
def _comparator_for_monotonic_decreasing(data_type):
2722-
if isinstance(data_type, StringType):
2723-
return compare_disallow_null
2724-
elif isinstance(data_type, BooleanType):
2725-
return compare_allow_null
2726-
elif isinstance(data_type, NumericType):
2727-
return compare_null_last
2728-
else:
2729-
return compare_null_first
2713+
return compare_disallow_null
27302714

27312715
def _is_monotonic_decreasing(self):
27322716
scol = self.spark.column
27332717
window = Window.orderBy(NATURAL_ORDER_COLUMN_NAME).rowsBetween(-1, -1)
27342718
prev = F.lag(scol, 1).over(window)
27352719

27362720
cond = F.lit(True)
2721+
has_not_null = F.lit(True)
27372722
for field in self.spark.data_type[::-1]:
27382723
left = scol.getField(field.name)
27392724
right = prev.getField(field.name)
27402725
compare = MultiIndex._comparator_for_monotonic_decreasing(field.dataType)
2726+
# Since pandas 1.1.4, null value is not allowed at any levels of MultiIndex.
2727+
# Therefore, we should check `has_not_null` over the all levels.
2728+
has_not_null = has_not_null & left.isNotNull()
27412729
cond = F.when(left.eqNullSafe(right), cond).otherwise(
27422730
compare(left, right, spark.Column.__lt__)
27432731
)
27442732

2745-
cond = prev.isNull() | cond
2733+
cond = has_not_null & (prev.isNull() | cond)
27462734

27472735
internal = InternalFrame(
27482736
spark_frame=self._internal.spark_frame.select(

databricks/koalas/tests/test_indexes.py

Lines changed: 110 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -964,32 +964,36 @@ def test_monotonic(self):
964964
datas.append([(-5, "e"), (-3, "d"), (-2, "c"), (-4, "b"), (-1, "a")])
965965
datas.append([(-5, "e"), (-4, "c"), (-3, "b"), (-2, "d"), (-1, "a")])
966966

967-
# None type tests (None type is treated as the smallest value)
968-
datas.append([(1, 100), (2, 200), (None, 300), (4, 400), (5, 500)])
969-
datas.append([(5, None), (4, 200), (3, 300), (2, 400), (1, 500)])
970-
datas.append([(5, 100), (4, 200), (3, None), (2, 400), (1, 500)])
971-
datas.append([(5, 100), (4, 200), (3, 300), (2, 400), (1, None)])
972-
datas.append([(1, 100), (2, 200), (None, None), (4, 400), (5, 500)])
973-
# The datas below cannot be an arguments for `MultiIndex.from_tuples` in pandas >= 1.1.0.
974-
# Refer https://github.com/databricks/koalas/pull/1688#issuecomment-667156560 for detail.
975-
if LooseVersion(pd.__version__) < LooseVersion("1.1.0"):
976-
datas.append([(-5, None), (-4, None), (-3, None), (-2, None), (-1, None)])
977-
datas.append([(None, "e"), (None, "c"), (None, "b"), (None, "d"), (None, "a")])
978-
datas.append([(None, None), (None, None), (None, None), (None, None), (None, None)])
967+
# boolean type tests
968+
datas.append([(True, True), (True, True)])
969+
datas.append([(True, True), (True, False)])
970+
datas.append([(True, False), (True, True)])
971+
datas.append([(False, True), (False, True)])
972+
datas.append([(False, True), (False, False)])
973+
datas.append([(False, False), (False, True)])
974+
datas.append([(True, True), (False, True)])
975+
datas.append([(True, True), (False, False)])
976+
datas.append([(True, False), (False, True)])
977+
datas.append([(False, True), (True, True)])
978+
datas.append([(False, True), (True, False)])
979+
datas.append([(False, False), (True, True)])
979980

980981
# duplicated index value tests
981982
datas.append([("x", "d"), ("y", "c"), ("y", "b"), ("z", "a")])
982983
datas.append([("x", "d"), ("y", "b"), ("y", "c"), ("z", "a")])
983-
datas.append([("x", "d"), ("y", "c"), ("y", None), ("z", "a")])
984-
datas.append([("x", "d"), ("y", None), ("y", None), ("z", "a")])
985-
datas.append([("x", "d"), ("y", "c"), ("y", "b"), (None, "a")])
986-
datas.append([("x", "d"), ("y", "b"), ("y", "c"), (None, "a")])
987984

988985
# more depth tests
989986
datas.append([("x", "d", "o"), ("y", "c", "p"), ("y", "c", "q"), ("z", "a", "r")])
990987
datas.append([("x", "d", "o"), ("y", "c", "q"), ("y", "c", "p"), ("z", "a", "r")])
988+
989+
# None type tests (None type is treated as False from pandas >= 1.1.4)
990+
# Refer https://github.com/pandas-dev/pandas/issues/37220
991+
datas.append([(1, 100), (2, 200), (None, 300), (4, 400), (5, 500)])
992+
datas.append([(1, 100), (2, 200), (None, None), (4, 400), (5, 500)])
993+
datas.append([("x", "d"), ("y", "c"), ("y", None), ("z", "a")])
994+
datas.append([("x", "d"), ("y", "c"), ("y", "b"), (None, "a")])
995+
datas.append([("x", "d"), ("y", "b"), ("y", "c"), (None, "a")])
991996
datas.append([("x", "d", "o"), ("y", "c", "p"), ("y", "c", None), ("z", "a", "r")])
992-
datas.append([("x", "d", "o"), ("y", "c", None), ("y", "c", None), ("z", "a", "r")])
993997

994998
for data in datas:
995999
with self.subTest(data=data):
@@ -998,25 +1002,105 @@ def test_monotonic(self):
9981002
self.assert_eq(kmidx.is_monotonic_increasing, pmidx.is_monotonic_increasing)
9991003
self.assert_eq(kmidx.is_monotonic_decreasing, pmidx.is_monotonic_decreasing)
10001004

1001-
# The datas below are showing different result depends on pandas version.
1002-
# Because the behavior of handling null values is changed in pandas >= 1.0.0.
1005+
# datas below return different result depends on pandas version.
1006+
# Because the behavior of handling null values is changed in pandas >= 1.1.4.
1007+
# Since Koalas follows latest pandas, all of them should return `False`.
10031008
datas = []
1004-
datas.append([(None, 100), (2, 200), (3, 300), (4, 400), (5, 500)])
1009+
datas.append([(1, 100), (2, 200), (3, None), (4, 400), (5, 500)])
1010+
datas.append([(1, None), (2, 200), (3, 300), (4, 400), (5, 500)])
1011+
datas.append([(1, 100), (2, 200), (3, 300), (4, 400), (5, None)])
1012+
datas.append([(False, None), (True, True)])
1013+
datas.append([(None, False), (True, True)])
1014+
datas.append([(False, False), (True, None)])
1015+
datas.append([(False, False), (None, True)])
1016+
datas.append([("x", "d"), ("y", None), ("y", None), ("z", "a")])
1017+
datas.append([("x", "d", "o"), ("y", "c", None), ("y", "c", None), ("z", "a", "r")])
10051018
datas.append([(1, 100), (2, 200), (3, 300), (4, 400), (None, 500)])
1006-
datas.append([(None, None), (2, 200), (3, 300), (4, 400), (5, 500)])
10071019
datas.append([(1, 100), (2, 200), (3, 300), (4, 400), (None, None)])
1020+
datas.append([(5, 100), (4, 200), (3, None), (2, 400), (1, 500)])
1021+
datas.append([(5, None), (4, 200), (3, 300), (2, 400), (1, 500)])
1022+
datas.append([(5, 100), (4, 200), (3, None), (2, 400), (1, 500)])
1023+
datas.append([(5, 100), (4, 200), (3, 300), (2, 400), (1, None)])
1024+
datas.append([(True, None), (True, True)])
1025+
datas.append([(None, True), (True, True)])
1026+
datas.append([(True, True), (None, True)])
1027+
datas.append([(True, True), (True, None)])
1028+
datas.append([(None, 100), (2, 200), (3, 300), (4, 400), (5, 500)])
1029+
datas.append([(None, None), (2, 200), (3, 300), (4, 400), (5, 500)])
10081030
datas.append([("x", "d"), ("y", None), ("y", "c"), ("z", "a")])
10091031
datas.append([("x", "d", "o"), ("y", "c", None), ("y", "c", "q"), ("z", "a", "r")])
10101032

10111033
for data in datas:
10121034
with self.subTest(data=data):
10131035
pmidx = pd.MultiIndex.from_tuples(data)
10141036
kmidx = ks.from_pandas(pmidx)
1015-
expected_increasing_result = pmidx.is_monotonic_increasing
1016-
if LooseVersion(pd.__version__) < LooseVersion("1.0.0"):
1017-
expected_increasing_result = not expected_increasing_result
1018-
self.assert_eq(kmidx.is_monotonic_increasing, expected_increasing_result)
1019-
self.assert_eq(kmidx.is_monotonic_decreasing, pmidx.is_monotonic_decreasing)
1037+
if LooseVersion(pd.__version__) < LooseVersion("1.1.4"):
1038+
self.assert_eq(kmidx.is_monotonic_increasing, False)
1039+
self.assert_eq(kmidx.is_monotonic_decreasing, False)
1040+
else:
1041+
self.assert_eq(kmidx.is_monotonic_increasing, pmidx.is_monotonic_increasing)
1042+
self.assert_eq(kmidx.is_monotonic_decreasing, pmidx.is_monotonic_decreasing)
1043+
1044+
# The datas below are tested another way since they cannot be an arguments for
1045+
# `MultiIndex.from_tuples` in pandas >= 1.1.0.
1046+
# Refer https://github.com/databricks/koalas/pull/1688#issuecomment-667156560 for detail.
1047+
if LooseVersion(pd.__version__) < LooseVersion("1.1.0"):
1048+
pmidx = pd.MultiIndex.from_tuples(
1049+
[(-5, None), (-4, None), (-3, None), (-2, None), (-1, None)]
1050+
)
1051+
kmidx = ks.from_pandas(pmidx)
1052+
self.assert_eq(kmidx.is_monotonic_increasing, False)
1053+
self.assert_eq(kmidx.is_monotonic_decreasing, False)
1054+
1055+
pmidx = pd.MultiIndex.from_tuples(
1056+
[(None, "e"), (None, "c"), (None, "b"), (None, "d"), (None, "a")]
1057+
)
1058+
kmidx = ks.from_pandas(pmidx)
1059+
self.assert_eq(kmidx.is_monotonic_increasing, False)
1060+
self.assert_eq(kmidx.is_monotonic_decreasing, False)
1061+
1062+
pmidx = pd.MultiIndex.from_tuples(
1063+
[(None, None), (None, None), (None, None), (None, None), (None, None)]
1064+
)
1065+
kmidx = ks.from_pandas(pmidx)
1066+
self.assert_eq(kmidx.is_monotonic_increasing, False)
1067+
self.assert_eq(kmidx.is_monotonic_decreasing, False)
1068+
1069+
pmidx = pd.MultiIndex.from_tuples([(None, None)])
1070+
kmidx = ks.from_pandas(pmidx)
1071+
self.assert_eq(kmidx.is_monotonic_increasing, False)
1072+
self.assert_eq(kmidx.is_monotonic_decreasing, False)
1073+
else:
1074+
# [(-5, None), (-4, None), (-3, None), (-2, None), (-1, None)]
1075+
kdf = ks.DataFrame({"a": [-5, -4, -3, -2, -1], "b": [1, 1, 1, 1, 1]})
1076+
kdf["b"] = None
1077+
kmidx = kdf.set_index(["a", "b"]).index
1078+
pmidx = kmidx.to_pandas()
1079+
self.assert_eq(kmidx.is_monotonic_increasing, pmidx.is_monotonic_increasing)
1080+
self.assert_eq(kmidx.is_monotonic_decreasing, pmidx.is_monotonic_decreasing)
1081+
# [(None, "e"), (None, "c"), (None, "b"), (None, "d"), (None, "a")]
1082+
kdf = ks.DataFrame({"a": [1, 1, 1, 1, 1], "b": ["e", "c", "b", "d", "a"]})
1083+
kdf["a"] = None
1084+
kmidx = kdf.set_index(["a", "b"]).index
1085+
pmidx = kmidx.to_pandas()
1086+
self.assert_eq(kmidx.is_monotonic_increasing, pmidx.is_monotonic_increasing)
1087+
self.assert_eq(kmidx.is_monotonic_decreasing, pmidx.is_monotonic_decreasing)
1088+
# [(None, None), (None, None), (None, None), (None, None), (None, None)]
1089+
kdf = ks.DataFrame({"a": [1, 1, 1, 1, 1], "b": [1, 1, 1, 1, 1]})
1090+
kdf["a"] = None
1091+
kdf["b"] = None
1092+
kmidx = kdf.set_index(["a", "b"]).index
1093+
pmidx = kmidx.to_pandas()
1094+
self.assert_eq(kmidx.is_monotonic_increasing, pmidx.is_monotonic_increasing)
1095+
self.assert_eq(kmidx.is_monotonic_decreasing, pmidx.is_monotonic_decreasing)
1096+
# [(None, None)]
1097+
kdf = ks.DataFrame({"a": [1], "b": [1]})
1098+
kdf["a"] = None
1099+
kdf["b"] = None
1100+
kmidx = kdf.set_index(["a", "b"]).index
1101+
pmidx = kmidx.to_pandas()
1102+
self.assert_eq(kmidx.is_monotonic_increasing, pmidx.is_monotonic_increasing)
1103+
self.assert_eq(kmidx.is_monotonic_decreasing, pmidx.is_monotonic_decreasing)
10201104

10211105
def test_difference(self):
10221106
# Index

0 commit comments

Comments
 (0)