Matching behavior to pandas 1.1.4 (#1881)

itholic · web-flow · commit b4cb45eec4a5 · 2020-11-10T19:06:33.000-08:00
Since pandas 1.1.4 has been released, upgraded pandas version in CI. From pandas 1.1.4, `MultiIndex.monotonic_increasing` and `MultiIndex.monotonic_decreasing` return `False` if the `MultiIndex` includes null value - before pandas 1.1.4, they treat the null value as the smallest value -. Refer to pandas-dev/pandas#37220 for more detail. Therefore, several behavior is fixed, and added more related tests. [What’s new in pandas 1.1.4](https://pandas.pydata.org/docs/whatsnew/v1.1.4.html)
diff --git a/.github/workflows/master.yml b/.github/workflows/master.yml
@@ -107,7 +107,7 @@ jobs:
             pyarrow-version: 0.15.1
           - python-version: 3.8
             spark-version: 3.0.1
-            pandas-version: 1.1.3
+            pandas-version: 1.1.4
             pyarrow-version: 1.0.1
             default-index-type: 'distributed-sequence'
     env:
diff --git a/databricks/koalas/indexes.py b/databricks/koalas/indexes.py
@@ -42,14 +42,7 @@
 import pyspark
 from pyspark import sql as spark
 from pyspark.sql import functions as F, Window
-from pyspark.sql.types import (
-    BooleanType,
-    DataType,
-    NumericType,
-    StringType,
-    TimestampType,
-    IntegralType,
-)
+from pyspark.sql.types import TimestampType, IntegralType, DataType
 
 from databricks import koalas as ks  # For running doctests and reference resolution in PyCharm.
 from databricks.koalas.config import get_option, option_context
@@ -60,10 +53,7 @@
 from databricks.koalas.series import Series, first_series
 from databricks.koalas.spark.accessors import SparkIndexMethods
 from databricks.koalas.utils import (
-    compare_allow_null,
     compare_disallow_null,
-    compare_null_first,
-    compare_null_last,
     default_session,
     is_name_like_tuple,
     is_name_like_value,
@@ -2680,10 +2670,7 @@ def levshape(self) -> Tuple[int, ...]:
 
     @staticmethod
     def _comparator_for_monotonic_increasing(data_type):
-        if isinstance(data_type, BooleanType):
-            return compare_allow_null
-        else:
-            return compare_null_last
+        return compare_disallow_null
 
     def _is_monotonic(self, order):
         if order == "increasing":
@@ -2697,15 +2684,19 @@ def _is_monotonic_increasing(self):
         prev = F.lag(scol, 1).over(window)
 
         cond = F.lit(True)
+        has_not_null = F.lit(True)
         for field in self.spark.data_type[::-1]:
             left = scol.getField(field.name)
             right = prev.getField(field.name)
             compare = MultiIndex._comparator_for_monotonic_increasing(field.dataType)
+            # Since pandas 1.1.4, null value is not allowed at any levels of MultiIndex.
+            # Therefore, we should check `has_not_null` over the all levels.
+            has_not_null = has_not_null & left.isNotNull()
             cond = F.when(left.eqNullSafe(right), cond).otherwise(
                 compare(left, right, spark.Column.__gt__)
             )
 
-        cond = prev.isNull() | cond
+        cond = has_not_null & (prev.isNull() | cond)
 
         internal = InternalFrame(
             spark_frame=self._internal.spark_frame.select(
@@ -2719,30 +2710,27 @@ def _is_monotonic_increasing(self):
 
     @staticmethod
     def _comparator_for_monotonic_decreasing(data_type):
-        if isinstance(data_type, StringType):
-            return compare_disallow_null
-        elif isinstance(data_type, BooleanType):
-            return compare_allow_null
-        elif isinstance(data_type, NumericType):
-            return compare_null_last
-        else:
-            return compare_null_first
+        return compare_disallow_null
 
     def _is_monotonic_decreasing(self):
         scol = self.spark.column
         window = Window.orderBy(NATURAL_ORDER_COLUMN_NAME).rowsBetween(-1, -1)
         prev = F.lag(scol, 1).over(window)
 
         cond = F.lit(True)
+        has_not_null = F.lit(True)
         for field in self.spark.data_type[::-1]:
             left = scol.getField(field.name)
             right = prev.getField(field.name)
             compare = MultiIndex._comparator_for_monotonic_decreasing(field.dataType)
+            # Since pandas 1.1.4, null value is not allowed at any levels of MultiIndex.
+            # Therefore, we should check `has_not_null` over the all levels.
+            has_not_null = has_not_null & left.isNotNull()
             cond = F.when(left.eqNullSafe(right), cond).otherwise(
                 compare(left, right, spark.Column.__lt__)
             )
 
-        cond = prev.isNull() | cond
+        cond = has_not_null & (prev.isNull() | cond)
 
         internal = InternalFrame(
             spark_frame=self._internal.spark_frame.select(
diff --git a/databricks/koalas/tests/test_indexes.py b/databricks/koalas/tests/test_indexes.py
@@ -964,32 +964,36 @@ def test_monotonic(self):
         datas.append([(-5, "e"), (-3, "d"), (-2, "c"), (-4, "b"), (-1, "a")])
         datas.append([(-5, "e"), (-4, "c"), (-3, "b"), (-2, "d"), (-1, "a")])
 
-        # None type tests (None type is treated as the smallest value)
-        datas.append([(1, 100), (2, 200), (None, 300), (4, 400), (5, 500)])
-        datas.append([(5, None), (4, 200), (3, 300), (2, 400), (1, 500)])
-        datas.append([(5, 100), (4, 200), (3, None), (2, 400), (1, 500)])
-        datas.append([(5, 100), (4, 200), (3, 300), (2, 400), (1, None)])
-        datas.append([(1, 100), (2, 200), (None, None), (4, 400), (5, 500)])
-        # The datas below cannot be an arguments for `MultiIndex.from_tuples` in pandas >= 1.1.0.
-        # Refer https://github.com/databricks/koalas/pull/1688#issuecomment-667156560 for detail.
-        if LooseVersion(pd.__version__) < LooseVersion("1.1.0"):
-            datas.append([(-5, None), (-4, None), (-3, None), (-2, None), (-1, None)])
-            datas.append([(None, "e"), (None, "c"), (None, "b"), (None, "d"), (None, "a")])
-            datas.append([(None, None), (None, None), (None, None), (None, None), (None, None)])
+        # boolean type tests
+        datas.append([(True, True), (True, True)])
+        datas.append([(True, True), (True, False)])
+        datas.append([(True, False), (True, True)])
+        datas.append([(False, True), (False, True)])
+        datas.append([(False, True), (False, False)])
+        datas.append([(False, False), (False, True)])
+        datas.append([(True, True), (False, True)])
+        datas.append([(True, True), (False, False)])
+        datas.append([(True, False), (False, True)])
+        datas.append([(False, True), (True, True)])
+        datas.append([(False, True), (True, False)])
+        datas.append([(False, False), (True, True)])
 
         # duplicated index value tests
         datas.append([("x", "d"), ("y", "c"), ("y", "b"), ("z", "a")])
         datas.append([("x", "d"), ("y", "b"), ("y", "c"), ("z", "a")])
-        datas.append([("x", "d"), ("y", "c"), ("y", None), ("z", "a")])
-        datas.append([("x", "d"), ("y", None), ("y", None), ("z", "a")])
-        datas.append([("x", "d"), ("y", "c"), ("y", "b"), (None, "a")])
-        datas.append([("x", "d"), ("y", "b"), ("y", "c"), (None, "a")])
 
         # more depth tests
         datas.append([("x", "d", "o"), ("y", "c", "p"), ("y", "c", "q"), ("z", "a", "r")])
         datas.append([("x", "d", "o"), ("y", "c", "q"), ("y", "c", "p"), ("z", "a", "r")])
+
+        # None type tests (None type is treated as False from pandas >= 1.1.4)
+        # Refer https://github.com/pandas-dev/pandas/issues/37220
+        datas.append([(1, 100), (2, 200), (None, 300), (4, 400), (5, 500)])
+        datas.append([(1, 100), (2, 200), (None, None), (4, 400), (5, 500)])
+        datas.append([("x", "d"), ("y", "c"), ("y", None), ("z", "a")])
+        datas.append([("x", "d"), ("y", "c"), ("y", "b"), (None, "a")])
+        datas.append([("x", "d"), ("y", "b"), ("y", "c"), (None, "a")])
         datas.append([("x", "d", "o"), ("y", "c", "p"), ("y", "c", None), ("z", "a", "r")])
-        datas.append([("x", "d", "o"), ("y", "c", None), ("y", "c", None), ("z", "a", "r")])
 
         for data in datas:
             with self.subTest(data=data):
@@ -998,25 +1002,105 @@ def test_monotonic(self):
                 self.assert_eq(kmidx.is_monotonic_increasing, pmidx.is_monotonic_increasing)
                 self.assert_eq(kmidx.is_monotonic_decreasing, pmidx.is_monotonic_decreasing)
 
-        # The datas below are showing different result depends on pandas version.
-        # Because the behavior of handling null values is changed in pandas >= 1.0.0.
+        # datas below return different result depends on pandas version.
+        # Because the behavior of handling null values is changed in pandas >= 1.1.4.
+        # Since Koalas follows latest pandas, all of them should return `False`.
         datas = []
-        datas.append([(None, 100), (2, 200), (3, 300), (4, 400), (5, 500)])
+        datas.append([(1, 100), (2, 200), (3, None), (4, 400), (5, 500)])
+        datas.append([(1, None), (2, 200), (3, 300), (4, 400), (5, 500)])
+        datas.append([(1, 100), (2, 200), (3, 300), (4, 400), (5, None)])
+        datas.append([(False, None), (True, True)])
+        datas.append([(None, False), (True, True)])
+        datas.append([(False, False), (True, None)])
+        datas.append([(False, False), (None, True)])
+        datas.append([("x", "d"), ("y", None), ("y", None), ("z", "a")])
+        datas.append([("x", "d", "o"), ("y", "c", None), ("y", "c", None), ("z", "a", "r")])
         datas.append([(1, 100), (2, 200), (3, 300), (4, 400), (None, 500)])
-        datas.append([(None, None), (2, 200), (3, 300), (4, 400), (5, 500)])
         datas.append([(1, 100), (2, 200), (3, 300), (4, 400), (None, None)])
+        datas.append([(5, 100), (4, 200), (3, None), (2, 400), (1, 500)])
+        datas.append([(5, None), (4, 200), (3, 300), (2, 400), (1, 500)])
+        datas.append([(5, 100), (4, 200), (3, None), (2, 400), (1, 500)])
+        datas.append([(5, 100), (4, 200), (3, 300), (2, 400), (1, None)])
+        datas.append([(True, None), (True, True)])
+        datas.append([(None, True), (True, True)])
+        datas.append([(True, True), (None, True)])
+        datas.append([(True, True), (True, None)])
+        datas.append([(None, 100), (2, 200), (3, 300), (4, 400), (5, 500)])
+        datas.append([(None, None), (2, 200), (3, 300), (4, 400), (5, 500)])
         datas.append([("x", "d"), ("y", None), ("y", "c"), ("z", "a")])
         datas.append([("x", "d", "o"), ("y", "c", None), ("y", "c", "q"), ("z", "a", "r")])
 
         for data in datas:
             with self.subTest(data=data):
                 pmidx = pd.MultiIndex.from_tuples(data)
                 kmidx = ks.from_pandas(pmidx)
-                expected_increasing_result = pmidx.is_monotonic_increasing
-                if LooseVersion(pd.__version__) < LooseVersion("1.0.0"):
-                    expected_increasing_result = not expected_increasing_result
-                self.assert_eq(kmidx.is_monotonic_increasing, expected_increasing_result)
-                self.assert_eq(kmidx.is_monotonic_decreasing, pmidx.is_monotonic_decreasing)
+                if LooseVersion(pd.__version__) < LooseVersion("1.1.4"):
+                    self.assert_eq(kmidx.is_monotonic_increasing, False)
+                    self.assert_eq(kmidx.is_monotonic_decreasing, False)
+                else:
+                    self.assert_eq(kmidx.is_monotonic_increasing, pmidx.is_monotonic_increasing)
+                    self.assert_eq(kmidx.is_monotonic_decreasing, pmidx.is_monotonic_decreasing)
+
+        # The datas below are tested another way since they cannot be an arguments for
+        # `MultiIndex.from_tuples` in pandas >= 1.1.0.
+        # Refer https://github.com/databricks/koalas/pull/1688#issuecomment-667156560 for detail.
+        if LooseVersion(pd.__version__) < LooseVersion("1.1.0"):
+            pmidx = pd.MultiIndex.from_tuples(
+                [(-5, None), (-4, None), (-3, None), (-2, None), (-1, None)]
+            )
+            kmidx = ks.from_pandas(pmidx)
+            self.assert_eq(kmidx.is_monotonic_increasing, False)
+            self.assert_eq(kmidx.is_monotonic_decreasing, False)
+
+            pmidx = pd.MultiIndex.from_tuples(
+                [(None, "e"), (None, "c"), (None, "b"), (None, "d"), (None, "a")]
+            )
+            kmidx = ks.from_pandas(pmidx)
+            self.assert_eq(kmidx.is_monotonic_increasing, False)
+            self.assert_eq(kmidx.is_monotonic_decreasing, False)
+
+            pmidx = pd.MultiIndex.from_tuples(
+                [(None, None), (None, None), (None, None), (None, None), (None, None)]
+            )
+            kmidx = ks.from_pandas(pmidx)
+            self.assert_eq(kmidx.is_monotonic_increasing, False)
+            self.assert_eq(kmidx.is_monotonic_decreasing, False)
+
+            pmidx = pd.MultiIndex.from_tuples([(None, None)])
+            kmidx = ks.from_pandas(pmidx)
+            self.assert_eq(kmidx.is_monotonic_increasing, False)
+            self.assert_eq(kmidx.is_monotonic_decreasing, False)
+        else:
+            # [(-5, None), (-4, None), (-3, None), (-2, None), (-1, None)]
+            kdf = ks.DataFrame({"a": [-5, -4, -3, -2, -1], "b": [1, 1, 1, 1, 1]})
+            kdf["b"] = None
+            kmidx = kdf.set_index(["a", "b"]).index
+            pmidx = kmidx.to_pandas()
+            self.assert_eq(kmidx.is_monotonic_increasing, pmidx.is_monotonic_increasing)
+            self.assert_eq(kmidx.is_monotonic_decreasing, pmidx.is_monotonic_decreasing)
+            # [(None, "e"), (None, "c"), (None, "b"), (None, "d"), (None, "a")]
+            kdf = ks.DataFrame({"a": [1, 1, 1, 1, 1], "b": ["e", "c", "b", "d", "a"]})
+            kdf["a"] = None
+            kmidx = kdf.set_index(["a", "b"]).index
+            pmidx = kmidx.to_pandas()
+            self.assert_eq(kmidx.is_monotonic_increasing, pmidx.is_monotonic_increasing)
+            self.assert_eq(kmidx.is_monotonic_decreasing, pmidx.is_monotonic_decreasing)
+            # [(None, None), (None, None), (None, None), (None, None), (None, None)]
+            kdf = ks.DataFrame({"a": [1, 1, 1, 1, 1], "b": [1, 1, 1, 1, 1]})
+            kdf["a"] = None
+            kdf["b"] = None
+            kmidx = kdf.set_index(["a", "b"]).index
+            pmidx = kmidx.to_pandas()
+            self.assert_eq(kmidx.is_monotonic_increasing, pmidx.is_monotonic_increasing)
+            self.assert_eq(kmidx.is_monotonic_decreasing, pmidx.is_monotonic_decreasing)
+            # [(None, None)]
+            kdf = ks.DataFrame({"a": [1], "b": [1]})
+            kdf["a"] = None
+            kdf["b"] = None
+            kmidx = kdf.set_index(["a", "b"]).index
+            pmidx = kmidx.to_pandas()
+            self.assert_eq(kmidx.is_monotonic_increasing, pmidx.is_monotonic_increasing)
+            self.assert_eq(kmidx.is_monotonic_decreasing, pmidx.is_monotonic_decreasing)
 
     def test_difference(self):
         # Index