BUG: sorting with large float and multiple columns incorrect

uweschmitt · jreback · commit 6bea8275e504 · 2016-12-23T16:03:47.000-05:00
closes #14922 Having the `int` equivalent of `NaT` in an `int64` column caused wrong sorting because this special value was considered as "missing value". Author: Uwe <uwe.schmitt@id.ethz.ch> Closes #14944 from uweschmitt/fix-gh-14922 and squashes the following commits: c244438 [Uwe] further cleanup tests 4f28026 [Uwe] fixed typo in whatsnew/v0.20.0.txt 60cca5d [Uwe] add fix of GH14922 to release notes for 0.20.0 04dcbe8 [Uwe] further test cleanup 21e610c [Uwe] extended tests + minor cleanup 358a31e [Uwe] Merge branch 'fix-gh-14922' of github.com:uweschmitt/pandas into fix-gh-14922 03699c6 [Uwe] Fix GH 14922 1afdbb8 [Uwe] Fix GH 14922
diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt
@@ -283,6 +283,7 @@ Bug Fixes
 - Bug in ``astype()`` where ``inf`` values were incorrectly converted to integers. Now raises error now with ``astype()`` for Series and DataFrames (:issue:`14265`)
 - Bug in ``DataFrame(..).apply(to_numeric)`` when values are of type decimal.Decimal. (:issue:`14827`)
 - Bug in ``describe()`` when passing a numpy array which does not contain the median to the ``percentiles`` keyword argument (:issue:`14908`)
+- Bug in ``DataFrame.sort_values()`` when sorting by multiple columns where one column is of type ``int64`` and contains ``NaT`` (:issue:`14922`)
 
 
 
diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
@@ -349,7 +349,8 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None):
 
     table = hash_klass(size_hint or len(vals))
     uniques = vec_klass()
-    labels = table.get_labels(vals, uniques, 0, na_sentinel, True)
+    check_nulls = not is_integer_dtype(values)
+    labels = table.get_labels(vals, uniques, 0, na_sentinel, check_nulls)
 
     labels = _ensure_platform_int(labels)
 
diff --git a/pandas/tests/frame/test_sorting.py b/pandas/tests/frame/test_sorting.py
@@ -6,7 +6,7 @@
 
 from pandas.compat import lrange
 from pandas import (DataFrame, Series, MultiIndex, Timestamp,
-                    date_range)
+                    date_range, NaT)
 
 from pandas.util.testing import (assert_series_equal,
                                  assert_frame_equal,
@@ -491,3 +491,49 @@ def test_frame_column_inplace_sort_exception(self):
 
         cp = s.copy()
         cp.sort_values()  # it works!
+
+    def test_sort_nat_values_in_int_column(self):
+
+        # GH 14922: "sorting with large float and multiple columns incorrect"
+
+        # cause was that the int64 value NaT was considered as "na". Which is
+        # only correct for datetime64 columns.
+
+        int_values = (2, int(NaT))
+        float_values = (2.0, -1.797693e308)
+
+        df = DataFrame(dict(int=int_values, float=float_values),
+                       columns=["int", "float"])
+
+        df_reversed = DataFrame(dict(int=int_values[::-1],
+                                     float=float_values[::-1]),
+                                columns=["int", "float"],
+                                index=[1, 0])
+
+        # NaT is not a "na" for int64 columns, so na_position must not
+        # influence the result:
+        df_sorted = df.sort_values(["int", "float"], na_position="last")
+        assert_frame_equal(df_sorted, df_reversed)
+
+        df_sorted = df.sort_values(["int", "float"], na_position="first")
+        assert_frame_equal(df_sorted, df_reversed)
+
+        # reverse sorting order
+        df_sorted = df.sort_values(["int", "float"], ascending=False)
+        assert_frame_equal(df_sorted, df)
+
+        # and now check if NaT is still considered as "na" for datetime64
+        # columns:
+        df = DataFrame(dict(datetime=[Timestamp("2016-01-01"), NaT],
+                            float=float_values), columns=["datetime", "float"])
+
+        df_reversed = DataFrame(dict(datetime=[NaT, Timestamp("2016-01-01")],
+                                     float=float_values[::-1]),
+                                columns=["datetime", "float"],
+                                index=[1, 0])
+
+        df_sorted = df.sort_values(["datetime", "float"], na_position="first")
+        assert_frame_equal(df_sorted, df_reversed)
+
+        df_sorted = df.sort_values(["datetime", "float"], na_position="last")
+        assert_frame_equal(df_sorted, df_reversed)