From a9921ba8aaae46db718c877100aee3cc39d07198 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 21 Jan 2020 11:57:10 -0600 Subject: [PATCH 1/5] COMPAT: Argsort position matches NumPy Closes https://github.com/pandas-dev/pandas/issues/29884 --- doc/source/whatsnew/v1.0.0.rst | 38 ++++++++++++++++++++++ pandas/core/indexes/datetimelike.py | 8 ++--- pandas/tests/indexes/datetimes/test_ops.py | 9 +++-- 3 files changed, 48 insertions(+), 7 deletions(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index ec6ad38bbc7cf..2f6d502901af1 100755 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -246,6 +246,44 @@ source, you should no longer need to install Cython into your build environment Backwards incompatible API changes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. _whatsnew_100.api_breaking.nat_sort: + +Changed sort position for ``NaT`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +:attr:`NaT` will now sort at the *end* rather than the beginning in sorting functions (:issue:`29884`). +This matches the behavior in NumPy 1.18 and newer, which makes the ``NaT`` behavior consistent +with other missing values like :attr:`numpy.nan`. + +.. ipython:: python + + values = pd.Index(['2001', 'NaT', '2000'], dtype='datetime64[ns]') + +*pandas 0.25.x* + +.. code-block:: python + + >>> values.sort_values() + DatetimeIndex(['NaT', '2000-01-01', '2001-01-01'], dtype='datetime64[ns]', freq=None) + + >>> values.argsort() + array([1, 2, 0]) + + +*pandas 1.0.0* + +.. ipython:: python + + values.sort_values() + values.argsort() + +This affects all sorting functions on indexes, Series, DataFrames, and arrays. + + +.. note:: + + This change was made between pandas 1.0.0rc0 and pandas 1.0.0. + .. _whatsnew_100.api_breaking.MultiIndex._names: Avoid using names from ``MultiIndex.levels`` diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index 1bfec9fbad0ed..d18a4c9086dfe 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -180,6 +180,9 @@ def map(self, mapper, na_action=None): except Exception: return self.astype(object).map(mapper) + def argsort(self, *args, **kwargs) -> np.ndarray: + return np.argsort(self._data) + def sort_values(self, return_indexer=False, ascending=True): """ Return sorted copy of Index. @@ -191,10 +194,7 @@ def sort_values(self, return_indexer=False, ascending=True): sorted_index = self.take(_as) return sorted_index, _as else: - # NB: using asi8 instead of _ndarray_values matters in numpy 1.18 - # because the treatment of NaT has been changed to put NaT last - # instead of first. - sorted_values = np.sort(self.asi8) + sorted_values = np.sort(self._ndarray_values) freq = self.freq if freq is not None and not is_period_dtype(self): diff --git a/pandas/tests/indexes/datetimes/test_ops.py b/pandas/tests/indexes/datetimes/test_ops.py index ecd4ace705e9e..1f59cf428748d 100644 --- a/pandas/tests/indexes/datetimes/test_ops.py +++ b/pandas/tests/indexes/datetimes/test_ops.py @@ -247,7 +247,7 @@ def test_order_with_freq(self, idx): ), ( [pd.NaT, "2011-01-03", "2011-01-05", "2011-01-02", pd.NaT], - [pd.NaT, pd.NaT, "2011-01-02", "2011-01-03", "2011-01-05"], + ["2011-01-02", "2011-01-03", "2011-01-05", pd.NaT, pd.NaT], ), ], ) @@ -269,14 +269,17 @@ def test_order_without_freq(self, index_dates, expected_dates, tz_naive_fixture) ordered, indexer = index.sort_values(return_indexer=True) tm.assert_index_equal(ordered, expected) - exp = np.array([0, 4, 3, 1, 2]) + if index.isna().any(): + exp = np.array([3, 1, 2, 0, 4]) + else: + exp = np.array([0, 4, 3, 1, 2]) tm.assert_numpy_array_equal(indexer, exp, check_dtype=False) assert ordered.freq is None ordered, indexer = index.sort_values(return_indexer=True, ascending=False) tm.assert_index_equal(ordered, expected[::-1]) - exp = np.array([2, 1, 3, 4, 0]) + exp = exp[::-1] tm.assert_numpy_array_equal(indexer, exp, check_dtype=False) assert ordered.freq is None From ee6b06884f4827f2cd6ef26d38cce5ff77f28d92 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 22 Jan 2020 12:06:41 -0600 Subject: [PATCH 2/5] move to arrays --- pandas/core/arrays/datetimelike.py | 2 +- pandas/core/indexes/datetimelike.py | 6 +++--- pandas/tests/arrays/test_datetimelike.py | 8 ++++++++ 3 files changed, 12 insertions(+), 4 deletions(-) diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 70637026c278d..d9b0ee0fc0ba0 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -710,7 +710,7 @@ def _from_factorized(cls, values, original): return cls(values, dtype=original.dtype) def _values_for_argsort(self): - return self._data + return self._data.view("M8[ns]") # ------------------------------------------------------------------ # Additional array methods diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index d18a4c9086dfe..e718a58a6af1e 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -180,9 +180,6 @@ def map(self, mapper, na_action=None): except Exception: return self.astype(object).map(mapper) - def argsort(self, *args, **kwargs) -> np.ndarray: - return np.argsort(self._data) - def sort_values(self, return_indexer=False, ascending=True): """ Return sorted copy of Index. @@ -224,6 +221,9 @@ def take(self, indices, axis=0, allow_fill=True, fill_value=None, **kwargs): self, indices, axis, allow_fill, fill_value, **kwargs ) + def argsort(self, *args, **kwargs) -> np.ndarray: + return np.argsort(self._data, *args, **kwargs) + @Appender(_shared_docs["searchsorted"]) def searchsorted(self, value, side="left", sorter=None): if isinstance(value, str): diff --git a/pandas/tests/arrays/test_datetimelike.py b/pandas/tests/arrays/test_datetimelike.py index 87b825c8c27bd..329311643c7f8 100644 --- a/pandas/tests/arrays/test_datetimelike.py +++ b/pandas/tests/arrays/test_datetimelike.py @@ -795,3 +795,11 @@ def test_to_numpy_extra(array): assert result[0] == result[1] tm.assert_equal(array, original) + + +@pytest.mark.parametrize("dtype", ["datetime64[ns]", "timedelta64[ns]", "Period[D]"]) +def test_argsort(dtype): + a = pd.array([2001, pd.NaT, 2000], dtype=dtype) + result = a.argsort() + expected = np.array([2, 0, 1]) + tm.assert_numpy_array_equal(result, expected) From f13c8abe7899e03f671a0a8e2ba263fee2d98816 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 22 Jan 2020 12:12:15 -0600 Subject: [PATCH 3/5] wip --- pandas/core/indexes/datetimelike.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index e718a58a6af1e..475400b1b7ec2 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -191,7 +191,8 @@ def sort_values(self, return_indexer=False, ascending=True): sorted_index = self.take(_as) return sorted_index, _as else: - sorted_values = np.sort(self._ndarray_values) + values = self._data + sorted_values = values[values.argsort()] freq = self.freq if freq is not None and not is_period_dtype(self): From c387d85d0eaff79f82c2d3c4f78cd5157d045f8c Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 22 Jan 2020 12:14:31 -0600 Subject: [PATCH 4/5] fixup --- pandas/core/indexes/datetimelike.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index 475400b1b7ec2..f1601c4535630 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -192,7 +192,7 @@ def sort_values(self, return_indexer=False, ascending=True): return sorted_index, _as else: values = self._data - sorted_values = values[values.argsort()] + sorted_values = values[values.argsort()].asi8 freq = self.freq if freq is not None and not is_period_dtype(self): From 3b01363eaf64c9836f4b2aca956452fd2240d5d2 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 22 Jan 2020 12:27:05 -0600 Subject: [PATCH 5/5] double-view --- pandas/core/indexes/datetimelike.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index f1601c4535630..ea89fb38e8977 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -192,7 +192,7 @@ def sort_values(self, return_indexer=False, ascending=True): return sorted_index, _as else: values = self._data - sorted_values = values[values.argsort()].asi8 + sorted_values = np.sort(values.view("M8[ns]")).view("i8") freq = self.freq if freq is not None and not is_period_dtype(self):