From 5db7e01cc50632306b43efe4346d23642d365922 Mon Sep 17 00:00:00 2001 From: Oliver Hofkens Date: Thu, 12 Dec 2019 11:40:02 +0100 Subject: [PATCH 01/16] TST: Add failing test for min/max on empty categorical. --- pandas/tests/reductions/test_reductions.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/pandas/tests/reductions/test_reductions.py b/pandas/tests/reductions/test_reductions.py index d66472b1c2054..2846d51b22d56 100644 --- a/pandas/tests/reductions/test_reductions.py +++ b/pandas/tests/reductions/test_reductions.py @@ -1060,6 +1060,14 @@ def test_min_max_skipna(self, skipna): assert np.isnan(_min) assert np.isnan(_max) + def test_min_max_empty(self): + cat = Series( + Categorical([], categories=list("ABC"), ordered=True) + ) + + assert isna(cat.min()) + assert isna(cat.max()) + class TestSeriesMode: # Note: the name TestSeriesMode indicates these tests From 9c36841bf81abbca683b210b63d70235e7ddd376 Mon Sep 17 00:00:00 2001 From: Oliver Hofkens Date: Thu, 12 Dec 2019 11:42:58 +0100 Subject: [PATCH 02/16] BUG: min/max on empty categorical crashed --- pandas/core/arrays/categorical.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 7170aec3820a8..41eb19d155004 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -2126,6 +2126,10 @@ def min(self, skipna=True): min : the minimum of this `Categorical` """ self.check_for_ordered("min") + + if len(self._codes) == 0: + return np.nan + good = self._codes != -1 if not good.all(): if skipna: @@ -2153,6 +2157,10 @@ def max(self, skipna=True): max : the maximum of this `Categorical` """ self.check_for_ordered("max") + + if len(self._codes) == 0: + return np.nan + good = self._codes != -1 if not good.all(): if skipna: From 4d5df3a94b4b17e6b7aa1189d081ee051aaaf42c Mon Sep 17 00:00:00 2001 From: Oliver Hofkens Date: Thu, 12 Dec 2019 12:25:25 +0100 Subject: [PATCH 03/16] DOC: Added whatsnew. --- doc/source/whatsnew/v1.0.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 29060a93923eb..6609a3c1914e2 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -667,6 +667,7 @@ Categorical same type as if one used the :meth:`.str.` / :meth:`.dt.` on a :class:`Series` of that type. E.g. when accessing :meth:`Series.dt.tz_localize` on a :class:`Categorical` with duplicate entries, the accessor was skipping duplicates (:issue:`27952`) - Bug in :meth:`DataFrame.replace` and :meth:`Series.replace` that would give incorrect results on categorical data (:issue:`26988`) +- Bug where calling :meth:`Categorical.min` or :meth:`Categorical.max` on an empty Categorical would raise a numpy exception. Datetimelike From d788332e0953991ec88cdc6a09b1152824221982 Mon Sep 17 00:00:00 2001 From: Oliver Hofkens Date: Thu, 12 Dec 2019 12:27:46 +0100 Subject: [PATCH 04/16] CLN: Black test_reductions. --- pandas/tests/reductions/test_reductions.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pandas/tests/reductions/test_reductions.py b/pandas/tests/reductions/test_reductions.py index 2846d51b22d56..a73ad1a09d103 100644 --- a/pandas/tests/reductions/test_reductions.py +++ b/pandas/tests/reductions/test_reductions.py @@ -1061,9 +1061,7 @@ def test_min_max_skipna(self, skipna): assert np.isnan(_max) def test_min_max_empty(self): - cat = Series( - Categorical([], categories=list("ABC"), ordered=True) - ) + cat = Series(Categorical([], categories=list("ABC"), ordered=True)) assert isna(cat.min()) assert isna(cat.max()) From 3ec816c0f53c7c0ba257569421606d73eeeab9e0 Mon Sep 17 00:00:00 2001 From: Oliver Hofkens Date: Thu, 12 Dec 2019 14:29:28 +0100 Subject: [PATCH 05/16] Added .. versionchanged and correct NA value for dtype. --- pandas/core/arrays/categorical.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 41eb19d155004..c84159fd1cdee 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -39,7 +39,7 @@ from pandas.core.dtypes.dtypes import CategoricalDtype from pandas.core.dtypes.generic import ABCIndexClass, ABCSeries from pandas.core.dtypes.inference import is_hashable -from pandas.core.dtypes.missing import isna, notna +from pandas.core.dtypes.missing import isna, na_value_for_dtype, notna from pandas._typing import ArrayLike, Dtype, Ordered from pandas.core import ops @@ -2116,6 +2116,10 @@ def min(self, skipna=True): Only ordered `Categoricals` have a minimum! + .. versionchanged:: 1.0.0 + + Returns an NA value on empty arrays + Raises ------ TypeError @@ -2128,7 +2132,7 @@ def min(self, skipna=True): self.check_for_ordered("min") if len(self._codes) == 0: - return np.nan + return na_value_for_dtype(self.dtype) good = self._codes != -1 if not good.all(): @@ -2147,6 +2151,10 @@ def max(self, skipna=True): Only ordered `Categoricals` have a maximum! + .. versionchanged:: 1.0.0 + + Returns an NA value on empty arrays + Raises ------ TypeError @@ -2159,7 +2167,7 @@ def max(self, skipna=True): self.check_for_ordered("max") if len(self._codes) == 0: - return np.nan + return na_value_for_dtype(self.dtype) good = self._codes != -1 if not good.all(): From 51f21d56a2a32afb06de78339e251bab11e61ad3 Mon Sep 17 00:00:00 2001 From: Oliver Hofkens Date: Thu, 12 Dec 2019 14:56:24 +0100 Subject: [PATCH 06/16] DOC: Add issue number to whatsnew. --- doc/source/whatsnew/v1.0.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 6609a3c1914e2..d4f58ec3086b0 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -667,7 +667,7 @@ Categorical same type as if one used the :meth:`.str.` / :meth:`.dt.` on a :class:`Series` of that type. E.g. when accessing :meth:`Series.dt.tz_localize` on a :class:`Categorical` with duplicate entries, the accessor was skipping duplicates (:issue:`27952`) - Bug in :meth:`DataFrame.replace` and :meth:`Series.replace` that would give incorrect results on categorical data (:issue:`26988`) -- Bug where calling :meth:`Categorical.min` or :meth:`Categorical.max` on an empty Categorical would raise a numpy exception. +- Bug where calling :meth:`Categorical.min` or :meth:`Categorical.max` on an empty Categorical would raise a numpy exception (:issue:`30227`) Datetimelike From 8dee71e2aa9a83a7e8f75a95f67009ee5c16ad2a Mon Sep 17 00:00:00 2001 From: Oliver Hofkens Date: Thu, 12 Dec 2019 15:00:01 +0100 Subject: [PATCH 07/16] CLN: Cleaner empty checks. --- pandas/core/arrays/categorical.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index c84159fd1cdee..ba0cfad20c748 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -2131,7 +2131,7 @@ def min(self, skipna=True): """ self.check_for_ordered("min") - if len(self._codes) == 0: + if not len(self._codes): return na_value_for_dtype(self.dtype) good = self._codes != -1 @@ -2166,7 +2166,7 @@ def max(self, skipna=True): """ self.check_for_ordered("max") - if len(self._codes) == 0: + if not len(self._codes): return na_value_for_dtype(self.dtype) good = self._codes != -1 From c3f0d77b3474de6fad2a16fda24f1bdb839c8996 Mon Sep 17 00:00:00 2001 From: Oliver Hofkens Date: Thu, 12 Dec 2019 15:09:34 +0100 Subject: [PATCH 08/16] CLN: Move and clean up test_min_max_empty. --- pandas/tests/arrays/categorical/test_analytics.py | 9 +++++++++ pandas/tests/reductions/test_reductions.py | 6 ------ 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/pandas/tests/arrays/categorical/test_analytics.py b/pandas/tests/arrays/categorical/test_analytics.py index 637a47eba0597..226033e9aebb1 100644 --- a/pandas/tests/arrays/categorical/test_analytics.py +++ b/pandas/tests/arrays/categorical/test_analytics.py @@ -35,6 +35,15 @@ def test_min_max(self): assert _min == "d" assert _max == "a" + def test_min_max_empty(self): + cat = Categorical([], categories=list("ABC"), ordered=True) + + result = cat.min() + assert isinstance(result, float) and np.isnan(result) + + result = cat.max() + assert isinstance(result, float) and np.isnan(result) + @pytest.mark.parametrize("skipna", [True, False]) def test_min_max_with_nan(self, skipna): # GH 25303 diff --git a/pandas/tests/reductions/test_reductions.py b/pandas/tests/reductions/test_reductions.py index a73ad1a09d103..d66472b1c2054 100644 --- a/pandas/tests/reductions/test_reductions.py +++ b/pandas/tests/reductions/test_reductions.py @@ -1060,12 +1060,6 @@ def test_min_max_skipna(self, skipna): assert np.isnan(_min) assert np.isnan(_max) - def test_min_max_empty(self): - cat = Series(Categorical([], categories=list("ABC"), ordered=True)) - - assert isna(cat.min()) - assert isna(cat.max()) - class TestSeriesMode: # Note: the name TestSeriesMode indicates these tests From f0efeeb05b362f1a27a9943c525bca8779b0ffaa Mon Sep 17 00:00:00 2001 From: Oliver Hofkens Date: Thu, 12 Dec 2019 15:13:36 +0100 Subject: [PATCH 09/16] DOC: Add issue number comment to test. --- pandas/tests/arrays/categorical/test_analytics.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/arrays/categorical/test_analytics.py b/pandas/tests/arrays/categorical/test_analytics.py index 226033e9aebb1..3a7da69bc3edc 100644 --- a/pandas/tests/arrays/categorical/test_analytics.py +++ b/pandas/tests/arrays/categorical/test_analytics.py @@ -36,6 +36,7 @@ def test_min_max(self): assert _max == "a" def test_min_max_empty(self): + # GH 30227 cat = Categorical([], categories=list("ABC"), ordered=True) result = cat.min() From 1888238169933dc41ba2ff4fd9f9fd8cc283e1a0 Mon Sep 17 00:00:00 2001 From: Oliver Hofkens Date: Thu, 12 Dec 2019 15:37:40 +0100 Subject: [PATCH 10/16] TST: Parametrize test case on different categorical dtypes. --- .../tests/arrays/categorical/test_analytics.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/pandas/tests/arrays/categorical/test_analytics.py b/pandas/tests/arrays/categorical/test_analytics.py index 3a7da69bc3edc..febf37a0d0b48 100644 --- a/pandas/tests/arrays/categorical/test_analytics.py +++ b/pandas/tests/arrays/categorical/test_analytics.py @@ -5,7 +5,7 @@ from pandas.compat import PYPY -from pandas import Categorical, Index, Series +from pandas import Categorical, Index, Series, date_range, NaT from pandas.api.types import is_scalar import pandas.util.testing as tm @@ -35,15 +35,23 @@ def test_min_max(self): assert _min == "d" assert _max == "a" - def test_min_max_empty(self): + @pytest.mark.parametrize( + "categories,expected", + [ + (list("ABC"), np.NaN), + ([1, 2, 3], np.NaN), + (Series(date_range("2020-01-01", periods=3), dtype="category"), NaT), + ], + ) + def test_min_max_empty(self, categories, expected): # GH 30227 cat = Categorical([], categories=list("ABC"), ordered=True) result = cat.min() - assert isinstance(result, float) and np.isnan(result) + assert result is expected result = cat.max() - assert isinstance(result, float) and np.isnan(result) + assert result is expected @pytest.mark.parametrize("skipna", [True, False]) def test_min_max_with_nan(self, skipna): From c74b8d095d64992b0ce808c95a1956a593159ad2 Mon Sep 17 00:00:00 2001 From: Oliver Hofkens Date: Thu, 12 Dec 2019 15:54:25 +0100 Subject: [PATCH 11/16] TST: Split and rename tests. --- pandas/tests/arrays/categorical/test_analytics.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/tests/arrays/categorical/test_analytics.py b/pandas/tests/arrays/categorical/test_analytics.py index febf37a0d0b48..bcce9e3df5932 100644 --- a/pandas/tests/arrays/categorical/test_analytics.py +++ b/pandas/tests/arrays/categorical/test_analytics.py @@ -11,8 +11,7 @@ class TestCategoricalAnalytics: - def test_min_max(self): - + def test_min_max_not_ordered(self): # unordered cats have no min/max cat = Categorical(["a", "b", "c", "d"], ordered=False) msg = "Categorical is not ordered for operation {}" @@ -21,6 +20,7 @@ def test_min_max(self): with pytest.raises(TypeError, match=msg.format("max")): cat.max() + def test_min_max_ordered(self): cat = Categorical(["a", "b", "c", "d"], ordered=True) _min = cat.min() _max = cat.max() @@ -43,7 +43,7 @@ def test_min_max(self): (Series(date_range("2020-01-01", periods=3), dtype="category"), NaT), ], ) - def test_min_max_empty(self, categories, expected): + def test_min_max_ordered_empty(self, categories, expected): # GH 30227 cat = Categorical([], categories=list("ABC"), ordered=True) From 34ad68d1bd47b83724f8304869331c11ff5760c8 Mon Sep 17 00:00:00 2001 From: Oliver Hofkens Date: Mon, 16 Dec 2019 12:57:25 +0100 Subject: [PATCH 12/16] TST; Fix test on datetime categorical. --- pandas/core/arrays/categorical.py | 4 ++-- pandas/tests/arrays/categorical/test_analytics.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index ba0cfad20c748..5f98e66691331 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -2132,7 +2132,7 @@ def min(self, skipna=True): self.check_for_ordered("min") if not len(self._codes): - return na_value_for_dtype(self.dtype) + return self.dtype.na_value good = self._codes != -1 if not good.all(): @@ -2167,7 +2167,7 @@ def max(self, skipna=True): self.check_for_ordered("max") if not len(self._codes): - return na_value_for_dtype(self.dtype) + return self.dtype.na_value good = self._codes != -1 if not good.all(): diff --git a/pandas/tests/arrays/categorical/test_analytics.py b/pandas/tests/arrays/categorical/test_analytics.py index bcce9e3df5932..c0e823d75fdb6 100644 --- a/pandas/tests/arrays/categorical/test_analytics.py +++ b/pandas/tests/arrays/categorical/test_analytics.py @@ -40,7 +40,7 @@ def test_min_max_ordered(self): [ (list("ABC"), np.NaN), ([1, 2, 3], np.NaN), - (Series(date_range("2020-01-01", periods=3), dtype="category"), NaT), + (Series(date_range("2020-01-01", periods=3), dtype="category"), np.NaN), ], ) def test_min_max_ordered_empty(self, categories, expected): From d8346ee1db49ac3da86696e7f15f86ac8053a66d Mon Sep 17 00:00:00 2001 From: Oliver Hofkens Date: Mon, 16 Dec 2019 13:24:27 +0100 Subject: [PATCH 13/16] CLN: Fixed unused import. --- pandas/tests/arrays/categorical/test_analytics.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/arrays/categorical/test_analytics.py b/pandas/tests/arrays/categorical/test_analytics.py index c0e823d75fdb6..91c5121f55430 100644 --- a/pandas/tests/arrays/categorical/test_analytics.py +++ b/pandas/tests/arrays/categorical/test_analytics.py @@ -5,7 +5,7 @@ from pandas.compat import PYPY -from pandas import Categorical, Index, Series, date_range, NaT +from pandas import Categorical, Index, Series, date_range from pandas.api.types import is_scalar import pandas.util.testing as tm From 4dca2d94e5c49f67998933e1d57821656aedccef Mon Sep 17 00:00:00 2001 From: Oliver Hofkens Date: Mon, 16 Dec 2019 15:09:21 +0100 Subject: [PATCH 14/16] CLN: Parametrize tests over min,max --- .../arrays/categorical/test_analytics.py | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/pandas/tests/arrays/categorical/test_analytics.py b/pandas/tests/arrays/categorical/test_analytics.py index 91c5121f55430..6b066b385fe0a 100644 --- a/pandas/tests/arrays/categorical/test_analytics.py +++ b/pandas/tests/arrays/categorical/test_analytics.py @@ -11,14 +11,15 @@ class TestCategoricalAnalytics: - def test_min_max_not_ordered(self): + @pytest.mark.parametrize("aggregation", ["min", "max"]) + def test_min_max_not_ordered_raises(self, aggregation): # unordered cats have no min/max cat = Categorical(["a", "b", "c", "d"], ordered=False) msg = "Categorical is not ordered for operation {}" - with pytest.raises(TypeError, match=msg.format("min")): - cat.min() - with pytest.raises(TypeError, match=msg.format("max")): - cat.max() + agg_func = getattr(cat, aggregation) + + with pytest.raises(TypeError, match=msg.format(aggregation)): + agg_func() def test_min_max_ordered(self): cat = Categorical(["a", "b", "c", "d"], ordered=True) @@ -43,14 +44,13 @@ def test_min_max_ordered(self): (Series(date_range("2020-01-01", periods=3), dtype="category"), np.NaN), ], ) - def test_min_max_ordered_empty(self, categories, expected): + @pytest.mark.parametrize("aggregation", ["min", "max"]) + def test_min_max_ordered_empty(self, categories, expected, aggregation): # GH 30227 cat = Categorical([], categories=list("ABC"), ordered=True) - result = cat.min() - assert result is expected - - result = cat.max() + agg_func = getattr(cat, aggregation) + result = agg_func() assert result is expected @pytest.mark.parametrize("skipna", [True, False]) From 38bab518f628cbf6adac0e7e2f868e96a6b482e8 Mon Sep 17 00:00:00 2001 From: Oliver Hofkens Date: Mon, 16 Dec 2019 16:42:09 +0100 Subject: [PATCH 15/16] CLN: Remove unused import. --- pandas/core/arrays/categorical.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 5f98e66691331..761d9907609c3 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -39,7 +39,7 @@ from pandas.core.dtypes.dtypes import CategoricalDtype from pandas.core.dtypes.generic import ABCIndexClass, ABCSeries from pandas.core.dtypes.inference import is_hashable -from pandas.core.dtypes.missing import isna, na_value_for_dtype, notna +from pandas.core.dtypes.missing import isna, notna from pandas._typing import ArrayLike, Dtype, Ordered from pandas.core import ops From 890b27869585ad45c27dd235a43c5c58b01356ff Mon Sep 17 00:00:00 2001 From: Oliver Hofkens Date: Tue, 17 Dec 2019 15:32:01 +0100 Subject: [PATCH 16/16] TST: xfail datetime categorical tests. --- pandas/tests/arrays/categorical/test_analytics.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/pandas/tests/arrays/categorical/test_analytics.py b/pandas/tests/arrays/categorical/test_analytics.py index 6b066b385fe0a..3c3d9d7a44aec 100644 --- a/pandas/tests/arrays/categorical/test_analytics.py +++ b/pandas/tests/arrays/categorical/test_analytics.py @@ -5,7 +5,7 @@ from pandas.compat import PYPY -from pandas import Categorical, Index, Series, date_range +from pandas import Categorical, Index, NaT, Series, date_range from pandas.api.types import is_scalar import pandas.util.testing as tm @@ -41,7 +41,13 @@ def test_min_max_ordered(self): [ (list("ABC"), np.NaN), ([1, 2, 3], np.NaN), - (Series(date_range("2020-01-01", periods=3), dtype="category"), np.NaN), + pytest.param( + Series(date_range("2020-01-01", periods=3), dtype="category"), + NaT, + marks=pytest.mark.xfail( + reason="https://github.com/pandas-dev/pandas/issues/29962" + ), + ), ], ) @pytest.mark.parametrize("aggregation", ["min", "max"])