From a470f02cce4efd66d0ba4b81657d9bce9334c9d8 Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Mon, 6 Apr 2020 18:55:03 -0500 Subject: [PATCH 01/12] ENH: Implement StringArray.min / max --- doc/source/whatsnew/v1.1.0.rst | 1 + pandas/core/arrays/string_.py | 16 ++++++++++++++++ pandas/tests/arrays/string_/test_string.py | 12 ++++++++++++ 3 files changed, 29 insertions(+) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 8a7db87b75d7b..89629aea38a55 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -354,6 +354,7 @@ Strings ^^^^^^^ - Bug in the :meth:`~Series.astype` method when converting "string" dtype data to nullable integer dtype (:issue:`32450`). +- Fixed issue where taking ``min`` or ``max`` of a ``StringArray`` or ``Series`` with ``StringDtype`` type would raise. (:issue:`31746`) - diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index dbca8e74f5e1b..5390a5c3a2603 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -13,6 +13,7 @@ from pandas import compat from pandas.core import ops +from pandas.core.array_algos import masked_reductions from pandas.core.arrays import IntegerArray, PandasArray from pandas.core.arrays.integer import _IntegerDtype from pandas.core.construction import extract_array @@ -282,8 +283,23 @@ def astype(self, dtype, copy=True): return super().astype(dtype, copy) def _reduce(self, name, skipna=True, **kwargs): + if name in ["min", "max"]: + return getattr(self, name)(skipna=skipna, **kwargs) + raise TypeError(f"Cannot perform reduction '{name}' with string dtype") + def min(self, axis=None, out=None, keepdims=False, skipna=True): + result = masked_reductions.min( + values=self.to_numpy(na_value=np.nan), mask=self.isna(), skipna=skipna + ) + return result + + def max(self, axis=None, out=None, keepdims=False, skipna=True): + result = masked_reductions.max( + values=self.to_numpy(na_value=np.nan), mask=self.isna(), skipna=skipna + ) + return result + def value_counts(self, dropna=False): from pandas import value_counts diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index fe770eed84b62..93a4fd2f048d4 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -230,6 +230,18 @@ def test_reduce(skipna): assert result == "abc" +@pytest.mark.parametrize("method", ["min", "max"]) +@pytest.mark.parametrize("skipna", [True, False]) +def test_min_max(method, skipna): + arr = pd.Series(["a", "b", "c", None], dtype="string") + result = getattr(arr, method)(skipna=skipna) + if skipna: + expected = "a" if method == "min" else "c" + assert result == expected + else: + assert result is pd.NA + + @pytest.mark.parametrize("skipna", [True, False]) @pytest.mark.xfail(reason="Not implemented StringArray.sum") def test_reduce_missing(skipna): From a50a2c49629a1eab3fccb131246d68bd341009af Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Mon, 6 Apr 2020 21:51:19 -0500 Subject: [PATCH 02/12] Skip --- pandas/tests/extension/base/reduce.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/pandas/tests/extension/base/reduce.py b/pandas/tests/extension/base/reduce.py index 6f433d659575a..af7d382d198ae 100644 --- a/pandas/tests/extension/base/reduce.py +++ b/pandas/tests/extension/base/reduce.py @@ -25,6 +25,13 @@ class BaseNoReduceTests(BaseReduceTests): @pytest.mark.parametrize("skipna", [True, False]) def test_reduce_series_numeric(self, data, all_numeric_reductions, skipna): + if isinstance(data, pd.arrays.StringArray) and all_numeric_reductions in [ + "min", + "max", + ]: + # Tested in pandas/tests/arrays/string_/test_string.py + pytest.skip("These reductions are implemented") + op_name = all_numeric_reductions s = pd.Series(data) From 6e8f0c530093e390fc75a485905d156b80762cf3 Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Tue, 7 Apr 2020 08:12:47 -0500 Subject: [PATCH 03/12] Update --- pandas/core/arrays/string_.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 5390a5c3a2603..2c51ef67055ef 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -4,6 +4,7 @@ import numpy as np from pandas._libs import lib, missing as libmissing +from pandas.compat.numpy import function as nv from pandas.core.dtypes.base import ExtensionDtype from pandas.core.dtypes.common import pandas_dtype @@ -288,15 +289,17 @@ def _reduce(self, name, skipna=True, **kwargs): raise TypeError(f"Cannot perform reduction '{name}' with string dtype") - def min(self, axis=None, out=None, keepdims=False, skipna=True): + def min(self, skipna=True, **kwargs): + nv.validate_min((), kwargs) result = masked_reductions.min( - values=self.to_numpy(na_value=np.nan), mask=self.isna(), skipna=skipna + values=self.to_numpy(), mask=self.isna(), skipna=skipna ) return result - def max(self, axis=None, out=None, keepdims=False, skipna=True): + def max(self, skipna=True, **kwargs): + nv.validate_max((), kwargs) result = masked_reductions.max( - values=self.to_numpy(na_value=np.nan), mask=self.isna(), skipna=skipna + values=self.to_numpy(), mask=self.isna(), skipna=skipna ) return result From 34f8d5fa8f91043c5b08184e2e039551e9547a0a Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Tue, 7 Apr 2020 15:48:14 -0500 Subject: [PATCH 04/12] Add numpy tests --- pandas/tests/arrays/string_/test_string.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index 93a4fd2f048d4..15a95223a191f 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -242,6 +242,14 @@ def test_min_max(method, skipna): assert result is pd.NA +@pytest.mark.parametrize("method", ["min", "max"]) +def test_min_max_numpy(method): + arr = pd.Series(["a", "b", "c", None], dtype="string") + result = getattr(np, method)(arr) + expected = "a" if method == "min" else "c" + assert result == expected + + @pytest.mark.parametrize("skipna", [True, False]) @pytest.mark.xfail(reason="Not implemented StringArray.sum") def test_reduce_missing(skipna): From fa13cec0df48994e5bb14cd545c75dc59bcd9c78 Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Wed, 8 Apr 2020 09:18:25 -0500 Subject: [PATCH 05/12] No kwargs --- pandas/core/arrays/string_.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 2c51ef67055ef..3ba13e83addac 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -285,7 +285,7 @@ def astype(self, dtype, copy=True): def _reduce(self, name, skipna=True, **kwargs): if name in ["min", "max"]: - return getattr(self, name)(skipna=skipna, **kwargs) + return getattr(self, name)(skipna=skipna) raise TypeError(f"Cannot perform reduction '{name}' with string dtype") From d804e4b64ad2dfc182014e75f4b4cb95ea459c91 Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Wed, 8 Apr 2020 11:34:03 -0500 Subject: [PATCH 06/12] Type --- pandas/core/arrays/string_.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 3ba13e83addac..0e108f258f246 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -4,6 +4,7 @@ import numpy as np from pandas._libs import lib, missing as libmissing +from pandas._typing import Scalar from pandas.compat.numpy import function as nv from pandas.core.dtypes.base import ExtensionDtype @@ -289,14 +290,14 @@ def _reduce(self, name, skipna=True, **kwargs): raise TypeError(f"Cannot perform reduction '{name}' with string dtype") - def min(self, skipna=True, **kwargs): + def min(self, skipna: bool = True, **kwargs) -> Scalar: nv.validate_min((), kwargs) result = masked_reductions.min( values=self.to_numpy(), mask=self.isna(), skipna=skipna ) return result - def max(self, skipna=True, **kwargs): + def max(self, skipna: bool = True, **kwargs) -> Scalar: nv.validate_max((), kwargs) result = masked_reductions.max( values=self.to_numpy(), mask=self.isna(), skipna=skipna From 5cc91e0f79cd56b5e0468799094c7ffdc9344a87 Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Wed, 8 Apr 2020 13:56:44 -0500 Subject: [PATCH 07/12] Move to PandasArray --- pandas/core/arrays/numpy_.py | 20 ++++++++++++++------ pandas/core/arrays/string_.py | 17 ----------------- 2 files changed, 14 insertions(+), 23 deletions(-) diff --git a/pandas/core/arrays/numpy_.py b/pandas/core/arrays/numpy_.py index 3058e1d6073f3..e9950e0edaffb 100644 --- a/pandas/core/arrays/numpy_.py +++ b/pandas/core/arrays/numpy_.py @@ -5,6 +5,7 @@ from numpy.lib.mixins import NDArrayOperatorsMixin from pandas._libs import lib +from pandas._typing import Scalar from pandas.compat.numpy import function as nv from pandas.util._decorators import doc from pandas.util._validators import validate_fillna_kwargs @@ -17,6 +18,7 @@ from pandas import compat from pandas.core import nanops from pandas.core.algorithms import searchsorted, take, unique +from pandas.core.array_algos import masked_reductions from pandas.core.arrays.base import ExtensionArray, ExtensionOpsMixin from pandas.core.construction import extract_array from pandas.core.indexers import check_array_indexer @@ -349,13 +351,19 @@ def all(self, axis=None, out=None, keepdims=False, skipna=True): nv.validate_all((), dict(out=out, keepdims=keepdims)) return nanops.nanall(self._ndarray, axis=axis, skipna=skipna) - def min(self, axis=None, out=None, keepdims=False, skipna=True): - nv.validate_min((), dict(out=out, keepdims=keepdims)) - return nanops.nanmin(self._ndarray, axis=axis, skipna=skipna) + def min(self, skipna: bool = True, **kwargs) -> Scalar: + nv.validate_min((), kwargs) + result = masked_reductions.min( + values=self.to_numpy(), mask=self.isna(), skipna=skipna + ) + return result - def max(self, axis=None, out=None, keepdims=False, skipna=True): - nv.validate_max((), dict(out=out, keepdims=keepdims)) - return nanops.nanmax(self._ndarray, axis=axis, skipna=skipna) + def max(self, skipna: bool = True, **kwargs) -> Scalar: + nv.validate_max((), kwargs) + result = masked_reductions.max( + values=self.to_numpy(), mask=self.isna(), skipna=skipna + ) + return result def sum( self, diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 0e108f258f246..51bbe182a002b 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -4,8 +4,6 @@ import numpy as np from pandas._libs import lib, missing as libmissing -from pandas._typing import Scalar -from pandas.compat.numpy import function as nv from pandas.core.dtypes.base import ExtensionDtype from pandas.core.dtypes.common import pandas_dtype @@ -15,7 +13,6 @@ from pandas import compat from pandas.core import ops -from pandas.core.array_algos import masked_reductions from pandas.core.arrays import IntegerArray, PandasArray from pandas.core.arrays.integer import _IntegerDtype from pandas.core.construction import extract_array @@ -290,20 +287,6 @@ def _reduce(self, name, skipna=True, **kwargs): raise TypeError(f"Cannot perform reduction '{name}' with string dtype") - def min(self, skipna: bool = True, **kwargs) -> Scalar: - nv.validate_min((), kwargs) - result = masked_reductions.min( - values=self.to_numpy(), mask=self.isna(), skipna=skipna - ) - return result - - def max(self, skipna: bool = True, **kwargs) -> Scalar: - nv.validate_max((), kwargs) - result = masked_reductions.max( - values=self.to_numpy(), mask=self.isna(), skipna=skipna - ) - return result - def value_counts(self, dropna=False): from pandas import value_counts From 0c98b0856611eb98e79d6324f872d127ad36153e Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Wed, 8 Apr 2020 13:58:45 -0500 Subject: [PATCH 08/12] Return None --- pandas/tests/extension/base/reduce.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/extension/base/reduce.py b/pandas/tests/extension/base/reduce.py index af7d382d198ae..51ee0e30dd0b4 100644 --- a/pandas/tests/extension/base/reduce.py +++ b/pandas/tests/extension/base/reduce.py @@ -30,7 +30,7 @@ def test_reduce_series_numeric(self, data, all_numeric_reductions, skipna): "max", ]: # Tested in pandas/tests/arrays/string_/test_string.py - pytest.skip("These reductions are implemented") + return None op_name = all_numeric_reductions s = pd.Series(data) From 3613f659f00d64f55382ab56afe7def1f1ba03ef Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Fri, 10 Apr 2020 13:57:59 -0500 Subject: [PATCH 09/12] Lint --- pandas/core/internals/blocks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 75c935cdf2e60..80573f32b936e 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -8,7 +8,7 @@ from pandas._libs import NaT, algos as libalgos, lib, writers import pandas._libs.internals as libinternals -from pandas._libs.tslibs import Timedelta, conversion +from pandas._libs.tslibs import conversion from pandas._libs.tslibs.timezones import tz_compare from pandas._typing import ArrayLike from pandas.util._validators import validate_bool_kwarg From 208835c4ac5b4256a753c8397deece26bc158983 Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Fri, 24 Apr 2020 10:23:37 -0500 Subject: [PATCH 10/12] Move test --- pandas/tests/extension/base/reduce.py | 7 ------- pandas/tests/extension/test_string.py | 11 ++++++++++- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/pandas/tests/extension/base/reduce.py b/pandas/tests/extension/base/reduce.py index 51ee0e30dd0b4..6f433d659575a 100644 --- a/pandas/tests/extension/base/reduce.py +++ b/pandas/tests/extension/base/reduce.py @@ -25,13 +25,6 @@ class BaseNoReduceTests(BaseReduceTests): @pytest.mark.parametrize("skipna", [True, False]) def test_reduce_series_numeric(self, data, all_numeric_reductions, skipna): - if isinstance(data, pd.arrays.StringArray) and all_numeric_reductions in [ - "min", - "max", - ]: - # Tested in pandas/tests/arrays/string_/test_string.py - return None - op_name = all_numeric_reductions s = pd.Series(data) diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py index 86aed671f1b88..27a157d2127f6 100644 --- a/pandas/tests/extension/test_string.py +++ b/pandas/tests/extension/test_string.py @@ -77,7 +77,16 @@ class TestMissing(base.BaseMissingTests): class TestNoReduce(base.BaseNoReduceTests): - pass + @pytest.mark.parametrize("skipna", [True, False]) + def test_reduce_series_numeric(self, data, all_numeric_reductions, skipna): + op_name = all_numeric_reductions + + if op_name in ["min", "max"]: + return None + + s = pd.Series(data) + with pytest.raises(TypeError): + getattr(s, op_name)(skipna=skipna) class TestMethods(base.BaseMethodsTests): From 91971257812b42649c5b47a158afa07903a9b93f Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Fri, 24 Apr 2020 12:13:32 -0500 Subject: [PATCH 11/12] Parametrize test --- pandas/tests/arrays/string_/test_string.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index 15a95223a191f..eb89798a1ad96 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -243,8 +243,14 @@ def test_min_max(method, skipna): @pytest.mark.parametrize("method", ["min", "max"]) -def test_min_max_numpy(method): - arr = pd.Series(["a", "b", "c", None], dtype="string") +@pytest.mark.parametrize( + "arr", + [ + pd.Series(["a", "b", "c", None], dtype="string"), + pd.array(["a", "b", "c", None], dtype="string"), + ], +) +def test_min_max_numpy(method, arr): result = getattr(np, method)(arr) expected = "a" if method == "min" else "c" assert result == expected From 3fcd2005b2f38bcc3b46d150d6f99d995f7fba36 Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Fri, 24 Apr 2020 14:49:46 -0500 Subject: [PATCH 12/12] Edit validator --- pandas/compat/numpy/function.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/compat/numpy/function.py b/pandas/compat/numpy/function.py index 260cc69187d38..e988f5d97f7ee 100644 --- a/pandas/compat/numpy/function.py +++ b/pandas/compat/numpy/function.py @@ -218,7 +218,7 @@ def validate_cum_func_with_skipna(skipna, args, kwargs, name): LOGICAL_FUNC_DEFAULTS = dict(out=None, keepdims=False) validate_logical_func = CompatValidator(LOGICAL_FUNC_DEFAULTS, method="kwargs") -MINMAX_DEFAULTS = dict(out=None, keepdims=False) +MINMAX_DEFAULTS = dict(axis=None, out=None, keepdims=False) validate_min = CompatValidator( MINMAX_DEFAULTS, fname="min", method="both", max_fname_arg_count=1 )