From 4c04e30d5c543870dbccb1c8a1a4c51e8759b621 Mon Sep 17 00:00:00 2001 From: Michael Tiemann <72577720+MichaelTiemannOSC@users.noreply.github.com> Date: Sun, 2 Jul 2023 10:04:12 -0400 Subject: [PATCH 01/16] Support uncertainties as EA Dtype In parallel with Pint and Pint-Pandas, changes to support uncertainties as an EA Dtype. See https://github.com/hgrecco/pint/pull/1615 and https://github.com/hgrecco/pint-pandas/pull/140 Signed-off-by: Michael Tiemann <72577720+MichaelTiemannOSC@users.noreply.github.com> --- pandas/core/arrays/masked.py | 3 ++- pandas/core/groupby/groupby.py | 5 ++++- pandas/core/groupby/ops.py | 2 +- pandas/tests/extension/test_boolean.py | 10 +++++----- 4 files changed, 12 insertions(+), 8 deletions(-) diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 15c485cbb1499..15f80b5501029 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -870,7 +870,8 @@ def take( # we only fill where the indexer is null # not existing missing values # TODO(jreback) what if we have a non-na float as a fill value? - if allow_fill and notna(fill_value): + # NaN with uncertainties is scalar but does not register as `isna`, so use fact that NaN != NaN + if allow_fill and notna(fill_value) and fill_value==fill_value: fill_mask = np.asarray(indexer) == -1 result[fill_mask] = fill_value mask = mask ^ fill_mask diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 678ab7444bb58..9d10440061537 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -3200,7 +3200,10 @@ def first(x: Series): """Helper function for first item that isn't NA.""" arr = x.array[notna(x.array)] if not len(arr): - return np.nan + nan_arr = x.array[isna(x.array)] + if not len(nan_arr): + return np.nan + return nan_arr[0] return arr[0] if isinstance(obj, DataFrame): diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index f0e4484f69f8d..8b7f8e1aee571 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -587,7 +587,7 @@ def nkeys(self) -> int: def get_iterator( self, data: NDFrameT, axis: AxisInt = 0 - ) -> Iterator[tuple[Hashable, NDFrameT]]: + ) -> Iterator[tuple[Hashable, NDFrameT]]: # Doesn't work with non-hashable EA types """ Groupby iterator diff --git a/pandas/tests/extension/test_boolean.py b/pandas/tests/extension/test_boolean.py index c9fa28a507745..18341b23237fe 100644 --- a/pandas/tests/extension/test_boolean.py +++ b/pandas/tests/extension/test_boolean.py @@ -28,7 +28,7 @@ def make_data(): - return [True, False] * 4 + [np.nan] + [True, False] * 44 + [np.nan] + [True, False] + return [True, False] * 4 + [pd.NA] + [True, False] * 44 + [pd.NA] + [True, False] @pytest.fixture @@ -48,7 +48,7 @@ def data_for_twos(dtype): @pytest.fixture def data_missing(dtype): - return pd.array([np.nan, True], dtype=dtype) + return pd.array([pd.NA, True], dtype=dtype) @pytest.fixture @@ -58,7 +58,7 @@ def data_for_sorting(dtype): @pytest.fixture def data_missing_for_sorting(dtype): - return pd.array([True, np.nan, False], dtype=dtype) + return pd.array([True, pd.NA, False], dtype=dtype) @pytest.fixture @@ -76,7 +76,7 @@ def na_value(): def data_for_grouping(dtype): b = True a = False - na = np.nan + na = pd.NA return pd.array([b, b, na, na, a, a, b], dtype=dtype) @@ -147,7 +147,7 @@ def _check_op(self, obj, op, other, op_name, exc=NotImplementedError): expected = expected.astype("Float64") if op_name == "__rpow__": # for rpow, combine does not propagate NaN - expected[result.isna()] = np.nan + expected[result.isna()] = pd.NA self.assert_equal(result, expected) else: with pytest.raises(exc): From a165278a6176597395a512e3f44b83abb82b9a0e Mon Sep 17 00:00:00 2001 From: Michael Tiemann <72577720+MichaelTiemannOSC@users.noreply.github.com> Date: Sun, 2 Jul 2023 10:15:23 -0400 Subject: [PATCH 02/16] Update masked.py Make black and ruff happy. Signed-off-by: Michael Tiemann <72577720+MichaelTiemannOSC@users.noreply.github.com> --- pandas/core/arrays/masked.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 15f80b5501029..0e5ac9ad2757b 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -870,8 +870,9 @@ def take( # we only fill where the indexer is null # not existing missing values # TODO(jreback) what if we have a non-na float as a fill value? - # NaN with uncertainties is scalar but does not register as `isna`, so use fact that NaN != NaN - if allow_fill and notna(fill_value) and fill_value==fill_value: + # NaN with uncertainties is scalar but does not register as `isna`, + # so use fact that NaN != NaN + if allow_fill and notna(fill_value) and fill_value == fill_value: fill_mask = np.asarray(indexer) == -1 result[fill_mask] = fill_value mask = mask ^ fill_mask From 0b833aaecac9d5324a7de48cc9ce3d4210c9390e Mon Sep 17 00:00:00 2001 From: Michael Tiemann <72577720+MichaelTiemannOSC@users.noreply.github.com> Date: Sun, 2 Jul 2023 17:03:53 -0400 Subject: [PATCH 03/16] Update test_json.py Somewhere along the line, https://github.com/pandas-dev/pandas/issues/39098 was fixed. Remove the XFAIL mark so we can celebrate one more success! Signed-off-by: Michael Tiemann <72577720+MichaelTiemannOSC@users.noreply.github.com> --- pandas/tests/extension/json/test_json.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/extension/json/test_json.py b/pandas/tests/extension/json/test_json.py index 0920e70142446..ac63fd2411400 100644 --- a/pandas/tests/extension/json/test_json.py +++ b/pandas/tests/extension/json/test_json.py @@ -356,7 +356,7 @@ def test_groupby_extension_no_sort(self): """ super().test_groupby_extension_no_sort() - @pytest.mark.xfail(reason="GH#39098: Converts agg result to object") + # GH#39098 is now fixed, so we don't need to XFAIL def test_groupby_agg_extension(self, data_for_grouping): super().test_groupby_agg_extension(data_for_grouping) From 2a1913bcda29d52630d7a2a46d63390dc33e82db Mon Sep 17 00:00:00 2001 From: Michael Tiemann <72577720+MichaelTiemannOSC@users.noreply.github.com> Date: Sun, 2 Jul 2023 17:47:19 -0400 Subject: [PATCH 04/16] Update test_json.py Fix pylint complaint about useless parent delgation. Signed-off-by: Michael Tiemann <72577720+MichaelTiemannOSC@users.noreply.github.com> --- pandas/tests/extension/json/test_json.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/pandas/tests/extension/json/test_json.py b/pandas/tests/extension/json/test_json.py index ac63fd2411400..85fa439f42ced 100644 --- a/pandas/tests/extension/json/test_json.py +++ b/pandas/tests/extension/json/test_json.py @@ -356,9 +356,10 @@ def test_groupby_extension_no_sort(self): """ super().test_groupby_extension_no_sort() - # GH#39098 is now fixed, so we don't need to XFAIL - def test_groupby_agg_extension(self, data_for_grouping): - super().test_groupby_agg_extension(data_for_grouping) + # GH#39098 is now fixed, so we don't need to XFAIL, nor subclass this test + # @pytest.mark.xpass(reason="GH#39098: Converts agg result to object") + # def test_groupby_agg_extension(self, data_for_grouping): + # super().test_groupby_agg_extension(data_for_grouping) class TestArithmeticOps(BaseJSON, base.BaseArithmeticOpsTests): From 7d8ba79bfe29318a5ec305f6d1dd30c319677c2f Mon Sep 17 00:00:00 2001 From: Michael Tiemann <72577720+MichaelTiemannOSC@users.noreply.github.com> Date: Wed, 12 Jul 2023 07:21:13 -0400 Subject: [PATCH 05/16] Update v2.1.0.rst Documentation updated. Signed-off-by: Michael Tiemann <72577720+MichaelTiemannOSC@users.noreply.github.com> --- doc/source/whatsnew/v2.1.0.rst | 3 +++ 1 file changed, 3 insertions(+) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index ebbdbcb0f61f5..e722ba6fda5c0 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -90,6 +90,7 @@ Other enhancements - :meth:`MultiIndex.sortlevel` and :meth:`Index.sortlevel` gained a new keyword ``na_position`` (:issue:`51612`) - :meth:`arrays.DatetimeArray.map`, :meth:`arrays.TimedeltaArray.map` and :meth:`arrays.PeriodArray.map` can now take a ``na_action`` argument (:issue:`51644`) - :meth:`arrays.SparseArray.map` now supports ``na_action`` (:issue:`52096`). +- :meth:`arrays.BaseMaskedArray.take` handle non-na float as fill value (triggered by `ufloat` NaN from `uncertainties` package) (:pull:`53970`) - Add :meth:`diff()` and :meth:`round()` for :class:`Index` (:issue:`19708`) - Add dtype of categories to ``repr`` information of :class:`CategoricalDtype` (:issue:`52179`) - Added to the escape mode "latex-math" preserving without escaping all characters between "\(" and "\)" in formatter (:issue:`51903`) @@ -107,6 +108,7 @@ Other enhancements - :meth:`Categorical.from_codes` has gotten a ``validate`` parameter (:issue:`50975`) - :meth:`DataFrame.unstack` gained the ``sort`` keyword to dictate whether the resulting :class:`MultiIndex` levels are sorted (:issue:`15105`) - :meth:`DataFrameGroupby.agg` and :meth:`DataFrameGroupby.transform` now support grouping by multiple keys when the index is not a :class:`MultiIndex` for ``engine="numba"`` (:issue:`53486`) +- :meth:`GroupBy.first` if series contains only NA values (which might be NaN), return the first NA value, else return np.nan (:pull:`53970`) - :meth:`Series.explode` now supports pyarrow-backed list types (:issue:`53602`) - :meth:`Series.str.join` now supports ``ArrowDtype(pa.string())`` (:issue:`53646`) - :meth:`SeriesGroupby.agg` and :meth:`DataFrameGroupby.agg` now support passing in multiple functions for ``engine="numba"`` (:issue:`53486`) @@ -552,6 +554,7 @@ Other - Bug in :meth:`Series.map` when giving a callable to an empty series, the returned series had ``object`` dtype. It now keeps the original dtype (:issue:`52384`) - Bug in :meth:`Series.memory_usage` when ``deep=True`` throw an error with Series of objects and the returned value is incorrect, as it does not take into account GC corrections (:issue:`51858`) - Fixed incorrect ``__name__`` attribute of ``pandas._libs.json`` (:issue:`52898`) +- Change tests/extension/test_boolean.py to use pd.NA instead of np.nan (following similar patterns in ../test_integer.py and ../test_float.py), updating :func:`make_data`, :func:`data_missing`, :func:`data_missing_for_sorting`, :func:`data_for_grouping`, :func:`_check_op` (:pull:`53970`) .. ***DO NOT USE THIS SECTION*** From 8ada66255513a2d3bd03b5027a5a875d9770d5d3 Mon Sep 17 00:00:00 2001 From: Michael Tiemann <72577720+MichaelTiemannOSC@users.noreply.github.com> Date: Wed, 12 Jul 2023 07:35:07 -0400 Subject: [PATCH 06/16] Update v2.1.0.rst rst ``code`` is two backticks (which my pre-commit didn't catch). Also, try to better guess the proper rst doc pattern for referencing test files in pandas/tests directory. Signed-off-by: Michael Tiemann <72577720+MichaelTiemannOSC@users.noreply.github.com> --- doc/source/whatsnew/v2.1.0.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index e722ba6fda5c0..8289f85171063 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -90,7 +90,7 @@ Other enhancements - :meth:`MultiIndex.sortlevel` and :meth:`Index.sortlevel` gained a new keyword ``na_position`` (:issue:`51612`) - :meth:`arrays.DatetimeArray.map`, :meth:`arrays.TimedeltaArray.map` and :meth:`arrays.PeriodArray.map` can now take a ``na_action`` argument (:issue:`51644`) - :meth:`arrays.SparseArray.map` now supports ``na_action`` (:issue:`52096`). -- :meth:`arrays.BaseMaskedArray.take` handle non-na float as fill value (triggered by `ufloat` NaN from `uncertainties` package) (:pull:`53970`) +- :meth:`arrays.BaseMaskedArray.take` handle non-na float as fill value (triggered by ``ufloat`` NaN from ``uncertainties`` package) (:pull:`53970`) - Add :meth:`diff()` and :meth:`round()` for :class:`Index` (:issue:`19708`) - Add dtype of categories to ``repr`` information of :class:`CategoricalDtype` (:issue:`52179`) - Added to the escape mode "latex-math" preserving without escaping all characters between "\(" and "\)" in formatter (:issue:`51903`) @@ -554,7 +554,7 @@ Other - Bug in :meth:`Series.map` when giving a callable to an empty series, the returned series had ``object`` dtype. It now keeps the original dtype (:issue:`52384`) - Bug in :meth:`Series.memory_usage` when ``deep=True`` throw an error with Series of objects and the returned value is incorrect, as it does not take into account GC corrections (:issue:`51858`) - Fixed incorrect ``__name__`` attribute of ``pandas._libs.json`` (:issue:`52898`) -- Change tests/extension/test_boolean.py to use pd.NA instead of np.nan (following similar patterns in ../test_integer.py and ../test_float.py), updating :func:`make_data`, :func:`data_missing`, :func:`data_missing_for_sorting`, :func:`data_for_grouping`, :func:`_check_op` (:pull:`53970`) +- Change ``~tests/extension/test_boolean.py`` to use pd.NA instead of np.nan (following similar patterns in ``~tests/extension/test_integer.py`` and ``~tests/extension/test_float.py``), updating :func:`make_data`, :func:`data_missing`, :func:`data_missing_for_sorting`, :func:`data_for_grouping`, :func:`_check_op` (:pull:`53970`) .. ***DO NOT USE THIS SECTION*** From 42ff1a824d5a0bfdb6172f4d50fd2bb97ebf0461 Mon Sep 17 00:00:00 2001 From: Michael Tiemann <72577720+MichaelTiemannOSC@users.noreply.github.com> Date: Wed, 12 Jul 2023 09:42:22 -0400 Subject: [PATCH 07/16] Update v2.1.0.rst Try referencing pull request using html reference as there is no extlink definition for `:pull:` Signed-off-by: Michael Tiemann <72577720+MichaelTiemannOSC@users.noreply.github.com> --- doc/source/whatsnew/v2.1.0.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index 8289f85171063..8f2bd4f1485f8 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -90,7 +90,7 @@ Other enhancements - :meth:`MultiIndex.sortlevel` and :meth:`Index.sortlevel` gained a new keyword ``na_position`` (:issue:`51612`) - :meth:`arrays.DatetimeArray.map`, :meth:`arrays.TimedeltaArray.map` and :meth:`arrays.PeriodArray.map` can now take a ``na_action`` argument (:issue:`51644`) - :meth:`arrays.SparseArray.map` now supports ``na_action`` (:issue:`52096`). -- :meth:`arrays.BaseMaskedArray.take` handle non-na float as fill value (triggered by ``ufloat`` NaN from ``uncertainties`` package) (:pull:`53970`) +- :meth:`arrays.BaseMaskedArray.take` handle non-na float as fill value (triggered by ``ufloat`` NaN from ``uncertainties`` package) (see `PR `_) - Add :meth:`diff()` and :meth:`round()` for :class:`Index` (:issue:`19708`) - Add dtype of categories to ``repr`` information of :class:`CategoricalDtype` (:issue:`52179`) - Added to the escape mode "latex-math" preserving without escaping all characters between "\(" and "\)" in formatter (:issue:`51903`) @@ -108,7 +108,7 @@ Other enhancements - :meth:`Categorical.from_codes` has gotten a ``validate`` parameter (:issue:`50975`) - :meth:`DataFrame.unstack` gained the ``sort`` keyword to dictate whether the resulting :class:`MultiIndex` levels are sorted (:issue:`15105`) - :meth:`DataFrameGroupby.agg` and :meth:`DataFrameGroupby.transform` now support grouping by multiple keys when the index is not a :class:`MultiIndex` for ``engine="numba"`` (:issue:`53486`) -- :meth:`GroupBy.first` if series contains only NA values (which might be NaN), return the first NA value, else return np.nan (:pull:`53970`) +- :meth:`GroupBy.first` if series contains only NA values (which might be NaN), return the first NA value, else return np.nan (see `PR `_) - :meth:`Series.explode` now supports pyarrow-backed list types (:issue:`53602`) - :meth:`Series.str.join` now supports ``ArrowDtype(pa.string())`` (:issue:`53646`) - :meth:`SeriesGroupby.agg` and :meth:`DataFrameGroupby.agg` now support passing in multiple functions for ``engine="numba"`` (:issue:`53486`) @@ -554,7 +554,7 @@ Other - Bug in :meth:`Series.map` when giving a callable to an empty series, the returned series had ``object`` dtype. It now keeps the original dtype (:issue:`52384`) - Bug in :meth:`Series.memory_usage` when ``deep=True`` throw an error with Series of objects and the returned value is incorrect, as it does not take into account GC corrections (:issue:`51858`) - Fixed incorrect ``__name__`` attribute of ``pandas._libs.json`` (:issue:`52898`) -- Change ``~tests/extension/test_boolean.py`` to use pd.NA instead of np.nan (following similar patterns in ``~tests/extension/test_integer.py`` and ``~tests/extension/test_float.py``), updating :func:`make_data`, :func:`data_missing`, :func:`data_missing_for_sorting`, :func:`data_for_grouping`, :func:`_check_op` (:pull:`53970`) +- Change ``~tests/extension/test_boolean.py`` to use pd.NA instead of np.nan (following similar patterns in ``~tests/extension/test_integer.py`` and ``~tests/extension/test_float.py``), updating :func:`make_data`, :func:`data_missing`, :func:`data_missing_for_sorting`, :func:`data_for_grouping`, :func:`_check_op` (see `PR `_) .. ***DO NOT USE THIS SECTION*** From e53a62c295fa0e858ae81036bf28c88de88b1dd9 Mon Sep 17 00:00:00 2001 From: Michael Tiemann <72577720+MichaelTiemannOSC@users.noreply.github.com> Date: Fri, 14 Jul 2023 06:35:14 -0400 Subject: [PATCH 08/16] Update test_json.py Having observed that GH#39098 works in the current release, allow the test that previously XFAIL'd to run (and pass). This is done to make CI/CD happy, and has nothing to do with changes otherwise present in this PR. Signed-off-by: Michael Tiemann <72577720+MichaelTiemannOSC@users.noreply.github.com> --- pandas/tests/extension/json/test_json.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/pandas/tests/extension/json/test_json.py b/pandas/tests/extension/json/test_json.py index 85fa439f42ced..45f4abe1f74d5 100644 --- a/pandas/tests/extension/json/test_json.py +++ b/pandas/tests/extension/json/test_json.py @@ -356,10 +356,8 @@ def test_groupby_extension_no_sort(self): """ super().test_groupby_extension_no_sort() - # GH#39098 is now fixed, so we don't need to XFAIL, nor subclass this test - # @pytest.mark.xpass(reason="GH#39098: Converts agg result to object") - # def test_groupby_agg_extension(self, data_for_grouping): - # super().test_groupby_agg_extension(data_for_grouping) + def test_groupby_agg_extension(self, data_for_grouping): + super().test_groupby_agg_extension(data_for_grouping) class TestArithmeticOps(BaseJSON, base.BaseArithmeticOpsTests): From b8083a9a2559d22e41991a1d4c41fa266b64dfd3 Mon Sep 17 00:00:00 2001 From: Michael Tiemann <72577720+MichaelTiemannOSC@users.noreply.github.com> Date: Mon, 17 Jul 2023 22:24:35 -0400 Subject: [PATCH 09/16] Address code review comments Removed the superfluous check testing whether fill_value was a NaN or not (in pandas/core/arrays/masked.py::take). pylint properly flags an unnecessary implementation of TestGroupby.test_groupby_agg_extension (which merely calls the parent method by the same name). By deleting that implementation, the parent method is called automatically. Signed-off-by: Michael Tiemann <72577720+MichaelTiemannOSC@users.noreply.github.com> --- pandas/core/arrays/masked.py | 2 +- pandas/tests/extension/json/test_json.py | 3 --- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 0e5ac9ad2757b..25d7a52df708d 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -872,7 +872,7 @@ def take( # TODO(jreback) what if we have a non-na float as a fill value? # NaN with uncertainties is scalar but does not register as `isna`, # so use fact that NaN != NaN - if allow_fill and notna(fill_value) and fill_value == fill_value: + if allow_fill and notna(fill_value): fill_mask = np.asarray(indexer) == -1 result[fill_mask] = fill_value mask = mask ^ fill_mask diff --git a/pandas/tests/extension/json/test_json.py b/pandas/tests/extension/json/test_json.py index 45f4abe1f74d5..fb2208b304f9e 100644 --- a/pandas/tests/extension/json/test_json.py +++ b/pandas/tests/extension/json/test_json.py @@ -356,9 +356,6 @@ def test_groupby_extension_no_sort(self): """ super().test_groupby_extension_no_sort() - def test_groupby_agg_extension(self, data_for_grouping): - super().test_groupby_agg_extension(data_for_grouping) - class TestArithmeticOps(BaseJSON, base.BaseArithmeticOpsTests): def test_arith_frame_with_scalar(self, data, all_arithmetic_operators, request): From ed69055b0038ef8623f1591b3d7924412487dd2f Mon Sep 17 00:00:00 2001 From: Michael Tiemann <72577720+MichaelTiemannOSC@users.noreply.github.com> Date: Wed, 26 Jul 2023 08:33:51 -0400 Subject: [PATCH 10/16] Code review cleanpus Patches in response to four review comments/questions received. Signed-off-by: Michael Tiemann <72577720+MichaelTiemannOSC@users.noreply.github.com> --- pandas/core/arrays/masked.py | 2 -- pandas/core/groupby/groupby.py | 2 +- pandas/core/groupby/ops.py | 2 +- pandas/tests/extension/test_boolean.py | 2 +- 4 files changed, 3 insertions(+), 5 deletions(-) diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index cdd9cd63cbcbf..1506cb3f750fb 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -876,8 +876,6 @@ def take( # we only fill where the indexer is null # not existing missing values # TODO(jreback) what if we have a non-na float as a fill value? - # NaN with uncertainties is scalar but does not register as `isna`, - # so use fact that NaN != NaN if allow_fill and notna(fill_value): fill_mask = np.asarray(indexer) == -1 result[fill_mask] = fill_value diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index cae5a4dc2d036..9442ae43b9517 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -3274,7 +3274,7 @@ def first(x: Series): if not len(arr): nan_arr = x.array[isna(x.array)] if not len(nan_arr): - return np.nan + return x.array._na_value return nan_arr[0] return arr[0] diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index c6834f7cf73ec..3c4a22d009406 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -590,7 +590,7 @@ def nkeys(self) -> int: def get_iterator( self, data: NDFrameT, axis: AxisInt = 0 - ) -> Iterator[tuple[Hashable, NDFrameT]]: # Doesn't work with non-hashable EA types + ) -> Iterator[tuple[Hashable, NDFrameT]]: """ Groupby iterator diff --git a/pandas/tests/extension/test_boolean.py b/pandas/tests/extension/test_boolean.py index 7d336f3ccacc1..fb8e8803bdbb3 100644 --- a/pandas/tests/extension/test_boolean.py +++ b/pandas/tests/extension/test_boolean.py @@ -28,7 +28,7 @@ def make_data(): - return [True, False] * 4 + [pd.NA] + [True, False] * 44 + [pd.NA] + [True, False] + return [True, False] * 4 + [np.nan] + [True, False] * 44 + [np.nan] + [True, False] @pytest.fixture From 65cf17a8927b2c4d73b60433317e477422d37eb0 Mon Sep 17 00:00:00 2001 From: Michael Tiemann <72577720+MichaelTiemannOSC@users.noreply.github.com> Date: Wed, 26 Jul 2023 10:05:03 -0400 Subject: [PATCH 11/16] Update groupby.py `_na_value` is a property of ExtensionDtype, not ExtensionArray. Signed-off-by: Michael Tiemann <72577720+MichaelTiemannOSC@users.noreply.github.com> --- pandas/core/groupby/groupby.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 9442ae43b9517..8bc39f0ef0ff6 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -3274,7 +3274,7 @@ def first(x: Series): if not len(arr): nan_arr = x.array[isna(x.array)] if not len(nan_arr): - return x.array._na_value + return x.array.dtype._na_value return nan_arr[0] return arr[0] From 2f92811574da5f4666b08e148424462f5ea8ca96 Mon Sep 17 00:00:00 2001 From: Michael Tiemann <72577720+MichaelTiemannOSC@users.noreply.github.com> Date: Wed, 26 Jul 2023 10:08:26 -0400 Subject: [PATCH 12/16] Update test_boolean.py Testing in Python 3.12 reveals that we *do* need to fix this after all. Signed-off-by: Michael Tiemann <72577720+MichaelTiemannOSC@users.noreply.github.com> --- pandas/tests/extension/test_boolean.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/extension/test_boolean.py b/pandas/tests/extension/test_boolean.py index fb8e8803bdbb3..7d336f3ccacc1 100644 --- a/pandas/tests/extension/test_boolean.py +++ b/pandas/tests/extension/test_boolean.py @@ -28,7 +28,7 @@ def make_data(): - return [True, False] * 4 + [np.nan] + [True, False] * 44 + [np.nan] + [True, False] + return [True, False] * 4 + [pd.NA] + [True, False] * 44 + [pd.NA] + [True, False] @pytest.fixture From e572b3d140fca85fe570176871e6ea173ede5712 Mon Sep 17 00:00:00 2001 From: Michael Tiemann <72577720+MichaelTiemannOSC@users.noreply.github.com> Date: Wed, 26 Jul 2023 11:23:37 -0400 Subject: [PATCH 13/16] _na_value is not a documented field of xarray Alas, _na_value is not a documented field of xarray. Current code checks do not seem to cover this code path, so the first failure was found attribute checks in doc build process. Also re-reverting test_boolean.py back to original state. The python-dev checks first tripped on boolean REPL, not groupby checks. Signed-off-by: Michael Tiemann <72577720+MichaelTiemannOSC@users.noreply.github.com> --- pandas/core/groupby/groupby.py | 2 +- pandas/tests/extension/test_boolean.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 8bc39f0ef0ff6..cae5a4dc2d036 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -3274,7 +3274,7 @@ def first(x: Series): if not len(arr): nan_arr = x.array[isna(x.array)] if not len(nan_arr): - return x.array.dtype._na_value + return np.nan return nan_arr[0] return arr[0] diff --git a/pandas/tests/extension/test_boolean.py b/pandas/tests/extension/test_boolean.py index 7d336f3ccacc1..fb8e8803bdbb3 100644 --- a/pandas/tests/extension/test_boolean.py +++ b/pandas/tests/extension/test_boolean.py @@ -28,7 +28,7 @@ def make_data(): - return [True, False] * 4 + [pd.NA] + [True, False] * 44 + [pd.NA] + [True, False] + return [True, False] * 4 + [np.nan] + [True, False] * 44 + [np.nan] + [True, False] @pytest.fixture From 2964f44251d847e026335b53f6d202864b5f4324 Mon Sep 17 00:00:00 2001 From: Michael Tiemann <72577720+MichaelTiemannOSC@users.noreply.github.com> Date: Wed, 26 Jul 2023 15:42:33 -0400 Subject: [PATCH 14/16] Finish removing unneeded code and doc changes. Not much left for this PR after all! Signed-off-by: Michael Tiemann <72577720+MichaelTiemannOSC@users.noreply.github.com> --- doc/source/whatsnew/v2.1.0.rst | 4 +--- pandas/core/groupby/groupby.py | 5 +---- 2 files changed, 2 insertions(+), 7 deletions(-) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index d5b7d316e307c..7601756f44eee 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -141,7 +141,6 @@ Other enhancements - :meth:`MultiIndex.sortlevel` and :meth:`Index.sortlevel` gained a new keyword ``na_position`` (:issue:`51612`) - :meth:`arrays.DatetimeArray.map`, :meth:`arrays.TimedeltaArray.map` and :meth:`arrays.PeriodArray.map` can now take a ``na_action`` argument (:issue:`51644`) - :meth:`arrays.SparseArray.map` now supports ``na_action`` (:issue:`52096`). -- :meth:`arrays.BaseMaskedArray.take` handle non-na float as fill value (triggered by ``ufloat`` NaN from ``uncertainties`` package) (see `PR `_) - :meth:`pandas.read_html` now supports the ``storage_options`` keyword when used with a URL, allowing users to add headers the outbound HTTP request (:issue:`49944`) - Add :meth:`diff()` and :meth:`round()` for :class:`Index` (:issue:`19708`) - Add dtype of categories to ``repr`` information of :class:`CategoricalDtype` (:issue:`52179`) @@ -490,7 +489,6 @@ Conversion - Bug in :meth:`DataFrame.info` raising ``ValueError`` when ``use_numba`` is set (:issue:`51922`) - Bug in :meth:`DataFrame.insert` raising ``TypeError`` if ``loc`` is ``np.int64`` (:issue:`53193`) - Bug in :meth:`HDFStore.select` loses precision of large int when stored and retrieved (:issue:`54186`) -- Strings ^^^^^^^ @@ -646,7 +644,7 @@ Other - Bug in :meth:`Series.memory_usage` when ``deep=True`` throw an error with Series of objects and the returned value is incorrect, as it does not take into account GC corrections (:issue:`51858`) - Bug in :meth:`period_range` the default behavior when freq was not passed as an argument was incorrect(:issue:`53687`) - Fixed incorrect ``__name__`` attribute of ``pandas._libs.json`` (:issue:`52898`) -- Change ``~tests/extension/test_boolean.py`` to use pd.NA instead of np.nan (following similar patterns in ``~tests/extension/test_integer.py`` and ``~tests/extension/test_float.py``), updating :func:`make_data`, :func:`data_missing`, :func:`data_missing_for_sorting`, :func:`data_for_grouping`, :func:`_check_op` (see `PR `_) +- .. ***DO NOT USE THIS SECTION*** diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index cae5a4dc2d036..64d874a31c428 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -3272,10 +3272,7 @@ def first(x: Series): """Helper function for first item that isn't NA.""" arr = x.array[notna(x.array)] if not len(arr): - nan_arr = x.array[isna(x.array)] - if not len(nan_arr): - return np.nan - return nan_arr[0] + return np.nan return arr[0] if isinstance(obj, DataFrame): From 6644fe095e189e41c7b8016e7ab574f1336ffff1 Mon Sep 17 00:00:00 2001 From: Michael Tiemann <72577720+MichaelTiemannOSC@users.noreply.github.com> Date: Wed, 26 Jul 2023 16:17:12 -0400 Subject: [PATCH 15/16] Update v2.1.0.rst Clean up more doc changes missed in previous commit. Signed-off-by: Michael Tiemann <72577720+MichaelTiemannOSC@users.noreply.github.com> --- doc/source/whatsnew/v2.1.0.rst | 4 ---- 1 file changed, 4 deletions(-) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index 7601756f44eee..bc187361493c0 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -160,7 +160,6 @@ Other enhancements - :meth:`DataFrame.stack` gained the ``sort`` keyword to dictate whether the resulting :class:`MultiIndex` levels are sorted (:issue:`15105`) - :meth:`DataFrame.unstack` gained the ``sort`` keyword to dictate whether the resulting :class:`MultiIndex` levels are sorted (:issue:`15105`) - :meth:`DataFrameGroupby.agg` and :meth:`DataFrameGroupby.transform` now support grouping by multiple keys when the index is not a :class:`MultiIndex` for ``engine="numba"`` (:issue:`53486`) -- :meth:`GroupBy.first` if series contains only NA values (which might be NaN), return the first NA value, else return np.nan (see `PR `_) - :meth:`Series.explode` now supports pyarrow-backed list types (:issue:`53602`) - :meth:`Series.str.join` now supports ``ArrowDtype(pa.string())`` (:issue:`53646`) - :meth:`SeriesGroupby.agg` and :meth:`DataFrameGroupby.agg` now support passing in multiple functions for ``engine="numba"`` (:issue:`53486`) @@ -448,7 +447,6 @@ Datetimelike - Bug in constructing a :class:`Timestamp` with ``ts_input=pd.NA`` raising ``TypeError`` (:issue:`45481`) - Bug in parsing datetime strings with weekday but no day e.g. "2023 Sept Thu" incorrectly raising ``AttributeError`` instead of ``ValueError`` (:issue:`52659`) - Timedelta ^^^^^^^^^ - :meth:`TimedeltaIndex.map` with ``na_action="ignore"`` now works as expected (:issue:`51644`) @@ -499,7 +497,6 @@ Interval ^^^^^^^^ - :meth:`pd.IntervalIndex.get_indexer` and :meth:`pd.IntervalIndex.get_indexer_nonunique` raising if ``target`` is read-only array (:issue:`53703`) - Bug in :class:`IntervalDtype` where the object could be kept alive when deleted (:issue:`54184`) -- Indexing ^^^^^^^^ @@ -615,7 +612,6 @@ ExtensionArray - Bug in :meth:`~arrays.ArrowExtensionArray.__iter__` and :meth:`~arrays.ArrowExtensionArray.__getitem__` returning python datetime and timedelta objects for non-nano dtypes (:issue:`53326`) - Bug where the :class:`DataFrame` repr would not work when a column would have an :class:`ArrowDtype` with an ``pyarrow.ExtensionDtype`` (:issue:`54063`) - Bug where the ``__from_arrow__`` method of masked ExtensionDtypes(e.g. :class:`Float64Dtype`, :class:`BooleanDtype`) would not accept pyarrow arrays of type ``pyarrow.null()`` (:issue:`52223`) -- Styler ^^^^^^ From c9e0de37fb2a81976071699e901958d1e869702f Mon Sep 17 00:00:00 2001 From: Michael Tiemann <72577720+MichaelTiemannOSC@users.noreply.github.com> Date: Wed, 26 Jul 2023 18:11:30 -0400 Subject: [PATCH 16/16] Update groupby.py Turns out that this fixes GH#39098. Maybe I should resubmit under that guise just this one patch? Signed-off-by: Michael Tiemann <72577720+MichaelTiemannOSC@users.noreply.github.com> --- pandas/core/groupby/groupby.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 64d874a31c428..09616cc38212b 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -3272,7 +3272,10 @@ def first(x: Series): """Helper function for first item that isn't NA.""" arr = x.array[notna(x.array)] if not len(arr): - return np.nan + nan_arr = x.array[isna(x.array)] + if not len(nan_arr): + return np.nan + return nan_arr[0] return arr[0] if isinstance(obj, DataFrame): @@ -3331,7 +3334,10 @@ def last(x: Series): """Helper function for last item that isn't NA.""" arr = x.array[notna(x.array)] if not len(arr): - return np.nan + nan_arr = x.array[isna(x.array)] + if not len(nan_arr): + return np.nan + return nan_arr[-1] return arr[-1] if isinstance(obj, DataFrame):