From e62a14f5e3b159e0e91649c12b31b587bb8c95d0 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Wed, 17 Apr 2019 14:06:17 -0700 Subject: [PATCH 01/13] Added failing test --- pandas/tests/groupby/test_nth.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/pandas/tests/groupby/test_nth.py b/pandas/tests/groupby/test_nth.py index 7a3d189d3020e..d02daae96841c 100644 --- a/pandas/tests/groupby/test_nth.py +++ b/pandas/tests/groupby/test_nth.py @@ -434,3 +434,14 @@ def test_nth_column_order(): columns=['C', 'B'], index=Index([1, 2], name='A')) assert_frame_equal(result, expected) + + +def test_nth_nan_in_grouper(): + # GH 26011 + df = DataFrame([[np.nan, 1, np.nan], + [2., 3, 4.]], columns=list('abc')) + result = df.groupby('a').nth(0) + expected = pd.DataFrame([[3, 4.]], columns=list('bc'), + index=Index([2.0], name='a')) + + assert_frame_equal(result, expected) From 823225b5024e5f0e139bc28b50cd419b7bd2ec64 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Fri, 19 Apr 2019 09:40:44 -0700 Subject: [PATCH 02/13] Added fix in code --- pandas/core/groupby/groupby.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index bd8a8852964e3..ed5eb28daed44 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -12,7 +12,7 @@ class providing the base-class of operations. import datetime from functools import partial, wraps import types -from typing import FrozenSet, Optional, Tuple, Type +from typing import FrozenSet, List, Optional, Tuple, Type, Union import warnings import numpy as np @@ -1546,7 +1546,9 @@ def backfill(self, limit=None): @Substitution(name='groupby') @Substitution(see_also=_common_see_also) - def nth(self, n, dropna=None): + def nth(self, + n: Union[int, List[int]], + dropna: Optional[Union[bool, str]] = None) -> DataFrame: """ Take the nth row from each group if n is an int, or a subset of rows if n is a list of ints. @@ -1636,11 +1638,13 @@ def nth(self, n, dropna=None): -nth_values) mask = mask_left | mask_right + ids, _, _ = self.grouper.group_info + mask = mask | ids != -1 # Drop NA values in grouping + out = self._selected_obj[mask] if not self.as_index: return out - ids, _, _ = self.grouper.group_info out.index = self.grouper.result_index[ids[mask]] return out.sort_index() if self.sort else out From 068936f4057fa5001daab504c9a36c3b930516b8 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Fri, 19 Apr 2019 09:50:38 -0700 Subject: [PATCH 03/13] Expanded test coverage scope --- pandas/tests/groupby/test_nth.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/pandas/tests/groupby/test_nth.py b/pandas/tests/groupby/test_nth.py index d02daae96841c..6d07ab0008adb 100644 --- a/pandas/tests/groupby/test_nth.py +++ b/pandas/tests/groupby/test_nth.py @@ -436,12 +436,18 @@ def test_nth_column_order(): assert_frame_equal(result, expected) -def test_nth_nan_in_grouper(): +@pytest.mark.parametrize("dropna", [None, 'any', 'all']) +def test_nth_nan_in_grouper(dropna): # GH 26011 - df = DataFrame([[np.nan, 1, np.nan], - [2., 3, 4.]], columns=list('abc')) - result = df.groupby('a').nth(0) - expected = pd.DataFrame([[3, 4.]], columns=list('bc'), - index=Index([2.0], name='a')) + df = DataFrame([ + [np.nan, 0, 1], + ['abc', 2, 3], + [np.nan, 4, 5], + ['def', 6, 7], + [np.nan, 8, 9], + ], columns=list('abc')) + result = df.groupby('a').nth(0, dropna=dropna) + expected = pd.DataFrame([[2, 3], [6, 7]], columns=list('bc'), + index=Index(['abc', 'def'], name='a')) assert_frame_equal(result, expected) From 871676483644edc074f8e71ad211502ebd959a95 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Fri, 19 Apr 2019 09:52:26 -0700 Subject: [PATCH 04/13] Whatsnew note --- doc/source/whatsnew/v0.25.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index d6d572bcb9889..08c4273e979bc 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -389,6 +389,7 @@ Groupby/Resample/Rolling - Ensured that result group order is correct when grouping on an ordered ``Categorical`` and specifying ``observed=True`` (:issue:`25871`, :issue:`25167`) - Bug in :meth:`pandas.core.window.Rolling.min` and :meth:`pandas.core.window.Rolling.max` that caused a memory leak (:issue:`25893`) - Bug in :func:`idxmax` and :func:`idxmin` on :meth:`DataFrame.groupby` with datetime column would return incorrect dtype (:issue:`25444`, :issue:`15306`) +- Bug in :meth:`pandas.core.groupby.GroupBy.nth` where NA values in the grouping would return incorrect results (:issue:`26011`) Reshaping ^^^^^^^^^ From 3000d2bc99042ce1566914b67b5be512e6cf7c2f Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Fri, 19 Apr 2019 11:49:04 -0700 Subject: [PATCH 05/13] Fixed implementation --- pandas/core/groupby/groupby.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index ed5eb28daed44..48b26e57d6d0a 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1639,7 +1639,7 @@ def nth(self, mask = mask_left | mask_right ids, _, _ = self.grouper.group_info - mask = mask | ids != -1 # Drop NA values in grouping + mask = mask & (ids != -1) # Drop NA values in grouping out = self._selected_obj[mask] if not self.as_index: From 84ddd6d04d9487946749bd8030b8bd9da6853fe2 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Fri, 19 Apr 2019 13:02:31 -0700 Subject: [PATCH 06/13] Typing and doc fixups --- pandas/core/groupby/groupby.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 48b26e57d6d0a..c29928a9bf06a 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -12,7 +12,8 @@ class providing the base-class of operations. import datetime from functools import partial, wraps import types -from typing import FrozenSet, List, Optional, Tuple, Type, Union +from typing import ( + cast, Collection, FrozenSet, List, Optional, Tuple, Type, Union) import warnings import numpy as np @@ -1547,16 +1548,15 @@ def backfill(self, limit=None): @Substitution(name='groupby') @Substitution(see_also=_common_see_also) def nth(self, - n: Union[int, List[int]], + n: Union[int, Collection[int]], dropna: Optional[Union[bool, str]] = None) -> DataFrame: """ Take the nth row from each group if n is an int, or a subset of rows if n is a list of ints. If dropna, will take the nth non-null row, dropna is either - Truthy (if a Series) or 'all', 'any' (if a DataFrame); - this is equivalent to calling dropna(how=dropna) before the - groupby. + 'all' or 'any'; this is equivalent to calling dropna(how=dropna) + before the groupby. Parameters ---------- @@ -1669,6 +1669,7 @@ def nth(self, # old behaviour, but with all and any support for DataFrames. # modified in GH 7559 to have better perf + n = cast(n, int) max_len = n if n >= 0 else - 1 - n dropped = self.obj.dropna(how=dropna, axis=self.axis) From 84d343a18b6fbf4252c1559af8373abc646b737c Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Fri, 19 Apr 2019 13:06:34 -0700 Subject: [PATCH 07/13] Fixed dropna annotation --- pandas/core/groupby/groupby.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index c29928a9bf06a..4e0d1073c4a33 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1549,7 +1549,7 @@ def backfill(self, limit=None): @Substitution(see_also=_common_see_also) def nth(self, n: Union[int, Collection[int]], - dropna: Optional[Union[bool, str]] = None) -> DataFrame: + dropna: Optional[str] = None) -> DataFrame: """ Take the nth row from each group if n is an int, or a subset of rows if n is a list of ints. From 6f40fbe2eb6bb347b41ce2a1d5f3095832ba5d42 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Fri, 19 Apr 2019 13:23:27 -0700 Subject: [PATCH 08/13] Replaced Collection reference with List in annotation --- pandas/core/groupby/groupby.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 4e0d1073c4a33..7596c414818c4 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -12,8 +12,7 @@ class providing the base-class of operations. import datetime from functools import partial, wraps import types -from typing import ( - cast, Collection, FrozenSet, List, Optional, Tuple, Type, Union) +from typing import cast, FrozenSet, List, Optional, Tuple, Type, Union import warnings import numpy as np @@ -1548,7 +1547,7 @@ def backfill(self, limit=None): @Substitution(name='groupby') @Substitution(see_also=_common_see_also) def nth(self, - n: Union[int, Collection[int]], + n: Union[int, List[int]], dropna: Optional[str] = None) -> DataFrame: """ Take the nth row from each group if n is an int, or a subset of rows From e2d006d18969a4ae061ddfeda33c5f58fd13c230 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Fri, 19 Apr 2019 14:32:32 -0700 Subject: [PATCH 09/13] Fixed cast expression --- pandas/core/groupby/groupby.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 7596c414818c4..0915f3660c6f6 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1668,7 +1668,7 @@ def nth(self, # old behaviour, but with all and any support for DataFrames. # modified in GH 7559 to have better perf - n = cast(n, int) + n = cast(int, n) max_len = n if n >= 0 else - 1 - n dropped = self.obj.dropna(how=dropna, axis=self.axis) From 0b7eb6cb165e0daf596f395ee1f6fbbb3e25cc61 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Mon, 22 Apr 2019 13:50:08 -0700 Subject: [PATCH 10/13] Moved comment position --- pandas/core/groupby/groupby.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 0915f3660c6f6..3c736fb37ba55 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1638,7 +1638,9 @@ def nth(self, mask = mask_left | mask_right ids, _, _ = self.grouper.group_info - mask = mask & (ids != -1) # Drop NA values in grouping + + # Drop NA values in grouping + mask = mask & (ids != -1) out = self._selected_obj[mask] if not self.as_index: From 48d90ee140a061c00459efaf9ebe9ae636c3d6cc Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Mon, 22 Apr 2019 13:52:32 -0700 Subject: [PATCH 11/13] Removed re-assignment of nth_values for typing --- pandas/core/groupby/groupby.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 3c736fb37ba55..d011766ad3d1e 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1628,13 +1628,13 @@ def nth(self, else: raise TypeError("n needs to be an int or a list/set/tuple of ints") - nth_values = np.array(nth_values, dtype=np.intp) + nth_array = np.array(nth_values, dtype=np.intp) self._set_group_selection() if not dropna: - mask_left = np.in1d(self._cumcount_array(), nth_values) + mask_left = np.in1d(self._cumcount_array(), nth_array) mask_right = np.in1d(self._cumcount_array(ascending=False) + 1, - -nth_values) + -nth_array) mask = mask_left | mask_right ids, _, _ = self.grouper.group_info From 7eea5e3b05337cccba5936fa91bf3efdeb3d041f Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Mon, 22 Apr 2019 14:28:55 -0700 Subject: [PATCH 12/13] typing fixup --- pandas/core/groupby/groupby.py | 141 +++++++++++++++++---------------- 1 file changed, 73 insertions(+), 68 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index d011766ad3d1e..55a1ad0e712c7 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -12,7 +12,7 @@ class providing the base-class of operations. import datetime from functools import partial, wraps import types -from typing import cast, FrozenSet, List, Optional, Tuple, Type, Union +from typing import FrozenSet, List, Optional, Tuple, Type, Union import warnings import numpy as np @@ -1618,20 +1618,20 @@ def nth(self, 4 2 5.0 """ - if isinstance(n, int): - nth_values = [n] - elif isinstance(n, (set, list, tuple)): - nth_values = list(set(n)) - if dropna is not None: - raise ValueError( - "dropna option with a list of nth values is not supported") - else: + valid_containers = (set, list, tuple) + if not isinstance(n, (valid_containers, int)): raise TypeError("n needs to be an int or a list/set/tuple of ints") - nth_array = np.array(nth_values, dtype=np.intp) - self._set_group_selection() - if not dropna: + + if isinstance(n, int): + nth_values = [n] + elif isinstance(n, valid_containers): + nth_values = list(set(n)) + + nth_array = np.array(nth_values, dtype=np.intp) + self._set_group_selection() + mask_left = np.in1d(self._cumcount_array(), nth_array) mask_right = np.in1d(self._cumcount_array(ascending=False) + 1, -nth_array) @@ -1650,65 +1650,70 @@ def nth(self, return out.sort_index() if self.sort else out - if dropna not in ['any', 'all']: - if isinstance(self._selected_obj, Series) and dropna is True: - warnings.warn("the dropna={dropna} keyword is deprecated," - "use dropna='all' instead. " - "For a Series groupby, dropna must be " - "either None, 'any' or 'all'.".format( - dropna=dropna), - FutureWarning, - stacklevel=2) - dropna = 'all' - else: - # Note: when agg-ing picker doesn't raise this, - # just returns NaN - raise ValueError("For a DataFrame groupby, dropna must be " - "either None, 'any' or 'all', " - "(was passed {dropna}).".format( - dropna=dropna)) - - # old behaviour, but with all and any support for DataFrames. - # modified in GH 7559 to have better perf - n = cast(int, n) - max_len = n if n >= 0 else - 1 - n - dropped = self.obj.dropna(how=dropna, axis=self.axis) - - # get a new grouper for our dropped obj - if self.keys is None and self.level is None: - - # we don't have the grouper info available - # (e.g. we have selected out - # a column that is not in the current object) - axis = self.grouper.axis - grouper = axis[axis.isin(dropped.index)] - else: + if isinstance(n, valid_containers): + raise ValueError( + "dropna option with a list of nth values is not supported") - # create a grouper with the original parameters, but on the dropped - # object - from pandas.core.groupby.grouper import _get_grouper - grouper, _, _ = _get_grouper(dropped, key=self.keys, - axis=self.axis, level=self.level, - sort=self.sort, - mutated=self.mutated) - - grb = dropped.groupby(grouper, as_index=self.as_index, sort=self.sort) - sizes, result = grb.size(), grb.nth(n) - mask = (sizes < max_len).values - - # set the results which don't meet the criteria - if len(result) and mask.any(): - result.loc[mask] = np.nan - - # reset/reindex to the original groups - if (len(self.obj) == len(dropped) or - len(result) == len(self.grouper.result_index)): - result.index = self.grouper.result_index - else: - result = result.reindex(self.grouper.result_index) + if dropna not in ['any', 'all']: + if isinstance(self._selected_obj, Series) and dropna is True: + warnings.warn("the dropna={dropna} keyword is deprecated," + "use dropna='all' instead. " + "For a Series groupby, dropna must be " + "either None, 'any' or 'all'.".format( + dropna=dropna), + FutureWarning, + stacklevel=2) + dropna = 'all' + else: + # Note: when agg-ing picker doesn't raise this, + # just returns NaN + raise ValueError("For a DataFrame groupby, dropna must be " + "either None, 'any' or 'all', " + "(was passed {dropna}).".format( + dropna=dropna)) + + # old behaviour, but with all and any support for DataFrames. + # modified in GH 7559 to have better perf + max_len = n if n >= 0 else - 1 - n + dropped = self.obj.dropna(how=dropna, axis=self.axis) + + # get a new grouper for our dropped obj + if self.keys is None and self.level is None: + + # we don't have the grouper info available + # (e.g. we have selected out + # a column that is not in the current object) + axis = self.grouper.axis + grouper = axis[axis.isin(dropped.index)] - return result + else: + + # create a grouper with the original parameters, but on dropped + # object + from pandas.core.groupby.grouper import _get_grouper + grouper, _, _ = _get_grouper(dropped, key=self.keys, + axis=self.axis, level=self.level, + sort=self.sort, + mutated=self.mutated) + + grb = dropped.groupby( + grouper, as_index=self.as_index, sort=self.sort) + sizes, result = grb.size(), grb.nth(n) + mask = (sizes < max_len).values + + # set the results which don't meet the criteria + if len(result) and mask.any(): + result.loc[mask] = np.nan + + # reset/reindex to the original groups + if (len(self.obj) == len(dropped) or + len(result) == len(self.grouper.result_index)): + result.index = self.grouper.result_index + else: + result = result.reindex(self.grouper.result_index) + + return result def quantile(self, q=0.5, interpolation='linear'): """ From c2a0b8e999744554011ffa3ea9608d486b7f1557 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Sun, 28 Apr 2019 18:33:33 -0700 Subject: [PATCH 13/13] dedented and added comment --- pandas/core/groupby/groupby.py | 122 ++++++++++++++++----------------- 1 file changed, 61 insertions(+), 61 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 55a1ad0e712c7..945885e34fa1e 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1650,70 +1650,70 @@ def nth(self, return out.sort_index() if self.sort else out - else: - if isinstance(n, valid_containers): - raise ValueError( - "dropna option with a list of nth values is not supported") - - if dropna not in ['any', 'all']: - if isinstance(self._selected_obj, Series) and dropna is True: - warnings.warn("the dropna={dropna} keyword is deprecated," - "use dropna='all' instead. " - "For a Series groupby, dropna must be " - "either None, 'any' or 'all'.".format( - dropna=dropna), - FutureWarning, - stacklevel=2) - dropna = 'all' - else: - # Note: when agg-ing picker doesn't raise this, - # just returns NaN - raise ValueError("For a DataFrame groupby, dropna must be " - "either None, 'any' or 'all', " - "(was passed {dropna}).".format( - dropna=dropna)) - - # old behaviour, but with all and any support for DataFrames. - # modified in GH 7559 to have better perf - max_len = n if n >= 0 else - 1 - n - dropped = self.obj.dropna(how=dropna, axis=self.axis) - - # get a new grouper for our dropped obj - if self.keys is None and self.level is None: - - # we don't have the grouper info available - # (e.g. we have selected out - # a column that is not in the current object) - axis = self.grouper.axis - grouper = axis[axis.isin(dropped.index)] - + # dropna is truthy + if isinstance(n, valid_containers): + raise ValueError( + "dropna option with a list of nth values is not supported") + + if dropna not in ['any', 'all']: + if isinstance(self._selected_obj, Series) and dropna is True: + warnings.warn("the dropna={dropna} keyword is deprecated," + "use dropna='all' instead. " + "For a Series groupby, dropna must be " + "either None, 'any' or 'all'.".format( + dropna=dropna), + FutureWarning, + stacklevel=2) + dropna = 'all' else: + # Note: when agg-ing picker doesn't raise this, + # just returns NaN + raise ValueError("For a DataFrame groupby, dropna must be " + "either None, 'any' or 'all', " + "(was passed {dropna}).".format( + dropna=dropna)) + + # old behaviour, but with all and any support for DataFrames. + # modified in GH 7559 to have better perf + max_len = n if n >= 0 else - 1 - n + dropped = self.obj.dropna(how=dropna, axis=self.axis) + + # get a new grouper for our dropped obj + if self.keys is None and self.level is None: + + # we don't have the grouper info available + # (e.g. we have selected out + # a column that is not in the current object) + axis = self.grouper.axis + grouper = axis[axis.isin(dropped.index)] - # create a grouper with the original parameters, but on dropped - # object - from pandas.core.groupby.grouper import _get_grouper - grouper, _, _ = _get_grouper(dropped, key=self.keys, - axis=self.axis, level=self.level, - sort=self.sort, - mutated=self.mutated) - - grb = dropped.groupby( - grouper, as_index=self.as_index, sort=self.sort) - sizes, result = grb.size(), grb.nth(n) - mask = (sizes < max_len).values - - # set the results which don't meet the criteria - if len(result) and mask.any(): - result.loc[mask] = np.nan - - # reset/reindex to the original groups - if (len(self.obj) == len(dropped) or - len(result) == len(self.grouper.result_index)): - result.index = self.grouper.result_index - else: - result = result.reindex(self.grouper.result_index) + else: - return result + # create a grouper with the original parameters, but on dropped + # object + from pandas.core.groupby.grouper import _get_grouper + grouper, _, _ = _get_grouper(dropped, key=self.keys, + axis=self.axis, level=self.level, + sort=self.sort, + mutated=self.mutated) + + grb = dropped.groupby( + grouper, as_index=self.as_index, sort=self.sort) + sizes, result = grb.size(), grb.nth(n) + mask = (sizes < max_len).values + + # set the results which don't meet the criteria + if len(result) and mask.any(): + result.loc[mask] = np.nan + + # reset/reindex to the original groups + if (len(self.obj) == len(dropped) or + len(result) == len(self.grouper.result_index)): + result.index = self.grouper.result_index + else: + result = result.reindex(self.grouper.result_index) + + return result def quantile(self, q=0.5, interpolation='linear'): """