From 15a27eaa36ba09a81bba58c0886824c24cac94ba Mon Sep 17 00:00:00 2001 From: Mabel Villalba Date: Sun, 19 Apr 2020 01:22:54 +0200 Subject: [PATCH 1/9] BUG: Maintain the order of the bins in group_quantile. Updated tests #33200 --- doc/source/whatsnew/v1.1.0.rst | 1 + pandas/_libs/groupby.pyx | 11 +++++++---- pandas/tests/groupby/test_function.py | 18 +++++++++++++----- 3 files changed, 21 insertions(+), 9 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index a797090a83444..13340c7cf11d8 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -603,6 +603,7 @@ Groupby/resample/rolling - Bug in :meth:`DataFrame.resample` where an ``AmbiguousTimeError`` would be raised when the resulting timezone aware :class:`DatetimeIndex` had a DST transition at midnight (:issue:`25758`) - Bug in :meth:`DataFrame.groupby` where a ``ValueError`` would be raised when grouping by a categorical column with read-only categories and ``sort=False`` (:issue:`33410`) - Bug in :meth:`GroupBy.first` and :meth:`GroupBy.last` where None is not preserved in object dtype (:issue:`32800`) +- Bug in :meth:`SeriesGroupBy.quantile` causes the quantiles to be shifted when the ``by`` axis contains ``NaN`` (:issue:`33200`) Reshaping ^^^^^^^^^ diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 53e66c4b8723d..31333b16d590a 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -779,9 +779,10 @@ def group_quantile(ndarray[float64_t] out, non_na_counts[lab] += 1 # Get an index of values sorted by labels and then values - order = (values, labels) - sort_arr = np.lexsort(order).astype(np.int64, copy=False) - + sort_arr = np.arange(len(labels), dtype=np.int64) + mask = labels != -1 + order = (np.asarray(values)[mask], labels[mask]) + sort_arr[mask] = np.lexsort(order).astype(np.int64, copy=False) with nogil: for i in range(ngroups): # Figure out how many group elements there are @@ -819,7 +820,9 @@ def group_quantile(ndarray[float64_t] out, # Increment the index reference in sorted_arr for the next group grp_start += grp_sz - + print(out) + # out = np.roll(out, -(len(out) - np.sum(counts))) + print(out) # ---------------------------------------------------------------------- # group_nth, group_last, group_rank diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 346de55f551df..e195b1d487614 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -1507,14 +1507,22 @@ def test_quantile_missing_group_values_no_segfaults(): grp.quantile() -def test_quantile_missing_group_values_correct_results(): - # GH 28662 - data = np.array([1.0, np.nan, 3.0, np.nan]) - df = pd.DataFrame(dict(key=data, val=range(4))) +@pytest.mark.parametrize( + "key, val, expected_key, expected_val", + [ + ([1.0, np.nan, 3.0, np.nan], range(4), [1.0, 3.0], [0.0, 1.0]), + (["a", "b", "b", np.nan], range(4), ["a", "b"], [0, 1.5]), + ], +) +def test_quantile_missing_group_values_correct_results( + key, val, expected_key, expected_val +): + # GH 28662, GH 33200 + df = pd.DataFrame({"key": key, "val": val}) result = df.groupby("key").quantile() expected = pd.DataFrame( - [1.0, 3.0], index=pd.Index([1.0, 3.0], name="key"), columns=["val"] + expected_val, index=pd.Index(expected_key, name="key"), columns=["val"] ) tm.assert_frame_equal(result, expected) From 8e6af0e0e485a501debd4e2cf2fc7eb58af5b721 Mon Sep 17 00:00:00 2001 From: Mabel Villalba Date: Sun, 19 Apr 2020 21:55:22 +0200 Subject: [PATCH 2/9] BUG: Fix quantile calculation #33200 --- pandas/_libs/groupby.pyx | 80 +++++++++++++-------------- pandas/tests/groupby/test_function.py | 3 +- 2 files changed, 41 insertions(+), 42 deletions(-) diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 31333b16d590a..46db384591bd5 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -778,51 +778,49 @@ def group_quantile(ndarray[float64_t] out, if not mask[i]: non_na_counts[lab] += 1 - # Get an index of values sorted by labels and then values - sort_arr = np.arange(len(labels), dtype=np.int64) - mask = labels != -1 - order = (np.asarray(values)[mask], labels[mask]) - sort_arr[mask] = np.lexsort(order).astype(np.int64, copy=False) - with nogil: - for i in range(ngroups): - # Figure out how many group elements there are - grp_sz = counts[i] - non_na_sz = non_na_counts[i] + if labels.any(): + # Get an index of values sorted by labels and then values + labels[labels==-1] = np.max(labels) + 1 + order = (values, labels) + sort_arr= np.lexsort(order).astype(np.int64, copy=False) + with nogil: + for i in range(ngroups): + # Figure out how many group elements there are + grp_sz = counts[i] + non_na_sz = non_na_counts[i] - if non_na_sz == 0: - out[i] = NaN - else: - # Calculate where to retrieve the desired value - # Casting to int will intentionally truncate result - idx = grp_start + (q * (non_na_sz - 1)) - - val = values[sort_arr[idx]] - # If requested quantile falls evenly on a particular index - # then write that index's value out. Otherwise interpolate - q_idx = q * (non_na_sz - 1) - frac = q_idx % 1 - - if frac == 0.0 or interp == INTERPOLATION_LOWER: - out[i] = val + if non_na_sz == 0: + out[i] = NaN else: - next_val = values[sort_arr[idx + 1]] - if interp == INTERPOLATION_LINEAR: - out[i] = val + (next_val - val) * frac - elif interp == INTERPOLATION_HIGHER: - out[i] = next_val - elif interp == INTERPOLATION_MIDPOINT: - out[i] = (val + next_val) / 2.0 - elif interp == INTERPOLATION_NEAREST: - if frac > .5 or (frac == .5 and q > .5): # Always OK? + # Calculate where to retrieve the desired value + # Casting to int will intentionally truncate result + idx = grp_start + (q * (non_na_sz - 1)) + + val = values[sort_arr[idx]] + # If requested quantile falls evenly on a particular index + # then write that index's value out. Otherwise interpolate + q_idx = q * (non_na_sz - 1) + frac = q_idx % 1 + + if frac == 0.0 or interp == INTERPOLATION_LOWER: + out[i] = val + else: + next_val = values[sort_arr[idx + 1]] + if interp == INTERPOLATION_LINEAR: + out[i] = val + (next_val - val) * frac + elif interp == INTERPOLATION_HIGHER: out[i] = next_val - else: - out[i] = val + elif interp == INTERPOLATION_MIDPOINT: + out[i] = (val + next_val) / 2.0 + elif interp == INTERPOLATION_NEAREST: + if frac > .5 or (frac == .5 and q > .5): # Always OK? + out[i] = next_val + else: + out[i] = val + + # Increment the index reference in sorted_arr for the next group + grp_start += grp_sz - # Increment the index reference in sorted_arr for the next group - grp_start += grp_sz - print(out) - # out = np.roll(out, -(len(out) - np.sum(counts))) - print(out) # ---------------------------------------------------------------------- # group_nth, group_last, group_rank diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index e195b1d487614..99a798c25efe2 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -1510,7 +1510,8 @@ def test_quantile_missing_group_values_no_segfaults(): @pytest.mark.parametrize( "key, val, expected_key, expected_val", [ - ([1.0, np.nan, 3.0, np.nan], range(4), [1.0, 3.0], [0.0, 1.0]), + ([1.0, np.nan, 3.0, np.nan], range(4), [1.0, 3.0], [0.0, 2.0]), + ([1.0, np.nan, 2.0, 2.0], range(4), [1.0, 2.0], [0.0, 2.5]), (["a", "b", "b", np.nan], range(4), ["a", "b"], [0, 1.5]), ], ) From 5832ba9db488a49a9d782a44cba6c5bbbce9d4c8 Mon Sep 17 00:00:00 2001 From: Mabel Villalba Date: Sun, 26 Apr 2020 18:33:09 +0200 Subject: [PATCH 3/9] BUG: Fix quantile calculation. Only move -1 labels if there are any labels #33200 --- pandas/_libs/groupby.pyx | 77 +++++++++++++++++++++------------------- 1 file changed, 40 insertions(+), 37 deletions(-) diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 46db384591bd5..a8340f46d585e 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -779,47 +779,50 @@ def group_quantile(ndarray[float64_t] out, non_na_counts[lab] += 1 if labels.any(): - # Get an index of values sorted by labels and then values + # Put '-1' (NaN) labels as the last group so it does not interfere + # with the calculations. labels[labels==-1] = np.max(labels) + 1 - order = (values, labels) - sort_arr= np.lexsort(order).astype(np.int64, copy=False) - with nogil: - for i in range(ngroups): - # Figure out how many group elements there are - grp_sz = counts[i] - non_na_sz = non_na_counts[i] + # Get an index of values sorted by labels and then values + order = (values, labels) + sort_arr = np.lexsort(order).astype(np.int64, copy=False) + + with nogil: + for i in range(ngroups): + # Figure out how many group elements there are + grp_sz = counts[i] + non_na_sz = non_na_counts[i] - if non_na_sz == 0: - out[i] = NaN + if non_na_sz == 0: + out[i] = NaN + else: + # Calculate where to retrieve the desired value + # Casting to int will intentionally truncate result + idx = grp_start + (q * (non_na_sz - 1)) + + val = values[sort_arr[idx]] + # If requested quantile falls evenly on a particular index + # then write that index's value out. Otherwise interpolate + q_idx = q * (non_na_sz - 1) + frac = q_idx % 1 + + if frac == 0.0 or interp == INTERPOLATION_LOWER: + out[i] = val else: - # Calculate where to retrieve the desired value - # Casting to int will intentionally truncate result - idx = grp_start + (q * (non_na_sz - 1)) - - val = values[sort_arr[idx]] - # If requested quantile falls evenly on a particular index - # then write that index's value out. Otherwise interpolate - q_idx = q * (non_na_sz - 1) - frac = q_idx % 1 - - if frac == 0.0 or interp == INTERPOLATION_LOWER: - out[i] = val - else: - next_val = values[sort_arr[idx + 1]] - if interp == INTERPOLATION_LINEAR: - out[i] = val + (next_val - val) * frac - elif interp == INTERPOLATION_HIGHER: + next_val = values[sort_arr[idx + 1]] + if interp == INTERPOLATION_LINEAR: + out[i] = val + (next_val - val) * frac + elif interp == INTERPOLATION_HIGHER: + out[i] = next_val + elif interp == INTERPOLATION_MIDPOINT: + out[i] = (val + next_val) / 2.0 + elif interp == INTERPOLATION_NEAREST: + if frac > .5 or (frac == .5 and q > .5): # Always OK? out[i] = next_val - elif interp == INTERPOLATION_MIDPOINT: - out[i] = (val + next_val) / 2.0 - elif interp == INTERPOLATION_NEAREST: - if frac > .5 or (frac == .5 and q > .5): # Always OK? - out[i] = next_val - else: - out[i] = val - - # Increment the index reference in sorted_arr for the next group - grp_start += grp_sz + else: + out[i] = val + + # Increment the index reference in sorted_arr for the next group + grp_start += grp_sz # ---------------------------------------------------------------------- From e70e3ca43551cb197dc85e1a85939df9effb26d4 Mon Sep 17 00:00:00 2001 From: Mabel Villalba Date: Tue, 28 Apr 2020 18:22:40 +0200 Subject: [PATCH 4/9] BUG: Explicit quantile to 0.5. Fix format #33200 --- pandas/_libs/groupby.pyx | 2 +- pandas/tests/groupby/test_function.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index a8340f46d585e..3d2a0163fadd9 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -781,7 +781,7 @@ def group_quantile(ndarray[float64_t] out, if labels.any(): # Put '-1' (NaN) labels as the last group so it does not interfere # with the calculations. - labels[labels==-1] = np.max(labels) + 1 + labels[labels == -1] = np.max(labels) + 1 # Get an index of values sorted by labels and then values order = (values, labels) sort_arr = np.lexsort(order).astype(np.int64, copy=False) diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 6a1d8447a3978..fcd3ce44f1482 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -1500,7 +1500,7 @@ def test_quantile_missing_group_values_correct_results( # GH 28662, GH 33200 df = pd.DataFrame({"key": key, "val": val}) - result = df.groupby("key").quantile() + result = df.groupby("key").quantile(0.5) expected = pd.DataFrame( expected_val, index=pd.Index(expected_key, name="key"), columns=["val"] ) From b0c309bd041fca7f30cd760b157a407cccaffd85 Mon Sep 17 00:00:00 2001 From: Mabel Villalba Date: Wed, 29 Apr 2020 19:00:33 +0200 Subject: [PATCH 5/9] BUG: Fix whatsnew. #33200 --- doc/source/whatsnew/v1.1.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 64c9c85ae0460..79617d7e95c37 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -644,7 +644,7 @@ Groupby/resample/rolling - Bug in :meth:`DataFrame.resample` where an ``AmbiguousTimeError`` would be raised when the resulting timezone aware :class:`DatetimeIndex` had a DST transition at midnight (:issue:`25758`) - Bug in :meth:`DataFrame.groupby` where a ``ValueError`` would be raised when grouping by a categorical column with read-only categories and ``sort=False`` (:issue:`33410`) - Bug in :meth:`GroupBy.first` and :meth:`GroupBy.last` where None is not preserved in object dtype (:issue:`32800`) -- Bug in :meth:`SeriesGroupBy.quantile` causes the quantiles to be shifted when the ``by`` axis contains ``NaN`` (:issue:`33200`) +- Bug in :meth:`GroupBy.quantile` causes the quantiles to be shifted when the ``by`` axis contains ``NaN`` (:issue:`33200`) Reshaping ^^^^^^^^^ From e7c0eac0d8b63b88fd5cc41a32af52b0c4dc2fd7 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Fri, 15 May 2020 15:49:13 +0100 Subject: [PATCH 6/9] troubleshoot ci timeouts --- pandas/_libs/groupby.pyx | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index c9c8b1552509c..e08a54b130f3c 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -779,9 +779,10 @@ def group_quantile(ndarray[float64_t] out, if labels.any(): # Put '-1' (NaN) labels as the last group so it does not interfere # with the calculations. - labels[labels == -1] = np.max(labels) + 1 - # Get an index of values sorted by labels and then values - order = (values, labels) + labels_for_lexsort = np.where(labels == -1, labels.max() + 1, labels) + else: + labels_for_lexsort = labels # Get an index of values sorted by labels and then values + order = (values, labels_for_lexsort) sort_arr = np.lexsort(order).astype(np.int64, copy=False) with nogil: From 07fa080d340f603e4b8cebf2f1ccb14abc2cf39f Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Fri, 15 May 2020 16:27:45 +0100 Subject: [PATCH 7/9] issue number and coments --- doc/source/whatsnew/v1.1.0.rst | 2 +- pandas/_libs/groupby.pyx | 3 ++- pandas/tests/groupby/test_quantile.py | 2 +- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 37ae589d3dc28..298416889f867 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -814,7 +814,7 @@ Groupby/resample/rolling - Bug in :meth:`DataFrame.resample` where an ``AmbiguousTimeError`` would be raised when the resulting timezone aware :class:`DatetimeIndex` had a DST transition at midnight (:issue:`25758`) - Bug in :meth:`DataFrame.groupby` where a ``ValueError`` would be raised when grouping by a categorical column with read-only categories and ``sort=False`` (:issue:`33410`) - Bug in :meth:`GroupBy.first` and :meth:`GroupBy.last` where None is not preserved in object dtype (:issue:`32800`) -- Bug in :meth:`GroupBy.quantile` causes the quantiles to be shifted when the ``by`` axis contains ``NaN`` (:issue:`33200`) +- Bug in :meth:`GroupBy.quantile` causes the quantiles to be shifted when the ``by`` axis contains ``NaN`` (:issue:`33200`, :issue:`33569`) - Bug in :meth:`Rolling.min` and :meth:`Rolling.max`: Growing memory usage after multiple calls when using a fixed window (:issue:`30726`) - Bug in :meth:`GroupBy.agg`, :meth:`GroupBy.transform`, and :meth:`GroupBy.resample` where subclasses are not preserved (:issue:`28330`) - Bug in :meth:`GroupBy.rolling.apply` ignores args and kwargs parameters (:issue:`33433`) diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index e08a54b130f3c..4e792da31e1d5 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -776,12 +776,13 @@ def group_quantile(ndarray[float64_t] out, if not mask[i]: non_na_counts[lab] += 1 + # Get an index of values sorted by labels and then values if labels.any(): # Put '-1' (NaN) labels as the last group so it does not interfere # with the calculations. labels_for_lexsort = np.where(labels == -1, labels.max() + 1, labels) else: - labels_for_lexsort = labels # Get an index of values sorted by labels and then values + labels_for_lexsort = labels order = (values, labels_for_lexsort) sort_arr = np.lexsort(order).astype(np.int64, copy=False) diff --git a/pandas/tests/groupby/test_quantile.py b/pandas/tests/groupby/test_quantile.py index 0b4f86e1223f2..4bfbffe0c68c9 100644 --- a/pandas/tests/groupby/test_quantile.py +++ b/pandas/tests/groupby/test_quantile.py @@ -192,7 +192,7 @@ def test_quantile_missing_group_values_no_segfaults(): def test_quantile_missing_group_values_correct_results( key, val, expected_key, expected_val ): - # GH 28662, GH 33200 + # GH 28662, GH 33200, GH 33569 df = pd.DataFrame({"key": key, "val": val}) result = df.groupby("key").quantile(0.5) From c6f41a4398e902ed04db8f6a212744a2919380e0 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Tue, 19 May 2020 13:37:23 +0100 Subject: [PATCH 8/9] add more test cases --- pandas/tests/groupby/test_quantile.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/tests/groupby/test_quantile.py b/pandas/tests/groupby/test_quantile.py index 4bfbffe0c68c9..00b87b668080a 100644 --- a/pandas/tests/groupby/test_quantile.py +++ b/pandas/tests/groupby/test_quantile.py @@ -187,6 +187,8 @@ def test_quantile_missing_group_values_no_segfaults(): ([1.0, np.nan, 3.0, np.nan], range(4), [1.0, 3.0], [0.0, 2.0]), ([1.0, np.nan, 2.0, 2.0], range(4), [1.0, 2.0], [0.0, 2.5]), (["a", "b", "b", np.nan], range(4), ["a", "b"], [0, 1.5]), + ([0], [42], [0], [42.0]), + ([], [], np.array([], dtype="float64"), np.array([], dtype="float64")), ], ) def test_quantile_missing_group_values_correct_results( From f21e332d4ee2663fedb8f5166c182e418e34f0f0 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Wed, 20 May 2020 16:20:33 +0100 Subject: [PATCH 9/9] update test per comment --- pandas/tests/groupby/test_quantile.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/pandas/tests/groupby/test_quantile.py b/pandas/tests/groupby/test_quantile.py index 00b87b668080a..8cfd8035502c3 100644 --- a/pandas/tests/groupby/test_quantile.py +++ b/pandas/tests/groupby/test_quantile.py @@ -197,10 +197,16 @@ def test_quantile_missing_group_values_correct_results( # GH 28662, GH 33200, GH 33569 df = pd.DataFrame({"key": key, "val": val}) - result = df.groupby("key").quantile(0.5) expected = pd.DataFrame( expected_val, index=pd.Index(expected_key, name="key"), columns=["val"] ) + + grp = df.groupby("key") + + result = grp.quantile(0.5) + tm.assert_frame_equal(result, expected) + + result = grp.quantile() tm.assert_frame_equal(result, expected)