From 31260be708c663124b675ff6b9d56409057ed20d Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 12 Aug 2019 12:56:46 -0500 Subject: [PATCH 1/4] Backport PR #27826 for 0.25.3 release * BUG: Fix groupby quantile segfault Validate that q is between 0 and 1. Closes #27470 * prettier --- pandas/_libs/groupby.pyx | 5 +++++ pandas/tests/groupby/test_function.py | 11 +++++++++++ 2 files changed, 16 insertions(+) diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index e3f18572abca1..3069bbbf34bb7 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -719,6 +719,11 @@ def group_quantile(ndarray[float64_t] out, ndarray[int64_t] counts, non_na_counts, sort_arr assert values.shape[0] == N + + if not (0 <= q <= 1): + raise ValueError("'q' must be between 0 and 1. Got" + " '{}' instead".format(q)) + inter_methods = { 'linear': INTERPOLATION_LINEAR, 'lower': INTERPOLATION_LOWER, diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 5d1a1fd938500..d89233f2fd603 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -1316,6 +1316,17 @@ def test_quantile_raises(): df.groupby("key").quantile() +def test_quantile_out_of_bounds_q_raises(): + # https://github.com/pandas-dev/pandas/issues/27470 + df = pd.DataFrame(dict(a=[0, 0, 0, 1, 1, 1], b=range(6))) + g = df.groupby([0, 0, 0, 1, 1, 1]) + with pytest.raises(ValueError, match="Got '50.0' instead"): + g.quantile(50) + + with pytest.raises(ValueError, match="Got '-1.0' instead"): + g.quantile(-1) + + # pipe # -------------------------------- From cdacc3fa2bd49a2652e1d779dfccd087a45fdf48 Mon Sep 17 00:00:00 2001 From: William Ayd Date: Wed, 30 Oct 2019 11:29:07 -0700 Subject: [PATCH 2/4] Backport PR #29173 for 0.25.3 release --- pandas/_libs/groupby.pyx | 3 +++ pandas/tests/groupby/test_function.py | 23 +++++++++++++++++++++++ 2 files changed, 26 insertions(+) diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 3069bbbf34bb7..a7dce26bdceea 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -741,6 +741,9 @@ def group_quantile(ndarray[float64_t] out, with nogil: for i in range(N): lab = labels[i] + if lab == -1: # NA group label + continue + counts[lab] += 1 if not mask[i]: non_na_counts[lab] += 1 diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index d89233f2fd603..da97dbfa7a2b8 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -1327,6 +1327,29 @@ def test_quantile_out_of_bounds_q_raises(): g.quantile(-1) +def test_quantile_missing_group_values_no_segfaults(): + # GH 28662 + data = np.array([1.0, np.nan, 1.0]) + df = pd.DataFrame(dict(key=data, val=range(3))) + + # Random segfaults; would have been guaranteed in loop + grp = df.groupby("key") + for _ in range(100): + grp.quantile() + + +def test_quantile_missing_group_values_correct_results(): + # GH 28662 + data = np.array([1.0, np.nan, 3.0, np.nan]) + df = pd.DataFrame(dict(key=data, val=range(4))) + + result = df.groupby("key").quantile() + expected = pd.DataFrame( + [1.0, 3.0], index=pd.Index([1.0, 3.0], name="key"), columns=["val"] + ) + tm.assert_frame_equal(result, expected) + + # pipe # -------------------------------- From 8c80df0e42a5f4a2474c827c0823815125978ecd Mon Sep 17 00:00:00 2001 From: William Ayd Date: Wed, 30 Oct 2019 18:02:00 -0700 Subject: [PATCH 3/4] Backport PR #29296 for 0.25.3 release --- doc/source/whatsnew/index.rst | 1 + doc/source/whatsnew/v0.25.3.rst | 22 ++++++++++++++++++++++ 2 files changed, 23 insertions(+) create mode 100644 doc/source/whatsnew/v0.25.3.rst diff --git a/doc/source/whatsnew/index.rst b/doc/source/whatsnew/index.rst index 6cfda147da312..fde778449751a 100644 --- a/doc/source/whatsnew/index.rst +++ b/doc/source/whatsnew/index.rst @@ -16,6 +16,7 @@ Version 0.25 .. toctree:: :maxdepth: 2 + v0.25.3 v0.25.2 v0.25.1 v0.25.0 diff --git a/doc/source/whatsnew/v0.25.3.rst b/doc/source/whatsnew/v0.25.3.rst new file mode 100644 index 0000000000000..f73a3f956f42e --- /dev/null +++ b/doc/source/whatsnew/v0.25.3.rst @@ -0,0 +1,22 @@ +.. _whatsnew_0253: + +What's new in 0.25.3 (October 31, 2019) +--------------------------------------- + +These are the changes in pandas 0.25.3. See :ref:`release` for a full changelog +including other versions of pandas. + +.. _whatsnew_0253.bug_fixes: + +Bug fixes +~~~~~~~~~ + +Groupby/resample/rolling +^^^^^^^^^^^^^^^^^^^^^^^^ + +- Bug in :meth:`DataFrameGroupBy.quantile` where NA values in the grouping could cause segfaults or incorrect results (:issue:`28882`) + +Contributors +~~~~~~~~~~~~ + +.. contributors:: v0.25.2..HEAD From 166b4bb4843688555ef6dded522df01decebb533 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 30 Oct 2019 11:38:20 -0700 Subject: [PATCH 4/4] Backport PR #29294 for 0.25.3 whatsnew --- pandas/io/parsers.py | 2 +- pandas/tests/io/parser/test_common.py | 5 +---- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 45444ffa0c8bd..1674a9f794fd9 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -2935,7 +2935,7 @@ def _next_iter_line(self, row_num): if self.warn_bad_lines or self.error_bad_lines: msg = str(e) - if "NULL byte" in msg: + if "NULL byte" in msg or "line contains NUL" in msg: msg = ( "NULL byte detected. This byte " "cannot be processed in Python's " diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py index 4a54d43de667a..b94d5cd497ccf 100644 --- a/pandas/tests/io/parser/test_common.py +++ b/pandas/tests/io/parser/test_common.py @@ -1898,10 +1898,7 @@ def test_null_byte_char(all_parsers): out = parser.read_csv(StringIO(data), names=names) tm.assert_frame_equal(out, expected) else: - if compat.PY38: - msg = "line contains NUL" - else: - msg = "NULL byte detected" + msg = "NULL byte detected" with pytest.raises(ParserError, match=msg): parser.read_csv(StringIO(data), names=names)