From 9e3d15f81f5c93c2fc7aa41f95a71e8a93228a14 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Thu, 16 Nov 2023 08:16:26 -0500 Subject: [PATCH 1/4] fix groupby and apply UB --- pandas/core/methods/selectn.py | 5 ++++- pandas/tests/groupby/test_cumulative.py | 1 + scripts/tests/data/deps_minimum.toml | 1 + 3 files changed, 6 insertions(+), 1 deletion(-) diff --git a/pandas/core/methods/selectn.py b/pandas/core/methods/selectn.py index 894791cb46371..6571eb0aad5d7 100644 --- a/pandas/core/methods/selectn.py +++ b/pandas/core/methods/selectn.py @@ -139,7 +139,10 @@ def compute(self, method: str) -> Series: # arr passed into kth_smallest must be contiguous. We copy # here because kth_smallest will modify its input - kth_val = libalgos.kth_smallest(arr.copy(order="C"), n - 1) + if n == 0: # avoid OOB access with kth_smallest_c + kth_val = 0 + else: + kth_val = libalgos.kth_smallest(arr.copy(order="C"), n - 1) (ns,) = np.nonzero(arr <= kth_val) inds = ns[arr[ns].argsort(kind="mergesort")] diff --git a/pandas/tests/groupby/test_cumulative.py b/pandas/tests/groupby/test_cumulative.py index 25534865b3486..a3fda7c118ba4 100644 --- a/pandas/tests/groupby/test_cumulative.py +++ b/pandas/tests/groupby/test_cumulative.py @@ -60,6 +60,7 @@ def test_groupby_cumprod(): tm.assert_series_equal(actual, expected) +@pytest.mark.skip # TODO: add expected_ub marker def test_groupby_cumprod_overflow(): # GH#37493 if we overflow we return garbage consistent with numpy df = DataFrame({"key": ["b"] * 4, "value": 100_000}) diff --git a/scripts/tests/data/deps_minimum.toml b/scripts/tests/data/deps_minimum.toml index c74ad3d17a4a9..c12b03b989d07 100644 --- a/scripts/tests/data/deps_minimum.toml +++ b/scripts/tests/data/deps_minimum.toml @@ -383,6 +383,7 @@ markers = [ "clipboard: mark a pd.read_clipboard test", "arm_slow: mark a test as slow for arm64 architecture", "arraymanager: mark a test to run with ArrayManager enabled", + "expected_ub : mark a test known to have undefined behavior", ] [tool.mypy] From 01b2752b71916071d97c362ff0346d4bf96621ae Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Thu, 16 Nov 2023 08:50:14 -0500 Subject: [PATCH 2/4] fix/skip parser overflow --- pandas/tests/io/parser/common/test_float.py | 3 ++- pandas/tests/io/test_orc.py | 4 ++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/pandas/tests/io/parser/common/test_float.py b/pandas/tests/io/parser/common/test_float.py index 4b23774ee2d5b..37788df27ac48 100644 --- a/pandas/tests/io/parser/common/test_float.py +++ b/pandas/tests/io/parser/common/test_float.py @@ -40,7 +40,7 @@ def test_scientific_no_exponent(all_parsers_all_precisions): tm.assert_frame_equal(df_roundtrip, df) -@pytest.mark.parametrize("neg_exp", [-617, -100000, -99999999999999999]) +@pytest.mark.parametrize("neg_exp", [-617, -100000, -999999999]) def test_very_negative_exponent(all_parsers_all_precisions, neg_exp): # GH#38753 parser, precision = all_parsers_all_precisions @@ -51,6 +51,7 @@ def test_very_negative_exponent(all_parsers_all_precisions, neg_exp): tm.assert_frame_equal(result, expected) +@pytest.mark.skip # TODO: figure out why linux is so weird on this @xfail_pyarrow # AssertionError: Attributes of DataFrame.iloc[:, 0] are different @pytest.mark.parametrize("exp", [999999999999999999, -999999999999999999]) def test_too_many_exponent_digits(all_parsers_all_precisions, exp, request): diff --git a/pandas/tests/io/test_orc.py b/pandas/tests/io/test_orc.py index a4021311fc963..a6d426873ed99 100644 --- a/pandas/tests/io/test_orc.py +++ b/pandas/tests/io/test_orc.py @@ -75,6 +75,7 @@ def test_orc_reader_empty(dirpath): tm.assert_equal(expected, got) +@pytest.mark.skip # segfaults looking for /usr/share/zoneinfo/US def test_orc_reader_basic(dirpath): data = { "boolean1": np.array([False, True], dtype="bool"), @@ -122,6 +123,7 @@ def test_orc_reader_decimal(dirpath): tm.assert_equal(expected, got) +@pytest.mark.skip # segfaults looking for /usr/share/zoneinfo/US def test_orc_reader_date_low(dirpath): data = { "time": np.array( @@ -163,6 +165,7 @@ def test_orc_reader_date_low(dirpath): tm.assert_equal(expected, got) +@pytest.mark.skip # segfaults looking for /usr/share/zoneinfo/US def test_orc_reader_date_high(dirpath): data = { "time": np.array( @@ -204,6 +207,7 @@ def test_orc_reader_date_high(dirpath): tm.assert_equal(expected, got) +@pytest.mark.skip # segfaults looking for /usr/share/zoneinfo/US def test_orc_reader_snappy_compressed(dirpath): data = { "int1": np.array( From bc474ba8714e22f72d2343ccc0e0532ad4effa6c Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Thu, 16 Nov 2023 13:06:53 -0500 Subject: [PATCH 3/4] revert unneeded --- pandas/tests/groupby/test_cumulative.py | 1 - pandas/tests/io/parser/common/test_float.py | 3 +-- pandas/tests/io/test_orc.py | 4 ---- scripts/tests/data/deps_minimum.toml | 1 - 4 files changed, 1 insertion(+), 8 deletions(-) diff --git a/pandas/tests/groupby/test_cumulative.py b/pandas/tests/groupby/test_cumulative.py index a3fda7c118ba4..25534865b3486 100644 --- a/pandas/tests/groupby/test_cumulative.py +++ b/pandas/tests/groupby/test_cumulative.py @@ -60,7 +60,6 @@ def test_groupby_cumprod(): tm.assert_series_equal(actual, expected) -@pytest.mark.skip # TODO: add expected_ub marker def test_groupby_cumprod_overflow(): # GH#37493 if we overflow we return garbage consistent with numpy df = DataFrame({"key": ["b"] * 4, "value": 100_000}) diff --git a/pandas/tests/io/parser/common/test_float.py b/pandas/tests/io/parser/common/test_float.py index 37788df27ac48..4b23774ee2d5b 100644 --- a/pandas/tests/io/parser/common/test_float.py +++ b/pandas/tests/io/parser/common/test_float.py @@ -40,7 +40,7 @@ def test_scientific_no_exponent(all_parsers_all_precisions): tm.assert_frame_equal(df_roundtrip, df) -@pytest.mark.parametrize("neg_exp", [-617, -100000, -999999999]) +@pytest.mark.parametrize("neg_exp", [-617, -100000, -99999999999999999]) def test_very_negative_exponent(all_parsers_all_precisions, neg_exp): # GH#38753 parser, precision = all_parsers_all_precisions @@ -51,7 +51,6 @@ def test_very_negative_exponent(all_parsers_all_precisions, neg_exp): tm.assert_frame_equal(result, expected) -@pytest.mark.skip # TODO: figure out why linux is so weird on this @xfail_pyarrow # AssertionError: Attributes of DataFrame.iloc[:, 0] are different @pytest.mark.parametrize("exp", [999999999999999999, -999999999999999999]) def test_too_many_exponent_digits(all_parsers_all_precisions, exp, request): diff --git a/pandas/tests/io/test_orc.py b/pandas/tests/io/test_orc.py index a6d426873ed99..a4021311fc963 100644 --- a/pandas/tests/io/test_orc.py +++ b/pandas/tests/io/test_orc.py @@ -75,7 +75,6 @@ def test_orc_reader_empty(dirpath): tm.assert_equal(expected, got) -@pytest.mark.skip # segfaults looking for /usr/share/zoneinfo/US def test_orc_reader_basic(dirpath): data = { "boolean1": np.array([False, True], dtype="bool"), @@ -123,7 +122,6 @@ def test_orc_reader_decimal(dirpath): tm.assert_equal(expected, got) -@pytest.mark.skip # segfaults looking for /usr/share/zoneinfo/US def test_orc_reader_date_low(dirpath): data = { "time": np.array( @@ -165,7 +163,6 @@ def test_orc_reader_date_low(dirpath): tm.assert_equal(expected, got) -@pytest.mark.skip # segfaults looking for /usr/share/zoneinfo/US def test_orc_reader_date_high(dirpath): data = { "time": np.array( @@ -207,7 +204,6 @@ def test_orc_reader_date_high(dirpath): tm.assert_equal(expected, got) -@pytest.mark.skip # segfaults looking for /usr/share/zoneinfo/US def test_orc_reader_snappy_compressed(dirpath): data = { "int1": np.array( diff --git a/scripts/tests/data/deps_minimum.toml b/scripts/tests/data/deps_minimum.toml index c12b03b989d07..c74ad3d17a4a9 100644 --- a/scripts/tests/data/deps_minimum.toml +++ b/scripts/tests/data/deps_minimum.toml @@ -383,7 +383,6 @@ markers = [ "clipboard: mark a pd.read_clipboard test", "arm_slow: mark a test as slow for arm64 architecture", "arraymanager: mark a test to run with ArrayManager enabled", - "expected_ub : mark a test known to have undefined behavior", ] [tool.mypy] From 71520980d73cad143a26fe1c9c1f916d1461e396 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Fri, 17 Nov 2023 14:04:06 -0500 Subject: [PATCH 4/4] feedback --- pandas/core/methods/selectn.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/pandas/core/methods/selectn.py b/pandas/core/methods/selectn.py index 6571eb0aad5d7..843c54de25bc9 100644 --- a/pandas/core/methods/selectn.py +++ b/pandas/core/methods/selectn.py @@ -139,10 +139,8 @@ def compute(self, method: str) -> Series: # arr passed into kth_smallest must be contiguous. We copy # here because kth_smallest will modify its input - if n == 0: # avoid OOB access with kth_smallest_c - kth_val = 0 - else: - kth_val = libalgos.kth_smallest(arr.copy(order="C"), n - 1) + # avoid OOB access with kth_smallest_c when n <= 0 + kth_val = libalgos.kth_smallest(arr.copy(order="C"), max(n - 1, 0)) (ns,) = np.nonzero(arr <= kth_val) inds = ns[arr[ns].argsort(kind="mergesort")]