From 9e3d15f81f5c93c2fc7aa41f95a71e8a93228a14 Mon Sep 17 00:00:00 2001
From: Will Ayd <william.ayd@icloud.com>
Date: Thu, 16 Nov 2023 08:16:26 -0500
Subject: [PATCH 1/4] fix groupby and apply UB

---
 pandas/core/methods/selectn.py          | 5 ++++-
 pandas/tests/groupby/test_cumulative.py | 1 +
 scripts/tests/data/deps_minimum.toml    | 1 +
 3 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/pandas/core/methods/selectn.py b/pandas/core/methods/selectn.py
index 894791cb46371..6571eb0aad5d7 100644
--- a/pandas/core/methods/selectn.py
+++ b/pandas/core/methods/selectn.py
@@ -139,7 +139,10 @@ def compute(self, method: str) -> Series:
 
         # arr passed into kth_smallest must be contiguous. We copy
         # here because kth_smallest will modify its input
-        kth_val = libalgos.kth_smallest(arr.copy(order="C"), n - 1)
+        if n == 0:  # avoid OOB access with kth_smallest_c
+            kth_val = 0
+        else:
+            kth_val = libalgos.kth_smallest(arr.copy(order="C"), n - 1)
         (ns,) = np.nonzero(arr <= kth_val)
         inds = ns[arr[ns].argsort(kind="mergesort")]
 
diff --git a/pandas/tests/groupby/test_cumulative.py b/pandas/tests/groupby/test_cumulative.py
index 25534865b3486..a3fda7c118ba4 100644
--- a/pandas/tests/groupby/test_cumulative.py
+++ b/pandas/tests/groupby/test_cumulative.py
@@ -60,6 +60,7 @@ def test_groupby_cumprod():
     tm.assert_series_equal(actual, expected)
 
 
+@pytest.mark.skip  # TODO: add expected_ub marker
 def test_groupby_cumprod_overflow():
     # GH#37493 if we overflow we return garbage consistent with numpy
     df = DataFrame({"key": ["b"] * 4, "value": 100_000})
diff --git a/scripts/tests/data/deps_minimum.toml b/scripts/tests/data/deps_minimum.toml
index c74ad3d17a4a9..c12b03b989d07 100644
--- a/scripts/tests/data/deps_minimum.toml
+++ b/scripts/tests/data/deps_minimum.toml
@@ -383,6 +383,7 @@ markers = [
   "clipboard: mark a pd.read_clipboard test",
   "arm_slow: mark a test as slow for arm64 architecture",
   "arraymanager: mark a test to run with ArrayManager enabled",
+  "expected_ub : mark a test known to have undefined behavior",
 ]
 
 [tool.mypy]

From 01b2752b71916071d97c362ff0346d4bf96621ae Mon Sep 17 00:00:00 2001
From: Will Ayd <william.ayd@icloud.com>
Date: Thu, 16 Nov 2023 08:50:14 -0500
Subject: [PATCH 2/4] fix/skip parser overflow

---
 pandas/tests/io/parser/common/test_float.py | 3 ++-
 pandas/tests/io/test_orc.py                 | 4 ++++
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/pandas/tests/io/parser/common/test_float.py b/pandas/tests/io/parser/common/test_float.py
index 4b23774ee2d5b..37788df27ac48 100644
--- a/pandas/tests/io/parser/common/test_float.py
+++ b/pandas/tests/io/parser/common/test_float.py
@@ -40,7 +40,7 @@ def test_scientific_no_exponent(all_parsers_all_precisions):
     tm.assert_frame_equal(df_roundtrip, df)
 
 
-@pytest.mark.parametrize("neg_exp", [-617, -100000, -99999999999999999])
+@pytest.mark.parametrize("neg_exp", [-617, -100000, -999999999])
 def test_very_negative_exponent(all_parsers_all_precisions, neg_exp):
     # GH#38753
     parser, precision = all_parsers_all_precisions
@@ -51,6 +51,7 @@ def test_very_negative_exponent(all_parsers_all_precisions, neg_exp):
     tm.assert_frame_equal(result, expected)
 
 
+@pytest.mark.skip  # TODO: figure out why linux is so weird on this
 @xfail_pyarrow  # AssertionError: Attributes of DataFrame.iloc[:, 0] are different
 @pytest.mark.parametrize("exp", [999999999999999999, -999999999999999999])
 def test_too_many_exponent_digits(all_parsers_all_precisions, exp, request):
diff --git a/pandas/tests/io/test_orc.py b/pandas/tests/io/test_orc.py
index a4021311fc963..a6d426873ed99 100644
--- a/pandas/tests/io/test_orc.py
+++ b/pandas/tests/io/test_orc.py
@@ -75,6 +75,7 @@ def test_orc_reader_empty(dirpath):
     tm.assert_equal(expected, got)
 
 
+@pytest.mark.skip  # segfaults looking for /usr/share/zoneinfo/US
 def test_orc_reader_basic(dirpath):
     data = {
         "boolean1": np.array([False, True], dtype="bool"),
@@ -122,6 +123,7 @@ def test_orc_reader_decimal(dirpath):
     tm.assert_equal(expected, got)
 
 
+@pytest.mark.skip  # segfaults looking for /usr/share/zoneinfo/US
 def test_orc_reader_date_low(dirpath):
     data = {
         "time": np.array(
@@ -163,6 +165,7 @@ def test_orc_reader_date_low(dirpath):
     tm.assert_equal(expected, got)
 
 
+@pytest.mark.skip  # segfaults looking for /usr/share/zoneinfo/US
 def test_orc_reader_date_high(dirpath):
     data = {
         "time": np.array(
@@ -204,6 +207,7 @@ def test_orc_reader_date_high(dirpath):
     tm.assert_equal(expected, got)
 
 
+@pytest.mark.skip  # segfaults looking for /usr/share/zoneinfo/US
 def test_orc_reader_snappy_compressed(dirpath):
     data = {
         "int1": np.array(

From bc474ba8714e22f72d2343ccc0e0532ad4effa6c Mon Sep 17 00:00:00 2001
From: Will Ayd <william.ayd@icloud.com>
Date: Thu, 16 Nov 2023 13:06:53 -0500
Subject: [PATCH 3/4] revert unneeded

---
 pandas/tests/groupby/test_cumulative.py     | 1 -
 pandas/tests/io/parser/common/test_float.py | 3 +--
 pandas/tests/io/test_orc.py                 | 4 ----
 scripts/tests/data/deps_minimum.toml        | 1 -
 4 files changed, 1 insertion(+), 8 deletions(-)

diff --git a/pandas/tests/groupby/test_cumulative.py b/pandas/tests/groupby/test_cumulative.py
index a3fda7c118ba4..25534865b3486 100644
--- a/pandas/tests/groupby/test_cumulative.py
+++ b/pandas/tests/groupby/test_cumulative.py
@@ -60,7 +60,6 @@ def test_groupby_cumprod():
     tm.assert_series_equal(actual, expected)
 
 
-@pytest.mark.skip  # TODO: add expected_ub marker
 def test_groupby_cumprod_overflow():
     # GH#37493 if we overflow we return garbage consistent with numpy
     df = DataFrame({"key": ["b"] * 4, "value": 100_000})
diff --git a/pandas/tests/io/parser/common/test_float.py b/pandas/tests/io/parser/common/test_float.py
index 37788df27ac48..4b23774ee2d5b 100644
--- a/pandas/tests/io/parser/common/test_float.py
+++ b/pandas/tests/io/parser/common/test_float.py
@@ -40,7 +40,7 @@ def test_scientific_no_exponent(all_parsers_all_precisions):
     tm.assert_frame_equal(df_roundtrip, df)
 
 
-@pytest.mark.parametrize("neg_exp", [-617, -100000, -999999999])
+@pytest.mark.parametrize("neg_exp", [-617, -100000, -99999999999999999])
 def test_very_negative_exponent(all_parsers_all_precisions, neg_exp):
     # GH#38753
     parser, precision = all_parsers_all_precisions
@@ -51,7 +51,6 @@ def test_very_negative_exponent(all_parsers_all_precisions, neg_exp):
     tm.assert_frame_equal(result, expected)
 
 
-@pytest.mark.skip  # TODO: figure out why linux is so weird on this
 @xfail_pyarrow  # AssertionError: Attributes of DataFrame.iloc[:, 0] are different
 @pytest.mark.parametrize("exp", [999999999999999999, -999999999999999999])
 def test_too_many_exponent_digits(all_parsers_all_precisions, exp, request):
diff --git a/pandas/tests/io/test_orc.py b/pandas/tests/io/test_orc.py
index a6d426873ed99..a4021311fc963 100644
--- a/pandas/tests/io/test_orc.py
+++ b/pandas/tests/io/test_orc.py
@@ -75,7 +75,6 @@ def test_orc_reader_empty(dirpath):
     tm.assert_equal(expected, got)
 
 
-@pytest.mark.skip  # segfaults looking for /usr/share/zoneinfo/US
 def test_orc_reader_basic(dirpath):
     data = {
         "boolean1": np.array([False, True], dtype="bool"),
@@ -123,7 +122,6 @@ def test_orc_reader_decimal(dirpath):
     tm.assert_equal(expected, got)
 
 
-@pytest.mark.skip  # segfaults looking for /usr/share/zoneinfo/US
 def test_orc_reader_date_low(dirpath):
     data = {
         "time": np.array(
@@ -165,7 +163,6 @@ def test_orc_reader_date_low(dirpath):
     tm.assert_equal(expected, got)
 
 
-@pytest.mark.skip  # segfaults looking for /usr/share/zoneinfo/US
 def test_orc_reader_date_high(dirpath):
     data = {
         "time": np.array(
@@ -207,7 +204,6 @@ def test_orc_reader_date_high(dirpath):
     tm.assert_equal(expected, got)
 
 
-@pytest.mark.skip  # segfaults looking for /usr/share/zoneinfo/US
 def test_orc_reader_snappy_compressed(dirpath):
     data = {
         "int1": np.array(
diff --git a/scripts/tests/data/deps_minimum.toml b/scripts/tests/data/deps_minimum.toml
index c12b03b989d07..c74ad3d17a4a9 100644
--- a/scripts/tests/data/deps_minimum.toml
+++ b/scripts/tests/data/deps_minimum.toml
@@ -383,7 +383,6 @@ markers = [
   "clipboard: mark a pd.read_clipboard test",
   "arm_slow: mark a test as slow for arm64 architecture",
   "arraymanager: mark a test to run with ArrayManager enabled",
-  "expected_ub : mark a test known to have undefined behavior",
 ]
 
 [tool.mypy]

From 71520980d73cad143a26fe1c9c1f916d1461e396 Mon Sep 17 00:00:00 2001
From: Will Ayd <william.ayd@icloud.com>
Date: Fri, 17 Nov 2023 14:04:06 -0500
Subject: [PATCH 4/4] feedback

---
 pandas/core/methods/selectn.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/pandas/core/methods/selectn.py b/pandas/core/methods/selectn.py
index 6571eb0aad5d7..843c54de25bc9 100644
--- a/pandas/core/methods/selectn.py
+++ b/pandas/core/methods/selectn.py
@@ -139,10 +139,8 @@ def compute(self, method: str) -> Series:
 
         # arr passed into kth_smallest must be contiguous. We copy
         # here because kth_smallest will modify its input
-        if n == 0:  # avoid OOB access with kth_smallest_c
-            kth_val = 0
-        else:
-            kth_val = libalgos.kth_smallest(arr.copy(order="C"), n - 1)
+        # avoid OOB access with kth_smallest_c when n <= 0
+        kth_val = libalgos.kth_smallest(arr.copy(order="C"), max(n - 1, 0))
         (ns,) = np.nonzero(arr <= kth_val)
         inds = ns[arr[ns].argsort(kind="mergesort")]