pandas-dev · mroeschke · Jun 28, 2023 · Jun 27, 2023 · Jun 27, 2023 · Jun 27, 2023
diff --git a/pandas/_libs/parsers.pyi b/pandas/_libs/parsers.pyi
@@ -12,6 +12,7 @@ from pandas._typing import (
 )
 
 STR_NA_VALUES: set[str]
+DEFAULT_BUFFER_HEURISTIC: int
 
 def sanitize_objects(
     values: npt.NDArray[np.object_],

diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx
@@ -118,6 +118,8 @@ cdef:
     float64_t NEGINF = -INF
     int64_t DEFAULT_CHUNKSIZE = 256 * 1024
 
+DEFAULT_BUFFER_HEURISTIC = 2 ** 20
+
 
 cdef extern from "pandas/portable.h":
     # I *think* this is here so that strcasecmp is defined on Windows
@@ -584,7 +586,7 @@ cdef class TextReader:
             raise EmptyDataError("No columns to parse from file")
 
         # Compute buffer_lines as function of table width.
-        heuristic = 2**20 // self.table_width
+        heuristic = DEFAULT_BUFFER_HEURISTIC // self.table_width
         self.buffer_lines = 1
         while self.buffer_lines * 2 < heuristic:
             self.buffer_lines *= 2

diff --git a/pandas/tests/indexes/datetimes/test_date_range.py b/pandas/tests/indexes/datetimes/test_date_range.py
@@ -212,11 +212,16 @@ def test_date_range_int64_overflow_non_recoverable(self):
             date_range(end="1969-11-14", periods=106752 * 24, freq="H")
 
     @pytest.mark.slow
-    def test_date_range_int64_overflow_stride_endpoint_different_signs(self):
+    @pytest.mark.parametrize(
+        "s_ts, e_ts", [("2262-02-23", "1969-11-14"), ("1970-02-01", "1677-10-22")]
+    )
+    def test_date_range_int64_overflow_stride_endpoint_different_signs(
+        self, s_ts, e_ts
+    ):
         # cases where stride * periods overflow int64 and stride/endpoint
         #  have different signs
-        start = Timestamp("2262-02-23")
-        end = Timestamp("1969-11-14")
+        start = Timestamp(s_ts)
+        end = Timestamp(e_ts)
 
         expected = date_range(start=start, end=end, freq="-1H")
         assert expected[0] == start
@@ -225,16 +230,6 @@ def test_date_range_int64_overflow_stride_endpoint_different_signs(self):
         dti = date_range(end=end, periods=len(expected), freq="-1H")
         tm.assert_index_equal(dti, expected)
 
-        start2 = Timestamp("1970-02-01")
-        end2 = Timestamp("1677-10-22")
-
-        expected2 = date_range(start=start2, end=end2, freq="-1H")
-        assert expected2[0] == start2
-        assert expected2[-1] == end2
-
-        dti2 = date_range(start=start2, periods=len(expected2), freq="-1H")
-        tm.assert_index_equal(dti2, expected2)
-
     def test_date_range_out_of_bounds(self):
         # GH#14187
         msg = "Cannot generate range"

diff --git a/pandas/tests/io/parser/common/test_chunksize.py b/pandas/tests/io/parser/common/test_chunksize.py
@@ -7,6 +7,7 @@
 import numpy as np
 import pytest
 
+from pandas._libs import parsers as libparsers
 from pandas.errors import DtypeWarning
 
 from pandas import (
@@ -162,14 +163,18 @@ def test_chunk_begins_with_newline_whitespace(all_parsers):
 
 
 @pytest.mark.slow
-def test_chunks_have_consistent_numerical_type(all_parsers):
+def test_chunks_have_consistent_numerical_type(all_parsers, monkeypatch):
+    # mainly an issue with the C parser
+    heuristic = 2**3
     parser = all_parsers
-    integers = [str(i) for i in range(499999)]
+    integers = [str(i) for i in range(heuristic - 1)]
     data = "a\n" + "\n".join(integers + ["1.0", "2.0"] + integers)
 
     # Coercions should work without warnings.
     with tm.assert_produces_warning(None):
-        result = parser.read_csv(StringIO(data))
+        with monkeypatch.context() as m:
+            m.setattr(libparsers, "DEFAULT_BUFFER_HEURISTIC", heuristic)
+            result = parser.read_csv(StringIO(data))
 
     assert type(result.a[0]) is np.float64
     assert result.a.dtype == float

diff --git a/pandas/tests/io/parser/dtypes/test_categorical.py b/pandas/tests/io/parser/dtypes/test_categorical.py
@@ -8,6 +8,8 @@
 import numpy as np
 import pytest
 
+from pandas._libs import parsers as libparsers
+
 from pandas.core.dtypes.dtypes import CategoricalDtype
 
 import pandas as pd
@@ -105,13 +107,16 @@ def test_categorical_dtype_missing(all_parsers):
 
 @xfail_pyarrow
 @pytest.mark.slow
-def test_categorical_dtype_high_cardinality_numeric(all_parsers):
+def test_categorical_dtype_high_cardinality_numeric(all_parsers, monkeypatch):
     # see gh-18186
+    # was an issue with C parser, due to DEFAULT_BUFFER_HEURISTIC
     parser = all_parsers
-    data = np.sort([str(i) for i in range(524289)])
+    heuristic = 2**5
+    data = np.sort([str(i) for i in range(heuristic + 1)])
     expected = DataFrame({"a": Categorical(data, ordered=True)})
-
-    actual = parser.read_csv(StringIO("a\n" + "\n".join(data)), dtype="category")
+    with monkeypatch.context() as m:
+        m.setattr(libparsers, "DEFAULT_BUFFER_HEURISTIC", heuristic)
+        actual = parser.read_csv(StringIO("a\n" + "\n".join(data)), dtype="category")
     actual["a"] = actual["a"].cat.reorder_categories(
         np.sort(actual.a.cat.categories), ordered=True
     )

diff --git a/pandas/tests/io/parser/test_c_parser_only.py b/pandas/tests/io/parser/test_c_parser_only.py
@@ -44,32 +44,6 @@ def test_buffer_overflow(c_parser_only, malformed):
         parser.read_csv(StringIO(malformed))
 
 
-def test_buffer_rd_bytes(c_parser_only):
-    # see gh-12098: src->buffer in the C parser can be freed twice leading
-    # to a segfault if a corrupt gzip file is read with 'read_csv', and the
-    # buffer is filled more than once before gzip raises an Exception.
-
-    data = (
-        "\x1F\x8B\x08\x00\x00\x00\x00\x00\x00\x03\xED\xC3\x41\x09"
-        "\x00\x00\x08\x00\xB1\xB7\xB6\xBA\xFE\xA5\xCC\x21\x6C\xB0"
-        "\xA6\x4D" + "\x55" * 267 + "\x7D\xF7\x00\x91\xE0\x47\x97\x14\x38\x04\x00"
-        "\x1f\x8b\x08\x00VT\x97V\x00\x03\xed]\xefO"
-    )
-    parser = c_parser_only
-
-    for _ in range(100):
-        try:
-            parser.read_csv_check_warnings(
-                RuntimeWarning,
-                "compression has no effect when passing a non-binary object as input",
-                StringIO(data),
-                compression="gzip",
-                delim_whitespace=True,
-            )
-        except Exception:
-            pass
-
-
 def test_delim_whitespace_custom_terminator(c_parser_only):
     # See gh-12912
     data = "a b c~1 2 3~4 5 6~7 8 9"

diff --git a/pandas/tests/io/parser/test_multi_thread.py b/pandas/tests/io/parser/test_multi_thread.py
@@ -22,38 +22,16 @@
 ]
 
 
-def _construct_dataframe(num_rows):
-    """
-    Construct a DataFrame for testing.
-
-    Parameters
-    ----------
-    num_rows : int
-        The number of rows for our DataFrame.
-
-    Returns
-    -------
-    df : DataFrame
-    """
-    df = DataFrame(np.random.rand(num_rows, 5), columns=list("abcde"))
-    df["foo"] = "foo"
-    df["bar"] = "bar"
-    df["baz"] = "baz"
-    df["date"] = pd.date_range("20000101 09:00:00", periods=num_rows, freq="s")
-    df["int"] = np.arange(num_rows, dtype="int64")
-    return df
-
-
 def test_multi_thread_string_io_read_csv(all_parsers):
     # see gh-11786
     parser = all_parsers
-    max_row_range = 10000
-    num_files = 100
+    max_row_range = 100
+    num_files = 10
 
-    bytes_to_df = [
+    bytes_to_df = (
         "\n".join([f"{i:d},{i:d},{i:d}" for i in range(max_row_range)]).encode()
         for _ in range(num_files)
-    ]
+    )
 
     # Read all files in many threads.
     with ExitStack() as stack:
@@ -141,11 +119,24 @@ def reader(arg):
 def test_multi_thread_path_multipart_read_csv(all_parsers):
     # see gh-11786
     num_tasks = 4
-    num_rows = 100000
+    num_rows = 48
 
     parser = all_parsers
     file_name = "__thread_pool_reader__.csv"
-    df = _construct_dataframe(num_rows)
+    df = DataFrame(
+        {
+            "a": np.random.rand(num_rows),
+            "b": np.random.rand(num_rows),
+            "c": np.random.rand(num_rows),
+            "d": np.random.rand(num_rows),
+            "e": np.random.rand(num_rows),
+            "foo": ["foo"] * num_rows,
+            "bar": ["bar"] * num_rows,
+            "baz": ["baz"] * num_rows,
+            "date": pd.date_range("20000101 09:00:00", periods=num_rows, freq="s"),
+            "int": np.arange(num_rows, dtype="int64"),
+        }
+    )
 
     with tm.ensure_clean(file_name) as path:
         df.to_csv(path)

diff --git a/pandas/tests/test_sorting.py b/pandas/tests/test_sorting.py
@@ -96,32 +96,29 @@ def test_int64_overflow_groupby_large_range(self):
 
     @pytest.mark.parametrize("agg", ["mean", "median"])
     def test_int64_overflow_groupby_large_df_shuffled(self, agg):
-        arr = np.random.randint(-1 << 12, 1 << 12, (1 << 15, 5))
-        i = np.random.choice(len(arr), len(arr) * 4)
+        rs = np.random.RandomState(42)
+        arr = rs.randint(-1 << 12, 1 << 12, (1 << 15, 5))
+        i = rs.choice(len(arr), len(arr) * 4)
         arr = np.vstack((arr, arr[i]))  # add some duplicate rows
 
-        i = np.random.permutation(len(arr))
+        i = rs.permutation(len(arr))
         arr = arr[i]  # shuffle rows
 
         df = DataFrame(arr, columns=list("abcde"))
-        df["jim"], df["joe"] = np.random.randn(2, len(df)) * 10
+        df["jim"], df["joe"] = np.zeros((2, len(df)))
         gr = df.groupby(list("abcde"))
 
         # verify this is testing what it is supposed to test!
         assert is_int64_overflow_possible(gr.grouper.shape)
 
-        # manually compute groupings
-        jim, joe = defaultdict(list), defaultdict(list)
-        for key, a, b in zip(map(tuple, arr), df["jim"], df["joe"]):
-            jim[key].append(a)
-            joe[key].append(b)
-
-        assert len(gr) == len(jim)
-        mi = MultiIndex.from_tuples(jim.keys(), names=list("abcde"))
+        mi = MultiIndex.from_arrays(
+            [ar.ravel() for ar in np.array_split(np.unique(arr, axis=0), 5, axis=1)],
+            names=list("abcde"),
+        )
 
-        f = lambda a: np.fromiter(map(getattr(np, agg), a), dtype="f8")
-        arr = np.vstack((f(jim.values()), f(joe.values()))).T
-        res = DataFrame(arr, columns=["jim", "joe"], index=mi).sort_index()
+        res = DataFrame(
+            np.zeros((len(mi), 2)), columns=["jim", "joe"], index=mi
+        ).sort_index()
 
         tm.assert_frame_equal(getattr(gr, agg)(), res)