diff --git a/pandas/_libs/parsers.pyi b/pandas/_libs/parsers.pyi index 3b6e4dca47b14..253bb7303cefb 100644 --- a/pandas/_libs/parsers.pyi +++ b/pandas/_libs/parsers.pyi @@ -12,6 +12,7 @@ from pandas._typing import ( ) STR_NA_VALUES: set[str] +DEFAULT_BUFFER_HEURISTIC: int def sanitize_objects( values: npt.NDArray[np.object_], diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index e97f214616ea6..6d66e21ce49f5 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -118,6 +118,8 @@ cdef: float64_t NEGINF = -INF int64_t DEFAULT_CHUNKSIZE = 256 * 1024 +DEFAULT_BUFFER_HEURISTIC = 2 ** 20 + cdef extern from "pandas/portable.h": # I *think* this is here so that strcasecmp is defined on Windows @@ -584,7 +586,7 @@ cdef class TextReader: raise EmptyDataError("No columns to parse from file") # Compute buffer_lines as function of table width. - heuristic = 2**20 // self.table_width + heuristic = DEFAULT_BUFFER_HEURISTIC // self.table_width self.buffer_lines = 1 while self.buffer_lines * 2 < heuristic: self.buffer_lines *= 2 diff --git a/pandas/tests/indexes/datetimes/test_date_range.py b/pandas/tests/indexes/datetimes/test_date_range.py index 1d6e7077de786..2e2e33e2fb366 100644 --- a/pandas/tests/indexes/datetimes/test_date_range.py +++ b/pandas/tests/indexes/datetimes/test_date_range.py @@ -212,11 +212,16 @@ def test_date_range_int64_overflow_non_recoverable(self): date_range(end="1969-11-14", periods=106752 * 24, freq="H") @pytest.mark.slow - def test_date_range_int64_overflow_stride_endpoint_different_signs(self): + @pytest.mark.parametrize( + "s_ts, e_ts", [("2262-02-23", "1969-11-14"), ("1970-02-01", "1677-10-22")] + ) + def test_date_range_int64_overflow_stride_endpoint_different_signs( + self, s_ts, e_ts + ): # cases where stride * periods overflow int64 and stride/endpoint # have different signs - start = Timestamp("2262-02-23") - end = Timestamp("1969-11-14") + start = Timestamp(s_ts) + end = Timestamp(e_ts) expected = date_range(start=start, end=end, freq="-1H") assert expected[0] == start @@ -225,16 +230,6 @@ def test_date_range_int64_overflow_stride_endpoint_different_signs(self): dti = date_range(end=end, periods=len(expected), freq="-1H") tm.assert_index_equal(dti, expected) - start2 = Timestamp("1970-02-01") - end2 = Timestamp("1677-10-22") - - expected2 = date_range(start=start2, end=end2, freq="-1H") - assert expected2[0] == start2 - assert expected2[-1] == end2 - - dti2 = date_range(start=start2, periods=len(expected2), freq="-1H") - tm.assert_index_equal(dti2, expected2) - def test_date_range_out_of_bounds(self): # GH#14187 msg = "Cannot generate range" diff --git a/pandas/tests/io/parser/common/test_chunksize.py b/pandas/tests/io/parser/common/test_chunksize.py index 6be7269cb8433..d407f98029e8d 100644 --- a/pandas/tests/io/parser/common/test_chunksize.py +++ b/pandas/tests/io/parser/common/test_chunksize.py @@ -7,6 +7,7 @@ import numpy as np import pytest +from pandas._libs import parsers as libparsers from pandas.errors import DtypeWarning from pandas import ( @@ -162,14 +163,18 @@ def test_chunk_begins_with_newline_whitespace(all_parsers): @pytest.mark.slow -def test_chunks_have_consistent_numerical_type(all_parsers): +def test_chunks_have_consistent_numerical_type(all_parsers, monkeypatch): + # mainly an issue with the C parser + heuristic = 2**3 parser = all_parsers - integers = [str(i) for i in range(499999)] + integers = [str(i) for i in range(heuristic - 1)] data = "a\n" + "\n".join(integers + ["1.0", "2.0"] + integers) # Coercions should work without warnings. with tm.assert_produces_warning(None): - result = parser.read_csv(StringIO(data)) + with monkeypatch.context() as m: + m.setattr(libparsers, "DEFAULT_BUFFER_HEURISTIC", heuristic) + result = parser.read_csv(StringIO(data)) assert type(result.a[0]) is np.float64 assert result.a.dtype == float diff --git a/pandas/tests/io/parser/dtypes/test_categorical.py b/pandas/tests/io/parser/dtypes/test_categorical.py index 33422d41c2f93..8671bccbc1bbd 100644 --- a/pandas/tests/io/parser/dtypes/test_categorical.py +++ b/pandas/tests/io/parser/dtypes/test_categorical.py @@ -8,6 +8,8 @@ import numpy as np import pytest +from pandas._libs import parsers as libparsers + from pandas.core.dtypes.dtypes import CategoricalDtype import pandas as pd @@ -105,13 +107,16 @@ def test_categorical_dtype_missing(all_parsers): @xfail_pyarrow @pytest.mark.slow -def test_categorical_dtype_high_cardinality_numeric(all_parsers): +def test_categorical_dtype_high_cardinality_numeric(all_parsers, monkeypatch): # see gh-18186 + # was an issue with C parser, due to DEFAULT_BUFFER_HEURISTIC parser = all_parsers - data = np.sort([str(i) for i in range(524289)]) + heuristic = 2**5 + data = np.sort([str(i) for i in range(heuristic + 1)]) expected = DataFrame({"a": Categorical(data, ordered=True)}) - - actual = parser.read_csv(StringIO("a\n" + "\n".join(data)), dtype="category") + with monkeypatch.context() as m: + m.setattr(libparsers, "DEFAULT_BUFFER_HEURISTIC", heuristic) + actual = parser.read_csv(StringIO("a\n" + "\n".join(data)), dtype="category") actual["a"] = actual["a"].cat.reorder_categories( np.sort(actual.a.cat.categories), ordered=True ) diff --git a/pandas/tests/io/parser/test_c_parser_only.py b/pandas/tests/io/parser/test_c_parser_only.py index 818c4f3522606..ceee9f13e07f8 100644 --- a/pandas/tests/io/parser/test_c_parser_only.py +++ b/pandas/tests/io/parser/test_c_parser_only.py @@ -44,32 +44,6 @@ def test_buffer_overflow(c_parser_only, malformed): parser.read_csv(StringIO(malformed)) -def test_buffer_rd_bytes(c_parser_only): - # see gh-12098: src->buffer in the C parser can be freed twice leading - # to a segfault if a corrupt gzip file is read with 'read_csv', and the - # buffer is filled more than once before gzip raises an Exception. - - data = ( - "\x1F\x8B\x08\x00\x00\x00\x00\x00\x00\x03\xED\xC3\x41\x09" - "\x00\x00\x08\x00\xB1\xB7\xB6\xBA\xFE\xA5\xCC\x21\x6C\xB0" - "\xA6\x4D" + "\x55" * 267 + "\x7D\xF7\x00\x91\xE0\x47\x97\x14\x38\x04\x00" - "\x1f\x8b\x08\x00VT\x97V\x00\x03\xed]\xefO" - ) - parser = c_parser_only - - for _ in range(100): - try: - parser.read_csv_check_warnings( - RuntimeWarning, - "compression has no effect when passing a non-binary object as input", - StringIO(data), - compression="gzip", - delim_whitespace=True, - ) - except Exception: - pass - - def test_delim_whitespace_custom_terminator(c_parser_only): # See gh-12912 data = "a b c~1 2 3~4 5 6~7 8 9" diff --git a/pandas/tests/io/parser/test_multi_thread.py b/pandas/tests/io/parser/test_multi_thread.py index 562b99090dfab..c3520a92f11b3 100644 --- a/pandas/tests/io/parser/test_multi_thread.py +++ b/pandas/tests/io/parser/test_multi_thread.py @@ -22,38 +22,16 @@ ] -def _construct_dataframe(num_rows): - """ - Construct a DataFrame for testing. - - Parameters - ---------- - num_rows : int - The number of rows for our DataFrame. - - Returns - ------- - df : DataFrame - """ - df = DataFrame(np.random.rand(num_rows, 5), columns=list("abcde")) - df["foo"] = "foo" - df["bar"] = "bar" - df["baz"] = "baz" - df["date"] = pd.date_range("20000101 09:00:00", periods=num_rows, freq="s") - df["int"] = np.arange(num_rows, dtype="int64") - return df - - def test_multi_thread_string_io_read_csv(all_parsers): # see gh-11786 parser = all_parsers - max_row_range = 10000 - num_files = 100 + max_row_range = 100 + num_files = 10 - bytes_to_df = [ + bytes_to_df = ( "\n".join([f"{i:d},{i:d},{i:d}" for i in range(max_row_range)]).encode() for _ in range(num_files) - ] + ) # Read all files in many threads. with ExitStack() as stack: @@ -141,11 +119,24 @@ def reader(arg): def test_multi_thread_path_multipart_read_csv(all_parsers): # see gh-11786 num_tasks = 4 - num_rows = 100000 + num_rows = 48 parser = all_parsers file_name = "__thread_pool_reader__.csv" - df = _construct_dataframe(num_rows) + df = DataFrame( + { + "a": np.random.rand(num_rows), + "b": np.random.rand(num_rows), + "c": np.random.rand(num_rows), + "d": np.random.rand(num_rows), + "e": np.random.rand(num_rows), + "foo": ["foo"] * num_rows, + "bar": ["bar"] * num_rows, + "baz": ["baz"] * num_rows, + "date": pd.date_range("20000101 09:00:00", periods=num_rows, freq="s"), + "int": np.arange(num_rows, dtype="int64"), + } + ) with tm.ensure_clean(file_name) as path: df.to_csv(path) diff --git a/pandas/tests/test_sorting.py b/pandas/tests/test_sorting.py index bdad8174c160d..45cdc3c332a9b 100644 --- a/pandas/tests/test_sorting.py +++ b/pandas/tests/test_sorting.py @@ -96,32 +96,29 @@ def test_int64_overflow_groupby_large_range(self): @pytest.mark.parametrize("agg", ["mean", "median"]) def test_int64_overflow_groupby_large_df_shuffled(self, agg): - arr = np.random.randint(-1 << 12, 1 << 12, (1 << 15, 5)) - i = np.random.choice(len(arr), len(arr) * 4) + rs = np.random.RandomState(42) + arr = rs.randint(-1 << 12, 1 << 12, (1 << 15, 5)) + i = rs.choice(len(arr), len(arr) * 4) arr = np.vstack((arr, arr[i])) # add some duplicate rows - i = np.random.permutation(len(arr)) + i = rs.permutation(len(arr)) arr = arr[i] # shuffle rows df = DataFrame(arr, columns=list("abcde")) - df["jim"], df["joe"] = np.random.randn(2, len(df)) * 10 + df["jim"], df["joe"] = np.zeros((2, len(df))) gr = df.groupby(list("abcde")) # verify this is testing what it is supposed to test! assert is_int64_overflow_possible(gr.grouper.shape) - # manually compute groupings - jim, joe = defaultdict(list), defaultdict(list) - for key, a, b in zip(map(tuple, arr), df["jim"], df["joe"]): - jim[key].append(a) - joe[key].append(b) - - assert len(gr) == len(jim) - mi = MultiIndex.from_tuples(jim.keys(), names=list("abcde")) + mi = MultiIndex.from_arrays( + [ar.ravel() for ar in np.array_split(np.unique(arr, axis=0), 5, axis=1)], + names=list("abcde"), + ) - f = lambda a: np.fromiter(map(getattr(np, agg), a), dtype="f8") - arr = np.vstack((f(jim.values()), f(joe.values()))).T - res = DataFrame(arr, columns=["jim", "joe"], index=mi).sort_index() + res = DataFrame( + np.zeros((len(mi), 2)), columns=["jim", "joe"], index=mi + ).sort_index() tm.assert_frame_equal(getattr(gr, agg)(), res)