Skip to content

Commit ee8e01e

Browse files
authored
TST: Refactor slow tests (#53891)
* Address more slow tests * Parameterize slow test * Reduce data size in multi_thread * Use constant data for test_int64_overflow_groupby_large_df_shuffled
1 parent b01dc62 commit ee8e01e

File tree

8 files changed

+60
-90
lines changed

8 files changed

+60
-90
lines changed

pandas/_libs/parsers.pyi

+1
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ from pandas._typing import (
1212
)
1313

1414
STR_NA_VALUES: set[str]
15+
DEFAULT_BUFFER_HEURISTIC: int
1516

1617
def sanitize_objects(
1718
values: npt.NDArray[np.object_],

pandas/_libs/parsers.pyx

+3-1
Original file line numberDiff line numberDiff line change
@@ -118,6 +118,8 @@ cdef:
118118
float64_t NEGINF = -INF
119119
int64_t DEFAULT_CHUNKSIZE = 256 * 1024
120120

121+
DEFAULT_BUFFER_HEURISTIC = 2 ** 20
122+
121123

122124
cdef extern from "pandas/portable.h":
123125
# I *think* this is here so that strcasecmp is defined on Windows
@@ -584,7 +586,7 @@ cdef class TextReader:
584586
raise EmptyDataError("No columns to parse from file")
585587

586588
# Compute buffer_lines as function of table width.
587-
heuristic = 2**20 // self.table_width
589+
heuristic = DEFAULT_BUFFER_HEURISTIC // self.table_width
588590
self.buffer_lines = 1
589591
while self.buffer_lines * 2 < heuristic:
590592
self.buffer_lines *= 2

pandas/tests/indexes/datetimes/test_date_range.py

+8-13
Original file line numberDiff line numberDiff line change
@@ -212,11 +212,16 @@ def test_date_range_int64_overflow_non_recoverable(self):
212212
date_range(end="1969-11-14", periods=106752 * 24, freq="H")
213213

214214
@pytest.mark.slow
215-
def test_date_range_int64_overflow_stride_endpoint_different_signs(self):
215+
@pytest.mark.parametrize(
216+
"s_ts, e_ts", [("2262-02-23", "1969-11-14"), ("1970-02-01", "1677-10-22")]
217+
)
218+
def test_date_range_int64_overflow_stride_endpoint_different_signs(
219+
self, s_ts, e_ts
220+
):
216221
# cases where stride * periods overflow int64 and stride/endpoint
217222
# have different signs
218-
start = Timestamp("2262-02-23")
219-
end = Timestamp("1969-11-14")
223+
start = Timestamp(s_ts)
224+
end = Timestamp(e_ts)
220225

221226
expected = date_range(start=start, end=end, freq="-1H")
222227
assert expected[0] == start
@@ -225,16 +230,6 @@ def test_date_range_int64_overflow_stride_endpoint_different_signs(self):
225230
dti = date_range(end=end, periods=len(expected), freq="-1H")
226231
tm.assert_index_equal(dti, expected)
227232

228-
start2 = Timestamp("1970-02-01")
229-
end2 = Timestamp("1677-10-22")
230-
231-
expected2 = date_range(start=start2, end=end2, freq="-1H")
232-
assert expected2[0] == start2
233-
assert expected2[-1] == end2
234-
235-
dti2 = date_range(start=start2, periods=len(expected2), freq="-1H")
236-
tm.assert_index_equal(dti2, expected2)
237-
238233
def test_date_range_out_of_bounds(self):
239234
# GH#14187
240235
msg = "Cannot generate range"

pandas/tests/io/parser/common/test_chunksize.py

+8-3
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
import numpy as np
88
import pytest
99

10+
from pandas._libs import parsers as libparsers
1011
from pandas.errors import DtypeWarning
1112

1213
from pandas import (
@@ -162,14 +163,18 @@ def test_chunk_begins_with_newline_whitespace(all_parsers):
162163

163164

164165
@pytest.mark.slow
165-
def test_chunks_have_consistent_numerical_type(all_parsers):
166+
def test_chunks_have_consistent_numerical_type(all_parsers, monkeypatch):
167+
# mainly an issue with the C parser
168+
heuristic = 2**3
166169
parser = all_parsers
167-
integers = [str(i) for i in range(499999)]
170+
integers = [str(i) for i in range(heuristic - 1)]
168171
data = "a\n" + "\n".join(integers + ["1.0", "2.0"] + integers)
169172

170173
# Coercions should work without warnings.
171174
with tm.assert_produces_warning(None):
172-
result = parser.read_csv(StringIO(data))
175+
with monkeypatch.context() as m:
176+
m.setattr(libparsers, "DEFAULT_BUFFER_HEURISTIC", heuristic)
177+
result = parser.read_csv(StringIO(data))
173178

174179
assert type(result.a[0]) is np.float64
175180
assert result.a.dtype == float

pandas/tests/io/parser/dtypes/test_categorical.py

+9-4
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,8 @@
88
import numpy as np
99
import pytest
1010

11+
from pandas._libs import parsers as libparsers
12+
1113
from pandas.core.dtypes.dtypes import CategoricalDtype
1214

1315
import pandas as pd
@@ -105,13 +107,16 @@ def test_categorical_dtype_missing(all_parsers):
105107

106108
@xfail_pyarrow
107109
@pytest.mark.slow
108-
def test_categorical_dtype_high_cardinality_numeric(all_parsers):
110+
def test_categorical_dtype_high_cardinality_numeric(all_parsers, monkeypatch):
109111
# see gh-18186
112+
# was an issue with C parser, due to DEFAULT_BUFFER_HEURISTIC
110113
parser = all_parsers
111-
data = np.sort([str(i) for i in range(524289)])
114+
heuristic = 2**5
115+
data = np.sort([str(i) for i in range(heuristic + 1)])
112116
expected = DataFrame({"a": Categorical(data, ordered=True)})
113-
114-
actual = parser.read_csv(StringIO("a\n" + "\n".join(data)), dtype="category")
117+
with monkeypatch.context() as m:
118+
m.setattr(libparsers, "DEFAULT_BUFFER_HEURISTIC", heuristic)
119+
actual = parser.read_csv(StringIO("a\n" + "\n".join(data)), dtype="category")
115120
actual["a"] = actual["a"].cat.reorder_categories(
116121
np.sort(actual.a.cat.categories), ordered=True
117122
)

pandas/tests/io/parser/test_c_parser_only.py

-26
Original file line numberDiff line numberDiff line change
@@ -44,32 +44,6 @@ def test_buffer_overflow(c_parser_only, malformed):
4444
parser.read_csv(StringIO(malformed))
4545

4646

47-
def test_buffer_rd_bytes(c_parser_only):
48-
# see gh-12098: src->buffer in the C parser can be freed twice leading
49-
# to a segfault if a corrupt gzip file is read with 'read_csv', and the
50-
# buffer is filled more than once before gzip raises an Exception.
51-
52-
data = (
53-
"\x1F\x8B\x08\x00\x00\x00\x00\x00\x00\x03\xED\xC3\x41\x09"
54-
"\x00\x00\x08\x00\xB1\xB7\xB6\xBA\xFE\xA5\xCC\x21\x6C\xB0"
55-
"\xA6\x4D" + "\x55" * 267 + "\x7D\xF7\x00\x91\xE0\x47\x97\x14\x38\x04\x00"
56-
"\x1f\x8b\x08\x00VT\x97V\x00\x03\xed]\xefO"
57-
)
58-
parser = c_parser_only
59-
60-
for _ in range(100):
61-
try:
62-
parser.read_csv_check_warnings(
63-
RuntimeWarning,
64-
"compression has no effect when passing a non-binary object as input",
65-
StringIO(data),
66-
compression="gzip",
67-
delim_whitespace=True,
68-
)
69-
except Exception:
70-
pass
71-
72-
7347
def test_delim_whitespace_custom_terminator(c_parser_only):
7448
# See gh-12912
7549
data = "a b c~1 2 3~4 5 6~7 8 9"

pandas/tests/io/parser/test_multi_thread.py

+19-28
Original file line numberDiff line numberDiff line change
@@ -22,38 +22,16 @@
2222
]
2323

2424

25-
def _construct_dataframe(num_rows):
26-
"""
27-
Construct a DataFrame for testing.
28-
29-
Parameters
30-
----------
31-
num_rows : int
32-
The number of rows for our DataFrame.
33-
34-
Returns
35-
-------
36-
df : DataFrame
37-
"""
38-
df = DataFrame(np.random.rand(num_rows, 5), columns=list("abcde"))
39-
df["foo"] = "foo"
40-
df["bar"] = "bar"
41-
df["baz"] = "baz"
42-
df["date"] = pd.date_range("20000101 09:00:00", periods=num_rows, freq="s")
43-
df["int"] = np.arange(num_rows, dtype="int64")
44-
return df
45-
46-
4725
def test_multi_thread_string_io_read_csv(all_parsers):
4826
# see gh-11786
4927
parser = all_parsers
50-
max_row_range = 10000
51-
num_files = 100
28+
max_row_range = 100
29+
num_files = 10
5230

53-
bytes_to_df = [
31+
bytes_to_df = (
5432
"\n".join([f"{i:d},{i:d},{i:d}" for i in range(max_row_range)]).encode()
5533
for _ in range(num_files)
56-
]
34+
)
5735

5836
# Read all files in many threads.
5937
with ExitStack() as stack:
@@ -141,11 +119,24 @@ def reader(arg):
141119
def test_multi_thread_path_multipart_read_csv(all_parsers):
142120
# see gh-11786
143121
num_tasks = 4
144-
num_rows = 100000
122+
num_rows = 48
145123

146124
parser = all_parsers
147125
file_name = "__thread_pool_reader__.csv"
148-
df = _construct_dataframe(num_rows)
126+
df = DataFrame(
127+
{
128+
"a": np.random.rand(num_rows),
129+
"b": np.random.rand(num_rows),
130+
"c": np.random.rand(num_rows),
131+
"d": np.random.rand(num_rows),
132+
"e": np.random.rand(num_rows),
133+
"foo": ["foo"] * num_rows,
134+
"bar": ["bar"] * num_rows,
135+
"baz": ["baz"] * num_rows,
136+
"date": pd.date_range("20000101 09:00:00", periods=num_rows, freq="s"),
137+
"int": np.arange(num_rows, dtype="int64"),
138+
}
139+
)
149140

150141
with tm.ensure_clean(file_name) as path:
151142
df.to_csv(path)

pandas/tests/test_sorting.py

+12-15
Original file line numberDiff line numberDiff line change
@@ -96,32 +96,29 @@ def test_int64_overflow_groupby_large_range(self):
9696

9797
@pytest.mark.parametrize("agg", ["mean", "median"])
9898
def test_int64_overflow_groupby_large_df_shuffled(self, agg):
99-
arr = np.random.randint(-1 << 12, 1 << 12, (1 << 15, 5))
100-
i = np.random.choice(len(arr), len(arr) * 4)
99+
rs = np.random.RandomState(42)
100+
arr = rs.randint(-1 << 12, 1 << 12, (1 << 15, 5))
101+
i = rs.choice(len(arr), len(arr) * 4)
101102
arr = np.vstack((arr, arr[i])) # add some duplicate rows
102103

103-
i = np.random.permutation(len(arr))
104+
i = rs.permutation(len(arr))
104105
arr = arr[i] # shuffle rows
105106

106107
df = DataFrame(arr, columns=list("abcde"))
107-
df["jim"], df["joe"] = np.random.randn(2, len(df)) * 10
108+
df["jim"], df["joe"] = np.zeros((2, len(df)))
108109
gr = df.groupby(list("abcde"))
109110

110111
# verify this is testing what it is supposed to test!
111112
assert is_int64_overflow_possible(gr.grouper.shape)
112113

113-
# manually compute groupings
114-
jim, joe = defaultdict(list), defaultdict(list)
115-
for key, a, b in zip(map(tuple, arr), df["jim"], df["joe"]):
116-
jim[key].append(a)
117-
joe[key].append(b)
118-
119-
assert len(gr) == len(jim)
120-
mi = MultiIndex.from_tuples(jim.keys(), names=list("abcde"))
114+
mi = MultiIndex.from_arrays(
115+
[ar.ravel() for ar in np.array_split(np.unique(arr, axis=0), 5, axis=1)],
116+
names=list("abcde"),
117+
)
121118

122-
f = lambda a: np.fromiter(map(getattr(np, agg), a), dtype="f8")
123-
arr = np.vstack((f(jim.values()), f(joe.values()))).T
124-
res = DataFrame(arr, columns=["jim", "joe"], index=mi).sort_index()
119+
res = DataFrame(
120+
np.zeros((len(mi), 2)), columns=["jim", "joe"], index=mi
121+
).sort_index()
125122

126123
tm.assert_frame_equal(getattr(gr, agg)(), res)
127124

0 commit comments

Comments
 (0)