Skip to content

Commit 11cdde6

Browse files
authored
TST: Use numpy random generator for testing data (#54209)
* TST: Use numpy random generator with ruff NPY002 * Fix other testing functions * Fix random_sample * Fix more usage * Fix more * Replace more * address rand * More fixes * Fix more standard_normal * Fix more * Fix * Address more * Fix more test * fix more tests * Try addressing windows tests * Address confest, ignore asv * adjust once more * ANother dtype * fix another unseeded default_rng * Add a rule for unseeded default_rng * Remove space * other fixes
1 parent 5e97e67 commit 11cdde6

File tree

302 files changed

+3645
-2351
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

302 files changed

+3645
-2351
lines changed

.pre-commit-config.yaml

+3
Original file line numberDiff line numberDiff line change
@@ -251,6 +251,9 @@ repos:
251251
252252
# os.remove
253253
|os\.remove
254+
255+
# Unseeded numpy default_rng
256+
|default_rng\(\)
254257
files: ^pandas/tests/
255258
types_or: [python, cython, rst]
256259
- id: unwanted-patterns-in-ea-tests

pandas/_testing/__init__.py

+15-41
Original file line numberDiff line numberDiff line change
@@ -391,9 +391,9 @@ def makeNumericIndex(k: int = 10, *, name=None, dtype: Dtype | None) -> Index:
391391
if is_unsigned_integer_dtype(dtype):
392392
values += 2 ** (dtype.itemsize * 8 - 1)
393393
elif dtype.kind == "f":
394-
values = np.random.random_sample(k) - np.random.random_sample(1)
394+
values = np.random.default_rng(2).random(k) - np.random.default_rng(2).random(1)
395395
values.sort()
396-
values = values * (10 ** np.random.randint(0, 9))
396+
values = values * (10 ** np.random.default_rng(2).integers(0, 9))
397397
else:
398398
raise NotImplementedError(f"wrong dtype {dtype}")
399399

@@ -487,7 +487,7 @@ def all_timeseries_index_generator(k: int = 10) -> Iterable[Index]:
487487
# make series
488488
def make_rand_series(name=None, dtype=np.float64) -> Series:
489489
index = makeStringIndex(_N)
490-
data = np.random.randn(_N)
490+
data = np.random.default_rng(2).standard_normal(_N)
491491
with np.errstate(invalid="ignore"):
492492
data = data.astype(dtype, copy=False)
493493
return Series(data, index=index, name=name)
@@ -510,21 +510,30 @@ def makeObjectSeries(name=None) -> Series:
510510

511511
def getSeriesData() -> dict[str, Series]:
512512
index = makeStringIndex(_N)
513-
return {c: Series(np.random.randn(_N), index=index) for c in getCols(_K)}
513+
return {
514+
c: Series(np.random.default_rng(i).standard_normal(_N), index=index)
515+
for i, c in enumerate(getCols(_K))
516+
}
514517

515518

516519
def makeTimeSeries(nper=None, freq: Frequency = "B", name=None) -> Series:
517520
if nper is None:
518521
nper = _N
519522
return Series(
520-
np.random.randn(nper), index=makeDateIndex(nper, freq=freq), name=name
523+
np.random.default_rng(2).standard_normal(nper),
524+
index=makeDateIndex(nper, freq=freq),
525+
name=name,
521526
)
522527

523528

524529
def makePeriodSeries(nper=None, name=None) -> Series:
525530
if nper is None:
526531
nper = _N
527-
return Series(np.random.randn(nper), index=makePeriodIndex(nper), name=name)
532+
return Series(
533+
np.random.default_rng(2).standard_normal(nper),
534+
index=makePeriodIndex(nper),
535+
name=name,
536+
)
528537

529538

530539
def getTimeSeriesData(nper=None, freq: Frequency = "B") -> dict[str, Series]:
@@ -787,40 +796,6 @@ def makeCustomDataframe(
787796
return DataFrame(data, index, columns, dtype=dtype)
788797

789798

790-
def _create_missing_idx(nrows, ncols, density: float, random_state=None):
791-
if random_state is None:
792-
random_state = np.random
793-
else:
794-
random_state = np.random.RandomState(random_state)
795-
796-
# below is cribbed from scipy.sparse
797-
size = round((1 - density) * nrows * ncols)
798-
# generate a few more to ensure unique values
799-
min_rows = 5
800-
fac = 1.02
801-
extra_size = min(size + min_rows, fac * size)
802-
803-
def _gen_unique_rand(rng, _extra_size):
804-
ind = rng.rand(int(_extra_size))
805-
return np.unique(np.floor(ind * nrows * ncols))[:size]
806-
807-
ind = _gen_unique_rand(random_state, extra_size)
808-
while ind.size < size:
809-
extra_size *= 1.05
810-
ind = _gen_unique_rand(random_state, extra_size)
811-
812-
j = np.floor(ind * 1.0 / nrows).astype(int)
813-
i = (ind - j * nrows).astype(int)
814-
return i.tolist(), j.tolist()
815-
816-
817-
def makeMissingDataframe(density: float = 0.9, random_state=None) -> DataFrame:
818-
df = makeDataFrame()
819-
i, j = _create_missing_idx(*df.shape, density=density, random_state=random_state)
820-
df.iloc[i, j] = np.nan
821-
return df
822-
823-
824799
class SubclassedSeries(Series):
825800
_metadata = ["testattr", "name"]
826801

@@ -1131,7 +1106,6 @@ def shares_memory(left, right) -> bool:
11311106
"makeFloatSeries",
11321107
"makeIntervalIndex",
11331108
"makeIntIndex",
1134-
"makeMissingDataframe",
11351109
"makeMixedDataFrame",
11361110
"makeMultiIndex",
11371111
"makeNumericIndex",

pandas/_testing/_random.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,8 @@ def rands_array(
1717
Generate an array of byte strings.
1818
"""
1919
retval = (
20-
np.random.choice(RANDS_CHARS, size=nchars * np.prod(size), replace=replace)
20+
np.random.default_rng(2)
21+
.choice(RANDS_CHARS, size=nchars * np.prod(size), replace=replace)
2122
.view((np.str_, nchars))
2223
.reshape(size)
2324
)
@@ -31,4 +32,4 @@ def rands(nchars) -> str:
3132
See `rands_array` if you want to create an array of random strings.
3233
3334
"""
34-
return "".join(np.random.choice(RANDS_CHARS, nchars))
35+
return "".join(np.random.default_rng(2).choice(RANDS_CHARS, nchars))

pandas/conftest.py

+7-5
Original file line numberDiff line numberDiff line change
@@ -561,7 +561,9 @@ def multiindex_dataframe_random_data(
561561
"""DataFrame with 2 level MultiIndex with random data"""
562562
index = lexsorted_two_level_string_multiindex
563563
return DataFrame(
564-
np.random.randn(10, 3), index=index, columns=Index(["A", "B", "C"], name="exp")
564+
np.random.default_rng(2).standard_normal((10, 3)),
565+
index=index,
566+
columns=Index(["A", "B", "C"], name="exp"),
565567
)
566568

567569

@@ -614,7 +616,7 @@ def _create_mi_with_dt64tz_level():
614616
"float32": tm.makeFloatIndex(100, dtype="float32"),
615617
"float64": tm.makeFloatIndex(100, dtype="float64"),
616618
"bool-object": tm.makeBoolIndex(10).astype(object),
617-
"bool-dtype": Index(np.random.randn(10) < 0),
619+
"bool-dtype": Index(np.random.default_rng(2).standard_normal(10) < 0),
618620
"complex64": tm.makeNumericIndex(100, dtype="float64").astype("complex64"),
619621
"complex128": tm.makeNumericIndex(100, dtype="float64").astype("complex128"),
620622
"categorical": tm.makeCategoricalIndex(100),
@@ -744,7 +746,7 @@ def datetime_series() -> Series:
744746
def _create_series(index):
745747
"""Helper for the _series dict"""
746748
size = len(index)
747-
data = np.random.randn(size)
749+
data = np.random.default_rng(2).standard_normal(size)
748750
return Series(data, index=index, name="a", copy=False)
749751

750752

@@ -773,7 +775,7 @@ def series_with_multilevel_index() -> Series:
773775
]
774776
tuples = zip(*arrays)
775777
index = MultiIndex.from_tuples(tuples)
776-
data = np.random.randn(8)
778+
data = np.random.default_rng(2).standard_normal(8)
777779
ser = Series(data, index=index)
778780
ser.iloc[3] = np.NaN
779781
return ser
@@ -946,7 +948,7 @@ def rand_series_with_duplicate_datetimeindex() -> Series:
946948
datetime(2000, 1, 5),
947949
]
948950

949-
return Series(np.random.randn(len(dates)), index=dates)
951+
return Series(np.random.default_rng(2).standard_normal(len(dates)), index=dates)
950952

951953

952954
# ----------------------------------------------------------------

pandas/tests/apply/test_frame_apply.py

+12-8
Original file line numberDiff line numberDiff line change
@@ -376,7 +376,7 @@ def test_apply_reduce_to_dict():
376376

377377

378378
def test_apply_differently_indexed():
379-
df = DataFrame(np.random.randn(20, 10))
379+
df = DataFrame(np.random.default_rng(2).standard_normal((20, 10)))
380380

381381
result = df.apply(Series.describe, axis=0)
382382
expected = DataFrame({i: v.describe() for i, v in df.items()}, columns=df.columns)
@@ -463,9 +463,9 @@ def test_apply_convert_objects():
463463
"shiny",
464464
"shiny",
465465
],
466-
"D": np.random.randn(11),
467-
"E": np.random.randn(11),
468-
"F": np.random.randn(11),
466+
"D": np.random.default_rng(2).standard_normal(11),
467+
"E": np.random.default_rng(2).standard_normal(11),
468+
"F": np.random.default_rng(2).standard_normal(11),
469469
}
470470
)
471471

@@ -659,7 +659,7 @@ def test_apply_category_equalness(val):
659659
def test_infer_row_shape():
660660
# GH 17437
661661
# if row shape is changing, infer it
662-
df = DataFrame(np.random.rand(10, 2))
662+
df = DataFrame(np.random.default_rng(2).random((10, 2)))
663663
result = df.apply(np.fft.fft, axis=0).shape
664664
assert result == (10, 2)
665665

@@ -816,7 +816,7 @@ def test_with_listlike_columns():
816816
# GH 17348
817817
df = DataFrame(
818818
{
819-
"a": Series(np.random.randn(4)),
819+
"a": Series(np.random.default_rng(2).standard_normal(4)),
820820
"b": ["a", "list", "of", "words"],
821821
"ts": date_range("2016-10-01", periods=4, freq="H"),
822822
}
@@ -862,7 +862,9 @@ def test_infer_output_shape_columns():
862862
def test_infer_output_shape_listlike_columns():
863863
# GH 16353
864864

865-
df = DataFrame(np.random.randn(6, 3), columns=["A", "B", "C"])
865+
df = DataFrame(
866+
np.random.default_rng(2).standard_normal((6, 3)), columns=["A", "B", "C"]
867+
)
866868

867869
result = df.apply(lambda x: [1, 2, 3], axis=1)
868870
expected = Series([[1, 2, 3] for t in df.itertuples()])
@@ -911,7 +913,9 @@ def fun(x):
911913
def test_consistent_coerce_for_shapes(lst):
912914
# we want column names to NOT be propagated
913915
# just because the shape matches the input shape
914-
df = DataFrame(np.random.randn(4, 3), columns=["A", "B", "C"])
916+
df = DataFrame(
917+
np.random.default_rng(2).standard_normal((4, 3)), columns=["A", "B", "C"]
918+
)
915919

916920
result = df.apply(lambda x: lst, axis=1)
917921
expected = Series([lst for t in df.itertuples()])

pandas/tests/apply/test_invalid_arg.py

+4-4
Original file line numberDiff line numberDiff line change
@@ -93,7 +93,7 @@ def test_series_nested_renamer(renamer):
9393

9494
def test_apply_dict_depr():
9595
tsdf = DataFrame(
96-
np.random.randn(10, 3),
96+
np.random.default_rng(2).standard_normal((10, 3)),
9797
columns=["A", "B", "C"],
9898
index=date_range("1/1/2000", periods=10),
9999
)
@@ -190,9 +190,9 @@ def test_apply_modify_traceback():
190190
"shiny",
191191
"shiny",
192192
],
193-
"D": np.random.randn(11),
194-
"E": np.random.randn(11),
195-
"F": np.random.randn(11),
193+
"D": np.random.default_rng(2).standard_normal(11),
194+
"E": np.random.default_rng(2).standard_normal(11),
195+
"F": np.random.default_rng(2).standard_normal(11),
196196
}
197197
)
198198

pandas/tests/apply/test_series_apply.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,7 @@ def f(x):
7676

7777
@pytest.mark.parametrize("convert_dtype", [True, False])
7878
def test_apply_convert_dtype_deprecated(convert_dtype):
79-
ser = Series(np.random.randn(10))
79+
ser = Series(np.random.default_rng(2).standard_normal(10))
8080

8181
def func(x):
8282
return x if x > 0 else np.nan

0 commit comments

Comments
 (0)