From 9ac1f1f81abcc28451d091910a440a877919a786 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 1 Aug 2023 16:03:48 -0700 Subject: [PATCH 1/4] remove tm.rands --- pandas/_testing/__init__.py | 5 +---- pandas/_testing/_io.py | 4 ++-- pandas/_testing/_random.py | 10 ---------- pandas/tests/extension/base/getitem.py | 2 +- pandas/tests/extension/base/setitem.py | 2 +- pandas/tests/frame/test_arithmetic.py | 2 +- pandas/tests/io/formats/test_format.py | 7 +++---- pandas/tests/io/pytables/test_errors.py | 2 +- pandas/tests/series/methods/test_astype.py | 16 +++++++++++++--- pandas/tests/util/test_util.py | 5 ----- 10 files changed, 23 insertions(+), 32 deletions(-) diff --git a/pandas/_testing/__init__.py b/pandas/_testing/__init__.py index 78c882dc94a99..4ce0a3e7836e7 100644 --- a/pandas/_testing/__init__.py +++ b/pandas/_testing/__init__.py @@ -54,10 +54,7 @@ round_trip_pickle, write_to_compressed, ) -from pandas._testing._random import ( - rands, - rands_array, -) +from pandas._testing._random import rands_array from pandas._testing._warnings import ( assert_produces_warning, maybe_produces_warning, diff --git a/pandas/_testing/_io.py b/pandas/_testing/_io.py index 49fde6d08fa11..edbba9452b50a 100644 --- a/pandas/_testing/_io.py +++ b/pandas/_testing/_io.py @@ -9,6 +9,7 @@ Any, Callable, ) +import uuid import zipfile from pandas.compat import ( @@ -18,7 +19,6 @@ from pandas.compat._optional import import_optional_dependency import pandas as pd -from pandas._testing._random import rands from pandas._testing.contexts import ensure_clean if TYPE_CHECKING: @@ -56,7 +56,7 @@ def round_trip_pickle( """ _path = path if _path is None: - _path = f"__{rands(10)}__.pickle" + _path = f"__{uuid.uuid4()}__.pickle" with ensure_clean(_path) as temp_path: pd.to_pickle(obj, temp_path) return pd.read_pickle(temp_path) diff --git a/pandas/_testing/_random.py b/pandas/_testing/_random.py index 4306a72700aff..fabcd62739c69 100644 --- a/pandas/_testing/_random.py +++ b/pandas/_testing/_random.py @@ -23,13 +23,3 @@ def rands_array( .reshape(size) ) return retval.astype(dtype) - - -def rands(nchars) -> str: - """ - Generate one random byte string. - - See `rands_array` if you want to create an array of random strings. - - """ - return "".join(np.random.default_rng(2).choice(RANDS_CHARS, nchars)) diff --git a/pandas/tests/extension/base/getitem.py b/pandas/tests/extension/base/getitem.py index 73c8afee4083a..faa38a7c03447 100644 --- a/pandas/tests/extension/base/getitem.py +++ b/pandas/tests/extension/base/getitem.py @@ -272,7 +272,7 @@ def test_getitem_series_integer_with_missing_raises(self, data, idx): msg = "Cannot index with an integer indexer containing NA values" # TODO: this raises KeyError about labels not found (it tries label-based) - ser = pd.Series(data, index=[tm.rands(4) for _ in range(len(data))]) + ser = pd.Series(data, index=[chr(100 + i) for i in range(len(data))]) with pytest.raises(ValueError, match=msg): ser[idx] diff --git a/pandas/tests/extension/base/setitem.py b/pandas/tests/extension/base/setitem.py index 76aa560fd17a2..1085ada920ccc 100644 --- a/pandas/tests/extension/base/setitem.py +++ b/pandas/tests/extension/base/setitem.py @@ -197,7 +197,7 @@ def test_setitem_integer_with_missing_raises(self, data, idx, box_in_series): # TODO(xfail) this raises KeyError about labels not found (it tries label-based) # for list of labels with Series if box_in_series: - arr = pd.Series(data, index=[tm.rands(4) for _ in range(len(data))]) + arr = pd.Series(data, index=[chr(100 + i) for i in range(len(data))]) msg = "Cannot index with an integer indexer containing NA values" with pytest.raises(ValueError, match=msg): diff --git a/pandas/tests/frame/test_arithmetic.py b/pandas/tests/frame/test_arithmetic.py index 0394241955e9b..262ed69ca7099 100644 --- a/pandas/tests/frame/test_arithmetic.py +++ b/pandas/tests/frame/test_arithmetic.py @@ -203,7 +203,7 @@ def test_timestamp_compare(self, left, right): "dates2": pd.date_range("20010102", periods=10), "intcol": np.random.default_rng(2).integers(1000000000, size=10), "floatcol": np.random.default_rng(2).standard_normal(10), - "stringcol": list(tm.rands(10)), + "stringcol": [chr(100 + i) for i in range(10)], } ) df.loc[np.random.default_rng(2).random(len(df)) > 0.5, "dates2"] = pd.NaT diff --git a/pandas/tests/io/formats/test_format.py b/pandas/tests/io/formats/test_format.py index 0938e7fc6f28b..46892af86ec49 100644 --- a/pandas/tests/io/formats/test_format.py +++ b/pandas/tests/io/formats/test_format.py @@ -214,10 +214,9 @@ def test_repr_truncation(self): { "A": np.random.default_rng(2).standard_normal(10), "B": [ - tm.rands( - np.random.default_rng(2).integers(max_len - 1, max_len + 1) - ) - for i in range(10) + "a" + * np.random.default_rng(2).integers(max_len - 1, max_len + 1) + for _ in range(10) ], } ) diff --git a/pandas/tests/io/pytables/test_errors.py b/pandas/tests/io/pytables/test_errors.py index e8e62d3fdd33b..44bdbfc3fdd7e 100644 --- a/pandas/tests/io/pytables/test_errors.py +++ b/pandas/tests/io/pytables/test_errors.py @@ -153,7 +153,7 @@ def test_append_with_diff_col_name_types_raises_value_error(setup_path): df5 = DataFrame({("1", 2, object): np.random.default_rng(2).standard_normal(10)}) with ensure_clean_store(setup_path) as store: - name = f"df_{tm.rands(10)}" + name = "df_diff_valerror" store.append(name, df) for d in (df2, df3, df4, df5): diff --git a/pandas/tests/series/methods/test_astype.py b/pandas/tests/series/methods/test_astype.py index f367d611d592a..b6c409397c9fb 100644 --- a/pandas/tests/series/methods/test_astype.py +++ b/pandas/tests/series/methods/test_astype.py @@ -29,6 +29,16 @@ import pandas._testing as tm +def rand_str(nchars: int) -> str: + """ + Generate one random byte string. + """ + RANDS_CHARS = np.array( + list(string.ascii_letters + string.digits), dtype=(np.str_, 1) + ) + return "".join(np.random.default_rng(2).choice(RANDS_CHARS, nchars)) + + class TestAstypeAPI: def test_astype_unitless_dt64_raises(self): # GH#47844 @@ -129,8 +139,8 @@ def test_astype_empty_constructor_equality(self, dtype): @pytest.mark.parametrize( "series", [ - Series([string.digits * 10, tm.rands(63), tm.rands(64), tm.rands(1000)]), - Series([string.digits * 10, tm.rands(63), tm.rands(64), np.nan, 1.0]), + Series([string.digits * 10, rand_str(63), rand_str(64), rand_str(1000)]), + Series([string.digits * 10, rand_str(63), rand_str(64), np.nan, 1.0]), ], ) def test_astype_str_map(self, dtype, series): @@ -382,7 +392,7 @@ def test_astype_unicode(self): # default encoding to utf-8 digits = string.digits test_series = [ - Series([digits * 10, tm.rands(63), tm.rands(64), tm.rands(1000)]), + Series([digits * 10, rand_str(63), rand_str(64), rand_str(1000)]), Series(["データーサイエンス、お前はもう死んでいる"]), ] diff --git a/pandas/tests/util/test_util.py b/pandas/tests/util/test_util.py index 802be634192a3..1d146771e6a42 100644 --- a/pandas/tests/util/test_util.py +++ b/pandas/tests/util/test_util.py @@ -6,11 +6,6 @@ import pandas._testing as tm -def test_rands(): - r = tm.rands(10) - assert len(r) == 10 - - def test_rands_array_1d(): arr = tm.rands_array(5, size=10) assert arr.shape == (10,) From 4ab42cc9f2f2b4b3eb7dec86afe189d068750f4c Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 1 Aug 2023 17:32:05 -0700 Subject: [PATCH 2/4] remove rands array --- asv_bench/benchmarks/array.py | 8 +++---- asv_bench/benchmarks/series_methods.py | 2 +- asv_bench/benchmarks/strings.py | 6 ++--- pandas/_testing/__init__.py | 17 ++++++++++++- pandas/_testing/_random.py | 25 -------------------- pandas/tests/arithmetic/test_numeric.py | 2 +- pandas/tests/frame/test_repr_info.py | 2 +- pandas/tests/groupby/test_groupby.py | 2 +- pandas/tests/groupby/test_rank.py | 4 ++-- pandas/tests/io/formats/test_format.py | 24 +++++++++---------- pandas/tests/io/pytables/test_round_trip.py | 4 +--- pandas/tests/reshape/merge/test_multi.py | 2 +- pandas/tests/series/indexing/test_getitem.py | 2 +- pandas/tests/series/indexing/test_setitem.py | 2 +- pandas/tests/util/test_hashing.py | 2 +- pandas/tests/util/test_util.py | 12 ---------- 16 files changed, 44 insertions(+), 72 deletions(-) delete mode 100644 pandas/_testing/_random.py diff --git a/asv_bench/benchmarks/array.py b/asv_bench/benchmarks/array.py index ecd8c26ba6ca5..506f334d83d01 100644 --- a/asv_bench/benchmarks/array.py +++ b/asv_bench/benchmarks/array.py @@ -2,8 +2,6 @@ import pandas as pd -from .pandas_vb_common import tm - class BooleanArray: def setup(self): @@ -56,7 +54,7 @@ def time_from_tuples(self): class StringArray: def setup(self): N = 100_000 - values = tm.rands_array(3, N) + values = np.array(["a"] * N, dtype=object) self.values_obj = np.array(values, dtype="object") self.values_str = np.array(values, dtype="U") self.values_list = values.tolist() @@ -80,7 +78,7 @@ def setup(self, multiple_chunks): import pyarrow as pa except ImportError: raise NotImplementedError - strings = tm.rands_array(3, 10_000) + strings = np.array(["a"] * 10_000, dtype=object) if multiple_chunks: chunks = [strings[i : i + 100] for i in range(0, len(strings), 100)] self.array = pd.arrays.ArrowStringArray(pa.chunked_array(chunks)) @@ -127,7 +125,7 @@ def setup(self, dtype, hasna): elif dtype == "int64[pyarrow]": data = np.arange(N) elif dtype == "string[pyarrow]": - data = tm.rands_array(10, N) + data = np.array(["a"] * N, dtype=object) elif dtype == "timestamp[ns][pyarrow]": data = pd.date_range("2000-01-01", freq="s", periods=N) else: diff --git a/asv_bench/benchmarks/series_methods.py b/asv_bench/benchmarks/series_methods.py index 492d075173e17..76cc803ecc2da 100644 --- a/asv_bench/benchmarks/series_methods.py +++ b/asv_bench/benchmarks/series_methods.py @@ -104,7 +104,7 @@ def setup(self, dtype): data = np.arange(N) na_value = NA elif dtype in ("string", "string[pyarrow]"): - data = tm.rands_array(5, N) + data = np.array(["a"] * N, dtype=object) na_value = NA else: raise NotImplementedError diff --git a/asv_bench/benchmarks/strings.py b/asv_bench/benchmarks/strings.py index 9f1aeb7670628..1cc3939c8080b 100644 --- a/asv_bench/benchmarks/strings.py +++ b/asv_bench/benchmarks/strings.py @@ -34,9 +34,7 @@ class Construction: dtype_mapping = {"str": "str", "string[python]": object, "string[pyarrow]": object} def setup(self, pd_type, dtype): - series_arr = tm.rands_array( - nchars=10, size=10**5, dtype=self.dtype_mapping[dtype] - ) + series_arr = np.array(["a"] * 10_000, dtype=self.dtype_mapping[dtype]) if pd_type == "series": self.arr = series_arr elif pd_type == "frame": @@ -276,7 +274,7 @@ def time_iter(self, dtype): class StringArrayConstruction: def setup(self): - self.series_arr = tm.rands_array(nchars=10, size=10**5) + self.series_arr = np.array(["a"] * 10**5, dtype=object) self.series_arr_nan = np.concatenate([self.series_arr, np.array([NA] * 1000)]) def time_string_array_construction(self): diff --git a/pandas/_testing/__init__.py b/pandas/_testing/__init__.py index 4ce0a3e7836e7..03bcbaaf2e25b 100644 --- a/pandas/_testing/__init__.py +++ b/pandas/_testing/__init__.py @@ -54,7 +54,6 @@ round_trip_pickle, write_to_compressed, ) -from pandas._testing._random import rands_array from pandas._testing._warnings import ( assert_produces_warning, maybe_produces_warning, @@ -346,6 +345,22 @@ def to_array(obj): # Others +def rands_array( + nchars, size: int, dtype: NpDtype = "O", replace: bool = True +) -> np.ndarray: + """ + Generate an array of byte strings. + """ + chars = np.array(list(string.ascii_letters + string.digits), dtype=(np.str_, 1)) + retval = ( + np.random.default_rng(2) + .choice(chars, size=nchars * np.prod(size), replace=replace) + .view((np.str_, nchars)) + .reshape(size) + ) + return retval.astype(dtype) + + def getCols(k) -> str: return string.ascii_uppercase[:k] diff --git a/pandas/_testing/_random.py b/pandas/_testing/_random.py deleted file mode 100644 index fabcd62739c69..0000000000000 --- a/pandas/_testing/_random.py +++ /dev/null @@ -1,25 +0,0 @@ -from __future__ import annotations - -import string -from typing import TYPE_CHECKING - -import numpy as np - -if TYPE_CHECKING: - from pandas._typing import NpDtype -RANDS_CHARS = np.array(list(string.ascii_letters + string.digits), dtype=(np.str_, 1)) - - -def rands_array( - nchars, size: int, dtype: NpDtype = "O", replace: bool = True -) -> np.ndarray: - """ - Generate an array of byte strings. - """ - retval = ( - np.random.default_rng(2) - .choice(RANDS_CHARS, size=nchars * np.prod(size), replace=replace) - .view((np.str_, nchars)) - .reshape(size) - ) - return retval.astype(dtype) diff --git a/pandas/tests/arithmetic/test_numeric.py b/pandas/tests/arithmetic/test_numeric.py index 42fa03b38f6ff..7f0996da2e2f2 100644 --- a/pandas/tests/arithmetic/test_numeric.py +++ b/pandas/tests/arithmetic/test_numeric.py @@ -881,7 +881,7 @@ def test_add_frames(self, first, second, expected): # TODO: This came from series.test.test_operators, needs cleanup def test_series_frame_radd_bug(self, fixed_now_ts): # GH#353 - vals = Series(tm.rands_array(5, 10)) + vals = Series(tm.makeStringIndex()) result = "foo_" + vals expected = vals.map(lambda x: "foo_" + x) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/frame/test_repr_info.py b/pandas/tests/frame/test_repr_info.py index 0c9e5e01fa644..49375658abfee 100644 --- a/pandas/tests/frame/test_repr_info.py +++ b/pandas/tests/frame/test_repr_info.py @@ -265,7 +265,7 @@ def test_str_to_bytes_raises(self): def test_very_wide_info_repr(self): df = DataFrame( np.random.default_rng(2).standard_normal((10, 20)), - columns=tm.rands_array(10, 20), + columns=np.array(["a" * 10] * 20, dtype=object), ) repr(df) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index fc0efb74a9b62..09d1814e1b0a1 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -1361,7 +1361,7 @@ def test_cython_grouper_series_bug_noncontig(): def test_series_grouper_noncontig_index(): - index = Index(tm.rands_array(10, 100)) + index = Index(["a" * 10] * 100) values = Series(np.random.default_rng(2).standard_normal(50), index=index[::2]) labels = np.random.default_rng(2).integers(0, 5, 50) diff --git a/pandas/tests/groupby/test_rank.py b/pandas/tests/groupby/test_rank.py index 41bfa121624ea..26881bdd18274 100644 --- a/pandas/tests/groupby/test_rank.py +++ b/pandas/tests/groupby/test_rank.py @@ -31,8 +31,8 @@ def test_rank_unordered_categorical_typeerror(): def test_rank_apply(): - lev1 = tm.rands_array(10, 100) - lev2 = tm.rands_array(10, 130) + lev1 = np.array(["a" * 10] * 100, dtype=object) + lev2 = np.array(["b" * 10] * 130, dtype=object) lab1 = np.random.default_rng(2).integers(0, 100, size=500, dtype=int) lab2 = np.random.default_rng(2).integers(0, 130, size=500, dtype=int) diff --git a/pandas/tests/io/formats/test_format.py b/pandas/tests/io/formats/test_format.py index 46892af86ec49..592b8d206fa30 100644 --- a/pandas/tests/io/formats/test_format.py +++ b/pandas/tests/io/formats/test_format.py @@ -1176,7 +1176,7 @@ def test_wide_repr(self): 20, ): max_cols = get_option("display.max_columns") - df = DataFrame(tm.rands_array(25, size=(10, max_cols - 1))) + df = DataFrame([["a" * 25] * (max_cols - 1)] * 10) with option_context("display.expand_frame_repr", False): rep_str = repr(df) @@ -1202,7 +1202,7 @@ def test_wide_repr_wide_columns(self): def test_wide_repr_named(self): with option_context("mode.sim_interactive", True, "display.max_columns", 20): max_cols = get_option("display.max_columns") - df = DataFrame(tm.rands_array(25, size=(10, max_cols - 1))) + df = DataFrame([["a" * 25] * (max_cols - 1)] * 10) df.index.name = "DataFrame Index" with option_context("display.expand_frame_repr", False): rep_str = repr(df) @@ -1219,9 +1219,9 @@ def test_wide_repr_named(self): def test_wide_repr_multiindex(self): with option_context("mode.sim_interactive", True, "display.max_columns", 20): - midx = MultiIndex.from_arrays(tm.rands_array(5, size=(2, 10))) + midx = MultiIndex.from_arrays([["a" * 5] * 10] * 2) max_cols = get_option("display.max_columns") - df = DataFrame(tm.rands_array(25, size=(10, max_cols - 1)), index=midx) + df = DataFrame([["a" * 25] * (max_cols - 1)] * 10, index=midx) df.index.names = ["Level 0", "Level 1"] with option_context("display.expand_frame_repr", False): rep_str = repr(df) @@ -1239,10 +1239,10 @@ def test_wide_repr_multiindex(self): def test_wide_repr_multiindex_cols(self): with option_context("mode.sim_interactive", True, "display.max_columns", 20): max_cols = get_option("display.max_columns") - midx = MultiIndex.from_arrays(tm.rands_array(5, size=(2, 10))) - mcols = MultiIndex.from_arrays(tm.rands_array(3, size=(2, max_cols - 1))) + midx = MultiIndex.from_arrays([["a" * 5] * 10] * 2) + mcols = MultiIndex.from_arrays([["b" * 3] * (max_cols - 1)] * 2) df = DataFrame( - tm.rands_array(25, (10, max_cols - 1)), index=midx, columns=mcols + [["c" * 25] * (max_cols - 1)] * 10, index=midx, columns=mcols ) df.index.names = ["Level 0", "Level 1"] with option_context("display.expand_frame_repr", False): @@ -1258,7 +1258,7 @@ def test_wide_repr_multiindex_cols(self): def test_wide_repr_unicode(self): with option_context("mode.sim_interactive", True, "display.max_columns", 20): max_cols = 20 - df = DataFrame(tm.rands_array(25, size=(10, max_cols - 1))) + df = DataFrame([["a" * 25] * 10] * (max_cols - 1)) with option_context("display.expand_frame_repr", False): rep_str = repr(df) with option_context("display.expand_frame_repr", True): @@ -1896,11 +1896,11 @@ def test_repr_html_mathjax(self): def test_repr_html_wide(self): max_cols = 20 - df = DataFrame(tm.rands_array(25, size=(10, max_cols - 1))) + df = DataFrame([["a" * 25] * (max_cols - 1)] * 10) with option_context("display.max_rows", 60, "display.max_columns", 20): assert "..." not in df._repr_html_() - wide_df = DataFrame(tm.rands_array(25, size=(10, max_cols + 1))) + wide_df = DataFrame([["a" * 25] * (max_cols + 1)] * 10) with option_context("display.max_rows", 60, "display.max_columns", 20): assert "..." in wide_df._repr_html_() @@ -1910,14 +1910,14 @@ def test_repr_html_wide_multiindex_cols(self): mcols = MultiIndex.from_product( [np.arange(max_cols // 2), ["foo", "bar"]], names=["first", "second"] ) - df = DataFrame(tm.rands_array(25, size=(10, len(mcols))), columns=mcols) + df = DataFrame([["a" * 25] * len(mcols)] * 10, columns=mcols) reg_repr = df._repr_html_() assert "..." not in reg_repr mcols = MultiIndex.from_product( (np.arange(1 + (max_cols // 2)), ["foo", "bar"]), names=["first", "second"] ) - df = DataFrame(tm.rands_array(25, size=(10, len(mcols))), columns=mcols) + df = DataFrame([["a" * 25] * len(mcols)] * 10, columns=mcols) with option_context("display.max_rows", 60, "display.max_columns", 20): assert "..." in df._repr_html_() diff --git a/pandas/tests/io/pytables/test_round_trip.py b/pandas/tests/io/pytables/test_round_trip.py index 84c8c0a314342..8ffdc421492a5 100644 --- a/pandas/tests/io/pytables/test_round_trip.py +++ b/pandas/tests/io/pytables/test_round_trip.py @@ -54,9 +54,7 @@ def roundtrip(key, obj, **kwargs): def test_long_strings(setup_path): # GH6166 - df = DataFrame( - {"a": tm.rands_array(100, size=10)}, index=tm.rands_array(100, size=10) - ) + df = DataFrame({"a": tm.makeStringIndex(10)}, index=tm.makeStringIndex(10)) with ensure_clean_store(setup_path) as store: store.append("df", df, data_columns=["a"]) diff --git a/pandas/tests/reshape/merge/test_multi.py b/pandas/tests/reshape/merge/test_multi.py index b43275f3ce4af..088d1e7e3c85e 100644 --- a/pandas/tests/reshape/merge/test_multi.py +++ b/pandas/tests/reshape/merge/test_multi.py @@ -193,7 +193,7 @@ def test_merge_multiple_cols_with_mixed_cols_index(self): def test_compress_group_combinations(self): # ~ 40000000 possible unique groups - key1 = tm.rands_array(10, 10000) + key1 = tm.makeStringIndex(10000) key1 = np.tile(key1, 2) key2 = key1[::-1] diff --git a/pandas/tests/series/indexing/test_getitem.py b/pandas/tests/series/indexing/test_getitem.py index 93ccc336468ea..458988491aae8 100644 --- a/pandas/tests/series/indexing/test_getitem.py +++ b/pandas/tests/series/indexing/test_getitem.py @@ -69,7 +69,7 @@ def test_getitem_unrecognized_scalar(self): assert result == 2 def test_getitem_negative_out_of_bounds(self): - ser = Series(tm.rands_array(5, 10), index=tm.rands_array(10, 10)) + ser = Series(["a"] * 10, index=["a"] * 10) msg = "index -11 is out of bounds for axis 0 with size 10" warn_msg = "Series.__getitem__ treating keys as positions is deprecated" diff --git a/pandas/tests/series/indexing/test_setitem.py b/pandas/tests/series/indexing/test_setitem.py index b40e4276ccfe7..f1e66212c131a 100644 --- a/pandas/tests/series/indexing/test_setitem.py +++ b/pandas/tests/series/indexing/test_setitem.py @@ -173,7 +173,7 @@ def test_object_series_setitem_dt64array_exact_match(self): class TestSetitemScalarIndexer: def test_setitem_negative_out_of_bounds(self): - ser = Series(tm.rands_array(5, 10), index=tm.rands_array(10, 10)) + ser = Series(["a"] * 10, index=["a"] * 10) msg = "index -11 is out of bounds for axis 0 with size 10" warn_msg = "Series.__setitem__ treating keys as positions is deprecated" diff --git a/pandas/tests/util/test_hashing.py b/pandas/tests/util/test_hashing.py index a23d0c1c13e09..e78b042a09231 100644 --- a/pandas/tests/util/test_hashing.py +++ b/pandas/tests/util/test_hashing.py @@ -328,7 +328,7 @@ def test_alternate_encoding(index): @pytest.mark.parametrize("l_add", [0, 1]) def test_same_len_hash_collisions(l_exp, l_add): length = 2 ** (l_exp + 8) + l_add - s = tm.rands_array(length, 2) + s = tm.makeStringIndex(length).to_numpy() result = hash_array(s, "utf8") assert not result[0] == result[1] diff --git a/pandas/tests/util/test_util.py b/pandas/tests/util/test_util.py index 1d146771e6a42..5718480fdec5e 100644 --- a/pandas/tests/util/test_util.py +++ b/pandas/tests/util/test_util.py @@ -6,18 +6,6 @@ import pandas._testing as tm -def test_rands_array_1d(): - arr = tm.rands_array(5, size=10) - assert arr.shape == (10,) - assert len(arr[0]) == 5 - - -def test_rands_array_2d(): - arr = tm.rands_array(7, size=(10, 10)) - assert arr.shape == (10, 10) - assert len(arr[1, 1]) == 7 - - def test_numpy_err_state_is_default(): expected = {"over": "warn", "divide": "warn", "invalid": "warn", "under": "ignore"} import numpy as np From 1b250b44926de41105d260eba605fb97bbd0aa7e Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 2 Aug 2023 09:22:52 -0700 Subject: [PATCH 3/4] Address failures --- asv_bench/benchmarks/strings.py | 4 ++-- pandas/_testing/__init__.py | 1 - 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/asv_bench/benchmarks/strings.py b/asv_bench/benchmarks/strings.py index 1cc3939c8080b..712d2afb81d08 100644 --- a/asv_bench/benchmarks/strings.py +++ b/asv_bench/benchmarks/strings.py @@ -34,7 +34,7 @@ class Construction: dtype_mapping = {"str": "str", "string[python]": object, "string[pyarrow]": object} def setup(self, pd_type, dtype): - series_arr = np.array(["a"] * 10_000, dtype=self.dtype_mapping[dtype]) + series_arr = np.array(["a" * 10] * 100_000, dtype=self.dtype_mapping[dtype]) if pd_type == "series": self.arr = series_arr elif pd_type == "frame": @@ -274,7 +274,7 @@ def time_iter(self, dtype): class StringArrayConstruction: def setup(self): - self.series_arr = np.array(["a"] * 10**5, dtype=object) + self.series_arr = np.array(["a" * 10] * 10**5, dtype=object) self.series_arr_nan = np.concatenate([self.series_arr, np.array([NA] * 1000)]) def time_string_array_construction(self): diff --git a/pandas/_testing/__init__.py b/pandas/_testing/__init__.py index 03bcbaaf2e25b..483c5ad59872f 100644 --- a/pandas/_testing/__init__.py +++ b/pandas/_testing/__init__.py @@ -1139,7 +1139,6 @@ def shares_memory(left, right) -> bool: "NULL_OBJECTS", "OBJECT_DTYPES", "raise_assert_detail", - "rands", "reset_display_options", "raises_chained_assignment_error", "round_trip_localpath", From e360fea192c7c22934b0617a13a6352fae264ccb Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 2 Aug 2023 15:00:11 -0700 Subject: [PATCH 4/4] Use unique values --- asv_bench/benchmarks/array.py | 6 +++--- asv_bench/benchmarks/series_methods.py | 2 +- asv_bench/benchmarks/strings.py | 6 ++++-- 3 files changed, 8 insertions(+), 6 deletions(-) diff --git a/asv_bench/benchmarks/array.py b/asv_bench/benchmarks/array.py index 506f334d83d01..09c4acc0ab309 100644 --- a/asv_bench/benchmarks/array.py +++ b/asv_bench/benchmarks/array.py @@ -54,7 +54,7 @@ def time_from_tuples(self): class StringArray: def setup(self): N = 100_000 - values = np.array(["a"] * N, dtype=object) + values = np.array([str(i) for i in range(N)], dtype=object) self.values_obj = np.array(values, dtype="object") self.values_str = np.array(values, dtype="U") self.values_list = values.tolist() @@ -78,7 +78,7 @@ def setup(self, multiple_chunks): import pyarrow as pa except ImportError: raise NotImplementedError - strings = np.array(["a"] * 10_000, dtype=object) + strings = np.array([str(i) for i in range(10_000)], dtype=object) if multiple_chunks: chunks = [strings[i : i + 100] for i in range(0, len(strings), 100)] self.array = pd.arrays.ArrowStringArray(pa.chunked_array(chunks)) @@ -125,7 +125,7 @@ def setup(self, dtype, hasna): elif dtype == "int64[pyarrow]": data = np.arange(N) elif dtype == "string[pyarrow]": - data = np.array(["a"] * N, dtype=object) + data = np.array([str(i) for i in range(N)], dtype=object) elif dtype == "timestamp[ns][pyarrow]": data = pd.date_range("2000-01-01", freq="s", periods=N) else: diff --git a/asv_bench/benchmarks/series_methods.py b/asv_bench/benchmarks/series_methods.py index 76cc803ecc2da..288369145576e 100644 --- a/asv_bench/benchmarks/series_methods.py +++ b/asv_bench/benchmarks/series_methods.py @@ -104,7 +104,7 @@ def setup(self, dtype): data = np.arange(N) na_value = NA elif dtype in ("string", "string[pyarrow]"): - data = np.array(["a"] * N, dtype=object) + data = np.array([str(i) * 5 for i in range(N)], dtype=object) na_value = NA else: raise NotImplementedError diff --git a/asv_bench/benchmarks/strings.py b/asv_bench/benchmarks/strings.py index 712d2afb81d08..d70d9d0aa5227 100644 --- a/asv_bench/benchmarks/strings.py +++ b/asv_bench/benchmarks/strings.py @@ -34,7 +34,9 @@ class Construction: dtype_mapping = {"str": "str", "string[python]": object, "string[pyarrow]": object} def setup(self, pd_type, dtype): - series_arr = np.array(["a" * 10] * 100_000, dtype=self.dtype_mapping[dtype]) + series_arr = np.array( + [str(i) * 10 for i in range(100_000)], dtype=self.dtype_mapping[dtype] + ) if pd_type == "series": self.arr = series_arr elif pd_type == "frame": @@ -274,7 +276,7 @@ def time_iter(self, dtype): class StringArrayConstruction: def setup(self): - self.series_arr = np.array(["a" * 10] * 10**5, dtype=object) + self.series_arr = np.array([str(i) * 10 for i in range(10**5)], dtype=object) self.series_arr_nan = np.concatenate([self.series_arr, np.array([NA] * 1000)]) def time_string_array_construction(self):