Skip to content

Commit 8336e36

Browse files
committed
Merge pull request pandas-dev#8575 from immerrr/optimize-makestringindex
TST/PERF: optimize tm.makeStringIndex
2 parents 778cfe4 + cf599d9 commit 8336e36

21 files changed

+134
-105
lines changed

doc/source/merging.rst

+2-2
Original file line numberDiff line numberDiff line change
@@ -130,9 +130,9 @@ behavior:
130130

131131
.. ipython:: python
132132
133-
from pandas.util.testing import rands
133+
from pandas.util.testing import rands_array
134134
df = DataFrame(np.random.randn(10, 4), columns=['a', 'b', 'c', 'd'],
135-
index=[rands(5) for _ in range(10)])
135+
index=rands_array(5, 10))
136136
df
137137
138138
concat([df.ix[:7, ['a', 'b']], df.ix[2:-2, ['c']],

pandas/core/common.py

-5
Original file line numberDiff line numberDiff line change
@@ -2110,11 +2110,6 @@ def _count_not_none(*args):
21102110
# miscellaneous python tools
21112111

21122112

2113-
def rands(n):
2114-
"""Generates a random alphanumeric string of length *n*"""
2115-
from random import Random
2116-
import string
2117-
return ''.join(Random().sample(string.ascii_letters + string.digits, n))
21182113

21192114

21202115
def adjoin(space, *lists):

pandas/io/tests/test_pytables.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -198,8 +198,8 @@ def test_long_strings(self):
198198
# GH6166
199199
# unconversion of long strings was being chopped in earlier
200200
# versions of numpy < 1.7.2
201-
df = DataFrame({'a': [tm.rands(100) for _ in range(10)]},
202-
index=[tm.rands(100) for _ in range(10)])
201+
df = DataFrame({'a': tm.rands_array(100, size=10)},
202+
index=tm.rands_array(100, size=10))
203203

204204
with ensure_clean_store(self.path) as store:
205205
store.append('df', df, data_columns=['a'])

pandas/tests/test_common.py

-5
Original file line numberDiff line numberDiff line change
@@ -274,11 +274,6 @@ def test_repr_binary_type():
274274
assert_equal(res, b)
275275

276276

277-
def test_rands():
278-
r = com.rands(10)
279-
assert(len(r) == 10)
280-
281-
282277
def test_adjoin():
283278
data = [['a', 'b', 'c'],
284279
['dd', 'ee', 'ff'],

pandas/tests/test_format.py

+23-27
Original file line numberDiff line numberDiff line change
@@ -1201,9 +1201,8 @@ def test_pprint_thing(self):
12011201

12021202
def test_wide_repr(self):
12031203
with option_context('mode.sim_interactive', True, 'display.show_dimensions', True):
1204-
col = lambda l, k: [tm.rands(k) for _ in range(l)]
12051204
max_cols = get_option('display.max_columns')
1206-
df = DataFrame([col(max_cols - 1, 25) for _ in range(10)])
1205+
df = DataFrame(tm.rands_array(25, size=(10, max_cols - 1)))
12071206
set_option('display.expand_frame_repr', False)
12081207
rep_str = repr(df)
12091208

@@ -1227,9 +1226,8 @@ def test_wide_repr_wide_columns(self):
12271226

12281227
def test_wide_repr_named(self):
12291228
with option_context('mode.sim_interactive', True):
1230-
col = lambda l, k: [tm.rands(k) for _ in range(l)]
12311229
max_cols = get_option('display.max_columns')
1232-
df = DataFrame([col(max_cols-1, 25) for _ in range(10)])
1230+
df = DataFrame(tm.rands_array(25, size=(10, max_cols - 1)))
12331231
df.index.name = 'DataFrame Index'
12341232
set_option('display.expand_frame_repr', False)
12351233

@@ -1249,11 +1247,10 @@ def test_wide_repr_named(self):
12491247

12501248
def test_wide_repr_multiindex(self):
12511249
with option_context('mode.sim_interactive', True):
1252-
col = lambda l, k: [tm.rands(k) for _ in range(l)]
1253-
midx = pandas.MultiIndex.from_arrays([np.array(col(10, 5)),
1254-
np.array(col(10, 5))])
1250+
midx = pandas.MultiIndex.from_arrays(
1251+
tm.rands_array(5, size=(2, 10)))
12551252
max_cols = get_option('display.max_columns')
1256-
df = DataFrame([col(max_cols-1, 25) for _ in range(10)],
1253+
df = DataFrame(tm.rands_array(25, size=(10, max_cols - 1)),
12571254
index=midx)
12581255
df.index.names = ['Level 0', 'Level 1']
12591256
set_option('display.expand_frame_repr', False)
@@ -1274,12 +1271,11 @@ def test_wide_repr_multiindex(self):
12741271
def test_wide_repr_multiindex_cols(self):
12751272
with option_context('mode.sim_interactive', True):
12761273
max_cols = get_option('display.max_columns')
1277-
col = lambda l, k: [tm.rands(k) for _ in range(l)]
1278-
midx = pandas.MultiIndex.from_arrays([np.array(col(10, 5)),
1279-
np.array(col(10, 5))])
1280-
mcols = pandas.MultiIndex.from_arrays([np.array(col(max_cols-1, 3)),
1281-
np.array(col(max_cols-1, 3))])
1282-
df = DataFrame([col(max_cols-1, 25) for _ in range(10)],
1274+
midx = pandas.MultiIndex.from_arrays(
1275+
tm.rands_array(5, size=(2, 10)))
1276+
mcols = pandas.MultiIndex.from_arrays(
1277+
tm.rands_array(3, size=(2, max_cols - 1)))
1278+
df = DataFrame(tm.rands_array(25, (10, max_cols - 1)),
12831279
index=midx, columns=mcols)
12841280
df.index.names = ['Level 0', 'Level 1']
12851281
set_option('display.expand_frame_repr', False)
@@ -1296,9 +1292,8 @@ def test_wide_repr_multiindex_cols(self):
12961292

12971293
def test_wide_repr_unicode(self):
12981294
with option_context('mode.sim_interactive', True):
1299-
col = lambda l, k: [tm.randu(k) for _ in range(l)]
13001295
max_cols = get_option('display.max_columns')
1301-
df = DataFrame([col(max_cols-1, 25) for _ in range(10)])
1296+
df = DataFrame(tm.rands_array(25, size=(10, max_cols - 1)))
13021297
set_option('display.expand_frame_repr', False)
13031298
rep_str = repr(df)
13041299
set_option('display.expand_frame_repr', True)
@@ -1877,30 +1872,31 @@ def test_repr_html(self):
18771872
self.reset_display_options()
18781873

18791874
def test_repr_html_wide(self):
1880-
row = lambda l, k: [tm.rands(k) for _ in range(l)]
18811875
max_cols = get_option('display.max_columns')
1882-
df = DataFrame([row(max_cols-1, 25) for _ in range(10)])
1876+
df = DataFrame(tm.rands_array(25, size=(10, max_cols - 1)))
18831877
reg_repr = df._repr_html_()
18841878
assert "..." not in reg_repr
18851879

1886-
wide_df = DataFrame([row(max_cols+1, 25) for _ in range(10)])
1880+
wide_df = DataFrame(tm.rands_array(25, size=(10, max_cols + 1)))
18871881
wide_repr = wide_df._repr_html_()
18881882
assert "..." in wide_repr
18891883

18901884
def test_repr_html_wide_multiindex_cols(self):
1891-
row = lambda l, k: [tm.rands(k) for _ in range(l)]
18921885
max_cols = get_option('display.max_columns')
18931886

1894-
tuples = list(itertools.product(np.arange(max_cols//2), ['foo', 'bar']))
1895-
mcols = pandas.MultiIndex.from_tuples(tuples, names=['first', 'second'])
1896-
df = DataFrame([row(len(mcols), 25) for _ in range(10)], columns=mcols)
1887+
mcols = pandas.MultiIndex.from_product([np.arange(max_cols//2),
1888+
['foo', 'bar']],
1889+
names=['first', 'second'])
1890+
df = DataFrame(tm.rands_array(25, size=(10, len(mcols))),
1891+
columns=mcols)
18971892
reg_repr = df._repr_html_()
18981893
assert '...' not in reg_repr
18991894

1900-
1901-
tuples = list(itertools.product(np.arange(1+(max_cols//2)), ['foo', 'bar']))
1902-
mcols = pandas.MultiIndex.from_tuples(tuples, names=['first', 'second'])
1903-
df = DataFrame([row(len(mcols), 25) for _ in range(10)], columns=mcols)
1895+
mcols = pandas.MultiIndex.from_product((np.arange(1+(max_cols//2)),
1896+
['foo', 'bar']),
1897+
names=['first', 'second'])
1898+
df = DataFrame(tm.rands_array(25, size=(10, len(mcols))),
1899+
columns=mcols)
19041900
wide_repr = df._repr_html_()
19051901
assert '...' in wide_repr
19061902

pandas/tests/test_frame.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -4734,7 +4734,7 @@ def test_bytestring_with_unicode(self):
47344734

47354735
def test_very_wide_info_repr(self):
47364736
df = DataFrame(np.random.randn(10, 20),
4737-
columns=[tm.rands(10) for _ in range(20)])
4737+
columns=tm.rands_array(10, 20))
47384738
repr(df)
47394739

47404740
def test_repr_column_name_unicode_truncation_bug(self):

pandas/tests/test_groupby.py

+3-4
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,6 @@
88

99
from pandas import date_range,bdate_range, Timestamp
1010
from pandas.core.index import Index, MultiIndex, Int64Index
11-
from pandas.core.common import rands
1211
from pandas.core.api import Categorical, DataFrame
1312
from pandas.core.groupby import (SpecificationError, DataError,
1413
_nargsort, _lexsort_indexer)
@@ -2579,7 +2578,7 @@ def test_cython_grouper_series_bug_noncontig(self):
25792578
self.assertTrue(result.isnull().all())
25802579

25812580
def test_series_grouper_noncontig_index(self):
2582-
index = Index([tm.rands(10) for _ in range(100)])
2581+
index = Index(tm.rands_array(10, 100))
25832582

25842583
values = Series(np.random.randn(50), index=index[::2])
25852584
labels = np.random.randint(0, 5, 50)
@@ -2869,8 +2868,8 @@ def test_column_select_via_attr(self):
28692868
assert_frame_equal(result, expected)
28702869

28712870
def test_rank_apply(self):
2872-
lev1 = np.array([rands(10) for _ in range(100)], dtype=object)
2873-
lev2 = np.array([rands(10) for _ in range(130)], dtype=object)
2871+
lev1 = tm.rands_array(10, 100)
2872+
lev2 = tm.rands_array(10, 130)
28742873
lab1 = np.random.randint(0, 100, size=500)
28752874
lab2 = np.random.randint(0, 130, size=500)
28762875

pandas/tests/test_series.py

+2-4
Original file line numberDiff line numberDiff line change
@@ -327,8 +327,7 @@ def test_getitem_setitem_ellipsis(self):
327327
self.assertTrue((result == 5).all())
328328

329329
def test_getitem_negative_out_of_bounds(self):
330-
s = Series([tm.rands(5) for _ in range(10)],
331-
index=[tm.rands(10) for _ in range(10)])
330+
s = Series(tm.rands_array(5, 10), index=tm.rands_array(10, 10))
332331

333332
self.assertRaises(IndexError, s.__getitem__, -11)
334333
self.assertRaises(IndexError, s.__setitem__, -11, 'foo')
@@ -3852,11 +3851,10 @@ def _check_op(arr, op):
38523851
_check_op(arr, operator.floordiv)
38533852

38543853
def test_series_frame_radd_bug(self):
3855-
from pandas.util.testing import rands
38563854
import operator
38573855

38583856
# GH 353
3859-
vals = Series([rands(5) for _ in range(10)])
3857+
vals = Series(tm.rands_array(5, 10))
38603858
result = 'foo_' + vals
38613859
expected = vals.map(lambda x: 'foo_' + x)
38623860
assert_series_equal(result, expected)

pandas/tests/test_util.py

+16
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,22 @@ def test_bad_deprecate_kwarg(self):
5959
def f4(new=None):
6060
pass
6161

62+
63+
def test_rands():
64+
r = tm.rands(10)
65+
assert(len(r) == 10)
66+
67+
68+
def test_rands_array():
69+
arr = tm.rands_array(5, size=10)
70+
assert(arr.shape == (10,))
71+
assert(len(arr[0]) == 5)
72+
73+
arr = tm.rands_array(7, size=(10, 10))
74+
assert(arr.shape == (10, 10))
75+
assert(len(arr[1, 1]) == 7)
76+
77+
6278
if __name__ == '__main__':
6379
nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],
6480
exit=False)

pandas/tools/tests/test_merge.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
from pandas.tseries.index import DatetimeIndex
1515
from pandas.tools.merge import merge, concat, ordered_merge, MergeError
1616
from pandas.util.testing import (assert_frame_equal, assert_series_equal,
17-
assert_almost_equal, rands,
17+
assert_almost_equal,
1818
makeCustomDataframe as mkdf,
1919
assertRaisesRegexp)
2020
from pandas import isnull, DataFrame, Index, MultiIndex, Panel, Series, date_range, read_table, read_csv
@@ -913,7 +913,7 @@ def test_merge_right_vs_left(self):
913913
def test_compress_group_combinations(self):
914914

915915
# ~ 40000000 possible unique groups
916-
key1 = np.array([rands(10) for _ in range(10000)], dtype='O')
916+
key1 = tm.rands_array(10, 10000)
917917
key1 = np.tile(key1, 2)
918918
key2 = key1[::-1]
919919

pandas/util/testing.py

+45-9
Original file line numberDiff line numberDiff line change
@@ -193,15 +193,50 @@ def randbool(size=(), p=0.5):
193193
return rand(*size) <= p
194194

195195

196-
def rands(n):
197-
choices = string.ascii_letters + string.digits
198-
return ''.join(random.choice(choices) for _ in range(n))
196+
RANDS_CHARS = np.array(list(string.ascii_letters + string.digits),
197+
dtype=(np.str_, 1))
198+
RANDU_CHARS = np.array(list(u("").join(map(unichr, lrange(1488, 1488 + 26))) +
199+
string.digits), dtype=(np.unicode_, 1))
200+
201+
202+
def rands_array(nchars, size, dtype='O'):
203+
"""Generate an array of byte strings."""
204+
retval = (choice(RANDS_CHARS, size=nchars * np.prod(size))
205+
.view((np.str_, nchars)).reshape(size))
206+
if dtype is None:
207+
return retval
208+
else:
209+
return retval.astype(dtype)
210+
211+
212+
def randu_array(nchars, size, dtype='O'):
213+
"""Generate an array of unicode strings."""
214+
retval = (choice(RANDU_CHARS, size=nchars * np.prod(size))
215+
.view((np.unicode_, nchars)).reshape(size))
216+
if dtype is None:
217+
return retval
218+
else:
219+
return retval.astype(dtype)
199220

200221

201-
def randu(n):
202-
choices = u("").join(map(unichr, lrange(1488, 1488 + 26)))
203-
choices += string.digits
204-
return ''.join([random.choice(choices) for _ in range(n)])
222+
def rands(nchars):
223+
"""
224+
Generate one random byte string.
225+
226+
See `rands_array` if you want to create an array of random strings.
227+
228+
"""
229+
return ''.join(choice(RANDS_CHARS, nchars))
230+
231+
232+
def randu(nchars):
233+
"""
234+
Generate one random unicode string.
235+
236+
See `randu_array` if you want to create an array of random unicode strings.
237+
238+
"""
239+
return ''.join(choice(RANDU_CHARS, nchars))
205240

206241

207242
def choice(x, size=10):
@@ -743,10 +778,11 @@ def getArangeMat():
743778

744779
# make index
745780
def makeStringIndex(k=10):
746-
return Index([rands(10) for _ in range(k)])
781+
return Index(rands_array(nchars=10, size=k))
782+
747783

748784
def makeUnicodeIndex(k=10):
749-
return Index([randu(10) for _ in range(k)])
785+
return Index(randu_array(nchars=10, size=k))
750786

751787
def makeBoolIndex(k=10):
752788
if k == 1:

vb_suite/frame_ctor.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,8 @@
1717

1818
setup = common_setup + """
1919
N, K = 5000, 50
20-
index = [rands(10) for _ in xrange(N)]
21-
columns = [rands(10) for _ in xrange(K)]
20+
index = tm.makeStringIndex(N)
21+
columns = tm.makeStringIndex(K)
2222
frame = DataFrame(np.random.randn(N, K), index=index, columns=columns)
2323
2424
try:

vb_suite/groupby.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -187,7 +187,7 @@ def f():
187187
setup = common_setup + """
188188
K = 1000
189189
N = 100000
190-
uniques = np.array([rands(10) for x in xrange(K)], dtype='O')
190+
uniques = tm.makeStringIndex(K).values
191191
s = Series(np.tile(uniques, N // K))
192192
"""
193193

0 commit comments

Comments
 (0)