Skip to content

TST/PERF: optimize tm.makeStringIndex #8575

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Oct 20, 2014
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions doc/source/merging.rst
Original file line number Diff line number Diff line change
Expand Up @@ -130,9 +130,9 @@ behavior:

.. ipython:: python

from pandas.util.testing import rands
from pandas.util.testing import rands_array
df = DataFrame(np.random.randn(10, 4), columns=['a', 'b', 'c', 'd'],
index=[rands(5) for _ in range(10)])
index=rands_array(5, 10))
df

concat([df.ix[:7, ['a', 'b']], df.ix[2:-2, ['c']],
Expand Down
5 changes: 0 additions & 5 deletions pandas/core/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -2110,11 +2110,6 @@ def _count_not_none(*args):
# miscellaneous python tools


def rands(n):
"""Generates a random alphanumeric string of length *n*"""
from random import Random
import string
return ''.join(Random().sample(string.ascii_letters + string.digits, n))


def adjoin(space, *lists):
Expand Down
4 changes: 2 additions & 2 deletions pandas/io/tests/test_pytables.py
Original file line number Diff line number Diff line change
Expand Up @@ -198,8 +198,8 @@ def test_long_strings(self):
# GH6166
# unconversion of long strings was being chopped in earlier
# versions of numpy < 1.7.2
df = DataFrame({'a': [tm.rands(100) for _ in range(10)]},
index=[tm.rands(100) for _ in range(10)])
df = DataFrame({'a': tm.rands_array(100, size=10)},
index=tm.rands_array(100, size=10))

with ensure_clean_store(self.path) as store:
store.append('df', df, data_columns=['a'])
Expand Down
5 changes: 0 additions & 5 deletions pandas/tests/test_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -274,11 +274,6 @@ def test_repr_binary_type():
assert_equal(res, b)


def test_rands():
r = com.rands(10)
assert(len(r) == 10)


def test_adjoin():
data = [['a', 'b', 'c'],
['dd', 'ee', 'ff'],
Expand Down
50 changes: 23 additions & 27 deletions pandas/tests/test_format.py
Original file line number Diff line number Diff line change
Expand Up @@ -1201,9 +1201,8 @@ def test_pprint_thing(self):

def test_wide_repr(self):
with option_context('mode.sim_interactive', True, 'display.show_dimensions', True):
col = lambda l, k: [tm.rands(k) for _ in range(l)]
max_cols = get_option('display.max_columns')
df = DataFrame([col(max_cols - 1, 25) for _ in range(10)])
df = DataFrame(tm.rands_array(25, size=(10, max_cols - 1)))
set_option('display.expand_frame_repr', False)
rep_str = repr(df)

Expand All @@ -1227,9 +1226,8 @@ def test_wide_repr_wide_columns(self):

def test_wide_repr_named(self):
with option_context('mode.sim_interactive', True):
col = lambda l, k: [tm.rands(k) for _ in range(l)]
max_cols = get_option('display.max_columns')
df = DataFrame([col(max_cols-1, 25) for _ in range(10)])
df = DataFrame(tm.rands_array(25, size=(10, max_cols - 1)))
df.index.name = 'DataFrame Index'
set_option('display.expand_frame_repr', False)

Expand All @@ -1249,11 +1247,10 @@ def test_wide_repr_named(self):

def test_wide_repr_multiindex(self):
with option_context('mode.sim_interactive', True):
col = lambda l, k: [tm.rands(k) for _ in range(l)]
midx = pandas.MultiIndex.from_arrays([np.array(col(10, 5)),
np.array(col(10, 5))])
midx = pandas.MultiIndex.from_arrays(
tm.rands_array(5, size=(2, 10)))
max_cols = get_option('display.max_columns')
df = DataFrame([col(max_cols-1, 25) for _ in range(10)],
df = DataFrame(tm.rands_array(25, size=(10, max_cols - 1)),
index=midx)
df.index.names = ['Level 0', 'Level 1']
set_option('display.expand_frame_repr', False)
Expand All @@ -1274,12 +1271,11 @@ def test_wide_repr_multiindex(self):
def test_wide_repr_multiindex_cols(self):
with option_context('mode.sim_interactive', True):
max_cols = get_option('display.max_columns')
col = lambda l, k: [tm.rands(k) for _ in range(l)]
midx = pandas.MultiIndex.from_arrays([np.array(col(10, 5)),
np.array(col(10, 5))])
mcols = pandas.MultiIndex.from_arrays([np.array(col(max_cols-1, 3)),
np.array(col(max_cols-1, 3))])
df = DataFrame([col(max_cols-1, 25) for _ in range(10)],
midx = pandas.MultiIndex.from_arrays(
tm.rands_array(5, size=(2, 10)))
mcols = pandas.MultiIndex.from_arrays(
tm.rands_array(3, size=(2, max_cols - 1)))
df = DataFrame(tm.rands_array(25, (10, max_cols - 1)),
index=midx, columns=mcols)
df.index.names = ['Level 0', 'Level 1']
set_option('display.expand_frame_repr', False)
Expand All @@ -1296,9 +1292,8 @@ def test_wide_repr_multiindex_cols(self):

def test_wide_repr_unicode(self):
with option_context('mode.sim_interactive', True):
col = lambda l, k: [tm.randu(k) for _ in range(l)]
max_cols = get_option('display.max_columns')
df = DataFrame([col(max_cols-1, 25) for _ in range(10)])
df = DataFrame(tm.rands_array(25, size=(10, max_cols - 1)))
set_option('display.expand_frame_repr', False)
rep_str = repr(df)
set_option('display.expand_frame_repr', True)
Expand Down Expand Up @@ -1877,30 +1872,31 @@ def test_repr_html(self):
self.reset_display_options()

def test_repr_html_wide(self):
row = lambda l, k: [tm.rands(k) for _ in range(l)]
max_cols = get_option('display.max_columns')
df = DataFrame([row(max_cols-1, 25) for _ in range(10)])
df = DataFrame(tm.rands_array(25, size=(10, max_cols - 1)))
reg_repr = df._repr_html_()
assert "..." not in reg_repr

wide_df = DataFrame([row(max_cols+1, 25) for _ in range(10)])
wide_df = DataFrame(tm.rands_array(25, size=(10, max_cols + 1)))
wide_repr = wide_df._repr_html_()
assert "..." in wide_repr

def test_repr_html_wide_multiindex_cols(self):
row = lambda l, k: [tm.rands(k) for _ in range(l)]
max_cols = get_option('display.max_columns')

tuples = list(itertools.product(np.arange(max_cols//2), ['foo', 'bar']))
mcols = pandas.MultiIndex.from_tuples(tuples, names=['first', 'second'])
df = DataFrame([row(len(mcols), 25) for _ in range(10)], columns=mcols)
mcols = pandas.MultiIndex.from_product([np.arange(max_cols//2),
['foo', 'bar']],
names=['first', 'second'])
df = DataFrame(tm.rands_array(25, size=(10, len(mcols))),
columns=mcols)
reg_repr = df._repr_html_()
assert '...' not in reg_repr


tuples = list(itertools.product(np.arange(1+(max_cols//2)), ['foo', 'bar']))
mcols = pandas.MultiIndex.from_tuples(tuples, names=['first', 'second'])
df = DataFrame([row(len(mcols), 25) for _ in range(10)], columns=mcols)
mcols = pandas.MultiIndex.from_product((np.arange(1+(max_cols//2)),
['foo', 'bar']),
names=['first', 'second'])
df = DataFrame(tm.rands_array(25, size=(10, len(mcols))),
columns=mcols)
wide_repr = df._repr_html_()
assert '...' in wide_repr

Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/test_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -4734,7 +4734,7 @@ def test_bytestring_with_unicode(self):

def test_very_wide_info_repr(self):
df = DataFrame(np.random.randn(10, 20),
columns=[tm.rands(10) for _ in range(20)])
columns=tm.rands_array(10, 20))
repr(df)

def test_repr_column_name_unicode_truncation_bug(self):
Expand Down
7 changes: 3 additions & 4 deletions pandas/tests/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@

from pandas import date_range,bdate_range, Timestamp
from pandas.core.index import Index, MultiIndex, Int64Index
from pandas.core.common import rands
from pandas.core.api import Categorical, DataFrame
from pandas.core.groupby import (SpecificationError, DataError,
_nargsort, _lexsort_indexer)
Expand Down Expand Up @@ -2579,7 +2578,7 @@ def test_cython_grouper_series_bug_noncontig(self):
self.assertTrue(result.isnull().all())

def test_series_grouper_noncontig_index(self):
index = Index([tm.rands(10) for _ in range(100)])
index = Index(tm.rands_array(10, 100))

values = Series(np.random.randn(50), index=index[::2])
labels = np.random.randint(0, 5, 50)
Expand Down Expand Up @@ -2869,8 +2868,8 @@ def test_column_select_via_attr(self):
assert_frame_equal(result, expected)

def test_rank_apply(self):
lev1 = np.array([rands(10) for _ in range(100)], dtype=object)
lev2 = np.array([rands(10) for _ in range(130)], dtype=object)
lev1 = tm.rands_array(10, 100)
lev2 = tm.rands_array(10, 130)
lab1 = np.random.randint(0, 100, size=500)
lab2 = np.random.randint(0, 130, size=500)

Expand Down
6 changes: 2 additions & 4 deletions pandas/tests/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -327,8 +327,7 @@ def test_getitem_setitem_ellipsis(self):
self.assertTrue((result == 5).all())

def test_getitem_negative_out_of_bounds(self):
s = Series([tm.rands(5) for _ in range(10)],
index=[tm.rands(10) for _ in range(10)])
s = Series(tm.rands_array(5, 10), index=tm.rands_array(10, 10))

self.assertRaises(IndexError, s.__getitem__, -11)
self.assertRaises(IndexError, s.__setitem__, -11, 'foo')
Expand Down Expand Up @@ -3852,11 +3851,10 @@ def _check_op(arr, op):
_check_op(arr, operator.floordiv)

def test_series_frame_radd_bug(self):
from pandas.util.testing import rands
import operator

# GH 353
vals = Series([rands(5) for _ in range(10)])
vals = Series(tm.rands_array(5, 10))
result = 'foo_' + vals
expected = vals.map(lambda x: 'foo_' + x)
assert_series_equal(result, expected)
Expand Down
16 changes: 16 additions & 0 deletions pandas/tests/test_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,22 @@ def test_bad_deprecate_kwarg(self):
def f4(new=None):
pass


def test_rands():
r = tm.rands(10)
assert(len(r) == 10)


def test_rands_array():
arr = tm.rands_array(5, size=10)
assert(arr.shape == (10,))
assert(len(arr[0]) == 5)

arr = tm.rands_array(7, size=(10, 10))
assert(arr.shape == (10, 10))
assert(len(arr[1, 1]) == 7)


if __name__ == '__main__':
nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],
exit=False)
4 changes: 2 additions & 2 deletions pandas/tools/tests/test_merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from pandas.tseries.index import DatetimeIndex
from pandas.tools.merge import merge, concat, ordered_merge, MergeError
from pandas.util.testing import (assert_frame_equal, assert_series_equal,
assert_almost_equal, rands,
assert_almost_equal,
makeCustomDataframe as mkdf,
assertRaisesRegexp)
from pandas import isnull, DataFrame, Index, MultiIndex, Panel, Series, date_range, read_table, read_csv
Expand Down Expand Up @@ -913,7 +913,7 @@ def test_merge_right_vs_left(self):
def test_compress_group_combinations(self):

# ~ 40000000 possible unique groups
key1 = np.array([rands(10) for _ in range(10000)], dtype='O')
key1 = tm.rands_array(10, 10000)
key1 = np.tile(key1, 2)
key2 = key1[::-1]

Expand Down
54 changes: 45 additions & 9 deletions pandas/util/testing.py
Original file line number Diff line number Diff line change
Expand Up @@ -193,15 +193,50 @@ def randbool(size=(), p=0.5):
return rand(*size) <= p


def rands(n):
choices = string.ascii_letters + string.digits
return ''.join(random.choice(choices) for _ in range(n))
RANDS_CHARS = np.array(list(string.ascii_letters + string.digits),
dtype=(np.str_, 1))
RANDU_CHARS = np.array(list(u("").join(map(unichr, lrange(1488, 1488 + 26))) +
string.digits), dtype=(np.unicode_, 1))


def rands_array(nchars, size, dtype='O'):
"""Generate an array of byte strings."""
retval = (choice(RANDS_CHARS, size=nchars * np.prod(size))
.view((np.str_, nchars)).reshape(size))
if dtype is None:
return retval
else:
return retval.astype(dtype)


def randu_array(nchars, size, dtype='O'):
"""Generate an array of unicode strings."""
retval = (choice(RANDU_CHARS, size=nchars * np.prod(size))
.view((np.unicode_, nchars)).reshape(size))
if dtype is None:
return retval
else:
return retval.astype(dtype)


def randu(n):
choices = u("").join(map(unichr, lrange(1488, 1488 + 26)))
choices += string.digits
return ''.join([random.choice(choices) for _ in range(n)])
def rands(nchars):
"""
Generate one random byte string.

See `rands_array` if you want to create an array of random strings.

"""
return ''.join(choice(RANDS_CHARS, nchars))


def randu(nchars):
"""
Generate one random unicode string.

See `randu_array` if you want to create an array of random unicode strings.

"""
return ''.join(choice(RANDU_CHARS, nchars))


def choice(x, size=10):
Expand Down Expand Up @@ -743,10 +778,11 @@ def getArangeMat():

# make index
def makeStringIndex(k=10):
return Index([rands(10) for _ in range(k)])
return Index(rands_array(nchars=10, size=k))


def makeUnicodeIndex(k=10):
return Index([randu(10) for _ in range(k)])
return Index(randu_array(nchars=10, size=k))

def makeBoolIndex(k=10):
if k == 1:
Expand Down
4 changes: 2 additions & 2 deletions vb_suite/frame_ctor.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,8 @@

setup = common_setup + """
N, K = 5000, 50
index = [rands(10) for _ in xrange(N)]
columns = [rands(10) for _ in xrange(K)]
index = tm.makeStringIndex(N)
columns = tm.makeStringIndex(K)
frame = DataFrame(np.random.randn(N, K), index=index, columns=columns)

try:
Expand Down
2 changes: 1 addition & 1 deletion vb_suite/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -187,7 +187,7 @@ def f():
setup = common_setup + """
K = 1000
N = 100000
uniques = np.array([rands(10) for x in xrange(K)], dtype='O')
uniques = tm.makeStringIndex(K).values
s = Series(np.tile(uniques, N // K))
"""

Expand Down
Loading