Skip to content

ENH: Add batteries-included index and DataFrame generators to tm #2446

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
160 changes: 160 additions & 0 deletions pandas/util/testing.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
from pandas.tseries.period import PeriodIndex

Index = index.Index
MultiIndex = index.MultiIndex
Series = series.Series
DataFrame = frame.DataFrame
Panel = panel.Panel
Expand Down Expand Up @@ -334,6 +335,165 @@ def makePanel():
def makePanel4D():
return Panel4D(dict(l1 = makePanel(), l2 = makePanel(), l3 = makePanel()))

def makeCustomIndex(nentries, nlevels, prefix='#', names=False, ndupe_l=None,
idx_type=None):
"""Create an index/multindex with given dimensions, levels, names, etc'

nentries - number of entries in index
nlevels - number of levels (> 1 produces multindex)
prefix - a string prefix for labels
names - (Optional), bool or list of strings. if True will use default names,
if false will use no names, if a list is given, the name of each level
in the index will be taken from the list.
ndupe_l - (Optional), list of ints, the number of rows for which the
label will repeated at the corresponding level, you can specify just
the first few, the rest will use the default ndupe_l of 1.
idx_type - "i"/"f"/"s"/"u"/"dt".
If idx_type is not None, `idx_nlevels` must be 1.
"i"/"f" creates an integer/float index,
"s"/"u" creates a string/unicode index
"dt" create a datetime index.

if unspecified, string labels will be generated.
"""

from collections import Counter
if ndupe_l is None:
ndupe_l = [1] * nentries
assert len(ndupe_l) <= nentries
assert names is None or names == False or names == True or len(names) \
== nlevels
assert idx_type is None or \
(idx_type in ('i', 'f', 's', 'u', 'dt') and nlevels == 1)

if names == True:
# build default names
names = [prefix + str(i) for i in range(nlevels)]
if names == False:
# pass None to index constructor for no name
names = None

# make singelton case uniform
if isinstance(names, basestring) and nlevels == 1:
names = [names]

# specific 1D index type requested?
idx_func = dict(i=makeIntIndex, f=makeFloatIndex, s=makeStringIndex,
u=makeUnicodeIndex, dt=makeDateIndex).get(idx_type)
if idx_func:
idx = idx_func(nentries)
# but we need to fill in the name
if names:
idx.name = names[0]
return idx
elif idx_type is not None:
raise ValueError('"%s" is not a legal value for `idx_type`, use '
'"i"/"f"/"s"/"u"/"dt".' % idx_type)

if len(ndupe_l) < nentries:
ndupe_l.extend([1] * (nentries - len(ndupe_l)))
assert len(ndupe_l) == nentries

assert all([x > 0 for x in ndupe_l])

tuples = []
for i in range(nlevels):
#build a list of lists to create the index from
div_factor = nentries // ndupe_l[i] + 1
cnt = Counter()
for j in range(div_factor):
label = prefix + '_l%d_g' % i + str(j)
cnt[label] = ndupe_l[i]
# cute Counter trick
result = list(sorted(cnt.elements()))[:nentries]
tuples.append(result)

tuples=zip(*tuples)

# convert tuples to index
if nentries == 1:
index = Index.from_tuples(tuples[0], name=names[0])
else:
index = MultiIndex.from_tuples(tuples, names=names)
return index

def makeCustomDataframe(nrows, ncols, c_idx_names=True, r_idx_names=True,
c_idx_nlevels=1, r_idx_nlevels=1, data_gen_f=None,
c_ndupe_l=None, r_ndupe_l=None, dtype=None,
c_idx_type=None, r_idx_type=None):
"""
nrows, ncols - number of data rows/cols
c_idx_names, idx_names - False/True/list of strings, yields No names ,
default names or uses the provided names for the levels of the
corresponding index. You can provide a single string when
c_idx_nlevels ==1.
c_idx_nlevels - number of levels in columns index. > 1 will yield MultiIndex
r_idx_nlevels - number of levels in rows index. > 1 will yield MultiIndex
data_gen_f - a function f(row,col) which return the data value at that position,
the default generator used yields values of the form "RxCy" based on position.
c_ndupe_l, r_ndupe_l - list of integers, determines the number
of duplicates for each label at a given level of the corresponding index.
The default `None` value produces a multiplicity of 1 across
all levels, i.e. a unique index. Will accept a partial list of
length N < idx_nlevels, for just the first N levels. If ndupe
doesn't divide nrows/ncol, the last label might have lower multiplicity.
dtype - passed to the DataFrame constructor as is, in case you wish to
have more control in conjuncion with a custom `data_gen_f`
r_idx_type, c_idx_type - "i"/"f"/"s"/"u"/"dt".
If idx_type is not None, `idx_nlevels` must be 1.
"i"/"f" creates an integer/float index,
"s"/"u" creates a string/unicode index
"dt" create a datetime index.

if unspecified, string labels will be generated.

Examples:

# 5 row, 3 columns, default names on both, single index on both axis
>> makeCustomDataframe(5,3)

# make the data a random int between 1 and 100
>> mkdf(5,3,data_gen_f=lambda r,c:randint(1,100))

# 2-level multiindex on rows with each label duplicated twice on first level,
# default names on both axis, single index on both axis
>> a=makeCustomDataframe(5,3,r_idx_nlevels=2,r_ndupe_l=[2])

# DatetimeIndex on row, index with unicode labels on columns
# no names on either axis
>> a=makeCustomDataframe(5,3,c_idx_names=False,r_idx_names=False,
r_idx_type="dt",c_idx_type="u")

# 4-level multindex on rows with names provided, 2-level multindex
# on columns with default labels and default names.
>> a=makeCustomDataframe(5,3,r_idx_nlevels=4,
r_idx_names=["FEE","FI","FO","FAM"],
c_idx_nlevels=2)

"""

assert c_idx_nlevels > 0
assert r_idx_nlevels > 0
assert r_idx_type is None or \
(r_idx_type in ('i', 'f', 's', 'u', 'dt') and r_idx_nlevels == 1)
assert c_idx_type is None or \
(c_idx_type in ('i', 'f', 's', 'u', 'dt') and c_idx_nlevels == 1)

columns = makeCustomIndex(ncols, nlevels=c_idx_nlevels, prefix='C',
names=c_idx_names, ndupe_l=c_ndupe_l,
idx_type=c_idx_type)
index = makeCustomIndex(nrows, nlevels=r_idx_nlevels, prefix='R',
names=r_idx_names, ndupe_l=r_ndupe_l,
idx_type=r_idx_type)

# by default, generate data based on location
if data_gen_f is None:
data_gen_f = lambda r, c: "R%dC%d" % (r,c)

data = [[data_gen_f(r, c) for c in range(ncols)] for r in range(nrows)]

return DataFrame(data, index, columns, dtype=dtype)

def add_nans(panel):
I, J, N = panel.shape
for i, item in enumerate(panel.items):
Expand Down