diff --git a/pandas/util/testing.py b/pandas/util/testing.py index e00573c61485e..66d8f98781a19 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -26,6 +26,7 @@ from pandas.tseries.period import PeriodIndex Index = index.Index +MultiIndex = index.MultiIndex Series = series.Series DataFrame = frame.DataFrame Panel = panel.Panel @@ -334,6 +335,165 @@ def makePanel(): def makePanel4D(): return Panel4D(dict(l1 = makePanel(), l2 = makePanel(), l3 = makePanel())) +def makeCustomIndex(nentries, nlevels, prefix='#', names=False, ndupe_l=None, + idx_type=None): + """Create an index/multindex with given dimensions, levels, names, etc' + + nentries - number of entries in index + nlevels - number of levels (> 1 produces multindex) + prefix - a string prefix for labels + names - (Optional), bool or list of strings. if True will use default names, + if false will use no names, if a list is given, the name of each level + in the index will be taken from the list. + ndupe_l - (Optional), list of ints, the number of rows for which the + label will repeated at the corresponding level, you can specify just + the first few, the rest will use the default ndupe_l of 1. + idx_type - "i"/"f"/"s"/"u"/"dt". + If idx_type is not None, `idx_nlevels` must be 1. + "i"/"f" creates an integer/float index, + "s"/"u" creates a string/unicode index + "dt" create a datetime index. + + if unspecified, string labels will be generated. + """ + + from collections import Counter + if ndupe_l is None: + ndupe_l = [1] * nentries + assert len(ndupe_l) <= nentries + assert names is None or names == False or names == True or len(names) \ + == nlevels + assert idx_type is None or \ + (idx_type in ('i', 'f', 's', 'u', 'dt') and nlevels == 1) + + if names == True: + # build default names + names = [prefix + str(i) for i in range(nlevels)] + if names == False: + # pass None to index constructor for no name + names = None + + # make singelton case uniform + if isinstance(names, basestring) and nlevels == 1: + names = [names] + + # specific 1D index type requested? + idx_func = dict(i=makeIntIndex, f=makeFloatIndex, s=makeStringIndex, + u=makeUnicodeIndex, dt=makeDateIndex).get(idx_type) + if idx_func: + idx = idx_func(nentries) + # but we need to fill in the name + if names: + idx.name = names[0] + return idx + elif idx_type is not None: + raise ValueError('"%s" is not a legal value for `idx_type`, use ' + '"i"/"f"/"s"/"u"/"dt".' % idx_type) + + if len(ndupe_l) < nentries: + ndupe_l.extend([1] * (nentries - len(ndupe_l))) + assert len(ndupe_l) == nentries + + assert all([x > 0 for x in ndupe_l]) + + tuples = [] + for i in range(nlevels): + #build a list of lists to create the index from + div_factor = nentries // ndupe_l[i] + 1 + cnt = Counter() + for j in range(div_factor): + label = prefix + '_l%d_g' % i + str(j) + cnt[label] = ndupe_l[i] + # cute Counter trick + result = list(sorted(cnt.elements()))[:nentries] + tuples.append(result) + + tuples=zip(*tuples) + + # convert tuples to index + if nentries == 1: + index = Index.from_tuples(tuples[0], name=names[0]) + else: + index = MultiIndex.from_tuples(tuples, names=names) + return index + +def makeCustomDataframe(nrows, ncols, c_idx_names=True, r_idx_names=True, + c_idx_nlevels=1, r_idx_nlevels=1, data_gen_f=None, + c_ndupe_l=None, r_ndupe_l=None, dtype=None, + c_idx_type=None, r_idx_type=None): + """ + nrows, ncols - number of data rows/cols + c_idx_names, idx_names - False/True/list of strings, yields No names , + default names or uses the provided names for the levels of the + corresponding index. You can provide a single string when + c_idx_nlevels ==1. + c_idx_nlevels - number of levels in columns index. > 1 will yield MultiIndex + r_idx_nlevels - number of levels in rows index. > 1 will yield MultiIndex + data_gen_f - a function f(row,col) which return the data value at that position, + the default generator used yields values of the form "RxCy" based on position. + c_ndupe_l, r_ndupe_l - list of integers, determines the number + of duplicates for each label at a given level of the corresponding index. + The default `None` value produces a multiplicity of 1 across + all levels, i.e. a unique index. Will accept a partial list of + length N < idx_nlevels, for just the first N levels. If ndupe + doesn't divide nrows/ncol, the last label might have lower multiplicity. + dtype - passed to the DataFrame constructor as is, in case you wish to + have more control in conjuncion with a custom `data_gen_f` + r_idx_type, c_idx_type - "i"/"f"/"s"/"u"/"dt". + If idx_type is not None, `idx_nlevels` must be 1. + "i"/"f" creates an integer/float index, + "s"/"u" creates a string/unicode index + "dt" create a datetime index. + + if unspecified, string labels will be generated. + + Examples: + + # 5 row, 3 columns, default names on both, single index on both axis + >> makeCustomDataframe(5,3) + + # make the data a random int between 1 and 100 + >> mkdf(5,3,data_gen_f=lambda r,c:randint(1,100)) + + # 2-level multiindex on rows with each label duplicated twice on first level, + # default names on both axis, single index on both axis + >> a=makeCustomDataframe(5,3,r_idx_nlevels=2,r_ndupe_l=[2]) + + # DatetimeIndex on row, index with unicode labels on columns + # no names on either axis + >> a=makeCustomDataframe(5,3,c_idx_names=False,r_idx_names=False, + r_idx_type="dt",c_idx_type="u") + + # 4-level multindex on rows with names provided, 2-level multindex + # on columns with default labels and default names. + >> a=makeCustomDataframe(5,3,r_idx_nlevels=4, + r_idx_names=["FEE","FI","FO","FAM"], + c_idx_nlevels=2) + + """ + + assert c_idx_nlevels > 0 + assert r_idx_nlevels > 0 + assert r_idx_type is None or \ + (r_idx_type in ('i', 'f', 's', 'u', 'dt') and r_idx_nlevels == 1) + assert c_idx_type is None or \ + (c_idx_type in ('i', 'f', 's', 'u', 'dt') and c_idx_nlevels == 1) + + columns = makeCustomIndex(ncols, nlevels=c_idx_nlevels, prefix='C', + names=c_idx_names, ndupe_l=c_ndupe_l, + idx_type=c_idx_type) + index = makeCustomIndex(nrows, nlevels=r_idx_nlevels, prefix='R', + names=r_idx_names, ndupe_l=r_ndupe_l, + idx_type=r_idx_type) + + # by default, generate data based on location + if data_gen_f is None: + data_gen_f = lambda r, c: "R%dC%d" % (r,c) + + data = [[data_gen_f(r, c) for c in range(ncols)] for r in range(nrows)] + + return DataFrame(data, index, columns, dtype=dtype) + def add_nans(panel): I, J, N = panel.shape for i, item in enumerate(panel.items):