pandas-dev · ghost · Dec 6, 2012
diff --git a/pandas/util/testing.py b/pandas/util/testing.py
@@ -26,6 +26,7 @@
 from pandas.tseries.period import PeriodIndex
 
 Index = index.Index
+MultiIndex = index.MultiIndex
 Series = series.Series
 DataFrame = frame.DataFrame
 Panel = panel.Panel
@@ -334,6 +335,165 @@ def makePanel():
 def makePanel4D():
     return Panel4D(dict(l1 = makePanel(), l2 = makePanel(), l3 = makePanel()))
 
+def makeCustomIndex(nentries, nlevels, prefix='#', names=False, ndupe_l=None,
+                    idx_type=None):
+    """Create an index/multindex with given dimensions, levels, names, etc'
+
+    nentries - number of entries in index
+    nlevels - number of levels (> 1 produces multindex)
+    prefix - a string prefix for labels
+    names - (Optional), bool or list of strings. if True will use default names,
+       if false will use no names, if a list is given,  the name of each level
+       in the index will be taken from the list.
+    ndupe_l - (Optional), list of ints, the number of rows for which the
+       label will repeated at the corresponding level, you can specify just
+       the first few, the rest will use the default ndupe_l of 1.
+    idx_type - "i"/"f"/"s"/"u"/"dt".
+       If idx_type is not None, `idx_nlevels` must be 1.
+       "i"/"f" creates an integer/float index,
+       "s"/"u" creates a string/unicode index
+       "dt" create a datetime index.
+
+        if unspecified, string labels will be generated.
+    """
+
+    from collections import Counter
+    if ndupe_l is None:
+        ndupe_l = [1] * nentries
+    assert len(ndupe_l) <= nentries
+    assert names is None or names == False or names == True or len(names) \
+        == nlevels
+    assert idx_type is None or \
+           (idx_type in ('i', 'f', 's', 'u', 'dt') and nlevels == 1)
+
+    if names == True:
+        # build default names
+        names = [prefix + str(i) for i in range(nlevels)]
+    if names == False:
+        # pass None to index constructor for no name
+        names = None
+
+    # make singelton case uniform
+    if isinstance(names, basestring) and nlevels == 1:
+        names = [names]
+
+    # specific 1D index type requested?
+    idx_func = dict(i=makeIntIndex, f=makeFloatIndex, s=makeStringIndex,
+                    u=makeUnicodeIndex, dt=makeDateIndex).get(idx_type)
+    if idx_func:
+        idx = idx_func(nentries)
+        # but we need to fill in the name
+        if names:
+            idx.name = names[0]
+        return idx
+    elif idx_type is not None:
+        raise ValueError('"%s" is not a legal value for `idx_type`, use  '
+                         '"i"/"f"/"s"/"u"/"dt".' % idx_type)
+
+    if len(ndupe_l) < nentries:
+        ndupe_l.extend([1] * (nentries - len(ndupe_l)))
+    assert len(ndupe_l) == nentries
+
+    assert all([x > 0 for x in ndupe_l])
+
+    tuples = []
+    for i in range(nlevels):
+        #build a list of lists to create the index from
+        div_factor = nentries // ndupe_l[i] + 1
+        cnt = Counter()
+        for j in range(div_factor):
+            label = prefix + '_l%d_g' % i + str(j)
+            cnt[label] = ndupe_l[i]
+        # cute Counter trick
+        result = list(sorted(cnt.elements()))[:nentries]
+        tuples.append(result)
+
+    tuples=zip(*tuples)
+
+    # convert tuples to index
+    if nentries == 1:
+        index = Index.from_tuples(tuples[0], name=names[0])
+    else:
+        index = MultiIndex.from_tuples(tuples, names=names)
+    return index
+
+def makeCustomDataframe(nrows, ncols, c_idx_names=True, r_idx_names=True,
+                        c_idx_nlevels=1, r_idx_nlevels=1, data_gen_f=None,
+                        c_ndupe_l=None, r_ndupe_l=None, dtype=None,
+                        c_idx_type=None, r_idx_type=None):
+    """
+   nrows,  ncols - number of data rows/cols
+   c_idx_names, idx_names  - False/True/list of strings,  yields No names ,
+        default names or  uses the provided names for the levels of the
+        corresponding  index. You can provide a single string when
+        c_idx_nlevels ==1.
+   c_idx_nlevels - number of levels in columns index. > 1 will yield MultiIndex
+   r_idx_nlevels - number of levels in rows index. > 1 will yield MultiIndex
+   data_gen_f - a function f(row,col) which return the data value at that position,
+        the default generator used yields values of the form "RxCy" based on position.
+   c_ndupe_l, r_ndupe_l - list of integers, determines the number
+        of duplicates for each label at a given level of the corresponding index.
+        The default `None` value produces a multiplicity of 1 across
+        all levels, i.e. a unique index. Will accept a partial list of
+        length N < idx_nlevels, for just the first N levels. If ndupe
+        doesn't divide nrows/ncol, the last label might have lower multiplicity.
+   dtype - passed to the DataFrame constructor as is, in case you wish to
+        have more control in conjuncion with a custom `data_gen_f`
+   r_idx_type, c_idx_type -  "i"/"f"/"s"/"u"/"dt".
+       If idx_type is not None, `idx_nlevels` must be 1.
+       "i"/"f" creates an integer/float index,
+       "s"/"u" creates a string/unicode index
+       "dt" create a datetime index.
+
+        if unspecified, string labels will be generated.
+
+    Examples:
+
+    # 5 row, 3 columns, default names on both, single index on both axis
+    >> makeCustomDataframe(5,3)
+
+    # make the data a random int between 1 and 100
+    >> mkdf(5,3,data_gen_f=lambda r,c:randint(1,100))
+
+    # 2-level multiindex on rows with each label duplicated twice on first level,
+    # default names on both axis, single index on both axis
+    >> a=makeCustomDataframe(5,3,r_idx_nlevels=2,r_ndupe_l=[2])
+
+    # DatetimeIndex on row, index with unicode labels on columns
+    # no names on either axis
+    >> a=makeCustomDataframe(5,3,c_idx_names=False,r_idx_names=False,
+                             r_idx_type="dt",c_idx_type="u")
+
+    # 4-level multindex on rows with names provided, 2-level multindex
+    # on columns with default labels and default names.
+    >> a=makeCustomDataframe(5,3,r_idx_nlevels=4,
+                             r_idx_names=["FEE","FI","FO","FAM"],
+                             c_idx_nlevels=2)
+
+    """
+
+    assert c_idx_nlevels > 0
+    assert r_idx_nlevels > 0
+    assert r_idx_type is None or \
+           (r_idx_type in ('i', 'f', 's', 'u', 'dt') and r_idx_nlevels == 1)
+    assert c_idx_type is None or \
+           (c_idx_type in ('i', 'f', 's', 'u', 'dt') and c_idx_nlevels == 1)
+
+    columns = makeCustomIndex(ncols, nlevels=c_idx_nlevels, prefix='C',
+                              names=c_idx_names, ndupe_l=c_ndupe_l,
+                              idx_type=c_idx_type)
+    index = makeCustomIndex(nrows, nlevels=r_idx_nlevels, prefix='R',
+                            names=r_idx_names, ndupe_l=r_ndupe_l,
+                            idx_type=r_idx_type)
+
+    # by default, generate data based on location
+    if data_gen_f is None:
+        data_gen_f = lambda r, c: "R%dC%d" % (r,c)
+
+    data = [[data_gen_f(r, c) for c in range(ncols)] for r in range(nrows)]
+
+    return DataFrame(data, index, columns, dtype=dtype)
+
 def add_nans(panel):
     I, J, N = panel.shape
     for i, item in enumerate(panel.items):