ENH: Index(...) constructor creates a MultiIndex when appropriate.

adgaudio · jreback · commit b7492fe124fc · 2014-04-09T19:23:49.000-04:00
- Series and DataFrame constructor autodetect when index/columns
should be MultiIndex
- prevents some seg faults in calls to cython funcs
- add tupleize_cols kwarg and update tests to git PR comments
- support name= xor names= in Index(tuples, ....) constructor
- docs

BUG: Index.identical(other) didn't compare type(other) to type(self)
diff --git a/doc/source/basics.rst b/doc/source/basics.rst
@@ -761,6 +761,7 @@ This is equivalent to the following
 
 .. _basics.reindexing:
 
+
 Reindexing and altering labels
 ------------------------------
 
diff --git a/doc/source/indexing.rst b/doc/source/indexing.rst
@@ -1643,15 +1643,21 @@ can think of ``MultiIndex`` an array of tuples where each tuple is unique. A
 ``MultiIndex`` can be created from a list of arrays (using
 ``MultiIndex.from_arrays``), an array of tuples (using
 ``MultiIndex.from_tuples``), or a crossed set of iterables (using
-``MultiIndex.from_product``).
+``MultiIndex.from_product``).  The ``Index`` constructor will attempt to return
+a ``MultiIndex`` when it is passed a list of tuples.  The following examples
+demo different ways to initialize MultiIndexes.
+
 
 .. ipython:: python
 
    arrays = [['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'],
              ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']]
    tuples = list(zip(*arrays))
    tuples
-   index = MultiIndex.from_tuples(tuples, names=['first', 'second'])
+
+   multi_index = MultiIndex.from_tuples(tuples, names=['first', 'second'])
+   multi_index
+
    s = Series(randn(8), index=index)
    s
 
diff --git a/doc/source/release.rst b/doc/source/release.rst
@@ -53,6 +53,9 @@ pandas 0.14.0
 New features
 ~~~~~~~~~~~~
 
+- ``Index`` returns a MultiIndex if passed a list of tuples
+  ``DataFrame(dict)`` and ``Series(dict)`` create ``MultiIndex``
+  columns and index where applicable (:issue:`4187`)
 - Hexagonal bin plots from ``DataFrame.plot`` with ``kind='hexbin'`` (:issue:`5478`)
 - Added the ``sym_diff`` method to ``Index`` (:issue:`5543`)
 - Added ``to_julian_date`` to ``TimeStamp`` and ``DatetimeIndex``.  The Julian
@@ -264,6 +267,8 @@ Bug Fixes
 ~~~~~~~~~
 
 - Bug in Series ValueError when index doesn't match data (:issue:`6532`)
+- Prevent segfault due to MultiIndex not being supported in HDFStore table
+  format (:issue:`1848`)
 - Bug in ``pd.DataFrame.sort_index`` where mergesort wasn't stable when ``ascending=False`` (:issue:`6399`)
 - Bug in ``pd.tseries.frequencies.to_offset`` when argument has leading zeroes (:issue:`6391`)
 - Bug in version string gen. for dev versions with shallow clones / install from tarball (:issue:`6127`)
diff --git a/doc/source/v0.14.0.txt b/doc/source/v0.14.0.txt
@@ -405,6 +405,18 @@ Deprecations
 Enhancements
 ~~~~~~~~~~~~
 
+- DataFrame and Series will create MultiIndex if passed a list of tuples
+
+  .. ipython:: python
+
+     Series({('a', 'b'): 1, ('a', 'a'): 0,
+                    ('a', 'c'): 2, ('b', 'a'): 3, ('b', 'b'): 4})
+     pandas.DataFrame({('a', 'b'): {('A', 'B'): 1, ('A', 'C'): 2},
+                       ('a', 'a'): {('A', 'C'): 3, ('A', 'B'): 4},
+                       ('a', 'c'): {('A', 'B'): 5, ('A', 'C'): 6},
+                       ('b', 'a'): {('A', 'C'): 7, ('A', 'B'): 8},
+                       ('b', 'b'): {('A', 'D'): 9, ('A', 'B'): 10}})
+
 - ``DataFrame.to_latex`` now takes a longtable keyword, which if True will return a table in a longtable environment. (:issue:`6617`)
 - ``pd.read_clipboard`` will, if 'sep' is unspecified, try to detect data copied from a spreadsheet
   and parse accordingly. (:issue:`6223`)
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -317,9 +317,9 @@ def _init_dict(self, data, index, columns, dtype=None):
         else:
             keys = list(data.keys())
             if not isinstance(data, OrderedDict):
-                keys = _try_sort(list(data.keys()))
+                keys = _try_sort(keys)
             columns = data_names = Index(keys)
-            arrays = [data[k] for k in columns]
+            arrays = [data[k] for k in keys]
 
         return _arrays_to_mgr(arrays, data_names, index, columns,
                               dtype=dtype)
@@ -4496,7 +4496,7 @@ def extract_index(data):
     index = None
     if len(data) == 0:
         index = Index([])
-    elif len(data) > 0 and index is None:
+    elif len(data) > 0:
         raw_lengths = []
         indexes = []
 
diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py
@@ -1164,8 +1164,7 @@ def groups(self):
         else:
             to_groupby = lzip(*(ping.grouper for ping in self.groupings))
             to_groupby = Index(to_groupby)
-
-            return self.axis.groupby(to_groupby)
+            return self.axis.groupby(to_groupby.values)
 
     @cache_readonly
     def group_info(self):
diff --git a/pandas/core/index.py b/pandas/core/index.py
@@ -71,6 +71,8 @@ class Index(IndexOpsMixin, FrozenNDArray):
         Make a copy of input ndarray
     name : object
         Name to be stored in the index
+    tupleize_cols : bool (default: True)
+        When True, attempt to create a MultiIndex if possible
 
     Notes
     -----
@@ -99,7 +101,7 @@ class Index(IndexOpsMixin, FrozenNDArray):
     _engine_type = _index.ObjectEngine
 
     def __new__(cls, data, dtype=None, copy=False, name=None, fastpath=False,
-                **kwargs):
+                tupleize_cols=True, **kwargs):
 
         # no class inference!
         if fastpath:
@@ -139,8 +141,19 @@ def __new__(cls, data, dtype=None, copy=False, name=None, fastpath=False,
 
         elif np.isscalar(data):
             cls._scalar_data_error(data)
-
         else:
+            if tupleize_cols and isinstance(data, list) and data:
+                try:
+                    sorted(data)
+                    has_mixed_types = False
+                except (TypeError, UnicodeDecodeError):
+                    has_mixed_types = True  # python3 only
+                if isinstance(data[0], tuple) and not has_mixed_types:
+                    try:
+                        return MultiIndex.from_tuples(
+                            data, names=name or kwargs.get('names'))
+                    except (TypeError, KeyError):
+                        pass  # python2 - MultiIndex fails on mixed types
             # other iterable of some kind
             subarr = com._asarray_tuplesafe(data, dtype=object)
 
@@ -808,7 +821,8 @@ def identical(self, other):
         """
         return (self.equals(other) and
                 all((getattr(self, c, None) == getattr(other, c, None)
-                     for c in self._comparables)))
+                     for c in self._comparables)) and
+                type(self) == type(other))
 
     def asof(self, label):
         """
@@ -1743,11 +1757,11 @@ def insert(self, loc, item):
         -------
         new_index : Index
         """
-        index = np.asarray(self)
-        # because numpy is fussy with tuples
-        item_idx = Index([item], dtype=index.dtype)
-        new_index = np.concatenate((index[:loc], item_idx, index[loc:]))
-        return Index(new_index, name=self.name)
+        _self = np.asarray(self)
+        item_idx = Index([item], dtype=self.dtype).values
+        idx = np.concatenate(
+            (_self[:loc], item_idx, _self[loc:]))
+        return Index(idx, name=self.name)
 
     def drop(self, labels):
         """
diff --git a/pandas/core/series.py b/pandas/core/series.py
@@ -22,9 +22,9 @@
                                 _values_from_object,
                                 _possibly_cast_to_datetime, _possibly_castable,
                                 _possibly_convert_platform,
+                                _try_sort,
                                 ABCSparseArray, _maybe_match_name,
                                 _ensure_object, SettingWithCopyError)
-
 from pandas.core.index import (Index, MultiIndex, InvalidIndexError,
                                _ensure_index)
 from pandas.core.indexing import (
@@ -180,7 +180,7 @@ def __init__(self, data=None, index=None, dtype=None, name=None,
                     if isinstance(data, OrderedDict):
                         index = Index(data)
                     else:
-                        index = Index(sorted(data))
+                        index = Index(_try_sort(data))
                 try:
                     if isinstance(index, DatetimeIndex):
                         # coerce back to datetime objects for lookup
diff --git a/pandas/src/inference.pyx b/pandas/src/inference.pyx
@@ -58,6 +58,8 @@ def infer_dtype(object _values):
             _values = list(_values)
         values = list_to_object_array(_values)
 
+    values = getattr(values, 'values', values)
+
     val_kind = values.dtype.type
     if val_kind in _TYPE_MAP:
         return _TYPE_MAP[val_kind]
@@ -1029,6 +1031,8 @@ def fast_multiget(dict mapping, ndarray keys, default=np.nan):
         # kludge, for Series
         return np.empty(0, dtype='f8')
 
+    keys = getattr(keys, 'values', keys)
+
     for i in range(n):
         val = util.get_value_1d(keys, i)
         if val in mapping:
diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py
@@ -181,12 +181,12 @@ def test_getitem_list(self):
         # tuples
         df = DataFrame(randn(8, 3),
                        columns=Index([('foo', 'bar'), ('baz', 'qux'),
-                                      ('peek', 'aboo')], name='sth'))
+                                      ('peek', 'aboo')], name=['sth', 'sth2']))
 
         result = df[[('foo', 'bar'), ('baz', 'qux')]]
         expected = df.ix[:, :2]
         assert_frame_equal(result, expected)
-        self.assertEqual(result.columns.name, 'sth')
+        self.assertEqual(result.columns.names, ['sth', 'sth2'])
 
     def test_setitem_list(self):
 
@@ -2499,6 +2499,31 @@ def test_constructor_dict_of_tuples(self):
         expected = DataFrame(dict((k, list(v)) for k, v in compat.iteritems(data)))
         assert_frame_equal(result, expected, check_dtype=False)
 
+    def test_constructor_dict_multiindex(self):
+        check = lambda result, expected: tm.assert_frame_equal(
+            result, expected, check_dtype=True, check_index_type=True,
+            check_column_type=True, check_names=True)
+        d = {('a', 'a'): {('i', 'i'): 0, ('i', 'j'): 1, ('j', 'i'): 2},
+             ('b', 'a'): {('i', 'i'): 6, ('i', 'j'): 5, ('j', 'i'): 4},
+             ('b', 'c'): {('i', 'i'): 7, ('i', 'j'): 8, ('j', 'i'): 9}}
+        _d = sorted(d.items())
+        df = DataFrame(d)
+        expected = DataFrame(
+            [x[1] for x in _d],
+            index=MultiIndex.from_tuples([x[0] for x in _d])).T
+        expected.index = MultiIndex.from_tuples(expected.index)
+        check(df, expected)
+
+        d['z'] = {'y': 123., ('i', 'i'): 111, ('i', 'j'): 111, ('j', 'i'): 111}
+        _d.insert(0, ('z', d['z']))
+        expected = DataFrame(
+            [x[1] for x in _d],
+            index=Index([x[0] for x in _d], tupleize_cols=False)).T
+        expected.index = Index(expected.index, tupleize_cols=False)
+        df = DataFrame(d)
+        df = df.reindex(columns=expected.columns, index=expected.index)
+        check(df, expected)
+
     def _check_basic_constructor(self, empty):
         "mat: 2d matrix with shpae (3, 2) to input. empty - makes sized objects"
         mat = empty((2, 3), dtype=float)
@@ -2922,8 +2947,8 @@ class CustomDict(dict):
     def test_constructor_ragged(self):
         data = {'A': randn(10),
                 'B': randn(8)}
-        assertRaisesRegexp(ValueError, 'arrays must all be same length',
-                           DataFrame, data)
+        with assertRaisesRegexp(ValueError, 'arrays must all be same length'):
+            DataFrame(data)
 
     def test_constructor_scalar(self):
         idx = Index(lrange(3))
@@ -12105,7 +12130,8 @@ def test_index_namedtuple(self):
         IndexType = namedtuple("IndexType", ["a", "b"])
         idx1 = IndexType("foo", "bar")
         idx2 = IndexType("baz", "bof")
-        index = Index([idx1, idx2], name="composite_index")
+        index = Index([idx1, idx2],
+                      name="composite_index", tupleize_cols=False)
         df = DataFrame([(1, 2), (3, 4)], index=index, columns=["A", "B"])
         self.assertEqual(df.ix[IndexType("foo", "bar")]["A"], 1)
 
diff --git a/pandas/tests/test_index.py b/pandas/tests/test_index.py
@@ -48,7 +48,8 @@ def setUp(self):
             intIndex = tm.makeIntIndex(100),
             floatIndex = tm.makeFloatIndex(100),
             empty = Index([]),
-            tuples = Index(lzip(['foo', 'bar', 'baz'], [1, 2, 3])),
+            tuples = MultiIndex.from_tuples(lzip(['foo', 'bar', 'baz'],
+                                                 [1, 2, 3]))
         )
         for name, ind in self.indices.items():
             setattr(self, name, ind)
@@ -230,6 +231,10 @@ def test_identical(self):
         i2 = i2.rename('foo')
         self.assert_(i1.identical(i2))
 
+        i3 = Index([('a', 'a'), ('a', 'b'), ('b', 'a')])
+        i4 = Index([('a', 'a'), ('a', 'b'), ('b', 'a')], tupleize_cols=False)
+        self.assertFalse(i3.identical(i4))
+
     def test_is_(self):
         ind = Index(range(10))
         self.assertTrue(ind.is_(ind))
@@ -987,18 +992,24 @@ def test_equals(self):
         self.assert_(same_values.equals(self.index))
 
     def test_identical(self):
+        i = Index(self.index.copy())
+        self.assertTrue(i.identical(self.index))
 
-        i = self.index.copy()
-        same_values = Index(i, dtype=object)
-        self.assert_(i.identical(same_values))
+        same_values_different_type = Index(i, dtype=object)
+        self.assertFalse(i.identical(same_values_different_type))
 
-        i = self.index.copy()
+        i = self.index.copy(dtype=object)
         i = i.rename('foo')
         same_values = Index(i, dtype=object)
-        self.assert_(same_values.identical(self.index))
+        self.assertTrue(same_values.identical(self.index.copy(dtype=object)))
 
         self.assertFalse(i.identical(self.index))
-        self.assert_(Index(same_values, name='foo').identical(i))
+        self.assertTrue(Index(same_values, name='foo', dtype=object
+                              ).identical(i))
+
+        self.assertFalse(
+            self.index.copy(dtype=object)
+            .identical(self.index.copy(dtype='int64')))
 
     def test_get_indexer(self):
         target = Int64Index(np.arange(10))
@@ -2217,6 +2228,12 @@ def test_identical(self):
         mi2 = mi2.set_names(['new1', 'new2'])
         self.assert_(mi.identical(mi2))
 
+        mi3 = Index(mi.tolist(), names=mi.names)
+        mi4 = Index(mi.tolist(), names=mi.names, tupleize_cols=False)
+        self.assert_(mi.identical(mi3))
+        self.assert_(not mi.identical(mi4))
+        self.assert_(mi.equals(mi4))
+
     def test_is_(self):
         mi = MultiIndex.from_tuples(lzip(range(10), range(10)))
         self.assertTrue(mi.is_(mi))
diff --git a/pandas/tests/test_series.py b/pandas/tests/test_series.py
@@ -633,6 +633,26 @@ def test_constructor_dict(self):
         expected.ix[1] = 1
         assert_series_equal(result, expected)
 
+    def test_constructor_dict_multiindex(self):
+        check = lambda result, expected: tm.assert_series_equal(
+            result, expected, check_dtype=True, check_index_type=True,
+            check_series_type=True)
+        d = {('a', 'a'): 0., ('b', 'a'): 1., ('b', 'c'): 2.}
+        _d = sorted(d.items())
+        ser = Series(d)
+        expected = Series([x[1] for x in _d],
+                          index=MultiIndex.from_tuples([x[0] for x in _d]))
+        check(ser, expected)
+
+        d['z'] = 111.
+        _d.insert(0, ('z', d['z']))
+        ser = Series(d)
+        expected = Series(
+            [x[1] for x in _d],
+            index=Index([x[0] for x in _d], tupleize_cols=False))
+        ser = ser.reindex(index=expected.index)
+        check(ser, expected)
+
     def test_constructor_subclass_dict(self):
         data = tm.TestSubDict((x, 10.0 * x) for x in range(10))
         series = Series(data)