Skip to content

Commit b7492fe

Browse files
adgaudiojreback
authored andcommitted
ENH: Index(...) constructor creates a MultiIndex when appropriate.
- Series and DataFrame constructor autodetect when index/columns should be MultiIndex - prevents some seg faults in calls to cython funcs - add tupleize_cols kwarg and update tests to git PR comments - support name= xor names= in Index(tuples, ....) constructor - docs BUG: Index.identical(other) didn't compare type(other) to type(self)
1 parent f8c566c commit b7492fe

12 files changed

+133
-29
lines changed

doc/source/basics.rst

+1
Original file line numberDiff line numberDiff line change
@@ -761,6 +761,7 @@ This is equivalent to the following
761761
762762
.. _basics.reindexing:
763763

764+
764765
Reindexing and altering labels
765766
------------------------------
766767

doc/source/indexing.rst

+8-2
Original file line numberDiff line numberDiff line change
@@ -1643,15 +1643,21 @@ can think of ``MultiIndex`` an array of tuples where each tuple is unique. A
16431643
``MultiIndex`` can be created from a list of arrays (using
16441644
``MultiIndex.from_arrays``), an array of tuples (using
16451645
``MultiIndex.from_tuples``), or a crossed set of iterables (using
1646-
``MultiIndex.from_product``).
1646+
``MultiIndex.from_product``). The ``Index`` constructor will attempt to return
1647+
a ``MultiIndex`` when it is passed a list of tuples. The following examples
1648+
demo different ways to initialize MultiIndexes.
1649+
16471650

16481651
.. ipython:: python
16491652
16501653
arrays = [['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'],
16511654
['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']]
16521655
tuples = list(zip(*arrays))
16531656
tuples
1654-
index = MultiIndex.from_tuples(tuples, names=['first', 'second'])
1657+
1658+
multi_index = MultiIndex.from_tuples(tuples, names=['first', 'second'])
1659+
multi_index
1660+
16551661
s = Series(randn(8), index=index)
16561662
s
16571663

doc/source/release.rst

+5
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,9 @@ pandas 0.14.0
5353
New features
5454
~~~~~~~~~~~~
5555

56+
- ``Index`` returns a MultiIndex if passed a list of tuples
57+
``DataFrame(dict)`` and ``Series(dict)`` create ``MultiIndex``
58+
columns and index where applicable (:issue:`4187`)
5659
- Hexagonal bin plots from ``DataFrame.plot`` with ``kind='hexbin'`` (:issue:`5478`)
5760
- Added the ``sym_diff`` method to ``Index`` (:issue:`5543`)
5861
- Added ``to_julian_date`` to ``TimeStamp`` and ``DatetimeIndex``. The Julian
@@ -264,6 +267,8 @@ Bug Fixes
264267
~~~~~~~~~
265268

266269
- Bug in Series ValueError when index doesn't match data (:issue:`6532`)
270+
- Prevent segfault due to MultiIndex not being supported in HDFStore table
271+
format (:issue:`1848`)
267272
- Bug in ``pd.DataFrame.sort_index`` where mergesort wasn't stable when ``ascending=False`` (:issue:`6399`)
268273
- Bug in ``pd.tseries.frequencies.to_offset`` when argument has leading zeroes (:issue:`6391`)
269274
- Bug in version string gen. for dev versions with shallow clones / install from tarball (:issue:`6127`)

doc/source/v0.14.0.txt

+12
Original file line numberDiff line numberDiff line change
@@ -405,6 +405,18 @@ Deprecations
405405
Enhancements
406406
~~~~~~~~~~~~
407407

408+
- DataFrame and Series will create MultiIndex if passed a list of tuples
409+
410+
.. ipython:: python
411+
412+
Series({('a', 'b'): 1, ('a', 'a'): 0,
413+
('a', 'c'): 2, ('b', 'a'): 3, ('b', 'b'): 4})
414+
pandas.DataFrame({('a', 'b'): {('A', 'B'): 1, ('A', 'C'): 2},
415+
('a', 'a'): {('A', 'C'): 3, ('A', 'B'): 4},
416+
('a', 'c'): {('A', 'B'): 5, ('A', 'C'): 6},
417+
('b', 'a'): {('A', 'C'): 7, ('A', 'B'): 8},
418+
('b', 'b'): {('A', 'D'): 9, ('A', 'B'): 10}})
419+
408420
- ``DataFrame.to_latex`` now takes a longtable keyword, which if True will return a table in a longtable environment. (:issue:`6617`)
409421
- ``pd.read_clipboard`` will, if 'sep' is unspecified, try to detect data copied from a spreadsheet
410422
and parse accordingly. (:issue:`6223`)

pandas/core/frame.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -317,9 +317,9 @@ def _init_dict(self, data, index, columns, dtype=None):
317317
else:
318318
keys = list(data.keys())
319319
if not isinstance(data, OrderedDict):
320-
keys = _try_sort(list(data.keys()))
320+
keys = _try_sort(keys)
321321
columns = data_names = Index(keys)
322-
arrays = [data[k] for k in columns]
322+
arrays = [data[k] for k in keys]
323323

324324
return _arrays_to_mgr(arrays, data_names, index, columns,
325325
dtype=dtype)
@@ -4496,7 +4496,7 @@ def extract_index(data):
44964496
index = None
44974497
if len(data) == 0:
44984498
index = Index([])
4499-
elif len(data) > 0 and index is None:
4499+
elif len(data) > 0:
45004500
raw_lengths = []
45014501
indexes = []
45024502

pandas/core/groupby.py

+1-2
Original file line numberDiff line numberDiff line change
@@ -1164,8 +1164,7 @@ def groups(self):
11641164
else:
11651165
to_groupby = lzip(*(ping.grouper for ping in self.groupings))
11661166
to_groupby = Index(to_groupby)
1167-
1168-
return self.axis.groupby(to_groupby)
1167+
return self.axis.groupby(to_groupby.values)
11691168

11701169
@cache_readonly
11711170
def group_info(self):

pandas/core/index.py

+22-8
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,8 @@ class Index(IndexOpsMixin, FrozenNDArray):
7171
Make a copy of input ndarray
7272
name : object
7373
Name to be stored in the index
74+
tupleize_cols : bool (default: True)
75+
When True, attempt to create a MultiIndex if possible
7476
7577
Notes
7678
-----
@@ -99,7 +101,7 @@ class Index(IndexOpsMixin, FrozenNDArray):
99101
_engine_type = _index.ObjectEngine
100102

101103
def __new__(cls, data, dtype=None, copy=False, name=None, fastpath=False,
102-
**kwargs):
104+
tupleize_cols=True, **kwargs):
103105

104106
# no class inference!
105107
if fastpath:
@@ -139,8 +141,19 @@ def __new__(cls, data, dtype=None, copy=False, name=None, fastpath=False,
139141

140142
elif np.isscalar(data):
141143
cls._scalar_data_error(data)
142-
143144
else:
145+
if tupleize_cols and isinstance(data, list) and data:
146+
try:
147+
sorted(data)
148+
has_mixed_types = False
149+
except (TypeError, UnicodeDecodeError):
150+
has_mixed_types = True # python3 only
151+
if isinstance(data[0], tuple) and not has_mixed_types:
152+
try:
153+
return MultiIndex.from_tuples(
154+
data, names=name or kwargs.get('names'))
155+
except (TypeError, KeyError):
156+
pass # python2 - MultiIndex fails on mixed types
144157
# other iterable of some kind
145158
subarr = com._asarray_tuplesafe(data, dtype=object)
146159

@@ -808,7 +821,8 @@ def identical(self, other):
808821
"""
809822
return (self.equals(other) and
810823
all((getattr(self, c, None) == getattr(other, c, None)
811-
for c in self._comparables)))
824+
for c in self._comparables)) and
825+
type(self) == type(other))
812826

813827
def asof(self, label):
814828
"""
@@ -1743,11 +1757,11 @@ def insert(self, loc, item):
17431757
-------
17441758
new_index : Index
17451759
"""
1746-
index = np.asarray(self)
1747-
# because numpy is fussy with tuples
1748-
item_idx = Index([item], dtype=index.dtype)
1749-
new_index = np.concatenate((index[:loc], item_idx, index[loc:]))
1750-
return Index(new_index, name=self.name)
1760+
_self = np.asarray(self)
1761+
item_idx = Index([item], dtype=self.dtype).values
1762+
idx = np.concatenate(
1763+
(_self[:loc], item_idx, _self[loc:]))
1764+
return Index(idx, name=self.name)
17511765

17521766
def drop(self, labels):
17531767
"""

pandas/core/series.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -22,9 +22,9 @@
2222
_values_from_object,
2323
_possibly_cast_to_datetime, _possibly_castable,
2424
_possibly_convert_platform,
25+
_try_sort,
2526
ABCSparseArray, _maybe_match_name,
2627
_ensure_object, SettingWithCopyError)
27-
2828
from pandas.core.index import (Index, MultiIndex, InvalidIndexError,
2929
_ensure_index)
3030
from pandas.core.indexing import (
@@ -180,7 +180,7 @@ def __init__(self, data=None, index=None, dtype=None, name=None,
180180
if isinstance(data, OrderedDict):
181181
index = Index(data)
182182
else:
183-
index = Index(sorted(data))
183+
index = Index(_try_sort(data))
184184
try:
185185
if isinstance(index, DatetimeIndex):
186186
# coerce back to datetime objects for lookup

pandas/src/inference.pyx

+4
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,8 @@ def infer_dtype(object _values):
5858
_values = list(_values)
5959
values = list_to_object_array(_values)
6060

61+
values = getattr(values, 'values', values)
62+
6163
val_kind = values.dtype.type
6264
if val_kind in _TYPE_MAP:
6365
return _TYPE_MAP[val_kind]
@@ -1029,6 +1031,8 @@ def fast_multiget(dict mapping, ndarray keys, default=np.nan):
10291031
# kludge, for Series
10301032
return np.empty(0, dtype='f8')
10311033

1034+
keys = getattr(keys, 'values', keys)
1035+
10321036
for i in range(n):
10331037
val = util.get_value_1d(keys, i)
10341038
if val in mapping:

pandas/tests/test_frame.py

+31-5
Original file line numberDiff line numberDiff line change
@@ -181,12 +181,12 @@ def test_getitem_list(self):
181181
# tuples
182182
df = DataFrame(randn(8, 3),
183183
columns=Index([('foo', 'bar'), ('baz', 'qux'),
184-
('peek', 'aboo')], name='sth'))
184+
('peek', 'aboo')], name=['sth', 'sth2']))
185185

186186
result = df[[('foo', 'bar'), ('baz', 'qux')]]
187187
expected = df.ix[:, :2]
188188
assert_frame_equal(result, expected)
189-
self.assertEqual(result.columns.name, 'sth')
189+
self.assertEqual(result.columns.names, ['sth', 'sth2'])
190190

191191
def test_setitem_list(self):
192192

@@ -2499,6 +2499,31 @@ def test_constructor_dict_of_tuples(self):
24992499
expected = DataFrame(dict((k, list(v)) for k, v in compat.iteritems(data)))
25002500
assert_frame_equal(result, expected, check_dtype=False)
25012501

2502+
def test_constructor_dict_multiindex(self):
2503+
check = lambda result, expected: tm.assert_frame_equal(
2504+
result, expected, check_dtype=True, check_index_type=True,
2505+
check_column_type=True, check_names=True)
2506+
d = {('a', 'a'): {('i', 'i'): 0, ('i', 'j'): 1, ('j', 'i'): 2},
2507+
('b', 'a'): {('i', 'i'): 6, ('i', 'j'): 5, ('j', 'i'): 4},
2508+
('b', 'c'): {('i', 'i'): 7, ('i', 'j'): 8, ('j', 'i'): 9}}
2509+
_d = sorted(d.items())
2510+
df = DataFrame(d)
2511+
expected = DataFrame(
2512+
[x[1] for x in _d],
2513+
index=MultiIndex.from_tuples([x[0] for x in _d])).T
2514+
expected.index = MultiIndex.from_tuples(expected.index)
2515+
check(df, expected)
2516+
2517+
d['z'] = {'y': 123., ('i', 'i'): 111, ('i', 'j'): 111, ('j', 'i'): 111}
2518+
_d.insert(0, ('z', d['z']))
2519+
expected = DataFrame(
2520+
[x[1] for x in _d],
2521+
index=Index([x[0] for x in _d], tupleize_cols=False)).T
2522+
expected.index = Index(expected.index, tupleize_cols=False)
2523+
df = DataFrame(d)
2524+
df = df.reindex(columns=expected.columns, index=expected.index)
2525+
check(df, expected)
2526+
25022527
def _check_basic_constructor(self, empty):
25032528
"mat: 2d matrix with shpae (3, 2) to input. empty - makes sized objects"
25042529
mat = empty((2, 3), dtype=float)
@@ -2922,8 +2947,8 @@ class CustomDict(dict):
29222947
def test_constructor_ragged(self):
29232948
data = {'A': randn(10),
29242949
'B': randn(8)}
2925-
assertRaisesRegexp(ValueError, 'arrays must all be same length',
2926-
DataFrame, data)
2950+
with assertRaisesRegexp(ValueError, 'arrays must all be same length'):
2951+
DataFrame(data)
29272952

29282953
def test_constructor_scalar(self):
29292954
idx = Index(lrange(3))
@@ -12105,7 +12130,8 @@ def test_index_namedtuple(self):
1210512130
IndexType = namedtuple("IndexType", ["a", "b"])
1210612131
idx1 = IndexType("foo", "bar")
1210712132
idx2 = IndexType("baz", "bof")
12108-
index = Index([idx1, idx2], name="composite_index")
12133+
index = Index([idx1, idx2],
12134+
name="composite_index", tupleize_cols=False)
1210912135
df = DataFrame([(1, 2), (3, 4)], index=index, columns=["A", "B"])
1211012136
self.assertEqual(df.ix[IndexType("foo", "bar")]["A"], 1)
1211112137

pandas/tests/test_index.py

+24-7
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,8 @@ def setUp(self):
4848
intIndex = tm.makeIntIndex(100),
4949
floatIndex = tm.makeFloatIndex(100),
5050
empty = Index([]),
51-
tuples = Index(lzip(['foo', 'bar', 'baz'], [1, 2, 3])),
51+
tuples = MultiIndex.from_tuples(lzip(['foo', 'bar', 'baz'],
52+
[1, 2, 3]))
5253
)
5354
for name, ind in self.indices.items():
5455
setattr(self, name, ind)
@@ -230,6 +231,10 @@ def test_identical(self):
230231
i2 = i2.rename('foo')
231232
self.assert_(i1.identical(i2))
232233

234+
i3 = Index([('a', 'a'), ('a', 'b'), ('b', 'a')])
235+
i4 = Index([('a', 'a'), ('a', 'b'), ('b', 'a')], tupleize_cols=False)
236+
self.assertFalse(i3.identical(i4))
237+
233238
def test_is_(self):
234239
ind = Index(range(10))
235240
self.assertTrue(ind.is_(ind))
@@ -987,18 +992,24 @@ def test_equals(self):
987992
self.assert_(same_values.equals(self.index))
988993

989994
def test_identical(self):
995+
i = Index(self.index.copy())
996+
self.assertTrue(i.identical(self.index))
990997

991-
i = self.index.copy()
992-
same_values = Index(i, dtype=object)
993-
self.assert_(i.identical(same_values))
998+
same_values_different_type = Index(i, dtype=object)
999+
self.assertFalse(i.identical(same_values_different_type))
9941000

995-
i = self.index.copy()
1001+
i = self.index.copy(dtype=object)
9961002
i = i.rename('foo')
9971003
same_values = Index(i, dtype=object)
998-
self.assert_(same_values.identical(self.index))
1004+
self.assertTrue(same_values.identical(self.index.copy(dtype=object)))
9991005

10001006
self.assertFalse(i.identical(self.index))
1001-
self.assert_(Index(same_values, name='foo').identical(i))
1007+
self.assertTrue(Index(same_values, name='foo', dtype=object
1008+
).identical(i))
1009+
1010+
self.assertFalse(
1011+
self.index.copy(dtype=object)
1012+
.identical(self.index.copy(dtype='int64')))
10021013

10031014
def test_get_indexer(self):
10041015
target = Int64Index(np.arange(10))
@@ -2217,6 +2228,12 @@ def test_identical(self):
22172228
mi2 = mi2.set_names(['new1', 'new2'])
22182229
self.assert_(mi.identical(mi2))
22192230

2231+
mi3 = Index(mi.tolist(), names=mi.names)
2232+
mi4 = Index(mi.tolist(), names=mi.names, tupleize_cols=False)
2233+
self.assert_(mi.identical(mi3))
2234+
self.assert_(not mi.identical(mi4))
2235+
self.assert_(mi.equals(mi4))
2236+
22202237
def test_is_(self):
22212238
mi = MultiIndex.from_tuples(lzip(range(10), range(10)))
22222239
self.assertTrue(mi.is_(mi))

pandas/tests/test_series.py

+20
Original file line numberDiff line numberDiff line change
@@ -633,6 +633,26 @@ def test_constructor_dict(self):
633633
expected.ix[1] = 1
634634
assert_series_equal(result, expected)
635635

636+
def test_constructor_dict_multiindex(self):
637+
check = lambda result, expected: tm.assert_series_equal(
638+
result, expected, check_dtype=True, check_index_type=True,
639+
check_series_type=True)
640+
d = {('a', 'a'): 0., ('b', 'a'): 1., ('b', 'c'): 2.}
641+
_d = sorted(d.items())
642+
ser = Series(d)
643+
expected = Series([x[1] for x in _d],
644+
index=MultiIndex.from_tuples([x[0] for x in _d]))
645+
check(ser, expected)
646+
647+
d['z'] = 111.
648+
_d.insert(0, ('z', d['z']))
649+
ser = Series(d)
650+
expected = Series(
651+
[x[1] for x in _d],
652+
index=Index([x[0] for x in _d], tupleize_cols=False))
653+
ser = ser.reindex(index=expected.index)
654+
check(ser, expected)
655+
636656
def test_constructor_subclass_dict(self):
637657
data = tm.TestSubDict((x, 10.0 * x) for x in range(10))
638658
series = Series(data)

0 commit comments

Comments
 (0)