diff --git a/doc/source/release.rst b/doc/source/release.rst index 96004737c4d0f..3f148748081b9 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -549,6 +549,7 @@ Bug Fixes - A zero length series written in Fixed format not deserializing properly. (:issue:`4708`) - Fixed decoding perf issue on pyt3 (:issue:`5441`) + - Validate levels in a multi-index before storing (:issue:`5527`) - Fixed bug in tslib.tz_convert(vals, tz1, tz2): it could raise IndexError exception while trying to access trans[pos + 1] (:issue:`4496`) - The ``by`` argument now works correctly with the ``layout`` argument diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index d2fe1e0638192..db2028c70dc20 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -2688,6 +2688,14 @@ def is_multi_index(self): """ the levels attribute is 1 or a list in the case of a multi-index """ return isinstance(self.levels,list) + def validate_multiindex(self, obj): + """ validate that we can store the multi-index; reset and return the new object """ + levels = [ l if l is not None else "level_{0}".format(i) for i, l in enumerate(obj.index.names) ] + try: + return obj.reset_index(), levels + except (ValueError): + raise ValueError("duplicate names/columns in the multi-index when storing as a table") + @property def nrows_expected(self): """ based on our axes, compute the expected nrows """ @@ -3701,10 +3709,9 @@ class AppendableMultiSeriesTable(AppendableSeriesTable): def write(self, obj, **kwargs): """ we are going to write this as a frame table """ name = obj.name or 'values' - cols = list(obj.index.names) + obj, self.levels = self.validate_multiindex(obj) + cols = list(self.levels) cols.append(name) - self.levels = list(obj.index.names) - obj = obj.reset_index() obj.columns = cols return super(AppendableMultiSeriesTable, self).write(obj=obj, **kwargs) @@ -3764,6 +3771,7 @@ class AppendableMultiFrameTable(AppendableFrameTable): table_type = u('appendable_multiframe') obj_type = DataFrame ndim = 2 + _re_levels = re.compile("^level_\d+$") @property def table_type_short(self): @@ -3774,11 +3782,11 @@ def write(self, obj, data_columns=None, **kwargs): data_columns = [] elif data_columns is True: data_columns = obj.columns[:] - for n in obj.index.names: + obj, self.levels = self.validate_multiindex(obj) + for n in self.levels: if n not in data_columns: data_columns.insert(0, n) - self.levels = obj.index.names - return super(AppendableMultiFrameTable, self).write(obj=obj.reset_index(), data_columns=data_columns, **kwargs) + return super(AppendableMultiFrameTable, self).write(obj=obj, data_columns=data_columns, **kwargs) def read(self, columns=None, **kwargs): if columns is not None: @@ -3787,7 +3795,11 @@ def read(self, columns=None, **kwargs): columns.insert(0, n) df = super(AppendableMultiFrameTable, self).read( columns=columns, **kwargs) - df.set_index(self.levels, inplace=True) + df = df.set_index(self.levels) + + # remove names for 'level_%d' + df.index = df.index.set_names([ None if self._re_levels.search(l) else l for l in df.index.names ]) + return df diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index 3ab818a7fbe1a..1953f79482a22 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -1572,6 +1572,51 @@ def test_column_multiindex(self): store.put('df1',df,format='table') tm.assert_frame_equal(store['df1'],df,check_index_type=True,check_column_type=True) + def test_store_multiindex(self): + + # validate multi-index names + # GH 5527 + with ensure_clean_store(self.path) as store: + + def make_index(names=None): + return MultiIndex.from_tuples([( datetime.datetime(2013,12,d), s, t) for d in range(1,3) for s in range(2) for t in range(3)], + names=names) + + + # no names + _maybe_remove(store, 'df') + df = DataFrame(np.zeros((12,2)), columns=['a','b'], index=make_index()) + store.append('df',df) + tm.assert_frame_equal(store.select('df'),df) + + # partial names + _maybe_remove(store, 'df') + df = DataFrame(np.zeros((12,2)), columns=['a','b'], index=make_index(['date',None,None])) + store.append('df',df) + tm.assert_frame_equal(store.select('df'),df) + + # series + _maybe_remove(store, 's') + s = Series(np.zeros(12), index=make_index(['date',None,None])) + store.append('s',s) + tm.assert_series_equal(store.select('s'),s) + + # dup with column + _maybe_remove(store, 'df') + df = DataFrame(np.zeros((12,2)), columns=['a','b'], index=make_index(['date','a','t'])) + self.assertRaises(ValueError, store.append, 'df',df) + + # dup within level + _maybe_remove(store, 'df') + df = DataFrame(np.zeros((12,2)), columns=['a','b'], index=make_index(['date','date','date'])) + self.assertRaises(ValueError, store.append, 'df',df) + + # fully names + _maybe_remove(store, 'df') + df = DataFrame(np.zeros((12,2)), columns=['a','b'], index=make_index(['date','s','t'])) + store.append('df',df) + tm.assert_frame_equal(store.select('df'),df) + def test_pass_spec_to_storer(self): df = tm.makeDataFrame()