Skip to content

Commit 68946db

Browse files
committed
Merge pull request #5634 from jreback/hdf_mi
BUG: Validate levels in a multi-index before storing in a HDFStore (GH5527)
2 parents 1200d9a + 8e59546 commit 68946db

File tree

3 files changed

+65
-7
lines changed

3 files changed

+65
-7
lines changed

doc/source/release.rst

+1
Original file line numberDiff line numberDiff line change
@@ -549,6 +549,7 @@ Bug Fixes
549549
- A zero length series written in Fixed format not deserializing properly.
550550
(:issue:`4708`)
551551
- Fixed decoding perf issue on pyt3 (:issue:`5441`)
552+
- Validate levels in a multi-index before storing (:issue:`5527`)
552553
- Fixed bug in tslib.tz_convert(vals, tz1, tz2): it could raise IndexError
553554
exception while trying to access trans[pos + 1] (:issue:`4496`)
554555
- The ``by`` argument now works correctly with the ``layout`` argument

pandas/io/pytables.py

+19-7
Original file line numberDiff line numberDiff line change
@@ -2688,6 +2688,14 @@ def is_multi_index(self):
26882688
""" the levels attribute is 1 or a list in the case of a multi-index """
26892689
return isinstance(self.levels,list)
26902690

2691+
def validate_multiindex(self, obj):
2692+
""" validate that we can store the multi-index; reset and return the new object """
2693+
levels = [ l if l is not None else "level_{0}".format(i) for i, l in enumerate(obj.index.names) ]
2694+
try:
2695+
return obj.reset_index(), levels
2696+
except (ValueError):
2697+
raise ValueError("duplicate names/columns in the multi-index when storing as a table")
2698+
26912699
@property
26922700
def nrows_expected(self):
26932701
""" based on our axes, compute the expected nrows """
@@ -3701,10 +3709,9 @@ class AppendableMultiSeriesTable(AppendableSeriesTable):
37013709
def write(self, obj, **kwargs):
37023710
""" we are going to write this as a frame table """
37033711
name = obj.name or 'values'
3704-
cols = list(obj.index.names)
3712+
obj, self.levels = self.validate_multiindex(obj)
3713+
cols = list(self.levels)
37053714
cols.append(name)
3706-
self.levels = list(obj.index.names)
3707-
obj = obj.reset_index()
37083715
obj.columns = cols
37093716
return super(AppendableMultiSeriesTable, self).write(obj=obj, **kwargs)
37103717

@@ -3764,6 +3771,7 @@ class AppendableMultiFrameTable(AppendableFrameTable):
37643771
table_type = u('appendable_multiframe')
37653772
obj_type = DataFrame
37663773
ndim = 2
3774+
_re_levels = re.compile("^level_\d+$")
37673775

37683776
@property
37693777
def table_type_short(self):
@@ -3774,11 +3782,11 @@ def write(self, obj, data_columns=None, **kwargs):
37743782
data_columns = []
37753783
elif data_columns is True:
37763784
data_columns = obj.columns[:]
3777-
for n in obj.index.names:
3785+
obj, self.levels = self.validate_multiindex(obj)
3786+
for n in self.levels:
37783787
if n not in data_columns:
37793788
data_columns.insert(0, n)
3780-
self.levels = obj.index.names
3781-
return super(AppendableMultiFrameTable, self).write(obj=obj.reset_index(), data_columns=data_columns, **kwargs)
3789+
return super(AppendableMultiFrameTable, self).write(obj=obj, data_columns=data_columns, **kwargs)
37823790

37833791
def read(self, columns=None, **kwargs):
37843792
if columns is not None:
@@ -3787,7 +3795,11 @@ def read(self, columns=None, **kwargs):
37873795
columns.insert(0, n)
37883796
df = super(AppendableMultiFrameTable, self).read(
37893797
columns=columns, **kwargs)
3790-
df.set_index(self.levels, inplace=True)
3798+
df = df.set_index(self.levels)
3799+
3800+
# remove names for 'level_%d'
3801+
df.index = df.index.set_names([ None if self._re_levels.search(l) else l for l in df.index.names ])
3802+
37913803
return df
37923804

37933805

pandas/io/tests/test_pytables.py

+45
Original file line numberDiff line numberDiff line change
@@ -1572,6 +1572,51 @@ def test_column_multiindex(self):
15721572
store.put('df1',df,format='table')
15731573
tm.assert_frame_equal(store['df1'],df,check_index_type=True,check_column_type=True)
15741574

1575+
def test_store_multiindex(self):
1576+
1577+
# validate multi-index names
1578+
# GH 5527
1579+
with ensure_clean_store(self.path) as store:
1580+
1581+
def make_index(names=None):
1582+
return MultiIndex.from_tuples([( datetime.datetime(2013,12,d), s, t) for d in range(1,3) for s in range(2) for t in range(3)],
1583+
names=names)
1584+
1585+
1586+
# no names
1587+
_maybe_remove(store, 'df')
1588+
df = DataFrame(np.zeros((12,2)), columns=['a','b'], index=make_index())
1589+
store.append('df',df)
1590+
tm.assert_frame_equal(store.select('df'),df)
1591+
1592+
# partial names
1593+
_maybe_remove(store, 'df')
1594+
df = DataFrame(np.zeros((12,2)), columns=['a','b'], index=make_index(['date',None,None]))
1595+
store.append('df',df)
1596+
tm.assert_frame_equal(store.select('df'),df)
1597+
1598+
# series
1599+
_maybe_remove(store, 's')
1600+
s = Series(np.zeros(12), index=make_index(['date',None,None]))
1601+
store.append('s',s)
1602+
tm.assert_series_equal(store.select('s'),s)
1603+
1604+
# dup with column
1605+
_maybe_remove(store, 'df')
1606+
df = DataFrame(np.zeros((12,2)), columns=['a','b'], index=make_index(['date','a','t']))
1607+
self.assertRaises(ValueError, store.append, 'df',df)
1608+
1609+
# dup within level
1610+
_maybe_remove(store, 'df')
1611+
df = DataFrame(np.zeros((12,2)), columns=['a','b'], index=make_index(['date','date','date']))
1612+
self.assertRaises(ValueError, store.append, 'df',df)
1613+
1614+
# fully names
1615+
_maybe_remove(store, 'df')
1616+
df = DataFrame(np.zeros((12,2)), columns=['a','b'], index=make_index(['date','s','t']))
1617+
store.append('df',df)
1618+
tm.assert_frame_equal(store.select('df'),df)
1619+
15751620
def test_pass_spec_to_storer(self):
15761621

15771622
df = tm.makeDataFrame()

0 commit comments

Comments
 (0)