Skip to content

BUG: Validate levels in a multi-index before storing in a HDFStore (GH5527) #5634

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Dec 3, 2013
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/release.rst
Original file line number Diff line number Diff line change
Expand Up @@ -549,6 +549,7 @@ Bug Fixes
- A zero length series written in Fixed format not deserializing properly.
(:issue:`4708`)
- Fixed decoding perf issue on pyt3 (:issue:`5441`)
- Validate levels in a multi-index before storing (:issue:`5527`)
- Fixed bug in tslib.tz_convert(vals, tz1, tz2): it could raise IndexError
exception while trying to access trans[pos + 1] (:issue:`4496`)
- The ``by`` argument now works correctly with the ``layout`` argument
Expand Down
26 changes: 19 additions & 7 deletions pandas/io/pytables.py
Original file line number Diff line number Diff line change
Expand Up @@ -2688,6 +2688,14 @@ def is_multi_index(self):
""" the levels attribute is 1 or a list in the case of a multi-index """
return isinstance(self.levels,list)

def validate_multiindex(self, obj):
""" validate that we can store the multi-index; reset and return the new object """
levels = [ l if l is not None else "level_{0}".format(i) for i, l in enumerate(obj.index.names) ]
try:
return obj.reset_index(), levels
except (ValueError):
raise ValueError("duplicate names/columns in the multi-index when storing as a table")

@property
def nrows_expected(self):
""" based on our axes, compute the expected nrows """
Expand Down Expand Up @@ -3701,10 +3709,9 @@ class AppendableMultiSeriesTable(AppendableSeriesTable):
def write(self, obj, **kwargs):
""" we are going to write this as a frame table """
name = obj.name or 'values'
cols = list(obj.index.names)
obj, self.levels = self.validate_multiindex(obj)
cols = list(self.levels)
cols.append(name)
self.levels = list(obj.index.names)
obj = obj.reset_index()
obj.columns = cols
return super(AppendableMultiSeriesTable, self).write(obj=obj, **kwargs)

Expand Down Expand Up @@ -3764,6 +3771,7 @@ class AppendableMultiFrameTable(AppendableFrameTable):
table_type = u('appendable_multiframe')
obj_type = DataFrame
ndim = 2
_re_levels = re.compile("^level_\d+$")

@property
def table_type_short(self):
Expand All @@ -3774,11 +3782,11 @@ def write(self, obj, data_columns=None, **kwargs):
data_columns = []
elif data_columns is True:
data_columns = obj.columns[:]
for n in obj.index.names:
obj, self.levels = self.validate_multiindex(obj)
for n in self.levels:
if n not in data_columns:
data_columns.insert(0, n)
self.levels = obj.index.names
return super(AppendableMultiFrameTable, self).write(obj=obj.reset_index(), data_columns=data_columns, **kwargs)
return super(AppendableMultiFrameTable, self).write(obj=obj, data_columns=data_columns, **kwargs)

def read(self, columns=None, **kwargs):
if columns is not None:
Expand All @@ -3787,7 +3795,11 @@ def read(self, columns=None, **kwargs):
columns.insert(0, n)
df = super(AppendableMultiFrameTable, self).read(
columns=columns, **kwargs)
df.set_index(self.levels, inplace=True)
df = df.set_index(self.levels)

# remove names for 'level_%d'
df.index = df.index.set_names([ None if self._re_levels.search(l) else l for l in df.index.names ])

return df


Expand Down
45 changes: 45 additions & 0 deletions pandas/io/tests/test_pytables.py
Original file line number Diff line number Diff line change
Expand Up @@ -1572,6 +1572,51 @@ def test_column_multiindex(self):
store.put('df1',df,format='table')
tm.assert_frame_equal(store['df1'],df,check_index_type=True,check_column_type=True)

def test_store_multiindex(self):

# validate multi-index names
# GH 5527
with ensure_clean_store(self.path) as store:

def make_index(names=None):
return MultiIndex.from_tuples([( datetime.datetime(2013,12,d), s, t) for d in range(1,3) for s in range(2) for t in range(3)],
names=names)


# no names
_maybe_remove(store, 'df')
df = DataFrame(np.zeros((12,2)), columns=['a','b'], index=make_index())
store.append('df',df)
tm.assert_frame_equal(store.select('df'),df)

# partial names
_maybe_remove(store, 'df')
df = DataFrame(np.zeros((12,2)), columns=['a','b'], index=make_index(['date',None,None]))
store.append('df',df)
tm.assert_frame_equal(store.select('df'),df)

# series
_maybe_remove(store, 's')
s = Series(np.zeros(12), index=make_index(['date',None,None]))
store.append('s',s)
tm.assert_series_equal(store.select('s'),s)

# dup with column
_maybe_remove(store, 'df')
df = DataFrame(np.zeros((12,2)), columns=['a','b'], index=make_index(['date','a','t']))
self.assertRaises(ValueError, store.append, 'df',df)

# dup within level
_maybe_remove(store, 'df')
df = DataFrame(np.zeros((12,2)), columns=['a','b'], index=make_index(['date','date','date']))
self.assertRaises(ValueError, store.append, 'df',df)

# fully names
_maybe_remove(store, 'df')
df = DataFrame(np.zeros((12,2)), columns=['a','b'], index=make_index(['date','s','t']))
store.append('df',df)
tm.assert_frame_equal(store.select('df'),df)

def test_pass_spec_to_storer(self):

df = tm.makeDataFrame()
Expand Down