Skip to content

Commit 8273356

Browse files
committed
BUG: invalid column names in a HDF5 table format
Have DataFrame.to_hdf() raise an error when using pytables with non-string column types. Fixes #9057
1 parent 08d60e6 commit 8273356

File tree

3 files changed

+44
-2
lines changed

3 files changed

+44
-2
lines changed

doc/source/whatsnew/v0.17.0.txt

+1-2
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,7 @@ Bug Fixes
7373
- Bug in ``Timestamp``'s' ``microsecond``, ``quarter``, ``dayofyear``, ``week`` and ``daysinmonth`` properties return ``np.int`` type, not built-in ``int``. (:issue:`10050`)
7474
- Bug in ``NaT`` raises ``AttributeError`` when accessing to ``daysinmonth``, ``dayofweek`` properties. (:issue:`10096`)
7575

76-
76+
- Bug in ``DataFrame.to_hdf()`` where table format would raise a seemingly unrelated error for invalid (non-string) column names. This is now explicitly forbidden. (:issue:`9057`)
7777
- Bug in getting timezone data with ``dateutil`` on various platforms ( :issue:`9059`, :issue:`8639`, :issue:`9663`, :issue:`10121`)
7878
- Bug in display datetimes with mixed frequencies uniformly; display 'ms' datetimes to the proper precision. (:issue:`10170`)
7979

@@ -94,4 +94,3 @@ Bug Fixes
9494

9595

9696
- Bug where infer_freq infers timerule (WOM-5XXX) unsupported by to_offset (:issue:`9425`)
97-

pandas/io/pytables.py

+14
Original file line numberDiff line numberDiff line change
@@ -257,6 +257,7 @@ def _tables():
257257
def to_hdf(path_or_buf, key, value, mode=None, complevel=None, complib=None,
258258
append=None, **kwargs):
259259
""" store this object, close it if we opened it """
260+
260261
if append:
261262
f = lambda store: store.append(key, value, **kwargs)
262263
else:
@@ -1535,6 +1536,12 @@ def maybe_set_size(self, min_itemsize=None, **kwargs):
15351536
self.typ = _tables(
15361537
).StringCol(itemsize=min_itemsize, pos=self.pos)
15371538

1539+
def validate(self, handler, append, **kwargs):
1540+
self.validate_names()
1541+
1542+
def validate_names(self):
1543+
pass
1544+
15381545
def validate_and_set(self, handler, append, **kwargs):
15391546
self.set_table(handler.table)
15401547
self.validate_col()
@@ -2080,6 +2087,10 @@ class DataIndexableCol(DataCol):
20802087
""" represent a data column that can be indexed """
20812088
is_data_indexable = True
20822089

2090+
def validate_names(self):
2091+
if not Index(self.values).is_object():
2092+
raise ValueError("cannot have non-object label DataIndexableCol")
2093+
20832094
def get_atom_string(self, block, itemsize):
20842095
return _tables().StringCol(itemsize=itemsize)
20852096

@@ -3756,6 +3767,9 @@ def write(self, obj, axes=None, append=False, complib=None,
37563767
min_itemsize=min_itemsize,
37573768
**kwargs)
37583769

3770+
for a in self.axes:
3771+
a.validate(self, append)
3772+
37593773
if not self.is_exists:
37603774

37613775
# create the table

pandas/io/tests/test_pytables.py

+29
Original file line numberDiff line numberDiff line change
@@ -4640,6 +4640,35 @@ def test_colums_multiindex_modified(self):
46404640
df_loaded = read_hdf(path, 'df', columns=cols2load)
46414641
self.assertTrue(cols2load_original == cols2load)
46424642

4643+
def test_to_hdf_with_object_column_names(self):
4644+
# GH9057
4645+
# Writing HDF5 table format should only work for string-like
4646+
# column types
4647+
4648+
types_should_fail = [ tm.makeIntIndex, tm.makeFloatIndex,
4649+
tm.makeDateIndex, tm.makeTimedeltaIndex,
4650+
tm.makePeriodIndex ]
4651+
types_should_run = [ tm.makeStringIndex, tm.makeCategoricalIndex ]
4652+
4653+
if compat.PY3:
4654+
types_should_run.append(tm.makeUnicodeIndex)
4655+
else:
4656+
types_should_fail.append(tm.makeUnicodeIndex)
4657+
4658+
for index in types_should_fail:
4659+
df = DataFrame(np.random.randn(10, 2), columns=index(2))
4660+
with ensure_clean_path(self.path) as path:
4661+
with self.assertRaises(ValueError,
4662+
msg="cannot have non-object label DataIndexableCol"):
4663+
df.to_hdf(path, 'df', format='table', data_columns=True)
4664+
4665+
for index in types_should_run:
4666+
df = DataFrame(np.random.randn(10, 2), columns=index(2))
4667+
with ensure_clean_path(self.path) as path:
4668+
df.to_hdf(path, 'df', format='table', data_columns=True)
4669+
result = pd.read_hdf(path, 'df', where="index = [{0}]".format(df.index[0]))
4670+
assert(len(result))
4671+
46434672

46444673
def _test_sort(obj):
46454674
if isinstance(obj, DataFrame):

0 commit comments

Comments
 (0)