From 90f63b08071a149d47c42c30401d831d1e5a1092 Mon Sep 17 00:00:00 2001 From: Mali Akmanalp Date: Mon, 22 May 2017 17:06:01 -0700 Subject: [PATCH 1/2] BUG: Handle numpy strings in index names in HDF5 #13492 --- doc/source/whatsnew/v0.20.2.txt | 1 + pandas/io/pytables.py | 2 ++ pandas/tests/io/test_pytables.py | 23 ++++++++++++++++++++++- 3 files changed, 25 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.20.2.txt b/doc/source/whatsnew/v0.20.2.txt index 379249b6e55d6..7bca4174da297 100644 --- a/doc/source/whatsnew/v0.20.2.txt +++ b/doc/source/whatsnew/v0.20.2.txt @@ -70,6 +70,7 @@ I/O - Bug that raised IndexError HTML-rendering an empty DataFrame (:issue:`15953`) - Bug in ``pd.read_csv()`` in which tarfile object inputs were raising an error in Python 2.x for the C engine (:issue:`16530`) - Bug where ``DataFrame.to_html()`` ignored the ``index_names`` parameter (:issue:`16493`) +- Bug where ``pd.read_hdf()`` returns numpy strings for index names (:issue:`13492`) - Bug in ``HDFStore.select_as_multiple()`` where start/stop arguments were not respected (:issue:`16209`) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index b838260d1f73c..9b2f7a677853a 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -2568,6 +2568,8 @@ def read_index_node(self, node, start=None, stop=None): if 'name' in node._v_attrs: name = node._v_attrs.name + if isinstance(name, compat.string_types): + name = compat.text_type(name) index_class = self._alias_to_class(getattr(node._v_attrs, 'index_class', '')) diff --git a/pandas/tests/io/test_pytables.py b/pandas/tests/io/test_pytables.py index ae14f74ece31c..040345db83c2b 100644 --- a/pandas/tests/io/test_pytables.py +++ b/pandas/tests/io/test_pytables.py @@ -16,7 +16,7 @@ date_range, timedelta_range, Index, DatetimeIndex, isnull) -from pandas.compat import is_platform_windows, PY3, PY35, BytesIO +from pandas.compat import is_platform_windows, PY3, PY35, BytesIO, text_type from pandas.io.formats.printing import pprint_thing tables = pytest.importorskip('tables') @@ -2920,6 +2920,27 @@ def test_store_index_name_with_tz(self): recons = store['frame'] tm.assert_frame_equal(recons, df) + @pytest.mark.parametrize('table_format', ['table', 'fixed']) + def test_store_index_name_numpy_str(self, table_format): + # GH #13492 + idx = pd.Index(pd.to_datetime([datetime.date(2000, 1, 1), + datetime.date(2000, 1, 2)]), + name=u('cols\u05d2')) + idx1 = pd.Index(pd.to_datetime([datetime.date(2010, 1, 1), + datetime.date(2010, 1, 2)]), + name=u('rows\u05d0')) + df = pd.DataFrame(np.arange(4).reshape(2, 2), columns=idx, index=idx1) + + # This used to fail, returning numpy strings instead of python strings. + with ensure_clean_path(self.path) as path: + df.to_hdf(path, 'df', format=table_format) + df2 = read_hdf(path, 'df') + + assert_frame_equal(df, df2, check_names=True) + + assert type(df2.index.name) == text_type + assert type(df2.columns.name) == text_type + def test_store_series_name(self): df = tm.makeDataFrame() series = df['A'] From 70a55ece6e20ee33d3f0a2666beabc58bbadd1c3 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Sat, 3 Jun 2017 19:59:00 -0500 Subject: [PATCH 2/2] REF: refactor to _ensure_str --- pandas/io/pytables.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 9b2f7a677853a..4a1b12414bcc5 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -73,6 +73,18 @@ def _ensure_encoding(encoding): return encoding +def _ensure_str(name): + """Ensure that an index / column name is a str (python 3) or + unicode (python 2); otherwise they may be np.string dtype. + Non-string dtypes are passed through unchanged. + + https://github.com/pandas-dev/pandas/issues/13492 + """ + if isinstance(name, compat.string_types): + name = compat.text_type(name) + return name + + Term = Expr @@ -2567,9 +2579,7 @@ def read_index_node(self, node, start=None, stop=None): name = None if 'name' in node._v_attrs: - name = node._v_attrs.name - if isinstance(name, compat.string_types): - name = compat.text_type(name) + name = _ensure_str(node._v_attrs.name) index_class = self._alias_to_class(getattr(node._v_attrs, 'index_class', ''))