diff --git a/doc/source/whatsnew/v0.17.1.txt b/doc/source/whatsnew/v0.17.1.txt index 736554672a089..f9ae7c32e956c 100755 --- a/doc/source/whatsnew/v0.17.1.txt +++ b/doc/source/whatsnew/v0.17.1.txt @@ -42,3 +42,10 @@ Performance Improvements Bug Fixes ~~~~~~~~~ + + +- Bug in ``HDFStore.append`` with strings whose encoded length exceded the max unencoded length (:issue:`11234`) + + + + diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index ac2358cb3d231..4de641bb67926 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -1860,7 +1860,8 @@ def set_atom_string(self, block, block_items, existing_col, min_itemsize, ) # itemsize is the maximum length of a string (along any dimension) - itemsize = lib.max_len_string_array(com._ensure_object(data.ravel())) + data_converted = _convert_string_array(data, encoding) + itemsize = data_converted.itemsize # specified min_itemsize? if isinstance(min_itemsize, dict): @@ -1877,10 +1878,7 @@ def set_atom_string(self, block, block_items, existing_col, min_itemsize, self.itemsize = itemsize self.kind = 'string' self.typ = self.get_atom_string(block, itemsize) - self.set_data(self.convert_string_data(data, itemsize, encoding)) - - def convert_string_data(self, data, itemsize, encoding): - return _convert_string_array(data, encoding, itemsize) + self.set_data(data_converted.astype('|S%d' % itemsize, copy=False)) def get_atom_coltype(self, kind=None): """ return the PyTables column class for this column """ diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index df2a659100305..167170f7cd7c5 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -4292,6 +4292,22 @@ def f(): compat_assert_produces_warning(PerformanceWarning, f) + + def test_unicode_longer_encoded(self): + # GH 11234 + char = '\u0394' + df = pd.DataFrame({'A': [char]}) + with ensure_clean_store(self.path) as store: + store.put('df', df, format='table', encoding='utf-8') + result = store.get('df') + tm.assert_frame_equal(result, df) + + df = pd.DataFrame({'A': ['a', char], 'B': ['b', 'b']}) + with ensure_clean_store(self.path) as store: + store.put('df', df, format='table', encoding='utf-8') + result = store.get('df') + tm.assert_frame_equal(result, df) + def test_store_datetime_mixed(self): df = DataFrame(