Skip to content

BUG: HDFStore.append with encoded string itemsize #11240

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions doc/source/whatsnew/v0.17.1.txt
Original file line number Diff line number Diff line change
Expand Up @@ -42,3 +42,10 @@ Performance Improvements

Bug Fixes
~~~~~~~~~


- Bug in ``HDFStore.append`` with strings whose encoded length exceded the max unencoded length (:issue:`11234`)




8 changes: 3 additions & 5 deletions pandas/io/pytables.py
Original file line number Diff line number Diff line change
Expand Up @@ -1860,7 +1860,8 @@ def set_atom_string(self, block, block_items, existing_col, min_itemsize,
)

# itemsize is the maximum length of a string (along any dimension)
itemsize = lib.max_len_string_array(com._ensure_object(data.ravel()))
data_converted = _convert_string_array(data, encoding)
itemsize = data_converted.itemsize

# specified min_itemsize?
if isinstance(min_itemsize, dict):
Expand All @@ -1877,10 +1878,7 @@ def set_atom_string(self, block, block_items, existing_col, min_itemsize,
self.itemsize = itemsize
self.kind = 'string'
self.typ = self.get_atom_string(block, itemsize)
self.set_data(self.convert_string_data(data, itemsize, encoding))

def convert_string_data(self, data, itemsize, encoding):
return _convert_string_array(data, encoding, itemsize)
self.set_data(data_converted.astype('|S%d' % itemsize, copy=False))

def get_atom_coltype(self, kind=None):
""" return the PyTables column class for this column """
Expand Down
16 changes: 16 additions & 0 deletions pandas/io/tests/test_pytables.py
Original file line number Diff line number Diff line change
Expand Up @@ -4292,6 +4292,22 @@ def f():

compat_assert_produces_warning(PerformanceWarning, f)


def test_unicode_longer_encoded(self):
# GH 11234
char = '\u0394'
df = pd.DataFrame({'A': [char]})
with ensure_clean_store(self.path) as store:
store.put('df', df, format='table', encoding='utf-8')
result = store.get('df')
tm.assert_frame_equal(result, df)

df = pd.DataFrame({'A': ['a', char], 'B': ['b', 'b']})
with ensure_clean_store(self.path) as store:
store.put('df', df, format='table', encoding='utf-8')
result = store.get('df')
tm.assert_frame_equal(result, df)

def test_store_datetime_mixed(self):

df = DataFrame(
Expand Down