Skip to content

Commit 26db172

Browse files
TomAugspurgerjreback
authored andcommitted
BUG: HDFStore.append with encoded string itemsize, #11234
Failure came when the maximum length of the unencoded string was smaller than the maximum encoded lenght.
1 parent 924c419 commit 26db172

File tree

3 files changed

+20
-6
lines changed

3 files changed

+20
-6
lines changed

doc/source/whatsnew/v0.17.1.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ Bug Fixes
4444
~~~~~~~~~
4545

4646
- Bug in ``.to_latex()`` output broken when the index has a name (:issue: `10660`)
47-
47+
- Bug in ``HDFStore.append`` with strings whose encoded length exceded the max unencoded length (:issue:`11234`)
4848

4949

5050

pandas/io/pytables.py

+3-5
Original file line numberDiff line numberDiff line change
@@ -1860,7 +1860,8 @@ def set_atom_string(self, block, block_items, existing_col, min_itemsize,
18601860
)
18611861

18621862
# itemsize is the maximum length of a string (along any dimension)
1863-
itemsize = lib.max_len_string_array(com._ensure_object(data.ravel()))
1863+
data_converted = _convert_string_array(data, encoding)
1864+
itemsize = data_converted.itemsize
18641865

18651866
# specified min_itemsize?
18661867
if isinstance(min_itemsize, dict):
@@ -1877,10 +1878,7 @@ def set_atom_string(self, block, block_items, existing_col, min_itemsize,
18771878
self.itemsize = itemsize
18781879
self.kind = 'string'
18791880
self.typ = self.get_atom_string(block, itemsize)
1880-
self.set_data(self.convert_string_data(data, itemsize, encoding))
1881-
1882-
def convert_string_data(self, data, itemsize, encoding):
1883-
return _convert_string_array(data, encoding, itemsize)
1881+
self.set_data(data_converted.astype('|S%d' % itemsize, copy=False))
18841882

18851883
def get_atom_coltype(self, kind=None):
18861884
""" return the PyTables column class for this column """

pandas/io/tests/test_pytables.py

+16
Original file line numberDiff line numberDiff line change
@@ -4292,6 +4292,22 @@ def f():
42924292

42934293
compat_assert_produces_warning(PerformanceWarning, f)
42944294

4295+
4296+
def test_unicode_longer_encoded(self):
4297+
# GH 11234
4298+
char = '\u0394'
4299+
df = pd.DataFrame({'A': [char]})
4300+
with ensure_clean_store(self.path) as store:
4301+
store.put('df', df, format='table', encoding='utf-8')
4302+
result = store.get('df')
4303+
tm.assert_frame_equal(result, df)
4304+
4305+
df = pd.DataFrame({'A': ['a', char], 'B': ['b', 'b']})
4306+
with ensure_clean_store(self.path) as store:
4307+
store.put('df', df, format='table', encoding='utf-8')
4308+
result = store.get('df')
4309+
tm.assert_frame_equal(result, df)
4310+
42954311
def test_store_datetime_mixed(self):
42964312

42974313
df = DataFrame(

0 commit comments

Comments
 (0)