Skip to content

Commit d1b0dac

Browse files
committed
Force all levels of multi indexes to unicode
Currently we only convert it for RangeIndexes but not for Multi. This adds handling for this case, and a test for this
1 parent 5b26664 commit d1b0dac

File tree

2 files changed

+58
-15
lines changed

2 files changed

+58
-15
lines changed

arctic/serialization/numpy_records.py

+12-2
Original file line numberDiff line numberDiff line change
@@ -288,8 +288,18 @@ def deserialize(self, item, force_bytes_to_unicode=False):
288288
if type(df[c].iloc[0]) == bytes:
289289
df[c] = df[c].str.decode('utf-8')
290290

291-
if type(df.index[0]) == bytes:
292-
df.index = df.index.astype('unicode')
291+
if isinstance(df.index, MultiIndex):
292+
unicode_indexes = []
293+
# MultiIndex requires a conversion at each level.
294+
for level in range(len(df.index.levels)):
295+
_index = df.index.get_level_values(level)
296+
if isinstance(_index[0], bytes):
297+
_index = _index.astype('unicode')
298+
unicode_indexes.append(_index)
299+
df.index = unicode_indexes
300+
else:
301+
if type(df.index[0]) == bytes:
302+
df.index = df.index.astype('unicode')
293303

294304
if type(df.columns[0]) == bytes:
295305
df.columns = df.index.astype('unicode')

tests/integration/store/test_pandas_store.py

+46-13
Original file line numberDiff line numberDiff line change
@@ -1017,13 +1017,10 @@ def test_mutable_df(library):
10171017
assert read_s.data.__array__().flags['WRITEABLE']
10181018

10191019

1020+
@pytest.mark.skipif(six.PY3, reason="Skip for Python3")
10201021
def test_forced_encodings_with_df_mixed_types(library):
10211022
sample_data = {'str_col': ['a', 'b'], u'unicode_col': [u'a', u'b'], 'int_col': [1, 2]}
10221023
# This is for testing py2 bytes vs unicode serialization issues. Ignoring Py3 for now.
1023-
if six.PY3:
1024-
assert True
1025-
return
1026-
10271024
# ===================BEFORE===================
10281025
df = pd.DataFrame(sample_data, index=['str_type', u'uni_type'])
10291026
assert type(df['str_col'][0]) == bytes
@@ -1060,13 +1057,10 @@ def test_forced_encodings_with_df_mixed_types(library):
10601057
assert all([type(x) == unicode for x in df_forced_unicode.index])
10611058

10621059

1060+
@pytest.mark.skipif(six.PY3, reason="Skip for Python3")
10631061
def test_forced_encodings_with_df(library):
10641062
sample_data = {'str_col': ['a', 'b'], 'unicode_col': [u'a', u'b'], 'int_col': [1, 2]}
10651063
# This is for testing py2 bytes vs unicode serialization issues. Ignoring Py3 for now.
1066-
if six.PY3:
1067-
assert True
1068-
return
1069-
10701064
# ===================BEFORE===================
10711065
df = pd.DataFrame(sample_data, index=['str_type', 'uni_type'])
10721066
assert type(df['str_col'][0]) == bytes
@@ -1094,13 +1088,10 @@ def test_forced_encodings_with_df(library):
10941088
assert all([type(x) == unicode for x in df_forced_unicode.index])
10951089

10961090

1091+
@pytest.mark.skipif(six.PY2, reason="Skip for Python2")
10971092
def test_forced_encodings_with_df_py3(library):
10981093
sample_data = {'str_col': [b'a', b'b'], 'unicode_col': [u'a', u'b'], 'int_col': [1, 2]}
1099-
unicode_type = unicode if six.PY2 else str
1100-
# This is for testing reading in py3 with bytes index.
1101-
if six.PY2:
1102-
assert True
1103-
return
1094+
unicode_type = str
11041095

11051096
# ===================BEFORE===================
11061097
df = pd.DataFrame(sample_data, index=[b'str_type', b'uni_type'])
@@ -1127,3 +1118,45 @@ def test_forced_encodings_with_df_py3(library):
11271118
# Should force everything to be unicode_type now.
11281119
assert all([type(x) == unicode_type for x in df_forced_unicode.columns])
11291120
assert all([type(x) == unicode_type for x in df_forced_unicode.index])
1121+
1122+
1123+
@pytest.mark.skipif(six.PY2, reason="Skip for Python2")
1124+
def test_forced_encodings_with_df_py3_multi_index(library):
1125+
sample_data = {'str_col': [b'a', b'b'], 'unicode_col': [u'a', u'b'], 'int_col': [1, 2]}
1126+
unicode_type = str
1127+
1128+
# ===================BEFORE===================
1129+
multi_index_df = pd.DataFrame(sample_data,
1130+
index=pd.MultiIndex.from_tuples([(b'ele1', b'uni_type1'), (b'ele2', b'uni_type2')]))
1131+
assert type(multi_index_df['str_col'][0]) == bytes
1132+
assert type(multi_index_df['unicode_col'][0]) == unicode_type
1133+
# Check that all column names are stored as as is by pandas
1134+
assert all([type(x) == unicode_type for x in multi_index_df.columns])
1135+
assert all([
1136+
type(multi_index_df.index.get_level_values(level)[0]) == bytes
1137+
for level in range(len(multi_index_df.index.levels))
1138+
])
1139+
1140+
library.write('dummy', multi_index_df)
1141+
1142+
# ===================READ BACK WITHOUT FORCED ENCODING===================
1143+
df_normal = library.read('dummy').data
1144+
assert type(df_normal['str_col'][0]) == bytes
1145+
assert type(df_normal['unicode_col'][0]) == unicode_type
1146+
# Arctic currently converts all column to unicode_type and will keep index type as is
1147+
assert all([type(x) == unicode_type for x in df_normal.columns])
1148+
assert all([
1149+
type(df_normal.index.get_level_values(level)[0]) == bytes
1150+
for level in range(len(df_normal.index.levels))
1151+
])
1152+
1153+
# ===================READ BACK WITH FORCED ENCODING===================
1154+
df_forced_unicode = library.read('dummy', force_bytes_to_unicode=True).data
1155+
assert type(df_forced_unicode['str_col'][0]) == unicode_type
1156+
assert type(df_forced_unicode['unicode_col'][0]) == unicode_type
1157+
# Should force everything to be unicode_type now.
1158+
assert all([type(x) == unicode_type for x in df_forced_unicode.columns])
1159+
assert all([
1160+
type(df_forced_unicode.index.get_level_values(level)[0]) == unicode_type
1161+
for level in range(len(df_forced_unicode.index.levels))
1162+
])

0 commit comments

Comments
 (0)