Skip to content

Commit f421fc6

Browse files
committed
Use astype vs using str accessor to convert to unicode
When read back with py3, the str accessor for pandas 0.22.0 does not work for indexes if all data is bytes type. I added a test for py3 reading as well, and astype works for all these cases.
1 parent 7e3793f commit f421fc6

File tree

2 files changed

+43
-5
lines changed

2 files changed

+43
-5
lines changed

arctic/serialization/numpy_records.py

+8-5
Original file line numberDiff line numberDiff line change
@@ -278,18 +278,21 @@ def deserialize(self, item, force_bytes_to_unicode=False):
278278
# of people migrating to py3. # https://github.com/manahl/arctic/issues/598
279279
# This should not be used for a normal flow, and you should instead of writing unicode strings
280280
# if you want to work with str in py3.,
281-
def convert_pandas_column_to_unicode(col):
282-
return col.str.decode('utf-8')
283281

284282
for c in df.select_dtypes(object):
283+
# The conversion is not using astype similar to the index as pandas has a bug where it tries to convert
284+
# the data columns to a unicode string, and the object in this case would be bytes, eg. b'abc'
285+
# which is converted to u"b'abc'" i.e it includes the b character as well! This generally happens
286+
# when there is a str conversion without specifying the encoding. eg. str(b'abc') -> "b'abc'" and the
287+
# fix for this is to tell it the encoding to use: i.e str(b'abc', 'utf-8') -> "abc"
285288
if type(df[c].iloc[0]) == bytes:
286-
df[c] = convert_pandas_column_to_unicode(df[c])
289+
df[c] = df[c].str.decode('utf-8')
287290

288291
if type(df.index[0]) == bytes:
289-
df.index = convert_pandas_column_to_unicode(df.index)
292+
df.index = df.index.astype('unicode')
290293

291294
if type(df.columns[0]) == bytes:
292-
df.columns = convert_pandas_column_to_unicode(df.columns)
295+
df.columns = df.index.astype('unicode')
293296

294297
return df
295298

tests/integration/store/test_pandas_store.py

+35
Original file line numberDiff line numberDiff line change
@@ -1092,3 +1092,38 @@ def test_forced_encodings_with_df(library):
10921092
# Should force everything to be unicode now.
10931093
assert all([type(x) == unicode for x in df_forced_unicode.columns])
10941094
assert all([type(x) == unicode for x in df_forced_unicode.index])
1095+
1096+
1097+
def test_forced_encodings_with_df_py3(library):
1098+
sample_data = {'str_col': [b'a', b'b'], 'unicode_col': [u'a', u'b'], 'int_col': [1, 2]}
1099+
unicode_type = unicode if six.PY2 else str
1100+
# This is for testing reading in py3 with bytes index.
1101+
if six.PY2:
1102+
assert True
1103+
return
1104+
1105+
# ===================BEFORE===================
1106+
df = pd.DataFrame(sample_data, index=[b'str_type', b'uni_type'])
1107+
assert type(df['str_col'][0]) == bytes
1108+
assert type(df['unicode_col'][0]) == unicode_type
1109+
# Check that all column names are stored as as is by pandas
1110+
assert all([type(x) == unicode_type for x in df.columns])
1111+
assert all([type(x) == bytes for x in df.index])
1112+
1113+
library.write('dummy', df)
1114+
1115+
# ===================READ BACK WITHOUT FORCED ENCODING===================
1116+
df_normal = library.read('dummy').data
1117+
assert type(df_normal['str_col'][0]) == bytes
1118+
assert type(df_normal['unicode_col'][0]) == unicode_type
1119+
# Arctic currently converts all column to unicode_type and will keep index type as is
1120+
assert all([type(x) == unicode_type for x in df_normal.columns])
1121+
assert all([type(x) == bytes for x in df_normal.index])
1122+
1123+
# ===================READ BACK WITH FORCED ENCODING===================
1124+
df_forced_unicode = library.read('dummy', force_bytes_to_unicode=True).data
1125+
assert type(df_forced_unicode['str_col'][0]) == unicode_type
1126+
assert type(df_forced_unicode['unicode_col'][0]) == unicode_type
1127+
# Should force everything to be unicode_type now.
1128+
assert all([type(x) == unicode_type for x in df_forced_unicode.columns])
1129+
assert all([type(x) == unicode_type for x in df_forced_unicode.index])

0 commit comments

Comments
 (0)