Skip to content

Commit da3bf8f

Browse files
authored
Merge pull request pandas-dev#765 from shashank88/handle_non_str_index
Use astype vs using str accessor to convert to unicode
2 parents 7e3793f + f421fc6 commit da3bf8f

File tree

2 files changed

+43
-5
lines changed

2 files changed

+43
-5
lines changed

arctic/serialization/numpy_records.py

+8-5
Original file line numberDiff line numberDiff line change
@@ -278,18 +278,21 @@ def deserialize(self, item, force_bytes_to_unicode=False):
278278
# of people migrating to py3. # https://github.com/manahl/arctic/issues/598
279279
# This should not be used for a normal flow, and you should instead of writing unicode strings
280280
# if you want to work with str in py3.,
281-
def convert_pandas_column_to_unicode(col):
282-
return col.str.decode('utf-8')
283281

284282
for c in df.select_dtypes(object):
283+
# The conversion is not using astype similar to the index as pandas has a bug where it tries to convert
284+
# the data columns to a unicode string, and the object in this case would be bytes, eg. b'abc'
285+
# which is converted to u"b'abc'" i.e it includes the b character as well! This generally happens
286+
# when there is a str conversion without specifying the encoding. eg. str(b'abc') -> "b'abc'" and the
287+
# fix for this is to tell it the encoding to use: i.e str(b'abc', 'utf-8') -> "abc"
285288
if type(df[c].iloc[0]) == bytes:
286-
df[c] = convert_pandas_column_to_unicode(df[c])
289+
df[c] = df[c].str.decode('utf-8')
287290

288291
if type(df.index[0]) == bytes:
289-
df.index = convert_pandas_column_to_unicode(df.index)
292+
df.index = df.index.astype('unicode')
290293

291294
if type(df.columns[0]) == bytes:
292-
df.columns = convert_pandas_column_to_unicode(df.columns)
295+
df.columns = df.index.astype('unicode')
293296

294297
return df
295298

tests/integration/store/test_pandas_store.py

+35
Original file line numberDiff line numberDiff line change
@@ -1092,3 +1092,38 @@ def test_forced_encodings_with_df(library):
10921092
# Should force everything to be unicode now.
10931093
assert all([type(x) == unicode for x in df_forced_unicode.columns])
10941094
assert all([type(x) == unicode for x in df_forced_unicode.index])
1095+
1096+
1097+
def test_forced_encodings_with_df_py3(library):
1098+
sample_data = {'str_col': [b'a', b'b'], 'unicode_col': [u'a', u'b'], 'int_col': [1, 2]}
1099+
unicode_type = unicode if six.PY2 else str
1100+
# This is for testing reading in py3 with bytes index.
1101+
if six.PY2:
1102+
assert True
1103+
return
1104+
1105+
# ===================BEFORE===================
1106+
df = pd.DataFrame(sample_data, index=[b'str_type', b'uni_type'])
1107+
assert type(df['str_col'][0]) == bytes
1108+
assert type(df['unicode_col'][0]) == unicode_type
1109+
# Check that all column names are stored as as is by pandas
1110+
assert all([type(x) == unicode_type for x in df.columns])
1111+
assert all([type(x) == bytes for x in df.index])
1112+
1113+
library.write('dummy', df)
1114+
1115+
# ===================READ BACK WITHOUT FORCED ENCODING===================
1116+
df_normal = library.read('dummy').data
1117+
assert type(df_normal['str_col'][0]) == bytes
1118+
assert type(df_normal['unicode_col'][0]) == unicode_type
1119+
# Arctic currently converts all column to unicode_type and will keep index type as is
1120+
assert all([type(x) == unicode_type for x in df_normal.columns])
1121+
assert all([type(x) == bytes for x in df_normal.index])
1122+
1123+
# ===================READ BACK WITH FORCED ENCODING===================
1124+
df_forced_unicode = library.read('dummy', force_bytes_to_unicode=True).data
1125+
assert type(df_forced_unicode['str_col'][0]) == unicode_type
1126+
assert type(df_forced_unicode['unicode_col'][0]) == unicode_type
1127+
# Should force everything to be unicode_type now.
1128+
assert all([type(x) == unicode_type for x in df_forced_unicode.columns])
1129+
assert all([type(x) == unicode_type for x in df_forced_unicode.index])

0 commit comments

Comments
 (0)