Merge pull request pandas-dev#765 from shashank88/handle_non_str_index

shashank88 · web-flow · commit da3bf8fff0e1 · 2019-05-02T15:34:40.000+01:00
Use astype vs using str accessor to convert to unicode
diff --git a/arctic/serialization/numpy_records.py b/arctic/serialization/numpy_records.py
@@ -278,18 +278,21 @@ def deserialize(self, item, force_bytes_to_unicode=False):
             # of people migrating to py3. # https://github.com/manahl/arctic/issues/598
             # This should not be used for a normal flow, and you should instead of writing unicode strings
             # if you want to work with str in py3.,
-            def convert_pandas_column_to_unicode(col):
-                return col.str.decode('utf-8')
 
             for c in df.select_dtypes(object):
+                # The conversion is not using astype similar to the index as pandas has a bug where it tries to convert
+                # the data columns to a unicode string, and the object in this case would be bytes, eg. b'abc'
+                # which is converted to u"b'abc'" i.e it includes the b character as well! This generally happens
+                # when there is a str conversion without specifying the encoding. eg. str(b'abc') -> "b'abc'" and the
+                # fix for this is to tell it the encoding to use: i.e str(b'abc', 'utf-8') -> "abc"
                 if type(df[c].iloc[0]) == bytes:
-                    df[c] = convert_pandas_column_to_unicode(df[c])
+                    df[c] = df[c].str.decode('utf-8')
 
             if type(df.index[0]) == bytes:
-                df.index = convert_pandas_column_to_unicode(df.index)
+                df.index = df.index.astype('unicode')
 
             if type(df.columns[0]) == bytes:
-                df.columns = convert_pandas_column_to_unicode(df.columns)
+                df.columns = df.index.astype('unicode')
 
         return df
 
diff --git a/tests/integration/store/test_pandas_store.py b/tests/integration/store/test_pandas_store.py
@@ -1092,3 +1092,38 @@ def test_forced_encodings_with_df(library):
     # Should force everything to be unicode now.
     assert all([type(x) == unicode for x in df_forced_unicode.columns])
     assert all([type(x) == unicode for x in df_forced_unicode.index])
+
+
+def test_forced_encodings_with_df_py3(library):
+    sample_data = {'str_col': [b'a', b'b'], 'unicode_col': [u'a', u'b'], 'int_col': [1, 2]}
+    unicode_type = unicode if six.PY2 else str
+    # This is for testing reading in py3 with bytes index.
+    if six.PY2:
+        assert True
+        return
+
+    # ===================BEFORE===================
+    df = pd.DataFrame(sample_data, index=[b'str_type', b'uni_type'])
+    assert type(df['str_col'][0]) == bytes
+    assert type(df['unicode_col'][0]) == unicode_type
+    # Check that all column names are stored as as is by pandas
+    assert all([type(x) == unicode_type for x in df.columns])
+    assert all([type(x) == bytes for x in df.index])
+
+    library.write('dummy', df)
+
+    # ===================READ BACK WITHOUT FORCED ENCODING===================
+    df_normal = library.read('dummy').data
+    assert type(df_normal['str_col'][0]) == bytes
+    assert type(df_normal['unicode_col'][0]) == unicode_type
+    # Arctic currently converts all column to unicode_type and will keep index type as is
+    assert all([type(x) == unicode_type for x in df_normal.columns])
+    assert all([type(x) == bytes for x in df_normal.index])
+
+    # ===================READ BACK WITH FORCED ENCODING===================
+    df_forced_unicode = library.read('dummy', force_bytes_to_unicode=True).data
+    assert type(df_forced_unicode['str_col'][0]) == unicode_type
+    assert type(df_forced_unicode['unicode_col'][0]) == unicode_type
+    # Should force everything to be unicode_type now.
+    assert all([type(x) == unicode_type for x in df_forced_unicode.columns])
+    assert all([type(x) == unicode_type for x in df_forced_unicode.index])