Skip to content

Commit 7c4cc4e

Browse files
authored
Merge pull request pandas-dev#767 from shashank88/v1.79.1
Handle multi indexes when converting to unicode
2 parents 5b26664 + 2e2ea54 commit 7c4cc4e

File tree

5 files changed

+67
-17
lines changed

5 files changed

+67
-17
lines changed

CHANGES.md

+3
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,8 @@
11
## Changelog
22

3+
### 1.79.1 (2019-05-03)
4+
* Bugfix: Pandas convert multiindexes to unicode if flag set.
5+
36
### 1.79 (2019-05-02)
47
* Bugfix: #765 Pandas data columns require encoding to convert to unicode.
58

arctic/serialization/numpy_records.py

+12-2
Original file line numberDiff line numberDiff line change
@@ -288,8 +288,18 @@ def deserialize(self, item, force_bytes_to_unicode=False):
288288
if type(df[c].iloc[0]) == bytes:
289289
df[c] = df[c].str.decode('utf-8')
290290

291-
if type(df.index[0]) == bytes:
292-
df.index = df.index.astype('unicode')
291+
if isinstance(df.index, MultiIndex):
292+
unicode_indexes = []
293+
# MultiIndex requires a conversion at each level.
294+
for level in range(len(df.index.levels)):
295+
_index = df.index.get_level_values(level)
296+
if isinstance(_index[0], bytes):
297+
_index = _index.astype('unicode')
298+
unicode_indexes.append(_index)
299+
df.index = unicode_indexes
300+
else:
301+
if type(df.index[0]) == bytes:
302+
df.index = df.index.astype('unicode')
293303

294304
if type(df.columns[0]) == bytes:
295305
df.columns = df.index.astype('unicode')

docs/releasing.md

+5-1
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,11 @@ Package is hosted here: https://pypi.python.org/pypi/arctic/
44

55
## General upload and packaging docs
66

7-
http://peterdowns.com/posts/first-time-with-pypi.html
7+
https://realpython.com/pypi-publish-python-package/
8+
9+
The version number is of the format: <MAJOR>.<MINOR>.<BUGFIX>
10+
For minor bug fixes increment the BUGFIX number. For new features increment the minor letter. MAJOR is only
11+
for a major (and possibly non backwards compatible) overhaul of arctic.
812

913
## Pre-requisites
1014

setup.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@ def run_tests(self):
6565

6666
setup(
6767
name="arctic",
68-
version="1.80.0",
68+
version="1.79.1",
6969
author="Man AHL Technology",
7070
author_email="[email protected]",
7171
description=("AHL Research Versioned TimeSeries and Tick store"),

tests/integration/store/test_pandas_store.py

+46-13
Original file line numberDiff line numberDiff line change
@@ -1017,13 +1017,10 @@ def test_mutable_df(library):
10171017
assert read_s.data.__array__().flags['WRITEABLE']
10181018

10191019

1020+
@pytest.mark.skipif(six.PY3, reason="Skip for Python3")
10201021
def test_forced_encodings_with_df_mixed_types(library):
10211022
sample_data = {'str_col': ['a', 'b'], u'unicode_col': [u'a', u'b'], 'int_col': [1, 2]}
10221023
# This is for testing py2 bytes vs unicode serialization issues. Ignoring Py3 for now.
1023-
if six.PY3:
1024-
assert True
1025-
return
1026-
10271024
# ===================BEFORE===================
10281025
df = pd.DataFrame(sample_data, index=['str_type', u'uni_type'])
10291026
assert type(df['str_col'][0]) == bytes
@@ -1060,13 +1057,10 @@ def test_forced_encodings_with_df_mixed_types(library):
10601057
assert all([type(x) == unicode for x in df_forced_unicode.index])
10611058

10621059

1060+
@pytest.mark.skipif(six.PY3, reason="Skip for Python3")
10631061
def test_forced_encodings_with_df(library):
10641062
sample_data = {'str_col': ['a', 'b'], 'unicode_col': [u'a', u'b'], 'int_col': [1, 2]}
10651063
# This is for testing py2 bytes vs unicode serialization issues. Ignoring Py3 for now.
1066-
if six.PY3:
1067-
assert True
1068-
return
1069-
10701064
# ===================BEFORE===================
10711065
df = pd.DataFrame(sample_data, index=['str_type', 'uni_type'])
10721066
assert type(df['str_col'][0]) == bytes
@@ -1094,13 +1088,10 @@ def test_forced_encodings_with_df(library):
10941088
assert all([type(x) == unicode for x in df_forced_unicode.index])
10951089

10961090

1091+
@pytest.mark.skipif(six.PY2, reason="Skip for Python2")
10971092
def test_forced_encodings_with_df_py3(library):
10981093
sample_data = {'str_col': [b'a', b'b'], 'unicode_col': [u'a', u'b'], 'int_col': [1, 2]}
1099-
unicode_type = unicode if six.PY2 else str
1100-
# This is for testing reading in py3 with bytes index.
1101-
if six.PY2:
1102-
assert True
1103-
return
1094+
unicode_type = str
11041095

11051096
# ===================BEFORE===================
11061097
df = pd.DataFrame(sample_data, index=[b'str_type', b'uni_type'])
@@ -1127,3 +1118,45 @@ def test_forced_encodings_with_df_py3(library):
11271118
# Should force everything to be unicode_type now.
11281119
assert all([type(x) == unicode_type for x in df_forced_unicode.columns])
11291120
assert all([type(x) == unicode_type for x in df_forced_unicode.index])
1121+
1122+
1123+
@pytest.mark.skipif(six.PY2, reason="Skip for Python2")
1124+
def test_forced_encodings_with_df_py3_multi_index(library):
1125+
sample_data = {'str_col': [b'a', b'b'], 'unicode_col': [u'a', u'b'], 'int_col': [1, 2]}
1126+
unicode_type = str
1127+
1128+
# ===================BEFORE===================
1129+
multi_index_df = pd.DataFrame(sample_data,
1130+
index=pd.MultiIndex.from_tuples([(b'ele1', b'uni_type1'), (b'ele2', b'uni_type2')]))
1131+
assert type(multi_index_df['str_col'][0]) == bytes
1132+
assert type(multi_index_df['unicode_col'][0]) == unicode_type
1133+
# Check that all column names are stored as as is by pandas
1134+
assert all([type(x) == unicode_type for x in multi_index_df.columns])
1135+
assert all([
1136+
type(multi_index_df.index.get_level_values(level)[0]) == bytes
1137+
for level in range(len(multi_index_df.index.levels))
1138+
])
1139+
1140+
library.write('dummy', multi_index_df)
1141+
1142+
# ===================READ BACK WITHOUT FORCED ENCODING===================
1143+
df_normal = library.read('dummy').data
1144+
assert type(df_normal['str_col'][0]) == bytes
1145+
assert type(df_normal['unicode_col'][0]) == unicode_type
1146+
# Arctic currently converts all column to unicode_type and will keep index type as is
1147+
assert all([type(x) == unicode_type for x in df_normal.columns])
1148+
assert all([
1149+
type(df_normal.index.get_level_values(level)[0]) == bytes
1150+
for level in range(len(df_normal.index.levels))
1151+
])
1152+
1153+
# ===================READ BACK WITH FORCED ENCODING===================
1154+
df_forced_unicode = library.read('dummy', force_bytes_to_unicode=True).data
1155+
assert type(df_forced_unicode['str_col'][0]) == unicode_type
1156+
assert type(df_forced_unicode['unicode_col'][0]) == unicode_type
1157+
# Should force everything to be unicode_type now.
1158+
assert all([type(x) == unicode_type for x in df_forced_unicode.columns])
1159+
assert all([
1160+
type(df_forced_unicode.index.get_level_values(level)[0]) == unicode_type
1161+
for level in range(len(df_forced_unicode.index.levels))
1162+
])

0 commit comments

Comments
 (0)