Merge pull request pandas-dev#705 from shashank88/sortlevel

shashank88 · web-flow · commit 4dc586778eaf · 2019-01-30T15:13:33.000Z
Fixes build breakage due to pandas 0.24.0 upgrade
diff --git a/arctic/multi_index.py b/arctic/multi_index.py
@@ -65,7 +65,7 @@ def fancy_group_by(df, grouping_level=0, aggregate_level=1, method='last', max_=
     # to work properly. We can check the sortdepth to see if this is in fact the case and resort if necessary.
     # TODO: this might need tweaking if the levels are around the wrong way
     if df.index.lexsort_depth < (aggregate_level + 1):
-        df = df.sortlevel(level=grouping_level)
+        df = df.sort_index(level=grouping_level)
 
     gb = df.groupby(level=grouping_level)
     if method == 'last':
@@ -115,7 +115,7 @@ def multi_index_insert_row(df, index_row, values_row):
         # We've just appended a row to an already-sorted dataframe
         return df
     # The df wasn't sorted or the row has to be put in the middle somewhere
-    return df.sortlevel()
+    return df.sort_index()
 
 
 def insert_at(df, sample_date, values):
diff --git a/arctic/serialization/incremental.py b/arctic/serialization/incremental.py
@@ -148,11 +148,11 @@ def _calculate_rows_per_chunk(max_chunk_size, chunk):
         sze = sze if sze < max_chunk_size else max_chunk_size
         rows_per_chunk = int(max_chunk_size / sze)
         if rows_per_chunk < 1 and ARCTIC_AUTO_EXPAND_CHUNK_SIZE:
-                # If a row size is larger than chunk_size, use the maximum document size
-                logging.warning('Chunk size of {} is too small to fit a row ({}). '
-                                'Using maximum document size.'.format(max_chunk_size, MAX_DOCUMENT_SIZE))
-                # For huge rows, fall-back to using a very large document size, less than max-allowed by MongoDB
-                rows_per_chunk = int(MAX_DOCUMENT_SIZE / sze)
+            # If a row size is larger than chunk_size, use the maximum document size
+            logging.warning('Chunk size of {} is too small to fit a row ({}). '
+                            'Using maximum document size.'.format(max_chunk_size, MAX_DOCUMENT_SIZE))
+            # For huge rows, fall-back to using a very large document size, less than max-allowed by MongoDB
+            rows_per_chunk = int(MAX_DOCUMENT_SIZE / sze)
         if rows_per_chunk < 1:
             raise ArcticSerializationException("Serialization failed to split data into max sized chunks.")
         return rows_per_chunk
diff --git a/arctic/store/_ndarray_store.py b/arctic/store/_ndarray_store.py
@@ -686,7 +686,7 @@ def _do_write(self, collection, version, symbol, item, previous_version, segment
                     set_spec['$set'] = segment
                     bulk.append(pymongo.UpdateOne(segment_spec, set_spec, upsert=True))
                 elif ARCTIC_FORWARD_POINTERS_CFG is FwPointersCfg.HYBRID:
-                        bulk.append(pymongo.UpdateOne(segment_spec, set_spec))
+                    bulk.append(pymongo.UpdateOne(segment_spec, set_spec))
                 # With FwPointersCfg.ENABLED  we make zero updates on existing segment documents, but:
                 #   - write only the new segment(s) documents
                 #   - write the new version document
diff --git a/tests/integration/store/test_bitemporal_store.py b/tests/integration/store/test_bitemporal_store.py
@@ -10,7 +10,7 @@
 from pandas.util.testing import assert_frame_equal
 
 from arctic.date._mktz import mktz
-from tests.util import read_str_as_pandas
+from tests.util import read_str_as_pandas, multi_index_df_from_arrs
 
 pytest_plugins = ['arctic.fixtures.arctic']
 
@@ -211,55 +211,69 @@ def test_bitemporal_store_read_as_of_timezone(bitemporal_library):
 
 
 def test_multi_index_ts_read_write(bitemporal_library):
-    ts = read_str_as_pandas("""          index 1 |    index 2 | near
-                         2012-09-08 17:06:11.040 | SPAM Index |  1.0
-                         2012-10-08 17:06:11.040 | SPAM Index |  2.0
-                         2012-10-09 17:06:11.040 | SPAM Index |  2.5
-                         2012-11-08 17:06:11.040 | SPAM Index |  3.0""", num_index=2)
+    ts = multi_index_df_from_arrs(
+        index_headers=('index 1', 'index 2'),
+        index_arrs=[
+            ['2012-09-08 17:06:11.040', '2012-10-08 17:06:11.040', '2012-10-09 17:06:11.040', '2012-11-08 17:06:11.040'],
+            ['SPAM Index'] * 4
+        ],
+        data_dict={'near': [1.0, 2.0, 2.5, 3.0]}
+    )
     bitemporal_library.update('spam', ts)
     assert_frame_equal(ts, bitemporal_library.read('spam').data)
 
 
 def test_multi_index_ts_read_raw(bitemporal_library):
-    ts = read_str_as_pandas("""          index 1 |    index 2 | near
-                         2012-09-08 17:06:11.040 | SPAM Index |  1.0
-                         2012-10-08 17:06:11.040 | SPAM Index |  2.0
-                         2012-10-09 17:06:11.040 | SPAM Index |  2.5
-                         2012-11-08 17:06:11.040 | SPAM Index |  3.0""", num_index=2)
-
-    expected_ts = read_str_as_pandas(""" index 1 |    index 2 | observed_dt | near
-                         2012-09-08 17:06:11.040 | SPAM Index |  2015-01-01 |  1.0
-                         2012-10-08 17:06:11.040 | SPAM Index |  2015-01-01 |  2.0
-                         2012-10-09 17:06:11.040 | SPAM Index |  2015-01-01 |  2.5
-                         2012-11-08 17:06:11.040 | SPAM Index |  2015-01-01 |  3.0""", num_index=3)
+    ts = multi_index_df_from_arrs(
+        index_headers=('index 1', 'index 2'),
+        index_arrs=[
+            ['2012-09-08 17:06:11.040', '2012-10-08 17:06:11.040', '2012-10-09 17:06:11.040', '2012-11-08 17:06:11.040'],
+            ['SPAM Index'] * 4
+        ],
+        data_dict={'near': [1.0, 2.0, 2.5, 3.0]}
+    )
+
+    expected_ts = multi_index_df_from_arrs(
+        index_headers=('index 1', 'index 2', 'observed_dt'),
+        index_arrs=[
+            ['2012-09-08 17:06:11.040', '2012-10-08 17:06:11.040', '2012-10-09 17:06:11.040', '2012-11-08 17:06:11.040'],
+            ['SPAM Index'] * 4,
+            ['2015-01-01'] * 4,
+        ],
+        data_dict={'near': [1.0, 2.0, 2.5, 3.0]}
+    )
     bitemporal_library.update('spam', ts, as_of=dt(2015, 1, 1))
     assert_frame_equal(expected_ts.tz_localize(tz=LOCAL_TZ, level=2), bitemporal_library.read('spam', raw=True).data)
 
 
 def test_multi_index_update(bitemporal_library):
-    ts = read_str_as_pandas("""          index 1 |    index 2 | near
-                         2012-09-08 17:06:11.040 | SPAM Index |  1.0
-                         2012-09-08 17:06:11.040 |  EGG Index |  1.1
-                         2012-10-08 17:06:11.040 | SPAM Index |  2.0
-                         2012-10-08 17:06:11.040 |  EGG Index |  2.1
-                         2012-10-09 17:06:11.040 | SPAM Index |  2.5
-                         2012-10-09 17:06:11.040 |  EGG Index |  2.6
-                         2012-11-08 17:06:11.040 | SPAM Index |  3.0
-                         2012-11-08 17:06:11.040 |  EGG Index |  3.1""", num_index=2)
-    ts2 = read_str_as_pandas("""          index 1 |    index 2 | near
-                          2012-09-08 17:06:11.040 | SPAM Index |  1.2
-                          2012-09-08 17:06:11.040 |  EGG Index |  1.6
-                          2012-12-08 17:06:11.040 | SPAM Index |  4.0""", num_index=2)
-    expected_ts = read_str_as_pandas("""  index 1 |    index 2 | near
-                          2012-09-08 17:06:11.040 |  EGG Index |  1.6
-                          2012-09-08 17:06:11.040 | SPAM Index |  1.2
-                          2012-10-08 17:06:11.040 |  EGG Index |  2.1
-                          2012-10-08 17:06:11.040 | SPAM Index |  2.0
-                          2012-10-09 17:06:11.040 |  EGG Index |  2.6
-                          2012-10-09 17:06:11.040 | SPAM Index |  2.5
-                          2012-11-08 17:06:11.040 |  EGG Index |  3.1
-                          2012-11-08 17:06:11.040 | SPAM Index |  3.0
-                          2012-12-08 17:06:11.040 | SPAM Index |  4.0""", num_index=2)
+    sample_timerange = list(sorted(['2012-09-08 17:06:11.040', '2012-10-08 17:06:11.040', '2012-10-09 17:06:11.040', '2012-11-08 17:06:11.040'] * 2))
+    ts = multi_index_df_from_arrs(
+        index_headers=('index 1', 'index 2'),
+        index_arrs=[
+            sample_timerange,
+            ['SPAM Index', 'EGG Index'] * 4
+        ],
+        data_dict={'near': [1.0, 1.1, 2.0, 2.1, 2.5, 2.6, 3.0, 3.1]}
+    )
+
+    ts2 = multi_index_df_from_arrs(
+        index_headers=('index 1', 'index 2'),
+        index_arrs=[
+            ['2012-09-08 17:06:11.040', '2012-09-08 17:06:11.040', '2012-12-08 17:06:11.040'],
+            ['SPAM Index', 'EGG Index', 'SPAM Index'],
+        ],
+        data_dict={'near': [1.2, 1.6, 4.0]}
+    )
+
+    expected_ts = multi_index_df_from_arrs(
+        index_headers=('index 1', 'index 2'),
+        index_arrs=[
+            sample_timerange + ['2012-12-08 17:06:11.040'],
+            ['EGG Index', 'SPAM Index'] * 4 + ['SPAM Index']
+        ],
+        data_dict={'near': [1.6, 1.2, 2.1, 2.0, 2.6, 2.5, 3.1, 3.0, 4.0]}
+    )
     bitemporal_library.update('spam', ts, as_of=dt(2015, 1, 1))
     bitemporal_library.update('spam', ts2, as_of=dt(2015, 1, 2))
     assert_frame_equal(expected_ts, bitemporal_library.read('spam').data)
diff --git a/tests/unit/test_multi_index.py b/tests/unit/test_multi_index.py
@@ -8,7 +8,7 @@
 from pandas.util.testing import assert_frame_equal
 
 from arctic.multi_index import groupby_asof, fancy_group_by, insert_at
-from tests.util import read_str_as_pandas
+from tests.util import multi_index_df_from_arrs
 
 
 def get_bitemporal_test_data():
@@ -97,23 +97,43 @@ def test__get_ts__unsorted_index():
 
 
 def test_fancy_group_by_multi_index():
-    ts = read_str_as_pandas("""      index 1 |    index 2 | observed_dt | near
-                     2012-09-08 17:06:11.040 | SPAM Index | 2015-01-01 |  1.0
-                     2012-09-08 17:06:11.040 |  EGG Index | 2015-01-01 |  1.6
-                     2012-10-08 17:06:11.040 | SPAM Index | 2015-01-01 |  2.0
-                     2012-10-08 17:06:11.040 | SPAM Index | 2015-01-05 |  4.2
-                     2012-10-08 17:06:11.040 |  EGG Index | 2015-01-01 |  2.1
-                     2012-10-09 17:06:11.040 | SPAM Index | 2015-01-01 |  2.5
-                     2012-10-09 17:06:11.040 |  EGG Index | 2015-01-01 |  2.6
-                     2012-11-08 17:06:11.040 | SPAM Index | 2015-01-01 |  3.0""", num_index=3)
-    expected_ts = read_str_as_pandas("""  index 1 |    index 2 | near
-                          2012-09-08 17:06:11.040 |  EGG Index |  1.6
-                          2012-09-08 17:06:11.040 | SPAM Index |  1.0
-                          2012-10-08 17:06:11.040 |  EGG Index |  2.1
-                          2012-10-08 17:06:11.040 | SPAM Index |  4.2
-                          2012-10-09 17:06:11.040 |  EGG Index |  2.6
-                          2012-10-09 17:06:11.040 | SPAM Index |  2.5
-                          2012-11-08 17:06:11.040 | SPAM Index |  3.0""", num_index=2)
+
+    ts = multi_index_df_from_arrs(
+        index_headers=('index 1', 'index 2', 'observed_dt'),
+        index_arrs=[
+            [
+                '2012-09-08 17:06:11.040',
+                '2012-09-08 17:06:11.040',
+                '2012-10-08 17:06:11.040',
+                '2012-10-08 17:06:11.040',
+                '2012-10-08 17:06:11.040',
+                '2012-10-09 17:06:11.040',
+                '2012-10-09 17:06:11.040',
+                '2012-11-08 17:06:11.040',
+            ],
+            ['SPAM Index', 'EGG Index', 'SPAM Index', 'SPAM Index'] + ['EGG Index', 'SPAM Index'] * 2,
+            ['2015-01-01'] * 3 + ['2015-01-05'] + ['2015-01-01'] * 4
+        ],
+        data_dict={'near': [1.0, 1.6, 2.0, 4.2, 2.1, 2.5, 2.6, 3.0]}
+    )
+
+    expected_ts= multi_index_df_from_arrs(
+        index_headers=('index 1', 'index 2'),
+        index_arrs=[
+            [
+                '2012-09-08 17:06:11.040',
+                '2012-09-08 17:06:11.040',
+                '2012-10-08 17:06:11.040',
+                '2012-10-08 17:06:11.040',
+                '2012-10-09 17:06:11.040',
+                '2012-10-09 17:06:11.040',
+                '2012-11-08 17:06:11.040',
+            ],
+            ['EGG Index', 'SPAM Index'] * 3 + ['SPAM Index']
+        ],
+        data_dict={'near': [1.6, 1.0, 2.1, 4.2, 2.6, 2.5, 3.0]}
+    )
+
     assert_frame_equal(expected_ts, groupby_asof(ts, dt_col=['index 1', 'index 2'], asof_col='observed_dt'))
 
 
diff --git a/tests/util.py b/tests/util.py
@@ -1,3 +1,4 @@
+from __future__ import print_function
 try:
     import cStringIO as stringio
 except ImportError:
@@ -59,3 +60,15 @@ def run_as_main(fn, *args):
         print("run_as_main: %s" % str(args))
         sys.argv = ['progname'] + list(args)
         return fn()
+
+
+def multi_index_df_from_arrs(index_headers, index_arrs, data_dict):
+    parsed_indexes = []
+    for index in index_arrs:
+        try:
+            parsed_indexes.append(pandas.to_datetime(index))
+        except ValueError:
+            parsed_indexes.append(index)
+
+    m_index = pandas.MultiIndex.from_arrays(parsed_indexes, names=index_headers)
+    return pandas.DataFrame(data_dict, index=m_index)