From 508bc0cd27e3e6c3c02cb5765eb3aeeb6ed11e20 Mon Sep 17 00:00:00 2001 From: Phillip Cloud Date: Sat, 19 Oct 2013 12:48:38 -0400 Subject: [PATCH] BUG: union should not try to sort inplace because platform impls differ as to when sorting occurs for objects that cannot be compared --- pandas/computation/tests/test_eval.py | 174 +++++++++++++++++--------- pandas/core/index.py | 24 ++-- pandas/tests/test_index.py | 18 ++- 3 files changed, 144 insertions(+), 72 deletions(-) diff --git a/pandas/computation/tests/test_eval.py b/pandas/computation/tests/test_eval.py index 0275193031a07..e22b6218a2227 100644 --- a/pandas/computation/tests/test_eval.py +++ b/pandas/computation/tests/test_eval.py @@ -1,5 +1,7 @@ #!/usr/bin/env python +import warnings +import operator from itertools import product from distutils.version import LooseVersion @@ -28,7 +30,7 @@ from pandas.util.testing import (assert_frame_equal, randbool, assertRaisesRegexp, assert_produces_warning, assert_series_equal) -from pandas.compat import PY3, u +from pandas.compat import PY3, u, reduce _series_frame_incompatible = _bool_ops_syms _scalar_skip = 'in', 'not in' @@ -699,6 +701,16 @@ def check_chained_cmp_op(self, lhs, cmp1, mid, cmp2, rhs): #------------------------------------- # basic and complex alignment +def _is_datetime(x): + return issubclass(x.dtype.type, np.datetime64) + + +def should_warn(*args): + not_mono = not any(map(operator.attrgetter('is_monotonic'), args)) + only_one_dt = reduce(operator.xor, map(_is_datetime, args)) + return not_mono and only_one_dt + + class TestAlignment(object): index_types = 'i', 'u', 'dt' @@ -719,13 +731,20 @@ def check_basic_frame_alignment(self, engine, parser): tm.skip_if_no_ne(engine) args = product(self.lhs_index_types, self.index_types, self.index_types) - for lr_idx_type, rr_idx_type, c_idx_type in args: - df = mkdf(10, 10, data_gen_f=f, r_idx_type=lr_idx_type, - c_idx_type=c_idx_type) - df2 = mkdf(20, 10, data_gen_f=f, r_idx_type=rr_idx_type, - c_idx_type=c_idx_type) - res = pd.eval('df + df2', engine=engine, parser=parser) - assert_frame_equal(res, df + df2) + with warnings.catch_warnings(record=True): + warnings.simplefilter('always', RuntimeWarning) + for lr_idx_type, rr_idx_type, c_idx_type in args: + df = mkdf(10, 10, data_gen_f=f, r_idx_type=lr_idx_type, + c_idx_type=c_idx_type) + df2 = mkdf(20, 10, data_gen_f=f, r_idx_type=rr_idx_type, + c_idx_type=c_idx_type) + # only warns if not monotonic and not sortable + if should_warn(df.index, df2.index): + with tm.assert_produces_warning(RuntimeWarning): + res = pd.eval('df + df2', engine=engine, parser=parser) + else: + res = pd.eval('df + df2', engine=engine, parser=parser) + assert_frame_equal(res, df + df2) def test_basic_frame_alignment(self): for engine, parser in ENGINES_PARSERS: @@ -754,12 +773,20 @@ def check_medium_complex_frame_alignment(self, engine, parser): args = product(self.lhs_index_types, self.index_types, self.index_types, self.index_types) - for r1, c1, r2, c2 in args: - df = mkdf(3, 2, data_gen_f=f, r_idx_type=r1, c_idx_type=c1) - df2 = mkdf(4, 2, data_gen_f=f, r_idx_type=r2, c_idx_type=c2) - df3 = mkdf(5, 2, data_gen_f=f, r_idx_type=r2, c_idx_type=c2) - res = pd.eval('df + df2 + df3', engine=engine, parser=parser) - assert_frame_equal(res, df + df2 + df3) + with warnings.catch_warnings(record=True): + warnings.simplefilter('always', RuntimeWarning) + + for r1, c1, r2, c2 in args: + df = mkdf(3, 2, data_gen_f=f, r_idx_type=r1, c_idx_type=c1) + df2 = mkdf(4, 2, data_gen_f=f, r_idx_type=r2, c_idx_type=c2) + df3 = mkdf(5, 2, data_gen_f=f, r_idx_type=r2, c_idx_type=c2) + if should_warn(df.index, df2.index, df3.index): + with tm.assert_produces_warning(RuntimeWarning): + res = pd.eval('df + df2 + df3', engine=engine, + parser=parser) + else: + res = pd.eval('df + df2 + df3', engine=engine, parser=parser) + assert_frame_equal(res, df + df2 + df3) @slow def test_medium_complex_frame_alignment(self): @@ -775,20 +802,24 @@ def testit(r_idx_type, c_idx_type, index_name): index = getattr(df, index_name) s = Series(np.random.randn(5), index[:5]) - res = pd.eval('df + s', engine=engine, parser=parser) + if should_warn(df.index, s.index): + with tm.assert_produces_warning(RuntimeWarning): + res = pd.eval('df + s', engine=engine, parser=parser) + else: + res = pd.eval('df + s', engine=engine, parser=parser) + if r_idx_type == 'dt' or c_idx_type == 'dt': - if engine == 'numexpr': - expected = df.add(s) - else: - expected = df + s + expected = df.add(s) if engine == 'numexpr' else df + s else: expected = df + s assert_frame_equal(res, expected) args = product(self.lhs_index_types, self.index_types, ('index', 'columns')) - for r_idx_type, c_idx_type, index_name in args: - testit(r_idx_type, c_idx_type, index_name) + with warnings.catch_warnings(record=True): + warnings.simplefilter('always', RuntimeWarning) + for r_idx_type, c_idx_type, index_name in args: + testit(r_idx_type, c_idx_type, index_name) def test_basic_frame_series_alignment(self): for engine, parser in ENGINES_PARSERS: @@ -802,13 +833,14 @@ def testit(r_idx_type, c_idx_type, index_name): c_idx_type=c_idx_type) index = getattr(df, index_name) s = Series(np.random.randn(5), index[:5]) + if should_warn(s.index, df.index): + with tm.assert_produces_warning(RuntimeWarning): + res = pd.eval('s + df', engine=engine, parser=parser) + else: + res = pd.eval('s + df', engine=engine, parser=parser) - res = pd.eval('s + df', engine=engine, parser=parser) if r_idx_type == 'dt' or c_idx_type == 'dt': - if engine == 'numexpr': - expected = df.add(s) - else: - expected = s + df + expected = df.add(s) if engine == 'numexpr' else s + df else: expected = s + df assert_frame_equal(res, expected) @@ -820,8 +852,10 @@ def testit(r_idx_type, c_idx_type, index_name): # dt with dt args = product(['dt'], ['dt'], ('index', 'columns')) - for r_idx_type, c_idx_type, index_name in args: - testit(r_idx_type, c_idx_type, index_name) + with warnings.catch_warnings(record=True): + warnings.simplefilter('always', RuntimeWarning) + for r_idx_type, c_idx_type, index_name in args: + testit(r_idx_type, c_idx_type, index_name) def test_basic_series_frame_alignment(self): for engine, parser in ENGINES_PARSERS: @@ -831,20 +865,29 @@ def check_series_frame_commutativity(self, engine, parser): tm.skip_if_no_ne(engine) args = product(self.lhs_index_types, self.index_types, ('+', '*'), ('index', 'columns')) - for r_idx_type, c_idx_type, op, index_name in args: - df = mkdf(10, 10, data_gen_f=f, r_idx_type=r_idx_type, - c_idx_type=c_idx_type) - index = getattr(df, index_name) - s = Series(np.random.randn(5), index[:5]) - lhs = 's {0} df'.format(op) - rhs = 'df {0} s'.format(op) - a = pd.eval(lhs, engine=engine, parser=parser) - b = pd.eval(rhs, engine=engine, parser=parser) + with warnings.catch_warnings(record=True): + warnings.simplefilter('always', RuntimeWarning) + for r_idx_type, c_idx_type, op, index_name in args: + df = mkdf(10, 10, data_gen_f=f, r_idx_type=r_idx_type, + c_idx_type=c_idx_type) + index = getattr(df, index_name) + s = Series(np.random.randn(5), index[:5]) + + lhs = 's {0} df'.format(op) + rhs = 'df {0} s'.format(op) + if should_warn(df.index, s.index): + with tm.assert_produces_warning(RuntimeWarning): + a = pd.eval(lhs, engine=engine, parser=parser) + with tm.assert_produces_warning(RuntimeWarning): + b = pd.eval(rhs, engine=engine, parser=parser) + else: + a = pd.eval(lhs, engine=engine, parser=parser) + b = pd.eval(rhs, engine=engine, parser=parser) - if r_idx_type != 'dt' and c_idx_type != 'dt': - if engine == 'numexpr': - assert_frame_equal(a, b) + if r_idx_type != 'dt' and c_idx_type != 'dt': + if engine == 'numexpr': + assert_frame_equal(a, b) def test_series_frame_commutativity(self): for engine, parser in ENGINES_PARSERS: @@ -860,34 +903,41 @@ def check_complex_series_frame_alignment(self, engine, parser): m1 = 5 m2 = 2 * m1 - for r1, r2, c1, c2 in args: - index_name = random.choice(['index', 'columns']) - obj_name = random.choice(['df', 'df2']) - - df = mkdf(m1, n, data_gen_f=f, r_idx_type=r1, c_idx_type=c1) - df2 = mkdf(m2, n, data_gen_f=f, r_idx_type=r2, c_idx_type=c2) - index = getattr(locals().get(obj_name), index_name) - s = Series(np.random.randn(n), index[:n]) - - if r2 == 'dt' or c2 == 'dt': - if engine == 'numexpr': - expected2 = df2.add(s) + with warnings.catch_warnings(record=True): + warnings.simplefilter('always', RuntimeWarning) + for r1, r2, c1, c2 in args: + index_name = random.choice(['index', 'columns']) + obj_name = random.choice(['df', 'df2']) + + df = mkdf(m1, n, data_gen_f=f, r_idx_type=r1, c_idx_type=c1) + df2 = mkdf(m2, n, data_gen_f=f, r_idx_type=r2, c_idx_type=c2) + index = getattr(locals().get(obj_name), index_name) + s = Series(np.random.randn(n), index[:n]) + + if r2 == 'dt' or c2 == 'dt': + if engine == 'numexpr': + expected2 = df2.add(s) + else: + expected2 = df2 + s else: expected2 = df2 + s - else: - expected2 = df2 + s - if r1 == 'dt' or c1 == 'dt': - if engine == 'numexpr': - expected = expected2.add(df) + if r1 == 'dt' or c1 == 'dt': + if engine == 'numexpr': + expected = expected2.add(df) + else: + expected = expected2 + df else: expected = expected2 + df - else: - expected = expected2 + df - res = pd.eval('df2 + s + df', engine=engine, parser=parser) - tm.assert_equal(res.shape, expected.shape) - assert_frame_equal(res, expected) + if should_warn(df2.index, s.index, df.index): + with tm.assert_produces_warning(RuntimeWarning): + res = pd.eval('df2 + s + df', engine=engine, + parser=parser) + else: + res = pd.eval('df2 + s + df', engine=engine, parser=parser) + tm.assert_equal(res.shape, expected.shape) + assert_frame_equal(res, expected) @slow def test_complex_series_frame_alignment(self): diff --git a/pandas/core/index.py b/pandas/core/index.py index 10e5558e12542..148fa5ecd8dad 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -1,5 +1,6 @@ # pylint: disable=E1101,E1103,W0232 import datetime +import warnings from functools import partial import warnings from pandas.compat import range, zip, lrange, lzip, u, reduce @@ -997,29 +998,36 @@ def union(self, other): result.extend([x for x in other.values if x not in value_set]) else: indexer = self.get_indexer(other) - indexer = (indexer == -1).nonzero()[0] + indexer, = (indexer == -1).nonzero() if len(indexer) > 0: other_diff = com.take_nd(other.values, indexer, allow_fill=False) result = com._concat_compat((self.values, other_diff)) + try: + self.values[0] < other_diff[0] + except TypeError as e: + warnings.warn("%s, sort order is undefined for " + "incomparable objects" % e, RuntimeWarning) + else: result.sort() - except Exception: - pass + else: - # contained in + result = self.values + try: - result = np.sort(self.values) - except TypeError: # pragma: no cover - result = self.values + result = np.sort(result) + except TypeError as e: + warnings.warn("%s, sort order is undefined for " + "incomparable objects" % e, RuntimeWarning) # for subclasses return self._wrap_union_result(other, result) def _wrap_union_result(self, other, result): name = self.name if self.name == other.name else None - return type(self)(data=result, name=name) + return self.__class__(data=result, name=name) def intersection(self, other): """ diff --git a/pandas/tests/test_index.py b/pandas/tests/test_index.py index 9681a606c7c57..5def2039c5ee8 100644 --- a/pandas/tests/test_index.py +++ b/pandas/tests/test_index.py @@ -2,6 +2,7 @@ from datetime import datetime, timedelta from pandas.compat import range, lrange, lzip, u, zip +import sys import operator import pickle import re @@ -15,7 +16,6 @@ from pandas.core.index import (Index, Float64Index, Int64Index, MultiIndex, InvalidIndexError) from pandas.tseries.index import DatetimeIndex -from pandas.core.frame import DataFrame from pandas.core.series import Series from pandas.util.testing import (assert_almost_equal, assertRaisesRegexp, assert_copy) @@ -844,11 +844,13 @@ def test_slice_keep_name(self): self.assertEqual(idx.name, idx[1:].name) def test_join_self(self): + # instance attributes of the form self.Index indices = 'unicode', 'str', 'date', 'int', 'float' kinds = 'outer', 'inner', 'left', 'right' for index_kind in indices: + res = getattr(self, '{0}Index'.format(index_kind)) + for kind in kinds: - res = getattr(self, '{0}Index'.format(index_kind)) joined = res.join(res, how=kind) self.assertIs(res, joined) @@ -860,6 +862,17 @@ def test_indexing_doesnt_change_class(self): self.assertTrue(idx[[0,1]].identical( pd.Index([1, 2], dtype=np.object_))) + def test_outer_join_sort(self): + left_idx = Index(np.random.permutation(15)) + right_idx = tm.makeDateIndex(10) + + with tm.assert_produces_warning(RuntimeWarning): + joined = left_idx.join(right_idx, how='outer') + # right_idx in this case because DatetimeIndex has join precedence over + # Int64Index + expected = right_idx.astype(object).union(left_idx.astype(object)) + tm.assert_index_equal(joined, expected) + class TestFloat64Index(tm.TestCase): _multiprocess_can_split_ = True @@ -2765,6 +2778,7 @@ def test_get_combined_index(): assert(result.equals(Index([]))) + if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], exit=False)