Skip to content

BUG: union should not try to sort inplace because platform impls differ as to when sorting occurs for objects that cannot be compared #5266

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jun 3, 2014
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
174 changes: 112 additions & 62 deletions pandas/computation/tests/test_eval.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
#!/usr/bin/env python

import warnings
import operator
from itertools import product
from distutils.version import LooseVersion

Expand Down Expand Up @@ -28,7 +30,7 @@
from pandas.util.testing import (assert_frame_equal, randbool,
assertRaisesRegexp,
assert_produces_warning, assert_series_equal)
from pandas.compat import PY3, u
from pandas.compat import PY3, u, reduce

_series_frame_incompatible = _bool_ops_syms
_scalar_skip = 'in', 'not in'
Expand Down Expand Up @@ -699,6 +701,16 @@ def check_chained_cmp_op(self, lhs, cmp1, mid, cmp2, rhs):
#-------------------------------------
# basic and complex alignment

def _is_datetime(x):
return issubclass(x.dtype.type, np.datetime64)


def should_warn(*args):
not_mono = not any(map(operator.attrgetter('is_monotonic'), args))
only_one_dt = reduce(operator.xor, map(_is_datetime, args))
return not_mono and only_one_dt


class TestAlignment(object):

index_types = 'i', 'u', 'dt'
Expand All @@ -719,13 +731,20 @@ def check_basic_frame_alignment(self, engine, parser):
tm.skip_if_no_ne(engine)
args = product(self.lhs_index_types, self.index_types,
self.index_types)
for lr_idx_type, rr_idx_type, c_idx_type in args:
df = mkdf(10, 10, data_gen_f=f, r_idx_type=lr_idx_type,
c_idx_type=c_idx_type)
df2 = mkdf(20, 10, data_gen_f=f, r_idx_type=rr_idx_type,
c_idx_type=c_idx_type)
res = pd.eval('df + df2', engine=engine, parser=parser)
assert_frame_equal(res, df + df2)
with warnings.catch_warnings(record=True):
warnings.simplefilter('always', RuntimeWarning)
for lr_idx_type, rr_idx_type, c_idx_type in args:
df = mkdf(10, 10, data_gen_f=f, r_idx_type=lr_idx_type,
c_idx_type=c_idx_type)
df2 = mkdf(20, 10, data_gen_f=f, r_idx_type=rr_idx_type,
c_idx_type=c_idx_type)
# only warns if not monotonic and not sortable
if should_warn(df.index, df2.index):
with tm.assert_produces_warning(RuntimeWarning):
res = pd.eval('df + df2', engine=engine, parser=parser)
else:
res = pd.eval('df + df2', engine=engine, parser=parser)
assert_frame_equal(res, df + df2)

def test_basic_frame_alignment(self):
for engine, parser in ENGINES_PARSERS:
Expand Down Expand Up @@ -754,12 +773,20 @@ def check_medium_complex_frame_alignment(self, engine, parser):
args = product(self.lhs_index_types, self.index_types,
self.index_types, self.index_types)

for r1, c1, r2, c2 in args:
df = mkdf(3, 2, data_gen_f=f, r_idx_type=r1, c_idx_type=c1)
df2 = mkdf(4, 2, data_gen_f=f, r_idx_type=r2, c_idx_type=c2)
df3 = mkdf(5, 2, data_gen_f=f, r_idx_type=r2, c_idx_type=c2)
res = pd.eval('df + df2 + df3', engine=engine, parser=parser)
assert_frame_equal(res, df + df2 + df3)
with warnings.catch_warnings(record=True):
warnings.simplefilter('always', RuntimeWarning)

for r1, c1, r2, c2 in args:
df = mkdf(3, 2, data_gen_f=f, r_idx_type=r1, c_idx_type=c1)
df2 = mkdf(4, 2, data_gen_f=f, r_idx_type=r2, c_idx_type=c2)
df3 = mkdf(5, 2, data_gen_f=f, r_idx_type=r2, c_idx_type=c2)
if should_warn(df.index, df2.index, df3.index):
with tm.assert_produces_warning(RuntimeWarning):
res = pd.eval('df + df2 + df3', engine=engine,
parser=parser)
else:
res = pd.eval('df + df2 + df3', engine=engine, parser=parser)
assert_frame_equal(res, df + df2 + df3)

@slow
def test_medium_complex_frame_alignment(self):
Expand All @@ -775,20 +802,24 @@ def testit(r_idx_type, c_idx_type, index_name):
index = getattr(df, index_name)
s = Series(np.random.randn(5), index[:5])

res = pd.eval('df + s', engine=engine, parser=parser)
if should_warn(df.index, s.index):
with tm.assert_produces_warning(RuntimeWarning):
res = pd.eval('df + s', engine=engine, parser=parser)
else:
res = pd.eval('df + s', engine=engine, parser=parser)

if r_idx_type == 'dt' or c_idx_type == 'dt':
if engine == 'numexpr':
expected = df.add(s)
else:
expected = df + s
expected = df.add(s) if engine == 'numexpr' else df + s
else:
expected = df + s
assert_frame_equal(res, expected)

args = product(self.lhs_index_types, self.index_types,
('index', 'columns'))
for r_idx_type, c_idx_type, index_name in args:
testit(r_idx_type, c_idx_type, index_name)
with warnings.catch_warnings(record=True):
warnings.simplefilter('always', RuntimeWarning)
for r_idx_type, c_idx_type, index_name in args:
testit(r_idx_type, c_idx_type, index_name)

def test_basic_frame_series_alignment(self):
for engine, parser in ENGINES_PARSERS:
Expand All @@ -802,13 +833,14 @@ def testit(r_idx_type, c_idx_type, index_name):
c_idx_type=c_idx_type)
index = getattr(df, index_name)
s = Series(np.random.randn(5), index[:5])
if should_warn(s.index, df.index):
with tm.assert_produces_warning(RuntimeWarning):
res = pd.eval('s + df', engine=engine, parser=parser)
else:
res = pd.eval('s + df', engine=engine, parser=parser)

res = pd.eval('s + df', engine=engine, parser=parser)
if r_idx_type == 'dt' or c_idx_type == 'dt':
if engine == 'numexpr':
expected = df.add(s)
else:
expected = s + df
expected = df.add(s) if engine == 'numexpr' else s + df
else:
expected = s + df
assert_frame_equal(res, expected)
Expand All @@ -820,8 +852,10 @@ def testit(r_idx_type, c_idx_type, index_name):

# dt with dt
args = product(['dt'], ['dt'], ('index', 'columns'))
for r_idx_type, c_idx_type, index_name in args:
testit(r_idx_type, c_idx_type, index_name)
with warnings.catch_warnings(record=True):
warnings.simplefilter('always', RuntimeWarning)
for r_idx_type, c_idx_type, index_name in args:
testit(r_idx_type, c_idx_type, index_name)

def test_basic_series_frame_alignment(self):
for engine, parser in ENGINES_PARSERS:
Expand All @@ -831,20 +865,29 @@ def check_series_frame_commutativity(self, engine, parser):
tm.skip_if_no_ne(engine)
args = product(self.lhs_index_types, self.index_types, ('+', '*'),
('index', 'columns'))
for r_idx_type, c_idx_type, op, index_name in args:
df = mkdf(10, 10, data_gen_f=f, r_idx_type=r_idx_type,
c_idx_type=c_idx_type)
index = getattr(df, index_name)
s = Series(np.random.randn(5), index[:5])

lhs = 's {0} df'.format(op)
rhs = 'df {0} s'.format(op)
a = pd.eval(lhs, engine=engine, parser=parser)
b = pd.eval(rhs, engine=engine, parser=parser)
with warnings.catch_warnings(record=True):
warnings.simplefilter('always', RuntimeWarning)
for r_idx_type, c_idx_type, op, index_name in args:
df = mkdf(10, 10, data_gen_f=f, r_idx_type=r_idx_type,
c_idx_type=c_idx_type)
index = getattr(df, index_name)
s = Series(np.random.randn(5), index[:5])

lhs = 's {0} df'.format(op)
rhs = 'df {0} s'.format(op)
if should_warn(df.index, s.index):
with tm.assert_produces_warning(RuntimeWarning):
a = pd.eval(lhs, engine=engine, parser=parser)
with tm.assert_produces_warning(RuntimeWarning):
b = pd.eval(rhs, engine=engine, parser=parser)
else:
a = pd.eval(lhs, engine=engine, parser=parser)
b = pd.eval(rhs, engine=engine, parser=parser)

if r_idx_type != 'dt' and c_idx_type != 'dt':
if engine == 'numexpr':
assert_frame_equal(a, b)
if r_idx_type != 'dt' and c_idx_type != 'dt':
if engine == 'numexpr':
assert_frame_equal(a, b)

def test_series_frame_commutativity(self):
for engine, parser in ENGINES_PARSERS:
Expand All @@ -860,34 +903,41 @@ def check_complex_series_frame_alignment(self, engine, parser):
m1 = 5
m2 = 2 * m1

for r1, r2, c1, c2 in args:
index_name = random.choice(['index', 'columns'])
obj_name = random.choice(['df', 'df2'])

df = mkdf(m1, n, data_gen_f=f, r_idx_type=r1, c_idx_type=c1)
df2 = mkdf(m2, n, data_gen_f=f, r_idx_type=r2, c_idx_type=c2)
index = getattr(locals().get(obj_name), index_name)
s = Series(np.random.randn(n), index[:n])

if r2 == 'dt' or c2 == 'dt':
if engine == 'numexpr':
expected2 = df2.add(s)
with warnings.catch_warnings(record=True):
warnings.simplefilter('always', RuntimeWarning)
for r1, r2, c1, c2 in args:
index_name = random.choice(['index', 'columns'])
obj_name = random.choice(['df', 'df2'])

df = mkdf(m1, n, data_gen_f=f, r_idx_type=r1, c_idx_type=c1)
df2 = mkdf(m2, n, data_gen_f=f, r_idx_type=r2, c_idx_type=c2)
index = getattr(locals().get(obj_name), index_name)
s = Series(np.random.randn(n), index[:n])

if r2 == 'dt' or c2 == 'dt':
if engine == 'numexpr':
expected2 = df2.add(s)
else:
expected2 = df2 + s
else:
expected2 = df2 + s
else:
expected2 = df2 + s

if r1 == 'dt' or c1 == 'dt':
if engine == 'numexpr':
expected = expected2.add(df)
if r1 == 'dt' or c1 == 'dt':
if engine == 'numexpr':
expected = expected2.add(df)
else:
expected = expected2 + df
else:
expected = expected2 + df
else:
expected = expected2 + df

res = pd.eval('df2 + s + df', engine=engine, parser=parser)
tm.assert_equal(res.shape, expected.shape)
assert_frame_equal(res, expected)
if should_warn(df2.index, s.index, df.index):
with tm.assert_produces_warning(RuntimeWarning):
res = pd.eval('df2 + s + df', engine=engine,
parser=parser)
else:
res = pd.eval('df2 + s + df', engine=engine, parser=parser)
tm.assert_equal(res.shape, expected.shape)
assert_frame_equal(res, expected)

@slow
def test_complex_series_frame_alignment(self):
Expand Down
24 changes: 16 additions & 8 deletions pandas/core/index.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# pylint: disable=E1101,E1103,W0232
import datetime
import warnings
from functools import partial
import warnings
from pandas.compat import range, zip, lrange, lzip, u, reduce
Expand Down Expand Up @@ -997,29 +998,36 @@ def union(self, other):
result.extend([x for x in other.values if x not in value_set])
else:
indexer = self.get_indexer(other)
indexer = (indexer == -1).nonzero()[0]
indexer, = (indexer == -1).nonzero()

if len(indexer) > 0:
other_diff = com.take_nd(other.values, indexer,
allow_fill=False)
result = com._concat_compat((self.values, other_diff))

try:
self.values[0] < other_diff[0]
except TypeError as e:
warnings.warn("%s, sort order is undefined for "
"incomparable objects" % e, RuntimeWarning)
else:
result.sort()
except Exception:
pass

else:
# contained in
result = self.values

try:
result = np.sort(self.values)
except TypeError: # pragma: no cover
result = self.values
result = np.sort(result)
except TypeError as e:
warnings.warn("%s, sort order is undefined for "
"incomparable objects" % e, RuntimeWarning)

# for subclasses
return self._wrap_union_result(other, result)

def _wrap_union_result(self, other, result):
name = self.name if self.name == other.name else None
return type(self)(data=result, name=name)
return self.__class__(data=result, name=name)

def intersection(self, other):
"""
Expand Down
18 changes: 16 additions & 2 deletions pandas/tests/test_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

from datetime import datetime, timedelta
from pandas.compat import range, lrange, lzip, u, zip
import sys
import operator
import pickle
import re
Expand All @@ -15,7 +16,6 @@
from pandas.core.index import (Index, Float64Index, Int64Index, MultiIndex,
InvalidIndexError)
from pandas.tseries.index import DatetimeIndex
from pandas.core.frame import DataFrame
from pandas.core.series import Series
from pandas.util.testing import (assert_almost_equal, assertRaisesRegexp,
assert_copy)
Expand Down Expand Up @@ -844,11 +844,13 @@ def test_slice_keep_name(self):
self.assertEqual(idx.name, idx[1:].name)

def test_join_self(self):
# instance attributes of the form self.<name>Index
indices = 'unicode', 'str', 'date', 'int', 'float'
kinds = 'outer', 'inner', 'left', 'right'
for index_kind in indices:
res = getattr(self, '{0}Index'.format(index_kind))

for kind in kinds:
res = getattr(self, '{0}Index'.format(index_kind))
joined = res.join(res, how=kind)
self.assertIs(res, joined)

Expand All @@ -860,6 +862,17 @@ def test_indexing_doesnt_change_class(self):
self.assertTrue(idx[[0,1]].identical(
pd.Index([1, 2], dtype=np.object_)))

def test_outer_join_sort(self):
left_idx = Index(np.random.permutation(15))
right_idx = tm.makeDateIndex(10)

with tm.assert_produces_warning(RuntimeWarning):
joined = left_idx.join(right_idx, how='outer')
# right_idx in this case because DatetimeIndex has join precedence over
# Int64Index
expected = right_idx.astype(object).union(left_idx.astype(object))
tm.assert_index_equal(joined, expected)


class TestFloat64Index(tm.TestCase):
_multiprocess_can_split_ = True
Expand Down Expand Up @@ -2765,6 +2778,7 @@ def test_get_combined_index():
assert(result.equals(Index([])))



if __name__ == '__main__':
nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],
exit=False)