diff --git a/asv_bench/benchmarks/index_object.py b/asv_bench/benchmarks/index_object.py index 8c65f09937df4..a0a1b560d36f3 100644 --- a/asv_bench/benchmarks/index_object.py +++ b/asv_bench/benchmarks/index_object.py @@ -63,6 +63,27 @@ def time_index_datetime_union(self): self.rng.union(self.rng2) +class index_datetime_set_difference(object): + goal_time = 0.2 + + def setup(self): + self.N = 100000 + self.A = self.N - 20000 + self.B = self.N + 20000 + self.idx1 = DatetimeIndex(range(self.N)) + self.idx2 = DatetimeIndex(range(self.A, self.B)) + self.idx3 = DatetimeIndex(range(self.N, self.B)) + + def time_index_datetime_difference(self): + self.idx1.difference(self.idx2) + + def time_index_datetime_difference_disjoint(self): + self.idx1.difference(self.idx3) + + def time_index_datetime_symmetric_difference(self): + self.idx1.symmetric_difference(self.idx2) + + class index_float64_boolean_indexer(object): goal_time = 0.2 @@ -183,6 +204,40 @@ def time_index_int64_union(self): self.left.union(self.right) +class index_int64_set_difference(object): + goal_time = 0.2 + + def setup(self): + self.N = 500000 + self.options = np.arange(self.N) + self.left = Index(self.options.take( + np.random.permutation(self.N)[:(self.N // 2)])) + self.right = Index(self.options.take( + np.random.permutation(self.N)[:(self.N // 2)])) + + def time_index_int64_difference(self): + self.left.difference(self.right) + + def time_index_int64_symmetric_difference(self): + self.left.symmetric_difference(self.right) + + +class index_str_set_difference(object): + goal_time = 0.2 + + def setup(self): + self.N = 10000 + self.strs = tm.rands_array(10, self.N) + self.left = Index(self.strs[:self.N * 2 // 3]) + self.right = Index(self.strs[self.N // 3:]) + + def time_str_difference(self): + self.left.difference(self.right) + + def time_str_symmetric_difference(self): + self.left.symmetric_difference(self.right) + + class index_str_boolean_indexer(object): goal_time = 0.2 diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 0b9695125c0a9..5e7130d99c8ec 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -395,7 +395,7 @@ resulting dtype will be upcast, which is unchanged from previous. pd.merge(df1, df2, how='outer', on='key') pd.merge(df1, df2, how='outer', on='key').dtypes -.. _whatsnew_0190.describe: +.. _whatsnew_0190.api.describe: ``.describe()`` changes ^^^^^^^^^^^^^^^^^^^^^^^ @@ -484,6 +484,34 @@ New Behavior: pd.NaT + 1 pd.NaT - 1 +.. _whatsnew_0190.api.difference: + +``Index.difference`` and ``.symmetric_difference`` changes +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +``Index.difference`` and ``Index.symmetric_difference`` will now, more consistently, treat ``NaN`` values as any other values. (:issue:`13514`) + +.. ipython:: python + + idx1 = pd.Index([1, 2, 3, np.nan]) + idx2 = pd.Index([0, 1, np.nan]) + +Previous Behavior: + +.. code-block:: ipython + + In [3]: idx1.difference(idx2) + Out[3]: Float64Index([nan, 2.0, 3.0], dtype='float64') + + In [4]: idx1.symmetric_difference(idx2) + Out[4]: Float64Index([0.0, nan, 2.0, 3.0], dtype='float64') + +New Behavior: + +.. ipython:: python + + idx1.difference(idx2) + idx1.symmetric_difference(idx2) .. _whatsnew_0190.deprecations: @@ -520,7 +548,7 @@ Performance Improvements - Improved performance of float64 hash table operations, fixing some very slow indexing and groupby operations in python 3 (:issue:`13166`, :issue:`13334`) - Improved performance of ``DataFrameGroupBy.transform`` (:issue:`12737`) - +- Improved performance of ``Index.difference`` (:issue:`12044`) .. _whatsnew_0190.bug_fixes: @@ -614,3 +642,5 @@ Bug Fixes - Bug in ``groupby`` with ``as_index=False`` returns all NaN's when grouping on multiple columns including a categorical one (:issue:`13204`) - Bug where ``pd.read_gbq()`` could throw ``ImportError: No module named discovery`` as a result of a naming conflict with another python package called apiclient (:issue:`13454`) +- Bug in ``Index.union`` returns an incorrect result with a named empty index (:issue:`13432`) +- Bugs in ``Index.difference`` and ``DataFrame.join`` raise in Python3 when using mixed-integer indexes (:issue:`13432`, :issue:`12814`) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index c3ba734353a8d..5cc54e61f6b2a 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -163,6 +163,104 @@ def isin(comps, values): return f(comps, values) +def safe_sort(values, labels=None, na_sentinel=-1, assume_unique=False): + """ + Sort ``values`` and reorder corresponding ``labels``. + ``values`` should be unique if ``labels`` is not None. + Safe for use with mixed types (int, str), orders ints before strs. + + .. versionadded:: 0.19.0 + + Parameters + ---------- + values : list-like + Sequence; must be unique if ``labels`` is not None. + labels : list_like + Indices to ``values``. All out of bound indices are treated as + "not found" and will be masked with ``na_sentinel``. + na_sentinel : int, default -1 + Value in ``labels`` to mark "not found". + Ignored when ``labels`` is None. + assume_unique : bool, default False + When True, ``values`` are assumed to be unique, which can speed up + the calculation. Ignored when ``labels`` is None. + + Returns + ------- + ordered : ndarray + Sorted ``values`` + new_labels : ndarray + Reordered ``labels``; returned when ``labels`` is not None. + + Raises + ------ + TypeError + * If ``values`` is not list-like or if ``labels`` is neither None + nor list-like + * If ``values`` cannot be sorted + ValueError + * If ``labels`` is not None and ``values`` contain duplicates. + """ + if not is_list_like(values): + raise TypeError("Only list-like objects are allowed to be passed to" + "safe_sort as values") + values = np.array(values, copy=False) + + def sort_mixed(values): + # order ints before strings, safe in py3 + str_pos = np.array([isinstance(x, string_types) for x in values], + dtype=bool) + nums = np.sort(values[~str_pos]) + strs = np.sort(values[str_pos]) + return _ensure_object(np.concatenate([nums, strs])) + + sorter = None + if compat.PY3 and lib.infer_dtype(values) == 'mixed-integer': + # unorderable in py3 if mixed str/int + ordered = sort_mixed(values) + else: + try: + sorter = values.argsort() + ordered = values.take(sorter) + except TypeError: + # try this anyway + ordered = sort_mixed(values) + + # labels: + + if labels is None: + return ordered + + if not is_list_like(labels): + raise TypeError("Only list-like objects or None are allowed to be" + "passed to safe_sort as labels") + labels = _ensure_platform_int(np.asarray(labels)) + + from pandas import Index + if not assume_unique and not Index(values).is_unique: + raise ValueError("values should be unique if labels is not None") + + if sorter is None: + # mixed types + (hash_klass, _), values = _get_data_algo(values, _hashtables) + t = hash_klass(len(values)) + t.map_locations(values) + sorter = _ensure_platform_int(t.lookup(ordered)) + + reverse_indexer = np.empty(len(sorter), dtype=np.int_) + reverse_indexer.put(sorter, np.arange(len(sorter))) + + mask = (labels < -len(values)) | (labels >= len(values)) | \ + (labels == na_sentinel) + + # (Out of bound indices will be masked with `na_sentinel` next, so we may + # deal with them here without performance loss using `mode='wrap'`.) + new_labels = reverse_indexer.take(labels, mode='wrap') + np.putmask(new_labels, mask, na_sentinel) + + return ordered, new_labels + + def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None): """ Encode input values as an enumerated type or categorical variable @@ -210,33 +308,10 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None): uniques = uniques.to_array() if sort and len(uniques) > 0: - try: - sorter = uniques.argsort() - except: - # unorderable in py3 if mixed str/int - t = hash_klass(len(uniques)) - t.map_locations(_ensure_object(uniques)) - - # order ints before strings - ordered = np.concatenate([ - np.sort(np.array([e for i, e in enumerate(uniques) if f(e)], - dtype=object)) for f in - [lambda x: not isinstance(x, string_types), - lambda x: isinstance(x, string_types)]]) - sorter = _ensure_platform_int(t.lookup( - _ensure_object(ordered))) - - reverse_indexer = np.empty(len(sorter), dtype=np.int_) - reverse_indexer.put(sorter, np.arange(len(sorter))) - - mask = labels < 0 - labels = reverse_indexer.take(labels) - np.putmask(labels, mask, -1) - - uniques = uniques.take(sorter) + uniques, labels = safe_sort(uniques, labels, na_sentinel=na_sentinel, + assume_unique=True) if is_datetimetz_type: - # reset tz uniques = DatetimeIndex(uniques.astype('M8[ns]')).tz_localize( values.tz) diff --git a/pandas/indexes/base.py b/pandas/indexes/base.py index b013d6ccb0b8e..71d5fdd17ee5c 100644 --- a/pandas/indexes/base.py +++ b/pandas/indexes/base.py @@ -1773,7 +1773,7 @@ def _get_consensus_name(self, other): else: name = None if self.name != name: - return other._shallow_copy(name=name) + return self._shallow_copy(name=name) return self def union(self, other): @@ -1920,7 +1920,8 @@ def difference(self, other): Return a new Index with elements from the index that are not in `other`. - This is the sorted set difference of two Index objects. + This is the set difference of two Index objects. + It's sorted if sorting is possible. Parameters ---------- @@ -1946,14 +1947,27 @@ def difference(self, other): other, result_name = self._convert_can_do_setop(other) - theDiff = sorted(set(self) - set(other)) - return Index(theDiff, name=result_name) + this = self._get_unique_index() + + indexer = this.get_indexer(other) + indexer = indexer.take((indexer != -1).nonzero()[0]) + + label_diff = np.setdiff1d(np.arange(this.size), indexer, + assume_unique=True) + the_diff = this.values.take(label_diff) + try: + the_diff = algos.safe_sort(the_diff) + except TypeError: + pass + + return this._shallow_copy(the_diff, name=result_name) diff = deprecate('diff', difference) def symmetric_difference(self, other, result_name=None): """ - Compute the sorted symmetric difference of two Index objects. + Compute the symmetric difference of two Index objects. + It's sorted if sorting is possible. Parameters ---------- @@ -1970,9 +1984,6 @@ def symmetric_difference(self, other, result_name=None): ``idx1`` or ``idx2`` but not both. Equivalent to the Index created by ``(idx1 - idx2) + (idx2 - idx1)`` with duplicates dropped. - The sorting of a result containing ``NaN`` values is not guaranteed - across Python versions. See GitHub issue #6444. - Examples -------- >>> idx1 = Index([1, 2, 3, 4]) @@ -1990,8 +2001,26 @@ def symmetric_difference(self, other, result_name=None): if result_name is None: result_name = result_name_update - the_diff = sorted(set((self.difference(other)). - union(other.difference(self)))) + this = self._get_unique_index() + other = other._get_unique_index() + indexer = this.get_indexer(other) + + # {this} minus {other} + common_indexer = indexer.take((indexer != -1).nonzero()[0]) + left_indexer = np.setdiff1d(np.arange(this.size), common_indexer, + assume_unique=True) + left_diff = this.values.take(left_indexer) + + # {other} minus {this} + right_indexer = (indexer == -1).nonzero()[0] + right_diff = other.values.take(right_indexer) + + the_diff = _concat._concat_compat([left_diff, right_diff]) + try: + the_diff = algos.safe_sort(the_diff) + except TypeError: + pass + attribs = self._get_attributes_dict() attribs['name'] = result_name if 'freq' in attribs: @@ -2000,6 +2029,36 @@ def symmetric_difference(self, other, result_name=None): sym_diff = deprecate('sym_diff', symmetric_difference) + def _get_unique_index(self, dropna=False): + """ + Returns an index containing unique values. + + Parameters + ---------- + dropna : bool + If True, NaN values are dropped. + + Returns + ------- + uniques : index + """ + if self.is_unique and not dropna: + return self + + values = self.values + + if not self.is_unique: + values = self.unique() + + if dropna: + try: + if self.hasnans: + values = values[~isnull(values)] + except NotImplementedError: + pass + + return self._shallow_copy(values) + def get_loc(self, key, method=None, tolerance=None): """ Get integer location for requested label diff --git a/pandas/tests/indexes/common.py b/pandas/tests/indexes/common.py index d6f7493bb25f9..92560363be8fe 100644 --- a/pandas/tests/indexes/common.py +++ b/pandas/tests/indexes/common.py @@ -287,6 +287,45 @@ def test_duplicates(self): self.assertEqual(result.name, 'foo') self.assert_index_equal(result, Index([ind[0]], name='foo')) + def test_get_unique_index(self): + for ind in self.indices.values(): + + # MultiIndex tested separately + if not len(ind) or isinstance(ind, MultiIndex): + continue + + idx = ind[[0] * 5] + idx_unique = ind[[0]] + # We test against `idx_unique`, so first we make sure it's unique + # and doesn't contain nans. + self.assertTrue(idx_unique.is_unique) + try: + self.assertFalse(idx_unique.hasnans) + except NotImplementedError: + pass + + for dropna in [False, True]: + result = idx._get_unique_index(dropna=dropna) + self.assert_index_equal(result, idx_unique) + + # nans: + + if not ind._can_hold_na: + continue + + vals = ind.values[[0] * 5] + vals[0] = np.nan + vals_unique = vals[:2] + idx_nan = ind._shallow_copy(vals) + idx_unique_nan = ind._shallow_copy(vals_unique) + self.assertTrue(idx_unique_nan.is_unique) + + for dropna, expected in zip([False, True], + [idx_unique_nan, idx_unique]): + for i in [idx_nan, idx_unique_nan]: + result = i._get_unique_index(dropna=dropna) + self.assert_index_equal(result, expected) + def test_sort(self): for ind in self.indices.values(): self.assertRaises(TypeError, ind.sort) diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 06662e52e3a6f..cc5dd24292bb8 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -640,47 +640,56 @@ def test_union(self): first = Index(list('ab'), name='A') second = Index(list('ab'), name='B') union = first.union(second) - self.assertIsNone(union.name) + expected = Index(list('ab'), name=None) + tm.assert_index_equal(union, expected) first = Index(list('ab'), name='A') second = Index([], name='B') union = first.union(second) - self.assertIsNone(union.name) + expected = Index(list('ab'), name=None) + tm.assert_index_equal(union, expected) first = Index([], name='A') second = Index(list('ab'), name='B') union = first.union(second) - self.assertIsNone(union.name) + expected = Index(list('ab'), name=None) + tm.assert_index_equal(union, expected) first = Index(list('ab')) second = Index(list('ab'), name='B') union = first.union(second) - self.assertEqual(union.name, 'B') + expected = Index(list('ab'), name='B') + tm.assert_index_equal(union, expected) first = Index([]) second = Index(list('ab'), name='B') union = first.union(second) - self.assertEqual(union.name, 'B') + expected = Index(list('ab'), name='B') + tm.assert_index_equal(union, expected) first = Index(list('ab')) second = Index([], name='B') union = first.union(second) - self.assertEqual(union.name, 'B') + expected = Index(list('ab'), name='B') + tm.assert_index_equal(union, expected) first = Index(list('ab'), name='A') second = Index(list('ab')) union = first.union(second) - self.assertEqual(union.name, 'A') + expected = Index(list('ab'), name='A') + tm.assert_index_equal(union, expected) first = Index(list('ab'), name='A') second = Index([]) union = first.union(second) - self.assertEqual(union.name, 'A') + expected = Index(list('ab'), name='A') + tm.assert_index_equal(union, expected) first = Index([], name='A') second = Index(list('ab')) union = first.union(second) - self.assertEqual(union.name, 'A') + expected = Index(list('ab'), name='A') + tm.assert_index_equal(union, expected) def test_add(self): @@ -803,17 +812,19 @@ def test_symmetric_difference(self): self.assertTrue(tm.equalContents(result, expected)) # nans: - # GH #6444, sorting of nans. Make sure the number of nans is right - # and the correct non-nan values are there. punt on sorting. - idx1 = Index([1, 2, 3, np.nan]) + # GH 13514 change: {nan} - {nan} == {} + # (GH 6444, sorting of nans, is no longer an issue) + idx1 = Index([1, np.nan, 2, 3]) idx2 = Index([0, 1, np.nan]) + idx3 = Index([0, 1]) + result = idx1.symmetric_difference(idx2) - # expected = Index([0.0, np.nan, 2.0, 3.0, np.nan]) + expected = Index([0.0, 2.0, 3.0]) + tm.assert_index_equal(result, expected) - nans = pd.isnull(result) - self.assertEqual(nans.sum(), 1) - self.assertEqual((~nans).sum(), 3) - [self.assertIn(x, result) for x in [0.0, 2.0, 3.0]] + result = idx1.symmetric_difference(idx3) + expected = Index([0.0, 2.0, 3.0, np.nan]) + tm.assert_index_equal(result, expected) # other not an Index: idx1 = Index([1, 2, 3, 4], name='idx1') @@ -1665,6 +1676,149 @@ def test_string_index_repr(self): self.assertEqual(coerce(idx), expected) +class TestMixedIntIndex(Base, tm.TestCase): + # Mostly the tests from common.py for which the results differ + # in py2 and py3 because ints and strings are uncomparable in py3 + # (GH 13514) + + _holder = Index + _multiprocess_can_split_ = True + + def setUp(self): + self.indices = dict(mixedIndex=Index([0, 'a', 1, 'b', 2, 'c'])) + self.setup_indices() + + def create_index(self): + return self.mixedIndex + + def test_order(self): + idx = self.create_index() + # 9816 deprecated + if PY3: + with tm.assertRaisesRegexp(TypeError, "unorderable types"): + with tm.assert_produces_warning(FutureWarning): + idx.order() + else: + with tm.assert_produces_warning(FutureWarning): + idx.order() + + def test_argsort(self): + idx = self.create_index() + if PY3: + with tm.assertRaisesRegexp(TypeError, "unorderable types"): + result = idx.argsort() + else: + result = idx.argsort() + expected = np.array(idx).argsort() + tm.assert_numpy_array_equal(result, expected, check_dtype=False) + + def test_numpy_argsort(self): + idx = self.create_index() + if PY3: + with tm.assertRaisesRegexp(TypeError, "unorderable types"): + result = np.argsort(idx) + else: + result = np.argsort(idx) + expected = idx.argsort() + tm.assert_numpy_array_equal(result, expected) + + def test_copy_name(self): + # Check that "name" argument passed at initialization is honoured + # GH12309 + idx = self.create_index() + + first = idx.__class__(idx, copy=True, name='mario') + second = first.__class__(first, copy=False) + + # Even though "copy=False", we want a new object. + self.assertIsNot(first, second) + # Not using tm.assert_index_equal() since names differ: + self.assertTrue(idx.equals(first)) + + self.assertEqual(first.name, 'mario') + self.assertEqual(second.name, 'mario') + + s1 = Series(2, index=first) + s2 = Series(3, index=second[:-1]) + if PY3: + with tm.assert_produces_warning(RuntimeWarning): + # unorderable types + s3 = s1 * s2 + else: + s3 = s1 * s2 + self.assertEqual(s3.index.name, 'mario') + + def test_union_base(self): + idx = self.create_index() + first = idx[3:] + second = idx[:5] + + if PY3: + with tm.assert_produces_warning(RuntimeWarning): + # unorderable types + result = first.union(second) + expected = Index(['b', 2, 'c', 0, 'a', 1]) + self.assert_index_equal(result, expected) + else: + result = first.union(second) + expected = Index(['b', 2, 'c', 0, 'a', 1]) + self.assert_index_equal(result, expected) + + # GH 10149 + cases = [klass(second.values) + for klass in [np.array, Series, list]] + for case in cases: + if PY3: + with tm.assert_produces_warning(RuntimeWarning): + # unorderable types + result = first.union(case) + self.assertTrue(tm.equalContents(result, idx)) + else: + result = first.union(case) + self.assertTrue(tm.equalContents(result, idx)) + + def test_intersection_base(self): + # (same results for py2 and py3 but sortedness not tested elsewhere) + idx = self.create_index() + first = idx[:5] + second = idx[:3] + result = first.intersection(second) + expected = Index([0, 'a', 1]) + self.assert_index_equal(result, expected) + + # GH 10149 + cases = [klass(second.values) + for klass in [np.array, Series, list]] + for case in cases: + result = first.intersection(case) + self.assertTrue(tm.equalContents(result, second)) + + def test_difference_base(self): + # (same results for py2 and py3 but sortedness not tested elsewhere) + idx = self.create_index() + first = idx[:4] + second = idx[3:] + + result = first.difference(second) + expected = Index([0, 1, 'a']) + self.assert_index_equal(result, expected) + + def test_symmetric_difference(self): + # (same results for py2 and py3 but sortedness not tested elsewhere) + idx = self.create_index() + first = idx[:4] + second = idx[3:] + + result = first.symmetric_difference(second) + expected = Index([0, 1, 2, 'a', 'c']) + self.assert_index_equal(result, expected) + + def test_logical_compat(self): + idx = self.create_index() + self.assertEqual(idx.all(), idx.values.all()) + self.assertEqual(idx.any(), idx.values.any()) + + def test_get_combined_index(): from pandas.core.index import _get_combined_index result = _get_combined_index([]) diff --git a/pandas/tests/indexes/test_multi.py b/pandas/tests/indexes/test_multi.py index e6a8aafc32be4..2734e90a1971b 100644 --- a/pandas/tests/indexes/test_multi.py +++ b/pandas/tests/indexes/test_multi.py @@ -1877,6 +1877,15 @@ def test_duplicate_meta_data(self): self.assertTrue(idx.has_duplicates) self.assertEqual(idx.drop_duplicates().names, idx.names) + def test_get_unique_index(self): + idx = self.index[[0, 1, 0, 1, 1, 0, 0]] + expected = self.index._shallow_copy(idx[[0, 1]]) + + for dropna in [False, True]: + result = idx._get_unique_index(dropna=dropna) + self.assertTrue(result.unique) + self.assert_index_equal(result, expected) + def test_tolist(self): result = self.index.tolist() exp = list(self.index.values) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index cb90110c953c1..f18d869b3843d 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -56,6 +56,80 @@ def test_strings(self): tm.assert_series_equal(result, expected) +class TestSafeSort(tm.TestCase): + _multiprocess_can_split_ = True + + def test_basic_sort(self): + values = [3, 1, 2, 0, 4] + result = algos.safe_sort(values) + expected = np.array([0, 1, 2, 3, 4]) + tm.assert_numpy_array_equal(result, expected) + + values = list("baaacb") + result = algos.safe_sort(values) + expected = np.array(list("aaabbc")) + tm.assert_numpy_array_equal(result, expected) + + values = [] + result = algos.safe_sort(values) + expected = np.array([]) + tm.assert_numpy_array_equal(result, expected) + + def test_labels(self): + values = [3, 1, 2, 0, 4] + expected = np.array([0, 1, 2, 3, 4]) + + labels = [0, 1, 1, 2, 3, 0, -1, 4] + result, result_labels = algos.safe_sort(values, labels) + expected_labels = np.array([3, 1, 1, 2, 0, 3, -1, 4]) + tm.assert_numpy_array_equal(result, expected) + tm.assert_numpy_array_equal(result_labels, expected_labels) + + # na_sentinel + labels = [0, 1, 1, 2, 3, 0, 99, 4] + result, result_labels = algos.safe_sort(values, labels, + na_sentinel=99) + expected_labels = np.array([3, 1, 1, 2, 0, 3, 99, 4]) + tm.assert_numpy_array_equal(result, expected) + tm.assert_numpy_array_equal(result_labels, expected_labels) + + # out of bound indices + labels = [0, 101, 102, 2, 3, 0, 99, 4] + result, result_labels = algos.safe_sort(values, labels) + expected_labels = np.array([3, -1, -1, 2, 0, 3, -1, 4]) + tm.assert_numpy_array_equal(result, expected) + tm.assert_numpy_array_equal(result_labels, expected_labels) + + labels = [] + result, result_labels = algos.safe_sort(values, labels) + expected_labels = np.array([], dtype=np.int_) + tm.assert_numpy_array_equal(result, expected) + tm.assert_numpy_array_equal(result_labels, expected_labels) + + def test_mixed_integer(self): + values = np.array(['b', 1, 0, 'a', 0, 'b'], dtype=object) + result = algos.safe_sort(values) + expected = np.array([0, 0, 1, 'a', 'b', 'b'], dtype=object) + tm.assert_numpy_array_equal(result, expected) + + values = np.array(['b', 1, 0, 'a'], dtype=object) + labels = [0, 1, 2, 3, 0, -1, 1] + result, result_labels = algos.safe_sort(values, labels) + expected = np.array([0, 1, 'a', 'b'], dtype=object) + + def test_exceptions(self): + with tm.assertRaisesRegexp(TypeError, + "Only list-like objects are allowed"): + algos.safe_sort(values=1) + + with tm.assertRaisesRegexp(TypeError, + "Only list-like objects or None"): + algos.safe_sort(values=[0, 1, 2], labels=1) + + with tm.assertRaisesRegexp(ValueError, "values should be unique"): + algos.safe_sort(values=[0, 1, 2, 1], labels=[0, 1]) + + class TestFactorize(tm.TestCase): _multiprocess_can_split_ = True diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index 57d43f22757ea..258f36cb1b68f 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -3210,6 +3210,18 @@ def test_groupby_nonstring_columns(self): expected = df.groupby(df[0]).mean() assert_frame_equal(result, expected) + def test_groupby_mixed_type_columns(self): + # GH 13432, unorderable types in py3 + df = DataFrame([[0, 1, 2]], columns=['A', 'B', 0]) + expected = DataFrame([[1, 2]], columns=['B', 0], + index=Index([0], name='A')) + + result = df.groupby('A').first() + tm.assert_frame_equal(result, expected) + + result = df.groupby('A').sum() + tm.assert_frame_equal(result, expected) + def test_cython_grouper_series_bug_noncontig(self): arr = np.empty((100, 100)) arr.fill(np.nan) diff --git a/pandas/tools/merge.py b/pandas/tools/merge.py index 5b66e55eb60b6..e7d165354ec6c 100644 --- a/pandas/tools/merge.py +++ b/pandas/tools/merge.py @@ -1209,16 +1209,12 @@ def _sort_labels(uniques, left, right): # tuplesafe uniques = Index(uniques).values - sorter = uniques.argsort() + l = len(left) + labels = np.concatenate([left, right]) - reverse_indexer = np.empty(len(sorter), dtype=np.int64) - reverse_indexer.put(sorter, np.arange(len(sorter))) - - new_left = reverse_indexer.take(_ensure_platform_int(left)) - np.putmask(new_left, left == -1, -1) - - new_right = reverse_indexer.take(_ensure_platform_int(right)) - np.putmask(new_right, right == -1, -1) + _, new_labels = algos.safe_sort(uniques, labels, na_sentinel=-1) + new_labels = _ensure_int64(new_labels) + new_left, new_right = new_labels[:l], new_labels[l:] return new_left, new_right diff --git a/pandas/tools/tests/test_join.py b/pandas/tools/tests/test_join.py index 86aee0b4a01c9..cb84c1f06653b 100644 --- a/pandas/tools/tests/test_join.py +++ b/pandas/tools/tests/test_join.py @@ -536,6 +536,23 @@ def test_join_sort(self): joined = left.join(right, on='key', sort=False) self.assert_index_equal(joined.index, pd.Index(lrange(4))) + def test_join_mixed_non_unique_index(self): + # GH 12814, unorderable types in py3 with a non-unique index + df1 = DataFrame({'a': [1, 2, 3, 4]}, index=[1, 2, 3, 'a']) + df2 = DataFrame({'b': [5, 6, 7, 8]}, index=[1, 3, 3, 4]) + result = df1.join(df2) + expected = DataFrame({'a': [1, 2, 3, 3, 4], + 'b': [5, np.nan, 6, 7, np.nan]}, + index=[1, 2, 3, 3, 'a']) + tm.assert_frame_equal(result, expected) + + df3 = DataFrame({'a': [1, 2, 3, 4]}, index=[1, 2, 2, 'a']) + df4 = DataFrame({'b': [5, 6, 7, 8]}, index=[1, 2, 3, 4]) + result = df3.join(df4) + expected = DataFrame({'a': [1, 2, 3, 4], 'b': [5, 6, 6, np.nan]}, + index=[1, 2, 2, 'a']) + tm.assert_frame_equal(result, expected) + def test_mixed_type_join_with_suffix(self): # GH #916 df = DataFrame(np.random.randn(20, 6),