pandas-dev · pijucha · Jun 22, 2016 · jreback · Jun 26, 2016 · pijucha
diff --git a/asv_bench/benchmarks/index_object.py b/asv_bench/benchmarks/index_object.py
@@ -63,6 +63,27 @@ def time_index_datetime_union(self):
         self.rng.union(self.rng2)
 
 
+class index_datetime_set_difference(object):
+    goal_time = 0.2
+
+    def setup(self):
+        self.N = 100000
+        self.A = self.N - 20000
+        self.B = self.N + 20000
+        self.idx1 = DatetimeIndex(range(self.N))
+        self.idx2 = DatetimeIndex(range(self.A, self.B))
+        self.idx3 = DatetimeIndex(range(self.N, self.B))
+
+    def time_index_datetime_difference(self):
+        self.idx1.difference(self.idx2)
+
+    def time_index_datetime_difference_disjoint(self):
+        self.idx1.difference(self.idx3)
+
+    def time_index_datetime_symmetric_difference(self):
+        self.idx1.symmetric_difference(self.idx2)
+
+
 class index_float64_boolean_indexer(object):
     goal_time = 0.2
 
@@ -183,6 +204,40 @@ def time_index_int64_union(self):
         self.left.union(self.right)
 
 
+class index_int64_set_difference(object):
+    goal_time = 0.2
+
+    def setup(self):
+        self.N = 500000
+        self.options = np.arange(self.N)
+        self.left = Index(self.options.take(
+            np.random.permutation(self.N)[:(self.N // 2)]))
+        self.right = Index(self.options.take(
+            np.random.permutation(self.N)[:(self.N // 2)]))
+
+    def time_index_int64_difference(self):
+        self.left.difference(self.right)
+
+    def time_index_int64_symmetric_difference(self):
+        self.left.symmetric_difference(self.right)
+
+
+class index_str_set_difference(object):
+    goal_time = 0.2
+
+    def setup(self):
+        self.N = 10000
+        self.strs = tm.rands_array(10, self.N)
+        self.left = Index(self.strs[:self.N * 2 // 3])
+        self.right = Index(self.strs[self.N // 3:])
+
+    def time_str_difference(self):
+        self.left.difference(self.right)
+
+    def time_str_symmetric_difference(self):
+        self.left.symmetric_difference(self.right)
+
+
 class index_str_boolean_indexer(object):
     goal_time = 0.2
 

diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt
@@ -395,7 +395,7 @@ resulting dtype will be upcast, which is unchanged from previous.
    pd.merge(df1, df2, how='outer', on='key')
    pd.merge(df1, df2, how='outer', on='key').dtypes
 
-.. _whatsnew_0190.describe:
+.. _whatsnew_0190.api.describe:
 
 ``.describe()`` changes
 ^^^^^^^^^^^^^^^^^^^^^^^
@@ -484,6 +484,34 @@ New Behavior:
    pd.NaT + 1
    pd.NaT - 1
 
+.. _whatsnew_0190.api.difference:
+
+``Index.difference`` and ``.symmetric_difference`` changes
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+``Index.difference`` and ``Index.symmetric_difference`` will now, more consistently, treat ``NaN`` values as any other values. (:issue:`13514`)
+
+.. ipython:: python
+
+   idx1 = pd.Index([1, 2, 3, np.nan])
+   idx2 = pd.Index([0, 1, np.nan])
+
+Previous Behavior:
+
+.. code-block:: ipython
+
+   In [3]: idx1.difference(idx2)
+   Out[3]: Float64Index([nan, 2.0, 3.0], dtype='float64')
+
+   In [4]: idx1.symmetric_difference(idx2)
+   Out[4]: Float64Index([0.0, nan, 2.0, 3.0], dtype='float64')
+
+New Behavior:
+
+.. ipython:: python
+
+   idx1.difference(idx2)
+   idx1.symmetric_difference(idx2)
 
 .. _whatsnew_0190.deprecations:
 
@@ -520,7 +548,7 @@ Performance Improvements
 
 - Improved performance of float64 hash table operations, fixing some very slow indexing and groupby operations in python 3 (:issue:`13166`, :issue:`13334`)
 - Improved performance of ``DataFrameGroupBy.transform`` (:issue:`12737`)
-
+- Improved performance of ``Index.difference`` (:issue:`12044`)
 
 .. _whatsnew_0190.bug_fixes:
 
@@ -614,3 +642,5 @@ Bug Fixes
 - Bug in ``groupby`` with ``as_index=False`` returns all NaN's when grouping on multiple columns including a categorical one (:issue:`13204`)
 
 - Bug where ``pd.read_gbq()`` could throw ``ImportError: No module named discovery`` as a result of a naming conflict with another python package called apiclient  (:issue:`13454`)
+- Bug in ``Index.union`` returns an incorrect result with a named empty index (:issue:`13432`)
+- Bugs in ``Index.difference`` and ``DataFrame.join`` raise in Python3 when using mixed-integer indexes (:issue:`13432`, :issue:`12814`)
diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
@@ -163,6 +163,104 @@ def isin(comps, values):
     return f(comps, values)
 
 
+def safe_sort(values, labels=None, na_sentinel=-1, assume_unique=False):
+    """
+    Sort ``values`` and reorder corresponding ``labels``.
+    ``values`` should be unique if ``labels`` is not None.
+    Safe for use with mixed types (int, str), orders ints before strs.
+
+    .. versionadded:: 0.19.0
+
+    Parameters
+    ----------
+    values : list-like
+        Sequence; must be unique if ``labels`` is not None.
+    labels : list_like
+        Indices to ``values``. All out of bound indices are treated as
+        "not found" and will be masked with ``na_sentinel``.
+    na_sentinel : int, default -1
+        Value in ``labels`` to mark "not found".
+        Ignored when ``labels`` is None.
+    assume_unique : bool, default False
+        When True, ``values`` are assumed to be unique, which can speed up
+        the calculation. Ignored when ``labels`` is None.
+
+    Returns
+    -------
+    ordered : ndarray
+        Sorted ``values``
+    new_labels : ndarray
+        Reordered ``labels``; returned when ``labels`` is not None.
+
+    Raises
+    ------
+    TypeError
+        * If ``values`` is not list-like or if ``labels`` is neither None
+        nor list-like
+        * If ``values`` cannot be sorted
+    ValueError
+        * If ``labels`` is not None and ``values`` contain duplicates.
+    """
+    if not is_list_like(values):
+        raise TypeError("Only list-like objects are allowed to be passed to"
+                        "safe_sort as values")
+    values = np.array(values, copy=False)
+
+    def sort_mixed(values):
+        # order ints before strings, safe in py3
+        str_pos = np.array([isinstance(x, string_types) for x in values],
+                           dtype=bool)
+        nums = np.sort(values[~str_pos])
+        strs = np.sort(values[str_pos])
+        return _ensure_object(np.concatenate([nums, strs]))
+
+    sorter = None
+    if compat.PY3 and lib.infer_dtype(values) == 'mixed-integer':
+        # unorderable in py3 if mixed str/int
+        ordered = sort_mixed(values)
+    else:
+        try:
+            sorter = values.argsort()
+            ordered = values.take(sorter)
+        except TypeError:
+            # try this anyway
+            ordered = sort_mixed(values)
+
+    # labels:
+
+    if labels is None:
+        return ordered
+
+    if not is_list_like(labels):
+        raise TypeError("Only list-like objects or None are allowed to be"
+                        "passed to safe_sort as labels")
+    labels = _ensure_platform_int(np.asarray(labels))
+
+    from pandas import Index
+    if not assume_unique and not Index(values).is_unique:
+        raise ValueError("values should be unique if labels is not None")
+
+    if sorter is None:
+        # mixed types
+        (hash_klass, _), values = _get_data_algo(values, _hashtables)
+        t = hash_klass(len(values))
+        t.map_locations(values)
+        sorter = _ensure_platform_int(t.lookup(ordered))
+
+    reverse_indexer = np.empty(len(sorter), dtype=np.int_)
+    reverse_indexer.put(sorter, np.arange(len(sorter)))
+
+    mask = (labels < -len(values)) | (labels >= len(values)) | \
+        (labels == na_sentinel)
+
+    # (Out of bound indices will be masked with `na_sentinel` next, so we may
+    # deal with them here without performance loss using `mode='wrap'`.)
+    new_labels = reverse_indexer.take(labels, mode='wrap')
+    np.putmask(new_labels, mask, na_sentinel)
+
+    return ordered, new_labels
+
+
 def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None):
     """
     Encode input values as an enumerated type or categorical variable
@@ -210,33 +308,10 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None):
     uniques = uniques.to_array()
 
     if sort and len(uniques) > 0:
-        try:
-            sorter = uniques.argsort()
-        except:
-            # unorderable in py3 if mixed str/int
-            t = hash_klass(len(uniques))
-            t.map_locations(_ensure_object(uniques))
-
-            # order ints before strings
-            ordered = np.concatenate([
-                np.sort(np.array([e for i, e in enumerate(uniques) if f(e)],
-                                 dtype=object)) for f in
-                [lambda x: not isinstance(x, string_types),
-                 lambda x: isinstance(x, string_types)]])
-            sorter = _ensure_platform_int(t.lookup(
-                _ensure_object(ordered)))
-
-        reverse_indexer = np.empty(len(sorter), dtype=np.int_)
-        reverse_indexer.put(sorter, np.arange(len(sorter)))
-
-        mask = labels < 0
-        labels = reverse_indexer.take(labels)
-        np.putmask(labels, mask, -1)
-
-        uniques = uniques.take(sorter)
+        uniques, labels = safe_sort(uniques, labels, na_sentinel=na_sentinel,
+                                    assume_unique=True)
 
     if is_datetimetz_type:
-
         # reset tz
         uniques = DatetimeIndex(uniques.astype('M8[ns]')).tz_localize(
             values.tz)

diff --git a/pandas/indexes/base.py b/pandas/indexes/base.py
@@ -1773,7 +1773,7 @@ def _get_consensus_name(self, other):
             else:
                 name = None
             if self.name != name:
-                return other._shallow_copy(name=name)
+                return self._shallow_copy(name=name)
         return self
 
     def union(self, other):
@@ -1920,7 +1920,8 @@ def difference(self, other):
         Return a new Index with elements from the index that are not in
         `other`.
 
-        This is the sorted set difference of two Index objects.
+        This is the set difference of two Index objects.
+        It's sorted if sorting is possible.
 
         Parameters
         ----------
@@ -1946,14 +1947,27 @@ def difference(self, other):
 
         other, result_name = self._convert_can_do_setop(other)
 
-        theDiff = sorted(set(self) - set(other))
-        return Index(theDiff, name=result_name)
+        this = self._get_unique_index()
+
+        indexer = this.get_indexer(other)
+        indexer = indexer.take((indexer != -1).nonzero()[0])
+
+        label_diff = np.setdiff1d(np.arange(this.size), indexer,
+                                  assume_unique=True)
+        the_diff = this.values.take(label_diff)
+        try:
+            the_diff = algos.safe_sort(the_diff)
+        except TypeError:
+            pass
+
+        return this._shallow_copy(the_diff, name=result_name)
 
     diff = deprecate('diff', difference)
 
     def symmetric_difference(self, other, result_name=None):
         """
-        Compute the sorted symmetric difference of two Index objects.
+        Compute the symmetric difference of two Index objects.
+        It's sorted if sorting is possible.
 
         Parameters
         ----------
@@ -1970,9 +1984,6 @@ def symmetric_difference(self, other, result_name=None):
         ``idx1`` or ``idx2`` but not both. Equivalent to the Index created by
         ``(idx1 - idx2) + (idx2 - idx1)`` with duplicates dropped.
 
-        The sorting of a result containing ``NaN`` values is not guaranteed
-        across Python versions. See GitHub issue #6444.
-
         Examples
         --------
         >>> idx1 = Index([1, 2, 3, 4])
@@ -1990,8 +2001,26 @@ def symmetric_difference(self, other, result_name=None):
         if result_name is None:
             result_name = result_name_update
 
-        the_diff = sorted(set((self.difference(other)).
-                              union(other.difference(self))))
+        this = self._get_unique_index()
+        other = other._get_unique_index()
+        indexer = this.get_indexer(other)
+
+        # {this} minus {other}
+        common_indexer = indexer.take((indexer != -1).nonzero()[0])
+        left_indexer = np.setdiff1d(np.arange(this.size), common_indexer,
+                                    assume_unique=True)
+        left_diff = this.values.take(left_indexer)
+
+        # {other} minus {this}
+        right_indexer = (indexer == -1).nonzero()[0]
+        right_diff = other.values.take(right_indexer)
+
+        the_diff = _concat._concat_compat([left_diff, right_diff])
+        try:
+            the_diff = algos.safe_sort(the_diff)
+        except TypeError:
+            pass
+
         attribs = self._get_attributes_dict()
         attribs['name'] = result_name
         if 'freq' in attribs:
@@ -2000,6 +2029,36 @@ def symmetric_difference(self, other, result_name=None):
 
     sym_diff = deprecate('sym_diff', symmetric_difference)
 
+    def _get_unique_index(self, dropna=False):
+        """
+        Returns an index containing unique values.
+
+        Parameters
+        ----------
+        dropna : bool
+            If True, NaN values are dropped.
+
+        Returns
+        -------
+        uniques : index
+        """
+        if self.is_unique and not dropna:
+            return self
+
+        values = self.values
+
+        if not self.is_unique:
+            values = self.unique()
+
+        if dropna:
+            try:
+                if self.hasnans:
+                    values = values[~isnull(values)]
+            except NotImplementedError:
+                pass
+
+        return self._shallow_copy(values)
+
     def get_loc(self, key, method=None, tolerance=None):
         """
         Get integer location for requested label