Skip to content

Commit b225cac

Browse files
pijuchajreback
authored andcommitted
BUG/PERF: Sort mixed-int in Py3, fix Index.difference
fixes some issues from #13432 closes #12044 closes #12814 Author: Piotr Jucha <[email protected]> Closes #13514 from pijucha/setop13432 and squashes the following commits: 3a96089 [Piotr Jucha] BUG/PERF: Sort mixed-int in Py3, fix Index.difference
1 parent 006bd0b commit b225cac

File tree

11 files changed

+583
-63
lines changed

11 files changed

+583
-63
lines changed

asv_bench/benchmarks/index_object.py

+55
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,27 @@ def time_index_datetime_union(self):
6363
self.rng.union(self.rng2)
6464

6565

66+
class index_datetime_set_difference(object):
67+
goal_time = 0.2
68+
69+
def setup(self):
70+
self.N = 100000
71+
self.A = self.N - 20000
72+
self.B = self.N + 20000
73+
self.idx1 = DatetimeIndex(range(self.N))
74+
self.idx2 = DatetimeIndex(range(self.A, self.B))
75+
self.idx3 = DatetimeIndex(range(self.N, self.B))
76+
77+
def time_index_datetime_difference(self):
78+
self.idx1.difference(self.idx2)
79+
80+
def time_index_datetime_difference_disjoint(self):
81+
self.idx1.difference(self.idx3)
82+
83+
def time_index_datetime_symmetric_difference(self):
84+
self.idx1.symmetric_difference(self.idx2)
85+
86+
6687
class index_float64_boolean_indexer(object):
6788
goal_time = 0.2
6889

@@ -183,6 +204,40 @@ def time_index_int64_union(self):
183204
self.left.union(self.right)
184205

185206

207+
class index_int64_set_difference(object):
208+
goal_time = 0.2
209+
210+
def setup(self):
211+
self.N = 500000
212+
self.options = np.arange(self.N)
213+
self.left = Index(self.options.take(
214+
np.random.permutation(self.N)[:(self.N // 2)]))
215+
self.right = Index(self.options.take(
216+
np.random.permutation(self.N)[:(self.N // 2)]))
217+
218+
def time_index_int64_difference(self):
219+
self.left.difference(self.right)
220+
221+
def time_index_int64_symmetric_difference(self):
222+
self.left.symmetric_difference(self.right)
223+
224+
225+
class index_str_set_difference(object):
226+
goal_time = 0.2
227+
228+
def setup(self):
229+
self.N = 10000
230+
self.strs = tm.rands_array(10, self.N)
231+
self.left = Index(self.strs[:self.N * 2 // 3])
232+
self.right = Index(self.strs[self.N // 3:])
233+
234+
def time_str_difference(self):
235+
self.left.difference(self.right)
236+
237+
def time_str_symmetric_difference(self):
238+
self.left.symmetric_difference(self.right)
239+
240+
186241
class index_str_boolean_indexer(object):
187242
goal_time = 0.2
188243

doc/source/whatsnew/v0.19.0.txt

+32-2
Original file line numberDiff line numberDiff line change
@@ -396,7 +396,7 @@ resulting dtype will be upcast, which is unchanged from previous.
396396
pd.merge(df1, df2, how='outer', on='key')
397397
pd.merge(df1, df2, how='outer', on='key').dtypes
398398

399-
.. _whatsnew_0190.describe:
399+
.. _whatsnew_0190.api.describe:
400400

401401
``.describe()`` changes
402402
^^^^^^^^^^^^^^^^^^^^^^^
@@ -485,6 +485,34 @@ New Behavior:
485485
pd.NaT + 1
486486
pd.NaT - 1
487487

488+
.. _whatsnew_0190.api.difference:
489+
490+
``Index.difference`` and ``.symmetric_difference`` changes
491+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
492+
493+
``Index.difference`` and ``Index.symmetric_difference`` will now, more consistently, treat ``NaN`` values as any other values. (:issue:`13514`)
494+
495+
.. ipython:: python
496+
497+
idx1 = pd.Index([1, 2, 3, np.nan])
498+
idx2 = pd.Index([0, 1, np.nan])
499+
500+
Previous Behavior:
501+
502+
.. code-block:: ipython
503+
504+
In [3]: idx1.difference(idx2)
505+
Out[3]: Float64Index([nan, 2.0, 3.0], dtype='float64')
506+
507+
In [4]: idx1.symmetric_difference(idx2)
508+
Out[4]: Float64Index([0.0, nan, 2.0, 3.0], dtype='float64')
509+
510+
New Behavior:
511+
512+
.. ipython:: python
513+
514+
idx1.difference(idx2)
515+
idx1.symmetric_difference(idx2)
488516

489517
.. _whatsnew_0190.deprecations:
490518

@@ -534,7 +562,7 @@ Performance Improvements
534562

535563
- Improved performance of float64 hash table operations, fixing some very slow indexing and groupby operations in python 3 (:issue:`13166`, :issue:`13334`)
536564
- Improved performance of ``DataFrameGroupBy.transform`` (:issue:`12737`)
537-
565+
- Improved performance of ``Index.difference`` (:issue:`12044`)
538566

539567
.. _whatsnew_0190.bug_fixes:
540568

@@ -629,3 +657,5 @@ Bug Fixes
629657
- Bug in ``groupby`` with ``as_index=False`` returns all NaN's when grouping on multiple columns including a categorical one (:issue:`13204`)
630658

631659
- Bug where ``pd.read_gbq()`` could throw ``ImportError: No module named discovery`` as a result of a naming conflict with another python package called apiclient (:issue:`13454`)
660+
- Bug in ``Index.union`` returns an incorrect result with a named empty index (:issue:`13432`)
661+
- Bugs in ``Index.difference`` and ``DataFrame.join`` raise in Python3 when using mixed-integer indexes (:issue:`13432`, :issue:`12814`)

pandas/core/algorithms.py

+100-25
Original file line numberDiff line numberDiff line change
@@ -163,6 +163,104 @@ def isin(comps, values):
163163
return f(comps, values)
164164

165165

166+
def safe_sort(values, labels=None, na_sentinel=-1, assume_unique=False):
167+
"""
168+
Sort ``values`` and reorder corresponding ``labels``.
169+
``values`` should be unique if ``labels`` is not None.
170+
Safe for use with mixed types (int, str), orders ints before strs.
171+
172+
.. versionadded:: 0.19.0
173+
174+
Parameters
175+
----------
176+
values : list-like
177+
Sequence; must be unique if ``labels`` is not None.
178+
labels : list_like
179+
Indices to ``values``. All out of bound indices are treated as
180+
"not found" and will be masked with ``na_sentinel``.
181+
na_sentinel : int, default -1
182+
Value in ``labels`` to mark "not found".
183+
Ignored when ``labels`` is None.
184+
assume_unique : bool, default False
185+
When True, ``values`` are assumed to be unique, which can speed up
186+
the calculation. Ignored when ``labels`` is None.
187+
188+
Returns
189+
-------
190+
ordered : ndarray
191+
Sorted ``values``
192+
new_labels : ndarray
193+
Reordered ``labels``; returned when ``labels`` is not None.
194+
195+
Raises
196+
------
197+
TypeError
198+
* If ``values`` is not list-like or if ``labels`` is neither None
199+
nor list-like
200+
* If ``values`` cannot be sorted
201+
ValueError
202+
* If ``labels`` is not None and ``values`` contain duplicates.
203+
"""
204+
if not is_list_like(values):
205+
raise TypeError("Only list-like objects are allowed to be passed to"
206+
"safe_sort as values")
207+
values = np.array(values, copy=False)
208+
209+
def sort_mixed(values):
210+
# order ints before strings, safe in py3
211+
str_pos = np.array([isinstance(x, string_types) for x in values],
212+
dtype=bool)
213+
nums = np.sort(values[~str_pos])
214+
strs = np.sort(values[str_pos])
215+
return _ensure_object(np.concatenate([nums, strs]))
216+
217+
sorter = None
218+
if compat.PY3 and lib.infer_dtype(values) == 'mixed-integer':
219+
# unorderable in py3 if mixed str/int
220+
ordered = sort_mixed(values)
221+
else:
222+
try:
223+
sorter = values.argsort()
224+
ordered = values.take(sorter)
225+
except TypeError:
226+
# try this anyway
227+
ordered = sort_mixed(values)
228+
229+
# labels:
230+
231+
if labels is None:
232+
return ordered
233+
234+
if not is_list_like(labels):
235+
raise TypeError("Only list-like objects or None are allowed to be"
236+
"passed to safe_sort as labels")
237+
labels = _ensure_platform_int(np.asarray(labels))
238+
239+
from pandas import Index
240+
if not assume_unique and not Index(values).is_unique:
241+
raise ValueError("values should be unique if labels is not None")
242+
243+
if sorter is None:
244+
# mixed types
245+
(hash_klass, _), values = _get_data_algo(values, _hashtables)
246+
t = hash_klass(len(values))
247+
t.map_locations(values)
248+
sorter = _ensure_platform_int(t.lookup(ordered))
249+
250+
reverse_indexer = np.empty(len(sorter), dtype=np.int_)
251+
reverse_indexer.put(sorter, np.arange(len(sorter)))
252+
253+
mask = (labels < -len(values)) | (labels >= len(values)) | \
254+
(labels == na_sentinel)
255+
256+
# (Out of bound indices will be masked with `na_sentinel` next, so we may
257+
# deal with them here without performance loss using `mode='wrap'`.)
258+
new_labels = reverse_indexer.take(labels, mode='wrap')
259+
np.putmask(new_labels, mask, na_sentinel)
260+
261+
return ordered, new_labels
262+
263+
166264
def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None):
167265
"""
168266
Encode input values as an enumerated type or categorical variable
@@ -210,33 +308,10 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None):
210308
uniques = uniques.to_array()
211309

212310
if sort and len(uniques) > 0:
213-
try:
214-
sorter = uniques.argsort()
215-
except:
216-
# unorderable in py3 if mixed str/int
217-
t = hash_klass(len(uniques))
218-
t.map_locations(_ensure_object(uniques))
219-
220-
# order ints before strings
221-
ordered = np.concatenate([
222-
np.sort(np.array([e for i, e in enumerate(uniques) if f(e)],
223-
dtype=object)) for f in
224-
[lambda x: not isinstance(x, string_types),
225-
lambda x: isinstance(x, string_types)]])
226-
sorter = _ensure_platform_int(t.lookup(
227-
_ensure_object(ordered)))
228-
229-
reverse_indexer = np.empty(len(sorter), dtype=np.int_)
230-
reverse_indexer.put(sorter, np.arange(len(sorter)))
231-
232-
mask = labels < 0
233-
labels = reverse_indexer.take(labels)
234-
np.putmask(labels, mask, -1)
235-
236-
uniques = uniques.take(sorter)
311+
uniques, labels = safe_sort(uniques, labels, na_sentinel=na_sentinel,
312+
assume_unique=True)
237313

238314
if is_datetimetz_type:
239-
240315
# reset tz
241316
uniques = DatetimeIndex(uniques.astype('M8[ns]')).tz_localize(
242317
values.tz)

pandas/indexes/base.py

+69-10
Original file line numberDiff line numberDiff line change
@@ -1773,7 +1773,7 @@ def _get_consensus_name(self, other):
17731773
else:
17741774
name = None
17751775
if self.name != name:
1776-
return other._shallow_copy(name=name)
1776+
return self._shallow_copy(name=name)
17771777
return self
17781778

17791779
def union(self, other):
@@ -1920,7 +1920,8 @@ def difference(self, other):
19201920
Return a new Index with elements from the index that are not in
19211921
`other`.
19221922
1923-
This is the sorted set difference of two Index objects.
1923+
This is the set difference of two Index objects.
1924+
It's sorted if sorting is possible.
19241925
19251926
Parameters
19261927
----------
@@ -1946,14 +1947,27 @@ def difference(self, other):
19461947

19471948
other, result_name = self._convert_can_do_setop(other)
19481949

1949-
theDiff = sorted(set(self) - set(other))
1950-
return Index(theDiff, name=result_name)
1950+
this = self._get_unique_index()
1951+
1952+
indexer = this.get_indexer(other)
1953+
indexer = indexer.take((indexer != -1).nonzero()[0])
1954+
1955+
label_diff = np.setdiff1d(np.arange(this.size), indexer,
1956+
assume_unique=True)
1957+
the_diff = this.values.take(label_diff)
1958+
try:
1959+
the_diff = algos.safe_sort(the_diff)
1960+
except TypeError:
1961+
pass
1962+
1963+
return this._shallow_copy(the_diff, name=result_name)
19511964

19521965
diff = deprecate('diff', difference)
19531966

19541967
def symmetric_difference(self, other, result_name=None):
19551968
"""
1956-
Compute the sorted symmetric difference of two Index objects.
1969+
Compute the symmetric difference of two Index objects.
1970+
It's sorted if sorting is possible.
19571971
19581972
Parameters
19591973
----------
@@ -1970,9 +1984,6 @@ def symmetric_difference(self, other, result_name=None):
19701984
``idx1`` or ``idx2`` but not both. Equivalent to the Index created by
19711985
``(idx1 - idx2) + (idx2 - idx1)`` with duplicates dropped.
19721986
1973-
The sorting of a result containing ``NaN`` values is not guaranteed
1974-
across Python versions. See GitHub issue #6444.
1975-
19761987
Examples
19771988
--------
19781989
>>> idx1 = Index([1, 2, 3, 4])
@@ -1990,8 +2001,26 @@ def symmetric_difference(self, other, result_name=None):
19902001
if result_name is None:
19912002
result_name = result_name_update
19922003

1993-
the_diff = sorted(set((self.difference(other)).
1994-
union(other.difference(self))))
2004+
this = self._get_unique_index()
2005+
other = other._get_unique_index()
2006+
indexer = this.get_indexer(other)
2007+
2008+
# {this} minus {other}
2009+
common_indexer = indexer.take((indexer != -1).nonzero()[0])
2010+
left_indexer = np.setdiff1d(np.arange(this.size), common_indexer,
2011+
assume_unique=True)
2012+
left_diff = this.values.take(left_indexer)
2013+
2014+
# {other} minus {this}
2015+
right_indexer = (indexer == -1).nonzero()[0]
2016+
right_diff = other.values.take(right_indexer)
2017+
2018+
the_diff = _concat._concat_compat([left_diff, right_diff])
2019+
try:
2020+
the_diff = algos.safe_sort(the_diff)
2021+
except TypeError:
2022+
pass
2023+
19952024
attribs = self._get_attributes_dict()
19962025
attribs['name'] = result_name
19972026
if 'freq' in attribs:
@@ -2000,6 +2029,36 @@ def symmetric_difference(self, other, result_name=None):
20002029

20012030
sym_diff = deprecate('sym_diff', symmetric_difference)
20022031

2032+
def _get_unique_index(self, dropna=False):
2033+
"""
2034+
Returns an index containing unique values.
2035+
2036+
Parameters
2037+
----------
2038+
dropna : bool
2039+
If True, NaN values are dropped.
2040+
2041+
Returns
2042+
-------
2043+
uniques : index
2044+
"""
2045+
if self.is_unique and not dropna:
2046+
return self
2047+
2048+
values = self.values
2049+
2050+
if not self.is_unique:
2051+
values = self.unique()
2052+
2053+
if dropna:
2054+
try:
2055+
if self.hasnans:
2056+
values = values[~isnull(values)]
2057+
except NotImplementedError:
2058+
pass
2059+
2060+
return self._shallow_copy(values)
2061+
20032062
def get_loc(self, key, method=None, tolerance=None):
20042063
"""
20052064
Get integer location for requested label

0 commit comments

Comments
 (0)