Skip to content

Commit faa5c5c

Browse files
author
Carlos Souza
committed
Merge remote-tracking branch 'upstream/master'
2 parents 43456a5 + bd169dc commit faa5c5c

File tree

15 files changed

+709
-441
lines changed

15 files changed

+709
-441
lines changed

doc/source/whatsnew/v0.20.0.txt

+58
Original file line numberDiff line numberDiff line change
@@ -750,6 +750,62 @@ New Behavior:
750750
TypeError: Cannot compare 2014-01-01 00:00:00 of
751751
type <class 'pandas.tslib.Timestamp'> to string column
752752

753+
.. _whatsnew_0200.api_breaking.index_order:
754+
755+
Index.intersection and inner join now preserve the order of the left Index
756+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
757+
758+
`:meth:Index.intersection` now preserves the order of the calling ``Index`` (left)
759+
instead of the other ``Index`` (right) (:issue:`15582`). This affects the inner
760+
joins (`:meth:DataFrame.join` and `:func:merge`) and the ``.align`` methods.
761+
762+
- ``Index.intersection``
763+
764+
.. ipython:: python
765+
766+
left = pd.Index([2, 1, 0])
767+
left
768+
right = pd.Index([1, 2, 3])
769+
right
770+
771+
Previous Behavior:
772+
773+
.. code-block:: ipython
774+
775+
In [4]: left.intersection(right)
776+
Out[4]: Int64Index([1, 2], dtype='int64')
777+
778+
New Behavior:
779+
780+
.. ipython:: python
781+
782+
left.intersection(right)
783+
784+
- ``DataFrame.join`` and ``pd.merge``
785+
786+
.. ipython:: python
787+
788+
left = pd.DataFrame({'a': [20, 10, 0]}, index=[2, 1, 0])
789+
left
790+
right = pd.DataFrame({'b': [100, 200, 300]}, index=[1, 2, 3])
791+
right
792+
793+
Previous Behavior:
794+
795+
.. code-block:: ipython
796+
797+
In [4]: left.join(right, how='inner')
798+
Out[4]:
799+
a b
800+
1 10 100
801+
2 20 200
802+
803+
New Behavior:
804+
805+
.. ipython:: python
806+
807+
left.join(right, how='inner')
808+
753809

754810
.. _whatsnew_0200.api:
755811

@@ -887,6 +943,7 @@ Bug Fixes
887943

888944
- Compat for 32-bit platforms for ``.qcut/cut``; bins will now be ``int64`` dtype (:issue:`14866`)
889945

946+
- Properly set ``__name__`` and ``__qualname__`` for ``Groupby.*`` functions (:issue:`14620`)
890947
- Bug in ``.at`` when selecting from a tz-aware column (:issue:`15822`)
891948
- Bug in the display of ``.info()`` where a qualifier (+) would always be displayed with a ``MultiIndex`` that contains only non-strings (:issue:`15245`)
892949
- Bug in ``.replace()`` may result in incorrect dtypes. (:issue:`12747`, :issue:`15765`)
@@ -983,6 +1040,7 @@ Bug Fixes
9831040

9841041
- Bug in ``DataFrame.to_stata()`` and ``StataWriter`` which produces incorrectly formatted files to be produced for some locales (:issue:`13856`)
9851042
- Bug in ``StataReader`` and ``StataWriter`` which allows invalid encodings (:issue:`15723`)
1043+
- Bug with ``sort=True`` in ``DataFrame.join`` and ``pd.merge`` when joining on indexes (:issue:`15582`)
9861044

9871045
- Bug in ``pd.concat()`` in which concatting with an empty dataframe with ``join='inner'`` was being improperly handled (:issue:`15328`)
9881046
- Bug in ``groupby.agg()`` incorrectly localizing timezone on ``datetime`` (:issue:`15426`, :issue:`10668`, :issue:`13046`)

pandas/core/frame.py

+15-8
Original file line numberDiff line numberDiff line change
@@ -124,10 +124,14 @@
124124
----------%s
125125
right : DataFrame
126126
how : {'left', 'right', 'outer', 'inner'}, default 'inner'
127-
* left: use only keys from left frame (SQL: left outer join)
128-
* right: use only keys from right frame (SQL: right outer join)
129-
* outer: use union of keys from both frames (SQL: full outer join)
130-
* inner: use intersection of keys from both frames (SQL: inner join)
127+
* left: use only keys from left frame, similar to a SQL left outer join;
128+
preserve key order
129+
* right: use only keys from right frame, similar to a SQL right outer join;
130+
preserve key order
131+
* outer: use union of keys from both frames, similar to a SQL full outer
132+
join; sort keys lexicographically
133+
* inner: use intersection of keys from both frames, similar to a SQL inner
134+
join; preserve the order of the left keys
131135
on : label or list
132136
Field names to join on. Must be found in both DataFrames. If on is
133137
None and not merging on indexes, then it merges on the intersection of
@@ -147,7 +151,8 @@
147151
Use the index from the right DataFrame as the join key. Same caveats as
148152
left_index
149153
sort : boolean, default False
150-
Sort the join keys lexicographically in the result DataFrame
154+
Sort the join keys lexicographically in the result DataFrame. If False,
155+
the order of the join keys depends on the join type (how keyword)
151156
suffixes : 2-length sequence (tuple, list, ...)
152157
Suffix to apply to overlapping column names in the left and right
153158
side, respectively
@@ -4472,16 +4477,18 @@ def join(self, other, on=None, how='left', lsuffix='', rsuffix='',
44724477
* left: use calling frame's index (or column if on is specified)
44734478
* right: use other frame's index
44744479
* outer: form union of calling frame's index (or column if on is
4475-
specified) with other frame's index
4480+
specified) with other frame's index, and sort it
4481+
lexicographically
44764482
* inner: form intersection of calling frame's index (or column if
4477-
on is specified) with other frame's index
4483+
on is specified) with other frame's index, preserving the order
4484+
of the calling's one
44784485
lsuffix : string
44794486
Suffix to use from left frame's overlapping columns
44804487
rsuffix : string
44814488
Suffix to use from right frame's overlapping columns
44824489
sort : boolean, default False
44834490
Order result DataFrame lexicographically by the join key. If False,
4484-
preserves the index order of the calling (left) DataFrame
4491+
the order of the join key depends on the join type (how keyword)
44854492
44864493
Notes
44874494
-----

pandas/core/groupby.py

+75-68
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,8 @@
1212
)
1313

1414
from pandas import compat
15-
from pandas.compat.numpy import function as nv
16-
from pandas.compat.numpy import _np_version_under1p8
15+
from pandas.compat.numpy import function as nv, _np_version_under1p8
16+
from pandas.compat import set_function_name
1717

1818
from pandas.types.common import (is_numeric_dtype,
1919
is_timedelta64_dtype, is_datetime64_dtype,
@@ -172,64 +172,6 @@
172172
'cummin', 'cummax'])
173173

174174

175-
def _groupby_function(name, alias, npfunc, numeric_only=True,
176-
_convert=False):
177-
178-
_local_template = "Compute %(f)s of group values"
179-
180-
@Substitution(name='groupby', f=name)
181-
@Appender(_doc_template)
182-
@Appender(_local_template)
183-
def f(self, **kwargs):
184-
if 'numeric_only' not in kwargs:
185-
kwargs['numeric_only'] = numeric_only
186-
self._set_group_selection()
187-
try:
188-
return self._cython_agg_general(alias, alt=npfunc, **kwargs)
189-
except AssertionError as e:
190-
raise SpecificationError(str(e))
191-
except Exception:
192-
result = self.aggregate(lambda x: npfunc(x, axis=self.axis))
193-
if _convert:
194-
result = result._convert(datetime=True)
195-
return result
196-
197-
f.__name__ = name
198-
199-
return f
200-
201-
202-
def _first_compat(x, axis=0):
203-
204-
def _first(x):
205-
206-
x = np.asarray(x)
207-
x = x[notnull(x)]
208-
if len(x) == 0:
209-
return np.nan
210-
return x[0]
211-
212-
if isinstance(x, DataFrame):
213-
return x.apply(_first, axis=axis)
214-
else:
215-
return _first(x)
216-
217-
218-
def _last_compat(x, axis=0):
219-
def _last(x):
220-
221-
x = np.asarray(x)
222-
x = x[notnull(x)]
223-
if len(x) == 0:
224-
return np.nan
225-
return x[-1]
226-
227-
if isinstance(x, DataFrame):
228-
return x.apply(_last, axis=axis)
229-
else:
230-
return _last(x)
231-
232-
233175
class Grouper(object):
234176
"""
235177
A Grouper allows the user to specify a groupby instruction for a target
@@ -1184,14 +1126,76 @@ def size(self):
11841126
result.name = getattr(self, 'name', None)
11851127
return result
11861128

1187-
sum = _groupby_function('sum', 'add', np.sum)
1188-
prod = _groupby_function('prod', 'prod', np.prod)
1189-
min = _groupby_function('min', 'min', np.min, numeric_only=False)
1190-
max = _groupby_function('max', 'max', np.max, numeric_only=False)
1191-
first = _groupby_function('first', 'first', _first_compat,
1192-
numeric_only=False, _convert=True)
1193-
last = _groupby_function('last', 'last', _last_compat, numeric_only=False,
1194-
_convert=True)
1129+
@classmethod
1130+
def _add_numeric_operations(cls):
1131+
""" add numeric operations to the GroupBy generically """
1132+
1133+
def groupby_function(name, alias, npfunc,
1134+
numeric_only=True, _convert=False):
1135+
1136+
_local_template = "Compute %(f)s of group values"
1137+
1138+
@Substitution(name='groupby', f=name)
1139+
@Appender(_doc_template)
1140+
@Appender(_local_template)
1141+
def f(self, **kwargs):
1142+
if 'numeric_only' not in kwargs:
1143+
kwargs['numeric_only'] = numeric_only
1144+
self._set_group_selection()
1145+
try:
1146+
return self._cython_agg_general(
1147+
alias, alt=npfunc, **kwargs)
1148+
except AssertionError as e:
1149+
raise SpecificationError(str(e))
1150+
except Exception:
1151+
result = self.aggregate(
1152+
lambda x: npfunc(x, axis=self.axis))
1153+
if _convert:
1154+
result = result._convert(datetime=True)
1155+
return result
1156+
1157+
set_function_name(f, name, cls)
1158+
1159+
return f
1160+
1161+
def first_compat(x, axis=0):
1162+
1163+
def first(x):
1164+
1165+
x = np.asarray(x)
1166+
x = x[notnull(x)]
1167+
if len(x) == 0:
1168+
return np.nan
1169+
return x[0]
1170+
1171+
if isinstance(x, DataFrame):
1172+
return x.apply(first, axis=axis)
1173+
else:
1174+
return first(x)
1175+
1176+
def last_compat(x, axis=0):
1177+
1178+
def last(x):
1179+
1180+
x = np.asarray(x)
1181+
x = x[notnull(x)]
1182+
if len(x) == 0:
1183+
return np.nan
1184+
return x[-1]
1185+
1186+
if isinstance(x, DataFrame):
1187+
return x.apply(last, axis=axis)
1188+
else:
1189+
return last(x)
1190+
1191+
cls.sum = groupby_function('sum', 'add', np.sum)
1192+
cls.prod = groupby_function('prod', 'prod', np.prod)
1193+
cls.min = groupby_function('min', 'min', np.min, numeric_only=False)
1194+
cls.max = groupby_function('max', 'max', np.max, numeric_only=False)
1195+
cls.first = groupby_function('first', 'first', first_compat,
1196+
numeric_only=False, _convert=True)
1197+
cls.last = groupby_function('last', 'last', last_compat,
1198+
numeric_only=False, _convert=True)
11951199

11961200
@Substitution(name='groupby')
11971201
@Appender(_doc_template)
@@ -1604,6 +1608,9 @@ def tail(self, n=5):
16041608
return self._selected_obj[mask]
16051609

16061610

1611+
GroupBy._add_numeric_operations()
1612+
1613+
16071614
@Appender(GroupBy.__doc__)
16081615
def groupby(obj, by, **kwds):
16091616
if isinstance(obj, Series):

pandas/indexes/base.py

+19-8
Original file line numberDiff line numberDiff line change
@@ -2089,8 +2089,8 @@ def intersection(self, other):
20892089
"""
20902090
Form the intersection of two Index objects.
20912091
2092-
This returns a new Index with elements common to the index and `other`.
2093-
Sortedness of the result is not guaranteed.
2092+
This returns a new Index with elements common to the index and `other`,
2093+
preserving the order of the calling index.
20942094
20952095
Parameters
20962096
----------
@@ -2128,15 +2128,15 @@ def intersection(self, other):
21282128
pass
21292129

21302130
try:
2131-
indexer = Index(self._values).get_indexer(other._values)
2131+
indexer = Index(other._values).get_indexer(self._values)
21322132
indexer = indexer.take((indexer != -1).nonzero()[0])
21332133
except:
21342134
# duplicates
2135-
indexer = Index(self._values).get_indexer_non_unique(
2136-
other._values)[0].unique()
2135+
indexer = Index(other._values).get_indexer_non_unique(
2136+
self._values)[0].unique()
21372137
indexer = indexer[indexer != -1]
21382138

2139-
taken = self.take(indexer)
2139+
taken = other.take(indexer)
21402140
if self.name != other.name:
21412141
taken.name = None
21422142
return taken
@@ -2831,8 +2831,7 @@ def _reindex_non_unique(self, target):
28312831
new_index = self._shallow_copy_with_infer(new_labels, freq=None)
28322832
return new_index, indexer, new_indexer
28332833

2834-
def join(self, other, how='left', level=None, return_indexers=False):
2835-
"""
2834+
_index_shared_docs['join'] = """
28362835
*this is an internal non-public method*
28372836
28382837
Compute join_index and indexers to conform data
@@ -2844,11 +2843,20 @@ def join(self, other, how='left', level=None, return_indexers=False):
28442843
how : {'left', 'right', 'inner', 'outer'}
28452844
level : int or level name, default None
28462845
return_indexers : boolean, default False
2846+
sort : boolean, default False
2847+
Sort the join keys lexicographically in the result Index. If False,
2848+
the order of the join keys depends on the join type (how keyword)
2849+
2850+
.. versionadded:: 0.20.0
28472851
28482852
Returns
28492853
-------
28502854
join_index, (left_indexer, right_indexer)
28512855
"""
2856+
2857+
@Appender(_index_shared_docs['join'])
2858+
def join(self, other, how='left', level=None, return_indexers=False,
2859+
sort=False):
28522860
from .multi import MultiIndex
28532861
self_is_mi = isinstance(self, MultiIndex)
28542862
other_is_mi = isinstance(other, MultiIndex)
@@ -2929,6 +2937,9 @@ def join(self, other, how='left', level=None, return_indexers=False):
29292937
elif how == 'outer':
29302938
join_index = self.union(other)
29312939

2940+
if sort:
2941+
join_index = join_index.sort_values()
2942+
29322943
if return_indexers:
29332944
if join_index is self:
29342945
lindexer = None

0 commit comments

Comments
 (0)