Skip to content

Commit 8d2c2a8

Browse files
committed
BUG: groupby level should preserve level order, perf enhancement. docs fix. moved join op code from internals to merge.py
1 parent 5d2b2e3 commit 8d2c2a8

File tree

14 files changed

+457
-279
lines changed

14 files changed

+457
-279
lines changed

doc/source/basics.rst

+9-9
Original file line numberDiff line numberDiff line change
@@ -692,10 +692,11 @@ produces the "keys" of the objects, namely:
692692

693693
Thus, for example:
694694

695-
.. ipython:: python
695+
.. ipython::
696696

697-
for col in df:
698-
print col
697+
In [0]: for col in df:
698+
...: print col
699+
...:
699700

700701
iteritems
701702
~~~~~~~~~
@@ -709,13 +710,12 @@ key-value pairs:
709710

710711
For example:
711712

712-
.. ipython:: python
713-
714-
for item, frame in wp.iteritems():
715-
print item
716-
print frame
713+
.. ipython::
717714

718-
.. _basics.sorting:
715+
In [0]: for item, frame in wp.iteritems():
716+
...: print item
717+
...: print frame
718+
...:
719719

720720
Sorting by index and value
721721
--------------------------

doc/source/io.rst

+2-2
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
np.set_printoptions(precision=4, suppress=True)
1616
import matplotlib.pyplot as plt
1717
plt.close('all')
18-
clipdf = DataFrame({'A':[1,2,3],'B':[4,5,6],'C':['p','q','r']},
18+
clipdf = DataFrame({'A':[1,2,3],'B':[4,5,6],'C':['p','q','r']},
1919
index=['x','y','z'])
2020
2121
*******************************
@@ -295,7 +295,7 @@ which, if set to ``True``, will additionally output the length of the Series.
295295

296296

297297
Writing to HTML format
298-
~~~~~~~~~~~~~~~~~~~~~
298+
~~~~~~~~~~~~~~~~~~~~~~
299299

300300
.. _io.html:
301301

pandas/core/frame.py

+20-12
Original file line numberDiff line numberDiff line change
@@ -2730,7 +2730,7 @@ def _concat_missing(values, n):
27302730
def _get_raw_column(self, col):
27312731
return self._data.get(col)
27322732

2733-
def join(self, other, on=None, how=None, lsuffix='', rsuffix=''):
2733+
def join(self, other, on=None, how='left', lsuffix='', rsuffix=''):
27342734
"""
27352735
Join columns with other DataFrame either on index or on a key
27362736
column.
@@ -2762,8 +2762,24 @@ def join(self, other, on=None, how=None, lsuffix='', rsuffix=''):
27622762
-------
27632763
joined : DataFrame
27642764
"""
2765-
if how is None:
2766-
how = 'left'
2765+
# For SparseDataFrame's benefit
2766+
return self._join_compat(other, on=on, how=how, lsuffix=lsuffix,
2767+
rsuffix=rsuffix)
2768+
2769+
def _join_compat(self, other, on=None, how='left', lsuffix='', rsuffix=''):
2770+
# from pandas.tools.merge import merge
2771+
2772+
# if isinstance(other, Series):
2773+
# assert(other.name is not None)
2774+
# other = DataFrame({other.name : other})
2775+
2776+
# return merge(self, other, left_on=on, left_index=on is None,
2777+
# right_index=True, suffixes=(lsuffix, rsuffix))
2778+
2779+
if isinstance(other, Series):
2780+
assert(other.name is not None)
2781+
other = DataFrame({other.name : other})
2782+
27672783
if on is not None:
27682784
return self._join_on(other, on, how, lsuffix, rsuffix)
27692785
else:
@@ -2773,10 +2789,6 @@ def _join_on(self, other, on, how, lsuffix, rsuffix):
27732789
if how not in ('left', 'inner'): # pragma: no cover
27742790
raise Exception('Only inner / left joins currently supported')
27752791

2776-
if isinstance(other, Series):
2777-
assert(other.name is not None)
2778-
other = DataFrame({other.name : other})
2779-
27802792
if isinstance(on, (list, tuple)):
27812793
if len(on) == 1:
27822794
join_key = self[on[0]].values
@@ -2792,11 +2804,7 @@ def _join_on(self, other, on, how, lsuffix, rsuffix):
27922804
return self._constructor(new_data)
27932805

27942806
def _join_index(self, other, how, lsuffix, rsuffix):
2795-
from pandas.core.internals import join_managers
2796-
2797-
if isinstance(other, Series):
2798-
assert(other.name is not None)
2799-
other = DataFrame({other.name : other})
2807+
from pandas.tools.merge import join_managers
28002808

28012809
thisdata, otherdata = self._data._maybe_rename_join(
28022810
other._data, lsuffix, rsuffix, copydata=False)

pandas/core/groupby.py

+68-20
Original file line numberDiff line numberDiff line change
@@ -149,7 +149,7 @@ def _obj_with_exclusions(self):
149149

150150
@property
151151
def _group_shape(self):
152-
return tuple(len(ping.ids) for ping in self.groupings)
152+
return tuple(len(ping.counts) for ping in self.groupings)
153153

154154
@property
155155
def _agg_stride_shape(self):
@@ -535,24 +535,35 @@ def __init__(self, index, grouper=None, name=None, level=None):
535535
if isinstance(grouper, Series) and name is None:
536536
self.name = grouper.name
537537

538+
# pre-computed
539+
self._was_factor = False
540+
538541
if level is not None:
539542
if not isinstance(level, int):
540543
assert(level in index.names)
541544
level = index.names.index(level)
542545

543546
inds = index.labels[level]
544-
labels = index.levels[level].take(inds)
547+
level_index = index.levels[level]
548+
545549
if self.name is None:
546550
self.name = index.names[level]
547551

552+
# XXX complete hack
553+
554+
level_values = index.levels[level].take(inds)
548555
if grouper is not None:
549-
self.grouper = labels.map(self.grouper)
556+
self.grouper = level_values.map(self.grouper)
550557
else:
551-
self.grouper = labels
552-
553-
# no level passed
554-
if not isinstance(self.grouper, np.ndarray):
555-
self.grouper = self.index.map(self.grouper)
558+
self._was_factor = True
559+
self._labels = inds
560+
self._group_index = level_index
561+
self._counts = lib.group_count(inds, len(level_index))
562+
self.grouper = level_values
563+
else:
564+
# no level passed
565+
if not isinstance(self.grouper, np.ndarray):
566+
self.grouper = self.index.map(self.grouper)
556567

557568
def __repr__(self):
558569
return 'Grouping(%s)' % self.name
@@ -563,6 +574,7 @@ def __iter__(self):
563574
_labels = None
564575
_ids = None
565576
_counts = None
577+
_group_index = None
566578

567579
@cache_readonly
568580
def indices(self):
@@ -577,7 +589,11 @@ def labels(self):
577589
@property
578590
def ids(self):
579591
if self._ids is None:
580-
self._make_labels()
592+
if self._was_factor:
593+
index = self._group_index
594+
self._ids = dict(zip(range(len(index)), index))
595+
else:
596+
self._make_labels()
581597
return self._ids
582598

583599
@cache_readonly
@@ -590,13 +606,21 @@ def counts(self):
590606
self._make_labels()
591607
return self._counts
592608

593-
@cache_readonly
609+
@property
594610
def group_index(self):
595-
return Index([self.ids[i] for i in range(len(self.ids))])
611+
if self._group_index is None:
612+
ids = self.ids
613+
values = np.arange(len(self.ids), dtype='O')
614+
self._group_index = Index(lib.lookup_values(values, ids))
615+
return self._group_index
596616

597617
def _make_labels(self):
598-
ids, labels, counts = _group_labels(self.grouper)
599-
sids, slabels, scounts = sort_group_labels(ids, labels, counts)
618+
if self._was_factor: # pragma: no cover
619+
raise Exception('Should not call this method grouping by level')
620+
else:
621+
ids, labels, counts = _group_labels(self.grouper)
622+
sids, slabels, scounts = sort_group_labels(ids, labels, counts)
623+
600624
self._labels = slabels
601625
self._ids = sids
602626
self._counts = scounts
@@ -768,7 +792,12 @@ def _get_index():
768792
if len(self.groupings) > 1:
769793
index = MultiIndex.from_tuples(keys, names=key_names)
770794
else:
771-
index = Index(keys, name=key_names[0])
795+
ping = self.groupings[0]
796+
if len(keys) == len(ping.counts):
797+
index = ping.group_index
798+
index.name = key_names[0]
799+
else:
800+
index = Index(keys, name=key_names[0])
772801
return index
773802

774803
if isinstance(values[0], Series):
@@ -981,7 +1010,10 @@ def _aggregate_generic(self, func, *args, **kwargs):
9811010

9821011
index_name = (self.groupings[0].name
9831012
if len(self.groupings) == 1 else None)
984-
result_index = Index(sorted(result), name=index_name)
1013+
1014+
result_index = self.groupings[0].group_index
1015+
1016+
# result_index = Index(sorted(result), name=index_name)
9851017

9861018
if result:
9871019
if axis == 0:
@@ -1062,25 +1094,36 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False):
10621094
not_indexed_same=not_indexed_same)
10631095
else:
10641096
if len(self.groupings) > 1:
1065-
keys = MultiIndex.from_tuples(keys, names=key_names)
1097+
key_index = MultiIndex.from_tuples(keys, names=key_names)
10661098
else:
1067-
keys = Index(keys, name=key_names[0])
1099+
ping = self.groupings[0]
1100+
if len(keys) == len(ping.counts):
1101+
key_index = ping.group_index
1102+
key_index.name = key_names[0]
1103+
1104+
key_lookup = Index(keys)
1105+
indexer = key_lookup.get_indexer(key_index)
1106+
1107+
# reorder the values
1108+
values = [values[i] for i in indexer]
1109+
else:
1110+
key_index = Index(keys, name=key_names[0])
10681111

10691112
if isinstance(values[0], np.ndarray):
10701113
if self.axis == 0:
10711114
stacked_values = np.vstack([np.asarray(x)
10721115
for x in values])
10731116
columns = values[0].index
1074-
index = keys
1117+
index = key_index
10751118
else:
10761119
stacked_values = np.vstack([np.asarray(x)
10771120
for x in values]).T
10781121
index = values[0].index
1079-
columns = keys
1122+
columns = key_index
10801123
return DataFrame(stacked_values, index=index,
10811124
columns=columns)
10821125
else:
1083-
return Series(values, index=keys)
1126+
return Series(values, index=key_index)
10841127

10851128
def transform(self, func, *args, **kwargs):
10861129
"""
@@ -1417,6 +1460,11 @@ def _group_labels(values):
14171460

14181461
def sort_group_labels(ids, labels, counts):
14191462
n = len(ids)
1463+
1464+
# corner all NA case
1465+
if n == 0:
1466+
return ids, labels, counts
1467+
14201468
rng = np.arange(n)
14211469
values = Series(ids, index=rng, dtype=object).values
14221470
indexer = values.argsort()

0 commit comments

Comments
 (0)