Skip to content

Commit def0155

Browse files
committed
BUG/TST: fix tests for groupby nth on Series (GH7559)
1 parent 4082c1a commit def0155

File tree

4 files changed

+98
-33
lines changed

4 files changed

+98
-33
lines changed

doc/source/v0.14.1.txt

+1
Original file line numberDiff line numberDiff line change
@@ -173,6 +173,7 @@ Bug Fixes
173173
- Bug in setitem with list-of-lists and single vs mixed types (:issue:`7551`:)
174174
- Bug in timeops with non-aligned Series (:issue:`7500`)
175175
- Bug in timedelta inference when assigning an incomplete Series (:issue:`7592`)
176+
- Bug in groupby ``.nth`` with a Series and integer-like column name (:issue:`7559`)
176177

177178
- Bug in ``value_counts`` where ``NaT`` did not qualify as missing (``NaN``) (:issue:`7423`)
178179

pandas/core/groupby.py

+50-20
Original file line numberDiff line numberDiff line change
@@ -467,7 +467,7 @@ def _selected_obj(self):
467467
def _set_selection_from_grouper(self):
468468
""" we may need create a selection if we have non-level groupers """
469469
grp = self.grouper
470-
if self.as_index and getattr(grp,'groupings',None) is not None:
470+
if self.as_index and getattr(grp,'groupings',None) is not None and self.obj.ndim > 1:
471471
ax = self.obj._info_axis
472472
groupers = [ g.name for g in grp.groupings if g.level is None and g.name is not None and g.name in ax ]
473473
if len(groupers):
@@ -759,7 +759,7 @@ def nth(self, n, dropna=None):
759759
760760
Examples
761761
--------
762-
>>> DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=['A', 'B'])
762+
>>> df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=['A', 'B'])
763763
>>> g = df.groupby('A')
764764
>>> g.nth(0)
765765
A B
@@ -804,7 +804,10 @@ def nth(self, n, dropna=None):
804804
if self.as_index:
805805
ax = self.obj._info_axis
806806
names = self.grouper.names
807-
if all([ n in ax for n in names ]):
807+
if self.obj.ndim == 1:
808+
# this is a pass-thru
809+
pass
810+
elif all([ n in ax for n in names ]):
808811
result.index = Index(self.obj[names][is_nth].values.ravel()).set_names(names)
809812
elif self._group_selection is not None:
810813
result.index = self.obj._get_axis(self.axis)[is_nth]
@@ -821,17 +824,29 @@ def nth(self, n, dropna=None):
821824
"(was passed %s)." % (dropna),)
822825

823826
# old behaviour, but with all and any support for DataFrames.
824-
827+
# modified in GH 7559 to have better perf
825828
max_len = n if n >= 0 else - 1 - n
829+
dropped = self.obj.dropna(how=dropna, axis=self.axis)
826830

827-
def picker(x):
828-
x = x.dropna(how=dropna) # Note: how is ignored if Series
829-
if len(x) <= max_len:
830-
return np.nan
831-
else:
832-
return x.iloc[n]
831+
# get a new grouper for our dropped obj
832+
grouper, exclusions, obj = _get_grouper(dropped, key=self.keys, axis=self.axis,
833+
level=self.level, sort=self.sort)
834+
835+
sizes = obj.groupby(grouper).size()
836+
result = obj.groupby(grouper).nth(n)
837+
mask = (sizes<max_len).values
838+
839+
# set the results which don't meet the criteria
840+
if len(result) and mask.any():
841+
result.loc[mask] = np.nan
833842

834-
return self.agg(picker)
843+
# reset/reindex to the original groups
844+
if len(self.obj) == len(dropped):
845+
result.index = self.grouper.result_index
846+
else:
847+
result = result.reindex(self.grouper.result_index)
848+
849+
return result
835850

836851
def cumcount(self, **kwargs):
837852
"""
@@ -942,21 +957,33 @@ def tail(self, n=5):
942957
def _cumcount_array(self, arr=None, **kwargs):
943958
"""
944959
arr is where cumcount gets it's values from
960+
961+
note: this is currently implementing sort=False (though the default is sort=True)
962+
for groupby in general
945963
"""
946964
ascending = kwargs.pop('ascending', True)
947965

948966
if arr is None:
949967
arr = np.arange(self.grouper._max_groupsize, dtype='int64')
950968

951969
len_index = len(self._selected_obj.index)
952-
cumcounts = np.empty(len_index, dtype=arr.dtype)
970+
cumcounts = np.zeros(len_index, dtype=arr.dtype)
971+
if not len_index:
972+
return cumcounts
973+
974+
indices, values = [], []
975+
for v in self.indices.values():
976+
indices.append(v)
977+
978+
if ascending:
979+
values.append(arr[:len(v)])
980+
else:
981+
values.append(arr[len(v)-1::-1])
982+
983+
indices = np.concatenate(indices)
984+
values = np.concatenate(values)
985+
cumcounts[indices] = values
953986

954-
if ascending:
955-
for v in self.indices.values():
956-
cumcounts[v] = arr[:len(v)]
957-
else:
958-
for v in self.indices.values():
959-
cumcounts[v] = arr[len(v)-1::-1]
960987
return cumcounts
961988

962989
def _index_with_as_index(self, b):
@@ -1270,6 +1297,7 @@ def group_info(self):
12701297
comp_ids = com._ensure_int64(comp_ids)
12711298
return comp_ids, obs_group_ids, ngroups
12721299

1300+
12731301
def _get_compressed_labels(self):
12741302
all_labels = [ping.labels for ping in self.groupings]
12751303
if self._overflow_possible:
@@ -1892,7 +1920,6 @@ def groups(self):
18921920
self._groups = self.index.groupby(self.grouper)
18931921
return self._groups
18941922

1895-
18961923
def _get_grouper(obj, key=None, axis=0, level=None, sort=True):
18971924
"""
18981925
create and return a BaseGrouper, which is an internal
@@ -2141,7 +2168,10 @@ def _wrap_aggregated_output(self, output, names=None):
21412168
if names is not None:
21422169
return DataFrame(output, index=index, columns=names)
21432170
else:
2144-
return Series(output, index=index, name=self.name)
2171+
name = self.name
2172+
if name is None:
2173+
name = self._selected_obj.name
2174+
return Series(output, index=index, name=name)
21452175

21462176
def _wrap_applied_output(self, keys, values, not_indexed_same=False):
21472177
if len(keys) == 0:

pandas/tests/test_groupby.py

+22
Original file line numberDiff line numberDiff line change
@@ -282,6 +282,28 @@ def test_nth(self):
282282
expected = df.loc[[]]
283283
assert_frame_equal(result,expected)
284284

285+
# GH 7559
286+
# from the vbench
287+
df = DataFrame(np.random.randint(1, 10, (100, 2)))
288+
s = df[1]
289+
g = df[0]
290+
expected = s.groupby(g).first()
291+
expected2 = s.groupby(g).apply(lambda x: x.iloc[0])
292+
assert_series_equal(expected2,expected)
293+
294+
# validate first
295+
v = s[g==1].iloc[0]
296+
self.assertEqual(expected.iloc[0],v)
297+
self.assertEqual(expected2.iloc[0],v)
298+
299+
# this is NOT the same as .first (as sorted is default!)
300+
# as it keeps the order in the series (and not the group order)
301+
# related GH 7287
302+
expected = s.groupby(g,sort=False).first()
303+
expected.index = range(1,10)
304+
result = s.groupby(g).nth(0,dropna='all')
305+
assert_series_equal(result,expected)
306+
285307
def test_grouper_index_types(self):
286308
# related GH5375
287309
# groupby misbehaving when using a Floatlike index

vb_suite/groupby.py

+25-13
Original file line numberDiff line numberDiff line change
@@ -244,11 +244,14 @@ def f():
244244
groupby_last_float32 = Benchmark('data2.groupby(labels).last()', setup,
245245
start_date=datetime(2013, 1, 1))
246246

247-
groupby_nth_float64 = Benchmark('data.groupby(labels).nth(0)', setup,
248-
start_date=datetime(2012, 5, 1))
249-
250-
groupby_nth_float32 = Benchmark('data2.groupby(labels).nth(0)', setup,
251-
start_date=datetime(2013, 1, 1))
247+
groupby_nth_float64_none = Benchmark('data.groupby(labels).nth(0)', setup,
248+
start_date=datetime(2012, 5, 1))
249+
groupby_nth_float32_none = Benchmark('data2.groupby(labels).nth(0)', setup,
250+
start_date=datetime(2013, 1, 1))
251+
groupby_nth_float64_any = Benchmark('data.groupby(labels).nth(0,dropna="all")', setup,
252+
start_date=datetime(2012, 5, 1))
253+
groupby_nth_float32_any = Benchmark('data2.groupby(labels).nth(0,dropna="all")', setup,
254+
start_date=datetime(2013, 1, 1))
252255

253256
# with datetimes (GH7555)
254257
setup = common_setup + """
@@ -259,8 +262,10 @@ def f():
259262
start_date=datetime(2013, 5, 1))
260263
groupby_last_datetimes = Benchmark('df.groupby("b").last()', setup,
261264
start_date=datetime(2013, 5, 1))
262-
groupby_nth_datetimes = Benchmark('df.groupby("b").nth(0)', setup,
263-
start_date=datetime(2013, 5, 1))
265+
groupby_nth_datetimes_none = Benchmark('df.groupby("b").nth(0)', setup,
266+
start_date=datetime(2013, 5, 1))
267+
groupby_nth_datetimes_any = Benchmark('df.groupby("b").nth(0,dropna="all")', setup,
268+
start_date=datetime(2013, 5, 1))
264269

265270
# with object
266271
setup = common_setup + """
@@ -271,8 +276,10 @@ def f():
271276
start_date=datetime(2013, 5, 1))
272277
groupby_last_object = Benchmark('df.groupby("b").last()', setup,
273278
start_date=datetime(2013, 5, 1))
274-
groupby_nth_object = Benchmark('df.groupby("b").nth(0)', setup,
275-
start_date=datetime(2013, 5, 1))
279+
groupby_nth_object_none = Benchmark('df.groupby("b").nth(0)', setup,
280+
start_date=datetime(2013, 5, 1))
281+
groupby_nth_object_any = Benchmark('df.groupby("b").nth(0,dropna="any")', setup,
282+
start_date=datetime(2013, 5, 1))
276283

277284
#----------------------------------------------------------------------
278285
# groupby_indices replacement, chop up Series
@@ -351,11 +358,16 @@ def f(g):
351358
"""
352359

353360
# Not really a fair test as behaviour has changed!
354-
groupby_frame_nth = Benchmark("df.groupby(0).nth(0)", setup,
355-
start_date=datetime(2014, 3, 1))
361+
groupby_frame_nth_none = Benchmark("df.groupby(0).nth(0)", setup,
362+
start_date=datetime(2014, 3, 1))
363+
364+
groupby_series_nth_none = Benchmark("df[1].groupby(df[0]).nth(0)", setup,
365+
start_date=datetime(2014, 3, 1))
366+
groupby_frame_nth_any= Benchmark("df.groupby(0).nth(0,dropna='any')", setup,
367+
start_date=datetime(2014, 3, 1))
356368

357-
groupby_series_nth = Benchmark("df[1].groupby(df[0]).nth(0)", setup,
358-
start_date=datetime(2014, 3, 1))
369+
groupby_series_nth_any = Benchmark("df[1].groupby(df[0]).nth(0,dropna='any')", setup,
370+
start_date=datetime(2014, 3, 1))
359371

360372

361373
#----------------------------------------------------------------------

0 commit comments

Comments
 (0)