Skip to content

Commit adc9238

Browse files
committed
BUG: groupby.first/last datetime64 type issue. close #2133
1 parent 189d04c commit adc9238

File tree

6 files changed

+69
-28
lines changed

6 files changed

+69
-28
lines changed

RELEASE.rst

+1
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,7 @@ pandas 0.9.1
6969
- Fix partial integer indexing bug in DataFrame.xs (#2107)
7070
- Fix variety of cut/qcut string-bin formatting bugs (#1978, #1979)
7171
- Raise Exception when xs view not possible of MultiIndex'd DataFrame (#2117)
72+
- Fix groupby(...).first() issue with datetime64 (#2133)
7273
7374
pandas 0.9.0
7475
============

pandas/core/frame.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -1437,11 +1437,12 @@ def convert_objects(self):
14371437
converted : DataFrame
14381438
"""
14391439
new_data = {}
1440+
convert_f = lambda x: lib.maybe_convert_objects(x, convert_datetime=1)
14401441

14411442
# TODO: could be more efficient taking advantage of the block
14421443
for col, s in self.iteritems():
14431444
if s.dtype == np.object_:
1444-
new_data[col] = lib.maybe_convert_objects(s)
1445+
new_data[col] = convert_f(s)
14451446
else:
14461447
new_data[col] = s
14471448

pandas/core/groupby.py

+56-25
Original file line numberDiff line numberDiff line change
@@ -27,12 +27,16 @@ class SpecificationError(GroupByError):
2727
pass
2828

2929

30-
def _groupby_function(name, alias, npfunc, numeric_only=True):
30+
def _groupby_function(name, alias, npfunc, numeric_only=True,
31+
_convert=False):
3132
def f(self):
3233
try:
3334
return self._cython_agg_general(alias, numeric_only=numeric_only)
3435
except Exception:
35-
return self.aggregate(lambda x: npfunc(x, axis=self.axis))
36+
result = self.aggregate(lambda x: npfunc(x, axis=self.axis))
37+
if _convert:
38+
result = result.convert_objects()
39+
return result
3640

3741
f.__doc__ = "Compute %s of group values" % name
3842
f.__name__ = name
@@ -41,19 +45,31 @@ def f(self):
4145

4246

4347
def _first_compat(x, axis=0):
44-
x = np.asarray(x)
45-
x = x[com.notnull(x)]
46-
if len(x) == 0:
47-
return np.nan
48-
return x[0]
48+
def _first(x):
49+
x = np.asarray(x)
50+
x = x[com.notnull(x)]
51+
if len(x) == 0:
52+
return np.nan
53+
return x[0]
54+
55+
if isinstance(x, DataFrame):
56+
return x.apply(_first, axis=axis)
57+
else:
58+
return _first(x)
4959

5060

5161
def _last_compat(x, axis=0):
52-
x = np.asarray(x)
53-
x = x[com.notnull(x)]
54-
if len(x) == 0:
55-
return np.nan
56-
return x[-1]
62+
def _last(x):
63+
x = np.asarray(x)
64+
x = x[com.notnull(x)]
65+
if len(x) == 0:
66+
return np.nan
67+
return x[-1]
68+
69+
if isinstance(x, DataFrame):
70+
return x.apply(_last, axis=axis)
71+
else:
72+
return _last(x)
5773

5874

5975
class GroupBy(object):
@@ -357,8 +373,9 @@ def size(self):
357373
min = _groupby_function('min', 'min', np.min)
358374
max = _groupby_function('max', 'max', np.max)
359375
first = _groupby_function('first', 'first', _first_compat,
360-
numeric_only=False)
361-
last = _groupby_function('last', 'last', _last_compat, numeric_only=False)
376+
numeric_only=False, _convert=True)
377+
last = _groupby_function('last', 'last', _last_compat, numeric_only=False,
378+
_convert=True)
362379

363380
def ohlc(self):
364381
"""
@@ -380,7 +397,7 @@ def picker(arr):
380397
def _cython_agg_general(self, how, numeric_only=True):
381398
output = {}
382399
for name, obj in self._iterate_slices():
383-
is_numeric = issubclass(obj.dtype.type, (np.number, np.bool_))
400+
is_numeric = _is_numeric_dtype(obj.dtype)
384401
if numeric_only and not is_numeric:
385402
continue
386403

@@ -699,12 +716,6 @@ def get_group_levels(self):
699716
_filter_empty_groups = True
700717

701718
def aggregate(self, values, how, axis=0):
702-
values = com.ensure_float(values)
703-
is_numeric = True
704-
705-
if not issubclass(values.dtype.type, (np.number, np.bool_)):
706-
values = values.astype(object)
707-
is_numeric = False
708719

709720
arity = self._cython_arity.get(how, 1)
710721

@@ -721,6 +732,16 @@ def aggregate(self, values, how, axis=0):
721732
raise NotImplementedError
722733
out_shape = (self.ngroups,) + values.shape[1:]
723734

735+
if _is_numeric_dtype(values.dtype):
736+
values = com.ensure_float(values)
737+
is_numeric = True
738+
else:
739+
if issubclass(values.dtype.type, np.datetime64):
740+
raise Exception('Cython not able to handle this case')
741+
742+
values = values.astype(object)
743+
is_numeric = False
744+
724745
# will be filled in Cython function
725746
result = np.empty(out_shape, dtype=values.dtype)
726747
counts = np.zeros(self.ngroups, dtype=np.int64)
@@ -753,10 +774,11 @@ def aggregate(self, values, how, axis=0):
753774
return result, names
754775

755776
def _aggregate(self, result, counts, values, how, is_numeric):
756-
fdict = self._cython_functions
757777
if not is_numeric:
758-
fdict = self._cython_object_functions
759-
agg_func = fdict[how]
778+
agg_func = self._cython_object_functions[how]
779+
else:
780+
agg_func = self._cython_functions[how]
781+
760782
trans_func = self._cython_transforms.get(how, lambda x: x)
761783

762784
comp_ids, _, ngroups = self.group_info
@@ -1458,12 +1480,15 @@ def _cython_agg_blocks(self, how, numeric_only=True):
14581480

14591481
for block in data.blocks:
14601482
values = block.values
1461-
is_numeric = issubclass(values.dtype.type, (np.number, np.bool_))
1483+
1484+
is_numeric = _is_numeric_dtype(values.dtype)
1485+
14621486
if numeric_only and not is_numeric:
14631487
continue
14641488

14651489
if is_numeric:
14661490
values = com.ensure_float(values)
1491+
14671492
result, _ = self.grouper.aggregate(values, how, axis=agg_axis)
14681493
newb = make_block(result, block.items, block.ref_items)
14691494
new_blocks.append(newb)
@@ -2231,6 +2256,12 @@ def _reorder_by_uniques(uniques, labels):
22312256
}
22322257

22332258

2259+
def _is_numeric_dtype(dt):
2260+
typ = dt.type
2261+
return (issubclass(typ, (np.number, np.bool_))
2262+
and not issubclass(typ, (np.datetime64, np.timedelta64)))
2263+
2264+
22342265
def _intercept_function(func):
22352266
return _func_table.get(func, func)
22362267

pandas/core/internals.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -359,7 +359,7 @@ def _try_cast(self, element):
359359
return element
360360

361361
def should_store(self, value):
362-
return issubclass(value.dtype.type, np.integer)
362+
return com.is_integer_dtype(value)
363363

364364
class BoolBlock(Block):
365365
_can_hold_na = False

pandas/src/inference.pyx

+1-1
Original file line numberDiff line numberDiff line change
@@ -431,7 +431,7 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0,
431431
elif util.is_complex_object(val):
432432
complexes[i] = val
433433
seen_complex = 1
434-
elif PyDateTime_Check(val):
434+
elif PyDateTime_Check(val) or util.is_datetime64_object(val):
435435
if convert_datetime:
436436
seen_datetime = 1
437437
idatetimes[i] = convert_to_tsobject(val).value

pandas/tests/test_groupby.py

+8
Original file line numberDiff line numberDiff line change
@@ -2083,7 +2083,15 @@ def test_groupby_categorical_no_compress(self):
20832083
exp = data.groupby(labels).mean().reindex(cats.levels)
20842084
assert_series_equal(result, exp)
20852085

2086+
def test_groupby_first_datetime64(self):
2087+
df = DataFrame([(1, 1351036800000000000), (2, 1351036800000000000)])
2088+
df[1] = df[1].view('M8[ns]')
20862089

2090+
self.assert_(issubclass(df[1].dtype.type, np.datetime64))
2091+
2092+
result = df.groupby(level=0).first()
2093+
got_dt = result[1].dtype
2094+
self.assert_(issubclass(got_dt.type, np.datetime64))
20872095

20882096

20892097
def _check_groupby(df, result, keys, field, f=lambda x: x.sum()):

0 commit comments

Comments
 (0)