Skip to content

Commit 3f24b87

Browse files
committed
Merge pull request #9345 from iwschris/groupby_nano_int
BUG: Fixes GH9311 groupby on datetime64
2 parents f2882b8 + 5f6cbf8 commit 3f24b87

File tree

6 files changed

+1391
-743
lines changed

6 files changed

+1391
-743
lines changed

doc/source/whatsnew/v0.16.0.txt

+2
Original file line numberDiff line numberDiff line change
@@ -187,6 +187,8 @@ Bug Fixes
187187
- Bug in the returned ``Series.dt.components`` index was reset to the default index (:issue:`9247`)
188188
- Bug in ``Categorical.__getitem__/__setitem__`` with listlike input getting incorrect results from indexer coercion (:issue:`9469`)
189189
- Bug in partial setting with a DatetimeIndex (:issue:`9478`)
190+
- Bug in groupby for integer and datetime64 columns when applying an aggregator that caused the value to be
191+
changed when the number was sufficiently large (:issue:`9311`, :issue:`6620`)
190192
- Fixed bug in ``to_sql`` when mapping a ``Timestamp`` object column (datetime
191193
column with timezone info) to the according sqlalchemy type (:issue:`9085`).
192194
- Fixed bug in ``to_sql`` ``dtype`` argument not accepting an instantiated

pandas/core/groupby.py

+36-24
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,8 @@
2424
from pandas.core.common import(_possibly_downcast_to_dtype, isnull,
2525
notnull, _DATELIKE_DTYPES, is_numeric_dtype,
2626
is_timedelta64_dtype, is_datetime64_dtype,
27-
is_categorical_dtype, _values_from_object)
27+
is_categorical_dtype, _values_from_object,
28+
_is_datetime_or_timedelta_dtype, is_bool_dtype)
2829
from pandas.core.config import option_context
2930
import pandas.lib as lib
3031
from pandas.lib import Timestamp
@@ -1444,7 +1445,9 @@ def get_func(fname):
14441445
f = getattr(_algos, "%s_%s" % (fname, dtype_str), None)
14451446
if f is not None:
14461447
return f
1447-
return getattr(_algos, fname, None)
1448+
1449+
if dtype_str == 'float64':
1450+
return getattr(_algos, fname, None)
14481451

14491452
ftype = self._cython_functions[how]
14501453

@@ -1471,7 +1474,6 @@ def wrapper(*args, **kwargs):
14711474
return func, dtype_str
14721475

14731476
def aggregate(self, values, how, axis=0):
1474-
14751477
arity = self._cython_arity.get(how, 1)
14761478

14771479
vdim = values.ndim
@@ -1487,27 +1489,44 @@ def aggregate(self, values, how, axis=0):
14871489
raise NotImplementedError
14881490
out_shape = (self.ngroups,) + values.shape[1:]
14891491

1490-
if is_numeric_dtype(values.dtype):
1491-
values = com.ensure_float(values)
1492-
is_numeric = True
1493-
out_dtype = 'f%d' % values.dtype.itemsize
1492+
is_numeric = is_numeric_dtype(values.dtype)
1493+
1494+
if _is_datetime_or_timedelta_dtype(values.dtype):
1495+
values = values.view('int64')
1496+
elif is_bool_dtype(values.dtype):
1497+
values = _algos.ensure_float64(values)
1498+
elif com.is_integer_dtype(values):
1499+
values = values.astype('int64', copy=False)
1500+
elif is_numeric:
1501+
values = _algos.ensure_float64(values)
14941502
else:
1495-
is_numeric = issubclass(values.dtype.type, (np.datetime64,
1496-
np.timedelta64))
1503+
values = values.astype(object)
1504+
1505+
try:
1506+
agg_func, dtype_str = self._get_aggregate_function(how, values)
1507+
except NotImplementedError:
14971508
if is_numeric:
1498-
out_dtype = 'float64'
1499-
values = values.view('int64')
1509+
values = _algos.ensure_float64(values)
1510+
agg_func, dtype_str = self._get_aggregate_function(how, values)
15001511
else:
1501-
out_dtype = 'object'
1502-
values = values.astype(object)
1512+
raise
1513+
1514+
if is_numeric:
1515+
out_dtype = '%s%d' % (values.dtype.kind, values.dtype.itemsize)
1516+
else:
1517+
out_dtype = 'object'
15031518

15041519
# will be filled in Cython function
15051520
result = np.empty(out_shape, dtype=out_dtype)
1506-
15071521
result.fill(np.nan)
15081522
counts = np.zeros(self.ngroups, dtype=np.int64)
15091523

1510-
result = self._aggregate(result, counts, values, how, is_numeric)
1524+
result = self._aggregate(result, counts, values, agg_func, is_numeric)
1525+
1526+
if com.is_integer_dtype(result):
1527+
if len(result[result == tslib.iNaT]) > 0:
1528+
result = result.astype('float64')
1529+
result[result == tslib.iNaT] = np.nan
15111530

15121531
if self._filter_empty_groups and not counts.all():
15131532
if result.ndim == 2:
@@ -1535,9 +1554,7 @@ def aggregate(self, values, how, axis=0):
15351554

15361555
return result, names
15371556

1538-
def _aggregate(self, result, counts, values, how, is_numeric):
1539-
agg_func, dtype = self._get_aggregate_function(how, values)
1540-
1557+
def _aggregate(self, result, counts, values, agg_func, is_numeric):
15411558
comp_ids, _, ngroups = self.group_info
15421559
if values.ndim > 3:
15431560
# punting for now
@@ -1796,9 +1813,7 @@ def size(self):
17961813
'ohlc': lambda *args: ['open', 'high', 'low', 'close']
17971814
}
17981815

1799-
def _aggregate(self, result, counts, values, how, is_numeric=True):
1800-
1801-
agg_func, dtype = self._get_aggregate_function(how, values)
1816+
def _aggregate(self, result, counts, values, agg_func, is_numeric=True):
18021817

18031818
if values.ndim > 3:
18041819
# punting for now
@@ -2535,9 +2550,6 @@ def _cython_agg_blocks(self, how, numeric_only=True):
25352550

25362551
values = block._try_operate(block.values)
25372552

2538-
if block.is_numeric:
2539-
values = _algos.ensure_float64(values)
2540-
25412553
result, _ = self.grouper.aggregate(values, how, axis=agg_axis)
25422554

25432555
# see if we can cast the block back to the original dtype

pandas/core/internals.py

+1-4
Original file line numberDiff line numberDiff line change
@@ -1811,10 +1811,7 @@ def _try_coerce_args(self, values, other):
18111811
def _try_coerce_result(self, result):
18121812
""" reverse of try_coerce_args """
18131813
if isinstance(result, np.ndarray):
1814-
if result.dtype == 'i8':
1815-
result = tslib.array_to_datetime(
1816-
result.astype(object).ravel()).reshape(result.shape)
1817-
elif result.dtype.kind in ['i', 'f', 'O']:
1814+
if result.dtype.kind in ['i', 'f', 'O']:
18181815
result = result.astype('M8[ns]')
18191816
elif isinstance(result, (np.integer, np.datetime64)):
18201817
result = lib.Timestamp(result)

pandas/src/generate_code.py

+102-23
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,9 @@
33
# don't introduce a pandas/pandas.compat import
44
# or we get a bootstrapping problem
55
from StringIO import StringIO
6+
import numpy as np
7+
8+
_int64_max = np.iinfo(np.int64).max
69

710
header = """
811
cimport numpy as np
@@ -680,7 +683,7 @@ def group_last_%(name)s(ndarray[%(dest_type2)s, ndim=2] out,
680683
for i in range(len(counts)):
681684
for j in range(K):
682685
if nobs[i, j] == 0:
683-
out[i, j] = nan
686+
out[i, j] = %(nan_val)s
684687
else:
685688
out[i, j] = resx[i, j]
686689
"""
@@ -726,7 +729,7 @@ def group_last_bin_%(name)s(ndarray[%(dest_type2)s, ndim=2] out,
726729
for i in range(ngroups):
727730
for j in range(K):
728731
if nobs[i, j] == 0:
729-
out[i, j] = nan
732+
out[i, j] = %(nan_val)s
730733
else:
731734
out[i, j] = resx[i, j]
732735
"""
@@ -773,7 +776,7 @@ def group_nth_bin_%(name)s(ndarray[%(dest_type2)s, ndim=2] out,
773776
for i in range(ngroups):
774777
for j in range(K):
775778
if nobs[i, j] == 0:
776-
out[i, j] = nan
779+
out[i, j] = %(nan_val)s
777780
else:
778781
out[i, j] = resx[i, j]
779782
"""
@@ -819,7 +822,7 @@ def group_nth_%(name)s(ndarray[%(dest_type2)s, ndim=2] out,
819822
for i in range(len(counts)):
820823
for j in range(K):
821824
if nobs[i, j] == 0:
822-
out[i, j] = nan
825+
out[i, j] = %(nan_val)s
823826
else:
824827
out[i, j] = resx[i, j]
825828
"""
@@ -1278,7 +1281,7 @@ def group_min_bin_%(name)s(ndarray[%(dest_type2)s, ndim=2] out,
12781281
nobs = np.zeros_like(out)
12791282
12801283
minx = np.empty_like(out)
1281-
minx.fill(np.inf)
1284+
minx.fill(%(inf_val)s)
12821285
12831286
if bins[len(bins) - 1] == len(values):
12841287
ngroups = len(bins)
@@ -1319,7 +1322,7 @@ def group_min_bin_%(name)s(ndarray[%(dest_type2)s, ndim=2] out,
13191322
for i in range(ngroups):
13201323
for j in range(K):
13211324
if nobs[i, j] == 0:
1322-
out[i, j] = nan
1325+
out[i, j] = %(nan_val)s
13231326
else:
13241327
out[i, j] = minx[i, j]
13251328
"""
@@ -1344,7 +1347,7 @@ def group_max_%(name)s(ndarray[%(dest_type2)s, ndim=2] out,
13441347
nobs = np.zeros_like(out)
13451348
13461349
maxx = np.empty_like(out)
1347-
maxx.fill(-np.inf)
1350+
maxx.fill(-%(inf_val)s)
13481351
13491352
N, K = (<object> values).shape
13501353
@@ -1381,7 +1384,7 @@ def group_max_%(name)s(ndarray[%(dest_type2)s, ndim=2] out,
13811384
for i in range(len(counts)):
13821385
for j in range(K):
13831386
if nobs[i, j] == 0:
1384-
out[i, j] = nan
1387+
out[i, j] = %(nan_val)s
13851388
else:
13861389
out[i, j] = maxx[i, j]
13871390
"""
@@ -1402,7 +1405,7 @@ def group_max_bin_%(name)s(ndarray[%(dest_type2)s, ndim=2] out,
14021405
14031406
nobs = np.zeros_like(out)
14041407
maxx = np.empty_like(out)
1405-
maxx.fill(-np.inf)
1408+
maxx.fill(-%(inf_val)s)
14061409
14071410
if bins[len(bins) - 1] == len(values):
14081411
ngroups = len(bins)
@@ -1443,7 +1446,7 @@ def group_max_bin_%(name)s(ndarray[%(dest_type2)s, ndim=2] out,
14431446
for i in range(ngroups):
14441447
for j in range(K):
14451448
if nobs[i, j] == 0:
1446-
out[i, j] = nan
1449+
out[i, j] = %(nan_val)s
14471450
else:
14481451
out[i, j] = maxx[i, j]
14491452
"""
@@ -1469,7 +1472,7 @@ def group_min_%(name)s(ndarray[%(dest_type2)s, ndim=2] out,
14691472
nobs = np.zeros_like(out)
14701473
14711474
minx = np.empty_like(out)
1472-
minx.fill(np.inf)
1475+
minx.fill(%(inf_val)s)
14731476
14741477
N, K = (<object> values).shape
14751478
@@ -1506,7 +1509,7 @@ def group_min_%(name)s(ndarray[%(dest_type2)s, ndim=2] out,
15061509
for i in range(len(counts)):
15071510
for j in range(K):
15081511
if nobs[i, j] == 0:
1509-
out[i, j] = nan
1512+
out[i, j] = %(nan_val)s
15101513
else:
15111514
out[i, j] = minx[i, j]
15121515
"""
@@ -2286,6 +2289,70 @@ def generate_put_template(template, use_ints=True, use_floats=True,
22862289
output.write(func)
22872290
return output.getvalue()
22882291

2292+
def generate_put_min_max_template(template, use_ints=True, use_floats=True,
2293+
use_objects=False, use_datelikes=False):
2294+
floats_list = [
2295+
('float64', 'float64_t', 'nan', 'np.inf'),
2296+
('float32', 'float32_t', 'nan', 'np.inf'),
2297+
]
2298+
ints_list = [
2299+
('int64', 'int64_t', 'iNaT', _int64_max),
2300+
]
2301+
date_like_list = [
2302+
('int64', 'int64_t', 'iNaT', _int64_max),
2303+
]
2304+
object_list = [('object', 'object', 'nan', 'np.inf')]
2305+
function_list = []
2306+
if use_floats:
2307+
function_list.extend(floats_list)
2308+
if use_ints:
2309+
function_list.extend(ints_list)
2310+
if use_objects:
2311+
function_list.extend(object_list)
2312+
if use_datelikes:
2313+
function_list.extend(date_like_list)
2314+
2315+
output = StringIO()
2316+
for name, dest_type, nan_val, inf_val in function_list:
2317+
func = template % {'name': name,
2318+
'dest_type2': dest_type,
2319+
'nan_val': nan_val,
2320+
'inf_val': inf_val}
2321+
output.write(func)
2322+
return output.getvalue()
2323+
2324+
def generate_put_selection_template(template, use_ints=True, use_floats=True,
2325+
use_objects=False, use_datelikes=False):
2326+
floats_list = [
2327+
('float64', 'float64_t', 'float64_t', 'nan'),
2328+
('float32', 'float32_t', 'float32_t', 'nan'),
2329+
]
2330+
ints_list = [
2331+
('int64', 'int64_t', 'int64_t', 'iNaT'),
2332+
]
2333+
date_like_list = [
2334+
('int64', 'int64_t', 'int64_t', 'iNaT'),
2335+
]
2336+
object_list = [('object', 'object', 'object', 'nan')]
2337+
function_list = []
2338+
if use_floats:
2339+
function_list.extend(floats_list)
2340+
if use_ints:
2341+
function_list.extend(ints_list)
2342+
if use_objects:
2343+
function_list.extend(object_list)
2344+
if use_datelikes:
2345+
function_list.extend(date_like_list)
2346+
2347+
output = StringIO()
2348+
for name, c_type, dest_type, nan_val in function_list:
2349+
func = template % {'name': name,
2350+
'c_type': c_type,
2351+
'dest_type2': dest_type,
2352+
'nan_val': nan_val}
2353+
output.write(func)
2354+
return output.getvalue()
2355+
22892356
def generate_take_template(template, exclude=None):
22902357
# name, dest, ctypein, ctypeout, preval, postval, cancopy
22912358
function_list = [
@@ -2347,24 +2414,27 @@ def generate_from_template(template, exclude=None):
23472414
return output.getvalue()
23482415

23492416
put_2d = [diff_2d_template]
2350-
groupbys = [group_last_template,
2351-
group_last_bin_template,
2352-
group_nth_template,
2353-
group_nth_bin_template,
2354-
group_add_template,
2417+
2418+
groupbys = [group_add_template,
23552419
group_add_bin_template,
23562420
group_prod_template,
23572421
group_prod_bin_template,
23582422
group_var_template,
23592423
group_var_bin_template,
23602424
group_mean_template,
23612425
group_mean_bin_template,
2362-
group_min_template,
2363-
group_min_bin_template,
2364-
group_max_template,
2365-
group_max_bin_template,
23662426
group_ohlc_template]
23672427

2428+
groupby_selection = [group_last_template,
2429+
group_last_bin_template,
2430+
group_nth_template,
2431+
group_nth_bin_template]
2432+
2433+
groupby_min_max = [group_min_template,
2434+
group_min_bin_template,
2435+
group_max_template,
2436+
group_max_bin_template]
2437+
23682438
groupby_count = [group_count_template, group_count_bin_template]
23692439

23702440
templates_1d = [map_indices_template,
@@ -2407,9 +2477,18 @@ def generate_take_cython_file(path='generated.pyx'):
24072477
for template in groupbys:
24082478
print(generate_put_template(template, use_ints=False), file=f)
24092479

2480+
for template in groupby_selection:
2481+
print(generate_put_selection_template(template, use_ints=True),
2482+
file=f)
2483+
2484+
for template in groupby_min_max:
2485+
print(generate_put_min_max_template(template, use_ints=True),
2486+
file=f)
2487+
24102488
for template in groupby_count:
2411-
print(generate_put_template(template, use_ints=False,
2412-
use_datelikes=True, use_objects=True),
2489+
print(generate_put_selection_template(template, use_ints=True,
2490+
use_datelikes=True,
2491+
use_objects=True),
24132492
file=f)
24142493

24152494
# for template in templates_1d_datetime:

0 commit comments

Comments
 (0)