|
6 | 6 | from pandas.core.frame import DataFrame
|
7 | 7 | from pandas.core.generic import NDFrame
|
8 | 8 | from pandas.core.index import Index, MultiIndex
|
9 |
| -from pandas.core.internals import BlockManager |
| 9 | +from pandas.core.internals import BlockManager, make_block |
10 | 10 | from pandas.core.series import Series
|
11 | 11 | from pandas.core.panel import Panel
|
12 | 12 | from pandas.util.decorators import cache_readonly, Appender
|
@@ -156,7 +156,7 @@ def _group_shape(self):
|
156 | 156 | return tuple(ping.ngroups for ping in self.groupings)
|
157 | 157 |
|
158 | 158 | def __getattr__(self, attr):
|
159 |
| - if hasattr(self.obj, attr): |
| 159 | + if hasattr(self.obj, attr) and attr != '_cache': |
160 | 160 | return self._make_wrapper(attr)
|
161 | 161 | raise AttributeError("'%s' object has no attribute '%s'" %
|
162 | 162 | (type(self).__name__, attr))
|
@@ -352,9 +352,7 @@ def _cython_agg_general(self, how):
|
352 | 352 | if not issubclass(obj.dtype.type, (np.number, np.bool_)):
|
353 | 353 | continue
|
354 | 354 |
|
355 |
| - if obj.dtype != np.float64: |
356 |
| - obj = obj.astype('f8') |
357 |
| - |
| 355 | + obj = com._ensure_float64(obj) |
358 | 356 | result, counts = cython_aggregate(obj, comp_ids,
|
359 | 357 | max_group, how=how)
|
360 | 358 | mask = counts > 0
|
@@ -395,10 +393,7 @@ def _python_agg_general(self, func, *args, **kwargs):
|
395 | 393 | def _group_index(self):
|
396 | 394 | result = get_group_index([ping.labels for ping in self.groupings],
|
397 | 395 | self._group_shape)
|
398 |
| - |
399 |
| - if result.dtype != np.int64: # pragma: no cover |
400 |
| - result = result.astype('i8') |
401 |
| - return result |
| 396 | + return com._ensure_int64(result) |
402 | 397 |
|
403 | 398 | def _get_multi_index(self, mask, obs_ids):
|
404 | 399 | masked = [labels for _, labels in
|
@@ -642,9 +637,7 @@ def _make_labels(self):
|
642 | 637 | if self._was_factor: # pragma: no cover
|
643 | 638 | raise Exception('Should not call this method grouping by level')
|
644 | 639 | else:
|
645 |
| - values = self.grouper |
646 |
| - if values.dtype != np.object_: |
647 |
| - values = values.astype('O') |
| 640 | + values = com._ensure_object(self.grouper) |
648 | 641 |
|
649 | 642 | # khash
|
650 | 643 | rizer = lib.Factorizer(len(values))
|
@@ -955,6 +948,73 @@ def _iterate_slices(self):
|
955 | 948 |
|
956 | 949 | yield val, slicer(val)
|
957 | 950 |
|
| 951 | + |
| 952 | + def _cython_agg_general(self, how): |
| 953 | + |
| 954 | + group_index = self._group_index |
| 955 | + comp_ids, obs_group_ids = _compress_group_index(group_index) |
| 956 | + max_group = len(obs_group_ids) |
| 957 | + |
| 958 | + obj = self._obj_with_exclusions |
| 959 | + if self.axis == 1: |
| 960 | + obj = obj.T |
| 961 | + |
| 962 | + new_blocks = [] |
| 963 | + |
| 964 | + for block in obj._data.blocks: |
| 965 | + values = block.values.T |
| 966 | + if not issubclass(values.dtype.type, (np.number, np.bool_)): |
| 967 | + continue |
| 968 | + |
| 969 | + values = com._ensure_float64(values) |
| 970 | + result, counts = cython_aggregate(values, comp_ids, |
| 971 | + max_group, how=how) |
| 972 | + |
| 973 | + mask = counts > 0 |
| 974 | + if len(mask) > 0: |
| 975 | + result = result[mask] |
| 976 | + newb = make_block(result.T, block.items, block.ref_items) |
| 977 | + new_blocks.append(newb) |
| 978 | + |
| 979 | + if len(new_blocks) == 0: |
| 980 | + raise GroupByError('No numeric types to aggregate') |
| 981 | + |
| 982 | + agg_axis = 0 if self.axis == 1 else 1 |
| 983 | + agg_labels = self._obj_with_exclusions._get_axis(agg_axis) |
| 984 | + |
| 985 | + if sum(len(x.items) for x in new_blocks) == len(agg_labels): |
| 986 | + output_keys = agg_labels |
| 987 | + else: |
| 988 | + output_keys = [] |
| 989 | + for b in new_blocks: |
| 990 | + output_keys.extend(b.items) |
| 991 | + try: |
| 992 | + output_keys.sort() |
| 993 | + except TypeError: # pragma |
| 994 | + pass |
| 995 | + |
| 996 | + if isinstance(agg_labels, MultiIndex): |
| 997 | + output_keys = MultiIndex.from_tuples(output_keys, |
| 998 | + names=agg_labels.names) |
| 999 | + |
| 1000 | + if not self.as_index: |
| 1001 | + index = np.arange(new_blocks[0].values.shape[1]) |
| 1002 | + mgr = BlockManager(new_blocks, [output_keys, index]) |
| 1003 | + result = DataFrame(mgr) |
| 1004 | + group_levels = self._get_group_levels(mask, obs_group_ids) |
| 1005 | + for i, (name, labels) in enumerate(group_levels): |
| 1006 | + result.insert(i, name, labels) |
| 1007 | + result = result.consolidate() |
| 1008 | + else: |
| 1009 | + index = self._get_multi_index(mask, obs_group_ids) |
| 1010 | + mgr = BlockManager(new_blocks, [output_keys, index]) |
| 1011 | + result = DataFrame(mgr) |
| 1012 | + |
| 1013 | + if self.axis == 1: |
| 1014 | + result = result.T |
| 1015 | + |
| 1016 | + return result |
| 1017 | + |
958 | 1018 | @cache_readonly
|
959 | 1019 | def _obj_with_exclusions(self):
|
960 | 1020 | if self._column is not None:
|
@@ -1282,8 +1342,9 @@ def generate_groups(data, group_index, ngroups, axis=0, factory=lambda x: x):
|
1282 | 1342 | -------
|
1283 | 1343 | generator
|
1284 | 1344 | """
|
1285 |
| - indexer = lib.groupsort_indexer(group_index.astype('i4'), |
1286 |
| - ngroups)[0] |
| 1345 | + group_index = com._ensure_int32(group_index) |
| 1346 | + |
| 1347 | + indexer = lib.groupsort_indexer(group_index, ngroups)[0] |
1287 | 1348 | group_index = group_index.take(indexer)
|
1288 | 1349 |
|
1289 | 1350 | if isinstance(data, BlockManager):
|
@@ -1312,8 +1373,7 @@ def _get_slice(slob):
|
1312 | 1373 | def _get_slice(slob):
|
1313 | 1374 | return sorted_data[slob]
|
1314 | 1375 |
|
1315 |
| - starts, ends = lib.generate_slices(group_index.astype('i4'), |
1316 |
| - ngroups) |
| 1376 | + starts, ends = lib.generate_slices(group_index, ngroups) |
1317 | 1377 |
|
1318 | 1378 | for i, (start, end) in enumerate(zip(starts, ends)):
|
1319 | 1379 | # Since I'm now compressing the group ids, it's now not "possible" to
|
@@ -1385,14 +1445,27 @@ def get_key(self, comp_id):
|
1385 | 1445 |
|
1386 | 1446 | def cython_aggregate(values, group_index, ngroups, how='add'):
|
1387 | 1447 | agg_func = _cython_functions[how]
|
| 1448 | + if values.ndim == 1: |
| 1449 | + squeeze = True |
| 1450 | + values = values[:, None] |
| 1451 | + out_shape = (ngroups, 1) |
| 1452 | + else: |
| 1453 | + squeeze = False |
| 1454 | + out_shape = (ngroups, values.shape[1]) |
| 1455 | + |
1388 | 1456 | trans_func = _cython_transforms.get(how, lambda x: x)
|
1389 | 1457 |
|
1390 |
| - result = np.empty(ngroups, dtype=np.float64) |
| 1458 | + result = np.empty(out_shape, dtype=np.float64) |
1391 | 1459 | result.fill(np.nan)
|
1392 | 1460 |
|
1393 | 1461 | counts = np.zeros(ngroups, dtype=np.int32)
|
| 1462 | + |
1394 | 1463 | agg_func(result, counts, values, group_index)
|
1395 | 1464 | result = trans_func(result)
|
| 1465 | + |
| 1466 | + if squeeze: |
| 1467 | + result = result.squeeze() |
| 1468 | + |
1396 | 1469 | return result, counts
|
1397 | 1470 |
|
1398 | 1471 | _cython_functions = {
|
|
0 commit comments