Skip to content

Commit f733f10

Browse files
Chang Shewesm
Chang She
authored andcommitted
ENH: make any/all conform to sum/mean interface. Fixed bug in copy keyword in BlockManager.get_numeric_data #1416
1 parent c183dd9 commit f733f10

File tree

5 files changed

+248
-38
lines changed

5 files changed

+248
-38
lines changed

pandas/core/frame.py

Lines changed: 89 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -528,18 +528,6 @@ def shape(self):
528528
def empty(self):
529529
return not (len(self.columns) > 0 and len(self.index) > 0)
530530

531-
def any(self):
532-
if not self._is_mixed_type:
533-
if self.dtypes[0] == np.bool_:
534-
return self.values.any()
535-
raise ValueError('Cannot call any() on mixed or non-boolean DataFrame')
536-
537-
def all(self):
538-
if not self._is_mixed_type:
539-
if self.dtypes[0] == np.bool_:
540-
return self.values.all()
541-
raise ValueError('Cannot call all() on mixed or non-boolean DataFrame')
542-
543531
def __nonzero__(self):
544532
raise ValueError("Cannot call bool() on DataFrame.")
545533

@@ -4057,6 +4045,62 @@ def _count_level(self, level, axis=0, numeric_only=False):
40574045
else:
40584046
return result
40594047

4048+
def any(self, axis=0, bool_only=None, skipna=True, level=None):
4049+
"""
4050+
Return whether any element is True over requested axis.
4051+
%(na_action)s
4052+
4053+
Parameters
4054+
----------
4055+
axis : {0, 1}
4056+
0 for row-wise, 1 for column-wise
4057+
skipna : boolean, default True
4058+
Exclude NA/null values. If an entire row/column is NA, the result
4059+
will be NA
4060+
level : int, default None
4061+
If the axis is a MultiIndex (hierarchical), count along a
4062+
particular level, collapsing into a DataFrame
4063+
bool_only : boolean, default None
4064+
Only include boolean data.
4065+
4066+
Returns
4067+
-------
4068+
any : Series (or DataFrame if level specified)
4069+
"""
4070+
if level is not None:
4071+
return self._agg_by_level('any', axis=axis, level=level,
4072+
skipna=skipna)
4073+
return self._reduce(nanops.nanany, axis=axis, skipna=skipna,
4074+
numeric_only=bool_only, filter_type='bool')
4075+
4076+
def all(self, axis=0, bool_only=None, skipna=True, level=None):
4077+
"""
4078+
Return whether any element is True over requested axis.
4079+
%(na_action)s
4080+
4081+
Parameters
4082+
----------
4083+
axis : {0, 1}
4084+
0 for row-wise, 1 for column-wise
4085+
skipna : boolean, default True
4086+
Exclude NA/null values. If an entire row/column is NA, the result
4087+
will be NA
4088+
level : int, default None
4089+
If the axis is a MultiIndex (hierarchical), count along a
4090+
particular level, collapsing into a DataFrame
4091+
bool_only : boolean, default None
4092+
Only include boolean data.
4093+
4094+
Returns
4095+
-------
4096+
any : Series (or DataFrame if level specified)
4097+
"""
4098+
if level is not None:
4099+
return self._agg_by_level('all', axis=axis, level=level,
4100+
skipna=skipna)
4101+
return self._reduce(nanops.nanall, axis=axis, skipna=skipna,
4102+
numeric_only=bool_only, filter_type='bool')
4103+
40604104
@Substitution(name='sum', shortname='sum', na_action=_doc_exclude_na,
40614105
extras=_numeric_only_doc)
40624106
@Appender(_stat_doc)
@@ -4183,20 +4227,33 @@ def _agg_by_level(self, name, axis=0, level=0, skipna=True, **kwds):
41834227
applyf = lambda x: method(x, axis=axis, skipna=skipna, **kwds)
41844228
return grouped.aggregate(applyf)
41854229

4186-
def _reduce(self, op, axis=0, skipna=True, numeric_only=None, **kwds):
4230+
def _reduce(self, op, axis=0, skipna=True, numeric_only=None,
4231+
filter_type=None, **kwds):
41874232
f = lambda x: op(x, axis=axis, skipna=skipna, **kwds)
41884233
labels = self._get_agg_axis(axis)
41894234
if numeric_only is None:
41904235
try:
41914236
values = self.values
41924237
result = f(values)
41934238
except Exception:
4194-
data = self._get_numeric_data()
4239+
if filter_type is None or filter_type == 'numeric':
4240+
data = self._get_numeric_data()
4241+
elif filter_type == 'bool':
4242+
data = self._get_bool_data()
4243+
else:
4244+
raise ValueError('Invalid filter_type %s ' %
4245+
str(filter_type))
41954246
result = f(data.values)
41964247
labels = data._get_agg_axis(axis)
41974248
else:
41984249
if numeric_only:
4199-
data = self._get_numeric_data()
4250+
if filter_type is None or filter_type == 'numeric':
4251+
data = self._get_numeric_data()
4252+
elif filter_type == 'bool':
4253+
data = self._get_bool_data()
4254+
else:
4255+
raise ValueError('Invalid filter_type %s ' %
4256+
str(filter_type))
42004257
values = data.values
42014258
labels = data._get_agg_axis(axis)
42024259
else:
@@ -4205,7 +4262,13 @@ def _reduce(self, op, axis=0, skipna=True, numeric_only=None, **kwds):
42054262

42064263
if result.dtype == np.object_:
42074264
try:
4208-
result = result.astype('f8')
4265+
if filter_type is None or filter_type == 'numeric':
4266+
result = result.astype('f8')
4267+
elif filter_type == 'bool':
4268+
result = result.astype('b')
4269+
else:
4270+
raise ValueError('Invalid dtype %s ' % str(filter_type))
4271+
42094272
except (ValueError, TypeError):
42104273
pass
42114274

@@ -4274,6 +4337,16 @@ def _get_numeric_data(self):
42744337
else:
42754338
return self.ix[:, []]
42764339

4340+
def _get_bool_data(self):
4341+
if self._is_mixed_type:
4342+
bool_data = self._data.get_bool_data()
4343+
return DataFrame(bool_data, copy=False)
4344+
else:
4345+
if self.values.dtype == np.bool_:
4346+
return self
4347+
else:
4348+
return self.ix[:, []]
4349+
42774350
def quantile(self, q=0.5, axis=0):
42784351
"""
42794352
Return values at the given quantile over requested axis, a la

pandas/core/internals.py

Lines changed: 46 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -492,6 +492,10 @@ def __init__(self, blocks, axes, do_integrity_check=True):
492492
if do_integrity_check:
493493
self._verify_integrity()
494494

495+
@classmethod
496+
def make_empty(self):
497+
return BlockManager([], [[], []])
498+
495499
def __nonzero__(self):
496500
return True
497501

@@ -589,10 +593,28 @@ def is_consolidated(self):
589593
dtypes = [blk.dtype.type for blk in self.blocks]
590594
return len(dtypes) == len(set(dtypes))
591595

592-
def get_numeric_data(self, copy=False):
593-
num_blocks = [b for b in self.blocks
594-
if (isinstance(b, (IntBlock, FloatBlock, ComplexBlock))
595-
and not isinstance(b, DatetimeBlock))]
596+
def get_numeric_data(self, copy=False, type_list=None):
597+
"""
598+
Parameters
599+
----------
600+
copy : boolean, default False
601+
Whether to copy the blocks
602+
type_list : tuple of type, default None
603+
Numeric types by default (Float/Complex/Int but not Datetime)
604+
"""
605+
if type_list is None:
606+
def filter_blocks(block):
607+
return (isinstance(block, (IntBlock, FloatBlock, ComplexBlock))
608+
and not isinstance(block, DatetimeBlock))
609+
else:
610+
type_list = self._get_clean_block_types(type_list)
611+
filter_blocks = lambda block: isinstance(block, type_list)
612+
613+
maybe_copy = lambda b: b.copy() if copy else b
614+
num_blocks = [maybe_copy(b) for b in self.blocks if filter_blocks(b)]
615+
616+
if len(num_blocks) == 0:
617+
return BlockManager.make_empty()
596618

597619
indexer = np.sort(np.concatenate([b.ref_locs for b in num_blocks]))
598620
new_items = self.items.take(indexer)
@@ -606,6 +628,26 @@ def get_numeric_data(self, copy=False):
606628
new_axes[0] = new_items
607629
return BlockManager(new_blocks, new_axes, do_integrity_check=False)
608630

631+
def _get_clean_block_types(self, type_list):
632+
if not isinstance(type_list, tuple):
633+
try:
634+
type_list = tuple(type_list)
635+
except TypeError:
636+
type_list = (type_list,)
637+
638+
type_map = {int : IntBlock, float : FloatBlock,
639+
complex : ComplexBlock,
640+
np.datetime64 : DatetimeBlock,
641+
datetime : DatetimeBlock,
642+
bool : BoolBlock,
643+
object : ObjectBlock}
644+
645+
type_list = tuple([type_map.get(t, t) for t in type_list])
646+
return type_list
647+
648+
def get_bool_data(self, copy=False):
649+
return self.get_numeric_data(copy=copy, type_list=(BoolBlock,))
650+
609651
def get_slice(self, slobj, axis=0):
610652
new_axes = list(self.axes)
611653
new_axes[axis] = new_axes[axis][slobj]

pandas/core/nanops.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,22 @@ def _has_infs(result):
5757
else:
5858
return np.isinf(result) or np.isneginf(result)
5959

60+
def nanany(values, axis=None, skipna=True):
61+
mask = isnull(values)
62+
63+
if skipna:
64+
values = values.copy()
65+
np.putmask(values, mask, False)
66+
return values.any(axis)
67+
68+
def nanall(values, axis=None, skipna=True):
69+
mask = isnull(values)
70+
71+
if skipna:
72+
values = values.copy()
73+
np.putmask(values, mask, True)
74+
return values.all(axis)
75+
6076
def _nansum(values, axis=None, skipna=True):
6177
mask = isnull(values)
6278

pandas/tests/test_frame.py

Lines changed: 63 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
1-
21
# pylint: disable-msg=W0612,E1101
32
from copy import deepcopy
43
from datetime import datetime, timedelta
@@ -2769,7 +2768,6 @@ def _test_seq(df, idx_ser, col_ser):
27692768
rs = df.le(df)
27702769
self.assert_(not rs.ix[0, 0])
27712770

2772-
27732771
# scalar
27742772
assert_frame_equal(df.eq(0), df == 0)
27752773
assert_frame_equal(df.ne(0), df != 0)
@@ -6246,25 +6244,73 @@ def test_bool_raises_value_error_1069(self):
62466244
self.failUnlessRaises(ValueError, lambda: bool(df))
62476245

62486246
def test_any_all(self):
6249-
df = DataFrame([[True, True, False]])
6250-
self.assert_(df.any())
6251-
self.assert_(not df.all())
6247+
self._check_bool_op('any', np.any, has_skipna=True, has_bool_only=True)
6248+
self._check_bool_op('all', np.all, has_skipna=True, has_bool_only=True)
62526249

6253-
df = DataFrame([[True, True, True]])
6254-
self.assert_(df.any())
6255-
self.assert_(df.all())
6250+
def _check_bool_op(self, name, alternative, frame=None, has_skipna=True,
6251+
has_bool_only=False):
6252+
if frame is None:
6253+
frame = self.frame > 0
6254+
# set some NAs
6255+
frame = DataFrame(frame.values.astype(object), frame.index,
6256+
frame.columns)
6257+
frame.ix[5:10] = np.nan
6258+
frame.ix[15:20, -2:] = np.nan
62566259

6257-
df = -df
6258-
self.assert_(not df.any())
6259-
self.assert_(not df.all())
6260+
f = getattr(frame, name)
6261+
6262+
if has_skipna:
6263+
def skipna_wrapper(x):
6264+
nona = x.dropna().values
6265+
return alternative(nona)
62606266

6261-
df = DataFrame([[1, 2, 3]])
6262-
self.assertRaises(ValueError, df.any)
6263-
self.assertRaises(ValueError, df.all)
6267+
def wrapper(x):
6268+
return alternative(x.values)
62646269

6265-
df = DataFrame([[1, 2, 3], [True, True, False]])
6266-
self.assertRaises(ValueError, df.any)
6267-
self.assertRaises(ValueError, df.all)
6270+
result0 = f(axis=0, skipna=False)
6271+
result1 = f(axis=1, skipna=False)
6272+
assert_series_equal(result0, frame.apply(wrapper))
6273+
assert_series_equal(result1, frame.apply(wrapper, axis=1),
6274+
check_dtype=False) # HACK: win32
6275+
else:
6276+
skipna_wrapper = alternative
6277+
wrapper = alternative
6278+
6279+
result0 = f(axis=0)
6280+
result1 = f(axis=1)
6281+
assert_series_equal(result0, frame.apply(skipna_wrapper))
6282+
assert_series_equal(result1, frame.apply(skipna_wrapper, axis=1),
6283+
check_dtype=False)
6284+
6285+
# result = f(axis=1)
6286+
# comp = frame.apply(alternative, axis=1).reindex(result.index)
6287+
# assert_series_equal(result, comp)
6288+
6289+
self.assertRaises(Exception, f, axis=2)
6290+
6291+
# make sure works on mixed-type frame
6292+
mixed = self.mixed_frame
6293+
mixed['_bool_'] = np.random.randn(len(mixed)) > 0
6294+
getattr(mixed, name)(axis=0)
6295+
getattr(mixed, name)(axis=1)
6296+
6297+
if has_bool_only:
6298+
getattr(mixed, name)(axis=0, bool_only=True)
6299+
getattr(mixed, name)(axis=1, bool_only=True)
6300+
getattr(frame, name)(axis=0, bool_only=False)
6301+
getattr(frame, name)(axis=1, bool_only=False)
6302+
6303+
# all NA case
6304+
if has_skipna:
6305+
all_na = frame * np.NaN
6306+
r0 = getattr(all_na, name)(axis=0)
6307+
r1 = getattr(all_na, name)(axis=1)
6308+
if name == 'any':
6309+
self.assert_(not r0.any())
6310+
self.assert_(not r1.any())
6311+
else:
6312+
self.assert_(r0.all())
6313+
self.assert_(r1.all())
62686314

62696315
if __name__ == '__main__':
62706316
# unittest.main()

pandas/tests/test_internals.py

Lines changed: 34 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,10 @@
44

55
import numpy as np
66

7-
from pandas import Index, MultiIndex, DataFrame
7+
from pandas import Index, MultiIndex, DataFrame, Series
88
from pandas.core.internals import *
99
import pandas.core.internals as internals
10+
import pandas.util.testing as tm
1011

1112
from pandas.util.testing import (assert_almost_equal, assert_frame_equal, randn)
1213

@@ -385,6 +386,38 @@ def test_xs(self):
385386

386387
assert_frame_equal(DataFrame(result), DataFrame(expected))
387388

389+
def test_get_numeric_data(self):
390+
int_ser = Series(np.array([0, 1, 2]))
391+
float_ser = Series(np.array([0., 1., 2.]))
392+
complex_ser = Series(np.array([0j, 1j, 2j]))
393+
str_ser = Series(np.array(['a', 'b', 'c']))
394+
bool_ser = Series(np.array([True, False, True]))
395+
obj_ser = Series(np.array([1, 'a', 5]))
396+
dt_ser = Series(tm.makeDateIndex(3))
397+
#check types
398+
df = DataFrame({'int' : int_ser, 'float' : float_ser,
399+
'complex' : complex_ser, 'str' : str_ser,
400+
'bool' : bool_ser, 'obj' : obj_ser,
401+
'dt' : dt_ser})
402+
xp = DataFrame({'int' : int_ser, 'float' : float_ser,
403+
'complex' : complex_ser})
404+
rs = DataFrame(df._data.get_numeric_data())
405+
assert_frame_equal(xp, rs)
406+
407+
xp = DataFrame({'bool' : bool_ser})
408+
rs = DataFrame(df._data.get_numeric_data(type_list=bool))
409+
assert_frame_equal(xp, rs)
410+
411+
rs = DataFrame(df._data.get_numeric_data(type_list=bool))
412+
df.ix[0, 'bool'] = not df.ix[0, 'bool']
413+
414+
self.assertEqual(rs.ix[0, 'bool'], df.ix[0, 'bool'])
415+
416+
rs = DataFrame(df._data.get_numeric_data(type_list=bool, copy=True))
417+
df.ix[0, 'bool'] = not df.ix[0, 'bool']
418+
419+
self.assertEqual(rs.ix[0, 'bool'], not df.ix[0, 'bool'])
420+
388421
if __name__ == '__main__':
389422
# unittest.main()
390423
import nose

0 commit comments

Comments
 (0)