diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py index b0b9f2815cbb9..5327e3fcbea76 100644 --- a/pandas/tests/frame/test_analytics.py +++ b/pandas/tests/frame/test_analytics.py @@ -25,22 +25,44 @@ import pandas.util._test_decorators as td -def _check_stat_op(name, alternative, main_frame, float_frame, - float_string_frame, has_skipna=True, - has_numeric_only=False, check_dtype=True, - check_dates=False, check_less_precise=False, - skipna_alternative=None): - - f = getattr(main_frame, name) +def assert_stat_op_calc(opname, alternative, frame, has_skipna=True, + check_dtype=True, check_dates=False, + check_less_precise=False, skipna_alternative=None): + """ + Check that operator opname works as advertised on frame + + Parameters + ---------- + opname : string + Name of the operator to test on frame + alternative : function + Function that opname is tested against; i.e. "frame.opname()" should + equal "alternative(frame)". + frame : DataFrame + The object that the tests are executed on + has_skipna : bool, default True + Whether the method "opname" has the kwarg "skip_na" + check_dtype : bool, default True + Whether the dtypes of the result of "frame.opname()" and + "alternative(frame)" should be checked. + check_dates : bool, default false + Whether opname should be tested on a Datetime Series + check_less_precise : bool, default False + Whether results should only be compared approximately; + passed on to tm.assert_series_equal + skipna_alternative : function, default None + NaN-safe version of alternative + """ + + f = getattr(frame, opname) if check_dates: df = DataFrame({'b': date_range('1/1/2001', periods=2)}) - _f = getattr(df, name) - result = _f() + result = getattr(df, opname)() assert isinstance(result, Series) df['a'] = lrange(len(df)) - result = getattr(df, name)() + result = getattr(df, opname)() assert isinstance(result, Series) assert len(result) @@ -52,11 +74,11 @@ def wrapper(x): skipna_alternative) result0 = f(axis=0, skipna=False) result1 = f(axis=1, skipna=False) - tm.assert_series_equal(result0, main_frame.apply(wrapper), + tm.assert_series_equal(result0, frame.apply(wrapper), check_dtype=check_dtype, check_less_precise=check_less_precise) # HACK: win32 - tm.assert_series_equal(result1, main_frame.apply(wrapper, axis=1), + tm.assert_series_equal(result1, frame.apply(wrapper, axis=1), check_dtype=False, check_less_precise=check_less_precise) else: @@ -64,49 +86,83 @@ def wrapper(x): result0 = f(axis=0) result1 = f(axis=1) - tm.assert_series_equal(result0, main_frame.apply(skipna_wrapper), + tm.assert_series_equal(result0, frame.apply(skipna_wrapper), check_dtype=check_dtype, check_less_precise=check_less_precise) - if name in ['sum', 'prod']: - expected = main_frame.apply(skipna_wrapper, axis=1) + + if opname in ['sum', 'prod']: + expected = frame.apply(skipna_wrapper, axis=1) tm.assert_series_equal(result1, expected, check_dtype=False, check_less_precise=check_less_precise) # check dtypes if check_dtype: - lcd_dtype = main_frame.values.dtype + lcd_dtype = frame.values.dtype assert lcd_dtype == result0.dtype assert lcd_dtype == result1.dtype # bad axis tm.assert_raises_regex(ValueError, 'No axis named 2', f, axis=2) - # make sure works on mixed-type frame - getattr(float_string_frame, name)(axis=0) - getattr(float_string_frame, name)(axis=1) - - if has_numeric_only: - getattr(float_string_frame, name)(axis=0, numeric_only=True) - getattr(float_string_frame, name)(axis=1, numeric_only=True) - getattr(float_frame, name)(axis=0, numeric_only=False) - getattr(float_frame, name)(axis=1, numeric_only=False) # all NA case if has_skipna: - all_na = float_frame * np.NaN - r0 = getattr(all_na, name)(axis=0) - r1 = getattr(all_na, name)(axis=1) - if name in ['sum', 'prod']: - unit = int(name == 'prod') + all_na = frame * np.NaN + r0 = getattr(all_na, opname)(axis=0) + r1 = getattr(all_na, opname)(axis=1) + if opname in ['sum', 'prod']: + unit = 1 if opname == 'prod' else 0 # result for empty sum/prod expected = pd.Series(unit, index=r0.index, dtype=r0.dtype) tm.assert_series_equal(r0, expected) expected = pd.Series(unit, index=r1.index, dtype=r1.dtype) tm.assert_series_equal(r1, expected) -def _check_bool_op(name, alternative, frame, float_string_frame, - has_skipna=True, has_bool_only=False): +def assert_stat_op_api(opname, float_frame, float_string_frame, + has_numeric_only=False): + """ + Check that API for operator opname works as advertised on frame + + Parameters + ---------- + opname : string + Name of the operator to test on frame + float_frame : DataFrame + DataFrame with columns of type float + float_string_frame : DataFrame + DataFrame with both float and string columns + has_numeric_only : bool, default False + Whether the method "opname" has the kwarg "numeric_only" + """ + + # make sure works on mixed-type frame + getattr(float_string_frame, opname)(axis=0) + getattr(float_string_frame, opname)(axis=1) - f = getattr(frame, name) + if has_numeric_only: + getattr(float_string_frame, opname)(axis=0, numeric_only=True) + getattr(float_string_frame, opname)(axis=1, numeric_only=True) + getattr(float_frame, opname)(axis=0, numeric_only=False) + getattr(float_frame, opname)(axis=1, numeric_only=False) + + +def assert_bool_op_calc(opname, alternative, frame, has_skipna=True): + """ + Check that bool operator opname works as advertised on frame + + Parameters + ---------- + opname : string + Name of the operator to test on frame + alternative : function + Function that opname is tested against; i.e. "frame.opname()" should + equal "alternative(frame)". + frame : DataFrame + The object that the tests are executed on + has_skipna : bool, default True + Whether the method "opname" has the kwarg "skip_na" + """ + + f = getattr(frame, opname) if has_skipna: def skipna_wrapper(x): @@ -118,6 +174,7 @@ def wrapper(x): result0 = f(axis=0, skipna=False) result1 = f(axis=1, skipna=False) + tm.assert_series_equal(result0, frame.apply(wrapper)) tm.assert_series_equal(result1, frame.apply(wrapper, axis=1), check_dtype=False) # HACK: win32 @@ -127,18 +184,48 @@ def wrapper(x): result0 = f(axis=0) result1 = f(axis=1) + tm.assert_series_equal(result0, frame.apply(skipna_wrapper)) tm.assert_series_equal(result1, frame.apply(skipna_wrapper, axis=1), check_dtype=False) # bad axis - pytest.raises(ValueError, f, axis=2) + tm.assert_raises_regex(ValueError, 'No axis named 2', f, axis=2) - # make sure works on mixed-type frame + # all NA case + if has_skipna: + all_na = frame * np.NaN + r0 = getattr(all_na, opname)(axis=0) + r1 = getattr(all_na, opname)(axis=1) + if opname == 'any': + assert not r0.any() + assert not r1.any() + else: + assert r0.all() + assert r1.all() + + +def assert_bool_op_api(opname, bool_frame_with_na, float_string_frame, + has_bool_only=False): + """ + Check that API for boolean operator opname works as advertised on frame + + Parameters + ---------- + opname : string + Name of the operator to test on frame + float_frame : DataFrame + DataFrame with columns of type float + float_string_frame : DataFrame + DataFrame with both float and string columns + has_bool_only : bool, default False + Whether the method "opname" has the kwarg "bool_only" + """ + # make sure op works on mixed-type frame mixed = float_string_frame - mixed['_bool_'] = np.random.randn(len(mixed)) > 0 - getattr(mixed, name)(axis=0) - getattr(mixed, name)(axis=1) + mixed['_bool_'] = np.random.randn(len(mixed)) > 0.5 + getattr(mixed, opname)(axis=0) + getattr(mixed, opname)(axis=1) class NonzeroFail(object): @@ -148,22 +235,10 @@ def __nonzero__(self): mixed['_nonzero_fail_'] = NonzeroFail() if has_bool_only: - getattr(mixed, name)(axis=0, bool_only=True) - getattr(mixed, name)(axis=1, bool_only=True) - getattr(frame, name)(axis=0, bool_only=False) - getattr(frame, name)(axis=1, bool_only=False) - - # all NA case - if has_skipna: - all_na = frame * np.NaN - r0 = getattr(all_na, name)(axis=0) - r1 = getattr(all_na, name)(axis=1) - if name == 'any': - assert not r0.any() - assert not r1.any() - else: - assert r0.all() - assert r1.all() + getattr(mixed, opname)(axis=0, bool_only=True) + getattr(mixed, opname)(axis=1, bool_only=True) + getattr(bool_frame_with_na, opname)(axis=0, bool_only=False) + getattr(bool_frame_with_na, opname)(axis=1, bool_only=False) class TestDataFrameAnalytics(): @@ -596,10 +671,10 @@ def test_reduce_mixed_frame(self): def test_count(self, float_frame_with_na, float_frame, float_string_frame): f = lambda s: notna(s).sum() - _check_stat_op('count', f, float_frame_with_na, float_frame, - float_string_frame, has_skipna=False, - has_numeric_only=True, check_dtype=False, - check_dates=True) + assert_stat_op_calc('count', f, float_frame_with_na, has_skipna=False, + check_dtype=False, check_dates=True) + assert_stat_op_api('count', float_frame, float_string_frame, + has_numeric_only=True) # corner case frame = DataFrame() @@ -628,9 +703,10 @@ def test_count(self, float_frame_with_na, float_frame, float_string_frame): def test_nunique(self, float_frame_with_na, float_frame, float_string_frame): f = lambda s: len(algorithms.unique1d(s.dropna())) - _check_stat_op('nunique', f, float_frame_with_na, - float_frame, float_string_frame, has_skipna=False, - check_dtype=False, check_dates=True) + assert_stat_op_calc('nunique', f, float_frame_with_na, + has_skipna=False, check_dtype=False, + check_dates=True) + assert_stat_op_api('nunique', float_frame, float_string_frame) df = DataFrame({'A': [1, 1, 1], 'B': [1, 2, 3], @@ -644,15 +720,13 @@ def test_nunique(self, float_frame_with_na, float_frame, def test_sum(self, float_frame_with_na, mixed_float_frame, float_frame, float_string_frame): - _check_stat_op('sum', np.sum, float_frame_with_na, float_frame, - float_string_frame, has_numeric_only=True, - skipna_alternative=np.nansum) - + assert_stat_op_api('sum', float_frame, float_string_frame, + has_numeric_only=True) + assert_stat_op_calc('sum', np.sum, float_frame_with_na, + skipna_alternative=np.nansum) # mixed types (with upcasting happening) - _check_stat_op('sum', np.sum, - mixed_float_frame.astype('float32'), float_frame, - float_string_frame, has_numeric_only=True, - check_dtype=False, check_less_precise=True) + assert_stat_op_calc('sum', np.sum, mixed_float_frame.astype('float32'), + check_dtype=False, check_less_precise=True) @pytest.mark.parametrize('method', ['sum', 'mean', 'prod', 'var', 'std', 'skew', 'min', 'max']) @@ -679,13 +753,14 @@ def test_stat_operators_attempt_obj_array(self, method): tm.assert_series_equal(result, expected) def test_mean(self, float_frame_with_na, float_frame, float_string_frame): - _check_stat_op('mean', np.mean, float_frame_with_na, - float_frame, float_string_frame, check_dates=True) + assert_stat_op_calc('mean', np.mean, float_frame_with_na, + check_dates=True) + assert_stat_op_api('mean', float_frame, float_string_frame) def test_product(self, float_frame_with_na, float_frame, float_string_frame): - _check_stat_op('product', np.prod, float_frame_with_na, - float_frame, float_string_frame) + assert_stat_op_calc('product', np.prod, float_frame_with_na) + assert_stat_op_api('product', float_frame, float_string_frame) # TODO: Ensure warning isn't emitted in the first place @pytest.mark.filterwarnings("ignore:All-NaN:RuntimeWarning") @@ -696,18 +771,18 @@ def wrapper(x): return np.nan return np.median(x) - _check_stat_op('median', wrapper, float_frame_with_na, - float_frame, float_string_frame, check_dates=True) + assert_stat_op_calc('median', wrapper, float_frame_with_na, + check_dates=True) + assert_stat_op_api('median', float_frame, float_string_frame) def test_min(self, float_frame_with_na, int_frame, float_frame, float_string_frame): with warnings.catch_warnings(record=True): warnings.simplefilter("ignore", RuntimeWarning) - _check_stat_op('min', np.min, float_frame_with_na, - float_frame, float_string_frame, - check_dates=True) - _check_stat_op('min', np.min, int_frame, float_frame, - float_string_frame) + assert_stat_op_calc('min', np.min, float_frame_with_na, + check_dates=True) + assert_stat_op_calc('min', np.min, int_frame) + assert_stat_op_api('min', float_frame, float_string_frame) def test_cummin(self, datetime_frame): datetime_frame.loc[5:10, 0] = nan @@ -759,26 +834,25 @@ def test_max(self, float_frame_with_na, int_frame, float_frame, float_string_frame): with warnings.catch_warnings(record=True): warnings.simplefilter("ignore", RuntimeWarning) - _check_stat_op('max', np.max, float_frame_with_na, - float_frame, float_string_frame, - check_dates=True) - _check_stat_op('max', np.max, int_frame, float_frame, - float_string_frame) + assert_stat_op_calc('max', np.max, float_frame_with_na, + check_dates=True) + assert_stat_op_calc('max', np.max, int_frame) + assert_stat_op_api('max', float_frame, float_string_frame) def test_mad(self, float_frame_with_na, float_frame, float_string_frame): f = lambda x: np.abs(x - x.mean()).mean() - _check_stat_op('mad', f, float_frame_with_na, float_frame, - float_string_frame) + assert_stat_op_calc('mad', f, float_frame_with_na) + assert_stat_op_api('mad', float_frame, float_string_frame) def test_var_std(self, float_frame_with_na, datetime_frame, float_frame, float_string_frame): alt = lambda x: np.var(x, ddof=1) - _check_stat_op('var', alt, float_frame_with_na, float_frame, - float_string_frame) + assert_stat_op_calc('var', alt, float_frame_with_na) + assert_stat_op_api('var', float_frame, float_string_frame) alt = lambda x: np.std(x, ddof=1) - _check_stat_op('std', alt, float_frame_with_na, float_frame, - float_string_frame) + assert_stat_op_calc('std', alt, float_frame_with_na) + assert_stat_op_api('std', float_frame, float_string_frame) result = datetime_frame.std(ddof=4) expected = datetime_frame.apply(lambda x: x.std(ddof=4)) @@ -892,8 +966,8 @@ def test_cumprod(self, datetime_frame): def test_sem(self, float_frame_with_na, datetime_frame, float_frame, float_string_frame): alt = lambda x: np.std(x, ddof=1) / np.sqrt(len(x)) - _check_stat_op('sem', alt, float_frame_with_na, - float_frame, float_string_frame) + assert_stat_op_calc('sem', alt, float_frame_with_na) + assert_stat_op_api('sem', float_frame, float_string_frame) result = datetime_frame.sem(ddof=4) expected = datetime_frame.apply( @@ -917,8 +991,8 @@ def alt(x): return np.nan return skew(x, bias=False) - _check_stat_op('skew', alt, float_frame_with_na, - float_frame, float_string_frame) + assert_stat_op_calc('skew', alt, float_frame_with_na) + assert_stat_op_api('skew', float_frame, float_string_frame) @td.skip_if_no_scipy def test_kurt(self, float_frame_with_na, float_frame, float_string_frame): @@ -929,8 +1003,8 @@ def alt(x): return np.nan return kurtosis(x, bias=False) - _check_stat_op('kurt', alt, float_frame_with_na, - float_frame, float_string_frame) + assert_stat_op_calc('kurt', alt, float_frame_with_na) + assert_stat_op_api('kurt', float_frame, float_string_frame) index = MultiIndex(levels=[['bar'], ['one', 'two', 'three'], [0, 1]], labels=[[0, 0, 0, 0, 0, 0], @@ -1205,9 +1279,9 @@ def wrapper(x): return np.nan return np.median(x) - _check_stat_op('median', wrapper, int_frame, float_frame, - float_string_frame, check_dtype=False, - check_dates=True) + assert_stat_op_calc('median', wrapper, int_frame, check_dtype=False, + check_dates=True) + assert_stat_op_api('median', float_frame, float_string_frame) # Miscellanea @@ -1262,13 +1336,12 @@ def test_idxmax(self, float_frame, int_frame): # ---------------------------------------------------------------------- # Logical reductions - def test_any_all(self, bool_frame_with_na, float_string_frame): - _check_bool_op('any', np.any, bool_frame_with_na, - float_string_frame, has_skipna=True, - has_bool_only=True) - _check_bool_op('all', np.all, bool_frame_with_na, - float_string_frame, has_skipna=True, - has_bool_only=True) + @pytest.mark.parametrize('opname', ['any', 'all']) + def test_any_all(self, opname, bool_frame_with_na, float_string_frame): + assert_bool_op_calc(opname, getattr(np, opname), bool_frame_with_na, + has_skipna=True) + assert_bool_op_api(opname, bool_frame_with_na, float_string_frame, + has_bool_only=True) def test_any_all_extra(self): df = DataFrame({