Skip to content

TST: further clean up of frame/test_analytics #23016

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Oct 7, 2018
Merged
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
215 changes: 110 additions & 105 deletions pandas/tests/frame/test_analytics.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,22 +25,19 @@
import pandas.util._test_decorators as td


def _check_stat_op(name, alternative, main_frame, float_frame,
float_string_frame, has_skipna=True,
has_numeric_only=False, check_dtype=True,
check_dates=False, check_less_precise=False,
skipna_alternative=None):
def assert_stat_op_calc(opname, alternative, main_frame, has_skipna=True,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

main_frame -> frame

check_dtype=True, check_dates=False,
check_less_precise=False, skipna_alternative=None):

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you add a doc-string describing things

f = getattr(main_frame, name)
f = getattr(main_frame, opname)

if check_dates:
df = DataFrame({'b': date_range('1/1/2001', periods=2)})
_f = getattr(df, name)
result = _f()
result = getattr(df, opname)()
assert isinstance(result, Series)

df['a'] = lrange(len(df))
result = getattr(df, name)()
result = getattr(df, opname)()
assert isinstance(result, Series)
assert len(result)

Expand All @@ -67,7 +64,8 @@ def wrapper(x):
tm.assert_series_equal(result0, main_frame.apply(skipna_wrapper),
check_dtype=check_dtype,
check_less_precise=check_less_precise)
if name in ['sum', 'prod']:

if opname in ['sum', 'prod']:
expected = main_frame.apply(skipna_wrapper, axis=1)
tm.assert_series_equal(result1, expected, check_dtype=False,
check_less_precise=check_less_precise)
Expand All @@ -80,33 +78,37 @@ def wrapper(x):

# bad axis
tm.assert_raises_regex(ValueError, 'No axis named 2', f, axis=2)
# make sure works on mixed-type frame
getattr(float_string_frame, name)(axis=0)
getattr(float_string_frame, name)(axis=1)

if has_numeric_only:
getattr(float_string_frame, name)(axis=0, numeric_only=True)
getattr(float_string_frame, name)(axis=1, numeric_only=True)
getattr(float_frame, name)(axis=0, numeric_only=False)
getattr(float_frame, name)(axis=1, numeric_only=False)

# all NA case
if has_skipna:
all_na = float_frame * np.NaN
r0 = getattr(all_na, name)(axis=0)
r1 = getattr(all_na, name)(axis=1)
if name in ['sum', 'prod']:
unit = int(name == 'prod')
all_na = main_frame * np.NaN
r0 = getattr(all_na, opname)(axis=0)
r1 = getattr(all_na, opname)(axis=1)
if opname in ['sum', 'prod']:
unit = 1 if opname == 'prod' else 0 # result for empty sum/prod
expected = pd.Series(unit, index=r0.index, dtype=r0.dtype)
tm.assert_series_equal(r0, expected)
expected = pd.Series(unit, index=r1.index, dtype=r1.dtype)
tm.assert_series_equal(r1, expected)


def _check_bool_op(name, alternative, frame, float_string_frame,
has_skipna=True, has_bool_only=False):
def assert_stat_op_api(opname, float_frame, float_string_frame,
has_numeric_only=False):

# make sure works on mixed-type frame
getattr(float_string_frame, opname)(axis=0)
getattr(float_string_frame, opname)(axis=1)

if has_numeric_only:
getattr(float_string_frame, opname)(axis=0, numeric_only=True)
getattr(float_string_frame, opname)(axis=1, numeric_only=True)
getattr(float_frame, opname)(axis=0, numeric_only=False)
getattr(float_frame, opname)(axis=1, numeric_only=False)


f = getattr(frame, name)
def assert_bool_op_calc(opname, alternative, main_frame, has_skipna=True):

f = getattr(main_frame, opname)

if has_skipna:
def skipna_wrapper(x):
Expand All @@ -118,27 +120,44 @@ def wrapper(x):

result0 = f(axis=0, skipna=False)
result1 = f(axis=1, skipna=False)
tm.assert_series_equal(result0, frame.apply(wrapper))
tm.assert_series_equal(result1, frame.apply(wrapper, axis=1),

tm.assert_series_equal(result0, main_frame.apply(wrapper))
tm.assert_series_equal(result1, main_frame.apply(wrapper, axis=1),
check_dtype=False) # HACK: win32
else:
skipna_wrapper = alternative
wrapper = alternative

result0 = f(axis=0)
result1 = f(axis=1)
tm.assert_series_equal(result0, frame.apply(skipna_wrapper))
tm.assert_series_equal(result1, frame.apply(skipna_wrapper, axis=1),

tm.assert_series_equal(result0, main_frame.apply(skipna_wrapper))
tm.assert_series_equal(result1, main_frame.apply(skipna_wrapper, axis=1),
check_dtype=False)

# bad axis
pytest.raises(ValueError, f, axis=2)
tm.assert_raises_regex(ValueError, 'No axis named 2', f, axis=2)

# make sure works on mixed-type frame
# all NA case
if has_skipna:
all_na = main_frame * np.NaN
r0 = getattr(all_na, opname)(axis=0)
r1 = getattr(all_na, opname)(axis=1)
if opname == 'any':
assert not r0.any()
assert not r1.any()
else:
assert r0.all()
assert r1.all()


def assert_bool_op_api(opname, bool_frame_with_na, float_string_frame,
has_bool_only=False):
# make sure op works on mixed-type frame
mixed = float_string_frame
mixed['_bool_'] = np.random.randn(len(mixed)) > 0
getattr(mixed, name)(axis=0)
getattr(mixed, name)(axis=1)
mixed['_bool_'] = np.random.randn(len(mixed)) > 0.5
getattr(mixed, opname)(axis=0)
getattr(mixed, opname)(axis=1)

class NonzeroFail(object):

Expand All @@ -148,22 +167,10 @@ def __nonzero__(self):
mixed['_nonzero_fail_'] = NonzeroFail()

if has_bool_only:
getattr(mixed, name)(axis=0, bool_only=True)
getattr(mixed, name)(axis=1, bool_only=True)
getattr(frame, name)(axis=0, bool_only=False)
getattr(frame, name)(axis=1, bool_only=False)

# all NA case
if has_skipna:
all_na = frame * np.NaN
r0 = getattr(all_na, name)(axis=0)
r1 = getattr(all_na, name)(axis=1)
if name == 'any':
assert not r0.any()
assert not r1.any()
else:
assert r0.all()
assert r1.all()
getattr(mixed, opname)(axis=0, bool_only=True)
getattr(mixed, opname)(axis=1, bool_only=True)
getattr(bool_frame_with_na, opname)(axis=0, bool_only=False)
getattr(bool_frame_with_na, opname)(axis=1, bool_only=False)


class TestDataFrameAnalytics():
Expand Down Expand Up @@ -596,10 +603,10 @@ def test_reduce_mixed_frame(self):

def test_count(self, float_frame_with_na, float_frame, float_string_frame):
f = lambda s: notna(s).sum()
_check_stat_op('count', f, float_frame_with_na, float_frame,
float_string_frame, has_skipna=False,
has_numeric_only=True, check_dtype=False,
check_dates=True)
assert_stat_op_calc('count', f, float_frame_with_na, has_skipna=False,
check_dtype=False, check_dates=True)
assert_stat_op_api('count', float_frame, float_string_frame,
has_numeric_only=True)

# corner case
frame = DataFrame()
Expand Down Expand Up @@ -628,9 +635,10 @@ def test_count(self, float_frame_with_na, float_frame, float_string_frame):
def test_nunique(self, float_frame_with_na, float_frame,
float_string_frame):
f = lambda s: len(algorithms.unique1d(s.dropna()))
_check_stat_op('nunique', f, float_frame_with_na,
float_frame, float_string_frame, has_skipna=False,
check_dtype=False, check_dates=True)
assert_stat_op_calc('nunique', f, float_frame_with_na,
has_skipna=False, check_dtype=False,
check_dates=True)
assert_stat_op_api('nunique', float_frame, float_string_frame)

df = DataFrame({'A': [1, 1, 1],
'B': [1, 2, 3],
Expand All @@ -644,15 +652,13 @@ def test_nunique(self, float_frame_with_na, float_frame,

def test_sum(self, float_frame_with_na, mixed_float_frame,
float_frame, float_string_frame):
_check_stat_op('sum', np.sum, float_frame_with_na, float_frame,
float_string_frame, has_numeric_only=True,
skipna_alternative=np.nansum)

assert_stat_op_api('sum', float_frame, float_string_frame,
has_numeric_only=True)
assert_stat_op_calc('sum', np.sum, float_frame_with_na,
skipna_alternative=np.nansum)
# mixed types (with upcasting happening)
_check_stat_op('sum', np.sum,
mixed_float_frame.astype('float32'), float_frame,
float_string_frame, has_numeric_only=True,
check_dtype=False, check_less_precise=True)
assert_stat_op_calc('sum', np.sum, mixed_float_frame.astype('float32'),
check_dtype=False, check_less_precise=True)

@pytest.mark.parametrize('method', ['sum', 'mean', 'prod', 'var',
'std', 'skew', 'min', 'max'])
Expand All @@ -679,13 +685,14 @@ def test_stat_operators_attempt_obj_array(self, method):
tm.assert_series_equal(result, expected)

def test_mean(self, float_frame_with_na, float_frame, float_string_frame):
_check_stat_op('mean', np.mean, float_frame_with_na,
float_frame, float_string_frame, check_dates=True)
assert_stat_op_calc('mean', np.mean, float_frame_with_na,
check_dates=True)
assert_stat_op_api('mean', float_frame, float_string_frame)

def test_product(self, float_frame_with_na, float_frame,
float_string_frame):
_check_stat_op('product', np.prod, float_frame_with_na,
float_frame, float_string_frame)
assert_stat_op_calc('product', np.prod, float_frame_with_na)
assert_stat_op_api('product', float_frame, float_string_frame)

# TODO: Ensure warning isn't emitted in the first place
@pytest.mark.filterwarnings("ignore:All-NaN:RuntimeWarning")
Expand All @@ -696,18 +703,18 @@ def wrapper(x):
return np.nan
return np.median(x)

_check_stat_op('median', wrapper, float_frame_with_na,
float_frame, float_string_frame, check_dates=True)
assert_stat_op_calc('median', wrapper, float_frame_with_na,
check_dates=True)
assert_stat_op_api('median', float_frame, float_string_frame)

def test_min(self, float_frame_with_na, int_frame,
float_frame, float_string_frame):
with warnings.catch_warnings(record=True):
warnings.simplefilter("ignore", RuntimeWarning)
_check_stat_op('min', np.min, float_frame_with_na,
float_frame, float_string_frame,
check_dates=True)
_check_stat_op('min', np.min, int_frame, float_frame,
float_string_frame)
assert_stat_op_calc('min', np.min, float_frame_with_na,
check_dates=True)
assert_stat_op_calc('min', np.min, int_frame)
assert_stat_op_api('min', float_frame, float_string_frame)

def test_cummin(self, datetime_frame):
datetime_frame.loc[5:10, 0] = nan
Expand Down Expand Up @@ -759,26 +766,25 @@ def test_max(self, float_frame_with_na, int_frame,
float_frame, float_string_frame):
with warnings.catch_warnings(record=True):
warnings.simplefilter("ignore", RuntimeWarning)
_check_stat_op('max', np.max, float_frame_with_na,
float_frame, float_string_frame,
check_dates=True)
_check_stat_op('max', np.max, int_frame, float_frame,
float_string_frame)
assert_stat_op_calc('max', np.max, float_frame_with_na,
check_dates=True)
assert_stat_op_calc('max', np.max, int_frame)
assert_stat_op_api('max', float_frame, float_string_frame)

def test_mad(self, float_frame_with_na, float_frame, float_string_frame):
f = lambda x: np.abs(x - x.mean()).mean()
_check_stat_op('mad', f, float_frame_with_na, float_frame,
float_string_frame)
assert_stat_op_calc('mad', f, float_frame_with_na)
assert_stat_op_api('mad', float_frame, float_string_frame)

def test_var_std(self, float_frame_with_na, datetime_frame, float_frame,
float_string_frame):
alt = lambda x: np.var(x, ddof=1)
_check_stat_op('var', alt, float_frame_with_na, float_frame,
float_string_frame)
assert_stat_op_calc('var', alt, float_frame_with_na)
assert_stat_op_api('var', float_frame, float_string_frame)

alt = lambda x: np.std(x, ddof=1)
_check_stat_op('std', alt, float_frame_with_na, float_frame,
float_string_frame)
assert_stat_op_calc('std', alt, float_frame_with_na)
assert_stat_op_api('std', float_frame, float_string_frame)

result = datetime_frame.std(ddof=4)
expected = datetime_frame.apply(lambda x: x.std(ddof=4))
Expand Down Expand Up @@ -892,8 +898,8 @@ def test_cumprod(self, datetime_frame):
def test_sem(self, float_frame_with_na, datetime_frame,
float_frame, float_string_frame):
alt = lambda x: np.std(x, ddof=1) / np.sqrt(len(x))
_check_stat_op('sem', alt, float_frame_with_na,
float_frame, float_string_frame)
assert_stat_op_calc('sem', alt, float_frame_with_na)
assert_stat_op_api('sem', float_frame, float_string_frame)

result = datetime_frame.sem(ddof=4)
expected = datetime_frame.apply(
Expand All @@ -917,8 +923,8 @@ def alt(x):
return np.nan
return skew(x, bias=False)

_check_stat_op('skew', alt, float_frame_with_na,
float_frame, float_string_frame)
assert_stat_op_calc('skew', alt, float_frame_with_na)
assert_stat_op_api('skew', float_frame, float_string_frame)

@td.skip_if_no_scipy
def test_kurt(self, float_frame_with_na, float_frame, float_string_frame):
Expand All @@ -929,8 +935,8 @@ def alt(x):
return np.nan
return kurtosis(x, bias=False)

_check_stat_op('kurt', alt, float_frame_with_na,
float_frame, float_string_frame)
assert_stat_op_calc('kurt', alt, float_frame_with_na)
assert_stat_op_api('kurt', float_frame, float_string_frame)

index = MultiIndex(levels=[['bar'], ['one', 'two', 'three'], [0, 1]],
labels=[[0, 0, 0, 0, 0, 0],
Expand Down Expand Up @@ -1205,9 +1211,9 @@ def wrapper(x):
return np.nan
return np.median(x)

_check_stat_op('median', wrapper, int_frame, float_frame,
float_string_frame, check_dtype=False,
check_dates=True)
assert_stat_op_calc('median', wrapper, int_frame, check_dtype=False,
check_dates=True)
assert_stat_op_api('median', float_frame, float_string_frame)

# Miscellanea

Expand Down Expand Up @@ -1262,13 +1268,12 @@ def test_idxmax(self, float_frame, int_frame):
# ----------------------------------------------------------------------
# Logical reductions

def test_any_all(self, bool_frame_with_na, float_string_frame):
_check_bool_op('any', np.any, bool_frame_with_na,
float_string_frame, has_skipna=True,
has_bool_only=True)
_check_bool_op('all', np.all, bool_frame_with_na,
float_string_frame, has_skipna=True,
has_bool_only=True)
@pytest.mark.parametrize('opname', ['any', 'all'])
def test_any_all(self, opname, bool_frame_with_na, float_string_frame):
assert_bool_op_calc(opname, getattr(np, opname), bool_frame_with_na,
has_skipna=True)
assert_bool_op_api(opname, bool_frame_with_na, float_string_frame,
has_bool_only=True)

def test_any_all_extra(self):
df = DataFrame({
Expand Down