From 6e8eea3a963b017ee6f5a3dea634062f55d6e855 Mon Sep 17 00:00:00 2001 From: Adam Klein Date: Sun, 23 Oct 2011 17:19:35 -0400 Subject: [PATCH] Added the level arguments to the series and frame math operations such as sum, etc. --- pandas/__init__.py | 2 +- pandas/core/frame.py | 85 +++++++++++++++++++++---- pandas/core/series.py | 106 ++++++++++++++++++++++++++------ pandas/tests/test_multilevel.py | 31 ++++++++++ 4 files changed, 193 insertions(+), 31 deletions(-) diff --git a/pandas/__init__.py b/pandas/__init__.py index 7d5ecf84fddf0..f0aac997d4cbe 100644 --- a/pandas/__init__.py +++ b/pandas/__init__.py @@ -9,7 +9,7 @@ try: import pandas._tseries as lib except Exception, e: # pragma: no cover - if 'No module named' in e.message: + if 'No module named' in str(e): raise ImportError('C extensions not built: if you installed already ' 'verify that you are not importing from the source ' 'directory') diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 2c18da8e56428..4a1c6ed6b7283 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2425,7 +2425,7 @@ def _count_level(self, level, axis=0, numeric_only=False): return DataFrame(result, index=index, columns=columns) - def sum(self, axis=0, numeric_only=False, skipna=True): + def sum(self, axis=0, numeric_only=False, skipna=True, level=None): """ Return sum over requested axis @@ -2438,6 +2438,8 @@ def sum(self, axis=0, numeric_only=False, skipna=True): skipna : boolean, default True Exclude NA/null values. If an entire row/column is NA, the result will be NA + level : integer, default None + Choose a level to groupby before applying operation Examples -------- @@ -2456,6 +2458,10 @@ def sum(self, axis=0, numeric_only=False, skipna=True): ------- sum : Series """ + if not level is None: + sumfunc = lambda x: x.sum(skipna=skipna) + return self.groupby(level=level).aggregate(sumfunc) + y, axis_labels = self._get_agg_data(axis, numeric_only=numeric_only) if len(axis_labels) == 0: @@ -2479,7 +2485,7 @@ def sum(self, axis=0, numeric_only=False, skipna=True): return Series(the_sum, index=axis_labels) - def min(self, axis=0, skipna=True): + def min(self, axis=0, skipna=True, level=None): """ Return minimum over requested axis. NA/null values are excluded @@ -2490,6 +2496,8 @@ def min(self, axis=0, skipna=True): skipna : boolean, default True Exclude NA/null values. If an entire row/column is NA, the result will be NA + level : integer, default None + Choose a level to groupby before applying operation Returns ------- @@ -2498,9 +2506,14 @@ def min(self, axis=0, skipna=True): values = self.values.copy() if skipna and not issubclass(values.dtype.type, np.integer): np.putmask(values, -np.isfinite(values), np.inf) + + if not level is None: + minfunc = lambda x: x.min(skipna=skipna) + return self.groupby(level=level).aggregate(minfunc) + return Series(values.min(axis), index=self._get_agg_axis(axis)) - def max(self, axis=0, skipna=True): + def max(self, axis=0, skipna=True, level=None): """ Return maximum over requested axis. NA/null values are excluded @@ -2511,6 +2524,8 @@ def max(self, axis=0, skipna=True): skipna : boolean, default True Exclude NA/null values. If an entire row/column is NA, the result will be NA + level : integer, default None + Choose a level to groupby before applying operation Returns ------- @@ -2519,9 +2534,14 @@ def max(self, axis=0, skipna=True): values = self.values.copy() if skipna and not issubclass(values.dtype.type, np.integer): np.putmask(values, -np.isfinite(values), -np.inf) + + if not level is None: + maxfunc = lambda x: x.max(skipna=skipna) + return self.groupby(level=level).aggregate(maxfunc) + return Series(values.max(axis), index=self._get_agg_axis(axis)) - def prod(self, axis=0, skipna=True): + def prod(self, axis=0, skipna=True, level=None): """ Return product over requested axis. NA/null values are treated as 1 @@ -2532,11 +2552,17 @@ def prod(self, axis=0, skipna=True): skipna : boolean, default True Exclude NA/null values. If an entire row/column is NA, the result will be NA + level : integer, default None + Choose a level to groupby before applying operation Returns ------- product : Series """ + if not level is None: + prodfunc = lambda x: x.prod(skipna=skipna) + return self.groupby(level=level).aggregate(prodfunc) + y = np.array(self.values, subok=True) if skipna: if not issubclass(y.dtype.type, np.integer): @@ -2544,11 +2570,10 @@ def prod(self, axis=0, skipna=True): result = y.prod(axis) count = self.count(axis) result[count == 0] = nan - return Series(result, index=self._get_agg_axis(axis)) - product = prod + return Series(result, index=self._get_agg_axis(axis)) - def mean(self, axis=0, skipna=True): + def mean(self, axis=0, skipna=True, level=None): """ Return mean over requested axis. NA/null values are excluded @@ -2559,11 +2584,17 @@ def mean(self, axis=0, skipna=True): skipna : boolean, default True Exclude NA/null values. If an entire row/column is NA, the result will be NA + level : integer, default None + Choose a level to groupby before applying operation Returns ------- mean : Series """ + if not level is None: + meanfunc = lambda x: x.mean(skipna=skipna) + return self.groupby(level=level).aggregate(meanfunc) + summed = self.sum(axis, numeric_only=True, skipna=skipna) count = self.count(axis, numeric_only=True).astype(float) return summed / count @@ -2599,7 +2630,7 @@ def f(arr): return self.apply(f, axis=axis) - def median(self, axis=0, skipna=True): + def median(self, axis=0, skipna=True, level=None): """ Return median over requested axis, NA/null are exluded @@ -2610,11 +2641,17 @@ def median(self, axis=0, skipna=True): skipna : boolean, default True Exclude NA/null values. If an entire row/column is NA, the result will be NA + level : integer, default None + Choose a level to groupby before applying operation Returns ------- Series or TimeSeries """ + if not level is None: + medianfunc = lambda x: x.median(skipna=skipna) + return self.groupby(level=level).aggregate(medianfunc) + if axis == 0: med = [self[col].median(skipna=skipna) for col in self.columns] return Series(med, index=self.columns) @@ -2624,7 +2661,7 @@ def median(self, axis=0, skipna=True): else: raise Exception('Must have 0<= axis <= 1') - def mad(self, axis=0, skipna=True): + def mad(self, axis=0, skipna=True, level=None): """ Return mean absolute deviation over requested axis @@ -2635,18 +2672,24 @@ def mad(self, axis=0, skipna=True): skipna : boolean, default True Exclude NA/null values. If an entire row/column is NA, the result will be NA + level : integer, default None + Choose a level to groupby before applying operation Returns ------- mad : Series """ + if not level is None: + madfunc = lambda x: x.mad(skipna=skipna) + return self.groupby(level=level).aggregate(madfunc) + if axis == 0: demeaned = self - self.mean(axis=0) else: demeaned = self.sub(self.mean(axis=1), axis=0) return np.abs(demeaned).mean(axis=axis, skipna=skipna) - def var(self, axis=0, skipna=True): + def var(self, axis=0, skipna=True, level=None): """ Return unbiased variance over requested axis @@ -2657,11 +2700,17 @@ def var(self, axis=0, skipna=True): skipna : boolean, default True Exclude NA/null values. If an entire row/column is NA, the result will be NA + level : integer, default None + Choose a level to groupby before applying operation Returns ------- var : Series """ + if not level is None: + varfunc = lambda x: x.var(skipna=skipna) + return self.groupby(level=level).aggregate(varfunc) + y, axis_labels = self._get_agg_data(axis, numeric_only=True) mask = np.isnan(y) @@ -2677,7 +2726,7 @@ def var(self, axis=0, skipna=True): return Series(theVar, index=axis_labels) - def std(self, axis=0, skipna=True): + def std(self, axis=0, skipna=True, level=None): """ Return unbiased std deviation over requested axis @@ -2688,14 +2737,20 @@ def std(self, axis=0, skipna=True): skipna : boolean, default True Exclude NA/null values. If an entire row/column is NA, the result will be NA + level : integer, default None + Choose a level to groupby before applying operation Returns ------- std : Series """ + if not level is None: + stdfunc = lambda x: x.std(skipna=skipna) + return self.groupby(level=level).aggregate(stdfunc) + return np.sqrt(self.var(axis=axis, skipna=skipna)) - def skew(self, axis=0, skipna=True): + def skew(self, axis=0, skipna=True, level=None): """ Return unbiased skewness over requested axis @@ -2706,11 +2761,17 @@ def skew(self, axis=0, skipna=True): skipna : boolean, default True Exclude NA/null values. If an entire row/column is NA, the result will be NA + level : integer, default None + Choose a level to groupby before applying operation Returns ------- skew : Series """ + if not level is None: + skewfunc = lambda x: x.skew(skipna=skipna) + return self.groupby(level=level).aggregate(skewfunc) + y, axis_labels = self._get_agg_data(axis, numeric_only=True) mask = np.isnan(y) diff --git a/pandas/core/series.py b/pandas/core/series.py index 295fc126e7618..bec45cc7c7511 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -601,7 +601,8 @@ def value_counts(self): counter[value] += 1 return Series(counter).order(ascending=False) - def sum(self, axis=0, dtype=None, out=None, skipna=True): + + def sum(self, axis=0, dtype=None, out=None, skipna=True, level=None): """ Sum of values @@ -610,10 +611,17 @@ def sum(self, axis=0, dtype=None, out=None, skipna=True): skipna : boolean, default True Exclude NA/null values + level : integer, default None + Choose a level to groupby before applying operation + Returns ------- sum : float """ + if level is not None: + sumfunc = lambda x: x.sum(dtype=dtype,skipna=skipna) + return self.groupby(level=level).aggregate(sumfunc) + values = self.values.copy() if skipna: @@ -624,7 +632,7 @@ def sum(self, axis=0, dtype=None, out=None, skipna=True): return values.sum() - def mean(self, axis=0, dtype=None, out=None, skipna=True): + def mean(self, axis=0, dtype=None, out=None, skipna=True, level=None): """ Mean of values @@ -633,13 +641,20 @@ def mean(self, axis=0, dtype=None, out=None, skipna=True): skipna : boolean, default True Exclude NA/null values + level : integer, default None + Choose a level to groupby before applying operation + Returns ------- mean : float """ - return self._ndarray_statistic('mean', dtype=dtype, skipna=skipna) + if level is None: + return self._ndarray_statistic('mean', dtype=dtype, skipna=skipna) + + meanfunc = lambda x: x.mean(dtype=dtype,skipna=skipna) + return self.groupby(level=level).aggregate(meanfunc) - def median(self, skipna=True): + def median(self, skipna=True, level=None): """ Compute median of values @@ -648,6 +663,9 @@ def median(self, skipna=True): skipna : boolean, default True Exclude NA/null values + level : integer, default None + Choose a level to groupby before applying operation + Returns ------- median : float @@ -663,9 +681,13 @@ def median(self, skipna=True): if not mask.all(): return np.nan - return lib.median(arr) + if level is None: + return lib.median(arr) - def prod(self, axis=0, dtype=None, out=None, skipna=True): + medianfunc = lambda x: x.median(skipna=skipna) + return self.groupby(level=level).aggregate(medianfunc) + + def prod(self, axis=0, dtype=None, out=None, skipna=True, level=None): """ Product of all values @@ -674,13 +696,20 @@ def prod(self, axis=0, dtype=None, out=None, skipna=True): skipna : boolean, default True Exclude NA/null values + level : integer, default None + Choose a level to groupby before applying operation + Returns ------- product : float """ - return self._ndarray_statistic('prod', dtype=dtype, skipna=skipna) + if level is None: + return self._ndarray_statistic('prod', dtype=dtype, skipna=skipna) - def min(self, axis=None, out=None, skipna=True): + prodfunc = lambda x: x.prod(dtype=dtype,skipna=skipna) + return self.groupby(level=level).aggregate(prodfunc) + + def min(self, axis=None, out=None, skipna=True, level=None): """ Minimum of values @@ -689,17 +718,26 @@ def min(self, axis=None, out=None, skipna=True): skipna : boolean, default True Exclude NA/null values + level : integer, default None + Choose a level to groupby before applying operation + Returns ------- min : float """ arr = self.values.copy() + if skipna: if not issubclass(arr.dtype.type, np.integer): np.putmask(arr, isnull(arr), np.inf) - return arr.min() - def max(self, axis=None, out=None, skipna=True): + if level is None: + return arr.min() + + minfunc = lambda x: x.min(axis=None, out=None, skipna=True) + return self.groupby(level=level).aggregate(minfunc) + + def max(self, axis=None, out=None, skipna=True, level=None): """ Maximum of values @@ -708,17 +746,27 @@ def max(self, axis=None, out=None, skipna=True): skipna : boolean, default True Exclude NA/null values + level : integer, default None + Choose a level to groupby before applying operation + Returns ------- max : float """ arr = self.values.copy() + if skipna: if not issubclass(arr.dtype.type, np.integer): np.putmask(arr, isnull(arr), -np.inf) - return arr.max() - def std(self, axis=None, dtype=None, out=None, ddof=1, skipna=True): + if level is None: + return arr.max() + + maxfunc = lambda x: x.max(axis=None, out=None, skipna=True) + return self.groupby(level=level).aggregate(maxfunc) + + def std(self, axis=None, dtype=None, out=None, ddof=1, skipna=True, + level=None): """ Unbiased standard deviation of values @@ -729,19 +777,27 @@ def std(self, axis=None, dtype=None, out=None, ddof=1, skipna=True): skipna : boolean, default True Exclude NA/null values + level : integer, default None + Choose a level to groupby before applying operation + Returns ------- stdev : float """ + if not level is None: + stdfunc = lambda x: x.std(axis=axis,out=out,skipna=skipna) + return self.groupby(level=level).aggregate(stdfunc) + if skipna: nona = remove_na(self.values) if len(nona) < 2: return nan return ndarray.std(nona, axis, dtype, out, ddof) - else: - return self.values.std(axis, dtype, out, ddof) - def var(self, axis=None, dtype=None, out=None, ddof=1, skipna=True): + return self.values.std(axis, dtype, out, ddof) + + def var(self, axis=None, dtype=None, out=None, ddof=1, skipna=True, + level=None): """ Unbiased variance of non-NA/null values @@ -752,19 +808,26 @@ def var(self, axis=None, dtype=None, out=None, ddof=1, skipna=True): skipna : boolean, default True Exclude NA/null values + level : integer, default None + Choose a level to groupby before applying operation + Returns ------- var : float """ + if not level is None: + varfunc = lambda x: x.var(axis=axis,out=out,skipna=skipna) + return self.groupby(level=level).aggregate(varfunc) + if skipna: nona = remove_na(self.values) if len(nona) < 2: return nan return ndarray.var(nona, axis, dtype, out, ddof) - else: - return self.values.var(axis, dtype, out, ddof) - def skew(self, skipna=True): + return self.values.var(axis, dtype, out, ddof) + + def skew(self, skipna=True, level=None): """ Unbiased skewness of the non-NA/null values @@ -773,10 +836,17 @@ def skew(self, skipna=True): skipna : boolean, default True Exclude NA/null values + level : integer, default None + Choose a level to groupby before applying operation + Returns ------- skew : float """ + if not level is None: + skewfunc = lambda x: x.skew(skipna=skipna) + return self.groupby(level=level).aggregate(skewfunc) + y = np.array(self.values) mask = notnull(y) count = mask.sum() diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index 69ad9f6996b65..8d6c9ee7ce3d2 100644 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -29,6 +29,15 @@ def setUp(self): labels=[[0, 1, 2, 3]], names=['first']) + # create test series object + arrays = [['bar', 'bar', 'baz', 'baz', 'qux', 'qux', 'foo', 'foo'], + ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']] + tuples = zip(*arrays) + index = MultiIndex.from_tuples(tuples) + s = Series(randn(8), index=index) + s[3] = np.NaN + self.series = s + tm.N = 100 self.tdf = tm.makeTimeDataFrame() self.ymd = self.tdf.groupby([lambda x: x.year, lambda x: x.month, @@ -531,6 +540,28 @@ def test_series_getitem_not_sorted(self): assert_series_equal(result, expected) assert_series_equal(result2, expected) + def test_series_group_min_max(self): + for op in ['sum', 'prod', 'min', 'max', 'median', 'mean', 'skew', + 'std', 'var']: + leftside = getattr(self.series.groupby(level=0), op)() + rightside = getattr(self.series, op)(level=0) + assert_series_equal(leftside, rightside) + + leftside = getattr(self.series.groupby(level=1), op)() + rightside = getattr(self.series, op)(level=1) + assert_series_equal(leftside, rightside) + + def test_frame_group_ops(self): + for op in ['sum', 'prod', 'min', 'max', 'median', 'mean', 'skew', + 'mad', 'std', 'var']: + leftside = getattr(self.frame.groupby(level=0), op)() + rightside = getattr(self.frame, op)(level=0) + assert_frame_equal(leftside, rightside) + + leftside = getattr(self.frame.groupby(level=1), op)() + rightside = getattr(self.frame, op)(level=1) + assert_frame_equal(leftside, rightside) + if __name__ == '__main__': # unittest.main()