diff --git a/doc/source/api.rst b/doc/source/api.rst index 4ecde7e05256a..79f5af74c3985 100644 --- a/doc/source/api.rst +++ b/doc/source/api.rst @@ -785,6 +785,7 @@ Attributes and underlying data Panel.axes Panel.ndim Panel.shape + Panel.dtypes Conversion ~~~~~~~~~~ @@ -1122,7 +1123,7 @@ Indexing, iteration ~~~~~~~~~~~~~~~~~~~ .. autosummary:: :toctree: generated/ - + GroupBy.__iter__ GroupBy.groups GroupBy.indices @@ -1141,7 +1142,7 @@ Computations / Descriptive Stats ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. autosummary:: :toctree: generated/ - + GroupBy.mean GroupBy.median GroupBy.std @@ -1155,7 +1156,7 @@ Computations / Descriptive Stats .. toctree:: :hidden: - + generated/pandas.core.common.isnull generated/pandas.core.common.notnull generated/pandas.core.reshape.get_dummies diff --git a/doc/source/basics.rst b/doc/source/basics.rst index eef271be74a02..bd2980c2f1c9f 100644 --- a/doc/source/basics.rst +++ b/doc/source/basics.rst @@ -637,6 +637,81 @@ to :ref:`merging/joining functionality `: s s.map(t) + +.. _basics.apply_panel: + +Applying with a Panel +~~~~~~~~~~~~~~~~~~~~~ + +Applying with a ``Panel`` will pass a ``Series`` to the applied function. If the applied +function returns a ``Series``, the result of the application will be a ``Panel``. If the applied function +reduces to a scalar, the result of the application will be a ``DataFrame``. + +.. note:: + + Prior to 0.13.1 ``apply`` on a ``Panel`` would only work on ``ufuncs`` (e.g. ``np.sum/np.max``). + +.. ipython:: python + + import pandas.util.testing as tm + panel = tm.makePanel(5) + panel + panel['ItemA'] + +A transformational apply. + +.. ipython:: python + + result = panel.apply(lambda x: x*2, axis='items') + result + result['ItemA'] + +A reduction operation. + +.. ipython:: python + + panel.apply(lambda x: x.dtype, axis='items') + +A similar reduction type operation + +.. ipython:: python + + panel.apply(lambda x: x.sum(), axis='major_axis') + +This last reduction is equivalent to + +.. ipython:: python + + panel.sum('major_axis') + +A transformation operation that returns a ``Panel``, but is computing +the z-score across the ``major_axis``. + +.. ipython:: python + + result = panel.apply(lambda x: (x-x.mean())/x.std(), axis='major_axis') + result + result['ItemA'] + +Apply can also accept multiple axes in the ``axis`` argument. This will pass a +``DataFrame`` of the cross-section to the applied function. + +.. ipython:: python + + f = lambda x: (x-x.mean(1)/x.std(1)) + + result = panel.apply(f, axis = ['items','major_axis']) + result + result.loc[:,:,'ItemA'] + +This is equivalent to the following + +.. ipython:: python + + result = Panel(dict([ (ax,f(panel.loc[:,:,ax])) for ax in panel.minor_axis ])) + result + result.loc[:,:,'ItemA'] + .. _basics.reindexing: Reindexing and altering labels @@ -1066,7 +1141,7 @@ or match a pattern: Series(['1', '2', '3a', '3b', '03c']).str.match(pattern, as_indexer=True) -The distinction between ``match`` and ``contains`` is strictness: ``match`` +The distinction between ``match`` and ``contains`` is strictness: ``match`` relies on strict ``re.match``, while ``contains`` relies on ``re.search``. .. warning:: @@ -1078,7 +1153,7 @@ relies on strict ``re.match``, while ``contains`` relies on ``re.search``. This old, deprecated behavior of ``match`` is still the default. As demonstrated above, use the new behavior by setting ``as_indexer=True``. In this mode, ``match`` is analagous to ``contains``, returning a boolean - Series. The new behavior will become the default behavior in a future + Series. The new behavior will become the default behavior in a future release. Methods like ``match``, ``contains``, ``startswith``, and ``endswith`` take diff --git a/doc/source/release.rst b/doc/source/release.rst index 9f0b42dd5b741..fc9f18279087b 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -73,6 +73,9 @@ Improvements to existing features - df.info() view now display dtype info per column (:issue: `5682`) - perf improvements in DataFrame ``count/dropna`` for ``axis=1`` - Series.str.contains now has a `regex=False` keyword which can be faster for plain (non-regex) string patterns. (:issue: `5879`) + - support ``dtypes`` on ``Panel`` + - extend ``Panel.apply`` to allow arbitrary functions (rather than only ufuncs) (:issue:`1148`) + allow multiple axes to be used to operate on slabs of a ``Panel`` .. _release.bug_fixes-0.13.1: diff --git a/doc/source/v0.13.1.txt b/doc/source/v0.13.1.txt index 250adffdadbca..76b915c519440 100644 --- a/doc/source/v0.13.1.txt +++ b/doc/source/v0.13.1.txt @@ -29,6 +29,60 @@ Deprecations Enhancements ~~~~~~~~~~~~ +- ``Panel.apply`` will work on non-ufuncs. See :ref:`the docs`. + + .. ipython:: python + + import pandas.util.testing as tm + panel = tm.makePanel(5) + panel + panel['ItemA'] + + Specifying an ``apply`` that operates on a Series (to return a single element) + + .. ipython:: python + + panel.apply(lambda x: x.dtype, axis='items') + + A similar reduction type operation + + .. ipython:: python + + panel.apply(lambda x: x.sum(), axis='major_axis') + + This is equivalent to + + .. ipython:: python + + panel.sum('major_axis') + + A transformation operation that returns a Panel, but is computing + the z-score across the major_axis + + .. ipython:: python + + result = panel.apply(lambda x: (x-x.mean())/x.std(), axis='major_axis') + result + result['ItemA'] + +- ``Panel.apply`` operating on cross-sectional slabs. (:issue:`1148`) + + .. ipython:: python + + f = lambda x: (x-x.mean(1)/x.std(1)) + + result = panel.apply(f, axis = ['items','major_axis']) + result + result.loc[:,:,'ItemA'] + + This is equivalent to the following + + .. ipython:: python + + result = Panel(dict([ (ax,f(panel.loc[:,:,ax])) for ax in panel.minor_axis ])) + result + result.loc[:,:,'ItemA'] + Experimental ~~~~~~~~~~~~ diff --git a/pandas/core/panel.py b/pandas/core/panel.py index b6cd643f47c5a..8c50396c503a0 100644 --- a/pandas/core/panel.py +++ b/pandas/core/panel.py @@ -17,8 +17,10 @@ from pandas.core.internals import (BlockManager, create_block_manager_from_arrays, create_block_manager_from_blocks) +from pandas.core.series import Series from pandas.core.frame import DataFrame from pandas.core.generic import NDFrame, _shared_docs +from pandas.tools.util import cartesian_product from pandas import compat from pandas.util.decorators import deprecate, Appender, Substitution import pandas.core.common as com @@ -333,26 +335,34 @@ def axis_pretty(a): [class_name, dims] + [axis_pretty(a) for a in self._AXIS_ORDERS]) return output - def _get_plane_axes(self, axis): + def _get_plane_axes_index(self, axis): """ - Get my plane axes: these are already + Get my plane axes indexes: these are already (as compared with higher level planes), - as we are returning a DataFrame axes + as we are returning a DataFrame axes indexes """ - axis = self._get_axis_name(axis) + axis_name = self._get_axis_name(axis) - if axis == 'major_axis': - index = self.minor_axis - columns = self.items - if axis == 'minor_axis': - index = self.major_axis - columns = self.items - elif axis == 'items': - index = self.major_axis - columns = self.minor_axis + if axis_name == 'major_axis': + index = 'minor_axis' + columns = 'items' + if axis_name == 'minor_axis': + index = 'major_axis' + columns = 'items' + elif axis_name == 'items': + index = 'major_axis' + columns = 'minor_axis' return index, columns + def _get_plane_axes(self, axis): + """ + Get my plane axes indexes: these are already + (as compared with higher level planes), + as we are returning a DataFrame axes + """ + return [ self._get_axis(axi) for axi in self._get_plane_axes_index(axis) ] + fromDict = from_dict def to_sparse(self, fill_value=None, kind='block'): @@ -431,6 +441,10 @@ def as_matrix(self): self._consolidate_inplace() return self._data.as_matrix() + @property + def dtypes(self): + return self.apply(lambda x: x.dtype, axis='items') + #---------------------------------------------------------------------- # Getting and setting elements @@ -827,25 +841,138 @@ def to_frame(self, filter_observations=True): to_long = deprecate('to_long', to_frame) toLong = deprecate('toLong', to_frame) - def apply(self, func, axis='major'): + def apply(self, func, axis='major', **kwargs): """ - Apply + Applies function along input axis of the Panel Parameters ---------- - func : numpy function - Signature should match numpy.{sum, mean, var, std} etc. + func : function + Function to apply to each combination of 'other' axes + e.g. if axis = 'items', then the combination of major_axis/minor_axis + will be passed a Series axis : {'major', 'minor', 'items'} - fill_value : boolean, default True - Replace NaN values with specified first + Additional keyword arguments will be passed as keywords to the function + + Examples + -------- + >>> p.apply(numpy.sqrt) # returns a Panel + >>> p.apply(lambda x: x.sum(), axis=0) # equiv to p.sum(0) + >>> p.apply(lambda x: x.sum(), axis=1) # equiv to p.sum(1) + >>> p.apply(lambda x: x.sum(), axis=2) # equiv to p.sum(2) Returns ------- - result : DataFrame or Panel + result : Pandas Object """ - i = self._get_axis_number(axis) - result = np.apply_along_axis(func, i, self.values) - return self._wrap_result(result, axis=axis) + + if kwargs and not isinstance(func, np.ufunc): + f = lambda x: func(x, **kwargs) + else: + f = func + + # 2d-slabs + if isinstance(axis, (tuple,list)) and len(axis) == 2: + return self._apply_2d(f, axis=axis) + + axis = self._get_axis_number(axis) + + # try ufunc like + if isinstance(f, np.ufunc): + try: + result = np.apply_along_axis(func, axis, self.values) + return self._wrap_result(result, axis=axis) + except (AttributeError): + pass + + # 1d + return self._apply_1d(f, axis=axis) + + def _apply_1d(self, func, axis): + + axis_name = self._get_axis_name(axis) + ax = self._get_axis(axis) + ndim = self.ndim + values = self.values + + # iter thru the axes + slice_axis = self._get_axis(axis) + slice_indexer = [0]*(ndim-1) + indexer = np.zeros(ndim, 'O') + indlist = list(range(ndim)) + indlist.remove(axis) + indexer[axis] = slice(None, None) + indexer.put(indlist, slice_indexer) + planes = [ self._get_axis(axi) for axi in indlist ] + shape = np.array(self.shape).take(indlist) + + # all the iteration points + points = cartesian_product(planes) + + results = [] + for i in range(np.prod(shape)): + + # construct the object + pts = tuple([ p[i] for p in points ]) + indexer.put(indlist, slice_indexer) + + obj = Series(values[tuple(indexer)],index=slice_axis,name=pts) + result = func(obj) + + results.append(result) + + # increment the indexer + slice_indexer[-1] += 1 + n = -1 + while (slice_indexer[n] >= shape[n]) and (n > (1-ndim)): + slice_indexer[n-1] += 1 + slice_indexer[n] = 0 + n -= 1 + + # empty object + if not len(results): + return self._constructor(**self._construct_axes_dict()) + + # same ndim as current + if isinstance(results[0],Series): + arr = np.vstack([ r.values for r in results ]) + arr = arr.T.reshape(tuple([len(slice_axis)] + list(shape))) + tranp = np.array([axis]+indlist).argsort() + arr = arr.transpose(tuple(list(tranp))) + return self._constructor(arr,**self._construct_axes_dict()) + + # ndim-1 shape + results = np.array(results).reshape(shape) + if results.ndim == 2 and axis_name != self._info_axis_name: + results = results.T + planes = planes[::-1] + return self._construct_return_type(results,planes) + + def _apply_2d(self, func, axis): + """ handle 2-d slices, equiv to iterating over the other axis """ + + ndim = self.ndim + axis = [ self._get_axis_number(a) for a in axis ] + + # construct slabs, in 2-d this is a DataFrame result + indexer_axis = list(range(ndim)) + for a in axis: + indexer_axis.remove(a) + indexer_axis = indexer_axis[0] + + slicer = [ slice(None,None) ] * ndim + ax = self._get_axis(indexer_axis) + + results = [] + for i, e in enumerate(ax): + + slicer[indexer_axis] = i + sliced = self.iloc[tuple(slicer)] + + obj = func(sliced) + results.append((e,obj)) + + return self._construct_return_type(dict(results)) def _reduce(self, op, axis=0, skipna=True, numeric_only=None, filter_type=None, **kwds): @@ -863,13 +990,33 @@ def _reduce(self, op, axis=0, skipna=True, numeric_only=None, def _construct_return_type(self, result, axes=None, **kwargs): """ return the type for the ndim of the result """ - ndim = result.ndim - if self.ndim == ndim: + ndim = getattr(result,'ndim',None) + + # need to assume they are the same + if ndim is None: + if isinstance(result,dict): + ndim = getattr(list(compat.itervalues(result))[0],'ndim',None) + + # a saclar result + if ndim is None: + ndim = 0 + + # have a dict, so top-level is +1 dim + else: + ndim += 1 + + # scalar + if ndim == 0: + return Series(result) + + # same as self + elif self.ndim == ndim: """ return the construction dictionary for these axes """ if axes is None: return self._constructor(result) return self._constructor(result, **self._construct_axes_dict()) + # sliced elif self.ndim == ndim + 1: if axes is None: return self._constructor_sliced(result) @@ -877,7 +1024,7 @@ def _construct_return_type(self, result, axes=None, **kwargs): result, **self._extract_axes_for_slice(self, axes)) raise PandasError('invalid _construct_return_type [self->%s] ' - '[result->%s]' % (self.ndim, result.ndim)) + '[result->%s]' % (self, result)) def _wrap_result(self, result, axis): axis = self._get_axis_name(axis) diff --git a/pandas/core/panelnd.py b/pandas/core/panelnd.py index a7cfe49484d24..3eebd51190e3d 100644 --- a/pandas/core/panelnd.py +++ b/pandas/core/panelnd.py @@ -56,9 +56,10 @@ def __init__(self, *args, **kwargs): self._init_data(*args, **kwargs) klass.__init__ = __init__ - def _get_plane_axes(self, axis): + def _get_plane_axes_index(self, axis): + """ return the sliced index for this object """ - axis = self._get_axis_name(axis) + axis_name = self._get_axis_name(axis) index = self._AXIS_ORDERS.index(axis) planes = [] @@ -67,8 +68,8 @@ def _get_plane_axes(self, axis): if index != self._AXIS_LEN: planes.extend(self._AXIS_ORDERS[index + 1:]) - return [getattr(self, p) for p in planes] - klass._get_plane_axes = _get_plane_axes + return planes + klass._get_plane_axes_index = _get_plane_axes_index def _combine(self, other, func, axis=0): if isinstance(other, klass): diff --git a/pandas/tests/test_panel.py b/pandas/tests/test_panel.py index 30500ac57a7f6..08d3afe63ec86 100644 --- a/pandas/tests/test_panel.py +++ b/pandas/tests/test_panel.py @@ -1061,6 +1061,105 @@ def test_convert_objects(self): result = p.convert_objects(convert_numeric='force') assert_panel_equal(result, expected) + def test_dtypes(self): + + result = self.panel.dtypes + expected = DataFrame(np.dtype('float64'),index=self.panel.major_axis,columns=self.panel.minor_axis) + assert_frame_equal(result, expected) + + def test_apply(self): + # GH1148 + + from pandas import Series,DataFrame + + # ufunc + applied = self.panel.apply(np.sqrt) + self.assert_(assert_almost_equal(applied.values, + np.sqrt(self.panel.values))) + + # ufunc same shape + result = self.panel.apply(lambda x: x*2, axis='items') + expected = self.panel*2 + assert_panel_equal(result, expected) + result = self.panel.apply(lambda x: x*2, axis='major_axis') + expected = self.panel*2 + assert_panel_equal(result, expected) + result = self.panel.apply(lambda x: x*2, axis='minor_axis') + expected = self.panel*2 + assert_panel_equal(result, expected) + + # reduction to DataFrame + result = self.panel.apply(lambda x: x.dtype, axis='items') + expected = DataFrame(np.dtype('float64'),index=self.panel.major_axis,columns=self.panel.minor_axis) + assert_frame_equal(result,expected) + result = self.panel.apply(lambda x: x.dtype, axis='major_axis') + expected = DataFrame(np.dtype('float64'),index=self.panel.minor_axis,columns=self.panel.items) + assert_frame_equal(result,expected) + result = self.panel.apply(lambda x: x.dtype, axis='minor_axis') + expected = DataFrame(np.dtype('float64'),index=self.panel.major_axis,columns=self.panel.items) + assert_frame_equal(result,expected) + + # reductions via other dims + expected = self.panel.sum(0) + result = self.panel.apply(lambda x: x.sum(), axis='items') + assert_frame_equal(result,expected) + expected = self.panel.sum(1) + result = self.panel.apply(lambda x: x.sum(), axis='major_axis') + assert_frame_equal(result,expected) + expected = self.panel.sum(2) + result = self.panel.apply(lambda x: x.sum(), axis='minor_axis') + assert_frame_equal(result,expected) + + # pass kwargs + result = self.panel.apply(lambda x, y: x.sum() + y, axis='items', y=5) + expected = self.panel.sum(0) + 5 + assert_frame_equal(result,expected) + + def test_apply_slabs(self): + + # same shape as original + result = self.panel.apply(lambda x: x*2, axis = ['items','major_axis']) + expected = (self.panel*2).transpose('minor_axis','major_axis','items') + assert_panel_equal(result,expected) + result = self.panel.apply(lambda x: x*2, axis = ['major_axis','items']) + assert_panel_equal(result,expected) + + result = self.panel.apply(lambda x: x*2, axis = ['items','minor_axis']) + expected = (self.panel*2).transpose('major_axis','minor_axis','items') + assert_panel_equal(result,expected) + result = self.panel.apply(lambda x: x*2, axis = ['minor_axis','items']) + assert_panel_equal(result,expected) + + result = self.panel.apply(lambda x: x*2, axis = ['major_axis','minor_axis']) + expected = self.panel*2 + assert_panel_equal(result,expected) + result = self.panel.apply(lambda x: x*2, axis = ['minor_axis','major_axis']) + assert_panel_equal(result,expected) + + # reductions + result = self.panel.apply(lambda x: x.sum(0), axis = ['items','major_axis']) + expected = self.panel.sum(1).T + assert_frame_equal(result,expected) + + result = self.panel.apply(lambda x: x.sum(1), axis = ['items','major_axis']) + expected = self.panel.sum(0) + assert_frame_equal(result,expected) + + # transforms + f = lambda x: (x-x.mean(1)/x.std(1)) + + result = self.panel.apply(f, axis = ['items','major_axis']) + expected = Panel(dict([ (ax,f(self.panel.loc[:,:,ax])) for ax in self.panel.minor_axis ])) + assert_panel_equal(result,expected) + + result = self.panel.apply(f, axis = ['major_axis','minor_axis']) + expected = Panel(dict([ (ax,f(self.panel.loc[ax])) for ax in self.panel.items ])) + assert_panel_equal(result,expected) + + result = self.panel.apply(f, axis = ['minor_axis','items']) + expected = Panel(dict([ (ax,f(self.panel.loc[:,ax])) for ax in self.panel.major_axis ])) + assert_panel_equal(result,expected) + def test_reindex(self): ref = self.panel['ItemB'] @@ -1989,12 +2088,6 @@ def test_get_dummies(self): dummies = get_dummies(self.panel['Label']) self.assert_(np.array_equal(dummies.values, minor_dummies.values)) - def test_apply(self): - # ufunc - applied = self.panel.apply(np.sqrt) - self.assert_(assert_almost_equal(applied.values, - np.sqrt(self.panel.values))) - def test_mean(self): means = self.panel.mean(level='minor')