diff --git a/doc/source/faq.rst b/doc/source/faq.rst index 467ec02b55f20..20762e3fc039f 100644 --- a/doc/source/faq.rst +++ b/doc/source/faq.rst @@ -369,3 +369,4 @@ just a thin layer around the ``QTableView``. mw = MainWidget() mw.show() app.exec_() + diff --git a/doc/source/internals.rst b/doc/source/internals.rst index 9418ca5265f1a..bc1189a8961d6 100644 --- a/doc/source/internals.rst +++ b/doc/source/internals.rst @@ -95,3 +95,155 @@ constructors ``from_tuples`` and ``from_arrays`` ensure that this is true, but if you compute the levels and labels yourself, please be careful. +.. _: + +Subclassing pandas Data Structures +---------------------------------- + +.. warning:: There are some easier alternatives before considering subclassing ``pandas`` data structures. + + 1. Monkey-patching: See :ref:`Adding Features to your pandas Installation `. + + 2. Use *composition*. See `here `_. + +This section describes how to subclass ``pandas`` data structures to meet more specific needs. There are 2 points which need attention: + +1. Override constructor properties. +2. Define original properties + +.. note:: You can find a nice example in `geopandas `_ project. + +Override Constructor Properties +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Each data structure has constructor properties to specifying data constructors. By overriding these properties, you can retain defined-classes through ``pandas`` data manipulations. + +There are 3 constructors to be defined: + +- ``_constructor``: Used when a manipulation result has the same dimesions as the original. +- ``_constructor_sliced``: Used when a manipulation result has one lower dimension(s) as the original, such as ``DataFrame`` single columns slicing. +- ``_constructor_expanddim``: Used when a manipulation result has one higher dimension as the original, such as ``Series.to_frame()`` and ``DataFrame.to_panel()``. + +Following table shows how ``pandas`` data structures define constructor properties by default. + +=========================== ======================= =================== ======================= +Property Attributes ``Series`` ``DataFrame`` ``Panel`` +=========================== ======================= =================== ======================= +``_constructor`` ``Series`` ``DataFrame`` ``Panel`` +``_constructor_sliced`` ``NotImplementedError`` ``Series`` ``DataFrame`` +``_constructor_expanddim`` ``DataFrame`` ``Panel`` ``NotImplementedError`` +=========================== ======================= =================== ======================= + +Below example shows how to define ``SubclassedSeries`` and ``SubclassedDataFrame`` overriding constructor properties. + +.. code-block:: python + + class SubclassedSeries(Series): + + @property + def _constructor(self): + return SubclassedSeries + + @property + def _constructor_expanddim(self): + return SubclassedDataFrame + + class SubclassedDataFrame(DataFrame): + + @property + def _constructor(self): + return SubclassedDataFrame + + @property + def _constructor_sliced(self): + return SubclassedSeries + +.. code-block:: python + + >>> s = SubclassedSeries([1, 2, 3]) + >>> type(s) + + + >>> to_framed = s.to_frame() + >>> type(to_framed) + + + >>> df = SubclassedDataFrame({'A', [1, 2, 3], 'B': [4, 5, 6], 'C': [7, 8, 9]}) + >>> df + A B C + 0 1 4 7 + 1 2 5 8 + 2 3 6 9 + + >>> type(df) + + + >>> sliced1 = df[['A', 'B']] + >>> sliced1 + A B + 0 1 4 + 1 2 5 + 2 3 6 + >>> type(sliced1) + + + >>> sliced2 = df['A'] + >>> sliced2 + 0 1 + 1 2 + 2 3 + Name: A, dtype: int64 + >>> type(sliced2) + + +Define Original Properties +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +To let original data structures have additional properties, you should let ``pandas`` knows what properties are added. ``pandas`` maps unknown properties to data names overriding ``__getattribute__``. Defining original properties can be done in one of 2 ways: + +1. Define ``_internal_names`` and ``_internal_names_set`` for temporary properties which WILL NOT be passed to manipulation results. +2. Define ``_metadata`` for normal properties which will be passed to manipulation results. + +Below is an example to define 2 original properties, "internal_cache" as a temporary property and "added_property" as a normal property + +.. code-block:: python + + class SubclassedDataFrame2(DataFrame): + + # temporary properties + _internal_names = DataFrame._internal_names + ['internal_cache'] + _internal_names_set = set(_internal_names) + + # normal properties + _metadata = ['added_property'] + + @property + def _constructor(self): + return SubclassedDataFrame2 + +.. code-block:: python + + >>> df = SubclassedDataFrame2({'A', [1, 2, 3], 'B': [4, 5, 6], 'C': [7, 8, 9]}) + >>> df + A B C + 0 1 4 7 + 1 2 5 8 + 2 3 6 9 + + >>> df.internal_cache = 'cached' + >>> df.added_property = 'property' + + >>> df.internal_cache + cached + >>> df.added_property + property + + # properties defined in _internal_names is reset after manipulation + >>> df[['A', 'B']].internal_cache + AttributeError: 'SubclassedDataFrame2' object has no attribute 'internal_cache' + + # properties defined in _metadata are retained + >>> df[['A', 'B']].added_property + property + + diff --git a/doc/source/whatsnew/v0.16.1.txt b/doc/source/whatsnew/v0.16.1.txt index 8bd2939e2c805..7166801b3fbf0 100755 --- a/doc/source/whatsnew/v0.16.1.txt +++ b/doc/source/whatsnew/v0.16.1.txt @@ -56,6 +56,8 @@ Enhancements - Trying to write an excel file now raises ``NotImplementedError`` if the ``DataFrame`` has a ``MultiIndex`` instead of writing a broken Excel file. (:issue:`9794`) +- ``DataFrame`` and ``Series`` now have ``_constructor_expanddim`` property as overridable constructor for one higher dimensionality data. This should be used only when it is really needed, see :ref:`here ` + .. _whatsnew_0161.api: API changes diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 4f7bc11cbf03c..272c401c18761 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -191,6 +191,11 @@ def _constructor(self): _constructor_sliced = Series + @property + def _constructor_expanddim(self): + from pandas.core.panel import Panel + return Panel + def __init__(self, data=None, index=None, columns=None, dtype=None, copy=False): if data is None: @@ -1061,8 +1066,6 @@ def to_panel(self): ------- panel : Panel """ - from pandas.core.panel import Panel - # only support this kind for now if (not isinstance(self.index, MultiIndex) or # pragma: no cover len(self.index.levels) != 2): @@ -1100,7 +1103,7 @@ def to_panel(self): shape=shape, ref_items=selfsorted.columns) - return Panel(new_mgr) + return self._constructor_expanddim(new_mgr) to_wide = deprecate('to_wide', to_panel) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 8bd85a008f077..9624b1308239c 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -155,6 +155,10 @@ def _local_dir(self): def _constructor_sliced(self): raise AbstractMethodError(self) + @property + def _constructor_expanddim(self): + raise NotImplementedError + #---------------------------------------------------------------------- # Axis diff --git a/pandas/core/series.py b/pandas/core/series.py index f9c56db018639..7bcf6c6671152 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -236,6 +236,11 @@ def from_array(cls, arr, index=None, name=None, dtype=None, copy=False, def _constructor(self): return Series + @property + def _constructor_expanddim(self): + from pandas.core.frame import DataFrame + return DataFrame + # types @property def _can_hold_na(self): @@ -1047,11 +1052,10 @@ def to_frame(self, name=None): ------- data_frame : DataFrame """ - from pandas.core.frame import DataFrame if name is None: - df = DataFrame(self) + df = self._constructor_expanddim(self) else: - df = DataFrame({name: self}) + df = self._constructor_expanddim({name: self}) return df diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index bcba891ee7e9d..c001f35ab65cc 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -31,7 +31,7 @@ import pandas.core.common as com import pandas.core.format as fmt import pandas.core.datetools as datetools -from pandas import (DataFrame, Index, Series, notnull, isnull, +from pandas import (DataFrame, Index, Series, Panel, notnull, isnull, MultiIndex, DatetimeIndex, Timestamp, date_range, read_csv, timedelta_range, Timedelta, option_context) @@ -14214,6 +14214,26 @@ def _constructor(self): # GH9776 self.assertEqual(df.iloc[0:1, :].testattr, 'XXX') + def test_to_panel_expanddim(self): + # GH 9762 + + class SubclassedFrame(DataFrame): + @property + def _constructor_expanddim(self): + return SubclassedPanel + + class SubclassedPanel(Panel): + pass + + index = MultiIndex.from_tuples([(0, 0), (0, 1), (0, 2)]) + df = SubclassedFrame({'X':[1, 2, 3], 'Y': [4, 5, 6]}, index=index) + result = df.to_panel() + self.assertTrue(isinstance(result, SubclassedPanel)) + expected = SubclassedPanel([[[1, 2, 3]], [[4, 5, 6]]], + items=['X', 'Y'], major_axis=[0], + minor_axis=[0, 1, 2]) + tm.assert_panel_equal(result, expected) + def skip_if_no_ne(engine='numexpr'): if engine == 'numexpr': diff --git a/pandas/tests/test_series.py b/pandas/tests/test_series.py index c3b43f3ec70c0..b5ada4cf39b5e 100644 --- a/pandas/tests/test_series.py +++ b/pandas/tests/test_series.py @@ -6851,6 +6851,22 @@ def test_searchsorted_sorter(self): e = np.array([0, 2]) tm.assert_array_equal(r, e) + def test_to_frame_expanddim(self): + # GH 9762 + + class SubclassedSeries(Series): + @property + def _constructor_expanddim(self): + return SubclassedFrame + + class SubclassedFrame(DataFrame): + pass + + s = SubclassedSeries([1, 2, 3], name='X') + result = s.to_frame() + self.assertTrue(isinstance(result, SubclassedFrame)) + expected = SubclassedFrame({'X': [1, 2, 3]}) + assert_frame_equal(result, expected) class TestSeriesNonUnique(tm.TestCase):