Skip to content

Commit ad2cefc

Browse files
committed
API: define _constructor_expanddim for subclassing Series and DataFrame
1 parent 9b842a0 commit ad2cefc

File tree

7 files changed

+231
-12
lines changed

7 files changed

+231
-12
lines changed

doc/source/faq.rst

+1
Original file line numberDiff line numberDiff line change
@@ -369,3 +369,4 @@ just a thin layer around the ``QTableView``.
369369
mw = MainWidget()
370370
mw.show()
371371
app.exec_()
372+

doc/source/internals.rst

+152
Original file line numberDiff line numberDiff line change
@@ -95,3 +95,155 @@ constructors ``from_tuples`` and ``from_arrays`` ensure that this is true, but
9595
if you compute the levels and labels yourself, please be careful.
9696

9797

98+
.. _ref-subclassing-pandas:
99+
100+
Subclassing pandas Data Structures
101+
----------------------------------
102+
103+
.. warning:: There are some easier alternatives before considering subclassing ``pandas`` data structures.
104+
105+
1. Monkey-patching: See :ref:`Adding Features to your pandas Installation <ref-monkey-patching>`.
106+
107+
2. Use *composition*. See `here <http://en.wikipedia.org/wiki/Composition_over_inheritance>`_.
108+
109+
This section describes how to subclass ``pandas`` data structures to meet more specific needs. There are 2 points which need attention:
110+
111+
1. Override constructor properties.
112+
2. Define original properties
113+
114+
.. note:: You can find a nice example in `geopandas <https://github.com/geopandas/geopandas>`_ project.
115+
116+
Override Constructor Properties
117+
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
118+
119+
Each data structure has constructor properties to specifying data constructors. By overriding these properties, you can retain defined-classes through ``pandas`` data manipulations.
120+
121+
There are 3 constructors to be defined:
122+
123+
- ``_constructor``: Used when a manipulation result has the same dimesions as the original.
124+
- ``_constructor_sliced``: Used when a manipulation result has one lower dimension(s) as the original, such as ``DataFrame`` single columns slicing.
125+
- ``_constructor_expanddim``: Used when a manipulation result has one higher dimension as the original, such as ``Series.to_frame()`` and ``DataFrame.to_panel()``.
126+
127+
Following table shows how ``pandas`` data structures define constructor properties by default.
128+
129+
=========================== ======================= =================== =======================
130+
Property Attributes ``Series`` ``DataFrame`` ``Panel``
131+
=========================== ======================= =================== =======================
132+
``_constructor`` ``Series`` ``DataFrame`` ``Panel``
133+
``_constructor_sliced`` ``NotImplementedError`` ``Series`` ``DataFrame``
134+
``_constructor_expanddim`` ``DataFrame`` ``Panel`` ``NotImplementedError``
135+
=========================== ======================= =================== =======================
136+
137+
Below example shows how to define ``SubclassedSeries`` and ``SubclassedDataFrame`` overriding constructor properties.
138+
139+
.. code-block:: python
140+
141+
class SubclassedSeries(Series):
142+
143+
@property
144+
def _constructor(self):
145+
return SubclassedSeries
146+
147+
@property
148+
def _constructor_expanddim(self):
149+
return SubclassedDataFrame
150+
151+
class SubclassedDataFrame(DataFrame):
152+
153+
@property
154+
def _constructor(self):
155+
return SubclassedDataFrame
156+
157+
@property
158+
def _constructor_sliced(self):
159+
return SubclassedSeries
160+
161+
.. code-block:: python
162+
163+
>>> s = SubclassedSeries([1, 2, 3])
164+
>>> type(s)
165+
<class '__main__.SubclassedSeries'>
166+
167+
>>> to_framed = s.to_frame()
168+
>>> type(to_framed)
169+
<class '__main__.SubclassedDataFrame'>
170+
171+
>>> df = SubclassedDataFrame({'A', [1, 2, 3], 'B': [4, 5, 6], 'C': [7, 8, 9]})
172+
>>> df
173+
A B C
174+
0 1 4 7
175+
1 2 5 8
176+
2 3 6 9
177+
178+
>>> type(df)
179+
<class '__main__.SubclassedDataFrame'>
180+
181+
>>> sliced1 = df[['A', 'B']]
182+
>>> sliced1
183+
A B
184+
0 1 4
185+
1 2 5
186+
2 3 6
187+
>>> type(sliced1)
188+
<class '__main__.SubclassedDataFrame'>
189+
190+
>>> sliced2 = df['A']
191+
>>> sliced2
192+
0 1
193+
1 2
194+
2 3
195+
Name: A, dtype: int64
196+
>>> type(sliced2)
197+
<class '__main__.SubclassedSeries'>
198+
199+
Define Original Properties
200+
~~~~~~~~~~~~~~~~~~~~~~~~~~
201+
202+
To let original data structures have additional properties, you should let ``pandas`` knows what properties are added. ``pandas`` maps unknown properties to data names overriding ``__getattribute__``. Defining original properties can be done either ways:
203+
204+
1. Define ``_internal_names`` and ``_internal_names_set`` for temporary properties which WILL NOT be passed to manipulation results.
205+
2. Define ``_metadata`` for normal properties which will be passed to manipulation results.
206+
207+
Below is an example to define 2 original properties, "internal_cache" as a temporary property and "added_property" as a normal property
208+
209+
.. code-block:: python
210+
211+
class SubclassedDataFrame2(DataFrame):
212+
213+
# temporary properties
214+
_internal_names = DataFrame._internal_names + ['internal_cache']
215+
_internal_names_set = set(_internal_names)
216+
217+
# normal properties
218+
_metadata = ['added_property']
219+
220+
@property
221+
def _constructor(self):
222+
return SubclassedDataFrame2
223+
224+
.. code-block:: python
225+
226+
>>> df = SubclassedDataFrame2({'A', [1, 2, 3], 'B': [4, 5, 6], 'C': [7, 8, 9]})
227+
>>> df
228+
A B C
229+
0 1 4 7
230+
1 2 5 8
231+
2 3 6 9
232+
233+
>>> df.internal_cache = 'cached'
234+
>>> df.added_property = 'property'
235+
236+
>>> df.internal_cache
237+
cached
238+
>>> df.added_property
239+
property
240+
241+
# properties defined in _internal_names is reset after manipulation
242+
>>> df[['A', 'B']].internal_cache
243+
AttributeError: 'SubclassedDataFrame2' object has no attribute 'internal_cache'
244+
245+
# properties defined in _metadata is retained
246+
>>> df[['A', 'B']].added_property
247+
property
248+
249+

pandas/core/frame.py

+11-8
Original file line numberDiff line numberDiff line change
@@ -191,6 +191,11 @@ def _constructor(self):
191191

192192
_constructor_sliced = Series
193193

194+
@property
195+
def _constructor_expanddim(self):
196+
from pandas.core.panel import Panel
197+
return Panel
198+
194199
def __init__(self, data=None, index=None, columns=None, dtype=None,
195200
copy=False):
196201
if data is None:
@@ -1064,8 +1069,6 @@ def to_panel(self):
10641069
-------
10651070
panel : Panel
10661071
"""
1067-
from pandas.core.panel import Panel
1068-
10691072
# only support this kind for now
10701073
if (not isinstance(self.index, MultiIndex) or # pragma: no cover
10711074
len(self.index.levels) != 2):
@@ -1103,7 +1106,7 @@ def to_panel(self):
11031106
shape=shape,
11041107
ref_items=selfsorted.columns)
11051108

1106-
return Panel(new_mgr)
1109+
return self._constructor_expanddim(new_mgr)
11071110

11081111
to_wide = deprecate('to_wide', to_panel)
11091112

@@ -4414,12 +4417,12 @@ def mode(self, axis=0, numeric_only=False):
44144417
"""
44154418
Gets the mode(s) of each element along the axis selected. Empty if nothing
44164419
has 2+ occurrences. Adds a row for each mode per label, fills in gaps
4417-
with nan.
4418-
4420+
with nan.
4421+
44194422
Note that there could be multiple values returned for the selected
4420-
axis (when more than one item share the maximum frequency), which is the
4421-
reason why a dataframe is returned. If you want to impute missing values
4422-
with the mode in a dataframe ``df``, you can just do this:
4423+
axis (when more than one item share the maximum frequency), which is the
4424+
reason why a dataframe is returned. If you want to impute missing values
4425+
with the mode in a dataframe ``df``, you can just do this:
44234426
``df.fillna(df.mode().iloc[0])``
44244427
44254428
Parameters

pandas/core/generic.py

+4
Original file line numberDiff line numberDiff line change
@@ -154,6 +154,10 @@ def _local_dir(self):
154154
def _constructor_sliced(self):
155155
raise NotImplementedError
156156

157+
@property
158+
def _constructor_expanddim(self):
159+
raise NotImplementedError
160+
157161
#----------------------------------------------------------------------
158162
# Axis
159163

pandas/core/series.py

+7-3
Original file line numberDiff line numberDiff line change
@@ -236,6 +236,11 @@ def from_array(cls, arr, index=None, name=None, dtype=None, copy=False,
236236
def _constructor(self):
237237
return Series
238238

239+
@property
240+
def _constructor_expanddim(self):
241+
from pandas.core.frame import DataFrame
242+
return DataFrame
243+
239244
# types
240245
@property
241246
def _can_hold_na(self):
@@ -1047,11 +1052,10 @@ def to_frame(self, name=None):
10471052
-------
10481053
data_frame : DataFrame
10491054
"""
1050-
from pandas.core.frame import DataFrame
10511055
if name is None:
1052-
df = DataFrame(self)
1056+
df = self._constructor_expanddim(self)
10531057
else:
1054-
df = DataFrame({name: self})
1058+
df = self._constructor_expanddim({name: self})
10551059

10561060
return df
10571061

pandas/tests/test_frame.py

+41-1
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@
3131
import pandas.core.common as com
3232
import pandas.core.format as fmt
3333
import pandas.core.datetools as datetools
34-
from pandas import (DataFrame, Index, Series, notnull, isnull,
34+
from pandas import (DataFrame, Index, Series, Panel, notnull, isnull,
3535
MultiIndex, DatetimeIndex, Timestamp, date_range,
3636
read_csv, timedelta_range, Timedelta,
3737
option_context)
@@ -14179,6 +14179,46 @@ def _constructor(self):
1417914179
# GH9776
1418014180
self.assertEqual(df.iloc[0:1, :].testattr, 'XXX')
1418114181

14182+
def test_to_panel_expanddim(self):
14183+
14184+
class SubclassedFrame(DataFrame):
14185+
@property
14186+
def _constructor_expanddim(self):
14187+
return SubclassedPanel
14188+
14189+
class SubclassedPanel(Panel):
14190+
pass
14191+
14192+
index = MultiIndex.from_tuples([(0, 0), (0, 1), (0, 2)])
14193+
df = SubclassedFrame({'X':[1, 2, 3], 'Y': [4, 5, 6]}, index=index)
14194+
result = df.to_panel()
14195+
self.assertTrue(isinstance(result, SubclassedPanel))
14196+
expected = SubclassedPanel([[[1, 2, 3]], [[4, 5, 6]]],
14197+
items=['X', 'Y'], major_axis=[0],
14198+
minor_axis=[0, 1, 2])
14199+
tm.assert_panel_equal(result, expected)
14200+
14201+
def test_dataframe_metadata(self):
14202+
14203+
class TestDataFrame(DataFrame):
14204+
_metadata = ['testattr']
14205+
14206+
@property
14207+
def _constructor(self):
14208+
return TestDataFrame
14209+
14210+
14211+
df = TestDataFrame({'X': [1, 2, 3], 'Y': [1, 2, 3]},
14212+
index=['a', 'b', 'c'])
14213+
df.testattr = 'XXX'
14214+
14215+
self.assertEqual(df.testattr, 'XXX')
14216+
self.assertEqual(df[['X']].testattr, 'XXX')
14217+
self.assertEqual(df.loc[['a', 'b'], :].testattr, 'XXX')
14218+
self.assertEqual(df.iloc[[0, 1], :].testattr, 'XXX')
14219+
# GH9776
14220+
self.assertEqual(df.iloc[0:1, :].testattr, 'XXX')
14221+
1418214222

1418314223
def skip_if_no_ne(engine='numexpr'):
1418414224
if engine == 'numexpr':

pandas/tests/test_series.py

+15
Original file line numberDiff line numberDiff line change
@@ -6780,6 +6780,21 @@ def test_searchsorted_sorter(self):
67806780
e = np.array([0, 2])
67816781
tm.assert_array_equal(r, e)
67826782

6783+
def test_to_frame_expanddim(self):
6784+
6785+
class SubclassedSeries(Series):
6786+
@property
6787+
def _constructor_expanddim(self):
6788+
return SubclassedFrame
6789+
6790+
class SubclassedFrame(DataFrame):
6791+
pass
6792+
6793+
s = SubclassedSeries([1, 2, 3], name='X')
6794+
result = s.to_frame()
6795+
self.assertTrue(isinstance(result, SubclassedFrame))
6796+
expected = SubclassedFrame({'X': [1, 2, 3]})
6797+
assert_frame_equal(result, expected)
67836798

67846799

67856800
class TestSeriesNonUnique(tm.TestCase):

0 commit comments

Comments
 (0)