Skip to content

Commit 878199d

Browse files
committed
API: define _constructor_expanddim for subclassing Series and DataFrame
1 parent a2a5cec commit 878199d

File tree

6 files changed

+227
-12
lines changed

6 files changed

+227
-12
lines changed

doc/source/faq.rst

+148
Original file line numberDiff line numberDiff line change
@@ -369,3 +369,151 @@ just a thin layer around the ``QTableView``.
369369
mw = MainWidget()
370370
mw.show()
371371
app.exec_()
372+
373+
374+
375+
.. _ref-subclassing-pandas:
376+
377+
Subclassing pandas Data Structures
378+
----------------------------------
379+
380+
This section describes how to subclass ``pandas`` data structures to meet more specific needs. There are 2 points to be cared:
381+
382+
1. Override constructor properties.
383+
2. Define original properties
384+
385+
.. note:: You can find actual example in `geopandas <https://github.com/geopandas/geopandas>`_ project.
386+
387+
Override Constructor Properties
388+
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
389+
390+
Each data structures have constructor properties to specifying data constructors. By overriding these properties, you can retain defined-classes through ``pandas`` data manipulations.
391+
392+
There are 3 constructors to be defined:
393+
394+
- ``_constructor``: Used when a manipulation result has the same dimensionalities as the original.
395+
- ``_constructor_sliced``: Used when a manipulation result has the lower dimensionalities as the original, such as ``DataFrame`` single columns slicing.
396+
- ``_constructor_expanddim``: Used when a manipulation result has the higher dimensionalities as the original, such as ``Series.to_frame()`` and ``DataFrame.to_panel()``.
397+
398+
Following table shows how ``pandas`` data structures define constructor properties by default.
399+
400+
=========================== ======================= =================== =======================
401+
Property Attributes ``Series`` ``DataFrame`` ``Panel``
402+
=========================== ======================= =================== =======================
403+
``_constructor`` ``Series`` ``DataFrame`` ``Panel``
404+
``_constructor_sliced`` ``NotImplementedError`` ``Series`` ``DataFrame``
405+
``_constructor_expanddim`` ``DataFrame`` ``Panel`` ``NotImplementedError``
406+
=========================== ======================= =================== =======================
407+
408+
Below example shows how to define ``SubclassedSeries`` and ``SubclassedDataFrame`` overriding constructor properties.
409+
410+
.. code-block:: python
411+
412+
class SubclassedSeries(Series):
413+
414+
@property
415+
def _constructor(self):
416+
return SubclassedSeries
417+
418+
@property
419+
def _constructor_expanddim(self):
420+
return SubclassedDataFrame
421+
422+
class SubclassedDataFrame(DataFrame):
423+
424+
@property
425+
def _constructor(self):
426+
return SubclassedDataFrame
427+
428+
@property
429+
def _constructor_sliced(self):
430+
return SubclassedSeries
431+
432+
.. code-block:: python
433+
434+
>>> s = SubclassedSeries([1, 2, 3])
435+
>>> type(s)
436+
<class '__main__.SubclassedSeries'>
437+
438+
>>> to_framed = s.to_frame()
439+
>>> type(to_framed)
440+
<class '__main__.SubclassedDataFrame'>
441+
442+
>>> df = SubclassedDataFrame({'A', [1, 2, 3], 'B': [4, 5, 6], 'C': [7, 8, 9]})
443+
>>> df
444+
A B C
445+
0 1 4 7
446+
1 2 5 8
447+
2 3 6 9
448+
449+
>>> type(df)
450+
<class '__main__.SubclassedDataFrame'>
451+
452+
>>> sliced1 = df[['A', 'B']]
453+
>>> sliced1
454+
A B
455+
0 1 4
456+
1 2 5
457+
2 3 6
458+
>>> type(sliced1)
459+
<class '__main__.SubclassedDataFrame'>
460+
461+
>>> sliced2 = df['A']
462+
>>> sliced2
463+
0 1
464+
1 2
465+
2 3
466+
Name: A, dtype: int64
467+
>>> type(sliced2)
468+
<class '__main__.SubclassedSeries'>
469+
470+
Define Original Properties
471+
~~~~~~~~~~~~~~~~~~~~~~~~~~
472+
473+
To let original data structures have additional properties, you should let ``pandas`` knows what properties are added. It is because ``pandas`` maps unknown properties to data names overriding ``__getattribute__``. Defining original properties can be done either ways:
474+
475+
1. Define ``_internal_names`` and ``_internal_names_set`` for temporary properties which WILL NOT be passed to manipulation results.
476+
2. Define ``_metadata`` for normal properties which will be passed to manipulation results.
477+
478+
Below is an example to define 2 original properties, "internal_cache" as a temporary property and "added_property" as a normal property
479+
480+
.. code-block:: python
481+
482+
class SubclassedDataFrame2(DataFrame):
483+
484+
# temporary properties
485+
_internal_names = DataFrame._internal_names + ['internal_cache']
486+
_internal_names_set = set(_internal_names)
487+
488+
# normal properties
489+
_metadata = ['added_property']
490+
491+
@property
492+
def _constructor(self):
493+
return SubclassedDataFrame2
494+
495+
.. code-block:: python
496+
497+
>>> df = SubclassedDataFrame2({'A', [1, 2, 3], 'B': [4, 5, 6], 'C': [7, 8, 9]})
498+
>>> df
499+
A B C
500+
0 1 4 7
501+
1 2 5 8
502+
2 3 6 9
503+
504+
>>> df.internal_cache = 'cached'
505+
>>> df.added_property = 'property'
506+
507+
>>> df.internal_cache
508+
cached
509+
>>> df.added_property
510+
property
511+
512+
# properties defined in _internal_names is reset after manipulation
513+
>>> df[['A', 'B']].internal_cache
514+
AttributeError: 'SubclassedDataFrame2' object has no attribute 'internal_cache'
515+
516+
# properties defined in _metadata is retained
517+
>>> df[['A', 'B']].added_property
518+
property
519+

pandas/core/frame.py

+11-8
Original file line numberDiff line numberDiff line change
@@ -191,6 +191,11 @@ def _constructor(self):
191191

192192
_constructor_sliced = Series
193193

194+
@property
195+
def _constructor_expanddim(self):
196+
from pandas.core.panel import Panel
197+
return Panel
198+
194199
def __init__(self, data=None, index=None, columns=None, dtype=None,
195200
copy=False):
196201
if data is None:
@@ -1064,8 +1069,6 @@ def to_panel(self):
10641069
-------
10651070
panel : Panel
10661071
"""
1067-
from pandas.core.panel import Panel
1068-
10691072
# only support this kind for now
10701073
if (not isinstance(self.index, MultiIndex) or # pragma: no cover
10711074
len(self.index.levels) != 2):
@@ -1103,7 +1106,7 @@ def to_panel(self):
11031106
shape=shape,
11041107
ref_items=selfsorted.columns)
11051108

1106-
return Panel(new_mgr)
1109+
return self._constructor_expanddim(new_mgr)
11071110

11081111
to_wide = deprecate('to_wide', to_panel)
11091112

@@ -4413,12 +4416,12 @@ def mode(self, axis=0, numeric_only=False):
44134416
"""
44144417
Gets the mode(s) of each element along the axis selected. Empty if nothing
44154418
has 2+ occurrences. Adds a row for each mode per label, fills in gaps
4416-
with nan.
4417-
4419+
with nan.
4420+
44184421
Note that there could be multiple values returned for the selected
4419-
axis (when more than one item share the maximum frequency), which is the
4420-
reason why a dataframe is returned. If you want to impute missing values
4421-
with the mode in a dataframe ``df``, you can just do this:
4422+
axis (when more than one item share the maximum frequency), which is the
4423+
reason why a dataframe is returned. If you want to impute missing values
4424+
with the mode in a dataframe ``df``, you can just do this:
44224425
``df.fillna(df.mode().iloc[0])``
44234426
44244427
Parameters

pandas/core/generic.py

+4
Original file line numberDiff line numberDiff line change
@@ -154,6 +154,10 @@ def _local_dir(self):
154154
def _constructor_sliced(self):
155155
raise NotImplementedError
156156

157+
@property
158+
def _constructor_expanddim(self):
159+
raise NotImplementedError
160+
157161
#----------------------------------------------------------------------
158162
# Axis
159163

pandas/core/series.py

+7-3
Original file line numberDiff line numberDiff line change
@@ -236,6 +236,11 @@ def from_array(cls, arr, index=None, name=None, dtype=None, copy=False,
236236
def _constructor(self):
237237
return Series
238238

239+
@property
240+
def _constructor_expanddim(self):
241+
from pandas.core.frame import DataFrame
242+
return DataFrame
243+
239244
# types
240245
@property
241246
def _can_hold_na(self):
@@ -1047,11 +1052,10 @@ def to_frame(self, name=None):
10471052
-------
10481053
data_frame : DataFrame
10491054
"""
1050-
from pandas.core.frame import DataFrame
10511055
if name is None:
1052-
df = DataFrame(self)
1056+
df = self._constructor_expanddim(self)
10531057
else:
1054-
df = DataFrame({name: self})
1058+
df = self._constructor_expanddim({name: self})
10551059

10561060
return df
10571061

pandas/tests/test_frame.py

+42-1
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@
3131
import pandas.core.common as com
3232
import pandas.core.format as fmt
3333
import pandas.core.datetools as datetools
34-
from pandas import (DataFrame, Index, Series, notnull, isnull,
34+
from pandas import (DataFrame, Index, Series, Panel, notnull, isnull,
3535
MultiIndex, DatetimeIndex, Timestamp, date_range,
3636
read_csv, timedelta_range, Timedelta,
3737
option_context)
@@ -14099,6 +14099,47 @@ def _constructor(self):
1409914099
# GH9776
1410014100
self.assertEqual(df.iloc[0:1, :].testattr, 'XXX')
1410114101

14102+
def test_to_panel_expanddim(self):
14103+
14104+
class SubclassedFrame(DataFrame):
14105+
@property
14106+
def _constructor_expanddim(self):
14107+
return SubclassedPanel
14108+
14109+
class SubclassedPanel(Panel):
14110+
pass
14111+
14112+
index = MultiIndex.from_tuples([(0, 0), (0, 1), (0, 2)])
14113+
df = SubclassedFrame({'X':[1, 2, 3], 'Y': [4, 5, 6]}, index=index)
14114+
result = df.to_panel()
14115+
self.assertTrue(isinstance(result, SubclassedPanel))
14116+
expected = SubclassedPanel([[[1, 2, 3]], [[4, 5, 6]]],
14117+
items=['X', 'Y'], major_axis=[0],
14118+
minor_axis=[0, 1, 2])
14119+
tm.assert_panel_equal(result, expected)
14120+
>>>>>>> API: define _constructor_expanddim for subclassing Series and DataFrame
14121+
14122+
def test_dataframe_metadata(self):
14123+
14124+
class TestDataFrame(DataFrame):
14125+
_metadata = ['testattr']
14126+
14127+
@property
14128+
def _constructor(self):
14129+
return TestDataFrame
14130+
14131+
14132+
df = TestDataFrame({'X': [1, 2, 3], 'Y': [1, 2, 3]},
14133+
index=['a', 'b', 'c'])
14134+
df.testattr = 'XXX'
14135+
14136+
self.assertEqual(df.testattr, 'XXX')
14137+
self.assertEqual(df[['X']].testattr, 'XXX')
14138+
self.assertEqual(df.loc[['a', 'b'], :].testattr, 'XXX')
14139+
self.assertEqual(df.iloc[[0, 1], :].testattr, 'XXX')
14140+
# GH9776
14141+
self.assertEqual(df.iloc[0:1, :].testattr, 'XXX')
14142+
1410214143

1410314144
def skip_if_no_ne(engine='numexpr'):
1410414145
if engine == 'numexpr':

pandas/tests/test_series.py

+15
Original file line numberDiff line numberDiff line change
@@ -6754,6 +6754,21 @@ def test_searchsorted_sorter(self):
67546754
e = np.array([0, 2])
67556755
tm.assert_array_equal(r, e)
67566756

6757+
def test_to_frame_expanddim(self):
6758+
6759+
class SubclassedSeries(Series):
6760+
@property
6761+
def _constructor_expanddim(self):
6762+
return SubclassedFrame
6763+
6764+
class SubclassedFrame(DataFrame):
6765+
pass
6766+
6767+
s = SubclassedSeries([1, 2, 3], name='X')
6768+
result = s.to_frame()
6769+
self.assertTrue(isinstance(result, SubclassedFrame))
6770+
expected = SubclassedFrame({'X': [1, 2, 3]})
6771+
assert_frame_equal(result, expected)
67576772

67586773

67596774
class TestSeriesNonUnique(tm.TestCase):

0 commit comments

Comments
 (0)