Skip to content

Commit c58b3ed

Browse files
committed
API: define _constructor_expanddim for subclassing Series and DataFrame
1 parent 0222024 commit c58b3ed

File tree

6 files changed

+207
-13
lines changed

6 files changed

+207
-13
lines changed

doc/source/faq.rst

+148
Original file line numberDiff line numberDiff line change
@@ -369,3 +369,151 @@ just a thin layer around the ``QTableView``.
369369
mw = MainWidget()
370370
mw.show()
371371
app.exec_()
372+
373+
374+
375+
.. _ref-subclassing-pandas:
376+
377+
Subclassing pandas Data Structures
378+
----------------------------------
379+
380+
This section describes how to subclass ``pandas`` data structures to meet more specific needs. There are 2 points to be cared:
381+
382+
1. Override constructor properties.
383+
2. Define original properties
384+
385+
.. note:: You can find actual example in `geopandas <https://github.com/geopandas/geopandas>`_ project.
386+
387+
Override Constructor Properties
388+
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
389+
390+
Each data structures have constructor properties to specifying data constructors. By overriding these properties, you can retain defined-classes through ``pandas`` data manipulations.
391+
392+
There are 3 constructors to be defined:
393+
394+
- ``_constructor``: Used when a manipulation result has the same dimensionalities as the original.
395+
- ``_constructor_sliced``: Used when a manipulation result has the lower dimensionalities as the original, such as ``DataFrame`` single columns slicing.
396+
- ``_constructor_expanddim``: Used when a manipulation result has the higher dimensionalities as the original, such as ``Series.to_frame()`` and ``DataFrame.to_panel()``.
397+
398+
Following table shows how ``pandas`` data structures define constructor properties by default.
399+
400+
=========================== ======================= =================== =======================
401+
Property Attributes ``Series`` ``DataFrame`` ``Panel``
402+
=========================== ======================= =================== =======================
403+
``_constructor`` ``Series`` ``DataFrame`` ``Panel``
404+
``_constructor_sliced`` ``NotImplementedError`` ``Series`` ``DataFrame``
405+
``_constructor_expanddim`` ``DataFrame`` ``Panel`` ``NotImplementedError``
406+
=========================== ======================= =================== =======================
407+
408+
Below example shows how to define ``SubclassedSeries`` and ``SubclassedDataFrame`` overriding constructor properties.
409+
410+
.. code-block:: python
411+
412+
class SubclassedSeries(Series):
413+
414+
@property
415+
def _constructor(self):
416+
return SubclassedSeries
417+
418+
@property
419+
def _constructor_expanddim(self):
420+
return SubclassedDataFrame
421+
422+
class SubclassedDataFrame(DataFrame):
423+
424+
@property
425+
def _constructor(self):
426+
return SubclassedDataFrame
427+
428+
@property
429+
def _constructor_sliced(self):
430+
return SubclassedSeries
431+
432+
.. code-block:: python
433+
434+
>>> s = SubclassedSeries([1, 2, 3])
435+
>>> type(s)
436+
<class '__main__.SubclassedSeries'>
437+
438+
>>> to_framed = s.to_frame()
439+
>>> type(to_framed)
440+
<class '__main__.SubclassedDataFrame'>
441+
442+
>>> df = SubclassedDataFrame({'A', [1, 2, 3], 'B': [4, 5, 6], 'C': [7, 8, 9]})
443+
>>> df
444+
A B C
445+
0 1 4 7
446+
1 2 5 8
447+
2 3 6 9
448+
449+
>>> type(df)
450+
<class '__main__.SubclassedDataFrame'>
451+
452+
>>> sliced1 = df[['A', 'B']]
453+
>>> sliced1
454+
A B
455+
0 1 4
456+
1 2 5
457+
2 3 6
458+
>>> type(sliced1)
459+
<class '__main__.SubclassedDataFrame'>
460+
461+
>>> sliced2 = df['A']
462+
>>> sliced2
463+
0 1
464+
1 2
465+
2 3
466+
Name: A, dtype: int64
467+
>>> type(sliced2)
468+
<class '__main__.SubclassedSeries'>
469+
470+
Define Original Properties
471+
~~~~~~~~~~~~~~~~~~~~~~~~~~
472+
473+
To let original data structures have additional properties, you should let ``pandas`` knows what properties are added. It is because ``pandas`` maps unknown properties to data names overriding ``__getattribute__``. Defining original properties can be done either ways:
474+
475+
1. Define ``_internal_names`` and ``_internal_names_set`` for temporary properties which WILL NOT be passed to manipulation results.
476+
2. Define ``_metadata`` for normal properties which will be passed to manipulation results.
477+
478+
Below is an example to define 2 original properties, "internal_cache" as a temporary property and "added_property" as a normal property
479+
480+
.. code-block:: python
481+
482+
class SubclassedDataFrame2(DataFrame):
483+
484+
# temporary properties
485+
_internal_names = DataFrame._internal_names + ['internal_cache']
486+
_internal_names_set = set(_internal_names)
487+
488+
# normal properties
489+
_metadata = ['added_property']
490+
491+
@property
492+
def _constructor(self):
493+
return SubclassedDataFrame2
494+
495+
.. code-block:: python
496+
497+
>>> df = SubclassedDataFrame2({'A', [1, 2, 3], 'B': [4, 5, 6], 'C': [7, 8, 9]})
498+
>>> df
499+
A B C
500+
0 1 4 7
501+
1 2 5 8
502+
2 3 6 9
503+
504+
>>> df.internal_cache = 'cached'
505+
>>> df.added_property = 'property'
506+
507+
>>> df.internal_cache
508+
cached
509+
>>> df.added_property
510+
property
511+
512+
# properties defined in _internal_names is reset after manipulation
513+
>>> df[['A', 'B']].internal_cache
514+
AttributeError: 'SubclassedDataFrame2' object has no attribute 'internal_cache'
515+
516+
# properties defined in _metadata is retained
517+
>>> df[['A', 'B']].added_property
518+
property
519+

pandas/core/frame.py

+11-8
Original file line numberDiff line numberDiff line change
@@ -191,6 +191,11 @@ def _constructor(self):
191191

192192
_constructor_sliced = Series
193193

194+
@property
195+
def _constructor_expanddim(self):
196+
from pandas.core.panel import Panel
197+
return Panel
198+
194199
def __init__(self, data=None, index=None, columns=None, dtype=None,
195200
copy=False):
196201
if data is None:
@@ -1064,8 +1069,6 @@ def to_panel(self):
10641069
-------
10651070
panel : Panel
10661071
"""
1067-
from pandas.core.panel import Panel
1068-
10691072
# only support this kind for now
10701073
if (not isinstance(self.index, MultiIndex) or # pragma: no cover
10711074
len(self.index.levels) != 2):
@@ -1103,7 +1106,7 @@ def to_panel(self):
11031106
shape=shape,
11041107
ref_items=selfsorted.columns)
11051108

1106-
return Panel(new_mgr)
1109+
return self._constructor_expanddim(new_mgr)
11071110

11081111
to_wide = deprecate('to_wide', to_panel)
11091112

@@ -4413,12 +4416,12 @@ def mode(self, axis=0, numeric_only=False):
44134416
"""
44144417
Gets the mode(s) of each element along the axis selected. Empty if nothing
44154418
has 2+ occurrences. Adds a row for each mode per label, fills in gaps
4416-
with nan.
4417-
4419+
with nan.
4420+
44184421
Note that there could be multiple values returned for the selected
4419-
axis (when more than one item share the maximum frequency), which is the
4420-
reason why a dataframe is returned. If you want to impute missing values
4421-
with the mode in a dataframe ``df``, you can just do this:
4422+
axis (when more than one item share the maximum frequency), which is the
4423+
reason why a dataframe is returned. If you want to impute missing values
4424+
with the mode in a dataframe ``df``, you can just do this:
44224425
``df.fillna(df.mode().iloc[0])``
44234426
44244427
Parameters

pandas/core/generic.py

+4
Original file line numberDiff line numberDiff line change
@@ -154,6 +154,10 @@ def _local_dir(self):
154154
def _constructor_sliced(self):
155155
raise NotImplementedError
156156

157+
@property
158+
def _constructor_expanddim(self):
159+
raise NotImplementedError
160+
157161
#----------------------------------------------------------------------
158162
# Axis
159163

pandas/core/series.py

+7-3
Original file line numberDiff line numberDiff line change
@@ -236,6 +236,11 @@ def from_array(cls, arr, index=None, name=None, dtype=None, copy=False,
236236
def _constructor(self):
237237
return Series
238238

239+
@property
240+
def _constructor_expanddim(self):
241+
from pandas.core.frame import DataFrame
242+
return DataFrame
243+
239244
# types
240245
@property
241246
def _can_hold_na(self):
@@ -1047,11 +1052,10 @@ def to_frame(self, name=None):
10471052
-------
10481053
data_frame : DataFrame
10491054
"""
1050-
from pandas.core.frame import DataFrame
10511055
if name is None:
1052-
df = DataFrame(self)
1056+
df = self._constructor_expanddim(self)
10531057
else:
1054-
df = DataFrame({name: self})
1058+
df = self._constructor_expanddim({name: self})
10551059

10561060
return df
10571061

pandas/tests/test_frame.py

+21-1
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@
3131
import pandas.core.common as com
3232
import pandas.core.format as fmt
3333
import pandas.core.datetools as datetools
34-
from pandas import (DataFrame, Index, Series, notnull, isnull,
34+
from pandas import (DataFrame, Index, Series, Panel, notnull, isnull,
3535
MultiIndex, DatetimeIndex, Timestamp, date_range,
3636
read_csv, timedelta_range, Timedelta,
3737
option_context)
@@ -14057,6 +14057,26 @@ def test_assign_bad(self):
1405714057
with tm.assertRaises(KeyError):
1405814058
df.assign(C=df.A, D=lambda x: x['A'] + x['C'])
1405914059

14060+
def test_to_panel_expanddim(self):
14061+
14062+
class SubclassedFrame(DataFrame):
14063+
@property
14064+
def _constructor_expanddim(self):
14065+
return SubclassedPanel
14066+
14067+
class SubclassedPanel(Panel):
14068+
pass
14069+
14070+
index = MultiIndex.from_tuples([(0, 0), (0, 1), (0, 2)])
14071+
df = SubclassedFrame({'X':[1, 2, 3], 'Y': [4, 5, 6]}, index=index)
14072+
result = df.to_panel()
14073+
self.assertTrue(isinstance(result, SubclassedPanel))
14074+
expected = SubclassedPanel([[[1, 2, 3]], [[4, 5, 6]]],
14075+
items=['X', 'Y'], major_axis=[0],
14076+
minor_axis=[0, 1, 2])
14077+
tm.assert_panel_equal(result, expected)
14078+
14079+
1406014080
def skip_if_no_ne(engine='numexpr'):
1406114081
if engine == 'numexpr':
1406214082
try:

pandas/tests/test_series.py

+16-1
Original file line numberDiff line numberDiff line change
@@ -6692,6 +6692,21 @@ def test_searchsorted_sorter(self):
66926692
e = np.array([0, 2])
66936693
tm.assert_array_equal(r, e)
66946694

6695+
def test_to_frame_expanddim(self):
6696+
6697+
class SubclassedSeries(Series):
6698+
@property
6699+
def _constructor_expanddim(self):
6700+
return SubclassedFrame
6701+
6702+
class SubclassedFrame(DataFrame):
6703+
pass
6704+
6705+
s = SubclassedSeries([1, 2, 3], name='X')
6706+
result = s.to_frame()
6707+
self.assertTrue(isinstance(result, SubclassedFrame))
6708+
expected = SubclassedFrame({'X': [1, 2, 3]})
6709+
assert_frame_equal(result, expected)
66956710

66966711

66976712
class TestSeriesNonUnique(tm.TestCase):
@@ -6845,7 +6860,7 @@ def test_repeat(self):
68456860
def test_unique_data_ownership(self):
68466861
# it works! #1807
68476862
Series(Series(["a", "c", "b"]).unique()).sort()
6848-
6863+
68496864
def test_datetime_timedelta_quantiles(self):
68506865
# covers #9694
68516866
self.assertTrue(pd.isnull(Series([],dtype='M8[ns]').quantile(.5)))

0 commit comments

Comments
 (0)