Skip to content

Subclassed reshape #15564

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 15 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 19 additions & 2 deletions doc/source/internals.rst
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,7 @@ Property Attributes ``Series`` ``DataFrame`` ``Panel
``_constructor_expanddim`` ``DataFrame`` ``Panel`` ``NotImplementedError``
=========================== ======================= =================== =======================

Below example shows how to define ``SubclassedSeries`` and ``SubclassedDataFrame`` overriding constructor properties.
The below example shows how to define ``SubclassedSeries`` and ``SubclassedDataFrame`` overriding constructor properties:

.. code-block:: python

Expand All @@ -152,6 +152,8 @@ Below example shows how to define ``SubclassedSeries`` and ``SubclassedDataFrame
def _constructor_sliced(self):
return SubclassedSeries

Overriding constructor properties allows subclass families to be preserved across slice and reshape operations:

.. code-block:: python

>>> s = SubclassedSeries([1, 2, 3])
Expand All @@ -162,7 +164,7 @@ Below example shows how to define ``SubclassedSeries`` and ``SubclassedDataFrame
>>> type(to_framed)
<class '__main__.SubclassedDataFrame'>

>>> df = SubclassedDataFrame({'A', [1, 2, 3], 'B': [4, 5, 6], 'C': [7, 8, 9]})
>>> df = SubclassedDataFrame({'A': [1, 2, 3], 'B': [4, 5, 6], 'C': [7, 8, 9]})
>>> df
A B C
0 1 4 7
Expand Down Expand Up @@ -190,6 +192,21 @@ Below example shows how to define ``SubclassedSeries`` and ``SubclassedDataFrame
>>> type(sliced2)
<class '__main__.SubclassedSeries'>

>>> stacked = df.stack()
>>> stacked
0 A 1
B 4
C 7
1 A 2
B 5
C 8
2 A 3
B 6
C 9
dtype: int64
>>> type(stacked)
<class '__main__.SubclassedSeries'>

Define Original Properties
~~~~~~~~~~~~~~~~~~~~~~~~~~

Expand Down
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.20.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -228,6 +228,7 @@ Other enhancements
- ``pd.TimedeltaIndex`` now has a custom datetick formatter specifically designed for nanosecond level precision (:issue:`8711`)
- ``pd.types.concat.union_categoricals`` gained the ``ignore_ordered`` argument to allow ignoring the ordered attribute of unioned categoricals (:issue:`13410`). See the :ref:`categorical union docs <categorical.union>` for more information.
- ``pandas.io.json.json_normalize()`` with an empty ``list`` will return an empty ``DataFrame`` (:issue:`15534`)
- `stack`, `unstack`, and `pivot` operations now preserve subclass family (:issue:`15563`)

.. _ISO 8601 duration: https://en.wikipedia.org/wiki/ISO_8601#Durations

Expand Down
32 changes: 23 additions & 9 deletions pandas/core/reshape.py
Original file line number Diff line number Diff line change
Expand Up @@ -371,8 +371,11 @@ def pivot(self, index=None, columns=None, values=None):
index = self.index
else:
index = self[index]
indexed = Series(self[values].values,
index=MultiIndex.from_arrays([index, self[columns]]))

indexed = self._constructor_sliced(
self[values].values,
index=MultiIndex.from_arrays([index, self[columns]]))

return indexed.unstack(columns)


Expand Down Expand Up @@ -448,13 +451,24 @@ def unstack(obj, level, fill_value=None):

if isinstance(obj, DataFrame):
if isinstance(obj.index, MultiIndex):
return _unstack_frame(obj, level, fill_value=fill_value)
unstacked = _unstack_frame(obj, level, fill_value=fill_value)
else:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

hmm I think this should be done inside the .stack and _unstack_frame functions no?

unstacked = obj.T.stack(dropna=False)

if len(unstacked.shape) == 1:
return obj._constructor_sliced(unstacked)
else:
return obj.T.stack(dropna=False)
return obj._constructor(unstacked)

else:
unstacker = _Unstacker(obj.values, obj.index, level=level,
fill_value=fill_value)
return unstacker.get_result()
unstacked = unstacker.get_result()

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same here, this now has logic in 2 places, the creation itself of the unstacked result, then the return class. should be a part of get_result (it certainly could be another method that is called by get_result) though

if len(unstacked.shape) == 1:
return obj._constructor(unstacked)
else:
return obj._constructor_expanddim(unstacked)


def _unstack_frame(obj, level, fill_value=None):
Expand Down Expand Up @@ -553,7 +567,7 @@ def factorize(index):
mask = notnull(new_values)
new_values = new_values[mask]
new_index = new_index[mask]
return Series(new_values, index=new_index)
return frame._constructor_sliced(new_values, index=new_index)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

so this is good



def stack_multiple(frame, level, dropna=True):
Expand Down Expand Up @@ -692,7 +706,7 @@ def _convert_level_number(level_num, columns):
new_index = MultiIndex(levels=new_levels, labels=new_labels,
names=new_names, verify_integrity=False)

result = DataFrame(new_data, index=new_index, columns=new_columns)
result = frame._constructor(new_data, index=new_index, columns=new_columns)

# more efficient way to go about this? can do the whole masking biz but
# will only save a small amount of time...
Expand Down Expand Up @@ -852,7 +866,7 @@ def melt(frame, id_vars=None, value_vars=None, var_name=None,
mdata[col] = np.asanyarray(frame.columns
._get_level_values(i)).repeat(N)

return DataFrame(mdata, columns=mcolumns)
return frame._constructor(mdata, columns=mcolumns)


def lreshape(data, groups, dropna=True, label=None):
Expand Down Expand Up @@ -921,7 +935,7 @@ def lreshape(data, groups, dropna=True, label=None):
if not mask.all():
mdata = dict((k, v[mask]) for k, v in compat.iteritems(mdata))

return DataFrame(mdata, columns=id_cols + pivot_cols)
return data._constructor(mdata, columns=id_cols + pivot_cols)


def wide_to_long(df, stubnames, i, j, sep="", suffix='\d+'):
Expand Down
205 changes: 204 additions & 1 deletion pandas/tests/frame/test_subclass.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

import numpy as np

from pandas import DataFrame, Series, MultiIndex, Panel
from pandas import DataFrame, Series, MultiIndex, Panel, Index
import pandas as pd
import pandas.util.testing as tm

Expand Down Expand Up @@ -125,6 +125,209 @@ def test_indexing_sliced(self):
tm.assert_series_equal(res, exp)
tm.assertIsInstance(res, tm.SubclassedSeries)

def test_subclass_stack(self):
<<<<<<< HEAD
# GH 15564
=======
>>>>>>> bug fix; test stack, unstack, pivot for series and df with Index, MultiIndex
df = tm.SubclassedDataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]],
index=['a', 'b', 'c'],
columns=['X', 'Y', 'Z'])

res = df.stack()
exp = tm.SubclassedSeries(
[1, 2, 3, 4, 5, 6, 7, 8, 9],
index=[list('aaabbbccc'), list('XYZXYZXYZ')])

tm.assert_series_equal(res, exp)
tm.assertIsInstance(res, tm.SubclassedSeries)

def test_subclass_stack_multi(self):
<<<<<<< HEAD
# GH 15564
=======
>>>>>>> bug fix; test stack, unstack, pivot for series and df with Index, MultiIndex
df = tm.SubclassedDataFrame([
[10, 11, 12, 13],
[20, 21, 22, 23],
[30, 31, 32, 33],
[40, 41, 42, 43]],
index=MultiIndex.from_tuples(
<<<<<<< HEAD
list(zip(list('AABB'), list('cdcd'))),
names=['aaa', 'ccc']),
columns=MultiIndex.from_tuples(
list(zip(list('WWXX'), list('yzyz'))),
=======
zip(list('AABB'), list('cdcd')),
names=['aaa', 'ccc']),
columns=MultiIndex.from_tuples(
zip(list('WWXX'), list('yzyz')),
>>>>>>> bug fix; test stack, unstack, pivot for series and df with Index, MultiIndex
names=['www', 'yyy']))

exp = tm.SubclassedDataFrame([
[10, 12],
[11, 13],
[20, 22],
[21, 23],
[30, 32],
[31, 33],
[40, 42],
[41, 43]],
<<<<<<< HEAD
index=MultiIndex.from_tuples(list(zip(
list('AAAABBBB'), list('ccddccdd'), list('yzyzyzyz'))),
names=['aaa', 'ccc', 'yyy']),
columns=Index(['W', 'X'], name='www'))
=======
index=MultiIndex.from_tuples(
zip(list('AAAABBBB'), list('ccddccdd'), list('yzyzyzyz')),
names=['aaa', 'ccc', 'yyy']),
columns=MultiIndex.from_tuples(
zip(list('WX')), names=['www']))
>>>>>>> bug fix; test stack, unstack, pivot for series and df with Index, MultiIndex

res = df.stack()
tm.assert_frame_equal(res, exp)
tm.assertIsInstance(res, tm.SubclassedDataFrame)

res = df.stack('yyy')
tm.assert_frame_equal(res, exp)
tm.assertIsInstance(res, tm.SubclassedDataFrame)

exp = tm.SubclassedDataFrame([
[10, 11],
[12, 13],
[20, 21],
[22, 23],
[30, 31],
[32, 33],
[40, 41],
[42, 43]],
<<<<<<< HEAD
index=MultiIndex.from_tuples(list(zip(
list('AAAABBBB'), list('ccddccdd'), list('WXWXWXWX'))),
names=['aaa', 'ccc', 'www']),
columns=Index(['y', 'z'], name='yyy'))
=======
index=MultiIndex.from_tuples(
zip(list('AAAABBBB'), list('ccddccdd'), list('WXWXWXWX')),
names=['aaa', 'ccc', 'www']),
columns=MultiIndex.from_tuples(
zip(list('yz')), names=['yyy']))
>>>>>>> bug fix; test stack, unstack, pivot for series and df with Index, MultiIndex

res = df.stack('www')
tm.assert_frame_equal(res, exp)
tm.assertIsInstance(res, tm.SubclassedDataFrame)

def test_subclass_unstack(self):
<<<<<<< HEAD
# GH 15564
=======
>>>>>>> bug fix; test stack, unstack, pivot for series and df with Index, MultiIndex
df = tm.SubclassedDataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]],
index=['a', 'b', 'c'],
columns=['X', 'Y', 'Z'])

res = df.unstack()
exp = tm.SubclassedSeries(
[1, 4, 7, 2, 5, 8, 3, 6, 9],
index=[list('XXXYYYZZZ'), list('abcabcabc')])

tm.assert_series_equal(res, exp)
tm.assertIsInstance(res, tm.SubclassedSeries)

def test_subclass_unstack_multi(self):
<<<<<<< HEAD
# GH 15564
=======
>>>>>>> bug fix; test stack, unstack, pivot for series and df with Index, MultiIndex
df = tm.SubclassedDataFrame([
[10, 11, 12, 13],
[20, 21, 22, 23],
[30, 31, 32, 33],
[40, 41, 42, 43]],
index=MultiIndex.from_tuples(
<<<<<<< HEAD
list(zip(list('AABB'), list('cdcd'))),
names=['aaa', 'ccc']),
columns=MultiIndex.from_tuples(
list(zip(list('WWXX'), list('yzyz'))),
=======
zip(list('AABB'), list('cdcd')),
names=['aaa', 'ccc']),
columns=MultiIndex.from_tuples(
zip(list('WWXX'), list('yzyz')),
>>>>>>> bug fix; test stack, unstack, pivot for series and df with Index, MultiIndex
names=['www', 'yyy']))

exp = tm.SubclassedDataFrame([
[10, 20, 11, 21, 12, 22, 13, 23],
[30, 40, 31, 41, 32, 42, 33, 43]],
<<<<<<< HEAD
index=Index(['A', 'B'], name='aaa'),
columns=MultiIndex.from_tuples(list(zip(
list('WWWWXXXX'), list('yyzzyyzz'), list('cdcdcdcd'))),
names=['www', 'yyy', 'ccc']))
=======
index=MultiIndex.from_tuples(
zip(list('AB')), names=['aaa']),
columns=MultiIndex.from_tuples(
zip(list('WWWWXXXX'), list('yyzzyyzz'), list('cdcdcdcd')),
names=['www', 'yyy', 'ccc']))
>>>>>>> bug fix; test stack, unstack, pivot for series and df with Index, MultiIndex

res = df.unstack()
tm.assert_frame_equal(res, exp)
tm.assertIsInstance(res, tm.SubclassedDataFrame)

res = df.unstack('ccc')
tm.assert_frame_equal(res, exp)
tm.assertIsInstance(res, tm.SubclassedDataFrame)

exp = tm.SubclassedDataFrame([
[10, 30, 11, 31, 12, 32, 13, 33],
[20, 40, 21, 41, 22, 42, 23, 43]],
<<<<<<< HEAD
index=Index(['c', 'd'], name='ccc'),
columns=MultiIndex.from_tuples(list(zip(
list('WWWWXXXX'), list('yyzzyyzz'), list('ABABABAB'))),
=======
index=MultiIndex.from_tuples(
zip(list('cd')), names=['ccc']),
columns=MultiIndex.from_tuples(
zip(list('WWWWXXXX'), list('yyzzyyzz'), list('ABABABAB')),
>>>>>>> bug fix; test stack, unstack, pivot for series and df with Index, MultiIndex
names=['www', 'yyy', 'aaa']))

res = df.unstack('aaa')
tm.assert_frame_equal(res, exp)
tm.assertIsInstance(res, tm.SubclassedDataFrame)

def test_subclass_pivot(self):
<<<<<<< HEAD
# GH 15564
=======
>>>>>>> bug fix; test stack, unstack, pivot for series and df with Index, MultiIndex
df = tm.SubclassedDataFrame({
'index': ['A', 'B', 'C', 'C', 'B', 'A'],
'columns': ['One', 'One', 'One', 'Two', 'Two', 'Two'],
'values': [1., 2., 3., 3., 2., 1.]})

pivoted = df.pivot(
index='index', columns='columns', values='values')

expected = tm.SubclassedDataFrame({
'One': {'A': 1., 'B': 2., 'C': 3.},
'Two': {'A': 1., 'B': 2., 'C': 3.}})

expected.index.name, expected.columns.name = 'index', 'columns'

tm.assert_frame_equal(pivoted, expected)
tm.assertIsInstance(pivoted, tm.SubclassedDataFrame)

def test_to_panel_expanddim(self):
# GH 9762

Expand Down
13 changes: 13 additions & 0 deletions pandas/tests/series/test_subclass.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,18 @@ def test_to_frame(self):
tm.assert_frame_equal(res, exp)
tm.assertIsInstance(res, tm.SubclassedDataFrame)

def test_subclass_unstack(self):
# GH 15564
s = tm.SubclassedSeries(
[1, 2, 3, 4], index=[list('aabb'), list('xyxy')])

res = s.unstack()
exp = tm.SubclassedDataFrame(
{'x': [1, 3], 'y': [2, 4]}, index=['a', 'b'])

tm.assert_frame_equal(res, exp)
tm.assertIsInstance(res, tm.SubclassedDataFrame)


class TestSparseSeriesSubclassing(tm.TestCase):

Expand Down Expand Up @@ -76,6 +88,7 @@ def test_subclass_sparse_addition(self):
tm.assert_sp_series_equal(s1 + s2, exp)

def test_subclass_sparse_to_frame(self):
# GH 15564
s = tm.SubclassedSparseSeries([1, 2], index=list('abcd'), name='xxx')
res = s.to_frame()

Expand Down