diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index dc305f36f32ec..ec106ff2b2f61 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -449,6 +449,8 @@ Reshaping - Bug in :func:`cut` which fails when using readonly arrays (:issue:`18773`) - Bug in :func:`Dataframe.pivot_table` which fails when the ``aggfunc`` arg is of type string. The behavior is now consistent with other methods like ``agg`` and ``apply`` (:issue:`18713`) - Bug in :func:`DataFrame.merge` in which merging using ``Index`` objects as vectors raised an Exception (:issue:`19038`) +- Bug in :func:`DataFrame.stack`, :func:`DataFrame.unstack`, :func:`Series.unstack` which were not returning subclasses (:issue:`15563`) +- Numeric ^^^^^^^ diff --git a/pandas/core/reshape/melt.py b/pandas/core/reshape/melt.py index b648c426a877f..28e9694681912 100644 --- a/pandas/core/reshape/melt.py +++ b/pandas/core/reshape/melt.py @@ -80,8 +80,7 @@ def melt(frame, id_vars=None, value_vars=None, var_name=None, mdata[col] = np.asanyarray(frame.columns ._get_level_values(i)).repeat(N) - from pandas import DataFrame - return DataFrame(mdata, columns=mcolumns) + return frame._constructor(mdata, columns=mcolumns) def lreshape(data, groups, dropna=True, label=None): @@ -152,8 +151,7 @@ def lreshape(data, groups, dropna=True, label=None): if not mask.all(): mdata = {k: v[mask] for k, v in compat.iteritems(mdata)} - from pandas import DataFrame - return DataFrame(mdata, columns=id_cols + pivot_cols) + return data._constructor(mdata, columns=id_cols + pivot_cols) def wide_to_long(df, stubnames, i, j, sep="", suffix=r'\d+'): diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index d6aed064e49f8..7a34044f70c34 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -37,8 +37,23 @@ class _Unstacker(object): Parameters ---------- + values : ndarray + Values of DataFrame to "Unstack" + index : object + Pandas ``Index`` level : int or str, default last level Level to "unstack". Accepts a name for the level. + value_columns : Index, optional + Pandas ``Index`` or ``MultiIndex`` object if unstacking a DataFrame + fill_value : scalar, optional + Default value to fill in missing values if subgroups do not have the + same set of labels. By default, missing values will be replaced with + the default fill value for that data type, NaN for float, NaT for + datetimelike, etc. For integer types, by default data will converted to + float and missing values will be set to NaN. + constructor : object + Pandas ``DataFrame`` or subclass used to create unstacked + response. If None, DataFrame or SparseDataFrame will be used. Examples -------- @@ -69,7 +84,7 @@ class _Unstacker(object): """ def __init__(self, values, index, level=-1, value_columns=None, - fill_value=None): + fill_value=None, constructor=None): self.is_categorical = None self.is_sparse = is_sparse(values) @@ -86,6 +101,14 @@ def __init__(self, values, index, level=-1, value_columns=None, self.value_columns = value_columns self.fill_value = fill_value + if constructor is None: + if self.is_sparse: + self.constructor = SparseDataFrame + else: + self.constructor = DataFrame + else: + self.constructor = constructor + if value_columns is None and values.shape[1] != 1: # pragma: no cover raise ValueError('must pass column labels for multi-column data') @@ -173,8 +196,7 @@ def get_result(self): ordered=ordered) for i in range(values.shape[-1])] - klass = SparseDataFrame if self.is_sparse else DataFrame - return klass(values, index=index, columns=columns) + return self.constructor(values, index=index, columns=columns) def get_new_values(self): values = self.values @@ -374,8 +396,9 @@ def pivot(self, index=None, columns=None, values=None): index = self.index else: index = self[index] - indexed = Series(self[values].values, - index=MultiIndex.from_arrays([index, self[columns]])) + indexed = self._constructor_sliced( + self[values].values, + index=MultiIndex.from_arrays([index, self[columns]])) return indexed.unstack(columns) @@ -461,7 +484,8 @@ def unstack(obj, level, fill_value=None): return obj.T.stack(dropna=False) else: unstacker = _Unstacker(obj.values, obj.index, level=level, - fill_value=fill_value) + fill_value=fill_value, + constructor=obj._constructor_expanddim) return unstacker.get_result() @@ -470,12 +494,12 @@ def _unstack_frame(obj, level, fill_value=None): unstacker = partial(_Unstacker, index=obj.index, level=level, fill_value=fill_value) blocks = obj._data.unstack(unstacker) - klass = type(obj) - return klass(blocks) + return obj._constructor(blocks) else: unstacker = _Unstacker(obj.values, obj.index, level=level, value_columns=obj.columns, - fill_value=fill_value) + fill_value=fill_value, + constructor=obj._constructor) return unstacker.get_result() @@ -528,8 +552,7 @@ def factorize(index): new_values = new_values[mask] new_index = new_index[mask] - klass = type(frame)._constructor_sliced - return klass(new_values, index=new_index) + return frame._constructor_sliced(new_values, index=new_index) def stack_multiple(frame, level, dropna=True): @@ -676,7 +699,7 @@ def _convert_level_number(level_num, columns): new_index = MultiIndex(levels=new_levels, labels=new_labels, names=new_names, verify_integrity=False) - result = DataFrame(new_data, index=new_index, columns=new_columns) + result = frame._constructor(new_data, index=new_index, columns=new_columns) # more efficient way to go about this? can do the whole masking biz but # will only save a small amount of time... diff --git a/pandas/tests/frame/test_subclass.py b/pandas/tests/frame/test_subclass.py index 52c591e4dcbb0..c52b512c2930a 100644 --- a/pandas/tests/frame/test_subclass.py +++ b/pandas/tests/frame/test_subclass.py @@ -5,7 +5,7 @@ from warnings import catch_warnings import numpy as np -from pandas import DataFrame, Series, MultiIndex, Panel +from pandas import DataFrame, Series, MultiIndex, Panel, Index import pandas as pd import pandas.util.testing as tm @@ -247,3 +247,270 @@ def test_subclass_sparse_transpose(self): [2, 5], [3, 6]]) tm.assert_sp_frame_equal(ossdf.T, essdf) + + def test_subclass_stack(self): + # GH 15564 + df = tm.SubclassedDataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], + index=['a', 'b', 'c'], + columns=['X', 'Y', 'Z']) + + res = df.stack() + exp = tm.SubclassedSeries( + [1, 2, 3, 4, 5, 6, 7, 8, 9], + index=[list('aaabbbccc'), list('XYZXYZXYZ')]) + + tm.assert_series_equal(res, exp) + + def test_subclass_stack_multi(self): + # GH 15564 + df = tm.SubclassedDataFrame([ + [10, 11, 12, 13], + [20, 21, 22, 23], + [30, 31, 32, 33], + [40, 41, 42, 43]], + index=MultiIndex.from_tuples( + list(zip(list('AABB'), list('cdcd'))), + names=['aaa', 'ccc']), + columns=MultiIndex.from_tuples( + list(zip(list('WWXX'), list('yzyz'))), + names=['www', 'yyy'])) + + exp = tm.SubclassedDataFrame([ + [10, 12], + [11, 13], + [20, 22], + [21, 23], + [30, 32], + [31, 33], + [40, 42], + [41, 43]], + index=MultiIndex.from_tuples(list(zip( + list('AAAABBBB'), list('ccddccdd'), list('yzyzyzyz'))), + names=['aaa', 'ccc', 'yyy']), + columns=Index(['W', 'X'], name='www')) + + res = df.stack() + tm.assert_frame_equal(res, exp) + + res = df.stack('yyy') + tm.assert_frame_equal(res, exp) + + exp = tm.SubclassedDataFrame([ + [10, 11], + [12, 13], + [20, 21], + [22, 23], + [30, 31], + [32, 33], + [40, 41], + [42, 43]], + index=MultiIndex.from_tuples(list(zip( + list('AAAABBBB'), list('ccddccdd'), list('WXWXWXWX'))), + names=['aaa', 'ccc', 'www']), + columns=Index(['y', 'z'], name='yyy')) + + res = df.stack('www') + tm.assert_frame_equal(res, exp) + + def test_subclass_stack_multi_mixed(self): + # GH 15564 + df = tm.SubclassedDataFrame([ + [10, 11, 12.0, 13.0], + [20, 21, 22.0, 23.0], + [30, 31, 32.0, 33.0], + [40, 41, 42.0, 43.0]], + index=MultiIndex.from_tuples( + list(zip(list('AABB'), list('cdcd'))), + names=['aaa', 'ccc']), + columns=MultiIndex.from_tuples( + list(zip(list('WWXX'), list('yzyz'))), + names=['www', 'yyy'])) + + exp = tm.SubclassedDataFrame([ + [10, 12.0], + [11, 13.0], + [20, 22.0], + [21, 23.0], + [30, 32.0], + [31, 33.0], + [40, 42.0], + [41, 43.0]], + index=MultiIndex.from_tuples(list(zip( + list('AAAABBBB'), list('ccddccdd'), list('yzyzyzyz'))), + names=['aaa', 'ccc', 'yyy']), + columns=Index(['W', 'X'], name='www')) + + res = df.stack() + tm.assert_frame_equal(res, exp) + + res = df.stack('yyy') + tm.assert_frame_equal(res, exp) + + exp = tm.SubclassedDataFrame([ + [10.0, 11.0], + [12.0, 13.0], + [20.0, 21.0], + [22.0, 23.0], + [30.0, 31.0], + [32.0, 33.0], + [40.0, 41.0], + [42.0, 43.0]], + index=MultiIndex.from_tuples(list(zip( + list('AAAABBBB'), list('ccddccdd'), list('WXWXWXWX'))), + names=['aaa', 'ccc', 'www']), + columns=Index(['y', 'z'], name='yyy')) + + res = df.stack('www') + tm.assert_frame_equal(res, exp) + + def test_subclass_unstack(self): + # GH 15564 + df = tm.SubclassedDataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], + index=['a', 'b', 'c'], + columns=['X', 'Y', 'Z']) + + res = df.unstack() + exp = tm.SubclassedSeries( + [1, 4, 7, 2, 5, 8, 3, 6, 9], + index=[list('XXXYYYZZZ'), list('abcabcabc')]) + + tm.assert_series_equal(res, exp) + + def test_subclass_unstack_multi(self): + # GH 15564 + df = tm.SubclassedDataFrame([ + [10, 11, 12, 13], + [20, 21, 22, 23], + [30, 31, 32, 33], + [40, 41, 42, 43]], + index=MultiIndex.from_tuples( + list(zip(list('AABB'), list('cdcd'))), + names=['aaa', 'ccc']), + columns=MultiIndex.from_tuples( + list(zip(list('WWXX'), list('yzyz'))), + names=['www', 'yyy'])) + + exp = tm.SubclassedDataFrame([ + [10, 20, 11, 21, 12, 22, 13, 23], + [30, 40, 31, 41, 32, 42, 33, 43]], + index=Index(['A', 'B'], name='aaa'), + columns=MultiIndex.from_tuples(list(zip( + list('WWWWXXXX'), list('yyzzyyzz'), list('cdcdcdcd'))), + names=['www', 'yyy', 'ccc'])) + + res = df.unstack() + tm.assert_frame_equal(res, exp) + + res = df.unstack('ccc') + tm.assert_frame_equal(res, exp) + + exp = tm.SubclassedDataFrame([ + [10, 30, 11, 31, 12, 32, 13, 33], + [20, 40, 21, 41, 22, 42, 23, 43]], + index=Index(['c', 'd'], name='ccc'), + columns=MultiIndex.from_tuples(list(zip( + list('WWWWXXXX'), list('yyzzyyzz'), list('ABABABAB'))), + names=['www', 'yyy', 'aaa'])) + + res = df.unstack('aaa') + tm.assert_frame_equal(res, exp) + + def test_subclass_unstack_multi_mixed(self): + # GH 15564 + df = tm.SubclassedDataFrame([ + [10, 11, 12.0, 13.0], + [20, 21, 22.0, 23.0], + [30, 31, 32.0, 33.0], + [40, 41, 42.0, 43.0]], + index=MultiIndex.from_tuples( + list(zip(list('AABB'), list('cdcd'))), + names=['aaa', 'ccc']), + columns=MultiIndex.from_tuples( + list(zip(list('WWXX'), list('yzyz'))), + names=['www', 'yyy'])) + + exp = tm.SubclassedDataFrame([ + [10, 20, 11, 21, 12.0, 22.0, 13.0, 23.0], + [30, 40, 31, 41, 32.0, 42.0, 33.0, 43.0]], + index=Index(['A', 'B'], name='aaa'), + columns=MultiIndex.from_tuples(list(zip( + list('WWWWXXXX'), list('yyzzyyzz'), list('cdcdcdcd'))), + names=['www', 'yyy', 'ccc'])) + + res = df.unstack() + tm.assert_frame_equal(res, exp) + + res = df.unstack('ccc') + tm.assert_frame_equal(res, exp) + + exp = tm.SubclassedDataFrame([ + [10, 30, 11, 31, 12.0, 32.0, 13.0, 33.0], + [20, 40, 21, 41, 22.0, 42.0, 23.0, 43.0]], + index=Index(['c', 'd'], name='ccc'), + columns=MultiIndex.from_tuples(list(zip( + list('WWWWXXXX'), list('yyzzyyzz'), list('ABABABAB'))), + names=['www', 'yyy', 'aaa'])) + + res = df.unstack('aaa') + tm.assert_frame_equal(res, exp) + + def test_subclass_pivot(self): + # GH 15564 + df = tm.SubclassedDataFrame({ + 'index': ['A', 'B', 'C', 'C', 'B', 'A'], + 'columns': ['One', 'One', 'One', 'Two', 'Two', 'Two'], + 'values': [1., 2., 3., 3., 2., 1.]}) + + pivoted = df.pivot( + index='index', columns='columns', values='values') + + expected = tm.SubclassedDataFrame({ + 'One': {'A': 1., 'B': 2., 'C': 3.}, + 'Two': {'A': 1., 'B': 2., 'C': 3.}}) + + expected.index.name, expected.columns.name = 'index', 'columns' + + tm.assert_frame_equal(pivoted, expected) + + def test_subclassed_melt(self): + # GH 15564 + cheese = tm.SubclassedDataFrame({ + 'first': ['John', 'Mary'], + 'last': ['Doe', 'Bo'], + 'height': [5.5, 6.0], + 'weight': [130, 150]}) + + melted = pd.melt(cheese, id_vars=['first', 'last']) + + expected = tm.SubclassedDataFrame([ + ['John', 'Doe', 'height', 5.5], + ['Mary', 'Bo', 'height', 6.0], + ['John', 'Doe', 'weight', 130], + ['Mary', 'Bo', 'weight', 150]], + columns=['first', 'last', 'variable', 'value']) + + tm.assert_frame_equal(melted, expected) + + def test_subclassed_wide_to_long(self): + # GH 9762 + + np.random.seed(123) + x = np.random.randn(3) + df = tm.SubclassedDataFrame({ + "A1970": {0: "a", 1: "b", 2: "c"}, + "A1980": {0: "d", 1: "e", 2: "f"}, + "B1970": {0: 2.5, 1: 1.2, 2: .7}, + "B1980": {0: 3.2, 1: 1.3, 2: .1}, + "X": dict(zip(range(3), x))}) + + df["id"] = df.index + exp_data = {"X": x.tolist() + x.tolist(), + "A": ['a', 'b', 'c', 'd', 'e', 'f'], + "B": [2.5, 1.2, 0.7, 3.2, 1.3, 0.1], + "year": [1970, 1970, 1970, 1980, 1980, 1980], + "id": [0, 1, 2, 0, 1, 2]} + expected = tm.SubclassedDataFrame(exp_data) + expected = expected.set_index(['id', 'year'])[["X", "A", "B"]] + long_frame = pd.wide_to_long(df, ["A", "B"], i="id", j="year") + + tm.assert_frame_equal(long_frame, expected) diff --git a/pandas/tests/series/test_subclass.py b/pandas/tests/series/test_subclass.py index 37c8d7343f7f1..60afaa3b821e1 100644 --- a/pandas/tests/series/test_subclass.py +++ b/pandas/tests/series/test_subclass.py @@ -13,24 +13,31 @@ def test_indexing_sliced(self): res = s.loc[['a', 'b']] exp = tm.SubclassedSeries([1, 2], index=list('ab')) tm.assert_series_equal(res, exp) - assert isinstance(res, tm.SubclassedSeries) res = s.iloc[[2, 3]] exp = tm.SubclassedSeries([3, 4], index=list('cd')) tm.assert_series_equal(res, exp) - assert isinstance(res, tm.SubclassedSeries) res = s.loc[['a', 'b']] exp = tm.SubclassedSeries([1, 2], index=list('ab')) tm.assert_series_equal(res, exp) - assert isinstance(res, tm.SubclassedSeries) def test_to_frame(self): s = tm.SubclassedSeries([1, 2, 3, 4], index=list('abcd'), name='xxx') res = s.to_frame() exp = tm.SubclassedDataFrame({'xxx': [1, 2, 3, 4]}, index=list('abcd')) tm.assert_frame_equal(res, exp) - assert isinstance(res, tm.SubclassedDataFrame) + + def test_subclass_unstack(self): + # GH 15564 + s = tm.SubclassedSeries( + [1, 2, 3, 4], index=[list('aabb'), list('xyxy')]) + + res = s.unstack() + exp = tm.SubclassedDataFrame( + {'x': [1, 3], 'y': [2, 4]}, index=['a', 'b']) + + tm.assert_frame_equal(res, exp) class TestSparseSeriesSubclassing(object):