From 642bfced686c2d6e088efd0bd562a7a3b61a6eb4 Mon Sep 17 00:00:00 2001 From: Michael Delgado Date: Tue, 7 Mar 2017 19:08:41 -0800 Subject: [PATCH 01/24] add subclassed stack/unstack/pivot tests --- pandas/tests/frame/test_subclass.py | 143 +++++++++++++++++++++++++++ pandas/tests/series/test_subclass.py | 12 +++ 2 files changed, 155 insertions(+) diff --git a/pandas/tests/frame/test_subclass.py b/pandas/tests/frame/test_subclass.py index 52c591e4dcbb0..0cc4a2d3efbbc 100644 --- a/pandas/tests/frame/test_subclass.py +++ b/pandas/tests/frame/test_subclass.py @@ -126,6 +126,149 @@ def test_indexing_sliced(self): tm.assert_series_equal(res, exp) assert isinstance(res, tm.SubclassedSeries) + def test_subclass_stack(self): + # GH 15564 + df = tm.SubclassedDataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], + index=['a', 'b', 'c'], + columns=['X', 'Y', 'Z']) + + res = df.stack() + exp = tm.SubclassedSeries( + [1, 2, 3, 4, 5, 6, 7, 8, 9], + index=[list('aaabbbccc'), list('XYZXYZXYZ')]) + + tm.assert_series_equal(res, exp) + tm.assertIsInstance(res, tm.SubclassedSeries) + + def test_subclass_stack_multi(self): + # GH 15564 + df = tm.SubclassedDataFrame([ + [10, 11, 12, 13], + [20, 21, 22, 23], + [30, 31, 32, 33], + [40, 41, 42, 43]], + index=MultiIndex.from_tuples( + list(zip(list('AABB'), list('cdcd'))), + names=['aaa', 'ccc']), + columns=MultiIndex.from_tuples( + list(zip(list('WWXX'), list('yzyz'))), + names=['www', 'yyy'])) + + exp = tm.SubclassedDataFrame([ + [10, 12], + [11, 13], + [20, 22], + [21, 23], + [30, 32], + [31, 33], + [40, 42], + [41, 43]], + index=MultiIndex.from_tuples(list(zip( + list('AAAABBBB'), list('ccddccdd'), list('yzyzyzyz'))), + names=['aaa', 'ccc', 'yyy']), + columns=Index(['W', 'X'], name='www')) + + res = df.stack() + tm.assert_frame_equal(res, exp) + tm.assertIsInstance(res, tm.SubclassedDataFrame) + + res = df.stack('yyy') + tm.assert_frame_equal(res, exp) + tm.assertIsInstance(res, tm.SubclassedDataFrame) + + exp = tm.SubclassedDataFrame([ + [10, 11], + [12, 13], + [20, 21], + [22, 23], + [30, 31], + [32, 33], + [40, 41], + [42, 43]], + index=MultiIndex.from_tuples(list(zip( + list('AAAABBBB'), list('ccddccdd'), list('WXWXWXWX'))), + names=['aaa', 'ccc', 'www']), + columns=Index(['y', 'z'], name='yyy')) + + res = df.stack('www') + tm.assert_frame_equal(res, exp) + tm.assertIsInstance(res, tm.SubclassedDataFrame) + + def test_subclass_unstack(self): + # GH 15564 + df = tm.SubclassedDataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], + index=['a', 'b', 'c'], + columns=['X', 'Y', 'Z']) + + res = df.unstack() + exp = tm.SubclassedSeries( + [1, 4, 7, 2, 5, 8, 3, 6, 9], + index=[list('XXXYYYZZZ'), list('abcabcabc')]) + + tm.assert_series_equal(res, exp) + tm.assertIsInstance(res, tm.SubclassedSeries) + + def test_subclass_unstack_multi(self): + # GH 15564 + df = tm.SubclassedDataFrame([ + [10, 11, 12, 13], + [20, 21, 22, 23], + [30, 31, 32, 33], + [40, 41, 42, 43]], + index=MultiIndex.from_tuples( + list(zip(list('AABB'), list('cdcd'))), + names=['aaa', 'ccc']), + columns=MultiIndex.from_tuples( + list(zip(list('WWXX'), list('yzyz'))), + names=['www', 'yyy'])) + + exp = tm.SubclassedDataFrame([ + [10, 20, 11, 21, 12, 22, 13, 23], + [30, 40, 31, 41, 32, 42, 33, 43]], + index=Index(['A', 'B'], name='aaa'), + columns=MultiIndex.from_tuples(list(zip( + list('WWWWXXXX'), list('yyzzyyzz'), list('cdcdcdcd'))), + names=['www', 'yyy', 'ccc'])) + + res = df.unstack() + tm.assert_frame_equal(res, exp) + tm.assertIsInstance(res, tm.SubclassedDataFrame) + + res = df.unstack('ccc') + tm.assert_frame_equal(res, exp) + tm.assertIsInstance(res, tm.SubclassedDataFrame) + + exp = tm.SubclassedDataFrame([ + [10, 30, 11, 31, 12, 32, 13, 33], + [20, 40, 21, 41, 22, 42, 23, 43]], + index=Index(['c', 'd'], name='ccc'), + columns=MultiIndex.from_tuples(list(zip( + list('WWWWXXXX'), list('yyzzyyzz'), list('ABABABAB'))), + names=['www', 'yyy', 'aaa'])) + + res = df.unstack('aaa') + tm.assert_frame_equal(res, exp) + tm.assertIsInstance(res, tm.SubclassedDataFrame) + + def test_subclass_pivot(self): + # GH 15564 + df = tm.SubclassedDataFrame({ + 'index': ['A', 'B', 'C', 'C', 'B', 'A'], + 'columns': ['One', 'One', 'One', 'Two', 'Two', 'Two'], + 'values': [1., 2., 3., 3., 2., 1.]}) + + pivoted = df.pivot( + index='index', columns='columns', values='values') + + expected = tm.SubclassedDataFrame({ + 'One': {'A': 1., 'B': 2., 'C': 3.}, + 'Two': {'A': 1., 'B': 2., 'C': 3.}}) + + expected.index.name, expected.columns.name = 'index', 'columns' + + tm.assert_frame_equal(pivoted, expected) + tm.assertIsInstance(pivoted, tm.SubclassedDataFrame) + def test_to_panel_expanddim(self): # GH 9762 diff --git a/pandas/tests/series/test_subclass.py b/pandas/tests/series/test_subclass.py index 37c8d7343f7f1..577c8f60e5356 100644 --- a/pandas/tests/series/test_subclass.py +++ b/pandas/tests/series/test_subclass.py @@ -32,6 +32,18 @@ def test_to_frame(self): tm.assert_frame_equal(res, exp) assert isinstance(res, tm.SubclassedDataFrame) + def test_subclass_unstack(self): + # GH 15564 + s = tm.SubclassedSeries( + [1, 2, 3, 4], index=[list('aabb'), list('xyxy')]) + + res = s.unstack() + exp = tm.SubclassedDataFrame( + {'x': [1, 3], 'y': [2, 4]}, index=['a', 'b']) + + tm.assert_frame_equal(res, exp) + tm.assertIsInstance(res, tm.SubclassedDataFrame) + class TestSparseSeriesSubclassing(object): From 02956040dba2c7711c41c8a00e323c4652e08f7d Mon Sep 17 00:00:00 2001 From: Michael Delgado Date: Tue, 7 Mar 2017 19:34:20 -0800 Subject: [PATCH 02/24] add melt test --- pandas/tests/frame/test_subclass.py | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/pandas/tests/frame/test_subclass.py b/pandas/tests/frame/test_subclass.py index 0cc4a2d3efbbc..712b0f68a7fc8 100644 --- a/pandas/tests/frame/test_subclass.py +++ b/pandas/tests/frame/test_subclass.py @@ -5,7 +5,7 @@ from warnings import catch_warnings import numpy as np -from pandas import DataFrame, Series, MultiIndex, Panel +from pandas import DataFrame, Series, MultiIndex, Panel, Index import pandas as pd import pandas.util.testing as tm @@ -269,6 +269,26 @@ def test_subclass_pivot(self): tm.assert_frame_equal(pivoted, expected) tm.assertIsInstance(pivoted, tm.SubclassedDataFrame) + def test_subclassed_melt(self): + # GH 15564 + cheese = tm.SubclassedDataFrame({ + 'first' : ['John', 'Mary'], + 'last' : ['Doe', 'Bo'], + 'height' : [5.5, 6.0], + 'weight' : [130, 150]}) + + melted = pd.melt(cheese, id_vars=['first', 'last']) + + expected = tm.SubclassedDataFrame([ + ['John', 'Doe', 'height', 5.5], + ['Mary', 'Bo', 'height', 6.0], + ['John', 'Doe', 'weight', 130], + ['Mary', 'Bo', 'weight', 150]], + columns=['first', 'last', 'variable', 'value']) + + tm.assert_frame_equal(melted, expected) + tm.assertIsInstance(melted, tm.SubclassedDataFrame) + def test_to_panel_expanddim(self): # GH 9762 From 60a2cfd95b8337991bc64e895143e203e8d180ff Mon Sep 17 00:00:00 2001 From: Michael Delgado Date: Tue, 7 Mar 2017 23:56:20 -0800 Subject: [PATCH 03/24] use _constructor* properties to create Series and DataFrame objects to preverve subclasses --- pandas/core/reshape/reshape.py | 29 +++++++++++++++++------------ 1 file changed, 17 insertions(+), 12 deletions(-) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index b0ed6d4c4b84d..a868ec4627f39 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -72,7 +72,8 @@ class _Unstacker(object): """ def __init__(self, values, index, level=-1, value_columns=None, - fill_value=None): + fill_value=None, + constructor=DataFrame): self.is_categorical = None if values.ndim == 1: @@ -83,6 +84,7 @@ def __init__(self, values, index, level=-1, value_columns=None, self.values = values self.value_columns = value_columns self.fill_value = fill_value + self.constructor = constructor if value_columns is None and values.shape[1] != 1: # pragma: no cover raise ValueError('must pass column labels for multi-column data') @@ -177,7 +179,7 @@ def get_result(self): ordered=ordered) for i in range(values.shape[-1])] - return DataFrame(values, index=index, columns=columns) + return self.constructor(values, index=index, columns=columns) def get_new_values(self): values = self.values @@ -373,8 +375,9 @@ def pivot(self, index=None, columns=None, values=None): index = self.index else: index = self[index] - indexed = Series(self[values].values, - index=MultiIndex.from_arrays([index, self[columns]])) + indexed = self._constructor_sliced( + self[values].values, + index=MultiIndex.from_arrays([index, self[columns]])) return indexed.unstack(columns) @@ -455,7 +458,8 @@ def unstack(obj, level, fill_value=None): return obj.T.stack(dropna=False) else: unstacker = _Unstacker(obj.values, obj.index, level=level, - fill_value=fill_value) + fill_value=fill_value, + constructor=obj._constructor_expanddim) return unstacker.get_result() @@ -487,13 +491,14 @@ def _unstack_frame(obj, level, fill_value=None): newb = make_block(new_values.T, placement=new_placement) new_blocks.append(newb) - result = DataFrame(BlockManager(new_blocks, new_axes)) - mask_frame = DataFrame(BlockManager(mask_blocks, new_axes)) + result = obj._constructor(BlockManager(new_blocks, new_axes)) + mask_frame = obj._constructor(BlockManager(mask_blocks, new_axes)) return result.loc[:, mask_frame.sum(0) > 0] else: unstacker = _Unstacker(obj.values, obj.index, level=level, value_columns=obj.columns, - fill_value=fill_value) + fill_value=fill_value, + constructor=obj._constructor) return unstacker.get_result() @@ -550,7 +555,7 @@ def factorize(index): mask = notnull(new_values) new_values = new_values[mask] new_index = new_index[mask] - return Series(new_values, index=new_index) + return frame._constructor_sliced(new_values, index=new_index) def stack_multiple(frame, level, dropna=True): @@ -696,7 +701,7 @@ def _convert_level_number(level_num, columns): new_index = MultiIndex(levels=new_levels, labels=new_labels, names=new_names, verify_integrity=False) - result = DataFrame(new_data, index=new_index, columns=new_columns) + result = frame._constructor(new_data, index=new_index, columns=new_columns) # more efficient way to go about this? can do the whole masking biz but # will only save a small amount of time... @@ -770,7 +775,7 @@ def melt(frame, id_vars=None, value_vars=None, var_name=None, mdata[col] = np.asanyarray(frame.columns ._get_level_values(i)).repeat(N) - return DataFrame(mdata, columns=mcolumns) + return frame._constructor(mdata, columns=mcolumns) def lreshape(data, groups, dropna=True, label=None): @@ -839,7 +844,7 @@ def lreshape(data, groups, dropna=True, label=None): if not mask.all(): mdata = dict((k, v[mask]) for k, v in compat.iteritems(mdata)) - return DataFrame(mdata, columns=id_cols + pivot_cols) + return data._constructor(mdata, columns=id_cols + pivot_cols) def wide_to_long(df, stubnames, i, j, sep="", suffix='\d+'): From d65cff5215e08690fee58faff9e33ab5de9ff954 Mon Sep 17 00:00:00 2001 From: Michael Delgado Date: Wed, 8 Mar 2017 00:52:20 -0800 Subject: [PATCH 04/24] document _Unstacker --- pandas/core/reshape/reshape.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index a868ec4627f39..612d3b80b5be2 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -40,9 +40,28 @@ class _Unstacker(object): Parameters ---------- + values : ndarray + Values of DataFrame to "Unstack" + + index : object + Pandas ``Index`` or ``MultiIndex`` + level : int or str, default last level Level to "unstack". Accepts a name for the level. + value_columns : object, optional + Pandas ``Index`` or ``MultiIndex`` object if unstacking a DataFrame + + fill_value : scalar, optional + Default value to fill in missing values if subgroups do not have the + same set of labels. By default, missing values will be replaced with + the default fill value for that data type, NaN for float, NaT for + datetimelike, etc. For integer types, by default data will converted to + float and missing values will be set to NaN. + + constructor : object, default DataFrame + ``Series``, ``DataFrame``, or subclass used to create unstacked response + Examples -------- >>> import pandas as pd From 9d1cf63561e5667c074b39d7dfe74ecc9471dcba Mon Sep 17 00:00:00 2001 From: Michael Delgado Date: Sat, 11 Mar 2017 09:53:53 -0800 Subject: [PATCH 05/24] fix bug in wide_to_long_test, add GH issue numbers --- pandas/tests/frame/test_subclass.py | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/pandas/tests/frame/test_subclass.py b/pandas/tests/frame/test_subclass.py index 712b0f68a7fc8..4921f0cc4e863 100644 --- a/pandas/tests/frame/test_subclass.py +++ b/pandas/tests/frame/test_subclass.py @@ -289,6 +289,31 @@ def test_subclassed_melt(self): tm.assert_frame_equal(melted, expected) tm.assertIsInstance(melted, tm.SubclassedDataFrame) + def test_subclassed_wide_to_long(self): + # GH 9762 + + np.random.seed(123) + x = np.random.randn(3) + df = tm.SubclassedDataFrame({ + "A1970": {0: "a", 1: "b", 2: "c"}, + "A1980": {0: "d", 1: "e", 2: "f"}, + "B1970": {0: 2.5, 1: 1.2, 2: .7}, + "B1980": {0: 3.2, 1: 1.3, 2: .1}, + "X": dict(zip(range(3), x))}) + + df["id"] = df.index + exp_data = {"X": x.tolist() + x.tolist(), + "A": ['a', 'b', 'c', 'd', 'e', 'f'], + "B": [2.5, 1.2, 0.7, 3.2, 1.3, 0.1], + "year": ['1970', '1970', '1970', '1980', '1980', '1980'], + "id": [0, 1, 2, 0, 1, 2]} + expected = tm.SubclassedDataFrame(exp_data) + expected = expected.set_index(['id', 'year'])[["X", "A", "B"]] + long_frame = pd.wide_to_long(df, ["A", "B"], i="id", j="year") + + tm.assert_frame_equal(long_frame, expected) + tm.assertIsInstance(long_frame, tm.SubclassedDataFrame) + def test_to_panel_expanddim(self): # GH 9762 From 3efb82f4bfe0842b8b9356d902a332db7e71c21b Mon Sep 17 00:00:00 2001 From: Michael Delgado Date: Sat, 11 Mar 2017 10:08:10 -0800 Subject: [PATCH 06/24] add whatsnew entry --- doc/source/whatsnew/v0.20.0.txt | 1327 ++++++------------------------- 1 file changed, 244 insertions(+), 1083 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index a0bf2f9b3758a..f6d5e3df814fc 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -1,39 +1,23 @@ .. _whatsnew_0200: -v0.20.1 (May 5, 2017) ---------------------- +v0.20.0 (????, 2017) +-------------------- -This is a major release from 0.19.2 and includes a number of API changes, deprecations, new features, +This is a major release from 0.19 and includes a small number of API changes, several new features, enhancements, and performance improvements along with a large number of bug fixes. We recommend that all users upgrade to this version. Highlights include: -- New ``.agg()`` API for Series/DataFrame similar to the groupby-rolling-resample API's, see :ref:`here ` -- Integration with the ``feather-format``, including a new top-level ``pd.read_feather()`` and ``DataFrame.to_feather()`` method, see :ref:`here `. +- Building pandas for development now requires ``cython >= 0.23`` (:issue:`14831`) - The ``.ix`` indexer has been deprecated, see :ref:`here ` -- ``Panel`` has been deprecated, see :ref:`here ` -- Addition of an ``IntervalIndex`` and ``Interval`` scalar type, see :ref:`here ` -- Improved user API when grouping by index levels in ``.groupby()``, see :ref:`here ` -- Improved support for ``UInt64`` dtypes, see :ref:`here ` -- A new orient for JSON serialization, ``orient='table'``, that uses the Table Schema spec and that gives the possibility for a more interactive repr in the Jupyter Notebook, see :ref:`here ` -- Experimental support for exporting styled DataFrames (``DataFrame.style``) to Excel, see :ref:`here ` -- Window binary corr/cov operations now return a MultiIndexed ``DataFrame`` rather than a ``Panel``, as ``Panel`` is now deprecated, see :ref:`here ` -- Support for S3 handling now uses ``s3fs``, see :ref:`here ` -- Google BigQuery support now uses the ``pandas-gbq`` library, see :ref:`here ` +- Switched the test framework to `pytest`_ (:issue:`13097`) +- A new orient for JSON serialization, ``orient='table'``, that uses the Table Schema spec, see :ref:`here ` -.. warning:: - - Pandas has changed the internal structure and layout of the codebase. - This can affect imports that are not from the top-level ``pandas.*`` namespace, please see the changes :ref:`here `. +.. _pytest: http://doc.pytest.org/en/latest/ Check the :ref:`API Changes ` and :ref:`deprecations ` before updating. -.. note:: - - This is a combined release for 0.20.0 and and 0.20.1. - Version 0.20.1 contains one additional change for backwards-compatibility with downstream projects using pandas' ``utils`` routines. (:issue:`16250`) - .. contents:: What's new in v0.20.0 :local: :backlinks: none @@ -43,81 +27,22 @@ Check the :ref:`API Changes ` and :ref:`deprecations New features ~~~~~~~~~~~~ -.. _whatsnew_0200.enhancements.agg: - -``agg`` API for DataFrame/Series -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Series & DataFrame have been enhanced to support the aggregation API. This is a familiar API -from groupby, window operations, and resampling. This allows aggregation operations in a concise way -by using :meth:`~DataFrame.agg` and :meth:`~DataFrame.transform`. The full documentation -is :ref:`here ` (:issue:`1623`). - -Here is a sample - -.. ipython:: python - - df = pd.DataFrame(np.random.randn(10, 3), columns=['A', 'B', 'C'], - index=pd.date_range('1/1/2000', periods=10)) - df.iloc[3:7] = np.nan - df - -One can operate using string function names, callables, lists, or dictionaries of these. - -Using a single function is equivalent to ``.apply``. - -.. ipython:: python - - df.agg('sum') - -Multiple aggregations with a list of functions. - -.. ipython:: python - - df.agg(['sum', 'min']) - -Using a dict provides the ability to apply specific aggregations per column. -You will get a matrix-like output of all of the aggregators. The output has one column -per unique function. Those functions applied to a particular column will be ``NaN``: - -.. ipython:: python - - df.agg({'A' : ['sum', 'min'], 'B' : ['min', 'max']}) - -The API also supports a ``.transform()`` function for broadcasting results. - -.. ipython:: python - :okwarning: - - df.transform(['abs', lambda x: x - x.min()]) - -When presented with mixed dtypes that cannot be aggregated, ``.agg()`` will only take the valid -aggregations. This is similiar to how groupby ``.agg()`` works. (:issue:`15015`) - -.. ipython:: python - - df = pd.DataFrame({'A': [1, 2, 3], - 'B': [1., 2., 3.], - 'C': ['foo', 'bar', 'baz'], - 'D': pd.date_range('20130101', periods=3)}) - df.dtypes - -.. ipython:: python - - df.agg(['min', 'sum']) .. _whatsnew_0200.enhancements.dataio_dtype: ``dtype`` keyword for data IO ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -The ``'python'`` engine for :func:`read_csv`, as well as the :func:`read_fwf` function for parsing -fixed-width text files and :func:`read_excel` for parsing Excel files, now accept the ``dtype`` keyword argument for specifying the types of specific columns (:issue:`14295`). See the :ref:`io docs ` for more information. +The ``dtype`` keyword argument in the :func:`read_csv` function for specifying the types of parsed columns is now supported with the ``'python'`` engine (:issue:`14295`). See the :ref:`io docs ` for more information. .. ipython:: python - :suppress: - from pandas.compat import StringIO + data = "a,b\n1,2\n3,4" + pd.read_csv(StringIO(data), engine='python').dtypes + pd.read_csv(StringIO(data), engine='python', dtype={'a':'float64', 'b':'object'}).dtypes + +The ``dtype`` keyword argument is also now supported in the :func:`read_fwf` function for parsing +fixed-width text files, and :func:`read_excel` for parsing Excel files. .. ipython:: python @@ -125,34 +50,12 @@ fixed-width text files and :func:`read_excel` for parsing Excel files, now accep pd.read_fwf(StringIO(data)).dtypes pd.read_fwf(StringIO(data), dtype={'a':'float64', 'b':'object'}).dtypes -.. _whatsnew_0120.enhancements.datetime_origin: - -``.to_datetime()`` has gained an ``origin`` parameter -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -:func:`to_datetime` has gained a new parameter, ``origin``, to define a reference date -from where to compute the resulting timestamps when parsing numerical values with a specific ``unit`` specified. (:issue:`11276`, :issue:`11745`) - -For example, with 1960-01-01 as the starting date: - -.. ipython:: python - - pd.to_datetime([1, 2, 3], unit='D', origin=pd.Timestamp('1960-01-01')) - -The default is set at ``origin='unix'``, which defaults to ``1970-01-01 00:00:00``, which is -commonly called 'unix epoch' or POSIX time. This was the previous default, so this is a backward compatible change. - -.. ipython:: python - - pd.to_datetime([1, 2, 3], unit='D') - - .. _whatsnew_0200.enhancements.groupby_access: Groupby Enhancements ^^^^^^^^^^^^^^^^^^^^ -Strings passed to ``DataFrame.groupby()`` as the ``by`` parameter may now reference either column names or index level names. Previously, only column names could be referenced. This allows to easily group by a column and index level at the same time. (:issue:`5677`) +Strings passed to ``DataFrame.groupby()`` as the ``by`` parameter may now reference either column names or index level names (:issue:`5677`) .. ipython:: python @@ -168,7 +71,6 @@ Strings passed to ``DataFrame.groupby()`` as the ``by`` parameter may now refere df.groupby(['second', 'A']).sum() - .. _whatsnew_0200.enhancements.compressed_urls: Better support for compressed URLs in ``read_csv`` @@ -178,8 +80,8 @@ The compression code was refactored (:issue:`12688`). As a result, reading dataframes from URLs in :func:`read_csv` or :func:`read_table` now supports additional compression methods: ``xz``, ``bz2``, and ``zip`` (:issue:`14570`). Previously, only ``gzip`` compression was supported. By default, compression of -URLs and paths are now inferred using their file extensions. Additionally, -support for bz2 compression in the python 2 C-engine improved (:issue:`14874`). +URLs and paths are now both inferred using their file extensions. Additionally, +support for bz2 compression in the python 2 c-engine improved (:issue:`14874`). .. ipython:: python @@ -197,10 +99,10 @@ support for bz2 compression in the python 2 C-engine improved (:issue:`14874`). Pickle file I/O now supports compression ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -:func:`read_pickle`, :meth:`DataFrame.to_pickle` and :meth:`Series.to_pickle` +:func:`read_pickle`, :meth:`DataFame.to_pickle` and :meth:`Series.to_pickle` can now read from and write to compressed pickle files. Compression methods can be an explicit parameter or be inferred from the file extension. -See :ref:`the docs here. ` +See :ref:`the docs here ` .. ipython:: python @@ -215,24 +117,33 @@ Using an explicit compression type df.to_pickle("data.pkl.compress", compression="gzip") rt = pd.read_pickle("data.pkl.compress", compression="gzip") - rt.head() + rt + +Inferring compression type from the extension + +.. ipython:: python + + df.to_pickle("data.pkl.xz", compression="infer") + rt = pd.read_pickle("data.pkl.xz", compression="infer") + rt -The default is to infer the compression type from the extension (``compression='infer'``): +The default is to 'infer .. ipython:: python df.to_pickle("data.pkl.gz") rt = pd.read_pickle("data.pkl.gz") - rt.head() + rt df["A"].to_pickle("s1.pkl.bz2") rt = pd.read_pickle("s1.pkl.bz2") - rt.head() + rt .. ipython:: python :suppress: import os os.remove("data.pkl.compress") + os.remove("data.pkl.xz") os.remove("data.pkl.gz") os.remove("s1.pkl.bz2") @@ -278,7 +189,7 @@ In previous versions, ``.groupby(..., sort=False)`` would fail with a ``ValueErr ordered=True)}) df -**Previous Behavior**: +Previous Behavior: .. code-block:: ipython @@ -286,7 +197,7 @@ In previous versions, ``.groupby(..., sort=False)`` would fail with a ``ValueErr --------------------------------------------------------------------------- ValueError: items in new_categories are not the same as in old categories -**New Behavior**: +New Behavior: .. ipython:: python @@ -312,7 +223,7 @@ the data. df.to_json(orient='table') -See :ref:`IO: Table Schema for more information `. +See :ref:`IO: Table Schema for more`. Additionally, the repr for ``DataFrame`` and ``Series`` can now publish this JSON Table schema representation of the Series or DataFrame if you are @@ -321,7 +232,7 @@ protocol). This gives frontends like the Jupyter notebook and `nteract`_ more flexiblity in how they display pandas objects, since they have more information about the data. -You must enable this by setting the ``display.html.table_schema`` option to ``True``. +You must enable this by setting the ``display.html.table_schema`` option to True. .. _Table Schema: http://specs.frictionlessdata.io/json-table-schema/ .. _nteract: http://nteract.io/ @@ -352,184 +263,113 @@ To convert a ``SparseDataFrame`` back to sparse SciPy matrix in COO format, you sdf.to_coo() -.. _whatsnew_0200.enhancements.style_excel: - -Excel output for styled DataFrames -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Experimental support has been added to export ``DataFrame.style`` formats to Excel using the ``openpyxl`` engine. (:issue:`15530`) +.. _whatsnew_0200.enhancements.other: -For example, after running the following, ``styled.xlsx`` renders as below: +Other enhancements +^^^^^^^^^^^^^^^^^^ -.. ipython:: python - :okwarning: - - np.random.seed(24) - df = pd.DataFrame({'A': np.linspace(1, 10, 10)}) - df = pd.concat([df, pd.DataFrame(np.random.RandomState(24).randn(10, 4), - columns=list('BCDE'))], - axis=1) - df.iloc[0, 2] = np.nan - df - styled = df.style.\ - applymap(lambda val: 'color: %s' % 'red' if val < 0 else 'black').\ - highlight_max() - styled.to_excel('styled.xlsx', engine='openpyxl') +- Integration with the ``feather-format``, including a new top-level ``pd.read_feather()`` and ``DataFrame.to_feather()`` method, see :ref:`here `. +- ``Series.str.replace()`` now accepts a callable, as replacement, which is passed to ``re.sub`` (:issue:`15055`) +- ``Series.str.replace()`` now accepts a compiled regular expression as a pattern (:issue:`15446`) -.. image:: _static/style-excel.png -.. ipython:: python - :suppress: +- ``Series.sort_index`` accepts parameters ``kind`` and ``na_position`` (:issue:`13589`, :issue:`14444`) - import os - os.remove('styled.xlsx') +- ``DataFrame`` has gained a ``nunique()`` method to count the distinct values over an axis (:issue:`14336`). +- ``DataFrame.groupby()`` has gained a ``.nunique()`` method to count the distinct values for all columns within each group (:issue:`14336`, :issue:`15197`). -See the :ref:`Style documentation ` for more detail. +- ``pd.read_excel`` now preserves sheet order when using ``sheetname=None`` (:issue:`9930`) +- Multiple offset aliases with decimal points are now supported (e.g. '0.5min' is parsed as '30s') (:issue:`8419`) +- ``.isnull()`` and ``.notnull()`` have been added to ``Index`` object to make them more consistent with the ``Series`` API (:issue:`15300`) -.. _whatsnew_0200.enhancements.intervalindex: +- New ``UnsortedIndexError`` (subclass of ``KeyError``) raised when indexing/slicing into an + unsorted MultiIndex (:issue:`11897`). This allows differentiation between errors due to lack + of sorting or an incorrect key. See :ref:`here ` +- ``MultiIndex`` has gained a ``.to_frame()`` method to convert to a ``DataFrame`` (:issue:`12397`) +- ``pd.cut`` and ``pd.qcut`` now support datetime64 and timedelta64 dtypes (:issue:`14714`, :issue:`14798`) +- ``pd.qcut`` has gained the ``duplicates='raise'|'drop'`` option to control whether to raise on duplicated edges (:issue:`7751`) +- ``Series`` provides a ``to_excel`` method to output Excel files (:issue:`8825`) +- The ``usecols`` argument in ``pd.read_csv`` now accepts a callable function as a value (:issue:`14154`) +- The ``skiprows`` argument in ``pd.read_csv`` now accepts a callable function as a value (:issue:`10882`) +- ``pd.DataFrame.plot`` now prints a title above each subplot if ``suplots=True`` and ``title`` is a list of strings (:issue:`14753`) +- ``pd.Series.interpolate`` now supports timedelta as an index type with ``method='time'`` (:issue:`6424`) +- ``Timedelta.isoformat`` method added for formatting Timedeltas as an `ISO 8601 duration`_. See the :ref:`Timedelta docs ` (:issue:`15136`) +- ``pandas.io.json.json_normalize()`` gained the option ``errors='ignore'|'raise'``; the default is ``errors='raise'`` which is backward compatible. (:issue:`14583`) +- ``.select_dtypes()`` now allows the string 'datetimetz' to generically select datetimes with tz (:issue:`14910`) +- The ``.to_latex()`` method will now accept ``multicolumn`` and ``multirow`` arguments to use the accompanying LaTeX enhancements +- ``pd.merge_asof()`` gained the option ``direction='backward'|'forward'|'nearest'`` (:issue:`14887`) +- ``Series/DataFrame.asfreq()`` have gained a ``fill_value`` parameter, to fill missing values (:issue:`3715`). +- ``Series/DataFrame.resample.asfreq`` have gained a ``fill_value`` parameter, to fill missing values during resampling (:issue:`3715`). +- ``pandas.tools.hashing`` has gained a ``hash_tuples`` routine, and ``hash_pandas_object`` has gained the ability to hash a ``MultiIndex`` (:issue:`15224`) +- ``Series/DataFrame.squeeze()`` have gained the ``axis`` parameter. (:issue:`15339`) +- ``DataFrame.to_excel()`` has a new ``freeze_panes`` parameter to turn on Freeze Panes when exporting to Excel (:issue:`15160`) +- HTML table output skips ``colspan`` or ``rowspan`` attribute if equal to 1. (:issue:`15403`) +- ``pd.TimedeltaIndex`` now has a custom datetick formatter specifically designed for nanosecond level precision (:issue:`8711`) +- ``pd.types.concat.union_categoricals`` gained the ``ignore_ordered`` argument to allow ignoring the ordered attribute of unioned categoricals (:issue:`13410`). See the :ref:`categorical union docs ` for more information. +- ``pandas.io.json.json_normalize()`` with an empty ``list`` will return an empty ``DataFrame`` (:issue:`15534`) +- ``pd.DataFrame.to_latex`` and ``pd.DataFrame.to_string`` now allow optional header aliases. (:issue:`15536`) -IntervalIndex -^^^^^^^^^^^^^ +.. _ISO 8601 duration: https://en.wikipedia.org/wiki/ISO_8601#Durations -pandas has gained an ``IntervalIndex`` with its own dtype, ``interval`` as well as the ``Interval`` scalar type. These allow first-class support for interval -notation, specifically as a return type for the categories in :func:`cut` and :func:`qcut`. The ``IntervalIndex`` allows some unique indexing, see the -:ref:`docs `. (:issue:`7640`, :issue:`8625`) -.. warning:: +.. _whatsnew_0200.api_breaking: - These indexing behaviors of the IntervalIndex are provisional and may change in a future version of pandas. Feedback on usage is welcome. +Backwards incompatible API changes +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. _whatsnew_0200.api_breaking.deprecate_ix: -Previous behavior: +Deprecate .ix +^^^^^^^^^^^^^ -The returned categories were strings, representing Intervals +The ``.ix`` indexer is deprecated, in favor of the more strict ``.iloc`` and ``.loc`` indexers. ``.ix`` offers a lot of magic on the inference of what the user wants to do. To wit, ``.ix`` can decide to index *positionally* OR via *labels*, depending on the data type of the index. This has caused quite a bit of user confusion over the years. The full indexing documentation are :ref:`here `. (:issue:`14218`) -.. code-block:: ipython - In [1]: c = pd.cut(range(4), bins=2) +The recommended methods of indexing are: - In [2]: c - Out[2]: - [(-0.003, 1.5], (-0.003, 1.5], (1.5, 3], (1.5, 3]] - Categories (2, object): [(-0.003, 1.5] < (1.5, 3]] +- ``.loc`` if you want to *label* index +- ``.iloc`` if you want to *positionally* index. - In [3]: c.categories - Out[3]: Index(['(-0.003, 1.5]', '(1.5, 3]'], dtype='object') +Using ``.ix`` will now show a ``DeprecationWarning`` with a link to some examples of how to convert code :ref:`here `. -New behavior: .. ipython:: python - c = pd.cut(range(4), bins=2) - c - c.categories - -Furthermore, this allows one to bin *other* data with these same bins, with ``NaN`` representing a missing -value similar to other dtypes. - -.. ipython:: python + df = pd.DataFrame({'A': [1, 2, 3], + 'B': [4, 5, 6]}, + index=list('abc')) - pd.cut([0, 3, 5, 1], bins=c.categories) + df -An ``IntervalIndex`` can also be used in ``Series`` and ``DataFrame`` as the index. +Previous Behavior, where you wish to get the 0th and the 2nd elements from the index in the 'A' column. -.. ipython:: python +.. code-block:: ipython - df = pd.DataFrame({'A': range(4), - 'B': pd.cut([0, 3, 1, 1], bins=c.categories)} - ).set_index('B') - df + In [3]: df.ix[[0, 2], 'A'] + Out[3]: + a 1 + c 3 + Name: A, dtype: int64 -Selecting via a specific interval: +Using ``.loc``. Here we will select the appropriate indexes from the index, then use *label* indexing. .. ipython:: python - df.loc[pd.Interval(1.5, 3.0)] + df.loc[df.index[[0, 2]], 'A'] -Selecting via a scalar value that is contained *in* the intervals. +Using ``.iloc``. Here we will get the location of the 'A' column, then use *positional* indexing to select things. .. ipython:: python - df.loc[0] - -.. _whatsnew_0200.enhancements.other: - -Other Enhancements -^^^^^^^^^^^^^^^^^^ - -- ``DataFrame.rolling()`` now accepts the parameter ``closed='right'|'left'|'both'|'neither'`` to choose the rolling window-endpoint closedness. See the :ref:`documentation ` (:issue:`13965`) -- Integration with the ``feather-format``, including a new top-level ``pd.read_feather()`` and ``DataFrame.to_feather()`` method, see :ref:`here `. -- ``Series.str.replace()`` now accepts a callable, as replacement, which is passed to ``re.sub`` (:issue:`15055`) -- ``Series.str.replace()`` now accepts a compiled regular expression as a pattern (:issue:`15446`) -- ``Series.sort_index`` accepts parameters ``kind`` and ``na_position`` (:issue:`13589`, :issue:`14444`) -- ``DataFrame`` and ``DataFrame.groupby()`` have gained a ``nunique()`` method to count the distinct values over an axis (:issue:`14336`, :issue:`15197`). -- ``DataFrame`` has gained a ``melt()`` method, equivalent to ``pd.melt()``, for unpivoting from a wide to long format (:issue:`12640`). -- ``pd.read_excel()`` now preserves sheet order when using ``sheetname=None`` (:issue:`9930`) -- Multiple offset aliases with decimal points are now supported (e.g. ``0.5min`` is parsed as ``30s``) (:issue:`8419`) -- ``.isnull()`` and ``.notnull()`` have been added to ``Index`` object to make them more consistent with the ``Series`` API (:issue:`15300`) -- New ``UnsortedIndexError`` (subclass of ``KeyError``) raised when indexing/slicing into an - unsorted MultiIndex (:issue:`11897`). This allows differentiation between errors due to lack - of sorting or an incorrect key. See :ref:`here ` -- ``MultiIndex`` has gained a ``.to_frame()`` method to convert to a ``DataFrame`` (:issue:`12397`) -- ``pd.cut`` and ``pd.qcut`` now support datetime64 and timedelta64 dtypes (:issue:`14714`, :issue:`14798`) -- ``pd.qcut`` has gained the ``duplicates='raise'|'drop'`` option to control whether to raise on duplicated edges (:issue:`7751`) -- ``Series`` provides a ``to_excel`` method to output Excel files (:issue:`8825`) -- The ``usecols`` argument in ``pd.read_csv()`` now accepts a callable function as a value (:issue:`14154`) -- The ``skiprows`` argument in ``pd.read_csv()`` now accepts a callable function as a value (:issue:`10882`) -- The ``nrows`` and ``chunksize`` arguments in ``pd.read_csv()`` are supported if both are passed (:issue:`6774`, :issue:`15755`) -- ``DataFrame.plot`` now prints a title above each subplot if ``suplots=True`` and ``title`` is a list of strings (:issue:`14753`) -- ``DataFrame.plot`` can pass the matplotlib 2.0 default color cycle as a single string as color parameter, see `here `__. (:issue:`15516`) -- ``Series.interpolate()`` now supports timedelta as an index type with ``method='time'`` (:issue:`6424`) -- Addition of a ``level`` keyword to ``DataFrame/Series.rename`` to rename - labels in the specified level of a MultiIndex (:issue:`4160`). -- ``DataFrame.reset_index()`` will now interpret a tuple ``index.name`` as a key spanning across levels of ``columns``, if this is a ``MultiIndex`` (:issue:`16164`) -- ``Timedelta.isoformat`` method added for formatting Timedeltas as an `ISO 8601 duration`_. See the :ref:`Timedelta docs ` (:issue:`15136`) -- ``.select_dtypes()`` now allows the string ``datetimetz`` to generically select datetimes with tz (:issue:`14910`) -- The ``.to_latex()`` method will now accept ``multicolumn`` and ``multirow`` arguments to use the accompanying LaTeX enhancements -- ``pd.merge_asof()`` gained the option ``direction='backward'|'forward'|'nearest'`` (:issue:`14887`) -- ``Series/DataFrame.asfreq()`` have gained a ``fill_value`` parameter, to fill missing values (:issue:`3715`). -- ``Series/DataFrame.resample.asfreq`` have gained a ``fill_value`` parameter, to fill missing values during resampling (:issue:`3715`). -- :func:`pandas.util.hash_pandas_object` has gained the ability to hash a ``MultiIndex`` (:issue:`15224`) -- ``Series/DataFrame.squeeze()`` have gained the ``axis`` parameter. (:issue:`15339`) -- ``DataFrame.to_excel()`` has a new ``freeze_panes`` parameter to turn on Freeze Panes when exporting to Excel (:issue:`15160`) -- ``pd.read_html()`` will parse multiple header rows, creating a MutliIndex header. (:issue:`13434`). -- HTML table output skips ``colspan`` or ``rowspan`` attribute if equal to 1. (:issue:`15403`) -- :class:`pandas.io.formats.style.Styler` template now has blocks for easier extension, :ref:`see the example notebook ` (:issue:`15649`) -- :meth:`Styler.render() ` now accepts ``**kwargs`` to allow user-defined variables in the template (:issue:`15649`) -- Compatibility with Jupyter notebook 5.0; MultiIndex column labels are left-aligned and MultiIndex row-labels are top-aligned (:issue:`15379`) -- ``TimedeltaIndex`` now has a custom date-tick formatter specifically designed for nanosecond level precision (:issue:`8711`) -- ``pd.api.types.union_categoricals`` gained the ``ignore_ordered`` argument to allow ignoring the ordered attribute of unioned categoricals (:issue:`13410`). See the :ref:`categorical union docs ` for more information. -- ``DataFrame.to_latex()`` and ``DataFrame.to_string()`` now allow optional header aliases. (:issue:`15536`) -- Re-enable the ``parse_dates`` keyword of ``pd.read_excel()`` to parse string columns as dates (:issue:`14326`) -- Added ``.empty`` property to subclasses of ``Index``. (:issue:`15270`) -- Enabled floor division for ``Timedelta`` and ``TimedeltaIndex`` (:issue:`15828`) -- ``pandas.io.json.json_normalize()`` gained the option ``errors='ignore'|'raise'``; the default is ``errors='raise'`` which is backward compatible. (:issue:`14583`) -- ``pandas.io.json.json_normalize()`` with an empty ``list`` will return an empty ``DataFrame`` (:issue:`15534`) -- ``pandas.io.json.json_normalize()`` has gained a ``sep`` option that accepts ``str`` to separate joined fields; the default is ".", which is backward compatible. (:issue:`14883`) -- :meth:`MultiIndex.remove_unused_levels` has been added to facilitate :ref:`removing unused levels `. (:issue:`15694`) -- ``pd.read_csv()`` will now raise a ``ParserError`` error whenever any parsing error occurs (:issue:`15913`, :issue:`15925`) -- ``pd.read_csv()`` now supports the ``error_bad_lines`` and ``warn_bad_lines`` arguments for the Python parser (:issue:`15925`) -- The ``display.show_dimensions`` option can now also be used to specify - whether the length of a ``Series`` should be shown in its repr (:issue:`7117`). -- ``parallel_coordinates()`` has gained a ``sort_labels`` keyword argument that sorts class labels and the colors assigned to them (:issue:`15908`) -- Options added to allow one to turn on/off using ``bottleneck`` and ``numexpr``, see :ref:`here ` (:issue:`16157`) -- ``DataFrame.style.bar()`` now accepts two more options to further customize the bar chart. Bar alignment is set with ``align='left'|'mid'|'zero'``, the default is "left", which is backward compatible; You can now pass a list of ``color=[color_negative, color_positive]``. (:issue:`14757`) - - -.. _ISO 8601 duration: https://en.wikipedia.org/wiki/ISO_8601#Durations - - -.. _whatsnew_0200.api_breaking: + df.iloc[[0, 2], df.columns.get_loc('A')] -Backwards incompatible API changes -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. _whatsnew.api_breaking.io_compat: -Possible incompatibility for HDF5 formats created with pandas < 0.13.0 -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Possible incompat for HDF5 formats for pandas < 0.13.0 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -``pd.TimeSeries`` was deprecated officially in 0.17.0, though has already been an alias since 0.13.0. It has +``pd.TimeSeries`` was deprecated officially in 0.17.0, though has only been an alias since 0.13.0. It has been dropped in favor of ``pd.Series``. (:issue:`15098`). This *may* cause HDF5 files that were created in prior versions to become unreadable if ``pd.TimeSeries`` @@ -630,115 +470,13 @@ New Behavior: s.map(lambda x: x.hour) - -.. _whatsnew_0200.api_breaking.index_dt_field: - -Accessing datetime fields of Index now return Index -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -The datetime-related attributes (see :ref:`here ` -for an overview) of ``DatetimeIndex``, ``PeriodIndex`` and ``TimedeltaIndex`` previously -returned numpy arrays. They will now return a new ``Index`` object, except -in the case of a boolean field, where the result will still be a boolean ndarray. (:issue:`15022`) - -Previous behaviour: - -.. code-block:: ipython - - In [1]: idx = pd.date_range("2015-01-01", periods=5, freq='10H') - - In [2]: idx.hour - Out[2]: array([ 0, 10, 20, 6, 16], dtype=int32) - -New Behavior: - -.. ipython:: python - - idx = pd.date_range("2015-01-01", periods=5, freq='10H') - idx.hour - -This has the advantage that specific ``Index`` methods are still available on the -result. On the other hand, this might have backward incompatibilities: e.g. -compared to numpy arrays, ``Index`` objects are not mutable. To get the original -ndarray, you can always convert explicitly using ``np.asarray(idx.hour)``. - -.. _whatsnew_0200.api_breaking.unique: - -pd.unique will now be consistent with extension types -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -In prior versions, using :meth:`Series.unique` and :func:`pandas.unique` on ``Categorical`` and tz-aware -data-types would yield different return types. These are now made consistent. (:issue:`15903`) - -- Datetime tz-aware - - Previous behaviour: - - .. code-block:: ipython - - # Series - In [5]: pd.Series([pd.Timestamp('20160101', tz='US/Eastern'), - pd.Timestamp('20160101', tz='US/Eastern')]).unique() - Out[5]: array([Timestamp('2016-01-01 00:00:00-0500', tz='US/Eastern')], dtype=object) - - In [6]: pd.unique(pd.Series([pd.Timestamp('20160101', tz='US/Eastern'), - pd.Timestamp('20160101', tz='US/Eastern')])) - Out[6]: array(['2016-01-01T05:00:00.000000000'], dtype='datetime64[ns]') - - # Index - In [7]: pd.Index([pd.Timestamp('20160101', tz='US/Eastern'), - pd.Timestamp('20160101', tz='US/Eastern')]).unique() - Out[7]: DatetimeIndex(['2016-01-01 00:00:00-05:00'], dtype='datetime64[ns, US/Eastern]', freq=None) - - In [8]: pd.unique([pd.Timestamp('20160101', tz='US/Eastern'), - pd.Timestamp('20160101', tz='US/Eastern')]) - Out[8]: array(['2016-01-01T05:00:00.000000000'], dtype='datetime64[ns]') - - New Behavior: - - .. ipython:: python - - # Series, returns an array of Timestamp tz-aware - pd.Series([pd.Timestamp('20160101', tz='US/Eastern'), - pd.Timestamp('20160101', tz='US/Eastern')]).unique() - pd.unique(pd.Series([pd.Timestamp('20160101', tz='US/Eastern'), - pd.Timestamp('20160101', tz='US/Eastern')])) - - # Index, returns a DatetimeIndex - pd.Index([pd.Timestamp('20160101', tz='US/Eastern'), - pd.Timestamp('20160101', tz='US/Eastern')]).unique() - pd.unique(pd.Index([pd.Timestamp('20160101', tz='US/Eastern'), - pd.Timestamp('20160101', tz='US/Eastern')])) - -- Categoricals - - Previous behaviour: - - .. code-block:: ipython - - In [1]: pd.Series(list('baabc'), dtype='category').unique() - Out[1]: - [b, a, c] - Categories (3, object): [b, a, c] - - In [2]: pd.unique(pd.Series(list('baabc'), dtype='category')) - Out[2]: array(['b', 'a', 'c'], dtype=object) - - New Behavior: - - .. ipython:: python - - # returns a Categorical - pd.Series(list('baabc'), dtype='category').unique() - pd.unique(pd.Series(list('baabc'), dtype='category')) - .. _whatsnew_0200.api_breaking.s3: S3 File Handling ^^^^^^^^^^^^^^^^ pandas now uses `s3fs `_ for handling S3 connections. This shouldn't break -any code. However, since ``s3fs`` is not a required dependency, you will need to install it separately, like ``boto`` +any code. However, since s3fs is not a required dependency, you will need to install it separately, like ``boto`` in prior versions of pandas. (:issue:`11915`). .. _whatsnew_0200.api_breaking.partial_string_indexing: @@ -746,7 +484,7 @@ in prior versions of pandas. (:issue:`11915`). Partial String Indexing Changes ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -:ref:`DatetimeIndex Partial String Indexing ` now works as an exact match, provided that string resolution coincides with index resolution, including a case when both are seconds (:issue:`14826`). See :ref:`Slice vs. Exact Match ` for details. +:ref:`DatetimeIndex Partial String Indexing ` now works as exact match, provided that string resolution coincides with index resolution, including a case when both are seconds (:issue:`14826`). See :ref:`Slice vs. Exact Match ` for details. .. ipython:: python @@ -778,45 +516,14 @@ New Behavior: In [5]: df['a']['2011-12-31 23:59:59'] Out[5]: 1 -.. _whatsnew_0200.api_breaking.concat_dtypes: - -Concat of different float dtypes will not automatically upcast -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Previously, ``concat`` of multiple objects with different ``float`` dtypes would automatically upcast results to a dtype of ``float64``. -Now the smallest acceptable dtype will be used (:issue:`13247`) - -.. ipython:: python - - df1 = pd.DataFrame(np.array([1.0], dtype=np.float32, ndmin=2)) - df1.dtypes - - df2 = pd.DataFrame(np.array([np.nan], dtype=np.float32, ndmin=2)) - df2.dtypes - -Previous Behavior: - -.. code-block:: ipython - - In [7]: pd.concat([df1, df2]).dtypes - Out[7]: - 0 float64 - dtype: object - -New Behavior: - -.. ipython:: python - - pd.concat([df1, df2]).dtypes - .. _whatsnew_0200.api_breaking.gbq: Pandas Google BigQuery support has moved ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -pandas has split off Google BigQuery support into a separate package ``pandas-gbq``. You can ``conda install pandas-gbq -c conda-forge`` or -``pip install pandas-gbq`` to get it. The functionality of :func:`read_gbq` and :meth:`DataFrame.to_gbq` remain the same with the -currently released version of ``pandas-gbq=0.1.4``. Documentation is now hosted `here `__ (:issue:`15347`) +pandas has split off Google BigQuery support into a separate package ``pandas-gbq``. You can ``pip install pandas-gbq`` to get it. +The functionality of :func:`read_gbq` and :meth:`DataFrame.to_gbq` remain the same with the currently released version of ``pandas-gbq=0.1.3``. +Documentation is now hosted `here `__ (:issue:`15347`) .. _whatsnew_0200.api_breaking.memory_usage: @@ -855,73 +562,33 @@ New Behavior: In [11]: index.memory_usage(deep=True) Out[11]: 260 -.. _whatsnew_0200.api_breaking.sort_index: - -DataFrame.sort_index changes -^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -In certain cases, calling ``.sort_index()`` on a MultiIndexed DataFrame would return the *same* DataFrame without seeming to sort. -This would happen with a ``lexsorted``, but non-monotonic levels. (:issue:`15622`, :issue:`15687`, :issue:`14015`, :issue:`13431`, :issue:`15797`) - -This is *unchanged* from prior versions, but shown for illustration purposes: - -.. ipython:: python - - df = DataFrame(np.arange(6), columns=['value'], index=MultiIndex.from_product([list('BA'), range(3)])) - df - -.. ipython:: python - - df.index.is_lexsorted() - df.index.is_monotonic - -Sorting works as expected - -.. ipython:: python - - df.sort_index() - -.. ipython:: python - - df.sort_index().index.is_lexsorted() - df.sort_index().index.is_monotonic - -However, this example, which has a non-monotonic 2nd level, -doesn't behave as desired. - -.. ipython:: python - - df = pd.DataFrame( - {'value': [1, 2, 3, 4]}, - index=pd.MultiIndex(levels=[['a', 'b'], ['bb', 'aa']], - labels=[[0, 0, 1, 1], [0, 1, 0, 1]])) - df - -Previous Behavior: +.. _whatsnew_0200.api_breaking.extensions: -.. code-block:: python - - In [11]: df.sort_index() - Out[11]: - value - a bb 1 - aa 2 - b bb 3 - aa 4 - - In [14]: df.sort_index().index.is_lexsorted() - Out[14]: True - - In [15]: df.sort_index().index.is_monotonic - Out[15]: False +Extension Modules Moved +^^^^^^^^^^^^^^^^^^^^^^^ -New Behavior: +Some formerly public c/c++/cython extension modules have been moved and/or renamed. These are all removed from the public API. +If indicated, a deprecation warning will be issued if you reference that module. (:issue:`12588`) -.. ipython:: python +.. csv-table:: + :header: "Previous Location", "New Location", "Deprecated" + :widths: 30, 30, 4 - df.sort_index() - df.sort_index().index.is_lexsorted() - df.sort_index().index.is_monotonic + "pandas.lib", "pandas._libs.lib", "X" + "pandas.tslib", "pandas._libs.tslib", "X" + "pandas._join", "pandas._libs.join", "" + "pandas._period", "pandas._libs.period", "" + "pandas.msgpack", "pandas.io.msgpack", "" + "pandas.index", "pandas._libs.index", "" + "pandas.algos", "pandas._libs.algos", "" + "pandas.hashtable", "pandas._libs.hashtable", "" + "pandas.json", "pandas.io.json.libjson", "X" + "pandas.parser", "pandas.io.libparsers", "X" + "pandas.io.sas.saslib", "pandas.io.sas.libsas", "" + "pandas._testing", "pandas.util.libtesting", "" + "pandas._sparse", "pandas.sparse.libsparse", "" + "pandas._hash", "pandas.tools.libhash", "" + "pandas._window", "pandas.core.libwindow", "" .. _whatsnew_0200.api_breaking.groupby_describe: @@ -977,79 +644,27 @@ New Behavior: df.groupby('A').agg([np.mean, np.std, np.min, np.max]) -.. _whatsnew_0200.api_breaking.rolling_pairwise: +.. _whatsnew_0200.api_breaking.hdfstore_where: -Window Binary Corr/Cov operations return a MultiIndex DataFrame -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -A binary window operation, like ``.corr()`` or ``.cov()``, when operating on a ``.rolling(..)``, ``.expanding(..)``, or ``.ewm(..)`` object, -will now return a 2-level ``MultiIndexed DataFrame`` rather than a ``Panel``, as ``Panel`` is now deprecated, -see :ref:`here `. These are equivalent in function, -but a MultiIndexed ``DataFrame`` enjoys more support in pandas. -See the section on :ref:`Windowed Binary Operations ` for more information. (:issue:`15677`) - -.. ipython:: python - - np.random.seed(1234) - df = pd.DataFrame(np.random.rand(100, 2), - columns=pd.Index(['A', 'B'], name='bar'), - index=pd.date_range('20160101', - periods=100, freq='D', name='foo')) - df.tail() - -Previous Behavior: - -.. code-block:: ipython - - In [2]: df.rolling(12).corr() - Out[2]: - - Dimensions: 100 (items) x 2 (major_axis) x 2 (minor_axis) - Items axis: 2016-01-01 00:00:00 to 2016-04-09 00:00:00 - Major_axis axis: A to B - Minor_axis axis: A to B - -New Behavior: - -.. ipython:: python - - res = df.rolling(12).corr() - res.tail() - -Retrieving a correlation matrix for a cross-section - -.. ipython:: python - - df.rolling(12).corr().loc['2016-04-07'] - -.. _whatsnew_0200.api_breaking.hdfstore_where: - -HDFStore where string comparison -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +HDFStore where string comparison +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ In previous versions most types could be compared to string column in a ``HDFStore`` -usually resulting in an invalid comparison, returning an empty result frame. These comparisons will now raise a +usually resulting in an invalid comparsion. These comparisions will now raise a ``TypeError`` (:issue:`15492`) -.. ipython:: python - - df = pd.DataFrame({'unparsed_date': ['2014-01-01', '2014-01-01']}) - df.to_hdf('store.h5', 'key', format='table', data_columns=True) - df.dtypes - -Previous Behavior: +New Behavior: .. code-block:: ipython - In [4]: pd.read_hdf('store.h5', 'key', where='unparsed_date > ts') - File "", line 1 - (unparsed_date > 1970-01-01 00:00:01.388552400) - ^ - SyntaxError: invalid token + In [15]: df = pd.DataFrame({'unparsed_date': ['2014-01-01', '2014-01-01']}) -New Behavior: + In [16]: df.dtypes + Out[16]: + unparsed_date object + dtype: object -.. code-block:: ipython + In [17]: df.to_hdf('store.h5', 'key', format='table', data_columns=True) In [18]: ts = pd.Timestamp('2014-01-01') @@ -1057,100 +672,6 @@ New Behavior: TypeError: Cannot compare 2014-01-01 00:00:00 of type to string column -.. ipython:: python - :suppress: - - import os - os.remove('store.h5') - -.. _whatsnew_0200.api_breaking.index_order: - -Index.intersection and inner join now preserve the order of the left Index -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -:meth:`Index.intersection` now preserves the order of the calling ``Index`` (left) -instead of the other ``Index`` (right) (:issue:`15582`). This affects inner -joins, :meth:`DataFrame.join` and :func:`merge`, and the ``.align`` method. - -- ``Index.intersection`` - - .. ipython:: python - - left = pd.Index([2, 1, 0]) - left - right = pd.Index([1, 2, 3]) - right - - Previous Behavior: - - .. code-block:: ipython - - In [4]: left.intersection(right) - Out[4]: Int64Index([1, 2], dtype='int64') - - New Behavior: - - .. ipython:: python - - left.intersection(right) - -- ``DataFrame.join`` and ``pd.merge`` - - .. ipython:: python - - left = pd.DataFrame({'a': [20, 10, 0]}, index=[2, 1, 0]) - left - right = pd.DataFrame({'b': [100, 200, 300]}, index=[1, 2, 3]) - right - - Previous Behavior: - - .. code-block:: ipython - - In [4]: left.join(right, how='inner') - Out[4]: - a b - 1 10 100 - 2 20 200 - - New Behavior: - - .. ipython:: python - - left.join(right, how='inner') - -.. _whatsnew_0200.api_breaking.pivot_table: - -Pivot Table always returns a DataFrame -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -The documentation for :meth:`pivot_table` states that a ``DataFrame`` is *always* returned. Here a bug -is fixed that allowed this to return a ``Series`` under certain circumstance. (:issue:`4386`) - -.. ipython:: python - - df = DataFrame({'col1': [3, 4, 5], - 'col2': ['C', 'D', 'E'], - 'col3': [1, 3, 9]}) - df - -Previous Behavior: - -.. code-block:: ipython - - In [2]: df.pivot_table('col1', index=['col3', 'col2'], aggfunc=np.sum) - Out[2]: - col3 col2 - 1 C 3 - 3 D 4 - 9 E 5 - Name: col1, dtype: int64 - -New Behavior: - -.. ipython:: python - - df.pivot_table('col1', index=['col3', 'col2'], aggfunc=np.sum) .. _whatsnew_0200.api: @@ -1158,10 +679,9 @@ Other API Changes ^^^^^^^^^^^^^^^^^ - ``numexpr`` version is now required to be >= 2.4.6 and it will not be used at all if this requisite is not fulfilled (:issue:`15213`). -- ``CParserError`` has been renamed to ``ParserError`` in ``pd.read_csv()`` and will be removed in the future (:issue:`12665`) +- ``CParserError`` has been renamed to ``ParserError`` in ``pd.read_csv`` and will be removed in the future (:issue:`12665`) - ``SparseArray.cumsum()`` and ``SparseSeries.cumsum()`` will now always return ``SparseArray`` and ``SparseSeries`` respectively (:issue:`12855`) - ``DataFrame.applymap()`` with an empty ``DataFrame`` will return a copy of the empty ``DataFrame`` instead of a ``Series`` (:issue:`8222`) -- ``Series.map()`` now respects default values of dictionary subclasses with a ``__missing__`` method, such as ``collections.Counter`` (:issue:`15999`) - ``.loc`` has compat with ``.ix`` for accepting iterators, and NamedTuples (:issue:`15120`) - ``interpolate()`` and ``fillna()`` will raise a ``ValueError`` if the ``limit`` keyword argument is not greater than 0. (:issue:`9217`) - ``pd.read_csv()`` will now issue a ``ParserWarning`` whenever there are conflicting values provided by the ``dialect`` parameter and the user (:issue:`14898`) @@ -1169,331 +689,19 @@ Other API Changes - ``inplace`` arguments now require a boolean value, else a ``ValueError`` is thrown (:issue:`14189`) - ``pandas.api.types.is_datetime64_ns_dtype`` will now report ``True`` on a tz-aware dtype, similar to ``pandas.api.types.is_datetime64_any_dtype`` - ``DataFrame.asof()`` will return a null filled ``Series`` instead the scalar ``NaN`` if a match is not found (:issue:`15118`) +- Reorganization of timeseries development tests (:issue:`14854`) - Specific support for ``copy.copy()`` and ``copy.deepcopy()`` functions on NDFrame objects (:issue:`15444`) - ``Series.sort_values()`` accepts a one element list of bool for consistency with the behavior of ``DataFrame.sort_values()`` (:issue:`15604`) - ``.merge()`` and ``.join()`` on ``category`` dtype columns will now preserve the category dtype when possible (:issue:`10409`) -- ``SparseDataFrame.default_fill_value`` will be 0, previously was ``nan`` in the return from ``pd.get_dummies(..., sparse=True)`` (:issue:`15594`) -- The default behaviour of ``Series.str.match`` has changed from extracting - groups to matching the pattern. The extracting behaviour was deprecated - since pandas version 0.13.0 and can be done with the ``Series.str.extract`` - method (:issue:`5224`). As a consequence, the ``as_indexer`` keyword is - ignored (no longer needed to specify the new behaviour) and is deprecated. -- ``NaT`` will now correctly report ``False`` for datetimelike boolean operations such as ``is_month_start`` (:issue:`15781`) -- ``NaT`` will now correctly return ``np.nan`` for ``Timedelta`` and ``Period`` accessors such as ``days`` and ``quarter`` (:issue:`15782`) -- ``NaT`` will now returns ``NaT`` for ``tz_localize`` and ``tz_convert`` - methods (:issue:`15830`) -- ``DataFrame`` and ``Panel`` constructors with invalid input will now raise ``ValueError`` rather than ``PandasError``, if called with scalar inputs and not axes (:issue:`15541`) -- ``DataFrame`` and ``Panel`` constructors with invalid input will now raise ``ValueError`` rather than ``pandas.core.common.PandasError``, if called with scalar inputs and not axes; The exception ``PandasError`` is removed as well. (:issue:`15541`) -- The exception ``pandas.core.common.AmbiguousIndexError`` is removed as it is not referenced (:issue:`15541`) - - -.. _whatsnew_0200.privacy: - -Reorganization of the library: Privacy Changes -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -.. _whatsnew_0200.privacy.extensions: - -Modules Privacy Has Changed -^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Some formerly public python/c/c++/cython extension modules have been moved and/or renamed. These are all removed from the public API. -Furthermore, the ``pandas.core``, ``pandas.compat``, and ``pandas.util`` top-level modules are now considered to be PRIVATE. -If indicated, a deprecation warning will be issued if you reference theses modules. (:issue:`12588`) - -.. csv-table:: - :header: "Previous Location", "New Location", "Deprecated" - :widths: 30, 30, 4 - - "pandas.lib", "pandas._libs.lib", "X" - "pandas.tslib", "pandas._libs.tslib", "X" - "pandas.computation", "pandas.core.computation", "X" - "pandas.msgpack", "pandas.io.msgpack", "" - "pandas.index", "pandas._libs.index", "" - "pandas.algos", "pandas._libs.algos", "" - "pandas.hashtable", "pandas._libs.hashtable", "" - "pandas.indexes", "pandas.core.indexes", "" - "pandas.json", "pandas._libs.json / pandas.io.json", "X" - "pandas.parser", "pandas._libs.parsers", "X" - "pandas.formats", "pandas.io.formats", "" - "pandas.sparse", "pandas.core.sparse", "" - "pandas.tools", "pandas.core.reshape", "X" - "pandas.types", "pandas.core.dtypes", "X" - "pandas.io.sas.saslib", "pandas.io.sas._sas", "" - "pandas._join", "pandas._libs.join", "" - "pandas._hash", "pandas._libs.hashing", "" - "pandas._period", "pandas._libs.period", "" - "pandas._sparse", "pandas._libs.sparse", "" - "pandas._testing", "pandas._libs.testing", "" - "pandas._window", "pandas._libs.window", "" - - -Some new subpackages are created with public functionality that is not directly -exposed in the top-level namespace: ``pandas.errors``, ``pandas.plotting`` and -``pandas.testing`` (more details below). Together with ``pandas.api.types`` and -certain functions in the ``pandas.io`` and ``pandas.tseries`` submodules, -these are now the public subpackages. - -Further changes: - -- The function :func:`~pandas.api.types.union_categoricals` is now importable from ``pandas.api.types``, formerly from ``pandas.types.concat`` (:issue:`15998`) -- The type import ``pandas.tslib.NaTType`` is deprecated and can be replaced by using ``type(pandas.NaT)`` (:issue:`16146`) -- The public functions in ``pandas.tools.hashing`` deprecated from that locations, but are now importable from ``pandas.util`` (:issue:`16223`) -- The modules in ``pandas.util``: ``decorators``, ``print_versions``, ``doctools``, ``validators``, ``depr_module`` are now private. Only the functions exposed in ``pandas.util`` itself are public (:issue:`16223`) - -.. _whatsnew_0200.privacy.errors: - -``pandas.errors`` -^^^^^^^^^^^^^^^^^ - -We are adding a standard public module for all pandas exceptions & warnings ``pandas.errors``. (:issue:`14800`). Previously -these exceptions & warnings could be imported from ``pandas.core.common`` or ``pandas.io.common``. These exceptions and warnings -will be removed from the ``*.common`` locations in a future release. (:issue:`15541`) - -The following are now part of this API: - -.. code-block:: python - - ['DtypeWarning', - 'EmptyDataError', - 'OutOfBoundsDatetime', - 'ParserError', - 'ParserWarning', - 'PerformanceWarning', - 'UnsortedIndexError', - 'UnsupportedFunctionCall'] - - -.. _whatsnew_0200.privacy.testing: - -``pandas.testing`` -^^^^^^^^^^^^^^^^^^ - -We are adding a standard module that exposes the public testing functions in ``pandas.testing`` (:issue:`9895`). Those functions can be used when writing tests for functionality using pandas objects. - -The following testing functions are now part of this API: - -- :func:`testing.assert_frame_equal` -- :func:`testing.assert_series_equal` -- :func:`testing.assert_index_equal` - - -.. _whatsnew_0200.privacy.plotting: - -``pandas.plotting`` -^^^^^^^^^^^^^^^^^^^ - -A new public ``pandas.plotting`` module has been added that holds plotting functionality that was previously in either ``pandas.tools.plotting`` or in the top-level namespace. See the :ref:`deprecations sections ` for more details. - -.. _whatsnew_0200.privacy.development: - -Other Development Changes -^^^^^^^^^^^^^^^^^^^^^^^^^ - -- Building pandas for development now requires ``cython >= 0.23`` (:issue:`14831`) -- Require at least 0.23 version of cython to avoid problems with character encodings (:issue:`14699`) -- Switched the test framework to use `pytest `__ (:issue:`13097`) -- Reorganization of tests directory layout (:issue:`14854`, :issue:`15707`). - .. _whatsnew_0200.deprecations: Deprecations -~~~~~~~~~~~~ - -.. _whatsnew_0200.api_breaking.deprecate_ix: - -Deprecate ``.ix`` -^^^^^^^^^^^^^^^^^ - -The ``.ix`` indexer is deprecated, in favor of the more strict ``.iloc`` and ``.loc`` indexers. ``.ix`` offers a lot of magic on the inference of what the user wants to do. To wit, ``.ix`` can decide to index *positionally* OR via *labels*, depending on the data type of the index. This has caused quite a bit of user confusion over the years. The full indexing documentation is :ref:`here `. (:issue:`14218`) - -The recommended methods of indexing are: - -- ``.loc`` if you want to *label* index -- ``.iloc`` if you want to *positionally* index. - -Using ``.ix`` will now show a ``DeprecationWarning`` with a link to some examples of how to convert code :ref:`here `. - - -.. ipython:: python - - df = pd.DataFrame({'A': [1, 2, 3], - 'B': [4, 5, 6]}, - index=list('abc')) - - df - -Previous Behavior, where you wish to get the 0th and the 2nd elements from the index in the 'A' column. - -.. code-block:: ipython - - In [3]: df.ix[[0, 2], 'A'] - Out[3]: - a 1 - c 3 - Name: A, dtype: int64 - -Using ``.loc``. Here we will select the appropriate indexes from the index, then use *label* indexing. - -.. ipython:: python - - df.loc[df.index[[0, 2]], 'A'] - -Using ``.iloc``. Here we will get the location of the 'A' column, then use *positional* indexing to select things. - -.. ipython:: python - - df.iloc[[0, 2], df.columns.get_loc('A')] - - -.. _whatsnew_0200.api_breaking.deprecate_panel: - -Deprecate Panel -^^^^^^^^^^^^^^^ - -``Panel`` is deprecated and will be removed in a future version. The recommended way to represent 3-D data are -with a ``MultiIndex`` on a ``DataFrame`` via the :meth:`~Panel.to_frame` or with the `xarray package `__. Pandas -provides a :meth:`~Panel.to_xarray` method to automate this conversion. For more details see :ref:`Deprecate Panel ` documentation. (:issue:`13563`). - -.. ipython:: python - :okwarning: - - p = tm.makePanel() - p - -Convert to a MultiIndex DataFrame - -.. ipython:: python - - p.to_frame() - -Convert to an xarray DataArray - -.. ipython:: python - - p.to_xarray() - -.. _whatsnew_0200.api_breaking.deprecate_group_agg_dict: - -Deprecate groupby.agg() with a dictionary when renaming -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -The ``.groupby(..).agg(..)``, ``.rolling(..).agg(..)``, and ``.resample(..).agg(..)`` syntax can accept a variable of inputs, including scalars, -list, and a dict of column names to scalars or lists. This provides a useful syntax for constructing multiple -(potentially different) aggregations. - -However, ``.agg(..)`` can *also* accept a dict that allows 'renaming' of the result columns. This is a complicated and confusing syntax, as well as not consistent -between ``Series`` and ``DataFrame``. We are deprecating this 'renaming' functionaility. - -- We are deprecating passing a dict to a grouped/rolled/resampled ``Series``. This allowed - one to ``rename`` the resulting aggregation, but this had a completely different - meaning than passing a dictionary to a grouped ``DataFrame``, which accepts column-to-aggregations. -- We are deprecating passing a dict-of-dicts to a grouped/rolled/resampled ``DataFrame`` in a similar manner. - -This is an illustrative example: - -.. ipython:: python - - df = pd.DataFrame({'A': [1, 1, 1, 2, 2], - 'B': range(5), - 'C': range(5)}) - df - -Here is a typical useful syntax for computing different aggregations for different columns. This -is a natural, and useful syntax. We aggregate from the dict-to-list by taking the specified -columns and applying the list of functions. This returns a ``MultiIndex`` for the columns (this is *not* deprecated). - -.. ipython:: python - - df.groupby('A').agg({'B': 'sum', 'C': 'min'}) - -Here's an example of the first deprecation, passing a dict to a grouped ``Series``. This -is a combination aggregation & renaming: - -.. code-block:: ipython - - In [6]: df.groupby('A').B.agg({'foo': 'count'}) - FutureWarning: using a dict on a Series for aggregation - is deprecated and will be removed in a future version - - Out[6]: - foo - A - 1 3 - 2 2 - -You can accomplish the same operation, more idiomatically by: - -.. ipython:: python - - df.groupby('A').B.agg(['count']).rename(columns={'count': 'foo'}) - - -Here's an example of the second deprecation, passing a dict-of-dict to a grouped ``DataFrame``: - -.. code-block:: python - - In [23]: (df.groupby('A') - .agg({'B': {'foo': 'sum'}, 'C': {'bar': 'min'}}) - ) - FutureWarning: using a dict with renaming is deprecated and - will be removed in a future version - - Out[23]: - B C - foo bar - A - 1 3 0 - 2 7 3 - - -You can accomplish nearly the same by: - -.. ipython:: python - - (df.groupby('A') - .agg({'B': 'sum', 'C': 'min'}) - .rename(columns={'B': 'foo', 'C': 'bar'}) - ) - - - -.. _whatsnew_0200.privacy.deprecate_plotting: - -Deprecate .plotting -^^^^^^^^^^^^^^^^^^^ - -The ``pandas.tools.plotting`` module has been deprecated, in favor of the top level ``pandas.plotting`` module. All the public plotting functions are now available -from ``pandas.plotting`` (:issue:`12548`). - -Furthermore, the top-level ``pandas.scatter_matrix`` and ``pandas.plot_params`` are deprecated. -Users can import these from ``pandas.plotting`` as well. - -Previous script: - -.. code-block:: python - - pd.tools.plotting.scatter_matrix(df) - pd.scatter_matrix(df) - -Should be changed to: - -.. code-block:: python - - pd.plotting.scatter_matrix(df) - - - -.. _whatsnew_0200.deprecations.other: - -Other Deprecations -^^^^^^^^^^^^^^^^^^ +^^^^^^^^^^^^ - ``SparseArray.to_dense()`` has deprecated the ``fill`` parameter, as that parameter was not being respected (:issue:`14647`) - ``SparseSeries.to_dense()`` has deprecated the ``sparse_only`` parameter (:issue:`14647`) - ``Series.repeat()`` has deprecated the ``reps`` parameter in favor of ``repeats`` (:issue:`12662`) -- The ``Series`` constructor and ``.astype`` method have deprecated accepting timestamp dtypes without a frequency (e.g. ``np.datetime64``) for the ``dtype`` parameter (:issue:`15524`) - ``Index.repeat()`` and ``MultiIndex.repeat()`` have deprecated the ``n`` parameter in favor of ``repeats`` (:issue:`12662`) - ``Categorical.searchsorted()`` and ``Series.searchsorted()`` have deprecated the ``v`` parameter in favor of ``value`` (:issue:`12662`) - ``TimedeltaIndex.searchsorted()``, ``DatetimeIndex.searchsorted()``, and ``PeriodIndex.searchsorted()`` have deprecated the ``key`` parameter in favor of ``value`` (:issue:`12662`) @@ -1501,21 +709,17 @@ Other Deprecations - ``Series.sortlevel`` and ``DataFrame.sortlevel`` have been deprecated in favor of ``Series.sort_index`` and ``DataFrame.sort_index`` (:issue:`15099`) - importing ``concat`` from ``pandas.tools.merge`` has been deprecated in favor of imports from the ``pandas`` namespace. This should only affect explict imports (:issue:`15358`) - ``Series/DataFrame/Panel.consolidate()`` been deprecated as a public method. (:issue:`15483`) -- The ``as_indexer`` keyword of ``Series.str.match()`` has been deprecated (ignored keyword) (:issue:`15257`). -- The following top-level pandas functions have been deprecated and will be removed in a future version (:issue:`13790`, :issue:`15940`) - +- The following top-level pandas functions have been deprecated and will be removed in a future version (:issue:`13790`) * ``pd.pnow()``, replaced by ``Period.now()`` * ``pd.Term``, is removed, as it is not applicable to user code. Instead use in-line string expressions in the where clause when searching in HDFStore * ``pd.Expr``, is removed, as it is not applicable to user code. * ``pd.match()``, is removed. * ``pd.groupby()``, replaced by using the ``.groupby()`` method directly on a ``Series/DataFrame`` - * ``pd.get_store()``, replaced by a direct call to ``pd.HDFStore(...)`` -- ``is_any_int_dtype``, ``is_floating_dtype``, and ``is_sequence`` are deprecated from ``pandas.api.types`` (:issue:`16042`) .. _whatsnew_0200.prior_deprecations: Removal of prior version deprecations/changes -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - The ``pandas.rpy`` module is removed. Similar functionality can be accessed through the `rpy2 `__ project. @@ -1531,11 +735,6 @@ Removal of prior version deprecations/changes in favor of ``iloc`` and ``iat`` as explained :ref:`here ` (:issue:`10711`). - The deprecated ``DataFrame.iterkv()`` has been removed in favor of ``DataFrame.iteritems()`` (:issue:`10711`) - The ``Categorical`` constructor has dropped the ``name`` parameter (:issue:`10632`) -- ``Categorical`` has dropped support for ``NaN`` categories (:issue:`10748`) -- The ``take_last`` parameter has been dropped from ``duplicated()``, ``drop_duplicates()``, ``nlargest()``, and ``nsmallest()`` methods (:issue:`10236`, :issue:`10792`, :issue:`10920`) -- ``Series``, ``Index``, and ``DataFrame`` have dropped the ``sort`` and ``order`` methods (:issue:`10726`) -- Where clauses in ``pytables`` are only accepted as strings and expressions types and not other data-types (:issue:`12027`) -- ``DataFrame`` has dropped the ``combineAdd`` and ``combineMult`` methods in favor of ``add`` and ``mul`` respectively (:issue:`10735`) .. _whatsnew_0200.performance: @@ -1543,7 +742,7 @@ Performance Improvements ~~~~~~~~~~~~~~~~~~~~~~~~ - Improved performance of ``pd.wide_to_long()`` (:issue:`14779`) -- Improved performance of ``pd.factorize()`` by releasing the GIL with ``object`` dtype when inferred as strings (:issue:`14859`, :issue:`16057`) +- Increased performance of ``pd.factorize()`` by releasing the GIL with ``object`` dtype when inferred as strings (:issue:`14859`) - Improved performance of timeseries plotting with an irregular DatetimeIndex (or with ``compat_x=True``) (:issue:`15073`). - Improved performance of ``groupby().cummin()`` and ``groupby().cummax()`` (:issue:`15048`, :issue:`15109`, :issue:`15561`, :issue:`15635`) @@ -1552,176 +751,138 @@ Performance Improvements - Improved performance of ``.rank()`` for categorical data (:issue:`15498`) - Improved performance when using ``.unstack()`` (:issue:`15503`) - Improved performance of merge/join on ``category`` columns (:issue:`10409`) -- Improved performance of ``drop_duplicates()`` on ``bool`` columns (:issue:`12963`) -- Improve performance of ``pd.core.groupby.GroupBy.apply`` when the applied - function used the ``.name`` attribute of the group DataFrame (:issue:`15062`). -- Improved performance of ``iloc`` indexing with a list or array (:issue:`15504`). -- Improved performance of ``Series.sort_index()`` with a monotonic index (:issue:`15694`) -- Improved performance in ``pd.read_csv()`` on some platforms with buffered reads (:issue:`16039`) + .. _whatsnew_0200.bug_fixes: Bug Fixes ~~~~~~~~~ -Conversion -^^^^^^^^^^ - - Bug in ``Timestamp.replace`` now raises ``TypeError`` when incorrect argument names are given; previously this raised ``ValueError`` (:issue:`15240`) -- Bug in ``Timestamp.replace`` with compat for passing long integers (:issue:`15030`) -- Bug in ``Timestamp`` returning UTC based time/date attributes when a timezone was provided (:issue:`13303`, :issue:`6538`) -- Bug in ``Timestamp`` incorrectly localizing timezones during construction (:issue:`11481`, :issue:`15777`) +- Bug in ``Index`` power operations with reversed operands (:issue:`14973`) - Bug in ``TimedeltaIndex`` addition where overflow was being allowed without error (:issue:`14816`) - Bug in ``TimedeltaIndex`` raising a ``ValueError`` when boolean indexing with ``loc`` (:issue:`14946`) -- Bug in catching an overflow in ``Timestamp`` + ``Timedelta/Offset`` operations (:issue:`15126`) - Bug in ``DatetimeIndex.round()`` and ``Timestamp.round()`` floating point accuracy when rounding by milliseconds or less (:issue:`14440`, :issue:`15578`) - Bug in ``astype()`` where ``inf`` values were incorrectly converted to integers. Now raises error now with ``astype()`` for Series and DataFrames (:issue:`14265`) - Bug in ``DataFrame(..).apply(to_numeric)`` when values are of type decimal.Decimal. (:issue:`14827`) - Bug in ``describe()`` when passing a numpy array which does not contain the median to the ``percentiles`` keyword argument (:issue:`14908`) +- Bug in ``DataFrame.sort_values()`` when sorting by multiple columns where one column is of type ``int64`` and contains ``NaT`` (:issue:`14922`) +- Bug in ``DataFrame.reindex()`` in which ``method`` was ignored when passing ``columns`` (:issue:`14992`) +- Bug in ``pd.to_numeric()`` in which float and unsigned integer elements were being improperly casted (:issue:`14941`, :issue:`15005`) - Cleaned up ``PeriodIndex`` constructor, including raising on floats more consistently (:issue:`13277`) +- Bug in ``pd.read_csv()`` in which the ``dialect`` parameter was not being verified before processing (:issue:`14898`) +- Bug in ``pd.read_fwf`` where the skiprows parameter was not being respected during column width inference (:issue:`11256`) +- Bug in ``pd.read_csv()`` in which missing data was being improperly handled with ``usecols`` (:issue:`6710`) +- Bug in ``pd.read_csv()`` in which a file containing a row with many columns followed by rows with fewer columns would cause a crash (:issue:`14125`) +- Bug in ``pd.tools.hashing.hash_pandas_object()`` in which hashing of categoricals depended on the ordering of categories, instead of just their values. (:issue:`15143`) +- Bug in ``.groupby(..).resample()`` when passed the ``on=`` kwarg. (:issue:`15021`) - Bug in using ``__deepcopy__`` on empty NDFrame objects (:issue:`15370`) -- Bug in ``.replace()`` may result in incorrect dtypes. (:issue:`12747`, :issue:`15765`) -- Bug in ``Series.replace`` and ``DataFrame.replace`` which failed on empty replacement dicts (:issue:`15289`) -- Bug in ``Series.replace`` which replaced a numeric by string (:issue:`15743`) -- Bug in ``Index`` construction with ``NaN`` elements and integer dtype specified (:issue:`15187`) -- Bug in ``Series`` construction with a datetimetz (:issue:`14928`) -- Bug in ``Series.dt.round()`` inconsistent behaviour on ``NaT`` 's with different arguments (:issue:`14940`) -- Bug in ``Series`` constructor when both ``copy=True`` and ``dtype`` arguments are provided (:issue:`15125`) -- Incorrect dtyped ``Series`` was returned by comparison methods (e.g., ``lt``, ``gt``, ...) against a constant for an empty ``DataFrame`` (:issue:`15077`) -- Bug in ``Series.ffill()`` with mixed dtypes containing tz-aware datetimes. (:issue:`14956`) -- Bug in ``DataFrame.fillna()`` where the argument ``downcast`` was ignored when fillna value was of type ``dict`` (:issue:`15277`) +- Bug in ``DataFrame.loc`` with indexing a ``MultiIndex`` with a ``Series`` indexer (:issue:`14730`, :issue:`15424`) +- Bug in ``DataFrame.loc`` with indexing a ``MultiIndex`` with a numpy array (:issue:`15434`) +- Bug in ``Rolling.quantile`` function that caused a segmentation fault when called with a quantile value outside of the range [0, 1] (:issue:`15463`) +- Bug in ``pd.cut()`` with a single bin on an all 0s array (:issue:`15428`) +- Bug in ``pd.qcut()`` with a single quantile and an array with identical values (:issue:`15431`) + + + +- Bug in the display of ``.info()`` where a qualifier (+) would always be displayed with a ``MultiIndex`` that contains only non-strings (:issue:`15245`) + - Bug in ``.asfreq()``, where frequency was not set for empty ``Series`` (:issue:`14320`) -- Bug in ``DataFrame`` construction with nulls and datetimes in a list-like (:issue:`15869`) -- Bug in ``DataFrame.fillna()`` with tz-aware datetimes (:issue:`15855`) -- Bug in ``is_string_dtype``, ``is_timedelta64_ns_dtype``, and ``is_string_like_dtype`` in which an error was raised when ``None`` was passed in (:issue:`15941`) -- Bug in the return type of ``pd.unique`` on a ``Categorical``, which was returning an ndarray and not a ``Categorical`` (:issue:`15903`) -- Bug in ``Index.to_series()`` where the index was not copied (and so mutating later would change the original), (:issue:`15949`) -- Bug in indexing with partial string indexing with a len-1 DataFrame (:issue:`16071`) -- Bug in ``Series`` construction where passing invalid dtype didn't raise an error. (:issue:`15520`) -Indexing -^^^^^^^^ +- Bug in ``pd.read_msgpack()`` in which ``Series`` categoricals were being improperly processed (:issue:`14901`) +- Bug in ``Series.ffill()`` with mixed dtypes containing tz-aware datetimes. (:issue:`14956`) + +- Bug in ``DataFrame.isin`` comparing datetimelike to empty frame (:issue:`15473`) -- Bug in ``Index`` power operations with reversed operands (:issue:`14973`) -- Bug in ``DataFrame.sort_values()`` when sorting by multiple columns where one column is of type ``int64`` and contains ``NaT`` (:issue:`14922`) -- Bug in ``DataFrame.reindex()`` in which ``method`` was ignored when passing ``columns`` (:issue:`14992`) -- Bug in ``DataFrame.loc`` with indexing a ``MultiIndex`` with a ``Series`` indexer (:issue:`14730`, :issue:`15424`) -- Bug in ``DataFrame.loc`` with indexing a ``MultiIndex`` with a numpy array (:issue:`15434`) -- Bug in ``Series.asof`` which raised if the series contained all ``np.nan`` (:issue:`15713`) -- Bug in ``.at`` when selecting from a tz-aware column (:issue:`15822`) - Bug in ``Series.where()`` and ``DataFrame.where()`` where array-like conditionals were being rejected (:issue:`15414`) -- Bug in ``Series.where()`` where TZ-aware data was converted to float representation (:issue:`15701`) -- Bug in ``.loc`` that would not return the correct dtype for scalar access for a DataFrame (:issue:`11617`) +- Bug in ``Index`` construction with ``NaN`` elements and integer dtype specified (:issue:`15187`) +- Bug in ``Series`` construction with a datetimetz (:issue:`14928`) - Bug in output formatting of a ``MultiIndex`` when names are integers (:issue:`12223`, :issue:`15262`) -- Bug in ``Categorical.searchsorted()`` where alphabetical instead of the provided categorical order was used (:issue:`14522`) -- Bug in ``Series.iloc`` where a ``Categorical`` object for list-like indexes input was returned, where a ``Series`` was expected. (:issue:`14580`) -- Bug in ``DataFrame.isin`` comparing datetimelike to empty frame (:issue:`15473`) -- Bug in ``.reset_index()`` when an all ``NaN`` level of a ``MultiIndex`` would fail (:issue:`6322`) -- Bug in ``.reset_index()`` when raising error for index name already present in ``MultiIndex`` columns (:issue:`16120`) -- Bug in creating a ``MultiIndex`` with tuples and not passing a list of names; this will now raise ``ValueError`` (:issue:`15110`) -- Bug in the HTML display with with a ``MultiIndex`` and truncation (:issue:`14882`) -- Bug in the display of ``.info()`` where a qualifier (+) would always be displayed with a ``MultiIndex`` that contains only non-strings (:issue:`15245`) -- Bug in ``pd.concat()`` where the names of ``MultiIndex`` of resulting ``DataFrame`` are not handled correctly when ``None`` is presented in the names of ``MultiIndex`` of input ``DataFrame`` (:issue:`15787`) -- Bug in ``DataFrame.sort_index()`` and ``Series.sort_index()`` where ``na_position`` doesn't work with a ``MultiIndex`` (:issue:`14784`, :issue:`16604`) -- Bug in in ``pd.concat()`` when combining objects with a ``CategoricalIndex`` (:issue:`16111`) -- Bug in indexing with a scalar and a ``CategoricalIndex`` (:issue:`16123`) -I/O -^^^ +- Bug in compat for passing long integers to ``Timestamp.replace`` (:issue:`15030`) +- Bug in ``.loc`` that would not return the correct dtype for scalar access for a DataFrame (:issue:`11617`) +- Bug in ``GroupBy.get_group()`` failing with a categorical grouper (:issue:`15155`) +- Bug in ``pandas.tools.utils.cartesian_product()`` with large input can cause overflow on windows (:issue:`15265`) -- Bug in ``pd.to_numeric()`` in which float and unsigned integer elements were being improperly casted (:issue:`14941`, :issue:`15005`) -- Bug in ``pd.read_fwf()`` where the skiprows parameter was not being respected during column width inference (:issue:`11256`) -- Bug in ``pd.read_csv()`` in which the ``dialect`` parameter was not being verified before processing (:issue:`14898`) -- Bug in ``pd.read_csv()`` in which missing data was being improperly handled with ``usecols`` (:issue:`6710`) -- Bug in ``pd.read_csv()`` in which a file containing a row with many columns followed by rows with fewer columns would cause a crash (:issue:`14125`) -- Bug in ``pd.read_csv()`` for the C engine where ``usecols`` were being indexed incorrectly with ``parse_dates`` (:issue:`14792`) -- Bug in ``pd.read_csv()`` with ``parse_dates`` when multiline headers are specified (:issue:`15376`) -- Bug in ``pd.read_csv()`` with ``float_precision='round_trip'`` which caused a segfault when a text entry is parsed (:issue:`15140`) -- Bug in ``pd.read_csv()`` when an index was specified and no values were specified as null values (:issue:`15835`) -- Bug in ``pd.read_csv()`` in which certain invalid file objects caused the Python interpreter to crash (:issue:`15337`) -- Bug in ``pd.read_csv()`` in which invalid values for ``nrows`` and ``chunksize`` were allowed (:issue:`15767`) -- Bug in ``pd.read_csv()`` for the Python engine in which unhelpful error messages were being raised when parsing errors occurred (:issue:`15910`) -- Bug in ``pd.read_csv()`` in which the ``skipfooter`` parameter was not being properly validated (:issue:`15925`) -- Bug in ``pd.to_csv()`` in which there was numeric overflow when a timestamp index was being written (:issue:`15982`) -- Bug in ``pd.util.hashing.hash_pandas_object()`` in which hashing of categoricals depended on the ordering of categories, instead of just their values. (:issue:`15143`) -- Bug in ``.to_json()`` where ``lines=True`` and contents (keys or values) contain escaped characters (:issue:`15096`) -- Bug in ``.to_json()`` causing single byte ascii characters to be expanded to four byte unicode (:issue:`15344`) -- Bug in ``.to_json()`` for the C engine where rollover was not correctly handled for case where frac is odd and diff is exactly 0.5 (:issue:`15716`, :issue:`15864`) -- Bug in ``pd.read_json()`` for Python 2 where ``lines=True`` and contents contain non-ascii unicode characters (:issue:`15132`) -- Bug in ``pd.read_msgpack()`` in which ``Series`` categoricals were being improperly processed (:issue:`14901`) -- Bug in ``pd.read_msgpack()`` which did not allow loading of a dataframe with an index of type ``CategoricalIndex`` (:issue:`15487`) -- Bug in ``pd.read_msgpack()`` when deserializing a ``CategoricalIndex`` (:issue:`15487`) - Bug in ``DataFrame.to_records()`` with converting a ``DatetimeIndex`` with a timezone (:issue:`13937`) -- Bug in ``DataFrame.to_records()`` which failed with unicode characters in column names (:issue:`11879`) -- Bug in ``.to_sql()`` when writing a DataFrame with numeric index names (:issue:`15404`). -- Bug in ``DataFrame.to_html()`` with ``index=False`` and ``max_rows`` raising in ``IndexError`` (:issue:`14998`) -- Bug in ``pd.read_hdf()`` passing a ``Timestamp`` to the ``where`` parameter with a non date column (:issue:`15492`) -- Bug in ``DataFrame.to_stata()`` and ``StataWriter`` which produces incorrectly formatted files to be produced for some locales (:issue:`13856`) -- Bug in ``StataReader`` and ``StataWriter`` which allows invalid encodings (:issue:`15723`) -- Bug in the ``Series`` repr not showing the length when the output was truncated (:issue:`15962`). -Plotting -^^^^^^^^ -- Bug in ``DataFrame.hist`` where ``plt.tight_layout`` caused an ``AttributeError`` (use ``matplotlib >= 2.0.1``) (:issue:`9351`) -- Bug in ``DataFrame.boxplot`` where ``fontsize`` was not applied to the tick labels on both axes (:issue:`15108`) -- Bug in the date and time converters pandas registers with matplotlib not handling multiple dimensions (:issue:`16026`) -- Bug in ``pd.scatter_matrix()`` could accept either ``color`` or ``c``, but not both (:issue:`14855`) +- Bug in ``.groupby(...).rolling(...)`` when ``on`` is specified and using a ``DatetimeIndex`` (:issue:`15130`) -Groupby/Resample/Rolling -^^^^^^^^^^^^^^^^^^^^^^^^ -- Bug in ``.groupby(..).resample()`` when passed the ``on=`` kwarg. (:issue:`15021`) -- Properly set ``__name__`` and ``__qualname__`` for ``Groupby.*`` functions (:issue:`14620`) -- Bug in ``GroupBy.get_group()`` failing with a categorical grouper (:issue:`15155`) -- Bug in ``.groupby(...).rolling(...)`` when ``on`` is specified and using a ``DatetimeIndex`` (:issue:`15130`, :issue:`13966`) -- Bug in groupby operations with ``timedelta64`` when passing ``numeric_only=False`` (:issue:`5724`) -- Bug in ``groupby.apply()`` coercing ``object`` dtypes to numeric types, when not all values were numeric (:issue:`14423`, :issue:`15421`, :issue:`15670`) -- Bug in ``resample``, where a non-string ``loffset`` argument would not be applied when resampling a timeseries (:issue:`13218`) -- Bug in ``DataFrame.groupby().describe()`` when grouping on ``Index`` containing tuples (:issue:`14848`) -- Bug in ``groupby().nunique()`` with a datetimelike-grouper where bins counts were incorrect (:issue:`13453`) -- Bug in ``groupby.transform()`` that would coerce the resultant dtypes back to the original (:issue:`10972`, :issue:`11444`) -- Bug in ``groupby.agg()`` incorrectly localizing timezone on ``datetime`` (:issue:`15426`, :issue:`10668`, :issue:`13046`) +- Bug in ``to_sql`` when writing a DataFrame with numeric index names (:issue:`15404`). +- Bug in ``Series.iloc`` where a ``Categorical`` object for list-like indexes input was returned, where a ``Series`` was expected. (:issue:`14580`) +- Bug in repr-formatting a ``SparseDataFrame`` after a value was set on (a copy of) one of its series (:issue:`15488`) +- Bug in ``SparseSeries.reindex`` on single level with list of length 1 (:issue:`15447`) + + +- Bug in groupby operations with timedelta64 when passing ``numeric_only=False`` (:issue:`5724`) + + +- Bug in ``DataFrame.to_html`` with ``index=False`` and ``max_rows`` raising in ``IndexError`` (:issue:`14998`) + +- Bug in ``Categorical.searchsorted()`` where alphabetical instead of the provided categorical order was used (:issue:`14522`) + + + +- Bug in ``resample``, where a non-string ```loffset`` argument would not be applied when resampling a timeseries (:issue:`13218`) + + + +- Bug in ``.rank()`` which incorrectly ranks ordered categories (:issue:`15420`) +- Bug in ``.corr()`` and ``.cov()`` where the column and index were the same object (:issue:`14617`) + + +- Require at least 0.23 version of cython to avoid problems with character encodings (:issue:`14699`) +- Bug in ``pd.pivot_table()`` where no error was raised when values argument was not in the columns (:issue:`14938`) + +- Bug in ``.to_json()`` where ``lines=True`` and contents (keys or values) contain escaped characters (:issue:`15096`) +- Bug in ``.to_json()`` causing single byte ascii characters to be expanded to four byte unicode (:issue:`15344`) +- Bug in ``.read_json()`` for Python 2 where ``lines=True`` and contents contain non-ascii unicode characters (:issue:`15132`) - Bug in ``.rolling/expanding()`` functions where ``count()`` was not counting ``np.Inf``, nor handling ``object`` dtypes (:issue:`12541`) - Bug in ``.rolling()`` where ``pd.Timedelta`` or ``datetime.timedelta`` was not accepted as a ``window`` argument (:issue:`15440`) -- Bug in ``Rolling.quantile`` function that caused a segmentation fault when called with a quantile value outside of the range [0, 1] (:issue:`15463`) - Bug in ``DataFrame.resample().median()`` if duplicate column names are present (:issue:`14233`) -Sparse -^^^^^^ +- Bug in ``DataFrame.groupby().describe()`` when grouping on ``Index`` containing tuples (:issue:`14848`) +- Bug in creating a ``MultiIndex`` with tuples and not passing a list of names; this will now raise ``ValueError`` (:issue:`15110`) +- Bug in ``groupby().nunique()`` with a datetimelike-grouper where bins counts were incorrect (:issue:`13453`) -- Bug in ``SparseSeries.reindex`` on single level with list of length 1 (:issue:`15447`) -- Bug in repr-formatting a ``SparseDataFrame`` after a value was set on (a copy of) one of its series (:issue:`15488`) -- Bug in ``SparseDataFrame`` construction with lists not coercing to dtype (:issue:`15682`) -- Bug in sparse array indexing in which indices were not being validated (:issue:`15863`) +- Bug in catching an overflow in ``Timestamp`` + ``Timedelta/Offset`` operations (:issue:`15126`) +- Bug in the HTML display with with a ``MultiIndex`` and truncation (:issue:`14882`) -Reshaping -^^^^^^^^^ -- Bug in ``pd.merge_asof()`` where ``left_index`` or ``right_index`` caused a failure when multiple ``by`` was specified (:issue:`15676`) - Bug in ``pd.merge_asof()`` where ``left_index``/``right_index`` together caused a failure when ``tolerance`` was specified (:issue:`15135`) - Bug in ``DataFrame.pivot_table()`` where ``dropna=True`` would not drop all-NaN columns when the columns was a ``category`` dtype (:issue:`15193`) -- Bug in ``pd.melt()`` where passing a tuple value for ``value_vars`` caused a ``TypeError`` (:issue:`15348`) -- Bug in ``pd.pivot_table()`` where no error was raised when values argument was not in the columns (:issue:`14938`) -- Bug in ``pd.concat()`` in which concatenating with an empty dataframe with ``join='inner'`` was being improperly handled (:issue:`15328`) -- Bug with ``sort=True`` in ``DataFrame.join`` and ``pd.merge`` when joining on indexes (:issue:`15582`) -- Bug in ``DataFrame.nsmallest`` and ``DataFrame.nlargest`` where identical values resulted in duplicated rows (:issue:`15297`) -Numeric -^^^^^^^ -- Bug in ``.rank()`` which incorrectly ranks ordered categories (:issue:`15420`) -- Bug in ``.corr()`` and ``.cov()`` where the column and index were the same object (:issue:`14617`) -- Bug in ``.mode()`` where ``mode`` was not returned if was only a single value (:issue:`15714`) -- Bug in ``pd.cut()`` with a single bin on an all 0s array (:issue:`15428`) -- Bug in ``pd.qcut()`` with a single quantile and an array with identical values (:issue:`15431`) -- Bug in ``pandas.tools.utils.cartesian_product()`` with large input can cause overflow on windows (:issue:`15265`) -- Bug in ``.eval()`` which caused multiline evals to fail with local variables not on the first line (:issue:`15342`) +- Bug in ``pd.read_hdf()`` passing a ``Timestamp`` to the ``where`` parameter with a non date column (:issue:`15492`) + + +- Bug in ``Series`` constructor when both ``copy=True`` and ``dtype`` arguments are provided (:issue:`15125`) +- Bug in ``pd.read_csv()`` for the C engine where ``usecols`` were being indexed incorrectly with ``parse_dates`` (:issue:`14792`) +- Incorrect dtyped ``Series`` was returned by comparison methods (e.g., ``lt``, ``gt``, ...) against a constant for an empty ``DataFrame`` (:issue:`15077`) +- Bug in ``Series.dt.round`` inconsistent behaviour on NAT's with different arguments (:issue:`14940`) +- Bug in ``DataFrame.fillna()`` where the argument ``downcast`` was ignored when fillna value was of type ``dict`` (:issue:`15277`) +- Bug in ``.reset_index()`` when an all ``NaN`` level of a ``MultiIndex`` would fail (:issue:`6322`) + +- Bug in ``pd.read_msgpack()`` when deserializing a ``CategoricalIndex`` (:issue:`15487`) +- Bug in ``pd.DataFrame.to_records()`` which failed with unicode characters in column names (:issue:`11879`) + -Other -^^^^^ +- Bug in ``pd.read_csv()`` with ``float_precision='round_trip'`` which caused a segfault when a text entry is parsed (:issue:`15140`) -- Compat with SciPy 0.19.0 for testing on ``.interpolate()`` (:issue:`15662`) -- Compat for 32-bit platforms for ``.qcut/cut``; bins will now be ``int64`` dtype (:issue:`14866`) -- Bug in interactions with ``Qt`` when a ``QtApplication`` already exists (:issue:`14372`) -- Avoid use of ``np.finfo()`` during ``import pandas`` removed to mitigate deadlock on Python GIL misuse (:issue:`14641`) +- Bug in ``DataFrame.to_stata()`` and ``StataWriter`` which produces incorrectly formatted files to be produced for some locales (:issue:`13856`) +- Bug in ``pd.concat()`` in which concatting with an empty dataframe with ``join='inner'`` was being improperly handled (:issue:`15328`) +- Bug in ``groupby.agg()`` incorrectly localizing timezone on ``datetime`` (:issue:`15426`, :issue:`10668`, :issue:`13046`) + + + +- Bug in ``.read_csv()`` with ``parse_dates`` when multiline headers are specified (:issue:`15376`) +- Bug in ``groupby.transform()`` that would coerce the resultant dtypes back to the original (:issue:`10972`, :issue:`11444`) + +- Bug in ``DataFrame.hist`` where ``plt.tight_layout`` caused an ``AttributeError`` (use ``matplotlib >= 0.2.0``) (:issue:`9351`) +- Bug in ``DataFrame.boxplot`` where ``fontsize`` was not applied to the tick labels on both axes (:issue:`15108`) +- Bug in ``Series.replace`` and ``DataFrame.replace`` which failed on empty replacement dicts (:issue:`15289`) +- Bug in ``pd.melt()`` where passing a tuple value for ``value_vars`` caused a ``TypeError`` (:issue:`15348`) +- Bug in ``.eval()`` which caused multiline evals to fail with local variables not on the first line (:issue:`15342`) +- Bug in ``pd.read_msgpack`` which did not allow to load dataframe with an index of type ``CategoricalIndex`` (:issue:`15487`) From 200c752e9ce8cd0ee97d8e552d0aa059f91b095d Mon Sep 17 00:00:00 2001 From: Michael Delgado Date: Sat, 11 Mar 2017 10:08:18 -0800 Subject: [PATCH 07/24] flake8 cleanup --- pandas/core/reshape/reshape.py | 3 ++- pandas/tests/frame/test_subclass.py | 10 +++++----- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 612d3b80b5be2..64d2e89f2234f 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -60,7 +60,8 @@ class _Unstacker(object): float and missing values will be set to NaN. constructor : object, default DataFrame - ``Series``, ``DataFrame``, or subclass used to create unstacked response + ``Series``, ``DataFrame``, or subclass used to create unstacked + response Examples -------- diff --git a/pandas/tests/frame/test_subclass.py b/pandas/tests/frame/test_subclass.py index 4921f0cc4e863..637f8e2910928 100644 --- a/pandas/tests/frame/test_subclass.py +++ b/pandas/tests/frame/test_subclass.py @@ -272,10 +272,10 @@ def test_subclass_pivot(self): def test_subclassed_melt(self): # GH 15564 cheese = tm.SubclassedDataFrame({ - 'first' : ['John', 'Mary'], - 'last' : ['Doe', 'Bo'], - 'height' : [5.5, 6.0], - 'weight' : [130, 150]}) + 'first': ['John', 'Mary'], + 'last': ['Doe', 'Bo'], + 'height': [5.5, 6.0], + 'weight': [130, 150]}) melted = pd.melt(cheese, id_vars=['first', 'last']) @@ -310,7 +310,7 @@ def test_subclassed_wide_to_long(self): expected = tm.SubclassedDataFrame(exp_data) expected = expected.set_index(['id', 'year'])[["X", "A", "B"]] long_frame = pd.wide_to_long(df, ["A", "B"], i="id", j="year") - + tm.assert_frame_equal(long_frame, expected) tm.assertIsInstance(long_frame, tm.SubclassedDataFrame) From 5e480c64f3a73a8c4547581a149694942a3431ef Mon Sep 17 00:00:00 2001 From: Michael Delgado Date: Sat, 11 Mar 2017 10:15:09 -0800 Subject: [PATCH 08/24] fix bug in existing docs ``internals.rst:220`` ``{A, [`` --> ``{A: [`` --- doc/source/internals.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/internals.rst b/doc/source/internals.rst index 3d96b93de4cc9..a321b4202296f 100644 --- a/doc/source/internals.rst +++ b/doc/source/internals.rst @@ -217,7 +217,7 @@ Below is an example to define 2 original properties, "internal_cache" as a tempo .. code-block:: python - >>> df = SubclassedDataFrame2({'A', [1, 2, 3], 'B': [4, 5, 6], 'C': [7, 8, 9]}) + >>> df = SubclassedDataFrame2({'A': [1, 2, 3], 'B': [4, 5, 6], 'C': [7, 8, 9]}) >>> df A B C 0 1 4 7 From 4f3319c8a112277128626c9e9b42180feb3e2b54 Mon Sep 17 00:00:00 2001 From: Michael Delgado Date: Sat, 11 Mar 2017 10:26:35 -0800 Subject: [PATCH 09/24] clarify language and add subclassed reshape and math examples to doc/source/internals.rst --- doc/source/internals.rst | 111 +++++++++++++++++++++++++++++++++------ 1 file changed, 95 insertions(+), 16 deletions(-) diff --git a/doc/source/internals.rst b/doc/source/internals.rst index a321b4202296f..5849ae60cf101 100644 --- a/doc/source/internals.rst +++ b/doc/source/internals.rst @@ -110,7 +110,7 @@ This section describes how to subclass ``pandas`` data structures to meet more s Override Constructor Properties ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Each data structure has constructor properties to specifying data constructors. By overriding these properties, you can retain defined-classes through ``pandas`` data manipulations. +Each data structure has constructor properties to specifying data constructors. By overriding these properties, you can retain defined subclass families through ``pandas`` data manipulations. There are 3 constructors to be defined: @@ -118,7 +118,7 @@ There are 3 constructors to be defined: - ``_constructor_sliced``: Used when a manipulation result has one lower dimension(s) as the original, such as ``DataFrame`` single columns slicing. - ``_constructor_expanddim``: Used when a manipulation result has one higher dimension as the original, such as ``Series.to_frame()`` and ``DataFrame.to_panel()``. -Following table shows how ``pandas`` data structures define constructor properties by default. +The following table shows how ``pandas`` data structures define constructor properties by default. =========================== ======================= =================== ======================= Property Attributes ``Series`` ``DataFrame`` ``Panel`` @@ -128,7 +128,7 @@ Property Attributes ``Series`` ``DataFrame`` ``Panel ``_constructor_expanddim`` ``DataFrame`` ``Panel`` ``NotImplementedError`` =========================== ======================= =================== ======================= -Below example shows how to define ``SubclassedSeries`` and ``SubclassedDataFrame`` overriding constructor properties. +The below example shows how to define ``SubclassedSeries`` and ``SubclassedDataFrame`` overriding constructor properties. .. code-block:: python @@ -152,22 +152,47 @@ Below example shows how to define ``SubclassedSeries`` and ``SubclassedDataFrame def _constructor_sliced(self): return SubclassedSeries + @property + def _constructor_expanddim(self): + return SubclassedPanel + + class SubclassedPanel(Panel): + + @property + def _constructor(self): + return SubclassedPanel + + @property + def _constructor_sliced(self): + return SubclassedDataFrame + +Overriding constructor properties allows subclass families to be preserved across slice and reshape operations: + .. code-block:: python - >>> s = SubclassedSeries([1, 2, 3]) - >>> type(s) + >>> ser = SubclassedSeries([1, 2, 3]) + >>> ser + 0 1 + 1 2 + 2 3 + dtype: int64 + >>> type(ser) >>> to_framed = s.to_frame() >>> type(to_framed) - >>> df = SubclassedDataFrame({'A', [1, 2, 3], 'B': [4, 5, 6], 'C': [7, 8, 9]}) + >>> df = SubclassedDataFrame({ + ... 'A': ['a', 'a', 'b', 'b'], + ... 'B': ['x', 'y', 'x', 'y'], + ... 'C': [1, 2, 3, 4]}) >>> df A B C - 0 1 4 7 - 1 2 5 8 - 2 3 6 9 + 0 a x 0 + 1 a y 1 + 2 b x 2 + 3 b y 3 >>> type(df) @@ -175,21 +200,75 @@ Below example shows how to define ``SubclassedSeries`` and ``SubclassedDataFrame >>> sliced1 = df[['A', 'B']] >>> sliced1 A B - 0 1 4 - 1 2 5 - 2 3 6 + 0 a x + 1 a y + 2 b x + 3 b y >>> type(sliced1) - >>> sliced2 = df['A'] + >>> sliced2 = df['C'] >>> sliced2 - 0 1 - 1 2 - 2 3 + 0 0 + 1 1 + 2 2 + 3 3 Name: A, dtype: int64 >>> type(sliced2) + >>> stacked = df.stack() + >>> stacked + 0 A a + B x + C 1 + 1 A a + B y + C 2 + 2 A b + B x + C 3 + 3 A b + B y + C 4 + dtype: object + >>> type(stacked) + + + >>> pivoted = df.pivot(index='A', columns='B', values='C') + >>> pivoted + B x y + A + a 1 2 + b 3 4 + >>> type(pivoted) + + +Most data operations also preserve the class: + +.. code-block:: python + + >>> squared = pivoted**2 + >>> squared + B x y + A + a 1 4 + b 9 16 + >>> type(pivoted) + + + >>> interped = ser.loc[[0, 0.5, 1, 1.5, 2]].interpolate() + >>> interped + 0.0 1.0 + 0.5 1.5 + 1.0 2.0 + 1.5 2.5 + 2.0 3.0 + dtype: float64 + >>> type(interped) + + + Define Original Properties ~~~~~~~~~~~~~~~~~~~~~~~~~~ From 8af21c1149007653a9d151c9f3d524e4e4204e4b Mon Sep 17 00:00:00 2001 From: Michael Delgado Date: Sat, 11 Mar 2017 12:18:59 -0800 Subject: [PATCH 10/24] additional clarification in doc/source/internals.rst --- doc/source/internals.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/internals.rst b/doc/source/internals.rst index 5849ae60cf101..79b5e0b9714d1 100644 --- a/doc/source/internals.rst +++ b/doc/source/internals.rst @@ -128,7 +128,7 @@ Property Attributes ``Series`` ``DataFrame`` ``Panel ``_constructor_expanddim`` ``DataFrame`` ``Panel`` ``NotImplementedError`` =========================== ======================= =================== ======================= -The below example shows how to define ``SubclassedSeries`` and ``SubclassedDataFrame`` overriding constructor properties. +The below example shows how to define ``SubclassedSeries``, ``SubclassedDataFrame``, and ``SubclassedPanel`` classes, overriding the default constructor properties. .. code-block:: python From 027f36a3651a665394d7596d2e2b8e7d9d2c0805 Mon Sep 17 00:00:00 2001 From: Michael Delgado Date: Sat, 11 Mar 2017 12:52:25 -0800 Subject: [PATCH 11/24] remove references to Panel from doc/source/internals.rst subclassing examples --- doc/source/internals.rst | 17 ++--------------- 1 file changed, 2 insertions(+), 15 deletions(-) diff --git a/doc/source/internals.rst b/doc/source/internals.rst index 79b5e0b9714d1..15a7b3fbcef39 100644 --- a/doc/source/internals.rst +++ b/doc/source/internals.rst @@ -116,7 +116,7 @@ There are 3 constructors to be defined: - ``_constructor``: Used when a manipulation result has the same dimesions as the original. - ``_constructor_sliced``: Used when a manipulation result has one lower dimension(s) as the original, such as ``DataFrame`` single columns slicing. -- ``_constructor_expanddim``: Used when a manipulation result has one higher dimension as the original, such as ``Series.to_frame()`` and ``DataFrame.to_panel()``. +- ``_constructor_expanddim``: Used when a manipulation result has one higher dimension as the original, such as ``Series.to_frame()``. The following table shows how ``pandas`` data structures define constructor properties by default. @@ -128,7 +128,7 @@ Property Attributes ``Series`` ``DataFrame`` ``Panel ``_constructor_expanddim`` ``DataFrame`` ``Panel`` ``NotImplementedError`` =========================== ======================= =================== ======================= -The below example shows how to define ``SubclassedSeries``, ``SubclassedDataFrame``, and ``SubclassedPanel`` classes, overriding the default constructor properties. +The below example shows how to define ``SubclassedSeries`` and ``SubclassedDataFrame`` classes, overriding the default constructor properties. .. code-block:: python @@ -152,19 +152,6 @@ The below example shows how to define ``SubclassedSeries``, ``SubclassedDataFram def _constructor_sliced(self): return SubclassedSeries - @property - def _constructor_expanddim(self): - return SubclassedPanel - - class SubclassedPanel(Panel): - - @property - def _constructor(self): - return SubclassedPanel - - @property - def _constructor_sliced(self): - return SubclassedDataFrame Overriding constructor properties allows subclass families to be preserved across slice and reshape operations: From 8a61374b5aa6ad402064734f987ac5e75c02d3a5 Mon Sep 17 00:00:00 2001 From: Michael Delgado Date: Sun, 12 Mar 2017 16:27:10 -0700 Subject: [PATCH 12/24] change code-block to ipython directives in doc/source/internals.rst --- doc/source/internals.rst | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/doc/source/internals.rst b/doc/source/internals.rst index 15a7b3fbcef39..a622e4e98eedc 100644 --- a/doc/source/internals.rst +++ b/doc/source/internals.rst @@ -130,7 +130,7 @@ Property Attributes ``Series`` ``DataFrame`` ``Panel The below example shows how to define ``SubclassedSeries`` and ``SubclassedDataFrame`` classes, overriding the default constructor properties. -.. code-block:: python +.. ipython:: python class SubclassedSeries(Series): @@ -152,10 +152,9 @@ The below example shows how to define ``SubclassedSeries`` and ``SubclassedDataF def _constructor_sliced(self): return SubclassedSeries - Overriding constructor properties allows subclass families to be preserved across slice and reshape operations: -.. code-block:: python +.. ipython:: python >>> ser = SubclassedSeries([1, 2, 3]) >>> ser @@ -266,7 +265,7 @@ To let original data structures have additional properties, you should let ``pan Below is an example to define 2 original properties, "internal_cache" as a temporary property and "added_property" as a normal property -.. code-block:: python +.. ipython:: python class SubclassedDataFrame2(DataFrame): @@ -281,7 +280,7 @@ Below is an example to define 2 original properties, "internal_cache" as a tempo def _constructor(self): return SubclassedDataFrame2 -.. code-block:: python +.. ipython:: python >>> df = SubclassedDataFrame2({'A': [1, 2, 3], 'B': [4, 5, 6], 'C': [7, 8, 9]}) >>> df From 6715a257e5e95143688bb6e23ec2e9a059aaa00e Mon Sep 17 00:00:00 2001 From: Michael Delgado Date: Sat, 13 May 2017 14:14:07 -0700 Subject: [PATCH 13/24] change from python to ipython code blocks in docs --- doc/source/internals.rst | 166 +++++++++++++++++++++++---------------- 1 file changed, 97 insertions(+), 69 deletions(-) diff --git a/doc/source/internals.rst b/doc/source/internals.rst index a622e4e98eedc..4375a7d6e0b26 100644 --- a/doc/source/internals.rst +++ b/doc/source/internals.rst @@ -9,6 +9,7 @@ np.random.seed(123456) np.set_printoptions(precision=4, suppress=True) import pandas as pd + from pandas import Series, DataFrame pd.options.display.max_rows = 15 ********* @@ -132,79 +133,92 @@ The below example shows how to define ``SubclassedSeries`` and ``SubclassedDataF .. ipython:: python - class SubclassedSeries(Series): - - @property - def _constructor(self): - return SubclassedSeries - - @property - def _constructor_expanddim(self): - return SubclassedDataFrame - - class SubclassedDataFrame(DataFrame): - - @property - def _constructor(self): - return SubclassedDataFrame - - @property - def _constructor_sliced(self): - return SubclassedSeries + In [1]: class SubclassedSeries(Series): + ...: + ...: @property + ...: def _constructor(self): + ...: return SubclassedSeries + ...: + ...: @property + ...: def _constructor_expanddim(self): + ...: return SubclassedDataFrame + ...: + + In [1]: class SubclassedDataFrame(DataFrame): + ...: + ...: @property + ...: def _constructor(self): + ...: return SubclassedDataFrame + ...: + ...: @property + ...: def _constructor_sliced(self): + ...: return SubclassedSeries + ...: Overriding constructor properties allows subclass families to be preserved across slice and reshape operations: .. ipython:: python - >>> ser = SubclassedSeries([1, 2, 3]) - >>> ser + In [1]: ser = SubclassedSeries([1, 2, 3]) + In [1]: ser + Out[1]: 0 1 1 2 2 3 dtype: int64 - >>> type(ser) + In [1]: type(ser) + Out[1]: - >>> to_framed = s.to_frame() - >>> type(to_framed) + In [1]: to_framed = s.to_frame() + In [1]: type(to_framed) + Out[1]: - >>> df = SubclassedDataFrame({ - ... 'A': ['a', 'a', 'b', 'b'], - ... 'B': ['x', 'y', 'x', 'y'], - ... 'C': [1, 2, 3, 4]}) - >>> df + In [1]: df = SubclassedDataFrame({ + ...: 'A': ['a', 'a', 'b', 'b'], + ...: 'B': ['x', 'y', 'x', 'y'], + ...: 'C': [1, 2, 3, 4]}) + In [1]: df + Out[1]: A B C 0 a x 0 1 a y 1 2 b x 2 3 b y 3 - >>> type(df) + In [1]: type(df) + Out[1]: - >>> sliced1 = df[['A', 'B']] - >>> sliced1 + In [1]: sliced1 = df[['A', 'B']] + In [1]: sliced1 + Out[1]: A B 0 a x 1 a y 2 b x 3 b y - >>> type(sliced1) + In [1]: type(sliced1) + Out[1]: - >>> sliced2 = df['C'] - >>> sliced2 + In [1]: sliced2 = df['C'] + In [1]: sliced2 + Out[1]: 0 0 1 1 2 2 3 3 Name: A, dtype: int64 - >>> type(sliced2) + + In [1]: type(sliced2) + Out[1]: - >>> stacked = df.stack() - >>> stacked + In [1]: stacked = df.stack() + In [1]: stacked + Out[1]: 0 A a B x C 1 @@ -218,40 +232,48 @@ Overriding constructor properties allows subclass families to be preserved acros B y C 4 dtype: object - >>> type(stacked) + In [1]: type(stacked) + Out[1]: - >>> pivoted = df.pivot(index='A', columns='B', values='C') - >>> pivoted + In [1]: pivoted = pd.pivot(index='A', columns='B', values='C') + In [1]: pivoted + Out[1]: B x y A a 1 2 b 3 4 - >>> type(pivoted) + In [1]: type(pivoted) + Out[1]: Most data operations also preserve the class: -.. code-block:: python - >>> squared = pivoted**2 - >>> squared +.. ipython:: python + + In [1]: squared = pivoted**2 + In [1]: squared + Out[1]: B x y A a 1 4 b 9 16 - >>> type(pivoted) + In [1]: type(pivoted) + Out[1]: - >>> interped = ser.loc[[0, 0.5, 1, 1.5, 2]].interpolate() - >>> interped + In [1]: interped = ser.loc[[0, 0.5, 1, 1.5, 2]].interpolate() + In [1]: interped + Out[1]: 0.0 1.0 0.5 1.5 1.0 2.0 1.5 2.5 2.0 3.0 dtype: float64 - >>> type(interped) + In [1]: type(interped) + Out[1]: @@ -267,40 +289,46 @@ Below is an example to define 2 original properties, "internal_cache" as a tempo .. ipython:: python - class SubclassedDataFrame2(DataFrame): - - # temporary properties - _internal_names = pd.DataFrame._internal_names + ['internal_cache'] - _internal_names_set = set(_internal_names) - - # normal properties - _metadata = ['added_property'] - - @property - def _constructor(self): - return SubclassedDataFrame2 + In [1]: class SubclassedDataFrame2(DataFrame): + ...: + ...: # temporary properties + ...: _internal_names = DataFrame._internal_names + ['internal_cache'] + ...: _internal_names_set = set(_internal_names) + ...: + ...: # normal properties + ...: _metadata = ['added_property'] + ...: + ...: @property + ...: def _constructor(self): + ...: return SubclassedDataFrame2 .. ipython:: python - >>> df = SubclassedDataFrame2({'A': [1, 2, 3], 'B': [4, 5, 6], 'C': [7, 8, 9]}) - >>> df + In [1]: df = SubclassedDataFrame2({'A': [1, 2, 3], 'B': [4, 5, 6], 'C': [7, 8, 9]}) + In [1]: df + Out[1]: A B C 0 1 4 7 1 2 5 8 2 3 6 9 - >>> df.internal_cache = 'cached' - >>> df.added_property = 'property' + In [1]: df.internal_cache = 'cached' + In [1]: df.added_property = 'property' + Out[1]: - >>> df.internal_cache + In [1]: df.internal_cache + Out[1]: cached - >>> df.added_property + In [1]: df.added_property + Out[1]: property # properties defined in _internal_names is reset after manipulation - >>> df[['A', 'B']].internal_cache + In [1]: df[['A', 'B']].internal_cache + Out[1]: AttributeError: 'SubclassedDataFrame2' object has no attribute 'internal_cache' # properties defined in _metadata are retained - >>> df[['A', 'B']].added_property + In [1]: df[['A', 'B']].added_property + Out[1]: property From f751a856c6d7131e987626522cafff135398ffaa Mon Sep 17 00:00:00 2001 From: Michael Delgado Date: Sat, 13 May 2017 14:14:30 -0700 Subject: [PATCH 14/24] reformat docstrings --- pandas/core/reshape/reshape.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 64d2e89f2234f..64cd0fa4548cc 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -42,23 +42,18 @@ class _Unstacker(object): ---------- values : ndarray Values of DataFrame to "Unstack" - index : object - Pandas ``Index`` or ``MultiIndex`` - + Pandas ``Index`` level : int or str, default last level Level to "unstack". Accepts a name for the level. - - value_columns : object, optional + value_columns : Index, optional Pandas ``Index`` or ``MultiIndex`` object if unstacking a DataFrame - fill_value : scalar, optional Default value to fill in missing values if subgroups do not have the same set of labels. By default, missing values will be replaced with the default fill value for that data type, NaN for float, NaT for datetimelike, etc. For integer types, by default data will converted to float and missing values will be set to NaN. - constructor : object, default DataFrame ``Series``, ``DataFrame``, or subclass used to create unstacked response From ca85796ad7eec2572d8ab069b9ecb282c9e92439 Mon Sep 17 00:00:00 2001 From: Michael Delgado Date: Tue, 7 Mar 2017 19:08:41 -0800 Subject: [PATCH 15/24] add subclassed stack/unstack/pivot tests --- pandas/tests/frame/test_subclass.py | 143 +++++++++++++++++++++++++++ pandas/tests/series/test_subclass.py | 12 +++ 2 files changed, 155 insertions(+) diff --git a/pandas/tests/frame/test_subclass.py b/pandas/tests/frame/test_subclass.py index 52c591e4dcbb0..0cc4a2d3efbbc 100644 --- a/pandas/tests/frame/test_subclass.py +++ b/pandas/tests/frame/test_subclass.py @@ -126,6 +126,149 @@ def test_indexing_sliced(self): tm.assert_series_equal(res, exp) assert isinstance(res, tm.SubclassedSeries) + def test_subclass_stack(self): + # GH 15564 + df = tm.SubclassedDataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], + index=['a', 'b', 'c'], + columns=['X', 'Y', 'Z']) + + res = df.stack() + exp = tm.SubclassedSeries( + [1, 2, 3, 4, 5, 6, 7, 8, 9], + index=[list('aaabbbccc'), list('XYZXYZXYZ')]) + + tm.assert_series_equal(res, exp) + tm.assertIsInstance(res, tm.SubclassedSeries) + + def test_subclass_stack_multi(self): + # GH 15564 + df = tm.SubclassedDataFrame([ + [10, 11, 12, 13], + [20, 21, 22, 23], + [30, 31, 32, 33], + [40, 41, 42, 43]], + index=MultiIndex.from_tuples( + list(zip(list('AABB'), list('cdcd'))), + names=['aaa', 'ccc']), + columns=MultiIndex.from_tuples( + list(zip(list('WWXX'), list('yzyz'))), + names=['www', 'yyy'])) + + exp = tm.SubclassedDataFrame([ + [10, 12], + [11, 13], + [20, 22], + [21, 23], + [30, 32], + [31, 33], + [40, 42], + [41, 43]], + index=MultiIndex.from_tuples(list(zip( + list('AAAABBBB'), list('ccddccdd'), list('yzyzyzyz'))), + names=['aaa', 'ccc', 'yyy']), + columns=Index(['W', 'X'], name='www')) + + res = df.stack() + tm.assert_frame_equal(res, exp) + tm.assertIsInstance(res, tm.SubclassedDataFrame) + + res = df.stack('yyy') + tm.assert_frame_equal(res, exp) + tm.assertIsInstance(res, tm.SubclassedDataFrame) + + exp = tm.SubclassedDataFrame([ + [10, 11], + [12, 13], + [20, 21], + [22, 23], + [30, 31], + [32, 33], + [40, 41], + [42, 43]], + index=MultiIndex.from_tuples(list(zip( + list('AAAABBBB'), list('ccddccdd'), list('WXWXWXWX'))), + names=['aaa', 'ccc', 'www']), + columns=Index(['y', 'z'], name='yyy')) + + res = df.stack('www') + tm.assert_frame_equal(res, exp) + tm.assertIsInstance(res, tm.SubclassedDataFrame) + + def test_subclass_unstack(self): + # GH 15564 + df = tm.SubclassedDataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], + index=['a', 'b', 'c'], + columns=['X', 'Y', 'Z']) + + res = df.unstack() + exp = tm.SubclassedSeries( + [1, 4, 7, 2, 5, 8, 3, 6, 9], + index=[list('XXXYYYZZZ'), list('abcabcabc')]) + + tm.assert_series_equal(res, exp) + tm.assertIsInstance(res, tm.SubclassedSeries) + + def test_subclass_unstack_multi(self): + # GH 15564 + df = tm.SubclassedDataFrame([ + [10, 11, 12, 13], + [20, 21, 22, 23], + [30, 31, 32, 33], + [40, 41, 42, 43]], + index=MultiIndex.from_tuples( + list(zip(list('AABB'), list('cdcd'))), + names=['aaa', 'ccc']), + columns=MultiIndex.from_tuples( + list(zip(list('WWXX'), list('yzyz'))), + names=['www', 'yyy'])) + + exp = tm.SubclassedDataFrame([ + [10, 20, 11, 21, 12, 22, 13, 23], + [30, 40, 31, 41, 32, 42, 33, 43]], + index=Index(['A', 'B'], name='aaa'), + columns=MultiIndex.from_tuples(list(zip( + list('WWWWXXXX'), list('yyzzyyzz'), list('cdcdcdcd'))), + names=['www', 'yyy', 'ccc'])) + + res = df.unstack() + tm.assert_frame_equal(res, exp) + tm.assertIsInstance(res, tm.SubclassedDataFrame) + + res = df.unstack('ccc') + tm.assert_frame_equal(res, exp) + tm.assertIsInstance(res, tm.SubclassedDataFrame) + + exp = tm.SubclassedDataFrame([ + [10, 30, 11, 31, 12, 32, 13, 33], + [20, 40, 21, 41, 22, 42, 23, 43]], + index=Index(['c', 'd'], name='ccc'), + columns=MultiIndex.from_tuples(list(zip( + list('WWWWXXXX'), list('yyzzyyzz'), list('ABABABAB'))), + names=['www', 'yyy', 'aaa'])) + + res = df.unstack('aaa') + tm.assert_frame_equal(res, exp) + tm.assertIsInstance(res, tm.SubclassedDataFrame) + + def test_subclass_pivot(self): + # GH 15564 + df = tm.SubclassedDataFrame({ + 'index': ['A', 'B', 'C', 'C', 'B', 'A'], + 'columns': ['One', 'One', 'One', 'Two', 'Two', 'Two'], + 'values': [1., 2., 3., 3., 2., 1.]}) + + pivoted = df.pivot( + index='index', columns='columns', values='values') + + expected = tm.SubclassedDataFrame({ + 'One': {'A': 1., 'B': 2., 'C': 3.}, + 'Two': {'A': 1., 'B': 2., 'C': 3.}}) + + expected.index.name, expected.columns.name = 'index', 'columns' + + tm.assert_frame_equal(pivoted, expected) + tm.assertIsInstance(pivoted, tm.SubclassedDataFrame) + def test_to_panel_expanddim(self): # GH 9762 diff --git a/pandas/tests/series/test_subclass.py b/pandas/tests/series/test_subclass.py index 37c8d7343f7f1..577c8f60e5356 100644 --- a/pandas/tests/series/test_subclass.py +++ b/pandas/tests/series/test_subclass.py @@ -32,6 +32,18 @@ def test_to_frame(self): tm.assert_frame_equal(res, exp) assert isinstance(res, tm.SubclassedDataFrame) + def test_subclass_unstack(self): + # GH 15564 + s = tm.SubclassedSeries( + [1, 2, 3, 4], index=[list('aabb'), list('xyxy')]) + + res = s.unstack() + exp = tm.SubclassedDataFrame( + {'x': [1, 3], 'y': [2, 4]}, index=['a', 'b']) + + tm.assert_frame_equal(res, exp) + tm.assertIsInstance(res, tm.SubclassedDataFrame) + class TestSparseSeriesSubclassing(object): From b0bc8f4fc1b78745dcd768af31f1a358ae5f06b2 Mon Sep 17 00:00:00 2001 From: Michael Delgado Date: Tue, 7 Mar 2017 19:34:20 -0800 Subject: [PATCH 16/24] add melt test --- pandas/tests/frame/test_subclass.py | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/pandas/tests/frame/test_subclass.py b/pandas/tests/frame/test_subclass.py index 0cc4a2d3efbbc..712b0f68a7fc8 100644 --- a/pandas/tests/frame/test_subclass.py +++ b/pandas/tests/frame/test_subclass.py @@ -5,7 +5,7 @@ from warnings import catch_warnings import numpy as np -from pandas import DataFrame, Series, MultiIndex, Panel +from pandas import DataFrame, Series, MultiIndex, Panel, Index import pandas as pd import pandas.util.testing as tm @@ -269,6 +269,26 @@ def test_subclass_pivot(self): tm.assert_frame_equal(pivoted, expected) tm.assertIsInstance(pivoted, tm.SubclassedDataFrame) + def test_subclassed_melt(self): + # GH 15564 + cheese = tm.SubclassedDataFrame({ + 'first' : ['John', 'Mary'], + 'last' : ['Doe', 'Bo'], + 'height' : [5.5, 6.0], + 'weight' : [130, 150]}) + + melted = pd.melt(cheese, id_vars=['first', 'last']) + + expected = tm.SubclassedDataFrame([ + ['John', 'Doe', 'height', 5.5], + ['Mary', 'Bo', 'height', 6.0], + ['John', 'Doe', 'weight', 130], + ['Mary', 'Bo', 'weight', 150]], + columns=['first', 'last', 'variable', 'value']) + + tm.assert_frame_equal(melted, expected) + tm.assertIsInstance(melted, tm.SubclassedDataFrame) + def test_to_panel_expanddim(self): # GH 9762 From 246a464973764d02914ff29bf9188ef09acba1af Mon Sep 17 00:00:00 2001 From: Michael Delgado Date: Tue, 7 Mar 2017 23:56:20 -0800 Subject: [PATCH 17/24] use _constructor* properties to create Series and DataFrame objects to preverve subclasses --- pandas/core/reshape/reshape.py | 29 +++++++++++++++++------------ 1 file changed, 17 insertions(+), 12 deletions(-) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index b0ed6d4c4b84d..a868ec4627f39 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -72,7 +72,8 @@ class _Unstacker(object): """ def __init__(self, values, index, level=-1, value_columns=None, - fill_value=None): + fill_value=None, + constructor=DataFrame): self.is_categorical = None if values.ndim == 1: @@ -83,6 +84,7 @@ def __init__(self, values, index, level=-1, value_columns=None, self.values = values self.value_columns = value_columns self.fill_value = fill_value + self.constructor = constructor if value_columns is None and values.shape[1] != 1: # pragma: no cover raise ValueError('must pass column labels for multi-column data') @@ -177,7 +179,7 @@ def get_result(self): ordered=ordered) for i in range(values.shape[-1])] - return DataFrame(values, index=index, columns=columns) + return self.constructor(values, index=index, columns=columns) def get_new_values(self): values = self.values @@ -373,8 +375,9 @@ def pivot(self, index=None, columns=None, values=None): index = self.index else: index = self[index] - indexed = Series(self[values].values, - index=MultiIndex.from_arrays([index, self[columns]])) + indexed = self._constructor_sliced( + self[values].values, + index=MultiIndex.from_arrays([index, self[columns]])) return indexed.unstack(columns) @@ -455,7 +458,8 @@ def unstack(obj, level, fill_value=None): return obj.T.stack(dropna=False) else: unstacker = _Unstacker(obj.values, obj.index, level=level, - fill_value=fill_value) + fill_value=fill_value, + constructor=obj._constructor_expanddim) return unstacker.get_result() @@ -487,13 +491,14 @@ def _unstack_frame(obj, level, fill_value=None): newb = make_block(new_values.T, placement=new_placement) new_blocks.append(newb) - result = DataFrame(BlockManager(new_blocks, new_axes)) - mask_frame = DataFrame(BlockManager(mask_blocks, new_axes)) + result = obj._constructor(BlockManager(new_blocks, new_axes)) + mask_frame = obj._constructor(BlockManager(mask_blocks, new_axes)) return result.loc[:, mask_frame.sum(0) > 0] else: unstacker = _Unstacker(obj.values, obj.index, level=level, value_columns=obj.columns, - fill_value=fill_value) + fill_value=fill_value, + constructor=obj._constructor) return unstacker.get_result() @@ -550,7 +555,7 @@ def factorize(index): mask = notnull(new_values) new_values = new_values[mask] new_index = new_index[mask] - return Series(new_values, index=new_index) + return frame._constructor_sliced(new_values, index=new_index) def stack_multiple(frame, level, dropna=True): @@ -696,7 +701,7 @@ def _convert_level_number(level_num, columns): new_index = MultiIndex(levels=new_levels, labels=new_labels, names=new_names, verify_integrity=False) - result = DataFrame(new_data, index=new_index, columns=new_columns) + result = frame._constructor(new_data, index=new_index, columns=new_columns) # more efficient way to go about this? can do the whole masking biz but # will only save a small amount of time... @@ -770,7 +775,7 @@ def melt(frame, id_vars=None, value_vars=None, var_name=None, mdata[col] = np.asanyarray(frame.columns ._get_level_values(i)).repeat(N) - return DataFrame(mdata, columns=mcolumns) + return frame._constructor(mdata, columns=mcolumns) def lreshape(data, groups, dropna=True, label=None): @@ -839,7 +844,7 @@ def lreshape(data, groups, dropna=True, label=None): if not mask.all(): mdata = dict((k, v[mask]) for k, v in compat.iteritems(mdata)) - return DataFrame(mdata, columns=id_cols + pivot_cols) + return data._constructor(mdata, columns=id_cols + pivot_cols) def wide_to_long(df, stubnames, i, j, sep="", suffix='\d+'): From 1c672a9d3f7f896ef2fa877ac33cd6c391b52a3b Mon Sep 17 00:00:00 2001 From: Michael Delgado Date: Wed, 8 Mar 2017 00:52:20 -0800 Subject: [PATCH 18/24] document _Unstacker --- pandas/core/reshape/reshape.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index a868ec4627f39..612d3b80b5be2 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -40,9 +40,28 @@ class _Unstacker(object): Parameters ---------- + values : ndarray + Values of DataFrame to "Unstack" + + index : object + Pandas ``Index`` or ``MultiIndex`` + level : int or str, default last level Level to "unstack". Accepts a name for the level. + value_columns : object, optional + Pandas ``Index`` or ``MultiIndex`` object if unstacking a DataFrame + + fill_value : scalar, optional + Default value to fill in missing values if subgroups do not have the + same set of labels. By default, missing values will be replaced with + the default fill value for that data type, NaN for float, NaT for + datetimelike, etc. For integer types, by default data will converted to + float and missing values will be set to NaN. + + constructor : object, default DataFrame + ``Series``, ``DataFrame``, or subclass used to create unstacked response + Examples -------- >>> import pandas as pd From eff151e74acc32b54beffc5f4600ca20c3e9673b Mon Sep 17 00:00:00 2001 From: Michael Delgado Date: Sat, 11 Mar 2017 09:53:53 -0800 Subject: [PATCH 19/24] fix bug in wide_to_long_test, add GH issue numbers --- pandas/tests/frame/test_subclass.py | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/pandas/tests/frame/test_subclass.py b/pandas/tests/frame/test_subclass.py index 712b0f68a7fc8..4921f0cc4e863 100644 --- a/pandas/tests/frame/test_subclass.py +++ b/pandas/tests/frame/test_subclass.py @@ -289,6 +289,31 @@ def test_subclassed_melt(self): tm.assert_frame_equal(melted, expected) tm.assertIsInstance(melted, tm.SubclassedDataFrame) + def test_subclassed_wide_to_long(self): + # GH 9762 + + np.random.seed(123) + x = np.random.randn(3) + df = tm.SubclassedDataFrame({ + "A1970": {0: "a", 1: "b", 2: "c"}, + "A1980": {0: "d", 1: "e", 2: "f"}, + "B1970": {0: 2.5, 1: 1.2, 2: .7}, + "B1980": {0: 3.2, 1: 1.3, 2: .1}, + "X": dict(zip(range(3), x))}) + + df["id"] = df.index + exp_data = {"X": x.tolist() + x.tolist(), + "A": ['a', 'b', 'c', 'd', 'e', 'f'], + "B": [2.5, 1.2, 0.7, 3.2, 1.3, 0.1], + "year": ['1970', '1970', '1970', '1980', '1980', '1980'], + "id": [0, 1, 2, 0, 1, 2]} + expected = tm.SubclassedDataFrame(exp_data) + expected = expected.set_index(['id', 'year'])[["X", "A", "B"]] + long_frame = pd.wide_to_long(df, ["A", "B"], i="id", j="year") + + tm.assert_frame_equal(long_frame, expected) + tm.assertIsInstance(long_frame, tm.SubclassedDataFrame) + def test_to_panel_expanddim(self): # GH 9762 From 16dae8e0415413b4d0e0581adba08dc5ef512f53 Mon Sep 17 00:00:00 2001 From: Michael Delgado Date: Sat, 11 Mar 2017 10:08:18 -0800 Subject: [PATCH 20/24] flake8 cleanup --- pandas/core/reshape/reshape.py | 3 ++- pandas/tests/frame/test_subclass.py | 10 +++++----- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 612d3b80b5be2..64d2e89f2234f 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -60,7 +60,8 @@ class _Unstacker(object): float and missing values will be set to NaN. constructor : object, default DataFrame - ``Series``, ``DataFrame``, or subclass used to create unstacked response + ``Series``, ``DataFrame``, or subclass used to create unstacked + response Examples -------- diff --git a/pandas/tests/frame/test_subclass.py b/pandas/tests/frame/test_subclass.py index 4921f0cc4e863..637f8e2910928 100644 --- a/pandas/tests/frame/test_subclass.py +++ b/pandas/tests/frame/test_subclass.py @@ -272,10 +272,10 @@ def test_subclass_pivot(self): def test_subclassed_melt(self): # GH 15564 cheese = tm.SubclassedDataFrame({ - 'first' : ['John', 'Mary'], - 'last' : ['Doe', 'Bo'], - 'height' : [5.5, 6.0], - 'weight' : [130, 150]}) + 'first': ['John', 'Mary'], + 'last': ['Doe', 'Bo'], + 'height': [5.5, 6.0], + 'weight': [130, 150]}) melted = pd.melt(cheese, id_vars=['first', 'last']) @@ -310,7 +310,7 @@ def test_subclassed_wide_to_long(self): expected = tm.SubclassedDataFrame(exp_data) expected = expected.set_index(['id', 'year'])[["X", "A", "B"]] long_frame = pd.wide_to_long(df, ["A", "B"], i="id", j="year") - + tm.assert_frame_equal(long_frame, expected) tm.assertIsInstance(long_frame, tm.SubclassedDataFrame) From 66b2e42a232efda7a27fa8f344164d64e706fb4e Mon Sep 17 00:00:00 2001 From: Michael Delgado Date: Sat, 11 Mar 2017 10:15:09 -0800 Subject: [PATCH 21/24] fix bug in existing docs ``internals.rst:220`` ``{A, [`` --> ``{A: [`` --- doc/source/internals.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/internals.rst b/doc/source/internals.rst index 3d96b93de4cc9..a321b4202296f 100644 --- a/doc/source/internals.rst +++ b/doc/source/internals.rst @@ -217,7 +217,7 @@ Below is an example to define 2 original properties, "internal_cache" as a tempo .. code-block:: python - >>> df = SubclassedDataFrame2({'A', [1, 2, 3], 'B': [4, 5, 6], 'C': [7, 8, 9]}) + >>> df = SubclassedDataFrame2({'A': [1, 2, 3], 'B': [4, 5, 6], 'C': [7, 8, 9]}) >>> df A B C 0 1 4 7 From 764181283c5a7eaf45d4b1c16742f54265b53073 Mon Sep 17 00:00:00 2001 From: Michael Delgado Date: Sat, 11 Mar 2017 10:26:35 -0800 Subject: [PATCH 22/24] clarify language and add subclassed reshape and math examples to doc/source/internals.rst --- doc/source/internals.rst | 111 +++++++++++++++++++++++++++++++++------ 1 file changed, 95 insertions(+), 16 deletions(-) diff --git a/doc/source/internals.rst b/doc/source/internals.rst index a321b4202296f..5849ae60cf101 100644 --- a/doc/source/internals.rst +++ b/doc/source/internals.rst @@ -110,7 +110,7 @@ This section describes how to subclass ``pandas`` data structures to meet more s Override Constructor Properties ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Each data structure has constructor properties to specifying data constructors. By overriding these properties, you can retain defined-classes through ``pandas`` data manipulations. +Each data structure has constructor properties to specifying data constructors. By overriding these properties, you can retain defined subclass families through ``pandas`` data manipulations. There are 3 constructors to be defined: @@ -118,7 +118,7 @@ There are 3 constructors to be defined: - ``_constructor_sliced``: Used when a manipulation result has one lower dimension(s) as the original, such as ``DataFrame`` single columns slicing. - ``_constructor_expanddim``: Used when a manipulation result has one higher dimension as the original, such as ``Series.to_frame()`` and ``DataFrame.to_panel()``. -Following table shows how ``pandas`` data structures define constructor properties by default. +The following table shows how ``pandas`` data structures define constructor properties by default. =========================== ======================= =================== ======================= Property Attributes ``Series`` ``DataFrame`` ``Panel`` @@ -128,7 +128,7 @@ Property Attributes ``Series`` ``DataFrame`` ``Panel ``_constructor_expanddim`` ``DataFrame`` ``Panel`` ``NotImplementedError`` =========================== ======================= =================== ======================= -Below example shows how to define ``SubclassedSeries`` and ``SubclassedDataFrame`` overriding constructor properties. +The below example shows how to define ``SubclassedSeries`` and ``SubclassedDataFrame`` overriding constructor properties. .. code-block:: python @@ -152,22 +152,47 @@ Below example shows how to define ``SubclassedSeries`` and ``SubclassedDataFrame def _constructor_sliced(self): return SubclassedSeries + @property + def _constructor_expanddim(self): + return SubclassedPanel + + class SubclassedPanel(Panel): + + @property + def _constructor(self): + return SubclassedPanel + + @property + def _constructor_sliced(self): + return SubclassedDataFrame + +Overriding constructor properties allows subclass families to be preserved across slice and reshape operations: + .. code-block:: python - >>> s = SubclassedSeries([1, 2, 3]) - >>> type(s) + >>> ser = SubclassedSeries([1, 2, 3]) + >>> ser + 0 1 + 1 2 + 2 3 + dtype: int64 + >>> type(ser) >>> to_framed = s.to_frame() >>> type(to_framed) - >>> df = SubclassedDataFrame({'A', [1, 2, 3], 'B': [4, 5, 6], 'C': [7, 8, 9]}) + >>> df = SubclassedDataFrame({ + ... 'A': ['a', 'a', 'b', 'b'], + ... 'B': ['x', 'y', 'x', 'y'], + ... 'C': [1, 2, 3, 4]}) >>> df A B C - 0 1 4 7 - 1 2 5 8 - 2 3 6 9 + 0 a x 0 + 1 a y 1 + 2 b x 2 + 3 b y 3 >>> type(df) @@ -175,21 +200,75 @@ Below example shows how to define ``SubclassedSeries`` and ``SubclassedDataFrame >>> sliced1 = df[['A', 'B']] >>> sliced1 A B - 0 1 4 - 1 2 5 - 2 3 6 + 0 a x + 1 a y + 2 b x + 3 b y >>> type(sliced1) - >>> sliced2 = df['A'] + >>> sliced2 = df['C'] >>> sliced2 - 0 1 - 1 2 - 2 3 + 0 0 + 1 1 + 2 2 + 3 3 Name: A, dtype: int64 >>> type(sliced2) + >>> stacked = df.stack() + >>> stacked + 0 A a + B x + C 1 + 1 A a + B y + C 2 + 2 A b + B x + C 3 + 3 A b + B y + C 4 + dtype: object + >>> type(stacked) + + + >>> pivoted = df.pivot(index='A', columns='B', values='C') + >>> pivoted + B x y + A + a 1 2 + b 3 4 + >>> type(pivoted) + + +Most data operations also preserve the class: + +.. code-block:: python + + >>> squared = pivoted**2 + >>> squared + B x y + A + a 1 4 + b 9 16 + >>> type(pivoted) + + + >>> interped = ser.loc[[0, 0.5, 1, 1.5, 2]].interpolate() + >>> interped + 0.0 1.0 + 0.5 1.5 + 1.0 2.0 + 1.5 2.5 + 2.0 3.0 + dtype: float64 + >>> type(interped) + + + Define Original Properties ~~~~~~~~~~~~~~~~~~~~~~~~~~ From be66ce07f15d06c54a7f9f0d64529bf5e9db9615 Mon Sep 17 00:00:00 2001 From: Michael Delgado Date: Sat, 11 Mar 2017 12:18:59 -0800 Subject: [PATCH 23/24] additional clarification in doc/source/internals.rst --- doc/source/internals.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/internals.rst b/doc/source/internals.rst index 5849ae60cf101..79b5e0b9714d1 100644 --- a/doc/source/internals.rst +++ b/doc/source/internals.rst @@ -128,7 +128,7 @@ Property Attributes ``Series`` ``DataFrame`` ``Panel ``_constructor_expanddim`` ``DataFrame`` ``Panel`` ``NotImplementedError`` =========================== ======================= =================== ======================= -The below example shows how to define ``SubclassedSeries`` and ``SubclassedDataFrame`` overriding constructor properties. +The below example shows how to define ``SubclassedSeries``, ``SubclassedDataFrame``, and ``SubclassedPanel`` classes, overriding the default constructor properties. .. code-block:: python From d27034d5ba3ac6f309177d6456425005310bca91 Mon Sep 17 00:00:00 2001 From: Michael Delgado Date: Sat, 11 Mar 2017 12:52:25 -0800 Subject: [PATCH 24/24] remove references to Panel from doc/source/internals.rst subclassing examples --- doc/source/internals.rst | 17 ++--------------- 1 file changed, 2 insertions(+), 15 deletions(-) diff --git a/doc/source/internals.rst b/doc/source/internals.rst index 79b5e0b9714d1..15a7b3fbcef39 100644 --- a/doc/source/internals.rst +++ b/doc/source/internals.rst @@ -116,7 +116,7 @@ There are 3 constructors to be defined: - ``_constructor``: Used when a manipulation result has the same dimesions as the original. - ``_constructor_sliced``: Used when a manipulation result has one lower dimension(s) as the original, such as ``DataFrame`` single columns slicing. -- ``_constructor_expanddim``: Used when a manipulation result has one higher dimension as the original, such as ``Series.to_frame()`` and ``DataFrame.to_panel()``. +- ``_constructor_expanddim``: Used when a manipulation result has one higher dimension as the original, such as ``Series.to_frame()``. The following table shows how ``pandas`` data structures define constructor properties by default. @@ -128,7 +128,7 @@ Property Attributes ``Series`` ``DataFrame`` ``Panel ``_constructor_expanddim`` ``DataFrame`` ``Panel`` ``NotImplementedError`` =========================== ======================= =================== ======================= -The below example shows how to define ``SubclassedSeries``, ``SubclassedDataFrame``, and ``SubclassedPanel`` classes, overriding the default constructor properties. +The below example shows how to define ``SubclassedSeries`` and ``SubclassedDataFrame`` classes, overriding the default constructor properties. .. code-block:: python @@ -152,19 +152,6 @@ The below example shows how to define ``SubclassedSeries``, ``SubclassedDataFram def _constructor_sliced(self): return SubclassedSeries - @property - def _constructor_expanddim(self): - return SubclassedPanel - - class SubclassedPanel(Panel): - - @property - def _constructor(self): - return SubclassedPanel - - @property - def _constructor_sliced(self): - return SubclassedDataFrame Overriding constructor properties allows subclass families to be preserved across slice and reshape operations: