From 2dbf9f2392d6eab6b36f5485abefad31758e23df Mon Sep 17 00:00:00 2001 From: daminisatya Date: Thu, 7 Jun 2018 11:21:55 -0700 Subject: [PATCH 01/12] Fix #21356: JSON nested_to_record Silently Drops Top-Level None Values --- pandas/io/json/normalize.py | 2 -- pandas/tests/io/json/test_normalize.py | 19 +++++++++++-------- 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/pandas/io/json/normalize.py b/pandas/io/json/normalize.py index 17393d458e746..b845a43b9ca9e 100644 --- a/pandas/io/json/normalize.py +++ b/pandas/io/json/normalize.py @@ -80,8 +80,6 @@ def nested_to_record(ds, prefix="", sep=".", level=0): if level != 0: # so we skip copying for top level, common case v = new_d.pop(k) new_d[newkey] = v - elif v is None: # pop the key if the value is None - new_d.pop(k) continue else: v = new_d.pop(k) diff --git a/pandas/tests/io/json/test_normalize.py b/pandas/tests/io/json/test_normalize.py index dc34ba81f679d..c37c4da9bbdd6 100644 --- a/pandas/tests/io/json/test_normalize.py +++ b/pandas/tests/io/json/test_normalize.py @@ -238,15 +238,16 @@ def test_non_ascii_key(self): tm.assert_frame_equal(result, expected) def test_missing_field(self, author_missing_data): - # GH20030: Checks for robustness of json_normalize - should - # unnest records where only the first record has a None value + # GH20030: Checks for robustness of json_normalize result = json_normalize(author_missing_data) ex_data = [ - {'author_name.first': np.nan, + {'info': np.nan, + 'author_name.first': np.nan, 'author_name.last_name': np.nan, 'info.created_at': np.nan, 'info.last_updated': np.nan}, - {'author_name.first': 'Jane', + {'info': None, + 'author_name.first': 'Jane', 'author_name.last_name': 'Doe', 'info.created_at': '11/08/1993', 'info.last_updated': '26/05/2012'} @@ -351,9 +352,8 @@ def test_json_normalize_errors(self): errors='raise' ) - def test_nonetype_dropping(self): - # GH20030: Checks that None values are dropped in nested_to_record - # to prevent additional columns of nans when passed to DataFrame + def test_nonetype(self): + # GH21356 data = [ {'info': None, 'author_name': @@ -367,7 +367,8 @@ def test_nonetype_dropping(self): ] result = nested_to_record(data) expected = [ - {'author_name.first': 'Smith', + {'info': None, + 'author_name.first': 'Smith', 'author_name.last_name': 'Appleseed'}, {'author_name.first': 'Jane', 'author_name.last_name': 'Doe', @@ -395,6 +396,7 @@ def test_nonetype_top_level_bottom_level(self): } result = nested_to_record(data) expected = { + 'id': None, 'location.country.state.id': None, 'location.country.state.town.info.id': None, 'location.country.state.town.info.region': None, @@ -423,6 +425,7 @@ def test_nonetype_multiple_levels(self): } result = nested_to_record(data) expected = { + 'id': None, 'location.id': None, 'location.country.id': None, 'location.country.state.id': None, From 2ac19b5cd3187f40bc7e93999faad428093589a6 Mon Sep 17 00:00:00 2001 From: daminisatya Date: Thu, 7 Jun 2018 12:13:55 -0700 Subject: [PATCH 02/12] Added bugfix #21356 to whatsnew doc under 0.24v --- doc/source/whatsnew/v0.24.0.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index 54eab36a8a571..2c34d7e905c7e 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -80,6 +80,8 @@ Documentation Changes Bug Fixes ~~~~~~~~~ +- The top level None value is not dropped but rather preserved along with lower levels for consistency (:issue:`21356`) + Categorical ^^^^^^^^^^^ From 0de0cd438768772ce35c2566de76f198a3b93ee5 Mon Sep 17 00:00:00 2001 From: daminisatya Date: Fri, 8 Jun 2018 09:03:15 -0700 Subject: [PATCH 03/12] changes requested --- pandas/tests/io/json/test_normalize.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/io/json/test_normalize.py b/pandas/tests/io/json/test_normalize.py index c37c4da9bbdd6..aac60e9327787 100644 --- a/pandas/tests/io/json/test_normalize.py +++ b/pandas/tests/io/json/test_normalize.py @@ -238,7 +238,7 @@ def test_non_ascii_key(self): tm.assert_frame_equal(result, expected) def test_missing_field(self, author_missing_data): - # GH20030: Checks for robustness of json_normalize + # GH20030: result = json_normalize(author_missing_data) ex_data = [ {'info': np.nan, From 832cc565a3f362f884b841a3e7997184caf18aee Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 7 Jun 2018 22:42:11 +0200 Subject: [PATCH 04/12] DOC: move whatsnew file for #21116 (index droplevel) (#21367) --- doc/source/whatsnew/v0.23.1.txt | 2 -- doc/source/whatsnew/v0.24.0.txt | 3 ++- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v0.23.1.txt b/doc/source/whatsnew/v0.23.1.txt index e29cb0a5a2626..048a429136f0c 100644 --- a/doc/source/whatsnew/v0.23.1.txt +++ b/doc/source/whatsnew/v0.23.1.txt @@ -15,8 +15,6 @@ and bug fixes. We recommend that all users upgrade to this version. New features ~~~~~~~~~~~~ -- :meth:`Index.droplevel` is now implemented also for flat indexes, for compatibility with MultiIndex (:issue:`21115`) - .. _whatsnew_0231.deprecations: diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index 2c34d7e905c7e..71d7fbc7438d9 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -15,7 +15,8 @@ Other Enhancements - :func:`to_datetime` now supports the ``%Z`` and ``%z`` directive when passed into ``format`` (:issue:`13486`) - :func:`Series.mode` and :func:`DataFrame.mode` now support the ``dropna`` parameter which can be used to specify whether NaN/NaT values should be considered (:issue:`17534`) - :func:`to_csv` now supports ``compression`` keyword when a file handle is passed. (:issue:`21227`) -- +- :meth:`Index.droplevel` is now implemented also for flat indexes, for compatibility with MultiIndex (:issue:`21115`) + .. _whatsnew_0240.api_breaking: From a1858c22f8e9d47edb4cdeaa1a3ac53de6a492c5 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 7 Jun 2018 23:20:16 +0200 Subject: [PATCH 05/12] DOC: clean-up 0.23.1 whatsnew (#21368) --- doc/source/whatsnew/v0.23.1.txt | 67 +++++++-------------------------- 1 file changed, 14 insertions(+), 53 deletions(-) diff --git a/doc/source/whatsnew/v0.23.1.txt b/doc/source/whatsnew/v0.23.1.txt index 048a429136f0c..09b711c80910c 100644 --- a/doc/source/whatsnew/v0.23.1.txt +++ b/doc/source/whatsnew/v0.23.1.txt @@ -10,19 +10,22 @@ and bug fixes. We recommend that all users upgrade to this version. :local: :backlinks: none -.. _whatsnew_0231.enhancements: -New features -~~~~~~~~~~~~ +.. _whatsnew_0231.fixed_regressions: +Fixed Regressions +~~~~~~~~~~~~~~~~~ -.. _whatsnew_0231.deprecations: - -Deprecations -~~~~~~~~~~~~ +- Fixed regression in the :attr:`DatetimeIndex.date` and :attr:`DatetimeIndex.time` + attributes in case of timezone-aware data: :attr:`DatetimeIndex.time` returned + a tz-aware time instead of tz-naive (:issue:`21267`) and :attr:`DatetimeIndex.date` + returned incorrect date when the input date has a non-UTC timezone (:issue:`21230`). +- Fixed regression in :meth:`pandas.io.json.json_normalize` when called with ``None`` values + in nested levels in JSON (:issue:`21158`). +- Bug in :meth:`~DataFrame.to_csv` causes encoding error when compression and encoding are specified (:issue:`21241`, :issue:`21118`) +- Bug preventing pandas from being importable with -OO optimization (:issue:`21071`) +- Bug in :meth:`Categorical.fillna` incorrectly raising a ``TypeError`` when `value` the individual categories are iterable and `value` is an iterable (:issue:`21097`, :issue:`19788`) -- -- .. _whatsnew_0231.performance: @@ -31,14 +34,7 @@ Performance Improvements - Improved performance of :meth:`CategoricalIndex.is_monotonic_increasing`, :meth:`CategoricalIndex.is_monotonic_decreasing` and :meth:`CategoricalIndex.is_monotonic` (:issue:`21025`) - Improved performance of :meth:`CategoricalIndex.is_unique` (:issue:`21107`) -- -- - -Documentation Changes -~~~~~~~~~~~~~~~~~~~~~ -- -- .. _whatsnew_0231.bug_fixes: @@ -46,74 +42,39 @@ Bug Fixes ~~~~~~~~~ Groupby/Resample/Rolling -^^^^^^^^^^^^^^^^^^^^^^^^ - Bug in :func:`DataFrame.agg` where applying multiple aggregation functions to a :class:`DataFrame` with duplicated column names would cause a stack overflow (:issue:`21063`) - Bug in :func:`pandas.core.groupby.GroupBy.ffill` and :func:`pandas.core.groupby.GroupBy.bfill` where the fill within a grouping would not always be applied as intended due to the implementations' use of a non-stable sort (:issue:`21207`) - Bug in :func:`pandas.core.groupby.GroupBy.rank` where results did not scale to 100% when specifying ``method='dense'`` and ``pct=True`` -Strings -^^^^^^^ +Data-type specific - Bug in :meth:`Series.str.replace()` where the method throws `TypeError` on Python 3.5.2 (:issue: `21078`) - -Timedelta -^^^^^^^^^ - Bug in :class:`Timedelta`: where passing a float with a unit would prematurely round the float precision (:issue: `14156`) - -Categorical -^^^^^^^^^^^ - -- Bug in :func:`pandas.util.testing.assert_index_equal` which raised ``AssertionError`` incorrectly, when comparing two :class:`CategoricalIndex` objects with param ``check_categorical=False`` (:issue:`19776`) -- Bug in :meth:`Categorical.fillna` incorrectly raising a ``TypeError`` when `value` the individual categories are iterable and `value` is an iterable (:issue:`21097`, :issue:`19788`) +- Bug in :func:`pandas.testing.assert_index_equal` which raised ``AssertionError`` incorrectly, when comparing two :class:`CategoricalIndex` objects with param ``check_categorical=False`` (:issue:`19776`) Sparse -^^^^^^ - Bug in :attr:`SparseArray.shape` which previously only returned the shape :attr:`SparseArray.sp_values` (:issue:`21126`) -Conversion -^^^^^^^^^^ - -- -- - Indexing -^^^^^^^^ - Bug in :meth:`Series.reset_index` where appropriate error was not raised with an invalid level name (:issue:`20925`) - Bug in :func:`interval_range` when ``start``/``periods`` or ``end``/``periods`` are specified with float ``start`` or ``end`` (:issue:`21161`) - Bug in :meth:`MultiIndex.set_names` where error raised for a ``MultiIndex`` with ``nlevels == 1`` (:issue:`21149`) -- Bug in :attr:`DatetimeIndex.date` where an incorrect date is returned when the input date has a non-UTC timezone (:issue:`21230`) - Bug in :class:`IntervalIndex` constructors where creating an ``IntervalIndex`` from categorical data was not fully supported (:issue:`21243`, issue:`21253`) - Bug in :meth:`MultiIndex.sort_index` which was not guaranteed to sort correctly with ``level=1``; this was also causing data misalignment in particular :meth:`DataFrame.stack` operations (:issue:`20994`, :issue:`20945`, :issue:`21052`) -- Bug in :attr:`DatetimeIndex.time` where given a tz-aware Timestamp, a tz-aware Time is returned instead of tz-naive (:issue:`21267`) -- I/O -^^^ - Bug in IO methods specifying ``compression='zip'`` which produced uncompressed zip archives (:issue:`17778`, :issue:`21144`) - Bug in :meth:`DataFrame.to_stata` which prevented exporting DataFrames to buffers and most file-like objects (:issue:`21041`) -- Bug when :meth:`pandas.io.json.json_normalize` was called with ``None`` values in nested levels in JSON (:issue:`21158`) -- Bug in :meth:`DataFrame.to_csv` and :meth:`Series.to_csv` causes encoding error when compression and encoding are specified (:issue:`21241`, :issue:`21118`) - Bug in :meth:`read_stata` and :class:`StataReader` which did not correctly decode utf-8 strings on Python 3 from Stata 14 files (dta version 118) (:issue:`21244`) -- - -Plotting -^^^^^^^^ - -- -- Reshaping -^^^^^^^^^ - Bug in :func:`concat` where error was raised in concatenating :class:`Series` with numpy scalar and tuple names (:issue:`21015`) -- Other -^^^^^ - Tab completion on :class:`Index` in IPython no longer outputs deprecation warnings (:issue:`21125`) -- Bug preventing pandas from being importable with -OO optimization (:issue:`21071`) From e64acdfb6a7b7721c7ea24c74a4411971bd32409 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 7 Jun 2018 16:21:08 -0500 Subject: [PATCH 06/12] BUG: Fixed concat warning message (#21362) --- doc/source/whatsnew/v0.23.1.txt | 1 + pandas/core/indexes/api.py | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.23.1.txt b/doc/source/whatsnew/v0.23.1.txt index 09b711c80910c..ead4fac14182d 100644 --- a/doc/source/whatsnew/v0.23.1.txt +++ b/doc/source/whatsnew/v0.23.1.txt @@ -74,6 +74,7 @@ I/O Reshaping - Bug in :func:`concat` where error was raised in concatenating :class:`Series` with numpy scalar and tuple names (:issue:`21015`) +- Bug in :func:`concat` warning message providing the wrong guidance for future behavior (:issue:`21101`) Other diff --git a/pandas/core/indexes/api.py b/pandas/core/indexes/api.py index f9501cd2f9ddf..6f4fdfe5bf5cd 100644 --- a/pandas/core/indexes/api.py +++ b/pandas/core/indexes/api.py @@ -24,9 +24,9 @@ Sorting because non-concatenation axis is not aligned. A future version of pandas will change to not sort by default. -To accept the future behavior, pass 'sort=True'. +To accept the future behavior, pass 'sort=False'. -To retain the current behavior and silence the warning, pass sort=False +To retain the current behavior and silence the warning, pass 'sort=True'. """) From 2fa78182d596840ea672906bf9f13887d04910ed Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 7 Jun 2018 23:25:37 +0200 Subject: [PATCH 07/12] Revert "enable multivalues insert (#19664)" (#21355) This reverts commit 7c7bd569ce8e0f117c618d068e3d2798134dbc73. --- doc/source/io.rst | 6 ------ doc/source/whatsnew/v0.23.1.txt | 4 ++++ pandas/io/sql.py | 28 +++------------------------- pandas/tests/io/test_sql.py | 26 -------------------------- 4 files changed, 7 insertions(+), 57 deletions(-) diff --git a/doc/source/io.rst b/doc/source/io.rst index 7bd56d52b3492..32129147ee281 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -4719,12 +4719,6 @@ writes ``data`` to the database in batches of 1000 rows at a time: data.to_sql('data_chunked', engine, chunksize=1000) -.. note:: - - The function :func:`~pandas.DataFrame.to_sql` will perform a multi-value - insert if the engine dialect ``supports_multivalues_insert``. This will - greatly speed up the insert in some cases. - SQL data types ++++++++++++++ diff --git a/doc/source/whatsnew/v0.23.1.txt b/doc/source/whatsnew/v0.23.1.txt index ead4fac14182d..2b64ef32c1eb6 100644 --- a/doc/source/whatsnew/v0.23.1.txt +++ b/doc/source/whatsnew/v0.23.1.txt @@ -16,6 +16,10 @@ and bug fixes. We recommend that all users upgrade to this version. Fixed Regressions ~~~~~~~~~~~~~~~~~ + +- Reverted the ability of :func:`~DataFrame.to_sql` to perform multivalue + inserts as this caused regression in certain cases (:issue:`21103`). + In the future this will be made configurable. - Fixed regression in the :attr:`DatetimeIndex.date` and :attr:`DatetimeIndex.time` attributes in case of timezone-aware data: :attr:`DatetimeIndex.time` returned a tz-aware time instead of tz-naive (:issue:`21267`) and :attr:`DatetimeIndex.date` diff --git a/pandas/io/sql.py b/pandas/io/sql.py index ccb8d2d99d734..a582d32741ae9 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -572,29 +572,8 @@ def create(self): else: self._execute_create() - def insert_statement(self, data, conn): - """ - Generate tuple of SQLAlchemy insert statement and any arguments - to be executed by connection (via `_execute_insert`). - - Parameters - ---------- - conn : SQLAlchemy connectable(engine/connection) - Connection to recieve the data - data : list of dict - The data to be inserted - - Returns - ------- - SQLAlchemy statement - insert statement - *, optional - Additional parameters to be passed when executing insert statement - """ - dialect = getattr(conn, 'dialect', None) - if dialect and getattr(dialect, 'supports_multivalues_insert', False): - return self.table.insert(data), - return self.table.insert(), data + def insert_statement(self): + return self.table.insert() def insert_data(self): if self.index is not None: @@ -633,9 +612,8 @@ def insert_data(self): return column_names, data_list def _execute_insert(self, conn, keys, data_iter): - """Insert data into this table with database connection""" data = [{k: v for k, v in zip(keys, row)} for row in data_iter] - conn.execute(*self.insert_statement(data, conn)) + conn.execute(self.insert_statement(), data) def insert(self, chunksize=None): keys, data_list = self.insert_data() diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index 4530cc9d2fba9..f3ab74d37a2bc 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -1665,29 +1665,6 @@ class Temporary(Base): tm.assert_frame_equal(df, expected) - def test_insert_multivalues(self): - # issues addressed - # https://github.com/pandas-dev/pandas/issues/14315 - # https://github.com/pandas-dev/pandas/issues/8953 - - db = sql.SQLDatabase(self.conn) - df = DataFrame({'A': [1, 0, 0], 'B': [1.1, 0.2, 4.3]}) - table = sql.SQLTable("test_table", db, frame=df) - data = [ - {'A': 1, 'B': 0.46}, - {'A': 0, 'B': -2.06} - ] - statement = table.insert_statement(data, conn=self.conn)[0] - - if self.supports_multivalues_insert: - assert statement.parameters == data, ( - 'insert statement should be multivalues' - ) - else: - assert statement.parameters is None, ( - 'insert statement should not be multivalues' - ) - class _TestSQLAlchemyConn(_EngineToConnMixin, _TestSQLAlchemy): @@ -1702,7 +1679,6 @@ class _TestSQLiteAlchemy(object): """ flavor = 'sqlite' - supports_multivalues_insert = True @classmethod def connect(cls): @@ -1751,7 +1727,6 @@ class _TestMySQLAlchemy(object): """ flavor = 'mysql' - supports_multivalues_insert = True @classmethod def connect(cls): @@ -1821,7 +1796,6 @@ class _TestPostgreSQLAlchemy(object): """ flavor = 'postgresql' - supports_multivalues_insert = True @classmethod def connect(cls): From f3f7eb9f26ecadd13b8ff0c982f761f4c0a94eed Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Thu, 7 Jun 2018 18:05:57 -0400 Subject: [PATCH 08/12] BUG: dropna incorrect with categoricals in pivot_table (#21252) --- doc/source/whatsnew/v0.23.1.txt | 2 ++ pandas/core/reshape/pivot.py | 20 ++++++++++++++++++-- pandas/tests/reshape/test_pivot.py | 26 +++++++++++++++++++++++++- 3 files changed, 45 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v0.23.1.txt b/doc/source/whatsnew/v0.23.1.txt index 2b64ef32c1eb6..97a5975dad9a6 100644 --- a/doc/source/whatsnew/v0.23.1.txt +++ b/doc/source/whatsnew/v0.23.1.txt @@ -29,6 +29,8 @@ Fixed Regressions - Bug in :meth:`~DataFrame.to_csv` causes encoding error when compression and encoding are specified (:issue:`21241`, :issue:`21118`) - Bug preventing pandas from being importable with -OO optimization (:issue:`21071`) - Bug in :meth:`Categorical.fillna` incorrectly raising a ``TypeError`` when `value` the individual categories are iterable and `value` is an iterable (:issue:`21097`, :issue:`19788`) +- Regression in :func:`pivot_table` where an ordered ``Categorical`` with missing + values for the pivot's ``index`` would give a mis-aligned result (:issue:`21133`) .. _whatsnew_0231.performance: diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index e02420323704e..9a2ad5d13d77a 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -1,8 +1,10 @@ # pylint: disable=E1103 -from pandas.core.dtypes.common import is_list_like, is_scalar +from pandas.core.dtypes.common import ( + is_list_like, is_scalar, is_integer_dtype) from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries +from pandas.core.dtypes.cast import maybe_downcast_to_dtype from pandas.core.reshape.concat import concat from pandas.core.series import Series @@ -79,8 +81,22 @@ def pivot_table(data, values=None, index=None, columns=None, aggfunc='mean', pass values = list(values) - grouped = data.groupby(keys, observed=dropna) + # group by the cartesian product of the grouper + # if we have a categorical + grouped = data.groupby(keys, observed=False) agged = grouped.agg(aggfunc) + if dropna and isinstance(agged, ABCDataFrame) and len(agged.columns): + agged = agged.dropna(how='all') + + # gh-21133 + # we want to down cast if + # the original values are ints + # as we grouped with a NaN value + # and then dropped, coercing to floats + for v in [v for v in values if v in data and v in agged]: + if (is_integer_dtype(data[v]) and + not is_integer_dtype(agged[v])): + agged[v] = maybe_downcast_to_dtype(agged[v], data[v].dtype) table = agged if table.index.nlevels > 1: diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index d2cf3fc11e165..3ec60d50f2792 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -1,3 +1,4 @@ +# -*- coding: utf-8 -*- from datetime import datetime, date, timedelta @@ -16,6 +17,11 @@ from pandas.api.types import CategoricalDtype as CDT +@pytest.fixture(params=[True, False]) +def dropna(request): + return request.param + + class TestPivotTable(object): def setup_method(self, method): @@ -109,7 +115,6 @@ def test_pivot_table_categorical(self): index=exp_index) tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize('dropna', [True, False]) def test_pivot_table_dropna_categoricals(self, dropna): # GH 15193 categories = ['a', 'b', 'c', 'd'] @@ -137,6 +142,25 @@ def test_pivot_table_dropna_categoricals(self, dropna): tm.assert_frame_equal(result, expected) + def test_pivot_with_non_observable_dropna(self, dropna): + # gh-21133 + df = pd.DataFrame( + {'A': pd.Categorical([np.nan, 'low', 'high', 'low', 'high'], + categories=['low', 'high'], + ordered=True), + 'B': range(5)}) + + result = df.pivot_table(index='A', values='B', dropna=dropna) + expected = pd.DataFrame( + {'B': [2, 3]}, + index=pd.Index( + pd.Categorical.from_codes([0, 1], + categories=['low', 'high'], + ordered=True), + name='A')) + + tm.assert_frame_equal(result, expected) + def test_pass_array(self): result = self.data.pivot_table( 'D', index=self.data.A, columns=self.data.C) From 7471f3eac5bc3097c68c7faa1660e1ce8914d399 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=B6ren?= Date: Fri, 8 Jun 2018 05:27:29 -0600 Subject: [PATCH 09/12] Sharey keyword for boxplot (#20968) --- doc/source/whatsnew/v0.23.1.txt | 13 +++++++ pandas/plotting/_core.py | 12 +++++- pandas/tests/plotting/test_frame.py | 59 +++++++++++++++++++++++++++++ 3 files changed, 82 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.23.1.txt b/doc/source/whatsnew/v0.23.1.txt index 97a5975dad9a6..5a1bcce9b5970 100644 --- a/doc/source/whatsnew/v0.23.1.txt +++ b/doc/source/whatsnew/v0.23.1.txt @@ -48,22 +48,26 @@ Bug Fixes ~~~~~~~~~ Groupby/Resample/Rolling +~~~~~~~~~~~~~~~~~~~~~~~~ - Bug in :func:`DataFrame.agg` where applying multiple aggregation functions to a :class:`DataFrame` with duplicated column names would cause a stack overflow (:issue:`21063`) - Bug in :func:`pandas.core.groupby.GroupBy.ffill` and :func:`pandas.core.groupby.GroupBy.bfill` where the fill within a grouping would not always be applied as intended due to the implementations' use of a non-stable sort (:issue:`21207`) - Bug in :func:`pandas.core.groupby.GroupBy.rank` where results did not scale to 100% when specifying ``method='dense'`` and ``pct=True`` Data-type specific +~~~~~~~~~~~~~~~~~~ - Bug in :meth:`Series.str.replace()` where the method throws `TypeError` on Python 3.5.2 (:issue: `21078`) - Bug in :class:`Timedelta`: where passing a float with a unit would prematurely round the float precision (:issue: `14156`) - Bug in :func:`pandas.testing.assert_index_equal` which raised ``AssertionError`` incorrectly, when comparing two :class:`CategoricalIndex` objects with param ``check_categorical=False`` (:issue:`19776`) Sparse +~~~~~~ - Bug in :attr:`SparseArray.shape` which previously only returned the shape :attr:`SparseArray.sp_values` (:issue:`21126`) Indexing +~~~~~~~~ - Bug in :meth:`Series.reset_index` where appropriate error was not raised with an invalid level name (:issue:`20925`) - Bug in :func:`interval_range` when ``start``/``periods`` or ``end``/``periods`` are specified with float ``start`` or ``end`` (:issue:`21161`) @@ -71,17 +75,26 @@ Indexing - Bug in :class:`IntervalIndex` constructors where creating an ``IntervalIndex`` from categorical data was not fully supported (:issue:`21243`, issue:`21253`) - Bug in :meth:`MultiIndex.sort_index` which was not guaranteed to sort correctly with ``level=1``; this was also causing data misalignment in particular :meth:`DataFrame.stack` operations (:issue:`20994`, :issue:`20945`, :issue:`21052`) +Plotting +~~~~~~~~ + +- New keywords (sharex, sharey) to turn on/off sharing of x/y-axis by subplots generated with pandas.DataFrame().groupby().boxplot() (:issue: `20968`) + I/O +~~~ - Bug in IO methods specifying ``compression='zip'`` which produced uncompressed zip archives (:issue:`17778`, :issue:`21144`) - Bug in :meth:`DataFrame.to_stata` which prevented exporting DataFrames to buffers and most file-like objects (:issue:`21041`) - Bug in :meth:`read_stata` and :class:`StataReader` which did not correctly decode utf-8 strings on Python 3 from Stata 14 files (dta version 118) (:issue:`21244`) + Reshaping +~~~~~~~~~ - Bug in :func:`concat` where error was raised in concatenating :class:`Series` with numpy scalar and tuple names (:issue:`21015`) - Bug in :func:`concat` warning message providing the wrong guidance for future behavior (:issue:`21101`) Other +~~~~~ - Tab completion on :class:`Index` in IPython no longer outputs deprecation warnings (:issue:`21125`) diff --git a/pandas/plotting/_core.py b/pandas/plotting/_core.py index c555991ab01c0..8c713548d1ede 100644 --- a/pandas/plotting/_core.py +++ b/pandas/plotting/_core.py @@ -2548,7 +2548,7 @@ def plot_group(group, ax): def boxplot_frame_groupby(grouped, subplots=True, column=None, fontsize=None, rot=0, grid=True, ax=None, figsize=None, - layout=None, **kwds): + layout=None, sharex=False, sharey=True, **kwds): """ Make box plots from DataFrameGroupBy data. @@ -2567,6 +2567,14 @@ def boxplot_frame_groupby(grouped, subplots=True, column=None, fontsize=None, figsize : A tuple (width, height) in inches layout : tuple (optional) (rows, columns) for the layout of the plot + sharex : bool, default False + Whether x-axes will be shared among subplots + + .. versionadded:: 0.23.1 + sharey : bool, default True + Whether y-axes will be shared among subplots + + .. versionadded:: 0.23.1 `**kwds` : Keyword Arguments All other plotting keyword arguments to be passed to matplotlib's boxplot function @@ -2598,7 +2606,7 @@ def boxplot_frame_groupby(grouped, subplots=True, column=None, fontsize=None, if subplots is True: naxes = len(grouped) fig, axes = _subplots(naxes=naxes, squeeze=False, - ax=ax, sharex=False, sharey=True, + ax=ax, sharex=sharex, sharey=sharey, figsize=figsize, layout=layout) axes = _flatten(axes) diff --git a/pandas/tests/plotting/test_frame.py b/pandas/tests/plotting/test_frame.py index ac02f5f4e4283..101713b06df8c 100644 --- a/pandas/tests/plotting/test_frame.py +++ b/pandas/tests/plotting/test_frame.py @@ -40,6 +40,14 @@ def setup_method(self, method): "C": np.arange(20) + np.random.uniform( size=20)}) + def _assert_ytickslabels_visibility(self, axes, expected): + for ax, exp in zip(axes, expected): + self._check_visible(ax.get_yticklabels(), visible=exp) + + def _assert_xtickslabels_visibility(self, axes, expected): + for ax, exp in zip(axes, expected): + self._check_visible(ax.get_xticklabels(), visible=exp) + @pytest.mark.slow def test_plot(self): df = self.tdf @@ -367,6 +375,57 @@ def test_subplots(self): for ax in axes: assert ax.get_legend() is None + def test_groupby_boxplot_sharey(self): + # https://github.com/pandas-dev/pandas/issues/20968 + # sharey can now be switched check whether the right + # pair of axes is turned on or off + + df = DataFrame({'a': [-1.43, -0.15, -3.70, -1.43, -0.14], + 'b': [0.56, 0.84, 0.29, 0.56, 0.85], + 'c': [0, 1, 2, 3, 1]}, + index=[0, 1, 2, 3, 4]) + + # behavior without keyword + axes = df.groupby('c').boxplot() + expected = [True, False, True, False] + self._assert_ytickslabels_visibility(axes, expected) + + # set sharey=True should be identical + axes = df.groupby('c').boxplot(sharey=True) + expected = [True, False, True, False] + self._assert_ytickslabels_visibility(axes, expected) + + # sharey=False, all yticklabels should be visible + axes = df.groupby('c').boxplot(sharey=False) + expected = [True, True, True, True] + self._assert_ytickslabels_visibility(axes, expected) + + def test_groupby_boxplot_sharex(self): + # https://github.com/pandas-dev/pandas/issues/20968 + # sharex can now be switched check whether the right + # pair of axes is turned on or off + + df = DataFrame({'a': [-1.43, -0.15, -3.70, -1.43, -0.14], + 'b': [0.56, 0.84, 0.29, 0.56, 0.85], + 'c': [0, 1, 2, 3, 1]}, + index=[0, 1, 2, 3, 4]) + + # behavior without keyword + axes = df.groupby('c').boxplot() + expected = [True, True, True, True] + self._assert_xtickslabels_visibility(axes, expected) + + # set sharex=False should be identical + axes = df.groupby('c').boxplot(sharex=False) + expected = [True, True, True, True] + self._assert_xtickslabels_visibility(axes, expected) + + # sharex=True, yticklabels should be visible + # only for bottom plots + axes = df.groupby('c').boxplot(sharex=True) + expected = [False, False, True, True] + self._assert_xtickslabels_visibility(axes, expected) + @pytest.mark.slow def test_subplots_timeseries(self): idx = date_range(start='2014-07-01', freq='M', periods=10) From 1027a1c01374881b0edf2e78a25cbf3b218db94f Mon Sep 17 00:00:00 2001 From: "Dr. Irv" Date: Fri, 8 Jun 2018 07:34:33 -0400 Subject: [PATCH 10/12] BUG: Series.combine() fails with ExtensionArray inside of Series (#21183) --- doc/source/whatsnew/v0.24.0.txt | 9 +++++ pandas/core/series.py | 30 +++++++++++++--- pandas/tests/extension/base/methods.py | 34 +++++++++++++++++++ .../extension/category/test_categorical.py | 26 ++++++++++++++ pandas/tests/extension/conftest.py | 9 +++++ pandas/tests/extension/decimal/array.py | 4 ++- .../tests/extension/decimal/test_decimal.py | 8 +++++ pandas/tests/extension/json/test_json.py | 8 +++++ pandas/tests/series/test_combine_concat.py | 13 +++++++ 9 files changed, 135 insertions(+), 6 deletions(-) diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index 71d7fbc7438d9..15924e4fcb701 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -181,9 +181,18 @@ Reshaping - - +ExtensionArray +^^^^^^^^^^^^^^ + +- :meth:`Series.combine()` works correctly with :class:`~pandas.api.extensions.ExtensionArray` inside of :class:`Series` (:issue:`20825`) +- :meth:`Series.combine()` with scalar argument now works for any function type (:issue:`21248`) +- +- + Other ^^^^^ - :meth: `~pandas.io.formats.style.Styler.background_gradient` now takes a ``text_color_threshold`` parameter to automatically lighten the text color based on the luminance of the background color. This improves readability with dark background colors without the need to limit the background colormap range. (:issue:`21258`) - - +- diff --git a/pandas/core/series.py b/pandas/core/series.py index 8bd48c629ffef..2ba1f15044952 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -2204,7 +2204,7 @@ def _binop(self, other, func, level=None, fill_value=None): result.name = None return result - def combine(self, other, func, fill_value=np.nan): + def combine(self, other, func, fill_value=None): """ Perform elementwise binary operation on two Series using given function with optional fill value when an index is missing from one Series or @@ -2216,6 +2216,8 @@ def combine(self, other, func, fill_value=np.nan): func : function Function that takes two scalars as inputs and return a scalar fill_value : scalar value + The default specifies to use the appropriate NaN value for + the underlying dtype of the Series Returns ------- @@ -2235,20 +2237,38 @@ def combine(self, other, func, fill_value=np.nan): Series.combine_first : Combine Series values, choosing the calling Series's values first """ + if fill_value is None: + fill_value = na_value_for_dtype(self.dtype, compat=False) + if isinstance(other, Series): + # If other is a Series, result is based on union of Series, + # so do this element by element new_index = self.index.union(other.index) new_name = ops.get_op_result_name(self, other) - new_values = np.empty(len(new_index), dtype=self.dtype) - for i, idx in enumerate(new_index): + new_values = [] + for idx in new_index: lv = self.get(idx, fill_value) rv = other.get(idx, fill_value) with np.errstate(all='ignore'): - new_values[i] = func(lv, rv) + new_values.append(func(lv, rv)) else: + # Assume that other is a scalar, so apply the function for + # each element in the Series new_index = self.index with np.errstate(all='ignore'): - new_values = func(self._values, other) + new_values = [func(lv, other) for lv in self._values] new_name = self.name + + if is_categorical_dtype(self.values): + pass + elif is_extension_array_dtype(self.values): + # The function can return something of any type, so check + # if the type is compatible with the calling EA + try: + new_values = self._values._from_sequence(new_values) + except TypeError: + pass + return self._constructor(new_values, index=new_index, name=new_name) def combine_first(self, other): diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py index c5436aa731d50..23227867ee4d7 100644 --- a/pandas/tests/extension/base/methods.py +++ b/pandas/tests/extension/base/methods.py @@ -103,3 +103,37 @@ def test_factorize_equivalence(self, data_for_grouping, na_sentinel): tm.assert_numpy_array_equal(l1, l2) self.assert_extension_array_equal(u1, u2) + + def test_combine_le(self, data_repeated): + # GH 20825 + # Test that combine works when doing a <= (le) comparison + orig_data1, orig_data2 = data_repeated(2) + s1 = pd.Series(orig_data1) + s2 = pd.Series(orig_data2) + result = s1.combine(s2, lambda x1, x2: x1 <= x2) + expected = pd.Series([a <= b for (a, b) in + zip(list(orig_data1), list(orig_data2))]) + self.assert_series_equal(result, expected) + + val = s1.iloc[0] + result = s1.combine(val, lambda x1, x2: x1 <= x2) + expected = pd.Series([a <= val for a in list(orig_data1)]) + self.assert_series_equal(result, expected) + + def test_combine_add(self, data_repeated): + # GH 20825 + orig_data1, orig_data2 = data_repeated(2) + s1 = pd.Series(orig_data1) + s2 = pd.Series(orig_data2) + result = s1.combine(s2, lambda x1, x2: x1 + x2) + expected = pd.Series( + orig_data1._from_sequence([a + b for (a, b) in + zip(list(orig_data1), + list(orig_data2))])) + self.assert_series_equal(result, expected) + + val = s1.iloc[0] + result = s1.combine(val, lambda x1, x2: x1 + x2) + expected = pd.Series( + orig_data1._from_sequence([a + val for a in list(orig_data1)])) + self.assert_series_equal(result, expected) diff --git a/pandas/tests/extension/category/test_categorical.py b/pandas/tests/extension/category/test_categorical.py index 530a4e7a22a7a..61fdb8454b542 100644 --- a/pandas/tests/extension/category/test_categorical.py +++ b/pandas/tests/extension/category/test_categorical.py @@ -1,6 +1,7 @@ import string import pytest +import pandas as pd import numpy as np from pandas.api.types import CategoricalDtype @@ -29,6 +30,15 @@ def data_missing(): return Categorical([np.nan, 'A']) +@pytest.fixture +def data_repeated(): + """Return different versions of data for count times""" + def gen(count): + for _ in range(count): + yield Categorical(make_data()) + yield gen + + @pytest.fixture def data_for_sorting(): return Categorical(['A', 'B', 'C'], categories=['C', 'A', 'B'], @@ -154,6 +164,22 @@ class TestMethods(base.BaseMethodsTests): def test_value_counts(self, all_data, dropna): pass + def test_combine_add(self, data_repeated): + # GH 20825 + # When adding categoricals in combine, result is a string + orig_data1, orig_data2 = data_repeated(2) + s1 = pd.Series(orig_data1) + s2 = pd.Series(orig_data2) + result = s1.combine(s2, lambda x1, x2: x1 + x2) + expected = pd.Series(([a + b for (a, b) in + zip(list(orig_data1), list(orig_data2))])) + self.assert_series_equal(result, expected) + + val = s1.iloc[0] + result = s1.combine(val, lambda x1, x2: x1 + x2) + expected = pd.Series([a + val for a in list(orig_data1)]) + self.assert_series_equal(result, expected) + class TestCasting(base.BaseCastingTests): pass diff --git a/pandas/tests/extension/conftest.py b/pandas/tests/extension/conftest.py index bbd31c4071b91..4bbbb7df2f399 100644 --- a/pandas/tests/extension/conftest.py +++ b/pandas/tests/extension/conftest.py @@ -30,6 +30,15 @@ def all_data(request, data, data_missing): return data_missing +@pytest.fixture +def data_repeated(): + """Return different versions of data for count times""" + def gen(count): + for _ in range(count): + yield NotImplementedError + yield gen + + @pytest.fixture def data_for_sorting(): """Length-3 array with a known sort order. diff --git a/pandas/tests/extension/decimal/array.py b/pandas/tests/extension/decimal/array.py index 90f0181beab0d..cc6fadc483d5e 100644 --- a/pandas/tests/extension/decimal/array.py +++ b/pandas/tests/extension/decimal/array.py @@ -28,7 +28,9 @@ class DecimalArray(ExtensionArray): dtype = DecimalDtype() def __init__(self, values): - assert all(isinstance(v, decimal.Decimal) for v in values) + for val in values: + if not isinstance(val, self.dtype.type): + raise TypeError values = np.asarray(values, dtype=object) self._data = values diff --git a/pandas/tests/extension/decimal/test_decimal.py b/pandas/tests/extension/decimal/test_decimal.py index 1f8cf0264f62f..f74b4d7e94f11 100644 --- a/pandas/tests/extension/decimal/test_decimal.py +++ b/pandas/tests/extension/decimal/test_decimal.py @@ -25,6 +25,14 @@ def data_missing(): return DecimalArray([decimal.Decimal('NaN'), decimal.Decimal(1)]) +@pytest.fixture +def data_repeated(): + def gen(count): + for _ in range(count): + yield DecimalArray(make_data()) + yield gen + + @pytest.fixture def data_for_sorting(): return DecimalArray([decimal.Decimal('1'), diff --git a/pandas/tests/extension/json/test_json.py b/pandas/tests/extension/json/test_json.py index b7ac8033f3f6d..85a282ae4007f 100644 --- a/pandas/tests/extension/json/test_json.py +++ b/pandas/tests/extension/json/test_json.py @@ -187,6 +187,14 @@ def test_sort_values_missing(self, data_missing_for_sorting, ascending): super(TestMethods, self).test_sort_values_missing( data_missing_for_sorting, ascending) + @pytest.mark.skip(reason="combine for JSONArray not supported") + def test_combine_le(self, data_repeated): + pass + + @pytest.mark.skip(reason="combine for JSONArray not supported") + def test_combine_add(self, data_repeated): + pass + class TestCasting(BaseJSON, base.BaseCastingTests): @pytest.mark.xfail diff --git a/pandas/tests/series/test_combine_concat.py b/pandas/tests/series/test_combine_concat.py index 6cf60e818c845..f35cce6ac9d71 100644 --- a/pandas/tests/series/test_combine_concat.py +++ b/pandas/tests/series/test_combine_concat.py @@ -60,6 +60,19 @@ def test_append_duplicates(self): with tm.assert_raises_regex(ValueError, msg): pd.concat([s1, s2], verify_integrity=True) + def test_combine_scalar(self): + # GH 21248 + # Note - combine() with another Series is tested elsewhere because + # it is used when testing operators + s = pd.Series([i * 10 for i in range(5)]) + result = s.combine(3, lambda x, y: x + y) + expected = pd.Series([i * 10 + 3 for i in range(5)]) + tm.assert_series_equal(result, expected) + + result = s.combine(22, lambda x, y: min(x, y)) + expected = pd.Series([min(i * 10, 22) for i in range(5)]) + tm.assert_series_equal(result, expected) + def test_combine_first(self): values = tm.makeIntIndex(20).values.astype(float) series = Series(values, index=tm.makeIntIndex(20)) From 7bc21248f1a7ac0c269ba58baebf8407a9c096dc Mon Sep 17 00:00:00 2001 From: daminisatya Date: Fri, 8 Jun 2018 09:21:11 -0700 Subject: [PATCH 11/12] requested changes added --- doc/source/whatsnew/v0.23.1.txt | 1 + doc/source/whatsnew/v0.24.0.txt | 14 +------------- pandas/tests/io/json/test_normalize.py | 2 +- 3 files changed, 3 insertions(+), 14 deletions(-) diff --git a/doc/source/whatsnew/v0.23.1.txt b/doc/source/whatsnew/v0.23.1.txt index 5a1bcce9b5970..439b58dbb72a8 100644 --- a/doc/source/whatsnew/v0.23.1.txt +++ b/doc/source/whatsnew/v0.23.1.txt @@ -31,6 +31,7 @@ Fixed Regressions - Bug in :meth:`Categorical.fillna` incorrectly raising a ``TypeError`` when `value` the individual categories are iterable and `value` is an iterable (:issue:`21097`, :issue:`19788`) - Regression in :func:`pivot_table` where an ordered ``Categorical`` with missing values for the pivot's ``index`` would give a mis-aligned result (:issue:`21133`) +- Fixed Regression in :func:`nested_to_record` which now flattens list of dictionaries and doesnot drop keys with value as `None` (:issue:`21356`) .. _whatsnew_0231.performance: diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index 15924e4fcb701..54eab36a8a571 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -15,8 +15,7 @@ Other Enhancements - :func:`to_datetime` now supports the ``%Z`` and ``%z`` directive when passed into ``format`` (:issue:`13486`) - :func:`Series.mode` and :func:`DataFrame.mode` now support the ``dropna`` parameter which can be used to specify whether NaN/NaT values should be considered (:issue:`17534`) - :func:`to_csv` now supports ``compression`` keyword when a file handle is passed. (:issue:`21227`) -- :meth:`Index.droplevel` is now implemented also for flat indexes, for compatibility with MultiIndex (:issue:`21115`) - +- .. _whatsnew_0240.api_breaking: @@ -81,8 +80,6 @@ Documentation Changes Bug Fixes ~~~~~~~~~ -- The top level None value is not dropped but rather preserved along with lower levels for consistency (:issue:`21356`) - Categorical ^^^^^^^^^^^ @@ -181,18 +178,9 @@ Reshaping - - -ExtensionArray -^^^^^^^^^^^^^^ - -- :meth:`Series.combine()` works correctly with :class:`~pandas.api.extensions.ExtensionArray` inside of :class:`Series` (:issue:`20825`) -- :meth:`Series.combine()` with scalar argument now works for any function type (:issue:`21248`) -- -- - Other ^^^^^ - :meth: `~pandas.io.formats.style.Styler.background_gradient` now takes a ``text_color_threshold`` parameter to automatically lighten the text color based on the luminance of the background color. This improves readability with dark background colors without the need to limit the background colormap range. (:issue:`21258`) - - -- diff --git a/pandas/tests/io/json/test_normalize.py b/pandas/tests/io/json/test_normalize.py index aac60e9327787..395c2c90767d3 100644 --- a/pandas/tests/io/json/test_normalize.py +++ b/pandas/tests/io/json/test_normalize.py @@ -352,7 +352,7 @@ def test_json_normalize_errors(self): errors='raise' ) - def test_nonetype(self): + def test_donot_drop_nonevalues(self): # GH21356 data = [ {'info': None, From b7bda8676abec2ee5672726b426aacb1b4c3ab0d Mon Sep 17 00:00:00 2001 From: William Ayd Date: Fri, 8 Jun 2018 09:38:16 -0700 Subject: [PATCH 12/12] Update v0.23.1.txt --- doc/source/whatsnew/v0.23.1.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/doc/source/whatsnew/v0.23.1.txt b/doc/source/whatsnew/v0.23.1.txt index 419ec13951c30..8ce2824d779f4 100644 --- a/doc/source/whatsnew/v0.23.1.txt +++ b/doc/source/whatsnew/v0.23.1.txt @@ -10,7 +10,6 @@ and bug fixes. We recommend that all users upgrade to this version. :local: :backlinks: none - .. _whatsnew_0231.fixed_regressions: Fixed Regressions