From 1b8c04110e72999c65b13147b5d960ca4ebaa29c Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 4 Jun 2018 11:53:06 +0200 Subject: [PATCH 01/24] CI: revert skip of geopandas downstream test (#21217) (cherry picked from commit 88c3f08d9b031f6559b9db6574ec02da5f81f6a8) --- pandas/tests/test_downstream.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/tests/test_downstream.py b/pandas/tests/test_downstream.py index c2d09c6d49e86..afd7993fefc70 100644 --- a/pandas/tests/test_downstream.py +++ b/pandas/tests/test_downstream.py @@ -103,7 +103,6 @@ def test_pandas_datareader(): 'F', 'quandl', '2017-01-01', '2017-02-01') -@pytest.mark.xfail(reaason="downstream install issue") def test_geopandas(): geopandas = import_module('geopandas') # noqa From 21775b6150d84894afcc1228b2070a4b1a044b1c Mon Sep 17 00:00:00 2001 From: topper-123 Date: Mon, 4 Jun 2018 22:43:16 +0100 Subject: [PATCH 02/24] Improve performance of CategoricalIndex.is_unique (#21107) (cherry picked from commit 9f95f7dbffef7752175ca9ed918314cb6f0b9b18) --- doc/source/whatsnew/v0.23.1.txt | 1 + pandas/core/indexes/category.py | 2 +- pandas/tests/indexes/test_category.py | 9 +++++++++ 3 files changed, 11 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.23.1.txt b/doc/source/whatsnew/v0.23.1.txt index b3c1dbc86525d..64a98de9c2bf7 100644 --- a/doc/source/whatsnew/v0.23.1.txt +++ b/doc/source/whatsnew/v0.23.1.txt @@ -30,6 +30,7 @@ Performance Improvements ~~~~~~~~~~~~~~~~~~~~~~~~ - Improved performance of :meth:`CategoricalIndex.is_monotonic_increasing`, :meth:`CategoricalIndex.is_monotonic_decreasing` and :meth:`CategoricalIndex.is_monotonic` (:issue:`21025`) +- Improved performance of :meth:`CategoricalIndex.is_unique` (:issue:`21107`) - - diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index 78b7ae7054248..150eca32e229d 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -378,7 +378,7 @@ def _engine(self): # introspection @cache_readonly def is_unique(self): - return not self.duplicated().any() + return self._engine.is_unique @property def is_monotonic_increasing(self): diff --git a/pandas/tests/indexes/test_category.py b/pandas/tests/indexes/test_category.py index 0e630f69b1a32..a2a4170256088 100644 --- a/pandas/tests/indexes/test_category.py +++ b/pandas/tests/indexes/test_category.py @@ -581,6 +581,15 @@ def test_is_monotonic(self, data, non_lexsorted_data): assert c.is_monotonic_increasing assert not c.is_monotonic_decreasing + @pytest.mark.parametrize('values, expected', [ + ([1, 2, 3], True), + ([1, 3, 1], False), + (list('abc'), True), + (list('aba'), False)]) + def test_is_unique(self, values, expected): + ci = CategoricalIndex(values) + assert ci.is_unique is expected + def test_duplicates(self): idx = CategoricalIndex([0, 0, 0], name='foo') From 9b837629a99ac5fcbe6d0be68a21fe79fd84a027 Mon Sep 17 00:00:00 2001 From: William Ayd Date: Tue, 5 Jun 2018 02:04:02 -0700 Subject: [PATCH 03/24] DOC: whatsnew note for MultiIndex Sorting Fix (#21316) (cherry picked from commit 15b39cdb2ee521964a00308f09d45f92be2feaf5) --- doc/source/whatsnew/v0.23.1.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/doc/source/whatsnew/v0.23.1.txt b/doc/source/whatsnew/v0.23.1.txt index 64a98de9c2bf7..1d7ef963d1153 100644 --- a/doc/source/whatsnew/v0.23.1.txt +++ b/doc/source/whatsnew/v0.23.1.txt @@ -84,6 +84,8 @@ Indexing - Bug in :meth:`Series.reset_index` where appropriate error was not raised with an invalid level name (:issue:`20925`) - Bug in :func:`interval_range` when ``start``/``periods`` or ``end``/``periods`` are specified with float ``start`` or ``end`` (:issue:`21161`) - Bug in :meth:`MultiIndex.set_names` where error raised for a ``MultiIndex`` with ``nlevels == 1`` (:issue:`21149`) +- Bug in :class:`IntervalIndex` constructors where creating an ``IntervalIndex`` from categorical data was not fully supported (:issue:`21243`, issue:`21253`) +- Bug in :meth:`MultiIndex.sort_index` which was not guaranteed to sort correctly with ``level=1``; this was also causing data misalignment in particular :meth:`DataFrame.stack` operations (:issue:`20994`, :issue:`20945`, :issue:`21052`) - I/O From 929699582225b1cc218ad21044edd99c1c82304a Mon Sep 17 00:00:00 2001 From: h-vetinari <33685575+h-vetinari@users.noreply.github.com> Date: Wed, 6 Jun 2018 17:08:22 +0200 Subject: [PATCH 04/24] DOC: fix mistake in Series.str.cat (#21330) (cherry picked from commit 0c65c57a279e755ab7093db925d1e580f9878dae) --- pandas/core/strings.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/core/strings.py b/pandas/core/strings.py index 5d50c45fe7eca..44811781837bc 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -2172,9 +2172,9 @@ def cat(self, others=None, sep=None, na_rep=None, join=None): Returns ------- - concat : str if `other is None`, Series/Index of objects if `others is - not None`. In the latter case, the result will remain categorical - if the calling Series/Index is categorical. + concat : str or Series/Index of objects + If `others` is None, `str` is returned, otherwise a `Series/Index` + (same type as caller) of objects is returned. See Also -------- From 3cd496b86906ba34ed9c9350cacd03ea3971de38 Mon Sep 17 00:00:00 2001 From: tmnhat2001 Date: Thu, 7 Jun 2018 05:39:24 -0400 Subject: [PATCH 05/24] BUG: Using DatetimeIndex.date with timezone returns incorrect date (#21281) * BUG: Using DatetimeIndex.date with timezone returns incorrect date #21230 * Fix bug where DTI.time returns a tz-aware Time instead of tz-naive #21267 (cherry picked from commit a363e1a920d93d41bc87cb70afe35d030cc6bf9a) --- doc/source/whatsnew/v0.23.1.txt | 2 ++ pandas/_libs/tslib.pyx | 2 +- pandas/core/indexes/datetimes.py | 22 +++++++++++++-- .../tests/indexes/datetimes/test_timezones.py | 28 ++++++++++++++++++- 4 files changed, 50 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v0.23.1.txt b/doc/source/whatsnew/v0.23.1.txt index 1d7ef963d1153..6b7ca4ca0ca7e 100644 --- a/doc/source/whatsnew/v0.23.1.txt +++ b/doc/source/whatsnew/v0.23.1.txt @@ -84,8 +84,10 @@ Indexing - Bug in :meth:`Series.reset_index` where appropriate error was not raised with an invalid level name (:issue:`20925`) - Bug in :func:`interval_range` when ``start``/``periods`` or ``end``/``periods`` are specified with float ``start`` or ``end`` (:issue:`21161`) - Bug in :meth:`MultiIndex.set_names` where error raised for a ``MultiIndex`` with ``nlevels == 1`` (:issue:`21149`) +- Bug in :attr:`DatetimeIndex.date` where an incorrect date is returned when the input date has a non-UTC timezone (:issue:`21230`) - Bug in :class:`IntervalIndex` constructors where creating an ``IntervalIndex`` from categorical data was not fully supported (:issue:`21243`, issue:`21253`) - Bug in :meth:`MultiIndex.sort_index` which was not guaranteed to sort correctly with ``level=1``; this was also causing data misalignment in particular :meth:`DataFrame.stack` operations (:issue:`20994`, :issue:`20945`, :issue:`21052`) +- Bug in :attr:`DatetimeIndex.time` where given a tz-aware Timestamp, a tz-aware Time is returned instead of tz-naive (:issue:`21267`) - I/O diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index 17453d8af1297..0f58cfa761f21 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -77,7 +77,7 @@ cdef inline object create_time_from_ts( int64_t value, pandas_datetimestruct dts, object tz, object freq): """ convenience routine to construct a datetime.time from its parts """ - return time(dts.hour, dts.min, dts.sec, dts.us, tz) + return time(dts.hour, dts.min, dts.sec, dts.us) def ints_to_pydatetime(ndarray[int64_t] arr, tz=None, freq=None, diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index 83950f1d71633..0ddf33cdcae73 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -2032,7 +2032,16 @@ def time(self): """ Returns numpy array of datetime.time. The time part of the Timestamps. """ - return libts.ints_to_pydatetime(self.asi8, self.tz, box="time") + + # If the Timestamps have a timezone that is not UTC, + # convert them into their i8 representation while + # keeping their timezone and not using UTC + if (self.tz is not None and self.tz is not utc): + timestamps = self._local_timestamps() + else: + timestamps = self.asi8 + + return libts.ints_to_pydatetime(timestamps, box="time") @property def date(self): @@ -2040,7 +2049,16 @@ def date(self): Returns numpy array of python datetime.date objects (namely, the date part of Timestamps without timezone information). """ - return libts.ints_to_pydatetime(self.normalize().asi8, box="date") + + # If the Timestamps have a timezone that is not UTC, + # convert them into their i8 representation while + # keeping their timezone and not using UTC + if (self.tz is not None and self.tz is not utc): + timestamps = self._local_timestamps() + else: + timestamps = self.asi8 + + return libts.ints_to_pydatetime(timestamps, box="date") def normalize(self): """ diff --git a/pandas/tests/indexes/datetimes/test_timezones.py b/pandas/tests/indexes/datetimes/test_timezones.py index 09210d8b64d1b..573940edaa08f 100644 --- a/pandas/tests/indexes/datetimes/test_timezones.py +++ b/pandas/tests/indexes/datetimes/test_timezones.py @@ -2,7 +2,7 @@ """ Tests for DatetimeIndex timezone-related methods """ -from datetime import datetime, timedelta, tzinfo +from datetime import datetime, timedelta, tzinfo, date, time from distutils.version import LooseVersion import pytest @@ -706,6 +706,32 @@ def test_join_utc_convert(self, join_type): assert isinstance(result, DatetimeIndex) assert result.tz.zone == 'UTC' + @pytest.mark.parametrize("dtype", [ + None, 'datetime64[ns, CET]', + 'datetime64[ns, EST]', 'datetime64[ns, UTC]' + ]) + def test_date_accessor(self, dtype): + # Regression test for GH#21230 + expected = np.array([date(2018, 6, 4), pd.NaT]) + + index = DatetimeIndex(['2018-06-04 10:00:00', pd.NaT], dtype=dtype) + result = index.date + + tm.assert_numpy_array_equal(result, expected) + + @pytest.mark.parametrize("dtype", [ + None, 'datetime64[ns, CET]', + 'datetime64[ns, EST]', 'datetime64[ns, UTC]' + ]) + def test_time_accessor(self, dtype): + # Regression test for GH#21267 + expected = np.array([time(10, 20, 30), pd.NaT]) + + index = DatetimeIndex(['2018-06-04 10:20:30', pd.NaT], dtype=dtype) + result = index.time + + tm.assert_numpy_array_equal(result, expected) + def test_dti_drop_dont_lose_tz(self): # GH#2621 ind = date_range("2012-12-01", periods=10, tz="utc") From 12e9ef6a991ede4b2c6222c289bcca33cb1fab54 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 7 Jun 2018 16:21:08 -0500 Subject: [PATCH 06/24] BUG: Fixed concat warning message (#21362) (cherry picked from commit 649bfae90f70e8ee7181aba31b0f0b44f09b76e6) --- doc/source/whatsnew/v0.23.1.txt | 2 +- pandas/core/indexes/api.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v0.23.1.txt b/doc/source/whatsnew/v0.23.1.txt index 6b7ca4ca0ca7e..cb44bec9ed092 100644 --- a/doc/source/whatsnew/v0.23.1.txt +++ b/doc/source/whatsnew/v0.23.1.txt @@ -107,7 +107,7 @@ Reshaping ^^^^^^^^^ - Bug in :func:`concat` where error was raised in concatenating :class:`Series` with numpy scalar and tuple names (:issue:`21015`) -- +- Bug in :func:`concat` warning message providing the wrong guidance for future behavior (:issue:`21101`) Other ^^^^^ diff --git a/pandas/core/indexes/api.py b/pandas/core/indexes/api.py index f9501cd2f9ddf..6f4fdfe5bf5cd 100644 --- a/pandas/core/indexes/api.py +++ b/pandas/core/indexes/api.py @@ -24,9 +24,9 @@ Sorting because non-concatenation axis is not aligned. A future version of pandas will change to not sort by default. -To accept the future behavior, pass 'sort=True'. +To accept the future behavior, pass 'sort=False'. -To retain the current behavior and silence the warning, pass sort=False +To retain the current behavior and silence the warning, pass 'sort=True'. """) From 222dff8ccc26bfe70892764ff1e08a9e167e6b3a Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 7 Jun 2018 23:25:37 +0200 Subject: [PATCH 07/24] Revert "enable multivalues insert (#19664)" (#21355) This reverts commit 7c7bd569ce8e0f117c618d068e3d2798134dbc73. (cherry picked from commit c460710f32193c65e33d366921f9eaf919bc8da4) --- doc/source/io.rst | 8 -------- doc/source/whatsnew/v0.23.1.txt | 29 ++++++++++++++++------------- pandas/io/sql.py | 28 +++------------------------- pandas/tests/io/test_sql.py | 26 -------------------------- 4 files changed, 19 insertions(+), 72 deletions(-) diff --git a/doc/source/io.rst b/doc/source/io.rst index aa2484b0cb5c3..d818f486ad62d 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -4719,14 +4719,6 @@ writes ``data`` to the database in batches of 1000 rows at a time: data.to_sql('data_chunked', engine, chunksize=1000) -.. note:: - - The function :func:`~pandas.DataFrame.to_sql` will perform a multivalue - insert if the engine dialect ``supports_multivalues_insert``. This will - greatly speed up the insert in some cases. - -SQL data types -++++++++++++++ :func:`~pandas.DataFrame.to_sql` will try to map your data to an appropriate SQL data type based on the dtype of the data. When you have columns of dtype diff --git a/doc/source/whatsnew/v0.23.1.txt b/doc/source/whatsnew/v0.23.1.txt index cb44bec9ed092..c5334338176aa 100644 --- a/doc/source/whatsnew/v0.23.1.txt +++ b/doc/source/whatsnew/v0.23.1.txt @@ -10,19 +10,22 @@ and bug fixes. We recommend that all users upgrade to this version. :local: :backlinks: none -.. _whatsnew_0231.enhancements: - -New features -~~~~~~~~~~~~ - - -.. _whatsnew_0231.deprecations: - -Deprecations -~~~~~~~~~~~~ - -- -- +.. _whatsnew_0231.fixed_regressions: + +Fixed Regressions + +- Reverted the ability of :func:`~DataFrame.to_sql` to perform multivalue + inserts as this caused regression in certain cases (:issue:`21103`). + In the future this will be made configurable. +- Fixed regression in the :attr:`DatetimeIndex.date` and :attr:`DatetimeIndex.time` + attributes in case of timezone-aware data: :attr:`DatetimeIndex.time` returned + a tz-aware time instead of tz-naive (:issue:`21267`) and :attr:`DatetimeIndex.date` + returned incorrect date when the input date has a non-UTC timezone (:issue:`21230`). +- Fixed regression in :meth:`pandas.io.json.json_normalize` when called with ``None`` values + in nested levels in JSON (:issue:`21158`). +- Bug in :meth:`~DataFrame.to_csv` causes encoding error when compression and encoding are specified (:issue:`21241`, :issue:`21118`) +- Bug preventing pandas from being importable with -OO optimization (:issue:`21071`) +- Bug in :meth:`Categorical.fillna` incorrectly raising a ``TypeError`` when `value` the individual categories are iterable and `value` is an iterable (:issue:`21097`, :issue:`19788`) .. _whatsnew_0231.performance: diff --git a/pandas/io/sql.py b/pandas/io/sql.py index ccb8d2d99d734..a582d32741ae9 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -572,29 +572,8 @@ def create(self): else: self._execute_create() - def insert_statement(self, data, conn): - """ - Generate tuple of SQLAlchemy insert statement and any arguments - to be executed by connection (via `_execute_insert`). - - Parameters - ---------- - conn : SQLAlchemy connectable(engine/connection) - Connection to recieve the data - data : list of dict - The data to be inserted - - Returns - ------- - SQLAlchemy statement - insert statement - *, optional - Additional parameters to be passed when executing insert statement - """ - dialect = getattr(conn, 'dialect', None) - if dialect and getattr(dialect, 'supports_multivalues_insert', False): - return self.table.insert(data), - return self.table.insert(), data + def insert_statement(self): + return self.table.insert() def insert_data(self): if self.index is not None: @@ -633,9 +612,8 @@ def insert_data(self): return column_names, data_list def _execute_insert(self, conn, keys, data_iter): - """Insert data into this table with database connection""" data = [{k: v for k, v in zip(keys, row)} for row in data_iter] - conn.execute(*self.insert_statement(data, conn)) + conn.execute(self.insert_statement(), data) def insert(self, chunksize=None): keys, data_list = self.insert_data() diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index 4530cc9d2fba9..f3ab74d37a2bc 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -1665,29 +1665,6 @@ class Temporary(Base): tm.assert_frame_equal(df, expected) - def test_insert_multivalues(self): - # issues addressed - # https://github.com/pandas-dev/pandas/issues/14315 - # https://github.com/pandas-dev/pandas/issues/8953 - - db = sql.SQLDatabase(self.conn) - df = DataFrame({'A': [1, 0, 0], 'B': [1.1, 0.2, 4.3]}) - table = sql.SQLTable("test_table", db, frame=df) - data = [ - {'A': 1, 'B': 0.46}, - {'A': 0, 'B': -2.06} - ] - statement = table.insert_statement(data, conn=self.conn)[0] - - if self.supports_multivalues_insert: - assert statement.parameters == data, ( - 'insert statement should be multivalues' - ) - else: - assert statement.parameters is None, ( - 'insert statement should not be multivalues' - ) - class _TestSQLAlchemyConn(_EngineToConnMixin, _TestSQLAlchemy): @@ -1702,7 +1679,6 @@ class _TestSQLiteAlchemy(object): """ flavor = 'sqlite' - supports_multivalues_insert = True @classmethod def connect(cls): @@ -1751,7 +1727,6 @@ class _TestMySQLAlchemy(object): """ flavor = 'mysql' - supports_multivalues_insert = True @classmethod def connect(cls): @@ -1821,7 +1796,6 @@ class _TestPostgreSQLAlchemy(object): """ flavor = 'postgresql' - supports_multivalues_insert = True @classmethod def connect(cls): From 85c01779e76971c23b5a691efe0ae047feb7634f Mon Sep 17 00:00:00 2001 From: Uddeshya Singh Date: Fri, 8 Jun 2018 21:55:51 +0530 Subject: [PATCH 08/24] BUG: invalid rolling window on empty input (#21291) (cherry picked from commit 93be27d6c5354f2a1daa10ac9cbe8f78934ea455) --- doc/source/whatsnew/v0.23.1.txt | 1 + pandas/core/window.py | 4 ++-- pandas/tests/test_window.py | 3 +-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v0.23.1.txt b/doc/source/whatsnew/v0.23.1.txt index c5334338176aa..f41dd61d392ae 100644 --- a/doc/source/whatsnew/v0.23.1.txt +++ b/doc/source/whatsnew/v0.23.1.txt @@ -54,6 +54,7 @@ Groupby/Resample/Rolling - Bug in :func:`DataFrame.agg` where applying multiple aggregation functions to a :class:`DataFrame` with duplicated column names would cause a stack overflow (:issue:`21063`) - Bug in :func:`pandas.core.groupby.GroupBy.ffill` and :func:`pandas.core.groupby.GroupBy.bfill` where the fill within a grouping would not always be applied as intended due to the implementations' use of a non-stable sort (:issue:`21207`) - Bug in :func:`pandas.core.groupby.GroupBy.rank` where results did not scale to 100% when specifying ``method='dense'`` and ``pct=True`` +- Bug in :func:`pandas.DataFrame.rolling` and :func:`pandas.Series.rolling` which incorrectly accepted a 0 window size rather than raising (:issue:`21286`) Strings ^^^^^^^ diff --git a/pandas/core/window.py b/pandas/core/window.py index 015e7f7913ed0..9d0f9dc4f75f9 100644 --- a/pandas/core/window.py +++ b/pandas/core/window.py @@ -602,8 +602,8 @@ def validate(self): if isinstance(window, (list, tuple, np.ndarray)): pass elif is_integer(window): - if window < 0: - raise ValueError("window must be non-negative") + if window <= 0: + raise ValueError("window must be > 0 ") try: import scipy.signal as sig except ImportError: diff --git a/pandas/tests/test_window.py b/pandas/tests/test_window.py index 74f2c977e0db2..cfd88f41f855e 100644 --- a/pandas/tests/test_window.py +++ b/pandas/tests/test_window.py @@ -389,8 +389,8 @@ def test_constructor(self, which): c(window=2, min_periods=1, center=False) # GH 13383 - c(0) with pytest.raises(ValueError): + c(0) c(-1) # not valid @@ -409,7 +409,6 @@ def test_constructor_with_win_type(self, which): # GH 13383 o = getattr(self, which) c = o.rolling - c(0, win_type='boxcar') with pytest.raises(ValueError): c(-1, win_type='boxcar') From 7688534ab779d6624d8f7444e6312572ed38c3a0 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 7 Jun 2018 23:20:16 +0200 Subject: [PATCH 09/24] DOC: clean-up 0.23.1 whatsnew (#21368) (cherry picked from commit 5bbbaf6ae48681699cfbdf8f4a726661118e0dcb) --- doc/source/whatsnew/v0.23.1.txt | 45 +++------------------------------ 1 file changed, 4 insertions(+), 41 deletions(-) diff --git a/doc/source/whatsnew/v0.23.1.txt b/doc/source/whatsnew/v0.23.1.txt index f41dd61d392ae..12608f677d22c 100644 --- a/doc/source/whatsnew/v0.23.1.txt +++ b/doc/source/whatsnew/v0.23.1.txt @@ -13,6 +13,7 @@ and bug fixes. We recommend that all users upgrade to this version. .. _whatsnew_0231.fixed_regressions: Fixed Regressions +~~~~~~~~~~~~~~~~~ - Reverted the ability of :func:`~DataFrame.to_sql` to perform multivalue inserts as this caused regression in certain cases (:issue:`21103`). @@ -34,14 +35,7 @@ Performance Improvements - Improved performance of :meth:`CategoricalIndex.is_monotonic_increasing`, :meth:`CategoricalIndex.is_monotonic_decreasing` and :meth:`CategoricalIndex.is_monotonic` (:issue:`21025`) - Improved performance of :meth:`CategoricalIndex.is_unique` (:issue:`21107`) -- -- -Documentation Changes -~~~~~~~~~~~~~~~~~~~~~ - -- -- .. _whatsnew_0231.bug_fixes: @@ -49,72 +43,41 @@ Bug Fixes ~~~~~~~~~ Groupby/Resample/Rolling -^^^^^^^^^^^^^^^^^^^^^^^^ - Bug in :func:`DataFrame.agg` where applying multiple aggregation functions to a :class:`DataFrame` with duplicated column names would cause a stack overflow (:issue:`21063`) - Bug in :func:`pandas.core.groupby.GroupBy.ffill` and :func:`pandas.core.groupby.GroupBy.bfill` where the fill within a grouping would not always be applied as intended due to the implementations' use of a non-stable sort (:issue:`21207`) - Bug in :func:`pandas.core.groupby.GroupBy.rank` where results did not scale to 100% when specifying ``method='dense'`` and ``pct=True`` - Bug in :func:`pandas.DataFrame.rolling` and :func:`pandas.Series.rolling` which incorrectly accepted a 0 window size rather than raising (:issue:`21286`) -Strings -^^^^^^^ +Data-type specific - Bug in :meth:`Series.str.replace()` where the method throws `TypeError` on Python 3.5.2 (:issue: `21078`) - -Timedelta -^^^^^^^^^ - Bug in :class:`Timedelta`: where passing a float with a unit would prematurely round the float precision (:issue: `14156`) - -Categorical -^^^^^^^^^^^ - -- Bug in :func:`pandas.util.testing.assert_index_equal` which raised ``AssertionError`` incorrectly, when comparing two :class:`CategoricalIndex` objects with param ``check_categorical=False`` (:issue:`19776`) -- Bug in :meth:`Categorical.fillna` incorrectly raising a ``TypeError`` when `value` the individual categories are iterable and `value` is an iterable (:issue:`21097`, :issue:`19788`) +- Bug in :func:`pandas.testing.assert_index_equal` which raised ``AssertionError`` incorrectly, when comparing two :class:`CategoricalIndex` objects with param ``check_categorical=False`` (:issue:`19776`) Sparse -^^^^^^ - Bug in :attr:`SparseArray.shape` which previously only returned the shape :attr:`SparseArray.sp_values` (:issue:`21126`) -Conversion -^^^^^^^^^^ - -- -- - Indexing -^^^^^^^^ - Bug in :meth:`Series.reset_index` where appropriate error was not raised with an invalid level name (:issue:`20925`) - Bug in :func:`interval_range` when ``start``/``periods`` or ``end``/``periods`` are specified with float ``start`` or ``end`` (:issue:`21161`) - Bug in :meth:`MultiIndex.set_names` where error raised for a ``MultiIndex`` with ``nlevels == 1`` (:issue:`21149`) -- Bug in :attr:`DatetimeIndex.date` where an incorrect date is returned when the input date has a non-UTC timezone (:issue:`21230`) - Bug in :class:`IntervalIndex` constructors where creating an ``IntervalIndex`` from categorical data was not fully supported (:issue:`21243`, issue:`21253`) - Bug in :meth:`MultiIndex.sort_index` which was not guaranteed to sort correctly with ``level=1``; this was also causing data misalignment in particular :meth:`DataFrame.stack` operations (:issue:`20994`, :issue:`20945`, :issue:`21052`) -- Bug in :attr:`DatetimeIndex.time` where given a tz-aware Timestamp, a tz-aware Time is returned instead of tz-naive (:issue:`21267`) -- I/O -^^^ - Bug in IO methods specifying ``compression='zip'`` which produced uncompressed zip archives (:issue:`17778`, :issue:`21144`) - Bug in :meth:`DataFrame.to_stata` which prevented exporting DataFrames to buffers and most file-like objects (:issue:`21041`) -- - -Plotting -^^^^^^^^ - -- -- +- Bug in :meth:`read_stata` and :class:`StataReader` which did not correctly decode utf-8 strings on Python 3 from Stata 14 files (dta version 118) (:issue:`21244`) Reshaping -^^^^^^^^^ - Bug in :func:`concat` where error was raised in concatenating :class:`Series` with numpy scalar and tuple names (:issue:`21015`) - Bug in :func:`concat` warning message providing the wrong guidance for future behavior (:issue:`21101`) Other -^^^^^ - Tab completion on :class:`Index` in IPython no longer outputs deprecation warnings (:issue:`21125`) -- Bug preventing pandas from being importable with -OO optimization (:issue:`21071`) From 17dad0dae88d915416ca6bfda853cfe679840f79 Mon Sep 17 00:00:00 2001 From: Damini Satya Date: Fri, 8 Jun 2018 09:50:20 -0700 Subject: [PATCH 10/24] Fix #21356: JSON nested_to_record Silently Drops Top-Level None Values (#21363) (cherry picked from commit ff2663247c2445677f27f3f46fe14f3ef265ce2d) --- doc/source/whatsnew/v0.23.1.txt | 5 ++ pandas/io/json/normalize.py | 2 - pandas/tests/io/json/test_normalize.py | 75 +++++++++++++++++++++++--- 3 files changed, 72 insertions(+), 10 deletions(-) diff --git a/doc/source/whatsnew/v0.23.1.txt b/doc/source/whatsnew/v0.23.1.txt index 12608f677d22c..020eebd414ac7 100644 --- a/doc/source/whatsnew/v0.23.1.txt +++ b/doc/source/whatsnew/v0.23.1.txt @@ -27,6 +27,11 @@ Fixed Regressions - Bug in :meth:`~DataFrame.to_csv` causes encoding error when compression and encoding are specified (:issue:`21241`, :issue:`21118`) - Bug preventing pandas from being importable with -OO optimization (:issue:`21071`) - Bug in :meth:`Categorical.fillna` incorrectly raising a ``TypeError`` when `value` the individual categories are iterable and `value` is an iterable (:issue:`21097`, :issue:`19788`) +- Fixed regression in constructors coercing NA values like ``None`` to strings when passing ``dtype=str`` (:issue:`21083`) +- Regression in :func:`pivot_table` where an ordered ``Categorical`` with missing + values for the pivot's ``index`` would give a mis-aligned result (:issue:`21133`) +- Fixed Regression in :func:`nested_to_record` which now flattens list of dictionaries and doesnot drop keys with value as `None` (:issue:`21356`) + .. _whatsnew_0231.performance: diff --git a/pandas/io/json/normalize.py b/pandas/io/json/normalize.py index 549204abd3caf..b845a43b9ca9e 100644 --- a/pandas/io/json/normalize.py +++ b/pandas/io/json/normalize.py @@ -80,8 +80,6 @@ def nested_to_record(ds, prefix="", sep=".", level=0): if level != 0: # so we skip copying for top level, common case v = new_d.pop(k) new_d[newkey] = v - if v is None: # pop the key if the value is None - new_d.pop(k) continue else: v = new_d.pop(k) diff --git a/pandas/tests/io/json/test_normalize.py b/pandas/tests/io/json/test_normalize.py index 0fabaf747b6de..395c2c90767d3 100644 --- a/pandas/tests/io/json/test_normalize.py +++ b/pandas/tests/io/json/test_normalize.py @@ -238,15 +238,16 @@ def test_non_ascii_key(self): tm.assert_frame_equal(result, expected) def test_missing_field(self, author_missing_data): - # GH20030: Checks for robustness of json_normalize - should - # unnest records where only the first record has a None value + # GH20030: result = json_normalize(author_missing_data) ex_data = [ - {'author_name.first': np.nan, + {'info': np.nan, + 'author_name.first': np.nan, 'author_name.last_name': np.nan, 'info.created_at': np.nan, 'info.last_updated': np.nan}, - {'author_name.first': 'Jane', + {'info': None, + 'author_name.first': 'Jane', 'author_name.last_name': 'Doe', 'info.created_at': '11/08/1993', 'info.last_updated': '26/05/2012'} @@ -351,9 +352,8 @@ def test_json_normalize_errors(self): errors='raise' ) - def test_nonetype_dropping(self): - # GH20030: Checks that None values are dropped in nested_to_record - # to prevent additional columns of nans when passed to DataFrame + def test_donot_drop_nonevalues(self): + # GH21356 data = [ {'info': None, 'author_name': @@ -367,7 +367,8 @@ def test_nonetype_dropping(self): ] result = nested_to_record(data) expected = [ - {'author_name.first': 'Smith', + {'info': None, + 'author_name.first': 'Smith', 'author_name.last_name': 'Appleseed'}, {'author_name.first': 'Jane', 'author_name.last_name': 'Doe', @@ -375,3 +376,61 @@ def test_nonetype_dropping(self): 'info.last_updated': '26/05/2012'}] assert result == expected + + def test_nonetype_top_level_bottom_level(self): + # GH21158: If inner level json has a key with a null value + # make sure it doesnt do a new_d.pop twice and except + data = { + "id": None, + "location": { + "country": { + "state": { + "id": None, + "town.info": { + "id": None, + "region": None, + "x": 49.151580810546875, + "y": -33.148521423339844, + "z": 27.572303771972656}}} + } + } + result = nested_to_record(data) + expected = { + 'id': None, + 'location.country.state.id': None, + 'location.country.state.town.info.id': None, + 'location.country.state.town.info.region': None, + 'location.country.state.town.info.x': 49.151580810546875, + 'location.country.state.town.info.y': -33.148521423339844, + 'location.country.state.town.info.z': 27.572303771972656} + assert result == expected + + def test_nonetype_multiple_levels(self): + # GH21158: If inner level json has a key with a null value + # make sure it doesnt do a new_d.pop twice and except + data = { + "id": None, + "location": { + "id": None, + "country": { + "id": None, + "state": { + "id": None, + "town.info": { + "region": None, + "x": 49.151580810546875, + "y": -33.148521423339844, + "z": 27.572303771972656}}} + } + } + result = nested_to_record(data) + expected = { + 'id': None, + 'location.id': None, + 'location.country.id': None, + 'location.country.state.id': None, + 'location.country.state.town.info.region': None, + 'location.country.state.town.info.x': 49.151580810546875, + 'location.country.state.town.info.y': -33.148521423339844, + 'location.country.state.town.info.z': 27.572303771972656} + assert result == expected From 5ceba747a2fa462a5abcec627001c2aba071e2ac Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 8 Jun 2018 11:54:36 -0500 Subject: [PATCH 11/24] Revert change to comparison op with datetime.date objects (#21361) (cherry picked from commit d79203af0552e73933e6f80f4284ac2697372eaa) --- doc/source/whatsnew/v0.23.1.txt | 42 ++++++++++++++++++++++++++ pandas/core/ops.py | 30 ++++++++++++++++++ pandas/tests/series/test_arithmetic.py | 40 ++++++++++++++++++++++++ 3 files changed, 112 insertions(+) diff --git a/doc/source/whatsnew/v0.23.1.txt b/doc/source/whatsnew/v0.23.1.txt index 020eebd414ac7..80526358f1d3d 100644 --- a/doc/source/whatsnew/v0.23.1.txt +++ b/doc/source/whatsnew/v0.23.1.txt @@ -15,6 +15,48 @@ and bug fixes. We recommend that all users upgrade to this version. Fixed Regressions ~~~~~~~~~~~~~~~~~ +**Comparing Series with datetime.date** + +We've reverted a 0.23.0 change to comparing a :class:`Series` holding datetimes and a ``datetime.date`` object (:issue:`21152`). +In pandas 0.22 and earlier, comparing a Series holding datetimes and ``datetime.date`` objects would coerce the ``datetime.date`` to a datetime before comapring. +This was inconsistent with Python, NumPy, and :class:`DatetimeIndex`, which never consider a datetime and ``datetime.date`` equal. + +In 0.23.0, we unified operations between DatetimeIndex and Series, and in the process changed comparisons between a Series of datetimes and ``datetime.date`` without warning. + +We've temporarily restored the 0.22.0 behavior, so datetimes and dates may again compare equal, but restore the 0.23.0 behavior in a future release. + +To summarize, here's the behavior in 0.22.0, 0.23.0, 0.23.1: + +.. code-block:: python + + # 0.22.0... Silently coerce the datetime.date + >>> Series(pd.date_range('2017', periods=2)) == datetime.date(2017, 1, 1) + 0 True + 1 False + dtype: bool + + # 0.23.0... Do not coerce the datetime.date + >>> Series(pd.date_range('2017', periods=2)) == datetime.date(2017, 1, 1) + 0 False + 1 False + dtype: bool + + # 0.23.1... Coerce the datetime.date with a warning + >>> Series(pd.date_range('2017', periods=2)) == datetime.date(2017, 1, 1) + /bin/python:1: FutureWarning: Comparing Series of datetimes with 'datetime.date'. Currently, the + 'datetime.date' is coerced to a datetime. In the future pandas will + not coerce, and the values not compare equal to the 'datetime.date'. + To retain the current behavior, convert the 'datetime.date' to a + datetime with 'pd.Timestamp'. + #!/bin/python3 + 0 True + 1 False + dtype: bool + +In addition, ordering comparisons will raise a ``TypeError`` in the future. + +**Other Fixes** + - Reverted the ability of :func:`~DataFrame.to_sql` to perform multivalue inserts as this caused regression in certain cases (:issue:`21103`). In the future this will be made configurable. diff --git a/pandas/core/ops.py b/pandas/core/ops.py index e14f82906cd06..540ebeee438f6 100644 --- a/pandas/core/ops.py +++ b/pandas/core/ops.py @@ -5,7 +5,10 @@ """ # necessary to enforce truediv in Python 2.X from __future__ import division +import datetime import operator +import textwrap +import warnings import numpy as np import pandas as pd @@ -1197,8 +1200,35 @@ def wrapper(self, other, axis=None): if is_datetime64_dtype(self) or is_datetime64tz_dtype(self): # Dispatch to DatetimeIndex to ensure identical # Series/Index behavior + if (isinstance(other, datetime.date) and + not isinstance(other, datetime.datetime)): + # https://github.com/pandas-dev/pandas/issues/21152 + # Compatibility for difference between Series comparison w/ + # datetime and date + msg = ( + "Comparing Series of datetimes with 'datetime.date'. " + "Currently, the 'datetime.date' is coerced to a " + "datetime. In the future pandas will not coerce, " + "and {future}. " + "To retain the current behavior, " + "convert the 'datetime.date' to a datetime with " + "'pd.Timestamp'." + ) + + if op in {operator.lt, operator.le, operator.gt, operator.ge}: + future = "a TypeError will be raised" + else: + future = ( + "'the values will not compare equal to the " + "'datetime.date'" + ) + msg = '\n'.join(textwrap.wrap(msg.format(future=future))) + warnings.warn(msg, FutureWarning, stacklevel=2) + other = pd.Timestamp(other) + res_values = dispatch_to_index_op(op, self, other, pd.DatetimeIndex) + return self._constructor(res_values, index=self.index, name=res_name) diff --git a/pandas/tests/series/test_arithmetic.py b/pandas/tests/series/test_arithmetic.py index ec0d7296e540e..95836f046195a 100644 --- a/pandas/tests/series/test_arithmetic.py +++ b/pandas/tests/series/test_arithmetic.py @@ -88,6 +88,46 @@ def test_ser_cmp_result_names(self, names, op): class TestTimestampSeriesComparison(object): + def test_dt64_ser_cmp_date_warning(self): + # https://github.com/pandas-dev/pandas/issues/21359 + # Remove this test and enble invalid test below + ser = pd.Series(pd.date_range('20010101', periods=10), name='dates') + date = ser.iloc[0].to_pydatetime().date() + + with tm.assert_produces_warning(FutureWarning) as m: + result = ser == date + expected = pd.Series([True] + [False] * 9, name='dates') + tm.assert_series_equal(result, expected) + assert "Comparing Series of datetimes " in str(m[0].message) + assert "will not compare equal" in str(m[0].message) + + with tm.assert_produces_warning(FutureWarning) as m: + result = ser != date + tm.assert_series_equal(result, ~expected) + assert "will not compare equal" in str(m[0].message) + + with tm.assert_produces_warning(FutureWarning) as m: + result = ser <= date + tm.assert_series_equal(result, expected) + assert "a TypeError will be raised" in str(m[0].message) + + with tm.assert_produces_warning(FutureWarning) as m: + result = ser < date + tm.assert_series_equal(result, pd.Series([False] * 10, name='dates')) + assert "a TypeError will be raised" in str(m[0].message) + + with tm.assert_produces_warning(FutureWarning) as m: + result = ser >= date + tm.assert_series_equal(result, pd.Series([True] * 10, name='dates')) + assert "a TypeError will be raised" in str(m[0].message) + + with tm.assert_produces_warning(FutureWarning) as m: + result = ser > date + tm.assert_series_equal(result, pd.Series([False] + [True] * 9, + name='dates')) + assert "a TypeError will be raised" in str(m[0].message) + + @pytest.mark.skip(reason="GH-21359") def test_dt64ser_cmp_date_invalid(self): # GH#19800 datetime.date comparison raises to # match DatetimeIndex/Timestamp. This also matches the behavior From c92d2f92a7d1d3a04867fadee0d8c5585da335c4 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 8 Jun 2018 11:27:13 -0500 Subject: [PATCH 12/24] REGR: NA-values in ctors with string dtype (#21366) (cherry picked from commit 636dd01fdacba0c8f0e7b5aaa726165983fc861d) --- pandas/conftest.py | 11 +++++++ pandas/core/dtypes/cast.py | 42 ++++++++++++++++++++++++ pandas/core/series.py | 4 ++- pandas/tests/dtypes/test_cast.py | 13 ++++++++ pandas/tests/frame/test_constructors.py | 11 +++++++ pandas/tests/frame/test_dtypes.py | 16 +++++---- pandas/tests/series/test_constructors.py | 26 +++++++++++---- 7 files changed, 110 insertions(+), 13 deletions(-) diff --git a/pandas/conftest.py b/pandas/conftest.py index b09cb872a12fb..e6c1b1b171045 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -149,3 +149,14 @@ def tz_aware_fixture(request): Fixture for trying explicit timezones: {0} """ return request.param + + +@pytest.fixture(params=[str, 'str', 'U']) +def string_dtype(request): + """Parametrized fixture for string dtypes. + + * str + * 'str' + * 'U' + """ + return request.param diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index e4ed6d544d42e..ebc7a13234a98 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1227,3 +1227,45 @@ def construct_1d_object_array_from_listlike(values): result = np.empty(len(values), dtype='object') result[:] = values return result + + +def construct_1d_ndarray_preserving_na(values, dtype=None, copy=False): + """ + Construct a new ndarray, coercing `values` to `dtype`, preserving NA. + + Parameters + ---------- + values : Sequence + dtype : numpy.dtype, optional + copy : bool, default False + Note that copies may still be made with ``copy=False`` if casting + is required. + + Returns + ------- + arr : ndarray[dtype] + + Examples + -------- + >>> np.array([1.0, 2.0, None], dtype='str') + array(['1.0', '2.0', 'None'], dtype='>> construct_1d_ndarray_preserving_na([1.0, 2.0, None], dtype='str') + + + """ + subarr = np.array(values, dtype=dtype, copy=copy) + + if dtype is not None and dtype.kind in ("U", "S"): + # GH-21083 + # We can't just return np.array(subarr, dtype='str') since + # NumPy will convert the non-string objects into strings + # Including NA values. Se we have to go + # string -> object -> update NA, which requires an + # additional pass over the data. + na_values = isna(values) + subarr2 = subarr.astype(object) + subarr2[na_values] = np.asarray(values, dtype=object)[na_values] + subarr = subarr2 + + return subarr diff --git a/pandas/core/series.py b/pandas/core/series.py index c5caafa07fb8e..6975dd8fc918e 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -40,6 +40,7 @@ maybe_convert_platform, maybe_cast_to_datetime, maybe_castable, construct_1d_arraylike_from_scalar, + construct_1d_ndarray_preserving_na, construct_1d_object_array_from_listlike) from pandas.core.dtypes.missing import ( isna, @@ -4047,7 +4048,8 @@ def _try_cast(arr, take_fast_path): isinstance(subarr, np.ndarray))): subarr = construct_1d_object_array_from_listlike(subarr) elif not is_extension_type(subarr): - subarr = np.array(subarr, dtype=dtype, copy=copy) + subarr = construct_1d_ndarray_preserving_na(subarr, dtype, + copy=copy) except (ValueError, TypeError): if is_categorical_dtype(dtype): # We *do* allow casting to categorical, since we know diff --git a/pandas/tests/dtypes/test_cast.py b/pandas/tests/dtypes/test_cast.py index 20cd8b43478d2..4a19682e2c558 100644 --- a/pandas/tests/dtypes/test_cast.py +++ b/pandas/tests/dtypes/test_cast.py @@ -23,6 +23,7 @@ maybe_convert_scalar, find_common_type, construct_1d_object_array_from_listlike, + construct_1d_ndarray_preserving_na, construct_1d_arraylike_from_scalar) from pandas.core.dtypes.dtypes import ( CategoricalDtype, @@ -440,3 +441,15 @@ def test_cast_1d_arraylike_from_scalar_categorical(self): tm.assert_categorical_equal(result, expected, check_category_order=True, check_dtype=True) + + +@pytest.mark.parametrize('values, dtype, expected', [ + ([1, 2, 3], None, np.array([1, 2, 3])), + (np.array([1, 2, 3]), None, np.array([1, 2, 3])), + (['1', '2', None], None, np.array(['1', '2', None])), + (['1', '2', None], np.dtype('str'), np.array(['1', '2', None])), + ([1, 2, None], np.dtype('str'), np.array(['1', '2', None])), +]) +def test_construct_1d_ndarray_preserving_na(values, dtype, expected): + result = construct_1d_ndarray_preserving_na(values, dtype=dtype) + tm.assert_numpy_array_equal(result, expected) diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 6dd38187f7277..70dd358248bc4 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -151,6 +151,17 @@ def test_constructor_complex_dtypes(self): assert a.dtype == df.a.dtype assert b.dtype == df.b.dtype + def test_constructor_dtype_str_na_values(self, string_dtype): + # https://github.com/pandas-dev/pandas/issues/21083 + df = DataFrame({'A': ['x', None]}, dtype=string_dtype) + result = df.isna() + expected = DataFrame({"A": [False, True]}) + tm.assert_frame_equal(result, expected) + assert df.iloc[1, 0] is None + + df = DataFrame({'A': ['x', np.nan]}, dtype=string_dtype) + assert np.isnan(df.iloc[1, 0]) + def test_constructor_rec(self): rec = self.frame.to_records(index=False) diff --git a/pandas/tests/frame/test_dtypes.py b/pandas/tests/frame/test_dtypes.py index 4c9f8c2ea0980..1eeeec0be3b8b 100644 --- a/pandas/tests/frame/test_dtypes.py +++ b/pandas/tests/frame/test_dtypes.py @@ -794,22 +794,26 @@ def test_arg_for_errors_in_astype(self): @pytest.mark.parametrize('input_vals', [ ([1, 2]), - ([1.0, 2.0, np.nan]), (['1', '2']), (list(pd.date_range('1/1/2011', periods=2, freq='H'))), (list(pd.date_range('1/1/2011', periods=2, freq='H', tz='US/Eastern'))), ([pd.Interval(left=0, right=5)]), ]) - def test_constructor_list_str(self, input_vals): + def test_constructor_list_str(self, input_vals, string_dtype): # GH 16605 # Ensure that data elements are converted to strings when # dtype is str, 'str', or 'U' - for dtype in ['str', str, 'U']: - result = DataFrame({'A': input_vals}, dtype=dtype) - expected = DataFrame({'A': input_vals}).astype({'A': dtype}) - assert_frame_equal(result, expected) + result = DataFrame({'A': input_vals}, dtype=string_dtype) + expected = DataFrame({'A': input_vals}).astype({'A': string_dtype}) + assert_frame_equal(result, expected) + + def test_constructor_list_str_na(self, string_dtype): + + result = DataFrame({"A": [1.0, 2.0, None]}, dtype=string_dtype) + expected = DataFrame({"A": ['1.0', '2.0', None]}, dtype=object) + assert_frame_equal(result, expected) class TestDataFrameDatetimeWithTZ(TestData): diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index 7e59325c32ddc..906d2aacd5586 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -137,6 +137,17 @@ def test_constructor_no_data_index_order(self): result = pd.Series(index=['b', 'a', 'c']) assert result.index.tolist() == ['b', 'a', 'c'] + def test_constructor_dtype_str_na_values(self, string_dtype): + # https://github.com/pandas-dev/pandas/issues/21083 + ser = Series(['x', None], dtype=string_dtype) + result = ser.isna() + expected = Series([False, True]) + tm.assert_series_equal(result, expected) + assert ser.iloc[1] is None + + ser = Series(['x', np.nan], dtype=string_dtype) + assert np.isnan(ser.iloc[1]) + def test_constructor_series(self): index1 = ['d', 'b', 'a', 'c'] index2 = sorted(index1) @@ -164,22 +175,25 @@ def test_constructor_list_like(self): @pytest.mark.parametrize('input_vals', [ ([1, 2]), - ([1.0, 2.0, np.nan]), (['1', '2']), (list(pd.date_range('1/1/2011', periods=2, freq='H'))), (list(pd.date_range('1/1/2011', periods=2, freq='H', tz='US/Eastern'))), ([pd.Interval(left=0, right=5)]), ]) - def test_constructor_list_str(self, input_vals): + def test_constructor_list_str(self, input_vals, string_dtype): # GH 16605 # Ensure that data elements from a list are converted to strings # when dtype is str, 'str', or 'U' + result = Series(input_vals, dtype=string_dtype) + expected = Series(input_vals).astype(string_dtype) + assert_series_equal(result, expected) - for dtype in ['str', str, 'U']: - result = Series(input_vals, dtype=dtype) - expected = Series(input_vals).astype(dtype) - assert_series_equal(result, expected) + def test_constructor_list_str_na(self, string_dtype): + result = Series([1.0, 2.0, np.nan], dtype=string_dtype) + expected = Series(['1.0', '2.0', np.nan], dtype=object) + assert_series_equal(result, expected) + assert np.isnan(result[2]) def test_constructor_generator(self): gen = (i for i in range(10)) From c64d52f9226217658272bb656f37b31d6509c4e3 Mon Sep 17 00:00:00 2001 From: Pyry Kovanen Date: Sat, 9 Jun 2018 02:40:03 +0300 Subject: [PATCH 13/24] BUG: Fix empty Data frames to JSON round-trippable back to data frames (#21318) (cherry picked from commit 415012f4f38ca0cf41717c51e49bd2349cba09a8) --- doc/source/whatsnew/v0.23.1.txt | 1 + pandas/io/json/table_schema.py | 2 +- pandas/tests/io/json/test_json_table_schema.py | 13 +++++++++++++ 3 files changed, 15 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.23.1.txt b/doc/source/whatsnew/v0.23.1.txt index 80526358f1d3d..3bbacd909c603 100644 --- a/doc/source/whatsnew/v0.23.1.txt +++ b/doc/source/whatsnew/v0.23.1.txt @@ -119,6 +119,7 @@ I/O - Bug in IO methods specifying ``compression='zip'`` which produced uncompressed zip archives (:issue:`17778`, :issue:`21144`) - Bug in :meth:`DataFrame.to_stata` which prevented exporting DataFrames to buffers and most file-like objects (:issue:`21041`) - Bug in :meth:`read_stata` and :class:`StataReader` which did not correctly decode utf-8 strings on Python 3 from Stata 14 files (dta version 118) (:issue:`21244`) +- Bug in IO JSON :func:`read_json` reading empty JSON schema with ``orient='table'`` back to :class:`DataFrame` caused an error (:issue:`21287`) Reshaping diff --git a/pandas/io/json/table_schema.py b/pandas/io/json/table_schema.py index 01f7db7d68664..5cea64388bdd7 100644 --- a/pandas/io/json/table_schema.py +++ b/pandas/io/json/table_schema.py @@ -296,7 +296,7 @@ def parse_table_schema(json, precise_float): """ table = loads(json, precise_float=precise_float) col_order = [field['name'] for field in table['schema']['fields']] - df = DataFrame(table['data'])[col_order] + df = DataFrame(table['data'], columns=col_order)[col_order] dtypes = {field['name']: convert_json_field_to_pandas_type(field) for field in table['schema']['fields']} diff --git a/pandas/tests/io/json/test_json_table_schema.py b/pandas/tests/io/json/test_json_table_schema.py index 49b39c17238ae..b6483d0e978ba 100644 --- a/pandas/tests/io/json/test_json_table_schema.py +++ b/pandas/tests/io/json/test_json_table_schema.py @@ -560,3 +560,16 @@ def test_multiindex(self, index_names): out = df.to_json(orient="table") result = pd.read_json(out, orient="table") tm.assert_frame_equal(df, result) + + @pytest.mark.parametrize("strict_check", [ + pytest.param(True, marks=pytest.mark.xfail), False]) + def test_empty_frame_roundtrip(self, strict_check): + # GH 21287 + df = pd.DataFrame([], columns=['a', 'b', 'c']) + expected = df.copy() + out = df.to_json(orient='table') + result = pd.read_json(out, orient='table') + # TODO: When DF coercion issue (#21345) is resolved tighten type checks + tm.assert_frame_equal(expected, result, + check_dtype=strict_check, + check_index_type=strict_check) From 6eea28a363834a809a7493b9c347e228e0bd133d Mon Sep 17 00:00:00 2001 From: chris-b1 Date: Fri, 8 Jun 2018 18:32:20 -0500 Subject: [PATCH 14/24] BLD: include dll in package_data on Windows (#21321) (cherry picked from commit 324b324f91021e57106ffc7937f35d54279aac5c) --- doc/source/whatsnew/v0.23.1.txt | 1 + setup.py | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.23.1.txt b/doc/source/whatsnew/v0.23.1.txt index 3bbacd909c603..db9a23dc66ef0 100644 --- a/doc/source/whatsnew/v0.23.1.txt +++ b/doc/source/whatsnew/v0.23.1.txt @@ -129,3 +129,4 @@ Reshaping Other - Tab completion on :class:`Index` in IPython no longer outputs deprecation warnings (:issue:`21125`) +- Bug preventing pandas being used on Windows without C++ redistributable installed (:issue:`21106`) diff --git a/setup.py b/setup.py index 6febe674fb2a1..90ec8e91a0700 100755 --- a/setup.py +++ b/setup.py @@ -453,10 +453,10 @@ def pxd(name): return pjoin('pandas', name + '.pxd') -# args to ignore warnings if is_platform_windows(): extra_compile_args = [] else: + # args to ignore warnings extra_compile_args = ['-Wno-unused-function'] lib_depends = lib_depends + ['pandas/_libs/src/numpy_helper.h', @@ -733,7 +733,7 @@ def pxd(name): maintainer=AUTHOR, version=versioneer.get_version(), packages=find_packages(include=['pandas', 'pandas.*']), - package_data={'': ['data/*', 'templates/*'], + package_data={'': ['data/*', 'templates/*', '_libs/*.dll'], 'pandas.tests.io': ['data/legacy_hdf/*.h5', 'data/legacy_pickle/*/*.pickle', 'data/legacy_msgpack/*/*.msgpack', From 64409d97165217908e6ee190a3b230bb72076de4 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 8 Jun 2018 19:44:17 +0200 Subject: [PATCH 15/24] REGR: allow merging on object boolean columns (#21310) (cherry picked from commit 8d5032a8c7b00d47fe5d0886145e1ad9dd17e0d3) --- doc/source/whatsnew/v0.23.1.txt | 5 ++--- pandas/core/reshape/merge.py | 10 ++++++++-- pandas/tests/reshape/merge/test_merge.py | 23 +++++++++++++++++++++++ 3 files changed, 33 insertions(+), 5 deletions(-) diff --git a/doc/source/whatsnew/v0.23.1.txt b/doc/source/whatsnew/v0.23.1.txt index db9a23dc66ef0..0017372add683 100644 --- a/doc/source/whatsnew/v0.23.1.txt +++ b/doc/source/whatsnew/v0.23.1.txt @@ -65,15 +65,14 @@ In addition, ordering comparisons will raise a ``TypeError`` in the future. a tz-aware time instead of tz-naive (:issue:`21267`) and :attr:`DatetimeIndex.date` returned incorrect date when the input date has a non-UTC timezone (:issue:`21230`). - Fixed regression in :meth:`pandas.io.json.json_normalize` when called with ``None`` values - in nested levels in JSON (:issue:`21158`). + in nested levels in JSON, and to not drop keys with value as `None` (:issue:`21158`, :issue:`21356`). - Bug in :meth:`~DataFrame.to_csv` causes encoding error when compression and encoding are specified (:issue:`21241`, :issue:`21118`) - Bug preventing pandas from being importable with -OO optimization (:issue:`21071`) - Bug in :meth:`Categorical.fillna` incorrectly raising a ``TypeError`` when `value` the individual categories are iterable and `value` is an iterable (:issue:`21097`, :issue:`19788`) - Fixed regression in constructors coercing NA values like ``None`` to strings when passing ``dtype=str`` (:issue:`21083`) - Regression in :func:`pivot_table` where an ordered ``Categorical`` with missing values for the pivot's ``index`` would give a mis-aligned result (:issue:`21133`) -- Fixed Regression in :func:`nested_to_record` which now flattens list of dictionaries and doesnot drop keys with value as `None` (:issue:`21356`) - +- Fixed regression in merging on boolean index/columns (:issue:`21119`). .. _whatsnew_0231.performance: diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 4d8897fb7c811..d69d79ca9b098 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -28,6 +28,7 @@ is_int_or_datetime_dtype, is_dtype_equal, is_bool, + is_bool_dtype, is_list_like, is_datetimelike, _ensure_int64, @@ -974,9 +975,14 @@ def _maybe_coerce_merge_keys(self): # Check if we are trying to merge on obviously # incompatible dtypes GH 9780, GH 15800 - elif is_numeric_dtype(lk) and not is_numeric_dtype(rk): + + # boolean values are considered as numeric, but are still allowed + # to be merged on object boolean values + elif ((is_numeric_dtype(lk) and not is_bool_dtype(lk)) + and not is_numeric_dtype(rk)): raise ValueError(msg) - elif not is_numeric_dtype(lk) and is_numeric_dtype(rk): + elif (not is_numeric_dtype(lk) + and (is_numeric_dtype(rk) and not is_bool_dtype(rk))): raise ValueError(msg) elif is_datetimelike(lk) and not is_datetimelike(rk): raise ValueError(msg) diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 8e639edd34b18..037bd9cc7cd18 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -1526,6 +1526,27 @@ def test_merge_on_ints_floats_warning(self): result = B.merge(A, left_on='Y', right_on='X') assert_frame_equal(result, expected[['Y', 'X']]) + def test_merge_incompat_infer_boolean_object(self): + # GH21119: bool + object bool merge OK + df1 = DataFrame({'key': Series([True, False], dtype=object)}) + df2 = DataFrame({'key': [True, False]}) + + expected = DataFrame({'key': [True, False]}, dtype=object) + result = pd.merge(df1, df2, on='key') + assert_frame_equal(result, expected) + result = pd.merge(df2, df1, on='key') + assert_frame_equal(result, expected) + + # with missing value + df1 = DataFrame({'key': Series([True, False, np.nan], dtype=object)}) + df2 = DataFrame({'key': [True, False]}) + + expected = DataFrame({'key': [True, False]}, dtype=object) + result = pd.merge(df1, df2, on='key') + assert_frame_equal(result, expected) + result = pd.merge(df2, df1, on='key') + assert_frame_equal(result, expected) + @pytest.mark.parametrize('df1_vals, df2_vals', [ ([0, 1, 2], ["0", "1", "2"]), ([0.0, 1.0, 2.0], ["0", "1", "2"]), @@ -1538,6 +1559,8 @@ def test_merge_on_ints_floats_warning(self): pd.date_range('20130101', periods=3, tz='US/Eastern')), ([0, 1, 2], Series(['a', 'b', 'a']).astype('category')), ([0.0, 1.0, 2.0], Series(['a', 'b', 'a']).astype('category')), + # TODO ([0, 1], pd.Series([False, True], dtype=bool)), + ([0, 1], pd.Series([False, True], dtype=object)) ]) def test_merge_incompat_dtypes(self, df1_vals, df2_vals): # GH 9780, GH 15800 From c5850c1fe5ea7760289a021b95a7c5a906d8d3ea Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Thu, 7 Jun 2018 18:05:57 -0400 Subject: [PATCH 16/24] BUG: dropna incorrect with categoricals in pivot_table (#21252) (cherry picked from commit abfac97b2d22447d41bfccaa53e0a264ca34d6d4) --- pandas/core/reshape/pivot.py | 20 ++++++++++++++++++-- pandas/tests/reshape/test_pivot.py | 26 +++++++++++++++++++++++++- 2 files changed, 43 insertions(+), 3 deletions(-) diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index e02420323704e..9a2ad5d13d77a 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -1,8 +1,10 @@ # pylint: disable=E1103 -from pandas.core.dtypes.common import is_list_like, is_scalar +from pandas.core.dtypes.common import ( + is_list_like, is_scalar, is_integer_dtype) from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries +from pandas.core.dtypes.cast import maybe_downcast_to_dtype from pandas.core.reshape.concat import concat from pandas.core.series import Series @@ -79,8 +81,22 @@ def pivot_table(data, values=None, index=None, columns=None, aggfunc='mean', pass values = list(values) - grouped = data.groupby(keys, observed=dropna) + # group by the cartesian product of the grouper + # if we have a categorical + grouped = data.groupby(keys, observed=False) agged = grouped.agg(aggfunc) + if dropna and isinstance(agged, ABCDataFrame) and len(agged.columns): + agged = agged.dropna(how='all') + + # gh-21133 + # we want to down cast if + # the original values are ints + # as we grouped with a NaN value + # and then dropped, coercing to floats + for v in [v for v in values if v in data and v in agged]: + if (is_integer_dtype(data[v]) and + not is_integer_dtype(agged[v])): + agged[v] = maybe_downcast_to_dtype(agged[v], data[v].dtype) table = agged if table.index.nlevels > 1: diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index d2cf3fc11e165..3ec60d50f2792 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -1,3 +1,4 @@ +# -*- coding: utf-8 -*- from datetime import datetime, date, timedelta @@ -16,6 +17,11 @@ from pandas.api.types import CategoricalDtype as CDT +@pytest.fixture(params=[True, False]) +def dropna(request): + return request.param + + class TestPivotTable(object): def setup_method(self, method): @@ -109,7 +115,6 @@ def test_pivot_table_categorical(self): index=exp_index) tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize('dropna', [True, False]) def test_pivot_table_dropna_categoricals(self, dropna): # GH 15193 categories = ['a', 'b', 'c', 'd'] @@ -137,6 +142,25 @@ def test_pivot_table_dropna_categoricals(self, dropna): tm.assert_frame_equal(result, expected) + def test_pivot_with_non_observable_dropna(self, dropna): + # gh-21133 + df = pd.DataFrame( + {'A': pd.Categorical([np.nan, 'low', 'high', 'low', 'high'], + categories=['low', 'high'], + ordered=True), + 'B': range(5)}) + + result = df.pivot_table(index='A', values='B', dropna=dropna) + expected = pd.DataFrame( + {'B': [2, 3]}, + index=pd.Index( + pd.Categorical.from_codes([0, 1], + categories=['low', 'high'], + ordered=True), + name='A')) + + tm.assert_frame_equal(result, expected) + def test_pass_array(self): result = self.data.pivot_table( 'D', index=self.data.A, columns=self.data.C) From 5359aead5f992e2d1b7c724ffadc5e4083b922cf Mon Sep 17 00:00:00 2001 From: ssikdar1 Date: Thu, 7 Jun 2018 11:58:47 -0400 Subject: [PATCH 17/24] Fix nested_to_record with None values in nested levels (#21164) (cherry picked from commit ab6aaf73a848a8725a23bb880be5221dd5ef5b3d) --- doc/source/whatsnew/v0.23.1.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/doc/source/whatsnew/v0.23.1.txt b/doc/source/whatsnew/v0.23.1.txt index 0017372add683..9cb21e8760262 100644 --- a/doc/source/whatsnew/v0.23.1.txt +++ b/doc/source/whatsnew/v0.23.1.txt @@ -117,6 +117,8 @@ I/O - Bug in IO methods specifying ``compression='zip'`` which produced uncompressed zip archives (:issue:`17778`, :issue:`21144`) - Bug in :meth:`DataFrame.to_stata` which prevented exporting DataFrames to buffers and most file-like objects (:issue:`21041`) +- Bug when :meth:`pandas.io.json.json_normalize` was called with ``None`` values in nested levels in JSON (:issue:`21158`) +- Bug in :meth:`DataFrame.to_csv` and :meth:`Series.to_csv` causes encoding error when compression and encoding are specified (:issue:`21241`, :issue:`21118`) - Bug in :meth:`read_stata` and :class:`StataReader` which did not correctly decode utf-8 strings on Python 3 from Stata 14 files (dta version 118) (:issue:`21244`) - Bug in IO JSON :func:`read_json` reading empty JSON schema with ``orient='table'`` back to :class:`DataFrame` caused an error (:issue:`21287`) From 182a16d239ca643c1b9f6f8b93fea19b6175fcf2 Mon Sep 17 00:00:00 2001 From: Stefano Cianciulli Date: Thu, 7 Jun 2018 12:23:32 +0100 Subject: [PATCH 18/24] Fix typo in error message in the PlanePlot class (#21350) (cherry picked from commit cea0a81b3d1ade61a5c662458dd8edc135dc94f6) --- pandas/plotting/_core.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/plotting/_core.py b/pandas/plotting/_core.py index 87b7d13251f28..d1a2121597dd6 100644 --- a/pandas/plotting/_core.py +++ b/pandas/plotting/_core.py @@ -811,7 +811,7 @@ class PlanePlot(MPLPlot): def __init__(self, data, x, y, **kwargs): MPLPlot.__init__(self, data, **kwargs) if x is None or y is None: - raise ValueError(self._kind + ' requires and x and y column') + raise ValueError(self._kind + ' requires an x and y column') if is_integer(x) and not self.data.columns.holds_integer(): x = self.data.columns[x] if is_integer(y) and not self.data.columns.holds_integer(): From 964658727f160fae32468686d1d794c40d6a2060 Mon Sep 17 00:00:00 2001 From: Max Kanter Date: Tue, 5 Jun 2018 07:08:30 -0400 Subject: [PATCH 19/24] Add Featuretools to Pandas Ecosystem Page (#21297) (cherry picked from commit 67e6e6fcd19d1d89cb60abc3a78372bc85fd8e29) --- doc/source/ecosystem.rst | 3 +++ 1 file changed, 3 insertions(+) diff --git a/doc/source/ecosystem.rst b/doc/source/ecosystem.rst index 30cdb06b28487..6714398084186 100644 --- a/doc/source/ecosystem.rst +++ b/doc/source/ecosystem.rst @@ -38,7 +38,10 @@ Statsmodels leverages pandas objects as the underlying data container for comput Use pandas DataFrames in your `scikit-learn `__ ML pipeline. +`Featuretools `__ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Featuretools is a Python library for automated feature engineering built on top of pandas. It excels at transforming temporal and relational datasets into feature matrices for machine learning using reusable feature engineering "primitives". Users can contribute their own primitives in Python and share them with the rest of the community. .. _ecosystem.visualization: From 8350429b74f591a2841c0e4328d62afa526ffcb1 Mon Sep 17 00:00:00 2001 From: Ming Li <14131823+minggli@users.noreply.github.com> Date: Tue, 5 Jun 2018 05:54:30 +0100 Subject: [PATCH 20/24] BUG: Fix encoding error in to_csv compression (#21300) (cherry picked from commit b32fdc44206c38aecbbe5fdb4ed543a5d213ebb9) --- doc/source/whatsnew/v0.23.1.txt | 8 +++++++ pandas/io/formats/csvs.py | 36 ++++++++++++++++------------- pandas/tests/frame/test_to_csv.py | 38 ++++++++++++++++++++++--------- pandas/tests/series/test_io.py | 36 +++++++++++++++++++++-------- pandas/tests/test_common.py | 23 +++++++++++++++++++ 5 files changed, 104 insertions(+), 37 deletions(-) diff --git a/doc/source/whatsnew/v0.23.1.txt b/doc/source/whatsnew/v0.23.1.txt index 9cb21e8760262..05fad4b99919e 100644 --- a/doc/source/whatsnew/v0.23.1.txt +++ b/doc/source/whatsnew/v0.23.1.txt @@ -121,6 +121,14 @@ I/O - Bug in :meth:`DataFrame.to_csv` and :meth:`Series.to_csv` causes encoding error when compression and encoding are specified (:issue:`21241`, :issue:`21118`) - Bug in :meth:`read_stata` and :class:`StataReader` which did not correctly decode utf-8 strings on Python 3 from Stata 14 files (dta version 118) (:issue:`21244`) - Bug in IO JSON :func:`read_json` reading empty JSON schema with ``orient='table'`` back to :class:`DataFrame` caused an error (:issue:`21287`) +- + +Plotting +^^^^^^^^ + +- +- +>>>>>>> b32fdc442... BUG: Fix encoding error in to_csv compression (#21300) Reshaping diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index 29b8d29af0808..7f660e2644fa4 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -9,6 +9,7 @@ import numpy as np from pandas.core.dtypes.missing import notna +from pandas.core.dtypes.inference import is_file_like from pandas.core.index import Index, MultiIndex from pandas import compat from pandas.compat import (StringIO, range, zip) @@ -127,14 +128,19 @@ def save(self): else: encoding = self.encoding - if hasattr(self.path_or_buf, 'write'): - f = self.path_or_buf - close = False + # PR 21300 uses string buffer to receive csv writing and dump into + # file-like output with compression as option. GH 21241, 21118 + f = StringIO() + if not is_file_like(self.path_or_buf): + # path_or_buf is path + path_or_buf = self.path_or_buf + elif hasattr(self.path_or_buf, 'name'): + # path_or_buf is file handle + path_or_buf = self.path_or_buf.name else: - f, handles = _get_handle(self.path_or_buf, self.mode, - encoding=encoding, - compression=None) - close = True if self.compression is None else False + # path_or_buf is file-like IO objects. + f = self.path_or_buf + path_or_buf = None try: writer_kwargs = dict(lineterminator=self.line_terminator, @@ -151,18 +157,16 @@ def save(self): self._save() finally: - # GH 17778 handles compression for byte strings. - if not close and self.compression: - f.close() - with open(self.path_or_buf, 'r') as f: - data = f.read() - f, handles = _get_handle(self.path_or_buf, self.mode, + # GH 17778 handles zip compression for byte strings separately. + buf = f.getvalue() + if path_or_buf: + f, handles = _get_handle(path_or_buf, self.mode, encoding=encoding, compression=self.compression) - f.write(data) - close = True - if close: + f.write(buf) f.close() + for _fh in handles: + _fh.close() def _save_header(self): diff --git a/pandas/tests/frame/test_to_csv.py b/pandas/tests/frame/test_to_csv.py index e4829ebf48561..60dc336a85388 100644 --- a/pandas/tests/frame/test_to_csv.py +++ b/pandas/tests/frame/test_to_csv.py @@ -919,29 +919,45 @@ def test_to_csv_path_is_none(self): recons = pd.read_csv(StringIO(csv_str), index_col=0) assert_frame_equal(self.frame, recons) - def test_to_csv_compression(self, compression): - - df = DataFrame([[0.123456, 0.234567, 0.567567], - [12.32112, 123123.2, 321321.2]], - index=['A', 'B'], columns=['X', 'Y', 'Z']) + @pytest.mark.parametrize('df,encoding', [ + (DataFrame([[0.123456, 0.234567, 0.567567], + [12.32112, 123123.2, 321321.2]], + index=['A', 'B'], columns=['X', 'Y', 'Z']), None), + # GH 21241, 21118 + (DataFrame([['abc', 'def', 'ghi']], columns=['X', 'Y', 'Z']), 'ascii'), + (DataFrame(5 * [[123, u"你好", u"世界"]], + columns=['X', 'Y', 'Z']), 'gb2312'), + (DataFrame(5 * [[123, u"Γειά σου", u"Κόσμε"]], + columns=['X', 'Y', 'Z']), 'cp737') + ]) + def test_to_csv_compression(self, df, encoding, compression): with ensure_clean() as filename: - df.to_csv(filename, compression=compression) + df.to_csv(filename, compression=compression, encoding=encoding) # test the round trip - to_csv -> read_csv - rs = read_csv(filename, compression=compression, - index_col=0) - assert_frame_equal(df, rs) + result = read_csv(filename, compression=compression, + index_col=0, encoding=encoding) + + with open(filename, 'w') as fh: + df.to_csv(fh, compression=compression, encoding=encoding) + + result_fh = read_csv(filename, compression=compression, + index_col=0, encoding=encoding) + assert_frame_equal(df, result) + assert_frame_equal(df, result_fh) # explicitly make sure file is compressed with tm.decompress_file(filename, compression) as fh: - text = fh.read().decode('utf8') + text = fh.read().decode(encoding or 'utf8') for col in df.columns: assert col in text with tm.decompress_file(filename, compression) as fh: - assert_frame_equal(df, read_csv(fh, index_col=0)) + assert_frame_equal(df, read_csv(fh, + index_col=0, + encoding=encoding)) def test_to_csv_date_format(self): with ensure_clean('__tmp_to_csv_date_format__') as path: diff --git a/pandas/tests/series/test_io.py b/pandas/tests/series/test_io.py index 0b0d4334c86a3..76dd4bc1f3d4a 100644 --- a/pandas/tests/series/test_io.py +++ b/pandas/tests/series/test_io.py @@ -138,29 +138,45 @@ def test_to_csv_path_is_none(self): csv_str = s.to_csv(path=None) assert isinstance(csv_str, str) - def test_to_csv_compression(self, compression): - - s = Series([0.123456, 0.234567, 0.567567], index=['A', 'B', 'C'], - name='X') + @pytest.mark.parametrize('s,encoding', [ + (Series([0.123456, 0.234567, 0.567567], index=['A', 'B', 'C'], + name='X'), None), + # GH 21241, 21118 + (Series(['abc', 'def', 'ghi'], name='X'), 'ascii'), + (Series(["123", u"你好", u"世界"], name=u"中文"), 'gb2312'), + (Series(["123", u"Γειά σου", u"Κόσμε"], name=u"Ελληνικά"), 'cp737') + ]) + def test_to_csv_compression(self, s, encoding, compression): with ensure_clean() as filename: - s.to_csv(filename, compression=compression, header=True) + s.to_csv(filename, compression=compression, encoding=encoding, + header=True) # test the round trip - to_csv -> read_csv - rs = pd.read_csv(filename, compression=compression, - index_col=0, squeeze=True) - assert_series_equal(s, rs) + result = pd.read_csv(filename, compression=compression, + encoding=encoding, index_col=0, squeeze=True) + + with open(filename, 'w') as fh: + s.to_csv(fh, compression=compression, encoding=encoding, + header=True) + + result_fh = pd.read_csv(filename, compression=compression, + encoding=encoding, index_col=0, + squeeze=True) + assert_series_equal(s, result) + assert_series_equal(s, result_fh) # explicitly ensure file was compressed with tm.decompress_file(filename, compression) as fh: - text = fh.read().decode('utf8') + text = fh.read().decode(encoding or 'utf8') assert s.name in text with tm.decompress_file(filename, compression) as fh: assert_series_equal(s, pd.read_csv(fh, index_col=0, - squeeze=True)) + squeeze=True, + encoding=encoding)) class TestSeriesIO(TestData): diff --git a/pandas/tests/test_common.py b/pandas/tests/test_common.py index bb7ee1b911fee..3443331e3d4ba 100644 --- a/pandas/tests/test_common.py +++ b/pandas/tests/test_common.py @@ -241,3 +241,26 @@ def test_compression_size(obj, method, compression): getattr(obj, method)(filename, compression=None) uncompressed = os.path.getsize(filename) assert uncompressed > compressed + + +@pytest.mark.parametrize('obj', [ + DataFrame(100 * [[0.123456, 0.234567, 0.567567], + [12.32112, 123123.2, 321321.2]], + columns=['X', 'Y', 'Z']), + Series(100 * [0.123456, 0.234567, 0.567567], name='X')]) +@pytest.mark.parametrize('method', ['to_csv']) +def test_compression_size_fh(obj, method, compression_only): + + with tm.ensure_clean() as filename: + with open(filename, 'w') as fh: + getattr(obj, method)(fh, compression=compression_only) + assert not fh.closed + assert fh.closed + compressed = os.path.getsize(filename) + with tm.ensure_clean() as filename: + with open(filename, 'w') as fh: + getattr(obj, method)(fh, compression=None) + assert not fh.closed + assert fh.closed + uncompressed = os.path.getsize(filename) + assert uncompressed > compressed From c65c1247a8ca3b9764cc7d88c51c5c7acddb90ed Mon Sep 17 00:00:00 2001 From: Jeremy Schendel Date: Mon, 4 Jun 2018 15:28:50 -0600 Subject: [PATCH 21/24] BUG: Allow IntervalIndex to be constructed from categorical data with appropriate dtype (#21254) (cherry picked from commit 686f6047312fe7671d8a5e1b2ffd1866f7c7a766) --- pandas/core/indexes/interval.py | 4 ++++ .../indexes/interval/test_construction.py | 23 ++++++++++++++++++- 2 files changed, 26 insertions(+), 1 deletion(-) diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index 8f8d8760583ce..eb9d7efc06c27 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -112,6 +112,10 @@ def maybe_convert_platform_interval(values): ------- array """ + if is_categorical_dtype(values): + # GH 21243/21253 + values = np.array(values) + if isinstance(values, (list, tuple)) and len(values) == 0: # GH 19016 # empty lists/tuples get object dtype by default, but this is not diff --git a/pandas/tests/indexes/interval/test_construction.py b/pandas/tests/indexes/interval/test_construction.py index 5fdf92dcb2044..b1711c3444586 100644 --- a/pandas/tests/indexes/interval/test_construction.py +++ b/pandas/tests/indexes/interval/test_construction.py @@ -6,8 +6,9 @@ from pandas import ( Interval, IntervalIndex, Index, Int64Index, Float64Index, Categorical, - date_range, timedelta_range, period_range, notna) + CategoricalIndex, date_range, timedelta_range, period_range, notna) from pandas.compat import lzip +from pandas.core.dtypes.common import is_categorical_dtype from pandas.core.dtypes.dtypes import IntervalDtype import pandas.core.common as com import pandas.util.testing as tm @@ -111,6 +112,22 @@ def test_constructor_string(self, constructor, breaks): with tm.assert_raises_regex(TypeError, msg): constructor(**self.get_kwargs_from_breaks(breaks)) + @pytest.mark.parametrize('cat_constructor', [ + Categorical, CategoricalIndex]) + def test_constructor_categorical_valid(self, constructor, cat_constructor): + # GH 21243/21253 + if isinstance(constructor, partial) and constructor.func is Index: + # Index is defined to create CategoricalIndex from categorical data + pytest.skip() + + breaks = np.arange(10, dtype='int64') + expected = IntervalIndex.from_breaks(breaks) + + cat_breaks = cat_constructor(breaks) + result_kwargs = self.get_kwargs_from_breaks(cat_breaks) + result = constructor(**result_kwargs) + tm.assert_index_equal(result, expected) + def test_generic_errors(self, constructor): # filler input data to be used when supplying invalid kwargs filler = self.get_kwargs_from_breaks(range(10)) @@ -238,6 +255,8 @@ def get_kwargs_from_breaks(self, breaks, closed='right'): tuples = lzip(breaks[:-1], breaks[1:]) if isinstance(breaks, (list, tuple)): return {'data': tuples} + elif is_categorical_dtype(breaks): + return {'data': breaks._constructor(tuples)} return {'data': com._asarray_tuplesafe(tuples)} def test_constructor_errors(self): @@ -286,6 +305,8 @@ def get_kwargs_from_breaks(self, breaks, closed='right'): if isinstance(breaks, list): return {'data': ivs} + elif is_categorical_dtype(breaks): + return {'data': breaks._constructor(ivs)} return {'data': np.array(ivs, dtype=object)} def test_generic_errors(self, constructor): From 8c0d56ba76d2c1a9c48301df680f27570b42cfbc Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 8 Jun 2018 18:41:49 +0200 Subject: [PATCH 22/24] DOC: update whatsnew 0.23.1 (#21387) (cherry picked from commit 0f521ab8eb6c78be92607beadbf6f2c1cbf681b7) --- doc/source/whatsnew/v0.23.1.txt | 26 +++++++++++--------------- 1 file changed, 11 insertions(+), 15 deletions(-) diff --git a/doc/source/whatsnew/v0.23.1.txt b/doc/source/whatsnew/v0.23.1.txt index 05fad4b99919e..b4d19e24ad392 100644 --- a/doc/source/whatsnew/v0.23.1.txt +++ b/doc/source/whatsnew/v0.23.1.txt @@ -88,24 +88,24 @@ Performance Improvements Bug Fixes ~~~~~~~~~ -Groupby/Resample/Rolling +**Groupby/Resample/Rolling** - Bug in :func:`DataFrame.agg` where applying multiple aggregation functions to a :class:`DataFrame` with duplicated column names would cause a stack overflow (:issue:`21063`) - Bug in :func:`pandas.core.groupby.GroupBy.ffill` and :func:`pandas.core.groupby.GroupBy.bfill` where the fill within a grouping would not always be applied as intended due to the implementations' use of a non-stable sort (:issue:`21207`) - Bug in :func:`pandas.core.groupby.GroupBy.rank` where results did not scale to 100% when specifying ``method='dense'`` and ``pct=True`` - Bug in :func:`pandas.DataFrame.rolling` and :func:`pandas.Series.rolling` which incorrectly accepted a 0 window size rather than raising (:issue:`21286`) -Data-type specific +**Data-type specific** - Bug in :meth:`Series.str.replace()` where the method throws `TypeError` on Python 3.5.2 (:issue: `21078`) - Bug in :class:`Timedelta`: where passing a float with a unit would prematurely round the float precision (:issue: `14156`) - Bug in :func:`pandas.testing.assert_index_equal` which raised ``AssertionError`` incorrectly, when comparing two :class:`CategoricalIndex` objects with param ``check_categorical=False`` (:issue:`19776`) -Sparse +**Sparse** - Bug in :attr:`SparseArray.shape` which previously only returned the shape :attr:`SparseArray.sp_values` (:issue:`21126`) -Indexing +**Indexing** - Bug in :meth:`Series.reset_index` where appropriate error was not raised with an invalid level name (:issue:`20925`) - Bug in :func:`interval_range` when ``start``/``periods`` or ``end``/``periods`` are specified with float ``start`` or ``end`` (:issue:`21161`) @@ -113,7 +113,11 @@ Indexing - Bug in :class:`IntervalIndex` constructors where creating an ``IntervalIndex`` from categorical data was not fully supported (:issue:`21243`, issue:`21253`) - Bug in :meth:`MultiIndex.sort_index` which was not guaranteed to sort correctly with ``level=1``; this was also causing data misalignment in particular :meth:`DataFrame.stack` operations (:issue:`20994`, :issue:`20945`, :issue:`21052`) -I/O +**Plotting** + +- New keywords (sharex, sharey) to turn on/off sharing of x/y-axis by subplots generated with pandas.DataFrame().groupby().boxplot() (:issue: `20968`) + +**I/O** - Bug in IO methods specifying ``compression='zip'`` which produced uncompressed zip archives (:issue:`17778`, :issue:`21144`) - Bug in :meth:`DataFrame.to_stata` which prevented exporting DataFrames to buffers and most file-like objects (:issue:`21041`) @@ -121,21 +125,13 @@ I/O - Bug in :meth:`DataFrame.to_csv` and :meth:`Series.to_csv` causes encoding error when compression and encoding are specified (:issue:`21241`, :issue:`21118`) - Bug in :meth:`read_stata` and :class:`StataReader` which did not correctly decode utf-8 strings on Python 3 from Stata 14 files (dta version 118) (:issue:`21244`) - Bug in IO JSON :func:`read_json` reading empty JSON schema with ``orient='table'`` back to :class:`DataFrame` caused an error (:issue:`21287`) -- - -Plotting -^^^^^^^^ - -- -- ->>>>>>> b32fdc442... BUG: Fix encoding error in to_csv compression (#21300) -Reshaping +**Reshaping** - Bug in :func:`concat` where error was raised in concatenating :class:`Series` with numpy scalar and tuple names (:issue:`21015`) - Bug in :func:`concat` warning message providing the wrong guidance for future behavior (:issue:`21101`) -Other +**Other** - Tab completion on :class:`Index` in IPython no longer outputs deprecation warnings (:issue:`21125`) - Bug preventing pandas being used on Windows without C++ redistributable installed (:issue:`21106`) From a32c4e4f0d19dee676f873d741ba5f3ce9c1c611 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 12 Jun 2018 10:10:57 -0500 Subject: [PATCH 23/24] Fixup whatsnew --- doc/source/whatsnew/v0.23.1.txt | 2 -- 1 file changed, 2 deletions(-) diff --git a/doc/source/whatsnew/v0.23.1.txt b/doc/source/whatsnew/v0.23.1.txt index b4d19e24ad392..db25bcf8113f5 100644 --- a/doc/source/whatsnew/v0.23.1.txt +++ b/doc/source/whatsnew/v0.23.1.txt @@ -121,8 +121,6 @@ Bug Fixes - Bug in IO methods specifying ``compression='zip'`` which produced uncompressed zip archives (:issue:`17778`, :issue:`21144`) - Bug in :meth:`DataFrame.to_stata` which prevented exporting DataFrames to buffers and most file-like objects (:issue:`21041`) -- Bug when :meth:`pandas.io.json.json_normalize` was called with ``None`` values in nested levels in JSON (:issue:`21158`) -- Bug in :meth:`DataFrame.to_csv` and :meth:`Series.to_csv` causes encoding error when compression and encoding are specified (:issue:`21241`, :issue:`21118`) - Bug in :meth:`read_stata` and :class:`StataReader` which did not correctly decode utf-8 strings on Python 3 from Stata 14 files (dta version 118) (:issue:`21244`) - Bug in IO JSON :func:`read_json` reading empty JSON schema with ``orient='table'`` back to :class:`DataFrame` caused an error (:issue:`21287`) From 47dc5e8692f92d3a4c74cf587dd3d6f831e8f28a Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 12 Jun 2018 10:41:44 -0500 Subject: [PATCH 24/24] Backport fixture --- pandas/conftest.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/pandas/conftest.py b/pandas/conftest.py index e6c1b1b171045..d5f399c7cd63d 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -105,6 +105,16 @@ def compression(request): return request.param +@pytest.fixture(params=['gzip', 'bz2', 'zip', + pytest.param('xz', marks=td.skip_if_no_lzma)]) +def compression_only(request): + """ + Fixture for trying common compression types in compression tests excluding + uncompressed case + """ + return request.param + + @pytest.fixture(scope='module') def datetime_tz_utc(): from datetime import timezone