From e82b128bb0cb3f49a1f1d644d408e46ff344730e Mon Sep 17 00:00:00 2001 From: sfoo Date: Thu, 30 Nov 2017 07:37:28 -0500 Subject: [PATCH 1/6] Copy data.values when ranking --- doc/source/whatsnew/v0.22.0.txt | 6 ++++-- pandas/core/generic.py | 2 +- pandas/tests/series/test_rank.py | 10 +++++++++- 3 files changed, 14 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v0.22.0.txt b/doc/source/whatsnew/v0.22.0.txt index c2da0c420f643..285fc51e9970a 100644 --- a/doc/source/whatsnew/v0.22.0.txt +++ b/doc/source/whatsnew/v0.22.0.txt @@ -335,5 +335,7 @@ Categorical Other ^^^^^ -- Improved error message when attempting to use a Python keyword as an identifier in a ``numexpr`` backed query (:issue:`18221`) -- +- Improved error message when attempting to use a Python keyword as an identifier in a numexpr query (:issue:`18221`) +- Fixed a bug where creating a Series from an array that contains both tz-naive and tz-aware values will result in a Series whose dtype is tz-aware instead of object (:issue:`16406`) +- Adding a ``Period`` object to a ``datetime`` or ``Timestamp`` object will now correctly raise a ``TypeError`` (:issue:`17983`) +- Bug in :func:`Series.rank` where ``Series`` containing ``NaT`` modifies the ``Series`` inplace (:issue:`18521`) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index da550dccc9c89..adc5760a5c71b 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -5940,7 +5940,7 @@ def rank(self, axis=0, method='average', numeric_only=None, raise NotImplementedError(msg) def ranker(data): - ranks = algos.rank(data.values, axis=axis, method=method, + ranks = algos.rank(data.values.copy(), axis=axis, method=method, ascending=ascending, na_option=na_option, pct=pct) ranks = self._constructor(ranks, **data._construct_axes_dict()) diff --git a/pandas/tests/series/test_rank.py b/pandas/tests/series/test_rank.py index 311d14e928caa..c564f572ea3bb 100644 --- a/pandas/tests/series/test_rank.py +++ b/pandas/tests/series/test_rank.py @@ -1,5 +1,5 @@ # -*- coding: utf-8 -*- -from pandas import compat +from pandas import compat, Timestamp import pytest @@ -368,3 +368,11 @@ def test_rank_object_bug(self): # smoke tests Series([np.nan] * 32).astype(object).rank(ascending=True) Series([np.nan] * 32).astype(object).rank(ascending=False) + + def test_rank_modify_inplace(self): + # GH 18521 + df = Series([Timestamp('2017-01-05 10:20:27.569000'), NaT]) + pre_rank_df = df.copy() + + df.rank() + assert_series_equal(df, pre_rank_df) From d1cc880df1ffdcc750619789836def49d9f693d0 Mon Sep 17 00:00:00 2001 From: sfoo Date: Mon, 4 Dec 2017 19:27:09 -0500 Subject: [PATCH 2/6] WIP: Thinking of better way than a copy --- pandas/core/generic.py | 2 +- pandas/tests/frame/test_analytics.py | 8 ++++++++ pandas/tests/series/test_rank.py | 9 +++++---- 3 files changed, 14 insertions(+), 5 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index adc5760a5c71b..da550dccc9c89 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -5940,7 +5940,7 @@ def rank(self, axis=0, method='average', numeric_only=None, raise NotImplementedError(msg) def ranker(data): - ranks = algos.rank(data.values.copy(), axis=axis, method=method, + ranks = algos.rank(data.values, axis=axis, method=method, ascending=ascending, na_option=na_option, pct=pct) ranks = self._constructor(ranks, **data._construct_axes_dict()) diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py index 4bba6d7601ae8..10573558d13cd 100644 --- a/pandas/tests/frame/test_analytics.py +++ b/pandas/tests/frame/test_analytics.py @@ -2214,3 +2214,11 @@ def test_series_broadcasting(self): df_nan.clip_lower(s, axis=0) for op in ['lt', 'le', 'gt', 'ge', 'eq', 'ne']: getattr(df, op)(s_nan, axis=0) + + def test_series_nat_conversion(self): + # GH 18521 + df = DataFrame(np.random.randn(10, 3), dtype='float64') + expected = df.copy() + df.rank() + result = df + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/series/test_rank.py b/pandas/tests/series/test_rank.py index c564f572ea3bb..bdc98f7c2e9a8 100644 --- a/pandas/tests/series/test_rank.py +++ b/pandas/tests/series/test_rank.py @@ -371,8 +371,9 @@ def test_rank_object_bug(self): def test_rank_modify_inplace(self): # GH 18521 - df = Series([Timestamp('2017-01-05 10:20:27.569000'), NaT]) - pre_rank_df = df.copy() + s = Series([Timestamp('2017-01-05 10:20:27.569000'), NaT]) + expected = s.copy() - df.rank() - assert_series_equal(df, pre_rank_df) + s.rank() + result = s + assert_series_equal(result, expected) From 9c071536c9fd7ec9d4ea490a44f6ae40e1e29ad1 Mon Sep 17 00:00:00 2001 From: sfoo Date: Mon, 11 Dec 2017 00:58:30 -0500 Subject: [PATCH 3/6] Rebased; copy if datetime --- pandas/core/algorithms.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 0ceb8966fd3c8..b511035113515 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -219,6 +219,10 @@ def _get_data_algo(values, func_map): if is_categorical_dtype(values): values = values._values_for_rank() + # Create copy in case NaT converts to asi8 + if is_datetime64_any_dtype(values): + values = values.copy() + values, dtype, ndtype = _ensure_data(values) if ndtype == 'object': From ca5d28c09f9ba5bb508b55396f50584ad7ddb1f9 Mon Sep 17 00:00:00 2001 From: sfoo Date: Tue, 12 Dec 2017 19:06:00 -0500 Subject: [PATCH 4/6] Fixed whatsnew; copy value in rank_1d --- doc/source/whatsnew/v0.22.0.txt | 4 +--- pandas/_libs/algos_rank_helper.pxi.in | 3 +++ pandas/core/algorithms.py | 4 ---- 3 files changed, 4 insertions(+), 7 deletions(-) diff --git a/doc/source/whatsnew/v0.22.0.txt b/doc/source/whatsnew/v0.22.0.txt index 285fc51e9970a..1e8aa6eaf2ef4 100644 --- a/doc/source/whatsnew/v0.22.0.txt +++ b/doc/source/whatsnew/v0.22.0.txt @@ -335,7 +335,5 @@ Categorical Other ^^^^^ -- Improved error message when attempting to use a Python keyword as an identifier in a numexpr query (:issue:`18221`) -- Fixed a bug where creating a Series from an array that contains both tz-naive and tz-aware values will result in a Series whose dtype is tz-aware instead of object (:issue:`16406`) -- Adding a ``Period`` object to a ``datetime`` or ``Timestamp`` object will now correctly raise a ``TypeError`` (:issue:`17983`) +- Improved error message when attempting to use a Python keyword as an identifier in a ``numexpr`` backed query (:issue:`18221`) - Bug in :func:`Series.rank` where ``Series`` containing ``NaT`` modifies the ``Series`` inplace (:issue:`18521`) diff --git a/pandas/_libs/algos_rank_helper.pxi.in b/pandas/_libs/algos_rank_helper.pxi.in index 0e46530e20d1c..cc31ab1cd4a2d 100644 --- a/pandas/_libs/algos_rank_helper.pxi.in +++ b/pandas/_libs/algos_rank_helper.pxi.in @@ -84,6 +84,9 @@ def rank_1d_{{dtype}}(object in_arr, ties_method='average', ascending=True, mask = np.isnan(values) {{elif dtype == 'int64'}} mask = values == iNaT + # create copy in case of iNaT + if mask.any(): + values = values.copy() {{endif}} # double sort first by mask and then by values to ensure nan values are diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index b511035113515..0ceb8966fd3c8 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -219,10 +219,6 @@ def _get_data_algo(values, func_map): if is_categorical_dtype(values): values = values._values_for_rank() - # Create copy in case NaT converts to asi8 - if is_datetime64_any_dtype(values): - values = values.copy() - values, dtype, ndtype = _ensure_data(values) if ndtype == 'object': From aad8fb8e0e8fef8754f860183867a45fe1b5c8d2 Mon Sep 17 00:00:00 2001 From: sfoo Date: Wed, 13 Dec 2017 23:36:15 -0500 Subject: [PATCH 5/6] Added comments about mutating inplace per review --- doc/source/whatsnew/v0.22.0.txt | 6 +++--- pandas/_libs/algos_rank_helper.pxi.in | 1 + pandas/tests/frame/test_analytics.py | 1 + pandas/tests/series/test_rank.py | 1 + 4 files changed, 6 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v0.22.0.txt b/doc/source/whatsnew/v0.22.0.txt index 1e8aa6eaf2ef4..6a6534f4a800f 100644 --- a/doc/source/whatsnew/v0.22.0.txt +++ b/doc/source/whatsnew/v0.22.0.txt @@ -315,7 +315,8 @@ Reshaping - Bug in :func:`DataFrame.stack` which fails trying to sort mixed type levels under Python 3 (:issue:`18310`) - Fixed construction of a :class:`Series` from a ``dict`` containing ``NaN`` as key (:issue:`18480`) - +- Improved error message when attempting to use a Python keyword as an identifier in a ``numexpr`` backed query (:issue:`18221`) +- Bug in :func:`Series.rank` where ``Series`` containing ``NaT`` modifies the ``Series`` inplace (:issue:`18521`) - Numeric @@ -335,5 +336,4 @@ Categorical Other ^^^^^ -- Improved error message when attempting to use a Python keyword as an identifier in a ``numexpr`` backed query (:issue:`18221`) -- Bug in :func:`Series.rank` where ``Series`` containing ``NaT`` modifies the ``Series`` inplace (:issue:`18521`) +- diff --git a/pandas/_libs/algos_rank_helper.pxi.in b/pandas/_libs/algos_rank_helper.pxi.in index cc31ab1cd4a2d..c7b75727e5ae8 100644 --- a/pandas/_libs/algos_rank_helper.pxi.in +++ b/pandas/_libs/algos_rank_helper.pxi.in @@ -85,6 +85,7 @@ def rank_1d_{{dtype}}(object in_arr, ties_method='average', ascending=True, {{elif dtype == 'int64'}} mask = values == iNaT # create copy in case of iNaT + # values are mutated inplace if mask.any(): values = values.copy() {{endif}} diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py index 10573558d13cd..7014929db4c2d 100644 --- a/pandas/tests/frame/test_analytics.py +++ b/pandas/tests/frame/test_analytics.py @@ -2217,6 +2217,7 @@ def test_series_broadcasting(self): def test_series_nat_conversion(self): # GH 18521 + # Check rank does not mutate DataFrame df = DataFrame(np.random.randn(10, 3), dtype='float64') expected = df.copy() df.rank() diff --git a/pandas/tests/series/test_rank.py b/pandas/tests/series/test_rank.py index bdc98f7c2e9a8..bccc46f1e0ca8 100644 --- a/pandas/tests/series/test_rank.py +++ b/pandas/tests/series/test_rank.py @@ -371,6 +371,7 @@ def test_rank_object_bug(self): def test_rank_modify_inplace(self): # GH 18521 + # Check rank does not mutate series s = Series([Timestamp('2017-01-05 10:20:27.569000'), NaT]) expected = s.copy() From caa3fa96c5aec3525c91e6f85c3fcdbe9eb22eb4 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Thu, 14 Dec 2017 06:29:45 -0500 Subject: [PATCH 6/6] doc updates --- doc/source/whatsnew/v0.22.0.txt | 3 +-- pandas/_libs/algos_rank_helper.pxi.in | 1 + 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.22.0.txt b/doc/source/whatsnew/v0.22.0.txt index 6a6534f4a800f..3e9e2bf329674 100644 --- a/doc/source/whatsnew/v0.22.0.txt +++ b/doc/source/whatsnew/v0.22.0.txt @@ -315,7 +315,6 @@ Reshaping - Bug in :func:`DataFrame.stack` which fails trying to sort mixed type levels under Python 3 (:issue:`18310`) - Fixed construction of a :class:`Series` from a ``dict`` containing ``NaN`` as key (:issue:`18480`) -- Improved error message when attempting to use a Python keyword as an identifier in a ``numexpr`` backed query (:issue:`18221`) - Bug in :func:`Series.rank` where ``Series`` containing ``NaT`` modifies the ``Series`` inplace (:issue:`18521`) - @@ -336,4 +335,4 @@ Categorical Other ^^^^^ -- +- Improved error message when attempting to use a Python keyword as an identifier in a ``numexpr`` backed query (:issue:`18221`) diff --git a/pandas/_libs/algos_rank_helper.pxi.in b/pandas/_libs/algos_rank_helper.pxi.in index c7b75727e5ae8..8ccc6e036da80 100644 --- a/pandas/_libs/algos_rank_helper.pxi.in +++ b/pandas/_libs/algos_rank_helper.pxi.in @@ -84,6 +84,7 @@ def rank_1d_{{dtype}}(object in_arr, ties_method='average', ascending=True, mask = np.isnan(values) {{elif dtype == 'int64'}} mask = values == iNaT + # create copy in case of iNaT # values are mutated inplace if mask.any():