From e51ea02221bb90e4c1f5fb8c992018221b2971f0 Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Sat, 13 Apr 2019 18:56:31 +0300 Subject: [PATCH 01/13] replaced InvalidIndexError class --- pandas/core/base.py | 4 ++++ pandas/core/indexes/base.py | 6 +----- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/pandas/core/base.py b/pandas/core/base.py index 9a0a4e3e9ca03..2ede121f1d1ab 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -33,6 +33,10 @@ unique='IndexOpsMixin', duplicated='IndexOpsMixin') +class InvalidIndexError(Exception): + pass + + class StringMixin(object): """implements string methods so long as object defines a `__unicode__` method. diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 1875976a8f931..dda1b31f17c1d 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -37,7 +37,7 @@ from pandas.core.accessor import CachedAccessor, DirNamesMixin import pandas.core.algorithms as algos from pandas.core.arrays import ExtensionArray -from pandas.core.base import IndexOpsMixin, PandasObject +from pandas.core.base import IndexOpsMixin, PandasObject, InvalidIndexError import pandas.core.common as com from pandas.core.indexes.frozen import FrozenList import pandas.core.missing as missing @@ -143,10 +143,6 @@ def index_arithmetic_method(self, other): return set_function_name(index_arithmetic_method, name, cls) -class InvalidIndexError(Exception): - pass - - _o_dtype = np.dtype(object) _Identity = object From 319a7131f4afe851ae159db0a8f245e6814039c6 Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Sat, 13 Apr 2019 18:58:00 +0300 Subject: [PATCH 02/13] fixed GH 22305 issue --- pandas/core/base.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/pandas/core/base.py b/pandas/core/base.py index 2ede121f1d1ab..7cb89ad8d111b 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -1205,7 +1205,13 @@ def _map_values(self, mapper, na_action=None): else: values = self.values - indexer = mapper.index.get_indexer(values) + try: + indexer = mapper.index.get_indexer(values) + except InvalidIndexError: + from pandas import Series + mapper = Series(algorithms.unique(mapper)) + indexer = mapper.index.get_indexer(values) + new_values = algorithms.take_1d(mapper._values, indexer) return new_values From 0f433e7f3fdff4cba0bc74a54f02077114cb8c87 Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Fri, 12 Apr 2019 15:47:53 +0300 Subject: [PATCH 03/13] added test, written by realead --- pandas/tests/series/test_timeseries.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/pandas/tests/series/test_timeseries.py b/pandas/tests/series/test_timeseries.py index 3a82339375699..5812da2d075ed 100644 --- a/pandas/tests/series/test_timeseries.py +++ b/pandas/tests/series/test_timeseries.py @@ -33,6 +33,19 @@ def _simple_ts(start, end, freq='D'): return Series(np.random.randn(len(rng)), index=rng) +def test_NA_values_with_cache(): + # GH 22305 + na_values = [None, np.nan, pd.NaT] + # check pairwise, that no pair of na values + # is mangled + for f in na_values: + for s in na_values: + if f is not s: # otherwise not unique + expected = Index([NaT, NaT], dtype='datetime64[ns]') + result = to_datetime([f, s], cache=True) + tm.assert_index_equal(result, expected) + + def assert_range_equal(left, right): assert (left.equals(right)) assert (left.freq == right.freq) From 2a090431303a52bb913db29f7cd12efcb699e386 Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Sat, 13 Apr 2019 20:25:25 +0300 Subject: [PATCH 04/13] fixed indent --- pandas/tests/series/test_timeseries.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/pandas/tests/series/test_timeseries.py b/pandas/tests/series/test_timeseries.py index 5812da2d075ed..4a22cdc94ec87 100644 --- a/pandas/tests/series/test_timeseries.py +++ b/pandas/tests/series/test_timeseries.py @@ -34,16 +34,16 @@ def _simple_ts(start, end, freq='D'): def test_NA_values_with_cache(): - # GH 22305 - na_values = [None, np.nan, pd.NaT] - # check pairwise, that no pair of na values - # is mangled - for f in na_values: - for s in na_values: - if f is not s: # otherwise not unique - expected = Index([NaT, NaT], dtype='datetime64[ns]') - result = to_datetime([f, s], cache=True) - tm.assert_index_equal(result, expected) + # GH 22305 + na_values = [None, np.nan, pd.NaT] + # check pairwise, that no pair of na values + # is mangled + for f in na_values: + for s in na_values: + if f is not s: # otherwise not unique + expected = Index([NaT, NaT], dtype='datetime64[ns]') + result = to_datetime([f, s], cache=True) + tm.assert_index_equal(result, expected) def assert_range_equal(left, right): From d0ec00c03273f24d1fb8d3807045fda0b80da516 Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Sat, 13 Apr 2019 20:27:09 +0300 Subject: [PATCH 05/13] fixed isort error --- pandas/core/indexes/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index dda1b31f17c1d..6e9487a2450c1 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -37,7 +37,7 @@ from pandas.core.accessor import CachedAccessor, DirNamesMixin import pandas.core.algorithms as algos from pandas.core.arrays import ExtensionArray -from pandas.core.base import IndexOpsMixin, PandasObject, InvalidIndexError +from pandas.core.base import IndexOpsMixin, InvalidIndexError, PandasObject import pandas.core.common as com from pandas.core.indexes.frozen import FrozenList import pandas.core.missing as missing From 3ee90c89fa2c33c14d2bdcd889ce74c5a16a0a07 Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Mon, 15 Apr 2019 11:29:37 +0300 Subject: [PATCH 06/13] using unique_nulls_fixture now --- pandas/tests/series/test_timeseries.py | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/pandas/tests/series/test_timeseries.py b/pandas/tests/series/test_timeseries.py index 4a22cdc94ec87..0eefd5c5b8c82 100644 --- a/pandas/tests/series/test_timeseries.py +++ b/pandas/tests/series/test_timeseries.py @@ -33,19 +33,6 @@ def _simple_ts(start, end, freq='D'): return Series(np.random.randn(len(rng)), index=rng) -def test_NA_values_with_cache(): - # GH 22305 - na_values = [None, np.nan, pd.NaT] - # check pairwise, that no pair of na values - # is mangled - for f in na_values: - for s in na_values: - if f is not s: # otherwise not unique - expected = Index([NaT, NaT], dtype='datetime64[ns]') - result = to_datetime([f, s], cache=True) - tm.assert_index_equal(result, expected) - - def assert_range_equal(left, right): assert (left.equals(right)) assert (left.freq == right.freq) @@ -1113,3 +1100,15 @@ def test_asarray_tz_aware(self): result = np.asarray(ser, dtype=object) tm.assert_numpy_array_equal(result, expected) + + def test_NA_values_with_cache(self, unique_nulls_fixture, + unique_nulls_fixture2): + # GH 22305 + # check pairwise, that no pair of na values + # is mangled + if unique_nulls_fixture is not unique_nulls_fixture2: + # otherwise not unique + expected = Index([NaT, NaT], dtype='datetime64[ns]') + result = to_datetime([unique_nulls_fixture, unique_nulls_fixture2], + cache=True) + tm.assert_index_equal(result, expected) From c0a18ce97a2e5dec005b236deb2a633a31e7114a Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Tue, 16 Apr 2019 14:02:20 +0300 Subject: [PATCH 07/13] changed test_na_values_with_cache location; also to_datetime is tested with duplicate nan value now --- pandas/tests/indexes/datetimes/test_tools.py | 9 +++++++++ pandas/tests/series/test_timeseries.py | 12 ------------ 2 files changed, 9 insertions(+), 12 deletions(-) diff --git a/pandas/tests/indexes/datetimes/test_tools.py b/pandas/tests/indexes/datetimes/test_tools.py index eaf689cfa1c21..a592ef941484e 100644 --- a/pandas/tests/indexes/datetimes/test_tools.py +++ b/pandas/tests/indexes/datetimes/test_tools.py @@ -1630,6 +1630,15 @@ def test_parsers(self, date_str, expected, cache): yearfirst=yearfirst) assert result7 == expected + @pytest.mark.parametrize('cache', [True, False]) + def test_na_values_with_cache(self, cache, unique_nulls_fixture, + unique_nulls_fixture2): + # GH22305 + expected = Index([NaT, NaT], dtype='datetime64[ns]') + result = to_datetime([unique_nulls_fixture, unique_nulls_fixture2], + cache=cache) + tm.assert_index_equal(result, expected) + def test_parsers_nat(self): # Test that each of several string-accepting methods return pd.NaT result1, _, _ = parsing.parse_time_string('NaT') diff --git a/pandas/tests/series/test_timeseries.py b/pandas/tests/series/test_timeseries.py index 0eefd5c5b8c82..3a82339375699 100644 --- a/pandas/tests/series/test_timeseries.py +++ b/pandas/tests/series/test_timeseries.py @@ -1100,15 +1100,3 @@ def test_asarray_tz_aware(self): result = np.asarray(ser, dtype=object) tm.assert_numpy_array_equal(result, expected) - - def test_NA_values_with_cache(self, unique_nulls_fixture, - unique_nulls_fixture2): - # GH 22305 - # check pairwise, that no pair of na values - # is mangled - if unique_nulls_fixture is not unique_nulls_fixture2: - # otherwise not unique - expected = Index([NaT, NaT], dtype='datetime64[ns]') - result = to_datetime([unique_nulls_fixture, unique_nulls_fixture2], - cache=True) - tm.assert_index_equal(result, expected) From 90b4510176647e4c5ec172531f7607263ee22034 Mon Sep 17 00:00:00 2001 From: Vasily Litvinov Date: Tue, 16 Apr 2019 10:37:35 -0500 Subject: [PATCH 08/13] Revert changes to indexers --- pandas/core/base.py | 12 +----------- pandas/core/indexes/base.py | 6 +++++- 2 files changed, 6 insertions(+), 12 deletions(-) diff --git a/pandas/core/base.py b/pandas/core/base.py index 7cb89ad8d111b..9a0a4e3e9ca03 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -33,10 +33,6 @@ unique='IndexOpsMixin', duplicated='IndexOpsMixin') -class InvalidIndexError(Exception): - pass - - class StringMixin(object): """implements string methods so long as object defines a `__unicode__` method. @@ -1205,13 +1201,7 @@ def _map_values(self, mapper, na_action=None): else: values = self.values - try: - indexer = mapper.index.get_indexer(values) - except InvalidIndexError: - from pandas import Series - mapper = Series(algorithms.unique(mapper)) - indexer = mapper.index.get_indexer(values) - + indexer = mapper.index.get_indexer(values) new_values = algorithms.take_1d(mapper._values, indexer) return new_values diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 6e9487a2450c1..1875976a8f931 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -37,7 +37,7 @@ from pandas.core.accessor import CachedAccessor, DirNamesMixin import pandas.core.algorithms as algos from pandas.core.arrays import ExtensionArray -from pandas.core.base import IndexOpsMixin, InvalidIndexError, PandasObject +from pandas.core.base import IndexOpsMixin, PandasObject import pandas.core.common as com from pandas.core.indexes.frozen import FrozenList import pandas.core.missing as missing @@ -143,6 +143,10 @@ def index_arithmetic_method(self, other): return set_function_name(index_arithmetic_method, name, cls) +class InvalidIndexError(Exception): + pass + + _o_dtype = np.dtype(object) _Identity = object From 80e819d180d577bf23dbe42fb40df1653819d3e5 Mon Sep 17 00:00:00 2001 From: Vasily Litvinov Date: Tue, 16 Apr 2019 10:55:35 -0500 Subject: [PATCH 09/13] Fix caching for some corner cases in pd.to_datetime --- pandas/core/tools/datetimes.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 66d563a7c6f85..df17f20afffd3 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -50,11 +50,10 @@ def _maybe_cache(arg, format, cache, convert_listlike): from pandas import Series cache_array = Series() if cache: - # Perform a quicker unique check from pandas import Index - if not Index(arg).is_unique: - unique_dates = algorithms.unique(arg) - cache_dates = convert_listlike(unique_dates, True, format) + unique_dates = Index(arg).unique() + if len(unique_dates) < len(arg): + cache_dates = convert_listlike(unique_dates.to_numpy(), True, format) cache_array = Series(cache_dates, index=unique_dates) return cache_array From 4da7b99335cfe2ad942af0e79721cf6923fe9fa6 Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Tue, 16 Apr 2019 19:34:47 +0300 Subject: [PATCH 10/13] fixed PEP8 issue; added reference to PR --- pandas/core/tools/datetimes.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index df17f20afffd3..506ac00946109 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -51,9 +51,11 @@ def _maybe_cache(arg, format, cache, convert_listlike): cache_array = Series() if cache: from pandas import Index + # GH26078 unique_dates = Index(arg).unique() if len(unique_dates) < len(arg): - cache_dates = convert_listlike(unique_dates.to_numpy(), True, format) + cache_dates = convert_listlike(unique_dates.to_numpy(), + True, format) cache_array = Series(cache_dates, index=unique_dates) return cache_array From 243e61ce52130034a1f4d2a6a41d53458376495b Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Tue, 16 Apr 2019 19:46:43 +0300 Subject: [PATCH 11/13] added whatsnew entry --- doc/source/whatsnew/v0.25.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index c441244b4415d..f88c2923686cf 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -266,7 +266,7 @@ Datetimelike ^^^^^^^^^^^^ - Bug in :func:`to_datetime` which would raise an (incorrect) ``ValueError`` when called with a date far into the future and the ``format`` argument specified instead of raising ``OutOfBoundsDatetime`` (:issue:`23830`) -- +- Bug in :func:`to_datetime` which would raise ``InvalidIndexError`` when called with ``cache=True``, with ``arg`` including several different "not a numbers" (:issue:`22305`) - - From c55825a250f6e10319ed563574495982bd1eae21 Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Wed, 17 Apr 2019 13:29:20 +0300 Subject: [PATCH 12/13] more understandable 'whatsnew' note --- doc/source/whatsnew/v0.25.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index f88c2923686cf..856fbd1237a03 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -266,7 +266,7 @@ Datetimelike ^^^^^^^^^^^^ - Bug in :func:`to_datetime` which would raise an (incorrect) ``ValueError`` when called with a date far into the future and the ``format`` argument specified instead of raising ``OutOfBoundsDatetime`` (:issue:`23830`) -- Bug in :func:`to_datetime` which would raise ``InvalidIndexError`` when called with ``cache=True``, with ``arg`` including several different "not a numbers" (:issue:`22305`) +- Bug in :func:`to_datetime` which would raise ``InvalidIndexError: Reindexing only valid with uniquely valued Index objects`` when called with ``cache=True``, with ``arg`` including at least two different elements from the set {None, numpy.nan, pandas.NaT} (:issue:`22305`) - - From a3c66b36e6150c8d56080924441d79ee6fc9a354 Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Wed, 17 Apr 2019 13:55:50 +0300 Subject: [PATCH 13/13] restored comment --- pandas/core/tools/datetimes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 506ac00946109..1ad39e7ad357a 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -50,8 +50,8 @@ def _maybe_cache(arg, format, cache, convert_listlike): from pandas import Series cache_array = Series() if cache: + # Perform a quicker unique check from pandas import Index - # GH26078 unique_dates = Index(arg).unique() if len(unique_dates) < len(arg): cache_dates = convert_listlike(unique_dates.to_numpy(),