From e8256cc3c85788114825e4f1acaa4a49e9c3415a Mon Sep 17 00:00:00 2001 From: Chin Hwee Date: Sat, 5 Oct 2019 10:37:22 +0800 Subject: [PATCH 001/119] DOC: added docs for MultiIndex.set_levels (#28294) shortened lines to pass checks --- doc/source/user_guide/advanced.rst | 31 ++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/doc/source/user_guide/advanced.rst b/doc/source/user_guide/advanced.rst index 62a9b6396404a..6547c46163e1b 100644 --- a/doc/source/user_guide/advanced.rst +++ b/doc/source/user_guide/advanced.rst @@ -177,6 +177,37 @@ completely analogous way to selecting a column in a regular DataFrame: See :ref:`Cross-section with hierarchical index ` for how to select on a deeper level. +.. _advanced.set_levels: + +Set values in levels +~~~~~~~~~~~~~~~~~~~~~ + +The method :meth:`~MultiIndex.set_levels` changes the ``levels`` attribute by +passing a new value for each index in the level. It is assumed that a new value +is provided for each code describing values in the level. +For example: + +.. ipython:: python + + df.columns # original MultiIndex columns + + df.columns.levels # original MultiIndex column levels + + df.columns.set_levels([1, 3, 5, 7], level=0) + + df.columns.set_levels([1, 3, 5, 7], level=0).levels + +If you pass more values than the number of index values in the level, +``set_levels`` will still pass the values to the level. The passed values +are stored in the MultiIndex ``FrozenList`` even though the index values +may be truncated in the MultiIndex output from ``set_levels``. + +.. ipython:: python + + df.columns.set_levels([1, 3, 5, 7], level=1) + + df.columns.set_levels([1, 3, 5, 7], level=1).levels + .. _advanced.shown_levels: Defined levels From d9130f2d86a3985fd6573bc2b4f3f6858b6c9899 Mon Sep 17 00:00:00 2001 From: Chin Hwee Date: Tue, 8 Oct 2019 23:48:32 +0800 Subject: [PATCH 002/119] edit set_levels docstring with additional examples edit set_levels docstring with additional examples edit set_levels docstring with additional examples --- doc/source/user_guide/advanced.rst | 31 -------------------- pandas/core/indexes/multi.py | 45 ++++++++++++++++++++++++------ 2 files changed, 36 insertions(+), 40 deletions(-) diff --git a/doc/source/user_guide/advanced.rst b/doc/source/user_guide/advanced.rst index 6547c46163e1b..62a9b6396404a 100644 --- a/doc/source/user_guide/advanced.rst +++ b/doc/source/user_guide/advanced.rst @@ -177,37 +177,6 @@ completely analogous way to selecting a column in a regular DataFrame: See :ref:`Cross-section with hierarchical index ` for how to select on a deeper level. -.. _advanced.set_levels: - -Set values in levels -~~~~~~~~~~~~~~~~~~~~~ - -The method :meth:`~MultiIndex.set_levels` changes the ``levels`` attribute by -passing a new value for each index in the level. It is assumed that a new value -is provided for each code describing values in the level. -For example: - -.. ipython:: python - - df.columns # original MultiIndex columns - - df.columns.levels # original MultiIndex column levels - - df.columns.set_levels([1, 3, 5, 7], level=0) - - df.columns.set_levels([1, 3, 5, 7], level=0).levels - -If you pass more values than the number of index values in the level, -``set_levels`` will still pass the values to the level. The passed values -are stored in the MultiIndex ``FrozenList`` even though the index values -may be truncated in the MultiIndex output from ``set_levels``. - -.. ipython:: python - - df.columns.set_levels([1, 3, 5, 7], level=1) - - df.columns.set_levels([1, 3, 5, 7], level=1).levels - .. _advanced.shown_levels: Defined levels diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index b2bb50939551d..6b7019f4685c7 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -719,8 +719,16 @@ def _set_levels( def set_levels(self, levels, level=None, inplace=False, verify_integrity=True): """ - Set new levels on MultiIndex. Defaults to returning - new index. + Set levels on MultiIndex by passing a new value for each + index in the level. Defaults to returning new + index. + + It is assumed that a new value is provided for each code describing + values in the level. If the number of values passed is more than + the number of index values in the level, ``set_levels`` will still + pass the values to the level. The passed values are stored in the + MultiIndex FrozenList even though the index values may be truncated + in the MultiIndex output from set_levels. Parameters ---------- @@ -740,32 +748,51 @@ def set_levels(self, levels, level=None, inplace=False, verify_integrity=True): Examples -------- >>> idx = pd.MultiIndex.from_tuples([(1, 'one'), (1, 'two'), - (2, 'one'), (2, 'two')], + (2, 'one'), (2, 'two'), + (3, 'one'), (3, 'two')], names=['foo', 'bar']) - >>> idx.set_levels([['a', 'b'], [1, 2]]) + >>> idx.set_levels([['a', 'b', 'c'], [1, 2]]) MultiIndex([('a', 1), ('a', 2), ('b', 1), - ('b', 2)], + ('b', 2), + ('c', 1), + ('c', 2)], names=['foo', 'bar']) - >>> idx.set_levels(['a', 'b'], level=0) + >>> idx.set_levels(['a', 'b', 'c'], level=0) MultiIndex([('a', 'one'), ('a', 'two'), ('b', 'one'), - ('b', 'two')], + ('b', 'two'), + ('c', 'one'), + ('c', 'two')], names=['foo', 'bar']) >>> idx.set_levels(['a', 'b'], level='bar') MultiIndex([(1, 'a'), (1, 'b'), (2, 'a'), - (2, 'b')], + (2, 'b'), + (3, 'a'), + (3, 'b')], + names=['foo', 'bar']) + >>> idx.set_levels([['a', 'b', 'c'], [1, 2]], level=[0, 1]) + MultiIndex([('a', 1), + ('a', 2), + ('b', 1), + ('b', 2), + ('c', 1), + ('c', 2)], names=['foo', 'bar']) - >>> idx.set_levels([['a', 'b'], [1, 2]], level=[0, 1]) + >>> idx.set_levels([['a', 'b', 'c'], [1, 2]], level=[0, 1]).levels + FrozenList([['a', 'b', 'c'], [1, 2]]) + >>> idx.set_levels([['a', 'b', 'c'], [1, 2, 3, 4]], level=[0, 1]) MultiIndex([('a', 1), ('a', 2), ('b', 1), ('b', 2)], names=['foo', 'bar']) + >>> idx.set_levels([['a', 'b', 'c'], [1, 2, 3, 4]], level=[0, 1]).levels + FrozenList([['a', 'b', 'c'], [1, 2, 3, 4]]) """ if is_list_like(levels) and not isinstance(levels, Index): levels = list(levels) From 46d88c182eb13cd55b716117abab6125b425426b Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 8 Oct 2019 11:59:07 -0700 Subject: [PATCH 003/119] CLN: Exception*2 in groupby wrapper (#28771) --- pandas/core/groupby/groupby.py | 57 +++++++++++++++++++--------------- 1 file changed, 32 insertions(+), 25 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index cb56f7b8d535b..61a04431f99cb 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -11,6 +11,8 @@ class providing the base-class of operations. from contextlib import contextmanager import datetime from functools import partial, wraps +import inspect +import re import types from typing import FrozenSet, List, Optional, Tuple, Type, Union @@ -613,23 +615,21 @@ def _make_wrapper(self, name): return self.apply(lambda self: getattr(self, name)) f = getattr(type(self._selected_obj), name) + sig = inspect.signature(f) def wrapper(*args, **kwargs): # a little trickery for aggregation functions that need an axis # argument - kwargs_with_axis = kwargs.copy() - if "axis" not in kwargs_with_axis or kwargs_with_axis["axis"] is None: - kwargs_with_axis["axis"] = self.axis - - def curried_with_axis(x): - return f(x, *args, **kwargs_with_axis) + if "axis" in sig.parameters: + if kwargs.get("axis", None) is None: + kwargs["axis"] = self.axis def curried(x): return f(x, *args, **kwargs) # preserve the name so we can detect it when calling plot methods, # to avoid duplicates - curried.__name__ = curried_with_axis.__name__ = name + curried.__name__ = name # special case otherwise extra plots are created when catching the # exception below @@ -637,24 +637,31 @@ def curried(x): return self.apply(curried) try: - return self.apply(curried_with_axis) - except Exception: - try: - return self.apply(curried) - except Exception: - - # related to : GH3688 - # try item-by-item - # this can be called recursively, so need to raise - # ValueError - # if we don't have this method to indicated to aggregate to - # mark this column as an error - try: - return self._aggregate_item_by_item(name, *args, **kwargs) - except AttributeError: - # e.g. SparseArray has no flags attr - raise ValueError - + return self.apply(curried) + except TypeError as err: + if not re.search( + "reduction operation '.*' not allowed for this dtype", str(err) + ): + # We don't have a cython implementation + # TODO: is the above comment accurate? + raise + + # related to : GH3688 + # try item-by-item + # this can be called recursively, so need to raise + # ValueError + # if we don't have this method to indicated to aggregate to + # mark this column as an error + try: + return self._aggregate_item_by_item(name, *args, **kwargs) + except AttributeError: + # e.g. SparseArray has no flags attr + # FIXME: 'SeriesGroupBy' has no attribute '_aggregate_item_by_item' + # occurs in idxmax() case + # in tests.groupby.test_function.test_non_cython_api + raise ValueError + + wrapper.__name__ = name return wrapper def get_group(self, name, obj=None): From 1ef3637f16fc82f87353a5bc76850fd0c1e33764 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 8 Oct 2019 13:55:01 -0700 Subject: [PATCH 004/119] TST: un-xfail 1 passing maybe_promote test (#28850) --- pandas/tests/dtypes/cast/test_promote.py | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/pandas/tests/dtypes/cast/test_promote.py b/pandas/tests/dtypes/cast/test_promote.py index da2b4c28a02a5..45dbdf72209b6 100644 --- a/pandas/tests/dtypes/cast/test_promote.py +++ b/pandas/tests/dtypes/cast/test_promote.py @@ -1038,14 +1038,7 @@ def test_maybe_promote_any_numpy_dtype_with_na( dtype = np.dtype(any_numpy_dtype_reduced) boxed, box_dtype = box # read from parametrized fixture - if ( - dtype == bytes - and not boxed - and fill_value is not None - and fill_value is not NaT - ): - pytest.xfail("does not upcast to object") - elif is_integer_dtype(dtype) and fill_value is not NaT: + if is_integer_dtype(dtype) and fill_value is not NaT: # integer + other missing value (np.nan / None) casts to float expected_dtype = np.float64 exp_val_for_scalar = np.nan From f625730eb1e729a3f58b213cb31ccf625307b198 Mon Sep 17 00:00:00 2001 From: Tommy Lynch Date: Tue, 8 Oct 2019 13:56:31 -0700 Subject: [PATCH 005/119] =?UTF-8?q?remove=20doc=20note=20about=20apply=20a?= =?UTF-8?q?pplying=20a=20function=20to=20the=20first=20element=20=E2=80=A6?= =?UTF-8?q?=20(#28854)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pandas/core/frame.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 1a19910a0957c..5200ad0ba0d23 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -6729,14 +6729,6 @@ def apply( DataFrame.aggregate: Only perform aggregating type operations. DataFrame.transform: Only perform transforming type operations. - Notes - ----- - In the current implementation apply calls `func` twice on the - first column/row to decide whether it can take a fast or slow - code path. This can lead to unexpected behavior if `func` has - side-effects, as they will take effect twice for the first - column/row. - Examples -------- From c9b62610b1f7405a2b4c825f0b4b254e27e2280e Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 8 Oct 2019 13:57:59 -0700 Subject: [PATCH 006/119] TST: un-xfail 22 tests (#28856) --- pandas/tests/dtypes/cast/test_promote.py | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/pandas/tests/dtypes/cast/test_promote.py b/pandas/tests/dtypes/cast/test_promote.py index 45dbdf72209b6..b07d6e72d1420 100644 --- a/pandas/tests/dtypes/cast/test_promote.py +++ b/pandas/tests/dtypes/cast/test_promote.py @@ -515,14 +515,6 @@ def test_maybe_promote_bytes_with_any(bytes_dtype, any_numpy_dtype_reduced, box) else: if boxed and box_dtype is None: pytest.xfail("does not upcast to object") - if ( - is_integer_dtype(fill_dtype) - or is_float_dtype(fill_dtype) - or is_complex_dtype(fill_dtype) - or is_object_dtype(fill_dtype) - or is_timedelta64_dtype(fill_dtype) - ) and not boxed: - pytest.xfail("does not upcast to object") # create array of given dtype; casts "1" to correct dtype fill_value = np.array([1], dtype=fill_dtype)[0] @@ -557,15 +549,12 @@ def test_maybe_promote_any_with_bytes(any_numpy_dtype_reduced, bytes_dtype, box) else: pytest.xfail("wrong missing value marker") else: - pass if ( boxed and (box_dtype == "bytes" or box_dtype is None) and not (is_string_dtype(dtype) or dtype == bool) ): pytest.xfail("does not upcast to object") - if not boxed and is_datetime_or_timedelta_dtype(dtype): - pytest.xfail("raises error") # create array of given dtype fill_value = b"abc" From 25b18596c83e2a1f0ee47446b3dba0c1ad339ed4 Mon Sep 17 00:00:00 2001 From: Ronan Lamy Date: Tue, 8 Oct 2019 23:00:22 +0200 Subject: [PATCH 007/119] TST: Improve compatibility with pypy error messages (#28844) --- pandas/tests/arithmetic/test_datetime64.py | 10 +++++----- pandas/tests/arithmetic/test_timedelta64.py | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/pandas/tests/arithmetic/test_datetime64.py b/pandas/tests/arithmetic/test_datetime64.py index 6f7222f523579..d239687a37757 100644 --- a/pandas/tests/arithmetic/test_datetime64.py +++ b/pandas/tests/arithmetic/test_datetime64.py @@ -887,7 +887,7 @@ def test_dt64arr_add_sub_td64ndarray(self, tz_naive_fixture, box_with_array): result = dtarr - tdarr tm.assert_equal(result, expected) - msg = "cannot subtract|bad operand type for unary -" + msg = "cannot subtract|(bad|unsupported) operand type for unary" with pytest.raises(TypeError, match=msg): tdarr - dtarr @@ -1126,7 +1126,7 @@ def test_dt64arr_series_sub_tick_DateOffset(self, box_with_array): result2 = -pd.offsets.Second(5) + ser tm.assert_equal(result2, expected) - msg = "bad operand type for unary" + msg = "(bad|unsupported) operand type for unary" with pytest.raises(TypeError, match=msg): pd.offsets.Second(5) - ser @@ -1220,7 +1220,7 @@ def test_dt64arr_add_sub_relativedelta_offsets(self, box_with_array): expected = DatetimeIndex([x - off for x in vec_items]) expected = tm.box_expected(expected, box_with_array) tm.assert_equal(expected, vec - off) - msg = "bad operand type for unary" + msg = "(bad|unsupported) operand type for unary" with pytest.raises(TypeError, match=msg): off - vec @@ -1336,7 +1336,7 @@ def test_dt64arr_add_sub_DateOffsets( expected = DatetimeIndex([offset + x for x in vec_items]) expected = tm.box_expected(expected, box_with_array) tm.assert_equal(expected, offset + vec) - msg = "bad operand type for unary" + msg = "(bad|unsupported) operand type for unary" with pytest.raises(TypeError, match=msg): offset - vec @@ -1920,7 +1920,7 @@ def test_operators_datetimelike_with_timezones(self): result = dt1 - td1[0] exp = (dt1.dt.tz_localize(None) - td1[0]).dt.tz_localize(tz) tm.assert_series_equal(result, exp) - msg = "bad operand type for unary" + msg = "(bad|unsupported) operand type for unary" with pytest.raises(TypeError, match=msg): td1[0] - dt1 diff --git a/pandas/tests/arithmetic/test_timedelta64.py b/pandas/tests/arithmetic/test_timedelta64.py index d480b26e30fff..ecb07fa49036a 100644 --- a/pandas/tests/arithmetic/test_timedelta64.py +++ b/pandas/tests/arithmetic/test_timedelta64.py @@ -245,7 +245,7 @@ def test_subtraction_ops(self): with pytest.raises(TypeError, match=msg): td - dt - msg = "bad operand type for unary -: 'DatetimeArray'" + msg = "(bad|unsupported) operand type for unary" with pytest.raises(TypeError, match=msg): td - dti From 29e56df08887e3e66d4894cb9a895c900fa8cba5 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 8 Oct 2019 14:21:12 -0700 Subject: [PATCH 008/119] TST: Fix not-boxed maybe_promote test (#28852) --- pandas/core/dtypes/cast.py | 2 ++ pandas/tests/dtypes/cast/test_promote.py | 2 -- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 4435b2518e90b..b439a3e2dfbc8 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -359,6 +359,8 @@ def maybe_promote(dtype, fill_value=np.nan): if isinstance(fill_value, datetime) and fill_value.tzinfo is not None: # Trying to insert tzaware into tznaive, have to cast to object dtype = np.dtype(np.object_) + elif is_integer(fill_value) or (is_float(fill_value) and not isna(fill_value)): + dtype = np.dtype(np.object_) else: try: fill_value = tslibs.Timestamp(fill_value).to_datetime64() diff --git a/pandas/tests/dtypes/cast/test_promote.py b/pandas/tests/dtypes/cast/test_promote.py index b07d6e72d1420..7c926b30766c5 100644 --- a/pandas/tests/dtypes/cast/test_promote.py +++ b/pandas/tests/dtypes/cast/test_promote.py @@ -592,8 +592,6 @@ def test_maybe_promote_datetime64_with_any( else: if boxed and box_dtype is None: pytest.xfail("does not upcast to object") - if not boxed: - pytest.xfail("does not upcast to object or raises") # create array of given dtype; casts "1" to correct dtype fill_value = np.array([1], dtype=fill_dtype)[0] From acde02bbcee409c30e032517825d9f9aafc65dd0 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 8 Oct 2019 14:21:49 -0700 Subject: [PATCH 009/119] TST: fix 24 xfails in maybe_promote (#28833) --- pandas/core/dtypes/cast.py | 22 +++++++++++++++------- pandas/core/internals/concat.py | 3 ++- pandas/tests/dtypes/cast/test_promote.py | 2 -- 3 files changed, 17 insertions(+), 10 deletions(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index b439a3e2dfbc8..5b13e13bb20ba 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -367,16 +367,24 @@ def maybe_promote(dtype, fill_value=np.nan): except (TypeError, ValueError): dtype = np.dtype(np.object_) elif issubclass(dtype.type, np.timedelta64): - try: - fv = tslibs.Timedelta(fill_value) - except ValueError: + if ( + is_integer(fill_value) + or (is_float(fill_value) and not np.isnan(fill_value)) + or isinstance(fill_value, str) + ): + # TODO: What about str that can be a timedelta? dtype = np.dtype(np.object_) else: - if fv is NaT: - # NaT has no `to_timedelta64` method - fill_value = np.timedelta64("NaT", "ns") + try: + fv = tslibs.Timedelta(fill_value) + except ValueError: + dtype = np.dtype(np.object_) else: - fill_value = fv.to_timedelta64() + if fv is NaT: + # NaT has no `to_timedelta64` method + fill_value = np.timedelta64("NaT", "ns") + else: + fill_value = fv.to_timedelta64() elif is_datetime64tz_dtype(dtype): if isna(fill_value): fill_value = NaT diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index 121c61d8d3623..aa372af4aceb8 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -338,6 +338,7 @@ def get_empty_dtype_and_na(join_units): if not upcast_classes: upcast_classes = null_upcast_classes + # TODO: de-duplicate with maybe_promote? # create the result if "object" in upcast_classes: return np.dtype(np.object_), np.nan @@ -356,7 +357,7 @@ def get_empty_dtype_and_na(join_units): elif "datetime" in upcast_classes: return np.dtype("M8[ns]"), tslibs.iNaT elif "timedelta" in upcast_classes: - return np.dtype("m8[ns]"), tslibs.iNaT + return np.dtype("m8[ns]"), np.timedelta64("NaT", "ns") else: # pragma try: g = np.find_common_type(upcast_classes, []) diff --git a/pandas/tests/dtypes/cast/test_promote.py b/pandas/tests/dtypes/cast/test_promote.py index 7c926b30766c5..e4e5a22ea6ca0 100644 --- a/pandas/tests/dtypes/cast/test_promote.py +++ b/pandas/tests/dtypes/cast/test_promote.py @@ -821,8 +821,6 @@ def test_maybe_promote_timedelta64_with_any( else: if boxed and box_dtype is None: pytest.xfail("does not upcast to object") - if not boxed: - pytest.xfail("does not upcast to object or raises") # create array of given dtype; casts "1" to correct dtype fill_value = np.array([1], dtype=fill_dtype)[0] From 5122241de2bd4c74abdd78f82eaa112c556c2796 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 8 Oct 2019 14:23:31 -0700 Subject: [PATCH 010/119] CLN: Assorted cleanups (#28848) --- ci/code_checks.sh | 2 +- pandas/core/internals/concat.py | 2 +- pandas/core/internals/managers.py | 2 +- pandas/core/ops/__init__.py | 12 +- pandas/core/ops/array_ops.py | 21 +- pandas/tests/groupby/test_bin_groupby.py | 7 +- pandas/tests/groupby/test_timegrouper.py | 7 +- pandas/tests/io/excel/test_writers.py | 11 +- pandas/tests/io/parser/test_textreader.py | 10 +- pandas/tests/reshape/merge/test_merge.py | 23 +- .../tests/reshape/merge/test_merge_ordered.py | 12 +- pandas/tests/reshape/merge/test_multi.py | 9 +- pandas/tests/reshape/test_melt.py | 21 +- pandas/tests/reshape/test_reshape.py | 17 +- .../tests/series/indexing/test_alter_index.py | 5 +- pandas/tests/series/test_analytics.py | 35 +- pandas/tests/series/test_combine_concat.py | 5 +- pandas/tests/series/test_constructors.py | 21 +- pandas/tests/series/test_missing.py | 37 +- pandas/tests/series/test_rank.py | 13 +- pandas/tests/test_algos.py | 7 +- pandas/tests/test_multilevel.py | 11 +- pandas/tests/test_sorting.py | 5 +- pandas/tests/test_strings.py | 645 ++++++++++++------ 24 files changed, 576 insertions(+), 364 deletions(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 2d78ce7db8090..0b9aae6676710 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -122,7 +122,7 @@ if [[ -z "$CHECK" || "$CHECK" == "patterns" ]]; then MSG='Check for non-standard imports' ; echo $MSG invgrep -R --include="*.py*" -E "from pandas.core.common import " pandas invgrep -R --include="*.py*" -E "from collections.abc import " pandas - # invgrep -R --include="*.py*" -E "from numpy import nan " pandas # GH#24822 not yet implemented since the offending imports have not all been removed + invgrep -R --include="*.py*" -E "from numpy import nan " pandas RET=$(($RET + $?)) ; echo $MSG "DONE" MSG='Check for use of exec' ; echo $MSG diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index aa372af4aceb8..85bce9450d12d 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -287,7 +287,7 @@ def get_empty_dtype_and_na(join_units): return np.float64, np.nan if is_uniform_reindex(join_units): - # XXX: integrate property + # FIXME: integrate property empty_dtype = join_units[0].block.dtype upcasted_na = join_units[0].block.fill_value return empty_dtype, upcasted_na diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 1c31542daa5de..5f4c9d41b340b 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -2035,7 +2035,7 @@ def concatenate_block_managers(mgrs_indexers, axes, concat_axis, copy): values = b.values if copy: values = values.copy() - elif not copy: + else: values = values.view() b = b.make_block_same_class(values, placement=placement) elif is_uniform_join_units(join_units): diff --git a/pandas/core/ops/__init__.py b/pandas/core/ops/__init__.py index 42f1d4e99108f..398fa9b0c1fc0 100644 --- a/pandas/core/ops/__init__.py +++ b/pandas/core/ops/__init__.py @@ -766,9 +766,9 @@ def f(self, other, axis=default_axis, level=None): return f -def _comp_method_FRAME(cls, func, special): - str_rep = _get_opstr(func) - op_name = _get_op_name(func, special) +def _comp_method_FRAME(cls, op, special): + str_rep = _get_opstr(op) + op_name = _get_op_name(op, special) @Appender("Wrapper for comparison method {name}".format(name=op_name)) def f(self, other): @@ -781,18 +781,18 @@ def f(self, other): raise ValueError( "Can only compare identically-labeled DataFrame objects" ) - new_data = dispatch_to_series(self, other, func, str_rep) + new_data = dispatch_to_series(self, other, op, str_rep) return self._construct_result(new_data) elif isinstance(other, ABCSeries): return _combine_series_frame( - self, other, func, fill_value=None, axis=None, level=None + self, other, op, fill_value=None, axis=None, level=None ) else: # straight boolean comparisons we want to allow all columns # (regardless of dtype to pass thru) See #4537 for discussion. - new_data = dispatch_to_series(self, other, func) + new_data = dispatch_to_series(self, other, op) return self._construct_result(new_data) f.__name__ = op_name diff --git a/pandas/core/ops/array_ops.py b/pandas/core/ops/array_ops.py index 55b4b1a899f65..a225eec93b27e 100644 --- a/pandas/core/ops/array_ops.py +++ b/pandas/core/ops/array_ops.py @@ -118,14 +118,14 @@ def masked_arith_op(x, y, op): return result -def define_na_arithmetic_op(op, str_rep, eval_kwargs): +def define_na_arithmetic_op(op, str_rep: str, eval_kwargs): def na_op(x, y): return na_arithmetic_op(x, y, op, str_rep, eval_kwargs) return na_op -def na_arithmetic_op(left, right, op, str_rep, eval_kwargs): +def na_arithmetic_op(left, right, op, str_rep: str, eval_kwargs): """ Return the result of evaluating op on the passed in values. @@ -173,6 +173,7 @@ def arithmetic_op( Cannot be a DataFrame or Index. Series is *not* excluded. op : {operator.add, operator.sub, ...} Or one of the reversed variants from roperator. + str_rep : str Returns ------- @@ -279,8 +280,16 @@ def comparison_op( return res_values -def na_logical_op(x, y, op): +def na_logical_op(x: np.ndarray, y, op): try: + # For exposition, write: + # yarr = isinstance(y, np.ndarray) + # yint = is_integer(y) or (yarr and y.dtype.kind == "i") + # ybool = is_bool(y) or (yarr and y.dtype.kind == "b") + # xint = x.dtype.kind == "i" + # xbool = x.dtype.kind == "b" + # Then Cases where this goes through without raising include: + # (xint or xbool) and (yint or bool) result = op(x, y) except TypeError: if isinstance(y, np.ndarray): @@ -304,9 +313,9 @@ def na_logical_op(x, y, op): NotImplementedError, ): raise TypeError( - "cannot compare a dtyped [{dtype}] array " - "with a scalar of type [{typ}]".format( - dtype=x.dtype, typ=type(y).__name__ + "Cannot perform '{op}' with a dtyped [{dtype}] array " + "and scalar of type [{typ}]".format( + op=op.__name__, dtype=x.dtype, typ=type(y).__name__ ) ) diff --git a/pandas/tests/groupby/test_bin_groupby.py b/pandas/tests/groupby/test_bin_groupby.py index b8f9ecd42bae3..8da03a7f61029 100644 --- a/pandas/tests/groupby/test_bin_groupby.py +++ b/pandas/tests/groupby/test_bin_groupby.py @@ -1,5 +1,4 @@ import numpy as np -from numpy import nan import pytest from pandas._libs import groupby, lib, reduction as libreduction @@ -96,7 +95,7 @@ def _check(dtype): def _ohlc(group): if isna(group).all(): - return np.repeat(nan, 4) + return np.repeat(np.nan, 4) return [group[0], group.max(), group.min(), group[-1]] expected = np.array([_ohlc(obj[:6]), _ohlc(obj[6:12]), _ohlc(obj[12:])]) @@ -104,9 +103,9 @@ def _ohlc(group): assert_almost_equal(out, expected) tm.assert_numpy_array_equal(counts, np.array([6, 6, 8], dtype=np.int64)) - obj[:6] = nan + obj[:6] = np.nan func(out, counts, obj[:, None], labels) - expected[0] = nan + expected[0] = np.nan assert_almost_equal(out, expected) _check("float32") diff --git a/pandas/tests/groupby/test_timegrouper.py b/pandas/tests/groupby/test_timegrouper.py index e1e35d8eb7d18..7acddec002d98 100644 --- a/pandas/tests/groupby/test_timegrouper.py +++ b/pandas/tests/groupby/test_timegrouper.py @@ -4,7 +4,6 @@ from io import StringIO import numpy as np -from numpy import nan import pytest import pytz @@ -699,13 +698,13 @@ def test_first_last_max_min_on_time_data(self): df_test = DataFrame( { "dt": [ - nan, + np.nan, "2015-07-24 10:10", "2015-07-25 11:11", "2015-07-23 12:12", - nan, + np.nan, ], - "td": [nan, td(days=1), td(days=2), td(days=3), nan], + "td": [np.nan, td(days=1), td(days=2), td(days=3), np.nan], } ) df_test.dt = pd.to_datetime(df_test.dt) diff --git a/pandas/tests/io/excel/test_writers.py b/pandas/tests/io/excel/test_writers.py index 8ad09549f3cbe..9feec424389e7 100644 --- a/pandas/tests/io/excel/test_writers.py +++ b/pandas/tests/io/excel/test_writers.py @@ -4,7 +4,6 @@ import os import numpy as np -from numpy import nan import pytest from pandas.compat import PY36 @@ -323,7 +322,7 @@ def test_excel_writer_context_manager(self, frame, engine, ext): def test_roundtrip(self, engine, ext, frame): frame = frame.copy() - frame["A"][:5] = nan + frame["A"][:5] = np.nan frame.to_excel(self.path, "test1") frame.to_excel(self.path, "test1", columns=["A", "B"]) @@ -388,7 +387,7 @@ def test_ts_frame(self, tsframe, engine, ext): def test_basics_with_nan(self, engine, ext, frame): frame = frame.copy() - frame["A"][:5] = nan + frame["A"][:5] = np.nan frame.to_excel(self.path, "test1") frame.to_excel(self.path, "test1", columns=["A", "B"]) frame.to_excel(self.path, "test1", header=False) @@ -450,7 +449,7 @@ def test_inf_roundtrip(self, engine, ext): def test_sheets(self, engine, ext, frame, tsframe): frame = frame.copy() - frame["A"][:5] = nan + frame["A"][:5] = np.nan frame.to_excel(self.path, "test1") frame.to_excel(self.path, "test1", columns=["A", "B"]) @@ -473,7 +472,7 @@ def test_sheets(self, engine, ext, frame, tsframe): def test_colaliases(self, engine, ext, frame): frame = frame.copy() - frame["A"][:5] = nan + frame["A"][:5] = np.nan frame.to_excel(self.path, "test1") frame.to_excel(self.path, "test1", columns=["A", "B"]) @@ -491,7 +490,7 @@ def test_colaliases(self, engine, ext, frame): def test_roundtrip_indexlabels(self, merge_cells, engine, ext, frame): frame = frame.copy() - frame["A"][:5] = nan + frame["A"][:5] = np.nan frame.to_excel(self.path, "test1") frame.to_excel(self.path, "test1", columns=["A", "B"]) diff --git a/pandas/tests/io/parser/test_textreader.py b/pandas/tests/io/parser/test_textreader.py index 73638fe8ab7c8..9afeaf75f4da3 100644 --- a/pandas/tests/io/parser/test_textreader.py +++ b/pandas/tests/io/parser/test_textreader.py @@ -6,7 +6,6 @@ import os import numpy as np -from numpy import nan import pytest import pandas._libs.parsers as parser @@ -309,10 +308,15 @@ def test_empty_field_eof(self): assert_array_dicts_equal(result, expected) # GH5664 - a = DataFrame([["b"], [nan]], columns=["a"], index=["a", "c"]) + a = DataFrame([["b"], [np.nan]], columns=["a"], index=["a", "c"]) b = DataFrame([[1, 1, 1, 0], [1, 1, 1, 0]], columns=list("abcd"), index=[1, 1]) c = DataFrame( - [[1, 2, 3, 4], [6, nan, nan, nan], [8, 9, 10, 11], [13, 14, nan, nan]], + [ + [1, 2, 3, 4], + [6, np.nan, np.nan, np.nan], + [8, 9, 10, 11], + [13, 14, np.nan, np.nan], + ], columns=list("abcd"), index=[0, 5, 7, 12], ) diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 4de8bba169438..08698133e360d 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -1,10 +1,9 @@ from collections import OrderedDict -from datetime import date, datetime +from datetime import date, datetime, timedelta import random import re import numpy as np -from numpy import nan import pytest from pandas.core.dtypes.common import is_categorical_dtype, is_object_dtype @@ -565,9 +564,7 @@ def test_merge_all_na_column(self, series_of_dtype, series_of_dtype_all_na): assert_frame_equal(actual, expected) def test_merge_nosort(self): - # #2098, anything to do? - - from datetime import datetime + # GH#2098, TODO: anything to do? d = { "var1": np.random.randint(0, 10, size=10), @@ -621,9 +618,9 @@ def test_merge_nan_right(self): expected = DataFrame( { "i1": {0: 0, 1: 1}, - "i1_": {0: 0.0, 1: nan}, + "i1_": {0: 0.0, 1: np.nan}, "i2": {0: 0.5, 1: 1.5}, - "i3": {0: 0.69999999999999996, 1: nan}, + "i3": {0: 0.69999999999999996, 1: np.nan}, } )[["i1", "i2", "i1_", "i3"]] assert_frame_equal(result, expected) @@ -640,21 +637,17 @@ def _constructor(self): assert isinstance(result, NotADataFrame) def test_join_append_timedeltas(self): - - import datetime as dt - from pandas import NaT - # timedelta64 issues with join/merge # GH 5695 - d = {"d": dt.datetime(2013, 11, 5, 5, 56), "t": dt.timedelta(0, 22500)} + d = {"d": datetime(2013, 11, 5, 5, 56), "t": timedelta(0, 22500)} df = DataFrame(columns=list("dt")) df = df.append(d, ignore_index=True) result = df.append(d, ignore_index=True) expected = DataFrame( { - "d": [dt.datetime(2013, 11, 5, 5, 56), dt.datetime(2013, 11, 5, 5, 56)], - "t": [dt.timedelta(0, 22500), dt.timedelta(0, 22500)], + "d": [datetime(2013, 11, 5, 5, 56), datetime(2013, 11, 5, 5, 56)], + "t": [timedelta(0, 22500), timedelta(0, 22500)], } ) assert_frame_equal(result, expected) @@ -667,7 +660,7 @@ def test_join_append_timedeltas(self): expected = DataFrame( { "0": Series([td, td], index=list("AB")), - "0r": Series([td, NaT], index=list("AB")), + "0r": Series([td, pd.NaT], index=list("AB")), } ) assert_frame_equal(result, expected) diff --git a/pandas/tests/reshape/merge/test_merge_ordered.py b/pandas/tests/reshape/merge/test_merge_ordered.py index 2b79548be7b59..a9f23313a83b9 100644 --- a/pandas/tests/reshape/merge/test_merge_ordered.py +++ b/pandas/tests/reshape/merge/test_merge_ordered.py @@ -1,4 +1,4 @@ -from numpy import nan +import numpy as np import pytest import pandas as pd @@ -17,8 +17,8 @@ def test_basic(self): expected = DataFrame( { "key": ["a", "b", "c", "d", "e", "f"], - "lvalue": [1, nan, 2, nan, 3, nan], - "rvalue": [nan, 1, 2, 3, nan, 4], + "lvalue": [1, np.nan, 2, np.nan, 3, np.nan], + "rvalue": [np.nan, 1, 2, 3, np.nan, 4], } ) @@ -30,7 +30,7 @@ def test_ffill(self): { "key": ["a", "b", "c", "d", "e", "f"], "lvalue": [1.0, 1, 2, 2, 3, 3.0], - "rvalue": [nan, 1, 2, 3, 3, 4], + "rvalue": [np.nan, 1, 2, 3, 3, 4], } ) assert_frame_equal(result, expected) @@ -47,7 +47,7 @@ def test_multigroup(self): { "key": ["a", "b", "c", "d", "e", "f"] * 2, "lvalue": [1.0, 1, 2, 2, 3, 3.0] * 2, - "rvalue": [nan, 1, 2, 3, 3, 4] * 2, + "rvalue": [np.nan, 1, 2, 3, 3, 4] * 2, } ) expected["group"] = ["a"] * 6 + ["b"] * 6 @@ -110,7 +110,7 @@ def test_doc_example(self): "group": list("aaaaabbbbb"), "key": ["a", "b", "c", "d", "e"] * 2, "lvalue": [1, 1, 2, 2, 3] * 2, - "rvalue": [nan, 1, 2, 3, 3] * 2, + "rvalue": [np.nan, 1, 2, 3, 3] * 2, } ) diff --git a/pandas/tests/reshape/merge/test_multi.py b/pandas/tests/reshape/merge/test_multi.py index 7aea85153d908..1d8d2add3840c 100644 --- a/pandas/tests/reshape/merge/test_multi.py +++ b/pandas/tests/reshape/merge/test_multi.py @@ -1,7 +1,6 @@ from collections import OrderedDict import numpy as np -from numpy import nan from numpy.random import randn import pytest @@ -311,11 +310,11 @@ def test_left_join_index_multi_match_multiindex(self): [ ["X", "Y", "C", "a", 6], ["X", "Y", "C", "a", 9], - ["W", "Y", "C", "e", nan], + ["W", "Y", "C", "e", np.nan], ["V", "Q", "A", "h", -3], ["V", "R", "D", "i", 2], ["V", "R", "D", "i", -1], - ["X", "Y", "D", "b", nan], + ["X", "Y", "D", "b", np.nan], ["X", "Y", "A", "c", 1], ["X", "Y", "A", "c", 4], ["W", "Q", "B", "f", 3], @@ -365,10 +364,10 @@ def test_left_join_index_multi_match(self): ["c", 0, "x"], ["c", 0, "r"], ["c", 0, "s"], - ["b", 1, nan], + ["b", 1, np.nan], ["a", 2, "v"], ["a", 2, "z"], - ["b", 3, nan], + ["b", 3, np.nan], ], columns=["tag", "val", "char"], index=[2, 2, 2, 2, 0, 1, 1, 3], diff --git a/pandas/tests/reshape/test_melt.py b/pandas/tests/reshape/test_melt.py index 5b1f151daf219..b1d790644bbfb 100644 --- a/pandas/tests/reshape/test_melt.py +++ b/pandas/tests/reshape/test_melt.py @@ -1,5 +1,4 @@ import numpy as np -from numpy import nan import pytest import pandas as pd @@ -329,11 +328,11 @@ def test_pairs(self): "29dec2008", "20jan2009", ], - "visitdt2": ["21jan2009", nan, "22jan2009", "31dec2008", "03feb2009"], - "visitdt3": ["05feb2009", nan, nan, "02jan2009", "15feb2009"], + "visitdt2": ["21jan2009", np.nan, "22jan2009", "31dec2008", "03feb2009"], + "visitdt3": ["05feb2009", np.nan, np.nan, "02jan2009", "15feb2009"], "wt1": [1823, 3338, 1549, 3298, 4306], - "wt2": [2011.0, nan, 1892.0, 3338.0, 4575.0], - "wt3": [2293.0, nan, nan, 3377.0, 4805.0], + "wt2": [2011.0, np.nan, 1892.0, 3338.0, 4575.0], + "wt3": [2293.0, np.nan, np.nan, 3377.0, 4805.0], } df = DataFrame(data) @@ -497,13 +496,13 @@ def test_pairs(self): "29dec2008", "20jan2009", "21jan2009", - nan, + np.nan, "22jan2009", "31dec2008", "03feb2009", "05feb2009", - nan, - nan, + np.nan, + np.nan, "02jan2009", "15feb2009", ], @@ -514,13 +513,13 @@ def test_pairs(self): 3298.0, 4306.0, 2011.0, - nan, + np.nan, 1892.0, 3338.0, 4575.0, 2293.0, - nan, - nan, + np.nan, + np.nan, 3377.0, 4805.0, ], diff --git a/pandas/tests/reshape/test_reshape.py b/pandas/tests/reshape/test_reshape.py index 5e80c317a587b..e2c6f7d1c8feb 100644 --- a/pandas/tests/reshape/test_reshape.py +++ b/pandas/tests/reshape/test_reshape.py @@ -1,7 +1,6 @@ from collections import OrderedDict import numpy as np -from numpy import nan import pytest from pandas.core.dtypes.common import is_integer_dtype @@ -140,19 +139,19 @@ def test_include_na(self, sparse, dtype): # Sparse dataframes do not allow nan labelled columns, see #GH8822 res_na = get_dummies(s, dummy_na=True, sparse=sparse, dtype=dtype) exp_na = DataFrame( - {nan: [0, 0, 1], "a": [1, 0, 0], "b": [0, 1, 0]}, + {np.nan: [0, 0, 1], "a": [1, 0, 0], "b": [0, 1, 0]}, dtype=self.effective_dtype(dtype), ) - exp_na = exp_na.reindex(["a", "b", nan], axis=1) + exp_na = exp_na.reindex(["a", "b", np.nan], axis=1) # hack (NaN handling in assert_index_equal) exp_na.columns = res_na.columns if sparse: exp_na = exp_na.apply(pd.SparseArray, fill_value=0.0) assert_frame_equal(res_na, exp_na) - res_just_na = get_dummies([nan], dummy_na=True, sparse=sparse, dtype=dtype) + res_just_na = get_dummies([np.nan], dummy_na=True, sparse=sparse, dtype=dtype) exp_just_na = DataFrame( - Series(1, index=[0]), columns=[nan], dtype=self.effective_dtype(dtype) + Series(1, index=[0]), columns=[np.nan], dtype=self.effective_dtype(dtype) ) tm.assert_numpy_array_equal(res_just_na.values, exp_just_na.values) @@ -464,14 +463,16 @@ def test_basic_drop_first_NA(self, sparse): assert_frame_equal(res, exp) res_na = get_dummies(s_NA, dummy_na=True, drop_first=True, sparse=sparse) - exp_na = DataFrame({"b": [0, 1, 0], nan: [0, 0, 1]}, dtype=np.uint8).reindex( - ["b", nan], axis=1 + exp_na = DataFrame({"b": [0, 1, 0], np.nan: [0, 0, 1]}, dtype=np.uint8).reindex( + ["b", np.nan], axis=1 ) if sparse: exp_na = exp_na.apply(pd.SparseArray, fill_value=0) assert_frame_equal(res_na, exp_na) - res_just_na = get_dummies([nan], dummy_na=True, drop_first=True, sparse=sparse) + res_just_na = get_dummies( + [np.nan], dummy_na=True, drop_first=True, sparse=sparse + ) exp_just_na = DataFrame(index=np.arange(1)) assert_frame_equal(res_just_na, exp_just_na) diff --git a/pandas/tests/series/indexing/test_alter_index.py b/pandas/tests/series/indexing/test_alter_index.py index c93a000f5e7ce..b25fee0435da0 100644 --- a/pandas/tests/series/indexing/test_alter_index.py +++ b/pandas/tests/series/indexing/test_alter_index.py @@ -1,7 +1,6 @@ from datetime import datetime import numpy as np -from numpy import nan import pytest import pandas as pd @@ -195,9 +194,9 @@ def test_reindex(test_data): def test_reindex_nan(): - ts = Series([2, 3, 5, 7], index=[1, 4, nan, 8]) + ts = Series([2, 3, 5, 7], index=[1, 4, np.nan, 8]) - i, j = [nan, 1, nan, 8, 4, nan], [2, 0, 2, 3, 1, 2] + i, j = [np.nan, 1, np.nan, 8, 4, np.nan], [2, 0, 2, 3, 1, 2] assert_series_equal(ts.reindex(i), ts.iloc[j]) ts.index = ts.index.astype("object") diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py index 08aa3ad02e0ed..d60cd3029e5a8 100644 --- a/pandas/tests/series/test_analytics.py +++ b/pandas/tests/series/test_analytics.py @@ -2,7 +2,6 @@ import operator import numpy as np -from numpy import nan import pytest import pandas.util._test_decorators as td @@ -236,7 +235,7 @@ def test_np_diff(self): s = Series(np.arange(5)) r = np.diff(s) - assert_series_equal(Series([nan, 0, 0, 0, nan]), r) + assert_series_equal(Series([np.nan, 0, 0, 0, np.nan]), r) def test_int_diff(self): # int dtype @@ -283,7 +282,7 @@ def test_tz_diff(self): @pytest.mark.parametrize( "input,output,diff", - [([False, True, True, False, False], [nan, True, False, True, False], 1)], + [([False, True, True, False, False], [np.nan, True, False, True, False], 1)], ) def test_bool_diff(self, input, output, diff): # boolean series (test for fixing #17294) @@ -294,7 +293,7 @@ def test_bool_diff(self, input, output, diff): def test_obj_diff(self): # object series - s = Series([False, True, 5.0, nan, True, False]) + s = Series([False, True, 5.0, np.nan, True, False]) result = s.diff() expected = s - s.shift(1) assert_series_equal(result, expected) @@ -538,14 +537,14 @@ def test_count(self, datetime_series): assert datetime_series.count() == np.isfinite(datetime_series).sum() - mi = MultiIndex.from_arrays([list("aabbcc"), [1, 2, 2, nan, 1, 2]]) + mi = MultiIndex.from_arrays([list("aabbcc"), [1, 2, 2, np.nan, 1, 2]]) ts = Series(np.arange(len(mi)), index=mi) left = ts.count(level=1) - right = Series([2, 3, 1], index=[1, 2, nan]) + right = Series([2, 3, 1], index=[1, 2, np.nan]) assert_series_equal(left, right) - ts.iloc[[0, 3, 5]] = nan + ts.iloc[[0, 3, 5]] = np.nan assert_series_equal(ts.count(level=1), right - 1) def test_dot(self): @@ -770,11 +769,11 @@ def test_cummethods_bool(self): result = getattr(s, method)() assert_series_equal(result, expected) - e = pd.Series([False, True, nan, False]) - cse = pd.Series([0, 1, nan, 1], dtype=object) - cpe = pd.Series([False, 0, nan, 0]) - cmin = pd.Series([False, False, nan, False]) - cmax = pd.Series([False, True, nan, True]) + e = pd.Series([False, True, np.nan, False]) + cse = pd.Series([0, 1, np.nan, 1], dtype=object) + cpe = pd.Series([False, 0, np.nan, 0]) + cmin = pd.Series([False, False, np.nan, False]) + cmax = pd.Series([False, True, np.nan, True]) expecteds = {"cumsum": cse, "cumprod": cpe, "cummin": cmin, "cummax": cmax} for method in methods: @@ -1042,7 +1041,6 @@ def test_shift_categorical(self): assert_index_equal(s.values.categories, sn2.values.categories) def test_unstack(self): - from numpy import nan index = MultiIndex( levels=[["bar", "foo"], ["one", "three", "two"]], @@ -1053,7 +1051,7 @@ def test_unstack(self): unstacked = s.unstack() expected = DataFrame( - [[2.0, nan, 3.0], [0.0, 1.0, nan]], + [[2.0, np.nan, 3.0], [0.0, 1.0, np.nan]], index=["bar", "foo"], columns=["one", "three", "two"], ) @@ -1080,7 +1078,9 @@ def test_unstack(self): idx = pd.MultiIndex.from_arrays([[101, 102], [3.5, np.nan]]) ts = pd.Series([1, 2], index=idx) left = ts.unstack() - right = DataFrame([[nan, 1], [2, nan]], index=[101, 102], columns=[nan, 3.5]) + right = DataFrame( + [[np.nan, 1], [2, np.nan]], index=[101, 102], columns=[np.nan, 3.5] + ) assert_frame_equal(left, right) idx = pd.MultiIndex.from_arrays( @@ -1092,9 +1092,10 @@ def test_unstack(self): ) ts = pd.Series([1.0, 1.1, 1.2, 1.3, 1.4], index=idx) right = DataFrame( - [[1.0, 1.3], [1.1, nan], [nan, 1.4], [1.2, nan]], columns=["cat", "dog"] + [[1.0, 1.3], [1.1, np.nan], [np.nan, 1.4], [1.2, np.nan]], + columns=["cat", "dog"], ) - tpls = [("a", 1), ("a", 2), ("b", nan), ("b", 1)] + tpls = [("a", 1), ("a", 2), ("b", np.nan), ("b", 1)] right.index = pd.MultiIndex.from_tuples(tpls) assert_frame_equal(ts.unstack(level=0), right) diff --git a/pandas/tests/series/test_combine_concat.py b/pandas/tests/series/test_combine_concat.py index 819b9228219aa..78d666720c091 100644 --- a/pandas/tests/series/test_combine_concat.py +++ b/pandas/tests/series/test_combine_concat.py @@ -1,7 +1,6 @@ from datetime import datetime import numpy as np -from numpy import nan import pytest import pandas as pd @@ -114,8 +113,8 @@ def test_combine_first(self): assert_series_equal(s, result) def test_update(self): - s = Series([1.5, nan, 3.0, 4.0, nan]) - s2 = Series([nan, 3.5, nan, 5.0]) + s = Series([1.5, np.nan, 3.0, 4.0, np.nan]) + s2 = Series([np.nan, 3.5, np.nan, 5.0]) s.update(s2) expected = Series([1.5, 3.5, 3.0, 5.0, np.nan]) diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index 2f09d777e719c..65cbf5fcf91d2 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -2,7 +2,6 @@ from datetime import datetime, timedelta import numpy as np -from numpy import nan import numpy.ma as ma import pytest @@ -456,14 +455,14 @@ def test_unordered_compare_equal(self): def test_constructor_maskedarray(self): data = ma.masked_all((3,), dtype=float) result = Series(data) - expected = Series([nan, nan, nan]) + expected = Series([np.nan, np.nan, np.nan]) assert_series_equal(result, expected) data[0] = 0.0 data[2] = 2.0 index = ["a", "b", "c"] result = Series(data, index=index) - expected = Series([0.0, nan, 2.0], index=index) + expected = Series([0.0, np.nan, 2.0], index=index) assert_series_equal(result, expected) data[1] = 1.0 @@ -473,14 +472,14 @@ def test_constructor_maskedarray(self): data = ma.masked_all((3,), dtype=int) result = Series(data) - expected = Series([nan, nan, nan], dtype=float) + expected = Series([np.nan, np.nan, np.nan], dtype=float) assert_series_equal(result, expected) data[0] = 0 data[2] = 2 index = ["a", "b", "c"] result = Series(data, index=index) - expected = Series([0, nan, 2], index=index, dtype=float) + expected = Series([0, np.nan, 2], index=index, dtype=float) assert_series_equal(result, expected) data[1] = 1 @@ -490,14 +489,14 @@ def test_constructor_maskedarray(self): data = ma.masked_all((3,), dtype=bool) result = Series(data) - expected = Series([nan, nan, nan], dtype=object) + expected = Series([np.nan, np.nan, np.nan], dtype=object) assert_series_equal(result, expected) data[0] = True data[2] = False index = ["a", "b", "c"] result = Series(data, index=index) - expected = Series([True, nan, False], index=index, dtype=object) + expected = Series([True, np.nan, False], index=index, dtype=object) assert_series_equal(result, expected) data[1] = True @@ -534,7 +533,7 @@ def test_constructor_maskedarray_hardened(self): # Check numpy masked arrays with hard masks -- from GH24574 data = ma.masked_all((3,), dtype=float).harden_mask() result = pd.Series(data) - expected = pd.Series([nan, nan, nan]) + expected = pd.Series([np.nan, np.nan, np.nan]) tm.assert_series_equal(result, expected) def test_series_ctor_plus_datetimeindex(self): @@ -736,14 +735,14 @@ def test_constructor_dtype_datetime64(self): s = Series(iNaT, index=range(5)) assert not isna(s).all() - s = Series(nan, dtype="M8[ns]", index=range(5)) + s = Series(np.nan, dtype="M8[ns]", index=range(5)) assert isna(s).all() s = Series([datetime(2001, 1, 2, 0, 0), iNaT], dtype="M8[ns]") assert isna(s[1]) assert s.dtype == "M8[ns]" - s = Series([datetime(2001, 1, 2, 0, 0), nan], dtype="M8[ns]") + s = Series([datetime(2001, 1, 2, 0, 0), np.nan], dtype="M8[ns]") assert isna(s[1]) assert s.dtype == "M8[ns]" @@ -1026,7 +1025,7 @@ def test_constructor_periodindex(self): def test_constructor_dict(self): d = {"a": 0.0, "b": 1.0, "c": 2.0} result = Series(d, index=["b", "c", "d", "a"]) - expected = Series([1, 2, nan, 0], index=["b", "c", "d", "a"]) + expected = Series([1, 2, np.nan, 0], index=["b", "c", "d", "a"]) assert_series_equal(result, expected) pidx = tm.makePeriodIndex(100) diff --git a/pandas/tests/series/test_missing.py b/pandas/tests/series/test_missing.py index f459ae9e7845d..835514ea724ab 100644 --- a/pandas/tests/series/test_missing.py +++ b/pandas/tests/series/test_missing.py @@ -1,7 +1,6 @@ from datetime import datetime, timedelta import numpy as np -from numpy import nan import pytest import pytz @@ -760,17 +759,17 @@ def test_fillna(self, datetime_series): assert_series_equal(result, expected) def test_fillna_bug(self): - x = Series([nan, 1.0, nan, 3.0, nan], ["z", "a", "b", "c", "d"]) + x = Series([np.nan, 1.0, np.nan, 3.0, np.nan], ["z", "a", "b", "c", "d"]) filled = x.fillna(method="ffill") - expected = Series([nan, 1.0, 1.0, 3.0, 3.0], x.index) + expected = Series([np.nan, 1.0, 1.0, 3.0, 3.0], x.index) assert_series_equal(filled, expected) filled = x.fillna(method="bfill") - expected = Series([1.0, 1.0, 3.0, 3.0, nan], x.index) + expected = Series([1.0, 1.0, 3.0, 3.0, np.nan], x.index) assert_series_equal(filled, expected) def test_fillna_inplace(self): - x = Series([nan, 1.0, nan, 3.0, nan], ["z", "a", "b", "c", "d"]) + x = Series([np.nan, 1.0, np.nan, 3.0, np.nan], ["z", "a", "b", "c", "d"]) y = x.copy() y.fillna(value=0, inplace=True) @@ -916,20 +915,20 @@ def test_valid(self, datetime_series): tm.assert_series_equal(result, ts[pd.notna(ts)]) def test_isna(self): - ser = Series([0, 5.4, 3, nan, -0.001]) + ser = Series([0, 5.4, 3, np.nan, -0.001]) expected = Series([False, False, False, True, False]) tm.assert_series_equal(ser.isna(), expected) - ser = Series(["hi", "", nan]) + ser = Series(["hi", "", np.nan]) expected = Series([False, False, True]) tm.assert_series_equal(ser.isna(), expected) def test_notna(self): - ser = Series([0, 5.4, 3, nan, -0.001]) + ser = Series([0, 5.4, 3, np.nan, -0.001]) expected = Series([True, True, True, False, True]) tm.assert_series_equal(ser.notna(), expected) - ser = Series(["hi", "", nan]) + ser = Series(["hi", "", np.nan]) expected = Series([True, True, False]) tm.assert_series_equal(ser.notna(), expected) @@ -1357,35 +1356,39 @@ def test_interp_limit_bad_direction(self): # limit_area introduced GH #16284 def test_interp_limit_area(self): # These tests are for issue #9218 -- fill NaNs in both directions. - s = Series([nan, nan, 3, nan, nan, nan, 7, nan, nan]) + s = Series([np.nan, np.nan, 3, np.nan, np.nan, np.nan, 7, np.nan, np.nan]) - expected = Series([nan, nan, 3.0, 4.0, 5.0, 6.0, 7.0, nan, nan]) + expected = Series([np.nan, np.nan, 3.0, 4.0, 5.0, 6.0, 7.0, np.nan, np.nan]) result = s.interpolate(method="linear", limit_area="inside") assert_series_equal(result, expected) - expected = Series([nan, nan, 3.0, 4.0, nan, nan, 7.0, nan, nan]) + expected = Series( + [np.nan, np.nan, 3.0, 4.0, np.nan, np.nan, 7.0, np.nan, np.nan] + ) result = s.interpolate(method="linear", limit_area="inside", limit=1) - expected = Series([nan, nan, 3.0, 4.0, nan, 6.0, 7.0, nan, nan]) + expected = Series([np.nan, np.nan, 3.0, 4.0, np.nan, 6.0, 7.0, np.nan, np.nan]) result = s.interpolate( method="linear", limit_area="inside", limit_direction="both", limit=1 ) assert_series_equal(result, expected) - expected = Series([nan, nan, 3.0, nan, nan, nan, 7.0, 7.0, 7.0]) + expected = Series([np.nan, np.nan, 3.0, np.nan, np.nan, np.nan, 7.0, 7.0, 7.0]) result = s.interpolate(method="linear", limit_area="outside") assert_series_equal(result, expected) - expected = Series([nan, nan, 3.0, nan, nan, nan, 7.0, 7.0, nan]) + expected = Series( + [np.nan, np.nan, 3.0, np.nan, np.nan, np.nan, 7.0, 7.0, np.nan] + ) result = s.interpolate(method="linear", limit_area="outside", limit=1) - expected = Series([nan, 3.0, 3.0, nan, nan, nan, 7.0, 7.0, nan]) + expected = Series([np.nan, 3.0, 3.0, np.nan, np.nan, np.nan, 7.0, 7.0, np.nan]) result = s.interpolate( method="linear", limit_area="outside", limit_direction="both", limit=1 ) assert_series_equal(result, expected) - expected = Series([3.0, 3.0, 3.0, nan, nan, nan, 7.0, nan, nan]) + expected = Series([3.0, 3.0, 3.0, np.nan, np.nan, np.nan, 7.0, np.nan, np.nan]) result = s.interpolate( method="linear", limit_area="outside", direction="backward" ) diff --git a/pandas/tests/series/test_rank.py b/pandas/tests/series/test_rank.py index f93e1651c8b10..5dd27e4c20dcf 100644 --- a/pandas/tests/series/test_rank.py +++ b/pandas/tests/series/test_rank.py @@ -1,7 +1,6 @@ from itertools import chain, product import numpy as np -from numpy import nan import pytest from pandas._libs.algos import Infinity, NegInfinity @@ -16,14 +15,14 @@ class TestSeriesRank(TestData): - s = Series([1, 3, 4, 2, nan, 2, 1, 5, nan, 3]) + s = Series([1, 3, 4, 2, np.nan, 2, 1, 5, np.nan, 3]) results = { - "average": np.array([1.5, 5.5, 7.0, 3.5, nan, 3.5, 1.5, 8.0, nan, 5.5]), - "min": np.array([1, 5, 7, 3, nan, 3, 1, 8, nan, 5]), - "max": np.array([2, 6, 7, 4, nan, 4, 2, 8, nan, 6]), - "first": np.array([1, 5, 7, 3, nan, 4, 2, 8, nan, 6]), - "dense": np.array([1, 3, 4, 2, nan, 2, 1, 5, nan, 3]), + "average": np.array([1.5, 5.5, 7.0, 3.5, np.nan, 3.5, 1.5, 8.0, np.nan, 5.5]), + "min": np.array([1, 5, 7, 3, np.nan, 3, 1, 8, np.nan, 5]), + "max": np.array([2, 6, 7, 4, np.nan, 4, 2, 8, np.nan, 6]), + "first": np.array([1, 5, 7, 3, np.nan, 4, 2, 8, np.nan, 6]), + "dense": np.array([1, 3, 4, 2, np.nan, 2, 1, 5, np.nan, 3]), } def test_rank(self): diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index d81ee79418e9c..a5706d8baa614 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -3,7 +3,6 @@ import struct import numpy as np -from numpy import nan from numpy.random import RandomState import pytest @@ -1623,11 +1622,11 @@ def _check(arr): result = libalgos.rank_1d_float64(arr) arr[mask] = np.inf exp = rankdata(arr) - exp[mask] = nan + exp[mask] = np.nan assert_almost_equal(result, exp) - _check(np.array([nan, nan, 5.0, 5.0, 5.0, nan, 1, 2, 3, nan])) - _check(np.array([4.0, nan, 5.0, 5.0, 5.0, nan, 1, 2, 4.0, nan])) + _check(np.array([np.nan, np.nan, 5.0, 5.0, 5.0, np.nan, 1, 2, 3, np.nan])) + _check(np.array([4.0, np.nan, 5.0, 5.0, 5.0, np.nan, 1, 2, 4.0, np.nan])) def test_basic(self): exp = np.array([1, 2], dtype=np.float64) diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index 4a60d3966a9bb..b9a33d130a99c 100644 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -1475,17 +1475,14 @@ def test_frame_dict_constructor_empty_series(self): def test_multiindex_na_repr(self): # only an issue with long columns - - from numpy import nan - df3 = DataFrame( { "A" * 30: {("A", "A0006000", "nuit"): "A0006000"}, - "B" * 30: {("A", "A0006000", "nuit"): nan}, - "C" * 30: {("A", "A0006000", "nuit"): nan}, - "D" * 30: {("A", "A0006000", "nuit"): nan}, + "B" * 30: {("A", "A0006000", "nuit"): np.nan}, + "C" * 30: {("A", "A0006000", "nuit"): np.nan}, + "D" * 30: {("A", "A0006000", "nuit"): np.nan}, "E" * 30: {("A", "A0006000", "nuit"): "A"}, - "F" * 30: {("A", "A0006000", "nuit"): nan}, + "F" * 30: {("A", "A0006000", "nuit"): np.nan}, } ) diff --git a/pandas/tests/test_sorting.py b/pandas/tests/test_sorting.py index f64ad8edafbd7..9be35198a5592 100644 --- a/pandas/tests/test_sorting.py +++ b/pandas/tests/test_sorting.py @@ -3,7 +3,6 @@ from itertools import product import numpy as np -from numpy import nan import pytest from pandas import DataFrame, MultiIndex, Series, array, concat, merge @@ -103,7 +102,7 @@ def aggr(func): assert_frame_equal(gr.median(), aggr(np.median)) def test_lexsort_indexer(self): - keys = [[nan] * 5 + list(range(100)) + [nan] * 5] + keys = [[np.nan] * 5 + list(range(100)) + [np.nan] * 5] # orders=True, na_position='last' result = lexsort_indexer(keys, orders=True, na_position="last") exp = list(range(5, 105)) + list(range(5)) + list(range(105, 110)) @@ -126,7 +125,7 @@ def test_lexsort_indexer(self): def test_nargsort(self): # np.argsort(items) places NaNs last - items = [nan] * 5 + list(range(100)) + [nan] * 5 + items = [np.nan] * 5 + list(range(100)) + [np.nan] * 5 # np.argsort(items2) may not place NaNs first items2 = np.array(items, dtype="O") diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index b50f1a0fd2f2a..53d74f74dc439 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -2,7 +2,6 @@ import re import numpy as np -from numpy import nan as NA from numpy.random import randint import pytest @@ -719,40 +718,42 @@ def test_cat_on_filtered_index(self): assert str_multiple.loc[1] == "2011 2 2" def test_count(self): - values = np.array(["foo", "foofoo", NA, "foooofooofommmfoo"], dtype=np.object_) + values = np.array( + ["foo", "foofoo", np.nan, "foooofooofommmfoo"], dtype=np.object_ + ) result = strings.str_count(values, "f[o]+") - exp = np.array([1, 2, NA, 4]) + exp = np.array([1, 2, np.nan, 4]) tm.assert_numpy_array_equal(result, exp) result = Series(values).str.count("f[o]+") - exp = Series([1, 2, NA, 4]) + exp = Series([1, 2, np.nan, 4]) assert isinstance(result, Series) tm.assert_series_equal(result, exp) # mixed - mixed = ["a", NA, "b", True, datetime.today(), "foo", None, 1, 2.0] + mixed = ["a", np.nan, "b", True, datetime.today(), "foo", None, 1, 2.0] rs = strings.str_count(mixed, "a") - xp = np.array([1, NA, 0, NA, NA, 0, NA, NA, NA]) + xp = np.array([1, np.nan, 0, np.nan, np.nan, 0, np.nan, np.nan, np.nan]) tm.assert_numpy_array_equal(rs, xp) rs = Series(mixed).str.count("a") - xp = Series([1, NA, 0, NA, NA, 0, NA, NA, NA]) + xp = Series([1, np.nan, 0, np.nan, np.nan, 0, np.nan, np.nan, np.nan]) assert isinstance(rs, Series) tm.assert_series_equal(rs, xp) def test_contains(self): values = np.array( - ["foo", NA, "fooommm__foo", "mmm_", "foommm[_]+bar"], dtype=np.object_ + ["foo", np.nan, "fooommm__foo", "mmm_", "foommm[_]+bar"], dtype=np.object_ ) pat = "mmm[_]+" result = strings.str_contains(values, pat) - expected = np.array([False, NA, True, True, False], dtype=np.object_) + expected = np.array([False, np.nan, True, True, False], dtype=np.object_) tm.assert_numpy_array_equal(result, expected) result = strings.str_contains(values, pat, regex=False) - expected = np.array([False, NA, False, False, True], dtype=np.object_) + expected = np.array([False, np.nan, False, False, True], dtype=np.object_) tm.assert_numpy_array_equal(result, expected) values = ["foo", "xyz", "fooommm__foo", "mmm_"] @@ -773,18 +774,23 @@ def test_contains(self): tm.assert_numpy_array_equal(result, expected) # mixed - mixed = ["a", NA, "b", True, datetime.today(), "foo", None, 1, 2.0] + mixed = ["a", np.nan, "b", True, datetime.today(), "foo", None, 1, 2.0] rs = strings.str_contains(mixed, "o") - xp = np.array([False, NA, False, NA, NA, True, NA, NA, NA], dtype=np.object_) + xp = np.array( + [False, np.nan, False, np.nan, np.nan, True, np.nan, np.nan, np.nan], + dtype=np.object_, + ) tm.assert_numpy_array_equal(rs, xp) rs = Series(mixed).str.contains("o") - xp = Series([False, NA, False, NA, NA, True, NA, NA, NA]) + xp = Series( + [False, np.nan, False, np.nan, np.nan, True, np.nan, np.nan, np.nan] + ) assert isinstance(rs, Series) tm.assert_series_equal(rs, xp) # unicode - values = np.array(["foo", NA, "fooommm__foo", "mmm_"], dtype=np.object_) + values = np.array(["foo", np.nan, "fooommm__foo", "mmm_"], dtype=np.object_) pat = "mmm[_]+" result = strings.str_contains(values, pat) @@ -825,10 +831,10 @@ def test_contains_for_object_category(self): tm.assert_series_equal(result, expected) def test_startswith(self): - values = Series(["om", NA, "foo_nom", "nom", "bar_foo", NA, "foo"]) + values = Series(["om", np.nan, "foo_nom", "nom", "bar_foo", np.nan, "foo"]) result = values.str.startswith("foo") - exp = Series([False, NA, True, False, False, NA, True]) + exp = Series([False, np.nan, True, False, False, np.nan, True]) tm.assert_series_equal(result, exp) result = values.str.startswith("foo", na=True) @@ -836,92 +842,114 @@ def test_startswith(self): # mixed mixed = np.array( - ["a", NA, "b", True, datetime.today(), "foo", None, 1, 2.0], + ["a", np.nan, "b", True, datetime.today(), "foo", None, 1, 2.0], dtype=np.object_, ) rs = strings.str_startswith(mixed, "f") - xp = np.array([False, NA, False, NA, NA, True, NA, NA, NA], dtype=np.object_) + xp = np.array( + [False, np.nan, False, np.nan, np.nan, True, np.nan, np.nan, np.nan], + dtype=np.object_, + ) tm.assert_numpy_array_equal(rs, xp) rs = Series(mixed).str.startswith("f") assert isinstance(rs, Series) - xp = Series([False, NA, False, NA, NA, True, NA, NA, NA]) + xp = Series( + [False, np.nan, False, np.nan, np.nan, True, np.nan, np.nan, np.nan] + ) tm.assert_series_equal(rs, xp) def test_endswith(self): - values = Series(["om", NA, "foo_nom", "nom", "bar_foo", NA, "foo"]) + values = Series(["om", np.nan, "foo_nom", "nom", "bar_foo", np.nan, "foo"]) result = values.str.endswith("foo") - exp = Series([False, NA, False, False, True, NA, True]) + exp = Series([False, np.nan, False, False, True, np.nan, True]) tm.assert_series_equal(result, exp) result = values.str.endswith("foo", na=False) tm.assert_series_equal(result, exp.fillna(False).astype(bool)) # mixed - mixed = ["a", NA, "b", True, datetime.today(), "foo", None, 1, 2.0] + mixed = ["a", np.nan, "b", True, datetime.today(), "foo", None, 1, 2.0] rs = strings.str_endswith(mixed, "f") - xp = np.array([False, NA, False, NA, NA, False, NA, NA, NA], dtype=np.object_) + xp = np.array( + [False, np.nan, False, np.nan, np.nan, False, np.nan, np.nan, np.nan], + dtype=np.object_, + ) tm.assert_numpy_array_equal(rs, xp) rs = Series(mixed).str.endswith("f") - xp = Series([False, NA, False, NA, NA, False, NA, NA, NA]) + xp = Series( + [False, np.nan, False, np.nan, np.nan, False, np.nan, np.nan, np.nan] + ) assert isinstance(rs, Series) tm.assert_series_equal(rs, xp) def test_title(self): - values = Series(["FOO", "BAR", NA, "Blah", "blurg"]) + values = Series(["FOO", "BAR", np.nan, "Blah", "blurg"]) result = values.str.title() - exp = Series(["Foo", "Bar", NA, "Blah", "Blurg"]) + exp = Series(["Foo", "Bar", np.nan, "Blah", "Blurg"]) tm.assert_series_equal(result, exp) # mixed - mixed = Series(["FOO", NA, "bar", True, datetime.today(), "blah", None, 1, 2.0]) + mixed = Series( + ["FOO", np.nan, "bar", True, datetime.today(), "blah", None, 1, 2.0] + ) mixed = mixed.str.title() - exp = Series(["Foo", NA, "Bar", NA, NA, "Blah", NA, NA, NA]) + exp = Series( + ["Foo", np.nan, "Bar", np.nan, np.nan, "Blah", np.nan, np.nan, np.nan] + ) tm.assert_almost_equal(mixed, exp) def test_lower_upper(self): - values = Series(["om", NA, "nom", "nom"]) + values = Series(["om", np.nan, "nom", "nom"]) result = values.str.upper() - exp = Series(["OM", NA, "NOM", "NOM"]) + exp = Series(["OM", np.nan, "NOM", "NOM"]) tm.assert_series_equal(result, exp) result = result.str.lower() tm.assert_series_equal(result, values) # mixed - mixed = Series(["a", NA, "b", True, datetime.today(), "foo", None, 1, 2.0]) + mixed = Series(["a", np.nan, "b", True, datetime.today(), "foo", None, 1, 2.0]) mixed = mixed.str.upper() rs = Series(mixed).str.lower() - xp = Series(["a", NA, "b", NA, NA, "foo", NA, NA, NA]) + xp = Series(["a", np.nan, "b", np.nan, np.nan, "foo", np.nan, np.nan, np.nan]) assert isinstance(rs, Series) tm.assert_series_equal(rs, xp) def test_capitalize(self): - values = Series(["FOO", "BAR", NA, "Blah", "blurg"]) + values = Series(["FOO", "BAR", np.nan, "Blah", "blurg"]) result = values.str.capitalize() - exp = Series(["Foo", "Bar", NA, "Blah", "Blurg"]) + exp = Series(["Foo", "Bar", np.nan, "Blah", "Blurg"]) tm.assert_series_equal(result, exp) # mixed - mixed = Series(["FOO", NA, "bar", True, datetime.today(), "blah", None, 1, 2.0]) + mixed = Series( + ["FOO", np.nan, "bar", True, datetime.today(), "blah", None, 1, 2.0] + ) mixed = mixed.str.capitalize() - exp = Series(["Foo", NA, "Bar", NA, NA, "Blah", NA, NA, NA]) + exp = Series( + ["Foo", np.nan, "Bar", np.nan, np.nan, "Blah", np.nan, np.nan, np.nan] + ) tm.assert_almost_equal(mixed, exp) def test_swapcase(self): - values = Series(["FOO", "BAR", NA, "Blah", "blurg"]) + values = Series(["FOO", "BAR", np.nan, "Blah", "blurg"]) result = values.str.swapcase() - exp = Series(["foo", "bar", NA, "bLAH", "BLURG"]) + exp = Series(["foo", "bar", np.nan, "bLAH", "BLURG"]) tm.assert_series_equal(result, exp) # mixed - mixed = Series(["FOO", NA, "bar", True, datetime.today(), "Blah", None, 1, 2.0]) + mixed = Series( + ["FOO", np.nan, "bar", True, datetime.today(), "Blah", None, 1, 2.0] + ) mixed = mixed.str.swapcase() - exp = Series(["foo", NA, "BAR", NA, NA, "bLAH", NA, NA, NA]) + exp = Series( + ["foo", np.nan, "BAR", np.nan, np.nan, "bLAH", np.nan, np.nan, np.nan] + ) tm.assert_almost_equal(mixed, exp) def test_casemethods(self): @@ -934,23 +962,23 @@ def test_casemethods(self): assert s.str.swapcase().tolist() == [v.swapcase() for v in values] def test_replace(self): - values = Series(["fooBAD__barBAD", NA]) + values = Series(["fooBAD__barBAD", np.nan]) result = values.str.replace("BAD[_]*", "") - exp = Series(["foobar", NA]) + exp = Series(["foobar", np.nan]) tm.assert_series_equal(result, exp) result = values.str.replace("BAD[_]*", "", n=1) - exp = Series(["foobarBAD", NA]) + exp = Series(["foobarBAD", np.nan]) tm.assert_series_equal(result, exp) # mixed mixed = Series( - ["aBAD", NA, "bBAD", True, datetime.today(), "fooBAD", None, 1, 2.0] + ["aBAD", np.nan, "bBAD", True, datetime.today(), "fooBAD", None, 1, 2.0] ) rs = Series(mixed).str.replace("BAD[_]*", "") - xp = Series(["a", NA, "b", NA, NA, "foo", NA, NA, NA]) + xp = Series(["a", np.nan, "b", np.nan, np.nan, "foo", np.nan, np.nan, np.nan]) assert isinstance(rs, Series) tm.assert_almost_equal(rs, xp) @@ -971,12 +999,12 @@ def test_replace(self): def test_replace_callable(self): # GH 15055 - values = Series(["fooBAD__barBAD", NA]) + values = Series(["fooBAD__barBAD", np.nan]) # test with callable repl = lambda m: m.group(0).swapcase() result = values.str.replace("[a-z][A-Z]{2}", repl, n=2) - exp = Series(["foObaD__baRbaD", NA]) + exp = Series(["foObaD__baRbaD", np.nan]) tm.assert_series_equal(result, exp) # test with wrong number of arguments, raising an error @@ -998,34 +1026,34 @@ def test_replace_callable(self): values.str.replace("a", repl) # test regex named groups - values = Series(["Foo Bar Baz", NA]) + values = Series(["Foo Bar Baz", np.nan]) pat = r"(?P\w+) (?P\w+) (?P\w+)" repl = lambda m: m.group("middle").swapcase() result = values.str.replace(pat, repl) - exp = Series(["bAR", NA]) + exp = Series(["bAR", np.nan]) tm.assert_series_equal(result, exp) def test_replace_compiled_regex(self): # GH 15446 - values = Series(["fooBAD__barBAD", NA]) + values = Series(["fooBAD__barBAD", np.nan]) # test with compiled regex pat = re.compile(r"BAD[_]*") result = values.str.replace(pat, "") - exp = Series(["foobar", NA]) + exp = Series(["foobar", np.nan]) tm.assert_series_equal(result, exp) result = values.str.replace(pat, "", n=1) - exp = Series(["foobarBAD", NA]) + exp = Series(["foobarBAD", np.nan]) tm.assert_series_equal(result, exp) # mixed mixed = Series( - ["aBAD", NA, "bBAD", True, datetime.today(), "fooBAD", None, 1, 2.0] + ["aBAD", np.nan, "bBAD", True, datetime.today(), "fooBAD", None, 1, 2.0] ) rs = Series(mixed).str.replace(pat, "") - xp = Series(["a", NA, "b", NA, NA, "foo", NA, NA, NA]) + xp = Series(["a", np.nan, "b", np.nan, np.nan, "foo", np.nan, np.nan, np.nan]) assert isinstance(rs, Series) tm.assert_almost_equal(rs, xp) @@ -1038,7 +1066,7 @@ def test_replace_compiled_regex(self): # case and flags provided to str.replace will have no effect # and will produce warnings - values = Series(["fooBAD__barBAD__bad", NA]) + values = Series(["fooBAD__barBAD__bad", np.nan]) pat = re.compile(r"BAD[_]*") with pytest.raises(ValueError, match="case and flags cannot be"): @@ -1051,21 +1079,21 @@ def test_replace_compiled_regex(self): result = values.str.replace(pat, "", case=True) # test with callable - values = Series(["fooBAD__barBAD", NA]) + values = Series(["fooBAD__barBAD", np.nan]) repl = lambda m: m.group(0).swapcase() pat = re.compile("[a-z][A-Z]{2}") result = values.str.replace(pat, repl, n=2) - exp = Series(["foObaD__baRbaD", NA]) + exp = Series(["foObaD__baRbaD", np.nan]) tm.assert_series_equal(result, exp) def test_replace_literal(self): # GH16808 literal replace (regex=False vs regex=True) - values = Series(["f.o", "foo", NA]) - exp = Series(["bao", "bao", NA]) + values = Series(["f.o", "foo", np.nan]) + exp = Series(["bao", "bao", np.nan]) result = values.str.replace("f.", "ba") tm.assert_series_equal(result, exp) - exp = Series(["bao", "foo", NA]) + exp = Series(["bao", "foo", np.nan]) result = values.str.replace("f.", "ba", regex=False) tm.assert_series_equal(result, exp) @@ -1083,42 +1111,54 @@ def test_replace_literal(self): values.str.replace(compiled_pat, "", regex=False) def test_repeat(self): - values = Series(["a", "b", NA, "c", NA, "d"]) + values = Series(["a", "b", np.nan, "c", np.nan, "d"]) result = values.str.repeat(3) - exp = Series(["aaa", "bbb", NA, "ccc", NA, "ddd"]) + exp = Series(["aaa", "bbb", np.nan, "ccc", np.nan, "ddd"]) tm.assert_series_equal(result, exp) result = values.str.repeat([1, 2, 3, 4, 5, 6]) - exp = Series(["a", "bb", NA, "cccc", NA, "dddddd"]) + exp = Series(["a", "bb", np.nan, "cccc", np.nan, "dddddd"]) tm.assert_series_equal(result, exp) # mixed - mixed = Series(["a", NA, "b", True, datetime.today(), "foo", None, 1, 2.0]) + mixed = Series(["a", np.nan, "b", True, datetime.today(), "foo", None, 1, 2.0]) rs = Series(mixed).str.repeat(3) - xp = Series(["aaa", NA, "bbb", NA, NA, "foofoofoo", NA, NA, NA]) + xp = Series( + ["aaa", np.nan, "bbb", np.nan, np.nan, "foofoofoo", np.nan, np.nan, np.nan] + ) assert isinstance(rs, Series) tm.assert_series_equal(rs, xp) def test_match(self): # New match behavior introduced in 0.13 - values = Series(["fooBAD__barBAD", NA, "foo"]) + values = Series(["fooBAD__barBAD", np.nan, "foo"]) result = values.str.match(".*(BAD[_]+).*(BAD)") - exp = Series([True, NA, False]) + exp = Series([True, np.nan, False]) tm.assert_series_equal(result, exp) - values = Series(["fooBAD__barBAD", NA, "foo"]) + values = Series(["fooBAD__barBAD", np.nan, "foo"]) result = values.str.match(".*BAD[_]+.*BAD") - exp = Series([True, NA, False]) + exp = Series([True, np.nan, False]) tm.assert_series_equal(result, exp) # mixed mixed = Series( - ["aBAD_BAD", NA, "BAD_b_BAD", True, datetime.today(), "foo", None, 1, 2.0] + [ + "aBAD_BAD", + np.nan, + "BAD_b_BAD", + True, + datetime.today(), + "foo", + None, + 1, + 2.0, + ] ) rs = Series(mixed).str.match(".*(BAD[_]+).*(BAD)") - xp = Series([True, NA, True, NA, NA, False, NA, NA, NA]) + xp = Series([True, np.nan, True, np.nan, np.nan, False, np.nan, np.nan, np.nan]) assert isinstance(rs, Series) tm.assert_series_equal(rs, xp) @@ -1131,12 +1171,12 @@ def test_match(self): assert_series_equal(exp, res) def test_extract_expand_None(self): - values = Series(["fooBAD__barBAD", NA, "foo"]) + values = Series(["fooBAD__barBAD", np.nan, "foo"]) with pytest.raises(ValueError, match="expand must be True or False"): values.str.extract(".*(BAD[_]+).*(BAD)", expand=None) def test_extract_expand_unspecified(self): - values = Series(["fooBAD__barBAD", NA, "foo"]) + values = Series(["fooBAD__barBAD", np.nan, "foo"]) result_unspecified = values.str.extract(".*(BAD[_]+).*") assert isinstance(result_unspecified, DataFrame) result_true = values.str.extract(".*(BAD[_]+).*", expand=True) @@ -1144,8 +1184,8 @@ def test_extract_expand_unspecified(self): def test_extract_expand_False(self): # Contains tests like those in test_match and some others. - values = Series(["fooBAD__barBAD", NA, "foo"]) - er = [NA, NA] # empty row + values = Series(["fooBAD__barBAD", np.nan, "foo"]) + er = [np.nan, np.nan] # empty row result = values.str.extract(".*(BAD[_]+).*(BAD)", expand=False) exp = DataFrame([["BAD__", "BAD"], er, er]) @@ -1153,7 +1193,17 @@ def test_extract_expand_False(self): # mixed mixed = Series( - ["aBAD_BAD", NA, "BAD_b_BAD", True, datetime.today(), "foo", None, 1, 2.0] + [ + "aBAD_BAD", + np.nan, + "BAD_b_BAD", + True, + datetime.today(), + "foo", + None, + 1, + 2.0, + ] ) rs = Series(mixed).str.extract(".*(BAD[_]+).*(BAD)", expand=False) @@ -1161,7 +1211,7 @@ def test_extract_expand_False(self): tm.assert_frame_equal(rs, exp) # unicode - values = Series(["fooBAD__barBAD", NA, "foo"]) + values = Series(["fooBAD__barBAD", np.nan, "foo"]) result = values.str.extract(".*(BAD[_]+).*(BAD)", expand=False) exp = DataFrame([["BAD__", "BAD"], er, er]) @@ -1200,51 +1250,55 @@ def test_extract_expand_False(self): s = Series(["A1", "B2", "C3"]) # one group, no matches result = s.str.extract("(_)", expand=False) - exp = Series([NA, NA, NA], dtype=object) + exp = Series([np.nan, np.nan, np.nan], dtype=object) tm.assert_series_equal(result, exp) # two groups, no matches result = s.str.extract("(_)(_)", expand=False) - exp = DataFrame([[NA, NA], [NA, NA], [NA, NA]], dtype=object) + exp = DataFrame( + [[np.nan, np.nan], [np.nan, np.nan], [np.nan, np.nan]], dtype=object + ) tm.assert_frame_equal(result, exp) # one group, some matches result = s.str.extract("([AB])[123]", expand=False) - exp = Series(["A", "B", NA]) + exp = Series(["A", "B", np.nan]) tm.assert_series_equal(result, exp) # two groups, some matches result = s.str.extract("([AB])([123])", expand=False) - exp = DataFrame([["A", "1"], ["B", "2"], [NA, NA]]) + exp = DataFrame([["A", "1"], ["B", "2"], [np.nan, np.nan]]) tm.assert_frame_equal(result, exp) # one named group result = s.str.extract("(?P[AB])", expand=False) - exp = Series(["A", "B", NA], name="letter") + exp = Series(["A", "B", np.nan], name="letter") tm.assert_series_equal(result, exp) # two named groups result = s.str.extract("(?P[AB])(?P[123])", expand=False) exp = DataFrame( - [["A", "1"], ["B", "2"], [NA, NA]], columns=["letter", "number"] + [["A", "1"], ["B", "2"], [np.nan, np.nan]], columns=["letter", "number"] ) tm.assert_frame_equal(result, exp) # mix named and unnamed groups result = s.str.extract("([AB])(?P[123])", expand=False) - exp = DataFrame([["A", "1"], ["B", "2"], [NA, NA]], columns=[0, "number"]) + exp = DataFrame( + [["A", "1"], ["B", "2"], [np.nan, np.nan]], columns=[0, "number"] + ) tm.assert_frame_equal(result, exp) # one normal group, one non-capturing group result = s.str.extract("([AB])(?:[123])", expand=False) - exp = Series(["A", "B", NA]) + exp = Series(["A", "B", np.nan]) tm.assert_series_equal(result, exp) # two normal groups, one non-capturing group result = Series(["A11", "B22", "C33"]).str.extract( "([AB])([123])(?:[123])", expand=False ) - exp = DataFrame([["A", "1"], ["B", "2"], [NA, NA]]) + exp = DataFrame([["A", "1"], ["B", "2"], [np.nan, np.nan]]) tm.assert_frame_equal(result, exp) # one optional group followed by one normal group @@ -1252,7 +1306,7 @@ def test_extract_expand_False(self): "(?P[AB])?(?P[123])", expand=False ) exp = DataFrame( - [["A", "1"], ["B", "2"], [NA, "3"]], columns=["letter", "number"] + [["A", "1"], ["B", "2"], [np.nan, "3"]], columns=["letter", "number"] ) tm.assert_frame_equal(result, exp) @@ -1261,7 +1315,7 @@ def test_extract_expand_False(self): "(?P[ABC])(?P[123])?", expand=False ) exp = DataFrame( - [["A", "1"], ["B", "2"], ["C", NA]], columns=["letter", "number"] + [["A", "1"], ["B", "2"], ["C", np.nan]], columns=["letter", "number"] ) tm.assert_frame_equal(result, exp) @@ -1272,13 +1326,13 @@ def check_index(index): index = index[: len(data)] s = Series(data, index=index) result = s.str.extract(r"(\d)", expand=False) - exp = Series(["1", "2", NA], index=index) + exp = Series(["1", "2", np.nan], index=index) tm.assert_series_equal(result, exp) result = Series(data, index=index).str.extract( r"(?P\D)(?P\d)?", expand=False ) - e_list = [["A", "1"], ["B", "2"], ["C", NA]] + e_list = [["A", "1"], ["B", "2"], ["C", np.nan]] exp = DataFrame(e_list, columns=["letter", "number"], index=index) tm.assert_frame_equal(result, exp) @@ -1302,8 +1356,8 @@ def check_index(index): def test_extract_expand_True(self): # Contains tests like those in test_match and some others. - values = Series(["fooBAD__barBAD", NA, "foo"]) - er = [NA, NA] # empty row + values = Series(["fooBAD__barBAD", np.nan, "foo"]) + er = [np.nan, np.nan] # empty row result = values.str.extract(".*(BAD[_]+).*(BAD)", expand=True) exp = DataFrame([["BAD__", "BAD"], er, er]) @@ -1311,7 +1365,17 @@ def test_extract_expand_True(self): # mixed mixed = Series( - ["aBAD_BAD", NA, "BAD_b_BAD", True, datetime.today(), "foo", None, 1, 2.0] + [ + "aBAD_BAD", + np.nan, + "BAD_b_BAD", + True, + datetime.today(), + "foo", + None, + 1, + 2.0, + ] ) rs = Series(mixed).str.extract(".*(BAD[_]+).*(BAD)", expand=True) @@ -1344,32 +1408,34 @@ def test_extract_series(self): s = Series(["A1", "B2", "C3"], name=series_name) # one group, no matches result = s.str.extract("(_)", expand=True) - exp = DataFrame([NA, NA, NA], dtype=object) + exp = DataFrame([np.nan, np.nan, np.nan], dtype=object) tm.assert_frame_equal(result, exp) # two groups, no matches result = s.str.extract("(_)(_)", expand=True) - exp = DataFrame([[NA, NA], [NA, NA], [NA, NA]], dtype=object) + exp = DataFrame( + [[np.nan, np.nan], [np.nan, np.nan], [np.nan, np.nan]], dtype=object + ) tm.assert_frame_equal(result, exp) # one group, some matches result = s.str.extract("([AB])[123]", expand=True) - exp = DataFrame(["A", "B", NA]) + exp = DataFrame(["A", "B", np.nan]) tm.assert_frame_equal(result, exp) # two groups, some matches result = s.str.extract("([AB])([123])", expand=True) - exp = DataFrame([["A", "1"], ["B", "2"], [NA, NA]]) + exp = DataFrame([["A", "1"], ["B", "2"], [np.nan, np.nan]]) tm.assert_frame_equal(result, exp) # one named group result = s.str.extract("(?P[AB])", expand=True) - exp = DataFrame({"letter": ["A", "B", NA]}) + exp = DataFrame({"letter": ["A", "B", np.nan]}) tm.assert_frame_equal(result, exp) # two named groups result = s.str.extract("(?P[AB])(?P[123])", expand=True) - e_list = [["A", "1"], ["B", "2"], [NA, NA]] + e_list = [["A", "1"], ["B", "2"], [np.nan, np.nan]] exp = DataFrame(e_list, columns=["letter", "number"]) tm.assert_frame_equal(result, exp) @@ -1380,7 +1446,7 @@ def test_extract_series(self): # one normal group, one non-capturing group result = s.str.extract("([AB])(?:[123])", expand=True) - exp = DataFrame(["A", "B", NA]) + exp = DataFrame(["A", "B", np.nan]) tm.assert_frame_equal(result, exp) def test_extract_optional_groups(self): @@ -1389,14 +1455,14 @@ def test_extract_optional_groups(self): result = Series(["A11", "B22", "C33"]).str.extract( "([AB])([123])(?:[123])", expand=True ) - exp = DataFrame([["A", "1"], ["B", "2"], [NA, NA]]) + exp = DataFrame([["A", "1"], ["B", "2"], [np.nan, np.nan]]) tm.assert_frame_equal(result, exp) # one optional group followed by one normal group result = Series(["A1", "B2", "3"]).str.extract( "(?P[AB])?(?P[123])", expand=True ) - e_list = [["A", "1"], ["B", "2"], [NA, "3"]] + e_list = [["A", "1"], ["B", "2"], [np.nan, "3"]] exp = DataFrame(e_list, columns=["letter", "number"]) tm.assert_frame_equal(result, exp) @@ -1404,7 +1470,7 @@ def test_extract_optional_groups(self): result = Series(["A1", "B2", "C"]).str.extract( "(?P[ABC])(?P[123])?", expand=True ) - e_list = [["A", "1"], ["B", "2"], ["C", NA]] + e_list = [["A", "1"], ["B", "2"], ["C", np.nan]] exp = DataFrame(e_list, columns=["letter", "number"]) tm.assert_frame_equal(result, exp) @@ -1414,13 +1480,13 @@ def check_index(index): data = ["A1", "B2", "C"] index = index[: len(data)] result = Series(data, index=index).str.extract(r"(\d)", expand=True) - exp = DataFrame(["1", "2", NA], index=index) + exp = DataFrame(["1", "2", np.nan], index=index) tm.assert_frame_equal(result, exp) result = Series(data, index=index).str.extract( r"(?P\D)(?P\d)?", expand=True ) - e_list = [["A", "1"], ["B", "2"], ["C", NA]] + e_list = [["A", "1"], ["B", "2"], ["C", np.nan]] exp = DataFrame(e_list, columns=["letter", "number"], index=index) tm.assert_frame_equal(result, exp) @@ -1530,7 +1596,7 @@ def test_extractall(self): [(1, 0), (2, 0), (2, 1)], names=(None, "match") ) expected_df = DataFrame( - [("A", "1"), (NA, "3"), (NA, "2")], + [("A", "1"), (np.nan, "3"), (np.nan, "2")], expected_index, columns=["letter", "number"], ) @@ -1540,7 +1606,9 @@ def test_extractall(self): pattern = "([AB])?(?P[123])" computed_df = Series(subject_list).str.extractall(pattern) expected_df = DataFrame( - [("A", "1"), (NA, "3"), (NA, "2")], expected_index, columns=[0, "number"] + [("A", "1"), (np.nan, "3"), (np.nan, "2")], + expected_index, + columns=[0, "number"], ) tm.assert_frame_equal(computed_df, expected_df) @@ -1918,11 +1986,33 @@ def test_join(self): # mixed mixed = Series( - ["a_b", NA, "asdf_cas_asdf", True, datetime.today(), "foo", None, 1, 2.0] + [ + "a_b", + np.nan, + "asdf_cas_asdf", + True, + datetime.today(), + "foo", + None, + 1, + 2.0, + ] ) rs = Series(mixed).str.split("_").str.join("_") - xp = Series(["a_b", NA, "asdf_cas_asdf", NA, NA, "foo", NA, NA, NA]) + xp = Series( + [ + "a_b", + np.nan, + "asdf_cas_asdf", + np.nan, + np.nan, + "foo", + np.nan, + np.nan, + np.nan, + ] + ) assert isinstance(rs, Series) tm.assert_almost_equal(rs, xp) @@ -1931,34 +2021,66 @@ def test_len(self): values = Series(["foo", "fooo", "fooooo", np.nan, "fooooooo"]) result = values.str.len() - exp = values.map(lambda x: len(x) if notna(x) else NA) + exp = values.map(lambda x: len(x) if notna(x) else np.nan) tm.assert_series_equal(result, exp) # mixed mixed = Series( - ["a_b", NA, "asdf_cas_asdf", True, datetime.today(), "foo", None, 1, 2.0] + [ + "a_b", + np.nan, + "asdf_cas_asdf", + True, + datetime.today(), + "foo", + None, + 1, + 2.0, + ] ) rs = Series(mixed).str.len() - xp = Series([3, NA, 13, NA, NA, 3, NA, NA, NA]) + xp = Series([3, np.nan, 13, np.nan, np.nan, 3, np.nan, np.nan, np.nan]) assert isinstance(rs, Series) tm.assert_almost_equal(rs, xp) def test_findall(self): - values = Series(["fooBAD__barBAD", NA, "foo", "BAD"]) + values = Series(["fooBAD__barBAD", np.nan, "foo", "BAD"]) result = values.str.findall("BAD[_]*") - exp = Series([["BAD__", "BAD"], NA, [], ["BAD"]]) + exp = Series([["BAD__", "BAD"], np.nan, [], ["BAD"]]) tm.assert_almost_equal(result, exp) # mixed mixed = Series( - ["fooBAD__barBAD", NA, "foo", True, datetime.today(), "BAD", None, 1, 2.0] + [ + "fooBAD__barBAD", + np.nan, + "foo", + True, + datetime.today(), + "BAD", + None, + 1, + 2.0, + ] ) rs = Series(mixed).str.findall("BAD[_]*") - xp = Series([["BAD__", "BAD"], NA, [], NA, NA, ["BAD"], NA, NA, NA]) + xp = Series( + [ + ["BAD__", "BAD"], + np.nan, + [], + np.nan, + np.nan, + ["BAD"], + np.nan, + np.nan, + np.nan, + ] + ) assert isinstance(rs, Series) tm.assert_almost_equal(rs, xp) @@ -2078,59 +2200,65 @@ def _check(result, expected): tm.assert_series_equal(result, Series([3, 1, 2, np.nan])) def test_pad(self): - values = Series(["a", "b", NA, "c", NA, "eeeeee"]) + values = Series(["a", "b", np.nan, "c", np.nan, "eeeeee"]) result = values.str.pad(5, side="left") - exp = Series([" a", " b", NA, " c", NA, "eeeeee"]) + exp = Series([" a", " b", np.nan, " c", np.nan, "eeeeee"]) tm.assert_almost_equal(result, exp) result = values.str.pad(5, side="right") - exp = Series(["a ", "b ", NA, "c ", NA, "eeeeee"]) + exp = Series(["a ", "b ", np.nan, "c ", np.nan, "eeeeee"]) tm.assert_almost_equal(result, exp) result = values.str.pad(5, side="both") - exp = Series([" a ", " b ", NA, " c ", NA, "eeeeee"]) + exp = Series([" a ", " b ", np.nan, " c ", np.nan, "eeeeee"]) tm.assert_almost_equal(result, exp) # mixed - mixed = Series(["a", NA, "b", True, datetime.today(), "ee", None, 1, 2.0]) + mixed = Series(["a", np.nan, "b", True, datetime.today(), "ee", None, 1, 2.0]) rs = Series(mixed).str.pad(5, side="left") - xp = Series([" a", NA, " b", NA, NA, " ee", NA, NA, NA]) + xp = Series( + [" a", np.nan, " b", np.nan, np.nan, " ee", np.nan, np.nan, np.nan] + ) assert isinstance(rs, Series) tm.assert_almost_equal(rs, xp) - mixed = Series(["a", NA, "b", True, datetime.today(), "ee", None, 1, 2.0]) + mixed = Series(["a", np.nan, "b", True, datetime.today(), "ee", None, 1, 2.0]) rs = Series(mixed).str.pad(5, side="right") - xp = Series(["a ", NA, "b ", NA, NA, "ee ", NA, NA, NA]) + xp = Series( + ["a ", np.nan, "b ", np.nan, np.nan, "ee ", np.nan, np.nan, np.nan] + ) assert isinstance(rs, Series) tm.assert_almost_equal(rs, xp) - mixed = Series(["a", NA, "b", True, datetime.today(), "ee", None, 1, 2.0]) + mixed = Series(["a", np.nan, "b", True, datetime.today(), "ee", None, 1, 2.0]) rs = Series(mixed).str.pad(5, side="both") - xp = Series([" a ", NA, " b ", NA, NA, " ee ", NA, NA, NA]) + xp = Series( + [" a ", np.nan, " b ", np.nan, np.nan, " ee ", np.nan, np.nan, np.nan] + ) assert isinstance(rs, Series) tm.assert_almost_equal(rs, xp) def test_pad_fillchar(self): - values = Series(["a", "b", NA, "c", NA, "eeeeee"]) + values = Series(["a", "b", np.nan, "c", np.nan, "eeeeee"]) result = values.str.pad(5, side="left", fillchar="X") - exp = Series(["XXXXa", "XXXXb", NA, "XXXXc", NA, "eeeeee"]) + exp = Series(["XXXXa", "XXXXb", np.nan, "XXXXc", np.nan, "eeeeee"]) tm.assert_almost_equal(result, exp) result = values.str.pad(5, side="right", fillchar="X") - exp = Series(["aXXXX", "bXXXX", NA, "cXXXX", NA, "eeeeee"]) + exp = Series(["aXXXX", "bXXXX", np.nan, "cXXXX", np.nan, "eeeeee"]) tm.assert_almost_equal(result, exp) result = values.str.pad(5, side="both", fillchar="X") - exp = Series(["XXaXX", "XXbXX", NA, "XXcXX", NA, "eeeeee"]) + exp = Series(["XXaXX", "XXbXX", np.nan, "XXcXX", np.nan, "eeeeee"]) tm.assert_almost_equal(result, exp) msg = "fillchar must be a character, not str" @@ -2171,35 +2299,76 @@ def _check(result, expected): tm.assert_series_equal(result, expected) def test_center_ljust_rjust(self): - values = Series(["a", "b", NA, "c", NA, "eeeeee"]) + values = Series(["a", "b", np.nan, "c", np.nan, "eeeeee"]) result = values.str.center(5) - exp = Series([" a ", " b ", NA, " c ", NA, "eeeeee"]) + exp = Series([" a ", " b ", np.nan, " c ", np.nan, "eeeeee"]) tm.assert_almost_equal(result, exp) result = values.str.ljust(5) - exp = Series(["a ", "b ", NA, "c ", NA, "eeeeee"]) + exp = Series(["a ", "b ", np.nan, "c ", np.nan, "eeeeee"]) tm.assert_almost_equal(result, exp) result = values.str.rjust(5) - exp = Series([" a", " b", NA, " c", NA, "eeeeee"]) + exp = Series([" a", " b", np.nan, " c", np.nan, "eeeeee"]) tm.assert_almost_equal(result, exp) # mixed - mixed = Series(["a", NA, "b", True, datetime.today(), "c", "eee", None, 1, 2.0]) + mixed = Series( + ["a", np.nan, "b", True, datetime.today(), "c", "eee", None, 1, 2.0] + ) rs = Series(mixed).str.center(5) - xp = Series([" a ", NA, " b ", NA, NA, " c ", " eee ", NA, NA, NA]) + xp = Series( + [ + " a ", + np.nan, + " b ", + np.nan, + np.nan, + " c ", + " eee ", + np.nan, + np.nan, + np.nan, + ] + ) assert isinstance(rs, Series) tm.assert_almost_equal(rs, xp) rs = Series(mixed).str.ljust(5) - xp = Series(["a ", NA, "b ", NA, NA, "c ", "eee ", NA, NA, NA]) + xp = Series( + [ + "a ", + np.nan, + "b ", + np.nan, + np.nan, + "c ", + "eee ", + np.nan, + np.nan, + np.nan, + ] + ) assert isinstance(rs, Series) tm.assert_almost_equal(rs, xp) rs = Series(mixed).str.rjust(5) - xp = Series([" a", NA, " b", NA, NA, " c", " eee", NA, NA, NA]) + xp = Series( + [ + " a", + np.nan, + " b", + np.nan, + np.nan, + " c", + " eee", + np.nan, + np.nan, + np.nan, + ] + ) assert isinstance(rs, Series) tm.assert_almost_equal(rs, xp) @@ -2268,14 +2437,14 @@ def test_zfill(self): tm.assert_series_equal(result, expected) def test_split(self): - values = Series(["a_b_c", "c_d_e", NA, "f_g_h"]) + values = Series(["a_b_c", "c_d_e", np.nan, "f_g_h"]) result = values.str.split("_") - exp = Series([["a", "b", "c"], ["c", "d", "e"], NA, ["f", "g", "h"]]) + exp = Series([["a", "b", "c"], ["c", "d", "e"], np.nan, ["f", "g", "h"]]) tm.assert_series_equal(result, exp) # more than one char - values = Series(["a__b__c", "c__d__e", NA, "f__g__h"]) + values = Series(["a__b__c", "c__d__e", np.nan, "f__g__h"]) result = values.str.split("__") tm.assert_series_equal(result, exp) @@ -2283,9 +2452,20 @@ def test_split(self): tm.assert_series_equal(result, exp) # mixed - mixed = Series(["a_b_c", NA, "d_e_f", True, datetime.today(), None, 1, 2.0]) + mixed = Series(["a_b_c", np.nan, "d_e_f", True, datetime.today(), None, 1, 2.0]) result = mixed.str.split("_") - exp = Series([["a", "b", "c"], NA, ["d", "e", "f"], NA, NA, NA, NA, NA]) + exp = Series( + [ + ["a", "b", "c"], + np.nan, + ["d", "e", "f"], + np.nan, + np.nan, + np.nan, + np.nan, + np.nan, + ] + ) assert isinstance(result, Series) tm.assert_almost_equal(result, exp) @@ -2294,19 +2474,19 @@ def test_split(self): tm.assert_almost_equal(result, exp) # regex split - values = Series(["a,b_c", "c_d,e", NA, "f,g,h"]) + values = Series(["a,b_c", "c_d,e", np.nan, "f,g,h"]) result = values.str.split("[,_]") - exp = Series([["a", "b", "c"], ["c", "d", "e"], NA, ["f", "g", "h"]]) + exp = Series([["a", "b", "c"], ["c", "d", "e"], np.nan, ["f", "g", "h"]]) tm.assert_series_equal(result, exp) def test_rsplit(self): - values = Series(["a_b_c", "c_d_e", NA, "f_g_h"]) + values = Series(["a_b_c", "c_d_e", np.nan, "f_g_h"]) result = values.str.rsplit("_") - exp = Series([["a", "b", "c"], ["c", "d", "e"], NA, ["f", "g", "h"]]) + exp = Series([["a", "b", "c"], ["c", "d", "e"], np.nan, ["f", "g", "h"]]) tm.assert_series_equal(result, exp) # more than one char - values = Series(["a__b__c", "c__d__e", NA, "f__g__h"]) + values = Series(["a__b__c", "c__d__e", np.nan, "f__g__h"]) result = values.str.rsplit("__") tm.assert_series_equal(result, exp) @@ -2314,9 +2494,20 @@ def test_rsplit(self): tm.assert_series_equal(result, exp) # mixed - mixed = Series(["a_b_c", NA, "d_e_f", True, datetime.today(), None, 1, 2.0]) + mixed = Series(["a_b_c", np.nan, "d_e_f", True, datetime.today(), None, 1, 2.0]) result = mixed.str.rsplit("_") - exp = Series([["a", "b", "c"], NA, ["d", "e", "f"], NA, NA, NA, NA, NA]) + exp = Series( + [ + ["a", "b", "c"], + np.nan, + ["d", "e", "f"], + np.nan, + np.nan, + np.nan, + np.nan, + np.nan, + ] + ) assert isinstance(result, Series) tm.assert_almost_equal(result, exp) @@ -2325,15 +2516,15 @@ def test_rsplit(self): tm.assert_almost_equal(result, exp) # regex split is not supported by rsplit - values = Series(["a,b_c", "c_d,e", NA, "f,g,h"]) + values = Series(["a,b_c", "c_d,e", np.nan, "f,g,h"]) result = values.str.rsplit("[,_]") - exp = Series([["a,b_c"], ["c_d,e"], NA, ["f,g,h"]]) + exp = Series([["a,b_c"], ["c_d,e"], np.nan, ["f,g,h"]]) tm.assert_series_equal(result, exp) # setting max number of splits, make sure it's from reverse - values = Series(["a_b_c", "c_d_e", NA, "f_g_h"]) + values = Series(["a_b_c", "c_d_e", np.nan, "f_g_h"]) result = values.str.rsplit("_", n=1) - exp = Series([["a_b", "c"], ["c_d", "e"], NA, ["f_g", "h"]]) + exp = Series([["a_b", "c"], ["c_d", "e"], np.nan, ["f_g", "h"]]) tm.assert_series_equal(result, exp) def test_split_blank_string(self): @@ -2408,9 +2599,9 @@ def test_split_to_dataframe(self): 0: ["some", "one"], 1: ["unequal", "of"], 2: ["splits", "these"], - 3: [NA, "things"], - 4: [NA, "is"], - 5: [NA, "not"], + 3: [np.nan, "things"], + 4: [np.nan, "is"], + 5: [np.nan, "not"], } ) tm.assert_frame_equal(result, exp) @@ -2451,7 +2642,7 @@ def test_split_to_multiindex_expand(self): result = idx.str.split("_", expand=True) exp = MultiIndex.from_tuples( [ - ("some", "unequal", "splits", NA, NA, NA), + ("some", "unequal", "splits", np.nan, np.nan, np.nan), ("one", "of", "these", "things", "is", "not"), (np.nan, np.nan, np.nan, np.nan, np.nan, np.nan), (None, None, None, None, None, None), @@ -2516,9 +2707,9 @@ def test_rsplit_to_multiindex_expand(self): def test_split_nan_expand(self): # gh-18450 - s = Series(["foo,bar,baz", NA]) + s = Series(["foo,bar,baz", np.nan]) result = s.str.split(",", expand=True) - exp = DataFrame([["foo", "bar", "baz"], [NA, NA, NA]]) + exp = DataFrame([["foo", "bar", "baz"], [np.nan, np.nan, np.nan]]) tm.assert_frame_equal(result, exp) # check that these are actually np.nan and not None @@ -2553,67 +2744,79 @@ def test_split_with_name(self): def test_partition_series(self): # https://github.com/pandas-dev/pandas/issues/23558 - values = Series(["a_b_c", "c_d_e", NA, "f_g_h", None]) + values = Series(["a_b_c", "c_d_e", np.nan, "f_g_h", None]) result = values.str.partition("_", expand=False) exp = Series( - [("a", "_", "b_c"), ("c", "_", "d_e"), NA, ("f", "_", "g_h"), None] + [("a", "_", "b_c"), ("c", "_", "d_e"), np.nan, ("f", "_", "g_h"), None] ) tm.assert_series_equal(result, exp) result = values.str.rpartition("_", expand=False) exp = Series( - [("a_b", "_", "c"), ("c_d", "_", "e"), NA, ("f_g", "_", "h"), None] + [("a_b", "_", "c"), ("c_d", "_", "e"), np.nan, ("f_g", "_", "h"), None] ) tm.assert_series_equal(result, exp) # more than one char - values = Series(["a__b__c", "c__d__e", NA, "f__g__h", None]) + values = Series(["a__b__c", "c__d__e", np.nan, "f__g__h", None]) result = values.str.partition("__", expand=False) exp = Series( - [("a", "__", "b__c"), ("c", "__", "d__e"), NA, ("f", "__", "g__h"), None] + [ + ("a", "__", "b__c"), + ("c", "__", "d__e"), + np.nan, + ("f", "__", "g__h"), + None, + ] ) tm.assert_series_equal(result, exp) result = values.str.rpartition("__", expand=False) exp = Series( - [("a__b", "__", "c"), ("c__d", "__", "e"), NA, ("f__g", "__", "h"), None] + [ + ("a__b", "__", "c"), + ("c__d", "__", "e"), + np.nan, + ("f__g", "__", "h"), + None, + ] ) tm.assert_series_equal(result, exp) # None - values = Series(["a b c", "c d e", NA, "f g h", None]) + values = Series(["a b c", "c d e", np.nan, "f g h", None]) result = values.str.partition(expand=False) exp = Series( - [("a", " ", "b c"), ("c", " ", "d e"), NA, ("f", " ", "g h"), None] + [("a", " ", "b c"), ("c", " ", "d e"), np.nan, ("f", " ", "g h"), None] ) tm.assert_series_equal(result, exp) result = values.str.rpartition(expand=False) exp = Series( - [("a b", " ", "c"), ("c d", " ", "e"), NA, ("f g", " ", "h"), None] + [("a b", " ", "c"), ("c d", " ", "e"), np.nan, ("f g", " ", "h"), None] ) tm.assert_series_equal(result, exp) # Not split - values = Series(["abc", "cde", NA, "fgh", None]) + values = Series(["abc", "cde", np.nan, "fgh", None]) result = values.str.partition("_", expand=False) - exp = Series([("abc", "", ""), ("cde", "", ""), NA, ("fgh", "", ""), None]) + exp = Series([("abc", "", ""), ("cde", "", ""), np.nan, ("fgh", "", ""), None]) tm.assert_series_equal(result, exp) result = values.str.rpartition("_", expand=False) - exp = Series([("", "", "abc"), ("", "", "cde"), NA, ("", "", "fgh"), None]) + exp = Series([("", "", "abc"), ("", "", "cde"), np.nan, ("", "", "fgh"), None]) tm.assert_series_equal(result, exp) # unicode - values = Series(["a_b_c", "c_d_e", NA, "f_g_h"]) + values = Series(["a_b_c", "c_d_e", np.nan, "f_g_h"]) result = values.str.partition("_", expand=False) - exp = Series([("a", "_", "b_c"), ("c", "_", "d_e"), NA, ("f", "_", "g_h")]) + exp = Series([("a", "_", "b_c"), ("c", "_", "d_e"), np.nan, ("f", "_", "g_h")]) tm.assert_series_equal(result, exp) result = values.str.rpartition("_", expand=False) - exp = Series([("a_b", "_", "c"), ("c_d", "_", "e"), NA, ("f_g", "_", "h")]) + exp = Series([("a_b", "_", "c"), ("c_d", "_", "e"), np.nan, ("f_g", "_", "h")]) tm.assert_series_equal(result, exp) # compare to standard lib @@ -2677,7 +2880,7 @@ def test_partition_index(self): def test_partition_to_dataframe(self): # https://github.com/pandas-dev/pandas/issues/23558 - values = Series(["a_b_c", "c_d_e", NA, "f_g_h", None]) + values = Series(["a_b_c", "c_d_e", np.nan, "f_g_h", None]) result = values.str.partition("_") exp = DataFrame( { @@ -2698,7 +2901,7 @@ def test_partition_to_dataframe(self): ) tm.assert_frame_equal(result, exp) - values = Series(["a_b_c", "c_d_e", NA, "f_g_h", None]) + values = Series(["a_b_c", "c_d_e", np.nan, "f_g_h", None]) result = values.str.partition("_", expand=True) exp = DataFrame( { @@ -2746,7 +2949,7 @@ def test_partition_with_name(self): def test_partition_deprecation(self): # GH 22676; depr kwarg "pat" in favor of "sep" - values = Series(["a_b_c", "c_d_e", NA, "f_g_h"]) + values = Series(["a_b_c", "c_d_e", np.nan, "f_g_h"]) # str.partition # using sep -> no warning @@ -2779,100 +2982,102 @@ def test_pipe_failures(self): @pytest.mark.parametrize( "start, stop, step, expected", [ - (2, 5, None, Series(["foo", "bar", NA, "baz"])), - (0, 3, -1, Series(["", "", NA, ""])), - (None, None, -1, Series(["owtoofaa", "owtrabaa", NA, "xuqzabaa"])), - (3, 10, 2, Series(["oto", "ato", NA, "aqx"])), - (3, 0, -1, Series(["ofa", "aba", NA, "aba"])), + (2, 5, None, Series(["foo", "bar", np.nan, "baz"])), + (0, 3, -1, Series(["", "", np.nan, ""])), + (None, None, -1, Series(["owtoofaa", "owtrabaa", np.nan, "xuqzabaa"])), + (3, 10, 2, Series(["oto", "ato", np.nan, "aqx"])), + (3, 0, -1, Series(["ofa", "aba", np.nan, "aba"])), ], ) def test_slice(self, start, stop, step, expected): - values = Series(["aafootwo", "aabartwo", NA, "aabazqux"]) + values = Series(["aafootwo", "aabartwo", np.nan, "aabazqux"]) result = values.str.slice(start, stop, step) tm.assert_series_equal(result, expected) # mixed mixed = Series( - ["aafootwo", NA, "aabartwo", True, datetime.today(), None, 1, 2.0] + ["aafootwo", np.nan, "aabartwo", True, datetime.today(), None, 1, 2.0] ) rs = Series(mixed).str.slice(2, 5) - xp = Series(["foo", NA, "bar", NA, NA, NA, NA, NA]) + xp = Series(["foo", np.nan, "bar", np.nan, np.nan, np.nan, np.nan, np.nan]) assert isinstance(rs, Series) tm.assert_almost_equal(rs, xp) rs = Series(mixed).str.slice(2, 5, -1) - xp = Series(["oof", NA, "rab", NA, NA, NA, NA, NA]) + xp = Series(["oof", np.nan, "rab", np.nan, np.nan, np.nan, np.nan, np.nan]) def test_slice_replace(self): - values = Series(["short", "a bit longer", "evenlongerthanthat", "", NA]) + values = Series(["short", "a bit longer", "evenlongerthanthat", "", np.nan]) - exp = Series(["shrt", "a it longer", "evnlongerthanthat", "", NA]) + exp = Series(["shrt", "a it longer", "evnlongerthanthat", "", np.nan]) result = values.str.slice_replace(2, 3) tm.assert_series_equal(result, exp) - exp = Series(["shzrt", "a zit longer", "evznlongerthanthat", "z", NA]) + exp = Series(["shzrt", "a zit longer", "evznlongerthanthat", "z", np.nan]) result = values.str.slice_replace(2, 3, "z") tm.assert_series_equal(result, exp) - exp = Series(["shzort", "a zbit longer", "evzenlongerthanthat", "z", NA]) + exp = Series(["shzort", "a zbit longer", "evzenlongerthanthat", "z", np.nan]) result = values.str.slice_replace(2, 2, "z") tm.assert_series_equal(result, exp) - exp = Series(["shzort", "a zbit longer", "evzenlongerthanthat", "z", NA]) + exp = Series(["shzort", "a zbit longer", "evzenlongerthanthat", "z", np.nan]) result = values.str.slice_replace(2, 1, "z") tm.assert_series_equal(result, exp) - exp = Series(["shorz", "a bit longez", "evenlongerthanthaz", "z", NA]) + exp = Series(["shorz", "a bit longez", "evenlongerthanthaz", "z", np.nan]) result = values.str.slice_replace(-1, None, "z") tm.assert_series_equal(result, exp) - exp = Series(["zrt", "zer", "zat", "z", NA]) + exp = Series(["zrt", "zer", "zat", "z", np.nan]) result = values.str.slice_replace(None, -2, "z") tm.assert_series_equal(result, exp) - exp = Series(["shortz", "a bit znger", "evenlozerthanthat", "z", NA]) + exp = Series(["shortz", "a bit znger", "evenlozerthanthat", "z", np.nan]) result = values.str.slice_replace(6, 8, "z") tm.assert_series_equal(result, exp) - exp = Series(["zrt", "a zit longer", "evenlongzerthanthat", "z", NA]) + exp = Series(["zrt", "a zit longer", "evenlongzerthanthat", "z", np.nan]) result = values.str.slice_replace(-10, 3, "z") tm.assert_series_equal(result, exp) def test_strip_lstrip_rstrip(self): - values = Series([" aa ", " bb \n", NA, "cc "]) + values = Series([" aa ", " bb \n", np.nan, "cc "]) result = values.str.strip() - exp = Series(["aa", "bb", NA, "cc"]) + exp = Series(["aa", "bb", np.nan, "cc"]) tm.assert_series_equal(result, exp) result = values.str.lstrip() - exp = Series(["aa ", "bb \n", NA, "cc "]) + exp = Series(["aa ", "bb \n", np.nan, "cc "]) tm.assert_series_equal(result, exp) result = values.str.rstrip() - exp = Series([" aa", " bb", NA, "cc"]) + exp = Series([" aa", " bb", np.nan, "cc"]) tm.assert_series_equal(result, exp) def test_strip_lstrip_rstrip_mixed(self): # mixed - mixed = Series([" aa ", NA, " bb \t\n", True, datetime.today(), None, 1, 2.0]) + mixed = Series( + [" aa ", np.nan, " bb \t\n", True, datetime.today(), None, 1, 2.0] + ) rs = Series(mixed).str.strip() - xp = Series(["aa", NA, "bb", NA, NA, NA, NA, NA]) + xp = Series(["aa", np.nan, "bb", np.nan, np.nan, np.nan, np.nan, np.nan]) assert isinstance(rs, Series) tm.assert_almost_equal(rs, xp) rs = Series(mixed).str.lstrip() - xp = Series(["aa ", NA, "bb \t\n", NA, NA, NA, NA, NA]) + xp = Series(["aa ", np.nan, "bb \t\n", np.nan, np.nan, np.nan, np.nan, np.nan]) assert isinstance(rs, Series) tm.assert_almost_equal(rs, xp) rs = Series(mixed).str.rstrip() - xp = Series([" aa", NA, " bb", NA, NA, NA, NA, NA]) + xp = Series([" aa", np.nan, " bb", np.nan, np.nan, np.nan, np.nan, np.nan]) assert isinstance(rs, Series) tm.assert_almost_equal(rs, xp) @@ -2932,7 +3137,7 @@ def test_wrap(self): # test with pre and post whitespace (non-unicode), NaN, and non-ascii # Unicode values = Series([" pre ", np.nan, "\xac\u20ac\U00008000 abadcafe"]) - xp = Series([" pre", NA, "\xac\u20ac\U00008000 ab\nadcafe"]) + xp = Series([" pre", np.nan, "\xac\u20ac\U00008000 ab\nadcafe"]) rs = values.str.wrap(6) assert_series_equal(rs, xp) @@ -2944,10 +3149,10 @@ def test_get(self): tm.assert_series_equal(result, expected) # mixed - mixed = Series(["a_b_c", NA, "c_d_e", True, datetime.today(), None, 1, 2.0]) + mixed = Series(["a_b_c", np.nan, "c_d_e", True, datetime.today(), None, 1, 2.0]) rs = Series(mixed).str.split("_").str.get(1) - xp = Series(["b", NA, "d", NA, NA, NA, NA, NA]) + xp = Series(["b", np.nan, "d", np.nan, np.nan, np.nan, np.nan, np.nan]) assert isinstance(rs, Series) tm.assert_almost_equal(rs, xp) @@ -2991,7 +3196,7 @@ def test_get_complex_nested(self, to_type): def test_contains_moar(self): # PR #1179 - s = Series(["A", "B", "C", "Aaba", "Baca", "", NA, "CABA", "dog", "cat"]) + s = Series(["A", "B", "C", "Aaba", "Baca", "", np.nan, "CABA", "dog", "cat"]) result = s.str.contains("a") expected = Series( @@ -3045,11 +3250,11 @@ def test_contains_nan(self): def test_replace_moar(self): # PR #1179 - s = Series(["A", "B", "C", "Aaba", "Baca", "", NA, "CABA", "dog", "cat"]) + s = Series(["A", "B", "C", "Aaba", "Baca", "", np.nan, "CABA", "dog", "cat"]) result = s.str.replace("A", "YYY") expected = Series( - ["YYY", "B", "C", "YYYaba", "Baca", "", NA, "CYYYBYYY", "dog", "cat"] + ["YYY", "B", "C", "YYYaba", "Baca", "", np.nan, "CYYYBYYY", "dog", "cat"] ) assert_series_equal(result, expected) @@ -3062,7 +3267,7 @@ def test_replace_moar(self): "YYYYYYbYYY", "BYYYcYYY", "", - NA, + np.nan, "CYYYBYYY", "dog", "cYYYt", @@ -3079,7 +3284,7 @@ def test_replace_moar(self): "XX-XX ba", "XX-XX ca", "", - NA, + np.nan, "XX-XX BA", "XX-XX ", "XX-XX t", @@ -3089,7 +3294,17 @@ def test_replace_moar(self): def test_string_slice_get_syntax(self): s = Series( - ["YYY", "B", "C", "YYYYYYbYYY", "BYYYcYYY", NA, "CYYYBYYY", "dog", "cYYYt"] + [ + "YYY", + "B", + "C", + "YYYYYYbYYY", + "BYYYcYYY", + np.nan, + "CYYYBYYY", + "dog", + "cYYYt", + ] ) result = s.str[0] @@ -3266,8 +3481,8 @@ def test_method_on_bytes(self): def test_casefold(self): # GH25405 - expected = Series(["ss", NA, "case", "ssd"]) - s = Series(["ß", NA, "case", "ßd"]) + expected = Series(["ss", np.nan, "case", "ssd"]) + s = Series(["ß", np.nan, "case", "ßd"]) result = s.str.casefold() tm.assert_series_equal(result, expected) From df2e0813e053cc5bc924b2292ea8918a6b27f0e2 Mon Sep 17 00:00:00 2001 From: William Ayd Date: Tue, 8 Oct 2019 14:24:17 -0700 Subject: [PATCH 011/119] Clean up Abstract and Naming Definitions for GroupBy (#28847) --- pandas/core/groupby/generic.py | 21 +++++++++++++-------- pandas/core/groupby/groupby.py | 11 +++++++---- 2 files changed, 20 insertions(+), 12 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 0bd6f746e4f3a..41a5195008f0c 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -132,6 +132,9 @@ def pinner(cls): class SeriesGroupBy(GroupBy): _apply_whitelist = base.series_apply_whitelist + def _iterate_slices(self): + yield self._selection_name, self._selected_obj + @property def _selection_name(self): """ @@ -323,7 +326,7 @@ def _aggregate_multiple_funcs(self, arg, _level): return DataFrame(results, columns=columns) - def _wrap_output(self, output, index, names=None): + def _wrap_series_output(self, output, index, names=None): """ common agg/transform wrapping logic """ output = output[self._selection_name] @@ -336,13 +339,15 @@ def _wrap_output(self, output, index, names=None): return Series(output, index=index, name=name) def _wrap_aggregated_output(self, output, names=None): - result = self._wrap_output( + result = self._wrap_series_output( output=output, index=self.grouper.result_index, names=names ) return self._reindex_output(result)._convert(datetime=True) def _wrap_transformed_output(self, output, names=None): - return self._wrap_output(output=output, index=self.obj.index, names=names) + return self._wrap_series_output( + output=output, index=self.obj.index, names=names + ) def _wrap_applied_output(self, keys, values, not_indexed_same=False): if len(keys) == 0: @@ -866,7 +871,7 @@ def aggregate(self, func=None, *args, **kwargs): if self.grouper.nkeys > 1: return self._python_agg_general(func, *args, **kwargs) elif args or kwargs: - result = self._aggregate_generic(func, *args, **kwargs) + result = self._aggregate_frame(func, *args, **kwargs) else: # try to treat as if we are passing a list @@ -875,7 +880,7 @@ def aggregate(self, func=None, *args, **kwargs): [func], _level=_level, _axis=self.axis ) except Exception: - result = self._aggregate_generic(func) + result = self._aggregate_frame(func) else: result.columns = Index( result.columns.levels[0], name=self._selected_obj.columns.name @@ -999,7 +1004,7 @@ def _cython_agg_blocks(self, how, alt=None, numeric_only=True, min_count=-1): return new_items, new_blocks - def _aggregate_generic(self, func, *args, **kwargs): + def _aggregate_frame(self, func, *args, **kwargs): if self.grouper.nkeys != 1: raise AssertionError("Number of keys must be 1") @@ -1022,7 +1027,7 @@ def _aggregate_generic(self, func, *args, **kwargs): wrapper = lambda x: func(x, *args, **kwargs) result[name] = data.apply(wrapper, axis=axis) - return self._wrap_generic_output(result, obj) + return self._wrap_frame_output(result, obj) def _aggregate_item_by_item(self, func, *args, **kwargs): # only for axis==0 @@ -1506,7 +1511,7 @@ def _gotitem(self, key, ndim, subset=None): raise AssertionError("invalid ndim for _gotitem") - def _wrap_generic_output(self, result, obj): + def _wrap_frame_output(self, result, obj): result_index = self.grouper.levels[0] if self.axis == 0: diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 61a04431f99cb..4e0dd65042196 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -754,7 +754,7 @@ def _python_apply_general(self, f): ) def _iterate_slices(self): - yield self._selection_name, self._selected_obj + raise AbstractMethodError(self) def transform(self, func, *args, **kwargs): raise AbstractMethodError(self) @@ -879,6 +879,12 @@ def _cython_transform(self, how, numeric_only=True, **kwargs): def _wrap_aggregated_output(self, output, names=None): raise AbstractMethodError(self) + def _wrap_transformed_output(self, output, names=None): + raise AbstractMethodError(self) + + def _wrap_applied_output(self, keys, values, not_indexed_same=False): + raise AbstractMethodError(self) + def _cython_agg_general(self, how, alt=None, numeric_only=True, min_count=-1): output = {} for name, obj in self._iterate_slices(): @@ -929,9 +935,6 @@ def _python_agg_general(self, func, *args, **kwargs): return self._wrap_aggregated_output(output) - def _wrap_applied_output(self, *args, **kwargs): - raise AbstractMethodError(self) - def _concat_objects(self, keys, values, not_indexed_same=False): from pandas.core.reshape.concat import concat From 43687c0c9b6c59c6f670e40c68a6ac7fe8a4a11e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20H=C3=A4hnke?= <11663234+DanBenHa@users.noreply.github.com> Date: Wed, 9 Oct 2019 09:37:49 +0200 Subject: [PATCH 012/119] DOC: Fix missing periods and non capitalized summary beginnings (#28858) --- pandas/_libs/tslibs/period.pyx | 6 +++--- pandas/_libs/tslibs/timedeltas.pyx | 4 ++-- pandas/core/arrays/interval.py | 2 +- pandas/io/formats/style.py | 2 +- pandas/plotting/_misc.py | 6 +++--- 5 files changed, 10 insertions(+), 10 deletions(-) diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx index 697e97e518b13..32dcc86faa7e8 100644 --- a/pandas/_libs/tslibs/period.pyx +++ b/pandas/_libs/tslibs/period.pyx @@ -1710,7 +1710,7 @@ cdef class _Period: def asfreq(self, freq, how='E'): """ Convert Period to desired frequency, either at the start or end of the - interval + interval. Parameters ---------- @@ -1777,7 +1777,7 @@ cdef class _Period: def to_timestamp(self, freq=None, how='start', tz=None): """ Return the Timestamp representation of the Period at the target - frequency at the specified end (how) of the Period + frequency at the specified end (how) of the Period. Parameters ---------- @@ -2380,7 +2380,7 @@ cdef class _Period: class Period(_Period): """ - Represents a period of time + Represents a period of time. Parameters ---------- diff --git a/pandas/_libs/tslibs/timedeltas.pyx b/pandas/_libs/tslibs/timedeltas.pyx index f9cb35eb79ae3..3d267b0114695 100644 --- a/pandas/_libs/tslibs/timedeltas.pyx +++ b/pandas/_libs/tslibs/timedeltas.pyx @@ -1344,7 +1344,7 @@ class Timedelta(_Timedelta): def floor(self, freq): """ - return a new Timedelta floored to this resolution. + Return a new Timedelta floored to this resolution. Parameters ---------- @@ -1355,7 +1355,7 @@ class Timedelta(_Timedelta): def ceil(self, freq): """ - return a new Timedelta ceiled to this resolution. + Return a new Timedelta ceiled to this resolution. Parameters ---------- diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index 6dd0b116b3b0d..4039cc91fb554 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -975,7 +975,7 @@ def length(self): @property def mid(self): """ - Return the midpoint of each Interval in the IntervalArray as an Index + Return the midpoint of each Interval in the IntervalArray as an Index. """ try: return 0.5 * (self.left + self.right) diff --git a/pandas/io/formats/style.py b/pandas/io/formats/style.py index 9c4746f4d68e3..6bac3fe426f2d 100644 --- a/pandas/io/formats/style.py +++ b/pandas/io/formats/style.py @@ -849,7 +849,7 @@ def set_uuid(self, uuid): def set_caption(self, caption): """ - Set the caption on a Styler + Set the caption on a Styler. Parameters ---------- diff --git a/pandas/plotting/_misc.py b/pandas/plotting/_misc.py index 74ce60c6116a9..426ca9632af29 100644 --- a/pandas/plotting/_misc.py +++ b/pandas/plotting/_misc.py @@ -8,7 +8,7 @@ def table(ax, data, rowLabels=None, colLabels=None, **kwargs): """ - Helper function to convert DataFrame and Series to matplotlib.table + Helper function to convert DataFrame and Series to matplotlib.table. Parameters ---------- @@ -32,7 +32,7 @@ def table(ax, data, rowLabels=None, colLabels=None, **kwargs): def register(explicit=True): """ - Register Pandas Formatters and Converters with matplotlib + Register Pandas Formatters and Converters with matplotlib. This function modifies the global ``matplotlib.units.registry`` dictionary. Pandas adds custom converters for @@ -54,7 +54,7 @@ def register(explicit=True): def deregister(): """ - Remove pandas' formatters and converters + Remove pandas' formatters and converters. Removes the custom converters added by :func:`register`. This attempts to set the state of the registry back to the state before From 93183bab135ec346450775f93070538d4d6f9c4f Mon Sep 17 00:00:00 2001 From: Bobae Kim <37873745+KimDoubleB@users.noreply.github.com> Date: Wed, 9 Oct 2019 23:50:45 +0900 Subject: [PATCH 013/119] fix unnecessary sort in pd.read_json and orient="index" (#28606) --- doc/source/whatsnew/v1.0.0.rst | 1 + pandas/io/json/_json.py | 16 +++++++--------- pandas/tests/io/json/test_pandas.py | 7 +++---- 3 files changed, 11 insertions(+), 13 deletions(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 751db2b88069d..fd1c1271a5e37 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -311,6 +311,7 @@ I/O - Bug in :func:`DataFrame.to_string` where values were truncated using display options instead of outputting the full content (:issue:`9784`) - Bug in :meth:`DataFrame.to_json` where a datetime column label would not be written out in ISO format with ``orient="table"`` (:issue:`28130`) - Bug in :func:`DataFrame.to_parquet` where writing to GCS would fail with `engine='fastparquet'` if the file did not already exist (:issue:`28326`) +- Bug in :meth:`DataFrame.read_json` where using ``orient="index"`` would not maintain the order (:issue:`28557`) - Bug in :meth:`DataFrame.to_html` where the length of the ``formatters`` argument was not verified (:issue:`28469`) Plotting diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index 73f4985e201f1..6ce288890b6c7 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -12,7 +12,7 @@ from pandas.core.dtypes.common import ensure_str, is_period_dtype -from pandas import DataFrame, MultiIndex, Series, isna, to_datetime +from pandas import DataFrame, MultiIndex, Series, compat, isna, to_datetime from pandas._typing import Scalar from pandas.core.reshape.concat import concat @@ -1112,15 +1112,13 @@ def _parse_no_numpy(self): self.check_keys_split(decoded) self.obj = DataFrame(dtype=None, **decoded) elif orient == "index": - self.obj = ( - DataFrame.from_dict( - loads(json, precise_float=self.precise_float), - dtype=None, - orient="index", - ) - .sort_index(axis="columns") - .sort_index(axis="index") + self.obj = DataFrame.from_dict( + loads(json, precise_float=self.precise_float), + dtype=None, + orient="index", ) + if compat.PY35: + self.obj = self.obj.sort_index(axis="columns").sort_index(axis="index") elif orient == "table": self.obj = parse_table_schema(json, precise_float=self.precise_float) else: diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 2195bf248f43a..8e28740c70bad 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -166,8 +166,7 @@ def test_roundtrip_simple(self, orient, convert_axes, numpy, dtype): expected = self.frame.copy() - if not numpy and (orient == "index" or (PY35 and orient == "columns")): - # TODO: debug why sort is required + if not numpy and PY35 and orient in ("index", "columns"): expected = expected.sort_index() assert_json_roundtrip_equal(result, expected, orient) @@ -181,7 +180,7 @@ def test_roundtrip_intframe(self, orient, convert_axes, numpy, dtype): data, orient=orient, convert_axes=convert_axes, numpy=numpy, dtype=dtype ) expected = self.intframe.copy() - if not numpy and (orient == "index" or (PY35 and orient == "columns")): + if not numpy and PY35 and orient in ("index", "columns"): expected = expected.sort_index() if ( @@ -216,7 +215,7 @@ def test_roundtrip_str_axes(self, orient, convert_axes, numpy, dtype): ) expected = df.copy() - if not numpy and (orient == "index" or (PY35 and orient == "columns")): + if not numpy and PY35 and orient in ("index", "columns"): expected = expected.sort_index() if not dtype: From 71f8ab9fd1ef925660cf5aaf1aa71e6fb510eb88 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 9 Oct 2019 18:25:28 -0700 Subject: [PATCH 014/119] CLN: dont catch Exception in groupby var (#28883) --- pandas/core/groupby/groupby.py | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 4e0dd65042196..d477b173b95f0 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -41,6 +41,7 @@ class providing the base-class of operations. ) from pandas.core.dtypes.missing import isna, notna +from pandas.core import nanops import pandas.core.algorithms as algorithms from pandas.core.arrays import Categorical from pandas.core.base import ( @@ -721,6 +722,10 @@ def f(g): with np.errstate(all="ignore"): return func(g, *args, **kwargs) + elif hasattr(nanops, "nan" + func): + # TODO: should we wrap this in to e.g. _is_builtin_func? + f = getattr(nanops, "nan" + func) + else: raise ValueError( "func must be a callable if args or kwargs are supplied" @@ -1297,16 +1302,9 @@ def var(self, ddof=1, *args, **kwargs): """ nv.validate_groupby_func("var", args, kwargs) if ddof == 1: - try: - return self._cython_agg_general( - "var", - alt=lambda x, axis: Series(x).var(ddof=ddof, **kwargs), - **kwargs - ) - except Exception: - f = lambda x: x.var(ddof=ddof, **kwargs) - with _group_selection_context(self): - return self._python_agg_general(f) + return self._cython_agg_general( + "var", alt=lambda x, axis: Series(x).var(ddof=ddof, **kwargs), **kwargs + ) else: f = lambda x: x.var(ddof=ddof, **kwargs) with _group_selection_context(self): From 548f83d4737ac07ac18ce5b3da86ba7c1b1000fa Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 9 Oct 2019 18:27:37 -0700 Subject: [PATCH 015/119] CLN: dont catch on groupby.mean (#28878) --- pandas/core/groupby/generic.py | 12 ++++++++++++ pandas/core/groupby/groupby.py | 13 +++---------- 2 files changed, 15 insertions(+), 10 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 41a5195008f0c..5200d33c6a1fb 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -971,6 +971,18 @@ def _cython_agg_blocks(self, how, alt=None, numeric_only=True, min_count=-1): if result is not no_result: # see if we can cast the block back to the original dtype result = maybe_downcast_numeric(result, block.dtype) + + if result.ndim == 1 and isinstance(result, np.ndarray): + # e.g. block.values was an IntegerArray + try: + # Cast back if feasible + result = type(block.values)._from_sequence( + result, dtype=block.values.dtype + ) + except ValueError: + # reshape to be valid for non-Extension Block + result = result.reshape(1, -1) + newb = block.make_block(result) new_items.append(locs) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index d477b173b95f0..59765ed508dd2 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1217,16 +1217,9 @@ def mean(self, *args, **kwargs): Name: B, dtype: float64 """ nv.validate_groupby_func("mean", args, kwargs, ["numeric_only"]) - try: - return self._cython_agg_general( - "mean", alt=lambda x, axis: Series(x).mean(**kwargs), **kwargs - ) - except GroupByError: - raise - except Exception: - with _group_selection_context(self): - f = lambda x: x.mean(axis=self.axis, **kwargs) - return self._python_agg_general(f) + return self._cython_agg_general( + "mean", alt=lambda x, axis: Series(x).mean(**kwargs), **kwargs + ) @Substitution(name="groupby") @Appender(_common_see_also) From 5cbd82bdd3a9d4900849808735a7e2de61bfdf6b Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 9 Oct 2019 18:30:54 -0700 Subject: [PATCH 016/119] CLN: assorted cleanups, remove unicode checks in cython (#28879) --- pandas/_libs/hashing.pyx | 2 +- pandas/_libs/hashtable_class_helper.pxi.in | 6 +++--- pandas/_libs/parsers.pyx | 2 +- pandas/_libs/tslibs/fields.pyx | 4 ++-- pandas/_libs/tslibs/parsing.pyx | 8 +++----- pandas/_libs/tslibs/period.pyx | 5 ++++- pandas/core/arrays/datetimelike.py | 2 +- pandas/core/arrays/period.py | 8 ++++---- pandas/core/arrays/timedeltas.py | 22 +++++++++++----------- pandas/core/ops/array_ops.py | 2 +- pandas/tests/frame/test_operators.py | 3 ++- pandas/tests/frame/test_query_eval.py | 3 ++- 12 files changed, 35 insertions(+), 32 deletions(-) diff --git a/pandas/_libs/hashing.pyx b/pandas/_libs/hashing.pyx index c0aa661266d29..6b27b2204e75e 100644 --- a/pandas/_libs/hashing.pyx +++ b/pandas/_libs/hashing.pyx @@ -60,7 +60,7 @@ def hash_object_array(object[:] arr, object key, object encoding='utf8'): val = arr[i] if isinstance(val, bytes): data = val - elif isinstance(val, unicode): + elif isinstance(val, str): data = val.encode(encoding) elif val is None or is_nan(val): # null, stringify and encode diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index 17f1d011af01b..1cbdb0df6233c 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -667,7 +667,7 @@ cdef class StringHashTable(HashTable): for i in range(n): val = values[i] - if isinstance(val, (str, unicode)): + if isinstance(val, str): v = get_c_string(val) else: v = get_c_string(self.na_string_sentinel) @@ -700,7 +700,7 @@ cdef class StringHashTable(HashTable): for i in range(n): val = values[i] - if isinstance(val, (str, unicode)): + if isinstance(val, str): v = get_c_string(val) else: v = get_c_string(self.na_string_sentinel) @@ -774,7 +774,7 @@ cdef class StringHashTable(HashTable): val = values[i] if (ignore_na - and (not isinstance(val, (str, unicode)) + and (not isinstance(val, str) or (use_na_value and val == na_value))): # if missing values do not count as unique values (i.e. if # ignore_na is True), we can skip the actual value, and diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index f5a42d7aef3ba..3f12ec4c15fc7 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -2249,7 +2249,7 @@ cdef _apply_converter(object f, parser_t *parser, int64_t col, def _maybe_encode(values): if values is None: return [] - return [x.encode('utf-8') if isinstance(x, unicode) else x for x in values] + return [x.encode('utf-8') if isinstance(x, str) else x for x in values] def sanitize_objects(ndarray[object] values, set na_values, diff --git a/pandas/_libs/tslibs/fields.pyx b/pandas/_libs/tslibs/fields.pyx index 2ed85595f7e3a..8f5c8d10776df 100644 --- a/pandas/_libs/tslibs/fields.pyx +++ b/pandas/_libs/tslibs/fields.pyx @@ -22,7 +22,7 @@ from pandas._libs.tslibs.np_datetime cimport ( from pandas._libs.tslibs.nattype cimport NPY_NAT -def get_time_micros(ndarray[int64_t] dtindex): +def get_time_micros(const int64_t[:] dtindex): """ Return the number of microseconds in the time component of a nanosecond timestamp. @@ -537,7 +537,7 @@ def get_date_field(const int64_t[:] dtindex, object field): elif field == 'is_leap_year': return isleapyear_arr(get_date_field(dtindex, 'Y')) - raise ValueError("Field %s not supported" % field) + raise ValueError("Field {field} not supported".format(field=field)) @cython.wraparound(False) diff --git a/pandas/_libs/tslibs/parsing.pyx b/pandas/_libs/tslibs/parsing.pyx index ca70c8af45f2f..33665484311ba 100644 --- a/pandas/_libs/tslibs/parsing.pyx +++ b/pandas/_libs/tslibs/parsing.pyx @@ -252,9 +252,7 @@ def parse_time_string(arg, freq=None, dayfirst=None, yearfirst=None): ------- datetime, datetime/dateutil.parser._result, str """ - if not isinstance(arg, (str, unicode)): - # Note: cython recognizes `unicode` in both py2/py3, optimizes - # this check into a C call. + if not isinstance(arg, str): return arg if getattr(freq, "_typ", None) == "dateoffset": @@ -370,7 +368,7 @@ cdef inline object _parse_dateabbr_string(object date_string, object default, int year, quarter = -1, month, mnum, date_len # special handling for possibilities eg, 2Q2005, 2Q05, 2005Q1, 05Q1 - assert isinstance(date_string, (str, unicode)) + assert isinstance(date_string, str) # len(date_string) == 0 # should be NaT??? @@ -517,7 +515,7 @@ cdef dateutil_parse(object timestr, object default, ignoretz=False, tzdata = tzinfos.get(res.tzname) if isinstance(tzdata, datetime.tzinfo): tzinfo = tzdata - elif isinstance(tzdata, (str, unicode)): + elif isinstance(tzdata, str): tzinfo = _dateutil_tzstr(tzdata) elif isinstance(tzdata, int): tzinfo = tzoffset(res.tzname, tzdata) diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx index 32dcc86faa7e8..84a41b8757001 100644 --- a/pandas/_libs/tslibs/period.pyx +++ b/pandas/_libs/tslibs/period.pyx @@ -2448,7 +2448,10 @@ class Period(_Period): converted = other.asfreq(freq) ordinal = converted.ordinal - elif is_null_datetimelike(value) or value in nat_strings: + elif is_null_datetimelike(value) or (isinstance(value, str) and + value in nat_strings): + # explicit str check is necessary to avoid raising incorrectly + # if we have a non-hashable value. ordinal = NPY_NAT elif isinstance(value, str) or util.is_integer_object(value): diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index bda5f8f4326f1..958650e3842fa 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -1148,7 +1148,7 @@ def _addsub_offset_array(self, other, op): ) # For EA self.astype('O') returns a numpy array, not an Index - left = lib.values_from_object(self.astype("O")) + left = self.astype("O") res_values = op(left, np.array(other)) kwargs = {} diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index f2d74794eadf5..43208d98abd3c 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -70,7 +70,7 @@ def _period_array_cmp(cls, op): nat_result = opname == "__ne__" def wrapper(self, other): - op = getattr(self.asi8, opname) + ordinal_op = getattr(self.asi8, opname) other = lib.item_from_zerodim(other) if isinstance(other, (ABCDataFrame, ABCSeries, ABCIndexClass)): @@ -82,11 +82,11 @@ def wrapper(self, other): if isinstance(other, Period): self._check_compatible_with(other) - result = op(other.ordinal) + result = ordinal_op(other.ordinal) elif isinstance(other, cls): self._check_compatible_with(other) - result = op(other.asi8) + result = ordinal_op(other.asi8) mask = self._isnan | other._isnan if mask.any(): @@ -98,7 +98,7 @@ def wrapper(self, other): result.fill(nat_result) else: other = Period(other, freq=self.freq) - result = op(other.ordinal) + result = ordinal_op(other.ordinal) if self._hasnans: result[self._isnan] = nat_result diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index 6c9462ff4fa4d..21e07b5101a64 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -553,7 +553,7 @@ def __mul__(self, other): # for that instead of ValueError raise ValueError("Cannot multiply with unequal lengths") - if is_object_dtype(other): + if is_object_dtype(other.dtype): # this multiplication will succeed only if all elements of other # are int or float scalars, so we will end up with # timedelta64[ns]-dtyped result @@ -601,11 +601,11 @@ def __truediv__(self, other): if len(other) != len(self): raise ValueError("Cannot divide vectors with unequal lengths") - elif is_timedelta64_dtype(other): + elif is_timedelta64_dtype(other.dtype): # let numpy handle it return self._data / other - elif is_object_dtype(other): + elif is_object_dtype(other.dtype): # Note: we do not do type inference on the result, so either # an object array or numeric-dtyped (if numpy does inference) # will be returned. GH#23829 @@ -649,12 +649,12 @@ def __rtruediv__(self, other): if len(other) != len(self): raise ValueError("Cannot divide vectors with unequal lengths") - elif is_timedelta64_dtype(other): + elif is_timedelta64_dtype(other.dtype): # let numpy handle it return other / self._data - elif is_object_dtype(other): - # Note: unlike in __truediv__, we do not _need_ to do type# + elif is_object_dtype(other.dtype): + # Note: unlike in __truediv__, we do not _need_ to do type # inference on the result. It does not raise, a numeric array # is returned. GH#23829 result = [other[n] / self[n] for n in range(len(self))] @@ -701,7 +701,7 @@ def __floordiv__(self, other): if len(other) != len(self): raise ValueError("Cannot divide with unequal lengths") - elif is_timedelta64_dtype(other): + elif is_timedelta64_dtype(other.dtype): other = type(self)(other) # numpy timedelta64 does not natively support floordiv, so operate @@ -713,7 +713,7 @@ def __floordiv__(self, other): result[mask] = np.nan return result - elif is_object_dtype(other): + elif is_object_dtype(other.dtype): result = [self[n] // other[n] for n in range(len(self))] result = np.array(result) if lib.infer_dtype(result, skipna=False) == "timedelta": @@ -721,7 +721,7 @@ def __floordiv__(self, other): return type(self)(result) return result - elif is_integer_dtype(other) or is_float_dtype(other): + elif is_integer_dtype(other.dtype) or is_float_dtype(other.dtype): result = self._data // other return type(self)(result) @@ -763,7 +763,7 @@ def __rfloordiv__(self, other): if len(other) != len(self): raise ValueError("Cannot divide with unequal lengths") - elif is_timedelta64_dtype(other): + elif is_timedelta64_dtype(other.dtype): other = type(self)(other) # numpy timedelta64 does not natively support floordiv, so operate @@ -775,7 +775,7 @@ def __rfloordiv__(self, other): result[mask] = np.nan return result - elif is_object_dtype(other): + elif is_object_dtype(other.dtype): result = [other[n] // self[n] for n in range(len(self))] result = np.array(result) return result diff --git a/pandas/core/ops/array_ops.py b/pandas/core/ops/array_ops.py index a225eec93b27e..8c9a4b94446c0 100644 --- a/pandas/core/ops/array_ops.py +++ b/pandas/core/ops/array_ops.py @@ -161,7 +161,7 @@ def arithmetic_op( right: Any, op, str_rep: str, - eval_kwargs: Dict[str, str], + eval_kwargs: Dict[str, bool], ): """ Evaluate an arithmetic operation `+`, `-`, `*`, `/`, `//`, `%`, `**`, ... diff --git a/pandas/tests/frame/test_operators.py b/pandas/tests/frame/test_operators.py index 0e1cd42329169..73eddf91325ae 100644 --- a/pandas/tests/frame/test_operators.py +++ b/pandas/tests/frame/test_operators.py @@ -400,7 +400,7 @@ def test_combineFrame(self, float_frame, mixed_float_frame, mixed_int_frame): added = float_frame + mixed_int_frame _check_mixed_float(added, dtype="float64") - def test_combineSeries( + def test_combine_series( self, float_frame, mixed_float_frame, mixed_int_frame, datetime_frame ): @@ -432,6 +432,7 @@ def test_combineSeries( added = mixed_float_frame + series.astype("float16") _check_mixed_float(added, dtype=dict(C=None)) + # FIXME: don't leave commented-out # these raise with numexpr.....as we are adding an int64 to an # uint64....weird vs int diff --git a/pandas/tests/frame/test_query_eval.py b/pandas/tests/frame/test_query_eval.py index 82c197ac054f0..f5f6c9ad6b3da 100644 --- a/pandas/tests/frame/test_query_eval.py +++ b/pandas/tests/frame/test_query_eval.py @@ -122,7 +122,8 @@ def test_ops(self): result = getattr(df, rop)(m) assert_frame_equal(result, expected) - # GH7192 + # GH7192: Note we need a large number of rows to ensure this + # goes through the numexpr path df = DataFrame(dict(A=np.random.randn(25000))) df.iloc[0:5] = np.nan expected = 1 - np.isnan(df.iloc[0:25]) From fcea0dc7fedb2773c7aeac98571fd02064efa06d Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 9 Oct 2019 18:50:32 -0700 Subject: [PATCH 017/119] No catching needed for median (#28873) --- pandas/core/groupby/groupby.py | 22 +++++----------------- 1 file changed, 5 insertions(+), 17 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 59765ed508dd2..cc297629a7004 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1234,23 +1234,11 @@ def median(self, **kwargs): Series or DataFrame Median of values within each group. """ - try: - return self._cython_agg_general( - "median", - alt=lambda x, axis: Series(x).median(axis=axis, **kwargs), - **kwargs - ) - except GroupByError: - raise - except Exception: - - def f(x): - if isinstance(x, np.ndarray): - x = Series(x) - return x.median(axis=self.axis, **kwargs) - - with _group_selection_context(self): - return self._python_agg_general(f) + return self._cython_agg_general( + "median", + alt=lambda x, axis: Series(x).median(axis=axis, **kwargs), + **kwargs + ) @Substitution(name="groupby") @Appender(_common_see_also) From 959a7604cd61782aa6964e00f0aa611496d8d002 Mon Sep 17 00:00:00 2001 From: Kyle Boone Date: Wed, 9 Oct 2019 19:05:00 -0700 Subject: [PATCH 018/119] read_hdf closes HDF5 stores that it didn't open. (#28700) --- doc/source/whatsnew/v1.0.0.rst | 1 + pandas/io/pytables.py | 11 ++++++----- pandas/tests/io/pytables/test_store.py | 16 +++++++++++++++- 3 files changed, 22 insertions(+), 6 deletions(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index fd1c1271a5e37..cde2a4279cf27 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -311,6 +311,7 @@ I/O - Bug in :func:`DataFrame.to_string` where values were truncated using display options instead of outputting the full content (:issue:`9784`) - Bug in :meth:`DataFrame.to_json` where a datetime column label would not be written out in ISO format with ``orient="table"`` (:issue:`28130`) - Bug in :func:`DataFrame.to_parquet` where writing to GCS would fail with `engine='fastparquet'` if the file did not already exist (:issue:`28326`) +- Bug in :func:`read_hdf` closing stores that it didn't open when Exceptions are raised (:issue:`28699`) - Bug in :meth:`DataFrame.read_json` where using ``orient="index"`` would not maintain the order (:issue:`28557`) - Bug in :meth:`DataFrame.to_html` where the length of the ``formatters`` argument was not verified (:issue:`28469`) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 55ccd838f8a16..0db5b1b4eecfa 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -396,11 +396,12 @@ def read_hdf(path_or_buf, key=None, mode="r", **kwargs): key = candidate_only_group._v_pathname return store.select(key, auto_close=auto_close, **kwargs) except (ValueError, TypeError, KeyError): - # if there is an error, close the store - try: - store.close() - except AttributeError: - pass + if not isinstance(path_or_buf, HDFStore): + # if there is an error, close the store if we opened it. + try: + store.close() + except AttributeError: + pass raise diff --git a/pandas/tests/io/pytables/test_store.py b/pandas/tests/io/pytables/test_store.py index 140ee5082f55d..956438f1afdf4 100644 --- a/pandas/tests/io/pytables/test_store.py +++ b/pandas/tests/io/pytables/test_store.py @@ -1195,8 +1195,22 @@ def test_read_missing_key_close_store(self, setup_path): # read with KeyError before another write df.to_hdf(path, "k2") - def test_append_frame_column_oriented(self, setup_path): + def test_read_missing_key_opened_store(self, setup_path): + # GH 28699 + with ensure_clean_path(setup_path) as path: + df = pd.DataFrame({"a": range(2), "b": range(2)}) + df.to_hdf(path, "k1") + + store = pd.HDFStore(path, "r") + with pytest.raises(KeyError, match="'No object named k2 in the file'"): + pd.read_hdf(store, "k2") + + # Test that the file is still open after a KeyError and that we can + # still read from it. + pd.read_hdf(store, "k1") + + def test_append_frame_column_oriented(self, setup_path): with ensure_clean_store(setup_path) as store: # column oriented From 4f97ca45f11519bc999b869eff52e8615aa88600 Mon Sep 17 00:00:00 2001 From: shaido987 Date: Thu, 10 Oct 2019 17:20:23 +0800 Subject: [PATCH 019/119] DOC: Fix commpiler typo in contributing.rst (#28891) Corrects the spelling of compiler --- doc/source/development/contributing.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/development/contributing.rst b/doc/source/development/contributing.rst index 10d702808606a..dc6fa3d100212 100644 --- a/doc/source/development/contributing.rst +++ b/doc/source/development/contributing.rst @@ -172,7 +172,7 @@ installed (or you wish to install a newer version) you can install a compiler yum groupinstall "Development Tools" For other Linux distributions, consult your favourite search engine for -commpiler installation instructions. +compiler installation instructions. Let us know if you have any difficulties by opening an issue or reaching out on `Gitter`_. From 974315493d2e1d6cf91623bc4bc2ef51cb813ea5 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 10 Oct 2019 05:50:17 -0700 Subject: [PATCH 020/119] TST: Fix maybe_promote floating non-boxed tests (#28880) --- pandas/core/dtypes/cast.py | 48 ++++++++++++++++++++++-- pandas/tests/dtypes/cast/test_promote.py | 15 +------- 2 files changed, 47 insertions(+), 16 deletions(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 5b13e13bb20ba..098f42b1a8c5c 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -398,6 +398,22 @@ def maybe_promote(dtype, fill_value=np.nan): dtype = np.dtype(np.float64) if not isna(fill_value): fill_value = dtype.type(fill_value) + + elif dtype.kind == "f": + if not np.can_cast(fill_value, dtype): + # e.g. dtype is float32, need float64 + dtype = np.min_scalar_type(fill_value) + + elif dtype.kind == "c": + if not np.can_cast(fill_value, dtype): + if np.can_cast(fill_value, np.dtype("c16")): + dtype = np.dtype(np.complex128) + else: + dtype = np.dtype(np.object_) + + if dtype.kind == "c" and not np.isnan(fill_value): + fill_value = dtype.type(fill_value) + elif is_bool(fill_value): if not issubclass(dtype.type, np.bool_): dtype = np.object_ @@ -405,7 +421,7 @@ def maybe_promote(dtype, fill_value=np.nan): fill_value = np.bool_(fill_value) elif is_integer(fill_value): if issubclass(dtype.type, np.bool_): - dtype = np.object_ + dtype = np.dtype(np.object_) elif issubclass(dtype.type, np.integer): # upcast to prevent overflow arr = np.asarray(fill_value) @@ -415,11 +431,37 @@ def maybe_promote(dtype, fill_value=np.nan): # check if we can cast if _check_lossless_cast(fill_value, dtype): fill_value = dtype.type(fill_value) + + if dtype.kind in ["c", "f"]: + # e.g. if dtype is complex128 and fill_value is 1, we + # want np.complex128(1) + fill_value = dtype.type(fill_value) + elif is_complex(fill_value): if issubclass(dtype.type, np.bool_): - dtype = np.object_ + dtype = np.dtype(np.object_) elif issubclass(dtype.type, (np.integer, np.floating)): - dtype = np.complex128 + c8 = np.dtype(np.complex64) + info = np.finfo(dtype) if dtype.kind == "f" else np.iinfo(dtype) + if ( + np.can_cast(fill_value, c8) + and np.can_cast(info.min, c8) + and np.can_cast(info.max, c8) + ): + dtype = np.dtype(np.complex64) + else: + dtype = np.dtype(np.complex128) + + elif dtype.kind == "c": + mst = np.min_scalar_type(fill_value) + if mst > dtype and mst.kind == "c": + # e.g. mst is np.complex128 and dtype is np.complex64 + dtype = mst + + if dtype.kind == "c": + # make sure we have a np.complex and not python complex + fill_value = dtype.type(fill_value) + elif fill_value is None: if is_float_dtype(dtype) or is_complex_dtype(dtype): fill_value = np.nan diff --git a/pandas/tests/dtypes/cast/test_promote.py b/pandas/tests/dtypes/cast/test_promote.py index e4e5a22ea6ca0..e9041a27ab9be 100644 --- a/pandas/tests/dtypes/cast/test_promote.py +++ b/pandas/tests/dtypes/cast/test_promote.py @@ -408,25 +408,14 @@ def test_maybe_promote_float_with_float(dtype, fill_value, expected_dtype, box): if box_dtype == object: pytest.xfail("falsely upcasts to object") - if boxed and is_float_dtype(dtype) and is_complex_dtype(expected_dtype): + elif boxed and is_float_dtype(dtype) and is_complex_dtype(expected_dtype): pytest.xfail("does not upcast to complex") - if (dtype, expected_dtype) in [ + elif boxed and (dtype, expected_dtype) in [ ("float32", "float64"), ("float32", "complex64"), ("complex64", "complex128"), ]: pytest.xfail("does not upcast correctly depending on value") - # this following xfails are "only" a consequence of the - now strictly - # enforced - principle that maybe_promote_with_scalar always casts - if not boxed and abs(fill_value) < 2: - pytest.xfail("wrong return type of fill_value") - if ( - not boxed - and dtype == "complex128" - and expected_dtype == "complex128" - and is_float_dtype(type(fill_value)) - ): - pytest.xfail("wrong return type of fill_value") # output is not a generic float, but corresponds to expected_dtype exp_val_for_scalar = np.array([fill_value], dtype=expected_dtype)[0] From 9a3e1ef690a3958f1c62f61f563e7f5af61973fd Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 10 Oct 2019 05:51:17 -0700 Subject: [PATCH 021/119] TST: Fix xfails for non-box maybe_promote on integer dtypes (#28864) --- pandas/core/dtypes/cast.py | 55 ++++++++++++++++++++++-- pandas/tests/dtypes/cast/test_promote.py | 32 +++++--------- 2 files changed, 63 insertions(+), 24 deletions(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 098f42b1a8c5c..1e353c97be754 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -424,9 +424,58 @@ def maybe_promote(dtype, fill_value=np.nan): dtype = np.dtype(np.object_) elif issubclass(dtype.type, np.integer): # upcast to prevent overflow - arr = np.asarray(fill_value) - if arr != arr.astype(dtype): - dtype = arr.dtype + mst = np.min_scalar_type(fill_value) + if mst > dtype: + # np.dtype ordering considers: + # int[n] < int[2*n] + # uint[n] < uint[2*n] + # u?int[n] < object_ + dtype = mst + + elif np.can_cast(fill_value, dtype): + pass + + elif dtype.kind == "u" and mst.kind == "i": + dtype = np.promote_types(dtype, mst) + if dtype.kind == "f": + # Case where we disagree with numpy + dtype = np.dtype(np.object_) + + elif dtype.kind == "i" and mst.kind == "u": + + if fill_value > np.iinfo(np.int64).max: + # object is the only way to represent fill_value and keep + # the range allowed by the given dtype + dtype = np.dtype(np.object_) + + elif mst.itemsize < dtype.itemsize: + pass + + elif dtype.itemsize == mst.itemsize: + # We never cast signed to unsigned because that loses + # parts of the original range, so find the smallest signed + # integer that can hold all of `mst`. + ndt = { + np.int64: np.object_, + np.int32: np.int64, + np.int16: np.int32, + np.int8: np.int16, + }[dtype.type] + dtype = np.dtype(ndt) + + else: + # bump to signed integer dtype that holds all of `mst` range + # Note: we have to use itemsize because some (windows) + # builds don't satisfiy e.g. np.uint32 == np.uint32 + ndt = { + 4: np.int64, + 2: np.int32, + 1: np.int16, # TODO: Test for this case + }[mst.itemsize] + dtype = np.dtype(ndt) + + fill_value = dtype.type(fill_value) + elif issubclass(dtype.type, np.floating): # check if we can cast if _check_lossless_cast(fill_value, dtype): diff --git a/pandas/tests/dtypes/cast/test_promote.py b/pandas/tests/dtypes/cast/test_promote.py index e9041a27ab9be..f2fc1136fbd35 100644 --- a/pandas/tests/dtypes/cast/test_promote.py +++ b/pandas/tests/dtypes/cast/test_promote.py @@ -151,7 +151,17 @@ def _assert_match(result_fill_value, expected_fill_value): # GH#23982/25425 require the same type in addition to equality/NA-ness res_type = type(result_fill_value) ex_type = type(expected_fill_value) - assert res_type == ex_type + if res_type.__name__ == "uint64": + # No idea why, but these (sometimes) do not compare as equal + assert ex_type.__name__ == "uint64" + elif res_type.__name__ == "ulonglong": + # On some builds we get this instead of np.uint64 + # Note: cant check res_type.dtype.itemsize directly on numpy 1.18 + assert res_type(0).itemsize == 8 + assert ex_type == res_type or ex_type == np.uint64 + else: + # On some builds, type comparison fails, e.g. np.int32 != np.int32 + assert res_type == ex_type or res_type.__name__ == ex_type.__name__ match_value = result_fill_value == expected_fill_value @@ -275,26 +285,6 @@ def test_maybe_promote_int_with_int(dtype, fill_value, expected_dtype, box): expected_dtype = np.dtype(expected_dtype) boxed, box_dtype = box # read from parametrized fixture - if not boxed: - if expected_dtype == object: - pytest.xfail("overflow error") - if expected_dtype == "int32": - pytest.xfail("always upcasts to platform int") - if dtype == "int8" and expected_dtype == "int16": - pytest.xfail("casts to int32 instead of int16") - if ( - issubclass(dtype.type, np.unsignedinteger) - and np.iinfo(dtype).max < fill_value <= np.iinfo("int64").max - ): - pytest.xfail("falsely casts to signed") - if (dtype, expected_dtype) in [ - ("uint8", "int16"), - ("uint32", "int64"), - ] and fill_value != np.iinfo("int32").min - 1: - pytest.xfail("casts to int32 instead of int8/int16") - # this following xfail is "only" a consequence of the - now strictly - # enforced - principle that maybe_promote_with_scalar always casts - pytest.xfail("wrong return type of fill_value") if boxed: if expected_dtype != object: pytest.xfail("falsely casts to object") From eca6079e3a23fad16d9d2365904ededf6764b8a2 Mon Sep 17 00:00:00 2001 From: Roshni Date: Thu, 10 Oct 2019 11:53:57 -0400 Subject: [PATCH 022/119] pandas.io.formats.style.Styler.set_table_attributes: Parameter attributes type updated to str instead of string (#28887) --- pandas/io/formats/style.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/formats/style.py b/pandas/io/formats/style.py index 6bac3fe426f2d..6b98eaca9dacc 100644 --- a/pandas/io/formats/style.py +++ b/pandas/io/formats/style.py @@ -780,7 +780,7 @@ def set_table_attributes(self, attributes): Parameters ---------- - attributes : string + attributes : str Returns ------- From 0c4404b5b85c085e5b6cd26be42e81e5a197fd8d Mon Sep 17 00:00:00 2001 From: Roshni Date: Thu, 10 Oct 2019 12:00:23 -0400 Subject: [PATCH 023/119] DOC: Fixed PR06 errors in pandas.api.extensions.ExtensionArray (#28885) --- pandas/core/arrays/base.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 0778b6726d104..7a16c3f6a35b6 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -177,7 +177,7 @@ def _from_sequence(cls, scalars, dtype=None, copy=False): dtype : dtype, optional Construct for this particular dtype. This should be a Dtype compatible with the ExtensionArray. - copy : boolean, default False + copy : bool, default False If True, copy the underlying data. Returns @@ -200,7 +200,7 @@ def _from_sequence_of_strings(cls, strings, dtype=None, copy=False): dtype : dtype, optional Construct for this particular dtype. This should be a Dtype compatible with the ExtensionArray. - copy : boolean, default False + copy : bool, default False If True, copy the underlying data. Returns @@ -769,7 +769,7 @@ def take( Parameters ---------- - indices : sequence of integers + indices : sequence of int Indices to be taken. allow_fill : bool, default False How to handle negative values in `indices`. From de2e0864a1e333d1dc9385d788a375d1aaaae1a5 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 10 Oct 2019 15:53:02 -0700 Subject: [PATCH 024/119] REF: simplify maybe_promote (#28899) --- pandas/core/dtypes/cast.py | 49 +++----------------------------------- 1 file changed, 3 insertions(+), 46 deletions(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 1e353c97be754..1e62527f95bc7 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -423,57 +423,14 @@ def maybe_promote(dtype, fill_value=np.nan): if issubclass(dtype.type, np.bool_): dtype = np.dtype(np.object_) elif issubclass(dtype.type, np.integer): - # upcast to prevent overflow - mst = np.min_scalar_type(fill_value) - if mst > dtype: - # np.dtype ordering considers: - # int[n] < int[2*n] - # uint[n] < uint[2*n] - # u?int[n] < object_ - dtype = mst - - elif np.can_cast(fill_value, dtype): - pass - - elif dtype.kind == "u" and mst.kind == "i": + if not np.can_cast(fill_value, dtype): + # upcast to prevent overflow + mst = np.min_scalar_type(fill_value) dtype = np.promote_types(dtype, mst) if dtype.kind == "f": # Case where we disagree with numpy dtype = np.dtype(np.object_) - elif dtype.kind == "i" and mst.kind == "u": - - if fill_value > np.iinfo(np.int64).max: - # object is the only way to represent fill_value and keep - # the range allowed by the given dtype - dtype = np.dtype(np.object_) - - elif mst.itemsize < dtype.itemsize: - pass - - elif dtype.itemsize == mst.itemsize: - # We never cast signed to unsigned because that loses - # parts of the original range, so find the smallest signed - # integer that can hold all of `mst`. - ndt = { - np.int64: np.object_, - np.int32: np.int64, - np.int16: np.int32, - np.int8: np.int16, - }[dtype.type] - dtype = np.dtype(ndt) - - else: - # bump to signed integer dtype that holds all of `mst` range - # Note: we have to use itemsize because some (windows) - # builds don't satisfiy e.g. np.uint32 == np.uint32 - ndt = { - 4: np.int64, - 2: np.int32, - 1: np.int16, # TODO: Test for this case - }[mst.itemsize] - dtype = np.dtype(ndt) - fill_value = dtype.type(fill_value) elif issubclass(dtype.type, np.floating): From d8f9be7e396032acbd11de94dad78c459f78e4ba Mon Sep 17 00:00:00 2001 From: Bobae Kim <37873745+KimDoubleB@users.noreply.github.com> Date: Fri, 11 Oct 2019 12:53:57 +0900 Subject: [PATCH 025/119] Fix typo (#28915) --- pandas/core/indexes/multi.py | 2 +- pandas/core/indexes/timedeltas.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 2da74012de968..b49bb856a2e2b 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -638,7 +638,7 @@ def levels(self): @property def _values(self): - # We override here, since our parent uses _data, which we dont' use. + # We override here, since our parent uses _data, which we don't use. return self.values @property diff --git a/pandas/core/indexes/timedeltas.py b/pandas/core/indexes/timedeltas.py index 2ecb66bc8f1e4..c6dce77c4d078 100644 --- a/pandas/core/indexes/timedeltas.py +++ b/pandas/core/indexes/timedeltas.py @@ -40,9 +40,9 @@ class TimedeltaDelegateMixin(DatetimelikeDelegateMixin): # Most attrs are dispatched via datetimelike_{ops,methods} - # Some are "raw" methods, the result is not not re-boxed in an Index + # Some are "raw" methods, the result is not re-boxed in an Index # We also have a few "extra" attrs, which may or may not be raw, - # which we we dont' want to expose in the .dt accessor. + # which we don't want to expose in the .dt accessor. _delegate_class = TimedeltaArray _delegated_properties = TimedeltaArray._datetimelike_ops + ["components"] _delegated_methods = TimedeltaArray._datetimelike_methods + [ From 230a61a2e597babb9d37ea8ae982d8da0246c3dd Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Fri, 11 Oct 2019 12:56:57 +0100 Subject: [PATCH 026/119] TYPING: lockdown test modules failing mypy (#28914) --- setup.cfg | 113 +++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 112 insertions(+), 1 deletion(-) diff --git a/setup.cfg b/setup.cfg index 43dbac15f5cfe..9c841b76761f5 100644 --- a/setup.cfg +++ b/setup.cfg @@ -130,5 +130,116 @@ skip = pandas/__init__.py,pandas/core/api.py ignore_missing_imports=True no_implicit_optional=True -[mypy-pandas.conftest,pandas.tests.*] +[mypy-pandas.conftest] +ignore_errors=True + +[mypy-pandas.tests.api.test_api] +ignore_errors=True + +[mypy-pandas.tests.arithmetic.test_datetime64] +ignore_errors=True + +[mypy-pandas.tests.arrays.test_array] +ignore_errors=True + +[mypy-pandas.tests.arrays.test_datetimelike] +ignore_errors=True + +[mypy-pandas.tests.arrays.test_period] +ignore_errors=True + +[mypy-pandas.tests.computation.test_eval] +ignore_errors=True + +[mypy-pandas.tests.dtypes.test_common] +ignore_errors=True + +[mypy-pandas.tests.dtypes.test_inference] +ignore_errors=True + +[mypy-pandas.tests.extension.decimal.test_decimal] +ignore_errors=True + +[mypy-pandas.tests.extension.json.array] +ignore_errors=True + +[mypy-pandas.tests.extension.json.test_json] +ignore_errors=True + +[mypy-pandas.tests.extension.test_numpy] +ignore_errors=True + +[mypy-pandas.tests.extension.test_sparse] +ignore_errors=True + +[mypy-pandas.tests.frame.test_constructors] +ignore_errors=True + +[mypy-pandas.tests.frame.test_convert_to] +ignore_errors=True + +[mypy-pandas.tests.indexes.datetimes.test_datetimelike] +ignore_errors=True + +[mypy-pandas.tests.indexes.interval.test_base] +ignore_errors=True + +[mypy-pandas.tests.indexes.interval.test_interval_tree] +ignore_errors=True + +[mypy-pandas.tests.indexes.period.test_period] +ignore_errors=True + +[mypy-pandas.tests.indexes.test_base] +ignore_errors=True + +[mypy-pandas.tests.indexes.test_category] +ignore_errors=True + +[mypy-pandas.tests.indexes.test_numeric] +ignore_errors=True + +[mypy-pandas.tests.indexes.test_range] +ignore_errors=True + +[mypy-pandas.tests.indexes.timedeltas.test_timedelta] +ignore_errors=True + +[mypy-pandas.tests.indexing.test_coercion] +ignore_errors=True + +[mypy-pandas.tests.indexing.test_loc] +ignore_errors=True + +[mypy-pandas.tests.io.json.test_ujson] +ignore_errors=True + +[mypy-pandas.tests.io.parser.conftest] +ignore_errors=True + +[mypy-pandas.tests.io.test_sql] +ignore_errors=True + +[mypy-pandas.tests.plotting.test_backend] +ignore_errors=True + +[mypy-pandas.tests.series.test_constructors] +ignore_errors=True + +[mypy-pandas.tests.series.test_operators] +ignore_errors=True + +[mypy-pandas.tests.test_algos] +ignore_errors=True + +[mypy-pandas.tests.test_base] +ignore_errors=True + +[mypy-pandas.tests.tseries.offsets.test_offsets] +ignore_errors=True + +[mypy-pandas.tests.tseries.offsets.test_offsets_properties] +ignore_errors=True + +[mypy-pandas.tests.tseries.offsets.test_yqm_offsets] ignore_errors=True From fab2c90985dfbd68a4bbc5a0c7ccc7616e0b5f8f Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 11 Oct 2019 04:57:37 -0700 Subject: [PATCH 027/119] CLN: simplify maybe_promote in float and complex cases (#28913) --- pandas/core/dtypes/cast.py | 19 ++++--------------- 1 file changed, 4 insertions(+), 15 deletions(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 1e62527f95bc7..328c7566d8e8d 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -405,11 +405,8 @@ def maybe_promote(dtype, fill_value=np.nan): dtype = np.min_scalar_type(fill_value) elif dtype.kind == "c": - if not np.can_cast(fill_value, dtype): - if np.can_cast(fill_value, np.dtype("c16")): - dtype = np.dtype(np.complex128) - else: - dtype = np.dtype(np.object_) + mst = np.min_scalar_type(fill_value) + dtype = np.promote_types(dtype, mst) if dtype.kind == "c" and not np.isnan(fill_value): fill_value = dtype.type(fill_value) @@ -447,16 +444,8 @@ def maybe_promote(dtype, fill_value=np.nan): if issubclass(dtype.type, np.bool_): dtype = np.dtype(np.object_) elif issubclass(dtype.type, (np.integer, np.floating)): - c8 = np.dtype(np.complex64) - info = np.finfo(dtype) if dtype.kind == "f" else np.iinfo(dtype) - if ( - np.can_cast(fill_value, c8) - and np.can_cast(info.min, c8) - and np.can_cast(info.max, c8) - ): - dtype = np.dtype(np.complex64) - else: - dtype = np.dtype(np.complex128) + mst = np.min_scalar_type(fill_value) + dtype = np.promote_types(dtype, mst) elif dtype.kind == "c": mst = np.min_scalar_type(fill_value) From e1e51df5ef82ee61305d8d994755ee4ed79413b5 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Fri, 11 Oct 2019 12:58:23 +0100 Subject: [PATCH 028/119] TYPING: errors reported by mypy 0.730 (#28910) --- pandas/compat/pickle_compat.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/pandas/compat/pickle_compat.py b/pandas/compat/pickle_compat.py index b3c7b8a7c8b9f..3a36713ccdbda 100644 --- a/pandas/compat/pickle_compat.py +++ b/pandas/compat/pickle_compat.py @@ -68,7 +68,11 @@ def load_reduce(self): class _LoadSparseSeries: # To load a SparseSeries as a Series[Sparse] - def __new__(cls) -> "Series": + + # https://github.com/python/mypy/issues/1020 + # error: Incompatible return type for "__new__" (returns "Series", but must return + # a subtype of "_LoadSparseSeries") + def __new__(cls) -> "Series": # type: ignore from pandas import Series warnings.warn( @@ -82,7 +86,11 @@ def __new__(cls) -> "Series": class _LoadSparseFrame: # To load a SparseDataFrame as a DataFrame[Sparse] - def __new__(cls) -> "DataFrame": + + # https://github.com/python/mypy/issues/1020 + # error: Incompatible return type for "__new__" (returns "DataFrame", but must + # return a subtype of "_LoadSparseFrame") + def __new__(cls) -> "DataFrame": # type: ignore from pandas import DataFrame warnings.warn( From a4c92cadd58462a70877af699a3173e0fb9ac915 Mon Sep 17 00:00:00 2001 From: Terji Petersen Date: Fri, 11 Oct 2019 13:00:14 +0100 Subject: [PATCH 029/119] clean tests/indexing/common.py (#28904) --- pandas/tests/indexing/common.py | 79 ++++++++++++---------------- pandas/tests/indexing/test_iloc.py | 2 +- pandas/tests/indexing/test_scalar.py | 8 +-- 3 files changed, 38 insertions(+), 51 deletions(-) diff --git a/pandas/tests/indexing/common.py b/pandas/tests/indexing/common.py index 78764e6763e95..812d84261eb46 100644 --- a/pandas/tests/indexing/common.py +++ b/pandas/tests/indexing/common.py @@ -1,5 +1,4 @@ """ common utilities """ - import itertools from warnings import catch_warnings, filterwarnings @@ -29,7 +28,7 @@ def _axify(obj, key, axis): class Base: """ indexing comprehensive base class """ - _objs = {"series", "frame"} + _kinds = {"series", "frame"} _typs = { "ints", "uints", @@ -101,13 +100,12 @@ def setup_method(self, method): self.series_empty = Series() # form agglomerates - for o in self._objs: - + for kind in self._kinds: d = dict() - for t in self._typs: - d[t] = getattr(self, "{o}_{t}".format(o=o, t=t), None) + for typ in self._typs: + d[typ] = getattr(self, "{kind}_{typ}".format(kind=kind, typ=typ)) - setattr(self, o, d) + setattr(self, kind, d) def generate_indices(self, f, values=False): """ generate the indices @@ -117,7 +115,7 @@ def generate_indices(self, f, values=False): axes = f.axes if values: - axes = (list(range(len(a))) for a in axes) + axes = (list(range(len(ax))) for ax in axes) return itertools.product(*axes) @@ -186,34 +184,34 @@ def check_result( method2, key2, typs=None, - objs=None, + kinds=None, axes=None, fails=None, ): - def _eq(t, o, a, obj, k1, k2): + def _eq(typ, kind, axis, obj, key1, key2): """ compare equal for these 2 keys """ - - if a is not None and a > obj.ndim - 1: + if axis > obj.ndim - 1: return def _print(result, error=None): - if error is not None: - error = str(error) - v = ( + err = str(error) if error is not None else "" + msg = ( "%-16.16s [%-16.16s]: [typ->%-8.8s,obj->%-8.8s," "key1->(%-4.4s),key2->(%-4.4s),axis->%s] %s" - % (name, result, t, o, method1, method2, a, error or "") + % (name, result, typ, kind, method1, method2, axis, err) ) if _verbose: - pprint_thing(v) + pprint_thing(msg) try: - rs = getattr(obj, method1).__getitem__(_axify(obj, k1, a)) + rs = getattr(obj, method1).__getitem__(_axify(obj, key1, axis)) with catch_warnings(record=True): filterwarnings("ignore", "\\n.ix", FutureWarning) try: - xp = self.get_result(obj, method2, k2, a) + xp = self.get_result( + obj=obj, method=method2, key=key2, axis=axis + ) except (KeyError, IndexError): # TODO: why is this allowed? result = "no comp" @@ -228,8 +226,8 @@ def _print(result, error=None): else: tm.assert_equal(rs, xp) result = "ok" - except AssertionError as e: - detail = str(e) + except AssertionError as exc: + detail = str(exc) result = "fail" # reverse the checks @@ -258,36 +256,25 @@ def _print(result, error=None): if typs is None: typs = self._typs - if objs is None: - objs = self._objs + if kinds is None: + kinds = self._kinds - if axes is not None: - if not isinstance(axes, (tuple, list)): - axes = [axes] - else: - axes = list(axes) - else: + if axes is None: axes = [0, 1] + elif not isinstance(axes, (tuple, list)): + assert isinstance(axes, int) + axes = [axes] # check - for o in objs: - if o not in self._objs: + for kind in kinds: + if kind not in self._kinds: continue - d = getattr(self, o) - for a in axes: - for t in typs: - if t not in self._typs: + d = getattr(self, kind) + for ax in axes: + for typ in typs: + if typ not in self._typs: continue - obj = d[t] - if obj is None: - continue - - def _call(obj=obj): - obj = obj.copy() - - k2 = key2 - _eq(t, o, a, obj, key1, k2) - - _call() + obj = d[typ] + _eq(typ=typ, kind=kind, axis=ax, obj=obj, key1=key1, key2=key2) diff --git a/pandas/tests/indexing/test_iloc.py b/pandas/tests/indexing/test_iloc.py index c3ba5c0545b8b..31120c2c023cc 100644 --- a/pandas/tests/indexing/test_iloc.py +++ b/pandas/tests/indexing/test_iloc.py @@ -284,7 +284,7 @@ def test_iloc_getitem_dups(self): [0, 1, 1, 3], "ix", {0: [0, 2, 2, 6], 1: [0, 3, 3, 9]}, - objs=["series", "frame"], + kinds=["series", "frame"], typs=["ints", "uints"], ) diff --git a/pandas/tests/indexing/test_scalar.py b/pandas/tests/indexing/test_scalar.py index 0b8f3af760f1d..532b77d6519c1 100644 --- a/pandas/tests/indexing/test_scalar.py +++ b/pandas/tests/indexing/test_scalar.py @@ -19,9 +19,9 @@ def _check(f, func, values=False): expected = self.get_value(f, i, values) tm.assert_almost_equal(result, expected) - for o in self._objs: + for kind in self._kinds: - d = getattr(self, o) + d = getattr(self, kind) # iat for f in [d["ints"], d["uints"]]: @@ -47,9 +47,9 @@ def _check(f, func, values=False): expected = self.get_value(f, i, values) tm.assert_almost_equal(expected, 1) - for t in self._objs: + for kind in self._kinds: - d = getattr(self, t) + d = getattr(self, kind) # iat for f in [d["ints"], d["uints"]]: From 3954fa7b3c10729eb7fd5a13a92bf03e11e49b17 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 11 Oct 2019 05:01:30 -0700 Subject: [PATCH 030/119] REF: use fused types for groupby_helper (#28886) --- pandas/_libs/groupby_helper.pxi.in | 250 ++++++++++++++++++----------- 1 file changed, 156 insertions(+), 94 deletions(-) diff --git a/pandas/_libs/groupby_helper.pxi.in b/pandas/_libs/groupby_helper.pxi.in index 000689f634545..6b434b6470581 100644 --- a/pandas/_libs/groupby_helper.pxi.in +++ b/pandas/_libs/groupby_helper.pxi.in @@ -12,39 +12,27 @@ _int64_max = np.iinfo(np.int64).max # group_nth, group_last, group_rank # ---------------------------------------------------------------------- -{{py: - -# name, c_type, nan_val -dtypes = [('float64', 'float64_t', 'NAN'), - ('float32', 'float32_t', 'NAN'), - ('int64', 'int64_t', 'NPY_NAT'), - ('object', 'object', 'NAN')] - -def get_dispatch(dtypes): - - for name, c_type, nan_val in dtypes: - - yield name, c_type, nan_val -}} - - -{{for name, c_type, nan_val in get_dispatch(dtypes)}} +ctypedef fused rank_t: + float64_t + float32_t + int64_t + object @cython.wraparound(False) @cython.boundscheck(False) -def group_last_{{name}}({{c_type}}[:, :] out, - int64_t[:] counts, - {{c_type}}[:, :] values, - const int64_t[:] labels, - Py_ssize_t min_count=-1): +def group_last(rank_t[:, :] out, + int64_t[:] counts, + rank_t[:, :] values, + const int64_t[:] labels, + Py_ssize_t min_count=-1): """ Only aggregates on axis=0 """ cdef: Py_ssize_t i, j, N, K, lab, ncounts = len(counts) - {{c_type}} val - ndarray[{{c_type}}, ndim=2] resx + rank_t val + ndarray[rank_t, ndim=2] resx ndarray[int64_t, ndim=2] nobs assert min_count == -1, "'min_count' only used in add and prod" @@ -53,19 +41,15 @@ def group_last_{{name}}({{c_type}}[:, :] out, raise AssertionError("len(index) != len(labels)") nobs = np.zeros((out).shape, dtype=np.int64) - {{if name == 'object'}} - resx = np.empty((out).shape, dtype=object) - {{else}} - resx = np.empty_like(out) - {{endif}} + if rank_t is object: + resx = np.empty((out).shape, dtype=object) + else: + resx = np.empty_like(out) N, K = (values).shape - {{if name == "object"}} - if True: # make templating happy - {{else}} - with nogil: - {{endif}} + if rank_t is object: + # TODO: De-duplicate once conditional-nogil is available for i in range(N): lab = labels[i] if lab < 0: @@ -76,36 +60,77 @@ def group_last_{{name}}({{c_type}}[:, :] out, val = values[i, j] # not nan - if ( - {{if not name.startswith("int")}} - val == val and - {{endif}} - val != {{nan_val}}): - nobs[lab, j] += 1 - resx[lab, j] = val + if rank_t is int64_t: + # need a special notna check + if val != NPY_NAT: + nobs[lab, j] += 1 + resx[lab, j] = val + else: + if val == val: + nobs[lab, j] += 1 + resx[lab, j] = val for i in range(ncounts): for j in range(K): if nobs[i, j] == 0: - out[i, j] = {{nan_val}} + if rank_t is int64_t: + out[i, j] = NPY_NAT + else: + out[i, j] = NAN else: out[i, j] = resx[i, j] + else: + with nogil: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if rank_t is int64_t: + # need a special notna check + if val != NPY_NAT: + nobs[lab, j] += 1 + resx[lab, j] = val + else: + if val == val: + nobs[lab, j] += 1 + resx[lab, j] = val + + for i in range(ncounts): + for j in range(K): + if nobs[i, j] == 0: + if rank_t is int64_t: + out[i, j] = NPY_NAT + else: + out[i, j] = NAN + else: + out[i, j] = resx[i, j] + +group_last_float64 = group_last["float64_t"] +group_last_float32 = group_last["float32_t"] +group_last_int64 = group_last["int64_t"] +group_last_object = group_last["object"] @cython.wraparound(False) @cython.boundscheck(False) -def group_nth_{{name}}({{c_type}}[:, :] out, - int64_t[:] counts, - {{c_type}}[:, :] values, - const int64_t[:] labels, int64_t rank, - Py_ssize_t min_count=-1): +def group_nth(rank_t[:, :] out, + int64_t[:] counts, + rank_t[:, :] values, + const int64_t[:] labels, int64_t rank, + Py_ssize_t min_count=-1): """ Only aggregates on axis=0 """ cdef: Py_ssize_t i, j, N, K, lab, ncounts = len(counts) - {{c_type}} val - ndarray[{{c_type}}, ndim=2] resx + rank_t val + ndarray[rank_t, ndim=2] resx ndarray[int64_t, ndim=2] nobs assert min_count == -1, "'min_count' only used in add and prod" @@ -114,19 +139,15 @@ def group_nth_{{name}}({{c_type}}[:, :] out, raise AssertionError("len(index) != len(labels)") nobs = np.zeros((out).shape, dtype=np.int64) - {{if name=='object'}} - resx = np.empty((out).shape, dtype=object) - {{else}} - resx = np.empty_like(out) - {{endif}} + if rank_t is object: + resx = np.empty((out).shape, dtype=object) + else: + resx = np.empty_like(out) N, K = (values).shape - {{if name == "object"}} - if True: # make templating happy - {{else}} - with nogil: - {{endif}} + if rank_t is object: + # TODO: De-duplicate once conditional-nogil is available for i in range(N): lab = labels[i] if lab < 0: @@ -137,11 +158,7 @@ def group_nth_{{name}}({{c_type}}[:, :] out, val = values[i, j] # not nan - if ( - {{if not name.startswith("int")}} - val == val and - {{endif}} - val != {{nan_val}}): + if val == val: nobs[lab, j] += 1 if nobs[lab, j] == rank: resx[lab, j] = val @@ -149,28 +166,65 @@ def group_nth_{{name}}({{c_type}}[:, :] out, for i in range(ncounts): for j in range(K): if nobs[i, j] == 0: - out[i, j] = {{nan_val}} + out[i, j] = NAN else: out[i, j] = resx[i, j] + else: + with nogil: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if rank_t is int64_t: + # need a special notna check + if val != NPY_NAT: + nobs[lab, j] += 1 + if nobs[lab, j] == rank: + resx[lab, j] = val + else: + if val == val: + nobs[lab, j] += 1 + if nobs[lab, j] == rank: + resx[lab, j] = val + + for i in range(ncounts): + for j in range(K): + if nobs[i, j] == 0: + if rank_t is int64_t: + out[i, j] = NPY_NAT + else: + out[i, j] = NAN + else: + out[i, j] = resx[i, j] + -{{if name != 'object'}} +group_nth_float64 = group_nth["float64_t"] +group_nth_float32 = group_nth["float32_t"] +group_nth_int64 = group_nth["int64_t"] +group_nth_object = group_nth["object"] @cython.boundscheck(False) @cython.wraparound(False) -def group_rank_{{name}}(float64_t[:, :] out, - {{c_type}}[:, :] values, - const int64_t[:] labels, - bint is_datetimelike, object ties_method, - bint ascending, bint pct, object na_option): +def group_rank(float64_t[:, :] out, + rank_t[:, :] values, + const int64_t[:] labels, + bint is_datetimelike, object ties_method, + bint ascending, bint pct, object na_option): """ Provides the rank of values within each group. Parameters ---------- out : array of float64_t values which this method will write its results to - values : array of {{c_type}} values to be ranked + values : array of rank_t values to be ranked labels : array containing unique label for each group, with its ordering matching up to the corresponding record in `values` is_datetimelike : bool, default False @@ -203,10 +257,13 @@ def group_rank_{{name}}(float64_t[:, :] out, Py_ssize_t grp_vals_seen=1, grp_na_count=0, grp_tie_count=0 ndarray[int64_t] _as ndarray[float64_t, ndim=2] grp_sizes - ndarray[{{c_type}}] masked_vals + ndarray[rank_t] masked_vals ndarray[uint8_t] mask bint keep_na - {{c_type}} nan_fill_val + rank_t nan_fill_val + + if rank_t is object: + raise NotImplementedError("Cant do nogil") tiebreak = tiebreakers[ties_method] keep_na = na_option == 'keep' @@ -217,25 +274,23 @@ def group_rank_{{name}}(float64_t[:, :] out, # with mask, without obfuscating location of missing data # in values array masked_vals = np.array(values[:, 0], copy=True) - {{if name == 'int64'}} - mask = (masked_vals == {{nan_val}}).astype(np.uint8) - {{else}} - mask = np.isnan(masked_vals).astype(np.uint8) - {{endif}} + if rank_t is int64_t: + mask = (masked_vals == NPY_NAT).astype(np.uint8) + else: + mask = np.isnan(masked_vals).astype(np.uint8) if ascending ^ (na_option == 'top'): - {{if name == 'int64'}} - nan_fill_val = np.iinfo(np.int64).max - {{else}} - nan_fill_val = np.inf - {{endif}} + if rank_t is int64_t: + nan_fill_val = np.iinfo(np.int64).max + else: + nan_fill_val = np.inf order = (masked_vals, mask, labels) else: - {{if name == 'int64'}} - nan_fill_val = np.iinfo(np.int64).min - {{else}} - nan_fill_val = -np.inf - {{endif}} + if rank_t is int64_t: + nan_fill_val = np.iinfo(np.int64).min + else: + nan_fill_val = -np.inf + order = (masked_vals, ~mask, labels) np.putmask(masked_vals, mask, nan_fill_val) @@ -337,8 +392,13 @@ def group_rank_{{name}}(float64_t[:, :] out, out[i, 0] = NAN elif grp_sizes[i, 0] != 0: out[i, 0] = out[i, 0] / grp_sizes[i, 0] -{{endif}} -{{endfor}} + + +group_rank_float64 = group_rank["float64_t"] +group_rank_float32 = group_rank["float32_t"] +group_rank_int64 = group_rank["int64_t"] +# Note: we do not have a group_rank_object because that would require a +# not-nogil implementation, see GH#19560 # ---------------------------------------------------------------------- @@ -484,7 +544,8 @@ def group_cummin(groupby_t[:, :] out, const int64_t[:] labels, int ngroups, bint is_datetimelike): - """Cumulative minimum of columns of `values`, in row groups `labels`. + """ + Cumulative minimum of columns of `values`, in row groups `labels`. Parameters ---------- @@ -548,9 +609,10 @@ def group_cummin(groupby_t[:, :] out, def group_cummax(groupby_t[:, :] out, groupby_t[:, :] values, const int64_t[:] labels, - int ngroups, + int ngroups, bint is_datetimelike): - """Cumulative maximum of columns of `values`, in row groups `labels`. + """ + Cumulative maximum of columns of `values`, in row groups `labels`. Parameters ---------- From 6241b9d3b3b8fd688cf32e45539719f1b9ec25c1 Mon Sep 17 00:00:00 2001 From: Jeroen Kant <45035434+jjlkant@users.noreply.github.com> Date: Fri, 11 Oct 2019 14:09:33 +0200 Subject: [PATCH 031/119] BUG: Allow all int types for merge (GH28870) (#28875) --- doc/source/whatsnew/v1.0.0.rst | 1 + pandas/core/reshape/merge.py | 3 +-- pandas/tests/reshape/merge/test_merge_asof.py | 16 ++++++++++++++++ 3 files changed, 18 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index cde2a4279cf27..da75e2c49ae10 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -345,6 +345,7 @@ Reshaping - Bug :func:`merge_asof` could not use :class:`datetime.timedelta` for ``tolerance`` kwarg (:issue:`28098`) - Bug in :func:`merge`, did not append suffixes correctly with MultiIndex (:issue:`28518`) - :func:`qcut` and :func:`cut` now handle boolean input (:issue:`20303`) +- Fix to ensure all int dtypes can be used in :func:`merge_asof` when using a tolerance value. Previously every non-int64 type would raise an erroneous ``MergeError`` (:issue:`28870`). Sparse ^^^^^^ diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 910c7ea561929..7bfc8153da568 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -28,7 +28,6 @@ is_dtype_equal, is_extension_array_dtype, is_float_dtype, - is_int64_dtype, is_integer, is_integer_dtype, is_list_like, @@ -1641,7 +1640,7 @@ def _get_merge_keys(self): if self.tolerance < Timedelta(0): raise MergeError("tolerance must be positive") - elif is_int64_dtype(lt): + elif is_integer_dtype(lt): if not is_integer(self.tolerance): raise MergeError(msg) if self.tolerance < 0: diff --git a/pandas/tests/reshape/merge/test_merge_asof.py b/pandas/tests/reshape/merge/test_merge_asof.py index caf2539a9e150..2e9ae80323159 100644 --- a/pandas/tests/reshape/merge/test_merge_asof.py +++ b/pandas/tests/reshape/merge/test_merge_asof.py @@ -1287,3 +1287,19 @@ def test_timedelta_tolerance_nearest(self): ) assert_frame_equal(result, expected) + + def test_int_type_tolerance(self, any_int_dtype): + # GH #28870 + + left = pd.DataFrame({"a": [0, 10, 20], "left_val": [1, 2, 3]}) + right = pd.DataFrame({"a": [5, 15, 25], "right_val": [1, 2, 3]}) + left["a"] = left["a"].astype(any_int_dtype) + right["a"] = right["a"].astype(any_int_dtype) + + expected = pd.DataFrame( + {"a": [0, 10, 20], "left_val": [1, 2, 3], "right_val": [np.nan, 1.0, 2.0]} + ) + expected["a"] = expected["a"].astype(any_int_dtype) + + result = pd.merge_asof(left, right, on="a", tolerance=10) + assert_frame_equal(result, expected) From 0474c7fd6f70952d3ff56f6de9bbf8f934131f74 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 11 Oct 2019 05:24:27 -0700 Subject: [PATCH 032/119] TST: Fix 36 maybe_promote xfails wanting np.bytes_ instead of np.object_ (#28861) --- pandas/tests/dtypes/cast/test_promote.py | 32 ++++++------------------ 1 file changed, 7 insertions(+), 25 deletions(-) diff --git a/pandas/tests/dtypes/cast/test_promote.py b/pandas/tests/dtypes/cast/test_promote.py index f2fc1136fbd35..1989ab3f93e89 100644 --- a/pandas/tests/dtypes/cast/test_promote.py +++ b/pandas/tests/dtypes/cast/test_promote.py @@ -485,25 +485,13 @@ def test_maybe_promote_bytes_with_any(bytes_dtype, any_numpy_dtype_reduced, box) fill_dtype = np.dtype(any_numpy_dtype_reduced) boxed, box_dtype = box # read from parametrized fixture - if issubclass(fill_dtype.type, np.bytes_): - if not boxed or box_dtype == object: - pytest.xfail("falsely upcasts to object") - # takes the opinion that bool dtype has no missing value marker - else: - pytest.xfail("wrong missing value marker") - else: - if boxed and box_dtype is None: - pytest.xfail("does not upcast to object") - # create array of given dtype; casts "1" to correct dtype fill_value = np.array([1], dtype=fill_dtype)[0] - # filling bytes with anything but bytes casts to object - expected_dtype = ( - dtype if issubclass(fill_dtype.type, np.bytes_) else np.dtype(object) - ) + # we never use bytes dtype internally, always promote to object + expected_dtype = np.dtype(np.object_) exp_val_for_scalar = fill_value - exp_val_for_array = None if issubclass(fill_dtype.type, np.bytes_) else np.nan + exp_val_for_array = np.nan _check_promote( dtype, @@ -521,13 +509,7 @@ def test_maybe_promote_any_with_bytes(any_numpy_dtype_reduced, bytes_dtype, box) fill_dtype = np.dtype(bytes_dtype) boxed, box_dtype = box # read from parametrized fixture - if issubclass(dtype.type, np.bytes_): - if not boxed or box_dtype == object: - pytest.xfail("falsely upcasts to object") - # takes the opinion that bool dtype has no missing value marker - else: - pytest.xfail("wrong missing value marker") - else: + if not issubclass(dtype.type, np.bytes_): if ( boxed and (box_dtype == "bytes" or box_dtype is None) @@ -541,11 +523,11 @@ def test_maybe_promote_any_with_bytes(any_numpy_dtype_reduced, bytes_dtype, box) # special case for box_dtype (cannot use fixture in parametrization) box_dtype = fill_dtype if box_dtype == "bytes" else box_dtype - # filling bytes with anything but bytes casts to object - expected_dtype = dtype if issubclass(dtype.type, np.bytes_) else np.dtype(object) + # we never use bytes dtype internally, always promote to object + expected_dtype = np.dtype(np.object_) # output is not a generic bytes, but corresponds to expected_dtype exp_val_for_scalar = np.array([fill_value], dtype=expected_dtype)[0] - exp_val_for_array = None if issubclass(dtype.type, np.bytes_) else np.nan + exp_val_for_array = np.nan _check_promote( dtype, From 3fba92ec50275a6b0b87f282bf29872fd71dd422 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Fri, 11 Oct 2019 13:41:19 +0100 Subject: [PATCH 033/119] TYPING: fix type annotation for pandas.io.formats.format._binify (#28908) --- pandas/io/formats/format.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 15f21814b072d..ad62c56a337b6 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -868,6 +868,8 @@ def _join_multiline(self, *args) -> str: np.array([self.adj.len(x) for x in col]).max() if len(col) > 0 else 0 for col in strcols ] + + assert lwidth is not None col_bins = _binify(col_widths, lwidth) nbins = len(col_bins) @@ -1890,7 +1892,7 @@ def set_eng_float_format(accuracy: int = 3, use_eng_prefix: bool = False) -> Non set_option("display.column_space", max(12, accuracy + 9)) -def _binify(cols: List[np.int32], line_width: Union[np.int32, int]) -> List[int]: +def _binify(cols: List[int], line_width: int) -> List[int]: adjoin_width = 1 bins = [] curr_width = 0 From 894eac64d764ff3cc417342d8106ba24d78d5d91 Mon Sep 17 00:00:00 2001 From: h-vetinari <33685575+h-vetinari@users.noreply.github.com> Date: Fri, 11 Oct 2019 17:13:01 +0200 Subject: [PATCH 034/119] TST: add test coverage for maybe_promote (#23982) --- pandas/tests/dtypes/cast/test_promote.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/pandas/tests/dtypes/cast/test_promote.py b/pandas/tests/dtypes/cast/test_promote.py index 1989ab3f93e89..5c61574eddb50 100644 --- a/pandas/tests/dtypes/cast/test_promote.py +++ b/pandas/tests/dtypes/cast/test_promote.py @@ -504,6 +504,16 @@ def test_maybe_promote_bytes_with_any(bytes_dtype, any_numpy_dtype_reduced, box) ) +# override parametrization of box to add special case for bytes +@pytest.mark.parametrize( + "box", + [ + (True, None), # fill_value wrapped in array with auto-dtype (fixed len) + (True, "bytes"), # fill_value wrapped in array with generic bytes-dtype + (True, object), # fill_value wrapped in array with object dtype + (False, None), # fill_value directly + ], +) def test_maybe_promote_any_with_bytes(any_numpy_dtype_reduced, bytes_dtype, box): dtype = np.dtype(any_numpy_dtype_reduced) fill_dtype = np.dtype(bytes_dtype) From 981299c1464124c34822b60dc1c3176494e08219 Mon Sep 17 00:00:00 2001 From: Jeremy Schendel Date: Fri, 11 Oct 2019 09:14:06 -0600 Subject: [PATCH 035/119] TST: Use fixtures instead of setup_method for index tests (#28865) --- pandas/tests/indexes/common.py | 720 +++++++++--------- pandas/tests/indexes/conftest.py | 43 +- pandas/tests/indexes/datetimelike.py | 22 +- .../indexes/datetimes/test_datetimelike.py | 13 +- pandas/tests/indexes/interval/test_base.py | 7 +- pandas/tests/indexes/period/test_period.py | 15 +- pandas/tests/indexes/test_base.py | 504 ++++++------ pandas/tests/indexes/test_category.py | 19 +- pandas/tests/indexes/test_numeric.py | 305 ++++---- pandas/tests/indexes/test_range.py | 259 +++---- pandas/tests/indexes/test_setops.py | 9 +- .../indexes/timedeltas/test_timedelta.py | 6 +- 12 files changed, 959 insertions(+), 963 deletions(-) diff --git a/pandas/tests/indexes/common.py b/pandas/tests/indexes/common.py index 793992d311502..b657d8d16df81 100644 --- a/pandas/tests/indexes/common.py +++ b/pandas/tests/indexes/common.py @@ -33,10 +33,6 @@ class Base: _holder = None _compat_props = ["shape", "ndim", "size", "nbytes"] - def setup_indices(self): - for name, idx in self.indices.items(): - setattr(self, name, idx) - def test_pickle_compat_construction(self): # need an object to create with msg = ( @@ -205,24 +201,23 @@ def test_reindex_base(self): with pytest.raises(ValueError, match="Invalid fill method"): idx.get_indexer(idx, method="invalid") - def test_get_indexer_consistency(self): + def test_get_indexer_consistency(self, indices): # See GH 16819 - for name, index in self.indices.items(): - if isinstance(index, IntervalIndex): - continue - - if index.is_unique or isinstance(index, CategoricalIndex): - indexer = index.get_indexer(index[0:2]) - assert isinstance(indexer, np.ndarray) - assert indexer.dtype == np.intp - else: - e = "Reindexing only valid with uniquely valued Index objects" - with pytest.raises(InvalidIndexError, match=e): - index.get_indexer(index[0:2]) + if isinstance(indices, IntervalIndex): + return - indexer, _ = index.get_indexer_non_unique(index[0:2]) + if indices.is_unique or isinstance(indices, CategoricalIndex): + indexer = indices.get_indexer(indices[0:2]) assert isinstance(indexer, np.ndarray) assert indexer.dtype == np.intp + else: + e = "Reindexing only valid with uniquely valued Index objects" + with pytest.raises(InvalidIndexError, match=e): + indices.get_indexer(indices[0:2]) + + indexer, _ = indices.get_indexer_non_unique(indices[0:2]) + assert isinstance(indexer, np.ndarray) + assert indexer.dtype == np.intp def test_ndarray_compat_properties(self): idx = self.create_index() @@ -258,146 +253,138 @@ def test_repr_max_seq_item_setting(self): repr(idx) assert "..." not in str(idx) - def test_copy_name(self): + def test_copy_name(self, indices): # gh-12309: Check that the "name" argument # passed at initialization is honored. + if isinstance(indices, MultiIndex): + return - for name, index in self.indices.items(): - if isinstance(index, MultiIndex): - continue - - first = index.__class__(index, copy=True, name="mario") - second = first.__class__(first, copy=False) + first = indices.__class__(indices, copy=True, name="mario") + second = first.__class__(first, copy=False) - # Even though "copy=False", we want a new object. - assert first is not second + # Even though "copy=False", we want a new object. + assert first is not second - # Not using tm.assert_index_equal() since names differ. - assert index.equals(first) + # Not using tm.assert_index_equal() since names differ. + assert indices.equals(first) - assert first.name == "mario" - assert second.name == "mario" + assert first.name == "mario" + assert second.name == "mario" - s1 = Series(2, index=first) - s2 = Series(3, index=second[:-1]) + s1 = Series(2, index=first) + s2 = Series(3, index=second[:-1]) - if not isinstance(index, CategoricalIndex): - # See gh-13365 - s3 = s1 * s2 - assert s3.index.name == "mario" + if not isinstance(indices, CategoricalIndex): + # See gh-13365 + s3 = s1 * s2 + assert s3.index.name == "mario" - def test_ensure_copied_data(self): + def test_ensure_copied_data(self, indices): # Check the "copy" argument of each Index.__new__ is honoured # GH12309 - for name, index in self.indices.items(): - init_kwargs = {} - if isinstance(index, PeriodIndex): - # Needs "freq" specification: - init_kwargs["freq"] = index.freq - elif isinstance(index, (RangeIndex, MultiIndex, CategoricalIndex)): - # RangeIndex cannot be initialized from data - # MultiIndex and CategoricalIndex are tested separately - continue - - index_type = index.__class__ - result = index_type(index.values, copy=True, **init_kwargs) - tm.assert_index_equal(index, result) + init_kwargs = {} + if isinstance(indices, PeriodIndex): + # Needs "freq" specification: + init_kwargs["freq"] = indices.freq + elif isinstance(indices, (RangeIndex, MultiIndex, CategoricalIndex)): + # RangeIndex cannot be initialized from data + # MultiIndex and CategoricalIndex are tested separately + return + + index_type = indices.__class__ + result = index_type(indices.values, copy=True, **init_kwargs) + tm.assert_index_equal(indices, result) + tm.assert_numpy_array_equal( + indices._ndarray_values, result._ndarray_values, check_same="copy" + ) + + if isinstance(indices, PeriodIndex): + # .values an object array of Period, thus copied + result = index_type(ordinal=indices.asi8, copy=False, **init_kwargs) tm.assert_numpy_array_equal( - index._ndarray_values, result._ndarray_values, check_same="copy" + indices._ndarray_values, result._ndarray_values, check_same="same" + ) + elif isinstance(indices, IntervalIndex): + # checked in test_interval.py + pass + else: + result = index_type(indices.values, copy=False, **init_kwargs) + tm.assert_numpy_array_equal( + indices.values, result.values, check_same="same" + ) + tm.assert_numpy_array_equal( + indices._ndarray_values, result._ndarray_values, check_same="same" ) - if isinstance(index, PeriodIndex): - # .values an object array of Period, thus copied - result = index_type(ordinal=index.asi8, copy=False, **init_kwargs) - tm.assert_numpy_array_equal( - index._ndarray_values, result._ndarray_values, check_same="same" - ) - elif isinstance(index, IntervalIndex): - # checked in test_interval.py - pass - else: - result = index_type(index.values, copy=False, **init_kwargs) - tm.assert_numpy_array_equal( - index.values, result.values, check_same="same" - ) - tm.assert_numpy_array_equal( - index._ndarray_values, result._ndarray_values, check_same="same" - ) - - def test_memory_usage(self): - for name, index in self.indices.items(): - result = index.memory_usage() - if len(index): - index.get_loc(index[0]) - result2 = index.memory_usage() - result3 = index.memory_usage(deep=True) - - # RangeIndex, IntervalIndex - # don't have engines - if not isinstance(index, (RangeIndex, IntervalIndex)): - assert result2 > result - - if index.inferred_type == "object": - assert result3 > result2 - - else: - - # we report 0 for no-length - assert result == 0 - - def test_argsort(self): - for k, ind in self.indices.items(): - - # separately tested - if k in ["catIndex"]: - continue - - result = ind.argsort() - expected = np.array(ind).argsort() - tm.assert_numpy_array_equal(result, expected, check_dtype=False) - - def test_numpy_argsort(self): - for k, ind in self.indices.items(): - result = np.argsort(ind) - expected = ind.argsort() - tm.assert_numpy_array_equal(result, expected) - - # these are the only two types that perform - # pandas compatibility input validation - the - # rest already perform separate (or no) such - # validation via their 'values' attribute as - # defined in pandas.core.indexes/base.py - they - # cannot be changed at the moment due to - # backwards compatibility concerns - if isinstance(type(ind), (CategoricalIndex, RangeIndex)): - msg = "the 'axis' parameter is not supported" - with pytest.raises(ValueError, match=msg): - np.argsort(ind, axis=1) - - msg = "the 'kind' parameter is not supported" - with pytest.raises(ValueError, match=msg): - np.argsort(ind, kind="mergesort") - - msg = "the 'order' parameter is not supported" - with pytest.raises(ValueError, match=msg): - np.argsort(ind, order=("a", "b")) - - def test_take(self): + def test_memory_usage(self, indices): + indices._engine.clear_mapping() + result = indices.memory_usage() + if indices.empty: + # we report 0 for no-length + assert result == 0 + return + + # non-zero length + indices.get_loc(indices[0]) + result2 = indices.memory_usage() + result3 = indices.memory_usage(deep=True) + + # RangeIndex, IntervalIndex + # don't have engines + if not isinstance(indices, (RangeIndex, IntervalIndex)): + assert result2 > result + + if indices.inferred_type == "object": + assert result3 > result2 + + def test_argsort(self, request, indices): + # separately tested + if isinstance(indices, CategoricalIndex): + return + + result = indices.argsort() + expected = np.array(indices).argsort() + tm.assert_numpy_array_equal(result, expected, check_dtype=False) + + def test_numpy_argsort(self, indices): + result = np.argsort(indices) + expected = indices.argsort() + tm.assert_numpy_array_equal(result, expected) + + # these are the only two types that perform + # pandas compatibility input validation - the + # rest already perform separate (or no) such + # validation via their 'values' attribute as + # defined in pandas.core.indexes/base.py - they + # cannot be changed at the moment due to + # backwards compatibility concerns + if isinstance(type(indices), (CategoricalIndex, RangeIndex)): + msg = "the 'axis' parameter is not supported" + with pytest.raises(ValueError, match=msg): + np.argsort(indices, axis=1) + + msg = "the 'kind' parameter is not supported" + with pytest.raises(ValueError, match=msg): + np.argsort(indices, kind="mergesort") + + msg = "the 'order' parameter is not supported" + with pytest.raises(ValueError, match=msg): + np.argsort(indices, order=("a", "b")) + + def test_take(self, indices): indexer = [4, 3, 0, 2] - for k, ind in self.indices.items(): - - # separate - if k in ["boolIndex", "tuples", "empty"]: - continue + if len(indices) < 5: + # not enough elements; ignore + return - result = ind.take(indexer) - expected = ind[indexer] - assert result.equals(expected) + result = indices.take(indexer) + expected = indices[indexer] + assert result.equals(expected) - if not isinstance(ind, (DatetimeIndex, PeriodIndex, TimedeltaIndex)): - # GH 10791 - with pytest.raises(AttributeError): - ind.freq + if not isinstance(indices, (DatetimeIndex, PeriodIndex, TimedeltaIndex)): + # GH 10791 + with pytest.raises(AttributeError): + indices.freq def test_take_invalid_kwargs(self): idx = self.create_index() @@ -454,173 +441,152 @@ def test_where(self, klass): @pytest.mark.parametrize( "method", ["intersection", "union", "difference", "symmetric_difference"] ) - def test_set_ops_error_cases(self, case, method): - for name, idx in self.indices.items(): - # non-iterable input + def test_set_ops_error_cases(self, case, method, indices): + # non-iterable input + msg = "Input must be Index or array-like" + with pytest.raises(TypeError, match=msg): + getattr(indices, method)(case) - msg = "Input must be Index or array-like" - with pytest.raises(TypeError, match=msg): - getattr(idx, method)(case) + def test_intersection_base(self, indices): + if isinstance(indices, CategoricalIndex): + return - def test_intersection_base(self): - for name, idx in self.indices.items(): - first = idx[:5] - second = idx[:3] - intersect = first.intersection(second) + first = indices[:5] + second = indices[:3] + intersect = first.intersection(second) + assert tm.equalContents(intersect, second) - if isinstance(idx, CategoricalIndex): - pass - else: - assert tm.equalContents(intersect, second) - - # GH 10149 - cases = [klass(second.values) for klass in [np.array, Series, list]] - for case in cases: - if isinstance(idx, CategoricalIndex): - pass - else: - result = first.intersection(case) - assert tm.equalContents(result, second) - - if isinstance(idx, MultiIndex): - msg = "other must be a MultiIndex or a list of tuples" - with pytest.raises(TypeError, match=msg): - first.intersection([1, 2, 3]) - - def test_union_base(self): - for name, idx in self.indices.items(): - first = idx[3:] - second = idx[:5] - everything = idx - union = first.union(second) - assert tm.equalContents(union, everything) - - # GH 10149 - cases = [klass(second.values) for klass in [np.array, Series, list]] - for case in cases: - if isinstance(idx, CategoricalIndex): - pass - else: - result = first.union(case) - assert tm.equalContents(result, everything) - - if isinstance(idx, MultiIndex): - msg = "other must be a MultiIndex or a list of tuples" - with pytest.raises(TypeError, match=msg): - first.union([1, 2, 3]) + # GH 10149 + cases = [klass(second.values) for klass in [np.array, Series, list]] + for case in cases: + result = first.intersection(case) + assert tm.equalContents(result, second) - @pytest.mark.parametrize("sort", [None, False]) - def test_difference_base(self, sort): - for name, idx in self.indices.items(): - first = idx[2:] - second = idx[:4] - answer = idx[4:] - result = first.difference(second, sort) - - if isinstance(idx, CategoricalIndex): - pass - else: - assert tm.equalContents(result, answer) + if isinstance(indices, MultiIndex): + msg = "other must be a MultiIndex or a list of tuples" + with pytest.raises(TypeError, match=msg): + first.intersection([1, 2, 3]) + + def test_union_base(self, indices): + first = indices[3:] + second = indices[:5] + everything = indices + union = first.union(second) + assert tm.equalContents(union, everything) + + # GH 10149 + cases = [klass(second.values) for klass in [np.array, Series, list]] + for case in cases: + if not isinstance(indices, CategoricalIndex): + result = first.union(case) + assert tm.equalContents(result, everything) + + if isinstance(indices, MultiIndex): + msg = "other must be a MultiIndex or a list of tuples" + with pytest.raises(TypeError, match=msg): + first.union([1, 2, 3]) - # GH 10149 - cases = [klass(second.values) for klass in [np.array, Series, list]] - for case in cases: - if isinstance(idx, CategoricalIndex): - pass - elif isinstance(idx, (DatetimeIndex, TimedeltaIndex)): - assert result.__class__ == answer.__class__ - tm.assert_numpy_array_equal( - result.sort_values().asi8, answer.sort_values().asi8 - ) - else: - result = first.difference(case, sort) - assert tm.equalContents(result, answer) - - if isinstance(idx, MultiIndex): - msg = "other must be a MultiIndex or a list of tuples" - with pytest.raises(TypeError, match=msg): - first.difference([1, 2, 3], sort) - - def test_symmetric_difference(self): - for name, idx in self.indices.items(): - first = idx[1:] - second = idx[:-1] - if isinstance(idx, CategoricalIndex): - pass + @pytest.mark.parametrize("sort", [None, False]) + def test_difference_base(self, sort, indices): + if isinstance(indices, CategoricalIndex): + return + + first = indices[2:] + second = indices[:4] + answer = indices[4:] + result = first.difference(second, sort) + assert tm.equalContents(result, answer) + + # GH 10149 + cases = [klass(second.values) for klass in [np.array, Series, list]] + for case in cases: + if isinstance(indices, (DatetimeIndex, TimedeltaIndex)): + assert result.__class__ == answer.__class__ + tm.assert_numpy_array_equal( + result.sort_values().asi8, answer.sort_values().asi8 + ) else: - answer = idx[[0, -1]] - result = first.symmetric_difference(second) + result = first.difference(case, sort) assert tm.equalContents(result, answer) - # GH 10149 - cases = [klass(second.values) for klass in [np.array, Series, list]] - for case in cases: - if isinstance(idx, CategoricalIndex): - pass - else: - result = first.symmetric_difference(case) - assert tm.equalContents(result, answer) - - if isinstance(idx, MultiIndex): - msg = "other must be a MultiIndex or a list of tuples" - with pytest.raises(TypeError, match=msg): - first.symmetric_difference([1, 2, 3]) - - def test_insert_base(self): - - for name, idx in self.indices.items(): - result = idx[1:4] - - if not len(idx): - continue + if isinstance(indices, MultiIndex): + msg = "other must be a MultiIndex or a list of tuples" + with pytest.raises(TypeError, match=msg): + first.difference([1, 2, 3], sort) + + def test_symmetric_difference(self, indices): + if isinstance(indices, CategoricalIndex): + return + + first = indices[1:] + second = indices[:-1] + answer = indices[[0, -1]] + result = first.symmetric_difference(second) + assert tm.equalContents(result, answer) + + # GH 10149 + cases = [klass(second.values) for klass in [np.array, Series, list]] + for case in cases: + result = first.symmetric_difference(case) + assert tm.equalContents(result, answer) + + if isinstance(indices, MultiIndex): + msg = "other must be a MultiIndex or a list of tuples" + with pytest.raises(TypeError, match=msg): + first.symmetric_difference([1, 2, 3]) - # test 0th element - assert idx[0:4].equals(result.insert(0, idx[0])) + def test_insert_base(self, indices): + result = indices[1:4] - def test_delete_base(self): + if not len(indices): + return - for name, idx in self.indices.items(): + # test 0th element + assert indices[0:4].equals(result.insert(0, indices[0])) - if not len(idx): - continue + def test_delete_base(self, indices): + if not len(indices): + return - if isinstance(idx, RangeIndex): - # tested in class - continue + if isinstance(indices, RangeIndex): + # tested in class + return - expected = idx[1:] - result = idx.delete(0) - assert result.equals(expected) - assert result.name == expected.name + expected = indices[1:] + result = indices.delete(0) + assert result.equals(expected) + assert result.name == expected.name - expected = idx[:-1] - result = idx.delete(-1) - assert result.equals(expected) - assert result.name == expected.name + expected = indices[:-1] + result = indices.delete(-1) + assert result.equals(expected) + assert result.name == expected.name - with pytest.raises((IndexError, ValueError)): - # either depending on numpy version - idx.delete(len(idx)) + with pytest.raises((IndexError, ValueError)): + # either depending on numpy version + indices.delete(len(indices)) - def test_equals(self): + def test_equals(self, indices): + if isinstance(indices, IntervalIndex): + # IntervalIndex tested separately + return - for name, idx in self.indices.items(): - assert idx.equals(idx) - assert idx.equals(idx.copy()) - assert idx.equals(idx.astype(object)) + assert indices.equals(indices) + assert indices.equals(indices.copy()) + assert indices.equals(indices.astype(object)) - assert not idx.equals(list(idx)) - assert not idx.equals(np.array(idx)) + assert not indices.equals(list(indices)) + assert not indices.equals(np.array(indices)) - # Cannot pass in non-int64 dtype to RangeIndex - if not isinstance(idx, RangeIndex): - same_values = Index(idx, dtype=object) - assert idx.equals(same_values) - assert same_values.equals(idx) + # Cannot pass in non-int64 dtype to RangeIndex + if not isinstance(indices, RangeIndex): + same_values = Index(indices, dtype=object) + assert indices.equals(same_values) + assert same_values.equals(indices) - if idx.nlevels == 1: - # do not test MultiIndex - assert not idx.equals(pd.Series(idx)) + if indices.nlevels == 1: + # do not test MultiIndex + assert not indices.equals(Series(indices)) def test_equals_op(self): # GH9947, GH10637 @@ -686,107 +652,99 @@ def test_equals_op(self): tm.assert_numpy_array_equal(index_a == item, expected3) tm.assert_series_equal(series_a == item, Series(expected3)) - def test_hasnans_isnans(self): + def test_hasnans_isnans(self, indices): # GH 11343, added tests for hasnans / isnans + if isinstance(indices, MultiIndex): + return + + # cases in indices doesn't include NaN + idx = indices.copy(deep=True) + expected = np.array([False] * len(idx), dtype=bool) + tm.assert_numpy_array_equal(idx._isnan, expected) + assert idx.hasnans is False + + idx = indices.copy(deep=True) + values = np.asarray(idx.values) + + if len(indices) == 0: + return + elif isinstance(indices, DatetimeIndexOpsMixin): + values[1] = iNaT + elif isinstance(indices, (Int64Index, UInt64Index)): + return + else: + values[1] = np.nan - for name, index in self.indices.items(): - if isinstance(index, MultiIndex): - pass - else: - idx = index.copy() - - # cases in indices doesn't include NaN - expected = np.array([False] * len(idx), dtype=bool) - tm.assert_numpy_array_equal(idx._isnan, expected) - assert idx.hasnans is False - - idx = index.copy() - values = np.asarray(idx.values) - - if len(index) == 0: - continue - elif isinstance(index, DatetimeIndexOpsMixin): - values[1] = iNaT - elif isinstance(index, (Int64Index, UInt64Index)): - continue - else: - values[1] = np.nan - - if isinstance(index, PeriodIndex): - idx = index.__class__(values, freq=index.freq) - else: - idx = index.__class__(values) - - expected = np.array([False] * len(idx), dtype=bool) - expected[1] = True - tm.assert_numpy_array_equal(idx._isnan, expected) - assert idx.hasnans is True - - def test_fillna(self): + if isinstance(indices, PeriodIndex): + idx = indices.__class__(values, freq=indices.freq) + else: + idx = indices.__class__(values) + + expected = np.array([False] * len(idx), dtype=bool) + expected[1] = True + tm.assert_numpy_array_equal(idx._isnan, expected) + assert idx.hasnans is True + + def test_fillna(self, indices): # GH 11343 - for name, index in self.indices.items(): - if len(index) == 0: - pass - elif isinstance(index, MultiIndex): - idx = index.copy() - msg = "isna is not defined for MultiIndex" - with pytest.raises(NotImplementedError, match=msg): - idx.fillna(idx[0]) + if len(indices) == 0: + pass + elif isinstance(indices, MultiIndex): + idx = indices.copy(deep=True) + msg = "isna is not defined for MultiIndex" + with pytest.raises(NotImplementedError, match=msg): + idx.fillna(idx[0]) + else: + idx = indices.copy(deep=True) + result = idx.fillna(idx[0]) + tm.assert_index_equal(result, idx) + assert result is not idx + + msg = "'value' must be a scalar, passed: " + with pytest.raises(TypeError, match=msg): + idx.fillna([idx[0]]) + + idx = indices.copy(deep=True) + values = np.asarray(idx.values) + + if isinstance(indices, DatetimeIndexOpsMixin): + values[1] = iNaT + elif isinstance(indices, (Int64Index, UInt64Index)): + return else: - idx = index.copy() - result = idx.fillna(idx[0]) - tm.assert_index_equal(result, idx) - assert result is not idx - - msg = "'value' must be a scalar, passed: " - with pytest.raises(TypeError, match=msg): - idx.fillna([idx[0]]) - - idx = index.copy() - values = np.asarray(idx.values) - - if isinstance(index, DatetimeIndexOpsMixin): - values[1] = iNaT - elif isinstance(index, (Int64Index, UInt64Index)): - continue - else: - values[1] = np.nan - - if isinstance(index, PeriodIndex): - idx = index.__class__(values, freq=index.freq) - else: - idx = index.__class__(values) - - expected = np.array([False] * len(idx), dtype=bool) - expected[1] = True - tm.assert_numpy_array_equal(idx._isnan, expected) - assert idx.hasnans is True - - def test_nulls(self): - # this is really a smoke test for the methods - # as these are adequately tested for function elsewhere + values[1] = np.nan - for name, index in self.indices.items(): - if len(index) == 0: - tm.assert_numpy_array_equal(index.isna(), np.array([], dtype=bool)) - elif isinstance(index, MultiIndex): - idx = index.copy() - msg = "isna is not defined for MultiIndex" - with pytest.raises(NotImplementedError, match=msg): - idx.isna() + if isinstance(indices, PeriodIndex): + idx = indices.__class__(values, freq=indices.freq) else: + idx = indices.__class__(values) - if not index.hasnans: - tm.assert_numpy_array_equal( - index.isna(), np.zeros(len(index), dtype=bool) - ) - tm.assert_numpy_array_equal( - index.notna(), np.ones(len(index), dtype=bool) - ) - else: - result = isna(index) - tm.assert_numpy_array_equal(index.isna(), result) - tm.assert_numpy_array_equal(index.notna(), ~result) + expected = np.array([False] * len(idx), dtype=bool) + expected[1] = True + tm.assert_numpy_array_equal(idx._isnan, expected) + assert idx.hasnans is True + + def test_nulls(self, indices): + # this is really a smoke test for the methods + # as these are adequately tested for function elsewhere + if len(indices) == 0: + tm.assert_numpy_array_equal(indices.isna(), np.array([], dtype=bool)) + elif isinstance(indices, MultiIndex): + idx = indices.copy() + msg = "isna is not defined for MultiIndex" + with pytest.raises(NotImplementedError, match=msg): + idx.isna() + elif not indices.hasnans: + tm.assert_numpy_array_equal( + indices.isna(), np.zeros(len(indices), dtype=bool) + ) + tm.assert_numpy_array_equal( + indices.notna(), np.ones(len(indices), dtype=bool) + ) + else: + result = isna(indices) + tm.assert_numpy_array_equal(indices.isna(), result) + tm.assert_numpy_array_equal(indices.notna(), ~result) def test_empty(self): # GH 15270 diff --git a/pandas/tests/indexes/conftest.py b/pandas/tests/indexes/conftest.py index 12c5fb8339549..2a9a8bf8d824f 100644 --- a/pandas/tests/indexes/conftest.py +++ b/pandas/tests/indexes/conftest.py @@ -5,28 +5,29 @@ from pandas.core.indexes.api import Index, MultiIndex import pandas.util.testing as tm -indices_list = [ - tm.makeUnicodeIndex(100), - tm.makeStringIndex(100), - tm.makeDateIndex(100), - tm.makePeriodIndex(100), - tm.makeTimedeltaIndex(100), - tm.makeIntIndex(100), - tm.makeUIntIndex(100), - tm.makeRangeIndex(100), - tm.makeFloatIndex(100), - Index([True, False]), - tm.makeCategoricalIndex(100), - tm.makeIntervalIndex(100), - Index([]), - MultiIndex.from_tuples(zip(["foo", "bar", "baz"], [1, 2, 3])), - Index([0, 0, 1, 1, 2, 2]), -] - - -@pytest.fixture(params=indices_list, ids=lambda x: type(x).__name__) +indices_dict = { + "unicode": tm.makeUnicodeIndex(100), + "string": tm.makeStringIndex(100), + "datetime": tm.makeDateIndex(100), + "period": tm.makePeriodIndex(100), + "timedelta": tm.makeTimedeltaIndex(100), + "int": tm.makeIntIndex(100), + "uint": tm.makeUIntIndex(100), + "range": tm.makeRangeIndex(100), + "float": tm.makeFloatIndex(100), + "bool": Index([True, False]), + "categorical": tm.makeCategoricalIndex(100), + "interval": tm.makeIntervalIndex(100), + "empty": Index([]), + "tuples": MultiIndex.from_tuples(zip(["foo", "bar", "baz"], [1, 2, 3])), + "repeats": Index([0, 0, 1, 1, 2, 2]), +} + + +@pytest.fixture(params=indices_dict.keys()) def indices(request): - return request.param + # copy to avoid mutation, e.g. setting .name + return indices_dict[request.param].copy() @pytest.fixture(params=[1, np.array(1, dtype=np.int64)]) diff --git a/pandas/tests/indexes/datetimelike.py b/pandas/tests/indexes/datetimelike.py index 7523b250ea291..f7cded9f44918 100644 --- a/pandas/tests/indexes/datetimelike.py +++ b/pandas/tests/indexes/datetimelike.py @@ -58,13 +58,14 @@ def test_view(self): tm.assert_index_equal(result, i_view) def test_map_callable(self): - expected = self.index + self.index.freq - result = self.index.map(lambda x: x + x.freq) + index = self.create_index() + expected = index + index.freq + result = index.map(lambda x: x + x.freq) tm.assert_index_equal(result, expected) # map to NaT - result = self.index.map(lambda x: pd.NaT if x == self.index[0] else x) - expected = pd.Index([pd.NaT] + self.index[1:].tolist()) + result = index.map(lambda x: pd.NaT if x == index[0] else x) + expected = pd.Index([pd.NaT] + index[1:].tolist()) tm.assert_index_equal(result, expected) @pytest.mark.parametrize( @@ -75,23 +76,24 @@ def test_map_callable(self): ], ) def test_map_dictlike(self, mapper): - expected = self.index + self.index.freq + index = self.create_index() + expected = index + index.freq # don't compare the freqs if isinstance(expected, pd.DatetimeIndex): expected.freq = None - result = self.index.map(mapper(expected, self.index)) + result = index.map(mapper(expected, index)) tm.assert_index_equal(result, expected) - expected = pd.Index([pd.NaT] + self.index[1:].tolist()) - result = self.index.map(mapper(expected, self.index)) + expected = pd.Index([pd.NaT] + index[1:].tolist()) + result = index.map(mapper(expected, index)) tm.assert_index_equal(result, expected) # empty map; these map to np.nan because we cannot know # to re-infer things - expected = pd.Index([np.nan] * len(self.index)) - result = self.index.map(mapper([], [])) + expected = pd.Index([np.nan] * len(index)) + result = index.map(mapper([], [])) tm.assert_index_equal(result, expected) def test_asobject_deprecated(self): diff --git a/pandas/tests/indexes/datetimes/test_datetimelike.py b/pandas/tests/indexes/datetimes/test_datetimelike.py index 0f1d7927ee3b4..8fa87f55f404b 100644 --- a/pandas/tests/indexes/datetimes/test_datetimelike.py +++ b/pandas/tests/indexes/datetimes/test_datetimelike.py @@ -1,4 +1,5 @@ """ generic tests from the Datetimelike class """ +import pytest from pandas import DatetimeIndex, date_range from pandas.util import testing as tm @@ -9,12 +10,12 @@ class TestDatetimeIndex(DatetimeLike): _holder = DatetimeIndex - def setup_method(self, method): - self.indices = dict( - index=tm.makeDateIndex(10), - index_dec=date_range("20130110", periods=10, freq="-1D"), - ) - self.setup_indices() + @pytest.fixture( + params=[tm.makeDateIndex(10), date_range("20130110", periods=10, freq="-1D")], + ids=["index_inc", "index_dec"], + ) + def indices(self, request): + return request.param def create_index(self): return date_range("20130101", periods=5) diff --git a/pandas/tests/indexes/interval/test_base.py b/pandas/tests/indexes/interval/test_base.py index b2cb29dafac09..339bdaf79c690 100644 --- a/pandas/tests/indexes/interval/test_base.py +++ b/pandas/tests/indexes/interval/test_base.py @@ -14,10 +14,9 @@ class TestBase(Base): _holder = IntervalIndex - def setup_method(self, method): - self.index = IntervalIndex.from_arrays([0, 1], [1, 2]) - self.index_with_nan = IntervalIndex.from_tuples([(0, 1), np.nan, (1, 2)]) - self.indices = dict(intervalIndex=tm.makeIntervalIndex(10)) + @pytest.fixture + def indices(self): + return tm.makeIntervalIndex(10) def create_index(self, closed="right"): return IntervalIndex.from_breaks(range(11), closed=closed) diff --git a/pandas/tests/indexes/period/test_period.py b/pandas/tests/indexes/period/test_period.py index ee37be7ab4c14..1a2c58bdfce37 100644 --- a/pandas/tests/indexes/period/test_period.py +++ b/pandas/tests/indexes/period/test_period.py @@ -25,12 +25,15 @@ class TestPeriodIndex(DatetimeLike): _holder = PeriodIndex - def setup_method(self, method): - self.indices = dict( - index=tm.makePeriodIndex(10), - index_dec=period_range("20130101", periods=10, freq="D")[::-1], - ) - self.setup_indices() + @pytest.fixture( + params=[ + tm.makePeriodIndex(10), + period_range("20130101", periods=10, freq="D")[::-1], + ], + ids=["index_inc", "index_dec"], + ) + def indices(self, request): + return request.param def create_index(self): return period_range("20130101", periods=5, freq="D") diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 82d5ddd1ac358..0dc6d24202c34 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -41,6 +41,7 @@ from pandas.core.indexes.api import Index, MultiIndex from pandas.core.sorting import safe_sort from pandas.tests.indexes.common import Base +from pandas.tests.indexes.conftest import indices_dict import pandas.util.testing as tm from pandas.util.testing import assert_almost_equal @@ -48,73 +49,57 @@ class TestIndex(Base): _holder = Index - def setup_method(self, method): - self.indices = dict( - unicodeIndex=tm.makeUnicodeIndex(100), - strIndex=tm.makeStringIndex(100), - dateIndex=tm.makeDateIndex(100), - periodIndex=tm.makePeriodIndex(100), - tdIndex=tm.makeTimedeltaIndex(100), - intIndex=tm.makeIntIndex(100), - uintIndex=tm.makeUIntIndex(100), - rangeIndex=tm.makeRangeIndex(100), - floatIndex=tm.makeFloatIndex(100), - boolIndex=Index([True, False]), - catIndex=tm.makeCategoricalIndex(100), - empty=Index([]), - tuples=MultiIndex.from_tuples(zip(["foo", "bar", "baz"], [1, 2, 3])), - repeats=Index([0, 0, 1, 1, 2, 2]), - ) - self.setup_indices() + @pytest.fixture + def index(self, request): + """ + Fixture for selectively parametrizing indices_dict via indirect parametrization + (parametrize over indices_dict keys with indirect=True). Defaults to string + index if no keys are provided. + """ + key = getattr(request, "param", "string") + + # copy to avoid mutation, e.g. setting .name + return indices_dict[key].copy() def create_index(self): return Index(list("abcde")) - def generate_index_types(self, skip_index_keys=[]): - """ - Return a generator of the various index types, leaving - out the ones with a key in skip_index_keys - """ - for key, index in self.indices.items(): - if key not in skip_index_keys: - yield key, index - def test_can_hold_identifiers(self): index = self.create_index() key = index[0] assert index._can_hold_identifiers_and_holds_name(key) is True - def test_new_axis(self): - new_index = self.dateIndex[None, :] + @pytest.mark.parametrize("index", ["datetime"], indirect=True) + def test_new_axis(self, index): + new_index = index[None, :] assert new_index.ndim == 2 assert isinstance(new_index, np.ndarray) - def test_copy_and_deepcopy(self): - new_copy2 = self.intIndex.copy(dtype=int) + @pytest.mark.parametrize("index", ["int", "uint", "float"], indirect=True) + def test_copy_and_deepcopy(self, index): + new_copy2 = index.copy(dtype=int) assert new_copy2.dtype.kind == "i" - @pytest.mark.parametrize("attr", ["strIndex", "dateIndex"]) - def test_constructor_regular(self, attr): - # regular instance creation - index = getattr(self, attr) - tm.assert_contains_all(index, index) + def test_constructor_regular(self, indices): + tm.assert_contains_all(indices, indices) - def test_constructor_casting(self): + def test_constructor_casting(self, index): # casting - arr = np.array(self.strIndex) - index = Index(arr) - tm.assert_contains_all(arr, index) - tm.assert_index_equal(self.strIndex, index) + arr = np.array(index) + new_index = Index(arr) + tm.assert_contains_all(arr, new_index) + tm.assert_index_equal(index, new_index) - def test_constructor_copy(self): + def test_constructor_copy(self, index): # copy - arr = np.array(self.strIndex) - index = Index(arr, copy=True, name="name") - assert isinstance(index, Index) - assert index.name == "name" - tm.assert_numpy_array_equal(arr, index.values) + # index = self.create_index() + arr = np.array(index) + new_index = Index(arr, copy=True, name="name") + assert isinstance(new_index, Index) + assert new_index.name == "name" + tm.assert_numpy_array_equal(arr, new_index.values) arr[0] = "SOMEBIGLONGSTRING" - assert index[0] != "SOMEBIGLONGSTRING" + assert new_index[0] != "SOMEBIGLONGSTRING" # what to do here? # arr = np.array(5.) @@ -570,37 +555,50 @@ def test_constructor_cast(self): with pytest.raises(ValueError, match=msg): Index(["a", "b", "c"], dtype=float) - def test_view_with_args(self): - restricted = ["unicodeIndex", "strIndex", "catIndex", "boolIndex", "empty"] - for i in list(set(self.indices.keys()) - set(restricted)): - ind = self.indices[i] - ind.view("i8") + @pytest.mark.parametrize( + "index", + [ + "datetime", + "float", + "int", + "period", + "range", + "repeats", + "timedelta", + "tuples", + "uint", + ], + indirect=True, + ) + def test_view_with_args(self, index): + index.view("i8") @pytest.mark.parametrize( - "index_type", + "index", [ - "unicodeIndex", - "strIndex", - pytest.param("catIndex", marks=pytest.mark.xfail(reason="gh-25464")), - "boolIndex", + "unicode", + "string", + pytest.param("categorical", marks=pytest.mark.xfail(reason="gh-25464")), + "bool", "empty", ], + indirect=True, ) - def test_view_with_args_object_array_raises(self, index_type): - ind = self.indices[index_type] + def test_view_with_args_object_array_raises(self, index): msg = "Cannot change data-type for object array" with pytest.raises(TypeError, match=msg): - ind.view("i8") + index.view("i8") - def test_astype(self): - casted = self.intIndex.astype("i8") + @pytest.mark.parametrize("index", ["int", "range"], indirect=True) + def test_astype(self, index): + casted = index.astype("i8") # it works! casted.get_loc(5) # pass on name - self.intIndex.name = "foobar" - casted = self.intIndex.astype("i8") + index.name = "foobar" + casted = index.astype("i8") assert casted.name == "foobar" def test_equals_object(self): @@ -700,16 +698,17 @@ def test_is_(self): ind2 = Index(arr, copy=False) assert not ind1.is_(ind2) - def test_asof(self): - d = self.dateIndex[0] - assert self.dateIndex.asof(d) == d - assert isna(self.dateIndex.asof(d - timedelta(1))) + @pytest.mark.parametrize("index", ["datetime"], indirect=True) + def test_asof(self, index): + d = index[0] + assert index.asof(d) == d + assert isna(index.asof(d - timedelta(1))) - d = self.dateIndex[-1] - assert self.dateIndex.asof(d + timedelta(1)) == d + d = index[-1] + assert index.asof(d + timedelta(1)) == d - d = self.dateIndex[0].to_pydatetime() - assert isinstance(self.dateIndex.asof(d), Timestamp) + d = index[0].to_pydatetime() + assert isinstance(index.asof(d), Timestamp) def test_asof_datetime_partial(self): index = pd.date_range("2010-01-01", periods=2, freq="m") @@ -731,40 +730,39 @@ def test_nanosecond_index_access(self): expected_ts = np_datetime64_compat("2013-01-01 00:00:00.000000050+0000", "ns") assert first_value == x[Timestamp(expected_ts)] - def test_booleanindex(self): - boolIndex = np.repeat(True, len(self.strIndex)).astype(bool) - boolIndex[5:30:2] = False + def test_booleanindex(self, index): + bool_index = np.repeat(True, len(index)).astype(bool) + bool_index[5:30:2] = False - subIndex = self.strIndex[boolIndex] + sub_index = index[bool_index] - for i, val in enumerate(subIndex): - assert subIndex.get_loc(val) == i + for i, val in enumerate(sub_index): + assert sub_index.get_loc(val) == i - subIndex = self.strIndex[list(boolIndex)] - for i, val in enumerate(subIndex): - assert subIndex.get_loc(val) == i + sub_index = index[list(bool_index)] + for i, val in enumerate(sub_index): + assert sub_index.get_loc(val) == i def test_fancy(self): - sl = self.strIndex[[1, 2, 3]] + index = self.create_index() + sl = index[[1, 2, 3]] for i in sl: assert i == sl[sl.get_loc(i)] - @pytest.mark.parametrize("attr", ["strIndex", "intIndex", "floatIndex"]) + @pytest.mark.parametrize("index", ["string", "int", "float"], indirect=True) @pytest.mark.parametrize("dtype", [np.int_, np.bool_]) - def test_empty_fancy(self, attr, dtype): + def test_empty_fancy(self, index, dtype): empty_arr = np.array([], dtype=dtype) - index = getattr(self, attr) empty_index = index.__class__([]) assert index[[]].identical(empty_index) assert index[empty_arr].identical(empty_index) - @pytest.mark.parametrize("attr", ["strIndex", "intIndex", "floatIndex"]) - def test_empty_fancy_raises(self, attr): + @pytest.mark.parametrize("index", ["string", "int", "float"], indirect=True) + def test_empty_fancy_raises(self, index): # pd.DatetimeIndex is excluded, because it overrides getitem and should # be tested separately. empty_farr = np.array([], dtype=np.float_) - index = getattr(self, attr) empty_index = index.__class__([]) assert index[[]].identical(empty_index) @@ -774,9 +772,9 @@ def test_empty_fancy_raises(self, attr): index[empty_farr] @pytest.mark.parametrize("sort", [None, False]) - def test_intersection(self, sort): - first = self.strIndex[:20] - second = self.strIndex[:10] + def test_intersection(self, index, sort): + first = index[:20] + second = index[:10] intersect = first.intersection(second, sort=sort) if sort is None: tm.assert_index_equal(intersect, second.sort_values()) @@ -812,10 +810,10 @@ def test_intersection_name_preservation(self, index2, keeps_name, sort): ) @pytest.mark.parametrize("sort", [None, False]) def test_intersection_name_preservation2( - self, first_name, second_name, expected_name, sort + self, index, first_name, second_name, expected_name, sort ): - first = self.strIndex[5:20] - second = self.strIndex[:10] + first = index[5:20] + second = index[:10] first.name = first_name second.name = second_name intersect = first.intersection(second, sort=sort) @@ -900,11 +898,10 @@ def test_chained_union(self, sort): tm.assert_index_equal(union, expected) @pytest.mark.parametrize("sort", [None, False]) - def test_union(self, sort): - # TODO: Replace with fixturesult - first = self.strIndex[5:20] - second = self.strIndex[:10] - everything = self.strIndex[:20] + def test_union(self, index, sort): + first = index[5:20] + second = index[:10] + everything = index[:20] union = first.union(second, sort=sort) if sort is None: @@ -965,12 +962,11 @@ def test_union_sort_other_incomparable_true(self): @pytest.mark.parametrize("klass", [np.array, Series, list]) @pytest.mark.parametrize("sort", [None, False]) - def test_union_from_iterables(self, klass, sort): + def test_union_from_iterables(self, index, klass, sort): # GH 10149 - # TODO: Replace with fixturesult - first = self.strIndex[5:20] - second = self.strIndex[:10] - everything = self.strIndex[:20] + first = index[5:20] + second = index[:10] + everything = index[:20] case = klass(second.values) result = first.union(case, sort=sort) @@ -979,9 +975,8 @@ def test_union_from_iterables(self, klass, sort): assert tm.equalContents(result, everything) @pytest.mark.parametrize("sort", [None, False]) - def test_union_identity(self, sort): - # TODO: replace with fixturesult - first = self.strIndex[5:20] + def test_union_identity(self, index, sort): + first = index[5:20] union = first.union(first, sort=sort) # i.e. identity is not preserved when sort is True @@ -1021,19 +1016,21 @@ def test_union_name_preservation( @pytest.mark.parametrize("sort", [None, False]) def test_union_dt_as_obj(self, sort): # TODO: Replace with fixturesult - firstCat = self.strIndex.union(self.dateIndex) - secondCat = self.strIndex.union(self.strIndex) + index = self.create_index() + date_index = pd.date_range("2019-01-01", periods=10) + first_cat = index.union(date_index) + second_cat = index.union(index) - if self.dateIndex.dtype == np.object_: - appended = np.append(self.strIndex, self.dateIndex) + if date_index.dtype == np.object_: + appended = np.append(index, date_index) else: - appended = np.append(self.strIndex, self.dateIndex.astype("O")) + appended = np.append(index, date_index.astype("O")) - assert tm.equalContents(firstCat, appended) - assert tm.equalContents(secondCat, self.strIndex) - tm.assert_contains_all(self.strIndex, firstCat) - tm.assert_contains_all(self.strIndex, secondCat) - tm.assert_contains_all(self.dateIndex, firstCat) + assert tm.equalContents(first_cat, appended) + assert tm.equalContents(second_cat, index) + tm.assert_contains_all(index, first_cat) + tm.assert_contains_all(index, second_cat) + tm.assert_contains_all(date_index, first_cat) @pytest.mark.parametrize( "method", ["union", "intersection", "difference", "symmetric_difference"] @@ -1045,11 +1042,9 @@ def test_setops_disallow_true(self, method): with pytest.raises(ValueError, match="The 'sort' keyword only takes"): getattr(idx1, method)(idx2, sort=True) - def test_map_identity_mapping(self): + def test_map_identity_mapping(self, indices): # GH 12766 - # TODO: replace with fixture - for name, cur_index in self.indices.items(): - tm.assert_index_equal(cur_index, cur_index.map(lambda x: x)) + tm.assert_index_equal(indices, indices.map(lambda x: x)) def test_map_with_tuples(self): # GH 12766 @@ -1096,31 +1091,37 @@ def test_map_tseries_indices_accsr_return_index(self): lambda values, index: pd.Series(values, index), ], ) - def test_map_dictlike(self, mapper): + def test_map_dictlike_simple(self, mapper): # GH 12756 expected = Index(["foo", "bar", "baz"]) index = tm.makeIntIndex(3) result = index.map(mapper(expected.values, index)) tm.assert_index_equal(result, expected) - # TODO: replace with fixture - for name in self.indices.keys(): - if name == "catIndex": - # Tested in test_categorical - continue - elif name == "repeats": - # Cannot map duplicated index - continue - - index = self.indices[name] - expected = Index(np.arange(len(index), 0, -1)) - + @pytest.mark.parametrize( + "mapper", + [ + lambda values, index: {i: e for e, i in zip(values, index)}, + lambda values, index: pd.Series(values, index), + ], + ) + def test_map_dictlike(self, indices, mapper): + # GH 12756 + if isinstance(indices, CategoricalIndex): + # Tested in test_categorical + return + elif not indices.is_unique: + # Cannot map duplicated index + return + + if indices.empty: # to match proper result coercion for uints - if name == "empty": - expected = Index([]) + expected = Index([]) + else: + expected = Index(np.arange(len(indices), 0, -1)) - result = index.map(mapper(expected, index)) - tm.assert_index_equal(result, expected) + result = indices.map(mapper(expected, indices)) + tm.assert_index_equal(result, expected) @pytest.mark.parametrize( "mapper", @@ -1169,11 +1170,10 @@ def test_append_empty_preserve_name(self, name, expected): @pytest.mark.parametrize("second_name,expected", [(None, None), ("name", "name")]) @pytest.mark.parametrize("sort", [None, False]) - def test_difference_name_preservation(self, second_name, expected, sort): - # TODO: replace with fixturesult - first = self.strIndex[5:20] - second = self.strIndex[:10] - answer = self.strIndex[10:20] + def test_difference_name_preservation(self, index, second_name, expected, sort): + first = index[5:20] + second = index[:10] + answer = index[10:20] first.name = "name" second.name = second_name @@ -1187,8 +1187,8 @@ def test_difference_name_preservation(self, second_name, expected, sort): assert result.name == expected @pytest.mark.parametrize("sort", [None, False]) - def test_difference_empty_arg(self, sort): - first = self.strIndex[5:20] + def test_difference_empty_arg(self, index, sort): + first = index[5:20] first.name == "name" result = first.difference([], sort) @@ -1196,8 +1196,8 @@ def test_difference_empty_arg(self, sort): assert result.name == first.name @pytest.mark.parametrize("sort", [None, False]) - def test_difference_identity(self, sort): - first = self.strIndex[5:20] + def test_difference_identity(self, index, sort): + first = index[5:20] first.name == "name" result = first.difference(first, sort) @@ -1205,12 +1205,12 @@ def test_difference_identity(self, sort): assert result.name == first.name @pytest.mark.parametrize("sort", [None, False]) - def test_difference_sort(self, sort): - first = self.strIndex[5:20] - second = self.strIndex[:10] + def test_difference_sort(self, index, sort): + first = index[5:20] + second = index[:10] result = first.difference(second, sort) - expected = self.strIndex[10:20] + expected = index[10:20] if sort is None: expected = expected.sort_values() @@ -1267,7 +1267,7 @@ def test_difference_incomparable_true(self, opname): @pytest.mark.parametrize("sort", [None, False]) def test_symmetric_difference_mi(self, sort): - index1 = MultiIndex.from_tuples(self.tuples) + index1 = MultiIndex.from_tuples(zip(["foo", "bar", "baz"], [1, 2, 3])) index2 = MultiIndex.from_tuples([("foo", 1), ("bar", 3)]) result = index1.symmetric_difference(index2, sort=sort) expected = MultiIndex.from_tuples([("bar", 2), ("baz", 3), ("bar", 3)]) @@ -1308,73 +1308,78 @@ def test_symmetric_difference_non_index(self, sort): assert result.name == "new_name" @pytest.mark.parametrize("sort", [None, False]) - def test_difference_type(self, sort): + def test_difference_type(self, indices, sort): # GH 20040 # If taking difference of a set and itself, it # needs to preserve the type of the index - skip_index_keys = ["repeats"] - for key, index in self.generate_index_types(skip_index_keys): - result = index.difference(index, sort=sort) - expected = index.drop(index) - tm.assert_index_equal(result, expected) + if not indices.is_unique: + return + result = indices.difference(indices, sort=sort) + expected = indices.drop(indices) + tm.assert_index_equal(result, expected) @pytest.mark.parametrize("sort", [None, False]) - def test_intersection_difference(self, sort): + def test_intersection_difference(self, indices, sort): # GH 20040 # Test that the intersection of an index with an # empty index produces the same index as the difference # of an index with itself. Test for all types - skip_index_keys = ["repeats"] - for key, index in self.generate_index_types(skip_index_keys): - inter = index.intersection(index.drop(index)) - diff = index.difference(index, sort=sort) - tm.assert_index_equal(inter, diff) + if not indices.is_unique: + return + inter = indices.intersection(indices.drop(indices)) + diff = indices.difference(indices, sort=sort) + tm.assert_index_equal(inter, diff) @pytest.mark.parametrize( - "attr,expected", + "index, expected", [ - ("strIndex", False), - ("boolIndex", False), - ("catIndex", False), - ("intIndex", True), - ("dateIndex", False), - ("floatIndex", True), + ("string", False), + ("bool", False), + ("categorical", False), + ("int", True), + ("datetime", False), + ("float", True), ], + indirect=["index"], ) - def test_is_numeric(self, attr, expected): - assert getattr(self, attr).is_numeric() == expected + def test_is_numeric(self, index, expected): + assert index.is_numeric() is expected @pytest.mark.parametrize( - "attr,expected", + "index, expected", [ - ("strIndex", True), - ("boolIndex", True), - ("catIndex", False), - ("intIndex", False), - ("dateIndex", False), - ("floatIndex", False), + ("string", True), + ("bool", True), + ("categorical", False), + ("int", False), + ("datetime", False), + ("float", False), ], + indirect=["index"], ) - def test_is_object(self, attr, expected): - assert getattr(self, attr).is_object() == expected + def test_is_object(self, index, expected): + assert index.is_object() is expected @pytest.mark.parametrize( - "attr,expected", + "index, expected", [ - ("strIndex", False), - ("boolIndex", False), - ("catIndex", False), - ("intIndex", False), - ("dateIndex", True), - ("floatIndex", False), + ("string", False), + ("bool", False), + ("categorical", False), + ("int", False), + ("datetime", True), + ("float", False), ], + indirect=["index"], ) - def test_is_all_dates(self, attr, expected): - assert getattr(self, attr).is_all_dates == expected + def test_is_all_dates(self, index, expected): + assert index.is_all_dates is expected + + def test_summary(self, indices): + self._check_method_works(Index._summary, indices) - def test_summary(self): - self._check_method_works(Index._summary) - # GH3869 + def test_summary_bug(self): + # GH3869` ind = Index(["{other}%s", "~:{range}:0"], name="A") result = ind._summary() # shouldn't be formatted accidentally. @@ -1388,9 +1393,10 @@ def test_summary_deprecated(self): with tm.assert_produces_warning(FutureWarning): ind.summary() - def test_format(self): - self._check_method_works(Index.format) + def test_format(self, indices): + self._check_method_works(Index.format, indices) + def test_format_bug(self): # GH 14626 # windows has different precision on datetime.datetime.now (it doesn't # include us since the default for Timestamp shows these but Index @@ -1402,7 +1408,7 @@ def test_format(self): expected = [str(index[0])] assert formatted == expected - self.strIndex[:0].format() + Index([]).format() @pytest.mark.parametrize("vals", [[1, 2.0 + 3.0j, 4.0], ["a", "b", "c"]]) def test_format_missing(self, vals, nulls_fixture): @@ -1419,8 +1425,7 @@ def test_format_missing(self, vals, nulls_fixture): def test_format_with_name_time_info(self): # bug I fixed 12/20/2011 - inc = timedelta(hours=4) - dates = Index([dt + inc for dt in self.dateIndex], name="something") + dates = date_range("2011-01-01 04:00:00", periods=10, name="something") formatted = dates.format(name=True) assert formatted[0] == "something" @@ -1438,15 +1443,8 @@ def test_logical_compat(self, op): index = self.create_index() assert getattr(index, op)() == getattr(index.values, op)() - def _check_method_works(self, method): - # TODO: make this a dedicated test with parametrized methods - method(self.empty) - method(self.dateIndex) - method(self.unicodeIndex) - method(self.strIndex) - method(self.intIndex) - method(self.tuples) - method(self.catIndex) + def _check_method_works(self, method, index): + method(index) def test_get_indexer(self): index1 = Index([1, 2, 3, 4, 5]) @@ -1766,38 +1764,37 @@ def test_slice_locs_negative_step(self, in_slice, expected): expected = pd.Index(list(expected)) tm.assert_index_equal(result, expected) - def test_drop_by_str_label(self): - # TODO: Parametrize these after replacing self.strIndex with fixture - n = len(self.strIndex) - drop = self.strIndex[list(range(5, 10))] - dropped = self.strIndex.drop(drop) + @pytest.mark.parametrize("index", ["string", "int", "float"], indirect=True) + def test_drop_by_str_label(self, index): + n = len(index) + drop = index[list(range(5, 10))] + dropped = index.drop(drop) - expected = self.strIndex[list(range(5)) + list(range(10, n))] + expected = index[list(range(5)) + list(range(10, n))] tm.assert_index_equal(dropped, expected) - dropped = self.strIndex.drop(self.strIndex[0]) - expected = self.strIndex[1:] + dropped = index.drop(index[0]) + expected = index[1:] tm.assert_index_equal(dropped, expected) + @pytest.mark.parametrize("index", ["string", "int", "float"], indirect=True) @pytest.mark.parametrize("keys", [["foo", "bar"], ["1", "bar"]]) - def test_drop_by_str_label_raises_missing_keys(self, keys): + def test_drop_by_str_label_raises_missing_keys(self, index, keys): with pytest.raises(KeyError, match=""): - self.strIndex.drop(keys) + index.drop(keys) - def test_drop_by_str_label_errors_ignore(self): - # TODO: Parametrize these after replacing self.strIndex with fixture - - # errors='ignore' - n = len(self.strIndex) - drop = self.strIndex[list(range(5, 10))] + @pytest.mark.parametrize("index", ["string", "int", "float"], indirect=True) + def test_drop_by_str_label_errors_ignore(self, index): + n = len(index) + drop = index[list(range(5, 10))] mixed = drop.tolist() + ["foo"] - dropped = self.strIndex.drop(mixed, errors="ignore") + dropped = index.drop(mixed, errors="ignore") - expected = self.strIndex[list(range(5)) + list(range(10, n))] + expected = index[list(range(5)) + list(range(10, n))] tm.assert_index_equal(dropped, expected) - dropped = self.strIndex.drop(["foo", "bar"], errors="ignore") - expected = self.strIndex[list(range(n))] + dropped = index.drop(["foo", "bar"], errors="ignore") + expected = index[list(range(n))] tm.assert_index_equal(dropped, expected) def test_drop_by_numeric_label_loc(self): @@ -1916,12 +1913,15 @@ def test_set_value_deprecated(self): idx.set_value(arr, idx[1], 80) assert arr[1] == 80 - def test_get_value(self): + @pytest.mark.parametrize( + "index", ["string", "int", "datetime", "timedelta"], indirect=True + ) + def test_get_value(self, index): # TODO: Remove function? GH 19728 values = np.random.randn(100) - date = self.dateIndex[67] + value = index[67] - assert_almost_equal(self.dateIndex.get_value(values, date), values[67]) + assert_almost_equal(index.get_value(values, value), values[67]) @pytest.mark.parametrize("values", [["foo", "bar", "quux"], {"foo", "bar", "quux"}]) @pytest.mark.parametrize( @@ -2040,8 +2040,8 @@ def test_boolean_cmp(self, values): tm.assert_numpy_array_equal(result, expected) @pytest.mark.parametrize("name,level", [(None, 0), ("a", "a")]) - def test_get_level_values(self, name, level): - expected = self.strIndex.copy() + def test_get_level_values(self, index, name, level): + expected = index.copy() if name: expected.name = name @@ -2052,14 +2052,12 @@ def test_slice_keep_name(self): index = Index(["a", "b"], name="asdf") assert index.name == index[1:].name - # instance attributes of the form self.Index - @pytest.mark.parametrize("index_kind", ["unicode", "str", "date", "int", "float"]) - def test_join_self(self, join_type, index_kind): - - res = getattr(self, "{0}Index".format(index_kind)) - - joined = res.join(res, how=join_type) - assert res is joined + @pytest.mark.parametrize( + "index", ["unicode", "string", "datetime", "int", "float"], indirect=True + ) + def test_join_self(self, index, join_type): + joined = index.join(index, how=join_type) + assert index is joined @pytest.mark.parametrize("method", ["strip", "rstrip", "lstrip"]) def test_str_attribute(self, method): @@ -2424,10 +2422,11 @@ def test_tab_complete_warning(self, ip): with provisionalcompleter("ignore"): list(ip.Completer.completions("idx.", 4)) - def test_deprecated_contains(self): - for index in self.indices.values(): - with tm.assert_produces_warning(FutureWarning): - index.contains(1) + def test_deprecated_contains(self, indices): + # deprecated for all types except IntervalIndex + warning = FutureWarning if not isinstance(indices, pd.IntervalIndex) else None + with tm.assert_produces_warning(warning): + indices.contains(1) class TestMixedIntIndex(Base): @@ -2437,12 +2436,12 @@ class TestMixedIntIndex(Base): _holder = Index - def setup_method(self, method): - self.indices = dict(mixedIndex=Index([0, "a", 1, "b", 2, "c"])) - self.setup_indices() + @pytest.fixture(params=[[0, "a", 1, "b", 2, "c"]], ids=["mixedIndex"]) + def indices(self, request): + return Index(request.param) def create_index(self): - return self.mixedIndex + return Index([0, "a", 1, "b", 2, "c"]) def test_argsort(self): index = self.create_index() @@ -2766,13 +2765,12 @@ def test_ensure_index_mixed_closed_intervals(self): ], ) def test_generated_op_names(opname, indices): - index = indices - if isinstance(index, ABCIndex) and opname == "rsub": + if isinstance(indices, ABCIndex) and opname == "rsub": # pd.Index.__rsub__ does not exist; though the method does exist # for subclasses. see GH#19723 return opname = "__{name}__".format(name=opname) - method = getattr(index, opname) + method = getattr(indices, opname) assert method.__name__ == opname diff --git a/pandas/tests/indexes/test_category.py b/pandas/tests/indexes/test_category.py index 67bf9bd20e716..4326c3f8188fc 100644 --- a/pandas/tests/indexes/test_category.py +++ b/pandas/tests/indexes/test_category.py @@ -19,9 +19,9 @@ class TestCategoricalIndex(Base): _holder = CategoricalIndex - def setup_method(self, method): - self.indices = dict(catIndex=tm.makeCategoricalIndex(100)) - self.setup_indices() + @pytest.fixture + def indices(self, request): + return tm.makeCategoricalIndex(100) def create_index(self, categories=None, ordered=False): if categories is None: @@ -780,7 +780,7 @@ def test_identical(self): assert ci1.identical(ci1.copy()) assert not ci1.identical(ci2) - def test_ensure_copied_data(self): + def test_ensure_copied_data(self, indices): # gh-12309: Check the "copy" argument of each # Index.__new__ is honored. # @@ -788,13 +788,12 @@ def test_ensure_copied_data(self): # self.value is not an ndarray. _base = lambda ar: ar if ar.base is None else ar.base - for index in self.indices.values(): - result = CategoricalIndex(index.values, copy=True) - tm.assert_index_equal(index, result) - assert _base(index.values) is not _base(result.values) + result = CategoricalIndex(indices.values, copy=True) + tm.assert_index_equal(indices, result) + assert _base(indices.values) is not _base(result.values) - result = CategoricalIndex(index.values, copy=False) - assert _base(index.values) is _base(result.values) + result = CategoricalIndex(indices.values, copy=False) + assert _base(indices.values) is _base(result.values) def test_equals_categorical(self): ci1 = CategoricalIndex(["a", "b"], categories=["a", "b"], ordered=True) diff --git a/pandas/tests/indexes/test_numeric.py b/pandas/tests/indexes/test_numeric.py index 8bc9783694492..e424b3601a4b2 100644 --- a/pandas/tests/indexes/test_numeric.py +++ b/pandas/tests/indexes/test_numeric.py @@ -1,4 +1,4 @@ -from datetime import datetime +from datetime import datetime, timedelta import re import numpy as np @@ -87,32 +87,42 @@ def test_where(self, klass): result = i.where(klass(cond)) tm.assert_index_equal(result, expected) - def test_insert(self): + def test_insert(self, nulls_fixture): # GH 18295 (test missing) - expected = Float64Index([0, np.nan, 1, 2, 3, 4]) - for na in (np.nan, pd.NaT, None): - result = self.create_index().insert(1, na) - tm.assert_index_equal(result, expected) + index = self.create_index() + expected = Float64Index([index[0], np.nan] + list(index[1:])) + result = index.insert(1, nulls_fixture) + tm.assert_index_equal(result, expected) class TestFloat64Index(Numeric): _holder = Float64Index - def setup_method(self, method): - self.indices = dict( - mixed=Float64Index([1.5, 2, 3, 4, 5]), - float=Float64Index(np.arange(5) * 2.5), - mixed_dec=Float64Index([5, 4, 3, 2, 1.5]), - float_dec=Float64Index(np.arange(4, -1, -1) * 2.5), - ) - self.setup_indices() + @pytest.fixture( + params=[ + [1.5, 2, 3, 4, 5], + [0.0, 2.5, 5.0, 7.5, 10.0], + [5, 4, 3, 2, 1.5], + [10.0, 7.5, 5.0, 2.5, 0.0], + ], + ids=["mixed", "float", "mixed_dec", "float_dec"], + ) + def indices(self, request): + return Float64Index(request.param) + + @pytest.fixture + def mixed_index(self): + return Float64Index([1.5, 2, 3, 4, 5]) + + @pytest.fixture + def float_index(self): + return Float64Index([0.0, 2.5, 5.0, 7.5, 10.0]) def create_index(self): return Float64Index(np.arange(5, dtype="float64")) - def test_repr_roundtrip(self): - for ind in (self.mixed, self.float): - tm.assert_index_equal(eval(repr(ind)), ind) + def test_repr_roundtrip(self, indices): + tm.assert_index_equal(eval(repr(indices)), indices) def check_is_index(self, i): assert isinstance(i, Index) @@ -176,30 +186,32 @@ def test_constructor_invalid(self): with pytest.raises(TypeError, match=msg): Float64Index([Timestamp("20130101")]) - def test_constructor_coerce(self): + def test_constructor_coerce(self, mixed_index, float_index): - self.check_coerce(self.mixed, Index([1.5, 2, 3, 4, 5])) - self.check_coerce(self.float, Index(np.arange(5) * 2.5)) - self.check_coerce(self.float, Index(np.array(np.arange(5) * 2.5, dtype=object))) + self.check_coerce(mixed_index, Index([1.5, 2, 3, 4, 5])) + self.check_coerce(float_index, Index(np.arange(5) * 2.5)) + self.check_coerce( + float_index, Index(np.array(np.arange(5) * 2.5, dtype=object)) + ) - def test_constructor_explicit(self): + def test_constructor_explicit(self, mixed_index, float_index): # these don't auto convert self.check_coerce( - self.float, Index((np.arange(5) * 2.5), dtype=object), is_float_index=False + float_index, Index((np.arange(5) * 2.5), dtype=object), is_float_index=False ) self.check_coerce( - self.mixed, Index([1.5, 2, 3, 4, 5], dtype=object), is_float_index=False + mixed_index, Index([1.5, 2, 3, 4, 5], dtype=object), is_float_index=False ) - def test_astype(self): + def test_astype(self, mixed_index, float_index): - result = self.float.astype(object) - assert result.equals(self.float) - assert self.float.equals(result) + result = float_index.astype(object) + assert result.equals(float_index) + assert float_index.equals(result) self.check_is_index(result) - i = self.mixed.copy() + i = mixed_index.copy() i.name = "foo" result = i.astype(object) assert result.equals(i) @@ -451,11 +463,12 @@ def test_view(self): tm.assert_index_equal(i, self._holder(i_view, name="Foo")) def test_is_monotonic(self): - assert self.index.is_monotonic is True - assert self.index.is_monotonic_increasing is True - assert self.index._is_strictly_monotonic_increasing is True - assert self.index.is_monotonic_decreasing is False - assert self.index._is_strictly_monotonic_decreasing is False + index = self._holder([1, 2, 3, 4]) + assert index.is_monotonic is True + assert index.is_monotonic_increasing is True + assert index._is_strictly_monotonic_increasing is True + assert index.is_monotonic_decreasing is False + assert index._is_strictly_monotonic_decreasing is False index = self._holder([4, 3, 2, 1]) assert index.is_monotonic is False @@ -490,23 +503,22 @@ def test_logical_compat(self): assert idx.any() == idx.values.any() def test_identical(self): - i = Index(self.index.copy()) - assert i.identical(self.index) + index = self.create_index() + i = Index(index.copy()) + assert i.identical(index) same_values_different_type = Index(i, dtype=object) assert not i.identical(same_values_different_type) - i = self.index.copy(dtype=object) + i = index.copy(dtype=object) i = i.rename("foo") same_values = Index(i, dtype=object) assert same_values.identical(i) - assert not i.identical(self.index) + assert not i.identical(index) assert Index(same_values, name="foo", dtype=object).identical(i) - assert not self.index.copy(dtype=object).identical( - self.index.copy(dtype=self._dtype) - ) + assert not index.copy(dtype=object).identical(index.copy(dtype=self._dtype)) def test_join_non_unique(self): left = Index([4, 4, 3, 3]) @@ -522,23 +534,21 @@ def test_join_non_unique(self): exp_ridx = np.array([2, 3, 2, 3, 0, 1, 0, 1], dtype=np.intp) tm.assert_numpy_array_equal(ridx, exp_ridx) - @pytest.mark.parametrize("kind", ["outer", "inner", "left", "right"]) - def test_join_self(self, kind): - joined = self.index.join(self.index, how=kind) - assert self.index is joined + def test_join_self(self, join_type): + index = self.create_index() + joined = index.join(index, how=join_type) + assert index is joined def test_union_noncomparable(self): - from datetime import datetime, timedelta - # corner case, non-Int64Index - now = datetime.now() - other = Index([now + timedelta(i) for i in range(4)], dtype=object) - result = self.index.union(other) - expected = Index(np.concatenate((self.index, other))) + index = self.create_index() + other = Index([datetime.now() + timedelta(i) for i in range(4)], dtype=object) + result = index.union(other) + expected = Index(np.concatenate((index, other))) tm.assert_index_equal(result, expected) - result = other.union(self.index) - expected = Index(np.concatenate((other, self.index))) + result = other.union(index) + expected = Index(np.concatenate((other, index))) tm.assert_index_equal(result, expected) def test_cant_or_shouldnt_cast(self): @@ -557,10 +567,12 @@ def test_cant_or_shouldnt_cast(self): self._holder(data) def test_view_index(self): - self.index.view(Index) + index = self.create_index() + index.view(Index) def test_prevent_casting(self): - result = self.index.astype("O") + index = self.create_index() + result = index.astype("O") assert result.dtype == np.object_ def test_take_preserve_name(self): @@ -604,15 +616,15 @@ class TestInt64Index(NumericInt): _dtype = "int64" _holder = Int64Index - def setup_method(self, method): - self.indices = dict( - index=Int64Index(np.arange(0, 20, 2)), - index_dec=Int64Index(np.arange(19, -1, -1)), - ) - self.setup_indices() + @pytest.fixture( + params=[range(0, 20, 2), range(19, -1, -1)], ids=["index_inc", "index_dec"] + ) + def indices(self, request): + return Int64Index(request.param) def create_index(self): - return Int64Index(np.arange(5, dtype="int64")) + # return Int64Index(np.arange(5, dtype="int64")) + return Int64Index(range(0, 20, 2)) def test_constructor(self): # pass list, coerce fine @@ -633,9 +645,9 @@ def test_constructor(self): Int64Index(5) # copy - arr = self.index.values + arr = index.values new_index = Int64Index(arr, copy=True) - tm.assert_index_equal(new_index, self.index) + tm.assert_index_equal(new_index, index) val = arr[0] + 3000 # this should not change index @@ -691,39 +703,42 @@ def test_coerce_list(self): assert isinstance(arr, Index) def test_get_indexer(self): + index = self.create_index() target = Int64Index(np.arange(10)) - indexer = self.index.get_indexer(target) + indexer = index.get_indexer(target) expected = np.array([0, -1, 1, -1, 2, -1, 3, -1, 4, -1], dtype=np.intp) tm.assert_numpy_array_equal(indexer, expected) target = Int64Index(np.arange(10)) - indexer = self.index.get_indexer(target, method="pad") + indexer = index.get_indexer(target, method="pad") expected = np.array([0, 0, 1, 1, 2, 2, 3, 3, 4, 4], dtype=np.intp) tm.assert_numpy_array_equal(indexer, expected) target = Int64Index(np.arange(10)) - indexer = self.index.get_indexer(target, method="backfill") + indexer = index.get_indexer(target, method="backfill") expected = np.array([0, 1, 1, 2, 2, 3, 3, 4, 4, 5], dtype=np.intp) tm.assert_numpy_array_equal(indexer, expected) def test_intersection(self): + index = self.create_index() other = Index([1, 2, 3, 4, 5]) - result = self.index.intersection(other) - expected = Index(np.sort(np.intersect1d(self.index.values, other.values))) + result = index.intersection(other) + expected = Index(np.sort(np.intersect1d(index.values, other.values))) tm.assert_index_equal(result, expected) - result = other.intersection(self.index) + result = other.intersection(index) expected = Index( - np.sort(np.asarray(np.intersect1d(self.index.values, other.values))) + np.sort(np.asarray(np.intersect1d(index.values, other.values))) ) tm.assert_index_equal(result, expected) def test_join_inner(self): + index = self.create_index() other = Int64Index([7, 12, 25, 1, 2, 5]) other_mono = Int64Index([1, 2, 5, 7, 12, 25]) # not monotonic - res, lidx, ridx = self.index.join(other, how="inner", return_indexers=True) + res, lidx, ridx = index.join(other, how="inner", return_indexers=True) # no guarantee of sortedness, so sort for comparison purposes ind = res.argsort() @@ -741,9 +756,9 @@ def test_join_inner(self): tm.assert_numpy_array_equal(ridx, eridx) # monotonic - res, lidx, ridx = self.index.join(other_mono, how="inner", return_indexers=True) + res, lidx, ridx = index.join(other_mono, how="inner", return_indexers=True) - res2 = self.index.intersection(other_mono) + res2 = index.intersection(other_mono) tm.assert_index_equal(res, res2) elidx = np.array([1, 6], dtype=np.intp) @@ -754,12 +769,13 @@ def test_join_inner(self): tm.assert_numpy_array_equal(ridx, eridx) def test_join_left(self): + index = self.create_index() other = Int64Index([7, 12, 25, 1, 2, 5]) other_mono = Int64Index([1, 2, 5, 7, 12, 25]) # not monotonic - res, lidx, ridx = self.index.join(other, how="left", return_indexers=True) - eres = self.index + res, lidx, ridx = index.join(other, how="left", return_indexers=True) + eres = index eridx = np.array([-1, 4, -1, -1, -1, -1, 1, -1, -1, -1], dtype=np.intp) assert isinstance(res, Int64Index) @@ -768,7 +784,7 @@ def test_join_left(self): tm.assert_numpy_array_equal(ridx, eridx) # monotonic - res, lidx, ridx = self.index.join(other_mono, how="left", return_indexers=True) + res, lidx, ridx = index.join(other_mono, how="left", return_indexers=True) eridx = np.array([-1, 1, -1, -1, -1, -1, 4, -1, -1, -1], dtype=np.intp) assert isinstance(res, Int64Index) tm.assert_index_equal(res, eres) @@ -787,11 +803,12 @@ def test_join_left(self): tm.assert_numpy_array_equal(ridx, eridx) def test_join_right(self): + index = self.create_index() other = Int64Index([7, 12, 25, 1, 2, 5]) other_mono = Int64Index([1, 2, 5, 7, 12, 25]) # not monotonic - res, lidx, ridx = self.index.join(other, how="right", return_indexers=True) + res, lidx, ridx = index.join(other, how="right", return_indexers=True) eres = other elidx = np.array([-1, 6, -1, -1, 1, -1], dtype=np.intp) @@ -801,7 +818,7 @@ def test_join_right(self): assert ridx is None # monotonic - res, lidx, ridx = self.index.join(other_mono, how="right", return_indexers=True) + res, lidx, ridx = index.join(other_mono, how="right", return_indexers=True) eres = other_mono elidx = np.array([-1, 1, -1, -1, 6, -1], dtype=np.intp) assert isinstance(other, Int64Index) @@ -821,40 +838,42 @@ def test_join_right(self): tm.assert_numpy_array_equal(ridx, eridx) def test_join_non_int_index(self): + index = self.create_index() other = Index([3, 6, 7, 8, 10], dtype=object) - outer = self.index.join(other, how="outer") - outer2 = other.join(self.index, how="outer") + outer = index.join(other, how="outer") + outer2 = other.join(index, how="outer") expected = Index([0, 2, 3, 4, 6, 7, 8, 10, 12, 14, 16, 18]) tm.assert_index_equal(outer, outer2) tm.assert_index_equal(outer, expected) - inner = self.index.join(other, how="inner") - inner2 = other.join(self.index, how="inner") + inner = index.join(other, how="inner") + inner2 = other.join(index, how="inner") expected = Index([6, 8, 10]) tm.assert_index_equal(inner, inner2) tm.assert_index_equal(inner, expected) - left = self.index.join(other, how="left") - tm.assert_index_equal(left, self.index.astype(object)) + left = index.join(other, how="left") + tm.assert_index_equal(left, index.astype(object)) - left2 = other.join(self.index, how="left") + left2 = other.join(index, how="left") tm.assert_index_equal(left2, other) - right = self.index.join(other, how="right") + right = index.join(other, how="right") tm.assert_index_equal(right, other) - right2 = other.join(self.index, how="right") - tm.assert_index_equal(right2, self.index.astype(object)) + right2 = other.join(index, how="right") + tm.assert_index_equal(right2, index.astype(object)) def test_join_outer(self): + index = self.create_index() other = Int64Index([7, 12, 25, 1, 2, 5]) other_mono = Int64Index([1, 2, 5, 7, 12, 25]) # not monotonic # guarantee of sortedness - res, lidx, ridx = self.index.join(other, how="outer", return_indexers=True) - noidx_res = self.index.join(other, how="outer") + res, lidx, ridx = index.join(other, how="outer", return_indexers=True) + noidx_res = index.join(other, how="outer") tm.assert_index_equal(res, noidx_res) eres = Int64Index([0, 1, 2, 4, 5, 6, 7, 8, 10, 12, 14, 16, 18, 25]) @@ -869,8 +888,8 @@ def test_join_outer(self): tm.assert_numpy_array_equal(ridx, eridx) # monotonic - res, lidx, ridx = self.index.join(other_mono, how="outer", return_indexers=True) - noidx_res = self.index.join(other_mono, how="outer") + res, lidx, ridx = index.join(other_mono, how="outer", return_indexers=True) + noidx_res = index.join(other_mono, how="outer") tm.assert_index_equal(res, noidx_res) elidx = np.array([0, -1, 1, 2, -1, 3, -1, 4, 5, 6, 7, 8, 9, -1], dtype=np.intp) @@ -888,14 +907,24 @@ class TestUInt64Index(NumericInt): _dtype = "uint64" _holder = UInt64Index - def setup_method(self, method): - vals = [2 ** 63, 2 ** 63 + 10, 2 ** 63 + 15, 2 ** 63 + 20, 2 ** 63 + 25] - self.indices = dict( - index=UInt64Index(vals), index_dec=UInt64Index(reversed(vals)) - ) - self.setup_indices() + @pytest.fixture( + params=[ + [2 ** 63, 2 ** 63 + 10, 2 ** 63 + 15, 2 ** 63 + 20, 2 ** 63 + 25], + [2 ** 63 + 25, 2 ** 63 + 20, 2 ** 63 + 15, 2 ** 63 + 10, 2 ** 63], + ], + ids=["index_inc", "index_dec"], + ) + def indices(self, request): + return UInt64Index(request.param) + + @pytest.fixture + def index_large(self): + # large values used in TestUInt64Index where no compat needed with Int64/Float64 + large = [2 ** 63, 2 ** 63 + 10, 2 ** 63 + 15, 2 ** 63 + 20, 2 ** 63 + 25] + return UInt64Index(large) def create_index(self): + # compat with shared Int64/Float64 tests; use index_large for UInt64 only tests return UInt64Index(np.arange(5, dtype="uint64")) def test_constructor(self): @@ -915,42 +944,42 @@ def test_constructor(self): res = Index(np.array([-1, 2 ** 63], dtype=object)) tm.assert_index_equal(res, idx) - def test_get_indexer(self): + def test_get_indexer(self, index_large): target = UInt64Index(np.arange(10).astype("uint64") * 5 + 2 ** 63) - indexer = self.index.get_indexer(target) + indexer = index_large.get_indexer(target) expected = np.array([0, -1, 1, 2, 3, 4, -1, -1, -1, -1], dtype=np.intp) tm.assert_numpy_array_equal(indexer, expected) target = UInt64Index(np.arange(10).astype("uint64") * 5 + 2 ** 63) - indexer = self.index.get_indexer(target, method="pad") + indexer = index_large.get_indexer(target, method="pad") expected = np.array([0, 0, 1, 2, 3, 4, 4, 4, 4, 4], dtype=np.intp) tm.assert_numpy_array_equal(indexer, expected) target = UInt64Index(np.arange(10).astype("uint64") * 5 + 2 ** 63) - indexer = self.index.get_indexer(target, method="backfill") + indexer = index_large.get_indexer(target, method="backfill") expected = np.array([0, 1, 1, 2, 3, 4, -1, -1, -1, -1], dtype=np.intp) tm.assert_numpy_array_equal(indexer, expected) - def test_intersection(self): + def test_intersection(self, index_large): other = Index([2 ** 63, 2 ** 63 + 5, 2 ** 63 + 10, 2 ** 63 + 15, 2 ** 63 + 20]) - result = self.index.intersection(other) - expected = Index(np.sort(np.intersect1d(self.index.values, other.values))) + result = index_large.intersection(other) + expected = Index(np.sort(np.intersect1d(index_large.values, other.values))) tm.assert_index_equal(result, expected) - result = other.intersection(self.index) + result = other.intersection(index_large) expected = Index( - np.sort(np.asarray(np.intersect1d(self.index.values, other.values))) + np.sort(np.asarray(np.intersect1d(index_large.values, other.values))) ) tm.assert_index_equal(result, expected) - def test_join_inner(self): + def test_join_inner(self, index_large): other = UInt64Index(2 ** 63 + np.array([7, 12, 25, 1, 2, 10], dtype="uint64")) other_mono = UInt64Index( 2 ** 63 + np.array([1, 2, 7, 10, 12, 25], dtype="uint64") ) # not monotonic - res, lidx, ridx = self.index.join(other, how="inner", return_indexers=True) + res, lidx, ridx = index_large.join(other, how="inner", return_indexers=True) # no guarantee of sortedness, so sort for comparison purposes ind = res.argsort() @@ -968,9 +997,11 @@ def test_join_inner(self): tm.assert_numpy_array_equal(ridx, eridx) # monotonic - res, lidx, ridx = self.index.join(other_mono, how="inner", return_indexers=True) + res, lidx, ridx = index_large.join( + other_mono, how="inner", return_indexers=True + ) - res2 = self.index.intersection(other_mono) + res2 = index_large.intersection(other_mono) tm.assert_index_equal(res, res2) elidx = np.array([1, 4], dtype=np.intp) @@ -981,15 +1012,15 @@ def test_join_inner(self): tm.assert_numpy_array_equal(lidx, elidx) tm.assert_numpy_array_equal(ridx, eridx) - def test_join_left(self): + def test_join_left(self, index_large): other = UInt64Index(2 ** 63 + np.array([7, 12, 25, 1, 2, 10], dtype="uint64")) other_mono = UInt64Index( 2 ** 63 + np.array([1, 2, 7, 10, 12, 25], dtype="uint64") ) # not monotonic - res, lidx, ridx = self.index.join(other, how="left", return_indexers=True) - eres = self.index + res, lidx, ridx = index_large.join(other, how="left", return_indexers=True) + eres = index_large eridx = np.array([-1, 5, -1, -1, 2], dtype=np.intp) assert isinstance(res, UInt64Index) @@ -998,7 +1029,7 @@ def test_join_left(self): tm.assert_numpy_array_equal(ridx, eridx) # monotonic - res, lidx, ridx = self.index.join(other_mono, how="left", return_indexers=True) + res, lidx, ridx = index_large.join(other_mono, how="left", return_indexers=True) eridx = np.array([-1, 3, -1, -1, 5], dtype=np.intp) assert isinstance(res, UInt64Index) @@ -1020,14 +1051,14 @@ def test_join_left(self): tm.assert_numpy_array_equal(lidx, elidx) tm.assert_numpy_array_equal(ridx, eridx) - def test_join_right(self): + def test_join_right(self, index_large): other = UInt64Index(2 ** 63 + np.array([7, 12, 25, 1, 2, 10], dtype="uint64")) other_mono = UInt64Index( 2 ** 63 + np.array([1, 2, 7, 10, 12, 25], dtype="uint64") ) # not monotonic - res, lidx, ridx = self.index.join(other, how="right", return_indexers=True) + res, lidx, ridx = index_large.join(other, how="right", return_indexers=True) eres = other elidx = np.array([-1, -1, 4, -1, -1, 1], dtype=np.intp) @@ -1037,7 +1068,9 @@ def test_join_right(self): assert ridx is None # monotonic - res, lidx, ridx = self.index.join(other_mono, how="right", return_indexers=True) + res, lidx, ridx = index_large.join( + other_mono, how="right", return_indexers=True + ) eres = other_mono elidx = np.array([-1, -1, -1, 1, -1, 4], dtype=np.intp) @@ -1060,38 +1093,38 @@ def test_join_right(self): tm.assert_numpy_array_equal(lidx, elidx) tm.assert_numpy_array_equal(ridx, eridx) - def test_join_non_int_index(self): + def test_join_non_int_index(self, index_large): other = Index( 2 ** 63 + np.array([1, 5, 7, 10, 20], dtype="uint64"), dtype=object ) - outer = self.index.join(other, how="outer") - outer2 = other.join(self.index, how="outer") + outer = index_large.join(other, how="outer") + outer2 = other.join(index_large, how="outer") expected = Index( 2 ** 63 + np.array([0, 1, 5, 7, 10, 15, 20, 25], dtype="uint64") ) tm.assert_index_equal(outer, outer2) tm.assert_index_equal(outer, expected) - inner = self.index.join(other, how="inner") - inner2 = other.join(self.index, how="inner") + inner = index_large.join(other, how="inner") + inner2 = other.join(index_large, how="inner") expected = Index(2 ** 63 + np.array([10, 20], dtype="uint64")) tm.assert_index_equal(inner, inner2) tm.assert_index_equal(inner, expected) - left = self.index.join(other, how="left") - tm.assert_index_equal(left, self.index.astype(object)) + left = index_large.join(other, how="left") + tm.assert_index_equal(left, index_large.astype(object)) - left2 = other.join(self.index, how="left") + left2 = other.join(index_large, how="left") tm.assert_index_equal(left2, other) - right = self.index.join(other, how="right") + right = index_large.join(other, how="right") tm.assert_index_equal(right, other) - right2 = other.join(self.index, how="right") - tm.assert_index_equal(right2, self.index.astype(object)) + right2 = other.join(index_large, how="right") + tm.assert_index_equal(right2, index_large.astype(object)) - def test_join_outer(self): + def test_join_outer(self, index_large): other = UInt64Index(2 ** 63 + np.array([7, 12, 25, 1, 2, 10], dtype="uint64")) other_mono = UInt64Index( 2 ** 63 + np.array([1, 2, 7, 10, 12, 25], dtype="uint64") @@ -1099,8 +1132,8 @@ def test_join_outer(self): # not monotonic # guarantee of sortedness - res, lidx, ridx = self.index.join(other, how="outer", return_indexers=True) - noidx_res = self.index.join(other, how="outer") + res, lidx, ridx = index_large.join(other, how="outer", return_indexers=True) + noidx_res = index_large.join(other, how="outer") tm.assert_index_equal(res, noidx_res) eres = UInt64Index( @@ -1115,8 +1148,10 @@ def test_join_outer(self): tm.assert_numpy_array_equal(ridx, eridx) # monotonic - res, lidx, ridx = self.index.join(other_mono, how="outer", return_indexers=True) - noidx_res = self.index.join(other_mono, how="outer") + res, lidx, ridx = index_large.join( + other_mono, how="outer", return_indexers=True + ) + noidx_res = index_large.join(other_mono, how="outer") tm.assert_index_equal(res, noidx_res) elidx = np.array([0, -1, -1, -1, 1, -1, 2, 3, 4], dtype=np.intp) diff --git a/pandas/tests/indexes/test_range.py b/pandas/tests/indexes/test_range.py index 627c5cc56e010..fa64e1bacb2e5 100644 --- a/pandas/tests/indexes/test_range.py +++ b/pandas/tests/indexes/test_range.py @@ -1,4 +1,4 @@ -from datetime import datetime +from datetime import datetime, timedelta import numpy as np import pytest @@ -22,15 +22,18 @@ class TestRangeIndex(Numeric): _holder = RangeIndex _compat_props = ["shape", "ndim", "size"] - def setup_method(self, method): - self.indices = dict( - index=RangeIndex(0, 20, 2, name="foo"), - index_dec=RangeIndex(18, -1, -2, name="bar"), - ) - self.setup_indices() + @pytest.fixture( + params=[ + RangeIndex(start=0, stop=20, step=2, name="foo"), + RangeIndex(start=18, stop=-1, step=-2, name="bar"), + ], + ids=["index_inc", "index_dec"], + ) + def indices(self, request): + return request.param def create_index(self): - return RangeIndex(5) + return RangeIndex(start=0, stop=20, step=2) def test_can_hold_identifiers(self): idx = self.create_index() @@ -38,8 +41,9 @@ def test_can_hold_identifiers(self): assert idx._can_hold_identifiers_and_holds_name(key) is False def test_too_many_names(self): + index = self.create_index() with pytest.raises(ValueError, match="^Length"): - self.index.names = ["roger", "harold"] + index.names = ["roger", "harold"] @pytest.mark.parametrize("name", [None, "foo"]) @pytest.mark.parametrize( @@ -267,7 +271,8 @@ def test_view(self): tm.assert_index_equal(i, i_view) def test_dtype(self): - assert self.index.dtype == np.int64 + index = self.create_index() + assert index.dtype == np.int64 def test_cached_data(self): # GH 26565, GH26617 @@ -326,11 +331,12 @@ def test_cached_data(self): assert isinstance(idx._cached_data, np.ndarray) def test_is_monotonic(self): - assert self.index.is_monotonic is True - assert self.index.is_monotonic_increasing is True - assert self.index.is_monotonic_decreasing is False - assert self.index._is_strictly_monotonic_increasing is True - assert self.index._is_strictly_monotonic_decreasing is False + index = RangeIndex(0, 20, 2) + assert index.is_monotonic is True + assert index.is_monotonic_increasing is True + assert index.is_monotonic_decreasing is False + assert index._is_strictly_monotonic_increasing is True + assert index._is_strictly_monotonic_decreasing is False index = RangeIndex(4, 0, -1) assert index.is_monotonic is False @@ -376,43 +382,45 @@ def test_logical_compat(self): assert idx.any() == idx.values.any() def test_identical(self): - i = Index(self.index.copy()) - assert i.identical(self.index) + index = self.create_index() + i = Index(index.copy()) + assert i.identical(index) # we don't allow object dtype for RangeIndex - if isinstance(self.index, RangeIndex): + if isinstance(index, RangeIndex): return same_values_different_type = Index(i, dtype=object) assert not i.identical(same_values_different_type) - i = self.index.copy(dtype=object) + i = index.copy(dtype=object) i = i.rename("foo") same_values = Index(i, dtype=object) - assert same_values.identical(self.index.copy(dtype=object)) + assert same_values.identical(index.copy(dtype=object)) - assert not i.identical(self.index) + assert not i.identical(index) assert Index(same_values, name="foo", dtype=object).identical(i) - assert not self.index.copy(dtype=object).identical( - self.index.copy(dtype="int64") - ) + assert not index.copy(dtype=object).identical(index.copy(dtype="int64")) def test_get_indexer(self): + index = self.create_index() target = RangeIndex(10) - indexer = self.index.get_indexer(target) + indexer = index.get_indexer(target) expected = np.array([0, -1, 1, -1, 2, -1, 3, -1, 4, -1], dtype=np.intp) tm.assert_numpy_array_equal(indexer, expected) def test_get_indexer_pad(self): + index = self.create_index() target = RangeIndex(10) - indexer = self.index.get_indexer(target, method="pad") + indexer = index.get_indexer(target, method="pad") expected = np.array([0, 0, 1, 1, 2, 2, 3, 3, 4, 4], dtype=np.intp) tm.assert_numpy_array_equal(indexer, expected) def test_get_indexer_backfill(self): + index = self.create_index() target = RangeIndex(10) - indexer = self.index.get_indexer(target, method="backfill") + indexer = index.get_indexer(target, method="backfill") expected = np.array([0, 1, 1, 2, 2, 3, 3, 4, 4, 5], dtype=np.intp) tm.assert_numpy_array_equal(indexer, expected) @@ -434,10 +442,11 @@ def test_get_indexer_decreasing(self, stop): def test_join_outer(self): # join with Int64Index + index = self.create_index() other = Int64Index(np.arange(25, 14, -1)) - res, lidx, ridx = self.index.join(other, how="outer", return_indexers=True) - noidx_res = self.index.join(other, how="outer") + res, lidx, ridx = index.join(other, how="outer", return_indexers=True) + noidx_res = index.join(other, how="outer") tm.assert_index_equal(res, noidx_res) eres = Int64Index( @@ -461,8 +470,8 @@ def test_join_outer(self): # join with RangeIndex other = RangeIndex(25, 14, -1) - res, lidx, ridx = self.index.join(other, how="outer", return_indexers=True) - noidx_res = self.index.join(other, how="outer") + res, lidx, ridx = index.join(other, how="outer", return_indexers=True) + noidx_res = index.join(other, how="outer") tm.assert_index_equal(res, noidx_res) assert isinstance(res, Int64Index) @@ -473,9 +482,10 @@ def test_join_outer(self): def test_join_inner(self): # Join with non-RangeIndex + index = self.create_index() other = Int64Index(np.arange(25, 14, -1)) - res, lidx, ridx = self.index.join(other, how="inner", return_indexers=True) + res, lidx, ridx = index.join(other, how="inner", return_indexers=True) # no guarantee of sortedness, so sort for comparison purposes ind = res.argsort() @@ -495,7 +505,7 @@ def test_join_inner(self): # Join two RangeIndex other = RangeIndex(25, 14, -1) - res, lidx, ridx = self.index.join(other, how="inner", return_indexers=True) + res, lidx, ridx = index.join(other, how="inner", return_indexers=True) assert isinstance(res, RangeIndex) tm.assert_index_equal(res, eres) @@ -504,10 +514,11 @@ def test_join_inner(self): def test_join_left(self): # Join with Int64Index + index = self.create_index() other = Int64Index(np.arange(25, 14, -1)) - res, lidx, ridx = self.index.join(other, how="left", return_indexers=True) - eres = self.index + res, lidx, ridx = index.join(other, how="left", return_indexers=True) + eres = index eridx = np.array([-1, -1, -1, -1, -1, -1, -1, -1, 9, 7], dtype=np.intp) assert isinstance(res, RangeIndex) @@ -518,7 +529,7 @@ def test_join_left(self): # Join withRangeIndex other = Int64Index(np.arange(25, 14, -1)) - res, lidx, ridx = self.index.join(other, how="left", return_indexers=True) + res, lidx, ridx = index.join(other, how="left", return_indexers=True) assert isinstance(res, RangeIndex) tm.assert_index_equal(res, eres) @@ -527,9 +538,10 @@ def test_join_left(self): def test_join_right(self): # Join with Int64Index + index = self.create_index() other = Int64Index(np.arange(25, 14, -1)) - res, lidx, ridx = self.index.join(other, how="right", return_indexers=True) + res, lidx, ridx = index.join(other, how="right", return_indexers=True) eres = other elidx = np.array([-1, -1, -1, -1, -1, -1, -1, 9, -1, 8, -1], dtype=np.intp) @@ -541,7 +553,7 @@ def test_join_right(self): # Join withRangeIndex other = RangeIndex(25, 14, -1) - res, lidx, ridx = self.index.join(other, how="right", return_indexers=True) + res, lidx, ridx = index.join(other, how="right", return_indexers=True) eres = other assert isinstance(other, RangeIndex) @@ -550,36 +562,38 @@ def test_join_right(self): assert ridx is None def test_join_non_int_index(self): + index = self.create_index() other = Index([3, 6, 7, 8, 10], dtype=object) - outer = self.index.join(other, how="outer") - outer2 = other.join(self.index, how="outer") + outer = index.join(other, how="outer") + outer2 = other.join(index, how="outer") expected = Index([0, 2, 3, 4, 6, 7, 8, 10, 12, 14, 16, 18]) tm.assert_index_equal(outer, outer2) tm.assert_index_equal(outer, expected) - inner = self.index.join(other, how="inner") - inner2 = other.join(self.index, how="inner") + inner = index.join(other, how="inner") + inner2 = other.join(index, how="inner") expected = Index([6, 8, 10]) tm.assert_index_equal(inner, inner2) tm.assert_index_equal(inner, expected) - left = self.index.join(other, how="left") - tm.assert_index_equal(left, self.index.astype(object)) + left = index.join(other, how="left") + tm.assert_index_equal(left, index.astype(object)) - left2 = other.join(self.index, how="left") + left2 = other.join(index, how="left") tm.assert_index_equal(left2, other) - right = self.index.join(other, how="right") + right = index.join(other, how="right") tm.assert_index_equal(right, other) - right2 = other.join(self.index, how="right") - tm.assert_index_equal(right2, self.index.astype(object)) + right2 = other.join(index, how="right") + tm.assert_index_equal(right2, index.astype(object)) def test_join_non_unique(self): + index = self.create_index() other = Index([4, 4, 3, 3]) - res, lidx, ridx = self.index.join(other, return_indexers=True) + res, lidx, ridx = index.join(other, return_indexers=True) eres = Int64Index([0, 2, 4, 4, 6, 8, 10, 12, 14, 16, 18]) elidx = np.array([0, 1, 2, 2, 3, 4, 5, 6, 7, 8, 9], dtype=np.intp) @@ -589,40 +603,40 @@ def test_join_non_unique(self): tm.assert_numpy_array_equal(lidx, elidx) tm.assert_numpy_array_equal(ridx, eridx) - def test_join_self(self): - kinds = "outer", "inner", "left", "right" - for kind in kinds: - joined = self.index.join(self.index, how=kind) - assert self.index is joined + def test_join_self(self, join_type): + index = self.create_index() + joined = index.join(index, how=join_type) + assert index is joined @pytest.mark.parametrize("sort", [None, False]) def test_intersection(self, sort): # intersect with Int64Index + index = self.create_index() other = Index(np.arange(1, 6)) - result = self.index.intersection(other, sort=sort) - expected = Index(np.sort(np.intersect1d(self.index.values, other.values))) + result = index.intersection(other, sort=sort) + expected = Index(np.sort(np.intersect1d(index.values, other.values))) tm.assert_index_equal(result, expected) - result = other.intersection(self.index, sort=sort) + result = other.intersection(index, sort=sort) expected = Index( - np.sort(np.asarray(np.intersect1d(self.index.values, other.values))) + np.sort(np.asarray(np.intersect1d(index.values, other.values))) ) tm.assert_index_equal(result, expected) # intersect with increasing RangeIndex other = RangeIndex(1, 6) - result = self.index.intersection(other, sort=sort) - expected = Index(np.sort(np.intersect1d(self.index.values, other.values))) + result = index.intersection(other, sort=sort) + expected = Index(np.sort(np.intersect1d(index.values, other.values))) tm.assert_index_equal(result, expected) # intersect with decreasing RangeIndex other = RangeIndex(5, 0, -1) - result = self.index.intersection(other, sort=sort) - expected = Index(np.sort(np.intersect1d(self.index.values, other.values))) + result = index.intersection(other, sort=sort) + expected = Index(np.sort(np.intersect1d(index.values, other.values))) tm.assert_index_equal(result, expected) # reversed (GH 17296) - result = other.intersection(self.index, sort=sort) + result = other.intersection(index, sort=sort) tm.assert_index_equal(result, expected) # GH 17296: intersect two decreasing RangeIndexes @@ -667,17 +681,15 @@ def test_intersection(self, sort): @pytest.mark.parametrize("sort", [False, None]) def test_union_noncomparable(self, sort): - from datetime import datetime, timedelta - # corner case, non-Int64Index - now = datetime.now() - other = Index([now + timedelta(i) for i in range(4)], dtype=object) - result = self.index.union(other, sort=sort) - expected = Index(np.concatenate((self.index, other))) + index = self.create_index() + other = Index([datetime.now() + timedelta(i) for i in range(4)], dtype=object) + result = index.union(other, sort=sort) + expected = Index(np.concatenate((index, other))) tm.assert_index_equal(result, expected) - result = other.union(self.index, sort=sort) - expected = Index(np.concatenate((other, self.index))) + result = other.union(index, sort=sort) + expected = Index(np.concatenate((other, index))) tm.assert_index_equal(result, expected) @pytest.fixture( @@ -785,11 +797,13 @@ def test_cant_or_shouldnt_cast(self): with pytest.raises(TypeError): RangeIndex("0", "1", "2") - def test_view_Index(self): - self.index.view(Index) + def test_view_index(self): + index = self.create_index() + index.view(Index) def test_prevent_casting(self): - result = self.index.astype("O") + index = self.create_index() + result = index.astype("O") assert result.dtype == np.object_ def test_take_preserve_name(self): @@ -828,7 +842,8 @@ def test_print_unicode_columns(self): repr(df.columns) # should not raise UnicodeDecodeError def test_repr_roundtrip(self): - tm.assert_index_equal(eval(repr(self.index)), self.index) + index = self.create_index() + tm.assert_index_equal(eval(repr(index)), index) def test_slice_keep_name(self): idx = RangeIndex(1, 2, name="asdf") @@ -859,20 +874,17 @@ def test_explicit_conversions(self): result = a - fidx tm.assert_index_equal(result, expected) - def test_has_duplicates(self): - for ind in self.indices: - if not len(ind): - continue - idx = self.indices[ind] - assert idx.is_unique - assert not idx.has_duplicates + def test_has_duplicates(self, indices): + assert indices.is_unique + assert not indices.has_duplicates def test_extended_gcd(self): - result = self.index._extended_gcd(6, 10) + index = self.create_index() + result = index._extended_gcd(6, 10) assert result[0] == result[1] * 6 + result[2] * 10 assert 2 == result[0] - result = self.index._extended_gcd(10, 6) + result = index._extended_gcd(10, 6) assert 2 == result[1] * 10 + result[2] * 6 assert 2 == result[0] @@ -917,80 +929,71 @@ def test_pickle_compat_construction(self): pass def test_slice_specialised(self): + index = self.create_index() + index.name = "foo" # scalar indexing - res = self.index[1] + res = index[1] expected = 2 assert res == expected - res = self.index[-1] + res = index[-1] expected = 18 assert res == expected # slicing # slice value completion - index = self.index[:] - expected = self.index - tm.assert_index_equal(index, expected) + index_slice = index[:] + expected = index + tm.assert_index_equal(index_slice, expected) # positive slice values - index = self.index[7:10:2] + index_slice = index[7:10:2] expected = Index(np.array([14, 18]), name="foo") - tm.assert_index_equal(index, expected) + tm.assert_index_equal(index_slice, expected) # negative slice values - index = self.index[-1:-5:-2] + index_slice = index[-1:-5:-2] expected = Index(np.array([18, 14]), name="foo") - tm.assert_index_equal(index, expected) + tm.assert_index_equal(index_slice, expected) # stop overshoot - index = self.index[2:100:4] + index_slice = index[2:100:4] expected = Index(np.array([4, 12]), name="foo") - tm.assert_index_equal(index, expected) + tm.assert_index_equal(index_slice, expected) # reverse - index = self.index[::-1] - expected = Index(self.index.values[::-1], name="foo") - tm.assert_index_equal(index, expected) + index_slice = index[::-1] + expected = Index(index.values[::-1], name="foo") + tm.assert_index_equal(index_slice, expected) - index = self.index[-8::-1] + index_slice = index[-8::-1] expected = Index(np.array([4, 2, 0]), name="foo") - tm.assert_index_equal(index, expected) + tm.assert_index_equal(index_slice, expected) - index = self.index[-40::-1] + index_slice = index[-40::-1] expected = Index(np.array([], dtype=np.int64), name="foo") - tm.assert_index_equal(index, expected) - - index = self.index[40::-1] - expected = Index(self.index.values[40::-1], name="foo") - tm.assert_index_equal(index, expected) - - index = self.index[10::-1] - expected = Index(self.index.values[::-1], name="foo") - tm.assert_index_equal(index, expected) - - def test_len_specialised(self): - - # make sure that our len is the same as - # np.arange calc - - for step in np.arange(1, 6, 1): + tm.assert_index_equal(index_slice, expected) - arr = np.arange(0, 5, step) - i = RangeIndex(0, 5, step) - assert len(i) == len(arr) + index_slice = index[40::-1] + expected = Index(index.values[40::-1], name="foo") + tm.assert_index_equal(index_slice, expected) - i = RangeIndex(5, 0, step) - assert len(i) == 0 + index_slice = index[10::-1] + expected = Index(index.values[::-1], name="foo") + tm.assert_index_equal(index_slice, expected) - for step in np.arange(-6, -1, 1): + @pytest.mark.parametrize("step", set(range(-5, 6)) - {0}) + def test_len_specialised(self, step): + # make sure that our len is the same as np.arange calc + start, stop = (0, 5) if step > 0 else (5, 0) - arr = np.arange(5, 0, step) - i = RangeIndex(5, 0, step) - assert len(i) == len(arr) + arr = np.arange(start, stop, step) + index = RangeIndex(start, stop, step) + assert len(index) == len(arr) - i = RangeIndex(0, 5, step) - assert len(i) == 0 + index = RangeIndex(stop, start, step) + assert len(index) == 0 @pytest.fixture( params=[ diff --git a/pandas/tests/indexes/test_setops.py b/pandas/tests/indexes/test_setops.py index b3850f7a4e09e..d5b23653e8a72 100644 --- a/pandas/tests/indexes/test_setops.py +++ b/pandas/tests/indexes/test_setops.py @@ -13,7 +13,7 @@ import pandas as pd from pandas import Float64Index, Int64Index, RangeIndex, UInt64Index from pandas.api.types import pandas_dtype -from pandas.tests.indexes.conftest import indices_list +from pandas.tests.indexes.conftest import indices_dict import pandas.util.testing as tm COMPATIBLE_INCONSISTENT_PAIRS = OrderedDict( @@ -26,15 +26,12 @@ ) -@pytest.fixture( - params=list(it.combinations(indices_list, 2)), - ids=lambda x: type(x[0]).__name__ + type(x[1]).__name__, -) +@pytest.fixture(params=it.combinations(indices_dict, 2), ids="-".join) def index_pair(request): """ Create all combinations of 2 index types. """ - return request.param + return indices_dict[request.param[0]], indices_dict[request.param[1]] def test_union_same_types(indices): diff --git a/pandas/tests/indexes/timedeltas/test_timedelta.py b/pandas/tests/indexes/timedeltas/test_timedelta.py index e790a913fcac2..2ef86ddf8c8bf 100644 --- a/pandas/tests/indexes/timedeltas/test_timedelta.py +++ b/pandas/tests/indexes/timedeltas/test_timedelta.py @@ -30,9 +30,9 @@ class TestTimedeltaIndex(DatetimeLike): _holder = TimedeltaIndex - def setup_method(self, method): - self.indices = dict(index=tm.makeTimedeltaIndex(10)) - self.setup_indices() + @pytest.fixture + def indices(self): + return tm.makeTimedeltaIndex(10) def create_index(self): return pd.to_timedelta(range(5), unit="d") + pd.offsets.Hour(1) From 7ea5c967e9262fb6b8ab4de3b7706f7ac59efb65 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 11 Oct 2019 08:50:17 -0700 Subject: [PATCH 036/119] CLN: Exception (#28853) --- pandas/core/generic.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index a135f567fe6f4..f77d543193e74 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -4555,10 +4555,7 @@ def reindex(self, *args, **kwargs): # check if we are a multi reindex if self._needs_reindex_multi(axes, method, level): - try: - return self._reindex_multi(axes, copy, fill_value) - except Exception: - pass + return self._reindex_multi(axes, copy, fill_value) # perform the reindex on the axes return self._reindex_axes( @@ -9065,7 +9062,6 @@ def _where( # try to not change dtype at first (if try_quick) if try_quick: - new_other = com.values_from_object(self) new_other = new_other.copy() new_other[icond] = other From a31e16004ab875629c109b85dcf18640c8a6511a Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 11 Oct 2019 11:15:34 -0700 Subject: [PATCH 037/119] CLN: assorted cleanups, mostly post-black fixups (#28857) --- asv_bench/benchmarks/ctors.py | 2 +- asv_bench/benchmarks/eval.py | 2 +- asv_bench/benchmarks/io/hdf.py | 4 ++-- doc/source/conf.py | 4 ++-- doc/source/whatsnew/v1.0.0.rst | 2 +- pandas/_libs/reduction.pyx | 6 +++--- pandas/_version.py | 2 +- pandas/core/apply.py | 8 +++++--- pandas/io/sas/sas7bdat.py | 2 +- pandas/plotting/_core.py | 4 ++-- pandas/plotting/_matplotlib/boxplot.py | 4 +--- pandas/plotting/_matplotlib/core.py | 2 +- pandas/plotting/_matplotlib/hist.py | 8 +++----- pandas/plotting/_matplotlib/style.py | 2 +- pandas/plotting/_matplotlib/tools.py | 3 +-- pandas/tests/io/test_html.py | 4 ++-- pandas/tests/window/test_window.py | 2 +- pandas/tseries/offsets.py | 8 +++----- pandas/util/_decorators.py | 4 ++-- pandas/util/_exceptions.py | 10 +++++----- pandas/util/_print_versions.py | 2 +- pandas/util/_test_decorators.py | 2 +- pandas/util/_validators.py | 4 ++-- pandas/util/testing.py | 18 +++++++++--------- scripts/find_commits_touching_func.py | 6 +++--- scripts/tests/test_validate_docstrings.py | 2 +- scripts/validate_docstrings.py | 2 +- setup.py | 6 ++---- 28 files changed, 59 insertions(+), 66 deletions(-) diff --git a/asv_bench/benchmarks/ctors.py b/asv_bench/benchmarks/ctors.py index ec3dd7a48a89f..a9e45cad22d27 100644 --- a/asv_bench/benchmarks/ctors.py +++ b/asv_bench/benchmarks/ctors.py @@ -67,7 +67,7 @@ class SeriesConstructors: def setup(self, data_fmt, with_index, dtype): if data_fmt in (gen_of_str, gen_of_tuples) and with_index: raise NotImplementedError( - "Series constructors do not support " "using generators with indexes" + "Series constructors do not support using generators with indexes" ) N = 10 ** 4 if dtype == "float": diff --git a/asv_bench/benchmarks/eval.py b/asv_bench/benchmarks/eval.py index 06a181875aaa8..cbab9fdc9c0ba 100644 --- a/asv_bench/benchmarks/eval.py +++ b/asv_bench/benchmarks/eval.py @@ -27,7 +27,7 @@ def time_add(self, engine, threads): def time_and(self, engine, threads): pd.eval( - "(self.df > 0) & (self.df2 > 0) & " "(self.df3 > 0) & (self.df4 > 0)", + "(self.df > 0) & (self.df2 > 0) & (self.df3 > 0) & (self.df4 > 0)", engine=engine, ) diff --git a/asv_bench/benchmarks/io/hdf.py b/asv_bench/benchmarks/io/hdf.py index 8ec04a2087f1b..b78dc63d17130 100644 --- a/asv_bench/benchmarks/io/hdf.py +++ b/asv_bench/benchmarks/io/hdf.py @@ -88,11 +88,11 @@ def time_write_store_table_dc(self): def time_query_store_table_wide(self): self.store.select( - "table_wide", where="index > self.start_wide and " "index < self.stop_wide" + "table_wide", where="index > self.start_wide and index < self.stop_wide" ) def time_query_store_table(self): - self.store.select("table", where="index > self.start and " "index < self.stop") + self.store.select("table", where="index > self.start and index < self.stop") def time_store_repr(self): repr(self.store) diff --git a/doc/source/conf.py b/doc/source/conf.py index 5e2a2db20b53c..34faf183db1c2 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -628,11 +628,11 @@ def linkcode_resolve(domain, info): fn = os.path.relpath(fn, start=os.path.dirname(pandas.__file__)) if "+" in pandas.__version__: - return "http://github.com/pandas-dev/pandas/blob/master/pandas/" "{}{}".format( + return "http://github.com/pandas-dev/pandas/blob/master/pandas/{}{}".format( fn, linespec ) else: - return "http://github.com/pandas-dev/pandas/blob/" "v{}/pandas/{}{}".format( + return "http://github.com/pandas-dev/pandas/blob/v{}/pandas/{}{}".format( pandas.__version__, fn, linespec ) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index da75e2c49ae10..ea52736cb11a7 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -256,7 +256,7 @@ Timezones Numeric ^^^^^^^ - Bug in :meth:`DataFrame.quantile` with zero-column :class:`DataFrame` incorrectly raising (:issue:`23925`) -- :class:`DataFrame` inequality comparisons with object-dtype and ``complex`` entries failing to raise ``TypeError`` like their :class:`Series` counterparts (:issue:`28079`) +- :class:`DataFrame` flex inequality comparisons methods (:meth:`DataFrame.lt`, :meth:`DataFrame.le`, :meth:`DataFrame.gt`, :meth: `DataFrame.ge`) with object-dtype and ``complex`` entries failing to raise ``TypeError`` like their :class:`Series` counterparts (:issue:`28079`) - Bug in :class:`DataFrame` logical operations (`&`, `|`, `^`) not matching :class:`Series` behavior by filling NA values (:issue:`28741`) - diff --git a/pandas/_libs/reduction.pyx b/pandas/_libs/reduction.pyx index a7d6d19bbc80d..34eb9412451c5 100644 --- a/pandas/_libs/reduction.pyx +++ b/pandas/_libs/reduction.pyx @@ -170,9 +170,9 @@ cdef class Reducer: PyArray_SETITEM(result, PyArray_ITER_DATA(it), res) chunk.data = chunk.data + self.increment PyArray_ITER_NEXT(it) - except Exception, e: - if hasattr(e, 'args'): - e.args = e.args + (i,) + except Exception as err: + if hasattr(err, 'args'): + err.args = err.args + (i,) raise finally: # so we don't free the wrong memory diff --git a/pandas/_version.py b/pandas/_version.py index 4f5bdf59a99d5..0cdedf3da3ea7 100644 --- a/pandas/_version.py +++ b/pandas/_version.py @@ -249,7 +249,7 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command): # tag full_tag = mo.group(1) if not full_tag.startswith(tag_prefix): - fmt = "tag '{full_tag}' doesn't start with prefix " "'{tag_prefix}'" + fmt = "tag '{full_tag}' doesn't start with prefix '{tag_prefix}'" msg = fmt.format(full_tag=full_tag, tag_prefix=tag_prefix) if verbose: print(msg) diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 714423de34222..605d179e7c652 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -341,13 +341,15 @@ def apply_series_generator(self): for i, v in enumerate(series_gen): results[i] = self.f(v) keys.append(v.name) - except Exception as e: - if hasattr(e, "args"): + except Exception as err: + if hasattr(err, "args"): # make sure i is defined if i is not None: k = res_index[i] - e.args = e.args + ("occurred at index %s" % pprint_thing(k),) + err.args = err.args + ( + "occurred at index %s" % pprint_thing(k), + ) raise self.results = results diff --git a/pandas/io/sas/sas7bdat.py b/pandas/io/sas/sas7bdat.py index 7cc9dc11a8ccc..eb57d703cd4d5 100644 --- a/pandas/io/sas/sas7bdat.py +++ b/pandas/io/sas/sas7bdat.py @@ -672,7 +672,7 @@ def _read_next_page(self): return True elif len(self._cached_page) != self._page_length: self.close() - msg = "failed to read complete page from file " "(read {:d} of {:d} bytes)" + msg = "failed to read complete page from file (read {:d} of {:d} bytes)" raise ValueError(msg.format(len(self._cached_page), self._page_length)) self._read_page_header() diff --git a/pandas/plotting/_core.py b/pandas/plotting/_core.py index 966a18e11a620..d7b0839ec62ea 100644 --- a/pandas/plotting/_core.py +++ b/pandas/plotting/_core.py @@ -685,7 +685,7 @@ def _get_call_args(backend_name, data, args, kwargs): else: raise TypeError( ( - "Called plot accessor for type {}, expected " "Series or DataFrame" + "Called plot accessor for type {}, expected Series or DataFrame" ).format(type(data).__name__) ) @@ -740,7 +740,7 @@ def __call__(self, *args, **kwargs): return plot_backend.plot(data, x=x, y=y, kind=kind, **kwargs) else: raise ValueError( - ("plot kind {} can only be used for " "data frames").format(kind) + ("plot kind {} can only be used for data frames").format(kind) ) elif kind in self._series_kinds: if isinstance(data, ABCDataFrame): diff --git a/pandas/plotting/_matplotlib/boxplot.py b/pandas/plotting/_matplotlib/boxplot.py index 99035013092cc..eed328131da92 100644 --- a/pandas/plotting/_matplotlib/boxplot.py +++ b/pandas/plotting/_matplotlib/boxplot.py @@ -331,9 +331,7 @@ def plot_group(keys, values, ax): if return_type is None: return_type = "axes" if layout is not None: - raise ValueError( - "The 'layout' keyword is not supported when " "'by' is None" - ) + raise ValueError("The 'layout' keyword is not supported when 'by' is None") if ax is None: rc = {"figure.figsize": figsize} if figsize is not None else {} diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py index 82c5ba7f0317d..a729951b3d7db 100644 --- a/pandas/plotting/_matplotlib/core.py +++ b/pandas/plotting/_matplotlib/core.py @@ -230,7 +230,7 @@ def _validate_color_args(self): "color" in self.kwds or "colors" in self.kwds ) and self.colormap is not None: warnings.warn( - "'color' and 'colormap' cannot be used " "simultaneously. Using 'color'" + "'color' and 'colormap' cannot be used simultaneously. Using 'color'" ) if "color" in self.kwds and self.style is not None: diff --git a/pandas/plotting/_matplotlib/hist.py b/pandas/plotting/_matplotlib/hist.py index 5213e09f14067..f95ff2578d882 100644 --- a/pandas/plotting/_matplotlib/hist.py +++ b/pandas/plotting/_matplotlib/hist.py @@ -184,7 +184,7 @@ def _grouped_plot( if figsize == "default": # allowed to specify mpl default with 'default' warnings.warn( - "figsize='default' is deprecated. Specify figure " "size by tuple instead", + "figsize='default' is deprecated. Specify figure size by tuple instead", FutureWarning, stacklevel=5, ) @@ -298,9 +298,7 @@ def hist_series( if by is None: if kwds.get("layout", None) is not None: - raise ValueError( - "The 'layout' keyword is not supported when " "'by' is None" - ) + raise ValueError("The 'layout' keyword is not supported when 'by' is None") # hack until the plotting interface is a bit more unified fig = kwds.pop( "figure", plt.gcf() if plt.get_fignums() else plt.figure(figsize=figsize) @@ -394,7 +392,7 @@ def hist_frame( naxes = len(data.columns) if naxes == 0: - raise ValueError("hist method requires numerical columns, " "nothing to plot.") + raise ValueError("hist method requires numerical columns, nothing to plot.") fig, axes = _subplots( naxes=naxes, diff --git a/pandas/plotting/_matplotlib/style.py b/pandas/plotting/_matplotlib/style.py index e1bba5856e271..927b9cf4e392a 100644 --- a/pandas/plotting/_matplotlib/style.py +++ b/pandas/plotting/_matplotlib/style.py @@ -25,7 +25,7 @@ def _get_standard_colors( elif color is not None: if colormap is not None: warnings.warn( - "'color' and 'colormap' cannot be used " "simultaneously. Using 'color'" + "'color' and 'colormap' cannot be used simultaneously. Using 'color'" ) colors = list(color) if is_list_like(color) else color else: diff --git a/pandas/plotting/_matplotlib/tools.py b/pandas/plotting/_matplotlib/tools.py index eddc9b4cd21bd..caa0167c06389 100644 --- a/pandas/plotting/_matplotlib/tools.py +++ b/pandas/plotting/_matplotlib/tools.py @@ -188,8 +188,7 @@ def _subplots( ax = _flatten(ax) if layout is not None: warnings.warn( - "When passing multiple axes, layout keyword is " "ignored", - UserWarning, + "When passing multiple axes, layout keyword is ignored", UserWarning ) if sharex or sharey: warnings.warn( diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index 1045b72f0aa6e..f35707de189dc 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -1233,8 +1233,8 @@ class ErrorThread(threading.Thread): def run(self): try: super().run() - except Exception as e: - self.err = e + except Exception as err: + self.err = err else: self.err = None diff --git a/pandas/tests/window/test_window.py b/pandas/tests/window/test_window.py index 5692404205012..f42c507e51511 100644 --- a/pandas/tests/window/test_window.py +++ b/pandas/tests/window/test_window.py @@ -65,7 +65,7 @@ def test_agg_function_support(self, arg): df = pd.DataFrame({"A": np.arange(5)}) roll = df.rolling(2, win_type="triang") - msg = "'{arg}' is not a valid function for " "'Window' object".format(arg=arg) + msg = "'{arg}' is not a valid function for 'Window' object".format(arg=arg) with pytest.raises(AttributeError, match=msg): roll.agg(arg) diff --git a/pandas/tseries/offsets.py b/pandas/tseries/offsets.py index 81d8869dd7ba0..84b00d7f4907f 100644 --- a/pandas/tseries/offsets.py +++ b/pandas/tseries/offsets.py @@ -658,9 +658,7 @@ def __init__(self, start="09:00", end="17:00", offset=timedelta(0)): # Validation of input if len(start) != len(end): - raise ValueError( - "number of starting time and ending time " "must be the same" - ) + raise ValueError("number of starting time and ending time must be the same") num_openings = len(start) # sort starting and ending time by starting time @@ -2242,7 +2240,7 @@ def _parse_suffix(cls, varion_code, startingMonth_code, weekday_code): variation = "last" else: raise ValueError( - "Unable to parse varion_code: " "{code}".format(code=varion_code) + "Unable to parse varion_code: {code}".format(code=varion_code) ) startingMonth = ccalendar.MONTH_TO_CAL_NUM[startingMonth_code] @@ -2557,7 +2555,7 @@ def __init__(self, n=1, normalize=False): BaseOffset.__init__(self, n, normalize) if normalize: raise ValueError( - "Tick offset with `normalize=True` are not " "allowed." + "Tick offset with `normalize=True` are not allowed." ) # GH#21427 __gt__ = _tick_comp(operator.gt) diff --git a/pandas/util/_decorators.py b/pandas/util/_decorators.py index 8a25e511b5fc4..ebc015c820c14 100644 --- a/pandas/util/_decorators.py +++ b/pandas/util/_decorators.py @@ -171,7 +171,7 @@ def deprecate_kwarg( if mapping is not None and not hasattr(mapping, "get") and not callable(mapping): raise TypeError( - "mapping from old to new argument values " "must be dict or callable!" + "mapping from old to new argument values must be dict or callable!" ) def _deprecate_kwarg(func: F) -> F: @@ -214,7 +214,7 @@ def wrapper(*args, **kwargs) -> Callable[..., Any]: warnings.warn(msg, FutureWarning, stacklevel=stacklevel) if kwargs.get(new_arg_name) is not None: msg = ( - "Can only specify '{old_name}' or '{new_name}', " "not both" + "Can only specify '{old_name}' or '{new_name}', not both" ).format(old_name=old_arg_name, new_name=new_arg_name) raise TypeError(msg) else: diff --git a/pandas/util/_exceptions.py b/pandas/util/_exceptions.py index 953c8a43a21b8..4f2cbd4314b8e 100644 --- a/pandas/util/_exceptions.py +++ b/pandas/util/_exceptions.py @@ -6,11 +6,11 @@ def rewrite_exception(old_name, new_name): """Rewrite the message of an exception.""" try: yield - except Exception as e: - msg = e.args[0] + except Exception as err: + msg = err.args[0] msg = msg.replace(old_name, new_name) args = (msg,) - if len(e.args) > 1: - args = args + e.args[1:] - e.args = args + if len(err.args) > 1: + args = args + err.args[1:] + err.args = args raise diff --git a/pandas/util/_print_versions.py b/pandas/util/_print_versions.py index 21d09c06940ca..25795859d8018 100644 --- a/pandas/util/_print_versions.py +++ b/pandas/util/_print_versions.py @@ -139,7 +139,7 @@ def main(): "--json", metavar="FILE", nargs=1, - help="Save output as JSON into file, pass in " "'-' to output to stdout", + help="Save output as JSON into file, pass in '-' to output to stdout", ) (options, args) = parser.parse_args() diff --git a/pandas/util/_test_decorators.py b/pandas/util/_test_decorators.py index c9fd426f68b48..b516c3d78a11e 100644 --- a/pandas/util/_test_decorators.py +++ b/pandas/util/_test_decorators.py @@ -185,7 +185,7 @@ def skip_if_no(package: str, min_version: Optional[str] = None) -> Callable: ) skip_if_not_us_locale = pytest.mark.skipif( _skip_if_not_us_locale(), - reason="Specific locale is set " "{lang}".format(lang=locale.getlocale()[0]), + reason="Specific locale is set {lang}".format(lang=locale.getlocale()[0]), ) skip_if_no_scipy = pytest.mark.skipif( _skip_if_no_scipy(), reason="Missing SciPy requirement" diff --git a/pandas/util/_validators.py b/pandas/util/_validators.py index f5a472596f58f..0eaf46d563163 100644 --- a/pandas/util/_validators.py +++ b/pandas/util/_validators.py @@ -289,7 +289,7 @@ def validate_axis_style_args(data, args, kwargs, arg_name, method_name): # First fill with explicit values provided by the user... if arg_name in kwargs: if args: - msg = "{} got multiple values for argument " "'{}'".format( + msg = "{} got multiple values for argument '{}'".format( method_name, arg_name ) raise TypeError(msg) @@ -318,7 +318,7 @@ def validate_axis_style_args(data, args, kwargs, arg_name, method_name): elif len(args) == 2: if "axis" in kwargs: # Unambiguously wrong - msg = "Cannot specify both 'axis' and any of 'index' " "or 'columns'" + msg = "Cannot specify both 'axis' and any of 'index' or 'columns'" raise TypeError(msg) msg = ( diff --git a/pandas/util/testing.py b/pandas/util/testing.py index a34fdee227afc..c8b41a87baa9d 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -1175,7 +1175,7 @@ def assert_series_equal( # vs Timestamp) but will compare equal if not Index(left.values).equals(Index(right.values)): msg = ( - "[datetimelike_compat=True] {left} is not equal to " "{right}." + "[datetimelike_compat=True] {left} is not equal to {right}." ).format(left=left.values, right=right.values) raise AssertionError(msg) else: @@ -2363,26 +2363,26 @@ def wrapper(*args, **kwargs): skip() try: return t(*args, **kwargs) - except Exception as e: - errno = getattr(e, "errno", None) + except Exception as err: + errno = getattr(err, "errno", None) if not errno and hasattr(errno, "reason"): - errno = getattr(e.reason, "errno", None) + errno = getattr(err.reason, "errno", None) if errno in skip_errnos: skip( "Skipping test due to known errno" - " and error {error}".format(error=e) + " and error {error}".format(error=err) ) - e_str = str(e) + e_str = str(err) if any(m.lower() in e_str.lower() for m in _skip_on_messages): skip( "Skipping test because exception " - "message is known and error {error}".format(error=e) + "message is known and error {error}".format(error=err) ) - if not isinstance(e, error_classes): + if not isinstance(err, error_classes): raise if raise_on_error or can_connect(url, error_classes): @@ -2390,7 +2390,7 @@ def wrapper(*args, **kwargs): else: skip( "Skipping test due to lack of connectivity" - " and error {error}".format(error=e) + " and error {error}".format(error=err) ) return wrapper diff --git a/scripts/find_commits_touching_func.py b/scripts/find_commits_touching_func.py index 95a892b822cff..5e1a169dbfc3f 100755 --- a/scripts/find_commits_touching_func.py +++ b/scripts/find_commits_touching_func.py @@ -46,14 +46,14 @@ "--dir-masks", metavar="d_re(,d_re)*", default=[], - help="comma separated list of regexes to match base " "path against", + help="comma separated list of regexes to match base path against", ) argparser.add_argument( "-p", "--path-masks", metavar="p_re(,p_re)*", default=[], - help="comma separated list of regexes to match full " "file path against", + help="comma separated list of regexes to match full file path against", ) argparser.add_argument( "-y", @@ -195,7 +195,7 @@ def sorter(i): return hits[i].path, d print( - ("\nThese commits touched the %s method in these files " "on these dates:\n") + ("\nThese commits touched the %s method in these files on these dates:\n") % args.funcname ) for i in sorted(range(len(hits)), key=sorter): diff --git a/scripts/tests/test_validate_docstrings.py b/scripts/tests/test_validate_docstrings.py index 85e5bf239cbfa..f1b1d9d8678bb 100644 --- a/scripts/tests/test_validate_docstrings.py +++ b/scripts/tests/test_validate_docstrings.py @@ -1029,7 +1029,7 @@ def test_bad_generic_functions(self, capsys, func): ( "BadReturns", "no_capitalization", - ("Return value description should start with a capital " "letter",), + ("Return value description should start with a capital letter",), ), ( "BadReturns", diff --git a/scripts/validate_docstrings.py b/scripts/validate_docstrings.py index 401eaf8ff5ed5..d363e7108fff3 100755 --- a/scripts/validate_docstrings.py +++ b/scripts/validate_docstrings.py @@ -91,7 +91,7 @@ "whitespace only", "GL06": 'Found unknown section "{section}". Allowed sections are: ' "{allowed_sections}", - "GL07": "Sections are in the wrong order. Correct order is: " "{correct_sections}", + "GL07": "Sections are in the wrong order. Correct order is: {correct_sections}", "GL08": "The object does not have a docstring", "GL09": "Deprecation warning should precede extended summary", "GL10": "reST directives {directives} must be followed by two colons", diff --git a/setup.py b/setup.py index 7040147c2b741..04aedcb101e25 100755 --- a/setup.py +++ b/setup.py @@ -79,7 +79,7 @@ def is_platform_mac(): except ImportError: import tempita except ImportError: - raise ImportError("Building pandas requires Tempita: " "pip install Tempita") + raise ImportError("Building pandas requires Tempita: pip install Tempita") _pxi_dep_template = { @@ -142,9 +142,7 @@ def build_extensions(self): _build_ext.build_extensions(self) -DESCRIPTION = ( - "Powerful data structures for data analysis, time series, " "and statistics" -) +DESCRIPTION = "Powerful data structures for data analysis, time series, and statistics" LONG_DESCRIPTION = """ **pandas** is a Python package providing fast, flexible, and expressive data structures designed to make working with structured (tabular, multidimensional, From 0748c91d89e5a8ea1289f055a634770a8dbfa63a Mon Sep 17 00:00:00 2001 From: Tola A <33249563+tolaa001@users.noreply.github.com> Date: Fri, 11 Oct 2019 21:08:23 +0100 Subject: [PATCH 038/119] DOC: PR06 docstring fixes (#28807) --- pandas/core/arrays/categorical.py | 4 ++-- pandas/core/arrays/datetimelike.py | 6 +++--- pandas/core/arrays/integer.py | 8 ++++---- pandas/core/arrays/period.py | 4 ++-- pandas/core/dtypes/cast.py | 4 ++-- pandas/core/dtypes/common.py | 2 +- pandas/core/dtypes/concat.py | 4 ++-- pandas/core/dtypes/dtypes.py | 2 +- pandas/core/dtypes/missing.py | 2 +- 9 files changed, 18 insertions(+), 18 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index bab1127e6e539..6f56d0be1adc5 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -636,7 +636,7 @@ def from_codes(cls, codes, categories=None, ordered=None, dtype=None): Parameters ---------- - codes : array-like, integers + codes : array-like of int An integer array, where each integer points to a category in categories or dtype.categories, or else is -1 for NaN. categories : index-like, optional @@ -647,7 +647,7 @@ def from_codes(cls, codes, categories=None, ordered=None, dtype=None): Whether or not this categorical is treated as an ordered categorical. If not given here or in `dtype`, the resulting categorical will be unordered. - dtype : CategoricalDtype or the string "category", optional + dtype : CategoricalDtype or "category", optional If :class:`CategoricalDtype`, cannot be used together with `categories` or `ordered`. diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 958650e3842fa..c682f3884603c 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -670,7 +670,7 @@ def value_counts(self, dropna=False): Parameters ---------- - dropna : boolean, default True + dropna : bool, default True Don't include counts of NaT values. Returns @@ -728,7 +728,7 @@ def _maybe_mask_results(self, result, fill_value=iNaT, convert=None): ---------- result : a ndarray fill_value : object, default iNaT - convert : string/dtype or None + convert : str, dtype or None Returns ------- @@ -1168,7 +1168,7 @@ def _time_shift(self, periods, freq=None): ---------- periods : int Number of periods to shift by. - freq : pandas.DateOffset, pandas.Timedelta, or string + freq : pandas.DateOffset, pandas.Timedelta, or str Frequency increment to shift by. """ if freq is not None and freq != self.freq: diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index 7b03bf35faf25..630c3e50f2c09 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -95,7 +95,7 @@ def integer_array(values, dtype=None, copy=False): values : 1D list-like dtype : dtype, optional dtype to coerce - copy : boolean, default False + copy : bool, default False Returns ------- @@ -140,8 +140,8 @@ def coerce_to_array(values, dtype, mask=None, copy=False): ---------- values : 1D list-like dtype : integer dtype - mask : boolean 1D array, optional - copy : boolean, default False + mask : bool 1D array, optional + copy : bool, default False if True, copy the input Returns @@ -542,7 +542,7 @@ def value_counts(self, dropna=True): Parameters ---------- - dropna : boolean, default True + dropna : bool, default True Don't include counts of NaN. Returns diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index 43208d98abd3c..a21d9e67e49e5 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -444,7 +444,7 @@ def to_timestamp(self, freq=None, how="start"): Parameters ---------- - freq : string or DateOffset, optional + freq : str or DateOffset, optional Target frequency. The default is 'D' for week or longer, 'S' otherwise how : {'s', 'e', 'start', 'end'} @@ -515,7 +515,7 @@ def _time_shift(self, periods, freq=None): ---------- periods : int Number of periods to shift by. - freq : pandas.DateOffset, pandas.Timedelta, or string + freq : pandas.DateOffset, pandas.Timedelta, or str Frequency increment to shift by. """ if freq is not None: diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 328c7566d8e8d..5a5b87069e81a 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1006,7 +1006,7 @@ def maybe_infer_to_datetimelike(value, convert_dates=False): Parameters ---------- value : np.array / Series / Index / list-like - convert_dates : boolean, default False + convert_dates : bool, default False if True try really hard to convert dates (such as datetime.date), other leave inferred dtype 'date' alone @@ -1439,7 +1439,7 @@ def maybe_cast_to_integer_array(arr, dtype, copy=False): The array to cast. dtype : str, np.dtype The integer dtype to cast the array to. - copy: boolean, default False + copy: bool, default False Whether to make a copy of the array before returning. Returns diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index 41677af7b1721..3f4ebc88c1c8a 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -152,7 +152,7 @@ def ensure_int_or_float(arr: ArrayLike, copy: bool = False) -> np.array: ---------- arr : array-like The array whose data type we want to enforce. - copy: boolean + copy: bool Whether to copy the original array or reuse it in place, if possible. diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index 1094ab22238e9..bd1ed0bb7d318 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -192,10 +192,10 @@ def union_categoricals(to_union, sort_categories=False, ignore_order=False): ---------- to_union : list-like of Categorical, CategoricalIndex, or Series with dtype='category' - sort_categories : boolean, default False + sort_categories : bool, default False If true, resulting categories will be lexsorted, otherwise they will be ordered as they appear in the data. - ignore_order : boolean, default False + ignore_order : bool, default False If true, the ordered attribute of the Categoricals will be ignored. Results in an unordered categorical. diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index fcdb89dd8a334..ae6f2ed289248 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -85,7 +85,7 @@ def find( """ Parameters ---------- - dtype : Type[ExtensionDtype] or string + dtype : Type[ExtensionDtype] or str Returns ------- diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py index 56bfbefdbf248..322011eb8e263 100644 --- a/pandas/core/dtypes/missing.py +++ b/pandas/core/dtypes/missing.py @@ -521,7 +521,7 @@ def na_value_for_dtype(dtype, compat=True): Parameters ---------- dtype : string / dtype - compat : boolean, default True + compat : bool, default True Returns ------- From 54b1151a5b8be5a59323faa1047685e90aad023e Mon Sep 17 00:00:00 2001 From: Nathan Abel Date: Fri, 11 Oct 2019 17:45:13 -0400 Subject: [PATCH 039/119] TST: Allow for multiple variables on the same line in docstring validation (#28811) --- pandas/core/series.py | 12 +++--- scripts/tests/test_validate_docstrings.py | 48 +++++++++++++++++++++++ scripts/validate_docstrings.py | 9 +++-- 3 files changed, 59 insertions(+), 10 deletions(-) diff --git a/pandas/core/series.py b/pandas/core/series.py index 97e8a2dbac7f5..19d201917f3c8 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -2076,12 +2076,12 @@ def idxmin(self, axis=0, skipna=True, *args, **kwargs): Parameters ---------- - skipna : bool, default True - Exclude NA/null values. If the entire Series is NA, the result - will be NA. axis : int, default 0 For compatibility with DataFrame.idxmin. Redundant for application on Series. + skipna : bool, default True + Exclude NA/null values. If the entire Series is NA, the result + will be NA. *args, **kwargs Additional keywords have no effect but might be accepted for compatibility with NumPy. @@ -2146,12 +2146,12 @@ def idxmax(self, axis=0, skipna=True, *args, **kwargs): Parameters ---------- - skipna : bool, default True - Exclude NA/null values. If the entire Series is NA, the result - will be NA. axis : int, default 0 For compatibility with DataFrame.idxmax. Redundant for application on Series. + skipna : bool, default True + Exclude NA/null values. If the entire Series is NA, the result + will be NA. *args, **kwargs Additional keywords have no effect but might be accepted for compatibility with NumPy. diff --git a/scripts/tests/test_validate_docstrings.py b/scripts/tests/test_validate_docstrings.py index f1b1d9d8678bb..b1b5be6d4faeb 100644 --- a/scripts/tests/test_validate_docstrings.py +++ b/scripts/tests/test_validate_docstrings.py @@ -39,6 +39,21 @@ def plot(self, kind, color="blue", **kwargs): """ pass + def swap(self, arr, i, j, *args, **kwargs): + """ + Swap two indicies on an array. + + Parameters + ---------- + arr : list + The list having indexes swapped. + i, j : int + The indexes being swapped. + *args, **kwargs + Extraneous parameters are being permitted. + """ + pass + def sample(self): """ Generate and return a random number. @@ -256,6 +271,21 @@ def say_hello(): else: return None + def multiple_variables_on_one_line(self, matrix, a, b, i, j): + """ + Swap two values in a matrix. + + Parameters + ---------- + matrix : list of list + A double list that represents a matrix. + a, b : int + The indicies of the first value. + i, j : int + The indicies of the second value. + """ + pass + class BadGenericDocStrings: """Everything here has a bad docstring @@ -634,6 +664,17 @@ def list_incorrect_parameter_type(self, kind): """ pass + def bad_parameter_spacing(self, a, b): + """ + The parameters on the same line have an extra space between them. + + Parameters + ---------- + a, b : int + Foo bar baz. + """ + pass + class BadReturns: def return_not_documented(self): @@ -827,6 +868,7 @@ def test_good_class(self, capsys): "func", [ "plot", + "swap", "sample", "random_letters", "sample_values", @@ -837,6 +879,7 @@ def test_good_class(self, capsys): "good_imports", "no_returns", "empty_returns", + "multiple_variables_on_one_line", ], ) def test_good_functions(self, capsys, func): @@ -1002,6 +1045,11 @@ def test_bad_generic_functions(self, capsys, func): "list_incorrect_parameter_type", ('Parameter "kind" type should use "str" instead of "string"',), ), + ( + "BadParameters", + "bad_parameter_spacing", + ("Parameters {b} not documented", "Unknown parameters { b}"), + ), pytest.param( "BadParameters", "blank_lines", diff --git a/scripts/validate_docstrings.py b/scripts/validate_docstrings.py index d363e7108fff3..50b02c0fcbaf5 100755 --- a/scripts/validate_docstrings.py +++ b/scripts/validate_docstrings.py @@ -422,10 +422,11 @@ def needs_summary(self): @property def doc_parameters(self): - return collections.OrderedDict( - (name, (type_, "".join(desc))) - for name, type_, desc in self.doc["Parameters"] - ) + parameters = collections.OrderedDict() + for names, type_, desc in self.doc["Parameters"]: + for name in names.split(", "): + parameters[name] = (type_, "".join(desc)) + return parameters @property def signature_parameters(self): From d6b058deb9df12dbcd91bbacd199c382c1307671 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sat, 12 Oct 2019 10:00:18 -0700 Subject: [PATCH 040/119] CLN: try/except cleanups (#28939) --- pandas/core/apply.py | 8 ++------ pandas/core/base.py | 23 ++++++++++++++--------- pandas/core/groupby/generic.py | 6 ++---- 3 files changed, 18 insertions(+), 19 deletions(-) diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 605d179e7c652..91f3e878c3807 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -396,15 +396,11 @@ def wrap_results_for_axis(self): result = self.obj._constructor(data=results) if not isinstance(results[0], ABCSeries): - try: + if len(result.index) == len(self.res_columns): result.index = self.res_columns - except ValueError: - pass - try: + if len(result.columns) == len(self.res_index): result.columns = self.res_index - except ValueError: - pass return result diff --git a/pandas/core/base.py b/pandas/core/base.py index 4d5b20c56df5a..2d798dd15ad24 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -16,6 +16,7 @@ from pandas.util._decorators import Appender, Substitution, cache_readonly from pandas.util._validators import validate_bool_kwarg +from pandas.core.dtypes.cast import is_nested_object from pandas.core.dtypes.common import ( is_categorical_dtype, is_datetime64_ns_dtype, @@ -566,25 +567,27 @@ def _aggregate_multiple_funcs(self, arg, _level, _axis): # degenerate case if obj.ndim == 1: for a in arg: + colg = self._gotitem(obj.name, ndim=1, subset=obj) try: - colg = self._gotitem(obj.name, ndim=1, subset=obj) - results.append(colg.aggregate(a)) + new_res = colg.aggregate(a) - # make sure we find a good name - name = com.get_callable_name(a) or a - keys.append(name) except (TypeError, DataError): pass except SpecificationError: raise + else: + results.append(new_res) + + # make sure we find a good name + name = com.get_callable_name(a) or a + keys.append(name) # multiples else: for index, col in enumerate(obj): + colg = self._gotitem(col, ndim=1, subset=obj.iloc[:, index]) try: - colg = self._gotitem(col, ndim=1, subset=obj.iloc[:, index]) - results.append(colg.aggregate(arg)) - keys.append(col) + new_res = colg.aggregate(arg) except (TypeError, DataError): pass except ValueError: @@ -592,6 +595,9 @@ def _aggregate_multiple_funcs(self, arg, _level, _axis): continue except SpecificationError: raise + else: + results.append(new_res) + keys.append(col) # if we are empty if not len(results): @@ -604,7 +610,6 @@ def _aggregate_multiple_funcs(self, arg, _level, _axis): # we are concatting non-NDFrame objects, # e.g. a list of scalars - from pandas.core.dtypes.cast import is_nested_object from pandas import Series result = Series(results, index=keys, name=self.name) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 5200d33c6a1fb..7be11696b7d45 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -505,9 +505,7 @@ def true_and_notna(x, *args, **kwargs): indices = [ self._get_index(name) for name, group in self if true_and_notna(group) ] - except ValueError: - raise TypeError("the filter must return a boolean result") - except TypeError: + except (ValueError, TypeError): raise TypeError("the filter must return a boolean result") filtered = self._apply_filter(indices, dropna) @@ -1052,8 +1050,8 @@ def _aggregate_item_by_item(self, func, *args, **kwargs): data = obj[item] colg = SeriesGroupBy(data, selection=item, grouper=self.grouper) + cast = self._transform_should_cast(func) try: - cast = self._transform_should_cast(func) result[item] = colg.aggregate(func, *args, **kwargs) if cast: From c387a28527d84542660b6b7bcff2c558a0a770d4 Mon Sep 17 00:00:00 2001 From: Siddhesh Poyarekar Date: Sat, 12 Oct 2019 13:08:43 -0400 Subject: [PATCH 041/119] BUG: Avoid undefined behaviour when converting from float to timedelta (#28918) Summation of timedelta series with NaTs in them result in undefined behaviour because the final wrapping step of the summation ends up converting the NaNs in the sum through a direct cast to int64. This cast is undefined for NaN and just happens to work on x86_64 because of the way cvttd2si works. On Aarch64, the corresponding fcvtzs sets the result to 0 on undefined input. This fix trivially sets the conversion target to m8 instead of i8 so that numpy correctly casts from NaN to NaT. Note that the fix in numpy for the same is pending in PR #numpy/numpy/14669 . There is an existing test (test_sum_nanops_timedelta in frame/test_analytics.py) that exercises this bug and has been verified to have been fixed with this and the numpy patch. --- pandas/core/nanops.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index eb442e8bf3486..09b80d1b3a9ac 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -360,7 +360,7 @@ def _wrap_results(result, dtype, fill_value=None): result = tslibs.Timedelta(result, unit="ns") else: - result = result.astype("i8").view(dtype) + result = result.astype("m8[ns]").view(dtype) return result From fad037e9d50e4d23cf57a90d67c575d90822fedd Mon Sep 17 00:00:00 2001 From: Jesse Pardue <32435177+S3rbane@users.noreply.github.com> Date: Sat, 12 Oct 2019 13:09:56 -0400 Subject: [PATCH 042/119] =?UTF-8?q?Added=20note=20to=20'contributing.rst?= =?UTF-8?q?=20file',=20telling=20users=20to=20append=20GH=20Issue=E2=80=A6?= =?UTF-8?q?=20(#28907)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- doc/source/development/contributing.rst | 3 +++ 1 file changed, 3 insertions(+) diff --git a/doc/source/development/contributing.rst b/doc/source/development/contributing.rst index dc6fa3d100212..949b6bd475319 100644 --- a/doc/source/development/contributing.rst +++ b/doc/source/development/contributing.rst @@ -949,6 +949,9 @@ the expected correct result:: assert_frame_equal(pivoted, expected) +Please remember to add the Github Issue Number as a comment to a new test. +E.g. "# brief comment, see GH#28907" + Transitioning to ``pytest`` ~~~~~~~~~~~~~~~~~~~~~~~~~~~ From 4ade26b4c4c877a203813cc3410c20f3c820c23b Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sat, 12 Oct 2019 10:14:21 -0700 Subject: [PATCH 043/119] REF: maybe_promote refactor/cleanup (#28897) --- pandas/core/dtypes/cast.py | 73 +++++++++++++++++--------------------- 1 file changed, 33 insertions(+), 40 deletions(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 5a5b87069e81a..dd001e78c07de 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -393,32 +393,29 @@ def maybe_promote(dtype, fill_value=np.nan): elif is_float(fill_value): if issubclass(dtype.type, np.bool_): - dtype = np.object_ + dtype = np.dtype(np.object_) + elif issubclass(dtype.type, np.integer): dtype = np.dtype(np.float64) - if not isna(fill_value): - fill_value = dtype.type(fill_value) elif dtype.kind == "f": - if not np.can_cast(fill_value, dtype): - # e.g. dtype is float32, need float64 - dtype = np.min_scalar_type(fill_value) + mst = np.min_scalar_type(fill_value) + if mst > dtype: + # e.g. mst is np.float64 and dtype is np.float32 + dtype = mst elif dtype.kind == "c": mst = np.min_scalar_type(fill_value) dtype = np.promote_types(dtype, mst) - if dtype.kind == "c" and not np.isnan(fill_value): - fill_value = dtype.type(fill_value) - elif is_bool(fill_value): if not issubclass(dtype.type, np.bool_): - dtype = np.object_ - else: - fill_value = np.bool_(fill_value) + dtype = np.dtype(np.object_) + elif is_integer(fill_value): if issubclass(dtype.type, np.bool_): dtype = np.dtype(np.object_) + elif issubclass(dtype.type, np.integer): if not np.can_cast(fill_value, dtype): # upcast to prevent overflow @@ -428,35 +425,20 @@ def maybe_promote(dtype, fill_value=np.nan): # Case where we disagree with numpy dtype = np.dtype(np.object_) - fill_value = dtype.type(fill_value) - - elif issubclass(dtype.type, np.floating): - # check if we can cast - if _check_lossless_cast(fill_value, dtype): - fill_value = dtype.type(fill_value) - - if dtype.kind in ["c", "f"]: - # e.g. if dtype is complex128 and fill_value is 1, we - # want np.complex128(1) - fill_value = dtype.type(fill_value) - elif is_complex(fill_value): if issubclass(dtype.type, np.bool_): dtype = np.dtype(np.object_) + elif issubclass(dtype.type, (np.integer, np.floating)): mst = np.min_scalar_type(fill_value) dtype = np.promote_types(dtype, mst) elif dtype.kind == "c": mst = np.min_scalar_type(fill_value) - if mst > dtype and mst.kind == "c": + if mst > dtype: # e.g. mst is np.complex128 and dtype is np.complex64 dtype = mst - if dtype.kind == "c": - # make sure we have a np.complex and not python complex - fill_value = dtype.type(fill_value) - elif fill_value is None: if is_float_dtype(dtype) or is_complex_dtype(dtype): fill_value = np.nan @@ -466,37 +448,48 @@ def maybe_promote(dtype, fill_value=np.nan): elif is_datetime_or_timedelta_dtype(dtype): fill_value = dtype.type("NaT", "ns") else: - dtype = np.object_ + dtype = np.dtype(np.object_) fill_value = np.nan else: - dtype = np.object_ + dtype = np.dtype(np.object_) # in case we have a string that looked like a number if is_extension_array_dtype(dtype): pass elif issubclass(np.dtype(dtype).type, (bytes, str)): - dtype = np.object_ + dtype = np.dtype(np.object_) + fill_value = _ensure_dtype_type(fill_value, dtype) return dtype, fill_value -def _check_lossless_cast(value, dtype: np.dtype) -> bool: +def _ensure_dtype_type(value, dtype): """ - Check if we can cast the given value to the given dtype _losslesly_. + Ensure that the given value is an instance of the given dtype. + + e.g. if out dtype is np.complex64, we should have an instance of that + as opposed to a python complex object. Parameters ---------- value : object - dtype : np.dtype + dtype : np.dtype or ExtensionDtype Returns ------- - bool + object """ - casted = dtype.type(value) - if casted == value: - return True - return False + + # Start with exceptions in which we do _not_ cast to numpy types + if is_extension_array_dtype(dtype): + return value + elif dtype == np.object_: + return value + elif isna(value): + # e.g. keep np.nan rather than try to cast to np.float32(np.nan) + return value + + return dtype.type(value) def infer_dtype_from(val, pandas_dtype=False): From 1e928862b51f4792378434d010950f761400d02b Mon Sep 17 00:00:00 2001 From: Terji Petersen Date: Sat, 12 Oct 2019 18:30:02 +0100 Subject: [PATCH 044/119] API: Add various deprecated attributes to ._deprecated (#28805) --- doc/source/whatsnew/v1.0.0.rst | 2 ++ pandas/core/arrays/categorical.py | 6 +++++- pandas/core/arrays/sparse/array.py | 1 + pandas/core/base.py | 1 + pandas/core/generic.py | 14 +++++++++++++- pandas/core/indexes/base.py | 6 ++++-- pandas/core/indexes/multi.py | 4 ++++ pandas/core/series.py | 19 ++++++++++++++++--- pandas/tests/series/test_api.py | 3 --- 9 files changed, 46 insertions(+), 10 deletions(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index ea52736cb11a7..25cf41126e599 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -151,6 +151,8 @@ Other API changes - :meth:`pandas.api.types.infer_dtype` will now return "integer-na" for integer and ``np.nan`` mix (:issue:`27283`) - :meth:`MultiIndex.from_arrays` will no longer infer names from arrays if ``names=None`` is explicitly provided (:issue:`27292`) +- In order to improve tab-completion, Pandas does not include most deprecated attributes when introspecting a pandas object using ``dir`` (e.g. ``dir(df)``). + To see which attributes are excluded, see an object's ``_deprecations`` attribute, for example ``pd.DataFrame._deprecations`` (:issue:`28805`). - The returned dtype of ::func:`pd.unique` now matches the input dtype. (:issue:`27874`) - diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 6f56d0be1adc5..5e974f0b69e59 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -331,7 +331,7 @@ class Categorical(ExtensionArray, PandasObject): __array_priority__ = 1000 _dtype = CategoricalDtype(ordered=False) # tolist is not actually deprecated, just suppressed in the __dir__ - _deprecations = frozenset(["labels", "tolist"]) + _deprecations = PandasObject._deprecations | frozenset(["tolist", "get_values"]) _typ = "categorical" def __init__( @@ -2522,6 +2522,10 @@ class CategoricalAccessor(PandasDelegate, PandasObject, NoNewAttributesMixin): >>> s.cat.as_unordered() """ + _deprecations = PandasObject._deprecations | frozenset( + ["categorical", "index", "name"] + ) + def __init__(self, data): self._validate(data) self._parent = data.values diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index 5acc922734529..e1691de234335 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -263,6 +263,7 @@ class SparseArray(PandasObject, ExtensionArray, ExtensionOpsMixin): _pandas_ftype = "sparse" _subtyp = "sparse_array" # register ABCSparseArray + _deprecations = PandasObject._deprecations | frozenset(["get_values"]) def __init__( self, diff --git a/pandas/core/base.py b/pandas/core/base.py index 2d798dd15ad24..56ffd3db6e942 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -653,6 +653,7 @@ class IndexOpsMixin: # ndarray compatibility __array_priority__ = 1000 + _deprecations = frozenset(["item"]) def transpose(self, *args, **kwargs): """ diff --git a/pandas/core/generic.py b/pandas/core/generic.py index f77d543193e74..fa269b4ebeab1 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -170,7 +170,19 @@ class NDFrame(PandasObject, SelectionMixin): _internal_names_set = set(_internal_names) # type: Set[str] _accessors = set() # type: Set[str] _deprecations = frozenset( - ["as_blocks", "blocks", "is_copy", "ftypes", "ix"] + [ + "as_blocks", + "as_matrix", + "blocks", + "clip_lower", + "clip_upper", + "get_dtype_counts", + "get_ftype_counts", + "get_values", + "is_copy", + "ftypes", + "ix", + ] ) # type: FrozenSet[str] _metadata = [] # type: List[str] _is_copy = None diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 86692ed602651..c9c02ad9e496a 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -205,8 +205,10 @@ class Index(IndexOpsMixin, PandasObject): """ # tolist is not actually deprecated, just suppressed in the __dir__ - _deprecations = DirNamesMixin._deprecations | frozenset( - ["tolist", "dtype_str", "get_values", "set_value"] + _deprecations = ( + IndexOpsMixin._deprecations + | DirNamesMixin._deprecations + | frozenset(["tolist", "contains", "dtype_str", "get_values", "set_value"]) ) # To hand over control to subclasses diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index b49bb856a2e2b..2007da541bb2e 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -229,6 +229,10 @@ class MultiIndex(Index): of the mentioned helper methods. """ + _deprecations = Index._deprecations | frozenset( + ["labels", "set_labels", "to_hierarchical"] + ) + # initialize to zero-length tuples to make everything work _typ = "multiindex" _names = FrozenList() diff --git a/pandas/core/series.py b/pandas/core/series.py index 19d201917f3c8..ff8149cc2e922 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -175,11 +175,24 @@ class Series(base.IndexOpsMixin, generic.NDFrame): _metadata = ["name"] _accessors = {"dt", "cat", "str", "sparse"} - # tolist is not actually deprecated, just suppressed in the __dir__ _deprecations = ( - generic.NDFrame._deprecations + base.IndexOpsMixin._deprecations + | generic.NDFrame._deprecations | DirNamesMixin._deprecations - | frozenset(["asobject", "reshape", "valid", "tolist", "ftype", "real", "imag"]) + | frozenset( + [ + "tolist", # tolist is not deprecated, just suppressed in the __dir__ + "asobject", + "compress", + "valid", + "ftype", + "real", + "imag", + "put", + "ptp", + "nonzero", + ] + ) ) # Override cache_readonly bc Series is mutable diff --git a/pandas/tests/series/test_api.py b/pandas/tests/series/test_api.py index 762f4a37d17cc..998f8b6f7d8a4 100644 --- a/pandas/tests/series/test_api.py +++ b/pandas/tests/series/test_api.py @@ -247,9 +247,6 @@ def test_tab_completion(self): def test_tab_completion_with_categorical(self): # test the tab completion display ok_for_cat = [ - "name", - "index", - "categorical", "categories", "codes", "ordered", From 56b2fd87ba6926261fc4bef1cfef05e024d515f4 Mon Sep 17 00:00:00 2001 From: Ryan Nazareth Date: Sat, 12 Oct 2019 18:38:19 +0100 Subject: [PATCH 045/119] TST: Test pivot_table() with categorical data (#28803) --- pandas/tests/reshape/test_pivot.py | 45 ++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index 582084e3bfb5a..a8386d21ba27f 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -1656,6 +1656,51 @@ def test_categorical_margins_category(self, observed): table = df.pivot_table("x", "y", "z", dropna=observed, margins=True) tm.assert_frame_equal(table, expected) + def test_pivot_with_categorical(self, observed, ordered_fixture): + # gh-21370 + idx = [np.nan, "low", "high", "low", np.nan] + col = [np.nan, "A", "B", np.nan, "A"] + df = pd.DataFrame( + { + "In": pd.Categorical( + idx, categories=["low", "high"], ordered=ordered_fixture + ), + "Col": pd.Categorical( + col, categories=["A", "B"], ordered=ordered_fixture + ), + "Val": range(1, 6), + } + ) + # case with index/columns/value + result = df.pivot_table( + index="In", columns="Col", values="Val", observed=observed + ) + + expected_cols = pd.CategoricalIndex( + ["A", "B"], ordered=ordered_fixture, name="Col" + ) + + expected = pd.DataFrame( + data=[[2.0, np.nan], [np.nan, 3.0]], columns=expected_cols + ) + expected.index = Index( + pd.Categorical( + ["low", "high"], categories=["low", "high"], ordered=ordered_fixture + ), + name="In", + ) + + tm.assert_frame_equal(result, expected) + + # case with columns/value + result = df.pivot_table(columns="Col", values="Val", observed=observed) + + expected = pd.DataFrame( + data=[[3.5, 3.0]], columns=expected_cols, index=Index(["Val"]) + ) + + tm.assert_frame_equal(result, expected) + def test_categorical_aggfunc(self, observed): # GH 9534 df = pd.DataFrame( From 25059ee93419e6b311b6140d289d20a15dd734d9 Mon Sep 17 00:00:00 2001 From: timcera Date: Sat, 12 Oct 2019 13:44:50 -0400 Subject: [PATCH 046/119] BUG: Need 'windows-1252' encoding for locale names. (#27368) --- doc/source/whatsnew/v1.0.0.rst | 1 + pandas/_config/localization.py | 10 +++++++++- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 25cf41126e599..1112e42489342 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -241,6 +241,7 @@ Datetimelike - Bug in :class:`Week` with ``weekday`` incorrectly raising ``AttributeError`` instead of ``TypeError`` when adding or subtracting an invalid type (:issue:`28530`) - Bug in :class:`DataFrame` arithmetic operations when operating with a :class:`Series` with dtype `'timedelta64[ns]'` (:issue:`28049`) - Bug in :func:`pandas.core.groupby.generic.SeriesGroupBy.apply` raising ``ValueError`` when a column in the original DataFrame is a datetime and the column labels are not standard integers (:issue:`28247`) +- Bug in :func:`pandas._config.localization.get_locales` where the ``locales -a`` encodes the locales list as windows-1252 (:issue:`23638`, :issue:`24760`, :issue:`27368`) Timedelta ^^^^^^^^^ diff --git a/pandas/_config/localization.py b/pandas/_config/localization.py index 9f750d8447c6a..ba60b1e003004 100644 --- a/pandas/_config/localization.py +++ b/pandas/_config/localization.py @@ -145,7 +145,15 @@ def get_locales(prefix=None, normalize=True, locale_getter=_default_locale_gette raw_locales = raw_locales.split(b"\n") out_locales = [] for x in raw_locales: - out_locales.append(str(x, encoding=options.display.encoding)) + try: + out_locales.append(str(x, encoding=options.display.encoding)) + except UnicodeError: + # 'locale -a' is used to populated 'raw_locales' and on + # Redhat 7 Linux (and maybe others) prints locale names + # using windows-1252 encoding. Bug only triggered by + # a few special characters and when there is an + # extensive list of installed locales. + out_locales.append(str(x, encoding="windows-1252")) except TypeError: pass From f7d162b1890758e22c6bc3de3a8b0f6e09e3317e Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sat, 12 Oct 2019 20:03:57 +0200 Subject: [PATCH 047/119] DOC: disable nbsphinx including requirejs (#28940) --- doc/source/conf.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/doc/source/conf.py b/doc/source/conf.py index 34faf183db1c2..86f78d9c0f0ae 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -120,6 +120,9 @@ plot_pre_code = """import numpy as np import pandas as pd""" +# nbsphinx do not use requirejs (breaks bootstrap) +nbsphinx_requirejs_path = "" + # Add any paths that contain templates here, relative to this directory. templates_path = ["../_templates"] From 625c550f67861c086f480adaa3fdbf21373cc63b Mon Sep 17 00:00:00 2001 From: lukasbk Date: Sun, 13 Oct 2019 17:58:57 +0200 Subject: [PATCH 048/119] CLN: fix mypy errors in pandas/tests/extension/test_numpy.py #28926 (#28947) --- pandas/tests/extension/base/ops.py | 9 +++++---- setup.cfg | 6 ------ 2 files changed, 5 insertions(+), 10 deletions(-) diff --git a/pandas/tests/extension/base/ops.py b/pandas/tests/extension/base/ops.py index e35464964f432..e968962caf0b7 100644 --- a/pandas/tests/extension/base/ops.py +++ b/pandas/tests/extension/base/ops.py @@ -1,4 +1,5 @@ import operator +from typing import Optional, Type import pytest @@ -61,10 +62,10 @@ class BaseArithmeticOpsTests(BaseOpsUtil): * divmod_exc = TypeError """ - series_scalar_exc = TypeError - frame_scalar_exc = TypeError - series_array_exc = TypeError - divmod_exc = TypeError + series_scalar_exc = TypeError # type: Optional[Type[TypeError]] + frame_scalar_exc = TypeError # type: Optional[Type[TypeError]] + series_array_exc = TypeError # type: Optional[Type[TypeError]] + divmod_exc = TypeError # type: Optional[Type[TypeError]] def test_arith_series_with_scalar(self, data, all_arithmetic_operators): # series & scalar diff --git a/setup.cfg b/setup.cfg index 9c841b76761f5..9af7215b1dc56 100644 --- a/setup.cfg +++ b/setup.cfg @@ -166,12 +166,6 @@ ignore_errors=True [mypy-pandas.tests.extension.json.test_json] ignore_errors=True -[mypy-pandas.tests.extension.test_numpy] -ignore_errors=True - -[mypy-pandas.tests.extension.test_sparse] -ignore_errors=True - [mypy-pandas.tests.frame.test_constructors] ignore_errors=True From 0eee324c2684693edae2b0a5ebad3c1619128db4 Mon Sep 17 00:00:00 2001 From: Rohit Sanjay Date: Sun, 13 Oct 2019 22:14:42 +0530 Subject: [PATCH 049/119] TST: add test_series_any_timedelta for GH17667 (#28942) --- pandas/tests/test_multilevel.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index b9a33d130a99c..e641d6f842d87 100644 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -1065,6 +1065,23 @@ def test_frame_any_all_group(self): ex = DataFrame({"data": [False, False]}, index=["one", "two"]) tm.assert_frame_equal(result, ex) + def test_series_any_timedelta(self): + # GH 17667 + df = DataFrame( + { + "a": Series([0, 0]), + "t": Series([pd.to_timedelta(0, "s"), pd.to_timedelta(1, "ms")]), + } + ) + + result = df.any(axis=0) + expected = Series(data=[False, True], index=["a", "t"]) + tm.assert_series_equal(result, expected) + + result = df.any(axis=1) + expected = Series(data=[False, True]) + tm.assert_series_equal(result, expected) + def test_std_var_pass_ddof(self): index = MultiIndex.from_arrays( [np.arange(5).repeat(10), np.tile(np.arange(10), 5)] From 2931f0222c6bffe3d7479f1823bbd5c508dbca6a Mon Sep 17 00:00:00 2001 From: Tola A <33249563+tolaa001@users.noreply.github.com> Date: Sun, 13 Oct 2019 20:07:56 +0100 Subject: [PATCH 050/119] PR06 doc string fixes (#28946) --- pandas/core/computation/eval.py | 4 ++-- pandas/core/frame.py | 2 +- pandas/core/generic.py | 2 +- pandas/core/groupby/generic.py | 2 +- pandas/core/indexes/base.py | 2 +- pandas/core/indexes/category.py | 2 +- pandas/core/indexes/datetimes.py | 2 +- pandas/core/indexes/interval.py | 4 ++-- pandas/core/indexes/multi.py | 4 ++-- pandas/core/reshape/tile.py | 4 ++-- pandas/core/tools/numeric.py | 2 +- pandas/io/json/_json.py | 2 +- pandas/io/parsers.py | 2 +- pandas/io/pytables.py | 9 +++++---- pandas/io/sql.py | 2 +- pandas/io/stata.py | 16 ++++++++-------- pandas/tseries/frequencies.py | 2 +- 17 files changed, 32 insertions(+), 31 deletions(-) diff --git a/pandas/core/computation/eval.py b/pandas/core/computation/eval.py index 8614230c4811f..63344af63470f 100644 --- a/pandas/core/computation/eval.py +++ b/pandas/core/computation/eval.py @@ -198,14 +198,14 @@ def eval( `__, only Python `expressions `__. - parser : string, default 'pandas', {'pandas', 'python'} + parser : {'pandas', 'python'}, default 'pandas' The parser to use to construct the syntax tree from the expression. The default of ``'pandas'`` parses code slightly different than standard Python. Alternatively, you can parse an expression using the ``'python'`` parser to retain strict Python semantics. See the :ref:`enhancing performance ` documentation for more details. - engine : string or None, default 'numexpr', {'python', 'numexpr'} + engine : {'python', 'numexpr'}, default 'numexpr' The engine used to evaluate the expression. Supported engines are diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 5200ad0ba0d23..79e941f262931 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -6238,7 +6238,7 @@ def unstack(self, level=-1, fill_value=None): ---------- level : int, str, or list of these, default -1 (last level) Level(s) of index to unstack, can pass level name. - fill_value : int, string or dict + fill_value : int, str or dict Replace NaN with this value if the unstack produces missing values. Returns diff --git a/pandas/core/generic.py b/pandas/core/generic.py index fa269b4ebeab1..da8db23fb538b 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -2353,7 +2353,7 @@ def to_json( .. versionadded:: 0.23.0 - indent : integer, optional + indent : int, optional Length of whitespace used to indent each record. .. versionadded:: 1.0.0 diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 7be11696b7d45..068d5e5275f0d 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1644,7 +1644,7 @@ def nunique(self, dropna=True): Parameters ---------- - dropna : boolean, default True + dropna : bool, default True Don't include NaN in the counts. Returns diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index c9c02ad9e496a..7dee3a17f8f9e 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -4531,7 +4531,7 @@ def shift(self, periods=1, freq=None): periods : int, default 1 Number of periods (or increments) to shift by, can be positive or negative. - freq : pandas.DateOffset, pandas.Timedelta or string, optional + freq : pandas.DateOffset, pandas.Timedelta or str, optional Frequency increment to shift by. If None, the index is shifted by its own `freq` attribute. Offset aliases are valid strings, e.g., 'D', 'W', 'M' etc. diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index ed3a4a7953df3..b538c4df00e19 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -77,7 +77,7 @@ class CategoricalIndex(Index, accessor.PandasDelegate): Whether or not this categorical is treated as an ordered categorical. If not given here or in `dtype`, the resulting categorical will be unordered. - dtype : CategoricalDtype or the string "category", optional + dtype : CategoricalDtype or "category", optional If :class:`CategoricalDtype`, cannot be used together with `categories` or `ordered`. diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index 0b20df38e7d42..6a2f49cd1470e 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -1328,7 +1328,7 @@ def indexer_at_time(self, time, asof=False): Parameters ---------- - time : datetime.time or string + time : datetime.time or str datetime.time or string in appropriate format ("%H:%M", "%H%M", "%I:%M%p", "%I%M%p", "%H:%M:%S", "%H%M%S", "%I:%M:%S%p", "%I%M%S%p"). diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index 2cc15f7650ac1..a2d48b5100a2e 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -1412,11 +1412,11 @@ def interval_range( Right bound for generating intervals periods : int, default None Number of periods to generate - freq : numeric, string, or DateOffset, default None + freq : numeric, str, or DateOffset, default None The length of each interval. Must be consistent with the type of start and end, e.g. 2 for numeric, or '5H' for datetime-like. Default is 1 for numeric and 'D' for datetime-like. - name : string, default None + name : str, default None Name of the resulting IntervalIndex closed : {'left', 'right', 'both', 'neither'}, default 'right' Whether the intervals are closed on the left-side, right-side, both diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 2007da541bb2e..596eaf0c55dbd 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -1650,7 +1650,7 @@ def to_frame(self, index=True, name=None): Parameters ---------- - index : boolean, default True + index : bool, default True Set the index of the returned DataFrame as the original MultiIndex. name : list / sequence of strings, optional @@ -2334,7 +2334,7 @@ def sortlevel(self, level=0, ascending=True, sort_remaining=True): level : list-like, int or str, default 0 If a string is given, must be a name of the level If list-like must be names or ints of levels. - ascending : boolean, default True + ascending : bool, default True False to sort in descending order Can also be a list to specify a directed ordering sort_remaining : sort by the remaining levels after level diff --git a/pandas/core/reshape/tile.py b/pandas/core/reshape/tile.py index be5d75224e77d..6942a5797a7f0 100644 --- a/pandas/core/reshape/tile.py +++ b/pandas/core/reshape/tile.py @@ -285,10 +285,10 @@ def qcut(x, q, labels=None, retbins=False, precision=3, duplicates="raise"): Parameters ---------- x : 1d ndarray or Series - q : integer or array of quantiles + q : int or list-like of int Number of quantiles. 10 for deciles, 4 for quartiles, etc. Alternately array of quantiles, e.g. [0, .25, .5, .75, 1.] for quartiles - labels : array or boolean, default None + labels : array or bool, default None Used as labels for the resulting bins. Must be of the same length as the resulting bins. If False, return only integer indicators of the bins. diff --git a/pandas/core/tools/numeric.py b/pandas/core/tools/numeric.py index fa33d11bda7eb..05696ffd4605d 100644 --- a/pandas/core/tools/numeric.py +++ b/pandas/core/tools/numeric.py @@ -39,7 +39,7 @@ def to_numeric(arg, errors="raise", downcast=None): - If 'raise', then invalid parsing will raise an exception - If 'coerce', then invalid parsing will be set as NaN - If 'ignore', then invalid parsing will return the input - downcast : {'integer', 'signed', 'unsigned', 'float'} , default None + downcast : {'integer', 'signed', 'unsigned', 'float'}, default None If not None, and if the data has been successfully cast to a numerical dtype (or if the data was numeric to begin with), downcast that resulting data to the smallest numerical dtype diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index 6ce288890b6c7..c71677fa3b570 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -384,7 +384,7 @@ def read_json( By file-like object, we refer to objects with a ``read()`` method, such as a file handler (e.g. via builtin ``open`` function) or ``StringIO``. - orient : string, + orient : str Indication of expected JSON string format. Compatible JSON strings can be produced by ``to_json()`` with a corresponding orient value. diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 3678e32943b2e..c82486532530f 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -257,7 +257,7 @@ arguments. dayfirst : bool, default False DD/MM format dates, international and European format. -cache_dates : boolean, default True +cache_dates : bool, default True If True, use a cache of unique, converted dates to apply the datetime conversion. May produce significant speed-up when parsing duplicate date strings, especially ones with timezone offsets. diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 0db5b1b4eecfa..c87cad5472bd9 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -1025,8 +1025,8 @@ def append( Write as a PyTables Table structure which may perform worse but allow more flexible operations like searching / selecting subsets of the data - append : boolean, default True, append the input data to the - existing + append : bool, default True + Append the input data to the existing. data_columns : list of columns, or True, default None List of columns to create as indexed data columns for on-disk queries, or True to use all columns. By default only the axes @@ -1037,8 +1037,9 @@ def append( chunksize : size to chunk the writing expectedrows : expected TOTAL row size of this table encoding : default None, provide an encoding for strings - dropna : boolean, default False, do not write an ALL nan row to - the store settable by the option 'io.hdf.dropna_table' + dropna : bool, default False + Do not write an ALL nan row to the store settable + by the option 'io.hdf.dropna_table'. Notes ----- diff --git a/pandas/io/sql.py b/pandas/io/sql.py index b0683fb8b0dfb..822b3288c82d9 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -287,7 +287,7 @@ def read_sql_query( If a DBAPI2 object, only sqlite3 is supported. index_col : string or list of strings, optional, default: None Column(s) to set as index(MultiIndex). - coerce_float : boolean, default True + coerce_float : bool, default True Attempts to convert values of non-string, non-numeric objects (like decimal.Decimal) to floating point. Useful for SQL result sets. params : list, tuple or dict, optional, default: None diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 0b674b556b2ee..679b74caba79e 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -53,31 +53,31 @@ ) _statafile_processing_params1 = """\ -convert_dates : boolean, defaults to True +convert_dates : bool, default True Convert date variables to DataFrame time values. -convert_categoricals : boolean, defaults to True +convert_categoricals : bool, default True Read value labels and convert columns to Categorical/Factor variables.""" _encoding_params = """\ -encoding : string, None or encoding +encoding : str, None or encoding Encoding used to parse the files. None defaults to latin-1.""" _statafile_processing_params2 = """\ -index_col : string, optional, default: None +index_col : str, optional Column to set as index. -convert_missing : boolean, defaults to False +convert_missing : bool, default False Flag indicating whether to convert missing values to their Stata representations. If False, missing values are replaced with nan. If True, columns containing missing values are returned with object data types and missing values are represented by StataMissingValue objects. -preserve_dtypes : boolean, defaults to True +preserve_dtypes : bool, default True Preserve Stata datatypes. If False, numeric data are upcast to pandas default types for foreign data (float64 or int64). columns : list or None Columns to retain. Columns will be returned in the given order. None returns all columns. -order_categoricals : boolean, defaults to True +order_categoricals : bool, default True Flag indicating whether converted categorical data are ordered.""" _chunksize_params = """\ @@ -86,7 +86,7 @@ given number of lines.""" _iterator_params = """\ -iterator : boolean, default False +iterator : bool, default False Return StataReader object.""" _read_stata_doc = """ diff --git a/pandas/tseries/frequencies.py b/pandas/tseries/frequencies.py index 4491e6ad9ac7e..0dcd8aeb4df9b 100644 --- a/pandas/tseries/frequencies.py +++ b/pandas/tseries/frequencies.py @@ -223,7 +223,7 @@ def infer_freq(index, warn=True): ---------- index : DatetimeIndex or TimedeltaIndex if passed a Series will use the values of the series (NOT THE INDEX) - warn : boolean, default True + warn : bool, default True Returns ------- From 04d7931e478e6e9b2d5642b7c1ed7b262cb872bc Mon Sep 17 00:00:00 2001 From: yogendrasoni Date: Mon, 14 Oct 2019 00:53:36 +0530 Subject: [PATCH 051/119] fix #28926 pandas\api\test_api.py mypy errors (#28935) * fix #28926 pandas\api\test_api.py mypy errors * fix #28926 pandas\api\test_api.py mypy errors * changed to type comment to support v 3.5 * removed section mypy-pandas.tests.api.test_api * fix annotation and revert delete * fix import sorting * fix import sorting --- pandas/tests/api/test_api.py | 12 +++++++----- setup.cfg | 3 --- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/pandas/tests/api/test_api.py b/pandas/tests/api/test_api.py index 6c50159663574..0af8ed0ebf8d5 100644 --- a/pandas/tests/api/test_api.py +++ b/pandas/tests/api/test_api.py @@ -1,3 +1,5 @@ +from typing import List + import pandas as pd from pandas import api, compat from pandas.util import testing as tm @@ -41,7 +43,7 @@ class TestPDApi(Base): ] # these are already deprecated; awaiting removal - deprecated_modules = [] + deprecated_modules = [] # type: List[str] # misc misc = ["IndexSlice", "NaT"] @@ -92,10 +94,10 @@ class TestPDApi(Base): classes.extend(["Panel", "SparseSeries", "SparseDataFrame"]) # these are already deprecated; awaiting removal - deprecated_classes = [] + deprecated_classes = [] # type: List[str] # these should be deprecated in the future - deprecated_classes_in_future = [] + deprecated_classes_in_future = [] # type: List[str] # external modules exposed in pandas namespace modules = ["np", "datetime"] @@ -171,10 +173,10 @@ class TestPDApi(Base): funcs_to = ["to_datetime", "to_msgpack", "to_numeric", "to_pickle", "to_timedelta"] # top-level to deprecate in the future - deprecated_funcs_in_future = [] + deprecated_funcs_in_future = [] # type: List[str] # these are already deprecated; awaiting removal - deprecated_funcs = [] + deprecated_funcs = [] # type: List[str] # private modules in pandas namespace private_modules = [ diff --git a/setup.cfg b/setup.cfg index 9af7215b1dc56..149af6c283d05 100644 --- a/setup.cfg +++ b/setup.cfg @@ -133,9 +133,6 @@ no_implicit_optional=True [mypy-pandas.conftest] ignore_errors=True -[mypy-pandas.tests.api.test_api] -ignore_errors=True - [mypy-pandas.tests.arithmetic.test_datetime64] ignore_errors=True From 06a6b496a4608bdcc54c8e0ad85197437257d9dc Mon Sep 17 00:00:00 2001 From: Oluokun Adedayo Date: Sun, 13 Oct 2019 23:34:35 +0100 Subject: [PATCH 052/119] Eliminated _WriterBase class, removed unused fixtures from methods in pandas/io/excel/test_writers.py (#28753) --- pandas/tests/io/excel/test_writers.py | 417 +++++++++++++------------- 1 file changed, 205 insertions(+), 212 deletions(-) diff --git a/pandas/tests/io/excel/test_writers.py b/pandas/tests/io/excel/test_writers.py index 9feec424389e7..793f11c62f9f5 100644 --- a/pandas/tests/io/excel/test_writers.py +++ b/pandas/tests/io/excel/test_writers.py @@ -24,6 +24,32 @@ ) +@pytest.fixture +def path(ext): + """ + Fixture to open file for use in each test case. + """ + with ensure_clean(ext) as file_path: + yield file_path + + +@pytest.fixture +def set_engine(engine, ext): + """ + Fixture to set engine for use in each test case. + + Rather than requiring `engine=...` to be provided explicitly as an + argument in each test, this fixture sets a global option to dictate + which engine should be used to write Excel files. After executing + the test it rolls back said change to the global option. + """ + option_name = "io.excel.{ext}.writer".format(ext=ext.strip(".")) + prev_engine = get_option(option_name) + set_option(option_name, engine) + yield + set_option(option_name, prev_engine) # Roll back option change + + @td.skip_if_no("xlrd") @pytest.mark.parametrize("ext", [".xls", ".xlsx", ".xlsm"]) class TestRoundTrip: @@ -233,34 +259,6 @@ def test_read_excel_parse_dates(self, ext): tm.assert_frame_equal(df, res) -class _WriterBase: - @pytest.fixture(autouse=True) - def set_engine_and_path(self, engine, ext): - """Fixture to set engine and open file for use in each test case - - Rather than requiring `engine=...` to be provided explicitly as an - argument in each test, this fixture sets a global option to dictate - which engine should be used to write Excel files. After executing - the test it rolls back said change to the global option. - - It also uses a context manager to open a temporary excel file for - the function to write to, accessible via `self.path` - - Notes - ----- - This fixture will run as part of each test method defined in the - class and any subclasses, on account of the `autouse=True` - argument - """ - option_name = "io.excel.{ext}.writer".format(ext=ext.strip(".")) - prev_engine = get_option(option_name) - set_option(option_name, engine) - with ensure_clean(ext) as path: - self.path = path - yield - set_option(option_name, prev_engine) # Roll back option change - - @td.skip_if_no("xlrd") @pytest.mark.parametrize( "engine,ext", @@ -271,10 +269,9 @@ class and any subclasses, on account of the `autouse=True` pytest.param("xlsxwriter", ".xlsx", marks=td.skip_if_no("xlsxwriter")), ], ) -class TestExcelWriter(_WriterBase): - # Base class for test cases to run with different Excel writers. - - def test_excel_sheet_size(self, engine, ext): +@pytest.mark.usefixtures("set_engine") +class TestExcelWriter: + def test_excel_sheet_size(self, path): # GH 26080 breaking_row_count = 2 ** 20 + 1 @@ -287,18 +284,18 @@ def test_excel_sheet_size(self, engine, ext): msg = "sheet is too large" with pytest.raises(ValueError, match=msg): - row_df.to_excel(self.path) + row_df.to_excel(path) with pytest.raises(ValueError, match=msg): - col_df.to_excel(self.path) + col_df.to_excel(path) - def test_excel_sheet_by_name_raise(self, engine, ext): + def test_excel_sheet_by_name_raise(self, path): import xlrd gt = DataFrame(np.random.randn(10, 2)) - gt.to_excel(self.path) + gt.to_excel(path) - xl = ExcelFile(self.path) + xl = ExcelFile(path) df = pd.read_excel(xl, 0, index_col=0) tm.assert_frame_equal(gt, df) @@ -306,162 +303,162 @@ def test_excel_sheet_by_name_raise(self, engine, ext): with pytest.raises(xlrd.XLRDError): pd.read_excel(xl, "0") - def test_excel_writer_context_manager(self, frame, engine, ext): - with ExcelWriter(self.path) as writer: + def test_excel_writer_context_manager(self, frame, path): + with ExcelWriter(path) as writer: frame.to_excel(writer, "Data1") frame2 = frame.copy() frame2.columns = frame.columns[::-1] frame2.to_excel(writer, "Data2") - with ExcelFile(self.path) as reader: + with ExcelFile(path) as reader: found_df = pd.read_excel(reader, "Data1", index_col=0) found_df2 = pd.read_excel(reader, "Data2", index_col=0) tm.assert_frame_equal(found_df, frame) tm.assert_frame_equal(found_df2, frame2) - def test_roundtrip(self, engine, ext, frame): + def test_roundtrip(self, frame, path): frame = frame.copy() frame["A"][:5] = np.nan - frame.to_excel(self.path, "test1") - frame.to_excel(self.path, "test1", columns=["A", "B"]) - frame.to_excel(self.path, "test1", header=False) - frame.to_excel(self.path, "test1", index=False) + frame.to_excel(path, "test1") + frame.to_excel(path, "test1", columns=["A", "B"]) + frame.to_excel(path, "test1", header=False) + frame.to_excel(path, "test1", index=False) # test roundtrip - frame.to_excel(self.path, "test1") - recons = pd.read_excel(self.path, "test1", index_col=0) + frame.to_excel(path, "test1") + recons = pd.read_excel(path, "test1", index_col=0) tm.assert_frame_equal(frame, recons) - frame.to_excel(self.path, "test1", index=False) - recons = pd.read_excel(self.path, "test1", index_col=None) + frame.to_excel(path, "test1", index=False) + recons = pd.read_excel(path, "test1", index_col=None) recons.index = frame.index tm.assert_frame_equal(frame, recons) - frame.to_excel(self.path, "test1", na_rep="NA") - recons = pd.read_excel(self.path, "test1", index_col=0, na_values=["NA"]) + frame.to_excel(path, "test1", na_rep="NA") + recons = pd.read_excel(path, "test1", index_col=0, na_values=["NA"]) tm.assert_frame_equal(frame, recons) # GH 3611 - frame.to_excel(self.path, "test1", na_rep="88") - recons = pd.read_excel(self.path, "test1", index_col=0, na_values=["88"]) + frame.to_excel(path, "test1", na_rep="88") + recons = pd.read_excel(path, "test1", index_col=0, na_values=["88"]) tm.assert_frame_equal(frame, recons) - frame.to_excel(self.path, "test1", na_rep="88") - recons = pd.read_excel(self.path, "test1", index_col=0, na_values=[88, 88.0]) + frame.to_excel(path, "test1", na_rep="88") + recons = pd.read_excel(path, "test1", index_col=0, na_values=[88, 88.0]) tm.assert_frame_equal(frame, recons) # GH 6573 - frame.to_excel(self.path, "Sheet1") - recons = pd.read_excel(self.path, index_col=0) + frame.to_excel(path, "Sheet1") + recons = pd.read_excel(path, index_col=0) tm.assert_frame_equal(frame, recons) - frame.to_excel(self.path, "0") - recons = pd.read_excel(self.path, index_col=0) + frame.to_excel(path, "0") + recons = pd.read_excel(path, index_col=0) tm.assert_frame_equal(frame, recons) # GH 8825 Pandas Series should provide to_excel method s = frame["A"] - s.to_excel(self.path) - recons = pd.read_excel(self.path, index_col=0) + s.to_excel(path) + recons = pd.read_excel(path, index_col=0) tm.assert_frame_equal(s.to_frame(), recons) - def test_mixed(self, engine, ext, frame): + def test_mixed(self, frame, path): mixed_frame = frame.copy() mixed_frame["foo"] = "bar" - mixed_frame.to_excel(self.path, "test1") - reader = ExcelFile(self.path) + mixed_frame.to_excel(path, "test1") + reader = ExcelFile(path) recons = pd.read_excel(reader, "test1", index_col=0) tm.assert_frame_equal(mixed_frame, recons) - def test_ts_frame(self, tsframe, engine, ext): + def test_ts_frame(self, tsframe, path): df = tsframe - df.to_excel(self.path, "test1") - reader = ExcelFile(self.path) + df.to_excel(path, "test1") + reader = ExcelFile(path) recons = pd.read_excel(reader, "test1", index_col=0) tm.assert_frame_equal(df, recons) - def test_basics_with_nan(self, engine, ext, frame): + def test_basics_with_nan(self, frame, path): frame = frame.copy() frame["A"][:5] = np.nan - frame.to_excel(self.path, "test1") - frame.to_excel(self.path, "test1", columns=["A", "B"]) - frame.to_excel(self.path, "test1", header=False) - frame.to_excel(self.path, "test1", index=False) + frame.to_excel(path, "test1") + frame.to_excel(path, "test1", columns=["A", "B"]) + frame.to_excel(path, "test1", header=False) + frame.to_excel(path, "test1", index=False) @pytest.mark.parametrize("np_type", [np.int8, np.int16, np.int32, np.int64]) - def test_int_types(self, engine, ext, np_type): + def test_int_types(self, np_type, path): # Test np.int values read come back as int # (rather than float which is Excel's format). df = DataFrame(np.random.randint(-10, 10, size=(10, 2)), dtype=np_type) - df.to_excel(self.path, "test1") + df.to_excel(path, "test1") - reader = ExcelFile(self.path) + reader = ExcelFile(path) recons = pd.read_excel(reader, "test1", index_col=0) int_frame = df.astype(np.int64) tm.assert_frame_equal(int_frame, recons) - recons2 = pd.read_excel(self.path, "test1", index_col=0) + recons2 = pd.read_excel(path, "test1", index_col=0) tm.assert_frame_equal(int_frame, recons2) # Test with convert_float=False comes back as float. float_frame = df.astype(float) - recons = pd.read_excel(self.path, "test1", convert_float=False, index_col=0) + recons = pd.read_excel(path, "test1", convert_float=False, index_col=0) tm.assert_frame_equal( recons, float_frame, check_index_type=False, check_column_type=False ) @pytest.mark.parametrize("np_type", [np.float16, np.float32, np.float64]) - def test_float_types(self, engine, ext, np_type): + def test_float_types(self, np_type, path): # Test np.float values read come back as float. df = DataFrame(np.random.random_sample(10), dtype=np_type) - df.to_excel(self.path, "test1") + df.to_excel(path, "test1") - reader = ExcelFile(self.path) + reader = ExcelFile(path) recons = pd.read_excel(reader, "test1", index_col=0).astype(np_type) tm.assert_frame_equal(df, recons, check_dtype=False) @pytest.mark.parametrize("np_type", [np.bool8, np.bool_]) - def test_bool_types(self, engine, ext, np_type): + def test_bool_types(self, np_type, path): # Test np.bool values read come back as float. df = DataFrame([1, 0, True, False], dtype=np_type) - df.to_excel(self.path, "test1") + df.to_excel(path, "test1") - reader = ExcelFile(self.path) + reader = ExcelFile(path) recons = pd.read_excel(reader, "test1", index_col=0).astype(np_type) tm.assert_frame_equal(df, recons) - def test_inf_roundtrip(self, engine, ext): + def test_inf_roundtrip(self, path): df = DataFrame([(1, np.inf), (2, 3), (5, -np.inf)]) - df.to_excel(self.path, "test1") + df.to_excel(path, "test1") - reader = ExcelFile(self.path) + reader = ExcelFile(path) recons = pd.read_excel(reader, "test1", index_col=0) tm.assert_frame_equal(df, recons) - def test_sheets(self, engine, ext, frame, tsframe): + def test_sheets(self, frame, tsframe, path): frame = frame.copy() frame["A"][:5] = np.nan - frame.to_excel(self.path, "test1") - frame.to_excel(self.path, "test1", columns=["A", "B"]) - frame.to_excel(self.path, "test1", header=False) - frame.to_excel(self.path, "test1", index=False) + frame.to_excel(path, "test1") + frame.to_excel(path, "test1", columns=["A", "B"]) + frame.to_excel(path, "test1", header=False) + frame.to_excel(path, "test1", index=False) # Test writing to separate sheets - writer = ExcelWriter(self.path) + writer = ExcelWriter(path) frame.to_excel(writer, "test1") tsframe.to_excel(writer, "test2") writer.save() - reader = ExcelFile(self.path) + reader = ExcelFile(path) recons = pd.read_excel(reader, "test1", index_col=0) tm.assert_frame_equal(frame, recons) recons = pd.read_excel(reader, "test2", index_col=0) @@ -470,62 +467,62 @@ def test_sheets(self, engine, ext, frame, tsframe): assert "test1" == reader.sheet_names[0] assert "test2" == reader.sheet_names[1] - def test_colaliases(self, engine, ext, frame): + def test_colaliases(self, frame, path): frame = frame.copy() frame["A"][:5] = np.nan - frame.to_excel(self.path, "test1") - frame.to_excel(self.path, "test1", columns=["A", "B"]) - frame.to_excel(self.path, "test1", header=False) - frame.to_excel(self.path, "test1", index=False) + frame.to_excel(path, "test1") + frame.to_excel(path, "test1", columns=["A", "B"]) + frame.to_excel(path, "test1", header=False) + frame.to_excel(path, "test1", index=False) # column aliases col_aliases = Index(["AA", "X", "Y", "Z"]) - frame.to_excel(self.path, "test1", header=col_aliases) - reader = ExcelFile(self.path) + frame.to_excel(path, "test1", header=col_aliases) + reader = ExcelFile(path) rs = pd.read_excel(reader, "test1", index_col=0) xp = frame.copy() xp.columns = col_aliases tm.assert_frame_equal(xp, rs) - def test_roundtrip_indexlabels(self, merge_cells, engine, ext, frame): + def test_roundtrip_indexlabels(self, merge_cells, frame, path): frame = frame.copy() frame["A"][:5] = np.nan - frame.to_excel(self.path, "test1") - frame.to_excel(self.path, "test1", columns=["A", "B"]) - frame.to_excel(self.path, "test1", header=False) - frame.to_excel(self.path, "test1", index=False) + frame.to_excel(path, "test1") + frame.to_excel(path, "test1", columns=["A", "B"]) + frame.to_excel(path, "test1", header=False) + frame.to_excel(path, "test1", index=False) # test index_label df = DataFrame(np.random.randn(10, 2)) >= 0 - df.to_excel(self.path, "test1", index_label=["test"], merge_cells=merge_cells) - reader = ExcelFile(self.path) + df.to_excel(path, "test1", index_label=["test"], merge_cells=merge_cells) + reader = ExcelFile(path) recons = pd.read_excel(reader, "test1", index_col=0).astype(np.int64) df.index.names = ["test"] assert df.index.names == recons.index.names df = DataFrame(np.random.randn(10, 2)) >= 0 df.to_excel( - self.path, + path, "test1", index_label=["test", "dummy", "dummy2"], merge_cells=merge_cells, ) - reader = ExcelFile(self.path) + reader = ExcelFile(path) recons = pd.read_excel(reader, "test1", index_col=0).astype(np.int64) df.index.names = ["test"] assert df.index.names == recons.index.names df = DataFrame(np.random.randn(10, 2)) >= 0 - df.to_excel(self.path, "test1", index_label="test", merge_cells=merge_cells) - reader = ExcelFile(self.path) + df.to_excel(path, "test1", index_label="test", merge_cells=merge_cells) + reader = ExcelFile(path) recons = pd.read_excel(reader, "test1", index_col=0).astype(np.int64) df.index.names = ["test"] tm.assert_frame_equal(df, recons.astype(bool)) frame.to_excel( - self.path, + path, "test1", columns=["A", "B", "C", "D"], index=False, @@ -535,35 +532,35 @@ def test_roundtrip_indexlabels(self, merge_cells, engine, ext, frame): df = frame.copy() df = df.set_index(["A", "B"]) - reader = ExcelFile(self.path) + reader = ExcelFile(path) recons = pd.read_excel(reader, "test1", index_col=[0, 1]) tm.assert_frame_equal(df, recons, check_less_precise=True) - def test_excel_roundtrip_indexname(self, merge_cells, engine, ext): + def test_excel_roundtrip_indexname(self, merge_cells, path): df = DataFrame(np.random.randn(10, 4)) df.index.name = "foo" - df.to_excel(self.path, merge_cells=merge_cells) + df.to_excel(path, merge_cells=merge_cells) - xf = ExcelFile(self.path) + xf = ExcelFile(path) result = pd.read_excel(xf, xf.sheet_names[0], index_col=0) tm.assert_frame_equal(result, df) assert result.index.name == "foo" - def test_excel_roundtrip_datetime(self, merge_cells, tsframe, engine, ext): + def test_excel_roundtrip_datetime(self, merge_cells, tsframe, path): # datetime.date, not sure what to test here exactly tsf = tsframe.copy() tsf.index = [x.date() for x in tsframe.index] - tsf.to_excel(self.path, "test1", merge_cells=merge_cells) + tsf.to_excel(path, "test1", merge_cells=merge_cells) - reader = ExcelFile(self.path) + reader = ExcelFile(path) recons = pd.read_excel(reader, "test1", index_col=0) tm.assert_frame_equal(tsframe, recons) - def test_excel_date_datetime_format(self, engine, ext): + def test_excel_date_datetime_format(self, engine, ext, path): # see gh-4133 # # Excel output format strings @@ -585,7 +582,7 @@ def test_excel_date_datetime_format(self, engine, ext): ) with ensure_clean(ext) as filename2: - writer1 = ExcelWriter(self.path) + writer1 = ExcelWriter(path) writer2 = ExcelWriter( filename2, date_format="DD.MM.YYYY", @@ -598,7 +595,7 @@ def test_excel_date_datetime_format(self, engine, ext): writer1.close() writer2.close() - reader1 = ExcelFile(self.path) + reader1 = ExcelFile(path) reader2 = ExcelFile(filename2) rs1 = pd.read_excel(reader1, "test1", index_col=0) @@ -610,7 +607,7 @@ def test_excel_date_datetime_format(self, engine, ext): # we need to use df_expected to check the result. tm.assert_frame_equal(rs2, df_expected) - def test_to_excel_interval_no_labels(self, engine, ext): + def test_to_excel_interval_no_labels(self, path): # see gh-19242 # # Test writing Interval without labels. @@ -620,13 +617,13 @@ def test_to_excel_interval_no_labels(self, engine, ext): df["new"] = pd.cut(df[0], 10) expected["new"] = pd.cut(expected[0], 10).astype(str) - df.to_excel(self.path, "test1") - reader = ExcelFile(self.path) + df.to_excel(path, "test1") + reader = ExcelFile(path) recons = pd.read_excel(reader, "test1", index_col=0) tm.assert_frame_equal(expected, recons) - def test_to_excel_interval_labels(self, engine, ext): + def test_to_excel_interval_labels(self, path): # see gh-19242 # # Test writing Interval with labels. @@ -638,13 +635,13 @@ def test_to_excel_interval_labels(self, engine, ext): df["new"] = intervals expected["new"] = pd.Series(list(intervals)) - df.to_excel(self.path, "test1") - reader = ExcelFile(self.path) + df.to_excel(path, "test1") + reader = ExcelFile(path) recons = pd.read_excel(reader, "test1", index_col=0) tm.assert_frame_equal(expected, recons) - def test_to_excel_timedelta(self, engine, ext): + def test_to_excel_timedelta(self, path): # see gh-19242, gh-9155 # # Test writing timedelta to xls. @@ -658,50 +655,50 @@ def test_to_excel_timedelta(self, engine, ext): lambda x: timedelta(seconds=x).total_seconds() / float(86400) ) - df.to_excel(self.path, "test1") - reader = ExcelFile(self.path) + df.to_excel(path, "test1") + reader = ExcelFile(path) recons = pd.read_excel(reader, "test1", index_col=0) tm.assert_frame_equal(expected, recons) - def test_to_excel_periodindex(self, engine, ext, tsframe): + def test_to_excel_periodindex(self, tsframe, path): xp = tsframe.resample("M", kind="period").mean() - xp.to_excel(self.path, "sht1") + xp.to_excel(path, "sht1") - reader = ExcelFile(self.path) + reader = ExcelFile(path) rs = pd.read_excel(reader, "sht1", index_col=0) tm.assert_frame_equal(xp, rs.to_period("M")) - def test_to_excel_multiindex(self, merge_cells, engine, ext, frame): + def test_to_excel_multiindex(self, merge_cells, frame, path): arrays = np.arange(len(frame.index) * 2).reshape(2, -1) new_index = MultiIndex.from_arrays(arrays, names=["first", "second"]) frame.index = new_index - frame.to_excel(self.path, "test1", header=False) - frame.to_excel(self.path, "test1", columns=["A", "B"]) + frame.to_excel(path, "test1", header=False) + frame.to_excel(path, "test1", columns=["A", "B"]) # round trip - frame.to_excel(self.path, "test1", merge_cells=merge_cells) - reader = ExcelFile(self.path) + frame.to_excel(path, "test1", merge_cells=merge_cells) + reader = ExcelFile(path) df = pd.read_excel(reader, "test1", index_col=[0, 1]) tm.assert_frame_equal(frame, df) # GH13511 - def test_to_excel_multiindex_nan_label(self, merge_cells, engine, ext): + def test_to_excel_multiindex_nan_label(self, merge_cells, path): df = pd.DataFrame( {"A": [None, 2, 3], "B": [10, 20, 30], "C": np.random.sample(3)} ) df = df.set_index(["A", "B"]) - df.to_excel(self.path, merge_cells=merge_cells) - df1 = pd.read_excel(self.path, index_col=[0, 1]) + df.to_excel(path, merge_cells=merge_cells) + df1 = pd.read_excel(path, index_col=[0, 1]) tm.assert_frame_equal(df, df1) # Test for Issue 11328. If column indices are integers, make # sure they are handled correctly for either setting of # merge_cells - def test_to_excel_multiindex_cols(self, merge_cells, engine, ext, frame): + def test_to_excel_multiindex_cols(self, merge_cells, frame, path): arrays = np.arange(len(frame.index) * 2).reshape(2, -1) new_index = MultiIndex.from_arrays(arrays, names=["first", "second"]) frame.index = new_index @@ -713,28 +710,28 @@ def test_to_excel_multiindex_cols(self, merge_cells, engine, ext, frame): header = 0 # round trip - frame.to_excel(self.path, "test1", merge_cells=merge_cells) - reader = ExcelFile(self.path) + frame.to_excel(path, "test1", merge_cells=merge_cells) + reader = ExcelFile(path) df = pd.read_excel(reader, "test1", header=header, index_col=[0, 1]) if not merge_cells: fm = frame.columns.format(sparsify=False, adjoin=False, names=False) frame.columns = [".".join(map(str, q)) for q in zip(*fm)] tm.assert_frame_equal(frame, df) - def test_to_excel_multiindex_dates(self, merge_cells, engine, ext, tsframe): + def test_to_excel_multiindex_dates(self, merge_cells, tsframe, path): # try multiindex with dates new_index = [tsframe.index, np.arange(len(tsframe.index))] tsframe.index = MultiIndex.from_arrays(new_index) tsframe.index.names = ["time", "foo"] - tsframe.to_excel(self.path, "test1", merge_cells=merge_cells) - reader = ExcelFile(self.path) + tsframe.to_excel(path, "test1", merge_cells=merge_cells) + reader = ExcelFile(path) recons = pd.read_excel(reader, "test1", index_col=[0, 1]) tm.assert_frame_equal(tsframe, recons) assert recons.index.names == ("time", "foo") - def test_to_excel_multiindex_no_write_index(self, engine, ext): + def test_to_excel_multiindex_no_write_index(self, path): # Test writing and re-reading a MI without the index. GH 5616. # Initial non-MI frame. @@ -746,24 +743,24 @@ def test_to_excel_multiindex_no_write_index(self, engine, ext): frame2.index = multi_index # Write out to Excel without the index. - frame2.to_excel(self.path, "test1", index=False) + frame2.to_excel(path, "test1", index=False) # Read it back in. - reader = ExcelFile(self.path) + reader = ExcelFile(path) frame3 = pd.read_excel(reader, "test1") # Test that it is the same as the initial frame. tm.assert_frame_equal(frame1, frame3) - def test_to_excel_float_format(self, engine, ext): + def test_to_excel_float_format(self, path): df = DataFrame( [[0.123456, 0.234567, 0.567567], [12.32112, 123123.2, 321321.2]], index=["A", "B"], columns=["X", "Y", "Z"], ) - df.to_excel(self.path, "test1", float_format="%.2f") + df.to_excel(path, "test1", float_format="%.2f") - reader = ExcelFile(self.path) + reader = ExcelFile(path) result = pd.read_excel(reader, "test1", index_col=0) expected = DataFrame( @@ -773,7 +770,7 @@ def test_to_excel_float_format(self, engine, ext): ) tm.assert_frame_equal(result, expected) - def test_to_excel_output_encoding(self, engine, ext): + def test_to_excel_output_encoding(self, ext): # Avoid mixed inferred_type. df = DataFrame( [["\u0192", "\u0193", "\u0194"], ["\u0195", "\u0196", "\u0197"]], @@ -786,7 +783,7 @@ def test_to_excel_output_encoding(self, engine, ext): result = pd.read_excel(filename, "TestSheet", encoding="utf8", index_col=0) tm.assert_frame_equal(result, df) - def test_to_excel_unicode_filename(self, engine, ext): + def test_to_excel_unicode_filename(self, ext, path): with ensure_clean("\u0192u." + ext) as filename: try: f = open(filename, "wb") @@ -916,14 +913,12 @@ def test_to_excel_unicode_filename(self, engine, ext): @pytest.mark.parametrize("r_idx_nlevels", [1, 2, 3]) @pytest.mark.parametrize("c_idx_nlevels", [1, 2, 3]) def test_excel_010_hemstring( - self, merge_cells, engine, ext, c_idx_nlevels, r_idx_nlevels, use_headers + self, merge_cells, c_idx_nlevels, r_idx_nlevels, use_headers, path ): def roundtrip(data, header=True, parser_hdr=0, index=True): - data.to_excel( - self.path, header=header, merge_cells=merge_cells, index=index - ) + data.to_excel(path, header=header, merge_cells=merge_cells, index=index) - xf = ExcelFile(self.path) + xf = ExcelFile(path) return pd.read_excel(xf, xf.sheet_names[0], header=parser_hdr) # Basic test. @@ -965,128 +960,128 @@ def roundtrip(data, header=True, parser_hdr=0, index=True): for c in range(len(res.columns)): assert res.iloc[r, c] is not np.nan - def test_duplicated_columns(self, engine, ext): + def test_duplicated_columns(self, path): # see gh-5235 df = DataFrame([[1, 2, 3], [1, 2, 3], [1, 2, 3]], columns=["A", "B", "B"]) - df.to_excel(self.path, "test1") + df.to_excel(path, "test1") expected = DataFrame( [[1, 2, 3], [1, 2, 3], [1, 2, 3]], columns=["A", "B", "B.1"] ) # By default, we mangle. - result = pd.read_excel(self.path, "test1", index_col=0) + result = pd.read_excel(path, "test1", index_col=0) tm.assert_frame_equal(result, expected) # Explicitly, we pass in the parameter. - result = pd.read_excel(self.path, "test1", index_col=0, mangle_dupe_cols=True) + result = pd.read_excel(path, "test1", index_col=0, mangle_dupe_cols=True) tm.assert_frame_equal(result, expected) # see gh-11007, gh-10970 df = DataFrame([[1, 2, 3, 4], [5, 6, 7, 8]], columns=["A", "B", "A", "B"]) - df.to_excel(self.path, "test1") + df.to_excel(path, "test1") - result = pd.read_excel(self.path, "test1", index_col=0) + result = pd.read_excel(path, "test1", index_col=0) expected = DataFrame( [[1, 2, 3, 4], [5, 6, 7, 8]], columns=["A", "B", "A.1", "B.1"] ) tm.assert_frame_equal(result, expected) # see gh-10982 - df.to_excel(self.path, "test1", index=False, header=False) - result = pd.read_excel(self.path, "test1", header=None) + df.to_excel(path, "test1", index=False, header=False) + result = pd.read_excel(path, "test1", header=None) expected = DataFrame([[1, 2, 3, 4], [5, 6, 7, 8]]) tm.assert_frame_equal(result, expected) msg = "Setting mangle_dupe_cols=False is not supported yet" with pytest.raises(ValueError, match=msg): - pd.read_excel(self.path, "test1", header=None, mangle_dupe_cols=False) + pd.read_excel(path, "test1", header=None, mangle_dupe_cols=False) - def test_swapped_columns(self, engine, ext): + def test_swapped_columns(self, path): # Test for issue #5427. write_frame = DataFrame({"A": [1, 1, 1], "B": [2, 2, 2]}) - write_frame.to_excel(self.path, "test1", columns=["B", "A"]) + write_frame.to_excel(path, "test1", columns=["B", "A"]) - read_frame = pd.read_excel(self.path, "test1", header=0) + read_frame = pd.read_excel(path, "test1", header=0) tm.assert_series_equal(write_frame["A"], read_frame["A"]) tm.assert_series_equal(write_frame["B"], read_frame["B"]) - def test_invalid_columns(self, engine, ext): + def test_invalid_columns(self, path): # see gh-10982 write_frame = DataFrame({"A": [1, 1, 1], "B": [2, 2, 2]}) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - write_frame.to_excel(self.path, "test1", columns=["B", "C"]) + write_frame.to_excel(path, "test1", columns=["B", "C"]) expected = write_frame.reindex(columns=["B", "C"]) - read_frame = pd.read_excel(self.path, "test1", index_col=0) + read_frame = pd.read_excel(path, "test1", index_col=0) tm.assert_frame_equal(expected, read_frame) with pytest.raises( KeyError, match="'passes columns are not ALL present dataframe'" ): - write_frame.to_excel(self.path, "test1", columns=["C", "D"]) + write_frame.to_excel(path, "test1", columns=["C", "D"]) - def test_comment_arg(self, engine, ext): + def test_comment_arg(self, path): # see gh-18735 # # Test the comment argument functionality to pd.read_excel. # Create file to read in. df = DataFrame({"A": ["one", "#one", "one"], "B": ["two", "two", "#two"]}) - df.to_excel(self.path, "test_c") + df.to_excel(path, "test_c") # Read file without comment arg. - result1 = pd.read_excel(self.path, "test_c", index_col=0) + result1 = pd.read_excel(path, "test_c", index_col=0) result1.iloc[1, 0] = None result1.iloc[1, 1] = None result1.iloc[2, 1] = None - result2 = pd.read_excel(self.path, "test_c", comment="#", index_col=0) + result2 = pd.read_excel(path, "test_c", comment="#", index_col=0) tm.assert_frame_equal(result1, result2) - def test_comment_default(self, engine, ext): + def test_comment_default(self, path): # Re issue #18735 # Test the comment argument default to pd.read_excel # Create file to read in df = DataFrame({"A": ["one", "#one", "one"], "B": ["two", "two", "#two"]}) - df.to_excel(self.path, "test_c") + df.to_excel(path, "test_c") # Read file with default and explicit comment=None - result1 = pd.read_excel(self.path, "test_c") - result2 = pd.read_excel(self.path, "test_c", comment=None) + result1 = pd.read_excel(path, "test_c") + result2 = pd.read_excel(path, "test_c", comment=None) tm.assert_frame_equal(result1, result2) - def test_comment_used(self, engine, ext): + def test_comment_used(self, path): # see gh-18735 # # Test the comment argument is working as expected when used. # Create file to read in. df = DataFrame({"A": ["one", "#one", "one"], "B": ["two", "two", "#two"]}) - df.to_excel(self.path, "test_c") + df.to_excel(path, "test_c") # Test read_frame_comment against manually produced expected output. expected = DataFrame({"A": ["one", None, "one"], "B": ["two", None, None]}) - result = pd.read_excel(self.path, "test_c", comment="#", index_col=0) + result = pd.read_excel(path, "test_c", comment="#", index_col=0) tm.assert_frame_equal(result, expected) - def test_comment_empty_line(self, engine, ext): + def test_comment_empty_line(self, path): # Re issue #18735 # Test that pd.read_excel ignores commented lines at the end of file df = DataFrame({"a": ["1", "#2"], "b": ["2", "3"]}) - df.to_excel(self.path, index=False) + df.to_excel(path, index=False) # Test that all-comment lines at EoF are ignored expected = DataFrame({"a": [1], "b": [2]}) - result = pd.read_excel(self.path, comment="#") + result = pd.read_excel(path, comment="#") tm.assert_frame_equal(result, expected) - def test_datetimes(self, engine, ext): + def test_datetimes(self, path): # Test writing and reading datetimes. For issue #9139. (xref #9185) datetimes = [ @@ -1104,12 +1099,12 @@ def test_datetimes(self, engine, ext): ] write_frame = DataFrame({"A": datetimes}) - write_frame.to_excel(self.path, "Sheet1") - read_frame = pd.read_excel(self.path, "Sheet1", header=0) + write_frame.to_excel(path, "Sheet1") + read_frame = pd.read_excel(path, "Sheet1", header=0) tm.assert_series_equal(write_frame["A"], read_frame["A"]) - def test_bytes_io(self, engine, ext): + def test_bytes_io(self, engine): # see gh-7074 bio = BytesIO() df = DataFrame(np.random.randn(10, 2)) @@ -1123,7 +1118,7 @@ def test_bytes_io(self, engine, ext): reread_df = pd.read_excel(bio, index_col=0) tm.assert_frame_equal(df, reread_df) - def test_write_lists_dict(self, engine, ext): + def test_write_lists_dict(self, path): # see gh-8188. df = DataFrame( { @@ -1132,8 +1127,8 @@ def test_write_lists_dict(self, engine, ext): "str": ["apple", "banana", "cherry"], } ) - df.to_excel(self.path, "Sheet1") - read = pd.read_excel(self.path, "Sheet1", header=0, index_col=0) + df.to_excel(path, "Sheet1") + read = pd.read_excel(path, "Sheet1", header=0, index_col=0) expected = df.copy() expected.mixed = expected.mixed.apply(str) @@ -1141,23 +1136,23 @@ def test_write_lists_dict(self, engine, ext): tm.assert_frame_equal(read, expected) - def test_true_and_false_value_options(self, engine, ext): + def test_true_and_false_value_options(self, path): # see gh-13347 df = pd.DataFrame([["foo", "bar"]], columns=["col1", "col2"]) expected = df.replace({"foo": True, "bar": False}) - df.to_excel(self.path) + df.to_excel(path) read_frame = pd.read_excel( - self.path, true_values=["foo"], false_values=["bar"], index_col=0 + path, true_values=["foo"], false_values=["bar"], index_col=0 ) tm.assert_frame_equal(read_frame, expected) - def test_freeze_panes(self, engine, ext): + def test_freeze_panes(self, path): # see gh-15160 expected = DataFrame([[1, 2], [3, 4]], columns=["col1", "col2"]) - expected.to_excel(self.path, "Sheet1", freeze_panes=(1, 1)) + expected.to_excel(path, "Sheet1", freeze_panes=(1, 1)) - result = pd.read_excel(self.path, index_col=0) + result = pd.read_excel(path, index_col=0) tm.assert_frame_equal(result, expected) def test_path_path_lib(self, engine, ext): @@ -1176,7 +1171,7 @@ def test_path_local_path(self, engine, ext): result = tm.round_trip_pathlib(writer, reader, path="foo.{ext}".format(ext=ext)) tm.assert_frame_equal(result, df) - def test_merged_cell_custom_objects(self, engine, merge_cells, ext): + def test_merged_cell_custom_objects(self, merge_cells, path): # see GH-27006 mi = MultiIndex.from_tuples( [ @@ -1185,10 +1180,8 @@ def test_merged_cell_custom_objects(self, engine, merge_cells, ext): ] ) expected = DataFrame(np.ones((2, 2)), columns=mi) - expected.to_excel(self.path) - result = pd.read_excel( - self.path, header=[0, 1], index_col=0, convert_float=False - ) + expected.to_excel(path) + result = pd.read_excel(path, header=[0, 1], index_col=0, convert_float=False) # need to convert PeriodIndexes to standard Indexes for assert equal expected.columns.set_levels( [[str(i) for i in mi.levels[0]], [str(i) for i in mi.levels[1]]], @@ -1199,18 +1192,18 @@ def test_merged_cell_custom_objects(self, engine, merge_cells, ext): tm.assert_frame_equal(expected, result) @pytest.mark.parametrize("dtype", [None, object]) - def test_raise_when_saving_timezones(self, engine, ext, dtype, tz_aware_fixture): + def test_raise_when_saving_timezones(self, dtype, tz_aware_fixture, path): # GH 27008, GH 7056 tz = tz_aware_fixture data = pd.Timestamp("2019", tz=tz) df = DataFrame([data], dtype=dtype) with pytest.raises(ValueError, match="Excel does not support"): - df.to_excel(self.path) + df.to_excel(path) data = data.to_pydatetime() df = DataFrame([data], dtype=dtype) with pytest.raises(ValueError, match="Excel does not support"): - df.to_excel(self.path) + df.to_excel(path) class TestExcelWriterEngineTests: From 851ca1ab2cecaf4e91a8553f449c4f73faad325e Mon Sep 17 00:00:00 2001 From: Luke Date: Sun, 13 Oct 2019 18:28:48 -0600 Subject: [PATCH 053/119] Fix mypy errors for pandas\tests\* #28926 (test_algos.py) (#28960) --- pandas/tests/test_algos.py | 2 +- setup.cfg | 3 --- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index a5706d8baa614..6df2c8faf7aee 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -767,7 +767,7 @@ def test_same_object_is_in(self): # with similar behavior, then we at least should # fall back to usual python's behavior: "a in [a] == True" class LikeNan: - def __eq__(self): + def __eq__(self, other): return False def __hash__(self): diff --git a/setup.cfg b/setup.cfg index 149af6c283d05..64494bf84363e 100644 --- a/setup.cfg +++ b/setup.cfg @@ -220,9 +220,6 @@ ignore_errors=True [mypy-pandas.tests.series.test_operators] ignore_errors=True -[mypy-pandas.tests.test_algos] -ignore_errors=True - [mypy-pandas.tests.test_base] ignore_errors=True From 18a9e4c8ab253e83ba43767d890576186be13332 Mon Sep 17 00:00:00 2001 From: Jack Bicknell Date: Mon, 14 Oct 2019 03:38:59 +0100 Subject: [PATCH 054/119] DOC: Fixed PR08 and PR09 docstring errors in pandas.Series (#28845) --- pandas/core/accessor.py | 12 ++-- pandas/core/arrays/categorical.py | 26 +++---- pandas/core/arrays/datetimelike.py | 4 +- pandas/core/arrays/datetimes.py | 9 +-- pandas/core/base.py | 2 +- pandas/core/generic.py | 112 +++++++++++++++-------------- pandas/core/indexes/datetimes.py | 8 +-- pandas/core/indexes/timedeltas.py | 14 ++-- pandas/core/series.py | 58 ++++++++------- pandas/core/strings.py | 60 +++++++++------- pandas/plotting/_core.py | 60 ++++++++-------- pandas/tseries/offsets.py | 10 +-- 12 files changed, 197 insertions(+), 178 deletions(-) diff --git a/pandas/core/accessor.py b/pandas/core/accessor.py index 2d4ded9e2e6ba..bce6c352ce480 100644 --- a/pandas/core/accessor.py +++ b/pandas/core/accessor.py @@ -71,7 +71,7 @@ def _add_delegate_accessors(cls, delegate, accessors, typ, overwrite=False): accessors : string list of accessors to add typ : 'property' or 'method' overwrite : boolean, default False - overwrite the method/property in the target class if it exists. + Overwrite the method/property in the target class if it exists. """ def _create_delegator_property(name): @@ -118,12 +118,12 @@ def delegate_names(delegate, accessors, typ, overwrite=False): Parameters ---------- delegate : object - the class to get methods/properties & doc-strings + The class to get methods/properties & doc-strings. accessors : Sequence[str] - List of accessor to add + List of accessor to add. typ : {'property', 'method'} overwrite : boolean, default False - overwrite the method/property in the target class if it exists + Overwrite the method/property in the target class if it exists. Returns ------- @@ -157,11 +157,11 @@ class CachedAccessor: Parameters ---------- name : str - The namespace this will be accessed under, e.g. ``df.foo`` + The namespace this will be accessed under, e.g. ``df.foo``. accessor : cls The class with the extension methods. The class' __init__ method should expect one of a ``Series``, ``DataFrame`` or ``Index`` as - the single argument ``data`` + the single argument ``data``. """ def __init__(self, name, accessor): diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 5e974f0b69e59..ea19808b19fc9 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -912,24 +912,26 @@ def rename_categories(self, new_categories, inplace=False): ---------- new_categories : list-like, dict-like or callable - * list-like: all items must be unique and the number of items in - the new categories must match the existing number of categories. + New categories which will replace old categories. - * dict-like: specifies a mapping from - old categories to new. Categories not contained in the mapping - are passed through and extra categories in the mapping are - ignored. + * list-like: all items must be unique and the number of items in + the new categories must match the existing number of categories. - .. versionadded:: 0.21.0 + * dict-like: specifies a mapping from + old categories to new. Categories not contained in the mapping + are passed through and extra categories in the mapping are + ignored. - * callable : a callable that is called on all items in the old - categories and whose return values comprise the new categories. + .. versionadded:: 0.21.0. - .. versionadded:: 0.23.0 + * callable : a callable that is called on all items in the old + categories and whose return values comprise the new categories. + + .. versionadded:: 0.23.0. inplace : bool, default False - Whether or not to rename the categories inplace or return a copy of - this categorical with renamed categories. + Whether or not to rename the categories inplace or return a copy of + this categorical with renamed categories. Returns ------- diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index c682f3884603c..84a4cbbc0a447 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -207,7 +207,7 @@ class TimelikeOps: ambiguous times) - 'NaT' will return NaT where there are ambiguous times - 'raise' will raise an AmbiguousTimeError if there are ambiguous - times + times. .. versionadded:: 0.24.0 @@ -223,7 +223,7 @@ class TimelikeOps: - 'NaT' will return NaT where there are nonexistent times - timedelta objects will shift nonexistent times by the timedelta - 'raise' will raise an NonExistentTimeError if there are - nonexistent times + nonexistent times. .. versionadded:: 0.24.0 diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index 0335058a69c63..788cd2a3ce5b7 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -993,7 +993,7 @@ def tz_localize(self, tz, ambiguous="raise", nonexistent="raise", errors=None): ambiguous times) - 'NaT' will return NaT where there are ambiguous times - 'raise' will raise an AmbiguousTimeError if there are ambiguous - times + times. nonexistent : 'shift_forward', 'shift_backward, 'NaT', timedelta, \ default 'raise' @@ -1007,11 +1007,12 @@ def tz_localize(self, tz, ambiguous="raise", nonexistent="raise", errors=None): - 'NaT' will return NaT where there are nonexistent times - timedelta objects will shift nonexistent times by the timedelta - 'raise' will raise an NonExistentTimeError if there are - nonexistent times + nonexistent times. .. versionadded:: 0.24.0 errors : {'raise', 'coerce'}, default None + The method to handle errors: - 'raise' will raise a NonExistentTimeError if a timestamp is not valid in the specified time zone (e.g. due to a transition from @@ -1871,7 +1872,7 @@ def sequence_to_dt64ns( dayfirst : bool, default False yearfirst : bool, default False ambiguous : str, bool, or arraylike, default 'raise' - See pandas._libs.tslibs.conversion.tz_localize_to_utc + See pandas._libs.tslibs.conversion.tz_localize_to_utc. int_as_wall_time : bool, default False Whether to treat ints as wall time in specified timezone, or as nanosecond-precision UNIX epoch (wall time in UTC). @@ -2015,7 +2016,7 @@ def objects_to_datetime64ns( dayfirst : bool yearfirst : bool utc : bool, default False - Whether to convert timezone-aware timestamps to UTC + Whether to convert timezone-aware timestamps to UTC. errors : {'raise', 'ignore', 'coerce'} allow_object : bool Whether to return an object-dtype ndarray instead of raising if the diff --git a/pandas/core/base.py b/pandas/core/base.py index 56ffd3db6e942..e4e14a950c96b 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -906,7 +906,7 @@ def to_numpy(self, dtype=None, copy=False): Parameters ---------- dtype : str or numpy.dtype, optional - The dtype to pass to :meth:`numpy.asarray` + The dtype to pass to :meth:`numpy.asarray`. copy : bool, default False Whether to ensure that the returned value is a not a view on another array. Note that ``copy=False`` does not *ensure* that diff --git a/pandas/core/generic.py b/pandas/core/generic.py index da8db23fb538b..e97772a418982 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -1687,9 +1687,9 @@ def _check_label_or_level_ambiguity(self, key, axis=0): Parameters ---------- key: str or object - label or level name + Label or level name. axis: int, default 0 - Axis that levels are associated with (0 for index, 1 for columns) + Axis that levels are associated with (0 for index, 1 for columns). Raises ------ @@ -2288,31 +2288,30 @@ def to_json( orient : str Indication of expected JSON string format. - * Series + * Series: + + - default is 'index' + - allowed values are: {'split','records','index','table'}. - - default is 'index' - - allowed values are: {'split','records','index','table'} + * DataFrame: - * DataFrame + - default is 'columns' + - allowed values are: {'split', 'records', 'index', 'columns', + 'values', 'table'}. - - default is 'columns' - - allowed values are: - {'split','records','index','columns','values','table'} + * The format of the JSON string: - * The format of the JSON string + - 'split' : dict like {'index' -> [index], 'columns' -> [columns], + 'data' -> [values]} + - 'records' : list like [{column -> value}, ... , {column -> value}] + - 'index' : dict like {index -> {column -> value}} + - 'columns' : dict like {column -> {index -> value}} + - 'values' : just the values array + - 'table' : dict like {'schema': {schema}, 'data': {data}} - - 'split' : dict like {'index' -> [index], - 'columns' -> [columns], 'data' -> [values]} - - 'records' : list like - [{column -> value}, ... , {column -> value}] - - 'index' : dict like {index -> {column -> value}} - - 'columns' : dict like {column -> {index -> value}} - - 'values' : just the values array - - 'table' : dict like {'schema': {schema}, 'data': {data}} - describing the data, and the data component is - like ``orient='records'``. + Describing the data, where data component is like ``orient='records'``. - .. versionchanged:: 0.20.0 + .. versionchanged:: 0.20.0 date_format : {None, 'epoch', 'iso'} Type of date conversion. 'epoch' = epoch milliseconds, @@ -2574,7 +2573,7 @@ def to_msgpack(self, path_or_buf=None, encoding="utf-8", **kwargs): ---------- path : str, buffer-like, or None Destination for the serialized object. - If None, return generated bytes + If None, return generated bytes. append : bool, default False Whether to append to an existing msgpack. compress : str, default None @@ -2765,8 +2764,8 @@ def to_pickle(self, path, compression="infer", protocol=pickle.HIGHEST_PROTOCOL) values are 0, 1, 2, 3, 4. A negative value for the protocol parameter is equivalent to setting its value to HIGHEST_PROTOCOL. - .. [1] https://docs.python.org/3/library/pickle.html - .. versionadded:: 0.21.0 + .. [1] https://docs.python.org/3/library/pickle.html. + .. versionadded:: 0.21.0. See Also -------- @@ -3864,7 +3863,7 @@ def reindex_like(self, other, method=None, copy=True, limit=None, tolerance=None * pad / ffill: propagate last valid observation forward to next valid * backfill / bfill: use next valid observation to fill gap - * nearest: use nearest valid observations to fill gap + * nearest: use nearest valid observations to fill gap. copy : bool, default True Return a new object, even if the passed indexes are the same. @@ -4338,7 +4337,7 @@ def reindex(self, *args, **kwargs): %(optional_labels)s %(axes)s : array-like, optional New labels / index to conform to, should be specified using - keywords. Preferably an Index object to avoid duplicating data + keywords. Preferably an Index object to avoid duplicating data. %(optional_axis)s method : {None, 'backfill'/'bfill', 'pad'/'ffill', 'nearest'} Method to use for filling holes in reindexed DataFrame. @@ -4346,10 +4345,10 @@ def reindex(self, *args, **kwargs): monotonically increasing/decreasing index. * None (default): don't fill gaps - * pad / ffill: propagate last valid observation forward to next - valid - * backfill / bfill: use next valid observation to fill gap - * nearest: use nearest valid observations to fill gap + * pad / ffill: Propagate last valid observation forward to next + valid. + * backfill / bfill: Use next valid observation to fill gap. + * nearest: Use nearest valid observations to fill gap. copy : bool, default True Return a new object, even if the passed indexes are the same. @@ -7938,11 +7937,11 @@ def asfreq(self, freq, method=None, how=None, normalize=False, fill_value=None): * 'pad' / 'ffill': propagate last valid observation forward to next valid - * 'backfill' / 'bfill': use NEXT valid observation to fill + * 'backfill' / 'bfill': use NEXT valid observation to fill. how : {'start', 'end'}, default end - For PeriodIndex only, see PeriodIndex.asfreq + For PeriodIndex only (see PeriodIndex.asfreq). normalize : bool, default False - Whether to reset output index to midnight + Whether to reset output index to midnight. fill_value : scalar, optional Value to use for missing values, applied during upsampling (note this does not fill NaNs that already were present). @@ -8614,14 +8613,14 @@ def rank( axis : {0 or 'index', 1 or 'columns'}, default 0 Index to direct ranking. method : {'average', 'min', 'max', 'first', 'dense'}, default 'average' - How to rank the group of records that have the same value - (i.e. ties): + How to rank the group of records that have the same value (i.e. ties): * average: average rank of the group * min: lowest rank in the group * max: highest rank in the group * first: ranks assigned in order they appear in the array - * dense: like 'min', but rank always increases by 1 between groups + * dense: like 'min', but rank always increases by 1 between groups. + numeric_only : bool, optional For DataFrame objects, rank only numeric columns if set to True. na_option : {'keep', 'top', 'bottom'}, default 'keep' @@ -8629,7 +8628,8 @@ def rank( * keep: assign NaN rank to NaN values * top: assign smallest rank to NaN values if ascending - * bottom: assign highest rank to NaN values if ascending + * bottom: assign highest rank to NaN values if ascending. + ascending : bool, default True Whether or not the elements should be ranked in ascending order. pct : bool, default False @@ -8728,20 +8728,22 @@ def ranker(data): other : DataFrame or Series join : {'outer', 'inner', 'left', 'right'}, default 'outer' axis : allowed axis of the other object, default None - Align on index (0), columns (1), or both (None) + Align on index (0), columns (1), or both (None). level : int or level name, default None Broadcast across a level, matching Index values on the - passed MultiIndex level + passed MultiIndex level. copy : bool, default True Always returns new objects. If copy=False and no reindexing is required then original objects are returned. fill_value : scalar, default np.NaN Value to use for missing values. Defaults to NaN, but can be any - "compatible" value + "compatible" value. method : {'backfill', 'bfill', 'pad', 'ffill', None}, default None - Method to use for filling holes in reindexed Series - pad / ffill: propagate last valid observation forward to next valid - backfill / bfill: use NEXT valid observation to fill gap + Method to use for filling holes in reindexed Series: + + - pad / ffill: propagate last valid observation forward to next valid. + - backfill / bfill: use NEXT valid observation to fill gap. + limit : int, default None If method is specified, this is the maximum number of consecutive NaN values to forward/backward fill. In other words, if there is @@ -8750,10 +8752,10 @@ def ranker(data): maximum number of entries along the entire axis where NaNs will be filled. Must be greater than 0 if not None. fill_axis : %(axes_single_arg)s, default 0 - Filling axis, method and limit + Filling axis, method and limit. broadcast_axis : %(axes_single_arg)s, default None Broadcast values along this axis, if aligning two objects of - different dimensions + different dimensions. Returns ------- @@ -9409,7 +9411,7 @@ def slice_shift(self, periods=1, axis=0): Parameters ---------- periods : int - Number of periods to move, can be positive or negative + Number of periods to move, can be positive or negative. Returns ------- @@ -9443,12 +9445,12 @@ def tshift(self, periods=1, freq=None, axis=0): Parameters ---------- periods : int - Number of periods to move, can be positive or negative + Number of periods to move, can be positive or negative. freq : DateOffset, timedelta, or str, default None Increment to use from the tseries module - or time rule expressed as a string (e.g. 'EOM') + or time rule expressed as a string (e.g. 'EOM'). axis : {0 or ‘index’, 1 or ‘columns’, None}, default 0 - Corresponds to the axis that contains the Index + Corresponds to the axis that contains the Index. Returns ------- @@ -9719,9 +9721,9 @@ def tz_localize( axis : the axis to localize level : int, str, default None If axis ia a MultiIndex, localize a specific level. Otherwise - must be None + must be None. copy : bool, default True - Also make a copy of the underlying data + Also make a copy of the underlying data. ambiguous : 'infer', bool-ndarray, 'NaT', default 'raise' When clocks moved backward due to DST, ambiguous times may arise. For example in Central European Time (UTC+01), when going from @@ -9737,7 +9739,7 @@ def tz_localize( ambiguous times) - 'NaT' will return NaT where there are ambiguous times - 'raise' will raise an AmbiguousTimeError if there are ambiguous - times + times. nonexistent : str, default 'raise' A nonexistent time does not exist in a particular timezone where clocks moved forward due to DST. Valid values are: @@ -9749,7 +9751,7 @@ def tz_localize( - 'NaT' will return NaT where there are nonexistent times - timedelta objects will shift nonexistent times by the timedelta - 'raise' will raise an NonExistentTimeError if there are - nonexistent times + nonexistent times. .. versionadded:: 0.24.0 @@ -10919,10 +10921,10 @@ def _doc_parms(cls): axis : %(axis_descr)s skipna : bool, default True Exclude NA/null values. If an entire row/column is NA, the result - will be NA + will be NA. level : int or level name, default None If the axis is a MultiIndex (hierarchical), count along a - particular level, collapsing into a %(name1)s + particular level, collapsing into a %(name1)s. ddof : int, default 1 Delta Degrees of Freedom. The divisor used in calculations is N - ddof, where N represents the number of elements. diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index 6a2f49cd1470e..477525d7ab272 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -149,13 +149,13 @@ class DatetimeIndex(DatetimeIndexOpsMixin, Int64Index, DatetimeDelegateMixin): non-DST time (note that this flag is only applicable for ambiguous times) - 'NaT' will return NaT where there are ambiguous times - - 'raise' will raise an AmbiguousTimeError if there are ambiguous times + - 'raise' will raise an AmbiguousTimeError if there are ambiguous times. name : object - Name to be stored in the index + Name to be stored in the index. dayfirst : bool, default False - If True, parse dates in `data` with the day first order + If True, parse dates in `data` with the day first order. yearfirst : bool, default False - If True parse dates in `data` with the year first order + If True parse dates in `data` with the year first order. Attributes ---------- diff --git a/pandas/core/indexes/timedeltas.py b/pandas/core/indexes/timedeltas.py index c6dce77c4d078..755992c881fe5 100644 --- a/pandas/core/indexes/timedeltas.py +++ b/pandas/core/indexes/timedeltas.py @@ -619,7 +619,7 @@ def insert(self, loc, item): ---------- loc : int item : object - if not either a Python datetime or a numpy integer-like, returned + If not either a Python datetime or a numpy integer-like, returned Index dtype will be object rather than datetime. Returns @@ -722,18 +722,18 @@ def timedelta_range( Parameters ---------- start : str or timedelta-like, default None - Left bound for generating timedeltas + Left bound for generating timedeltas. end : str or timedelta-like, default None - Right bound for generating timedeltas + Right bound for generating timedeltas. periods : int, default None - Number of periods to generate + Number of periods to generate. freq : str or DateOffset, default 'D' - Frequency strings can have multiples, e.g. '5H' + Frequency strings can have multiples, e.g. '5H'. name : str, default None - Name of the resulting TimedeltaIndex + Name of the resulting TimedeltaIndex. closed : str, default None Make the interval closed with respect to the given frequency to - the 'left', 'right', or both sides (None) + the 'left', 'right', or both sides (None). Returns ------- diff --git a/pandas/core/series.py b/pandas/core/series.py index ff8149cc2e922..539a09f7046ac 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -338,11 +338,11 @@ def _init_dict(self, data, index=None, dtype=None): Parameters ---------- data : dict or dict-like - Data used to populate the new Series + Data used to populate the new Series. index : Index or index-like, default None - index for the new Series: if None, use dict keys + Index for the new Series: if None, use dict keys. dtype : dtype, default None - dtype for the new Series: if None, infer from data + The dtype for the new Series: if None, infer from data. Returns ------- @@ -1337,9 +1337,9 @@ def _set_value(self, label, value, takeable: bool = False): Parameters ---------- label : object - Partial indexing with MultiIndex not allowed + Partial indexing with MultiIndex not allowed. value : object - Scalar value + Scalar value. takeable : interpret the index as indexers, default False Returns @@ -1794,7 +1794,7 @@ def _set_name(self, name, inplace=False): ---------- name : str inplace : bool - whether to modify `self` directly or return a copy + Whether to modify `self` directly or return a copy. """ inplace = validate_bool_kwarg(inplace, "inplace") ser = self if inplace else self.copy() @@ -1937,9 +1937,12 @@ def drop_duplicates(self, keep="first", inplace=False): Parameters ---------- keep : {'first', 'last', ``False``}, default 'first' + Method to handle dropping duplicates: + - 'first' : Drop duplicates except for the first occurrence. - 'last' : Drop duplicates except for the last occurrence. - ``False`` : Drop all duplicates. + inplace : bool, default ``False`` If ``True``, performs operation inplace and returns None. @@ -2015,6 +2018,8 @@ def duplicated(self, keep="first"): Parameters ---------- keep : {'first', 'last', False}, default 'first' + Method to handle dropping duplicates: + - 'first' : Mark duplicates as ``True`` except for the first occurrence. - 'last' : Mark duplicates as ``True`` except for the last @@ -2257,10 +2262,9 @@ def round(self, decimals=0, *args, **kwargs): Parameters ---------- - decimals : int - Number of decimal places to round to (default: 0). - If decimals is negative, it specifies the number of - positions to the left of the decimal point. + decimals : int, default 0 + Number of decimal places to round to. If decimals is negative, + it specifies the number of positions to the left of the decimal point. Returns ------- @@ -2294,7 +2298,7 @@ def quantile(self, q=0.5, interpolation="linear"): Parameters ---------- q : float or array-like, default 0.5 (50% quantile) - 0 <= q <= 1, the quantile(s) to compute. + The quantile(s) to compute, which can lie in range: 0 <= q <= 1. interpolation : {'linear', 'lower', 'higher', 'midpoint', 'nearest'} This optional parameter specifies the interpolation method to use, when the desired quantile lies between two data points `i` and `j`: @@ -2356,15 +2360,17 @@ def corr(self, other, method="pearson", min_periods=None): other : Series Series with which to compute the correlation. method : {'pearson', 'kendall', 'spearman'} or callable - * pearson : standard correlation coefficient - * kendall : Kendall Tau correlation coefficient - * spearman : Spearman rank correlation - * callable: callable with input two 1d ndarrays - and returning a float. Note that the returned matrix from corr - will have 1 along the diagonals and will be symmetric - regardless of the callable's behavior - .. versionadded:: 0.24.0 + Method used to compute correlation: + - pearson : Standard correlation coefficient + - kendall : Kendall Tau correlation coefficient + - spearman : Spearman rank correlation + - callable: Callable with input two 1d ndarrays and returning a float. + + .. versionadded:: 0.24.0 + Note that the returned matrix from corr will have 1 along the + diagonals and will be symmetric regardless of the callable's + behavior. min_periods : int, optional Minimum number of observations needed to have a valid result. @@ -2725,10 +2731,10 @@ def _binop(self, other, func, level=None, fill_value=None): func : binary operator fill_value : float or object Value to substitute for NA/null values. If both Series are NA in a - location, the result will be NA regardless of the passed fill value + location, the result will be NA regardless of the passed fill value. level : int or level name, default None Broadcast across a level, matching Index values on the - passed MultiIndex level + passed MultiIndex level. Returns ------- @@ -3308,7 +3314,7 @@ def argsort(self, axis=0, kind="quicksort", order=None): Has no effect but is accepted for compatibility with numpy. kind : {'mergesort', 'quicksort', 'heapsort'}, default 'quicksort' Choice of sorting algorithm. See np.sort for more - information. 'mergesort' is the only stable algorithm + information. 'mergesort' is the only stable algorithm. order : None Has no effect but is accepted for compatibility with numpy. @@ -3562,7 +3568,7 @@ def reorder_levels(self, order): Parameters ---------- order : list of int representing new level order - (reference level by number or key) + Reference level by number or key. Returns ------- @@ -3763,9 +3769,9 @@ def _gotitem(self, key, ndim, subset=None): ---------- key : string / list of selections ndim : 1,2 - requested ndim of result + Requested ndim of result. subset : object, default None - subset to act on + Subset to act on. """ return self @@ -4089,7 +4095,7 @@ def rename(self, index=None, **kwargs): Parameters ---------- index : scalar, hashable sequence, dict-like or function, optional - dict-like or functions are transformations to apply to + Functions or dict-like are transformations to apply to the index. Scalar or hashable sequence-like will alter the ``Series.name`` attribute. diff --git a/pandas/core/strings.py b/pandas/core/strings.py index 888d2ae6f9473..2f2e7234999f2 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -57,12 +57,12 @@ def cat_core(list_of_columns: List, sep: str): List of arrays to be concatenated with sep; these arrays may not contain NaNs! sep : string - The separator string for concatenating the columns + The separator string for concatenating the columns. Returns ------- nd.array - The concatenation of list_of_columns with sep + The concatenation of list_of_columns with sep. """ if sep == "": # no need to interleave sep if it is empty @@ -85,12 +85,12 @@ def cat_safe(list_of_columns: List, sep: str): List of arrays to be concatenated with sep; these arrays may not contain NaNs! sep : string - The separator string for concatenating the columns + The separator string for concatenating the columns. Returns ------- nd.array - The concatenation of list_of_columns with sep + The concatenation of list_of_columns with sep. """ try: result = cat_core(list_of_columns, sep) @@ -506,13 +506,18 @@ def str_replace(arr, pat, repl, n=-1, case=None, flags=0, regex=True): n : int, default -1 (all) Number of replacements to make from start. case : bool, default None + Determines if replace is case sensitive: + - If True, case sensitive (the default if `pat` is a string) - Set to False for case insensitive - - Cannot be set if `pat` is a compiled regex + - Cannot be set if `pat` is a compiled regex. + flags : int, default 0 (no flags) - - re module flags, e.g. re.IGNORECASE - - Cannot be set if `pat` is a compiled regex + Regex module flags, e.g. re.IGNORECASE. Cannot be set if `pat` is a compiled + regex. regex : bool, default True + Determines if assumes the passed-in pattern is a regular expression: + - If True, assumes the passed-in pattern is a regular expression. - If False, treats the pattern as a literal string - Cannot be set to False if `pat` is a compiled regex or `repl` is @@ -713,7 +718,7 @@ def str_match(arr, pat, case=True, flags=0, na=np.nan): case : bool, default True If True, case sensitive. flags : int, default 0 (no flags) - re module flags, e.g. re.IGNORECASE. + Regex module flags, e.g. re.IGNORECASE. na : default NaN Fill value for missing values. @@ -1681,7 +1686,7 @@ def str_translate(arr, table): Parameters ---------- table : dict - table is a mapping of Unicode ordinals to Unicode ordinals, strings, or + Table is a mapping of Unicode ordinals to Unicode ordinals, strings, or None. Unmapped characters are left untouched. Characters mapped to None are deleted. :meth:`str.maketrans` is a helper function for making translation tables. @@ -2134,11 +2139,12 @@ def _get_series_list(self, others): Parameters ---------- others : Series, DataFrame, np.ndarray, list-like or list-like of - objects that are either Series, Index or np.ndarray (1-dim) + Objects that are either Series, Index or np.ndarray (1-dim). Returns ------- - list : others transformed into list of Series + list of Series + Others transformed into list of Series. """ from pandas import Series, DataFrame @@ -2556,7 +2562,7 @@ def rsplit(self, pat=None, n=-1, expand=False): String to split on. pat : str, default whitespace .. deprecated:: 0.24.0 - Use ``sep`` instead + Use ``sep`` instead. expand : bool, default True If True, return DataFrame/MultiIndex expanding dimensionality. If False, return Series/Index. @@ -2712,13 +2718,13 @@ def pad(self, width, side="left", fillchar=" "): ---------- width : int Minimum width of resulting string; additional characters will be filled - with ``fillchar`` + with ``fillchar``. fillchar : str - Additional character for filling, default is whitespace + Additional character for filling, default is whitespace. Returns ------- - filled : Series/Index of objects + filled : Series/Index of objects. """ @Appender(_shared_docs["str_pad"] % dict(side="left and right", method="center")) @@ -2754,7 +2760,7 @@ def zfill(self, width): Returns ------- - Series/Index of objects + Series/Index of objects. See Also -------- @@ -2842,7 +2848,7 @@ def encode(self, encoding, errors="strict"): Returns ------- - Series/Index of objects + Series or Index of object See Also -------- @@ -2967,15 +2973,15 @@ def extractall(self, pat, flags=0): Parameters ---------- sub : str - Substring being searched + Substring being searched. start : int - Left edge index + Left edge index. end : int - Right edge index + Right edge index. Returns ------- - found : Series/Index of integer values + Series or Index of int. See Also -------- @@ -3018,7 +3024,7 @@ def normalize(self, form): Parameters ---------- form : {'NFC', 'NFKC', 'NFD', 'NFKD'} - Unicode form + Unicode form. Returns ------- @@ -3041,15 +3047,15 @@ def normalize(self, form): Parameters ---------- sub : str - Substring being searched + Substring being searched. start : int - Left edge index + Left edge index. end : int - Right edge index + Right edge index. Returns ------- - found : Series/Index of objects + Series or Index of object See Also -------- @@ -3147,7 +3153,7 @@ def rindex(self, sub, start=0, end=None): Returns ------- - Series/Index of objects + Series or Index of object See Also -------- diff --git a/pandas/plotting/_core.py b/pandas/plotting/_core.py index d7b0839ec62ea..eaf5b336bb8f6 100644 --- a/pandas/plotting/_core.py +++ b/pandas/plotting/_core.py @@ -36,28 +36,28 @@ def hist_series( Parameters ---------- by : object, optional - If passed, then used to form histograms for separate groups + If passed, then used to form histograms for separate groups. ax : matplotlib axis object - If not passed, uses gca() + If not passed, uses gca(). grid : bool, default True - Whether to show axis grid lines + Whether to show axis grid lines. xlabelsize : int, default None - If specified changes the x-axis label size + If specified changes the x-axis label size. xrot : float, default None - rotation of x axis labels + Rotation of x axis labels. ylabelsize : int, default None - If specified changes the y-axis label size + If specified changes the y-axis label size. yrot : float, default None - rotation of y axis labels + Rotation of y axis labels. figsize : tuple, default None - figure size in inches by default + Figure size in inches by default. bins : int or sequence, default 10 Number of histogram bins to be used. If an integer is given, bins + 1 bin edges are calculated and returned. If bins is a sequence, gives bin edges, including left edge of first bin and right edge of last bin. In this case, bins is returned unmodified. **kwargs - To be passed to the actual plotting function + To be passed to the actual plotting function. Returns ------- @@ -441,27 +441,28 @@ def boxplot_frame_groupby( grouped : Grouped DataFrame subplots : bool * ``False`` - no subplots will be used - * ``True`` - create a subplot for each group + * ``True`` - create a subplot for each group. + column : column name or list of names, or vector - Can be any valid input to groupby + Can be any valid input to groupby. fontsize : int or str rot : label rotation angle grid : Setting this to True will show the grid ax : Matplotlib axis object, default None figsize : A tuple (width, height) in inches layout : tuple (optional) - (rows, columns) for the layout of the plot + The layout of the plot: (rows, columns). sharex : bool, default False - Whether x-axes will be shared among subplots + Whether x-axes will be shared among subplots. .. versionadded:: 0.23.1 sharey : bool, default True - Whether y-axes will be shared among subplots + Whether y-axes will be shared among subplots. .. versionadded:: 0.23.1 **kwargs All other plotting keyword arguments to be passed to - matplotlib's boxplot function + matplotlib's boxplot function. Returns ------- @@ -507,7 +508,7 @@ class PlotAccessor(PandasObject): Parameters ---------- data : Series or DataFrame - The object for which the method is called + The object for which the method is called. x : label or position, default None Only used if data is a DataFrame. y : label, position or list of label, positions, default None @@ -526,30 +527,31 @@ class PlotAccessor(PandasObject): - 'area' : area plot - 'pie' : pie plot - 'scatter' : scatter plot - - 'hexbin' : hexbin plot + - 'hexbin' : hexbin plot. + figsize : a tuple (width, height) in inches use_index : bool, default True - Use index as ticks for x axis + Use index as ticks for x axis. title : str or list Title to use for the plot. If a string is passed, print the string at the top of the figure. If a list is passed and `subplots` is True, print each item in the list above the corresponding subplot. grid : bool, default None (matlab style default) - Axis grid lines - legend : False/True/'reverse' - Place legend on axis subplots + Axis grid lines. + legend : bool or {'reverse'} + Place legend on axis subplots. style : list or dict - The matplotlib line style per column + The matplotlib line style per column. logx : bool or 'sym', default False - Use log scaling or symlog scaling on x axis + Use log scaling or symlog scaling on x axis. .. versionchanged:: 0.25.0 logy : bool or 'sym' default False - Use log scaling or symlog scaling on y axis + Use log scaling or symlog scaling on y axis. .. versionchanged:: 0.25.0 loglog : bool or 'sym', default False - Use log scaling or symlog scaling on both x and y axes + Use log scaling or symlog scaling on both x and y axes. .. versionchanged:: 0.25.0 xticks : sequence @@ -560,7 +562,7 @@ class PlotAccessor(PandasObject): ylim : 2-tuple/list rot : int, default None Rotation for ticks (xticks for vertical, yticks for horizontal - plots) + plots). fontsize : int, default None Font size for xticks and yticks. colormap : str or matplotlib colormap object, default None @@ -568,11 +570,11 @@ class PlotAccessor(PandasObject): name from matplotlib. colorbar : bool, optional If True, plot colorbar (only relevant for 'scatter' and 'hexbin' - plots) + plots). position : float Specify relative alignments for bar plot layout. From 0 (left/bottom-end) to 1 (right/top-end). Default is 0.5 - (center) + (center). table : bool, Series or DataFrame, default False If True, draw a table using the data in the DataFrame and the data will be transposed to meet matplotlib's default layout. @@ -585,7 +587,7 @@ class PlotAccessor(PandasObject): Equivalent to yerr. mark_right : bool, default True When using a secondary_y axis, automatically mark the column - labels with "(right)" in the legend + labels with "(right)" in the legend. include_bool : bool, default is False If True, boolean values can be plotted. **kwargs diff --git a/pandas/tseries/offsets.py b/pandas/tseries/offsets.py index 84b00d7f4907f..c0f6777fdb62b 100644 --- a/pandas/tseries/offsets.py +++ b/pandas/tseries/offsets.py @@ -229,7 +229,7 @@ def __add__(date): - minute - second - microsecond - - nanosecond + - nanosecond. See Also -------- @@ -1682,7 +1682,7 @@ class WeekOfMonth(_WeekOfMonthMixin, DateOffset): - 3 is Thursday - 4 is Friday - 5 is Saturday - - 6 is Sunday + - 6 is Sunday. """ _prefix = "WOM" @@ -1758,7 +1758,7 @@ class LastWeekOfMonth(_WeekOfMonthMixin, DateOffset): - 3 is Thursday - 4 is Friday - 5 is Saturday - - 6 is Sunday + - 6 is Sunday. """ _prefix = "LWOM" @@ -2078,7 +2078,7 @@ class FY5253(DateOffset): - 3 is Thursday - 4 is Friday - 5 is Saturday - - 6 is Sunday + - 6 is Sunday. startingMonth : int {1, 2, ... 12}, default 1 The month in which the fiscal year ends. @@ -2296,7 +2296,7 @@ class FY5253Quarter(DateOffset): - 3 is Thursday - 4 is Friday - 5 is Saturday - - 6 is Sunday + - 6 is Sunday. startingMonth : int {1, 2, ..., 12}, default 1 The month in which fiscal years end. From 2f80feb55dc89a70bc8812904f28988f50f336f2 Mon Sep 17 00:00:00 2001 From: Nathan Abel Date: Mon, 14 Oct 2019 12:05:36 -0400 Subject: [PATCH 055/119] BUG: Partial fix for docstring validation for parameters (#28765) --- scripts/tests/test_validate_docstrings.py | 19 ++++++++++++++++++ scripts/validate_docstrings.py | 24 ++++++++++++++++------- 2 files changed, 36 insertions(+), 7 deletions(-) diff --git a/scripts/tests/test_validate_docstrings.py b/scripts/tests/test_validate_docstrings.py index b1b5be6d4faeb..1506acc95edf9 100644 --- a/scripts/tests/test_validate_docstrings.py +++ b/scripts/tests/test_validate_docstrings.py @@ -1,3 +1,4 @@ +import functools import io import random import string @@ -68,6 +69,23 @@ def sample(self): """ return random.random() + @functools.lru_cache(None) + def decorated_sample(self, max): + """ + Generate and return a random integer between 0 and max. + + Parameters + ---------- + max : int + The maximum value of the random number. + + Returns + ------- + int + Random number generated. + """ + return random.randint(0, max) + def random_letters(self): """ Generate and return a sequence of random letters. @@ -870,6 +888,7 @@ def test_good_class(self, capsys): "plot", "swap", "sample", + "decorated_sample", "random_letters", "sample_values", "head", diff --git a/scripts/validate_docstrings.py b/scripts/validate_docstrings.py index 50b02c0fcbaf5..1d0f4b583bd0c 100755 --- a/scripts/validate_docstrings.py +++ b/scripts/validate_docstrings.py @@ -430,6 +430,17 @@ def doc_parameters(self): @property def signature_parameters(self): + def add_stars(param_name: str, info: inspect.Parameter): + """ + Add stars to *args and **kwargs parameters + """ + if info.kind == inspect.Parameter.VAR_POSITIONAL: + return f"*{param_name}" + elif info.kind == inspect.Parameter.VAR_KEYWORD: + return f"**{param_name}" + else: + return param_name + if inspect.isclass(self.obj): if hasattr(self.obj, "_accessors") and ( self.name.split(".")[-1] in self.obj._accessors @@ -437,17 +448,16 @@ def signature_parameters(self): # accessor classes have a signature but don't want to show this return tuple() try: - sig = inspect.getfullargspec(self.obj) + sig = inspect.signature(self.obj) except (TypeError, ValueError): # Some objects, mainly in C extensions do not support introspection # of the signature return tuple() - params = sig.args - if sig.varargs: - params.append("*" + sig.varargs) - if sig.varkw: - params.append("**" + sig.varkw) - params = tuple(params) + + params = tuple( + add_stars(parameter, sig.parameters[parameter]) + for parameter in sig.parameters + ) if params and params[0] in ("self", "cls"): return params[1:] return params From 5b0bf235ee4050d6385932b5a63ed692b6d268fc Mon Sep 17 00:00:00 2001 From: Hugh Kelley <38143549+HughKelley@users.noreply.github.com> Date: Mon, 14 Oct 2019 17:48:15 -0400 Subject: [PATCH 056/119] DOC: fix formatting in the ExtensionArray docstrings (#28686) --- pandas/core/arrays/base.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 7a16c3f6a35b6..53755695c97e3 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -474,7 +474,7 @@ def fillna(self, value=None, method=None, limit=None): method : {'backfill', 'bfill', 'pad', 'ffill', None}, default None Method to use for filling holes in reindexed Series pad / ffill: propagate last valid observation forward to next valid - backfill / bfill: use NEXT valid observation to fill gap + backfill / bfill: use NEXT valid observation to fill gap. limit : int, default None If method is specified, this is the maximum number of consecutive NaN values to forward/backward fill. In other words, if there is @@ -485,7 +485,8 @@ def fillna(self, value=None, method=None, limit=None): Returns ------- - filled : ExtensionArray with NA/NaN filled + ExtensionArray + With NA/NaN filled. """ value, method = validate_fillna_kwargs(value, method) @@ -539,13 +540,14 @@ def shift(self, periods: int = 1, fill_value: object = None) -> ABCExtensionArra fill_value : object, optional The scalar value to use for newly introduced missing values. - The default is ``self.dtype.na_value`` + The default is ``self.dtype.na_value``. .. versionadded:: 0.24.0 Returns ------- - shifted : ExtensionArray + ExtensionArray + Shifted. Notes ----- @@ -869,11 +871,12 @@ def view(self, dtype=None) -> Union[ABCExtensionArray, np.ndarray]: Parameters ---------- dtype : str, np.dtype, or ExtensionDtype, optional - Default None + Default None. Returns ------- ExtensionArray + A view of the :class:`ExtensionArray`. """ # NB: # - This must return a *new* object referencing the same data, not self. From 8c5941cd57e260f4c3552769e79d8d6dbd6283d9 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 14 Oct 2019 19:33:47 -0400 Subject: [PATCH 057/119] BUG: use EA.astype in ExtensionBlock.to_native_types (#28841) --- doc/source/whatsnew/v0.25.2.rst | 2 +- pandas/core/internals/blocks.py | 19 +++- pandas/tests/extension/list/__init__.py | 3 + pandas/tests/extension/list/array.py | 133 +++++++++++++++++++++++ pandas/tests/extension/list/test_list.py | 30 +++++ 5 files changed, 185 insertions(+), 2 deletions(-) create mode 100644 pandas/tests/extension/list/__init__.py create mode 100644 pandas/tests/extension/list/array.py create mode 100644 pandas/tests/extension/list/test_list.py diff --git a/doc/source/whatsnew/v0.25.2.rst b/doc/source/whatsnew/v0.25.2.rst index 9789c9fce3541..fcb6fc8f347bd 100644 --- a/doc/source/whatsnew/v0.25.2.rst +++ b/doc/source/whatsnew/v0.25.2.rst @@ -64,7 +64,7 @@ I/O - Fix regression in notebook display where tags not used for :attr:`DataFrame.index` (:issue:`28204`). - Regression in :meth:`~DataFrame.to_csv` where writing a :class:`Series` or :class:`DataFrame` indexed by an :class:`IntervalIndex` would incorrectly raise a ``TypeError`` (:issue:`28210`) -- +- Fix :meth:`~DataFrame.to_csv` with ``ExtensionArray`` with list-like values (:issue:`28840`). - Plotting diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index b76cb5cbec626..1495be1f26df5 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -687,7 +687,6 @@ def _try_coerce_args(self, other): def to_native_types(self, slicer=None, na_rep="nan", quoting=None, **kwargs): """ convert to our native types format, slicing if desired """ - values = self.get_values() if slicer is not None: @@ -1783,6 +1782,23 @@ def get_values(self, dtype=None): def to_dense(self): return np.asarray(self.values) + def to_native_types(self, slicer=None, na_rep="nan", quoting=None, **kwargs): + """override to use ExtensionArray astype for the conversion""" + values = self.values + if slicer is not None: + values = values[slicer] + mask = isna(values) + + try: + values = values.astype(str) + values[mask] = na_rep + except Exception: + # eg SparseArray does not support setitem, needs to be converted to ndarray + return super().to_native_types(slicer, na_rep, quoting, **kwargs) + + # we are expected to return a 2-d ndarray + return values.reshape(1, len(values)) + def take_nd(self, indexer, axis=0, new_mgr_locs=None, fill_tuple=None): """ Take values according to indexer and return them as a block. @@ -2265,6 +2281,7 @@ class DatetimeTZBlock(ExtensionBlock, DatetimeBlock): is_extension = True _can_hold_element = DatetimeBlock._can_hold_element + to_native_types = DatetimeBlock.to_native_types fill_value = np.datetime64("NaT", "ns") @property diff --git a/pandas/tests/extension/list/__init__.py b/pandas/tests/extension/list/__init__.py new file mode 100644 index 0000000000000..108f1937d07d3 --- /dev/null +++ b/pandas/tests/extension/list/__init__.py @@ -0,0 +1,3 @@ +from .array import ListArray, ListDtype, make_data + +__all__ = ["ListArray", "ListDtype", "make_data"] diff --git a/pandas/tests/extension/list/array.py b/pandas/tests/extension/list/array.py new file mode 100644 index 0000000000000..0ca9fadb68829 --- /dev/null +++ b/pandas/tests/extension/list/array.py @@ -0,0 +1,133 @@ +""" +Test extension array for storing nested data in a pandas container. + +The ListArray stores an ndarray of lists. +""" +import numbers +import random +import string + +import numpy as np + +from pandas.core.dtypes.base import ExtensionDtype + +import pandas as pd +from pandas.core.arrays import ExtensionArray + + +class ListDtype(ExtensionDtype): + type = list + name = "list" + na_value = np.nan + + @classmethod + def construct_array_type(cls): + """ + Return the array type associated with this dtype. + + Returns + ------- + type + """ + return ListArray + + @classmethod + def construct_from_string(cls, string): + if string == cls.name: + return cls() + else: + raise TypeError("Cannot construct a '{}' from '{}'".format(cls, string)) + + +class ListArray(ExtensionArray): + dtype = ListDtype() + __array_priority__ = 1000 + + def __init__(self, values, dtype=None, copy=False): + if not isinstance(values, np.ndarray): + raise TypeError("Need to pass a numpy array as values") + for val in values: + if not isinstance(val, self.dtype.type) and not pd.isna(val): + raise TypeError("All values must be of type " + str(self.dtype.type)) + self.data = values + + @classmethod + def _from_sequence(cls, scalars, dtype=None, copy=False): + data = np.empty(len(scalars), dtype=object) + data[:] = scalars + return cls(data) + + def __getitem__(self, item): + if isinstance(item, numbers.Integral): + return self.data[item] + else: + # slice, list-like, mask + return type(self)(self.data[item]) + + def __len__(self) -> int: + return len(self.data) + + def isna(self): + return np.array( + [not isinstance(x, list) and np.isnan(x) for x in self.data], dtype=bool + ) + + def take(self, indexer, allow_fill=False, fill_value=None): + # re-implement here, since NumPy has trouble setting + # sized objects like UserDicts into scalar slots of + # an ndarary. + indexer = np.asarray(indexer) + msg = ( + "Index is out of bounds or cannot do a " + "non-empty take from an empty array." + ) + + if allow_fill: + if fill_value is None: + fill_value = self.dtype.na_value + # bounds check + if (indexer < -1).any(): + raise ValueError + try: + output = [ + self.data[loc] if loc != -1 else fill_value for loc in indexer + ] + except IndexError: + raise IndexError(msg) + else: + try: + output = [self.data[loc] for loc in indexer] + except IndexError: + raise IndexError(msg) + + return self._from_sequence(output) + + def copy(self): + return type(self)(self.data[:]) + + def astype(self, dtype, copy=True): + if isinstance(dtype, type(self.dtype)) and dtype == self.dtype: + if copy: + return self.copy() + return self + elif pd.api.types.is_string_dtype(dtype) and not pd.api.types.is_object_dtype( + dtype + ): + # numpy has problems with astype(str) for nested elements + return np.array([str(x) for x in self.data], dtype=dtype) + return np.array(self.data, dtype=dtype, copy=copy) + + @classmethod + def _concat_same_type(cls, to_concat): + data = np.concatenate([x.data for x in to_concat]) + return cls(data) + + +def make_data(): + # TODO: Use a regular dict. See _NDFrameIndexer._setitem_with_indexer + data = np.empty(100, dtype=object) + data[:] = [ + [random.choice(string.ascii_letters) for _ in range(random.randint(0, 10))] + for _ in range(100) + ] + return data diff --git a/pandas/tests/extension/list/test_list.py b/pandas/tests/extension/list/test_list.py new file mode 100644 index 0000000000000..c5c4417155562 --- /dev/null +++ b/pandas/tests/extension/list/test_list.py @@ -0,0 +1,30 @@ +import pytest + +import pandas as pd + +from .array import ListArray, ListDtype, make_data + + +@pytest.fixture +def dtype(): + return ListDtype() + + +@pytest.fixture +def data(): + """Length-100 ListArray for semantics test.""" + data = make_data() + + while len(data[0]) == len(data[1]): + data = make_data() + + return ListArray(data) + + +def test_to_csv(data): + # https://github.com/pandas-dev/pandas/issues/28840 + # array with list-likes fail when doing astype(str) on the numpy array + # which was done in to_native_types + df = pd.DataFrame({"a": data}) + res = df.to_csv() + assert str(data[0]) in res From 97c65671c5995d069616d56efa22a2fc899d9358 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 15 Oct 2019 06:31:51 -0500 Subject: [PATCH 058/119] Document 3.8 compatibility (#28982) --- doc/source/getting_started/install.rst | 2 +- doc/source/whatsnew/v0.25.2.rst | 4 ++++ setup.py | 1 + 3 files changed, 6 insertions(+), 1 deletion(-) diff --git a/doc/source/getting_started/install.rst b/doc/source/getting_started/install.rst index fc99b458fa0af..7d1150c2f65fa 100644 --- a/doc/source/getting_started/install.rst +++ b/doc/source/getting_started/install.rst @@ -18,7 +18,7 @@ Instructions for installing from source, Python version support ---------------------- -Officially Python 3.5.3 and above, 3.6, and 3.7. +Officially Python 3.5.3 and above, 3.6, 3.7, and 3.8. Installing pandas ----------------- diff --git a/doc/source/whatsnew/v0.25.2.rst b/doc/source/whatsnew/v0.25.2.rst index fcb6fc8f347bd..73a5ac5f840be 100644 --- a/doc/source/whatsnew/v0.25.2.rst +++ b/doc/source/whatsnew/v0.25.2.rst @@ -6,6 +6,10 @@ What's new in 0.25.2 (October XX, 2019) These are the changes in pandas 0.25.2. See :ref:`release` for a full changelog including other versions of pandas. +.. note:: + + Pandas 0.25.2 adds compatibility for Python 3.8 (:issue:`28147`). + .. _whatsnew_0252.bug_fixes: Bug fixes diff --git a/setup.py b/setup.py index 04aedcb101e25..c35a0e75ecb80 100755 --- a/setup.py +++ b/setup.py @@ -228,6 +228,7 @@ def build_extensions(self): "Programming Language :: Python :: 3.5", "Programming Language :: Python :: 3.6", "Programming Language :: Python :: 3.7", + "Programming Language :: Python :: 3.8", "Programming Language :: Cython", "Topic :: Scientific/Engineering", ] From 80412cc309464aac46188b0979826661dcdc83ed Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 15 Oct 2019 04:38:52 -0700 Subject: [PATCH 059/119] CI: 3.8 build (#28730) --- .travis.yml | 13 +++++++++++-- ci/build38.sh | 25 +++++++++++++++++++++++++ ci/setup_env.sh | 4 ++++ 3 files changed, 40 insertions(+), 2 deletions(-) create mode 100644 ci/build38.sh diff --git a/.travis.yml b/.travis.yml index 79fecc41bec0d..b9fa06304d387 100644 --- a/.travis.yml +++ b/.travis.yml @@ -30,6 +30,12 @@ matrix: - python: 3.5 include: + - dist: bionic + # 18.04 + python: 3.8-dev + env: + - JOB="3.8-dev" PATTERN="(not slow and not network)" + - dist: trusty env: - JOB="3.7" ENV_FILE="ci/deps/travis-37.yaml" PATTERN="(not slow and not network)" @@ -71,6 +77,7 @@ before_install: # This overrides travis and tells it to look nowhere. - export BOTO_CONFIG=/dev/null + install: - echo "install start" - ci/prep_cython_cache.sh @@ -78,17 +85,19 @@ install: - ci/submit_cython_cache.sh - echo "install done" + before_script: # display server (for clipboard functionality) needs to be started here, # does not work if done in install:setup_env.sh (GH-26103) - export DISPLAY=":99.0" - echo "sh -e /etc/init.d/xvfb start" - - sh -e /etc/init.d/xvfb start + - if [ "$JOB" != "3.8-dev" ]; then sh -e /etc/init.d/xvfb start; fi - sleep 3 script: - echo "script start" - - source activate pandas-dev + - echo "$JOB" + - if [ "$JOB" != "3.8-dev" ]; then source activate pandas-dev; fi - ci/run_tests.sh after_script: diff --git a/ci/build38.sh b/ci/build38.sh new file mode 100644 index 0000000000000..5c798c17301e0 --- /dev/null +++ b/ci/build38.sh @@ -0,0 +1,25 @@ +#!/bin/bash -e +# Special build for python3.8 until numpy puts its own wheels up + +sudo apt-get install build-essential gcc xvfb +pip install --no-deps -U pip wheel setuptools +pip install python-dateutil pytz cython pytest pytest-xdist hypothesis + +# Possible alternative for getting numpy: +# pip install --pre -f https://7933911d6844c6c53a7d-47bd50c35cd79bd838daf386af554a83.ssl.cf2.rackcdn.com/ numpy +git clone https://github.com/numpy/numpy +cd numpy +python setup.py build_ext --inplace +python setup.py install +cd .. +rm -rf numpy + +python setup.py build_ext -inplace +python -m pip install --no-build-isolation -e . + +python -c "import sys; print(sys.version_info)" +python -c "import pandas as pd" +python -c "import hypothesis" + +# TODO: Is there anything else in setup_env that we really want to do? +# ci/setup_env.sh diff --git a/ci/setup_env.sh b/ci/setup_env.sh index 382491a947488..be8c3645691fe 100755 --- a/ci/setup_env.sh +++ b/ci/setup_env.sh @@ -1,5 +1,9 @@ #!/bin/bash -e +if [ "$JOB" == "3.8-dev" ]; then + /bin/bash ci/build38.sh + exit 0 +fi # edit the locale file if needed if [ -n "$LOCALE_OVERRIDE" ]; then From c65cfb6ac0fa7a350738edfeea968eee685f6cce Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 15 Oct 2019 05:08:41 -0700 Subject: [PATCH 060/119] remove unnecessary get_value_at calls (#28977) --- pandas/_libs/index.pyx | 30 ++++++++++++++++++++---------- pandas/_libs/lib.pyx | 16 +++++++++++----- pandas/_libs/reduction.pyx | 2 +- pandas/core/sorting.py | 4 ++-- 4 files changed, 34 insertions(+), 18 deletions(-) diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index 979dad6db0838..22f7104debf10 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -41,11 +41,13 @@ cdef inline bint is_definitely_invalid_key(object val): cpdef get_value_at(ndarray arr, object loc, object tz=None): + obj = util.get_value_at(arr, loc) + if arr.descr.type_num == NPY_DATETIME: - return Timestamp(util.get_value_at(arr, loc), tz=tz) + return Timestamp(obj, tz=tz) elif arr.descr.type_num == NPY_TIMEDELTA: - return Timedelta(util.get_value_at(arr, loc)) - return util.get_value_at(arr, loc) + return Timedelta(obj) + return obj # Don't populate hash tables in monotonic indexes larger than this @@ -102,6 +104,9 @@ cdef class IndexEngine: arr[loc] = value cpdef get_loc(self, object val): + cdef: + Py_ssize_t loc + if is_definitely_invalid_key(val): raise TypeError("'{val}' is an invalid key".format(val=val)) @@ -114,7 +119,7 @@ cdef class IndexEngine: loc = _bin_search(values, val) # .searchsorted(val, side='left') if loc >= len(values): raise KeyError(val) - if util.get_value_at(values, loc) != val: + if values[loc] != val: raise KeyError(val) return loc @@ -352,22 +357,22 @@ cdef Py_ssize_t _bin_search(ndarray values, object val) except -1: Py_ssize_t mid = 0, lo = 0, hi = len(values) - 1 object pval - if hi == 0 or (hi > 0 and val > util.get_value_at(values, hi)): + if hi == 0 or (hi > 0 and val > values[hi]): return len(values) while lo < hi: mid = (lo + hi) // 2 - pval = util.get_value_at(values, mid) + pval = values[mid] if val < pval: hi = mid elif val > pval: lo = mid + 1 else: - while mid > 0 and val == util.get_value_at(values, mid - 1): + while mid > 0 and val == values[mid - 1]: mid -= 1 return mid - if val <= util.get_value_at(values, mid): + if val <= values[mid]: return mid else: return mid + 1 @@ -387,13 +392,16 @@ cdef class DatetimeEngine(Int64Engine): return 'M8[ns]' def __contains__(self, object val): + cdef: + int64_t loc + if self.over_size_threshold and self.is_monotonic_increasing: if not self.is_unique: return self._get_loc_duplicates(val) values = self._get_index_values() conv = maybe_datetimelike_to_i8(val) loc = values.searchsorted(conv, side='left') - return util.get_value_at(values, loc) == conv + return values[loc] == conv self._ensure_mapping_populated() return maybe_datetimelike_to_i8(val) in self.mapping @@ -405,6 +413,8 @@ cdef class DatetimeEngine(Int64Engine): return algos.is_monotonic(values, timelike=True) cpdef get_loc(self, object val): + cdef: + int64_t loc if is_definitely_invalid_key(val): raise TypeError @@ -422,7 +432,7 @@ cdef class DatetimeEngine(Int64Engine): self._date_check_type(val) raise KeyError(val) - if loc == len(values) or util.get_value_at(values, loc) != conv: + if loc == len(values) or values[loc] != conv: raise KeyError(val) return loc diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 1c2f80b832201..a3a50644e58f3 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -782,8 +782,16 @@ def generate_slices(const int64_t[:] labels, Py_ssize_t ngroups): return starts, ends -def indices_fast(object index, const int64_t[:] labels, list keys, +def indices_fast(ndarray index, const int64_t[:] labels, list keys, list sorted_labels): + """ + Parameters + ---------- + index : ndarray + labels : ndarray[int64] + keys : list + sorted_labels : list[ndarray[int64]] + """ cdef: Py_ssize_t i, j, k, lab, cur, start, n = len(labels) dict result = {} @@ -803,8 +811,7 @@ def indices_fast(object index, const int64_t[:] labels, list keys, if lab != -1: tup = PyTuple_New(k) for j in range(k): - val = util.get_value_at(keys[j], - sorted_labels[j][i - 1]) + val = keys[j][sorted_labels[j][i - 1]] PyTuple_SET_ITEM(tup, j, val) Py_INCREF(val) @@ -814,8 +821,7 @@ def indices_fast(object index, const int64_t[:] labels, list keys, tup = PyTuple_New(k) for j in range(k): - val = util.get_value_at(keys[j], - sorted_labels[j][n - 1]) + val = keys[j][sorted_labels[j][n - 1]] PyTuple_SET_ITEM(tup, j, val) Py_INCREF(val) result[tup] = index[start:] diff --git a/pandas/_libs/reduction.pyx b/pandas/_libs/reduction.pyx index 34eb9412451c5..0eac0e94f0beb 100644 --- a/pandas/_libs/reduction.pyx +++ b/pandas/_libs/reduction.pyx @@ -121,7 +121,7 @@ cdef class Reducer: for i in range(self.nresults): if has_ndarray_labels: - name = util.get_value_at(labels, i) + name = labels[i] elif has_labels: # labels is an ExtensionArray name = labels[i] diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index e6edad656d430..94810369785d3 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -303,8 +303,8 @@ def get_flattened_iterator(comp_ids, ngroups, levels, labels): def get_indexer_dict(label_list, keys): - """ return a diction of {labels} -> {indexers} """ - shape = list(map(len, keys)) + """ return a dict of {labels} -> {indexers} """ + shape = [len(x) for x in keys] group_index = get_group_index(label_list, shape, sort=True, xnull=True) ngroups = ( From 3eca50528ceabe56b3181b9e37fa918e96db0981 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 15 Oct 2019 05:09:05 -0700 Subject: [PATCH 061/119] add types tot rank_1d_, rank_2d (#28978) --- pandas/_libs/algos_rank_helper.pxi.in | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/pandas/_libs/algos_rank_helper.pxi.in b/pandas/_libs/algos_rank_helper.pxi.in index 5dac94394c7ed..1ba1667b687be 100644 --- a/pandas/_libs/algos_rank_helper.pxi.in +++ b/pandas/_libs/algos_rank_helper.pxi.in @@ -24,7 +24,7 @@ dtypes = [('object', 'object', 'Infinity()', 'NegInfinity()'), @cython.wraparound(False) @cython.boundscheck(False) -def rank_1d_{{dtype}}(object in_arr, ties_method='average', +def rank_1d_{{dtype}}({{ctype}}[:] in_arr, ties_method='average', ascending=True, na_option='keep', pct=False): """ Fast NaN-friendly version of scipy.stats.rankdata @@ -189,7 +189,7 @@ def rank_1d_{{dtype}}(object in_arr, ties_method='average', return ranks -def rank_2d_{{dtype}}(object in_arr, axis=0, ties_method='average', +def rank_2d_{{dtype}}({{ctype}}[:, :] in_arr, axis=0, ties_method='average', ascending=True, na_option='keep', pct=False): """ Fast NaN-friendly version of scipy.stats.rankdata @@ -226,12 +226,10 @@ def rank_2d_{{dtype}}(object in_arr, axis=0, ties_method='average', keep_na = na_option == 'keep' - in_arr = np.asarray(in_arr) - if axis == 0: - values = in_arr.T.copy() + values = np.asarray(in_arr).T.copy() else: - values = in_arr.copy() + values = np.asarray(in_arr).copy() {{if dtype == 'object'}} if values.dtype != np.object_: From 3b343680eae6d677bb37900171b02fa195a5f42f Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 15 Oct 2019 05:18:36 -0700 Subject: [PATCH 062/119] CLN: Exception in _libs (#28967) --- pandas/_libs/intervaltree.pxi.in | 4 ++-- pandas/_libs/lib.pyx | 5 +++-- pandas/_libs/tslibs/parsing.pyx | 2 +- pandas/_libs/tslibs/timezones.pyx | 7 ++----- 4 files changed, 8 insertions(+), 10 deletions(-) diff --git a/pandas/_libs/intervaltree.pxi.in b/pandas/_libs/intervaltree.pxi.in index ac713a928973f..08bfaf21db9fb 100644 --- a/pandas/_libs/intervaltree.pxi.in +++ b/pandas/_libs/intervaltree.pxi.in @@ -158,7 +158,7 @@ cdef class IntervalTree(IntervalMixin): # TODO: write get_indexer_intervals cdef: - size_t old_len + Py_ssize_t old_len Py_ssize_t i Int64Vector result @@ -179,7 +179,7 @@ cdef class IntervalTree(IntervalMixin): the given array of scalar targets. Non-unique positions are repeated. """ cdef: - size_t old_len + Py_ssize_t old_len Py_ssize_t i Int64Vector result, missing diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index a3a50644e58f3..c4cb2556334ed 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2072,7 +2072,7 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0, floats[i] = float(val) complexes[i] = complex(val) seen.float_ = 1 - except Exception: + except (ValueError, TypeError): seen.object_ = 1 break else: @@ -2352,7 +2352,8 @@ def to_object_array_tuples(rows: object): row = rows[i] for j in range(len(row)): result[i, j] = row[j] - except Exception: + except TypeError: + # e.g. "Expected tuple, got list" # upcast any subclasses to tuple for i in range(n): row = (rows[i],) if checknull(rows[i]) else tuple(rows[i]) diff --git a/pandas/_libs/tslibs/parsing.pyx b/pandas/_libs/tslibs/parsing.pyx index 33665484311ba..bf0a0ae5a3fe9 100644 --- a/pandas/_libs/tslibs/parsing.pyx +++ b/pandas/_libs/tslibs/parsing.pyx @@ -581,7 +581,7 @@ def try_parse_dates(object[:] values, parser=None, else: result[i] = parse_date(values[i]) except Exception: - # failed + # Since parser is user-defined, we can't guess what it migh raise return values else: parse_date = parser diff --git a/pandas/_libs/tslibs/timezones.pyx b/pandas/_libs/tslibs/timezones.pyx index cbfbc14c35b35..bc1fdfae99de9 100644 --- a/pandas/_libs/tslibs/timezones.pyx +++ b/pandas/_libs/tslibs/timezones.pyx @@ -226,11 +226,8 @@ cdef object get_dst_info(object tz): if treat_tz_as_pytz(tz): trans = np.array(tz._utc_transition_times, dtype='M8[ns]') trans = trans.view('i8') - try: - if tz._utc_transition_times[0].year == 1: - trans[0] = NPY_NAT + 1 - except Exception: - pass + if tz._utc_transition_times[0].year == 1: + trans[0] = NPY_NAT + 1 deltas = unbox_utcoffsets(tz._transition_info) typ = 'pytz' From 125739fe041a3b80bcc81f5dbdf2da12d3021465 Mon Sep 17 00:00:00 2001 From: Aaditya Panikath Date: Tue, 15 Oct 2019 17:50:15 +0530 Subject: [PATCH 063/119] CLN: pandas-dev#28926 Fix pandas/tests/tseries/offsets/test_offsets_properties (#28972) --- pandas/tests/tseries/offsets/test_offsets_properties.py | 4 ++-- setup.cfg | 3 --- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/pandas/tests/tseries/offsets/test_offsets_properties.py b/pandas/tests/tseries/offsets/test_offsets_properties.py index 880ff1f137520..a05de78e299f7 100644 --- a/pandas/tests/tseries/offsets/test_offsets_properties.py +++ b/pandas/tests/tseries/offsets/test_offsets_properties.py @@ -36,8 +36,8 @@ with warnings.catch_warnings(): warnings.simplefilter("ignore") - min_dt = (pd.Timestamp(1900, 1, 1).to_pydatetime(),) - max_dt = (pd.Timestamp(1900, 1, 1).to_pydatetime(),) + min_dt = pd.Timestamp(1900, 1, 1).to_pydatetime() + max_dt = pd.Timestamp(1900, 1, 1).to_pydatetime() gen_date_range = st.builds( pd.date_range, diff --git a/setup.cfg b/setup.cfg index 64494bf84363e..775999bc21b97 100644 --- a/setup.cfg +++ b/setup.cfg @@ -226,8 +226,5 @@ ignore_errors=True [mypy-pandas.tests.tseries.offsets.test_offsets] ignore_errors=True -[mypy-pandas.tests.tseries.offsets.test_offsets_properties] -ignore_errors=True - [mypy-pandas.tests.tseries.offsets.test_yqm_offsets] ignore_errors=True From 6dc53bb4e903d6e21dc9a0f0a2e97809b053a095 Mon Sep 17 00:00:00 2001 From: Aaditya Panikath Date: Tue, 15 Oct 2019 18:00:27 +0530 Subject: [PATCH 064/119] CLN: pandas-dev#28926 Fix mypy errors in pandas/tests/io/parser/conftest.py (#28973) --- pandas/tests/io/parser/conftest.py | 9 +++++---- setup.cfg | 3 --- 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/pandas/tests/io/parser/conftest.py b/pandas/tests/io/parser/conftest.py index 2c347a096006a..183ad500b15f3 100644 --- a/pandas/tests/io/parser/conftest.py +++ b/pandas/tests/io/parser/conftest.py @@ -1,4 +1,5 @@ import os +from typing import List, Optional import pytest @@ -6,9 +7,9 @@ class BaseParser: - engine = None + engine = None # type: Optional[str] low_memory = True - float_precision_choices = [] + float_precision_choices = [] # type: List[Optional[str]] def update_kwargs(self, kwargs): kwargs = kwargs.copy() @@ -59,11 +60,11 @@ def csv1(csv_dir_path): _py_parsers_only = [_pythonParser] _c_parsers_only = [_cParserHighMemory, _cParserLowMemory] -_all_parsers = _c_parsers_only + _py_parsers_only +_all_parsers = [*_c_parsers_only, *_py_parsers_only] _py_parser_ids = ["python"] _c_parser_ids = ["c_high", "c_low"] -_all_parser_ids = _c_parser_ids + _py_parser_ids +_all_parser_ids = [*_c_parser_ids, *_py_parser_ids] @pytest.fixture(params=_all_parsers, ids=_all_parser_ids) diff --git a/setup.cfg b/setup.cfg index 775999bc21b97..55d25abde585c 100644 --- a/setup.cfg +++ b/setup.cfg @@ -205,9 +205,6 @@ ignore_errors=True [mypy-pandas.tests.io.json.test_ujson] ignore_errors=True -[mypy-pandas.tests.io.parser.conftest] -ignore_errors=True - [mypy-pandas.tests.io.test_sql] ignore_errors=True From 19c8fad56aa3fc8f683e884f125f042143966b47 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 15 Oct 2019 08:51:07 -0500 Subject: [PATCH 065/119] DOC: 0.25.2 whatsnew cleanup (#29000) --- doc/source/whatsnew/v0.25.2.rst | 73 ++------------------------------- 1 file changed, 3 insertions(+), 70 deletions(-) diff --git a/doc/source/whatsnew/v0.25.2.rst b/doc/source/whatsnew/v0.25.2.rst index 73a5ac5f840be..a99751f9bab9f 100644 --- a/doc/source/whatsnew/v0.25.2.rst +++ b/doc/source/whatsnew/v0.25.2.rst @@ -1,6 +1,6 @@ .. _whatsnew_0252: -What's new in 0.25.2 (October XX, 2019) +What's new in 0.25.2 (October 15, 2019) --------------------------------------- These are the changes in pandas 0.25.2. See :ref:`release` for a full changelog @@ -15,91 +15,24 @@ including other versions of pandas. Bug fixes ~~~~~~~~~ -Categorical -^^^^^^^^^^^ - -- - -Datetimelike -^^^^^^^^^^^^ - -- -- -- - -Timezones -^^^^^^^^^ - -- - -Numeric -^^^^^^^ - -- -- -- -- - -Conversion -^^^^^^^^^^ - -- - -Interval -^^^^^^^^ - -- - Indexing ^^^^^^^^ -- Fix regression in :meth:`DataFrame.reindex` not following ``limit`` argument (:issue:`28631`). +- Fix regression in :meth:`DataFrame.reindex` not following the ``limit`` argument (:issue:`28631`). - Fix regression in :meth:`RangeIndex.get_indexer` for decreasing :class:`RangeIndex` where target values may be improperly identified as missing/present (:issue:`28678`) -- -- - -Missing -^^^^^^^ - -- I/O ^^^ -- Fix regression in notebook display where tags not used for :attr:`DataFrame.index` (:issue:`28204`). +- Fix regression in notebook display where ```` tags were missing for :attr:`DataFrame.index` values (:issue:`28204`). - Regression in :meth:`~DataFrame.to_csv` where writing a :class:`Series` or :class:`DataFrame` indexed by an :class:`IntervalIndex` would incorrectly raise a ``TypeError`` (:issue:`28210`) - Fix :meth:`~DataFrame.to_csv` with ``ExtensionArray`` with list-like values (:issue:`28840`). -- - -Plotting -^^^^^^^^ - -- -- -- Groupby/resample/rolling ^^^^^^^^^^^^^^^^^^^^^^^^ - Bug incorrectly raising an ``IndexError`` when passing a list of quantiles to :meth:`pandas.core.groupby.DataFrameGroupBy.quantile` (:issue:`28113`). - Bug in :meth:`pandas.core.groupby.GroupBy.shift`, :meth:`pandas.core.groupby.GroupBy.bfill` and :meth:`pandas.core.groupby.GroupBy.ffill` where timezone information would be dropped (:issue:`19995`, :issue:`27992`) -- -- -- - -Reshaping -^^^^^^^^^ - -- -- -- -- -- - -Sparse -^^^^^^ - -- Other ^^^^^ From 8d3fec931dffcc662fb72c59091d153f3500b314 Mon Sep 17 00:00:00 2001 From: Andreas Buhr Date: Tue, 15 Oct 2019 17:21:58 +0200 Subject: [PATCH 066/119] DOC: fix is_scalar documentation (#28998) (#29004) --- pandas/_libs/lib.pyx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index c4cb2556334ed..b13246a4a969c 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -137,8 +137,8 @@ def is_scalar(val: object) -> bool: Examples -------- - >>> dt = pd.datetime.datetime(2018, 10, 3) - >>> pd.is_scalar(dt) + >>> dt = datetime.datetime(2018, 10, 3) + >>> pd.api.types.is_scalar(dt) True >>> pd.api.types.is_scalar([2, 3]) From 74bbbb084dc530080da93b96a41114d0598f928b Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 15 Oct 2019 16:37:40 -0500 Subject: [PATCH 067/119] Setuptools CI fixup (#29010) * CI: Fix miniconda upgrade issues --- ci/setup_env.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/ci/setup_env.sh b/ci/setup_env.sh index be8c3645691fe..794130355fd74 100755 --- a/ci/setup_env.sh +++ b/ci/setup_env.sh @@ -55,6 +55,7 @@ echo echo "update conda" conda config --set ssl_verify false conda config --set quiet true --set always_yes true --set changeps1 false +conda install pip # create conda to create a historical artifact for pip & setuptools conda update -n base conda echo "conda info -a" From de67bb72ecf777b2f35fed83cf539b0d4e800a38 Mon Sep 17 00:00:00 2001 From: Aaditya Panikath Date: Wed, 16 Oct 2019 09:17:07 +0530 Subject: [PATCH 068/119] CLN: Fix mypy errors in pandas/tests/io/test_sql.py Reverted and added type changes (#29006) --- pandas/tests/io/test_sql.py | 4 ++-- setup.cfg | 3 --- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index 7491cef17ebfc..183a47c6039ec 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -583,7 +583,7 @@ class _TestSQLApi(PandasSQLTest): """ flavor = "sqlite" - mode = None + mode = None # type: str def setup_connect(self): self.conn = self.connect() @@ -1234,7 +1234,7 @@ class _TestSQLAlchemy(SQLAlchemyMixIn, PandasSQLTest): """ - flavor = None + flavor = None # type: str @pytest.fixture(autouse=True, scope="class") def setup_class(cls): diff --git a/setup.cfg b/setup.cfg index 55d25abde585c..462e79dae1039 100644 --- a/setup.cfg +++ b/setup.cfg @@ -205,9 +205,6 @@ ignore_errors=True [mypy-pandas.tests.io.json.test_ujson] ignore_errors=True -[mypy-pandas.tests.io.test_sql] -ignore_errors=True - [mypy-pandas.tests.plotting.test_backend] ignore_errors=True From d52850f954a30a30cb9ecb27a603fe35a7161367 Mon Sep 17 00:00:00 2001 From: nrebena <49879400+nrebena@users.noreply.github.com> Date: Wed, 16 Oct 2019 14:11:16 +0200 Subject: [PATCH 069/119] DOC: Reference level name as Term of HDFStore.select query (#28791) (#28793) --- doc/source/user_guide/io.rst | 34 +++++++++++++++++++++++++++++++++- doc/source/whatsnew/v1.0.0.rst | 1 + 2 files changed, 34 insertions(+), 1 deletion(-) diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index ee097c1f4d5e8..6b23c814843e1 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -3811,6 +3811,8 @@ storing/selecting from homogeneous index ``DataFrames``. # the levels are automatically included as data columns store.select('df_mi', 'foo=bar') +.. note:: + The ``index`` keyword is reserved and cannot be use as a level name. .. _io.hdf5-query: @@ -3829,6 +3831,7 @@ A query is specified using the ``Term`` class under the hood, as a boolean expre * ``index`` and ``columns`` are supported indexers of ``DataFrames``. * if ``data_columns`` are specified, these can be used as additional indexers. +* level name in a MultiIndex, with default name ``level_0``, ``level_1``, … if not provided. Valid comparison operators are: @@ -3947,7 +3950,7 @@ space. These are in terms of the total number of rows in a table. .. _io.hdf5-timedelta: -Using timedelta64[ns] +Query timedelta64[ns] +++++++++++++++++++++ You can store and query using the ``timedelta64[ns]`` type. Terms can be @@ -3966,6 +3969,35 @@ specified in the format: ``()``, where float may be signed (and fra store.append('dftd', dftd, data_columns=True) store.select('dftd', "C<'-3.5D'") +Query MultiIndex +++++++++++++++++ + +Selecting from a ``MultiIndex`` can be achieved by using the name of the level. + +.. ipython:: python + + df_mi.index.names + store.select('df_mi', "foo=baz and bar=two") + +If the ``MultiIndex`` levels names are ``None``, the levels are automatically made available via +the ``level_n`` keyword with ``n`` the level of the ``MultiIndex`` you want to select from. + +.. ipython:: python + + index = pd.MultiIndex( + levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]], + codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + ) + df_mi_2 = pd.DataFrame(np.random.randn(10, 3), + index=index, columns=["A", "B", "C"]) + df_mi_2 + + store.append("df_mi_2", df_mi_2) + + # the levels are automatically included as data columns with keyword level_n + store.select("df_mi_2", "level_0=foo and level_1=two") + + Indexing ++++++++ diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 1112e42489342..9ce3b5be9624f 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -162,6 +162,7 @@ Documentation Improvements ^^^^^^^^^^^^^^^^^^^^^^^^^^ - Added new section on :ref:`scale` (:issue:`28315`). +- Added sub-section Query MultiIndex in IO tools user guide (:issue:`28791`) .. _whatsnew_1000.deprecations: From 0a108f0760fca053fab611fd02822554ed909638 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 16 Oct 2019 05:21:56 -0700 Subject: [PATCH 070/119] CLN: move small bits outside of try/excepts (#28962) --- pandas/_libs/algos_take_helper.pxi.in | 1 - pandas/core/base.py | 8 +++----- pandas/core/groupby/generic.py | 13 +++++++++---- pandas/core/groupby/groupby.py | 12 +++--------- 4 files changed, 15 insertions(+), 19 deletions(-) diff --git a/pandas/_libs/algos_take_helper.pxi.in b/pandas/_libs/algos_take_helper.pxi.in index 3a3adc71875ed..f10061a417c03 100644 --- a/pandas/_libs/algos_take_helper.pxi.in +++ b/pandas/_libs/algos_take_helper.pxi.in @@ -276,7 +276,6 @@ cdef _take_2d(ndarray[take_t, ndim=2] values, object idx): Py_ssize_t i, j, N, K ndarray[Py_ssize_t, ndim=2, cast=True] indexer = idx ndarray[take_t, ndim=2] result - object val N, K = (values).shape diff --git a/pandas/core/base.py b/pandas/core/base.py index e4e14a950c96b..c461a1509ec78 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -267,7 +267,7 @@ def aggregate(self, func, *args, **kwargs): agg = aggregate - def _try_aggregate_string_function(self, arg, *args, **kwargs): + def _try_aggregate_string_function(self, arg: str, *args, **kwargs): """ if arg is a string, then try to operate on it: - try to find a function (or attribute) on ourselves @@ -292,12 +292,10 @@ def _try_aggregate_string_function(self, arg, *args, **kwargs): f = getattr(np, arg, None) if f is not None: - try: + if hasattr(self, "__array__"): + # in particular exclude Window return f(self, *args, **kwargs) - except (AttributeError, TypeError): - pass - raise AttributeError( "'{arg}' is not a valid function for " "'{cls}' object".format(arg=arg, cls=type(self).__name__) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 068d5e5275f0d..76a3893d3af2a 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -952,6 +952,7 @@ def _cython_agg_blocks(self, how, alt=None, numeric_only=True, min_count=-1): if alt is None: # we cannot perform the operation # in an alternate way, exclude the block + assert how == "ohlc" deleted_items.append(locs) continue @@ -1025,17 +1026,20 @@ def _aggregate_frame(self, func, *args, **kwargs): if axis != obj._info_axis_number: try: for name, data in self: - result[name] = self._try_cast(func(data, *args, **kwargs), data) + fres = func(data, *args, **kwargs) + result[name] = self._try_cast(fres, data) except Exception: return self._aggregate_item_by_item(func, *args, **kwargs) else: for name in self.indices: + data = self.get_group(name, obj=obj) try: - data = self.get_group(name, obj=obj) - result[name] = self._try_cast(func(data, *args, **kwargs), data) + fres = func(data, *args, **kwargs) except Exception: wrapper = lambda x: func(x, *args, **kwargs) result[name] = data.apply(wrapper, axis=axis) + else: + result[name] = self._try_cast(fres, data) return self._wrap_frame_output(result, obj) @@ -1410,9 +1414,10 @@ def _transform_item_by_item(self, obj, wrapper): for i, col in enumerate(obj): try: output[col] = self[col].transform(wrapper) - inds.append(i) except Exception: pass + else: + inds.append(i) if len(output) == 0: raise TypeError("Transform function invalid for data types") diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index cc297629a7004..8461b4381e2ea 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -598,14 +598,7 @@ def pipe(self, func, *args, **kwargs): plot = property(GroupByPlot) def _make_wrapper(self, name): - if name not in self._apply_whitelist: - is_callable = callable(getattr(self._selected_obj, name, None)) - kind = " callable " if is_callable else " " - msg = ( - "Cannot access{0}attribute {1!r} of {2!r} objects, try " - "using the 'apply' method".format(kind, name, type(self).__name__) - ) - raise AttributeError(msg) + assert name in self._apply_whitelist self._set_group_selection() @@ -919,9 +912,10 @@ def _python_agg_general(self, func, *args, **kwargs): for name, obj in self._iterate_slices(): try: result, counts = self.grouper.agg_series(obj, f) - output[name] = self._try_cast(result, obj, numeric_only=True) except TypeError: continue + else: + output[name] = self._try_cast(result, obj, numeric_only=True) if len(output) == 0: return self._python_apply_general(f) From c903e5e9ad6541364ff686395266e10a73b565bf Mon Sep 17 00:00:00 2001 From: William Ayd Date: Wed, 16 Oct 2019 05:24:46 -0700 Subject: [PATCH 071/119] CLN: Consistent and Annotated Return Type of _iterate_slices (#28958) --- pandas/core/frame.py | 15 +++++++++++-- pandas/core/groupby/generic.py | 40 ++++++++++++++++++++-------------- pandas/core/groupby/groupby.py | 4 ++-- 3 files changed, 39 insertions(+), 20 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 79e941f262931..c82d8a25fedba 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -14,7 +14,18 @@ import itertools import sys from textwrap import dedent -from typing import FrozenSet, List, Optional, Sequence, Set, Tuple, Type, Union +from typing import ( + FrozenSet, + Hashable, + Iterable, + List, + Optional, + Sequence, + Set, + Tuple, + Type, + Union, +) import warnings import numpy as np @@ -861,7 +872,7 @@ def style(self): """ @Appender(_shared_docs["items"]) - def items(self): + def items(self) -> Iterable[Tuple[Hashable, Series]]: if self.columns.is_unique and hasattr(self, "_item_cache"): for k in self.columns: yield k, self._get_item_cache(k) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 76a3893d3af2a..aa817ec451aa5 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -11,7 +11,17 @@ from functools import partial from textwrap import dedent import typing -from typing import Any, Callable, FrozenSet, Sequence, Type, Union +from typing import ( + Any, + Callable, + FrozenSet, + Hashable, + Iterable, + Sequence, + Tuple, + Type, + Union, +) import warnings import numpy as np @@ -132,7 +142,7 @@ def pinner(cls): class SeriesGroupBy(GroupBy): _apply_whitelist = base.series_apply_whitelist - def _iterate_slices(self): + def _iterate_slices(self) -> Iterable[Tuple[Hashable, Series]]: yield self._selection_name, self._selected_obj @property @@ -898,22 +908,20 @@ def aggregate(self, func=None, *args, **kwargs): agg = aggregate - def _iterate_slices(self): - if self.axis == 0: - # kludge - if self._selection is None: - slice_axis = self.obj.columns - else: - slice_axis = self._selection_list - slicer = lambda x: self.obj[x] + def _iterate_slices(self) -> Iterable[Tuple[Hashable, Series]]: + obj = self._selected_obj + if self.axis == 1: + obj = obj.T + + if isinstance(obj, Series) and obj.name not in self.exclusions: + # Occurs when doing DataFrameGroupBy(...)["X"] + yield obj.name, obj else: - slice_axis = self.obj.index - slicer = self.obj.xs + for label, values in obj.items(): + if label in self.exclusions: + continue - for val in slice_axis: - if val in self.exclusions: - continue - yield val, slicer(val) + yield label, values def _cython_agg_general(self, how, alt=None, numeric_only=True, min_count=-1): new_items, new_blocks = self._cython_agg_blocks( diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 8461b4381e2ea..6379e27e55d2e 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -14,7 +14,7 @@ class providing the base-class of operations. import inspect import re import types -from typing import FrozenSet, List, Optional, Tuple, Type, Union +from typing import FrozenSet, Hashable, Iterable, List, Optional, Tuple, Type, Union import numpy as np @@ -751,7 +751,7 @@ def _python_apply_general(self, f): keys, values, not_indexed_same=mutated or self.mutated ) - def _iterate_slices(self): + def _iterate_slices(self) -> Iterable[Tuple[Hashable, Series]]: raise AbstractMethodError(self) def transform(self, func, *args, **kwargs): From b632ca07b4ff65d45c751b02c045051a93a8c537 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 16 Oct 2019 05:25:41 -0700 Subject: [PATCH 072/119] restore xfail (#29013) --- pandas/tests/groupby/test_categorical.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 2831c07cb21d3..5391cb5ce821f 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -4,7 +4,7 @@ import numpy as np import pytest -from pandas.compat import PY37, is_platform_windows +from pandas.compat import PY37 import pandas as pd from pandas import ( @@ -209,10 +209,9 @@ def test_level_get_group(observed): assert_frame_equal(result, expected) -# GH#21636 previously flaky on py37 -@pytest.mark.xfail( - is_platform_windows() and PY37, reason="Flaky, GH-27902", strict=False -) +# GH#21636 flaky on py37; may be related to older numpy, see discussion +# https://github.com/MacPython/pandas-wheels/pull/64 +@pytest.mark.xfail(PY37, reason="Flaky, GH-27902", strict=False) @pytest.mark.parametrize("ordered", [True, False]) def test_apply(ordered): # GH 10138 @@ -229,6 +228,9 @@ def test_apply(ordered): idx = MultiIndex.from_arrays([missing, dense], names=["missing", "dense"]) expected = DataFrame([0, 1, 2.0], index=idx, columns=["values"]) + # GH#21636 tracking down the xfail, in some builds np.mean(df.loc[[0]]) + # is coming back as Series([0., 1., 0.], index=["missing", "dense", "values"]) + # when we expect Series(0., index=["values"]) result = grouped.apply(lambda x: np.mean(x)) assert_frame_equal(result, expected) From 79a5f7c61f4ef8ca43d0002b1fc21809d876d1e5 Mon Sep 17 00:00:00 2001 From: Jinyang Zhou Date: Wed, 16 Oct 2019 20:29:24 +0800 Subject: [PATCH 073/119] To html encoding add (#28692) --- doc/source/whatsnew/v1.0.0.rst | 1 + pandas/core/frame.py | 11 ++++++++++- pandas/io/formats/format.py | 5 ++++- pandas/tests/io/formats/test_to_html.py | 8 ++++++++ 4 files changed, 23 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 9ce3b5be9624f..5b5098f7d2426 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -109,6 +109,7 @@ Other enhancements (:issue:`28368`) - :meth:`DataFrame.to_json` now accepts an ``indent`` integer argument to enable pretty printing of JSON output (:issue:`12004`) - :meth:`read_stata` can read Stata 119 dta files. (:issue:`28250`) +- Added ``encoding`` argument to :func:`DataFrame.to_html` for non-ascii text (:issue:`28663`) Build Changes ^^^^^^^^^^^^^ diff --git a/pandas/core/frame.py b/pandas/core/frame.py index c82d8a25fedba..64755b2390eaf 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2218,6 +2218,7 @@ def to_html( border=None, table_id=None, render_links=False, + encoding=None, ): """ Render a DataFrame as an HTML table. @@ -2233,6 +2234,10 @@ def to_html( border : int A ``border=border`` attribute is included in the opening `` tag. Default ``pd.options.display.html.border``. + encoding : str, default "utf-8" + Set character encoding + + .. versionadded:: 1.0 table_id : str, optional A css id is included in the opening `
` tag if specified. @@ -2274,7 +2279,11 @@ def to_html( ) # TODO: a generic formatter wld b in DataFrameFormatter return formatter.to_html( - buf=buf, classes=classes, notebook=notebook, border=border + buf=buf, + classes=classes, + notebook=notebook, + border=border, + encoding=encoding, ) # ---------------------------------------------------------------------- diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index ad62c56a337b6..b8c40e3f62221 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -942,6 +942,7 @@ def _format_col(self, i: int) -> List[str]: def to_html( self, buf: Optional[FilePathOrBuffer[str]] = None, + encoding: Optional[str] = None, classes: Optional[Union[str, List, Tuple]] = None, notebook: bool = False, border: Optional[int] = None, @@ -963,7 +964,9 @@ def to_html( from pandas.io.formats.html import HTMLFormatter, NotebookFormatter Klass = NotebookFormatter if notebook else HTMLFormatter - return Klass(self, classes=classes, border=border).get_result(buf=buf) + return Klass(self, classes=classes, border=border).get_result( + buf=buf, encoding=encoding + ) def _get_formatted_column_labels(self, frame: "DataFrame") -> List[List[str]]: from pandas.core.index import _sparsify diff --git a/pandas/tests/io/formats/test_to_html.py b/pandas/tests/io/formats/test_to_html.py index ef19319e208d9..6c4a226b7ebd2 100644 --- a/pandas/tests/io/formats/test_to_html.py +++ b/pandas/tests/io/formats/test_to_html.py @@ -99,6 +99,14 @@ def test_to_html_unicode(df, expected, datapath): assert result == expected +def test_to_html_encoding(float_frame, tmp_path): + # GH 28663 + path = tmp_path / "test.html" + float_frame.to_html(path, encoding="gbk") + with open(str(path), "r", encoding="gbk") as f: + assert float_frame.to_html() == f.read() + + def test_to_html_decimal(datapath): # GH 12031 df = DataFrame({"A": [6.0, 3.1, 2.2]}) From a0d01b8035757371dd4631800fa9df3f72925e45 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 16 Oct 2019 05:42:47 -0700 Subject: [PATCH 074/119] add uint64 support for some libgroupby funcs (#28931) --- pandas/_libs/groupby_helper.pxi.in | 62 ++++++++++++++++++++++++++- pandas/core/groupby/groupby.py | 8 ++++ pandas/tests/groupby/test_function.py | 2 +- 3 files changed, 69 insertions(+), 3 deletions(-) diff --git a/pandas/_libs/groupby_helper.pxi.in b/pandas/_libs/groupby_helper.pxi.in index 6b434b6470581..f052feea0bbf3 100644 --- a/pandas/_libs/groupby_helper.pxi.in +++ b/pandas/_libs/groupby_helper.pxi.in @@ -16,6 +16,7 @@ ctypedef fused rank_t: float64_t float32_t int64_t + uint64_t object @@ -34,6 +35,7 @@ def group_last(rank_t[:, :] out, rank_t val ndarray[rank_t, ndim=2] resx ndarray[int64_t, ndim=2] nobs + bint runtime_error = False assert min_count == -1, "'min_count' only used in add and prod" @@ -106,11 +108,20 @@ def group_last(rank_t[:, :] out, if nobs[i, j] == 0: if rank_t is int64_t: out[i, j] = NPY_NAT + elif rank_t is uint64_t: + runtime_error = True + break else: out[i, j] = NAN else: out[i, j] = resx[i, j] + if runtime_error: + # We cannot raise directly above because that is within a nogil + # block. + raise RuntimeError("empty group with uint64_t") + + group_last_float64 = group_last["float64_t"] group_last_float32 = group_last["float32_t"] group_last_int64 = group_last["int64_t"] @@ -132,6 +143,7 @@ def group_nth(rank_t[:, :] out, rank_t val ndarray[rank_t, ndim=2] resx ndarray[int64_t, ndim=2] nobs + bint runtime_error = False assert min_count == -1, "'min_count' only used in add and prod" @@ -199,11 +211,19 @@ def group_nth(rank_t[:, :] out, if nobs[i, j] == 0: if rank_t is int64_t: out[i, j] = NPY_NAT + elif rank_t is uint64_t: + runtime_error = True + break else: out[i, j] = NAN else: out[i, j] = resx[i, j] + if runtime_error: + # We cannot raise directly above because that is within a nogil + # block. + raise RuntimeError("empty group with uint64_t") + group_nth_float64 = group_nth["float64_t"] group_nth_float32 = group_nth["float32_t"] @@ -282,12 +302,16 @@ def group_rank(float64_t[:, :] out, if ascending ^ (na_option == 'top'): if rank_t is int64_t: nan_fill_val = np.iinfo(np.int64).max + elif rank_t is uint64_t: + nan_fill_val = np.iinfo(np.uint64).max else: nan_fill_val = np.inf order = (masked_vals, mask, labels) else: if rank_t is int64_t: nan_fill_val = np.iinfo(np.int64).min + elif rank_t is uint64_t: + nan_fill_val = 0 else: nan_fill_val = -np.inf @@ -397,6 +421,7 @@ def group_rank(float64_t[:, :] out, group_rank_float64 = group_rank["float64_t"] group_rank_float32 = group_rank["float32_t"] group_rank_int64 = group_rank["int64_t"] +group_rank_uint64 = group_rank["uint64_t"] # Note: we do not have a group_rank_object because that would require a # not-nogil implementation, see GH#19560 @@ -410,6 +435,7 @@ ctypedef fused groupby_t: float64_t float32_t int64_t + uint64_t @cython.wraparound(False) @@ -426,6 +452,7 @@ def group_max(groupby_t[:, :] out, Py_ssize_t i, j, N, K, lab, ncounts = len(counts) groupby_t val, count, nan_val ndarray[groupby_t, ndim=2] maxx, nobs + bint runtime_error = False assert min_count == -1, "'min_count' only used in add and prod" @@ -439,6 +466,11 @@ def group_max(groupby_t[:, :] out, # Note: evaluated at compile-time maxx[:] = -_int64_max nan_val = NPY_NAT + elif groupby_t is uint64_t: + # NB: We do not define nan_val because there is no such thing + # for uint64_t. We carefully avoid having to reference it in this + # case. + maxx[:] = 0 else: maxx[:] = -np.inf nan_val = NAN @@ -462,7 +494,7 @@ def group_max(groupby_t[:, :] out, if val > maxx[lab, j]: maxx[lab, j] = val else: - if val == val and val != nan_val: + if val == val: nobs[lab, j] += 1 if val > maxx[lab, j]: maxx[lab, j] = val @@ -470,10 +502,18 @@ def group_max(groupby_t[:, :] out, for i in range(ncounts): for j in range(K): if nobs[i, j] == 0: + if groupby_t is uint64_t: + runtime_error = True + break out[i, j] = nan_val else: out[i, j] = maxx[i, j] + if runtime_error: + # We cannot raise directly above because that is within a nogil + # block. + raise RuntimeError("empty group with uint64_t") + @cython.wraparound(False) @cython.boundscheck(False) @@ -489,6 +529,7 @@ def group_min(groupby_t[:, :] out, Py_ssize_t i, j, N, K, lab, ncounts = len(counts) groupby_t val, count, nan_val ndarray[groupby_t, ndim=2] minx, nobs + bint runtime_error = False assert min_count == -1, "'min_count' only used in add and prod" @@ -501,6 +542,11 @@ def group_min(groupby_t[:, :] out, if groupby_t is int64_t: minx[:] = _int64_max nan_val = NPY_NAT + elif groupby_t is uint64_t: + # NB: We do not define nan_val because there is no such thing + # for uint64_t. We carefully avoid having to reference it in this + # case. + minx[:] = np.iinfo(np.uint64).max else: minx[:] = np.inf nan_val = NAN @@ -524,7 +570,7 @@ def group_min(groupby_t[:, :] out, if val < minx[lab, j]: minx[lab, j] = val else: - if val == val and val != nan_val: + if val == val: nobs[lab, j] += 1 if val < minx[lab, j]: minx[lab, j] = val @@ -532,10 +578,18 @@ def group_min(groupby_t[:, :] out, for i in range(ncounts): for j in range(K): if nobs[i, j] == 0: + if groupby_t is uint64_t: + runtime_error = True + break out[i, j] = nan_val else: out[i, j] = minx[i, j] + if runtime_error: + # We cannot raise directly above because that is within a nogil + # block. + raise RuntimeError("empty group with uint64_t") + @cython.boundscheck(False) @cython.wraparound(False) @@ -575,6 +629,8 @@ def group_cummin(groupby_t[:, :] out, accum = np.empty((ngroups, K), dtype=np.asarray(values).dtype) if groupby_t is int64_t: accum[:] = _int64_max + elif groupby_t is uint64_t: + accum[:] = np.iinfo(np.uint64).max else: accum[:] = np.inf @@ -642,6 +698,8 @@ def group_cummax(groupby_t[:, :] out, accum = np.empty((ngroups, K), dtype=np.asarray(values).dtype) if groupby_t is int64_t: accum[:] = -_int64_max + elif groupby_t is uint64_t: + accum[:] = 0 else: accum[:] = -np.inf diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 6379e27e55d2e..92ea733cc3447 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1355,7 +1355,15 @@ def f(self, **kwargs): return self._cython_agg_general(alias, alt=npfunc, **kwargs) except AssertionError as e: raise SpecificationError(str(e)) + except DataError: + pass except Exception: + # TODO: the remaining test cases that get here are from: + # - AttributeError from _cython_agg_blocks bug passing + # DataFrame to make_block; see GH#28275 + # - TypeError in _cython_operation calling ensure_float64 + # on object array containing complex numbers; + # see test_groupby_complex, test_max_nan_bug pass # apply a non-cython aggregation diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index afb22a732691c..571e710ba8928 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -378,7 +378,7 @@ def test_median_empty_bins(observed): @pytest.mark.parametrize( - "dtype", ["int8", "int16", "int32", "int64", "float32", "float64"] + "dtype", ["int8", "int16", "int32", "int64", "float32", "float64", "uint64"] ) @pytest.mark.parametrize( "method,data", From 86e187f4d06835c98c15674dd3024e449c8f8c8d Mon Sep 17 00:00:00 2001 From: Max Bolingbroke Date: Wed, 16 Oct 2019 13:52:54 +0100 Subject: [PATCH 075/119] BUG: CategoricalIndex allowed reindexing duplicate sources (#28257) --- doc/source/user_guide/advanced.rst | 36 ++++++++---- doc/source/whatsnew/v1.0.0.rst | 4 ++ pandas/_libs/index.pyx | 20 +++++-- pandas/core/indexes/base.py | 8 ++- pandas/core/indexes/category.py | 8 --- pandas/tests/indexes/test_category.py | 20 ++++--- pandas/tests/indexing/test_categorical.py | 71 +++++++++++------------ pandas/tests/series/test_operators.py | 35 ++++++----- pandas/tests/test_base.py | 6 ++ pandas/util/testing.py | 4 +- 10 files changed, 126 insertions(+), 86 deletions(-) diff --git a/doc/source/user_guide/advanced.rst b/doc/source/user_guide/advanced.rst index 62a9b6396404a..4949dd580414f 100644 --- a/doc/source/user_guide/advanced.rst +++ b/doc/source/user_guide/advanced.rst @@ -783,27 +783,41 @@ values **not** in the categories, similarly to how you can reindex **any** panda .. ipython:: python - df2.reindex(['a', 'e']) - df2.reindex(['a', 'e']).index - df2.reindex(pd.Categorical(['a', 'e'], categories=list('abcde'))) - df2.reindex(pd.Categorical(['a', 'e'], categories=list('abcde'))).index + df3 = pd.DataFrame({'A': np.arange(3), + 'B': pd.Series(list('abc')).astype('category')}) + df3 = df3.set_index('B') + df3 + +.. ipython:: python + + df3.reindex(['a', 'e']) + df3.reindex(['a', 'e']).index + df3.reindex(pd.Categorical(['a', 'e'], categories=list('abe'))) + df3.reindex(pd.Categorical(['a', 'e'], categories=list('abe'))).index .. warning:: Reshaping and Comparison operations on a ``CategoricalIndex`` must have the same categories or a ``TypeError`` will be raised. - .. code-block:: ipython + .. ipython:: python - In [9]: df3 = pd.DataFrame({'A': np.arange(6), 'B': pd.Series(list('aabbca')).astype('category')}) + df4 = pd.DataFrame({'A': np.arange(2), + 'B': list('ba')}) + df4['B'] = df4['B'].astype(CategoricalDtype(list('ab'))) + df4 = df4.set_index('B') + df4.index - In [11]: df3 = df3.set_index('B') + df5 = pd.DataFrame({'A': np.arange(2), + 'B': list('bc')}) + df5['B'] = df5['B'].astype(CategoricalDtype(list('bc'))) + df5 = df5.set_index('B') + df5.index - In [11]: df3.index - Out[11]: CategoricalIndex(['a', 'a', 'b', 'b', 'c', 'a'], categories=['a', 'b', 'c'], ordered=False, name='B', dtype='category') + .. code-block:: ipython - In [12]: pd.concat([df2, df3]) - TypeError: categories must match existing categories when appending + In [1]: pd.concat([df4, df5]) + TypeError: categories must match existing categories when appending .. _indexing.rangeindex: diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 5b5098f7d2426..7c86ad0f029ed 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -223,6 +223,7 @@ Categorical - Added test to assert the :func:`fillna` raises the correct ValueError message when the value isn't a value from categories (:issue:`13628`) - Bug in :meth:`Categorical.astype` where ``NaN`` values were handled incorrectly when casting to int (:issue:`28406`) +- :meth:`DataFrame.reindex` with a :class:`CategoricalIndex` would fail when the targets contained duplicates, and wouldn't fail if the source contained duplicates (:issue:`28107`) - Bug in :meth:`Categorical.astype` not allowing for casting to extension dtypes (:issue:`28668`) - Bug where :func:`merge` was unable to join on categorical and extension dtype columns (:issue:`28668`) - :meth:`Categorical.searchsorted` and :meth:`CategoricalIndex.searchsorted` now work on unordered categoricals also (:issue:`21667`) @@ -292,6 +293,9 @@ Indexing - Bug in reindexing a :meth:`PeriodIndex` with another type of index that contained a `Period` (:issue:`28323`) (:issue:`28337`) - Fix assignment of column via `.loc` with numpy non-ns datetime type (:issue:`27395`) - Bug in :meth:`Float64Index.astype` where ``np.inf`` was not handled properly when casting to an integer dtype (:issue:`28475`) +- :meth:`Index.union` could fail when the left contained duplicates (:issue:`28257`) +- :meth:`Index.get_indexer_non_unique` could fail with `TypeError` in some cases, such as when searching for ints in a string index (:issue:`28257`) +- Missing ^^^^^^^ diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index 22f7104debf10..144d555258c50 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -286,7 +286,7 @@ cdef class IndexEngine: cdef: ndarray values, x ndarray[int64_t] result, missing - set stargets + set stargets, remaining_stargets dict d = {} object val int count = 0, count_missing = 0 @@ -309,12 +309,20 @@ cdef class IndexEngine: if stargets and len(stargets) < 5 and self.is_monotonic_increasing: # if there are few enough stargets and the index is monotonically # increasing, then use binary search for each starget + remaining_stargets = set() for starget in stargets: - start = values.searchsorted(starget, side='left') - end = values.searchsorted(starget, side='right') - if start != end: - d[starget] = list(range(start, end)) - else: + try: + start = values.searchsorted(starget, side='left') + end = values.searchsorted(starget, side='right') + except TypeError: # e.g. if we tried to search for string in int array + remaining_stargets.add(starget) + else: + if start != end: + d[starget] = list(range(start, end)) + + stargets = remaining_stargets + + if stargets: # otherwise, map by iterating through all items in the index for i in range(n): val = values[i] diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 7dee3a17f8f9e..464cd49f135ae 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -2493,8 +2493,12 @@ def _union(self, other, sort): value_set = set(lvals) result.extend([x for x in rvals if x not in value_set]) else: - indexer = self.get_indexer(other) - indexer, = (indexer == -1).nonzero() + # find indexes of things in "other" that are not in "self" + if self.is_unique: + indexer = self.get_indexer(other) + indexer = (indexer == -1).nonzero()[0] + else: + indexer = algos.unique1d(self.get_indexer_non_unique(other)[1]) if len(indexer) > 0: other_diff = algos.take_nd(rvals, indexer, allow_fill=False) diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index b538c4df00e19..e5a8edb56e413 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -552,10 +552,6 @@ def get_value(self, series: AnyArrayLike, key: Any): # we might be a positional inexer return super().get_value(series, key) - def _can_reindex(self, indexer): - """ always allow reindexing """ - pass - @Substitution(klass="CategoricalIndex") @Appender(_shared_docs["searchsorted"]) def searchsorted(self, value, side="left", sorter=None): @@ -585,7 +581,6 @@ def reindex(self, target, method=None, level=None, limit=None, tolerance=None): Indices of output values in original index """ - if method is not None: raise NotImplementedError( "argument method is not implemented for CategoricalIndex.reindex" @@ -605,9 +600,6 @@ def reindex(self, target, method=None, level=None, limit=None, tolerance=None): indexer = None missing = [] else: - if not target.is_unique: - raise ValueError("cannot reindex with a non-unique indexer") - indexer, missing = self.get_indexer_non_unique(np.array(target)) if len(self.codes) and indexer is not None: diff --git a/pandas/tests/indexes/test_category.py b/pandas/tests/indexes/test_category.py index 4326c3f8188fc..8ed7f1a890c39 100644 --- a/pandas/tests/indexes/test_category.py +++ b/pandas/tests/indexes/test_category.py @@ -599,15 +599,19 @@ def test_reindex_dtype(self): tm.assert_numpy_array_equal(indexer, np.array([0, 3, 2], dtype=np.intp)) def test_reindex_duplicate_target(self): - # See GH23963 - c = CategoricalIndex(["a", "b", "c", "a"], categories=["a", "b", "c", "d"]) - with pytest.raises(ValueError, match="non-unique indexer"): - c.reindex(["a", "a", "c"]) + # See GH25459 + cat = CategoricalIndex(["a", "b", "c"], categories=["a", "b", "c", "d"]) + res, indexer = cat.reindex(["a", "c", "c"]) + exp = Index(["a", "c", "c"], dtype="object") + tm.assert_index_equal(res, exp, exact=True) + tm.assert_numpy_array_equal(indexer, np.array([0, 2, 2], dtype=np.intp)) - with pytest.raises(ValueError, match="non-unique indexer"): - c.reindex( - CategoricalIndex(["a", "a", "c"], categories=["a", "b", "c", "d"]) - ) + res, indexer = cat.reindex( + CategoricalIndex(["a", "c", "c"], categories=["a", "b", "c", "d"]) + ) + exp = CategoricalIndex(["a", "c", "c"], categories=["a", "b", "c", "d"]) + tm.assert_index_equal(res, exp, exact=True) + tm.assert_numpy_array_equal(indexer, np.array([0, 2, 2], dtype=np.intp)) def test_reindex_empty_index(self): # See GH16770 diff --git a/pandas/tests/indexing/test_categorical.py b/pandas/tests/indexing/test_categorical.py index c365c985eb4b6..005a9a24dc597 100644 --- a/pandas/tests/indexing/test_categorical.py +++ b/pandas/tests/indexing/test_categorical.py @@ -561,26 +561,30 @@ def test_read_only_source(self): assert_frame_equal(rw_df.loc[1:3], ro_df.loc[1:3]) def test_reindexing(self): + df = DataFrame( + { + "A": np.arange(3, dtype="int64"), + "B": Series(list("abc")).astype(CDT(list("cabe"))), + } + ).set_index("B") # reindexing # convert to a regular index - result = self.df2.reindex(["a", "b", "e"]) - expected = DataFrame( - {"A": [0, 1, 5, 2, 3, np.nan], "B": Series(list("aaabbe"))} - ).set_index("B") + result = df.reindex(["a", "b", "e"]) + expected = DataFrame({"A": [0, 1, np.nan], "B": Series(list("abe"))}).set_index( + "B" + ) assert_frame_equal(result, expected, check_index_type=True) - result = self.df2.reindex(["a", "b"]) - expected = DataFrame( - {"A": [0, 1, 5, 2, 3], "B": Series(list("aaabb"))} - ).set_index("B") + result = df.reindex(["a", "b"]) + expected = DataFrame({"A": [0, 1], "B": Series(list("ab"))}).set_index("B") assert_frame_equal(result, expected, check_index_type=True) - result = self.df2.reindex(["e"]) + result = df.reindex(["e"]) expected = DataFrame({"A": [np.nan], "B": Series(["e"])}).set_index("B") assert_frame_equal(result, expected, check_index_type=True) - result = self.df2.reindex(["d"]) + result = df.reindex(["d"]) expected = DataFrame({"A": [np.nan], "B": Series(["d"])}).set_index("B") assert_frame_equal(result, expected, check_index_type=True) @@ -588,65 +592,58 @@ def test_reindexing(self): # then return a Categorical cats = list("cabe") - result = self.df2.reindex(Categorical(["a", "d"], categories=cats)) + result = df.reindex(Categorical(["a", "e"], categories=cats)) expected = DataFrame( - {"A": [0, 1, 5, np.nan], "B": Series(list("aaad")).astype(CDT(cats))} + {"A": [0, np.nan], "B": Series(list("ae")).astype(CDT(cats))} ).set_index("B") assert_frame_equal(result, expected, check_index_type=True) - result = self.df2.reindex(Categorical(["a"], categories=cats)) + result = df.reindex(Categorical(["a"], categories=cats)) expected = DataFrame( - {"A": [0, 1, 5], "B": Series(list("aaa")).astype(CDT(cats))} + {"A": [0], "B": Series(list("a")).astype(CDT(cats))} ).set_index("B") assert_frame_equal(result, expected, check_index_type=True) - result = self.df2.reindex(["a", "b", "e"]) - expected = DataFrame( - {"A": [0, 1, 5, 2, 3, np.nan], "B": Series(list("aaabbe"))} - ).set_index("B") + result = df.reindex(["a", "b", "e"]) + expected = DataFrame({"A": [0, 1, np.nan], "B": Series(list("abe"))}).set_index( + "B" + ) assert_frame_equal(result, expected, check_index_type=True) - result = self.df2.reindex(["a", "b"]) - expected = DataFrame( - {"A": [0, 1, 5, 2, 3], "B": Series(list("aaabb"))} - ).set_index("B") + result = df.reindex(["a", "b"]) + expected = DataFrame({"A": [0, 1], "B": Series(list("ab"))}).set_index("B") assert_frame_equal(result, expected, check_index_type=True) - result = self.df2.reindex(["e"]) + result = df.reindex(["e"]) expected = DataFrame({"A": [np.nan], "B": Series(["e"])}).set_index("B") assert_frame_equal(result, expected, check_index_type=True) # give back the type of categorical that we received - result = self.df2.reindex( - Categorical(["a", "d"], categories=cats, ordered=True) - ) + result = df.reindex(Categorical(["a", "e"], categories=cats, ordered=True)) expected = DataFrame( - { - "A": [0, 1, 5, np.nan], - "B": Series(list("aaad")).astype(CDT(cats, ordered=True)), - } + {"A": [0, np.nan], "B": Series(list("ae")).astype(CDT(cats, ordered=True))} ).set_index("B") assert_frame_equal(result, expected, check_index_type=True) - result = self.df2.reindex(Categorical(["a", "d"], categories=["a", "d"])) + result = df.reindex(Categorical(["a", "d"], categories=["a", "d"])) expected = DataFrame( - {"A": [0, 1, 5, np.nan], "B": Series(list("aaad")).astype(CDT(["a", "d"]))} + {"A": [0, np.nan], "B": Series(list("ad")).astype(CDT(["a", "d"]))} ).set_index("B") assert_frame_equal(result, expected, check_index_type=True) # passed duplicate indexers are not allowed - msg = "cannot reindex with a non-unique indexer" + msg = "cannot reindex from a duplicate axis" with pytest.raises(ValueError, match=msg): - self.df2.reindex(["a", "a"]) + self.df2.reindex(["a", "b"]) # args NotImplemented ATM msg = r"argument {} is not implemented for CategoricalIndex\.reindex" with pytest.raises(NotImplementedError, match=msg.format("method")): - self.df2.reindex(["a"], method="ffill") + df.reindex(["a"], method="ffill") with pytest.raises(NotImplementedError, match=msg.format("level")): - self.df2.reindex(["a"], level=1) + df.reindex(["a"], level=1) with pytest.raises(NotImplementedError, match=msg.format("limit")): - self.df2.reindex(["a"], limit=2) + df.reindex(["a"], limit=2) def test_loc_slice(self): # slicing diff --git a/pandas/tests/series/test_operators.py b/pandas/tests/series/test_operators.py index 467f2c177850a..6bfcc02ca633a 100644 --- a/pandas/tests/series/test_operators.py +++ b/pandas/tests/series/test_operators.py @@ -7,7 +7,6 @@ import pandas as pd from pandas import Categorical, DataFrame, Index, Series, bdate_range, date_range, isna from pandas.core import ops -from pandas.core.indexes.base import InvalidIndexError import pandas.core.nanops as nanops import pandas.util.testing as tm from pandas.util.testing import ( @@ -282,13 +281,27 @@ def test_logical_ops_with_index(self, op): result = op(ser, idx2) assert_series_equal(result, expected) + def test_reversed_xor_with_index_returns_index(self): + # GH#22092, GH#19792 + ser = Series([True, True, False, False]) + idx1 = Index([True, False, True, False]) + idx2 = Index([1, 0, 1, 0]) + + expected = Index.symmetric_difference(idx1, ser) + result = idx1 ^ ser + assert_index_equal(result, expected) + + expected = Index.symmetric_difference(idx2, ser) + result = idx2 ^ ser + assert_index_equal(result, expected) + @pytest.mark.parametrize( "op", [ pytest.param( ops.rand_, marks=pytest.mark.xfail( - reason="GH#22092 Index implementation returns Index", + reason="GH#22092 Index __and__ returns Index intersection", raises=AssertionError, strict=True, ), @@ -296,30 +309,26 @@ def test_logical_ops_with_index(self, op): pytest.param( ops.ror_, marks=pytest.mark.xfail( - reason="Index.get_indexer with non unique index", - raises=InvalidIndexError, + reason="GH#22092 Index __or__ returns Index union", + raises=AssertionError, strict=True, ), ), - ops.rxor, ], ) - def test_reversed_logical_ops_with_index(self, op): + def test_reversed_logical_op_with_index_returns_series(self, op): # GH#22092, GH#19792 ser = Series([True, True, False, False]) idx1 = Index([True, False, True, False]) idx2 = Index([1, 0, 1, 0]) - # symmetric_difference is only for rxor, but other 2 should fail - expected = idx1.symmetric_difference(ser) - + expected = pd.Series(op(idx1.values, ser.values)) result = op(ser, idx1) - assert_index_equal(result, expected) - - expected = idx2.symmetric_difference(ser) + assert_series_equal(result, expected) + expected = pd.Series(op(idx2.values, ser.values)) result = op(ser, idx2) - assert_index_equal(result, expected) + assert_series_equal(result, expected) @pytest.mark.parametrize( "op, expected", diff --git a/pandas/tests/test_base.py b/pandas/tests/test_base.py index 483122a0eeaba..1f19f58e80f26 100644 --- a/pandas/tests/test_base.py +++ b/pandas/tests/test_base.py @@ -1009,6 +1009,12 @@ def test_bool_indexing(self, indexer_klass, indexer): s = pd.Series(idx) tm.assert_series_equal(s[indexer_klass(indexer)], s.iloc[exp_idx]) + def test_get_indexer_non_unique_dtype_mismatch(self): + # GH 25459 + indexes, missing = pd.Index(["A", "B"]).get_indexer_non_unique(pd.Index([0])) + tm.assert_numpy_array_equal(np.array([-1], dtype=np.intp), indexes) + tm.assert_numpy_array_equal(np.array([0], dtype=np.int64), missing) + class TestTranspose(Ops): errmsg = "the 'axes' parameter is not supported" diff --git a/pandas/util/testing.py b/pandas/util/testing.py index c8b41a87baa9d..4cf2776f5aa7c 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -1600,7 +1600,9 @@ def makeUnicodeIndex(k=10, name=None): def makeCategoricalIndex(k=10, n=3, name=None, **kwargs): """ make a length k index or n categories """ x = rands_array(nchars=4, size=n) - return CategoricalIndex(np.random.choice(x, k), name=name, **kwargs) + return CategoricalIndex( + Categorical.from_codes(np.arange(k) % n, categories=x), name=name, **kwargs + ) def makeIntervalIndex(k=10, name=None, **kwargs): From 46e89b07d3c249b3e42b326db1c15deb1a2dd00e Mon Sep 17 00:00:00 2001 From: Terji Petersen Date: Wed, 16 Oct 2019 15:40:58 +0100 Subject: [PATCH 076/119] Separate MultiIndex names from levels (#27242) --- doc/source/whatsnew/v1.0.0.rst | 33 ++++++++++++++++- pandas/core/frame.py | 3 +- pandas/core/indexes/multi.py | 15 ++++---- pandas/core/reshape/reshape.py | 17 ++++----- pandas/io/json/_table_schema.py | 6 ++-- pandas/tests/frame/test_alter_axes.py | 2 +- pandas/tests/indexes/multi/test_astype.py | 2 +- .../tests/indexes/multi/test_constructor.py | 8 +++-- pandas/tests/indexes/multi/test_names.py | 28 +++++++-------- pandas/tests/indexes/multi/test_reindex.py | 10 +++--- pandas/tests/indexes/multi/test_reshape.py | 5 +-- pandas/tests/reshape/test_concat.py | 12 +++---- pandas/tests/reshape/test_reshape.py | 5 ++- pandas/tests/test_multilevel.py | 35 +++++++++++-------- 14 files changed, 110 insertions(+), 71 deletions(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 7c86ad0f029ed..7692651db840e 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -124,7 +124,37 @@ source, you should no longer need to install Cython into your build environment Backwards incompatible API changes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -- :class:`pandas.core.groupby.GroupBy.transform` now raises on invalid operation names (:issue:`27489`). +.. _whatsnew_1000.api_breaking.MultiIndex._names: + +``MultiIndex.levels`` do not hold level names any longer +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +- A :class:`MultiIndex` previously stored the level names as attributes of each of its + :attr:`MultiIndex.levels`. From Pandas 1.0, the names are only accessed through + :attr:`MultiIndex.names` (which was also possible previously). This is done in order to + make :attr:`MultiIndex.levels` more similar to :attr:`CategoricalIndex.categories` (:issue:`27242`:). + +*pandas 0.25.x* + +.. code-block:: ipython + + In [1]: mi = pd.MultiIndex.from_product([[1, 2], ['a', 'b']], names=['x', 'y']) + Out[2]: mi + MultiIndex([(1, 'a'), + (1, 'b'), + (2, 'a'), + (2, 'b')], + names=['x', 'y']) + Out[3]: mi.levels[0].name + 'x' + +*pandas 1.0.0* + +.. ipython:: python + + mi = pd.MultiIndex.from_product([[1, 2], ['a', 'b']], names=['x', 'y']) + mi.levels[0].name + - :class:`pandas.core.arrays.IntervalArray` adopts a new ``__repr__`` in accordance with other array classes (:issue:`25022`) *pandas 0.25.x* @@ -150,6 +180,7 @@ Backwards incompatible API changes Other API changes ^^^^^^^^^^^^^^^^^ +- :class:`pandas.core.groupby.GroupBy.transform` now raises on invalid operation names (:issue:`27489`) - :meth:`pandas.api.types.infer_dtype` will now return "integer-na" for integer and ``np.nan`` mix (:issue:`27283`) - :meth:`MultiIndex.from_arrays` will no longer infer names from arrays if ``names=None`` is explicitly provided (:issue:`27292`) - In order to improve tab-completion, Pandas does not include most deprecated attributes when introspecting a pandas object using ``dir`` (e.g. ``dir(df)``). diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 64755b2390eaf..a3c839f6b13a1 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -7792,7 +7792,8 @@ def _count_level(self, level, axis=0, numeric_only=False): if isinstance(level, str): level = count_axis._get_level_number(level) - level_index = count_axis.levels[level] + level_name = count_axis._names[level] + level_index = count_axis.levels[level]._shallow_copy(name=level_name) level_codes = ensure_int64(count_axis.codes[level]) counts = lib.count_level_2d(mask, level_codes, len(level_index), axis=0) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 596eaf0c55dbd..b0a1ed0650f7c 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -274,6 +274,7 @@ def __new__( result._set_levels(levels, copy=copy, validate=False) result._set_codes(codes, copy=copy, validate=False) + result._names = [None] * len(levels) if names is not None: # handles name validation result._set_names(names) @@ -1216,7 +1217,7 @@ def __len__(self): return len(self.codes[0]) def _get_names(self): - return FrozenList(level.name for level in self.levels) + return FrozenList(self._names) def _set_names(self, names, level=None, validate=True): """ @@ -1262,7 +1263,7 @@ def _set_names(self, names, level=None, validate=True): level = [self._get_level_number(l) for l in level] # set the name - for l, name in zip(level, names): + for lev, name in zip(level, names): if name is not None: # GH 20527 # All items in 'names' need to be hashable: @@ -1272,7 +1273,7 @@ def _set_names(self, names, level=None, validate=True): self.__class__.__name__ ) ) - self.levels[l].rename(name, inplace=True) + self._names[lev] = name names = property( fset=_set_names, fget=_get_names, doc="""\nNames of levels in MultiIndex.\n""" @@ -1582,13 +1583,13 @@ def _get_level_values(self, level, unique=False): values : ndarray """ - values = self.levels[level] + lev = self.levels[level] level_codes = self.codes[level] + name = self._names[level] if unique: level_codes = algos.unique(level_codes) - filled = algos.take_1d(values._values, level_codes, fill_value=values._na_value) - values = values._shallow_copy(filled) - return values + filled = algos.take_1d(lev._values, level_codes, fill_value=lev._na_value) + return lev._shallow_copy(filled, name=name) def get_level_values(self, level): """ diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index e654685d24d9d..340e964d7c14f 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -259,10 +259,10 @@ def get_new_values(self): def get_new_columns(self): if self.value_columns is None: if self.lift == 0: - return self.removed_level + return self.removed_level._shallow_copy(name=self.removed_name) - lev = self.removed_level - return lev.insert(0, lev._na_value) + lev = self.removed_level.insert(0, item=self.removed_level._na_value) + return lev.rename(self.removed_name) stride = len(self.removed_level) + self.lift width = len(self.value_columns) @@ -298,10 +298,10 @@ def get_new_index(self): # construct the new index if len(self.new_index_levels) == 1: - lev, lab = self.new_index_levels[0], result_codes[0] - if (lab == -1).any(): - lev = lev.insert(len(lev), lev._na_value) - return lev.take(lab) + level, level_codes = self.new_index_levels[0], result_codes[0] + if (level_codes == -1).any(): + level = level.insert(len(level), level._na_value) + return level.take(level_codes).rename(self.new_index_names[0]) return MultiIndex( levels=self.new_index_levels, @@ -661,7 +661,8 @@ def _convert_level_number(level_num, columns): new_names = this.columns.names[:-1] new_columns = MultiIndex.from_tuples(unique_groups, names=new_names) else: - new_columns = unique_groups = this.columns.levels[0] + new_columns = this.columns.levels[0]._shallow_copy(name=this.columns.names[0]) + unique_groups = new_columns # time to ravel the values new_data = {} diff --git a/pandas/io/json/_table_schema.py b/pandas/io/json/_table_schema.py index 9016e8a98e5ba..1e27421a55499 100644 --- a/pandas/io/json/_table_schema.py +++ b/pandas/io/json/_table_schema.py @@ -243,8 +243,10 @@ def build_table_schema(data, index=True, primary_key=None, version=True): if index: if data.index.nlevels > 1: - for level in data.index.levels: - fields.append(convert_pandas_type_to_json_field(level)) + for level, name in zip(data.index.levels, data.index.names): + new_field = convert_pandas_type_to_json_field(level) + new_field["name"] = name + fields.append(new_field) else: fields.append(convert_pandas_type_to_json_field(data.index)) diff --git a/pandas/tests/frame/test_alter_axes.py b/pandas/tests/frame/test_alter_axes.py index 017cbea7ec723..b310335be5f65 100644 --- a/pandas/tests/frame/test_alter_axes.py +++ b/pandas/tests/frame/test_alter_axes.py @@ -978,7 +978,7 @@ def test_reset_index(self, float_frame): ): values = lev.take(level_codes) name = names[i] - tm.assert_index_equal(values, Index(deleveled[name])) + tm.assert_index_equal(values, Index(deleveled[name].rename(name=None))) stacked.index.names = [None, None] deleveled2 = stacked.reset_index() diff --git a/pandas/tests/indexes/multi/test_astype.py b/pandas/tests/indexes/multi/test_astype.py index 4adcdd0112b26..f320a89c471bf 100644 --- a/pandas/tests/indexes/multi/test_astype.py +++ b/pandas/tests/indexes/multi/test_astype.py @@ -11,7 +11,7 @@ def test_astype(idx): actual = idx.astype("O") assert_copy(actual.levels, expected.levels) assert_copy(actual.codes, expected.codes) - assert [level.name for level in actual.levels] == list(expected.names) + assert actual.names == list(expected.names) with pytest.raises(TypeError, match="^Setting.*dtype.*object"): idx.astype(np.dtype(int)) diff --git a/pandas/tests/indexes/multi/test_constructor.py b/pandas/tests/indexes/multi/test_constructor.py index 9472d539537ba..993979f31a35b 100644 --- a/pandas/tests/indexes/multi/test_constructor.py +++ b/pandas/tests/indexes/multi/test_constructor.py @@ -17,7 +17,7 @@ def test_constructor_single_level(): levels=[["foo", "bar", "baz", "qux"]], codes=[[0, 1, 2, 3]], names=["first"] ) assert isinstance(result, MultiIndex) - expected = Index(["foo", "bar", "baz", "qux"], name="first") + expected = Index(["foo", "bar", "baz", "qux"]) tm.assert_index_equal(result.levels[0], expected) assert result.names == ["first"] @@ -292,8 +292,9 @@ def test_from_arrays_empty(): # 1 level result = MultiIndex.from_arrays(arrays=[[]], names=["A"]) assert isinstance(result, MultiIndex) - expected = Index([], name="A") + expected = Index([]) tm.assert_index_equal(result.levels[0], expected) + assert result.names == ["A"] # N levels for N in [2, 3]: @@ -439,8 +440,9 @@ def test_from_product_empty_zero_levels(): def test_from_product_empty_one_level(): result = MultiIndex.from_product([[]], names=["A"]) - expected = pd.Index([], name="A") + expected = pd.Index([]) tm.assert_index_equal(result.levels[0], expected) + assert result.names == ["A"] @pytest.mark.parametrize( diff --git a/pandas/tests/indexes/multi/test_names.py b/pandas/tests/indexes/multi/test_names.py index 5856cb56b307b..679e045a68f29 100644 --- a/pandas/tests/indexes/multi/test_names.py +++ b/pandas/tests/indexes/multi/test_names.py @@ -27,28 +27,25 @@ def test_index_name_retained(): def test_changing_names(idx): - - # names should be applied to levels - level_names = [level.name for level in idx.levels] - check_level_names(idx, idx.names) + assert [level.name for level in idx.levels] == [None, None] view = idx.view() copy = idx.copy() shallow_copy = idx._shallow_copy() - # changing names should change level names on object + # changing names should not change level names on object new_names = [name + "a" for name in idx.names] idx.names = new_names - check_level_names(idx, new_names) + check_level_names(idx, [None, None]) - # but not on copies - check_level_names(view, level_names) - check_level_names(copy, level_names) - check_level_names(shallow_copy, level_names) + # and not on copies + check_level_names(view, [None, None]) + check_level_names(copy, [None, None]) + check_level_names(shallow_copy, [None, None]) # and copies shouldn't change original shallow_copy.names = [name + "c" for name in shallow_copy.names] - check_level_names(idx, new_names) + check_level_names(idx, [None, None]) def test_take_preserve_name(idx): @@ -82,9 +79,9 @@ def test_copy_names(): def test_names(idx, index_names): # names are assigned in setup - names = index_names + assert index_names == ["first", "second"] level_names = [level.name for level in idx.levels] - assert names == level_names + assert level_names == [None, None] # setting bad names on existing index = idx @@ -109,11 +106,10 @@ def test_names(idx, index_names): names=["first", "second", "third"], ) - # names are assigned + # names are assigned on index, but not transferred to the levels index.names = ["a", "b"] - ind_names = list(index.names) level_names = [level.name for level in index.levels] - assert ind_names == level_names + assert level_names == [None, None] def test_duplicate_level_names_access_raises(idx): diff --git a/pandas/tests/indexes/multi/test_reindex.py b/pandas/tests/indexes/multi/test_reindex.py index 88de4d1e80386..970288e5747c7 100644 --- a/pandas/tests/indexes/multi/test_reindex.py +++ b/pandas/tests/indexes/multi/test_reindex.py @@ -6,19 +6,17 @@ import pandas.util.testing as tm -def check_level_names(index, names): - assert [level.name for level in index.levels] == list(names) - - def test_reindex(idx): result, indexer = idx.reindex(list(idx[:4])) assert isinstance(result, MultiIndex) - check_level_names(result, idx[:4].names) + assert result.names == ["first", "second"] + assert [level.name for level in result.levels] == [None, None] result, indexer = idx.reindex(list(idx)) assert isinstance(result, MultiIndex) assert indexer is None - check_level_names(result, idx.names) + assert result.names == ["first", "second"] + assert [level.name for level in result.levels] == [None, None] def test_reindex_level(idx): diff --git a/pandas/tests/indexes/multi/test_reshape.py b/pandas/tests/indexes/multi/test_reshape.py index a30e6f33d1499..e79f212f30078 100644 --- a/pandas/tests/indexes/multi/test_reshape.py +++ b/pandas/tests/indexes/multi/test_reshape.py @@ -15,10 +15,11 @@ def test_insert(idx): # key not contained in all levels new_index = idx.insert(0, ("abc", "three")) - exp0 = Index(list(idx.levels[0]) + ["abc"], name="first") + exp0 = Index(list(idx.levels[0]) + ["abc"]) tm.assert_index_equal(new_index.levels[0], exp0) + assert new_index.names == ["first", "second"] - exp1 = Index(list(idx.levels[1]) + ["three"], name="second") + exp1 = Index(list(idx.levels[1]) + ["three"]) tm.assert_index_equal(new_index.levels[1], exp1) assert new_index[0] == ("abc", "three") diff --git a/pandas/tests/reshape/test_concat.py b/pandas/tests/reshape/test_concat.py index 13f0f14014a31..33cbaaed1848d 100644 --- a/pandas/tests/reshape/test_concat.py +++ b/pandas/tests/reshape/test_concat.py @@ -1219,8 +1219,10 @@ def test_concat_keys_specific_levels(self): names=["group_key"], ) - tm.assert_index_equal(result.columns.levels[0], Index(level, name="group_key")) - assert result.columns.names[0] == "group_key" + tm.assert_index_equal(result.columns.levels[0], Index(level)) + tm.assert_index_equal(result.columns.levels[1], Index([0, 1, 2, 3])) + + assert result.columns.names == ["group_key", None] def test_concat_dataframe_keys_bug(self, sort): t1 = DataFrame( @@ -1409,10 +1411,8 @@ def test_concat_keys_and_levels(self): keys=[("foo", "one"), ("foo", "two"), ("baz", "one"), ("baz", "two")], names=["first", "second"], ) - assert result.index.names == ("first", "second") + (None,) - tm.assert_index_equal( - result.index.levels[0], Index(["baz", "foo"], name="first") - ) + assert result.index.names == ("first", "second", None) + tm.assert_index_equal(result.index.levels[0], Index(["baz", "foo"])) def test_concat_keys_levels_no_overlap(self): # GH #1406 diff --git a/pandas/tests/reshape/test_reshape.py b/pandas/tests/reshape/test_reshape.py index e2c6f7d1c8feb..0b9392a0eeb5b 100644 --- a/pandas/tests/reshape/test_reshape.py +++ b/pandas/tests/reshape/test_reshape.py @@ -618,16 +618,15 @@ def test_reshaping_multi_index_categorical(self): df.index.names = ["major", "minor"] df["str"] = "foo" - dti = df.index.levels[0] - df["category"] = df["str"].astype("category") result = df["category"].unstack() + dti = df.index.levels[0] c = Categorical(["foo"] * len(dti)) expected = DataFrame( {"A": c.copy(), "B": c.copy(), "C": c.copy(), "D": c.copy()}, columns=Index(list("ABCD"), name="minor"), - index=dti, + index=dti.rename("major"), ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index e641d6f842d87..76436f4480809 100644 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -335,7 +335,7 @@ def test_count_level_corner(self): df = self.frame[:0] result = df.count(level=0) expected = ( - DataFrame(index=s.index.levels[0], columns=df.columns) + DataFrame(index=s.index.levels[0].set_names(["first"]), columns=df.columns) .fillna(0) .astype(np.int64) ) @@ -976,13 +976,11 @@ def test_count(self): result = series.count(level="b") expect = self.series.count(level=1) - tm.assert_series_equal(result, expect, check_names=False) - assert result.index.name == "b" + tm.assert_series_equal(result, expect) result = series.count(level="a") expect = self.series.count(level=0) - tm.assert_series_equal(result, expect, check_names=False) - assert result.index.name == "a" + tm.assert_series_equal(result, expect) msg = "Level x not found" with pytest.raises(KeyError, match=msg): @@ -1014,6 +1012,8 @@ def test_frame_group_ops(self, op, level, axis, skipna, sort): self.frame.iloc[1, [1, 2]] = np.nan self.frame.iloc[7, [0, 1]] = np.nan + level_name = self.frame.index.names[level] + if axis == 0: frame = self.frame else: @@ -1034,7 +1034,7 @@ def aggf(x): frame = frame.sort_index(level=level, axis=axis) # for good measure, groupby detail - level_index = frame._get_axis(axis).levels[level] + level_index = frame._get_axis(axis).levels[level].rename(level_name) tm.assert_index_equal(leftside._get_axis(axis), level_index) tm.assert_index_equal(rightside._get_axis(axis), level_index) @@ -1639,12 +1639,18 @@ def test_constructor_with_tz(self): ) result = MultiIndex.from_arrays([index, columns]) - tm.assert_index_equal(result.levels[0], index) - tm.assert_index_equal(result.levels[1], columns) + + assert result.names == ["dt1", "dt2"] + # levels don't have names set, so set name of index/columns to None in checks + tm.assert_index_equal(result.levels[0], index.rename(name=None)) + tm.assert_index_equal(result.levels[1], columns.rename(name=None)) result = MultiIndex.from_arrays([Series(index), Series(columns)]) - tm.assert_index_equal(result.levels[0], index) - tm.assert_index_equal(result.levels[1], columns) + + assert result.names == ["dt1", "dt2"] + # levels don't have names set, so set name of index/columns to None in checks + tm.assert_index_equal(result.levels[0], index.rename(name=None)) + tm.assert_index_equal(result.levels[1], columns.rename(name=None)) def test_set_index_datetime(self): # GH 3950 @@ -1666,18 +1672,19 @@ def test_set_index_datetime(self): df.index = df.index.tz_convert("US/Pacific") expected = pd.DatetimeIndex( - ["2011-07-19 07:00:00", "2011-07-19 08:00:00", "2011-07-19 09:00:00"], - name="datetime", + ["2011-07-19 07:00:00", "2011-07-19 08:00:00", "2011-07-19 09:00:00"] ) expected = expected.tz_localize("UTC").tz_convert("US/Pacific") df = df.set_index("label", append=True) tm.assert_index_equal(df.index.levels[0], expected) - tm.assert_index_equal(df.index.levels[1], Index(["a", "b"], name="label")) + tm.assert_index_equal(df.index.levels[1], Index(["a", "b"])) + assert df.index.names == ["datetime", "label"] df = df.swaplevel(0, 1) - tm.assert_index_equal(df.index.levels[0], Index(["a", "b"], name="label")) + tm.assert_index_equal(df.index.levels[0], Index(["a", "b"])) tm.assert_index_equal(df.index.levels[1], expected) + assert df.index.names == ["label", "datetime"] df = DataFrame(np.random.random(6)) idx1 = pd.DatetimeIndex( From 5ad908eb7adf384989f564f196da34f68bf497b0 Mon Sep 17 00:00:00 2001 From: Oktay Sabak Date: Wed, 16 Oct 2019 19:24:57 +0300 Subject: [PATCH 077/119] DOC: fix code-block in the reshaping docs (#28838) --- doc/source/user_guide/reshaping.rst | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/doc/source/user_guide/reshaping.rst b/doc/source/user_guide/reshaping.rst index dd6d3062a8f0a..b2ee252495f23 100644 --- a/doc/source/user_guide/reshaping.rst +++ b/doc/source/user_guide/reshaping.rst @@ -728,14 +728,14 @@ Suppose we wanted to pivot ``df`` such that the ``col`` values are columns, ``row`` values are the index, and the mean of ``val0`` are the values? In particular, the resulting DataFrame should look like: -.. note:: - - col col0 col1 col2 col3 col4 - row - row0 0.77 0.605 NaN 0.860 0.65 - row2 0.13 NaN 0.395 0.500 0.25 - row3 NaN 0.310 NaN 0.545 NaN - row4 NaN 0.100 0.395 0.760 0.24 +.. code-block:: text + + col col0 col1 col2 col3 col4 + row + row0 0.77 0.605 NaN 0.860 0.65 + row2 0.13 NaN 0.395 0.500 0.25 + row3 NaN 0.310 NaN 0.545 NaN + row4 NaN 0.100 0.395 0.760 0.24 This solution uses :func:`~pandas.pivot_table`. Also note that ``aggfunc='mean'`` is the default. It is included here to be explicit. From fdc322a5b111882a4404cc87ff485c6254ad4cbb Mon Sep 17 00:00:00 2001 From: Max Chen Date: Thu, 17 Oct 2019 02:43:32 +0800 Subject: [PATCH 078/119] CLN: Fix mypy error in pandas/tests/computation/test_eval.py (#29007) --- pandas/tests/computation/test_eval.py | 9 +++++++-- setup.cfg | 3 --- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/pandas/tests/computation/test_eval.py b/pandas/tests/computation/test_eval.py index b6ffd8a83e409..4d40cd3a2d4ca 100644 --- a/pandas/tests/computation/test_eval.py +++ b/pandas/tests/computation/test_eval.py @@ -2,6 +2,7 @@ from functools import reduce from itertools import product import operator +from typing import Dict, Type import warnings import numpy as np @@ -19,7 +20,11 @@ from pandas.core.computation.check import _NUMEXPR_VERSION from pandas.core.computation.engines import NumExprClobberingError, _engines import pandas.core.computation.expr as expr -from pandas.core.computation.expr import PandasExprVisitor, PythonExprVisitor +from pandas.core.computation.expr import ( + BaseExprVisitor, + PandasExprVisitor, + PythonExprVisitor, +) from pandas.core.computation.expressions import _NUMEXPR_INSTALLED, _USE_NUMEXPR from pandas.core.computation.ops import ( _arith_ops_syms, @@ -1884,7 +1889,7 @@ def test_invalid_parser(): "python": PythonExprVisitor, "pytables": pytables.ExprVisitor, "pandas": PandasExprVisitor, -} +} # type: Dict[str, Type[BaseExprVisitor]] @pytest.mark.parametrize("engine", _engines) diff --git a/setup.cfg b/setup.cfg index 462e79dae1039..ca15386b2c429 100644 --- a/setup.cfg +++ b/setup.cfg @@ -145,9 +145,6 @@ ignore_errors=True [mypy-pandas.tests.arrays.test_period] ignore_errors=True -[mypy-pandas.tests.computation.test_eval] -ignore_errors=True - [mypy-pandas.tests.dtypes.test_common] ignore_errors=True From b63f829466fd6999c7c28a6ae81038bed8ae0d94 Mon Sep 17 00:00:00 2001 From: Terji Petersen Date: Wed, 16 Oct 2019 19:53:41 +0100 Subject: [PATCH 079/119] CLN: Clean DirNameMixin (#28957) --- pandas/core/accessor.py | 6 ++---- pandas/core/arrays/categorical.py | 4 +++- pandas/core/base.py | 14 ++++++++++++-- pandas/core/indexes/base.py | 12 ++++++------ pandas/core/series.py | 4 +--- 5 files changed, 24 insertions(+), 16 deletions(-) diff --git a/pandas/core/accessor.py b/pandas/core/accessor.py index bce6c352ce480..fc60c01d7b808 100644 --- a/pandas/core/accessor.py +++ b/pandas/core/accessor.py @@ -4,7 +4,7 @@ that can be mixed into or pinned onto other pandas classes. """ -from typing import Set +from typing import FrozenSet, Set import warnings from pandas.util._decorators import Appender @@ -12,9 +12,7 @@ class DirNamesMixin: _accessors = set() # type: Set[str] - _deprecations = frozenset( - ["asobject", "base", "data", "flags", "itemsize", "strides"] - ) + _deprecations = frozenset() # type: FrozenSet[str] def _dir_deletions(self): """ diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index ea19808b19fc9..d34cf3e576beb 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -331,7 +331,9 @@ class Categorical(ExtensionArray, PandasObject): __array_priority__ = 1000 _dtype = CategoricalDtype(ordered=False) # tolist is not actually deprecated, just suppressed in the __dir__ - _deprecations = PandasObject._deprecations | frozenset(["tolist", "get_values"]) + _deprecations = PandasObject._deprecations | frozenset( + ["tolist", "itemsize", "get_values"] + ) _typ = "categorical" def __init__( diff --git a/pandas/core/base.py b/pandas/core/base.py index c461a1509ec78..5ae3926952a67 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -4,7 +4,7 @@ import builtins from collections import OrderedDict import textwrap -from typing import Dict, Optional +from typing import Dict, FrozenSet, Optional import warnings import numpy as np @@ -651,7 +651,17 @@ class IndexOpsMixin: # ndarray compatibility __array_priority__ = 1000 - _deprecations = frozenset(["item"]) + _deprecations = frozenset( + [ + "tolist", # tolist is not deprecated, just suppressed in the __dir__ + "base", + "data", + "item", + "itemsize", + "flags", + "strides", + ] + ) # type: FrozenSet[str] def transpose(self, *args, **kwargs): """ diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 464cd49f135ae..526b2c2e2c412 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -1,7 +1,7 @@ from datetime import datetime import operator from textwrap import dedent -from typing import Union +from typing import FrozenSet, Union import warnings import numpy as np @@ -63,7 +63,7 @@ from pandas.core.dtypes.missing import array_equivalent, isna from pandas.core import ops -from pandas.core.accessor import CachedAccessor, DirNamesMixin +from pandas.core.accessor import CachedAccessor import pandas.core.algorithms as algos from pandas.core.arrays import ExtensionArray from pandas.core.base import IndexOpsMixin, PandasObject @@ -206,10 +206,10 @@ class Index(IndexOpsMixin, PandasObject): # tolist is not actually deprecated, just suppressed in the __dir__ _deprecations = ( - IndexOpsMixin._deprecations - | DirNamesMixin._deprecations - | frozenset(["tolist", "contains", "dtype_str", "get_values", "set_value"]) - ) + PandasObject._deprecations + | IndexOpsMixin._deprecations + | frozenset(["asobject", "contains", "dtype_str", "get_values", "set_value"]) + ) # type: FrozenSet[str] # To hand over control to subclasses _join_precedence = 1 diff --git a/pandas/core/series.py b/pandas/core/series.py index 539a09f7046ac..1039e9af929d4 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -54,7 +54,7 @@ import pandas as pd from pandas.core import algorithms, base, generic, nanops, ops -from pandas.core.accessor import CachedAccessor, DirNamesMixin +from pandas.core.accessor import CachedAccessor from pandas.core.arrays import ExtensionArray from pandas.core.arrays.categorical import Categorical, CategoricalAccessor from pandas.core.arrays.sparse import SparseAccessor @@ -178,10 +178,8 @@ class Series(base.IndexOpsMixin, generic.NDFrame): _deprecations = ( base.IndexOpsMixin._deprecations | generic.NDFrame._deprecations - | DirNamesMixin._deprecations | frozenset( [ - "tolist", # tolist is not deprecated, just suppressed in the __dir__ "asobject", "compress", "valid", From bff90a34dc98405755ba83efa2a71452e4551f61 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 16 Oct 2019 12:09:04 -0700 Subject: [PATCH 080/119] REF: de-duplicate groupby_helper code (#28934) --- pandas/_libs/groupby.pyx | 3 +- pandas/_libs/groupby_helper.pxi.in | 139 +++++++++++------------------ 2 files changed, 54 insertions(+), 88 deletions(-) diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 3069bbbf34bb7..c9994812462b1 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -372,7 +372,8 @@ def group_any_all(uint8_t[:] out, const uint8_t[:] mask, object val_test, bint skipna): - """Aggregated boolean values to show truthfulness of group elements + """ + Aggregated boolean values to show truthfulness of group elements. Parameters ---------- diff --git a/pandas/_libs/groupby_helper.pxi.in b/pandas/_libs/groupby_helper.pxi.in index f052feea0bbf3..c837c6c5c6519 100644 --- a/pandas/_libs/groupby_helper.pxi.in +++ b/pandas/_libs/groupby_helper.pxi.in @@ -20,6 +20,18 @@ ctypedef fused rank_t: object +cdef inline bint _treat_as_na(rank_t val, bint is_datetimelike) nogil: + if rank_t is object: + # Should never be used, but we need to avoid the `val != val` below + # or else cython will raise about gil acquisition. + raise NotImplementedError + + elif rank_t is int64_t: + return is_datetimelike and val == NPY_NAT + else: + return val != val + + @cython.wraparound(False) @cython.boundscheck(False) def group_last(rank_t[:, :] out, @@ -61,24 +73,16 @@ def group_last(rank_t[:, :] out, for j in range(K): val = values[i, j] - # not nan - if rank_t is int64_t: - # need a special notna check - if val != NPY_NAT: - nobs[lab, j] += 1 - resx[lab, j] = val - else: - if val == val: - nobs[lab, j] += 1 - resx[lab, j] = val + if val == val: + # NB: use _treat_as_na here once + # conditional-nogil is available. + nobs[lab, j] += 1 + resx[lab, j] = val for i in range(ncounts): for j in range(K): if nobs[i, j] == 0: - if rank_t is int64_t: - out[i, j] = NPY_NAT - else: - out[i, j] = NAN + out[i, j] = NAN else: out[i, j] = resx[i, j] else: @@ -92,16 +96,10 @@ def group_last(rank_t[:, :] out, for j in range(K): val = values[i, j] - # not nan - if rank_t is int64_t: - # need a special notna check - if val != NPY_NAT: - nobs[lab, j] += 1 - resx[lab, j] = val - else: - if val == val: - nobs[lab, j] += 1 - resx[lab, j] = val + if not _treat_as_na(val, True): + # TODO: Sure we always want is_datetimelike=True? + nobs[lab, j] += 1 + resx[lab, j] = val for i in range(ncounts): for j in range(K): @@ -113,6 +111,7 @@ def group_last(rank_t[:, :] out, break else: out[i, j] = NAN + else: out[i, j] = resx[i, j] @@ -121,7 +120,6 @@ def group_last(rank_t[:, :] out, # block. raise RuntimeError("empty group with uint64_t") - group_last_float64 = group_last["float64_t"] group_last_float32 = group_last["float32_t"] group_last_int64 = group_last["int64_t"] @@ -169,8 +167,9 @@ def group_nth(rank_t[:, :] out, for j in range(K): val = values[i, j] - # not nan if val == val: + # NB: use _treat_as_na here once + # conditional-nogil is available. nobs[lab, j] += 1 if nobs[lab, j] == rank: resx[lab, j] = val @@ -193,18 +192,11 @@ def group_nth(rank_t[:, :] out, for j in range(K): val = values[i, j] - # not nan - if rank_t is int64_t: - # need a special notna check - if val != NPY_NAT: - nobs[lab, j] += 1 - if nobs[lab, j] == rank: - resx[lab, j] = val - else: - if val == val: - nobs[lab, j] += 1 - if nobs[lab, j] == rank: - resx[lab, j] = val + if not _treat_as_na(val, True): + # TODO: Sure we always want is_datetimelike=True? + nobs[lab, j] += 1 + if nobs[lab, j] == rank: + resx[lab, j] = val for i in range(ncounts): for j in range(K): @@ -487,17 +479,11 @@ def group_max(groupby_t[:, :] out, for j in range(K): val = values[i, j] - # not nan - if groupby_t is int64_t: - if val != nan_val: - nobs[lab, j] += 1 - if val > maxx[lab, j]: - maxx[lab, j] = val - else: - if val == val: - nobs[lab, j] += 1 - if val > maxx[lab, j]: - maxx[lab, j] = val + if not _treat_as_na(val, True): + # TODO: Sure we always want is_datetimelike=True? + nobs[lab, j] += 1 + if val > maxx[lab, j]: + maxx[lab, j] = val for i in range(ncounts): for j in range(K): @@ -563,17 +549,11 @@ def group_min(groupby_t[:, :] out, for j in range(K): val = values[i, j] - # not nan - if groupby_t is int64_t: - if val != nan_val: - nobs[lab, j] += 1 - if val < minx[lab, j]: - minx[lab, j] = val - else: - if val == val: - nobs[lab, j] += 1 - if val < minx[lab, j]: - minx[lab, j] = val + if not _treat_as_na(val, True): + # TODO: Sure we always want is_datetimelike=True? + nobs[lab, j] += 1 + if val < minx[lab, j]: + minx[lab, j] = val for i in range(ncounts): for j in range(K): @@ -643,21 +623,13 @@ def group_cummin(groupby_t[:, :] out, for j in range(K): val = values[i, j] - # val = nan - if groupby_t is int64_t: - if is_datetimelike and val == NPY_NAT: - out[i, j] = NPY_NAT - else: - mval = accum[lab, j] - if val < mval: - accum[lab, j] = mval = val - out[i, j] = mval + if _treat_as_na(val, is_datetimelike): + out[i, j] = val else: - if val == val: - mval = accum[lab, j] - if val < mval: - accum[lab, j] = mval = val - out[i, j] = mval + mval = accum[lab, j] + if val < mval: + accum[lab, j] = mval = val + out[i, j] = mval @cython.boundscheck(False) @@ -712,17 +684,10 @@ def group_cummax(groupby_t[:, :] out, for j in range(K): val = values[i, j] - if groupby_t is int64_t: - if is_datetimelike and val == NPY_NAT: - out[i, j] = NPY_NAT - else: - mval = accum[lab, j] - if val > mval: - accum[lab, j] = mval = val - out[i, j] = mval + if _treat_as_na(val, is_datetimelike): + out[i, j] = val else: - if val == val: - mval = accum[lab, j] - if val > mval: - accum[lab, j] = mval = val - out[i, j] = mval + mval = accum[lab, j] + if val > mval: + accum[lab, j] = mval = val + out[i, j] = mval From 30a0e2ee9d0c2d3eec45a26400d568d9a189f38f Mon Sep 17 00:00:00 2001 From: saskakarsi <44523813+saskakarsi@users.noreply.github.com> Date: Thu, 17 Oct 2019 00:24:53 +0300 Subject: [PATCH 081/119] CLN: fix mypy error pandas/tests/plotting/test_backend.py (#29016) * CLN: fix mypy error pandas/tests/plotting/test_backend.py GH28926, mypy didn't like setting non-existant attribute on module. Dummy function, circumvented with __setattr__ --- pandas/tests/plotting/test_backend.py | 2 +- setup.cfg | 3 --- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/pandas/tests/plotting/test_backend.py b/pandas/tests/plotting/test_backend.py index 6511d94aa4c09..41b1a88b15acb 100644 --- a/pandas/tests/plotting/test_backend.py +++ b/pandas/tests/plotting/test_backend.py @@ -9,7 +9,7 @@ import pandas dummy_backend = types.ModuleType("pandas_dummy_backend") -dummy_backend.plot = lambda *args, **kwargs: None +setattr(dummy_backend, "plot", lambda *args, **kwargs: None) @pytest.fixture diff --git a/setup.cfg b/setup.cfg index ca15386b2c429..4353c0065b94b 100644 --- a/setup.cfg +++ b/setup.cfg @@ -202,9 +202,6 @@ ignore_errors=True [mypy-pandas.tests.io.json.test_ujson] ignore_errors=True -[mypy-pandas.tests.plotting.test_backend] -ignore_errors=True - [mypy-pandas.tests.series.test_constructors] ignore_errors=True From 143eb3880cfbfa43963ecfbb0fb70be57798227b Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 16 Oct 2019 16:37:29 -0700 Subject: [PATCH 082/119] REF: re-raise AssertionError unchanged (#28959) --- pandas/_libs/groupby.pyx | 8 ++++---- pandas/core/groupby/generic.py | 12 +++++++++++ pandas/core/groupby/groupby.py | 21 ++++---------------- pandas/core/groupby/ops.py | 2 ++ pandas/core/resample.py | 2 ++ pandas/tests/groupby/aggregate/test_other.py | 2 +- 6 files changed, 25 insertions(+), 22 deletions(-) diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index c9994812462b1..4f7488c88630b 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -442,7 +442,7 @@ def _group_add(floating[:, :] out, floating[:, :] sumx, nobs if len(values) != len(labels): - raise AssertionError("len(index) != len(labels)") + raise ValueError("len(index) != len(labels)") nobs = np.zeros_like(out) sumx = np.zeros_like(out) @@ -492,7 +492,7 @@ def _group_prod(floating[:, :] out, floating[:, :] prodx, nobs if not len(values) == len(labels): - raise AssertionError("len(index) != len(labels)") + raise ValueError("len(index) != len(labels)") nobs = np.zeros_like(out) prodx = np.ones_like(out) @@ -542,7 +542,7 @@ def _group_var(floating[:, :] out, assert min_count == -1, "'min_count' only used in add and prod" if not len(values) == len(labels): - raise AssertionError("len(index) != len(labels)") + raise ValueError("len(index) != len(labels)") nobs = np.zeros_like(out) mean = np.zeros_like(out) @@ -597,7 +597,7 @@ def _group_mean(floating[:, :] out, assert min_count == -1, "'min_count' only used in add and prod" if not len(values) == len(labels): - raise AssertionError("len(index) != len(labels)") + raise ValueError("len(index) != len(labels)") nobs = np.zeros_like(out) sumx = np.zeros_like(out) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index aa817ec451aa5..8cd727e744519 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -261,6 +261,8 @@ def aggregate(self, func=None, *args, **kwargs): try: return self._python_agg_general(func, *args, **kwargs) + except AssertionError: + raise except Exception: result = self._aggregate_named(func, *args, **kwargs) @@ -887,6 +889,8 @@ def aggregate(self, func=None, *args, **kwargs): result = self._aggregate_multiple_funcs( [func], _level=_level, _axis=self.axis ) + except AssertionError: + raise except Exception: result = self._aggregate_frame(func) else: @@ -1036,6 +1040,8 @@ def _aggregate_frame(self, func, *args, **kwargs): for name, data in self: fres = func(data, *args, **kwargs) result[name] = self._try_cast(fres, data) + except AssertionError: + raise except Exception: return self._aggregate_item_by_item(func, *args, **kwargs) else: @@ -1043,6 +1049,8 @@ def _aggregate_frame(self, func, *args, **kwargs): data = self.get_group(name, obj=obj) try: fres = func(data, *args, **kwargs) + except AssertionError: + raise except Exception: wrapper = lambda x: func(x, *args, **kwargs) result[name] = data.apply(wrapper, axis=axis) @@ -1398,6 +1406,8 @@ def _choose_path(self, fast_path, slow_path, group): # if we make it here, test if we can use the fast path try: res_fast = fast_path(group) + except AssertionError: + raise except Exception: # Hard to know ex-ante what exceptions `fast_path` might raise return path, res @@ -1422,6 +1432,8 @@ def _transform_item_by_item(self, obj, wrapper): for i, col in enumerate(obj): try: output[col] = self[col].transform(wrapper) + except AssertionError: + raise except Exception: pass else: diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 92ea733cc3447..6f2868482b798 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -44,13 +44,7 @@ class providing the base-class of operations. from pandas.core import nanops import pandas.core.algorithms as algorithms from pandas.core.arrays import Categorical -from pandas.core.base import ( - DataError, - GroupByError, - PandasObject, - SelectionMixin, - SpecificationError, -) +from pandas.core.base import DataError, PandasObject, SelectionMixin import pandas.core.common as com from pandas.core.construction import extract_array from pandas.core.frame import DataFrame @@ -862,8 +856,6 @@ def _cython_transform(self, how, numeric_only=True, **kwargs): result, names = self.grouper.transform(obj.values, how, **kwargs) except NotImplementedError: continue - except AssertionError as e: - raise GroupByError(str(e)) if self._transform_should_cast(how): output[name] = self._try_cast(result, obj) else: @@ -890,12 +882,7 @@ def _cython_agg_general(self, how, alt=None, numeric_only=True, min_count=-1): if numeric_only and not is_numeric: continue - try: - result, names = self.grouper.aggregate( - obj.values, how, min_count=min_count - ) - except AssertionError as e: - raise GroupByError(str(e)) + result, names = self.grouper.aggregate(obj.values, how, min_count=min_count) output[name] = self._try_cast(result, obj) if len(output) == 0: @@ -1353,8 +1340,8 @@ def f(self, **kwargs): # try a cython aggregation if we can try: return self._cython_agg_general(alias, alt=npfunc, **kwargs) - except AssertionError as e: - raise SpecificationError(str(e)) + except AssertionError: + raise except DataError: pass except Exception: diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 40517eefe4d5d..27415a1bacdbd 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -647,6 +647,8 @@ def _transform( def agg_series(self, obj, func): try: return self._aggregate_series_fast(obj, func) + except AssertionError: + raise except Exception: return self._aggregate_series_pure_python(obj, func) diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 545bc21dd6d1b..5185d95cfac4c 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -360,6 +360,8 @@ def _groupby_and_aggregate(self, how, grouper=None, *args, **kwargs): result = grouped._aggregate_item_by_item(how, *args, **kwargs) else: result = grouped.aggregate(how, *args, **kwargs) + except AssertionError: + raise except Exception: # we have a non-reducing function diff --git a/pandas/tests/groupby/aggregate/test_other.py b/pandas/tests/groupby/aggregate/test_other.py index 7e3cbed09c6d7..5dad868c8c3aa 100644 --- a/pandas/tests/groupby/aggregate/test_other.py +++ b/pandas/tests/groupby/aggregate/test_other.py @@ -19,7 +19,7 @@ date_range, period_range, ) -from pandas.core.groupby.groupby import SpecificationError +from pandas.core.base import SpecificationError import pandas.util.testing as tm from pandas.io.formats.printing import pprint_thing From 9486f044ee57a00bd964160c1f234fe92d02e735 Mon Sep 17 00:00:00 2001 From: lukasbk Date: Thu, 17 Oct 2019 01:52:13 +0200 Subject: [PATCH 083/119] tests/indexing/test_coercion.py typefix (#28990) --- pandas/tests/indexing/test_coercion.py | 3 ++- setup.cfg | 3 --- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/pandas/tests/indexing/test_coercion.py b/pandas/tests/indexing/test_coercion.py index 05b58b0eca9b8..4f38d7beb9c0b 100644 --- a/pandas/tests/indexing/test_coercion.py +++ b/pandas/tests/indexing/test_coercion.py @@ -1,4 +1,5 @@ import itertools +from typing import Dict, List import numpy as np import pytest @@ -928,7 +929,7 @@ class TestReplaceSeriesCoercion(CoercionBase): klasses = ["series"] method = "replace" - rep = {} + rep = {} # type: Dict[str, List] rep["object"] = ["a", "b"] rep["int64"] = [4, 5] rep["float64"] = [1.1, 2.2] diff --git a/setup.cfg b/setup.cfg index 4353c0065b94b..3562ece5acad3 100644 --- a/setup.cfg +++ b/setup.cfg @@ -193,9 +193,6 @@ ignore_errors=True [mypy-pandas.tests.indexes.timedeltas.test_timedelta] ignore_errors=True -[mypy-pandas.tests.indexing.test_coercion] -ignore_errors=True - [mypy-pandas.tests.indexing.test_loc] ignore_errors=True From da3d0d92ee7d09011ab893871694cc0d84c66f02 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 16 Oct 2019 16:53:50 -0700 Subject: [PATCH 084/119] DEPR: remove previously-deprecated broadcast/reduce kwargs from DataFrame.apply (#29017) --- doc/source/whatsnew/v1.0.0.rst | 1 + pandas/core/apply.py | 46 +------------------------------- pandas/core/frame.py | 40 ++------------------------- pandas/tests/frame/test_apply.py | 11 -------- 4 files changed, 4 insertions(+), 94 deletions(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 7692651db840e..48c1173a372a7 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -227,6 +227,7 @@ Removal of prior version deprecations/changes - Removed the previously deprecated :meth:`ExtensionArray._formatting_values`. Use :attr:`ExtensionArray._formatter` instead. (:issue:`23601`) - Removed the previously deprecated ``IntervalIndex.from_intervals`` in favor of the :class:`IntervalIndex` constructor (:issue:`19263`) - Ability to read pickles containing :class:`Categorical` instances created with pre-0.16 version of pandas has been removed (:issue:`27538`) +- Removed the previously deprecated ``reduce`` and ``broadcast`` arguments from :meth:`DataFrame.apply` (:issue:`18577`) - .. _whatsnew_1000.performance: diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 91f3e878c3807..f402154dc91ca 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -1,5 +1,4 @@ import inspect -import warnings import numpy as np @@ -21,9 +20,7 @@ def frame_apply( obj, func, axis=0, - broadcast=None, raw=False, - reduce=None, result_type=None, ignore_failures=False, args=None, @@ -40,9 +37,7 @@ def frame_apply( return klass( obj, func, - broadcast=broadcast, raw=raw, - reduce=reduce, result_type=result_type, ignore_failures=ignore_failures, args=args, @@ -51,18 +46,7 @@ def frame_apply( class FrameApply: - def __init__( - self, - obj, - func, - broadcast, - raw, - reduce, - result_type, - ignore_failures, - args, - kwds, - ): + def __init__(self, obj, func, raw, result_type, ignore_failures, args, kwds): self.obj = obj self.raw = raw self.ignore_failures = ignore_failures @@ -75,34 +59,6 @@ def __init__( "of {None, 'reduce', 'broadcast', 'expand'}" ) - if broadcast is not None: - warnings.warn( - "The broadcast argument is deprecated and will " - "be removed in a future version. You can specify " - "result_type='broadcast' to broadcast the result " - "to the original dimensions", - FutureWarning, - stacklevel=4, - ) - if broadcast: - result_type = "broadcast" - - if reduce is not None: - warnings.warn( - "The reduce argument is deprecated and will " - "be removed in a future version. You can specify " - "result_type='reduce' to try to reduce the result " - "to the original dimensions", - FutureWarning, - stacklevel=4, - ) - if reduce: - - if result_type is not None: - raise ValueError("cannot pass both reduce=True and result_type") - - result_type = "reduce" - self.result_type = result_type # curry if needed diff --git a/pandas/core/frame.py b/pandas/core/frame.py index a3c839f6b13a1..7880acb1b78da 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -6648,15 +6648,7 @@ def transform(self, func, axis=0, *args, **kwargs): return super().transform(func, *args, **kwargs) def apply( - self, - func, - axis=0, - broadcast=None, - raw=False, - reduce=None, - result_type=None, - args=(), - **kwds + self, func, axis=0, raw=False, reduce=None, result_type=None, args=(), **kwds ): """ Apply a function along an axis of the DataFrame. @@ -6676,21 +6668,9 @@ def apply( * 0 or 'index': apply function to each column. * 1 or 'columns': apply function to each row. - broadcast : bool, optional - Only relevant for aggregation functions: - - * ``False`` or ``None`` : returns a Series whose length is the - length of the index or the number of columns (based on the - `axis` parameter) - * ``True`` : results will be broadcast to the original shape - of the frame, the original index and columns will be retained. - - .. deprecated:: 0.23.0 - This argument will be removed in a future version, replaced - by result_type='broadcast'. raw : bool, default False - Determines if row or column is passed as a Series or ndarry object: + Determines if row or column is passed as a Series or ndarray object: * ``False`` : passes each row or column as a Series to the function. @@ -6698,20 +6678,6 @@ def apply( instead. If you are just applying a NumPy reduction function this will achieve much better performance. - reduce : bool or None, default None - Try to apply reduction procedures. If the DataFrame is empty, - `apply` will use `reduce` to determine whether the result - should be a Series or a DataFrame. If ``reduce=None`` (the - default), `apply`'s return value will be guessed by calling - `func` on an empty Series - (note: while guessing, exceptions raised by `func` will be - ignored). - If ``reduce=True`` a Series will always be returned, and if - ``reduce=False`` a DataFrame will always be returned. - - .. deprecated:: 0.23.0 - This argument will be removed in a future version, replaced - by ``result_type='reduce'``. result_type : {'expand', 'reduce', 'broadcast', None}, default None These only act when ``axis=1`` (columns): @@ -6825,9 +6791,7 @@ def apply( self, func=func, axis=axis, - broadcast=broadcast, raw=raw, - reduce=reduce, result_type=result_type, args=args, kwds=kwds, diff --git a/pandas/tests/frame/test_apply.py b/pandas/tests/frame/test_apply.py index 0328232213588..fe034504b8161 100644 --- a/pandas/tests/frame/test_apply.py +++ b/pandas/tests/frame/test_apply.py @@ -137,13 +137,6 @@ def test_nunique_empty(self): expected = Series([], index=pd.Index([])) assert_series_equal(result, expected) - def test_apply_deprecate_reduce(self): - empty_frame = DataFrame() - - x = [] - with tm.assert_produces_warning(FutureWarning): - empty_frame.apply(x.append, axis=1, reduce=True) - def test_apply_standard_nonunique(self): df = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], index=["a", "a", "c"]) @@ -170,10 +163,6 @@ def test_apply_with_string_funcs(self, float_frame, func, args, kwds): expected = getattr(float_frame, func)(*args, **kwds) tm.assert_series_equal(result, expected) - def test_apply_broadcast_deprecated(self, float_frame): - with tm.assert_produces_warning(FutureWarning): - float_frame.apply(np.mean, broadcast=True) - def test_apply_broadcast(self, float_frame, int_frame_const_col): # scalars From 9d45934af87ce4bdf204836a2e9cfcc3a7e5c279 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 17 Oct 2019 05:37:47 -0700 Subject: [PATCH 085/119] BUG: Fix TypeError in _cython_agg_blocks (#29035) --- pandas/core/groupby/generic.py | 21 +++++++++++++++++++-- pandas/core/groupby/groupby.py | 15 ++++++++------- 2 files changed, 27 insertions(+), 9 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 8cd727e744519..8e53972c95275 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -970,6 +970,11 @@ def _cython_agg_blocks(self, how, alt=None, numeric_only=True, min_count=-1): # call our grouper again with only this block obj = self.obj[data.items[locs]] + if obj.shape[1] == 1: + # Avoid call to self.values that can occur in DataFrame + # reductions; see GH#28949 + obj = obj.iloc[:, 0] + s = groupby(obj, self.grouper) try: result = s.aggregate(lambda x: alt(x, axis=self.axis)) @@ -978,17 +983,29 @@ def _cython_agg_blocks(self, how, alt=None, numeric_only=True, min_count=-1): # continue and exclude the block deleted_items.append(locs) continue + + # unwrap DataFrame to get array + assert len(result._data.blocks) == 1 + result = result._data.blocks[0].values + if result.ndim == 1 and isinstance(result, np.ndarray): + result = result.reshape(1, -1) + finally: + assert not isinstance(result, DataFrame) + if result is not no_result: # see if we can cast the block back to the original dtype result = maybe_downcast_numeric(result, block.dtype) - if result.ndim == 1 and isinstance(result, np.ndarray): + if block.is_extension and isinstance(result, np.ndarray): # e.g. block.values was an IntegerArray + # (1, N) case can occur if block.values was Categorical + # and result is ndarray[object] + assert result.ndim == 1 or result.shape[0] == 1 try: # Cast back if feasible result = type(block.values)._from_sequence( - result, dtype=block.values.dtype + result.ravel(), dtype=block.values.dtype ) except ValueError: # reshape to be valid for non-Extension Block diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 6f2868482b798..fa65179469840 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1344,13 +1344,14 @@ def f(self, **kwargs): raise except DataError: pass - except Exception: - # TODO: the remaining test cases that get here are from: - # - AttributeError from _cython_agg_blocks bug passing - # DataFrame to make_block; see GH#28275 - # - TypeError in _cython_operation calling ensure_float64 - # on object array containing complex numbers; - # see test_groupby_complex, test_max_nan_bug + except (TypeError, NotImplementedError): + # TODO: + # - TypeError: this is reached via test_groupby_complex + # and can be fixed by implementing _group_add for + # complex dtypes + # - NotImplementedError: reached in test_max_nan_bug, + # raised in _get_cython_function and should probably + # be handled inside _cython_agg_blocks pass # apply a non-cython aggregation From 509eb14683bd78d932a17a096e13c0d8d9f19e91 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 17 Oct 2019 11:25:47 -0700 Subject: [PATCH 086/119] CI: xfail on numpy 1.18 (#29057) * xfail on numpy 1.18 * CI: try using numpy wheel --- ci/build38.sh | 8 +------- pandas/compat/numpy/__init__.py | 1 + pandas/tests/series/test_analytics.py | 13 +++++++++++++ 3 files changed, 15 insertions(+), 7 deletions(-) diff --git a/ci/build38.sh b/ci/build38.sh index 5c798c17301e0..903016536d240 100644 --- a/ci/build38.sh +++ b/ci/build38.sh @@ -6,13 +6,7 @@ pip install --no-deps -U pip wheel setuptools pip install python-dateutil pytz cython pytest pytest-xdist hypothesis # Possible alternative for getting numpy: -# pip install --pre -f https://7933911d6844c6c53a7d-47bd50c35cd79bd838daf386af554a83.ssl.cf2.rackcdn.com/ numpy -git clone https://github.com/numpy/numpy -cd numpy -python setup.py build_ext --inplace -python setup.py install -cd .. -rm -rf numpy +pip install --pre -f https://7933911d6844c6c53a7d-47bd50c35cd79bd838daf386af554a83.ssl.cf2.rackcdn.com/ numpy python setup.py build_ext -inplace python -m pip install --no-build-isolation -e . diff --git a/pandas/compat/numpy/__init__.py b/pandas/compat/numpy/__init__.py index ce56c08d3ec14..402ed62f2df65 100644 --- a/pandas/compat/numpy/__init__.py +++ b/pandas/compat/numpy/__init__.py @@ -12,6 +12,7 @@ _np_version_under1p15 = _nlv < LooseVersion("1.15") _np_version_under1p16 = _nlv < LooseVersion("1.16") _np_version_under1p17 = _nlv < LooseVersion("1.17") +_np_version_under1p18 = _nlv < LooseVersion("1.18") _is_numpy_dev = ".dev" in str(_nlv) diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py index d60cd3029e5a8..c8e1c04f3e3fb 100644 --- a/pandas/tests/series/test_analytics.py +++ b/pandas/tests/series/test_analytics.py @@ -4,6 +4,7 @@ import numpy as np import pytest +from pandas.compat.numpy import _np_version_under1p18 import pandas.util._test_decorators as td import pandas as pd @@ -160,6 +161,9 @@ def test_cummax(self, datetime_series): tm.assert_series_equal(result, expected) + @pytest.mark.xfail( + not _np_version_under1p18, reason="numpy 1.18 changed min/max behavior for NaT" + ) def test_cummin_datetime64(self): s = pd.Series( pd.to_datetime(["NaT", "2000-1-2", "NaT", "2000-1-1", "NaT", "2000-1-3"]) @@ -179,6 +183,9 @@ def test_cummin_datetime64(self): result = s.cummin(skipna=False) tm.assert_series_equal(expected, result) + @pytest.mark.xfail( + not _np_version_under1p18, reason="numpy 1.18 changed min/max behavior for NaT" + ) def test_cummax_datetime64(self): s = pd.Series( pd.to_datetime(["NaT", "2000-1-2", "NaT", "2000-1-1", "NaT", "2000-1-3"]) @@ -198,6 +205,9 @@ def test_cummax_datetime64(self): result = s.cummax(skipna=False) tm.assert_series_equal(expected, result) + @pytest.mark.xfail( + not _np_version_under1p18, reason="numpy 1.18 changed min/max behavior for NaT" + ) def test_cummin_timedelta64(self): s = pd.Series(pd.to_timedelta(["NaT", "2 min", "NaT", "1 min", "NaT", "3 min"])) @@ -213,6 +223,9 @@ def test_cummin_timedelta64(self): result = s.cummin(skipna=False) tm.assert_series_equal(expected, result) + @pytest.mark.xfail( + not _np_version_under1p18, reason="numpy 1.18 changed min/max behavior for NaT" + ) def test_cummax_timedelta64(self): s = pd.Series(pd.to_timedelta(["NaT", "2 min", "NaT", "1 min", "NaT", "3 min"])) From 6d35836ec25b33990e6d962aff52e388652f65ce Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 17 Oct 2019 15:17:13 -0700 Subject: [PATCH 087/119] Stop catching TypeError in groupby methods (#29060) --- pandas/_libs/groupby.pyx | 33 ++++++++++++++++++++++++--------- pandas/core/groupby/groupby.py | 21 ++++++++++----------- pandas/core/groupby/ops.py | 8 +++++++- 3 files changed, 41 insertions(+), 21 deletions(-) diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 4f7488c88630b..68c21139e7384 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -8,7 +8,7 @@ import numpy as np cimport numpy as cnp from numpy cimport (ndarray, int8_t, int16_t, int32_t, int64_t, uint8_t, uint16_t, - uint32_t, uint64_t, float32_t, float64_t) + uint32_t, uint64_t, float32_t, float64_t, complex64_t, complex128_t) cnp.import_array() @@ -421,16 +421,23 @@ def group_any_all(uint8_t[:] out, if values[i] == flag_val: out[lab] = flag_val + # ---------------------------------------------------------------------- # group_add, group_prod, group_var, group_mean, group_ohlc # ---------------------------------------------------------------------- +ctypedef fused complexfloating_t: + float64_t + float32_t + complex64_t + complex128_t + @cython.wraparound(False) @cython.boundscheck(False) -def _group_add(floating[:, :] out, +def _group_add(complexfloating_t[:, :] out, int64_t[:] counts, - floating[:, :] values, + complexfloating_t[:, :] values, const int64_t[:] labels, Py_ssize_t min_count=0): """ @@ -438,13 +445,14 @@ def _group_add(floating[:, :] out, """ cdef: Py_ssize_t i, j, N, K, lab, ncounts = len(counts) - floating val, count - floating[:, :] sumx, nobs + complexfloating_t val, count + complexfloating_t[:, :] sumx + int64_t[:, :] nobs if len(values) != len(labels): raise ValueError("len(index) != len(labels)") - nobs = np.zeros_like(out) + nobs = np.zeros((len(out), out.shape[1]), dtype=np.int64) sumx = np.zeros_like(out) N, K = (values).shape @@ -462,7 +470,12 @@ def _group_add(floating[:, :] out, # not nan if val == val: nobs[lab, j] += 1 - sumx[lab, j] += val + if (complexfloating_t is complex64_t or + complexfloating_t is complex128_t): + # clang errors if we use += with these dtypes + sumx[lab, j] = sumx[lab, j] + val + else: + sumx[lab, j] += val for i in range(ncounts): for j in range(K): @@ -472,8 +485,10 @@ def _group_add(floating[:, :] out, out[i, j] = sumx[i, j] -group_add_float32 = _group_add['float'] -group_add_float64 = _group_add['double'] +group_add_float32 = _group_add['float32_t'] +group_add_float64 = _group_add['float64_t'] +group_add_complex64 = _group_add['float complex'] +group_add_complex128 = _group_add['double complex'] @cython.wraparound(False) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index fa65179469840..b27d5bb05ee8f 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1340,19 +1340,18 @@ def f(self, **kwargs): # try a cython aggregation if we can try: return self._cython_agg_general(alias, alt=npfunc, **kwargs) - except AssertionError: - raise except DataError: pass - except (TypeError, NotImplementedError): - # TODO: - # - TypeError: this is reached via test_groupby_complex - # and can be fixed by implementing _group_add for - # complex dtypes - # - NotImplementedError: reached in test_max_nan_bug, - # raised in _get_cython_function and should probably - # be handled inside _cython_agg_blocks - pass + except NotImplementedError as err: + if "function is not implemented for this dtype" in str(err): + # raised in _get_cython_function, in some cases can + # be trimmed by implementing cython funcs for more dtypes + pass + elif "decimal does not support skipna=True" in str(err): + # FIXME: kludge for test_decimal:test_in_numeric_groupby + pass + else: + raise # apply a non-cython aggregation result = self.aggregate(lambda x: npfunc(x, axis=self.axis)) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 27415a1bacdbd..e380cf5930f97 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -526,7 +526,13 @@ def _cython_operation(self, kind, values, how, axis, min_count=-1, **kwargs): func = self._get_cython_function(kind, how, values, is_numeric) except NotImplementedError: if is_numeric: - values = ensure_float64(values) + try: + values = ensure_float64(values) + except TypeError: + if lib.infer_dtype(values, skipna=False) == "complex": + values = values.astype(complex) + else: + raise func = self._get_cython_function(kind, how, values, is_numeric) else: raise From b0f33b3f40d5742531e31aef90573b9592489592 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 18 Oct 2019 07:43:34 -0500 Subject: [PATCH 088/119] API: Restore getting name from MultiIndex level (#29061) * API: Restore getting name from MultiIndex level xref https://issues.apache.org/jira/browse/ARROW-6922 / https://github.com/pandas-dev/pandas/pull/27242#issuecomment-543302582 / https://github.com/pandas-dev/pandas/issues/29032 No docs yet, since it isn't clear how this will eventually sort out. But we at least want to preserve this behavior for 1.0 * fixups --- pandas/core/indexes/multi.py | 7 ++++-- pandas/tests/frame/test_alter_axes.py | 2 +- .../tests/indexes/multi/test_constructor.py | 6 ++--- pandas/tests/indexes/multi/test_names.py | 23 ++++++++++++------- pandas/tests/indexes/multi/test_reindex.py | 4 ++-- pandas/tests/indexes/multi/test_reshape.py | 4 ++-- pandas/tests/reshape/test_concat.py | 6 +++-- pandas/tests/test_multilevel.py | 21 ++++++++--------- 8 files changed, 42 insertions(+), 31 deletions(-) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index b0a1ed0650f7c..fda5c78a61e53 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -639,7 +639,10 @@ def from_frame(cls, df, sortorder=None, names=None): @property def levels(self): - return self._levels + result = [ + x._shallow_copy(name=name) for x, name in zip(self._levels, self._names) + ] + return FrozenList(result) @property def _values(self): @@ -830,7 +833,7 @@ def _set_codes( if level is None: new_codes = FrozenList( _ensure_frozen(level_codes, lev, copy=copy)._shallow_copy() - for lev, level_codes in zip(self.levels, codes) + for lev, level_codes in zip(self._levels, codes) ) else: level = [self._get_level_number(l) for l in level] diff --git a/pandas/tests/frame/test_alter_axes.py b/pandas/tests/frame/test_alter_axes.py index b310335be5f65..017cbea7ec723 100644 --- a/pandas/tests/frame/test_alter_axes.py +++ b/pandas/tests/frame/test_alter_axes.py @@ -978,7 +978,7 @@ def test_reset_index(self, float_frame): ): values = lev.take(level_codes) name = names[i] - tm.assert_index_equal(values, Index(deleveled[name].rename(name=None))) + tm.assert_index_equal(values, Index(deleveled[name])) stacked.index.names = [None, None] deleveled2 = stacked.reset_index() diff --git a/pandas/tests/indexes/multi/test_constructor.py b/pandas/tests/indexes/multi/test_constructor.py index 993979f31a35b..ff98da85cfb2d 100644 --- a/pandas/tests/indexes/multi/test_constructor.py +++ b/pandas/tests/indexes/multi/test_constructor.py @@ -17,7 +17,7 @@ def test_constructor_single_level(): levels=[["foo", "bar", "baz", "qux"]], codes=[[0, 1, 2, 3]], names=["first"] ) assert isinstance(result, MultiIndex) - expected = Index(["foo", "bar", "baz", "qux"]) + expected = Index(["foo", "bar", "baz", "qux"], name="first") tm.assert_index_equal(result.levels[0], expected) assert result.names == ["first"] @@ -292,7 +292,7 @@ def test_from_arrays_empty(): # 1 level result = MultiIndex.from_arrays(arrays=[[]], names=["A"]) assert isinstance(result, MultiIndex) - expected = Index([]) + expected = Index([], name="A") tm.assert_index_equal(result.levels[0], expected) assert result.names == ["A"] @@ -440,7 +440,7 @@ def test_from_product_empty_zero_levels(): def test_from_product_empty_one_level(): result = MultiIndex.from_product([[]], names=["A"]) - expected = pd.Index([]) + expected = pd.Index([], name="A") tm.assert_index_equal(result.levels[0], expected) assert result.names == ["A"] diff --git a/pandas/tests/indexes/multi/test_names.py b/pandas/tests/indexes/multi/test_names.py index 679e045a68f29..5c3a48c9dd481 100644 --- a/pandas/tests/indexes/multi/test_names.py +++ b/pandas/tests/indexes/multi/test_names.py @@ -27,7 +27,7 @@ def test_index_name_retained(): def test_changing_names(idx): - assert [level.name for level in idx.levels] == [None, None] + assert [level.name for level in idx.levels] == ["first", "second"] view = idx.view() copy = idx.copy() @@ -36,16 +36,16 @@ def test_changing_names(idx): # changing names should not change level names on object new_names = [name + "a" for name in idx.names] idx.names = new_names - check_level_names(idx, [None, None]) + check_level_names(idx, ["firsta", "seconda"]) # and not on copies - check_level_names(view, [None, None]) - check_level_names(copy, [None, None]) - check_level_names(shallow_copy, [None, None]) + check_level_names(view, ["first", "second"]) + check_level_names(copy, ["first", "second"]) + check_level_names(shallow_copy, ["first", "second"]) # and copies shouldn't change original shallow_copy.names = [name + "c" for name in shallow_copy.names] - check_level_names(idx, [None, None]) + check_level_names(idx, ["firsta", "seconda"]) def test_take_preserve_name(idx): @@ -81,7 +81,7 @@ def test_names(idx, index_names): # names are assigned in setup assert index_names == ["first", "second"] level_names = [level.name for level in idx.levels] - assert level_names == [None, None] + assert level_names == index_names # setting bad names on existing index = idx @@ -109,7 +109,7 @@ def test_names(idx, index_names): # names are assigned on index, but not transferred to the levels index.names = ["a", "b"] level_names = [level.name for level in index.levels] - assert level_names == [None, None] + assert level_names == ["a", "b"] def test_duplicate_level_names_access_raises(idx): @@ -117,3 +117,10 @@ def test_duplicate_level_names_access_raises(idx): idx.names = ["foo", "foo"] with pytest.raises(ValueError, match="name foo occurs multiple times"): idx._get_level_number("foo") + + +def test_get_names_from_levels(): + idx = pd.MultiIndex.from_product([["a"], [1, 2]], names=["a", "b"]) + + assert idx.levels[0].name == "a" + assert idx.levels[1].name == "b" diff --git a/pandas/tests/indexes/multi/test_reindex.py b/pandas/tests/indexes/multi/test_reindex.py index 970288e5747c7..513efa8941de8 100644 --- a/pandas/tests/indexes/multi/test_reindex.py +++ b/pandas/tests/indexes/multi/test_reindex.py @@ -10,13 +10,13 @@ def test_reindex(idx): result, indexer = idx.reindex(list(idx[:4])) assert isinstance(result, MultiIndex) assert result.names == ["first", "second"] - assert [level.name for level in result.levels] == [None, None] + assert [level.name for level in result.levels] == ["first", "second"] result, indexer = idx.reindex(list(idx)) assert isinstance(result, MultiIndex) assert indexer is None assert result.names == ["first", "second"] - assert [level.name for level in result.levels] == [None, None] + assert [level.name for level in result.levels] == ["first", "second"] def test_reindex_level(idx): diff --git a/pandas/tests/indexes/multi/test_reshape.py b/pandas/tests/indexes/multi/test_reshape.py index e79f212f30078..37df420e9ea2e 100644 --- a/pandas/tests/indexes/multi/test_reshape.py +++ b/pandas/tests/indexes/multi/test_reshape.py @@ -15,11 +15,11 @@ def test_insert(idx): # key not contained in all levels new_index = idx.insert(0, ("abc", "three")) - exp0 = Index(list(idx.levels[0]) + ["abc"]) + exp0 = Index(list(idx.levels[0]) + ["abc"], name="first") tm.assert_index_equal(new_index.levels[0], exp0) assert new_index.names == ["first", "second"] - exp1 = Index(list(idx.levels[1]) + ["three"]) + exp1 = Index(list(idx.levels[1]) + ["three"], name="second") tm.assert_index_equal(new_index.levels[1], exp1) assert new_index[0] == ("abc", "three") diff --git a/pandas/tests/reshape/test_concat.py b/pandas/tests/reshape/test_concat.py index 33cbaaed1848d..eda7bc0ec4df7 100644 --- a/pandas/tests/reshape/test_concat.py +++ b/pandas/tests/reshape/test_concat.py @@ -1219,7 +1219,7 @@ def test_concat_keys_specific_levels(self): names=["group_key"], ) - tm.assert_index_equal(result.columns.levels[0], Index(level)) + tm.assert_index_equal(result.columns.levels[0], Index(level, name="group_key")) tm.assert_index_equal(result.columns.levels[1], Index([0, 1, 2, 3])) assert result.columns.names == ["group_key", None] @@ -1412,7 +1412,9 @@ def test_concat_keys_and_levels(self): names=["first", "second"], ) assert result.index.names == ("first", "second", None) - tm.assert_index_equal(result.index.levels[0], Index(["baz", "foo"])) + tm.assert_index_equal( + result.index.levels[0], Index(["baz", "foo"], name="first") + ) def test_concat_keys_levels_no_overlap(self): # GH #1406 diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index 76436f4480809..79c9fe2b60bd9 100644 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -975,11 +975,11 @@ def test_count(self): series.index.names = ["a", "b"] result = series.count(level="b") - expect = self.series.count(level=1) + expect = self.series.count(level=1).rename_axis("b") tm.assert_series_equal(result, expect) result = series.count(level="a") - expect = self.series.count(level=0) + expect = self.series.count(level=0).rename_axis("a") tm.assert_series_equal(result, expect) msg = "Level x not found" @@ -1641,16 +1641,14 @@ def test_constructor_with_tz(self): result = MultiIndex.from_arrays([index, columns]) assert result.names == ["dt1", "dt2"] - # levels don't have names set, so set name of index/columns to None in checks - tm.assert_index_equal(result.levels[0], index.rename(name=None)) - tm.assert_index_equal(result.levels[1], columns.rename(name=None)) + tm.assert_index_equal(result.levels[0], index) + tm.assert_index_equal(result.levels[1], columns) result = MultiIndex.from_arrays([Series(index), Series(columns)]) assert result.names == ["dt1", "dt2"] - # levels don't have names set, so set name of index/columns to None in checks - tm.assert_index_equal(result.levels[0], index.rename(name=None)) - tm.assert_index_equal(result.levels[1], columns.rename(name=None)) + tm.assert_index_equal(result.levels[0], index) + tm.assert_index_equal(result.levels[1], columns) def test_set_index_datetime(self): # GH 3950 @@ -1672,17 +1670,18 @@ def test_set_index_datetime(self): df.index = df.index.tz_convert("US/Pacific") expected = pd.DatetimeIndex( - ["2011-07-19 07:00:00", "2011-07-19 08:00:00", "2011-07-19 09:00:00"] + ["2011-07-19 07:00:00", "2011-07-19 08:00:00", "2011-07-19 09:00:00"], + name="datetime", ) expected = expected.tz_localize("UTC").tz_convert("US/Pacific") df = df.set_index("label", append=True) tm.assert_index_equal(df.index.levels[0], expected) - tm.assert_index_equal(df.index.levels[1], Index(["a", "b"])) + tm.assert_index_equal(df.index.levels[1], Index(["a", "b"], name="label")) assert df.index.names == ["datetime", "label"] df = df.swaplevel(0, 1) - tm.assert_index_equal(df.index.levels[0], Index(["a", "b"])) + tm.assert_index_equal(df.index.levels[0], Index(["a", "b"], name="label")) tm.assert_index_equal(df.index.levels[1], expected) assert df.index.names == ["label", "datetime"] From 709436d49029ecbb708e8ce0e4e63faabbe7a885 Mon Sep 17 00:00:00 2001 From: Rajhans Jadhao Date: Fri, 18 Oct 2019 20:45:30 +0530 Subject: [PATCH 089/119] fixed issue of mypy for test_ujson (#29022) --- pandas/tests/io/json/test_ujson.py | 5 +---- setup.cfg | 3 --- 2 files changed, 1 insertion(+), 7 deletions(-) diff --git a/pandas/tests/io/json/test_ujson.py b/pandas/tests/io/json/test_ujson.py index d6572ac7b7bfe..20e2690084e2a 100644 --- a/pandas/tests/io/json/test_ujson.py +++ b/pandas/tests/io/json/test_ujson.py @@ -1,10 +1,7 @@ -try: - import json -except ImportError: - import simplejson as json import calendar import datetime import decimal +import json import locale import math import re diff --git a/setup.cfg b/setup.cfg index 3562ece5acad3..257f67c69ba15 100644 --- a/setup.cfg +++ b/setup.cfg @@ -196,9 +196,6 @@ ignore_errors=True [mypy-pandas.tests.indexing.test_loc] ignore_errors=True -[mypy-pandas.tests.io.json.test_ujson] -ignore_errors=True - [mypy-pandas.tests.series.test_constructors] ignore_errors=True From f556a71d3aae1a1b946d3fe7bc896fc219c14aaa Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 18 Oct 2019 10:27:29 -0700 Subject: [PATCH 090/119] CLN: derivation of nogil param (#29047) --- pandas/_libs/algos_take_helper.pxi.in | 42 ++++++++++++++------------- 1 file changed, 22 insertions(+), 20 deletions(-) diff --git a/pandas/_libs/algos_take_helper.pxi.in b/pandas/_libs/algos_take_helper.pxi.in index f10061a417c03..e7ee212065c5b 100644 --- a/pandas/_libs/algos_take_helper.pxi.in +++ b/pandas/_libs/algos_take_helper.pxi.in @@ -12,26 +12,26 @@ WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in # name, dest, c_type_in, c_type_out, preval, postval, can_copy, nogil dtypes = [ - ('bool', 'bool', 'uint8_t', 'uint8_t', '', '', True, True), + ('bool', 'bool', 'uint8_t', 'uint8_t', '', '', True), ('bool', 'object', 'uint8_t', 'object', - 'True if ', ' > 0 else False', False, False), - ('int8', 'int8', 'int8_t', 'int8_t', '', '', True, False), - ('int8', 'int32', 'int8_t', 'int32_t', '', '', False, True), - ('int8', 'int64', 'int8_t', 'int64_t', '', '', False, True), - ('int8', 'float64', 'int8_t', 'float64_t', '', '', False, True), - ('int16', 'int16', 'int16_t', 'int16_t', '', '', True, True), - ('int16', 'int32', 'int16_t', 'int32_t', '', '', False, True), - ('int16', 'int64', 'int16_t', 'int64_t', '', '', False, True), - ('int16', 'float64', 'int16_t', 'float64_t', '', '', False, True), - ('int32', 'int32', 'int32_t', 'int32_t', '', '', True, True), - ('int32', 'int64', 'int32_t', 'int64_t', '', '', False, True), - ('int32', 'float64', 'int32_t', 'float64_t', '', '', False, True), - ('int64', 'int64', 'int64_t', 'int64_t', '', '', True, True), - ('int64', 'float64', 'int64_t', 'float64_t', '', '', False, True), - ('float32', 'float32', 'float32_t', 'float32_t', '', '', True, True), - ('float32', 'float64', 'float32_t', 'float64_t', '', '', False, True), - ('float64', 'float64', 'float64_t', 'float64_t', '', '', True, True), - ('object', 'object', 'object', 'object', '', '', False, False)] + 'True if ', ' > 0 else False', False), + ('int8', 'int8', 'int8_t', 'int8_t', '', '', True), + ('int8', 'int32', 'int8_t', 'int32_t', '', '', False), + ('int8', 'int64', 'int8_t', 'int64_t', '', '', False), + ('int8', 'float64', 'int8_t', 'float64_t', '', '', False), + ('int16', 'int16', 'int16_t', 'int16_t', '', '', True), + ('int16', 'int32', 'int16_t', 'int32_t', '', '', False), + ('int16', 'int64', 'int16_t', 'int64_t', '', '', False), + ('int16', 'float64', 'int16_t', 'float64_t', '', '', False), + ('int32', 'int32', 'int32_t', 'int32_t', '', '', True), + ('int32', 'int64', 'int32_t', 'int64_t', '', '', False), + ('int32', 'float64', 'int32_t', 'float64_t', '', '', False), + ('int64', 'int64', 'int64_t', 'int64_t', '', '', True), + ('int64', 'float64', 'int64_t', 'float64_t', '', '', False), + ('float32', 'float32', 'float32_t', 'float32_t', '', '', True), + ('float32', 'float64', 'float32_t', 'float64_t', '', '', False), + ('float64', 'float64', 'float64_t', 'float64_t', '', '', True), + ('object', 'object', 'object', 'object', '', '', False)] def get_dispatch(dtypes): @@ -118,7 +118,9 @@ def get_dispatch(dtypes): """ for (name, dest, c_type_in, c_type_out, preval, postval, - can_copy, nogil) in dtypes: + can_copy) in dtypes: + + nogil = c_type_out != "object" if nogil: nogil_str = "with nogil:" tab = ' ' From 827440afe8db3367f768335bb3d9c21105bb1ae1 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 18 Oct 2019 10:28:03 -0700 Subject: [PATCH 091/119] REF: remove groupby_helper (#29040) --- pandas/_libs/groupby.pyx | 689 +++++++++++++++++++++++++++- pandas/_libs/groupby_helper.pxi.in | 693 ----------------------------- setup.py | 3 +- 3 files changed, 688 insertions(+), 697 deletions(-) delete mode 100644 pandas/_libs/groupby_helper.pxi.in diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 68c21139e7384..8a417d8fe3a92 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -11,6 +11,8 @@ from numpy cimport (ndarray, uint32_t, uint64_t, float32_t, float64_t, complex64_t, complex128_t) cnp.import_array() +cdef extern from "numpy/npy_math.h": + float64_t NAN "NPY_NAN" from pandas._libs.util cimport numeric, get_nat @@ -21,6 +23,7 @@ from pandas._libs.algos import (take_2d_axis1_float64_float64, groupsort_indexer, tiebreakers) cdef int64_t NPY_NAT = get_nat() +_int64_max = np.iinfo(np.int64).max cdef float64_t NaN = np.NaN @@ -804,5 +807,687 @@ def group_quantile(ndarray[float64_t] out, grp_start += grp_sz -# generated from template -include "groupby_helper.pxi" +# ---------------------------------------------------------------------- +# group_nth, group_last, group_rank +# ---------------------------------------------------------------------- + +ctypedef fused rank_t: + float64_t + float32_t + int64_t + uint64_t + object + + +cdef inline bint _treat_as_na(rank_t val, bint is_datetimelike) nogil: + if rank_t is object: + # Should never be used, but we need to avoid the `val != val` below + # or else cython will raise about gil acquisition. + raise NotImplementedError + + elif rank_t is int64_t: + return is_datetimelike and val == NPY_NAT + else: + return val != val + + +@cython.wraparound(False) +@cython.boundscheck(False) +def group_last(rank_t[:, :] out, + int64_t[:] counts, + rank_t[:, :] values, + const int64_t[:] labels, + Py_ssize_t min_count=-1): + """ + Only aggregates on axis=0 + """ + cdef: + Py_ssize_t i, j, N, K, lab, ncounts = len(counts) + rank_t val + ndarray[rank_t, ndim=2] resx + ndarray[int64_t, ndim=2] nobs + bint runtime_error = False + + assert min_count == -1, "'min_count' only used in add and prod" + + if not len(values) == len(labels): + raise AssertionError("len(index) != len(labels)") + + nobs = np.zeros((out).shape, dtype=np.int64) + if rank_t is object: + resx = np.empty((out).shape, dtype=object) + else: + resx = np.empty_like(out) + + N, K = (values).shape + + if rank_t is object: + # TODO: De-duplicate once conditional-nogil is available + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + + if val == val: + # NB: use _treat_as_na here once + # conditional-nogil is available. + nobs[lab, j] += 1 + resx[lab, j] = val + + for i in range(ncounts): + for j in range(K): + if nobs[i, j] == 0: + out[i, j] = NAN + else: + out[i, j] = resx[i, j] + else: + with nogil: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + + if not _treat_as_na(val, True): + # TODO: Sure we always want is_datetimelike=True? + nobs[lab, j] += 1 + resx[lab, j] = val + + for i in range(ncounts): + for j in range(K): + if nobs[i, j] == 0: + if rank_t is int64_t: + out[i, j] = NPY_NAT + elif rank_t is uint64_t: + runtime_error = True + break + else: + out[i, j] = NAN + + else: + out[i, j] = resx[i, j] + + if runtime_error: + # We cannot raise directly above because that is within a nogil + # block. + raise RuntimeError("empty group with uint64_t") + + +group_last_float64 = group_last["float64_t"] +group_last_float32 = group_last["float32_t"] +group_last_int64 = group_last["int64_t"] +group_last_object = group_last["object"] + + +@cython.wraparound(False) +@cython.boundscheck(False) +def group_nth(rank_t[:, :] out, + int64_t[:] counts, + rank_t[:, :] values, + const int64_t[:] labels, int64_t rank, + Py_ssize_t min_count=-1): + """ + Only aggregates on axis=0 + """ + cdef: + Py_ssize_t i, j, N, K, lab, ncounts = len(counts) + rank_t val + ndarray[rank_t, ndim=2] resx + ndarray[int64_t, ndim=2] nobs + bint runtime_error = False + + assert min_count == -1, "'min_count' only used in add and prod" + + if not len(values) == len(labels): + raise AssertionError("len(index) != len(labels)") + + nobs = np.zeros((out).shape, dtype=np.int64) + if rank_t is object: + resx = np.empty((out).shape, dtype=object) + else: + resx = np.empty_like(out) + + N, K = (values).shape + + if rank_t is object: + # TODO: De-duplicate once conditional-nogil is available + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + + if val == val: + # NB: use _treat_as_na here once + # conditional-nogil is available. + nobs[lab, j] += 1 + if nobs[lab, j] == rank: + resx[lab, j] = val + + for i in range(ncounts): + for j in range(K): + if nobs[i, j] == 0: + out[i, j] = NAN + else: + out[i, j] = resx[i, j] + + else: + with nogil: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + + if not _treat_as_na(val, True): + # TODO: Sure we always want is_datetimelike=True? + nobs[lab, j] += 1 + if nobs[lab, j] == rank: + resx[lab, j] = val + + for i in range(ncounts): + for j in range(K): + if nobs[i, j] == 0: + if rank_t is int64_t: + out[i, j] = NPY_NAT + elif rank_t is uint64_t: + runtime_error = True + break + else: + out[i, j] = NAN + else: + out[i, j] = resx[i, j] + + if runtime_error: + # We cannot raise directly above because that is within a nogil + # block. + raise RuntimeError("empty group with uint64_t") + + +group_nth_float64 = group_nth["float64_t"] +group_nth_float32 = group_nth["float32_t"] +group_nth_int64 = group_nth["int64_t"] +group_nth_object = group_nth["object"] + + +@cython.boundscheck(False) +@cython.wraparound(False) +def group_rank(float64_t[:, :] out, + rank_t[:, :] values, + const int64_t[:] labels, + bint is_datetimelike, object ties_method, + bint ascending, bint pct, object na_option): + """ + Provides the rank of values within each group. + + Parameters + ---------- + out : array of float64_t values which this method will write its results to + values : array of rank_t values to be ranked + labels : array containing unique label for each group, with its ordering + matching up to the corresponding record in `values` + is_datetimelike : bool, default False + unused in this method but provided for call compatibility with other + Cython transformations + ties_method : {'average', 'min', 'max', 'first', 'dense'}, default + 'average' + * average: average rank of group + * min: lowest rank in group + * max: highest rank in group + * first: ranks assigned in order they appear in the array + * dense: like 'min', but rank always increases by 1 between groups + ascending : boolean, default True + False for ranks by high (1) to low (N) + na_option : {'keep', 'top', 'bottom'}, default 'keep' + pct : boolean, default False + Compute percentage rank of data within each group + na_option : {'keep', 'top', 'bottom'}, default 'keep' + * keep: leave NA values where they are + * top: smallest rank if ascending + * bottom: smallest rank if descending + + Notes + ----- + This method modifies the `out` parameter rather than returning an object + """ + cdef: + TiebreakEnumType tiebreak + Py_ssize_t i, j, N, K, grp_start=0, dups=0, sum_ranks=0 + Py_ssize_t grp_vals_seen=1, grp_na_count=0, grp_tie_count=0 + ndarray[int64_t] _as + ndarray[float64_t, ndim=2] grp_sizes + ndarray[rank_t] masked_vals + ndarray[uint8_t] mask + bint keep_na + rank_t nan_fill_val + + if rank_t is object: + raise NotImplementedError("Cant do nogil") + + tiebreak = tiebreakers[ties_method] + keep_na = na_option == 'keep' + N, K = (values).shape + grp_sizes = np.ones_like(out) + + # Copy values into new array in order to fill missing data + # with mask, without obfuscating location of missing data + # in values array + masked_vals = np.array(values[:, 0], copy=True) + if rank_t is int64_t: + mask = (masked_vals == NPY_NAT).astype(np.uint8) + else: + mask = np.isnan(masked_vals).astype(np.uint8) + + if ascending ^ (na_option == 'top'): + if rank_t is int64_t: + nan_fill_val = np.iinfo(np.int64).max + elif rank_t is uint64_t: + nan_fill_val = np.iinfo(np.uint64).max + else: + nan_fill_val = np.inf + order = (masked_vals, mask, labels) + else: + if rank_t is int64_t: + nan_fill_val = np.iinfo(np.int64).min + elif rank_t is uint64_t: + nan_fill_val = 0 + else: + nan_fill_val = -np.inf + + order = (masked_vals, ~mask, labels) + np.putmask(masked_vals, mask, nan_fill_val) + + # lexsort using labels, then mask, then actual values + # each label corresponds to a different group value, + # the mask helps you differentiate missing values before + # performing sort on the actual values + _as = np.lexsort(order).astype(np.int64, copy=False) + + if not ascending: + _as = _as[::-1] + + with nogil: + # Loop over the length of the value array + # each incremental i value can be looked up in the _as array + # that we sorted previously, which gives us the location of + # that sorted value for retrieval back from the original + # values / masked_vals arrays + for i in range(N): + # dups and sum_ranks will be incremented each loop where + # the value / group remains the same, and should be reset + # when either of those change + # Used to calculate tiebreakers + dups += 1 + sum_ranks += i - grp_start + 1 + + # Update out only when there is a transition of values or labels. + # When a new value or group is encountered, go back #dups steps( + # the number of occurrence of current value) and assign the ranks + # based on the the starting index of the current group (grp_start) + # and the current index + if (i == N - 1 or + (masked_vals[_as[i]] != masked_vals[_as[i+1]]) or + (mask[_as[i]] ^ mask[_as[i+1]]) or + (labels[_as[i]] != labels[_as[i+1]])): + # if keep_na, check for missing values and assign back + # to the result where appropriate + if keep_na and mask[_as[i]]: + for j in range(i - dups + 1, i + 1): + out[_as[j], 0] = NaN + grp_na_count = dups + elif tiebreak == TIEBREAK_AVERAGE: + for j in range(i - dups + 1, i + 1): + out[_as[j], 0] = sum_ranks / dups + elif tiebreak == TIEBREAK_MIN: + for j in range(i - dups + 1, i + 1): + out[_as[j], 0] = i - grp_start - dups + 2 + elif tiebreak == TIEBREAK_MAX: + for j in range(i - dups + 1, i + 1): + out[_as[j], 0] = i - grp_start + 1 + elif tiebreak == TIEBREAK_FIRST: + for j in range(i - dups + 1, i + 1): + if ascending: + out[_as[j], 0] = j + 1 - grp_start + else: + out[_as[j], 0] = 2 * i - j - dups + 2 - grp_start + elif tiebreak == TIEBREAK_DENSE: + for j in range(i - dups + 1, i + 1): + out[_as[j], 0] = grp_vals_seen + + # look forward to the next value (using the sorting in _as) + # if the value does not equal the current value then we need to + # reset the dups and sum_ranks, knowing that a new value is + # coming up. the conditional also needs to handle nan equality + # and the end of iteration + if (i == N - 1 or + (masked_vals[_as[i]] != masked_vals[_as[i+1]]) or + (mask[_as[i]] ^ mask[_as[i+1]])): + dups = sum_ranks = 0 + grp_vals_seen += 1 + grp_tie_count += 1 + + # Similar to the previous conditional, check now if we are + # moving to a new group. If so, keep track of the index where + # the new group occurs, so the tiebreaker calculations can + # decrement that from their position. fill in the size of each + # group encountered (used by pct calculations later). also be + # sure to reset any of the items helping to calculate dups + if i == N - 1 or labels[_as[i]] != labels[_as[i+1]]: + if tiebreak != TIEBREAK_DENSE: + for j in range(grp_start, i + 1): + grp_sizes[_as[j], 0] = (i - grp_start + 1 - + grp_na_count) + else: + for j in range(grp_start, i + 1): + grp_sizes[_as[j], 0] = (grp_tie_count - + (grp_na_count > 0)) + dups = sum_ranks = 0 + grp_na_count = 0 + grp_tie_count = 0 + grp_start = i + 1 + grp_vals_seen = 1 + + if pct: + for i in range(N): + # We don't include NaN values in percentage + # rankings, so we assign them percentages of NaN. + if out[i, 0] != out[i, 0] or out[i, 0] == NAN: + out[i, 0] = NAN + elif grp_sizes[i, 0] != 0: + out[i, 0] = out[i, 0] / grp_sizes[i, 0] + + +group_rank_float64 = group_rank["float64_t"] +group_rank_float32 = group_rank["float32_t"] +group_rank_int64 = group_rank["int64_t"] +group_rank_uint64 = group_rank["uint64_t"] +# Note: we do not have a group_rank_object because that would require a +# not-nogil implementation, see GH#19560 + + +# ---------------------------------------------------------------------- +# group_min, group_max +# ---------------------------------------------------------------------- + +# TODO: consider implementing for more dtypes +ctypedef fused groupby_t: + float64_t + float32_t + int64_t + uint64_t + + +@cython.wraparound(False) +@cython.boundscheck(False) +def group_max(groupby_t[:, :] out, + int64_t[:] counts, + groupby_t[:, :] values, + const int64_t[:] labels, + Py_ssize_t min_count=-1): + """ + Only aggregates on axis=0 + """ + cdef: + Py_ssize_t i, j, N, K, lab, ncounts = len(counts) + groupby_t val, count, nan_val + ndarray[groupby_t, ndim=2] maxx, nobs + bint runtime_error = False + + assert min_count == -1, "'min_count' only used in add and prod" + + if not len(values) == len(labels): + raise AssertionError("len(index) != len(labels)") + + nobs = np.zeros_like(out) + + maxx = np.empty_like(out) + if groupby_t is int64_t: + # Note: evaluated at compile-time + maxx[:] = -_int64_max + nan_val = NPY_NAT + elif groupby_t is uint64_t: + # NB: We do not define nan_val because there is no such thing + # for uint64_t. We carefully avoid having to reference it in this + # case. + maxx[:] = 0 + else: + maxx[:] = -np.inf + nan_val = NAN + + N, K = (values).shape + + with nogil: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + + if not _treat_as_na(val, True): + # TODO: Sure we always want is_datetimelike=True? + nobs[lab, j] += 1 + if val > maxx[lab, j]: + maxx[lab, j] = val + + for i in range(ncounts): + for j in range(K): + if nobs[i, j] == 0: + if groupby_t is uint64_t: + runtime_error = True + break + out[i, j] = nan_val + else: + out[i, j] = maxx[i, j] + + if runtime_error: + # We cannot raise directly above because that is within a nogil + # block. + raise RuntimeError("empty group with uint64_t") + + +@cython.wraparound(False) +@cython.boundscheck(False) +def group_min(groupby_t[:, :] out, + int64_t[:] counts, + groupby_t[:, :] values, + const int64_t[:] labels, + Py_ssize_t min_count=-1): + """ + Only aggregates on axis=0 + """ + cdef: + Py_ssize_t i, j, N, K, lab, ncounts = len(counts) + groupby_t val, count, nan_val + ndarray[groupby_t, ndim=2] minx, nobs + bint runtime_error = False + + assert min_count == -1, "'min_count' only used in add and prod" + + if not len(values) == len(labels): + raise AssertionError("len(index) != len(labels)") + + nobs = np.zeros_like(out) + + minx = np.empty_like(out) + if groupby_t is int64_t: + minx[:] = _int64_max + nan_val = NPY_NAT + elif groupby_t is uint64_t: + # NB: We do not define nan_val because there is no such thing + # for uint64_t. We carefully avoid having to reference it in this + # case. + minx[:] = np.iinfo(np.uint64).max + else: + minx[:] = np.inf + nan_val = NAN + + N, K = (values).shape + + with nogil: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + + if not _treat_as_na(val, True): + # TODO: Sure we always want is_datetimelike=True? + nobs[lab, j] += 1 + if val < minx[lab, j]: + minx[lab, j] = val + + for i in range(ncounts): + for j in range(K): + if nobs[i, j] == 0: + if groupby_t is uint64_t: + runtime_error = True + break + out[i, j] = nan_val + else: + out[i, j] = minx[i, j] + + if runtime_error: + # We cannot raise directly above because that is within a nogil + # block. + raise RuntimeError("empty group with uint64_t") + + +@cython.boundscheck(False) +@cython.wraparound(False) +def group_cummin(groupby_t[:, :] out, + groupby_t[:, :] values, + const int64_t[:] labels, + int ngroups, + bint is_datetimelike): + """ + Cumulative minimum of columns of `values`, in row groups `labels`. + + Parameters + ---------- + out : array + Array to store cummin in. + values : array + Values to take cummin of. + labels : int64 array + Labels to group by. + ngroups : int + Number of groups, larger than all entries of `labels`. + is_datetimelike : bool + True if `values` contains datetime-like entries. + + Notes + ----- + This method modifies the `out` parameter, rather than returning an object. + """ + + cdef: + Py_ssize_t i, j, N, K, size + groupby_t val, mval + ndarray[groupby_t, ndim=2] accum + int64_t lab + + N, K = (values).shape + accum = np.empty((ngroups, K), dtype=np.asarray(values).dtype) + if groupby_t is int64_t: + accum[:] = _int64_max + elif groupby_t is uint64_t: + accum[:] = np.iinfo(np.uint64).max + else: + accum[:] = np.inf + + with nogil: + for i in range(N): + lab = labels[i] + + if lab < 0: + continue + for j in range(K): + val = values[i, j] + + if _treat_as_na(val, is_datetimelike): + out[i, j] = val + else: + mval = accum[lab, j] + if val < mval: + accum[lab, j] = mval = val + out[i, j] = mval + + +@cython.boundscheck(False) +@cython.wraparound(False) +def group_cummax(groupby_t[:, :] out, + groupby_t[:, :] values, + const int64_t[:] labels, + int ngroups, + bint is_datetimelike): + """ + Cumulative maximum of columns of `values`, in row groups `labels`. + + Parameters + ---------- + out : array + Array to store cummax in. + values : array + Values to take cummax of. + labels : int64 array + Labels to group by. + ngroups : int + Number of groups, larger than all entries of `labels`. + is_datetimelike : bool + True if `values` contains datetime-like entries. + + Notes + ----- + This method modifies the `out` parameter, rather than returning an object. + """ + + cdef: + Py_ssize_t i, j, N, K, size + groupby_t val, mval + ndarray[groupby_t, ndim=2] accum + int64_t lab + + N, K = (values).shape + accum = np.empty((ngroups, K), dtype=np.asarray(values).dtype) + if groupby_t is int64_t: + accum[:] = -_int64_max + elif groupby_t is uint64_t: + accum[:] = 0 + else: + accum[:] = -np.inf + + with nogil: + for i in range(N): + lab = labels[i] + + if lab < 0: + continue + for j in range(K): + val = values[i, j] + + if _treat_as_na(val, is_datetimelike): + out[i, j] = val + else: + mval = accum[lab, j] + if val > mval: + accum[lab, j] = mval = val + out[i, j] = mval diff --git a/pandas/_libs/groupby_helper.pxi.in b/pandas/_libs/groupby_helper.pxi.in deleted file mode 100644 index c837c6c5c6519..0000000000000 --- a/pandas/_libs/groupby_helper.pxi.in +++ /dev/null @@ -1,693 +0,0 @@ -""" -Template for each `dtype` helper function using groupby - -WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in -""" - -cdef extern from "numpy/npy_math.h": - float64_t NAN "NPY_NAN" -_int64_max = np.iinfo(np.int64).max - -# ---------------------------------------------------------------------- -# group_nth, group_last, group_rank -# ---------------------------------------------------------------------- - -ctypedef fused rank_t: - float64_t - float32_t - int64_t - uint64_t - object - - -cdef inline bint _treat_as_na(rank_t val, bint is_datetimelike) nogil: - if rank_t is object: - # Should never be used, but we need to avoid the `val != val` below - # or else cython will raise about gil acquisition. - raise NotImplementedError - - elif rank_t is int64_t: - return is_datetimelike and val == NPY_NAT - else: - return val != val - - -@cython.wraparound(False) -@cython.boundscheck(False) -def group_last(rank_t[:, :] out, - int64_t[:] counts, - rank_t[:, :] values, - const int64_t[:] labels, - Py_ssize_t min_count=-1): - """ - Only aggregates on axis=0 - """ - cdef: - Py_ssize_t i, j, N, K, lab, ncounts = len(counts) - rank_t val - ndarray[rank_t, ndim=2] resx - ndarray[int64_t, ndim=2] nobs - bint runtime_error = False - - assert min_count == -1, "'min_count' only used in add and prod" - - if not len(values) == len(labels): - raise AssertionError("len(index) != len(labels)") - - nobs = np.zeros((out).shape, dtype=np.int64) - if rank_t is object: - resx = np.empty((out).shape, dtype=object) - else: - resx = np.empty_like(out) - - N, K = (values).shape - - if rank_t is object: - # TODO: De-duplicate once conditional-nogil is available - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - for j in range(K): - val = values[i, j] - - if val == val: - # NB: use _treat_as_na here once - # conditional-nogil is available. - nobs[lab, j] += 1 - resx[lab, j] = val - - for i in range(ncounts): - for j in range(K): - if nobs[i, j] == 0: - out[i, j] = NAN - else: - out[i, j] = resx[i, j] - else: - with nogil: - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - for j in range(K): - val = values[i, j] - - if not _treat_as_na(val, True): - # TODO: Sure we always want is_datetimelike=True? - nobs[lab, j] += 1 - resx[lab, j] = val - - for i in range(ncounts): - for j in range(K): - if nobs[i, j] == 0: - if rank_t is int64_t: - out[i, j] = NPY_NAT - elif rank_t is uint64_t: - runtime_error = True - break - else: - out[i, j] = NAN - - else: - out[i, j] = resx[i, j] - - if runtime_error: - # We cannot raise directly above because that is within a nogil - # block. - raise RuntimeError("empty group with uint64_t") - -group_last_float64 = group_last["float64_t"] -group_last_float32 = group_last["float32_t"] -group_last_int64 = group_last["int64_t"] -group_last_object = group_last["object"] - - -@cython.wraparound(False) -@cython.boundscheck(False) -def group_nth(rank_t[:, :] out, - int64_t[:] counts, - rank_t[:, :] values, - const int64_t[:] labels, int64_t rank, - Py_ssize_t min_count=-1): - """ - Only aggregates on axis=0 - """ - cdef: - Py_ssize_t i, j, N, K, lab, ncounts = len(counts) - rank_t val - ndarray[rank_t, ndim=2] resx - ndarray[int64_t, ndim=2] nobs - bint runtime_error = False - - assert min_count == -1, "'min_count' only used in add and prod" - - if not len(values) == len(labels): - raise AssertionError("len(index) != len(labels)") - - nobs = np.zeros((out).shape, dtype=np.int64) - if rank_t is object: - resx = np.empty((out).shape, dtype=object) - else: - resx = np.empty_like(out) - - N, K = (values).shape - - if rank_t is object: - # TODO: De-duplicate once conditional-nogil is available - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - for j in range(K): - val = values[i, j] - - if val == val: - # NB: use _treat_as_na here once - # conditional-nogil is available. - nobs[lab, j] += 1 - if nobs[lab, j] == rank: - resx[lab, j] = val - - for i in range(ncounts): - for j in range(K): - if nobs[i, j] == 0: - out[i, j] = NAN - else: - out[i, j] = resx[i, j] - - else: - with nogil: - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - for j in range(K): - val = values[i, j] - - if not _treat_as_na(val, True): - # TODO: Sure we always want is_datetimelike=True? - nobs[lab, j] += 1 - if nobs[lab, j] == rank: - resx[lab, j] = val - - for i in range(ncounts): - for j in range(K): - if nobs[i, j] == 0: - if rank_t is int64_t: - out[i, j] = NPY_NAT - elif rank_t is uint64_t: - runtime_error = True - break - else: - out[i, j] = NAN - else: - out[i, j] = resx[i, j] - - if runtime_error: - # We cannot raise directly above because that is within a nogil - # block. - raise RuntimeError("empty group with uint64_t") - - -group_nth_float64 = group_nth["float64_t"] -group_nth_float32 = group_nth["float32_t"] -group_nth_int64 = group_nth["int64_t"] -group_nth_object = group_nth["object"] - - -@cython.boundscheck(False) -@cython.wraparound(False) -def group_rank(float64_t[:, :] out, - rank_t[:, :] values, - const int64_t[:] labels, - bint is_datetimelike, object ties_method, - bint ascending, bint pct, object na_option): - """ - Provides the rank of values within each group. - - Parameters - ---------- - out : array of float64_t values which this method will write its results to - values : array of rank_t values to be ranked - labels : array containing unique label for each group, with its ordering - matching up to the corresponding record in `values` - is_datetimelike : bool, default False - unused in this method but provided for call compatibility with other - Cython transformations - ties_method : {'average', 'min', 'max', 'first', 'dense'}, default - 'average' - * average: average rank of group - * min: lowest rank in group - * max: highest rank in group - * first: ranks assigned in order they appear in the array - * dense: like 'min', but rank always increases by 1 between groups - ascending : boolean, default True - False for ranks by high (1) to low (N) - na_option : {'keep', 'top', 'bottom'}, default 'keep' - pct : boolean, default False - Compute percentage rank of data within each group - na_option : {'keep', 'top', 'bottom'}, default 'keep' - * keep: leave NA values where they are - * top: smallest rank if ascending - * bottom: smallest rank if descending - - Notes - ----- - This method modifies the `out` parameter rather than returning an object - """ - cdef: - TiebreakEnumType tiebreak - Py_ssize_t i, j, N, K, grp_start=0, dups=0, sum_ranks=0 - Py_ssize_t grp_vals_seen=1, grp_na_count=0, grp_tie_count=0 - ndarray[int64_t] _as - ndarray[float64_t, ndim=2] grp_sizes - ndarray[rank_t] masked_vals - ndarray[uint8_t] mask - bint keep_na - rank_t nan_fill_val - - if rank_t is object: - raise NotImplementedError("Cant do nogil") - - tiebreak = tiebreakers[ties_method] - keep_na = na_option == 'keep' - N, K = (values).shape - grp_sizes = np.ones_like(out) - - # Copy values into new array in order to fill missing data - # with mask, without obfuscating location of missing data - # in values array - masked_vals = np.array(values[:, 0], copy=True) - if rank_t is int64_t: - mask = (masked_vals == NPY_NAT).astype(np.uint8) - else: - mask = np.isnan(masked_vals).astype(np.uint8) - - if ascending ^ (na_option == 'top'): - if rank_t is int64_t: - nan_fill_val = np.iinfo(np.int64).max - elif rank_t is uint64_t: - nan_fill_val = np.iinfo(np.uint64).max - else: - nan_fill_val = np.inf - order = (masked_vals, mask, labels) - else: - if rank_t is int64_t: - nan_fill_val = np.iinfo(np.int64).min - elif rank_t is uint64_t: - nan_fill_val = 0 - else: - nan_fill_val = -np.inf - - order = (masked_vals, ~mask, labels) - np.putmask(masked_vals, mask, nan_fill_val) - - # lexsort using labels, then mask, then actual values - # each label corresponds to a different group value, - # the mask helps you differentiate missing values before - # performing sort on the actual values - _as = np.lexsort(order).astype(np.int64, copy=False) - - if not ascending: - _as = _as[::-1] - - with nogil: - # Loop over the length of the value array - # each incremental i value can be looked up in the _as array - # that we sorted previously, which gives us the location of - # that sorted value for retrieval back from the original - # values / masked_vals arrays - for i in range(N): - # dups and sum_ranks will be incremented each loop where - # the value / group remains the same, and should be reset - # when either of those change - # Used to calculate tiebreakers - dups += 1 - sum_ranks += i - grp_start + 1 - - # Update out only when there is a transition of values or labels. - # When a new value or group is encountered, go back #dups steps( - # the number of occurrence of current value) and assign the ranks - # based on the the starting index of the current group (grp_start) - # and the current index - if (i == N - 1 or - (masked_vals[_as[i]] != masked_vals[_as[i+1]]) or - (mask[_as[i]] ^ mask[_as[i+1]]) or - (labels[_as[i]] != labels[_as[i+1]])): - # if keep_na, check for missing values and assign back - # to the result where appropriate - if keep_na and mask[_as[i]]: - for j in range(i - dups + 1, i + 1): - out[_as[j], 0] = NaN - grp_na_count = dups - elif tiebreak == TIEBREAK_AVERAGE: - for j in range(i - dups + 1, i + 1): - out[_as[j], 0] = sum_ranks / dups - elif tiebreak == TIEBREAK_MIN: - for j in range(i - dups + 1, i + 1): - out[_as[j], 0] = i - grp_start - dups + 2 - elif tiebreak == TIEBREAK_MAX: - for j in range(i - dups + 1, i + 1): - out[_as[j], 0] = i - grp_start + 1 - elif tiebreak == TIEBREAK_FIRST: - for j in range(i - dups + 1, i + 1): - if ascending: - out[_as[j], 0] = j + 1 - grp_start - else: - out[_as[j], 0] = 2 * i - j - dups + 2 - grp_start - elif tiebreak == TIEBREAK_DENSE: - for j in range(i - dups + 1, i + 1): - out[_as[j], 0] = grp_vals_seen - - # look forward to the next value (using the sorting in _as) - # if the value does not equal the current value then we need to - # reset the dups and sum_ranks, knowing that a new value is - # coming up. the conditional also needs to handle nan equality - # and the end of iteration - if (i == N - 1 or - (masked_vals[_as[i]] != masked_vals[_as[i+1]]) or - (mask[_as[i]] ^ mask[_as[i+1]])): - dups = sum_ranks = 0 - grp_vals_seen += 1 - grp_tie_count += 1 - - # Similar to the previous conditional, check now if we are - # moving to a new group. If so, keep track of the index where - # the new group occurs, so the tiebreaker calculations can - # decrement that from their position. fill in the size of each - # group encountered (used by pct calculations later). also be - # sure to reset any of the items helping to calculate dups - if i == N - 1 or labels[_as[i]] != labels[_as[i+1]]: - if tiebreak != TIEBREAK_DENSE: - for j in range(grp_start, i + 1): - grp_sizes[_as[j], 0] = (i - grp_start + 1 - - grp_na_count) - else: - for j in range(grp_start, i + 1): - grp_sizes[_as[j], 0] = (grp_tie_count - - (grp_na_count > 0)) - dups = sum_ranks = 0 - grp_na_count = 0 - grp_tie_count = 0 - grp_start = i + 1 - grp_vals_seen = 1 - - if pct: - for i in range(N): - # We don't include NaN values in percentage - # rankings, so we assign them percentages of NaN. - if out[i, 0] != out[i, 0] or out[i, 0] == NAN: - out[i, 0] = NAN - elif grp_sizes[i, 0] != 0: - out[i, 0] = out[i, 0] / grp_sizes[i, 0] - - -group_rank_float64 = group_rank["float64_t"] -group_rank_float32 = group_rank["float32_t"] -group_rank_int64 = group_rank["int64_t"] -group_rank_uint64 = group_rank["uint64_t"] -# Note: we do not have a group_rank_object because that would require a -# not-nogil implementation, see GH#19560 - - -# ---------------------------------------------------------------------- -# group_min, group_max -# ---------------------------------------------------------------------- - -# TODO: consider implementing for more dtypes -ctypedef fused groupby_t: - float64_t - float32_t - int64_t - uint64_t - - -@cython.wraparound(False) -@cython.boundscheck(False) -def group_max(groupby_t[:, :] out, - int64_t[:] counts, - groupby_t[:, :] values, - const int64_t[:] labels, - Py_ssize_t min_count=-1): - """ - Only aggregates on axis=0 - """ - cdef: - Py_ssize_t i, j, N, K, lab, ncounts = len(counts) - groupby_t val, count, nan_val - ndarray[groupby_t, ndim=2] maxx, nobs - bint runtime_error = False - - assert min_count == -1, "'min_count' only used in add and prod" - - if not len(values) == len(labels): - raise AssertionError("len(index) != len(labels)") - - nobs = np.zeros_like(out) - - maxx = np.empty_like(out) - if groupby_t is int64_t: - # Note: evaluated at compile-time - maxx[:] = -_int64_max - nan_val = NPY_NAT - elif groupby_t is uint64_t: - # NB: We do not define nan_val because there is no such thing - # for uint64_t. We carefully avoid having to reference it in this - # case. - maxx[:] = 0 - else: - maxx[:] = -np.inf - nan_val = NAN - - N, K = (values).shape - - with nogil: - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - for j in range(K): - val = values[i, j] - - if not _treat_as_na(val, True): - # TODO: Sure we always want is_datetimelike=True? - nobs[lab, j] += 1 - if val > maxx[lab, j]: - maxx[lab, j] = val - - for i in range(ncounts): - for j in range(K): - if nobs[i, j] == 0: - if groupby_t is uint64_t: - runtime_error = True - break - out[i, j] = nan_val - else: - out[i, j] = maxx[i, j] - - if runtime_error: - # We cannot raise directly above because that is within a nogil - # block. - raise RuntimeError("empty group with uint64_t") - - -@cython.wraparound(False) -@cython.boundscheck(False) -def group_min(groupby_t[:, :] out, - int64_t[:] counts, - groupby_t[:, :] values, - const int64_t[:] labels, - Py_ssize_t min_count=-1): - """ - Only aggregates on axis=0 - """ - cdef: - Py_ssize_t i, j, N, K, lab, ncounts = len(counts) - groupby_t val, count, nan_val - ndarray[groupby_t, ndim=2] minx, nobs - bint runtime_error = False - - assert min_count == -1, "'min_count' only used in add and prod" - - if not len(values) == len(labels): - raise AssertionError("len(index) != len(labels)") - - nobs = np.zeros_like(out) - - minx = np.empty_like(out) - if groupby_t is int64_t: - minx[:] = _int64_max - nan_val = NPY_NAT - elif groupby_t is uint64_t: - # NB: We do not define nan_val because there is no such thing - # for uint64_t. We carefully avoid having to reference it in this - # case. - minx[:] = np.iinfo(np.uint64).max - else: - minx[:] = np.inf - nan_val = NAN - - N, K = (values).shape - - with nogil: - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - for j in range(K): - val = values[i, j] - - if not _treat_as_na(val, True): - # TODO: Sure we always want is_datetimelike=True? - nobs[lab, j] += 1 - if val < minx[lab, j]: - minx[lab, j] = val - - for i in range(ncounts): - for j in range(K): - if nobs[i, j] == 0: - if groupby_t is uint64_t: - runtime_error = True - break - out[i, j] = nan_val - else: - out[i, j] = minx[i, j] - - if runtime_error: - # We cannot raise directly above because that is within a nogil - # block. - raise RuntimeError("empty group with uint64_t") - - -@cython.boundscheck(False) -@cython.wraparound(False) -def group_cummin(groupby_t[:, :] out, - groupby_t[:, :] values, - const int64_t[:] labels, - int ngroups, - bint is_datetimelike): - """ - Cumulative minimum of columns of `values`, in row groups `labels`. - - Parameters - ---------- - out : array - Array to store cummin in. - values : array - Values to take cummin of. - labels : int64 array - Labels to group by. - ngroups : int - Number of groups, larger than all entries of `labels`. - is_datetimelike : bool - True if `values` contains datetime-like entries. - - Notes - ----- - This method modifies the `out` parameter, rather than returning an object. - """ - - cdef: - Py_ssize_t i, j, N, K, size - groupby_t val, mval - ndarray[groupby_t, ndim=2] accum - int64_t lab - - N, K = (values).shape - accum = np.empty((ngroups, K), dtype=np.asarray(values).dtype) - if groupby_t is int64_t: - accum[:] = _int64_max - elif groupby_t is uint64_t: - accum[:] = np.iinfo(np.uint64).max - else: - accum[:] = np.inf - - with nogil: - for i in range(N): - lab = labels[i] - - if lab < 0: - continue - for j in range(K): - val = values[i, j] - - if _treat_as_na(val, is_datetimelike): - out[i, j] = val - else: - mval = accum[lab, j] - if val < mval: - accum[lab, j] = mval = val - out[i, j] = mval - - -@cython.boundscheck(False) -@cython.wraparound(False) -def group_cummax(groupby_t[:, :] out, - groupby_t[:, :] values, - const int64_t[:] labels, - int ngroups, - bint is_datetimelike): - """ - Cumulative maximum of columns of `values`, in row groups `labels`. - - Parameters - ---------- - out : array - Array to store cummax in. - values : array - Values to take cummax of. - labels : int64 array - Labels to group by. - ngroups : int - Number of groups, larger than all entries of `labels`. - is_datetimelike : bool - True if `values` contains datetime-like entries. - - Notes - ----- - This method modifies the `out` parameter, rather than returning an object. - """ - - cdef: - Py_ssize_t i, j, N, K, size - groupby_t val, mval - ndarray[groupby_t, ndim=2] accum - int64_t lab - - N, K = (values).shape - accum = np.empty((ngroups, K), dtype=np.asarray(values).dtype) - if groupby_t is int64_t: - accum[:] = -_int64_max - elif groupby_t is uint64_t: - accum[:] = 0 - else: - accum[:] = -np.inf - - with nogil: - for i in range(N): - lab = labels[i] - - if lab < 0: - continue - for j in range(K): - val = values[i, j] - - if _treat_as_na(val, is_datetimelike): - out[i, j] = val - else: - mval = accum[lab, j] - if val > mval: - accum[lab, j] = mval = val - out[i, j] = mval diff --git a/setup.py b/setup.py index c35a0e75ecb80..2892cd0b2e294 100755 --- a/setup.py +++ b/setup.py @@ -88,7 +88,6 @@ def is_platform_mac(): "_libs/algos_take_helper.pxi.in", "_libs/algos_rank_helper.pxi.in", ], - "groupby": ["_libs/groupby_helper.pxi.in"], "hashtable": [ "_libs/hashtable_class_helper.pxi.in", "_libs/hashtable_func_helper.pxi.in", @@ -564,7 +563,7 @@ def srcpath(name=None, suffix=".pyx", subdir="src"): ext_data = { "_libs.algos": {"pyxfile": "_libs/algos", "depends": _pxi_dep["algos"]}, - "_libs.groupby": {"pyxfile": "_libs/groupby", "depends": _pxi_dep["groupby"]}, + "_libs.groupby": {"pyxfile": "_libs/groupby"}, "_libs.hashing": {"pyxfile": "_libs/hashing", "include": [], "depends": []}, "_libs.hashtable": { "pyxfile": "_libs/hashtable", From 1e4fe0a12eb61986ce397f2cdfb12e647025424b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Simon-Martin=20Schr=C3=B6der?= Date: Fri, 18 Oct 2019 19:28:57 +0200 Subject: [PATCH 092/119] ENH: Informative dtype message for for assert_series_equal (#28993) --- pandas/tests/util/test_assert_frame_equal.py | 2 +- pandas/tests/util/test_assert_series_equal.py | 2 +- pandas/util/testing.py | 7 +++++-- 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/pandas/tests/util/test_assert_frame_equal.py b/pandas/tests/util/test_assert_frame_equal.py index 9571e8027ccf7..86e5d506e0779 100644 --- a/pandas/tests/util/test_assert_frame_equal.py +++ b/pandas/tests/util/test_assert_frame_equal.py @@ -141,7 +141,7 @@ def test_empty_dtypes(check_dtype): df1["col1"] = df1["col1"].astype("int64") if check_dtype: - msg = "Attributes are different" + msg = r"Attributes of DataFrame\..* are different" with pytest.raises(AssertionError, match=msg): assert_frame_equal(df1, df2, **kwargs) else: diff --git a/pandas/tests/util/test_assert_series_equal.py b/pandas/tests/util/test_assert_series_equal.py index a12d9386eb159..bad3f2e67f8bb 100644 --- a/pandas/tests/util/test_assert_series_equal.py +++ b/pandas/tests/util/test_assert_series_equal.py @@ -179,7 +179,7 @@ def test_series_equal_values_mismatch(check_less_precise): def test_series_equal_categorical_mismatch(check_categorical): - msg = """Attributes are different + msg = """Attributes of Series are different Attribute "dtype" are different \\[left\\]: CategoricalDtype\\(categories=\\['a', 'b'\\], ordered=False\\) diff --git a/pandas/util/testing.py b/pandas/util/testing.py index 4cf2776f5aa7c..73535e55d4fa5 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -1156,7 +1156,9 @@ def assert_series_equal( ): pass else: - assert_attr_equal("dtype", left, right) + assert_attr_equal( + "dtype", left, right, obj="Attributes of {obj}".format(obj=obj) + ) if check_exact: assert_numpy_array_equal( @@ -1315,8 +1317,9 @@ def assert_frame_equal( >>> assert_frame_equal(df1, df2) Traceback (most recent call last): - AssertionError: Attributes are different ... + AssertionError: Attributes of DataFrame.iloc[:, 1] are different + Attribute "dtype" are different [left]: int64 [right]: float64 From 2683954f95573531872d59254d866192edee0d8f Mon Sep 17 00:00:00 2001 From: Marco Neumann Date: Fri, 18 Oct 2019 17:32:51 +0000 Subject: [PATCH 093/119] TST: add regression test for all-none-groupby (#29067) Closes #21624 --- pandas/tests/groupby/test_groupby.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 6212a37472000..dff5baa9b5984 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -1944,3 +1944,13 @@ def test_shift_bfill_ffill_tz(tz_naive_fixture, op, expected): result = getattr(grouped, op)() expected = DataFrame(expected).assign(time=lambda x: x.time.dt.tz_localize(tz)) assert_frame_equal(result, expected) + + +def test_groupby_only_none_group(): + # see GH21624 + # this was crashing with "ValueError: Length of passed values is 1, index implies 0" + df = pd.DataFrame({"g": [None], "x": 1}) + actual = df.groupby("g")["x"].transform("sum") + expected = pd.Series([np.nan], name="x") + + assert_series_equal(actual, expected) From 45dc6d3d1998325688307ebb47aae8f0e26f2b50 Mon Sep 17 00:00:00 2001 From: Luke Date: Fri, 18 Oct 2019 11:56:06 -0600 Subject: [PATCH 094/119] Fix mypy errors for pandas\tests\*: test_convert_to.py (#28965) --- pandas/tests/frame/test_convert_to.py | 4 ++-- setup.cfg | 3 --- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/pandas/tests/frame/test_convert_to.py b/pandas/tests/frame/test_convert_to.py index 3f0768ad5bdac..c9a7507969f5b 100644 --- a/pandas/tests/frame/test_convert_to.py +++ b/pandas/tests/frame/test_convert_to.py @@ -575,9 +575,9 @@ def test_frame_to_dict_tz(self): ), ), ( - defaultdict(list), + defaultdict(dict), defaultdict( - list, + dict, { 0: {"int_col": 1, "float_col": 1.0}, 1: {"int_col": 2, "float_col": 2.0}, diff --git a/setup.cfg b/setup.cfg index 257f67c69ba15..ca1ca4a7b5733 100644 --- a/setup.cfg +++ b/setup.cfg @@ -163,9 +163,6 @@ ignore_errors=True [mypy-pandas.tests.frame.test_constructors] ignore_errors=True -[mypy-pandas.tests.frame.test_convert_to] -ignore_errors=True - [mypy-pandas.tests.indexes.datetimes.test_datetimelike] ignore_errors=True From 58d34d91bc92ce5eb4023bfcdbff916cef5ef5f0 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 18 Oct 2019 10:57:54 -0700 Subject: [PATCH 095/119] CLN: catch less in groupby (#29077) --- pandas/core/groupby/generic.py | 20 +++++++++++++++++--- pandas/core/resample.py | 19 ++++++++++++++++++- 2 files changed, 35 insertions(+), 4 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 8e53972c95275..8191c3519a36a 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -889,9 +889,23 @@ def aggregate(self, func=None, *args, **kwargs): result = self._aggregate_multiple_funcs( [func], _level=_level, _axis=self.axis ) - except AssertionError: - raise - except Exception: + except ValueError as err: + if "no results" not in str(err): + # raised directly by _aggregate_multiple_funcs + raise + result = self._aggregate_frame(func) + except NotImplementedError as err: + if "axis other than 0 is not supported" in str(err): + # raised directly by _aggregate_multiple_funcs + pass + elif "decimal does not support skipna=True" in str(err): + # FIXME: kludge for DecimalArray tests + pass + else: + raise + # FIXME: this is raised in a bunch of + # test_whitelist.test_regression_whitelist_methods tests, + # can be avoided result = self._aggregate_frame(func) else: result.columns = Index( diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 5185d95cfac4c..d4ae3767f6157 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -17,6 +17,7 @@ from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries import pandas.core.algorithms as algos +from pandas.core.base import DataError from pandas.core.generic import _shared_docs from pandas.core.groupby.base import GroupByMixin from pandas.core.groupby.generic import SeriesGroupBy @@ -362,7 +363,23 @@ def _groupby_and_aggregate(self, how, grouper=None, *args, **kwargs): result = grouped.aggregate(how, *args, **kwargs) except AssertionError: raise - except Exception: + except DataError: + # we have a non-reducing function; try to evaluate + result = grouped.apply(how, *args, **kwargs) + except ValueError as err: + if "Must produce aggregated value" in str(err): + # raised in _aggregate_named + pass + elif "len(index) != len(labels)" in str(err): + # raised in libgroupby validation + pass + elif "No objects to concatenate" in str(err): + # raised in concat call + # In tests this is reached via either + # _apply_to_column_groupbys (ohlc) or DataFrameGroupBy.nunique + pass + else: + raise # we have a non-reducing function # try to evaluate From e54b995c03dcde65b49639f9924fd6d1e54f1025 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 18 Oct 2019 12:34:44 -0700 Subject: [PATCH 096/119] CLN: simplify take_2d_multi (#29065) --- pandas/core/algorithms.py | 79 ++++++++++++++++----------------------- pandas/core/generic.py | 2 +- 2 files changed, 33 insertions(+), 48 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 2e5ab0d182aff..717c2eb26be8b 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -1304,7 +1304,7 @@ def get_indexer(current_indexer, other_indexer): return frame.sort_values(columns, ascending=ascending, kind="mergesort") -# ------- ## ---- # +# ---- # # take # # ---- # @@ -1712,59 +1712,44 @@ def take_nd( take_1d = take_nd -def take_2d_multi( - arr, indexer, out=None, fill_value=np.nan, mask_info=None, allow_fill=True -): +def take_2d_multi(arr, indexer, fill_value=np.nan): """ Specialized Cython take which sets NaN values in one pass """ - if indexer is None or (indexer[0] is None and indexer[1] is None): - row_idx = np.arange(arr.shape[0], dtype=np.int64) - col_idx = np.arange(arr.shape[1], dtype=np.int64) - indexer = row_idx, col_idx - dtype, fill_value = arr.dtype, arr.dtype.type() - else: - row_idx, col_idx = indexer - if row_idx is None: - row_idx = np.arange(arr.shape[0], dtype=np.int64) - else: - row_idx = ensure_int64(row_idx) - if col_idx is None: - col_idx = np.arange(arr.shape[1], dtype=np.int64) - else: - col_idx = ensure_int64(col_idx) - indexer = row_idx, col_idx - if not allow_fill: + # This is only called from one place in DataFrame._reindex_multi, + # so we know indexer is well-behaved. + assert indexer is not None + assert indexer[0] is not None + assert indexer[1] is not None + + row_idx, col_idx = indexer + + row_idx = ensure_int64(row_idx) + col_idx = ensure_int64(col_idx) + indexer = row_idx, col_idx + mask_info = None + + # check for promotion based on types only (do this first because + # it's faster than computing a mask) + dtype, fill_value = maybe_promote(arr.dtype, fill_value) + if dtype != arr.dtype: + # check if promotion is actually required based on indexer + row_mask = row_idx == -1 + col_mask = col_idx == -1 + row_needs = row_mask.any() + col_needs = col_mask.any() + mask_info = (row_mask, col_mask), (row_needs, col_needs) + + if not (row_needs or col_needs): + # if not, then depromote, set fill_value to dummy + # (it won't be used but we don't want the cython code + # to crash when trying to cast it to dtype) dtype, fill_value = arr.dtype, arr.dtype.type() - mask_info = None, False - else: - # check for promotion based on types only (do this first because - # it's faster than computing a mask) - dtype, fill_value = maybe_promote(arr.dtype, fill_value) - if dtype != arr.dtype and (out is None or out.dtype != dtype): - # check if promotion is actually required based on indexer - if mask_info is not None: - (row_mask, col_mask), (row_needs, col_needs) = mask_info - else: - row_mask = row_idx == -1 - col_mask = col_idx == -1 - row_needs = row_mask.any() - col_needs = col_mask.any() - mask_info = (row_mask, col_mask), (row_needs, col_needs) - if row_needs or col_needs: - if out is not None and out.dtype != dtype: - raise TypeError("Incompatible type for fill_value") - else: - # if not, then depromote, set fill_value to dummy - # (it won't be used but we don't want the cython code - # to crash when trying to cast it to dtype) - dtype, fill_value = arr.dtype, arr.dtype.type() # at this point, it's guaranteed that dtype can hold both the arr values # and the fill_value - if out is None: - out_shape = len(row_idx), len(col_idx) - out = np.empty(out_shape, dtype=dtype) + out_shape = len(row_idx), len(col_idx) + out = np.empty(out_shape, dtype=dtype) func = _take_2d_multi_dict.get((arr.dtype.name, out.dtype.name), None) if func is None and arr.dtype != out.dtype: diff --git a/pandas/core/generic.py b/pandas/core/generic.py index e97772a418982..e3e59639de56b 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -4606,7 +4606,7 @@ def _needs_reindex_multi(self, axes, method, level): ) def _reindex_multi(self, axes, copy, fill_value): - return NotImplemented + raise AbstractMethodError(self) def _reindex_with_indexers( self, reindexers, fill_value=None, copy=False, allow_dups=False From 2701f524661a82cbcb205e377ebe91d02fc66cb4 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 18 Oct 2019 14:29:56 -0700 Subject: [PATCH 097/119] REF: use fused types in algos_rank_helper (#29044) --- pandas/_libs/algos_rank_helper.pxi.in | 418 ++++++++++++++------------ 1 file changed, 231 insertions(+), 187 deletions(-) diff --git a/pandas/_libs/algos_rank_helper.pxi.in b/pandas/_libs/algos_rank_helper.pxi.in index 1ba1667b687be..d5a31b6a13010 100644 --- a/pandas/_libs/algos_rank_helper.pxi.in +++ b/pandas/_libs/algos_rank_helper.pxi.in @@ -8,24 +8,17 @@ WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in # rank_1d, rank_2d # ---------------------------------------------------------------------- -{{py: - -# dtype ctype pos_nan_value neg_nan_value -dtypes = [('object', 'object', 'Infinity()', 'NegInfinity()'), - ('float64', 'float64_t', 'np.inf', '-np.inf'), - ('uint64', 'uint64_t', '', ''), - ('int64', 'int64_t', 'np.iinfo(np.int64).max', - 'np.iinfo(np.int64).min')] - -}} - -{{for dtype, ctype, pos_nan_value, neg_nan_value in dtypes}} +ctypedef fused rank_t: + object + float64_t + uint64_t + int64_t @cython.wraparound(False) @cython.boundscheck(False) -def rank_1d_{{dtype}}({{ctype}}[:] in_arr, ties_method='average', - ascending=True, na_option='keep', pct=False): +def rank_1d(rank_t[:] in_arr, ties_method='average', + ascending=True, na_option='keep', pct=False): """ Fast NaN-friendly version of scipy.stats.rankdata """ @@ -33,85 +26,86 @@ def rank_1d_{{dtype}}({{ctype}}[:] in_arr, ties_method='average', cdef: Py_ssize_t i, j, n, dups = 0, total_tie_count = 0, non_na_idx = 0 - {{if dtype == 'object'}} - ndarray sorted_data, values - {{else}} - ndarray[{{ctype}}] sorted_data, values - {{endif}} + ndarray[rank_t] sorted_data, values ndarray[float64_t] ranks ndarray[int64_t] argsorted ndarray[uint8_t, cast=True] sorted_mask - {{if dtype == 'uint64'}} - {{ctype}} val - {{else}} - {{ctype}} val, nan_value - {{endif}} + rank_t val, nan_value float64_t sum_ranks = 0 int tiebreak = 0 bint keep_na = 0 - bint isnan + bint isnan, condition float64_t count = 0.0 + tiebreak = tiebreakers[ties_method] - {{if dtype == 'float64'}} - values = np.asarray(in_arr).copy() - {{elif dtype == 'object'}} - values = np.array(in_arr, copy=True) + if rank_t is float64_t: + values = np.asarray(in_arr).copy() + elif rank_t is object: + values = np.array(in_arr, copy=True) - if values.dtype != np.object_: - values = values.astype('O') - {{else}} - values = np.asarray(in_arr) - {{endif}} + if values.dtype != np.object_: + values = values.astype('O') + else: + values = np.asarray(in_arr) keep_na = na_option == 'keep' - {{if dtype == 'object'}} - mask = missing.isnaobj(values) - {{elif dtype == 'float64'}} - mask = np.isnan(values) - {{elif dtype == 'int64'}} - mask = values == NPY_NAT + if rank_t is object: + mask = missing.isnaobj(values) + elif rank_t is float64_t: + mask = np.isnan(values) + elif rank_t is int64_t: + mask = values == NPY_NAT - # create copy in case of NPY_NAT - # values are mutated inplace - if mask.any(): - values = values.copy() - {{endif}} + # create copy in case of NPY_NAT + # values are mutated inplace + if mask.any(): + values = values.copy() # double sort first by mask and then by values to ensure nan values are # either at the beginning or the end. mask/(~mask) controls padding at # tail or the head - {{if dtype != 'uint64'}} - if ascending ^ (na_option == 'top'): - nan_value = {{pos_nan_value}} - order = (values, mask) + if rank_t is not uint64_t: + if ascending ^ (na_option == 'top'): + if rank_t is object: + nan_value = Infinity() + elif rank_t is float64_t: + nan_value = np.inf + elif rank_t is int64_t: + nan_value = np.iinfo(np.int64).max + + order = (values, mask) + else: + if rank_t is object: + nan_value = NegInfinity() + elif rank_t is float64_t: + nan_value = -np.inf + elif rank_t is int64_t: + nan_value = np.iinfo(np.int64).min + + order = (values, ~mask) + np.putmask(values, mask, nan_value) else: - nan_value = {{neg_nan_value}} - order = (values, ~mask) - np.putmask(values, mask, nan_value) - {{else}} - mask = np.zeros(shape=len(values), dtype=bool) - order = (values, mask) - {{endif}} + mask = np.zeros(shape=len(values), dtype=bool) + order = (values, mask) n = len(values) ranks = np.empty(n, dtype='f8') - {{if dtype == 'object'}} - _as = np.lexsort(keys=order) - {{else}} - if tiebreak == TIEBREAK_FIRST: - # need to use a stable sort here + if rank_t is object: _as = np.lexsort(keys=order) - if not ascending: - tiebreak = TIEBREAK_FIRST_DESCENDING else: - _as = np.lexsort(keys=order) - {{endif}} + if tiebreak == TIEBREAK_FIRST: + # need to use a stable sort here + _as = np.lexsort(keys=order) + if not ascending: + tiebreak = TIEBREAK_FIRST_DESCENDING + else: + _as = np.lexsort(keys=order) if not ascending: _as = _as[::-1] @@ -122,38 +116,32 @@ def rank_1d_{{dtype}}({{ctype}}[:] in_arr, ties_method='average', non_na_idx = _indices[0] if len(_indices) > 0 else -1 argsorted = _as.astype('i8') - {{if dtype == 'object'}} - if True: - {{else}} - with nogil: - {{endif}} - # TODO: why does the 2d version not have a nogil block? + if rank_t is object: + # TODO: de-duplicate once cython supports conditional nogil for i in range(n): sum_ranks += i + 1 dups += 1 - {{if dtype == 'object'}} - val = util.get_value_at(sorted_data, i) - {{else}} val = sorted_data[i] - {{endif}} - {{if dtype != 'uint64'}} - isnan = sorted_mask[i] - if isnan and keep_na: - ranks[argsorted[i]] = NaN - continue - {{endif}} + if rank_t is not uint64_t: + isnan = sorted_mask[i] + if isnan and keep_na: + ranks[argsorted[i]] = NaN + continue count += 1.0 - {{if dtype == 'object'}} - if (i == n - 1 or - are_diff(util.get_value_at(sorted_data, i + 1), val) or - i == non_na_idx): - {{else}} - if (i == n - 1 or sorted_data[i + 1] != val or i == non_na_idx): - {{endif}} + if rank_t is object: + condition = (i == n - 1 or + are_diff(sorted_data[i + 1], val) or + i == non_na_idx) + else: + condition = (i == n - 1 or + sorted_data[i + 1] != val or + i == non_na_idx) + + if condition: if tiebreak == TIEBREAK_AVERAGE: for j in range(i - dups + 1, i + 1): @@ -165,13 +153,12 @@ def rank_1d_{{dtype}}({{ctype}}[:] in_arr, ties_method='average', for j in range(i - dups + 1, i + 1): ranks[argsorted[j]] = i + 1 elif tiebreak == TIEBREAK_FIRST: - {{if dtype == 'object'}} - raise ValueError('first not supported for ' - 'non-numeric data') - {{else}} - for j in range(i - dups + 1, i + 1): - ranks[argsorted[j]] = j + 1 - {{endif}} + if rank_t is object: + raise ValueError('first not supported for ' + 'non-numeric data') + else: + for j in range(i - dups + 1, i + 1): + ranks[argsorted[j]] = j + 1 elif tiebreak == TIEBREAK_FIRST_DESCENDING: for j in range(i - dups + 1, i + 1): ranks[argsorted[j]] = 2 * i - j - dups + 2 @@ -180,6 +167,60 @@ def rank_1d_{{dtype}}({{ctype}}[:] in_arr, ties_method='average', for j in range(i - dups + 1, i + 1): ranks[argsorted[j]] = total_tie_count sum_ranks = dups = 0 + + else: + with nogil: + # TODO: why does the 2d version not have a nogil block? + for i in range(n): + sum_ranks += i + 1 + dups += 1 + + val = sorted_data[i] + + if rank_t is not uint64_t: + isnan = sorted_mask[i] + if isnan and keep_na: + ranks[argsorted[i]] = NaN + continue + + count += 1.0 + + if rank_t is object: + condition = (i == n - 1 or + are_diff(sorted_data[i + 1], val) or + i == non_na_idx) + else: + condition = (i == n - 1 or + sorted_data[i + 1] != val or + i == non_na_idx) + + if condition: + + if tiebreak == TIEBREAK_AVERAGE: + for j in range(i - dups + 1, i + 1): + ranks[argsorted[j]] = sum_ranks / dups + elif tiebreak == TIEBREAK_MIN: + for j in range(i - dups + 1, i + 1): + ranks[argsorted[j]] = i - dups + 2 + elif tiebreak == TIEBREAK_MAX: + for j in range(i - dups + 1, i + 1): + ranks[argsorted[j]] = i + 1 + elif tiebreak == TIEBREAK_FIRST: + if rank_t is object: + raise ValueError('first not supported for ' + 'non-numeric data') + else: + for j in range(i - dups + 1, i + 1): + ranks[argsorted[j]] = j + 1 + elif tiebreak == TIEBREAK_FIRST_DESCENDING: + for j in range(i - dups + 1, i + 1): + ranks[argsorted[j]] = 2 * i - j - dups + 2 + elif tiebreak == TIEBREAK_DENSE: + total_tie_count += 1 + for j in range(i - dups + 1, i + 1): + ranks[argsorted[j]] = total_tie_count + sum_ranks = dups = 0 + if pct: if tiebreak == TIEBREAK_DENSE: return ranks / total_tie_count @@ -189,8 +230,14 @@ def rank_1d_{{dtype}}({{ctype}}[:] in_arr, ties_method='average', return ranks -def rank_2d_{{dtype}}({{ctype}}[:, :] in_arr, axis=0, ties_method='average', - ascending=True, na_option='keep', pct=False): +rank_1d_object = rank_1d["object"] +rank_1d_float64 = rank_1d["float64_t"] +rank_1d_uint64 = rank_1d["uint64_t"] +rank_1d_int64 = rank_1d["int64_t"] + + +def rank_2d(rank_t[:, :] in_arr, axis=0, ties_method='average', + ascending=True, na_option='keep', pct=False): """ Fast NaN-friendly version of scipy.stats.rankdata """ @@ -198,29 +245,20 @@ def rank_2d_{{dtype}}({{ctype}}[:, :] in_arr, axis=0, ties_method='average', cdef: Py_ssize_t i, j, z, k, n, dups = 0, total_tie_count = 0 - {{if dtype == 'object'}} Py_ssize_t infs - {{endif}} ndarray[float64_t, ndim=2] ranks - {{if dtype == 'int64' or dtype == 'uint64'}} - ndarray[{{ctype}}, ndim=2, cast=True] values - {{else}} - ndarray[{{ctype}}, ndim=2] values - {{endif}} + ndarray[rank_t, ndim=2] values ndarray[int64_t, ndim=2] argsorted - {{if dtype == 'uint64'}} - {{ctype}} val - {{else}} - {{ctype}} val, nan_value - {{endif}} + rank_t val, nan_value float64_t sum_ranks = 0 int tiebreak = 0 bint keep_na = 0 float64_t count = 0.0 + bint condition, skip_condition tiebreak = tiebreakers[ties_method] @@ -231,103 +269,106 @@ def rank_2d_{{dtype}}({{ctype}}[:, :] in_arr, axis=0, ties_method='average', else: values = np.asarray(in_arr).copy() - {{if dtype == 'object'}} - if values.dtype != np.object_: - values = values.astype('O') - {{endif}} - - {{if dtype != 'uint64'}} - if ascending ^ (na_option == 'top'): - nan_value = {{pos_nan_value}} - else: - nan_value = {{neg_nan_value}} + if rank_t is object: + if values.dtype != np.object_: + values = values.astype('O') - {{if dtype == 'object'}} - mask = missing.isnaobj2d(values) - {{elif dtype == 'float64'}} - mask = np.isnan(values) - {{elif dtype == 'int64'}} - mask = values == NPY_NAT - {{endif}} + if rank_t is not uint64_t: + if ascending ^ (na_option == 'top'): + if rank_t is object: + nan_value = Infinity() + elif rank_t is float64_t: + nan_value = np.inf + elif rank_t is int64_t: + nan_value = np.iinfo(np.int64).max - np.putmask(values, mask, nan_value) - {{endif}} + else: + if rank_t is object: + nan_value = NegInfinity() + elif rank_t is float64_t: + nan_value = -np.inf + elif rank_t is int64_t: + nan_value = NPY_NAT + + if rank_t is object: + mask = missing.isnaobj2d(values) + elif rank_t is float64_t: + mask = np.isnan(values) + elif rank_t is int64_t: + mask = values == NPY_NAT + + np.putmask(values, mask, nan_value) n, k = (values).shape ranks = np.empty((n, k), dtype='f8') - {{if dtype == 'object'}} - try: - _as = values.argsort(1) - except TypeError: - values = in_arr - for i in range(len(values)): - ranks[i] = rank_1d_object(in_arr[i], ties_method=ties_method, - ascending=ascending, pct=pct) - if axis == 0: - return ranks.T - else: - return ranks - {{else}} - if tiebreak == TIEBREAK_FIRST: - # need to use a stable sort here - _as = values.argsort(axis=1, kind='mergesort') - if not ascending: - tiebreak = TIEBREAK_FIRST_DESCENDING + if rank_t is object: + try: + _as = values.argsort(1) + except TypeError: + values = in_arr + for i in range(len(values)): + ranks[i] = rank_1d_object(in_arr[i], ties_method=ties_method, + ascending=ascending, pct=pct) + if axis == 0: + return ranks.T + else: + return ranks else: - _as = values.argsort(1) - {{endif}} + if tiebreak == TIEBREAK_FIRST: + # need to use a stable sort here + _as = values.argsort(axis=1, kind='mergesort') + if not ascending: + tiebreak = TIEBREAK_FIRST_DESCENDING + else: + _as = values.argsort(1) if not ascending: _as = _as[:, ::-1] - values = _take_2d_{{dtype}}(values, _as) + values = _take_2d(values, _as) argsorted = _as.astype('i8') for i in range(n): - {{if dtype == 'object'}} - dups = sum_ranks = infs = 0 - {{else}} - dups = sum_ranks = 0 - {{endif}} + if rank_t is object: + dups = sum_ranks = infs = 0 + else: + dups = sum_ranks = 0 total_tie_count = 0 count = 0.0 for j in range(k): - {{if dtype != 'object'}} - sum_ranks += j + 1 - dups += 1 - {{endif}} + if rank_t is not object: + sum_ranks += j + 1 + dups += 1 val = values[i, j] - {{if dtype != 'uint64'}} - {{if dtype == 'object'}} - if (val is nan_value) and keep_na: - {{else}} - if (val == nan_value) and keep_na: - {{endif}} - ranks[i, argsorted[i, j]] = NaN + if rank_t is not uint64_t: + if rank_t is object: + skip_condition = (val is nan_value) and keep_na + else: + skip_condition = (val == nan_value) and keep_na + if skip_condition: + ranks[i, argsorted[i, j]] = NaN - {{if dtype == 'object'}} - infs += 1 - {{endif}} + if rank_t is object: + infs += 1 - continue - {{endif}} + continue count += 1.0 - {{if dtype == 'object'}} - sum_ranks += (j - infs) + 1 - dups += 1 - {{endif}} + if rank_t is object: + sum_ranks += (j - infs) + 1 + dups += 1 - {{if dtype == 'object'}} - if j == k - 1 or are_diff(values[i, j + 1], val): - {{else}} - if j == k - 1 or values[i, j + 1] != val: - {{endif}} + if rank_t is object: + condition = j == k - 1 or are_diff(values[i, j + 1], val) + else: + condition = j == k - 1 or values[i, j + 1] != val + + if condition: if tiebreak == TIEBREAK_AVERAGE: for z in range(j - dups + 1, j + 1): ranks[i, argsorted[i, z]] = sum_ranks / dups @@ -338,13 +379,12 @@ def rank_2d_{{dtype}}({{ctype}}[:, :] in_arr, axis=0, ties_method='average', for z in range(j - dups + 1, j + 1): ranks[i, argsorted[i, z]] = j + 1 elif tiebreak == TIEBREAK_FIRST: - {{if dtype == 'object'}} - raise ValueError('first not supported ' - 'for non-numeric data') - {{else}} - for z in range(j - dups + 1, j + 1): - ranks[i, argsorted[i, z]] = z + 1 - {{endif}} + if rank_t is object: + raise ValueError('first not supported ' + 'for non-numeric data') + else: + for z in range(j - dups + 1, j + 1): + ranks[i, argsorted[i, z]] = z + 1 elif tiebreak == TIEBREAK_FIRST_DESCENDING: for z in range(j - dups + 1, j + 1): ranks[i, argsorted[i, z]] = 2 * j - z - dups + 2 @@ -363,4 +403,8 @@ def rank_2d_{{dtype}}({{ctype}}[:, :] in_arr, axis=0, ties_method='average', else: return ranks -{{endfor}} + +rank_2d_object = rank_2d["object"] +rank_2d_float64 = rank_2d["float64_t"] +rank_2d_uint64 = rank_2d["uint64_t"] +rank_2d_int64 = rank_2d["int64_t"] From 09a9f5f01ad68f9f0fc457ebe200928d236cde2b Mon Sep 17 00:00:00 2001 From: Javad Date: Sat, 19 Oct 2019 03:27:38 +0330 Subject: [PATCH 098/119] DOC: updated categorical docstring (#29068) --- pandas/core/arrays/categorical.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index d34cf3e576beb..795986127cde7 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -295,7 +295,7 @@ class Categorical(ExtensionArray, PandasObject): See Also -------- - api.types.CategoricalDtype : Type for categorical data. + CategoricalDtype : Type for categorical data. CategoricalIndex : An Index with an underlying ``Categorical``. Notes From b372ac421e83c85b69869a15b101a3c2aa987813 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 18 Oct 2019 19:22:59 -0700 Subject: [PATCH 099/119] CLN: tighten exception catching in indexes (#29078) --- pandas/core/indexes/base.py | 17 ++++------------- pandas/core/indexes/period.py | 10 ++++++++-- pandas/core/indexes/timedeltas.py | 3 ++- 3 files changed, 14 insertions(+), 16 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 526b2c2e2c412..1a08609ccd99a 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -3142,16 +3142,7 @@ def is_int(v): elif is_positional: indexer = key else: - try: - indexer = self.slice_indexer(start, stop, step, kind=kind) - except Exception: - if is_index_slice: - if self.is_integer(): - raise - else: - indexer = key - else: - raise + indexer = self.slice_indexer(start, stop, step, kind=kind) return indexer @@ -4676,11 +4667,11 @@ def get_value(self, series, key): raise InvalidIndexError(key) else: raise e1 - except Exception: # pragma: no cover + except Exception: raise e1 except TypeError: - # python 3 - if is_scalar(key): # pragma: no cover + # e.g. "[False] is an invalid key" + if is_scalar(key): raise IndexError(key) raise InvalidIndexError(key) diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index 0fc74f4e78c9f..f085dff84462d 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -457,7 +457,11 @@ def __contains__(self, key): try: self.get_loc(key) return True - except Exception: + except (ValueError, TypeError, KeyError): + # TypeError can be reached if we pass a tuple that is not hashable + # ValueError can be reached if pass a 2-tuple and parse_time_string + # raises with the wrong number of return values + # TODO: the latter is a bug in parse_time_string return False @cache_readonly @@ -765,7 +769,9 @@ def _maybe_cast_slice_bound(self, label, side, kind): _, parsed, reso = parse_time_string(label, self.freq) bounds = self._parsed_string_to_bounds(reso, parsed) return bounds[0 if side == "left" else 1] - except Exception: + except ValueError: + # string cannot be parsed as datetime-like + # TODO: we need tests for this case raise KeyError(label) elif is_integer(label) or is_float(label): self._invalid_indexer("slice", label) diff --git a/pandas/core/indexes/timedeltas.py b/pandas/core/indexes/timedeltas.py index 755992c881fe5..62a74fefa6577 100644 --- a/pandas/core/indexes/timedeltas.py +++ b/pandas/core/indexes/timedeltas.py @@ -630,7 +630,8 @@ def insert(self, loc, item): if _is_convertible_to_td(item): try: item = Timedelta(item) - except Exception: + except ValueError: + # e.g. str that can't be parsed to timedelta pass elif is_scalar(item) and isna(item): # GH 18295 From 09fc1b4b86799a790449f0cd03f44e8b42f51973 Mon Sep 17 00:00:00 2001 From: Chin Hwee Date: Sat, 19 Oct 2019 23:39:35 +0800 Subject: [PATCH 100/119] explain set_levels().levels examples + formatting --- pandas/core/indexes/multi.py | 34 ++++++++++++++++++++++++++-------- 1 file changed, 26 insertions(+), 8 deletions(-) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 6b7019f4685c7..aabbd7efe1c35 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -719,16 +719,19 @@ def _set_levels( def set_levels(self, levels, level=None, inplace=False, verify_integrity=True): """ - Set levels on MultiIndex by passing a new value for each - index in the level. Defaults to returning new - index. + Set levels on MultiIndex. Defaults to returning new index. + ``set_levels`` passes a new value for each index in the level, + where the new values to be passed in each level are defined in a list. It is assumed that a new value is provided for each code describing - values in the level. If the number of values passed is more than - the number of index values in the level, ``set_levels`` will still - pass the values to the level. The passed values are stored in the - MultiIndex FrozenList even though the index values may be truncated - in the MultiIndex output from set_levels. + values in the level. + + If the number of values passed is more than the number of index + values in the level, ``set_levels`` will still pass the values + to the level. The passed values are stored in the FrozenList + representing the levels attribute of the MultiIndex, even + though the index values may be truncated in the MultiIndex + output from set_levels. Parameters ---------- @@ -751,6 +754,14 @@ def set_levels(self, levels, level=None, inplace=False, verify_integrity=True): (2, 'one'), (2, 'two'), (3, 'one'), (3, 'two')], names=['foo', 'bar']) + >>> idx + MultiIndex([('1', one), + ('1', two), + ('2', one), + ('2', two), + ('3', one), + ('3', two)], + names=['foo', 'bar']) >>> idx.set_levels([['a', 'b', 'c'], [1, 2]]) MultiIndex([('a', 1), ('a', 2), @@ -775,6 +786,13 @@ def set_levels(self, levels, level=None, inplace=False, verify_integrity=True): (3, 'a'), (3, 'b')], names=['foo', 'bar']) + + ``set_levels()`` passes values into the levels attribute that is + represented by a FrozenList containing list of values for each + level in the MultiIndex, even when the number of values passed + for a level is more than the number of indexes available in the + MultiIndex itself. + >>> idx.set_levels([['a', 'b', 'c'], [1, 2]], level=[0, 1]) MultiIndex([('a', 1), ('a', 2), From 5bdd7db3737523562037cf5790c0c61f150da487 Mon Sep 17 00:00:00 2001 From: Chin Hwee Date: Sat, 19 Oct 2019 23:39:35 +0800 Subject: [PATCH 101/119] explain set_levels().levels examples + formatting resolve pep8 issues explain set_levels().levels examples + formatting --- pandas/core/indexes/multi.py | 34 ++++++++++++++++++++++++++-------- 1 file changed, 26 insertions(+), 8 deletions(-) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 6b7019f4685c7..54955a41f8e8e 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -719,16 +719,19 @@ def _set_levels( def set_levels(self, levels, level=None, inplace=False, verify_integrity=True): """ - Set levels on MultiIndex by passing a new value for each - index in the level. Defaults to returning new - index. + Set levels on MultiIndex. Defaults to returning new index. + ``set_levels`` passes a new value for each index in the level, + where the new values to be passed in each level are defined in a list. It is assumed that a new value is provided for each code describing - values in the level. If the number of values passed is more than - the number of index values in the level, ``set_levels`` will still - pass the values to the level. The passed values are stored in the - MultiIndex FrozenList even though the index values may be truncated - in the MultiIndex output from set_levels. + values in the level. + + If the number of values passed is more than the number of index + values in the level, ``set_levels`` will still pass the values + to the level. The passed values are stored in the FrozenList + representing the levels attribute of the MultiIndex, even + though the index values may be truncated in the MultiIndex + output from set_levels. Parameters ---------- @@ -751,6 +754,14 @@ def set_levels(self, levels, level=None, inplace=False, verify_integrity=True): (2, 'one'), (2, 'two'), (3, 'one'), (3, 'two')], names=['foo', 'bar']) + >>> idx + MultiIndex([('1', one), + ('1', two), + ('2', one), + ('2', two), + ('3', one), + ('3', two)], + names=['foo', 'bar']) >>> idx.set_levels([['a', 'b', 'c'], [1, 2]]) MultiIndex([('a', 1), ('a', 2), @@ -775,6 +786,13 @@ def set_levels(self, levels, level=None, inplace=False, verify_integrity=True): (3, 'a'), (3, 'b')], names=['foo', 'bar']) + + ``set_levels()`` passes values into the levels attribute that is + represented by a FrozenList containing list of values for each + level in the MultiIndex, even when the number of values passed + for a level is more than the number of indexes available in the + MultiIndex itself. + >>> idx.set_levels([['a', 'b', 'c'], [1, 2]], level=[0, 1]) MultiIndex([('a', 1), ('a', 2), From 9c4a371e616ab1c3a795d6d1dfd1d0baf762ed27 Mon Sep 17 00:00:00 2001 From: Martin Winkel Date: Sat, 19 Oct 2019 17:50:56 +0200 Subject: [PATCH 102/119] [#22550] Remove TestData from series-tests test_quantile.py (#29096) --- pandas/tests/series/test_quantile.py | 51 +++++++++++++++------------- 1 file changed, 27 insertions(+), 24 deletions(-) diff --git a/pandas/tests/series/test_quantile.py b/pandas/tests/series/test_quantile.py index b001312fa37f3..1a4a3f523cbbe 100644 --- a/pandas/tests/series/test_quantile.py +++ b/pandas/tests/series/test_quantile.py @@ -8,24 +8,22 @@ from pandas.core.indexes.datetimes import Timestamp import pandas.util.testing as tm -from .common import TestData +class TestSeriesQuantile: + def test_quantile(self, datetime_series): -class TestSeriesQuantile(TestData): - def test_quantile(self): + q = datetime_series.quantile(0.1) + assert q == np.percentile(datetime_series.dropna(), 10) - q = self.ts.quantile(0.1) - assert q == np.percentile(self.ts.dropna(), 10) - - q = self.ts.quantile(0.9) - assert q == np.percentile(self.ts.dropna(), 90) + q = datetime_series.quantile(0.9) + assert q == np.percentile(datetime_series.dropna(), 90) # object dtype - q = Series(self.ts, dtype=object).quantile(0.9) - assert q == np.percentile(self.ts.dropna(), 90) + q = Series(datetime_series, dtype=object).quantile(0.9) + assert q == np.percentile(datetime_series.dropna(), 90) # datetime64[ns] dtype - dts = self.ts.index.to_series() + dts = datetime_series.index.to_series() q = dts.quantile(0.2) assert q == Timestamp("2000-01-10 19:12:00") @@ -41,20 +39,23 @@ def test_quantile(self): msg = "percentiles should all be in the interval \\[0, 1\\]" for invalid in [-1, 2, [0.5, -1], [0.5, 2]]: with pytest.raises(ValueError, match=msg): - self.ts.quantile(invalid) + datetime_series.quantile(invalid) - def test_quantile_multi(self): + def test_quantile_multi(self, datetime_series): qs = [0.1, 0.9] - result = self.ts.quantile(qs) + result = datetime_series.quantile(qs) expected = pd.Series( - [np.percentile(self.ts.dropna(), 10), np.percentile(self.ts.dropna(), 90)], + [ + np.percentile(datetime_series.dropna(), 10), + np.percentile(datetime_series.dropna(), 90), + ], index=qs, - name=self.ts.name, + name=datetime_series.name, ) tm.assert_series_equal(result, expected) - dts = self.ts.index.to_series() + dts = datetime_series.index.to_series() dts.name = "xxx" result = dts.quantile((0.2, 0.2)) expected = Series( @@ -64,18 +65,20 @@ def test_quantile_multi(self): ) tm.assert_series_equal(result, expected) - result = self.ts.quantile([]) - expected = pd.Series([], name=self.ts.name, index=Index([], dtype=float)) + result = datetime_series.quantile([]) + expected = pd.Series( + [], name=datetime_series.name, index=Index([], dtype=float) + ) tm.assert_series_equal(result, expected) - def test_quantile_interpolation(self): + def test_quantile_interpolation(self, datetime_series): # see gh-10174 # interpolation = linear (default case) - q = self.ts.quantile(0.1, interpolation="linear") - assert q == np.percentile(self.ts.dropna(), 10) - q1 = self.ts.quantile(0.1) - assert q1 == np.percentile(self.ts.dropna(), 10) + q = datetime_series.quantile(0.1, interpolation="linear") + assert q == np.percentile(datetime_series.dropna(), 10) + q1 = datetime_series.quantile(0.1) + assert q1 == np.percentile(datetime_series.dropna(), 10) # test with and without interpolation keyword assert q == q1 From a2f5ae2ca2f8855899f59565271ebf591707d79c Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sat, 19 Oct 2019 10:03:50 -0700 Subject: [PATCH 103/119] comments, catch less (#29088) --- pandas/core/groupby/generic.py | 12 ++++-------- pandas/core/groupby/ops.py | 10 +++++++++- pandas/core/series.py | 1 + 3 files changed, 14 insertions(+), 9 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 8191c3519a36a..a78857423e7e0 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1067,14 +1067,9 @@ def _aggregate_frame(self, func, *args, **kwargs): result = OrderedDict() if axis != obj._info_axis_number: - try: - for name, data in self: - fres = func(data, *args, **kwargs) - result[name] = self._try_cast(fres, data) - except AssertionError: - raise - except Exception: - return self._aggregate_item_by_item(func, *args, **kwargs) + for name, data in self: + fres = func(data, *args, **kwargs) + result[name] = self._try_cast(fres, data) else: for name in self.indices: data = self.get_group(name, obj=obj) @@ -1441,6 +1436,7 @@ def _choose_path(self, fast_path, slow_path, group): raise except Exception: # Hard to know ex-ante what exceptions `fast_path` might raise + # TODO: no test cases get here return path, res # verify fast path does not change columns (and names), otherwise diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index e380cf5930f97..fcc646dec89d9 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -655,7 +655,15 @@ def agg_series(self, obj, func): return self._aggregate_series_fast(obj, func) except AssertionError: raise - except Exception: + except ValueError as err: + if "No result." in str(err): + # raised in libreduction + pass + elif "Function does not reduce" in str(err): + # raised in libreduction + pass + else: + raise return self._aggregate_series_pure_python(obj, func) def _aggregate_series_fast(self, obj, func): diff --git a/pandas/core/series.py b/pandas/core/series.py index 1039e9af929d4..ea48b3603623a 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1288,6 +1288,7 @@ def _set_with(self, key, value): else: if isinstance(key, tuple): try: + # TODO: no test cases that get here self._set_values(key, value) except Exception: pass From cb99e2400c32e5bc63ff6681f38c241564667aaa Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sat, 19 Oct 2019 10:05:31 -0700 Subject: [PATCH 104/119] REF: remove algos_rank_helper (#29086) --- pandas/_libs/algos.pyx | 415 +++++++++++++++++++++++++- pandas/_libs/algos_rank_helper.pxi.in | 410 ------------------------- setup.py | 1 - 3 files changed, 414 insertions(+), 412 deletions(-) delete mode 100644 pandas/_libs/algos_rank_helper.pxi.in diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index 0f91f612994c7..cab8bc8e799d4 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -771,7 +771,420 @@ def is_monotonic(ndarray[algos_t, ndim=1] arr, bint timelike): return is_monotonic_inc, is_monotonic_dec, is_strict_monotonic +# ---------------------------------------------------------------------- +# rank_1d, rank_2d +# ---------------------------------------------------------------------- + +ctypedef fused rank_t: + object + float64_t + uint64_t + int64_t + + +@cython.wraparound(False) +@cython.boundscheck(False) +def rank_1d(rank_t[:] in_arr, ties_method='average', + ascending=True, na_option='keep', pct=False): + """ + Fast NaN-friendly version of scipy.stats.rankdata + """ + + cdef: + Py_ssize_t i, j, n, dups = 0, total_tie_count = 0, non_na_idx = 0 + + ndarray[rank_t] sorted_data, values + + ndarray[float64_t] ranks + ndarray[int64_t] argsorted + ndarray[uint8_t, cast=True] sorted_mask + + rank_t val, nan_value + + float64_t sum_ranks = 0 + int tiebreak = 0 + bint keep_na = 0 + bint isnan, condition + float64_t count = 0.0 + + tiebreak = tiebreakers[ties_method] + + if rank_t is float64_t: + values = np.asarray(in_arr).copy() + elif rank_t is object: + values = np.array(in_arr, copy=True) + + if values.dtype != np.object_: + values = values.astype('O') + else: + values = np.asarray(in_arr) + + keep_na = na_option == 'keep' + + if rank_t is object: + mask = missing.isnaobj(values) + elif rank_t is float64_t: + mask = np.isnan(values) + elif rank_t is int64_t: + mask = values == NPY_NAT + + # create copy in case of NPY_NAT + # values are mutated inplace + if mask.any(): + values = values.copy() + + # double sort first by mask and then by values to ensure nan values are + # either at the beginning or the end. mask/(~mask) controls padding at + # tail or the head + if rank_t is not uint64_t: + if ascending ^ (na_option == 'top'): + if rank_t is object: + nan_value = Infinity() + elif rank_t is float64_t: + nan_value = np.inf + elif rank_t is int64_t: + nan_value = np.iinfo(np.int64).max + + order = (values, mask) + else: + if rank_t is object: + nan_value = NegInfinity() + elif rank_t is float64_t: + nan_value = -np.inf + elif rank_t is int64_t: + nan_value = np.iinfo(np.int64).min + + order = (values, ~mask) + np.putmask(values, mask, nan_value) + else: + mask = np.zeros(shape=len(values), dtype=bool) + order = (values, mask) + + n = len(values) + ranks = np.empty(n, dtype='f8') + + if rank_t is object: + _as = np.lexsort(keys=order) + else: + if tiebreak == TIEBREAK_FIRST: + # need to use a stable sort here + _as = np.lexsort(keys=order) + if not ascending: + tiebreak = TIEBREAK_FIRST_DESCENDING + else: + _as = np.lexsort(keys=order) + + if not ascending: + _as = _as[::-1] + + sorted_data = values.take(_as) + sorted_mask = mask.take(_as) + _indices = np.diff(sorted_mask.astype(int)).nonzero()[0] + non_na_idx = _indices[0] if len(_indices) > 0 else -1 + argsorted = _as.astype('i8') + + if rank_t is object: + # TODO: de-duplicate once cython supports conditional nogil + for i in range(n): + sum_ranks += i + 1 + dups += 1 + + val = sorted_data[i] + + if rank_t is not uint64_t: + isnan = sorted_mask[i] + if isnan and keep_na: + ranks[argsorted[i]] = NaN + continue + + count += 1.0 + + if rank_t is object: + condition = ( + i == n - 1 or + are_diff(sorted_data[i + 1], val) or + i == non_na_idx + ) + else: + condition = ( + i == n - 1 or + sorted_data[i + 1] != val or + i == non_na_idx + ) + + if condition: + + if tiebreak == TIEBREAK_AVERAGE: + for j in range(i - dups + 1, i + 1): + ranks[argsorted[j]] = sum_ranks / dups + elif tiebreak == TIEBREAK_MIN: + for j in range(i - dups + 1, i + 1): + ranks[argsorted[j]] = i - dups + 2 + elif tiebreak == TIEBREAK_MAX: + for j in range(i - dups + 1, i + 1): + ranks[argsorted[j]] = i + 1 + elif tiebreak == TIEBREAK_FIRST: + if rank_t is object: + raise ValueError('first not supported for ' + 'non-numeric data') + else: + for j in range(i - dups + 1, i + 1): + ranks[argsorted[j]] = j + 1 + elif tiebreak == TIEBREAK_FIRST_DESCENDING: + for j in range(i - dups + 1, i + 1): + ranks[argsorted[j]] = 2 * i - j - dups + 2 + elif tiebreak == TIEBREAK_DENSE: + total_tie_count += 1 + for j in range(i - dups + 1, i + 1): + ranks[argsorted[j]] = total_tie_count + sum_ranks = dups = 0 + + else: + with nogil: + # TODO: why does the 2d version not have a nogil block? + for i in range(n): + sum_ranks += i + 1 + dups += 1 + + val = sorted_data[i] + + if rank_t is not uint64_t: + isnan = sorted_mask[i] + if isnan and keep_na: + ranks[argsorted[i]] = NaN + continue + + count += 1.0 + + if rank_t is object: + condition = ( + i == n - 1 or + are_diff(sorted_data[i + 1], val) or + i == non_na_idx + ) + else: + condition = ( + i == n - 1 or + sorted_data[i + 1] != val or + i == non_na_idx + ) + + if condition: + + if tiebreak == TIEBREAK_AVERAGE: + for j in range(i - dups + 1, i + 1): + ranks[argsorted[j]] = sum_ranks / dups + elif tiebreak == TIEBREAK_MIN: + for j in range(i - dups + 1, i + 1): + ranks[argsorted[j]] = i - dups + 2 + elif tiebreak == TIEBREAK_MAX: + for j in range(i - dups + 1, i + 1): + ranks[argsorted[j]] = i + 1 + elif tiebreak == TIEBREAK_FIRST: + if rank_t is object: + raise ValueError('first not supported for ' + 'non-numeric data') + else: + for j in range(i - dups + 1, i + 1): + ranks[argsorted[j]] = j + 1 + elif tiebreak == TIEBREAK_FIRST_DESCENDING: + for j in range(i - dups + 1, i + 1): + ranks[argsorted[j]] = 2 * i - j - dups + 2 + elif tiebreak == TIEBREAK_DENSE: + total_tie_count += 1 + for j in range(i - dups + 1, i + 1): + ranks[argsorted[j]] = total_tie_count + sum_ranks = dups = 0 + + if pct: + if tiebreak == TIEBREAK_DENSE: + return ranks / total_tie_count + else: + return ranks / count + else: + return ranks + + +rank_1d_object = rank_1d["object"] +rank_1d_float64 = rank_1d["float64_t"] +rank_1d_uint64 = rank_1d["uint64_t"] +rank_1d_int64 = rank_1d["int64_t"] + + +def rank_2d(rank_t[:, :] in_arr, axis=0, ties_method='average', + ascending=True, na_option='keep', pct=False): + """ + Fast NaN-friendly version of scipy.stats.rankdata + """ + + cdef: + Py_ssize_t i, j, z, k, n, dups = 0, total_tie_count = 0 + + Py_ssize_t infs + + ndarray[float64_t, ndim=2] ranks + ndarray[rank_t, ndim=2] values + + ndarray[int64_t, ndim=2] argsorted + + rank_t val, nan_value + + float64_t sum_ranks = 0 + int tiebreak = 0 + bint keep_na = 0 + float64_t count = 0.0 + bint condition, skip_condition + + tiebreak = tiebreakers[ties_method] + + keep_na = na_option == 'keep' + + if axis == 0: + values = np.asarray(in_arr).T.copy() + else: + values = np.asarray(in_arr).copy() + + if rank_t is object: + if values.dtype != np.object_: + values = values.astype('O') + + if rank_t is not uint64_t: + if ascending ^ (na_option == 'top'): + if rank_t is object: + nan_value = Infinity() + elif rank_t is float64_t: + nan_value = np.inf + elif rank_t is int64_t: + nan_value = np.iinfo(np.int64).max + + else: + if rank_t is object: + nan_value = NegInfinity() + elif rank_t is float64_t: + nan_value = -np.inf + elif rank_t is int64_t: + nan_value = NPY_NAT + + if rank_t is object: + mask = missing.isnaobj2d(values) + elif rank_t is float64_t: + mask = np.isnan(values) + elif rank_t is int64_t: + mask = values == NPY_NAT + + np.putmask(values, mask, nan_value) + + n, k = (values).shape + ranks = np.empty((n, k), dtype='f8') + + if rank_t is object: + try: + _as = values.argsort(1) + except TypeError: + values = in_arr + for i in range(len(values)): + ranks[i] = rank_1d_object(in_arr[i], ties_method=ties_method, + ascending=ascending, pct=pct) + if axis == 0: + return ranks.T + else: + return ranks + else: + if tiebreak == TIEBREAK_FIRST: + # need to use a stable sort here + _as = values.argsort(axis=1, kind='mergesort') + if not ascending: + tiebreak = TIEBREAK_FIRST_DESCENDING + else: + _as = values.argsort(1) + + if not ascending: + _as = _as[:, ::-1] + + values = _take_2d(values, _as) + argsorted = _as.astype('i8') + + for i in range(n): + if rank_t is object: + dups = sum_ranks = infs = 0 + else: + dups = sum_ranks = 0 + + total_tie_count = 0 + count = 0.0 + for j in range(k): + if rank_t is not object: + sum_ranks += j + 1 + dups += 1 + + val = values[i, j] + + if rank_t is not uint64_t: + if rank_t is object: + skip_condition = (val is nan_value) and keep_na + else: + skip_condition = (val == nan_value) and keep_na + if skip_condition: + ranks[i, argsorted[i, j]] = NaN + + if rank_t is object: + infs += 1 + + continue + + count += 1.0 + + if rank_t is object: + sum_ranks += (j - infs) + 1 + dups += 1 + + if rank_t is object: + condition = j == k - 1 or are_diff(values[i, j + 1], val) + else: + condition = j == k - 1 or values[i, j + 1] != val + + if condition: + if tiebreak == TIEBREAK_AVERAGE: + for z in range(j - dups + 1, j + 1): + ranks[i, argsorted[i, z]] = sum_ranks / dups + elif tiebreak == TIEBREAK_MIN: + for z in range(j - dups + 1, j + 1): + ranks[i, argsorted[i, z]] = j - dups + 2 + elif tiebreak == TIEBREAK_MAX: + for z in range(j - dups + 1, j + 1): + ranks[i, argsorted[i, z]] = j + 1 + elif tiebreak == TIEBREAK_FIRST: + if rank_t is object: + raise ValueError('first not supported ' + 'for non-numeric data') + else: + for z in range(j - dups + 1, j + 1): + ranks[i, argsorted[i, z]] = z + 1 + elif tiebreak == TIEBREAK_FIRST_DESCENDING: + for z in range(j - dups + 1, j + 1): + ranks[i, argsorted[i, z]] = 2 * j - z - dups + 2 + elif tiebreak == TIEBREAK_DENSE: + total_tie_count += 1 + for z in range(j - dups + 1, j + 1): + ranks[i, argsorted[i, z]] = total_tie_count + sum_ranks = dups = 0 + if pct: + if tiebreak == TIEBREAK_DENSE: + ranks[i, :] /= total_tie_count + else: + ranks[i, :] /= count + if axis == 0: + return ranks.T + else: + return ranks + + +rank_2d_object = rank_2d["object"] +rank_2d_float64 = rank_2d["float64_t"] +rank_2d_uint64 = rank_2d["uint64_t"] +rank_2d_int64 = rank_2d["int64_t"] + + # generated from template include "algos_common_helper.pxi" -include "algos_rank_helper.pxi" include "algos_take_helper.pxi" diff --git a/pandas/_libs/algos_rank_helper.pxi.in b/pandas/_libs/algos_rank_helper.pxi.in deleted file mode 100644 index d5a31b6a13010..0000000000000 --- a/pandas/_libs/algos_rank_helper.pxi.in +++ /dev/null @@ -1,410 +0,0 @@ -""" -Template for each `dtype` helper function for rank - -WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in -""" - -# ---------------------------------------------------------------------- -# rank_1d, rank_2d -# ---------------------------------------------------------------------- - -ctypedef fused rank_t: - object - float64_t - uint64_t - int64_t - - -@cython.wraparound(False) -@cython.boundscheck(False) -def rank_1d(rank_t[:] in_arr, ties_method='average', - ascending=True, na_option='keep', pct=False): - """ - Fast NaN-friendly version of scipy.stats.rankdata - """ - - cdef: - Py_ssize_t i, j, n, dups = 0, total_tie_count = 0, non_na_idx = 0 - - ndarray[rank_t] sorted_data, values - - ndarray[float64_t] ranks - ndarray[int64_t] argsorted - ndarray[uint8_t, cast=True] sorted_mask - - rank_t val, nan_value - - float64_t sum_ranks = 0 - int tiebreak = 0 - bint keep_na = 0 - bint isnan, condition - float64_t count = 0.0 - - tiebreak = tiebreakers[ties_method] - - if rank_t is float64_t: - values = np.asarray(in_arr).copy() - elif rank_t is object: - values = np.array(in_arr, copy=True) - - if values.dtype != np.object_: - values = values.astype('O') - else: - values = np.asarray(in_arr) - - keep_na = na_option == 'keep' - - if rank_t is object: - mask = missing.isnaobj(values) - elif rank_t is float64_t: - mask = np.isnan(values) - elif rank_t is int64_t: - mask = values == NPY_NAT - - # create copy in case of NPY_NAT - # values are mutated inplace - if mask.any(): - values = values.copy() - - # double sort first by mask and then by values to ensure nan values are - # either at the beginning or the end. mask/(~mask) controls padding at - # tail or the head - if rank_t is not uint64_t: - if ascending ^ (na_option == 'top'): - if rank_t is object: - nan_value = Infinity() - elif rank_t is float64_t: - nan_value = np.inf - elif rank_t is int64_t: - nan_value = np.iinfo(np.int64).max - - order = (values, mask) - else: - if rank_t is object: - nan_value = NegInfinity() - elif rank_t is float64_t: - nan_value = -np.inf - elif rank_t is int64_t: - nan_value = np.iinfo(np.int64).min - - order = (values, ~mask) - np.putmask(values, mask, nan_value) - else: - mask = np.zeros(shape=len(values), dtype=bool) - order = (values, mask) - - n = len(values) - ranks = np.empty(n, dtype='f8') - - if rank_t is object: - _as = np.lexsort(keys=order) - else: - if tiebreak == TIEBREAK_FIRST: - # need to use a stable sort here - _as = np.lexsort(keys=order) - if not ascending: - tiebreak = TIEBREAK_FIRST_DESCENDING - else: - _as = np.lexsort(keys=order) - - if not ascending: - _as = _as[::-1] - - sorted_data = values.take(_as) - sorted_mask = mask.take(_as) - _indices = np.diff(sorted_mask.astype(int)).nonzero()[0] - non_na_idx = _indices[0] if len(_indices) > 0 else -1 - argsorted = _as.astype('i8') - - if rank_t is object: - # TODO: de-duplicate once cython supports conditional nogil - for i in range(n): - sum_ranks += i + 1 - dups += 1 - - val = sorted_data[i] - - if rank_t is not uint64_t: - isnan = sorted_mask[i] - if isnan and keep_na: - ranks[argsorted[i]] = NaN - continue - - count += 1.0 - - if rank_t is object: - condition = (i == n - 1 or - are_diff(sorted_data[i + 1], val) or - i == non_na_idx) - else: - condition = (i == n - 1 or - sorted_data[i + 1] != val or - i == non_na_idx) - - if condition: - - if tiebreak == TIEBREAK_AVERAGE: - for j in range(i - dups + 1, i + 1): - ranks[argsorted[j]] = sum_ranks / dups - elif tiebreak == TIEBREAK_MIN: - for j in range(i - dups + 1, i + 1): - ranks[argsorted[j]] = i - dups + 2 - elif tiebreak == TIEBREAK_MAX: - for j in range(i - dups + 1, i + 1): - ranks[argsorted[j]] = i + 1 - elif tiebreak == TIEBREAK_FIRST: - if rank_t is object: - raise ValueError('first not supported for ' - 'non-numeric data') - else: - for j in range(i - dups + 1, i + 1): - ranks[argsorted[j]] = j + 1 - elif tiebreak == TIEBREAK_FIRST_DESCENDING: - for j in range(i - dups + 1, i + 1): - ranks[argsorted[j]] = 2 * i - j - dups + 2 - elif tiebreak == TIEBREAK_DENSE: - total_tie_count += 1 - for j in range(i - dups + 1, i + 1): - ranks[argsorted[j]] = total_tie_count - sum_ranks = dups = 0 - - else: - with nogil: - # TODO: why does the 2d version not have a nogil block? - for i in range(n): - sum_ranks += i + 1 - dups += 1 - - val = sorted_data[i] - - if rank_t is not uint64_t: - isnan = sorted_mask[i] - if isnan and keep_na: - ranks[argsorted[i]] = NaN - continue - - count += 1.0 - - if rank_t is object: - condition = (i == n - 1 or - are_diff(sorted_data[i + 1], val) or - i == non_na_idx) - else: - condition = (i == n - 1 or - sorted_data[i + 1] != val or - i == non_na_idx) - - if condition: - - if tiebreak == TIEBREAK_AVERAGE: - for j in range(i - dups + 1, i + 1): - ranks[argsorted[j]] = sum_ranks / dups - elif tiebreak == TIEBREAK_MIN: - for j in range(i - dups + 1, i + 1): - ranks[argsorted[j]] = i - dups + 2 - elif tiebreak == TIEBREAK_MAX: - for j in range(i - dups + 1, i + 1): - ranks[argsorted[j]] = i + 1 - elif tiebreak == TIEBREAK_FIRST: - if rank_t is object: - raise ValueError('first not supported for ' - 'non-numeric data') - else: - for j in range(i - dups + 1, i + 1): - ranks[argsorted[j]] = j + 1 - elif tiebreak == TIEBREAK_FIRST_DESCENDING: - for j in range(i - dups + 1, i + 1): - ranks[argsorted[j]] = 2 * i - j - dups + 2 - elif tiebreak == TIEBREAK_DENSE: - total_tie_count += 1 - for j in range(i - dups + 1, i + 1): - ranks[argsorted[j]] = total_tie_count - sum_ranks = dups = 0 - - if pct: - if tiebreak == TIEBREAK_DENSE: - return ranks / total_tie_count - else: - return ranks / count - else: - return ranks - - -rank_1d_object = rank_1d["object"] -rank_1d_float64 = rank_1d["float64_t"] -rank_1d_uint64 = rank_1d["uint64_t"] -rank_1d_int64 = rank_1d["int64_t"] - - -def rank_2d(rank_t[:, :] in_arr, axis=0, ties_method='average', - ascending=True, na_option='keep', pct=False): - """ - Fast NaN-friendly version of scipy.stats.rankdata - """ - - cdef: - Py_ssize_t i, j, z, k, n, dups = 0, total_tie_count = 0 - - Py_ssize_t infs - - ndarray[float64_t, ndim=2] ranks - ndarray[rank_t, ndim=2] values - - ndarray[int64_t, ndim=2] argsorted - - rank_t val, nan_value - - float64_t sum_ranks = 0 - int tiebreak = 0 - bint keep_na = 0 - float64_t count = 0.0 - bint condition, skip_condition - - tiebreak = tiebreakers[ties_method] - - keep_na = na_option == 'keep' - - if axis == 0: - values = np.asarray(in_arr).T.copy() - else: - values = np.asarray(in_arr).copy() - - if rank_t is object: - if values.dtype != np.object_: - values = values.astype('O') - - if rank_t is not uint64_t: - if ascending ^ (na_option == 'top'): - if rank_t is object: - nan_value = Infinity() - elif rank_t is float64_t: - nan_value = np.inf - elif rank_t is int64_t: - nan_value = np.iinfo(np.int64).max - - else: - if rank_t is object: - nan_value = NegInfinity() - elif rank_t is float64_t: - nan_value = -np.inf - elif rank_t is int64_t: - nan_value = NPY_NAT - - if rank_t is object: - mask = missing.isnaobj2d(values) - elif rank_t is float64_t: - mask = np.isnan(values) - elif rank_t is int64_t: - mask = values == NPY_NAT - - np.putmask(values, mask, nan_value) - - n, k = (values).shape - ranks = np.empty((n, k), dtype='f8') - - if rank_t is object: - try: - _as = values.argsort(1) - except TypeError: - values = in_arr - for i in range(len(values)): - ranks[i] = rank_1d_object(in_arr[i], ties_method=ties_method, - ascending=ascending, pct=pct) - if axis == 0: - return ranks.T - else: - return ranks - else: - if tiebreak == TIEBREAK_FIRST: - # need to use a stable sort here - _as = values.argsort(axis=1, kind='mergesort') - if not ascending: - tiebreak = TIEBREAK_FIRST_DESCENDING - else: - _as = values.argsort(1) - - if not ascending: - _as = _as[:, ::-1] - - values = _take_2d(values, _as) - argsorted = _as.astype('i8') - - for i in range(n): - if rank_t is object: - dups = sum_ranks = infs = 0 - else: - dups = sum_ranks = 0 - - total_tie_count = 0 - count = 0.0 - for j in range(k): - if rank_t is not object: - sum_ranks += j + 1 - dups += 1 - - val = values[i, j] - - if rank_t is not uint64_t: - if rank_t is object: - skip_condition = (val is nan_value) and keep_na - else: - skip_condition = (val == nan_value) and keep_na - if skip_condition: - ranks[i, argsorted[i, j]] = NaN - - if rank_t is object: - infs += 1 - - continue - - count += 1.0 - - if rank_t is object: - sum_ranks += (j - infs) + 1 - dups += 1 - - if rank_t is object: - condition = j == k - 1 or are_diff(values[i, j + 1], val) - else: - condition = j == k - 1 or values[i, j + 1] != val - - if condition: - if tiebreak == TIEBREAK_AVERAGE: - for z in range(j - dups + 1, j + 1): - ranks[i, argsorted[i, z]] = sum_ranks / dups - elif tiebreak == TIEBREAK_MIN: - for z in range(j - dups + 1, j + 1): - ranks[i, argsorted[i, z]] = j - dups + 2 - elif tiebreak == TIEBREAK_MAX: - for z in range(j - dups + 1, j + 1): - ranks[i, argsorted[i, z]] = j + 1 - elif tiebreak == TIEBREAK_FIRST: - if rank_t is object: - raise ValueError('first not supported ' - 'for non-numeric data') - else: - for z in range(j - dups + 1, j + 1): - ranks[i, argsorted[i, z]] = z + 1 - elif tiebreak == TIEBREAK_FIRST_DESCENDING: - for z in range(j - dups + 1, j + 1): - ranks[i, argsorted[i, z]] = 2 * j - z - dups + 2 - elif tiebreak == TIEBREAK_DENSE: - total_tie_count += 1 - for z in range(j - dups + 1, j + 1): - ranks[i, argsorted[i, z]] = total_tie_count - sum_ranks = dups = 0 - if pct: - if tiebreak == TIEBREAK_DENSE: - ranks[i, :] /= total_tie_count - else: - ranks[i, :] /= count - if axis == 0: - return ranks.T - else: - return ranks - - -rank_2d_object = rank_2d["object"] -rank_2d_float64 = rank_2d["float64_t"] -rank_2d_uint64 = rank_2d["uint64_t"] -rank_2d_int64 = rank_2d["int64_t"] diff --git a/setup.py b/setup.py index 2892cd0b2e294..0dd1980088db8 100755 --- a/setup.py +++ b/setup.py @@ -86,7 +86,6 @@ def is_platform_mac(): "algos": [ "_libs/algos_common_helper.pxi.in", "_libs/algos_take_helper.pxi.in", - "_libs/algos_rank_helper.pxi.in", ], "hashtable": [ "_libs/hashtable_class_helper.pxi.in", From 930dd84002f384b47297967a3e0a117eb409d582 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sat, 19 Oct 2019 10:06:21 -0700 Subject: [PATCH 105/119] CLN: Exception in DataFrame._reduce (#29085) --- pandas/core/frame.py | 52 +++++++++++++++----------------------------- 1 file changed, 18 insertions(+), 34 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 7880acb1b78da..c90bf4ba7151f 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -68,7 +68,6 @@ infer_dtype_from_object, is_bool_dtype, is_datetime64_any_dtype, - is_datetime64tz_dtype, is_dict_like, is_dtype_equal, is_extension_array_dtype, @@ -7784,20 +7783,9 @@ def _reduce( def f(x): return op(x, axis=axis, skipna=skipna, **kwds) - # exclude timedelta/datetime unless we are uniform types - if ( - axis == 1 - and self._is_datelike_mixed_type - and ( - not self._is_homogeneous_type - and not is_datetime64tz_dtype(self.dtypes[0]) - ) - ): - numeric_only = True - if numeric_only is None: + values = self.values try: - values = self.values result = f(values) if filter_type == "bool" and is_object_dtype(values) and axis is None: @@ -7809,27 +7797,23 @@ def f(x): # try by-column first if filter_type is None and axis == 0: - try: - - # this can end up with a non-reduction - # but not always. if the types are mixed - # with datelike then need to make sure a series - - # we only end up here if we have not specified - # numeric_only and yet we have tried a - # column-by-column reduction, where we have mixed type. - # So let's just do what we can - from pandas.core.apply import frame_apply - - opa = frame_apply( - self, func=f, result_type="expand", ignore_failures=True - ) - result = opa.get_result() - if result.ndim == self.ndim: - result = result.iloc[0] - return result - except Exception: - pass + # this can end up with a non-reduction + # but not always. if the types are mixed + # with datelike then need to make sure a series + + # we only end up here if we have not specified + # numeric_only and yet we have tried a + # column-by-column reduction, where we have mixed type. + # So let's just do what we can + from pandas.core.apply import frame_apply + + opa = frame_apply( + self, func=f, result_type="expand", ignore_failures=True + ) + result = opa.get_result() + if result.ndim == self.ndim: + result = result.iloc[0] + return result if filter_type is None or filter_type == "numeric": data = self._get_numeric_data() From 7c8c8c8120abd5f93608cea28ec56724c0df22e3 Mon Sep 17 00:00:00 2001 From: Martin Winkel Date: Sat, 19 Oct 2019 19:07:11 +0200 Subject: [PATCH 106/119] [#22550] Remove TestData from series-tests test_operators.py (#29084) --- pandas/tests/series/test_operators.py | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/pandas/tests/series/test_operators.py b/pandas/tests/series/test_operators.py index 6bfcc02ca633a..942ab0db37a57 100644 --- a/pandas/tests/series/test_operators.py +++ b/pandas/tests/series/test_operators.py @@ -16,8 +16,6 @@ assert_series_equal, ) -from .common import TestData - class TestSeriesLogicalOps: @pytest.mark.parametrize("bool_op", [operator.and_, operator.or_, operator.xor]) @@ -746,7 +744,7 @@ def test_comparison_flex_alignment_fill(self): assert_series_equal(left.gt(right, fill_value=0), exp) -class TestSeriesOperators(TestData): +class TestSeriesOperators: def test_operators_empty_int_corner(self): s1 = Series([], [], dtype=np.int32) s2 = Series({"x": 0.0}) @@ -768,12 +766,10 @@ def test_ops_datetimelike_align(self): result = (dt2.to_frame() - dt.to_frame())[0] assert_series_equal(result, expected) - def test_operators_corner(self): - series = self.ts - + def test_operators_corner(self, datetime_series): empty = Series([], index=Index([])) - result = series + empty + result = datetime_series + empty assert np.isnan(result).all() result = empty + Series([], index=Index([])) @@ -786,10 +782,12 @@ def test_operators_corner(self): # deltas = deltas + sub_deltas # float + int - int_ts = self.ts.astype(int)[:-5] - added = self.ts + int_ts + int_ts = datetime_series.astype(int)[:-5] + added = datetime_series + int_ts expected = Series( - self.ts.values[:-5] + int_ts.values, index=self.ts.index[:-5], name="ts" + datetime_series.values[:-5] + int_ts.values, + index=datetime_series.index[:-5], + name="ts", ) tm.assert_series_equal(added[:-5], expected) From e4afa45c380b0d8f2c39e2e53a9e8415f685ba44 Mon Sep 17 00:00:00 2001 From: qudade <7327644+qudade@users.noreply.github.com> Date: Sat, 19 Oct 2019 19:08:36 +0200 Subject: [PATCH 107/119] TST: 2d index when constructing dataframe (#25416). (#29083) --- pandas/tests/frame/test_constructors.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index ebffeeaa3063e..583093af6d3e6 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -424,6 +424,25 @@ def test_constructor_multi_index(self): df = DataFrame(index=mi, columns=mi) assert pd.isna(df).values.ravel().all() + def test_constructor_2d_index(self): + # GH 25416 + # handling of 2d index in construction + df = pd.DataFrame([[1]], columns=[[1]], index=[1, 2]) + expected = pd.DataFrame( + [1, 1], + index=pd.Int64Index([1, 2], dtype="int64"), + columns=pd.MultiIndex(levels=[[1]], codes=[[0]]), + ) + tm.assert_frame_equal(df, expected) + + df = pd.DataFrame([[1]], columns=[[1]], index=[[1, 2]]) + expected = pd.DataFrame( + [1, 1], + index=pd.MultiIndex(levels=[[1, 2]], codes=[[0, 1]]), + columns=pd.MultiIndex(levels=[[1]], codes=[[0]]), + ) + tm.assert_frame_equal(df, expected) + def test_constructor_error_msgs(self): msg = "Empty data passed with indices specified." # passing an empty array with columns specified. From cb76dcb6e16eed7e4df4844ba94f54383c3425f8 Mon Sep 17 00:00:00 2001 From: qudade <7327644+qudade@users.noreply.github.com> Date: Sat, 19 Oct 2019 19:11:15 +0200 Subject: [PATCH 108/119] TST: regression test for groupby with datetime and timedelta (#15562) (#29063) --- pandas/tests/groupby/test_apply.py | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index 4d0063b773bc5..1af4768b7381e 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -543,6 +543,33 @@ def predictions(tool): tm.assert_series_equal(expected, result) +def test_apply_aggregating_timedelta_and_datetime(): + # Regression test for GH 15562 + # The following groupby caused ValueErrors and IndexErrors pre 0.20.0 + + df = pd.DataFrame( + { + "clientid": ["A", "B", "C"], + "datetime": [np.datetime64("2017-02-01 00:00:00")] * 3, + } + ) + df["time_delta_zero"] = df.datetime - df.datetime + result = df.groupby("clientid").apply( + lambda ddf: pd.Series( + dict(clientid_age=ddf.time_delta_zero.min(), date=ddf.datetime.min()) + ) + ) + expected = pd.DataFrame( + { + "clientid": ["A", "B", "C"], + "clientid_age": [np.timedelta64(0, "D")] * 3, + "date": [np.datetime64("2017-02-01 00:00:00")] * 3, + } + ).set_index("clientid") + + tm.assert_frame_equal(result, expected) + + def test_time_field_bug(): # Test a fix for the following error related to GH issue 11324 When # non-key fields in a group-by dataframe contained time-based fields From 693105169f514fe1ac2372bca787ad48c50bd421 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sat, 19 Oct 2019 10:13:05 -0700 Subject: [PATCH 109/119] CLN: cython and docstring cleanups (#29089) --- pandas/_libs/algos_common_helper.pxi.in | 5 +- pandas/_libs/algos_take_helper.pxi.in | 62 ++++++++++++----------- pandas/_libs/hashtable_func_helper.pxi.in | 6 ++- pandas/_libs/join.pyx | 24 ++++----- pandas/_libs/sparse_op_helper.pxi.in | 2 +- pandas/_libs/window.pyx | 8 +-- pandas/core/arrays/base.py | 2 +- pandas/core/generic.py | 2 +- pandas/core/groupby/groupby.py | 2 +- pandas/core/indexes/base.py | 5 +- pandas/core/indexes/datetimes.py | 3 +- pandas/core/indexes/timedeltas.py | 4 +- pandas/core/missing.py | 2 +- pandas/core/ops/docstrings.py | 4 +- pandas/core/strings.py | 2 +- pandas/io/pytables.py | 2 +- pandas/tests/plotting/common.py | 4 +- 17 files changed, 67 insertions(+), 72 deletions(-) diff --git a/pandas/_libs/algos_common_helper.pxi.in b/pandas/_libs/algos_common_helper.pxi.in index 91599fa223b57..eb6d689899073 100644 --- a/pandas/_libs/algos_common_helper.pxi.in +++ b/pandas/_libs/algos_common_helper.pxi.in @@ -17,12 +17,11 @@ dtypes = [('float64', 'float64_t', 'float64_t'), def get_dispatch(dtypes): for name, c_type, dest_type, in dtypes: - dest_name = dest_type[:-2] # i.e. strip "_t" - yield name, c_type, dest_type, dest_name + yield name, c_type, dest_type }} -{{for name, c_type, dest_type, dest_name +{{for name, c_type, dest_type in get_dispatch(dtypes)}} diff --git a/pandas/_libs/algos_take_helper.pxi.in b/pandas/_libs/algos_take_helper.pxi.in index e7ee212065c5b..bd5a488722f6d 100644 --- a/pandas/_libs/algos_take_helper.pxi.in +++ b/pandas/_libs/algos_take_helper.pxi.in @@ -10,28 +10,28 @@ WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in {{py: -# name, dest, c_type_in, c_type_out, preval, postval, can_copy, nogil +# c_type_in, c_type_out, preval, postval dtypes = [ - ('bool', 'bool', 'uint8_t', 'uint8_t', '', '', True), - ('bool', 'object', 'uint8_t', 'object', - 'True if ', ' > 0 else False', False), - ('int8', 'int8', 'int8_t', 'int8_t', '', '', True), - ('int8', 'int32', 'int8_t', 'int32_t', '', '', False), - ('int8', 'int64', 'int8_t', 'int64_t', '', '', False), - ('int8', 'float64', 'int8_t', 'float64_t', '', '', False), - ('int16', 'int16', 'int16_t', 'int16_t', '', '', True), - ('int16', 'int32', 'int16_t', 'int32_t', '', '', False), - ('int16', 'int64', 'int16_t', 'int64_t', '', '', False), - ('int16', 'float64', 'int16_t', 'float64_t', '', '', False), - ('int32', 'int32', 'int32_t', 'int32_t', '', '', True), - ('int32', 'int64', 'int32_t', 'int64_t', '', '', False), - ('int32', 'float64', 'int32_t', 'float64_t', '', '', False), - ('int64', 'int64', 'int64_t', 'int64_t', '', '', True), - ('int64', 'float64', 'int64_t', 'float64_t', '', '', False), - ('float32', 'float32', 'float32_t', 'float32_t', '', '', True), - ('float32', 'float64', 'float32_t', 'float64_t', '', '', False), - ('float64', 'float64', 'float64_t', 'float64_t', '', '', True), - ('object', 'object', 'object', 'object', '', '', False)] + ('uint8_t', 'uint8_t', '', ''), + ('uint8_t', 'object', 'True if ', ' > 0 else False'), + ('int8_t', 'int8_t', '', ''), + ('int8_t', 'int32_t', '', ''), + ('int8_t', 'int64_t', '', ''), + ('int8_t', 'float64_t', '', ''), + ('int16_t', 'int16_t', '', ''), + ('int16_t', 'int32_t', '', ''), + ('int16_t', 'int64_t', '', ''), + ('int16_t', 'float64_t', '', ''), + ('int32_t', 'int32_t', '', ''), + ('int32_t', 'int64_t', '', ''), + ('int32_t', 'float64_t', '', ''), + ('int64_t', 'int64_t', '', ''), + ('int64_t', 'float64_t', '', ''), + ('float32_t', 'float32_t', '', ''), + ('float32_t', 'float64_t', '', ''), + ('float64_t', 'float64_t', '', ''), + ('object', 'object', '', ''), +] def get_dispatch(dtypes): @@ -117,9 +117,9 @@ def get_dispatch(dtypes): out[i, j] = %(preval)svalues[i, idx]%(postval)s """ - for (name, dest, c_type_in, c_type_out, preval, postval, - can_copy) in dtypes: + for (c_type_in, c_type_out, preval, postval) in dtypes: + can_copy = c_type_in == c_type_out != "object" nogil = c_type_out != "object" if nogil: nogil_str = "with nogil:" @@ -128,6 +128,16 @@ def get_dispatch(dtypes): nogil_str = '' tab = '' + def get_name(dtype_name): + if dtype_name == "object": + return "object" + if dtype_name == "uint8_t": + return "bool" + return dtype_name[:-2] + + name = get_name(c_type_in) + dest = get_name(c_type_out) + args = dict(name=name, dest=dest, c_type_in=c_type_in, c_type_out=c_type_out, preval=preval, postval=postval, can_copy=can_copy, nogil_str=nogil_str, tab=tab) @@ -291,9 +301,3 @@ cdef _take_2d(ndarray[take_t, ndim=2] values, object idx): for j in range(K): result[i, j] = values[i, indexer[i, j]] return result - - -_take_2d_object = _take_2d[object] -_take_2d_float64 = _take_2d[float64_t] -_take_2d_int64 = _take_2d[int64_t] -_take_2d_uint64 = _take_2d[uint64_t] diff --git a/pandas/_libs/hashtable_func_helper.pxi.in b/pandas/_libs/hashtable_func_helper.pxi.in index e400ec0e608f0..f6af93f85bd5a 100644 --- a/pandas/_libs/hashtable_func_helper.pxi.in +++ b/pandas/_libs/hashtable_func_helper.pxi.in @@ -151,12 +151,14 @@ def duplicated_{{dtype}}({{scalar}}[:] values, object keep='first'): if keep == 'last': {{if dtype == 'object'}} - for i from n > i >= 0: + for i in range(n - 1, -1, -1): + # equivalent: range(n)[::-1], which cython doesnt like in nogil kh_put_{{ttype}}(table, values[i], &ret) out[i] = ret == 0 {{else}} with nogil: - for i from n > i >= 0: + for i in range(n - 1, -1, -1): + # equivalent: range(n)[::-1], which cython doesnt like in nogil kh_put_{{ttype}}(table, values[i], &ret) out[i] = ret == 0 {{endif}} diff --git a/pandas/_libs/join.pyx b/pandas/_libs/join.pyx index 238bfd0be0aa7..caf730389008a 100644 --- a/pandas/_libs/join.pyx +++ b/pandas/_libs/join.pyx @@ -13,6 +13,7 @@ from pandas._libs.algos import ( ) +@cython.boundscheck(False) def inner_join(const int64_t[:] left, const int64_t[:] right, Py_ssize_t max_groups): cdef: @@ -20,6 +21,8 @@ def inner_join(const int64_t[:] left, const int64_t[:] right, ndarray[int64_t] left_count, right_count, left_sorter, right_sorter ndarray[int64_t] left_indexer, right_indexer int64_t lc, rc + Py_ssize_t loc, left_pos = 0, right_pos = 0, position = 0 + Py_ssize_t offset # NA group in location 0 @@ -34,11 +37,6 @@ def inner_join(const int64_t[:] left, const int64_t[:] right, if rc > 0 and lc > 0: count += lc * rc - # group 0 is the NA group - cdef: - Py_ssize_t loc, left_pos = 0, right_pos = 0, position = 0 - Py_ssize_t offset - # exclude the NA group left_pos = left_count[0] right_pos = right_count[0] @@ -64,6 +62,7 @@ def inner_join(const int64_t[:] left, const int64_t[:] right, _get_result_indexer(right_sorter, right_indexer)) +@cython.boundscheck(False) def left_outer_join(const int64_t[:] left, const int64_t[:] right, Py_ssize_t max_groups, sort=True): cdef: @@ -72,6 +71,8 @@ def left_outer_join(const int64_t[:] left, const int64_t[:] right, ndarray rev ndarray[int64_t] left_indexer, right_indexer int64_t lc, rc + Py_ssize_t loc, left_pos = 0, right_pos = 0, position = 0 + Py_ssize_t offset # NA group in location 0 @@ -85,11 +86,6 @@ def left_outer_join(const int64_t[:] left, const int64_t[:] right, else: count += left_count[i] - # group 0 is the NA group - cdef: - Py_ssize_t loc, left_pos = 0, right_pos = 0, position = 0 - Py_ssize_t offset - # exclude the NA group left_pos = left_count[0] right_pos = right_count[0] @@ -137,6 +133,7 @@ def left_outer_join(const int64_t[:] left, const int64_t[:] right, return left_indexer, right_indexer +@cython.boundscheck(False) def full_outer_join(const int64_t[:] left, const int64_t[:] right, Py_ssize_t max_groups): cdef: @@ -144,6 +141,8 @@ def full_outer_join(const int64_t[:] left, const int64_t[:] right, ndarray[int64_t] left_count, right_count, left_sorter, right_sorter ndarray[int64_t] left_indexer, right_indexer int64_t lc, rc + int64_t left_pos = 0, right_pos = 0 + Py_ssize_t offset, position = 0 # NA group in location 0 @@ -160,11 +159,6 @@ def full_outer_join(const int64_t[:] left, const int64_t[:] right, else: count += lc + rc - # group 0 is the NA group - cdef: - int64_t left_pos = 0, right_pos = 0 - Py_ssize_t offset, position = 0 - # exclude the NA group left_pos = left_count[0] right_pos = right_count[0] diff --git a/pandas/_libs/sparse_op_helper.pxi.in b/pandas/_libs/sparse_op_helper.pxi.in index 5949a3fd0ed81..62ea477167b72 100644 --- a/pandas/_libs/sparse_op_helper.pxi.in +++ b/pandas/_libs/sparse_op_helper.pxi.in @@ -86,7 +86,7 @@ def get_op(tup): 'and': '{0} & {1}', # logical op 'or': '{0} | {1}'} - return ops_dict[opname].format(lval, rval, dtype) + return ops_dict[opname].format(lval, rval) def get_dispatch(dtypes): diff --git a/pandas/_libs/window.pyx b/pandas/_libs/window.pyx index 8de593ce36c86..a2096d389823f 100644 --- a/pandas/_libs/window.pyx +++ b/pandas/_libs/window.pyx @@ -1296,7 +1296,7 @@ cdef _roll_min_max_variable(ndarray[numeric] values, # The original impl didn't deal with variable window sizes # So the code was optimized for that - for i from starti[0] <= i < endi[0]: + for i in range(starti[0], endi[0]): ai = init_mm(values[i], &nobs, is_max) # Discard previous entries if we find new min or max @@ -1644,7 +1644,7 @@ def roll_generic(object obj, else: # truncated windows at the beginning, through first full-length window - for i from 0 <= i < (int_min(win, N) - offset): + for i in range((int_min(win, N) - offset)): if counts[i] >= minp: output[i] = func(arr[0: (i + offset + 1)], *args, **kwargs) else: @@ -1654,7 +1654,7 @@ def roll_generic(object obj, buf = arr.data bufarr = np.empty(win, dtype=float) oldbuf = bufarr.data - for i from (win - offset) <= i < (N - offset): + for i in range((win - offset), (N - offset)): buf = buf + 1 bufarr.data = buf if counts[i] >= minp: @@ -1664,7 +1664,7 @@ def roll_generic(object obj, bufarr.data = oldbuf # truncated windows at the end - for i from int_max(N - offset, 0) <= i < N: + for i in range(int_max(N - offset, 0), N): if counts[i] >= minp: output[i] = func(arr[int_max(i + offset - win + 1, 0): N], *args, diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 53755695c97e3..08901df963f20 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -1104,7 +1104,7 @@ def _create_method(cls, op, coerce_to_dtype=True): ---------- op : function An operator that takes arguments op(a, b) - coerce_to_dtype : bool, default True + coerce_to_dtype : bool, default True boolean indicating whether to attempt to convert the result to the underlying ExtensionArray dtype. If it's not possible to create a new ExtensionArray with the diff --git a/pandas/core/generic.py b/pandas/core/generic.py index e3e59639de56b..a300748ee5bc8 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -2481,7 +2481,7 @@ def to_hdf(self, path_or_buf, key, **kwargs): like searching / selecting subsets of the data. append : bool, default False For Table formats, append the input data to the existing. - data_columns : list of columns or True, optional + data_columns : list of columns or True, optional List of columns to create as indexed data columns for on-disk queries, or True to use all columns. By default only the axes of the object are indexed. See :ref:`io.hdf5-query-data-columns`. diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index b27d5bb05ee8f..f622480cfe4b7 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -2080,7 +2080,7 @@ def rank( * dense: like 'min', but rank always increases by 1 between groups ascending : bool, default True False for ranks by high (1) to low (N). - na_option : {'keep', 'top', 'bottom'}, default 'keep' + na_option : {'keep', 'top', 'bottom'}, default 'keep' * keep: leave NA values where they are * top: smallest rank if ascending * bottom: smallest rank if descending diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 1a08609ccd99a..9d6487f7a8ae4 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -2031,7 +2031,7 @@ def fillna(self, value=None, downcast=None): Parameters ---------- - how : {'any', 'all'}, default 'any' + how : {'any', 'all'}, default 'any' If the Index is a MultiIndex, drop the value when any or all levels are NaN. @@ -5016,12 +5016,11 @@ def _validate_indexer(self, form, key, kind): Returns ------- - label : object + label : object Notes ----- Value of `side` parameter should be validated in caller. - """ @Appender(_index_shared_docs["_maybe_cast_slice_bound"]) diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index 477525d7ab272..49c11c5505d00 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -1079,12 +1079,11 @@ def _maybe_cast_slice_bound(self, label, side, kind): Returns ------- - label : object + label : object Notes ----- Value of `side` parameter should be validated in caller. - """ assert kind in ["ix", "loc", "getitem", None] diff --git a/pandas/core/indexes/timedeltas.py b/pandas/core/indexes/timedeltas.py index 62a74fefa6577..c404e205e603c 100644 --- a/pandas/core/indexes/timedeltas.py +++ b/pandas/core/indexes/timedeltas.py @@ -550,7 +550,6 @@ def _maybe_cast_slice_bound(self, label, side, kind): """ If label is a string, cast it to timedelta according to resolution. - Parameters ---------- label : object @@ -559,8 +558,7 @@ def _maybe_cast_slice_bound(self, label, side, kind): Returns ------- - label : object - + label : object """ assert kind in ["ix", "loc", "getitem", None] diff --git a/pandas/core/missing.py b/pandas/core/missing.py index bc81fbb7e1ce0..f2655c126b9e5 100644 --- a/pandas/core/missing.py +++ b/pandas/core/missing.py @@ -420,7 +420,7 @@ def _akima_interpolate(xi, yi, x, der=0, axis=0): ---------- xi : array_like A sorted list of x-coordinates, of length N. - yi : array_like + yi : array_like A 1-D array of real values. `yi`'s length along the interpolation axis must be equal to the length of `xi`. If N-D array, use axis parameter to select correct axis. diff --git a/pandas/core/ops/docstrings.py b/pandas/core/ops/docstrings.py index 93f197366cf32..5d3f9cd92aa1a 100644 --- a/pandas/core/ops/docstrings.py +++ b/pandas/core/ops/docstrings.py @@ -387,7 +387,7 @@ def _make_flex_doc(op_name, typ): ---------- other : scalar, sequence, Series, or DataFrame Any single or multiple element data structure, or list-like object. -axis : {{0 or 'index', 1 or 'columns'}} +axis : {{0 or 'index', 1 or 'columns'}} Whether to compare by the index (0 or 'index') or columns (1 or 'columns'). For Series input, axis to match Series index on. level : int or label @@ -541,7 +541,7 @@ def _make_flex_doc(op_name, typ): ---------- other : scalar, sequence, Series, or DataFrame Any single or multiple element data structure, or list-like object. -axis : {{0 or 'index', 1 or 'columns'}}, default 'columns' +axis : {{0 or 'index', 1 or 'columns'}}, default 'columns' Whether to compare by the index (0 or 'index') or columns (1 or 'columns'). level : int or label diff --git a/pandas/core/strings.py b/pandas/core/strings.py index 2f2e7234999f2..e50da168af4d2 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -1343,7 +1343,7 @@ def str_pad(arr, width, side="left", fillchar=" "): character. Equivalent to ``Series.str.pad(side='right')``. Series.str.center : Fills boths sides of strings with an arbitrary character. Equivalent to ``Series.str.pad(side='both')``. - Series.str.zfill : Pad strings in the Series/Index by prepending '0' + Series.str.zfill : Pad strings in the Series/Index by prepending '0' character. Equivalent to ``Series.str.pad(side='left', fillchar='0')``. Examples diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index c87cad5472bd9..94f863d8970f1 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -1027,7 +1027,7 @@ def append( / selecting subsets of the data append : bool, default True Append the input data to the existing. - data_columns : list of columns, or True, default None + data_columns : list of columns, or True, default None List of columns to create as indexed data columns for on-disk queries, or True to use all columns. By default only the axes of the object are indexed. See `here diff --git a/pandas/tests/plotting/common.py b/pandas/tests/plotting/common.py index 65d0c3d9fb17d..f0ba5f14d59c6 100644 --- a/pandas/tests/plotting/common.py +++ b/pandas/tests/plotting/common.py @@ -311,7 +311,7 @@ def _check_ax_scales(self, axes, xaxis="linear", yaxis="linear"): axes : matplotlib Axes object, or its list-like xaxis : {'linear', 'log'} expected xaxis scale - yaxis : {'linear', 'log'} + yaxis : {'linear', 'log'} expected yaxis scale """ axes = self._flatten_visible(axes) @@ -329,7 +329,7 @@ def _check_axes_shape(self, axes, axes_num=None, layout=None, figsize=None): axes_num : number expected number of axes. Unnecessary axes should be set to invisible. - layout : tuple + layout : tuple expected layout, (expected number of rows , columns) figsize : tuple expected figsize. default is matplotlib default From 04893a954b91574279c402e8730a4b5fae2ae9e1 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sat, 19 Oct 2019 11:14:22 -0700 Subject: [PATCH 110/119] BUG: parse_time_string failing to raise TypeError (#29098) --- pandas/_libs/tslibs/parsing.pyx | 4 ++-- pandas/core/indexes/datetimes.py | 2 +- pandas/core/indexes/period.py | 9 ++------- pandas/core/indexes/timedeltas.py | 2 +- pandas/tests/tslibs/test_parsing.py | 6 ++++++ 5 files changed, 12 insertions(+), 11 deletions(-) diff --git a/pandas/_libs/tslibs/parsing.pyx b/pandas/_libs/tslibs/parsing.pyx index bf0a0ae5a3fe9..796d1400194fd 100644 --- a/pandas/_libs/tslibs/parsing.pyx +++ b/pandas/_libs/tslibs/parsing.pyx @@ -233,7 +233,7 @@ def parse_datetime_string(date_string, freq=None, dayfirst=False, return dt -def parse_time_string(arg, freq=None, dayfirst=None, yearfirst=None): +def parse_time_string(arg: str, freq=None, dayfirst=None, yearfirst=None): """ Try hard to parse datetime string, leveraging dateutil plus some extra goodies like quarter recognition. @@ -253,7 +253,7 @@ def parse_time_string(arg, freq=None, dayfirst=None, yearfirst=None): datetime, datetime/dateutil.parser._result, str """ if not isinstance(arg, str): - return arg + raise TypeError("parse_time_string argument must be str") if getattr(freq, "_typ", None) == "dateoffset": freq = freq.rule_code diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index 49c11c5505d00..3535682bf182d 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -1106,7 +1106,7 @@ def _maybe_cast_slice_bound(self, label, side, kind): else: return label - def _get_string_slice(self, key, use_lhs=True, use_rhs=True): + def _get_string_slice(self, key: str, use_lhs: bool = True, use_rhs: bool = True): freq = getattr(self, "freqstr", getattr(self, "inferred_freq", None)) _, parsed, reso = parsing.parse_time_string(key, freq) loc = self._partial_date_slice(reso, parsed, use_lhs=use_lhs, use_rhs=use_rhs) diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index f085dff84462d..a20290e77023a 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -457,11 +457,8 @@ def __contains__(self, key): try: self.get_loc(key) return True - except (ValueError, TypeError, KeyError): + except (TypeError, KeyError): # TypeError can be reached if we pass a tuple that is not hashable - # ValueError can be reached if pass a 2-tuple and parse_time_string - # raises with the wrong number of return values - # TODO: the latter is a bug in parse_time_string return False @cache_readonly @@ -608,7 +605,7 @@ def get_value(self, series, key): try: return com.maybe_box(self, super().get_value(s, key), series, key) except (KeyError, IndexError): - try: + if isinstance(key, str): asdt, parsed, reso = parse_time_string(key, self.freq) grp = resolution.Resolution.get_freq_group(reso) freqn = resolution.get_freq_group(self.freq) @@ -634,8 +631,6 @@ def get_value(self, series, key): ) else: raise KeyError(key) - except TypeError: - pass period = Period(key, self.freq) key = period.value if isna(period) else period.ordinal diff --git a/pandas/core/indexes/timedeltas.py b/pandas/core/indexes/timedeltas.py index c404e205e603c..983e68f38a4b9 100644 --- a/pandas/core/indexes/timedeltas.py +++ b/pandas/core/indexes/timedeltas.py @@ -528,7 +528,7 @@ def get_loc(self, key, method=None, tolerance=None): # the try/except clauses below tolerance = self._convert_tolerance(tolerance, np.asarray(key)) - if _is_convertible_to_td(key): + if _is_convertible_to_td(key) or key is NaT: key = Timedelta(key) return Index.get_loc(self, key, method, tolerance) diff --git a/pandas/tests/tslibs/test_parsing.py b/pandas/tests/tslibs/test_parsing.py index 126a1bd12ad59..9b6ed86bc2770 100644 --- a/pandas/tests/tslibs/test_parsing.py +++ b/pandas/tests/tslibs/test_parsing.py @@ -23,6 +23,12 @@ def test_parse_time_string(): assert parsed == parsed_lower +def test_parse_time_string_invalid_type(): + # Raise on invalid input, don't just return it + with pytest.raises(TypeError): + parse_time_string((4, 5)) + + @pytest.mark.parametrize( "dashed,normal", [("1988-Q2", "1988Q2"), ("2Q-1988", "2Q1988")] ) From 971d1910467dc51638e93b57b305c866ccca5a15 Mon Sep 17 00:00:00 2001 From: Chin Hwee Date: Sat, 19 Oct 2019 23:39:35 +0800 Subject: [PATCH 111/119] explain set_levels().levels examples + formatting resolve trailing whitespaces --- pandas/core/indexes/multi.py | 41 +++++++++++++++++++++++++++++------- 1 file changed, 33 insertions(+), 8 deletions(-) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 6b7019f4685c7..db4310aced7ca 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -719,16 +719,19 @@ def _set_levels( def set_levels(self, levels, level=None, inplace=False, verify_integrity=True): """ - Set levels on MultiIndex by passing a new value for each - index in the level. Defaults to returning new - index. + Set levels on MultiIndex. Defaults to returning new index. + ``set_levels`` passes a new value for each index in the level, + where the new values to be passed in each level are defined in a list. It is assumed that a new value is provided for each code describing - values in the level. If the number of values passed is more than - the number of index values in the level, ``set_levels`` will still - pass the values to the level. The passed values are stored in the - MultiIndex FrozenList even though the index values may be truncated - in the MultiIndex output from set_levels. + values in the level. + + If the number of values passed is more than the number of index + values in the level, ``set_levels`` will still pass the values + to the level. The passed values are stored in the FrozenList + representing the levels attribute of the MultiIndex, even + though the index values may be truncated in the MultiIndex + output from ``set_levels``. Parameters ---------- @@ -751,6 +754,14 @@ def set_levels(self, levels, level=None, inplace=False, verify_integrity=True): (2, 'one'), (2, 'two'), (3, 'one'), (3, 'two')], names=['foo', 'bar']) + >>> idx + MultiIndex([('1', one), + ('1', two), + ('2', one), + ('2', two), + ('3', one), + ('3', two)], + names=['foo', 'bar']) >>> idx.set_levels([['a', 'b', 'c'], [1, 2]]) MultiIndex([('a', 1), ('a', 2), @@ -775,6 +786,20 @@ def set_levels(self, levels, level=None, inplace=False, verify_integrity=True): (3, 'a'), (3, 'b')], names=['foo', 'bar']) + +<<<<<<< HEAD + ``set_levels()`` passes values into the levels attribute that is + represented by a FrozenList containing list of values for each + level in the MultiIndex, even when the number of values passed + for a level is more than the number of indexes available in the +======= + ``set_levels()`` passes values into the levels attribute that is + represented by a FrozenList containing list of values for each + level in the MultiIndex, even when the number of values passed + for a level is more than the number of indexes available in the +>>>>>>> 09fc1b4b8... explain set_levels().levels examples + formatting + MultiIndex itself. + >>> idx.set_levels([['a', 'b', 'c'], [1, 2]], level=[0, 1]) MultiIndex([('a', 1), ('a', 2), From 8ccc7c727552a448b5688b50330da2fd7a9e156b Mon Sep 17 00:00:00 2001 From: Chin Hwee Date: Sun, 20 Oct 2019 09:49:58 +0800 Subject: [PATCH 112/119] formatting changes --- pandas/core/indexes/multi.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index db4310aced7ca..92b602d5f6fc9 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -787,17 +787,10 @@ def set_levels(self, levels, level=None, inplace=False, verify_integrity=True): (3, 'b')], names=['foo', 'bar']) -<<<<<<< HEAD ``set_levels()`` passes values into the levels attribute that is represented by a FrozenList containing list of values for each level in the MultiIndex, even when the number of values passed for a level is more than the number of indexes available in the -======= - ``set_levels()`` passes values into the levels attribute that is - represented by a FrozenList containing list of values for each - level in the MultiIndex, even when the number of values passed - for a level is more than the number of indexes available in the ->>>>>>> 09fc1b4b8... explain set_levels().levels examples + formatting MultiIndex itself. >>> idx.set_levels([['a', 'b', 'c'], [1, 2]], level=[0, 1]) From e623f0fbf9a89cbf566255189c843b7ff9a79f55 Mon Sep 17 00:00:00 2001 From: Grigorios Giannakopoulos Date: Sun, 20 Oct 2019 16:49:54 +0300 Subject: [PATCH 113/119] Add a regression test for the timezone issue (#29097) --- pandas/tests/frame/test_apply.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/pandas/tests/frame/test_apply.py b/pandas/tests/frame/test_apply.py index fe034504b8161..4b7439cd40023 100644 --- a/pandas/tests/frame/test_apply.py +++ b/pandas/tests/frame/test_apply.py @@ -1346,3 +1346,17 @@ def test_frequency_is_original(self, num_cols): df = DataFrame(1, index=index, columns=range(num_cols)) df.apply(lambda x: x) assert index.freq == original.freq + + def test_apply_datetime_tz_issue(self): + # GH 29052 + + timestamps = [ + pd.Timestamp("2019-03-15 12:34:31.909000+0000", tz="UTC"), + pd.Timestamp("2019-03-15 12:34:34.359000+0000", tz="UTC"), + pd.Timestamp("2019-03-15 12:34:34.660000+0000", tz="UTC"), + ] + df = DataFrame(data=[0, 1, 2], index=timestamps) + result = df.apply(lambda x: x.name, axis=1) + expected = pd.Series(index=timestamps, data=timestamps) + + tm.assert_series_equal(result, expected) From d2d8785c71cf75ae8d6363410156e96a1785f8f7 Mon Sep 17 00:00:00 2001 From: Abhijeet Krishnan Date: Sun, 20 Oct 2019 18:26:40 -0400 Subject: [PATCH 114/119] Fix typing errors (#29115) Thanks, @AbhijeetKrishnan --- pandas/tests/frame/test_constructors.py | 6 +++--- setup.cfg | 3 --- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 583093af6d3e6..aa00cf234d9ee 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -50,13 +50,13 @@ class TestDataFrameConstructors: lambda: DataFrame({}), lambda: DataFrame(()), lambda: DataFrame([]), - lambda: DataFrame((x for x in [])), + lambda: DataFrame((_ for _ in [])), lambda: DataFrame(range(0)), lambda: DataFrame(data=None), lambda: DataFrame(data={}), lambda: DataFrame(data=()), lambda: DataFrame(data=[]), - lambda: DataFrame(data=(x for x in [])), + lambda: DataFrame(data=(_ for _ in [])), lambda: DataFrame(data=range(0)), ], ) @@ -72,7 +72,7 @@ def test_empty_constructor(self, constructor): [ ([[]], RangeIndex(1), RangeIndex(0)), ([[], []], RangeIndex(2), RangeIndex(0)), - ([(x for x in [])], RangeIndex(1), RangeIndex(0)), + ([(_ for _ in [])], RangeIndex(1), RangeIndex(0)), ], ) def test_emptylike_constructor(self, emptylike, expected_index, expected_columns): diff --git a/setup.cfg b/setup.cfg index ca1ca4a7b5733..de251bafd34fb 100644 --- a/setup.cfg +++ b/setup.cfg @@ -160,9 +160,6 @@ ignore_errors=True [mypy-pandas.tests.extension.json.test_json] ignore_errors=True -[mypy-pandas.tests.frame.test_constructors] -ignore_errors=True - [mypy-pandas.tests.indexes.datetimes.test_datetimelike] ignore_errors=True From 9f0383707d20e788d28d950f14a3eabf58d978f4 Mon Sep 17 00:00:00 2001 From: Abhijeet Krishnan Date: Sun, 20 Oct 2019 18:27:41 -0400 Subject: [PATCH 115/119] Fix typing errors (#29114) --- pandas/tests/dtypes/test_inference.py | 7 +++++-- setup.cfg | 3 --- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py index cfa6304909bb7..60afd768195d9 100644 --- a/pandas/tests/dtypes/test_inference.py +++ b/pandas/tests/dtypes/test_inference.py @@ -75,7 +75,7 @@ def coerce(request): (iter([1, 2]), True, "iterator"), # noqa: E241 (iter([]), True, "iterator-empty"), # noqa: E241 ((x for x in [1, 2]), True, "generator"), # noqa: E241 - ((x for x in []), True, "generator-empty"), # noqa: E241 + ((_ for _ in []), True, "generator-empty"), # noqa: E241 (Series([1]), True, "Series"), # noqa: E241 (Series([]), True, "Series-empty"), # noqa: E241 (Series(["a"]).str, True, "StringMethods"), # noqa: E241 @@ -288,7 +288,10 @@ class MockFile: assert not is_file(data) -@pytest.mark.parametrize("ll", [collections.namedtuple("Test", list("abc"))(1, 2, 3)]) +test_tuple = collections.namedtuple("Test", ["a", "b", "c"]) + + +@pytest.mark.parametrize("ll", [test_tuple(1, 2, 3)]) def test_is_names_tuple_passes(ll): assert inference.is_named_tuple(ll) diff --git a/setup.cfg b/setup.cfg index de251bafd34fb..199ad34626011 100644 --- a/setup.cfg +++ b/setup.cfg @@ -148,9 +148,6 @@ ignore_errors=True [mypy-pandas.tests.dtypes.test_common] ignore_errors=True -[mypy-pandas.tests.dtypes.test_inference] -ignore_errors=True - [mypy-pandas.tests.extension.decimal.test_decimal] ignore_errors=True From 9026a79dec643046eee78c244598205297b828b6 Mon Sep 17 00:00:00 2001 From: Chin Hwee Date: Sat, 5 Oct 2019 10:37:22 +0800 Subject: [PATCH 116/119] DOC: added docs for MultiIndex.set_levels (#28294) shortened lines to pass checks --- doc/source/user_guide/advanced.rst | 31 ++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/doc/source/user_guide/advanced.rst b/doc/source/user_guide/advanced.rst index 4949dd580414f..d123dcb751887 100644 --- a/doc/source/user_guide/advanced.rst +++ b/doc/source/user_guide/advanced.rst @@ -177,6 +177,37 @@ completely analogous way to selecting a column in a regular DataFrame: See :ref:`Cross-section with hierarchical index ` for how to select on a deeper level. +.. _advanced.set_levels: + +Set values in levels +~~~~~~~~~~~~~~~~~~~~~ + +The method :meth:`~MultiIndex.set_levels` changes the ``levels`` attribute by +passing a new value for each index in the level. It is assumed that a new value +is provided for each code describing values in the level. +For example: + +.. ipython:: python + + df.columns # original MultiIndex columns + + df.columns.levels # original MultiIndex column levels + + df.columns.set_levels([1, 3, 5, 7], level=0) + + df.columns.set_levels([1, 3, 5, 7], level=0).levels + +If you pass more values than the number of index values in the level, +``set_levels`` will still pass the values to the level. The passed values +are stored in the MultiIndex ``FrozenList`` even though the index values +may be truncated in the MultiIndex output from ``set_levels``. + +.. ipython:: python + + df.columns.set_levels([1, 3, 5, 7], level=1) + + df.columns.set_levels([1, 3, 5, 7], level=1).levels + .. _advanced.shown_levels: Defined levels From 4f5e38d05f2fdba8a33c527e106796907a7c388f Mon Sep 17 00:00:00 2001 From: Chin Hwee Date: Tue, 8 Oct 2019 23:48:32 +0800 Subject: [PATCH 117/119] edit set_levels docstring with additional examples edit set_levels docstring with additional examples edit set_levels docstring with additional examples --- doc/source/user_guide/advanced.rst | 31 -------------------- pandas/core/indexes/multi.py | 45 ++++++++++++++++++++++++------ 2 files changed, 36 insertions(+), 40 deletions(-) diff --git a/doc/source/user_guide/advanced.rst b/doc/source/user_guide/advanced.rst index d123dcb751887..4949dd580414f 100644 --- a/doc/source/user_guide/advanced.rst +++ b/doc/source/user_guide/advanced.rst @@ -177,37 +177,6 @@ completely analogous way to selecting a column in a regular DataFrame: See :ref:`Cross-section with hierarchical index ` for how to select on a deeper level. -.. _advanced.set_levels: - -Set values in levels -~~~~~~~~~~~~~~~~~~~~~ - -The method :meth:`~MultiIndex.set_levels` changes the ``levels`` attribute by -passing a new value for each index in the level. It is assumed that a new value -is provided for each code describing values in the level. -For example: - -.. ipython:: python - - df.columns # original MultiIndex columns - - df.columns.levels # original MultiIndex column levels - - df.columns.set_levels([1, 3, 5, 7], level=0) - - df.columns.set_levels([1, 3, 5, 7], level=0).levels - -If you pass more values than the number of index values in the level, -``set_levels`` will still pass the values to the level. The passed values -are stored in the MultiIndex ``FrozenList`` even though the index values -may be truncated in the MultiIndex output from ``set_levels``. - -.. ipython:: python - - df.columns.set_levels([1, 3, 5, 7], level=1) - - df.columns.set_levels([1, 3, 5, 7], level=1).levels - .. _advanced.shown_levels: Defined levels diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index fda5c78a61e53..88637ccbe1e47 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -736,8 +736,16 @@ def _set_levels( def set_levels(self, levels, level=None, inplace=False, verify_integrity=True): """ - Set new levels on MultiIndex. Defaults to returning - new index. + Set levels on MultiIndex by passing a new value for each + index in the level. Defaults to returning new + index. + + It is assumed that a new value is provided for each code describing + values in the level. If the number of values passed is more than + the number of index values in the level, ``set_levels`` will still + pass the values to the level. The passed values are stored in the + MultiIndex FrozenList even though the index values may be truncated + in the MultiIndex output from set_levels. Parameters ---------- @@ -757,32 +765,51 @@ def set_levels(self, levels, level=None, inplace=False, verify_integrity=True): Examples -------- >>> idx = pd.MultiIndex.from_tuples([(1, 'one'), (1, 'two'), - (2, 'one'), (2, 'two')], + (2, 'one'), (2, 'two'), + (3, 'one'), (3, 'two')], names=['foo', 'bar']) - >>> idx.set_levels([['a', 'b'], [1, 2]]) + >>> idx.set_levels([['a', 'b', 'c'], [1, 2]]) MultiIndex([('a', 1), ('a', 2), ('b', 1), - ('b', 2)], + ('b', 2), + ('c', 1), + ('c', 2)], names=['foo', 'bar']) - >>> idx.set_levels(['a', 'b'], level=0) + >>> idx.set_levels(['a', 'b', 'c'], level=0) MultiIndex([('a', 'one'), ('a', 'two'), ('b', 'one'), - ('b', 'two')], + ('b', 'two'), + ('c', 'one'), + ('c', 'two')], names=['foo', 'bar']) >>> idx.set_levels(['a', 'b'], level='bar') MultiIndex([(1, 'a'), (1, 'b'), (2, 'a'), - (2, 'b')], + (2, 'b'), + (3, 'a'), + (3, 'b')], + names=['foo', 'bar']) + >>> idx.set_levels([['a', 'b', 'c'], [1, 2]], level=[0, 1]) + MultiIndex([('a', 1), + ('a', 2), + ('b', 1), + ('b', 2), + ('c', 1), + ('c', 2)], names=['foo', 'bar']) - >>> idx.set_levels([['a', 'b'], [1, 2]], level=[0, 1]) + >>> idx.set_levels([['a', 'b', 'c'], [1, 2]], level=[0, 1]).levels + FrozenList([['a', 'b', 'c'], [1, 2]]) + >>> idx.set_levels([['a', 'b', 'c'], [1, 2, 3, 4]], level=[0, 1]) MultiIndex([('a', 1), ('a', 2), ('b', 1), ('b', 2)], names=['foo', 'bar']) + >>> idx.set_levels([['a', 'b', 'c'], [1, 2, 3, 4]], level=[0, 1]).levels + FrozenList([['a', 'b', 'c'], [1, 2, 3, 4]]) """ if is_list_like(levels) and not isinstance(levels, Index): levels = list(levels) From c00cc9de84d082956671682df3050dd77101df9c Mon Sep 17 00:00:00 2001 From: Chin Hwee Date: Sat, 19 Oct 2019 23:39:35 +0800 Subject: [PATCH 118/119] explain set_levels().levels examples + formatting --- pandas/core/indexes/multi.py | 34 ++++++++++++++++++++++++++-------- 1 file changed, 26 insertions(+), 8 deletions(-) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 88637ccbe1e47..269cf4547d971 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -736,16 +736,19 @@ def _set_levels( def set_levels(self, levels, level=None, inplace=False, verify_integrity=True): """ - Set levels on MultiIndex by passing a new value for each - index in the level. Defaults to returning new - index. + Set levels on MultiIndex. Defaults to returning new index. + ``set_levels`` passes a new value for each index in the level, + where the new values to be passed in each level are defined in a list. It is assumed that a new value is provided for each code describing - values in the level. If the number of values passed is more than - the number of index values in the level, ``set_levels`` will still - pass the values to the level. The passed values are stored in the - MultiIndex FrozenList even though the index values may be truncated - in the MultiIndex output from set_levels. + values in the level. + + If the number of values passed is more than the number of index + values in the level, ``set_levels`` will still pass the values + to the level. The passed values are stored in the FrozenList + representing the levels attribute of the MultiIndex, even + though the index values may be truncated in the MultiIndex + output from set_levels. Parameters ---------- @@ -768,6 +771,14 @@ def set_levels(self, levels, level=None, inplace=False, verify_integrity=True): (2, 'one'), (2, 'two'), (3, 'one'), (3, 'two')], names=['foo', 'bar']) + >>> idx + MultiIndex([('1', one), + ('1', two), + ('2', one), + ('2', two), + ('3', one), + ('3', two)], + names=['foo', 'bar']) >>> idx.set_levels([['a', 'b', 'c'], [1, 2]]) MultiIndex([('a', 1), ('a', 2), @@ -792,6 +803,13 @@ def set_levels(self, levels, level=None, inplace=False, verify_integrity=True): (3, 'a'), (3, 'b')], names=['foo', 'bar']) + + ``set_levels()`` passes values into the levels attribute that is + represented by a FrozenList containing list of values for each + level in the MultiIndex, even when the number of values passed + for a level is more than the number of indexes available in the + MultiIndex itself. + >>> idx.set_levels([['a', 'b', 'c'], [1, 2]], level=[0, 1]) MultiIndex([('a', 1), ('a', 2), From 211574b32dbceac4be1603472c9ab477303818ea Mon Sep 17 00:00:00 2001 From: Chin Hwee Date: Sat, 19 Oct 2019 23:39:35 +0800 Subject: [PATCH 119/119] explain set_levels().levels examples + formatting resolve trailing whitespaces --- pandas/core/indexes/multi.py | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 269cf4547d971..1640fabb40daa 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -738,16 +738,16 @@ def set_levels(self, levels, level=None, inplace=False, verify_integrity=True): """ Set levels on MultiIndex. Defaults to returning new index. - ``set_levels`` passes a new value for each index in the level, - where the new values to be passed in each level are defined in a list. + ``set_levels`` passes a new value for each index in the level, + where the new values to be passed in each level are defined in a list. It is assumed that a new value is provided for each code describing - values in the level. - - If the number of values passed is more than the number of index - values in the level, ``set_levels`` will still pass the values - to the level. The passed values are stored in the FrozenList - representing the levels attribute of the MultiIndex, even - though the index values may be truncated in the MultiIndex + values in the level. + + If the number of values passed is more than the number of index + values in the level, ``set_levels`` will still pass the values + to the level. The passed values are stored in the FrozenList + representing the levels attribute of the MultiIndex, even + though the index values may be truncated in the MultiIndex output from set_levels. Parameters @@ -804,10 +804,10 @@ def set_levels(self, levels, level=None, inplace=False, verify_integrity=True): (3, 'b')], names=['foo', 'bar']) - ``set_levels()`` passes values into the levels attribute that is - represented by a FrozenList containing list of values for each - level in the MultiIndex, even when the number of values passed - for a level is more than the number of indexes available in the + ``set_levels()`` passes values into the levels attribute that is + represented by a FrozenList containing list of values for each + level in the MultiIndex, even when the number of values passed + for a level is more than the number of indexes available in the MultiIndex itself. >>> idx.set_levels([['a', 'b', 'c'], [1, 2]], level=[0, 1])