From 8be97f8f3ce2db4ed85db2c6c68d8c39508df3a6 Mon Sep 17 00:00:00 2001 From: Unprocessable Date: Mon, 5 Aug 2019 14:34:23 +0200 Subject: [PATCH 001/191] Fix numpy boolean subtraction error in Series.diff Numpy no longer allows subtraction of boolean values. Not sure if Numpy version checking should be done before this code... Note: I haven't actually run this code, feel free to check it, but should be correct. --------------------------------------------------------------------------- TypeError Traceback (most recent call last) in 1 data = pd.Series([0,-1,-2,-3,-4,-3,-2,-1,0,-1,-1,0,-1,-2,-3,-2,0]) 2 filtered = data.between(-2,0, inclusive = True) ----> 3 filtered.diff() 4 print(filtered) ~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\series.py in diff(self, periods) 2191 dtype: float64 2192 """ -> 2193 result = algorithms.diff(com.values_from_object(self), periods) 2194 return self._constructor(result, index=self.index).__finalize__(self) 2195 ~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\algorithms.py in diff(arr, n, axis) 1817 out_arr[res_indexer] = result 1818 else: -> 1819 out_arr[res_indexer] = arr[res_indexer] - arr[lag_indexer] 1820 1821 if is_timedelta: TypeError: numpy boolean subtract, the `-` operator, is deprecated, use the bitwise_xor, the `^` operator, or the logical_xor function instead. --- pandas/core/algorithms.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 21d12d02c9008..70971a5f8aef7 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -1933,6 +1933,7 @@ def diff(arr, n, axis=0): elif is_bool_dtype(dtype): dtype = np.object_ + is_bool = True elif is_integer_dtype(dtype): dtype = np.float64 @@ -1972,6 +1973,8 @@ def diff(arr, n, axis=0): result = res - lag result[mask] = na out_arr[res_indexer] = result + elif is_bool: + out_arr[res_indexer] = arr[res_indexer] ^ arr[lag_indexer] else: out_arr[res_indexer] = arr[res_indexer] - arr[lag_indexer] From 7c982d20a2fc61a472aa1be134aa69e04a2537a9 Mon Sep 17 00:00:00 2001 From: Unprocessable Date: Mon, 5 Aug 2019 15:18:53 +0200 Subject: [PATCH 002/191] Added is_bool outside if function --- pandas/core/algorithms.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 70971a5f8aef7..6e1c2f1c178b8 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -1925,6 +1925,7 @@ def diff(arr, n, axis=0): dtype = arr.dtype is_timedelta = False + is_bool = False if needs_i8_conversion(arr): dtype = np.float64 arr = arr.view("i8") From 97509e9ddae3c895fcdc6e576a211f12850fef87 Mon Sep 17 00:00:00 2001 From: Clemens Brunner Date: Mon, 5 Aug 2019 15:27:41 +0200 Subject: [PATCH 003/191] Slightly rephrase SPSS doc (#27754) --- doc/source/user_guide/io.rst | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 8e5352c337072..947bf15a49c7a 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -5491,30 +5491,29 @@ The top-level function :func:`read_spss` can read (but not write) SPSS `sav` (.sav) and `zsav` (.zsav) format files. SPSS files contain column names. By default the -whole file is read, categorical columns are converted into ``pd.Categorical`` +whole file is read, categorical columns are converted into ``pd.Categorical``, and a ``DataFrame`` with all columns is returned. -Specify a ``usecols`` to obtain a subset of columns. Specify ``convert_categoricals=False`` +Specify the ``usecols`` parameter to obtain a subset of columns. Specify ``convert_categoricals=False`` to avoid converting categorical columns into ``pd.Categorical``. -Read a spss file: +Read an SPSS file: .. code-block:: python - df = pd.read_spss('spss_data.zsav') + df = pd.read_spss('spss_data.sav') -Extract a subset of columns ``usecols`` from SPSS file and +Extract a subset of columns contained in ``usecols`` from an SPSS file and avoid converting categorical columns into ``pd.Categorical``: .. code-block:: python - df = pd.read_spss('spss_data.zsav', usecols=['foo', 'bar'], + df = pd.read_spss('spss_data.sav', usecols=['foo', 'bar'], convert_categoricals=False) -More info_ about the sav and zsav file format is available from the IBM -web site. +More information about the `sav` and `zsav` file format is available here_. -.. _info: https://www.ibm.com/support/knowledgecenter/en/SSLVMB_22.0.0/com.ibm.spss.statistics.help/spss/base/savedatatypes.htm +.. _here: https://www.ibm.com/support/knowledgecenter/en/SSLVMB_22.0.0/com.ibm.spss.statistics.help/spss/base/savedatatypes.htm .. _io.other: From b3c2453c2293c604d1ca3231c1c77d8cf0999140 Mon Sep 17 00:00:00 2001 From: Nico Cernek Date: Mon, 5 Aug 2019 07:32:30 -0600 Subject: [PATCH 004/191] DOC: update compiling instructions and link. (#27717) --- doc/source/development/contributing.rst | 21 +++++---------------- 1 file changed, 5 insertions(+), 16 deletions(-) diff --git a/doc/source/development/contributing.rst b/doc/source/development/contributing.rst index 80dc8b0d8782b..b38f7767ae073 100644 --- a/doc/source/development/contributing.rst +++ b/doc/source/development/contributing.rst @@ -133,22 +133,11 @@ Installing a C compiler Pandas uses C extensions (mostly written using Cython) to speed up certain operations. To install pandas from source, you need to compile these C extensions, which means you need a C compiler. This process depends on which -platform you're using. Follow the `CPython contributing guide -`_ for getting a -compiler installed. You don't need to do any of the ``./configure`` or ``make`` -steps; you only need to install the compiler. - -For Windows developers, when using Python 3.5 and later, it is sufficient to -install `Visual Studio 2017 `_ with the -**Python development workload** and the **Python native development tools** -option. Otherwise, the following links may be helpful. - -* https://blogs.msdn.microsoft.com/pythonengineering/2017/03/07/python-support-in-vs2017/ -* https://blogs.msdn.microsoft.com/pythonengineering/2016/04/11/unable-to-find-vcvarsall-bat/ -* https://github.com/conda/conda-recipes/wiki/Building-from-Source-on-Windows-32-bit-and-64-bit -* https://cowboyprogrammer.org/building-python-wheels-for-windows/ -* https://blog.ionelmc.ro/2014/12/21/compiling-python-extensions-on-windows/ -* https://support.enthought.com/hc/en-us/articles/204469260-Building-Python-extensions-with-Canopy +platform you're using. + +* Windows: https://devguide.python.org/setup/#windows-compiling +* Mac: https://devguide.python.org/setup/#macos +* Unix: https://devguide.python.org/setup/#unix-compiling Let us know if you have any difficulties by opening an issue or reaching out on `Gitter`_. From 45f9c60fd0eb069d54b0ee7e6845d91a91baf6fe Mon Sep 17 00:00:00 2001 From: Unprocessable Date: Mon, 5 Aug 2019 15:56:45 +0200 Subject: [PATCH 005/191] Added test_diff function --- pandas/tests/series/test_diff.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) create mode 100644 pandas/tests/series/test_diff.py diff --git a/pandas/tests/series/test_diff.py b/pandas/tests/series/test_diff.py new file mode 100644 index 0000000000000..d6c443d2c5b62 --- /dev/null +++ b/pandas/tests/series/test_diff.py @@ -0,0 +1,17 @@ +from pandas import ( + Series +) + +from numpy import nan + +def test_diff(): + data = Series([0,-1,-2,-3,-4,-3,-2,-1,0,-1,-1,0,-1,-2,-3,-2,0]) + + filtered = data.between(-2,0, inclusive = True) + diff_boolean = filtered.diff() + expected_boolean = Series([nan, False, False, True, False, False, True, False, False, False, False, False, False, False, True, True, False]) + assert diff_boolean.equals(expected_boolean) + + diff_data = data.diff() + expected_data = Series([nan, -1.0, -1.0, -1.0, -1.0, 1.0, 1.0, 1.0, 1.0, -1.0, 0.0, 1.0, -1.0, -1.0, -1.0, 1.0, 2.0]) + assert diff_data.equals(expected_data) From 83907a2f1cb5ec2a4e04f22196af7b5fe41e6f22 Mon Sep 17 00:00:00 2001 From: Unprocessable Date: Mon, 5 Aug 2019 15:58:40 +0200 Subject: [PATCH 006/191] Update test_diff.py --- pandas/tests/series/test_diff.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/pandas/tests/series/test_diff.py b/pandas/tests/series/test_diff.py index d6c443d2c5b62..404688b673de6 100644 --- a/pandas/tests/series/test_diff.py +++ b/pandas/tests/series/test_diff.py @@ -6,12 +6,10 @@ def test_diff(): data = Series([0,-1,-2,-3,-4,-3,-2,-1,0,-1,-1,0,-1,-2,-3,-2,0]) - filtered = data.between(-2,0, inclusive = True) diff_boolean = filtered.diff() expected_boolean = Series([nan, False, False, True, False, False, True, False, False, False, False, False, False, False, True, True, False]) assert diff_boolean.equals(expected_boolean) - diff_data = data.diff() expected_data = Series([nan, -1.0, -1.0, -1.0, -1.0, 1.0, 1.0, 1.0, 1.0, -1.0, 0.0, 1.0, -1.0, -1.0, -1.0, 1.0, 2.0]) assert diff_data.equals(expected_data) From 6c3875e5b640e2eee9139737864171bcac7a10b6 Mon Sep 17 00:00:00 2001 From: Unprocessable Date: Mon, 5 Aug 2019 16:04:56 +0200 Subject: [PATCH 007/191] Changing style to match PEP --- pandas/tests/series/test_diff.py | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/pandas/tests/series/test_diff.py b/pandas/tests/series/test_diff.py index 404688b673de6..8eeed64ec432a 100644 --- a/pandas/tests/series/test_diff.py +++ b/pandas/tests/series/test_diff.py @@ -4,12 +4,21 @@ from numpy import nan + def test_diff(): - data = Series([0,-1,-2,-3,-4,-3,-2,-1,0,-1,-1,0,-1,-2,-3,-2,0]) - filtered = data.between(-2,0, inclusive = True) + data = Series( + [0, -1, -2, -3, -4, -3, -2, -1, 0, -1, -1, 0, -1, -2, -3, -2, 0] + ) + filtered = data.between(-2, 0, inclusive=True) diff_boolean = filtered.diff() - expected_boolean = Series([nan, False, False, True, False, False, True, False, False, False, False, False, False, False, True, True, False]) + expected_boolean = Series( + [nan, False, False, True, False, False, True, False, False, \ + False, False, False, False, False, True, True, False] + ) assert diff_boolean.equals(expected_boolean) diff_data = data.diff() - expected_data = Series([nan, -1.0, -1.0, -1.0, -1.0, 1.0, 1.0, 1.0, 1.0, -1.0, 0.0, 1.0, -1.0, -1.0, -1.0, 1.0, 2.0]) + expected_data = Series( + [nan, -1.0, -1.0, -1.0, -1.0, 1.0, 1.0, 1.0, 1.0, \ + -1.0, 0.0, 1.0, -1.0, -1.0, -1.0, 1.0, 2.0] + ) assert diff_data.equals(expected_data) From ecf1c04a27e73514ec7e6fc4e6518c6e78a69530 Mon Sep 17 00:00:00 2001 From: Unprocessable Date: Mon, 5 Aug 2019 16:06:48 +0200 Subject: [PATCH 008/191] Update test_diff.py --- pandas/tests/series/test_diff.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/series/test_diff.py b/pandas/tests/series/test_diff.py index 8eeed64ec432a..564f45f4c5bc6 100644 --- a/pandas/tests/series/test_diff.py +++ b/pandas/tests/series/test_diff.py @@ -12,13 +12,13 @@ def test_diff(): filtered = data.between(-2, 0, inclusive=True) diff_boolean = filtered.diff() expected_boolean = Series( - [nan, False, False, True, False, False, True, False, False, \ + [nan, False, False, True, False, False, True, False, False, False, False, False, False, False, True, True, False] ) assert diff_boolean.equals(expected_boolean) diff_data = data.diff() expected_data = Series( - [nan, -1.0, -1.0, -1.0, -1.0, 1.0, 1.0, 1.0, 1.0, \ + [nan, -1.0, -1.0, -1.0, -1.0, 1.0, 1.0, 1.0, 1.0, -1.0, 0.0, 1.0, -1.0, -1.0, -1.0, 1.0, 2.0] ) assert diff_data.equals(expected_data) From 0e7ed3d94ad44e4ec3eaf2c1a6429c5d17b4728a Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 5 Aug 2019 08:03:44 -0700 Subject: [PATCH 009/191] missed from #27720 (#27759) --- pandas/tests/indexing/test_coercion.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/pandas/tests/indexing/test_coercion.py b/pandas/tests/indexing/test_coercion.py index ed80e249220fd..05b58b0eca9b8 100644 --- a/pandas/tests/indexing/test_coercion.py +++ b/pandas/tests/indexing/test_coercion.py @@ -1038,10 +1038,6 @@ def test_replace_series(self, how, to_key, from_key): "from_key", ["datetime64[ns, UTC]", "datetime64[ns, US/Eastern]"] ) def test_replace_series_datetime_tz(self, how, to_key, from_key): - how = "series" - from_key = "datetime64[ns, US/Eastern]" - to_key = "timedelta64[ns]" - index = pd.Index([3, 4], name="xyz") obj = pd.Series(self.rep[from_key], index=index, name="yyy") assert obj.dtype == from_key From dd8ff9839fa515b8e04dd56b55ed40081127d62a Mon Sep 17 00:00:00 2001 From: Unprocessable Date: Mon, 5 Aug 2019 17:04:48 +0200 Subject: [PATCH 010/191] Using tm to assert series differences --- pandas/tests/series/test_diff.py | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/pandas/tests/series/test_diff.py b/pandas/tests/series/test_diff.py index 564f45f4c5bc6..9f6545c5cecbe 100644 --- a/pandas/tests/series/test_diff.py +++ b/pandas/tests/series/test_diff.py @@ -1,24 +1,23 @@ -from pandas import ( - Series -) - from numpy import nan +from pandas import Series +import pandas.util.testing as tm def test_diff(): + ''' + Tests the pd.Series diff function on boolean series. + ''' data = Series( - [0, -1, -2, -3, -4, -3, -2, -1, 0, -1, -1, 0, -1, -2, -3, -2, 0] + [0, -1, -2, -3, -4, -3, -2, -1, 0] ) filtered = data.between(-2, 0, inclusive=True) diff_boolean = filtered.diff() expected_boolean = Series( - [nan, False, False, True, False, False, True, False, False, - False, False, False, False, False, True, True, False] + [nan, False, False, True, False, False, True, False, False] ) - assert diff_boolean.equals(expected_boolean) + tm.assert_series_equal(diff_boolean, expected_boolean) diff_data = data.diff() expected_data = Series( - [nan, -1.0, -1.0, -1.0, -1.0, 1.0, 1.0, 1.0, 1.0, - -1.0, 0.0, 1.0, -1.0, -1.0, -1.0, 1.0, 2.0] + [nan, -1.0, -1.0, -1.0, -1.0, 1.0, 1.0, 1.0, 1.0] ) - assert diff_data.equals(expected_data) + tm.assert_series_equal(diff_data, expected_data) From f9f95c0762af2c114ed27d21288c9d25ab49634d Mon Sep 17 00:00:00 2001 From: Adam Klaum <37817979+Adam-Klaum@users.noreply.github.com> Date: Mon, 5 Aug 2019 11:09:28 -0400 Subject: [PATCH 011/191] ENH: Validation to only allow positive integers for options (#27382) --- doc/source/whatsnew/v1.0.0.rst | 6 ++++++ pandas/_config/config.py | 27 +++++++++++++++++++++++++++ pandas/core/config_init.py | 15 ++++----------- pandas/tests/config/test_config.py | 13 +++++++++++++ 4 files changed, 50 insertions(+), 11 deletions(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 04cd5e4c2c918..c7f8bb70e3461 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -195,6 +195,12 @@ ExtensionArray - - + +Other +^^^^^ +- Trying to set the ``display.precision``, ``display.max_rows`` or ``display.max_columns`` using :meth:`set_option` to anything but a ``None`` or a positive int will raise a ``ValueError`` (:issue:`23348`) + + .. _whatsnew_1000.contributors: Contributors diff --git a/pandas/_config/config.py b/pandas/_config/config.py index 4f0720abd1445..890db5b41907e 100644 --- a/pandas/_config/config.py +++ b/pandas/_config/config.py @@ -787,6 +787,7 @@ def is_instance_factory(_type): ValueError if x is not an instance of `_type` """ + if isinstance(_type, (tuple, list)): _type = tuple(_type) type_repr = "|".join(map(str, _type)) @@ -820,6 +821,32 @@ def inner(x): return inner +def is_nonnegative_int(value): + """ + Verify that value is None or a positive int. + + Parameters + ---------- + value : None or int + The `value` to be checked. + + Raises + ------ + ValueError + When the value is not None or is a negative integer + """ + + if value is None: + return + + elif isinstance(value, int): + if value >= 0: + return + + msg = "Value must be a nonnegative integer or None" + raise ValueError(msg) + + # common type validators, for convenience # usage: register_option(... , validator = is_int) is_int = is_type_factory(int) diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py index be6086dd360f2..08dce6aca6e6d 100644 --- a/pandas/core/config_init.py +++ b/pandas/core/config_init.py @@ -17,6 +17,7 @@ is_callable, is_instance_factory, is_int, + is_nonnegative_int, is_one_of_factory, is_text, ) @@ -319,7 +320,7 @@ def is_terminal(): with cf.config_prefix("display"): - cf.register_option("precision", 6, pc_precision_doc, validator=is_int) + cf.register_option("precision", 6, pc_precision_doc, validator=is_nonnegative_int) cf.register_option( "float_format", None, @@ -333,12 +334,7 @@ def is_terminal(): pc_max_info_rows_doc, validator=is_instance_factory((int, type(None))), ) - cf.register_option( - "max_rows", - 60, - pc_max_rows_doc, - validator=is_instance_factory([type(None), int]), - ) + cf.register_option("max_rows", 60, pc_max_rows_doc, validator=is_nonnegative_int) cf.register_option( "min_rows", 10, @@ -352,10 +348,7 @@ def is_terminal(): else: max_cols = 20 # cannot determine optimal number of columns cf.register_option( - "max_columns", - max_cols, - pc_max_cols_doc, - validator=is_instance_factory([type(None), int]), + "max_columns", max_cols, pc_max_cols_doc, validator=is_nonnegative_int ) cf.register_option( "large_repr", diff --git a/pandas/tests/config/test_config.py b/pandas/tests/config/test_config.py index 3f12d1d7a292d..efaeb7b1471ec 100644 --- a/pandas/tests/config/test_config.py +++ b/pandas/tests/config/test_config.py @@ -208,13 +208,16 @@ def test_set_option_multiple(self): def test_validation(self): self.cf.register_option("a", 1, "doc", validator=self.cf.is_int) + self.cf.register_option("d", 1, "doc", validator=self.cf.is_nonnegative_int) self.cf.register_option("b.c", "hullo", "doc2", validator=self.cf.is_text) + msg = "Value must have type ''" with pytest.raises(ValueError, match=msg): self.cf.register_option("a.b.c.d2", "NO", "doc", validator=self.cf.is_int) self.cf.set_option("a", 2) # int is_int self.cf.set_option("b.c", "wurld") # str is_str + self.cf.set_option("d", 2) # None not is_int with pytest.raises(ValueError, match=msg): @@ -222,6 +225,16 @@ def test_validation(self): with pytest.raises(ValueError, match=msg): self.cf.set_option("a", "ab") + msg = "Value must be a nonnegative integer or None" + with pytest.raises(ValueError, match=msg): + self.cf.register_option( + "a.b.c.d3", "NO", "doc", validator=self.cf.is_nonnegative_int + ) + with pytest.raises(ValueError, match=msg): + self.cf.register_option( + "a.b.c.d3", -2, "doc", validator=self.cf.is_nonnegative_int + ) + msg = r"Value must be an instance of \|" with pytest.raises(ValueError, match=msg): self.cf.set_option("b.c", 1) From 2f775b0bf45603e6ae85812bdd9548c8f196146d Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 5 Aug 2019 08:52:10 -0700 Subject: [PATCH 012/191] REF: separate out invalid ops (#27735) --- pandas/core/arrays/datetimelike.py | 27 +++++++------ pandas/core/arrays/datetimes.py | 9 +++-- pandas/core/arrays/timedeltas.py | 8 ++-- pandas/core/indexes/base.py | 3 +- pandas/core/ops/__init__.py | 63 ++---------------------------- pandas/core/ops/invalid.py | 61 +++++++++++++++++++++++++++++ 6 files changed, 90 insertions(+), 81 deletions(-) create mode 100644 pandas/core/ops/invalid.py diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 2747b1d7dd9f1..770870a466aa9 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -44,9 +44,10 @@ from pandas.core.dtypes.missing import is_valid_nat_for_dtype, isna from pandas._typing import DatetimeLikeScalar -from pandas.core import missing, nanops, ops +from pandas.core import missing, nanops from pandas.core.algorithms import checked_add_with_arr, take, unique1d, value_counts import pandas.core.common as com +from pandas.core.ops.invalid import make_invalid_op from pandas.tseries import frequencies from pandas.tseries.offsets import DateOffset, Tick @@ -921,18 +922,18 @@ def _is_unique(self): # pow is invalid for all three subclasses; TimedeltaArray will override # the multiplication and division ops - __pow__ = ops.make_invalid_op("__pow__") - __rpow__ = ops.make_invalid_op("__rpow__") - __mul__ = ops.make_invalid_op("__mul__") - __rmul__ = ops.make_invalid_op("__rmul__") - __truediv__ = ops.make_invalid_op("__truediv__") - __rtruediv__ = ops.make_invalid_op("__rtruediv__") - __floordiv__ = ops.make_invalid_op("__floordiv__") - __rfloordiv__ = ops.make_invalid_op("__rfloordiv__") - __mod__ = ops.make_invalid_op("__mod__") - __rmod__ = ops.make_invalid_op("__rmod__") - __divmod__ = ops.make_invalid_op("__divmod__") - __rdivmod__ = ops.make_invalid_op("__rdivmod__") + __pow__ = make_invalid_op("__pow__") + __rpow__ = make_invalid_op("__rpow__") + __mul__ = make_invalid_op("__mul__") + __rmul__ = make_invalid_op("__rmul__") + __truediv__ = make_invalid_op("__truediv__") + __rtruediv__ = make_invalid_op("__rtruediv__") + __floordiv__ = make_invalid_op("__floordiv__") + __rfloordiv__ = make_invalid_op("__rfloordiv__") + __mod__ = make_invalid_op("__mod__") + __rmod__ = make_invalid_op("__rmod__") + __divmod__ = make_invalid_op("__divmod__") + __rdivmod__ = make_invalid_op("__rdivmod__") def _add_datetimelike_scalar(self, other): # Overriden by TimedeltaArray diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index 061ee4b90d0e9..28537124536e7 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -53,6 +53,7 @@ from pandas.core.arrays import datetimelike as dtl from pandas.core.arrays._ranges import generate_regular_range import pandas.core.common as com +from pandas.core.ops.invalid import invalid_comparison from pandas.tseries.frequencies import get_period_alias, to_offset from pandas.tseries.offsets import Day, Tick @@ -171,13 +172,13 @@ def wrapper(self, other): other = _to_M8(other, tz=self.tz) except ValueError: # string that cannot be parsed to Timestamp - return ops.invalid_comparison(self, other, op) + return invalid_comparison(self, other, op) result = op(self.asi8, other.view("i8")) if isna(other): result.fill(nat_result) elif lib.is_scalar(other) or np.ndim(other) == 0: - return ops.invalid_comparison(self, other, op) + return invalid_comparison(self, other, op) elif len(other) != len(self): raise ValueError("Lengths must match") else: @@ -191,7 +192,7 @@ def wrapper(self, other): ): # Following Timestamp convention, __eq__ is all-False # and __ne__ is all True, others raise TypeError. - return ops.invalid_comparison(self, other, op) + return invalid_comparison(self, other, op) if is_object_dtype(other): # We have to use _comp_method_OBJECT_ARRAY instead of numpy @@ -204,7 +205,7 @@ def wrapper(self, other): o_mask = isna(other) elif not (is_datetime64_dtype(other) or is_datetime64tz_dtype(other)): # e.g. is_timedelta64_dtype(other) - return ops.invalid_comparison(self, other, op) + return invalid_comparison(self, other, op) else: self._assert_tzawareness_compat(other) if isinstance(other, (ABCIndexClass, ABCSeries)): diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index afd1e8203059e..94dd561fc96f7 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -41,9 +41,9 @@ ) from pandas.core.dtypes.missing import isna -from pandas.core import ops from pandas.core.algorithms import checked_add_with_arr import pandas.core.common as com +from pandas.core.ops.invalid import invalid_comparison from pandas.tseries.frequencies import to_offset from pandas.tseries.offsets import Tick @@ -90,14 +90,14 @@ def wrapper(self, other): other = Timedelta(other) except ValueError: # failed to parse as timedelta - return ops.invalid_comparison(self, other, op) + return invalid_comparison(self, other, op) result = op(self.view("i8"), other.value) if isna(other): result.fill(nat_result) elif not is_list_like(other): - return ops.invalid_comparison(self, other, op) + return invalid_comparison(self, other, op) elif len(other) != len(self): raise ValueError("Lengths must match") @@ -106,7 +106,7 @@ def wrapper(self, other): try: other = type(self)._from_sequence(other)._data except (ValueError, TypeError): - return ops.invalid_comparison(self, other, op) + return invalid_comparison(self, other, op) result = op(self.view("i8"), other.view("i8")) result = com.values_from_object(result) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 2271ff643bc15..57e84282aed72 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -70,7 +70,8 @@ from pandas.core.indexers import maybe_convert_indices from pandas.core.indexes.frozen import FrozenList import pandas.core.missing as missing -from pandas.core.ops import get_op_result_name, make_invalid_op +from pandas.core.ops import get_op_result_name +from pandas.core.ops.invalid import make_invalid_op import pandas.core.sorting as sorting from pandas.core.strings import StringMethods diff --git a/pandas/core/ops/__init__.py b/pandas/core/ops/__init__.py index 48b3d74e8d02c..4ab1941e3493f 100644 --- a/pandas/core/ops/__init__.py +++ b/pandas/core/ops/__init__.py @@ -49,15 +49,15 @@ import pandas as pd from pandas._typing import ArrayLike from pandas.core.construction import extract_array - -from . import missing -from .docstrings import ( +from pandas.core.ops import missing +from pandas.core.ops.docstrings import ( _arith_doc_FRAME, _flex_comp_doc_FRAME, _make_flex_doc, _op_descriptions, ) -from .roperator import ( # noqa:F401 +from pandas.core.ops.invalid import invalid_comparison +from pandas.core.ops.roperator import ( # noqa:F401 radd, rand_, rdiv, @@ -185,29 +185,6 @@ def maybe_upcast_for_op(obj, shape: Tuple[int, ...]): # ----------------------------------------------------------------------------- -def make_invalid_op(name): - """ - Return a binary method that always raises a TypeError. - - Parameters - ---------- - name : str - - Returns - ------- - invalid_op : function - """ - - def invalid_op(self, other=None): - raise TypeError( - "cannot perform {name} with this index type: " - "{typ}".format(name=name, typ=type(self).__name__) - ) - - invalid_op.__name__ = name - return invalid_op - - def _gen_eval_kwargs(name): """ Find the keyword arguments to pass to numexpr for the given operation. @@ -476,38 +453,6 @@ def masked_arith_op(x, y, op): return result -def invalid_comparison(left, right, op): - """ - If a comparison has mismatched types and is not necessarily meaningful, - follow python3 conventions by: - - - returning all-False for equality - - returning all-True for inequality - - raising TypeError otherwise - - Parameters - ---------- - left : array-like - right : scalar, array-like - op : operator.{eq, ne, lt, le, gt} - - Raises - ------ - TypeError : on inequality comparisons - """ - if op is operator.eq: - res_values = np.zeros(left.shape, dtype=bool) - elif op is operator.ne: - res_values = np.ones(left.shape, dtype=bool) - else: - raise TypeError( - "Invalid comparison between dtype={dtype} and {typ}".format( - dtype=left.dtype, typ=type(right).__name__ - ) - ) - return res_values - - # ----------------------------------------------------------------------------- # Dispatch logic diff --git a/pandas/core/ops/invalid.py b/pandas/core/ops/invalid.py new file mode 100644 index 0000000000000..013ff7689b221 --- /dev/null +++ b/pandas/core/ops/invalid.py @@ -0,0 +1,61 @@ +""" +Templates for invalid operations. +""" +import operator + +import numpy as np + + +def invalid_comparison(left, right, op): + """ + If a comparison has mismatched types and is not necessarily meaningful, + follow python3 conventions by: + + - returning all-False for equality + - returning all-True for inequality + - raising TypeError otherwise + + Parameters + ---------- + left : array-like + right : scalar, array-like + op : operator.{eq, ne, lt, le, gt} + + Raises + ------ + TypeError : on inequality comparisons + """ + if op is operator.eq: + res_values = np.zeros(left.shape, dtype=bool) + elif op is operator.ne: + res_values = np.ones(left.shape, dtype=bool) + else: + raise TypeError( + "Invalid comparison between dtype={dtype} and {typ}".format( + dtype=left.dtype, typ=type(right).__name__ + ) + ) + return res_values + + +def make_invalid_op(name: str): + """ + Return a binary method that always raises a TypeError. + + Parameters + ---------- + name : str + + Returns + ------- + invalid_op : function + """ + + def invalid_op(self, other=None): + raise TypeError( + "cannot perform {name} with this index type: " + "{typ}".format(name=name, typ=type(self).__name__) + ) + + invalid_op.__name__ = name + return invalid_op From ac693331400bce4747c63aa76c53c6e3488933ae Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 5 Aug 2019 08:52:44 -0700 Subject: [PATCH 013/191] CLN: collected cleanups from other branches (#27723) --- pandas/_libs/index.pyx | 4 -- pandas/core/arrays/numpy_.py | 6 ++- pandas/core/arrays/timedeltas.py | 4 +- pandas/core/indexes/base.py | 2 +- pandas/core/internals/blocks.py | 54 +++++++++++++++++------- pandas/core/internals/managers.py | 2 +- pandas/core/resample.py | 9 ++-- pandas/core/window.py | 2 + pandas/tests/groupby/test_categorical.py | 2 +- 9 files changed, 54 insertions(+), 31 deletions(-) diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index f704ceffa662e..7424c4ddc3d92 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -47,10 +47,6 @@ cpdef get_value_at(ndarray arr, object loc, object tz=None): return util.get_value_at(arr, loc) -def get_value_box(arr: ndarray, loc: object) -> object: - return get_value_at(arr, loc, tz=None) - - # Don't populate hash tables in monotonic indexes larger than this _SIZE_CUTOFF = 1000000 diff --git a/pandas/core/arrays/numpy_.py b/pandas/core/arrays/numpy_.py index 39529177b9e35..667fb4501ed95 100644 --- a/pandas/core/arrays/numpy_.py +++ b/pandas/core/arrays/numpy_.py @@ -125,7 +125,11 @@ def __init__(self, values, copy=False): if isinstance(values, type(self)): values = values._ndarray if not isinstance(values, np.ndarray): - raise ValueError("'values' must be a NumPy array.") + raise ValueError( + "'values' must be a NumPy array, not {typ}".format( + typ=type(values).__name__ + ) + ) if values.ndim != 1: raise ValueError("PandasArray must be 1-dimensional.") diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index 94dd561fc96f7..6899e47045c1c 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -173,8 +173,8 @@ class TimedeltaArray(dtl.DatetimeLikeArrayMixin, dtl.TimelikeOps): "ceil", ] - # Needed so that NaT.__richcmp__(DateTimeArray) operates pointwise - ndim = 1 + # Note: ndim must be defined to ensure NaT.__richcmp(TimedeltaArray) + # operates pointwise. @property def _box_func(self): diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 57e84282aed72..356ae20b2240a 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -4714,7 +4714,7 @@ def get_value(self, series, key): raise try: - return libindex.get_value_box(s, key) + return libindex.get_value_at(s, key) except IndexError: raise except TypeError: diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 6a2aebe5db246..9f3aa699cfaf4 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -434,7 +434,7 @@ def f(m, v, i): return self.split_and_operate(mask, f, inplace) - def split_and_operate(self, mask, f, inplace): + def split_and_operate(self, mask, f, inplace: bool): """ split the block per-column, and apply the callable f per-column, return a new block for each. Handle @@ -493,17 +493,15 @@ def make_a_block(nv, ref_loc): return new_blocks - def _maybe_downcast(self, blocks, downcast=None): + def _maybe_downcast(self, blocks: List["Block"], downcast=None) -> List["Block"]: # no need to downcast our float # unless indicated - if downcast is None and self.is_float: - return blocks - elif downcast is None and (self.is_timedelta or self.is_datetime): + if downcast is None and ( + self.is_float or self.is_timedelta or self.is_datetime + ): return blocks - if not isinstance(blocks, list): - blocks = [blocks] return _extend_blocks([b.downcast(downcast) for b in blocks]) def downcast(self, dtypes=None): @@ -1343,7 +1341,15 @@ def shift(self, periods, axis=0, fill_value=None): return [self.make_block(new_values)] - def where(self, other, cond, align=True, errors="raise", try_cast=False, axis=0): + def where( + self, + other, + cond, + align=True, + errors="raise", + try_cast: bool = False, + axis: int = 0, + ) -> List["Block"]: """ evaluate the block; return result block(s) from the result @@ -1442,7 +1448,7 @@ def func(cond, values, other): if try_cast: result = self._try_cast_result(result) - return self.make_block(result) + return [self.make_block(result)] # might need to separate out blocks axis = cond.ndim - 1 @@ -1474,9 +1480,9 @@ def _unstack(self, unstacker_func, new_columns, n_rows, fill_value): new_columns : Index All columns of the unstacked BlockManager. n_rows : int - Only used in ExtensionBlock.unstack + Only used in ExtensionBlock._unstack fill_value : int - Only used in ExtensionBlock.unstack + Only used in ExtensionBlock._unstack Returns ------- @@ -1550,7 +1556,7 @@ def quantile(self, qs, interpolation="linear", axis=0): result = result[..., 0] result = lib.item_from_zerodim(result) - ndim = getattr(result, "ndim", None) or 0 + ndim = np.ndim(result) return make_block(result, placement=np.arange(len(result)), ndim=ndim) def _replace_coerce( @@ -1923,7 +1929,15 @@ def shift( ) ] - def where(self, other, cond, align=True, errors="raise", try_cast=False, axis=0): + def where( + self, + other, + cond, + align=True, + errors="raise", + try_cast: bool = False, + axis: int = 0, + ) -> List["Block"]: if isinstance(other, ABCDataFrame): # ExtensionArrays are 1-D, so if we get here then # `other` should be a DataFrame with a single column. @@ -1968,7 +1982,7 @@ def where(self, other, cond, align=True, errors="raise", try_cast=False, axis=0) np.where(cond, self.values, other), dtype=dtype ) - return self.make_block_same_class(result, placement=self.mgr_locs) + return [self.make_block_same_class(result, placement=self.mgr_locs)] @property def _ftype(self): @@ -2706,7 +2720,7 @@ def f(m, v, i): return blocks - def _maybe_downcast(self, blocks, downcast=None): + def _maybe_downcast(self, blocks: List["Block"], downcast=None) -> List["Block"]: if downcast is not None: return blocks @@ -3031,7 +3045,15 @@ def concat_same_type(self, to_concat, placement=None): values, placement=placement or slice(0, len(values), 1), ndim=self.ndim ) - def where(self, other, cond, align=True, errors="raise", try_cast=False, axis=0): + def where( + self, + other, + cond, + align=True, + errors="raise", + try_cast: bool = False, + axis: int = 0, + ) -> List["Block"]: # TODO(CategoricalBlock.where): # This can all be deleted in favor of ExtensionBlock.where once # we enforce the deprecation. diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index e5acd23b77d5d..b30ddbc383906 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1823,7 +1823,7 @@ def _simple_blockify(tuples, dtype): """ values, placement = _stack_arrays(tuples, dtype) - # CHECK DTYPE? + # TODO: CHECK DTYPE? if dtype is not None and values.dtype != dtype: # pragma: no cover values = values.astype(dtype) diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 66878c3b1026c..a5d0e2cb3b58f 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -1630,15 +1630,14 @@ def _get_period_bins(self, ax): def _take_new_index(obj, indexer, new_index, axis=0): - from pandas.core.api import Series, DataFrame - if isinstance(obj, Series): + if isinstance(obj, ABCSeries): new_values = algos.take_1d(obj.values, indexer) - return Series(new_values, index=new_index, name=obj.name) - elif isinstance(obj, DataFrame): + return obj._constructor(new_values, index=new_index, name=obj.name) + elif isinstance(obj, ABCDataFrame): if axis == 1: raise NotImplementedError("axis 1 is not supported") - return DataFrame( + return obj._constructor( obj._data.reindex_indexer(new_axis=new_index, indexer=indexer, axis=1) ) else: diff --git a/pandas/core/window.py b/pandas/core/window.py index 4b6a1cf2e9a04..a7425bc1466c3 100644 --- a/pandas/core/window.py +++ b/pandas/core/window.py @@ -265,6 +265,8 @@ def _wrap_result(self, result, block=None, obj=None): # coerce if necessary if block is not None: if is_timedelta64_dtype(block.values.dtype): + # TODO: do we know what result.dtype is at this point? + # i.e. can we just do an astype? from pandas import to_timedelta result = to_timedelta(result.ravel(), unit="ns").values.reshape( diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 9b8c8e6d8a077..ce724f5a60beb 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -506,7 +506,7 @@ def test_datetime(): desc_result = grouped.describe() idx = cats.codes.argsort() - ord_labels = cats.take_nd(idx) + ord_labels = cats.take(idx) ord_data = data.take(idx) expected = ord_data.groupby(ord_labels, observed=False).describe() assert_frame_equal(desc_result, expected) From 61819aba14dd7b3996336aaed84d07cd936d92b5 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 5 Aug 2019 13:29:16 -0700 Subject: [PATCH 014/191] BUG: fix to_datetime(dti, utc=True) (#27733) * BUG: fix to_datetime(dti, utc=True) * whatsnew, suggested edits * parametrize * backquotes --- doc/source/whatsnew/v0.25.1.rst | 2 +- pandas/core/tools/datetimes.py | 3 +++ pandas/tests/indexes/datetimes/test_tools.py | 12 ++++++++++++ 3 files changed, 16 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.25.1.rst b/doc/source/whatsnew/v0.25.1.rst index 4d9ee4c676759..943a6adb7944e 100644 --- a/doc/source/whatsnew/v0.25.1.rst +++ b/doc/source/whatsnew/v0.25.1.rst @@ -31,7 +31,7 @@ Categorical Datetimelike ^^^^^^^^^^^^ - +- Bug in :func:`to_datetime` where passing a timezone-naive :class:`DatetimeArray` or :class:`DatetimeIndex` and ``utc=True`` would incorrectly return a timezone-naive result (:issue:`27733`) - - - diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 172084e97a959..b07647cf5b5fb 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -334,6 +334,9 @@ def _convert_listlike_datetimes( return DatetimeIndex(arg, tz=tz, name=name) except ValueError: pass + elif tz: + # DatetimeArray, DatetimeIndex + return arg.tz_localize(tz) return arg diff --git a/pandas/tests/indexes/datetimes/test_tools.py b/pandas/tests/indexes/datetimes/test_tools.py index 8db15709da35d..9af0f47f6dce9 100644 --- a/pandas/tests/indexes/datetimes/test_tools.py +++ b/pandas/tests/indexes/datetimes/test_tools.py @@ -1620,6 +1620,18 @@ def test_dayfirst(self, cache): tm.assert_index_equal(expected, idx5) tm.assert_index_equal(expected, idx6) + @pytest.mark.parametrize("klass", [DatetimeIndex, DatetimeArray]) + def test_to_datetime_dta_tz(self, klass): + # GH#27733 + dti = date_range("2015-04-05", periods=3).rename("foo") + expected = dti.tz_localize("UTC") + + obj = klass(dti) + expected = klass(expected) + + result = to_datetime(obj, utc=True) + tm.assert_equal(result, expected) + class TestGuessDatetimeFormat: @td.skip_if_not_us_locale From e123486314f90a6ef129901114fbeb93283e5233 Mon Sep 17 00:00:00 2001 From: Unprocessable Date: Tue, 6 Aug 2019 11:21:07 +0200 Subject: [PATCH 015/191] Added diff bug description to changelog --- doc/source/whatsnew/v0.25.1.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.25.1.rst b/doc/source/whatsnew/v0.25.1.rst index 4d9ee4c676759..4b0358a893d15 100644 --- a/doc/source/whatsnew/v0.25.1.rst +++ b/doc/source/whatsnew/v0.25.1.rst @@ -153,7 +153,7 @@ ExtensionArray Other ^^^^^ - Bug in :meth:`Series.replace` and :meth:`DataFrame.replace` when replacing timezone-aware timestamps using a dict-like replacer (:issue:`27720`) -- +- Bug in :meth:`Series.diff` where a boolean series would cause a TypeError (the - operator is deprecated) when using NumPy >= 0.13.0 - - From 269d148d2f5d88b04d55badf1b5c41edadd33a58 Mon Sep 17 00:00:00 2001 From: Unprocessable Date: Tue, 6 Aug 2019 11:31:48 +0200 Subject: [PATCH 016/191] Added issue number --- doc/source/whatsnew/v0.25.1.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.25.1.rst b/doc/source/whatsnew/v0.25.1.rst index 4b0358a893d15..181b208d70d49 100644 --- a/doc/source/whatsnew/v0.25.1.rst +++ b/doc/source/whatsnew/v0.25.1.rst @@ -153,7 +153,7 @@ ExtensionArray Other ^^^^^ - Bug in :meth:`Series.replace` and :meth:`DataFrame.replace` when replacing timezone-aware timestamps using a dict-like replacer (:issue:`27720`) -- Bug in :meth:`Series.diff` where a boolean series would cause a TypeError (the - operator is deprecated) when using NumPy >= 0.13.0 +- Bug in :meth:`Series.diff` where a boolean series would cause a TypeError (the - operator is deprecated) when using NumPy >= 0.13.0 (:issue:`17294`) - - From 3993206038774ae174d1dd7f35fc82df2b8f030e Mon Sep 17 00:00:00 2001 From: Unprocessable Date: Tue, 6 Aug 2019 12:15:18 +0200 Subject: [PATCH 017/191] Adding timeseries tests as well --- pandas/tests/series/test_diff.py | 82 +++++++++++++++++++++++--------- 1 file changed, 59 insertions(+), 23 deletions(-) diff --git a/pandas/tests/series/test_diff.py b/pandas/tests/series/test_diff.py index 9f6545c5cecbe..af2d078ed2d0c 100644 --- a/pandas/tests/series/test_diff.py +++ b/pandas/tests/series/test_diff.py @@ -1,23 +1,59 @@ -from numpy import nan -from pandas import Series -import pandas.util.testing as tm - - -def test_diff(): - ''' - Tests the pd.Series diff function on boolean series. - ''' - data = Series( - [0, -1, -2, -3, -4, -3, -2, -1, 0] - ) - filtered = data.between(-2, 0, inclusive=True) - diff_boolean = filtered.diff() - expected_boolean = Series( - [nan, False, False, True, False, False, True, False, False] - ) - tm.assert_series_equal(diff_boolean, expected_boolean) - diff_data = data.diff() - expected_data = Series( - [nan, -1.0, -1.0, -1.0, -1.0, 1.0, 1.0, 1.0, 1.0] - ) - tm.assert_series_equal(diff_data, expected_data) +from pandas import ( + Series, + date_range, +) +from pandas.tests.series.common import TestData +from pandas.core.indexes.timedeltas import TimedeltaIndex +from pandas.util.testing import assert_series_equal + +class TestDiff(TestData): + def test_diff(self): + # Just run the function + self.ts.diff() + + # int dtype + a = 10000000000000000 + b = a + 1 + s = Series([a, b]) + + rs = s.diff() + assert rs[1] == 1 + + # neg n + rs = self.ts.diff(-1) + xp = self.ts - self.ts.shift(-1) + assert_series_equal(rs, xp) + + # 0 + rs = self.ts.diff(0) + xp = self.ts - self.ts + assert_series_equal(rs, xp) + + # datetime diff (GH3100) + s = Series(date_range("20130102", periods=5)) + rs = s - s.shift(1) + xp = s.diff() + assert_series_equal(rs, xp) + + # timedelta diff + nrs = rs - rs.shift(1) + nxp = xp.diff() + assert_series_equal(nrs, nxp) + + # with tz + s = Series( + date_range("2000-01-01 09:00:00", periods=5, tz="US/Eastern"), name="foo" + ) + result = s.diff() + assert_series_equal( + result, Series(TimedeltaIndex(["NaT"] + ["1 days"] * 4), name="foo") + ) + + # boolean series + s = Series( + [False, True, True, False, False] + ) + result = s.diff() + assert_series_equal( + result, Series[nan, True, False, True, False] + ) From b610dd9fec32d88a8653cd7f264c94498de51411 Mon Sep 17 00:00:00 2001 From: Unprocessable Date: Tue, 6 Aug 2019 12:16:56 +0200 Subject: [PATCH 018/191] Removed test_diff, it is added to test_diff.py I copied the data over to the other file, diff tests should be in one file and the boolean test cannot be added to this one as it is not a time series. --- pandas/tests/series/test_timeseries.py | 42 -------------------------- 1 file changed, 42 deletions(-) diff --git a/pandas/tests/series/test_timeseries.py b/pandas/tests/series/test_timeseries.py index d0ca5d82c6b33..fbe3f929cf5b5 100644 --- a/pandas/tests/series/test_timeseries.py +++ b/pandas/tests/series/test_timeseries.py @@ -355,48 +355,6 @@ def test_asfreq_datetimeindex_empty_series(self): ) tm.assert_index_equal(expected.index, result.index) - def test_diff(self): - # Just run the function - self.ts.diff() - - # int dtype - a = 10000000000000000 - b = a + 1 - s = Series([a, b]) - - rs = s.diff() - assert rs[1] == 1 - - # neg n - rs = self.ts.diff(-1) - xp = self.ts - self.ts.shift(-1) - assert_series_equal(rs, xp) - - # 0 - rs = self.ts.diff(0) - xp = self.ts - self.ts - assert_series_equal(rs, xp) - - # datetime diff (GH3100) - s = Series(date_range("20130102", periods=5)) - rs = s - s.shift(1) - xp = s.diff() - assert_series_equal(rs, xp) - - # timedelta diff - nrs = rs - rs.shift(1) - nxp = xp.diff() - assert_series_equal(nrs, nxp) - - # with tz - s = Series( - date_range("2000-01-01 09:00:00", periods=5, tz="US/Eastern"), name="foo" - ) - result = s.diff() - assert_series_equal( - result, Series(TimedeltaIndex(["NaT"] + ["1 days"] * 4), name="foo") - ) - def test_pct_change(self): rs = self.ts.pct_change(fill_method=None) assert_series_equal(rs, self.ts / self.ts.shift(1) - 1) From 92075fbc3cec3426c71752d71ab6739e620cae5b Mon Sep 17 00:00:00 2001 From: Unprocessable Date: Tue, 6 Aug 2019 13:06:10 +0200 Subject: [PATCH 019/191] Removed too much code, importing nan again --- pandas/tests/series/test_diff.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/series/test_diff.py b/pandas/tests/series/test_diff.py index af2d078ed2d0c..dc77e15fdbec5 100644 --- a/pandas/tests/series/test_diff.py +++ b/pandas/tests/series/test_diff.py @@ -1,3 +1,4 @@ +from numpy import nan from pandas import ( Series, date_range, From 1b906575f2606a66e307c5bc99dcbb2de379b23e Mon Sep 17 00:00:00 2001 From: Unprocessable Date: Tue, 6 Aug 2019 13:08:38 +0200 Subject: [PATCH 020/191] STYLE: formatting... --- pandas/tests/series/test_diff.py | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/pandas/tests/series/test_diff.py b/pandas/tests/series/test_diff.py index dc77e15fdbec5..1bb9c57701748 100644 --- a/pandas/tests/series/test_diff.py +++ b/pandas/tests/series/test_diff.py @@ -1,12 +1,10 @@ from numpy import nan -from pandas import ( - Series, - date_range, -) +from pandas import Series, date_range from pandas.tests.series.common import TestData from pandas.core.indexes.timedeltas import TimedeltaIndex from pandas.util.testing import assert_series_equal + class TestDiff(TestData): def test_diff(self): # Just run the function @@ -51,10 +49,6 @@ def test_diff(self): ) # boolean series - s = Series( - [False, True, True, False, False] - ) + s = Series([False, True, True, False, False]) result = s.diff() - assert_series_equal( - result, Series[nan, True, False, True, False] - ) + assert_series_equal(result, Series[nan, True, False, True, False]) From 182cbc25b5535d65fd7284b95d4f893c6ea36782 Mon Sep 17 00:00:00 2001 From: Unprocessable Date: Tue, 6 Aug 2019 13:37:34 +0200 Subject: [PATCH 021/191] Splitting into two functions --- pandas/tests/series/test_diff.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/tests/series/test_diff.py b/pandas/tests/series/test_diff.py index 1bb9c57701748..a37f64956f834 100644 --- a/pandas/tests/series/test_diff.py +++ b/pandas/tests/series/test_diff.py @@ -6,7 +6,7 @@ class TestDiff(TestData): - def test_diff(self): + def test_ts_diff(self): # Just run the function self.ts.diff() @@ -47,7 +47,8 @@ def test_diff(self): assert_series_equal( result, Series(TimedeltaIndex(["NaT"] + ["1 days"] * 4), name="foo") ) - + + def test_boolean_diff(self): # boolean series s = Series([False, True, True, False, False]) result = s.diff() From 3ba0958f98d15f8bf2a72a94a018400be9b8489b Mon Sep 17 00:00:00 2001 From: Unprocessable Date: Tue, 6 Aug 2019 13:48:35 +0200 Subject: [PATCH 022/191] Split into 3 functions --- pandas/tests/series/test_diff.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/tests/series/test_diff.py b/pandas/tests/series/test_diff.py index a37f64956f834..692b0c5ae9c1a 100644 --- a/pandas/tests/series/test_diff.py +++ b/pandas/tests/series/test_diff.py @@ -27,7 +27,8 @@ def test_ts_diff(self): rs = self.ts.diff(0) xp = self.ts - self.ts assert_series_equal(rs, xp) - + + def test_datetime_diff(self): # datetime diff (GH3100) s = Series(date_range("20130102", periods=5)) rs = s - s.shift(1) From 47feb3056a2d1447d46adcdb82732604d579a414 Mon Sep 17 00:00:00 2001 From: Unprocessable Date: Tue, 6 Aug 2019 14:34:21 +0200 Subject: [PATCH 023/191] Update test_diff.py --- pandas/tests/series/test_diff.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pandas/tests/series/test_diff.py b/pandas/tests/series/test_diff.py index 692b0c5ae9c1a..a15c72360aada 100644 --- a/pandas/tests/series/test_diff.py +++ b/pandas/tests/series/test_diff.py @@ -6,7 +6,7 @@ class TestDiff(TestData): - def test_ts_diff(self): + def test_diff(self): # Just run the function self.ts.diff() @@ -28,7 +28,6 @@ def test_ts_diff(self): xp = self.ts - self.ts assert_series_equal(rs, xp) - def test_datetime_diff(self): # datetime diff (GH3100) s = Series(date_range("20130102", periods=5)) rs = s - s.shift(1) @@ -49,7 +48,6 @@ def test_datetime_diff(self): result, Series(TimedeltaIndex(["NaT"] + ["1 days"] * 4), name="foo") ) - def test_boolean_diff(self): # boolean series s = Series([False, True, True, False, False]) result = s.diff() From d425cdf273d13c53010a8b8ba6194b2d56d4c3b7 Mon Sep 17 00:00:00 2001 From: Unprocessable Date: Tue, 6 Aug 2019 15:25:20 +0200 Subject: [PATCH 024/191] Adding nan test --- pandas/tests/series/test_diff.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/pandas/tests/series/test_diff.py b/pandas/tests/series/test_diff.py index a15c72360aada..43d7a9d41cf77 100644 --- a/pandas/tests/series/test_diff.py +++ b/pandas/tests/series/test_diff.py @@ -51,4 +51,9 @@ def test_diff(self): # boolean series s = Series([False, True, True, False, False]) result = s.diff() - assert_series_equal(result, Series[nan, True, False, True, False]) + assert_series_equal(result, Series([nan, True, False, True, False])) + + # boolean nan series + s = Series([False, True, nan, False, False]) + result = s.diff() + assert_series_equal(result, Series([nan, 1, nan, nan, 0], dtype = "object")) From 050b8cb424e31fd7118354b033f3c40759999a30 Mon Sep 17 00:00:00 2001 From: Unprocessable Date: Tue, 6 Aug 2019 16:05:54 +0200 Subject: [PATCH 025/191] STYLE: Black formatting --- pandas/tests/series/test_diff.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/tests/series/test_diff.py b/pandas/tests/series/test_diff.py index 43d7a9d41cf77..7e303be7ee770 100644 --- a/pandas/tests/series/test_diff.py +++ b/pandas/tests/series/test_diff.py @@ -27,7 +27,7 @@ def test_diff(self): rs = self.ts.diff(0) xp = self.ts - self.ts assert_series_equal(rs, xp) - + # datetime diff (GH3100) s = Series(date_range("20130102", periods=5)) rs = s - s.shift(1) @@ -47,13 +47,13 @@ def test_diff(self): assert_series_equal( result, Series(TimedeltaIndex(["NaT"] + ["1 days"] * 4), name="foo") ) - + # boolean series s = Series([False, True, True, False, False]) result = s.diff() assert_series_equal(result, Series([nan, True, False, True, False])) - + # boolean nan series s = Series([False, True, nan, False, False]) result = s.diff() - assert_series_equal(result, Series([nan, 1, nan, nan, 0], dtype = "object")) + assert_series_equal(result, Series([nan, 1, nan, nan, 0], dtype="object")) From e028aefa828dfa89a68fc90d3a16764826b36e32 Mon Sep 17 00:00:00 2001 From: Unprocessable Date: Tue, 6 Aug 2019 17:02:33 +0200 Subject: [PATCH 026/191] Import order changed --- pandas/tests/series/test_diff.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/series/test_diff.py b/pandas/tests/series/test_diff.py index 7e303be7ee770..1d8c17dfa989b 100644 --- a/pandas/tests/series/test_diff.py +++ b/pandas/tests/series/test_diff.py @@ -1,7 +1,7 @@ from numpy import nan from pandas import Series, date_range -from pandas.tests.series.common import TestData from pandas.core.indexes.timedeltas import TimedeltaIndex +from pandas.tests.series.common import TestData from pandas.util.testing import assert_series_equal From f669f94a186ea444cc771985a915e90eecf218a9 Mon Sep 17 00:00:00 2001 From: kernc Date: Tue, 6 Aug 2019 17:39:19 +0200 Subject: [PATCH 027/191] BUG: Fix windowing over read-only arrays (#27767) --- doc/source/whatsnew/v0.25.1.rst | 1 + pandas/core/window.py | 6 ++++-- pandas/tests/window/test_rolling.py | 8 ++++++++ 3 files changed, 13 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.25.1.rst b/doc/source/whatsnew/v0.25.1.rst index 943a6adb7944e..66b760a76dad3 100644 --- a/doc/source/whatsnew/v0.25.1.rst +++ b/doc/source/whatsnew/v0.25.1.rst @@ -118,6 +118,7 @@ Groupby/resample/rolling ^^^^^^^^^^^^^^^^^^^^^^^^ - Bug in :meth:`pandas.core.groupby.DataFrameGroupBy.transform` where applying a timezone conversion lambda function would drop timezone information (:issue:`27496`) +- Bug in windowing over read-only arrays (:issue:`27766`) - - diff --git a/pandas/core/window.py b/pandas/core/window.py index a7425bc1466c3..3e3f17369db7b 100644 --- a/pandas/core/window.py +++ b/pandas/core/window.py @@ -246,8 +246,10 @@ def _prep_values(self, values: Optional[np.ndarray] = None) -> np.ndarray: except (ValueError, TypeError): raise TypeError("cannot handle this type -> {0}".format(values.dtype)) - # Always convert inf to nan - values[np.isinf(values)] = np.NaN + # Convert inf to nan for C funcs + inf = np.isinf(values) + if inf.any(): + values = np.where(inf, np.nan, values) return values diff --git a/pandas/tests/window/test_rolling.py b/pandas/tests/window/test_rolling.py index c7177e1d3914f..f0787ab3d191f 100644 --- a/pandas/tests/window/test_rolling.py +++ b/pandas/tests/window/test_rolling.py @@ -326,3 +326,11 @@ def test_rolling_axis_count(self, axis_frame): result = df.rolling(2, axis=axis_frame).count() tm.assert_frame_equal(result, expected) + + def test_readonly_array(self): + # GH-27766 + arr = np.array([1, 3, np.nan, 3, 5]) + arr.setflags(write=False) + result = pd.Series(arr).rolling(2).mean() + expected = pd.Series([np.nan, 2, np.nan, np.nan, 4]) + tm.assert_series_equal(result, expected) From 341043d25e38f6f6f84b4609b2ab7feb96df7789 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 6 Aug 2019 08:40:39 -0700 Subject: [PATCH 028/191] CLN/REF: Remove _try_cast_result, _try_coerce_and_cast_result (#27764) --- pandas/core/groupby/generic.py | 10 ++++--- pandas/core/groupby/ops.py | 2 ++ pandas/core/internals/blocks.py | 46 ++++----------------------------- 3 files changed, 13 insertions(+), 45 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 2ad85903b916b..ea2bd22cccc3d 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -21,7 +21,11 @@ from pandas.errors import AbstractMethodError from pandas.util._decorators import Appender, Substitution -from pandas.core.dtypes.cast import maybe_convert_objects, maybe_downcast_to_dtype +from pandas.core.dtypes.cast import ( + maybe_convert_objects, + maybe_downcast_numeric, + maybe_downcast_to_dtype, +) from pandas.core.dtypes.common import ( ensure_int64, ensure_platform_int, @@ -180,10 +184,8 @@ def _cython_agg_blocks(self, how, alt=None, numeric_only=True, min_count=-1): continue finally: if result is not no_result: - dtype = block.values.dtype - # see if we can cast the block back to the original dtype - result = block._try_coerce_and_cast_result(result, dtype=dtype) + result = maybe_downcast_numeric(result, block.dtype) newb = block.make_block(result) new_items.append(locs) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 676f243c9c8d3..b0c629f017dd3 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -591,6 +591,8 @@ def _cython_operation(self, kind, values, how, axis, min_count=-1, **kwargs): if is_datetime64tz_dtype(orig_values.dtype): result = type(orig_values)(result.astype(np.int64), dtype=orig_values.dtype) + elif is_datetimelike and kind == "aggregate": + result = result.astype(orig_values.dtype) return result, names diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 9f3aa699cfaf4..8c3cf7cc51495 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -18,6 +18,7 @@ find_common_type, infer_dtype_from, infer_dtype_from_scalar, + maybe_downcast_numeric, maybe_downcast_to_dtype, maybe_infer_dtype_type, maybe_promote, @@ -55,7 +56,6 @@ ABCDataFrame, ABCDatetimeIndex, ABCExtensionArray, - ABCIndexClass, ABCPandasArray, ABCSeries, ) @@ -685,28 +685,6 @@ def _can_hold_element(self, element): return issubclass(tipo.type, dtype) return isinstance(element, dtype) - def _try_cast_result(self, result, dtype=None): - """ try to cast the result to our original type, we may have - roundtripped thru object in the mean-time - """ - if dtype is None: - dtype = self.dtype - - if self.is_integer or self.is_bool or self.is_datetime: - pass - elif self.is_float and result.dtype == self.dtype: - # protect against a bool/object showing up here - if isinstance(dtype, str) and dtype == "infer": - return result - - # This is only reached via Block.setitem, where dtype is always - # either "infer", self.dtype, or values.dtype. - assert dtype == self.dtype, (dtype, self.dtype) - return result - - # may need to change the dtype here - return maybe_downcast_to_dtype(result, dtype) - def _try_coerce_args(self, other): """ provide coercion to our input arguments """ @@ -729,10 +707,6 @@ def _try_coerce_args(self, other): return other - def _try_coerce_and_cast_result(self, result, dtype=None): - result = self._try_cast_result(result, dtype=dtype) - return result - def to_native_types(self, slicer=None, na_rep="nan", quoting=None, **kwargs): """ convert to our native types format, slicing if desired """ @@ -925,8 +899,6 @@ def setitem(self, indexer, value): else: values[indexer] = value - # coerce and try to infer the dtypes of the result - values = self._try_coerce_and_cast_result(values, dtype) if transpose: values = values.T block = self.make_block(values) @@ -1444,10 +1416,6 @@ def func(cond, values, other): if transpose: result = result.T - # try to cast if requested - if try_cast: - result = self._try_cast_result(result) - return [self.make_block(result)] # might need to separate out blocks @@ -1459,7 +1427,7 @@ def func(cond, values, other): for m in [mask, ~mask]: if m.any(): taken = result.take(m.nonzero()[0], axis=axis) - r = self._try_cast_result(taken) + r = maybe_downcast_numeric(taken, self.dtype) nb = self.make_block(r.T, placement=self.mgr_locs[m]) result_blocks.append(nb) @@ -1692,9 +1660,6 @@ def putmask(self, mask, new, align=True, inplace=False, axis=0, transpose=False) new_values[mask] = new return [self.make_block(values=new_values)] - def _try_cast_result(self, result, dtype=None): - return result - def _get_unstack_items(self, unstacker, new_columns): """ Get the placement, values, and mask for a Block unstack. @@ -1746,7 +1711,8 @@ def __init__(self, values, placement, ndim=None): super().__init__(values, placement, ndim) def _maybe_coerce_values(self, values): - """Unbox to an extension array. + """ + Unbox to an extension array. This will unbox an ExtensionArray stored in an Index or Series. ExtensionArrays pass through. No dtype coercion is done. @@ -1759,9 +1725,7 @@ def _maybe_coerce_values(self, values): ------- ExtensionArray """ - if isinstance(values, (ABCIndexClass, ABCSeries)): - values = values._values - return values + return extract_array(values) @property def _holder(self): From 640d9e1f5fe8ab64d1f6496b8216c28185e53225 Mon Sep 17 00:00:00 2001 From: Kaiqi Dong Date: Tue, 6 Aug 2019 17:45:30 +0200 Subject: [PATCH 029/191] BUG: pd.crosstab not working when margin and normalize are set together (#27663) --- doc/source/whatsnew/v0.25.1.rst | 1 + pandas/core/reshape/pivot.py | 29 +++++++---- pandas/tests/reshape/test_pivot.py | 81 ++++++++++++++++++++++++++++++ 3 files changed, 101 insertions(+), 10 deletions(-) diff --git a/doc/source/whatsnew/v0.25.1.rst b/doc/source/whatsnew/v0.25.1.rst index 66b760a76dad3..f1d3f152e503d 100644 --- a/doc/source/whatsnew/v0.25.1.rst +++ b/doc/source/whatsnew/v0.25.1.rst @@ -126,6 +126,7 @@ Reshaping ^^^^^^^^^ - A ``KeyError`` is now raised if ``.unstack()`` is called on a :class:`Series` or :class:`DataFrame` with a flat :class:`Index` passing a name which is not the correct one (:issue:`18303`) +- Bug in :meth:`DataFrame.crosstab` when ``margins`` set to ``True`` and ``normalize`` is not ``False``, an error is raised. (:issue:`27500`) - :meth:`DataFrame.join` now suppresses the ``FutureWarning`` when the sort parameter is specified (:issue:`21952`) - diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index 79716520f6654..d653dd87308cf 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -611,13 +611,21 @@ def _normalize(table, normalize, margins, margins_name="All"): table = table.fillna(0) elif margins is True: - - column_margin = table.loc[:, margins_name].drop(margins_name) - index_margin = table.loc[margins_name, :].drop(margins_name) - table = table.drop(margins_name, axis=1).drop(margins_name) - # to keep index and columns names - table_index_names = table.index.names - table_columns_names = table.columns.names + # keep index and column of pivoted table + table_index = table.index + table_columns = table.columns + + # check if margin name is in (for MI cases) or equal to last + # index/column and save the column and index margin + if (margins_name not in table.iloc[-1, :].name) | ( + margins_name != table.iloc[:, -1].name + ): + raise ValueError("{} not in pivoted DataFrame".format(margins_name)) + column_margin = table.iloc[:-1, -1] + index_margin = table.iloc[-1, :-1] + + # keep the core table + table = table.iloc[:-1, :-1] # Normalize core table = _normalize(table, normalize=normalize, margins=False) @@ -627,11 +635,13 @@ def _normalize(table, normalize, margins, margins_name="All"): column_margin = column_margin / column_margin.sum() table = concat([table, column_margin], axis=1) table = table.fillna(0) + table.columns = table_columns elif normalize == "index": index_margin = index_margin / index_margin.sum() table = table.append(index_margin) table = table.fillna(0) + table.index = table_index elif normalize == "all" or normalize is True: column_margin = column_margin / column_margin.sum() @@ -641,13 +651,12 @@ def _normalize(table, normalize, margins, margins_name="All"): table = table.append(index_margin) table = table.fillna(0) + table.index = table_index + table.columns = table_columns else: raise ValueError("Not a valid normalize argument") - table.index.names = table_index_names - table.columns.names = table_columns_names - else: raise ValueError("Not a valid margins argument") diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index be82e7f595f8c..03b15d2df1a26 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -2447,3 +2447,84 @@ def test_crosstab_unsorted_order(self): [[1, 0, 0], [0, 1, 0], [0, 0, 1]], index=e_idx, columns=e_columns ) tm.assert_frame_equal(result, expected) + + def test_margin_normalize(self): + # GH 27500 + df = pd.DataFrame( + { + "A": ["foo", "foo", "foo", "foo", "foo", "bar", "bar", "bar", "bar"], + "B": ["one", "one", "one", "two", "two", "one", "one", "two", "two"], + "C": [ + "small", + "large", + "large", + "small", + "small", + "large", + "small", + "small", + "large", + ], + "D": [1, 2, 2, 3, 3, 4, 5, 6, 7], + "E": [2, 4, 5, 5, 6, 6, 8, 9, 9], + } + ) + # normalize on index + result = pd.crosstab( + [df.A, df.B], df.C, margins=True, margins_name="Sub-Total", normalize=0 + ) + expected = pd.DataFrame( + [[0.5, 0.5], [0.5, 0.5], [0.666667, 0.333333], [0, 1], [0.444444, 0.555556]] + ) + expected.index = MultiIndex( + levels=[["Sub-Total", "bar", "foo"], ["", "one", "two"]], + codes=[[1, 1, 2, 2, 0], [1, 2, 1, 2, 0]], + names=["A", "B"], + ) + expected.columns = Index(["large", "small"], dtype="object", name="C") + tm.assert_frame_equal(result, expected) + + # normalize on columns + result = pd.crosstab( + [df.A, df.B], df.C, margins=True, margins_name="Sub-Total", normalize=1 + ) + expected = pd.DataFrame( + [ + [0.25, 0.2, 0.222222], + [0.25, 0.2, 0.222222], + [0.5, 0.2, 0.333333], + [0, 0.4, 0.222222], + ] + ) + expected.columns = Index( + ["large", "small", "Sub-Total"], dtype="object", name="C" + ) + expected.index = MultiIndex( + levels=[["bar", "foo"], ["one", "two"]], + codes=[[0, 0, 1, 1], [0, 1, 0, 1]], + names=["A", "B"], + ) + tm.assert_frame_equal(result, expected) + + # normalize on both index and column + result = pd.crosstab( + [df.A, df.B], df.C, margins=True, margins_name="Sub-Total", normalize=True + ) + expected = pd.DataFrame( + [ + [0.111111, 0.111111, 0.222222], + [0.111111, 0.111111, 0.222222], + [0.222222, 0.111111, 0.333333], + [0.000000, 0.222222, 0.222222], + [0.444444, 0.555555, 1], + ] + ) + expected.columns = Index( + ["large", "small", "Sub-Total"], dtype="object", name="C" + ) + expected.index = MultiIndex( + levels=[["Sub-Total", "bar", "foo"], ["", "one", "two"]], + codes=[[1, 1, 2, 2, 0], [1, 2, 1, 2, 0]], + names=["A", "B"], + ) + tm.assert_frame_equal(result, expected) From 54e58039fddc79492e598e85279c42e85d06967c Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Tue, 6 Aug 2019 22:55:16 +0200 Subject: [PATCH 030/191] DOC: Validate docstring directives (#27630) Closes gh-27629 --- ci/code_checks.sh | 4 +-- scripts/tests/test_validate_docstrings.py | 32 ++++++++++++++++++++++- scripts/validate_docstrings.py | 10 +++++++ 3 files changed, 43 insertions(+), 3 deletions(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 06d45e38bfcdb..333136ddfddd9 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -263,8 +263,8 @@ fi ### DOCSTRINGS ### if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then - MSG='Validate docstrings (GL03, GL04, GL05, GL06, GL07, GL09, SS04, SS05, PR03, PR04, PR05, PR10, EX04, RT01, RT04, RT05, SA05)' ; echo $MSG - $BASE_DIR/scripts/validate_docstrings.py --format=azure --errors=GL03,GL04,GL05,GL06,GL07,GL09,SS04,SS05,PR03,PR04,PR05,PR10,EX04,RT01,RT04,RT05,SA05 + MSG='Validate docstrings (GL03, GL04, GL05, GL06, GL07, GL09, GL10, SS04, SS05, PR03, PR04, PR05, PR10, EX04, RT01, RT04, RT05, SA05)' ; echo $MSG + $BASE_DIR/scripts/validate_docstrings.py --format=azure --errors=GL03,GL04,GL05,GL06,GL07,GL09,GL10,SS04,SS05,PR03,PR04,PR05,PR10,EX04,RT01,RT04,RT05,SA05 RET=$(($RET + $?)) ; echo $MSG "DONE" fi diff --git a/scripts/tests/test_validate_docstrings.py b/scripts/tests/test_validate_docstrings.py index f3364e6725a20..35aaf10458f44 100644 --- a/scripts/tests/test_validate_docstrings.py +++ b/scripts/tests/test_validate_docstrings.py @@ -200,7 +200,7 @@ def contains(self, pat, case=True, na=np.nan): def mode(self, axis, numeric_only): """ - Ensure sphinx directives don't affect checks for trailing periods. + Ensure reST directives don't affect checks for leading periods. Parameters ---------- @@ -447,6 +447,27 @@ def deprecation_in_wrong_order(self): def method_wo_docstrings(self): pass + def directives_without_two_colons(self, first, second): + """ + Ensure reST directives have trailing colons. + + Parameters + ---------- + first : str + Sentence ending in period, followed by single directive w/o colons. + + .. versionchanged 0.1.2 + + second : bool + Sentence ending in period, followed by multiple directives w/o + colons. + + .. versionadded 0.1.2 + .. deprecated 0.00.0 + + """ + pass + class BadSummaries: def wrong_line(self): @@ -840,6 +861,7 @@ def test_bad_class(self, capsys): "plot", "method", "private_classes", + "directives_without_two_colons", ], ) def test_bad_generic_functions(self, capsys, func): @@ -879,6 +901,14 @@ def test_bad_generic_functions(self, capsys, func): "deprecation_in_wrong_order", ("Deprecation warning should precede extended summary",), ), + ( + "BadGenericDocStrings", + "directives_without_two_colons", + ( + "reST directives ['versionchanged', 'versionadded', " + "'deprecated'] must be followed by two colons", + ), + ), ( "BadSeeAlso", "desc_no_period", diff --git a/scripts/validate_docstrings.py b/scripts/validate_docstrings.py index 37623d32db685..bf5d861281a36 100755 --- a/scripts/validate_docstrings.py +++ b/scripts/validate_docstrings.py @@ -59,6 +59,7 @@ PRIVATE_CLASSES = ["NDFrame", "IndexOpsMixin"] DIRECTIVES = ["versionadded", "versionchanged", "deprecated"] +DIRECTIVE_PATTERN = re.compile(rf"^\s*\.\. ({'|'.join(DIRECTIVES)})(?!::)", re.I | re.M) ALLOWED_SECTIONS = [ "Parameters", "Attributes", @@ -93,6 +94,7 @@ "GL07": "Sections are in the wrong order. Correct order is: " "{correct_sections}", "GL08": "The object does not have a docstring", "GL09": "Deprecation warning should precede extended summary", + "GL10": "reST directives {directives} must be followed by two colons", "SS01": "No summary found (a short summary in a single line should be " "present at the beginning of the docstring)", "SS02": "Summary does not start with a capital letter", @@ -478,6 +480,10 @@ def parameter_mismatches(self): def correct_parameters(self): return not bool(self.parameter_mismatches) + @property + def directives_without_two_colons(self): + return DIRECTIVE_PATTERN.findall(self.raw_doc) + def parameter_type(self, param): return self.doc_parameters[param][0] @@ -697,6 +703,10 @@ def get_validation_data(doc): if doc.deprecated and not doc.extended_summary.startswith(".. deprecated:: "): errs.append(error("GL09")) + directives_without_two_colons = doc.directives_without_two_colons + if directives_without_two_colons: + errs.append(error("GL10", directives=directives_without_two_colons)) + if not doc.summary: errs.append(error("SS01")) else: From a45760fd45b434caf9107bb19f1536636cc3fbd8 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Wed, 7 Aug 2019 07:23:25 +0100 Subject: [PATCH 031/191] DEPR: Removed the previously deprecated ExtensionArray._formatting_values (#27774) --- doc/source/reference/extensions.rst | 1 - doc/source/whatsnew/v1.0.0.rst | 2 +- pandas/core/arrays/base.py | 16 ----------- pandas/core/internals/blocks.py | 27 +------------------ pandas/core/internals/managers.py | 4 --- pandas/core/series.py | 7 ----- pandas/io/formats/format.py | 9 ++++--- .../tests/extension/decimal/test_decimal.py | 11 -------- pandas/tests/extension/test_external_block.py | 21 +-------------- 9 files changed, 8 insertions(+), 90 deletions(-) diff --git a/doc/source/reference/extensions.rst b/doc/source/reference/extensions.rst index 407aab4bb1f1b..78e8734e9b5ff 100644 --- a/doc/source/reference/extensions.rst +++ b/doc/source/reference/extensions.rst @@ -34,7 +34,6 @@ objects. api.extensions.ExtensionArray._concat_same_type api.extensions.ExtensionArray._formatter - api.extensions.ExtensionArray._formatting_values api.extensions.ExtensionArray._from_factorized api.extensions.ExtensionArray._from_sequence api.extensions.ExtensionArray._from_sequence_of_strings diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index c7f8bb70e3461..bca7bf8cbefbd 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -65,7 +65,7 @@ Removal of prior version deprecations/changes - Changed the the default value of `inplace` in :meth:`DataFrame.set_index` and :meth:`Series.set_axis`. It now defaults to False (:issue:`27600`) - :meth:`pandas.Series.str.cat` now defaults to aligning ``others``, using ``join='left'`` (:issue:`27611`) - :meth:`pandas.Series.str.cat` does not accept list-likes *within* list-likes anymore (:issue:`27611`) -- +- Removed the previously deprecated :meth:`ExtensionArray._formatting_values`. Use :attr:`ExtensionArray._formatter` instead. (:issue:`23601`) .. _whatsnew_1000.performance: diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index e517be4f03a16..00e1d092ffa22 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -66,7 +66,6 @@ class ExtensionArray: unique _concat_same_type _formatter - _formatting_values _from_factorized _from_sequence _from_sequence_of_strings @@ -908,21 +907,6 @@ def _formatter(self, boxed: bool = False) -> Callable[[Any], Optional[str]]: return str return repr - def _formatting_values(self) -> np.ndarray: - # At the moment, this has to be an array since we use result.dtype - """ - An array of values to be printed in, e.g. the Series repr - - .. deprecated:: 0.24.0 - - Use :meth:`ExtensionArray._formatter` instead. - - Returns - ------- - array : ndarray - """ - return np.array(self) - # ------------------------------------------------------------------------ # Reshaping # ------------------------------------------------------------------------ diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 8c3cf7cc51495..f035906e68ab8 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -68,13 +68,7 @@ ) import pandas.core.algorithms as algos -from pandas.core.arrays import ( - Categorical, - DatetimeArray, - ExtensionArray, - PandasDtype, - TimedeltaArray, -) +from pandas.core.arrays import Categorical, DatetimeArray, PandasDtype, TimedeltaArray from pandas.core.base import PandasObject import pandas.core.common as com from pandas.core.construction import extract_array @@ -209,10 +203,6 @@ def internal_values(self, dtype=None): """ return self.values - def formatting_values(self): - """Return the internal values used by the DataFrame/SeriesFormatter""" - return self.internal_values() - def get_values(self, dtype=None): """ return an internal format, currently just the ndarray @@ -1831,21 +1821,6 @@ def _slice(self, slicer): return self.values[slicer] - def formatting_values(self): - # Deprecating the ability to override _formatting_values. - # Do the warning here, it's only user in pandas, since we - # have to check if the subclass overrode it. - fv = getattr(type(self.values), "_formatting_values", None) - if fv and fv != ExtensionArray._formatting_values: - msg = ( - "'ExtensionArray._formatting_values' is deprecated. " - "Specify 'ExtensionArray._formatter' instead." - ) - warnings.warn(msg, FutureWarning, stacklevel=10) - return self.values._formatting_values() - - return self.values - def concat_same_type(self, to_concat, placement=None): """ Concatenate list of single blocks of the same type. diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index b30ddbc383906..1c31542daa5de 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1582,10 +1582,6 @@ def external_values(self): def internal_values(self): return self._block.internal_values() - def formatting_values(self): - """Return the internal values used by the DataFrame/SeriesFormatter""" - return self._block.formatting_values() - def get_values(self): """ return a dense type view """ return np.array(self._block.to_dense(), copy=False) diff --git a/pandas/core/series.py b/pandas/core/series.py index 9e317d365ccb8..4e64a25e430eb 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -562,13 +562,6 @@ def _values(self): """ return self._data.internal_values() - def _formatting_values(self): - """ - Return the values that can be formatted (used by SeriesFormatter - and DataFrameFormatter). - """ - return self._data.formatting_values() - def get_values(self): """ Same as values (but handles sparseness conversions); is a view. diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index d8a370d77ea31..61af935bd8227 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -336,9 +336,11 @@ def _get_formatted_index(self) -> Tuple[List[str], bool]: return fmt_index, have_header def _get_formatted_values(self) -> List[str]: - values_to_format = self.tr_series._formatting_values() return format_array( - values_to_format, None, float_format=self.float_format, na_rep=self.na_rep + self.tr_series._values, + None, + float_format=self.float_format, + na_rep=self.na_rep, ) def to_string(self) -> str: @@ -903,9 +905,8 @@ def to_latex( def _format_col(self, i: int) -> List[str]: frame = self.tr_frame formatter = self._get_formatter(i) - values_to_format = frame.iloc[:, i]._formatting_values() return format_array( - values_to_format, + frame.iloc[:, i]._values, formatter, float_format=self.float_format, na_rep=self.na_rep, diff --git a/pandas/tests/extension/decimal/test_decimal.py b/pandas/tests/extension/decimal/test_decimal.py index 9dec023f4073a..3ac9d37ccf4f3 100644 --- a/pandas/tests/extension/decimal/test_decimal.py +++ b/pandas/tests/extension/decimal/test_decimal.py @@ -392,17 +392,6 @@ def test_ufunc_fallback(data): tm.assert_series_equal(result, expected) -def test_formatting_values_deprecated(): - class DecimalArray2(DecimalArray): - def _formatting_values(self): - return np.array(self) - - ser = pd.Series(DecimalArray2([decimal.Decimal("1.0")])) - - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - repr(ser) - - def test_array_ufunc(): a = to_decimal([1, 2, 3]) result = np.exp(a) diff --git a/pandas/tests/extension/test_external_block.py b/pandas/tests/extension/test_external_block.py index 1a4f84e2c0fd2..6311070cfe2bb 100644 --- a/pandas/tests/extension/test_external_block.py +++ b/pandas/tests/extension/test_external_block.py @@ -2,7 +2,7 @@ import pytest import pandas as pd -from pandas.core.internals import BlockManager, SingleBlockManager +from pandas.core.internals import BlockManager from pandas.core.internals.blocks import Block, NonConsolidatableMixIn @@ -10,9 +10,6 @@ class CustomBlock(NonConsolidatableMixIn, Block): _holder = np.ndarray - def formatting_values(self): - return np.array(["Val: {}".format(i) for i in self.values]) - def concat_same_type(self, to_concat, placement=None): """ Always concatenate disregarding self.ndim as the values are @@ -35,22 +32,6 @@ def df(): return pd.DataFrame(block_manager) -def test_custom_repr(): - values = np.arange(3, dtype="int64") - - # series - block = CustomBlock(values, placement=slice(0, 3)) - - s = pd.Series(SingleBlockManager(block, pd.RangeIndex(3))) - assert repr(s) == "0 Val: 0\n1 Val: 1\n2 Val: 2\ndtype: int64" - - # dataframe - block = CustomBlock(values, placement=slice(0, 1)) - blk_mgr = BlockManager([block], [["col"], range(3)]) - df = pd.DataFrame(blk_mgr) - assert repr(df) == " col\n0 Val: 0\n1 Val: 1\n2 Val: 2" - - def test_concat_series(): # GH17728 values = np.arange(3, dtype="int64") From 820e09e0336b7011edb7c44f722cf9d0f2c2ee52 Mon Sep 17 00:00:00 2001 From: Bhavani Ravi Date: Wed, 7 Aug 2019 15:08:41 +0530 Subject: [PATCH 032/191] CI: Fix setting PATH in azure pipelines (#27787) --- azure-pipelines.yml | 22 ++++------------------ ci/azure/posix.yml | 7 ++----- ci/azure/windows.yml | 5 +++-- 3 files changed, 9 insertions(+), 25 deletions(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index cfd7f6546833d..263a87176a9c9 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -22,22 +22,17 @@ jobs: timeoutInMinutes: 90 steps: - script: | - # XXX next command should avoid redefining the path in every step, but - # made the process crash as it couldn't find deactivate - #echo '##vso[task.prependpath]$HOME/miniconda3/bin' + echo '##vso[task.prependpath]$(HOME)/miniconda3/bin' echo '##vso[task.setvariable variable=ENV_FILE]environment.yml' echo '##vso[task.setvariable variable=AZURE]true' displayName: 'Setting environment variables' # Do not require a conda environment - - script: | - export PATH=$HOME/miniconda3/bin:$PATH - ci/code_checks.sh patterns + - script: ci/code_checks.sh patterns displayName: 'Looking for unwanted patterns' condition: true - script: | - export PATH=$HOME/miniconda3/bin:$PATH sudo apt-get install -y libc6-dev-i386 ci/setup_env.sh displayName: 'Setup environment and build pandas' @@ -45,14 +40,12 @@ jobs: # Do not require pandas - script: | - export PATH=$HOME/miniconda3/bin:$PATH source activate pandas-dev ci/code_checks.sh lint displayName: 'Linting' condition: true - script: | - export PATH=$HOME/miniconda3/bin:$PATH source activate pandas-dev ci/code_checks.sh dependencies displayName: 'Dependencies consistency' @@ -60,42 +53,36 @@ jobs: # Require pandas - script: | - export PATH=$HOME/miniconda3/bin:$PATH source activate pandas-dev ci/code_checks.sh code displayName: 'Checks on imported code' condition: true - script: | - export PATH=$HOME/miniconda3/bin:$PATH source activate pandas-dev ci/code_checks.sh doctests displayName: 'Running doctests' condition: true - script: | - export PATH=$HOME/miniconda3/bin:$PATH source activate pandas-dev ci/code_checks.sh docstrings displayName: 'Docstring validation' condition: true - script: | - export PATH=$HOME/miniconda3/bin:$PATH source activate pandas-dev ci/code_checks.sh typing displayName: 'Typing validation' condition: true - script: | - export PATH=$HOME/miniconda3/bin:$PATH source activate pandas-dev pytest --capture=no --strict scripts - displayName: 'Testing docstring validaton script' + displayName: 'Testing docstring validation script' condition: true - script: | - export PATH=$HOME/miniconda3/bin:$PATH source activate pandas-dev cd asv_bench asv check -E existing @@ -124,16 +111,15 @@ jobs: steps: - script: | echo '##vso[task.setvariable variable=ENV_FILE]environment.yml' + echo '##vso[task.prependpath]$(HOME)/miniconda3/bin' displayName: 'Setting environment variables' - script: | - export PATH=$HOME/miniconda3/bin:$PATH sudo apt-get install -y libc6-dev-i386 ci/setup_env.sh displayName: 'Setup environment and build pandas' - script: | - export PATH=$HOME/miniconda3/bin:$PATH source activate pandas-dev # Next we should simply have `doc/make.py --warnings-are-errors`, everything else is required because the ipython directive doesn't fail the build on errors (https://github.com/ipython/ipython/issues/11547) doc/make.py --warnings-are-errors | tee sphinx.log ; SPHINX_RET=${PIPESTATUS[0]} diff --git a/ci/azure/posix.yml b/ci/azure/posix.yml index 39f862290e720..6093df46ffb60 100644 --- a/ci/azure/posix.yml +++ b/ci/azure/posix.yml @@ -56,17 +56,15 @@ jobs: steps: - script: | if [ "$(uname)" == "Linux" ]; then sudo apt-get install -y libc6-dev-i386 $EXTRA_APT; fi + echo '##vso[task.prependpath]$(HOME)/miniconda3/bin' echo "Creating Environment" ci/setup_env.sh displayName: 'Setup environment and build pandas' - script: | - export PATH=$HOME/miniconda3/bin:$PATH source activate pandas-dev ci/run_tests.sh displayName: 'Test' - - script: | - export PATH=$HOME/miniconda3/bin:$PATH - source activate pandas-dev && pushd /tmp && python -c "import pandas; pandas.show_versions();" && popd + - script: source activate pandas-dev && pushd /tmp && python -c "import pandas; pandas.show_versions();" && popd - task: PublishTestResults@2 inputs: testResultsFiles: 'test-data-*.xml' @@ -97,7 +95,6 @@ jobs: } displayName: 'Check for test failures' - script: | - export PATH=$HOME/miniconda3/bin:$PATH source activate pandas-dev python ci/print_skipped.py displayName: 'Print skipped tests' diff --git a/ci/azure/windows.yml b/ci/azure/windows.yml index 20cad1bb4af96..dfa82819b9826 100644 --- a/ci/azure/windows.yml +++ b/ci/azure/windows.yml @@ -17,7 +17,9 @@ jobs: CONDA_PY: "37" steps: - - powershell: Write-Host "##vso[task.prependpath]$env:CONDA\Scripts" + - powershell: | + Write-Host "##vso[task.prependpath]$env:CONDA\Scripts" + Write-Host "##vso[task.prependpath]$HOME/miniconda3/bin" displayName: 'Add conda to PATH' - script: conda update -q -n base conda displayName: Update conda @@ -52,7 +54,6 @@ jobs: } displayName: 'Check for test failures' - script: | - export PATH=$HOME/miniconda3/bin:$PATH source activate pandas-dev python ci/print_skipped.py displayName: 'Print skipped tests' From 9d947dbd6f18a563e882a567dc6035382c8d30cf Mon Sep 17 00:00:00 2001 From: Unprocessable Date: Wed, 7 Aug 2019 13:08:01 +0200 Subject: [PATCH 033/191] Update test_diff.py --- pandas/tests/series/test_diff.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/pandas/tests/series/test_diff.py b/pandas/tests/series/test_diff.py index 1d8c17dfa989b..6df0955b96c3d 100644 --- a/pandas/tests/series/test_diff.py +++ b/pandas/tests/series/test_diff.py @@ -51,9 +51,4 @@ def test_diff(self): # boolean series s = Series([False, True, True, False, False]) result = s.diff() - assert_series_equal(result, Series([nan, True, False, True, False])) - - # boolean nan series - s = Series([False, True, nan, False, False]) - result = s.diff() - assert_series_equal(result, Series([nan, 1, nan, nan, 0], dtype="object")) + assert_series_equal(result, Series[nan, True, False, True, False]) From b115c9ecf42675ea702bad823f8269e7f5fa763b Mon Sep 17 00:00:00 2001 From: Unprocessable Date: Wed, 7 Aug 2019 13:13:27 +0200 Subject: [PATCH 034/191] Update test_diff.py --- pandas/tests/series/test_diff.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/pandas/tests/series/test_diff.py b/pandas/tests/series/test_diff.py index 6df0955b96c3d..1d8c17dfa989b 100644 --- a/pandas/tests/series/test_diff.py +++ b/pandas/tests/series/test_diff.py @@ -51,4 +51,9 @@ def test_diff(self): # boolean series s = Series([False, True, True, False, False]) result = s.diff() - assert_series_equal(result, Series[nan, True, False, True, False]) + assert_series_equal(result, Series([nan, True, False, True, False])) + + # boolean nan series + s = Series([False, True, nan, False, False]) + result = s.diff() + assert_series_equal(result, Series([nan, 1, nan, nan, 0], dtype="object")) From 6cde0b304733940d746776835c305ed9b29222cb Mon Sep 17 00:00:00 2001 From: Jeremy Schendel Date: Wed, 7 Aug 2019 07:16:49 -0600 Subject: [PATCH 035/191] DEPR: Remove previously deprecated IntervalIndex.from_intervals (#27793) --- doc/source/whatsnew/v1.0.0.rst | 1 + pandas/core/arrays/interval.py | 44 ------------------- pandas/core/indexes/interval.py | 16 ------- .../indexes/interval/test_construction.py | 29 ------------ 4 files changed, 1 insertion(+), 89 deletions(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index bca7bf8cbefbd..f5ca843e1a6f7 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -66,6 +66,7 @@ Removal of prior version deprecations/changes - :meth:`pandas.Series.str.cat` now defaults to aligning ``others``, using ``join='left'`` (:issue:`27611`) - :meth:`pandas.Series.str.cat` does not accept list-likes *within* list-likes anymore (:issue:`27611`) - Removed the previously deprecated :meth:`ExtensionArray._formatting_values`. Use :attr:`ExtensionArray._formatter` instead. (:issue:`23601`) +- Removed the previously deprecated ``IntervalIndex.from_intervals`` in favor of the :class:`IntervalIndex` constructor (:issue:`19263`) .. _whatsnew_1000.performance: diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index 2b3c02bd1cade..4ab75090c34d0 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -358,50 +358,6 @@ def from_arrays(cls, left, right, closed="right", copy=False, dtype=None): left, right, closed, copy=copy, dtype=dtype, verify_integrity=True ) - _interval_shared_docs[ - "from_intervals" - ] = """ - Construct an %(klass)s from a 1d array of Interval objects - - .. deprecated:: 0.23.0 - - Parameters - ---------- - data : array-like (1-dimensional) - Array of Interval objects. All intervals must be closed on the same - sides. - copy : boolean, default False - by-default copy the data, this is compat only and ignored - dtype : dtype or None, default None - If None, dtype will be inferred - - ..versionadded:: 0.23.0 - - See Also - -------- - interval_range : Function to create a fixed frequency IntervalIndex. - %(klass)s.from_arrays : Construct an %(klass)s from a left and - right array. - %(klass)s.from_breaks : Construct an %(klass)s from an array of - splits. - %(klass)s.from_tuples : Construct an %(klass)s from an - array-like of tuples. - - Examples - -------- - >>> pd.%(qualname)s.from_intervals([pd.Interval(0, 1), - ... pd.Interval(1, 2)]) - %(klass)s([(0, 1], (1, 2]], - closed='right', dtype='interval[int64]') - - The generic Index constructor work identically when it infers an array - of all intervals: - - >>> pd.Index([pd.Interval(0, 1), pd.Interval(1, 2)]) - %(klass)s([(0, 1], (1, 2]], - closed='right', dtype='interval[int64]') - """ - _interval_shared_docs[ "from_tuples" ] = """ diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index 7a444683ffcb2..9361408290bb1 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -269,22 +269,6 @@ def from_arrays( ) return cls._simple_new(array, name=name) - @classmethod - @Appender(_interval_shared_docs["from_intervals"] % _index_doc_kwargs) - def from_intervals(cls, data, closed=None, name=None, copy=False, dtype=None): - msg = ( - "IntervalIndex.from_intervals is deprecated and will be " - "removed in a future version; Use IntervalIndex(...) instead" - ) - warnings.warn(msg, FutureWarning, stacklevel=2) - with rewrite_exception("IntervalArray", cls.__name__): - array = IntervalArray(data, closed=closed, copy=copy, dtype=dtype) - - if name is None and isinstance(data, cls): - name = data.name - - return cls._simple_new(array, name=name) - @classmethod @Appender(_interval_shared_docs["from_tuples"] % _index_doc_kwargs) def from_tuples(cls, data, closed="right", name=None, copy=False, dtype=None): diff --git a/pandas/tests/indexes/interval/test_construction.py b/pandas/tests/indexes/interval/test_construction.py index e2abb4531525a..82a10d24dad30 100644 --- a/pandas/tests/indexes/interval/test_construction.py +++ b/pandas/tests/indexes/interval/test_construction.py @@ -421,32 +421,3 @@ def test_index_mixed_closed(self): result = Index(intervals) expected = Index(intervals, dtype=object) tm.assert_index_equal(result, expected) - - -class TestFromIntervals(TestClassConstructors): - """ - Tests for IntervalIndex.from_intervals, which is deprecated in favor of the - IntervalIndex constructor. Same tests as the IntervalIndex constructor, - plus deprecation test. Should only need to delete this class when removed. - """ - - @pytest.fixture - def constructor(self): - def from_intervals_ignore_warnings(*args, **kwargs): - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - return IntervalIndex.from_intervals(*args, **kwargs) - - return from_intervals_ignore_warnings - - def test_deprecated(self): - ivs = [Interval(0, 1), Interval(1, 2)] - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - IntervalIndex.from_intervals(ivs) - - @pytest.mark.skip(reason="parent class test that is not applicable") - def test_index_object_dtype(self): - pass - - @pytest.mark.skip(reason="parent class test that is not applicable") - def test_index_mixed_closed(self): - pass From 38d2372d512284e4f3b140284b815c6473e28d69 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 7 Aug 2019 15:25:36 +0200 Subject: [PATCH 036/191] BUG: fix construction of NonConsolidatableBlock with inconsistent ndim (#27786) --- pandas/core/internals/blocks.py | 2 ++ pandas/tests/extension/base/getitem.py | 6 ++++++ 2 files changed, 8 insertions(+) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index f035906e68ab8..8eea46af2c353 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -263,6 +263,8 @@ def make_block_same_class(self, values, placement=None, ndim=None, dtype=None): ) if placement is None: placement = self.mgr_locs + if ndim is None: + ndim = self.ndim return make_block( values, placement=placement, ndim=ndim, klass=self.__class__, dtype=dtype ) diff --git a/pandas/tests/extension/base/getitem.py b/pandas/tests/extension/base/getitem.py index e02586eacfea7..d56cc50f4739c 100644 --- a/pandas/tests/extension/base/getitem.py +++ b/pandas/tests/extension/base/getitem.py @@ -260,3 +260,9 @@ def test_reindex_non_na_fill_value(self, data_missing): expected = pd.Series(data_missing._from_sequence([na, valid, valid])) self.assert_series_equal(result, expected) + + def test_loc_len1(self, data): + # see GH-27785 take_nd with indexer of len 1 resulting in wrong ndim + df = pd.DataFrame({"A": data}) + res = df.loc[[0], "A"] + assert res._data._block.ndim == 1 From 9724ace04769ce75fcc265103bcee8bbff3938d9 Mon Sep 17 00:00:00 2001 From: Christopher Whelan Date: Wed, 7 Aug 2019 06:27:03 -0700 Subject: [PATCH 037/191] [BLD] Add script that fails build if git tags do not exist (#27770) --- .travis.yml | 4 ++-- ci/check_git_tags.sh | 28 ++++++++++++++++++++++++++++ pandas/tests/test_common.py | 11 +++++++++++ 3 files changed, 41 insertions(+), 2 deletions(-) create mode 100755 ci/check_git_tags.sh diff --git a/.travis.yml b/.travis.yml index 9be4291d10874..79fecc41bec0d 100644 --- a/.travis.yml +++ b/.travis.yml @@ -21,7 +21,7 @@ env: git: # for cloning - depth: 2000 + depth: false matrix: fast_finish: true @@ -63,7 +63,7 @@ before_install: - pwd - uname -a - git --version - - git tag + - ./ci/check_git_tags.sh # Because travis runs on Google Cloud and has a /etc/boto.cfg, # it breaks moto import, see: # https://github.com/spulec/moto/issues/1771 diff --git a/ci/check_git_tags.sh b/ci/check_git_tags.sh new file mode 100755 index 0000000000000..9dbcd4f98683e --- /dev/null +++ b/ci/check_git_tags.sh @@ -0,0 +1,28 @@ +set -e + +if [[ ! $(git tag) ]]; then + echo "No git tags in clone, please sync your git tags with upstream using:" + echo " git fetch --tags upstream" + echo " git push --tags origin" + echo "" + echo "If the issue persists, the clone depth needs to be increased in .travis.yml" + exit 1 +fi + +# This will error if there are no tags and we omit --always +DESCRIPTION=$(git describe --long --tags) +echo "$DESCRIPTION" + +if [[ "$DESCRIPTION" == *"untagged"* ]]; then + echo "Unable to determine most recent tag, aborting build" + exit 1 +else + if [[ "$DESCRIPTION" != *"g"* ]]; then + # A good description will have the hash prefixed by g, a bad one will be + # just the hash + echo "Unable to determine most recent tag, aborting build" + exit 1 + else + echo "$(git tag)" + fi +fi diff --git a/pandas/tests/test_common.py b/pandas/tests/test_common.py index 479e55c86fcd1..65b2dab1b02a8 100644 --- a/pandas/tests/test_common.py +++ b/pandas/tests/test_common.py @@ -1,4 +1,5 @@ import collections +from distutils.version import LooseVersion from functools import partial import string @@ -117,3 +118,13 @@ def test_git_version(): git_version = pd.__git_version__ assert len(git_version) == 40 assert all(c in string.hexdigits for c in git_version) + + +def test_version_tag(): + version = pd.__version__ + try: + version > LooseVersion("0.0.1") + except TypeError: + raise ValueError( + "No git tags exist, please sync tags between upstream and your repo" + ) From 3bf35c6b5deac4b44c46fc8db10cf90293e9d620 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Wed, 7 Aug 2019 06:32:47 -0700 Subject: [PATCH 038/191] REF: pandas/core/window.py into multiple files (#27736) --- doc/source/reference/window.rst | 7 +- pandas/core/generic.py | 29 +- pandas/core/window/__init__.py | 3 + pandas/core/window/common.py | 276 ++++++ pandas/core/window/ewm.py | 388 ++++++++ pandas/core/window/expanding.py | 260 +++++ pandas/core/{window.py => window/rolling.py} | 951 +------------------ pandas/tests/window/test_ewm.py | 4 +- pandas/tests/window/test_expanding.py | 4 +- pandas/tests/window/test_moments.py | 4 +- pandas/tests/window/test_rolling.py | 4 +- pandas/tests/window/test_window.py | 4 +- 12 files changed, 979 insertions(+), 955 deletions(-) create mode 100644 pandas/core/window/__init__.py create mode 100644 pandas/core/window/common.py create mode 100644 pandas/core/window/ewm.py create mode 100644 pandas/core/window/expanding.py rename pandas/core/{window.py => window/rolling.py} (66%) diff --git a/doc/source/reference/window.rst b/doc/source/reference/window.rst index 9e1374a3bd8e4..2f6addf607877 100644 --- a/doc/source/reference/window.rst +++ b/doc/source/reference/window.rst @@ -5,7 +5,6 @@ ====== Window ====== -.. currentmodule:: pandas.core.window Rolling objects are returned by ``.rolling`` calls: :func:`pandas.DataFrame.rolling`, :func:`pandas.Series.rolling`, etc. Expanding objects are returned by ``.expanding`` calls: :func:`pandas.DataFrame.expanding`, :func:`pandas.Series.expanding`, etc. @@ -13,6 +12,8 @@ EWM objects are returned by ``.ewm`` calls: :func:`pandas.DataFrame.ewm`, :func: Standard moving window functions -------------------------------- +.. currentmodule:: pandas.core.window.rolling + .. autosummary:: :toctree: api/ @@ -38,6 +39,8 @@ Standard moving window functions Standard expanding window functions ----------------------------------- +.. currentmodule:: pandas.core.window.expanding + .. autosummary:: :toctree: api/ @@ -59,6 +62,8 @@ Standard expanding window functions Exponentially-weighted moving window functions ---------------------------------------------- +.. currentmodule:: pandas.core.window.ewm + .. autosummary:: :toctree: api/ diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 1b39f9225a0ed..4d29f19cc01ed 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -10683,9 +10683,9 @@ def _add_series_or_dataframe_operations(cls): the doc strings again. """ - from pandas.core import window as rwindow + from pandas.core.window import EWM, Expanding, Rolling, Window - @Appender(rwindow.rolling.__doc__) + @Appender(Rolling.__doc__) def rolling( self, window, @@ -10697,7 +10697,20 @@ def rolling( closed=None, ): axis = self._get_axis_number(axis) - return rwindow.rolling( + + if win_type is not None: + return Window( + self, + window=window, + min_periods=min_periods, + center=center, + win_type=win_type, + on=on, + axis=axis, + closed=closed, + ) + + return Rolling( self, window=window, min_periods=min_periods, @@ -10710,16 +10723,14 @@ def rolling( cls.rolling = rolling - @Appender(rwindow.expanding.__doc__) + @Appender(Expanding.__doc__) def expanding(self, min_periods=1, center=False, axis=0): axis = self._get_axis_number(axis) - return rwindow.expanding( - self, min_periods=min_periods, center=center, axis=axis - ) + return Expanding(self, min_periods=min_periods, center=center, axis=axis) cls.expanding = expanding - @Appender(rwindow.ewm.__doc__) + @Appender(EWM.__doc__) def ewm( self, com=None, @@ -10732,7 +10743,7 @@ def ewm( axis=0, ): axis = self._get_axis_number(axis) - return rwindow.ewm( + return EWM( self, com=com, span=span, diff --git a/pandas/core/window/__init__.py b/pandas/core/window/__init__.py new file mode 100644 index 0000000000000..dcf58a4c0dd5b --- /dev/null +++ b/pandas/core/window/__init__.py @@ -0,0 +1,3 @@ +from pandas.core.window.ewm import EWM # noqa:F401 +from pandas.core.window.expanding import Expanding, ExpandingGroupby # noqa:F401 +from pandas.core.window.rolling import Rolling, RollingGroupby, Window # noqa:F401 diff --git a/pandas/core/window/common.py b/pandas/core/window/common.py new file mode 100644 index 0000000000000..0f2920b3558c9 --- /dev/null +++ b/pandas/core/window/common.py @@ -0,0 +1,276 @@ +"""Common utility functions for rolling operations""" +from collections import defaultdict +import warnings + +import numpy as np + +from pandas.core.dtypes.common import is_integer +from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries + +import pandas.core.common as com +from pandas.core.generic import _shared_docs +from pandas.core.groupby.base import GroupByMixin +from pandas.core.index import MultiIndex + +_shared_docs = dict(**_shared_docs) +_doc_template = """ + Returns + ------- + Series or DataFrame + Return type is determined by the caller. + + See Also + -------- + Series.%(name)s : Series %(name)s. + DataFrame.%(name)s : DataFrame %(name)s. +""" + + +class _GroupByMixin(GroupByMixin): + """ + Provide the groupby facilities. + """ + + def __init__(self, obj, *args, **kwargs): + parent = kwargs.pop("parent", None) # noqa + groupby = kwargs.pop("groupby", None) + if groupby is None: + groupby, obj = obj, obj.obj + self._groupby = groupby + self._groupby.mutated = True + self._groupby.grouper.mutated = True + super().__init__(obj, *args, **kwargs) + + count = GroupByMixin._dispatch("count") + corr = GroupByMixin._dispatch("corr", other=None, pairwise=None) + cov = GroupByMixin._dispatch("cov", other=None, pairwise=None) + + def _apply( + self, func, name=None, window=None, center=None, check_minp=None, **kwargs + ): + """ + Dispatch to apply; we are stripping all of the _apply kwargs and + performing the original function call on the grouped object. + """ + + def f(x, name=name, *args): + x = self._shallow_copy(x) + + if isinstance(name, str): + return getattr(x, name)(*args, **kwargs) + + return x.apply(name, *args, **kwargs) + + return self._groupby.apply(f) + + +def _flex_binary_moment(arg1, arg2, f, pairwise=False): + + if not ( + isinstance(arg1, (np.ndarray, ABCSeries, ABCDataFrame)) + and isinstance(arg2, (np.ndarray, ABCSeries, ABCDataFrame)) + ): + raise TypeError( + "arguments to moment function must be of type " + "np.ndarray/Series/DataFrame" + ) + + if isinstance(arg1, (np.ndarray, ABCSeries)) and isinstance( + arg2, (np.ndarray, ABCSeries) + ): + X, Y = _prep_binary(arg1, arg2) + return f(X, Y) + + elif isinstance(arg1, ABCDataFrame): + from pandas import DataFrame + + def dataframe_from_int_dict(data, frame_template): + result = DataFrame(data, index=frame_template.index) + if len(result.columns) > 0: + result.columns = frame_template.columns[result.columns] + return result + + results = {} + if isinstance(arg2, ABCDataFrame): + if pairwise is False: + if arg1 is arg2: + # special case in order to handle duplicate column names + for i, col in enumerate(arg1.columns): + results[i] = f(arg1.iloc[:, i], arg2.iloc[:, i]) + return dataframe_from_int_dict(results, arg1) + else: + if not arg1.columns.is_unique: + raise ValueError("'arg1' columns are not unique") + if not arg2.columns.is_unique: + raise ValueError("'arg2' columns are not unique") + with warnings.catch_warnings(record=True): + warnings.simplefilter("ignore", RuntimeWarning) + X, Y = arg1.align(arg2, join="outer") + X = X + 0 * Y + Y = Y + 0 * X + + with warnings.catch_warnings(record=True): + warnings.simplefilter("ignore", RuntimeWarning) + res_columns = arg1.columns.union(arg2.columns) + for col in res_columns: + if col in X and col in Y: + results[col] = f(X[col], Y[col]) + return DataFrame(results, index=X.index, columns=res_columns) + elif pairwise is True: + results = defaultdict(dict) + for i, k1 in enumerate(arg1.columns): + for j, k2 in enumerate(arg2.columns): + if j < i and arg2 is arg1: + # Symmetric case + results[i][j] = results[j][i] + else: + results[i][j] = f( + *_prep_binary(arg1.iloc[:, i], arg2.iloc[:, j]) + ) + + from pandas import concat + + result_index = arg1.index.union(arg2.index) + if len(result_index): + + # construct result frame + result = concat( + [ + concat( + [results[i][j] for j, c in enumerate(arg2.columns)], + ignore_index=True, + ) + for i, c in enumerate(arg1.columns) + ], + ignore_index=True, + axis=1, + ) + result.columns = arg1.columns + + # set the index and reorder + if arg2.columns.nlevels > 1: + result.index = MultiIndex.from_product( + arg2.columns.levels + [result_index] + ) + result = result.reorder_levels([2, 0, 1]).sort_index() + else: + result.index = MultiIndex.from_product( + [range(len(arg2.columns)), range(len(result_index))] + ) + result = result.swaplevel(1, 0).sort_index() + result.index = MultiIndex.from_product( + [result_index] + [arg2.columns] + ) + else: + + # empty result + result = DataFrame( + index=MultiIndex( + levels=[arg1.index, arg2.columns], codes=[[], []] + ), + columns=arg2.columns, + dtype="float64", + ) + + # reset our index names to arg1 names + # reset our column names to arg2 names + # careful not to mutate the original names + result.columns = result.columns.set_names(arg1.columns.names) + result.index = result.index.set_names( + result_index.names + arg2.columns.names + ) + + return result + + else: + raise ValueError("'pairwise' is not True/False") + else: + results = { + i: f(*_prep_binary(arg1.iloc[:, i], arg2)) + for i, col in enumerate(arg1.columns) + } + return dataframe_from_int_dict(results, arg1) + + else: + return _flex_binary_moment(arg2, arg1, f) + + +def _get_center_of_mass(comass, span, halflife, alpha): + valid_count = com.count_not_none(comass, span, halflife, alpha) + if valid_count > 1: + raise ValueError("comass, span, halflife, and alpha are mutually exclusive") + + # Convert to center of mass; domain checks ensure 0 < alpha <= 1 + if comass is not None: + if comass < 0: + raise ValueError("comass must satisfy: comass >= 0") + elif span is not None: + if span < 1: + raise ValueError("span must satisfy: span >= 1") + comass = (span - 1) / 2.0 + elif halflife is not None: + if halflife <= 0: + raise ValueError("halflife must satisfy: halflife > 0") + decay = 1 - np.exp(np.log(0.5) / halflife) + comass = 1 / decay - 1 + elif alpha is not None: + if alpha <= 0 or alpha > 1: + raise ValueError("alpha must satisfy: 0 < alpha <= 1") + comass = (1.0 - alpha) / alpha + else: + raise ValueError("Must pass one of comass, span, halflife, or alpha") + + return float(comass) + + +def _offset(window, center): + if not is_integer(window): + window = len(window) + offset = (window - 1) / 2.0 if center else 0 + try: + return int(offset) + except TypeError: + return offset.astype(int) + + +def _require_min_periods(p): + def _check_func(minp, window): + if minp is None: + return window + else: + return max(p, minp) + + return _check_func + + +def _use_window(minp, window): + if minp is None: + return window + else: + return minp + + +def _zsqrt(x): + with np.errstate(all="ignore"): + result = np.sqrt(x) + mask = x < 0 + + if isinstance(x, ABCDataFrame): + if mask.values.any(): + result[mask] = 0 + else: + if mask.any(): + result[mask] = 0 + + return result + + +def _prep_binary(arg1, arg2): + if not isinstance(arg2, type(arg1)): + raise Exception("Input arrays must be of the same type!") + + # mask out values, this also makes a common index... + X = arg1 + 0 * arg2 + Y = arg2 + 0 * arg1 + + return X, Y diff --git a/pandas/core/window/ewm.py b/pandas/core/window/ewm.py new file mode 100644 index 0000000000000..0ce6d5ddec2ad --- /dev/null +++ b/pandas/core/window/ewm.py @@ -0,0 +1,388 @@ +from textwrap import dedent + +import numpy as np + +import pandas._libs.window as libwindow +from pandas.compat.numpy import function as nv +from pandas.util._decorators import Appender, Substitution + +from pandas.core.dtypes.generic import ABCDataFrame + +from pandas.core.base import DataError +from pandas.core.window.common import _doc_template, _get_center_of_mass, _shared_docs +from pandas.core.window.rolling import _flex_binary_moment, _Rolling, _zsqrt + +_bias_template = """ + Parameters + ---------- + bias : bool, default False + Use a standard estimation bias correction. + *args, **kwargs + Arguments and keyword arguments to be passed into func. +""" + +_pairwise_template = """ + Parameters + ---------- + other : Series, DataFrame, or ndarray, optional + If not supplied then will default to self and produce pairwise + output. + pairwise : bool, default None + If False then only matching columns between self and other will be + used and the output will be a DataFrame. + If True then all pairwise combinations will be calculated and the + output will be a MultiIndex DataFrame in the case of DataFrame + inputs. In the case of missing elements, only complete pairwise + observations will be used. + bias : bool, default False + Use a standard estimation bias correction. + **kwargs + Keyword arguments to be passed into func. +""" + + +class EWM(_Rolling): + r""" + Provide exponential weighted functions. + + Parameters + ---------- + com : float, optional + Specify decay in terms of center of mass, + :math:`\alpha = 1 / (1 + com),\text{ for } com \geq 0`. + span : float, optional + Specify decay in terms of span, + :math:`\alpha = 2 / (span + 1),\text{ for } span \geq 1`. + halflife : float, optional + Specify decay in terms of half-life, + :math:`\alpha = 1 - exp(log(0.5) / halflife),\text{for} halflife > 0`. + alpha : float, optional + Specify smoothing factor :math:`\alpha` directly, + :math:`0 < \alpha \leq 1`. + min_periods : int, default 0 + Minimum number of observations in window required to have a value + (otherwise result is NA). + adjust : bool, default True + Divide by decaying adjustment factor in beginning periods to account + for imbalance in relative weightings + (viewing EWMA as a moving average). + ignore_na : bool, default False + Ignore missing values when calculating weights; + specify True to reproduce pre-0.15.0 behavior. + axis : {0 or 'index', 1 or 'columns'}, default 0 + The axis to use. The value 0 identifies the rows, and 1 + identifies the columns. + + Returns + ------- + DataFrame + A Window sub-classed for the particular operation. + + See Also + -------- + rolling : Provides rolling window calculations. + expanding : Provides expanding transformations. + + Notes + ----- + Exactly one of center of mass, span, half-life, and alpha must be provided. + Allowed values and relationship between the parameters are specified in the + parameter descriptions above; see the link at the end of this section for + a detailed explanation. + + When adjust is True (default), weighted averages are calculated using + weights (1-alpha)**(n-1), (1-alpha)**(n-2), ..., 1-alpha, 1. + + When adjust is False, weighted averages are calculated recursively as: + weighted_average[0] = arg[0]; + weighted_average[i] = (1-alpha)*weighted_average[i-1] + alpha*arg[i]. + + When ignore_na is False (default), weights are based on absolute positions. + For example, the weights of x and y used in calculating the final weighted + average of [x, None, y] are (1-alpha)**2 and 1 (if adjust is True), and + (1-alpha)**2 and alpha (if adjust is False). + + When ignore_na is True (reproducing pre-0.15.0 behavior), weights are based + on relative positions. For example, the weights of x and y used in + calculating the final weighted average of [x, None, y] are 1-alpha and 1 + (if adjust is True), and 1-alpha and alpha (if adjust is False). + + More details can be found at + http://pandas.pydata.org/pandas-docs/stable/user_guide/computation.html#exponentially-weighted-windows + + Examples + -------- + + >>> df = pd.DataFrame({'B': [0, 1, 2, np.nan, 4]}) + >>> df + B + 0 0.0 + 1 1.0 + 2 2.0 + 3 NaN + 4 4.0 + + >>> df.ewm(com=0.5).mean() + B + 0 0.000000 + 1 0.750000 + 2 1.615385 + 3 1.615385 + 4 3.670213 + """ + _attributes = ["com", "min_periods", "adjust", "ignore_na", "axis"] + + def __init__( + self, + obj, + com=None, + span=None, + halflife=None, + alpha=None, + min_periods=0, + adjust=True, + ignore_na=False, + axis=0, + ): + self.obj = obj + self.com = _get_center_of_mass(com, span, halflife, alpha) + self.min_periods = min_periods + self.adjust = adjust + self.ignore_na = ignore_na + self.axis = axis + self.on = None + + @property + def _constructor(self): + return EWM + + _agg_see_also_doc = dedent( + """ + See Also + -------- + pandas.DataFrame.rolling.aggregate + """ + ) + + _agg_examples_doc = dedent( + """ + Examples + -------- + + >>> df = pd.DataFrame(np.random.randn(10, 3), columns=['A', 'B', 'C']) + >>> df + A B C + 0 -2.385977 -0.102758 0.438822 + 1 -1.004295 0.905829 -0.954544 + 2 0.735167 -0.165272 -1.619346 + 3 -0.702657 -1.340923 -0.706334 + 4 -0.246845 0.211596 -0.901819 + 5 2.463718 3.157577 -1.380906 + 6 -1.142255 2.340594 -0.039875 + 7 1.396598 -1.647453 1.677227 + 8 -0.543425 1.761277 -0.220481 + 9 -0.640505 0.289374 -1.550670 + + >>> df.ewm(alpha=0.5).mean() + A B C + 0 -2.385977 -0.102758 0.438822 + 1 -1.464856 0.569633 -0.490089 + 2 -0.207700 0.149687 -1.135379 + 3 -0.471677 -0.645305 -0.906555 + 4 -0.355635 -0.203033 -0.904111 + 5 1.076417 1.503943 -1.146293 + 6 -0.041654 1.925562 -0.588728 + 7 0.680292 0.132049 0.548693 + 8 0.067236 0.948257 0.163353 + 9 -0.286980 0.618493 -0.694496 + """ + ) + + @Substitution( + see_also=_agg_see_also_doc, + examples=_agg_examples_doc, + versionadded="", + klass="Series/Dataframe", + axis="", + ) + @Appender(_shared_docs["aggregate"]) + def aggregate(self, arg, *args, **kwargs): + return super().aggregate(arg, *args, **kwargs) + + agg = aggregate + + def _apply(self, func, **kwargs): + """ + Rolling statistical measure using supplied function. Designed to be + used with passed-in Cython array-based functions. + + Parameters + ---------- + func : str/callable to apply + + Returns + ------- + y : same type as input argument + """ + blocks, obj = self._create_blocks() + block_list = list(blocks) + + results = [] + exclude = [] + for i, b in enumerate(blocks): + try: + values = self._prep_values(b.values) + + except (TypeError, NotImplementedError): + if isinstance(obj, ABCDataFrame): + exclude.extend(b.columns) + del block_list[i] + continue + else: + raise DataError("No numeric types to aggregate") + + if values.size == 0: + results.append(values.copy()) + continue + + # if we have a string function name, wrap it + if isinstance(func, str): + cfunc = getattr(libwindow, func, None) + if cfunc is None: + raise ValueError( + "we do not support this function " + "in libwindow.{func}".format(func=func) + ) + + def func(arg): + return cfunc( + arg, + self.com, + int(self.adjust), + int(self.ignore_na), + int(self.min_periods), + ) + + results.append(np.apply_along_axis(func, self.axis, values)) + + return self._wrap_results(results, block_list, obj, exclude) + + @Substitution(name="ewm") + @Appender(_doc_template) + def mean(self, *args, **kwargs): + """ + Exponential weighted moving average. + + Parameters + ---------- + *args, **kwargs + Arguments and keyword arguments to be passed into func. + """ + nv.validate_window_func("mean", args, kwargs) + return self._apply("ewma", **kwargs) + + @Substitution(name="ewm") + @Appender(_doc_template) + @Appender(_bias_template) + def std(self, bias=False, *args, **kwargs): + """ + Exponential weighted moving stddev. + """ + nv.validate_window_func("std", args, kwargs) + return _zsqrt(self.var(bias=bias, **kwargs)) + + vol = std + + @Substitution(name="ewm") + @Appender(_doc_template) + @Appender(_bias_template) + def var(self, bias=False, *args, **kwargs): + """ + Exponential weighted moving variance. + """ + nv.validate_window_func("var", args, kwargs) + + def f(arg): + return libwindow.ewmcov( + arg, + arg, + self.com, + int(self.adjust), + int(self.ignore_na), + int(self.min_periods), + int(bias), + ) + + return self._apply(f, **kwargs) + + @Substitution(name="ewm") + @Appender(_doc_template) + @Appender(_pairwise_template) + def cov(self, other=None, pairwise=None, bias=False, **kwargs): + """ + Exponential weighted sample covariance. + """ + if other is None: + other = self._selected_obj + # only default unset + pairwise = True if pairwise is None else pairwise + other = self._shallow_copy(other) + + def _get_cov(X, Y): + X = self._shallow_copy(X) + Y = self._shallow_copy(Y) + cov = libwindow.ewmcov( + X._prep_values(), + Y._prep_values(), + self.com, + int(self.adjust), + int(self.ignore_na), + int(self.min_periods), + int(bias), + ) + return X._wrap_result(cov) + + return _flex_binary_moment( + self._selected_obj, other._selected_obj, _get_cov, pairwise=bool(pairwise) + ) + + @Substitution(name="ewm") + @Appender(_doc_template) + @Appender(_pairwise_template) + def corr(self, other=None, pairwise=None, **kwargs): + """ + Exponential weighted sample correlation. + """ + if other is None: + other = self._selected_obj + # only default unset + pairwise = True if pairwise is None else pairwise + other = self._shallow_copy(other) + + def _get_corr(X, Y): + X = self._shallow_copy(X) + Y = self._shallow_copy(Y) + + def _cov(x, y): + return libwindow.ewmcov( + x, + y, + self.com, + int(self.adjust), + int(self.ignore_na), + int(self.min_periods), + 1, + ) + + x_values = X._prep_values() + y_values = Y._prep_values() + with np.errstate(all="ignore"): + cov = _cov(x_values, y_values) + x_var = _cov(x_values, x_values) + y_var = _cov(y_values, y_values) + corr = cov / _zsqrt(x_var * y_var) + return X._wrap_result(corr) + + return _flex_binary_moment( + self._selected_obj, other._selected_obj, _get_corr, pairwise=bool(pairwise) + ) diff --git a/pandas/core/window/expanding.py b/pandas/core/window/expanding.py new file mode 100644 index 0000000000000..c43ca6b0565f3 --- /dev/null +++ b/pandas/core/window/expanding.py @@ -0,0 +1,260 @@ +from textwrap import dedent + +from pandas.compat.numpy import function as nv +from pandas.util._decorators import Appender, Substitution + +from pandas.core.window.common import _doc_template, _GroupByMixin, _shared_docs +from pandas.core.window.rolling import _Rolling_and_Expanding + + +class Expanding(_Rolling_and_Expanding): + """ + Provide expanding transformations. + + Parameters + ---------- + min_periods : int, default 1 + Minimum number of observations in window required to have a value + (otherwise result is NA). + center : bool, default False + Set the labels at the center of the window. + axis : int or str, default 0 + + Returns + ------- + a Window sub-classed for the particular operation + + See Also + -------- + rolling : Provides rolling window calculations. + ewm : Provides exponential weighted functions. + + Notes + ----- + By default, the result is set to the right edge of the window. This can be + changed to the center of the window by setting ``center=True``. + + Examples + -------- + + >>> df = pd.DataFrame({'B': [0, 1, 2, np.nan, 4]}) + B + 0 0.0 + 1 1.0 + 2 2.0 + 3 NaN + 4 4.0 + + >>> df.expanding(2).sum() + B + 0 NaN + 1 1.0 + 2 3.0 + 3 3.0 + 4 7.0 + """ + + _attributes = ["min_periods", "center", "axis"] + + def __init__(self, obj, min_periods=1, center=False, axis=0, **kwargs): + super().__init__(obj=obj, min_periods=min_periods, center=center, axis=axis) + + @property + def _constructor(self): + return Expanding + + def _get_window(self, other=None, **kwargs): + """ + Get the window length over which to perform some operation. + + Parameters + ---------- + other : object, default None + The other object that is involved in the operation. + Such an object is involved for operations like covariance. + + Returns + ------- + window : int + The window length. + """ + axis = self.obj._get_axis(self.axis) + length = len(axis) + (other is not None) * len(axis) + + other = self.min_periods or -1 + return max(length, other) + + _agg_see_also_doc = dedent( + """ + See Also + -------- + DataFrame.expanding.aggregate + DataFrame.rolling.aggregate + DataFrame.aggregate + """ + ) + + _agg_examples_doc = dedent( + """ + Examples + -------- + + >>> df = pd.DataFrame(np.random.randn(10, 3), columns=['A', 'B', 'C']) + >>> df + A B C + 0 -2.385977 -0.102758 0.438822 + 1 -1.004295 0.905829 -0.954544 + 2 0.735167 -0.165272 -1.619346 + 3 -0.702657 -1.340923 -0.706334 + 4 -0.246845 0.211596 -0.901819 + 5 2.463718 3.157577 -1.380906 + 6 -1.142255 2.340594 -0.039875 + 7 1.396598 -1.647453 1.677227 + 8 -0.543425 1.761277 -0.220481 + 9 -0.640505 0.289374 -1.550670 + + >>> df.ewm(alpha=0.5).mean() + A B C + 0 -2.385977 -0.102758 0.438822 + 1 -1.464856 0.569633 -0.490089 + 2 -0.207700 0.149687 -1.135379 + 3 -0.471677 -0.645305 -0.906555 + 4 -0.355635 -0.203033 -0.904111 + 5 1.076417 1.503943 -1.146293 + 6 -0.041654 1.925562 -0.588728 + 7 0.680292 0.132049 0.548693 + 8 0.067236 0.948257 0.163353 + 9 -0.286980 0.618493 -0.694496 + """ + ) + + @Substitution( + see_also=_agg_see_also_doc, + examples=_agg_examples_doc, + versionadded="", + klass="Series/Dataframe", + axis="", + ) + @Appender(_shared_docs["aggregate"]) + def aggregate(self, arg, *args, **kwargs): + return super().aggregate(arg, *args, **kwargs) + + agg = aggregate + + @Substitution(name="expanding") + @Appender(_shared_docs["count"]) + def count(self, **kwargs): + return super().count(**kwargs) + + @Substitution(name="expanding") + @Appender(_shared_docs["apply"]) + def apply(self, func, raw=None, args=(), kwargs={}): + return super().apply(func, raw=raw, args=args, kwargs=kwargs) + + @Substitution(name="expanding") + @Appender(_shared_docs["sum"]) + def sum(self, *args, **kwargs): + nv.validate_expanding_func("sum", args, kwargs) + return super().sum(*args, **kwargs) + + @Substitution(name="expanding") + @Appender(_doc_template) + @Appender(_shared_docs["max"]) + def max(self, *args, **kwargs): + nv.validate_expanding_func("max", args, kwargs) + return super().max(*args, **kwargs) + + @Substitution(name="expanding") + @Appender(_shared_docs["min"]) + def min(self, *args, **kwargs): + nv.validate_expanding_func("min", args, kwargs) + return super().min(*args, **kwargs) + + @Substitution(name="expanding") + @Appender(_shared_docs["mean"]) + def mean(self, *args, **kwargs): + nv.validate_expanding_func("mean", args, kwargs) + return super().mean(*args, **kwargs) + + @Substitution(name="expanding") + @Appender(_shared_docs["median"]) + def median(self, **kwargs): + return super().median(**kwargs) + + @Substitution(name="expanding") + @Appender(_shared_docs["std"]) + def std(self, ddof=1, *args, **kwargs): + nv.validate_expanding_func("std", args, kwargs) + return super().std(ddof=ddof, **kwargs) + + @Substitution(name="expanding") + @Appender(_shared_docs["var"]) + def var(self, ddof=1, *args, **kwargs): + nv.validate_expanding_func("var", args, kwargs) + return super().var(ddof=ddof, **kwargs) + + @Substitution(name="expanding") + @Appender(_doc_template) + @Appender(_shared_docs["skew"]) + def skew(self, **kwargs): + return super().skew(**kwargs) + + _agg_doc = dedent( + """ + Examples + -------- + + The example below will show an expanding calculation with a window size of + four matching the equivalent function call using `scipy.stats`. + + >>> arr = [1, 2, 3, 4, 999] + >>> import scipy.stats + >>> fmt = "{0:.6f}" # limit the printed precision to 6 digits + >>> print(fmt.format(scipy.stats.kurtosis(arr[:-1], bias=False))) + -1.200000 + >>> print(fmt.format(scipy.stats.kurtosis(arr, bias=False))) + 4.999874 + >>> s = pd.Series(arr) + >>> s.expanding(4).kurt() + 0 NaN + 1 NaN + 2 NaN + 3 -1.200000 + 4 4.999874 + dtype: float64 + """ + ) + + @Appender(_agg_doc) + @Substitution(name="expanding") + @Appender(_shared_docs["kurt"]) + def kurt(self, **kwargs): + return super().kurt(**kwargs) + + @Substitution(name="expanding") + @Appender(_shared_docs["quantile"]) + def quantile(self, quantile, interpolation="linear", **kwargs): + return super().quantile( + quantile=quantile, interpolation=interpolation, **kwargs + ) + + @Substitution(name="expanding") + @Appender(_doc_template) + @Appender(_shared_docs["cov"]) + def cov(self, other=None, pairwise=None, ddof=1, **kwargs): + return super().cov(other=other, pairwise=pairwise, ddof=ddof, **kwargs) + + @Substitution(name="expanding") + @Appender(_shared_docs["corr"]) + def corr(self, other=None, pairwise=None, **kwargs): + return super().corr(other=other, pairwise=pairwise, **kwargs) + + +class ExpandingGroupby(_GroupByMixin, Expanding): + """ + Provide a expanding groupby implementation. + """ + + @property + def _constructor(self): + return Expanding diff --git a/pandas/core/window.py b/pandas/core/window/rolling.py similarity index 66% rename from pandas/core/window.py rename to pandas/core/window/rolling.py index 3e3f17369db7b..323089b3fdf6b 100644 --- a/pandas/core/window.py +++ b/pandas/core/window/rolling.py @@ -2,7 +2,6 @@ Provide a generic structure to support window functions, similar to how we have a Groupby object. """ -from collections import defaultdict from datetime import timedelta from textwrap import dedent from typing import Callable, List, Optional, Set, Union @@ -38,22 +37,17 @@ from pandas._typing import Axis, FrameOrSeries, Scalar from pandas.core.base import DataError, PandasObject, SelectionMixin import pandas.core.common as com -from pandas.core.generic import _shared_docs -from pandas.core.groupby.base import GroupByMixin -from pandas.core.index import Index, MultiIndex, ensure_index - -_shared_docs = dict(**_shared_docs) -_doc_template = """ - Returns - ------- - Series or DataFrame - Return type is determined by the caller. - - See Also - -------- - Series.%(name)s : Series %(name)s. - DataFrame.%(name)s : DataFrame %(name)s. -""" +from pandas.core.index import Index, ensure_index +from pandas.core.window.common import ( + _doc_template, + _flex_binary_moment, + _GroupByMixin, + _offset, + _require_min_periods, + _shared_docs, + _use_window, + _zsqrt, +) class _Window(PandasObject, SelectionMixin): @@ -121,6 +115,8 @@ def validate(self): "neither", ]: raise ValueError("closed must be 'right', 'left', 'both' or 'neither'") + if not isinstance(self.obj, (ABCSeries, ABCDataFrame)): + raise TypeError("invalid type: {}".format(type(self))) def _create_blocks(self): """ @@ -929,44 +925,6 @@ def mean(self, *args, **kwargs): return self._apply("roll_weighted_mean", **kwargs) -class _GroupByMixin(GroupByMixin): - """ - Provide the groupby facilities. - """ - - def __init__(self, obj, *args, **kwargs): - parent = kwargs.pop("parent", None) # noqa - groupby = kwargs.pop("groupby", None) - if groupby is None: - groupby, obj = obj, obj.obj - self._groupby = groupby - self._groupby.mutated = True - self._groupby.grouper.mutated = True - super().__init__(obj, *args, **kwargs) - - count = GroupByMixin._dispatch("count") - corr = GroupByMixin._dispatch("corr", other=None, pairwise=None) - cov = GroupByMixin._dispatch("cov", other=None, pairwise=None) - - def _apply( - self, func, name=None, window=None, center=None, check_minp=None, **kwargs - ): - """ - Dispatch to apply; we are stripping all of the _apply kwargs and - performing the original function call on the grouped object. - """ - - def f(x, name=name, *args): - x = self._shallow_copy(x) - - if isinstance(name, str): - return getattr(x, name)(*args, **kwargs) - - return x.apply(name, *args, **kwargs) - - return self._groupby.apply(f) - - class _Rolling(_Window): @property def _constructor(self): @@ -1949,6 +1907,9 @@ def corr(self, other=None, pairwise=None, **kwargs): return super().corr(other=other, pairwise=pairwise, **kwargs) +Rolling.__doc__ = Window.__doc__ + + class RollingGroupby(_GroupByMixin, Rolling): """ Provide a rolling groupby implementation. @@ -1976,883 +1937,3 @@ def _validate_monotonic(self): level. """ pass - - -class Expanding(_Rolling_and_Expanding): - """ - Provide expanding transformations. - - Parameters - ---------- - min_periods : int, default 1 - Minimum number of observations in window required to have a value - (otherwise result is NA). - center : bool, default False - Set the labels at the center of the window. - axis : int or str, default 0 - - Returns - ------- - a Window sub-classed for the particular operation - - See Also - -------- - rolling : Provides rolling window calculations. - ewm : Provides exponential weighted functions. - - Notes - ----- - By default, the result is set to the right edge of the window. This can be - changed to the center of the window by setting ``center=True``. - - Examples - -------- - - >>> df = pd.DataFrame({'B': [0, 1, 2, np.nan, 4]}) - B - 0 0.0 - 1 1.0 - 2 2.0 - 3 NaN - 4 4.0 - - >>> df.expanding(2).sum() - B - 0 NaN - 1 1.0 - 2 3.0 - 3 3.0 - 4 7.0 - """ - - _attributes = ["min_periods", "center", "axis"] - - def __init__(self, obj, min_periods=1, center=False, axis=0, **kwargs): - super().__init__(obj=obj, min_periods=min_periods, center=center, axis=axis) - - @property - def _constructor(self): - return Expanding - - def _get_window(self, other=None, **kwargs): - """ - Get the window length over which to perform some operation. - - Parameters - ---------- - other : object, default None - The other object that is involved in the operation. - Such an object is involved for operations like covariance. - - Returns - ------- - window : int - The window length. - """ - axis = self.obj._get_axis(self.axis) - length = len(axis) + (other is not None) * len(axis) - - other = self.min_periods or -1 - return max(length, other) - - _agg_see_also_doc = dedent( - """ - See Also - -------- - DataFrame.expanding.aggregate - DataFrame.rolling.aggregate - DataFrame.aggregate - """ - ) - - _agg_examples_doc = dedent( - """ - Examples - -------- - - >>> df = pd.DataFrame(np.random.randn(10, 3), columns=['A', 'B', 'C']) - >>> df - A B C - 0 -2.385977 -0.102758 0.438822 - 1 -1.004295 0.905829 -0.954544 - 2 0.735167 -0.165272 -1.619346 - 3 -0.702657 -1.340923 -0.706334 - 4 -0.246845 0.211596 -0.901819 - 5 2.463718 3.157577 -1.380906 - 6 -1.142255 2.340594 -0.039875 - 7 1.396598 -1.647453 1.677227 - 8 -0.543425 1.761277 -0.220481 - 9 -0.640505 0.289374 -1.550670 - - >>> df.ewm(alpha=0.5).mean() - A B C - 0 -2.385977 -0.102758 0.438822 - 1 -1.464856 0.569633 -0.490089 - 2 -0.207700 0.149687 -1.135379 - 3 -0.471677 -0.645305 -0.906555 - 4 -0.355635 -0.203033 -0.904111 - 5 1.076417 1.503943 -1.146293 - 6 -0.041654 1.925562 -0.588728 - 7 0.680292 0.132049 0.548693 - 8 0.067236 0.948257 0.163353 - 9 -0.286980 0.618493 -0.694496 - """ - ) - - @Substitution( - see_also=_agg_see_also_doc, - examples=_agg_examples_doc, - versionadded="", - klass="Series/Dataframe", - axis="", - ) - @Appender(_shared_docs["aggregate"]) - def aggregate(self, arg, *args, **kwargs): - return super().aggregate(arg, *args, **kwargs) - - agg = aggregate - - @Substitution(name="expanding") - @Appender(_shared_docs["count"]) - def count(self, **kwargs): - return super().count(**kwargs) - - @Substitution(name="expanding") - @Appender(_shared_docs["apply"]) - def apply(self, func, raw=None, args=(), kwargs={}): - return super().apply(func, raw=raw, args=args, kwargs=kwargs) - - @Substitution(name="expanding") - @Appender(_shared_docs["sum"]) - def sum(self, *args, **kwargs): - nv.validate_expanding_func("sum", args, kwargs) - return super().sum(*args, **kwargs) - - @Substitution(name="expanding") - @Appender(_doc_template) - @Appender(_shared_docs["max"]) - def max(self, *args, **kwargs): - nv.validate_expanding_func("max", args, kwargs) - return super().max(*args, **kwargs) - - @Substitution(name="expanding") - @Appender(_shared_docs["min"]) - def min(self, *args, **kwargs): - nv.validate_expanding_func("min", args, kwargs) - return super().min(*args, **kwargs) - - @Substitution(name="expanding") - @Appender(_shared_docs["mean"]) - def mean(self, *args, **kwargs): - nv.validate_expanding_func("mean", args, kwargs) - return super().mean(*args, **kwargs) - - @Substitution(name="expanding") - @Appender(_shared_docs["median"]) - def median(self, **kwargs): - return super().median(**kwargs) - - @Substitution(name="expanding") - @Appender(_shared_docs["std"]) - def std(self, ddof=1, *args, **kwargs): - nv.validate_expanding_func("std", args, kwargs) - return super().std(ddof=ddof, **kwargs) - - @Substitution(name="expanding") - @Appender(_shared_docs["var"]) - def var(self, ddof=1, *args, **kwargs): - nv.validate_expanding_func("var", args, kwargs) - return super().var(ddof=ddof, **kwargs) - - @Substitution(name="expanding") - @Appender(_doc_template) - @Appender(_shared_docs["skew"]) - def skew(self, **kwargs): - return super().skew(**kwargs) - - _agg_doc = dedent( - """ - Examples - -------- - - The example below will show an expanding calculation with a window size of - four matching the equivalent function call using `scipy.stats`. - - >>> arr = [1, 2, 3, 4, 999] - >>> import scipy.stats - >>> fmt = "{0:.6f}" # limit the printed precision to 6 digits - >>> print(fmt.format(scipy.stats.kurtosis(arr[:-1], bias=False))) - -1.200000 - >>> print(fmt.format(scipy.stats.kurtosis(arr, bias=False))) - 4.999874 - >>> s = pd.Series(arr) - >>> s.expanding(4).kurt() - 0 NaN - 1 NaN - 2 NaN - 3 -1.200000 - 4 4.999874 - dtype: float64 - """ - ) - - @Appender(_agg_doc) - @Substitution(name="expanding") - @Appender(_shared_docs["kurt"]) - def kurt(self, **kwargs): - return super().kurt(**kwargs) - - @Substitution(name="expanding") - @Appender(_shared_docs["quantile"]) - def quantile(self, quantile, interpolation="linear", **kwargs): - return super().quantile( - quantile=quantile, interpolation=interpolation, **kwargs - ) - - @Substitution(name="expanding") - @Appender(_doc_template) - @Appender(_shared_docs["cov"]) - def cov(self, other=None, pairwise=None, ddof=1, **kwargs): - return super().cov(other=other, pairwise=pairwise, ddof=ddof, **kwargs) - - @Substitution(name="expanding") - @Appender(_shared_docs["corr"]) - def corr(self, other=None, pairwise=None, **kwargs): - return super().corr(other=other, pairwise=pairwise, **kwargs) - - -class ExpandingGroupby(_GroupByMixin, Expanding): - """ - Provide a expanding groupby implementation. - """ - - @property - def _constructor(self): - return Expanding - - -_bias_template = """ - Parameters - ---------- - bias : bool, default False - Use a standard estimation bias correction. - *args, **kwargs - Arguments and keyword arguments to be passed into func. -""" - -_pairwise_template = """ - Parameters - ---------- - other : Series, DataFrame, or ndarray, optional - If not supplied then will default to self and produce pairwise - output. - pairwise : bool, default None - If False then only matching columns between self and other will be - used and the output will be a DataFrame. - If True then all pairwise combinations will be calculated and the - output will be a MultiIndex DataFrame in the case of DataFrame - inputs. In the case of missing elements, only complete pairwise - observations will be used. - bias : bool, default False - Use a standard estimation bias correction. - **kwargs - Keyword arguments to be passed into func. -""" - - -class EWM(_Rolling): - r""" - Provide exponential weighted functions. - - Parameters - ---------- - com : float, optional - Specify decay in terms of center of mass, - :math:`\alpha = 1 / (1 + com),\text{ for } com \geq 0`. - span : float, optional - Specify decay in terms of span, - :math:`\alpha = 2 / (span + 1),\text{ for } span \geq 1`. - halflife : float, optional - Specify decay in terms of half-life, - :math:`\alpha = 1 - exp(log(0.5) / halflife),\text{for} halflife > 0`. - alpha : float, optional - Specify smoothing factor :math:`\alpha` directly, - :math:`0 < \alpha \leq 1`. - min_periods : int, default 0 - Minimum number of observations in window required to have a value - (otherwise result is NA). - adjust : bool, default True - Divide by decaying adjustment factor in beginning periods to account - for imbalance in relative weightings - (viewing EWMA as a moving average). - ignore_na : bool, default False - Ignore missing values when calculating weights; - specify True to reproduce pre-0.15.0 behavior. - axis : {0 or 'index', 1 or 'columns'}, default 0 - The axis to use. The value 0 identifies the rows, and 1 - identifies the columns. - - Returns - ------- - DataFrame - A Window sub-classed for the particular operation. - - See Also - -------- - rolling : Provides rolling window calculations. - expanding : Provides expanding transformations. - - Notes - ----- - Exactly one of center of mass, span, half-life, and alpha must be provided. - Allowed values and relationship between the parameters are specified in the - parameter descriptions above; see the link at the end of this section for - a detailed explanation. - - When adjust is True (default), weighted averages are calculated using - weights (1-alpha)**(n-1), (1-alpha)**(n-2), ..., 1-alpha, 1. - - When adjust is False, weighted averages are calculated recursively as: - weighted_average[0] = arg[0]; - weighted_average[i] = (1-alpha)*weighted_average[i-1] + alpha*arg[i]. - - When ignore_na is False (default), weights are based on absolute positions. - For example, the weights of x and y used in calculating the final weighted - average of [x, None, y] are (1-alpha)**2 and 1 (if adjust is True), and - (1-alpha)**2 and alpha (if adjust is False). - - When ignore_na is True (reproducing pre-0.15.0 behavior), weights are based - on relative positions. For example, the weights of x and y used in - calculating the final weighted average of [x, None, y] are 1-alpha and 1 - (if adjust is True), and 1-alpha and alpha (if adjust is False). - - More details can be found at - http://pandas.pydata.org/pandas-docs/stable/user_guide/computation.html#exponentially-weighted-windows - - Examples - -------- - - >>> df = pd.DataFrame({'B': [0, 1, 2, np.nan, 4]}) - >>> df - B - 0 0.0 - 1 1.0 - 2 2.0 - 3 NaN - 4 4.0 - - >>> df.ewm(com=0.5).mean() - B - 0 0.000000 - 1 0.750000 - 2 1.615385 - 3 1.615385 - 4 3.670213 - """ - _attributes = ["com", "min_periods", "adjust", "ignore_na", "axis"] - - def __init__( - self, - obj, - com=None, - span=None, - halflife=None, - alpha=None, - min_periods=0, - adjust=True, - ignore_na=False, - axis=0, - ): - self.obj = obj - self.com = _get_center_of_mass(com, span, halflife, alpha) - self.min_periods = min_periods - self.adjust = adjust - self.ignore_na = ignore_na - self.axis = axis - self.on = None - - @property - def _constructor(self): - return EWM - - _agg_see_also_doc = dedent( - """ - See Also - -------- - pandas.DataFrame.rolling.aggregate - """ - ) - - _agg_examples_doc = dedent( - """ - Examples - -------- - - >>> df = pd.DataFrame(np.random.randn(10, 3), columns=['A', 'B', 'C']) - >>> df - A B C - 0 -2.385977 -0.102758 0.438822 - 1 -1.004295 0.905829 -0.954544 - 2 0.735167 -0.165272 -1.619346 - 3 -0.702657 -1.340923 -0.706334 - 4 -0.246845 0.211596 -0.901819 - 5 2.463718 3.157577 -1.380906 - 6 -1.142255 2.340594 -0.039875 - 7 1.396598 -1.647453 1.677227 - 8 -0.543425 1.761277 -0.220481 - 9 -0.640505 0.289374 -1.550670 - - >>> df.ewm(alpha=0.5).mean() - A B C - 0 -2.385977 -0.102758 0.438822 - 1 -1.464856 0.569633 -0.490089 - 2 -0.207700 0.149687 -1.135379 - 3 -0.471677 -0.645305 -0.906555 - 4 -0.355635 -0.203033 -0.904111 - 5 1.076417 1.503943 -1.146293 - 6 -0.041654 1.925562 -0.588728 - 7 0.680292 0.132049 0.548693 - 8 0.067236 0.948257 0.163353 - 9 -0.286980 0.618493 -0.694496 - """ - ) - - @Substitution( - see_also=_agg_see_also_doc, - examples=_agg_examples_doc, - versionadded="", - klass="Series/Dataframe", - axis="", - ) - @Appender(_shared_docs["aggregate"]) - def aggregate(self, arg, *args, **kwargs): - return super().aggregate(arg, *args, **kwargs) - - agg = aggregate - - def _apply(self, func, **kwargs): - """ - Rolling statistical measure using supplied function. Designed to be - used with passed-in Cython array-based functions. - - Parameters - ---------- - func : str/callable to apply - - Returns - ------- - y : same type as input argument - """ - blocks, obj = self._create_blocks() - block_list = list(blocks) - - results = [] - exclude = [] - for i, b in enumerate(blocks): - try: - values = self._prep_values(b.values) - - except (TypeError, NotImplementedError): - if isinstance(obj, ABCDataFrame): - exclude.extend(b.columns) - del block_list[i] - continue - else: - raise DataError("No numeric types to aggregate") - - if values.size == 0: - results.append(values.copy()) - continue - - # if we have a string function name, wrap it - if isinstance(func, str): - cfunc = getattr(libwindow, func, None) - if cfunc is None: - raise ValueError( - "we do not support this function " - "in libwindow.{func}".format(func=func) - ) - - def func(arg): - return cfunc( - arg, - self.com, - int(self.adjust), - int(self.ignore_na), - int(self.min_periods), - ) - - results.append(np.apply_along_axis(func, self.axis, values)) - - return self._wrap_results(results, block_list, obj, exclude) - - @Substitution(name="ewm") - @Appender(_doc_template) - def mean(self, *args, **kwargs): - """ - Exponential weighted moving average. - - Parameters - ---------- - *args, **kwargs - Arguments and keyword arguments to be passed into func. - """ - nv.validate_window_func("mean", args, kwargs) - return self._apply("ewma", **kwargs) - - @Substitution(name="ewm") - @Appender(_doc_template) - @Appender(_bias_template) - def std(self, bias=False, *args, **kwargs): - """ - Exponential weighted moving stddev. - """ - nv.validate_window_func("std", args, kwargs) - return _zsqrt(self.var(bias=bias, **kwargs)) - - vol = std - - @Substitution(name="ewm") - @Appender(_doc_template) - @Appender(_bias_template) - def var(self, bias=False, *args, **kwargs): - """ - Exponential weighted moving variance. - """ - nv.validate_window_func("var", args, kwargs) - - def f(arg): - return libwindow.ewmcov( - arg, - arg, - self.com, - int(self.adjust), - int(self.ignore_na), - int(self.min_periods), - int(bias), - ) - - return self._apply(f, **kwargs) - - @Substitution(name="ewm") - @Appender(_doc_template) - @Appender(_pairwise_template) - def cov(self, other=None, pairwise=None, bias=False, **kwargs): - """ - Exponential weighted sample covariance. - """ - if other is None: - other = self._selected_obj - # only default unset - pairwise = True if pairwise is None else pairwise - other = self._shallow_copy(other) - - def _get_cov(X, Y): - X = self._shallow_copy(X) - Y = self._shallow_copy(Y) - cov = libwindow.ewmcov( - X._prep_values(), - Y._prep_values(), - self.com, - int(self.adjust), - int(self.ignore_na), - int(self.min_periods), - int(bias), - ) - return X._wrap_result(cov) - - return _flex_binary_moment( - self._selected_obj, other._selected_obj, _get_cov, pairwise=bool(pairwise) - ) - - @Substitution(name="ewm") - @Appender(_doc_template) - @Appender(_pairwise_template) - def corr(self, other=None, pairwise=None, **kwargs): - """ - Exponential weighted sample correlation. - """ - if other is None: - other = self._selected_obj - # only default unset - pairwise = True if pairwise is None else pairwise - other = self._shallow_copy(other) - - def _get_corr(X, Y): - X = self._shallow_copy(X) - Y = self._shallow_copy(Y) - - def _cov(x, y): - return libwindow.ewmcov( - x, - y, - self.com, - int(self.adjust), - int(self.ignore_na), - int(self.min_periods), - 1, - ) - - x_values = X._prep_values() - y_values = Y._prep_values() - with np.errstate(all="ignore"): - cov = _cov(x_values, y_values) - x_var = _cov(x_values, x_values) - y_var = _cov(y_values, y_values) - corr = cov / _zsqrt(x_var * y_var) - return X._wrap_result(corr) - - return _flex_binary_moment( - self._selected_obj, other._selected_obj, _get_corr, pairwise=bool(pairwise) - ) - - -# Helper Funcs - - -def _flex_binary_moment(arg1, arg2, f, pairwise=False): - - if not ( - isinstance(arg1, (np.ndarray, ABCSeries, ABCDataFrame)) - and isinstance(arg2, (np.ndarray, ABCSeries, ABCDataFrame)) - ): - raise TypeError( - "arguments to moment function must be of type " - "np.ndarray/Series/DataFrame" - ) - - if isinstance(arg1, (np.ndarray, ABCSeries)) and isinstance( - arg2, (np.ndarray, ABCSeries) - ): - X, Y = _prep_binary(arg1, arg2) - return f(X, Y) - - elif isinstance(arg1, ABCDataFrame): - from pandas import DataFrame - - def dataframe_from_int_dict(data, frame_template): - result = DataFrame(data, index=frame_template.index) - if len(result.columns) > 0: - result.columns = frame_template.columns[result.columns] - return result - - results = {} - if isinstance(arg2, ABCDataFrame): - if pairwise is False: - if arg1 is arg2: - # special case in order to handle duplicate column names - for i, col in enumerate(arg1.columns): - results[i] = f(arg1.iloc[:, i], arg2.iloc[:, i]) - return dataframe_from_int_dict(results, arg1) - else: - if not arg1.columns.is_unique: - raise ValueError("'arg1' columns are not unique") - if not arg2.columns.is_unique: - raise ValueError("'arg2' columns are not unique") - with warnings.catch_warnings(record=True): - warnings.simplefilter("ignore", RuntimeWarning) - X, Y = arg1.align(arg2, join="outer") - X = X + 0 * Y - Y = Y + 0 * X - - with warnings.catch_warnings(record=True): - warnings.simplefilter("ignore", RuntimeWarning) - res_columns = arg1.columns.union(arg2.columns) - for col in res_columns: - if col in X and col in Y: - results[col] = f(X[col], Y[col]) - return DataFrame(results, index=X.index, columns=res_columns) - elif pairwise is True: - results = defaultdict(dict) - for i, k1 in enumerate(arg1.columns): - for j, k2 in enumerate(arg2.columns): - if j < i and arg2 is arg1: - # Symmetric case - results[i][j] = results[j][i] - else: - results[i][j] = f( - *_prep_binary(arg1.iloc[:, i], arg2.iloc[:, j]) - ) - - from pandas import concat - - result_index = arg1.index.union(arg2.index) - if len(result_index): - - # construct result frame - result = concat( - [ - concat( - [results[i][j] for j, c in enumerate(arg2.columns)], - ignore_index=True, - ) - for i, c in enumerate(arg1.columns) - ], - ignore_index=True, - axis=1, - ) - result.columns = arg1.columns - - # set the index and reorder - if arg2.columns.nlevels > 1: - result.index = MultiIndex.from_product( - arg2.columns.levels + [result_index] - ) - result = result.reorder_levels([2, 0, 1]).sort_index() - else: - result.index = MultiIndex.from_product( - [range(len(arg2.columns)), range(len(result_index))] - ) - result = result.swaplevel(1, 0).sort_index() - result.index = MultiIndex.from_product( - [result_index] + [arg2.columns] - ) - else: - - # empty result - result = DataFrame( - index=MultiIndex( - levels=[arg1.index, arg2.columns], codes=[[], []] - ), - columns=arg2.columns, - dtype="float64", - ) - - # reset our index names to arg1 names - # reset our column names to arg2 names - # careful not to mutate the original names - result.columns = result.columns.set_names(arg1.columns.names) - result.index = result.index.set_names( - result_index.names + arg2.columns.names - ) - - return result - - else: - raise ValueError("'pairwise' is not True/False") - else: - results = { - i: f(*_prep_binary(arg1.iloc[:, i], arg2)) - for i, col in enumerate(arg1.columns) - } - return dataframe_from_int_dict(results, arg1) - - else: - return _flex_binary_moment(arg2, arg1, f) - - -def _get_center_of_mass(comass, span, halflife, alpha): - valid_count = com.count_not_none(comass, span, halflife, alpha) - if valid_count > 1: - raise ValueError("comass, span, halflife, and alpha are mutually exclusive") - - # Convert to center of mass; domain checks ensure 0 < alpha <= 1 - if comass is not None: - if comass < 0: - raise ValueError("comass must satisfy: comass >= 0") - elif span is not None: - if span < 1: - raise ValueError("span must satisfy: span >= 1") - comass = (span - 1) / 2.0 - elif halflife is not None: - if halflife <= 0: - raise ValueError("halflife must satisfy: halflife > 0") - decay = 1 - np.exp(np.log(0.5) / halflife) - comass = 1 / decay - 1 - elif alpha is not None: - if alpha <= 0 or alpha > 1: - raise ValueError("alpha must satisfy: 0 < alpha <= 1") - comass = (1.0 - alpha) / alpha - else: - raise ValueError("Must pass one of comass, span, halflife, or alpha") - - return float(comass) - - -def _offset(window, center): - if not is_integer(window): - window = len(window) - offset = (window - 1) / 2.0 if center else 0 - try: - return int(offset) - except TypeError: - return offset.astype(int) - - -def _require_min_periods(p): - def _check_func(minp, window): - if minp is None: - return window - else: - return max(p, minp) - - return _check_func - - -def _use_window(minp, window): - if minp is None: - return window - else: - return minp - - -def _zsqrt(x): - with np.errstate(all="ignore"): - result = np.sqrt(x) - mask = x < 0 - - if isinstance(x, ABCDataFrame): - if mask.values.any(): - result[mask] = 0 - else: - if mask.any(): - result[mask] = 0 - - return result - - -def _prep_binary(arg1, arg2): - if not isinstance(arg2, type(arg1)): - raise Exception("Input arrays must be of the same type!") - - # mask out values, this also makes a common index... - X = arg1 + 0 * arg2 - Y = arg2 + 0 * arg1 - - return X, Y - - -# Top-level exports - - -def rolling(obj, win_type=None, **kwds): - if not isinstance(obj, (ABCSeries, ABCDataFrame)): - raise TypeError("invalid type: %s" % type(obj)) - - if win_type is not None: - return Window(obj, win_type=win_type, **kwds) - - return Rolling(obj, **kwds) - - -rolling.__doc__ = Window.__doc__ - - -def expanding(obj, **kwds): - if not isinstance(obj, (ABCSeries, ABCDataFrame)): - raise TypeError("invalid type: %s" % type(obj)) - - return Expanding(obj, **kwds) - - -expanding.__doc__ = Expanding.__doc__ - - -def ewm(obj, **kwds): - if not isinstance(obj, (ABCSeries, ABCDataFrame)): - raise TypeError("invalid type: %s" % type(obj)) - - return EWM(obj, **kwds) - - -ewm.__doc__ = EWM.__doc__ diff --git a/pandas/tests/window/test_ewm.py b/pandas/tests/window/test_ewm.py index a05b567adad7a..1683fda500f85 100644 --- a/pandas/tests/window/test_ewm.py +++ b/pandas/tests/window/test_ewm.py @@ -4,7 +4,7 @@ from pandas.errors import UnsupportedFunctionCall from pandas import DataFrame, Series -import pandas.core.window as rwindow +from pandas.core.window import EWM from pandas.tests.window.common import Base @@ -60,7 +60,7 @@ def test_constructor(self, which): @pytest.mark.parametrize("method", ["std", "mean", "var"]) def test_numpy_compat(self, method): # see gh-12811 - e = rwindow.EWM(Series([2, 4, 6]), alpha=0.5) + e = EWM(Series([2, 4, 6]), alpha=0.5) msg = "numpy operations are not valid with window objects" diff --git a/pandas/tests/window/test_expanding.py b/pandas/tests/window/test_expanding.py index 1e92c981964c5..098acdff93ac6 100644 --- a/pandas/tests/window/test_expanding.py +++ b/pandas/tests/window/test_expanding.py @@ -5,7 +5,7 @@ import pandas as pd from pandas import DataFrame, Series -import pandas.core.window as rwindow +from pandas.core.window import Expanding from pandas.tests.window.common import Base import pandas.util.testing as tm @@ -42,7 +42,7 @@ def test_constructor(self, which): @pytest.mark.parametrize("method", ["std", "mean", "sum", "max", "min", "var"]) def test_numpy_compat(self, method): # see gh-12811 - e = rwindow.Expanding(Series([2, 4, 6]), window=2) + e = Expanding(Series([2, 4, 6]), window=2) msg = "numpy operations are not valid with window objects" diff --git a/pandas/tests/window/test_moments.py b/pandas/tests/window/test_moments.py index d860859958254..3d6cd7d10bd10 100644 --- a/pandas/tests/window/test_moments.py +++ b/pandas/tests/window/test_moments.py @@ -10,7 +10,7 @@ import pandas as pd from pandas import DataFrame, Index, Series, concat, isna, notna -import pandas.core.window as rwindow +from pandas.core.window.common import _flex_binary_moment from pandas.tests.window.common import Base import pandas.util.testing as tm @@ -1878,7 +1878,7 @@ def test_flex_binary_moment(self): " np.ndarray/Series/DataFrame" ) with pytest.raises(TypeError, match=msg): - rwindow._flex_binary_moment(5, 6, None) + _flex_binary_moment(5, 6, None) def test_corr_sanity(self): # GH 3155 diff --git a/pandas/tests/window/test_rolling.py b/pandas/tests/window/test_rolling.py index f0787ab3d191f..b4787bf25e3bb 100644 --- a/pandas/tests/window/test_rolling.py +++ b/pandas/tests/window/test_rolling.py @@ -8,7 +8,7 @@ import pandas as pd from pandas import DataFrame, Series -import pandas.core.window as rwindow +from pandas.core.window import Rolling from pandas.tests.window.common import Base import pandas.util.testing as tm @@ -101,7 +101,7 @@ def test_constructor_timedelta_window_and_minperiods(self, window, raw): @pytest.mark.parametrize("method", ["std", "mean", "sum", "max", "min", "var"]) def test_numpy_compat(self, method): # see gh-12811 - r = rwindow.Rolling(Series([2, 4, 6]), window=2) + r = Rolling(Series([2, 4, 6]), window=2) msg = "numpy operations are not valid with window objects" diff --git a/pandas/tests/window/test_window.py b/pandas/tests/window/test_window.py index a6a56c98a9377..5692404205012 100644 --- a/pandas/tests/window/test_window.py +++ b/pandas/tests/window/test_window.py @@ -6,7 +6,7 @@ import pandas as pd from pandas import Series -import pandas.core.window as rwindow +from pandas.core.window import Window from pandas.tests.window.common import Base @@ -50,7 +50,7 @@ def test_constructor_with_win_type(self, which, win_types): @pytest.mark.parametrize("method", ["sum", "mean"]) def test_numpy_compat(self, method): # see gh-12811 - w = rwindow.Window(Series([2, 4, 6]), window=[0, 2]) + w = Window(Series([2, 4, 6]), window=[0, 2]) msg = "numpy operations are not valid with window objects" From 3bdac3e78786bdcdeea49f9da183ff826e91e98c Mon Sep 17 00:00:00 2001 From: Kaiqi Dong Date: Wed, 7 Aug 2019 15:57:33 +0200 Subject: [PATCH 039/191] TST: Add tests for groupby categorical values with axis=1 (#27788) --- pandas/tests/groupby/test_categorical.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index ce724f5a60beb..756de3edd33dd 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -1163,3 +1163,13 @@ def test_seriesgroupby_observed_apply_dict(df_cat, observed, index, data): lambda x: OrderedDict([("min", x.min()), ("max", x.max())]) ) assert_series_equal(result, expected) + + +@pytest.mark.parametrize("code", [([1, 0, 0]), ([0, 0, 0])]) +def test_groupby_categorical_axis_1(code): + # GH 13420 + df = DataFrame({"a": [1, 2, 3, 4], "b": [-1, -2, -3, -4], "c": [5, 6, 7, 8]}) + cat = pd.Categorical.from_codes(code, categories=list("abc")) + result = df.groupby(cat, axis=1).mean() + expected = df.T.groupby(cat, axis=0).mean().T + assert_frame_equal(result, expected) From 358107330d6805829d3af8dcdd49f2e828aaa81d Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 7 Aug 2019 09:54:42 -0700 Subject: [PATCH 040/191] CLN: remove nested error handling (#27792) --- pandas/core/internals/blocks.py | 16 ++-------------- 1 file changed, 2 insertions(+), 14 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 8eea46af2c353..c035e1174bb27 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1366,20 +1366,8 @@ def func(cond, values, other): # np.where will cast integer array to floats in this case other = self._try_coerce_args(other) - try: - fastres = expressions.where(cond, values, other) - return fastres - except Exception as detail: - if errors == "raise": - raise TypeError( - "Could not operate [{other!r}] with block values " - "[{detail!s}]".format(other=other, detail=detail) - ) - else: - # return the values - result = np.empty(values.shape, dtype="float64") - result.fill(np.nan) - return result + fastres = expressions.where(cond, values, other) + return fastres if cond.ravel().all(): result = values From c0ff67a22df9c18da1172766e313732ed2ab6c30 Mon Sep 17 00:00:00 2001 From: Kaiqi Dong Date: Wed, 7 Aug 2019 23:09:53 +0200 Subject: [PATCH 041/191] BUG: Add mapping for pyqt for successful package installation (#27645) * Add mapping for pyqt --- doc/source/whatsnew/v1.0.0.rst | 1 + environment.yml | 2 +- requirements-dev.txt | 2 +- scripts/generate_pip_deps_from_conda.py | 2 +- 4 files changed, 4 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index f5ca843e1a6f7..58918f2d8c40e 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -188,6 +188,7 @@ Sparse Build Changes ^^^^^^^^^^^^^ +- Fixed pyqt development dependency issue because of different pyqt package name in conda and PyPI (:issue:`26838`) ExtensionArray diff --git a/environment.yml b/environment.yml index 93e8302b498a0..6d2cd701c3854 100644 --- a/environment.yml +++ b/environment.yml @@ -71,7 +71,7 @@ dependencies: - lxml # pandas.read_html - openpyxl # pandas.read_excel, DataFrame.to_excel, pandas.ExcelWriter, pandas.ExcelFile - pyarrow>=0.9.0 # pandas.read_paquet, DataFrame.to_parquet, pandas.read_feather, DataFrame.to_feather - - pyqt # pandas.read_clipbobard + - pyqt>=5.9.2 # pandas.read_clipboard - pytables>=3.4.2 # pandas.read_hdf, DataFrame.to_hdf - python-snappy # required by pyarrow - s3fs # pandas.read_csv... when using 's3://...' path diff --git a/requirements-dev.txt b/requirements-dev.txt index e49ad10bfc99d..cf11a3ee28258 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -45,7 +45,7 @@ html5lib lxml openpyxl pyarrow>=0.9.0 -pyqt +pyqt5>=5.9.2 tables>=3.4.2 python-snappy s3fs diff --git a/scripts/generate_pip_deps_from_conda.py b/scripts/generate_pip_deps_from_conda.py index ac73859b22598..6ae10c2cb07d2 100755 --- a/scripts/generate_pip_deps_from_conda.py +++ b/scripts/generate_pip_deps_from_conda.py @@ -20,7 +20,7 @@ EXCLUDE = {"python=3"} -RENAME = {"pytables": "tables"} +RENAME = {"pytables": "tables", "pyqt": "pyqt5"} def conda_package_to_pip(package): From 3ae18d5b51543569b27ee41d2dc79a7aa1adcd42 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 8 Aug 2019 06:00:22 -0500 Subject: [PATCH 042/191] CI: remove pytest pins (#27809) --- ci/deps/azure-36-locale.yaml | 4 ++-- ci/deps/travis-36-slow.yaml | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/ci/deps/azure-36-locale.yaml b/ci/deps/azure-36-locale.yaml index 8f8273f57c3fe..6a77b5dbedc61 100644 --- a/ci/deps/azure-36-locale.yaml +++ b/ci/deps/azure-36-locale.yaml @@ -20,8 +20,8 @@ dependencies: - xlsxwriter=0.9.8 - xlwt=1.2.0 # universal - - pytest>=4.0.2,<5.0.0 - - pytest-xdist + - pytest>=5.0.0 + - pytest-xdist>=1.29.0 - pytest-mock - pytest-azurepipelines - hypothesis>=3.58.0 diff --git a/ci/deps/travis-36-slow.yaml b/ci/deps/travis-36-slow.yaml index 538a82f66e4c8..9564bf5bb3a9f 100644 --- a/ci/deps/travis-36-slow.yaml +++ b/ci/deps/travis-36-slow.yaml @@ -25,8 +25,8 @@ dependencies: - xlsxwriter - xlwt # universal - - pytest>=4.0.2,<5.0.0 - - pytest-xdist + - pytest>=5.0.0 + - pytest-xdist>=1.29.0 - pytest-mock - moto - hypothesis>=3.58.0 From fae84ec6af8bd94d7de7f21f0398abd72e5b1787 Mon Sep 17 00:00:00 2001 From: Samesh Lakhotia <43701530+sameshl@users.noreply.github.com> Date: Thu, 8 Aug 2019 17:12:54 +0530 Subject: [PATCH 043/191] DOC: add print statement in to_latex example (#27798) --- pandas/core/generic.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 4d29f19cc01ed..c38489c7b85ef 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -2993,10 +2993,15 @@ def to_latex( >>> df = pd.DataFrame({'name': ['Raphael', 'Donatello'], ... 'mask': ['red', 'purple'], ... 'weapon': ['sai', 'bo staff']}) - >>> df.to_latex(index=False) # doctest: +NORMALIZE_WHITESPACE - '\\begin{tabular}{lll}\n\\toprule\n name & mask & weapon - \\\\\n\\midrule\n Raphael & red & sai \\\\\n Donatello & - purple & bo staff \\\\\n\\bottomrule\n\\end{tabular}\n' + >>> print(df.to_latex(index=False)) # doctest: +NORMALIZE_WHITESPACE + \begin{tabular}{lll} + \toprule + name & mask & weapon \\ + \midrule + Raphael & red & sai \\ + Donatello & purple & bo staff \\ + \bottomrule + \end{tabular} """ # Get defaults from the pandas config if self.ndim == 1: From c729b7302f62fb540248f67249dc20de99d0ba05 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 8 Aug 2019 05:33:12 -0700 Subject: [PATCH 044/191] REF: separate method-pinning functions (#27811) --- pandas/core/ops/__init__.py | 223 +------------------------------- pandas/core/ops/methods.py | 249 ++++++++++++++++++++++++++++++++++++ 2 files changed, 253 insertions(+), 219 deletions(-) create mode 100644 pandas/core/ops/methods.py diff --git a/pandas/core/ops/__init__.py b/pandas/core/ops/__init__.py index 4ab1941e3493f..01de1428e290c 100644 --- a/pandas/core/ops/__init__.py +++ b/pandas/core/ops/__init__.py @@ -41,7 +41,6 @@ ABCIndex, ABCIndexClass, ABCSeries, - ABCSparseArray, ABCSparseSeries, ) from pandas.core.dtypes.missing import isna, notna @@ -57,6 +56,10 @@ _op_descriptions, ) from pandas.core.ops.invalid import invalid_comparison +from pandas.core.ops.methods import ( # noqa:F401 + add_flex_arithmetic_methods, + add_special_arithmetic_methods, +) from pandas.core.ops.roperator import ( # noqa:F401 radd, rand_, @@ -587,224 +590,6 @@ def dispatch_to_extension_op(op, left, right): return res_values -# ----------------------------------------------------------------------------- -# Functions that add arithmetic methods to objects, given arithmetic factory -# methods - - -def _get_method_wrappers(cls): - """ - Find the appropriate operation-wrappers to use when defining flex/special - arithmetic, boolean, and comparison operations with the given class. - - Parameters - ---------- - cls : class - - Returns - ------- - arith_flex : function or None - comp_flex : function or None - arith_special : function - comp_special : function - bool_special : function - - Notes - ----- - None is only returned for SparseArray - """ - if issubclass(cls, ABCSparseSeries): - # Be sure to catch this before ABCSeries and ABCSparseArray, - # as they will both come see SparseSeries as a subclass - arith_flex = _flex_method_SERIES - comp_flex = _flex_method_SERIES - arith_special = _arith_method_SPARSE_SERIES - comp_special = _arith_method_SPARSE_SERIES - bool_special = _bool_method_SERIES - # TODO: I don't think the functions defined by bool_method are tested - elif issubclass(cls, ABCSeries): - # Just Series; SparseSeries is caught above - arith_flex = _flex_method_SERIES - comp_flex = _flex_method_SERIES - arith_special = _arith_method_SERIES - comp_special = _comp_method_SERIES - bool_special = _bool_method_SERIES - elif issubclass(cls, ABCDataFrame): - # Same for DataFrame and SparseDataFrame - arith_flex = _arith_method_FRAME - comp_flex = _flex_comp_method_FRAME - arith_special = _arith_method_FRAME - comp_special = _comp_method_FRAME - bool_special = _arith_method_FRAME - return arith_flex, comp_flex, arith_special, comp_special, bool_special - - -def _create_methods(cls, arith_method, comp_method, bool_method, special): - # creates actual methods based upon arithmetic, comp and bool method - # constructors. - - have_divmod = issubclass(cls, ABCSeries) - # divmod is available for Series and SparseSeries - - # yapf: disable - new_methods = dict( - add=arith_method(cls, operator.add, special), - radd=arith_method(cls, radd, special), - sub=arith_method(cls, operator.sub, special), - mul=arith_method(cls, operator.mul, special), - truediv=arith_method(cls, operator.truediv, special), - floordiv=arith_method(cls, operator.floordiv, special), - # Causes a floating point exception in the tests when numexpr enabled, - # so for now no speedup - mod=arith_method(cls, operator.mod, special), - pow=arith_method(cls, operator.pow, special), - # not entirely sure why this is necessary, but previously was included - # so it's here to maintain compatibility - rmul=arith_method(cls, rmul, special), - rsub=arith_method(cls, rsub, special), - rtruediv=arith_method(cls, rtruediv, special), - rfloordiv=arith_method(cls, rfloordiv, special), - rpow=arith_method(cls, rpow, special), - rmod=arith_method(cls, rmod, special)) - # yapf: enable - new_methods["div"] = new_methods["truediv"] - new_methods["rdiv"] = new_methods["rtruediv"] - if have_divmod: - # divmod doesn't have an op that is supported by numexpr - new_methods["divmod"] = arith_method(cls, divmod, special) - new_methods["rdivmod"] = arith_method(cls, rdivmod, special) - - new_methods.update( - dict( - eq=comp_method(cls, operator.eq, special), - ne=comp_method(cls, operator.ne, special), - lt=comp_method(cls, operator.lt, special), - gt=comp_method(cls, operator.gt, special), - le=comp_method(cls, operator.le, special), - ge=comp_method(cls, operator.ge, special), - ) - ) - - if bool_method: - new_methods.update( - dict( - and_=bool_method(cls, operator.and_, special), - or_=bool_method(cls, operator.or_, special), - # For some reason ``^`` wasn't used in original. - xor=bool_method(cls, operator.xor, special), - rand_=bool_method(cls, rand_, special), - ror_=bool_method(cls, ror_, special), - rxor=bool_method(cls, rxor, special), - ) - ) - - if special: - dunderize = lambda x: "__{name}__".format(name=x.strip("_")) - else: - dunderize = lambda x: x - new_methods = {dunderize(k): v for k, v in new_methods.items()} - return new_methods - - -def add_methods(cls, new_methods): - for name, method in new_methods.items(): - # For most methods, if we find that the class already has a method - # of the same name, it is OK to over-write it. The exception is - # inplace methods (__iadd__, __isub__, ...) for SparseArray, which - # retain the np.ndarray versions. - force = not (issubclass(cls, ABCSparseArray) and name.startswith("__i")) - if force or name not in cls.__dict__: - setattr(cls, name, method) - - -# ---------------------------------------------------------------------- -# Arithmetic -def add_special_arithmetic_methods(cls): - """ - Adds the full suite of special arithmetic methods (``__add__``, - ``__sub__``, etc.) to the class. - - Parameters - ---------- - cls : class - special methods will be defined and pinned to this class - """ - _, _, arith_method, comp_method, bool_method = _get_method_wrappers(cls) - new_methods = _create_methods( - cls, arith_method, comp_method, bool_method, special=True - ) - # inplace operators (I feel like these should get passed an `inplace=True` - # or just be removed - - def _wrap_inplace_method(method): - """ - return an inplace wrapper for this method - """ - - def f(self, other): - result = method(self, other) - - # this makes sure that we are aligned like the input - # we are updating inplace so we want to ignore is_copy - self._update_inplace( - result.reindex_like(self, copy=False)._data, verify_is_copy=False - ) - - return self - - f.__name__ = "__i{name}__".format(name=method.__name__.strip("__")) - return f - - new_methods.update( - dict( - __iadd__=_wrap_inplace_method(new_methods["__add__"]), - __isub__=_wrap_inplace_method(new_methods["__sub__"]), - __imul__=_wrap_inplace_method(new_methods["__mul__"]), - __itruediv__=_wrap_inplace_method(new_methods["__truediv__"]), - __ifloordiv__=_wrap_inplace_method(new_methods["__floordiv__"]), - __imod__=_wrap_inplace_method(new_methods["__mod__"]), - __ipow__=_wrap_inplace_method(new_methods["__pow__"]), - ) - ) - - new_methods.update( - dict( - __iand__=_wrap_inplace_method(new_methods["__and__"]), - __ior__=_wrap_inplace_method(new_methods["__or__"]), - __ixor__=_wrap_inplace_method(new_methods["__xor__"]), - ) - ) - - add_methods(cls, new_methods=new_methods) - - -def add_flex_arithmetic_methods(cls): - """ - Adds the full suite of flex arithmetic methods (``pow``, ``mul``, ``add``) - to the class. - - Parameters - ---------- - cls : class - flex methods will be defined and pinned to this class - """ - flex_arith_method, flex_comp_method, _, _, _ = _get_method_wrappers(cls) - new_methods = _create_methods( - cls, flex_arith_method, flex_comp_method, bool_method=None, special=False - ) - new_methods.update( - dict( - multiply=new_methods["mul"], - subtract=new_methods["sub"], - divide=new_methods["div"], - ) - ) - # opt out of bool flex methods for now - assert not any(kname in new_methods for kname in ("ror_", "rxor", "rand_")) - - add_methods(cls, new_methods=new_methods) - - # ----------------------------------------------------------------------------- # Series diff --git a/pandas/core/ops/methods.py b/pandas/core/ops/methods.py new file mode 100644 index 0000000000000..eba0a797a791f --- /dev/null +++ b/pandas/core/ops/methods.py @@ -0,0 +1,249 @@ +""" +Functions to generate methods and pin them to the appropriate classes. +""" +import operator + +from pandas.core.dtypes.generic import ( + ABCDataFrame, + ABCSeries, + ABCSparseArray, + ABCSparseSeries, +) + +from pandas.core.ops.roperator import ( + radd, + rand_, + rdivmod, + rfloordiv, + rmod, + rmul, + ror_, + rpow, + rsub, + rtruediv, + rxor, +) + + +def _get_method_wrappers(cls): + """ + Find the appropriate operation-wrappers to use when defining flex/special + arithmetic, boolean, and comparison operations with the given class. + + Parameters + ---------- + cls : class + + Returns + ------- + arith_flex : function or None + comp_flex : function or None + arith_special : function + comp_special : function + bool_special : function + + Notes + ----- + None is only returned for SparseArray + """ + # TODO: make these non-runtime imports once the relevant functions + # are no longer in __init__ + from pandas.core.ops import ( + _arith_method_FRAME, + _arith_method_SERIES, + _arith_method_SPARSE_SERIES, + _bool_method_SERIES, + _comp_method_FRAME, + _comp_method_SERIES, + _flex_comp_method_FRAME, + _flex_method_SERIES, + ) + + if issubclass(cls, ABCSparseSeries): + # Be sure to catch this before ABCSeries and ABCSparseArray, + # as they will both come see SparseSeries as a subclass + arith_flex = _flex_method_SERIES + comp_flex = _flex_method_SERIES + arith_special = _arith_method_SPARSE_SERIES + comp_special = _arith_method_SPARSE_SERIES + bool_special = _bool_method_SERIES + # TODO: I don't think the functions defined by bool_method are tested + elif issubclass(cls, ABCSeries): + # Just Series; SparseSeries is caught above + arith_flex = _flex_method_SERIES + comp_flex = _flex_method_SERIES + arith_special = _arith_method_SERIES + comp_special = _comp_method_SERIES + bool_special = _bool_method_SERIES + elif issubclass(cls, ABCDataFrame): + # Same for DataFrame and SparseDataFrame + arith_flex = _arith_method_FRAME + comp_flex = _flex_comp_method_FRAME + arith_special = _arith_method_FRAME + comp_special = _comp_method_FRAME + bool_special = _arith_method_FRAME + return arith_flex, comp_flex, arith_special, comp_special, bool_special + + +def add_special_arithmetic_methods(cls): + """ + Adds the full suite of special arithmetic methods (``__add__``, + ``__sub__``, etc.) to the class. + + Parameters + ---------- + cls : class + special methods will be defined and pinned to this class + """ + _, _, arith_method, comp_method, bool_method = _get_method_wrappers(cls) + new_methods = _create_methods( + cls, arith_method, comp_method, bool_method, special=True + ) + # inplace operators (I feel like these should get passed an `inplace=True` + # or just be removed + + def _wrap_inplace_method(method): + """ + return an inplace wrapper for this method + """ + + def f(self, other): + result = method(self, other) + + # this makes sure that we are aligned like the input + # we are updating inplace so we want to ignore is_copy + self._update_inplace( + result.reindex_like(self, copy=False)._data, verify_is_copy=False + ) + + return self + + f.__name__ = "__i{name}__".format(name=method.__name__.strip("__")) + return f + + new_methods.update( + dict( + __iadd__=_wrap_inplace_method(new_methods["__add__"]), + __isub__=_wrap_inplace_method(new_methods["__sub__"]), + __imul__=_wrap_inplace_method(new_methods["__mul__"]), + __itruediv__=_wrap_inplace_method(new_methods["__truediv__"]), + __ifloordiv__=_wrap_inplace_method(new_methods["__floordiv__"]), + __imod__=_wrap_inplace_method(new_methods["__mod__"]), + __ipow__=_wrap_inplace_method(new_methods["__pow__"]), + ) + ) + + new_methods.update( + dict( + __iand__=_wrap_inplace_method(new_methods["__and__"]), + __ior__=_wrap_inplace_method(new_methods["__or__"]), + __ixor__=_wrap_inplace_method(new_methods["__xor__"]), + ) + ) + + _add_methods(cls, new_methods=new_methods) + + +def add_flex_arithmetic_methods(cls): + """ + Adds the full suite of flex arithmetic methods (``pow``, ``mul``, ``add``) + to the class. + + Parameters + ---------- + cls : class + flex methods will be defined and pinned to this class + """ + flex_arith_method, flex_comp_method, _, _, _ = _get_method_wrappers(cls) + new_methods = _create_methods( + cls, flex_arith_method, flex_comp_method, bool_method=None, special=False + ) + new_methods.update( + dict( + multiply=new_methods["mul"], + subtract=new_methods["sub"], + divide=new_methods["div"], + ) + ) + # opt out of bool flex methods for now + assert not any(kname in new_methods for kname in ("ror_", "rxor", "rand_")) + + _add_methods(cls, new_methods=new_methods) + + +def _create_methods(cls, arith_method, comp_method, bool_method, special): + # creates actual methods based upon arithmetic, comp and bool method + # constructors. + + have_divmod = issubclass(cls, ABCSeries) + # divmod is available for Series and SparseSeries + + # yapf: disable + new_methods = dict( + add=arith_method(cls, operator.add, special), + radd=arith_method(cls, radd, special), + sub=arith_method(cls, operator.sub, special), + mul=arith_method(cls, operator.mul, special), + truediv=arith_method(cls, operator.truediv, special), + floordiv=arith_method(cls, operator.floordiv, special), + # Causes a floating point exception in the tests when numexpr enabled, + # so for now no speedup + mod=arith_method(cls, operator.mod, special), + pow=arith_method(cls, operator.pow, special), + # not entirely sure why this is necessary, but previously was included + # so it's here to maintain compatibility + rmul=arith_method(cls, rmul, special), + rsub=arith_method(cls, rsub, special), + rtruediv=arith_method(cls, rtruediv, special), + rfloordiv=arith_method(cls, rfloordiv, special), + rpow=arith_method(cls, rpow, special), + rmod=arith_method(cls, rmod, special)) + # yapf: enable + new_methods["div"] = new_methods["truediv"] + new_methods["rdiv"] = new_methods["rtruediv"] + if have_divmod: + # divmod doesn't have an op that is supported by numexpr + new_methods["divmod"] = arith_method(cls, divmod, special) + new_methods["rdivmod"] = arith_method(cls, rdivmod, special) + + new_methods.update( + dict( + eq=comp_method(cls, operator.eq, special), + ne=comp_method(cls, operator.ne, special), + lt=comp_method(cls, operator.lt, special), + gt=comp_method(cls, operator.gt, special), + le=comp_method(cls, operator.le, special), + ge=comp_method(cls, operator.ge, special), + ) + ) + + if bool_method: + new_methods.update( + dict( + and_=bool_method(cls, operator.and_, special), + or_=bool_method(cls, operator.or_, special), + # For some reason ``^`` wasn't used in original. + xor=bool_method(cls, operator.xor, special), + rand_=bool_method(cls, rand_, special), + ror_=bool_method(cls, ror_, special), + rxor=bool_method(cls, rxor, special), + ) + ) + + if special: + dunderize = lambda x: "__{name}__".format(name=x.strip("_")) + else: + dunderize = lambda x: x + new_methods = {dunderize(k): v for k, v in new_methods.items()} + return new_methods + + +def _add_methods(cls, new_methods): + for name, method in new_methods.items(): + # For most methods, if we find that the class already has a method + # of the same name, it is OK to over-write it. The exception is + # inplace methods (__iadd__, __isub__, ...) for SparseArray, which + # retain the np.ndarray versions. + force = not (issubclass(cls, ABCSparseArray) and name.startswith("__i")) + if force or name not in cls.__dict__: + setattr(cls, name, method) From f00905ecf1b4801a525774b8f8c8c9d94d37d7b6 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 8 Aug 2019 05:35:31 -0700 Subject: [PATCH 045/191] REF: Simplify _comp_method_SERIES (#27803) --- pandas/core/ops/__init__.py | 65 +++++++++++++++---------------------- 1 file changed, 27 insertions(+), 38 deletions(-) diff --git a/pandas/core/ops/__init__.py b/pandas/core/ops/__init__.py index 01de1428e290c..a56521b9c9fec 100644 --- a/pandas/core/ops/__init__.py +++ b/pandas/core/ops/__init__.py @@ -814,6 +814,15 @@ def wrapper(self, other, axis=None): self._get_axis_number(axis) res_name = get_op_result_name(self, other) + other = lib.item_from_zerodim(other) + + # TODO: shouldn't we be applying finalize whenever + # not isinstance(other, ABCSeries)? + finalizer = ( + lambda x: x.__finalize__(self) + if isinstance(other, (np.ndarray, ABCIndexClass)) + else x + ) if isinstance(other, list): # TODO: same for tuples? @@ -826,11 +835,18 @@ def wrapper(self, other, axis=None): elif isinstance(other, ABCSeries) and not self._indexed_same(other): raise ValueError("Can only compare identically-labeled Series objects") - elif is_categorical_dtype(self): + elif ( + is_list_like(other) + and len(other) != len(self) + and not isinstance(other, frozenset) + ): + # TODO: why are we treating len-1 frozenset differently? + raise ValueError("Lengths must match to compare") + + if is_categorical_dtype(self): # Dispatch to Categorical implementation; CategoricalIndex # behavior is non-canonical GH#19513 res_values = dispatch_to_extension_op(op, self, other) - return self._constructor(res_values, index=self.index, name=res_name) elif is_datetime64_dtype(self) or is_datetime64tz_dtype(self): # Dispatch to DatetimeIndex to ensure identical @@ -838,42 +854,18 @@ def wrapper(self, other, axis=None): from pandas.core.arrays import DatetimeArray res_values = dispatch_to_extension_op(op, DatetimeArray(self), other) - return self._constructor(res_values, index=self.index, name=res_name) elif is_timedelta64_dtype(self): from pandas.core.arrays import TimedeltaArray res_values = dispatch_to_extension_op(op, TimedeltaArray(self), other) - return self._constructor(res_values, index=self.index, name=res_name) elif is_extension_array_dtype(self) or ( is_extension_array_dtype(other) and not is_scalar(other) ): # Note: the `not is_scalar(other)` condition rules out - # e.g. other == "category" + # e.g. other == "category" res_values = dispatch_to_extension_op(op, self, other) - return self._constructor(res_values, index=self.index).rename(res_name) - - elif isinstance(other, ABCSeries): - # By this point we have checked that self._indexed_same(other) - res_values = na_op(self.values, other.values) - # rename is needed in case res_name is None and res_values.name - # is not. - return self._constructor( - res_values, index=self.index, name=res_name - ).rename(res_name) - - elif isinstance(other, (np.ndarray, ABCIndexClass)): - # do not check length of zerodim array - # as it will broadcast - if other.ndim != 0 and len(self) != len(other): - raise ValueError("Lengths must match to compare") - - res_values = na_op(self.values, np.asarray(other)) - result = self._constructor(res_values, index=self.index) - # rename is needed in case res_name is None and self.name - # is not. - return result.__finalize__(self).rename(res_name) elif is_scalar(other) and isna(other): # numpy does not like comparisons vs None @@ -881,25 +873,22 @@ def wrapper(self, other, axis=None): res_values = np.ones(len(self), dtype=bool) else: res_values = np.zeros(len(self), dtype=bool) - return self._constructor( - res_values, index=self.index, name=res_name, dtype="bool" - ) else: - values = self.to_numpy() + lvalues = extract_array(self, extract_numpy=True) + rvalues = extract_array(other, extract_numpy=True) with np.errstate(all="ignore"): - res = na_op(values, other) - if is_scalar(res): + res_values = na_op(lvalues, rvalues) + if is_scalar(res_values): raise TypeError( "Could not compare {typ} type with Series".format(typ=type(other)) ) - # always return a full value series here - res_values = extract_array(res, extract_numpy=True) - return self._constructor( - res_values, index=self.index, name=res_name, dtype="bool" - ) + result = self._constructor(res_values, index=self.index) + # rename is needed in case res_name is None and result.name + # is not. + return finalizer(result).rename(res_name) wrapper.__name__ = op_name return wrapper From d320ef73a76543a8803f1c0639e1f28d725343cd Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 8 Aug 2019 05:37:41 -0700 Subject: [PATCH 046/191] CLN: Assorted Cleanups (#27791) --- pandas/core/arrays/categorical.py | 17 +++++++---------- pandas/core/arrays/datetimelike.py | 12 +++++------- pandas/core/arrays/sparse.py | 8 ++++---- pandas/core/indexes/datetimes.py | 3 +-- pandas/core/indexes/period.py | 5 ++++- pandas/io/pytables.py | 6 ++++-- pandas/tests/arrays/test_datetimelike.py | 14 ++++++++++++++ pandas/tests/arrays/test_integer.py | 2 -- pandas/tests/series/test_period.py | 3 +-- 9 files changed, 40 insertions(+), 30 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index d22b4bd4d3f2b..69fa956b73f28 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -22,7 +22,6 @@ ensure_int64, ensure_object, ensure_platform_int, - is_categorical, is_categorical_dtype, is_datetime64_dtype, is_datetimelike, @@ -2659,18 +2658,18 @@ def _get_codes_for_values(values, categories): return coerce_indexer_dtype(t.lookup(vals), cats) -def _recode_for_categories(codes, old_categories, new_categories): +def _recode_for_categories(codes: np.ndarray, old_categories, new_categories): """ Convert a set of codes for to a new set of categories Parameters ---------- - codes : array + codes : np.ndarray old_categories, new_categories : Index Returns ------- - new_codes : array + new_codes : np.ndarray[np.int64] Examples -------- @@ -2725,17 +2724,15 @@ def _factorize_from_iterable(values): If `values` has a categorical dtype, then `categories` is a CategoricalIndex keeping the categories and order of `values`. """ - from pandas.core.indexes.category import CategoricalIndex - if not is_list_like(values): raise TypeError("Input must be list-like") - if is_categorical(values): - values = CategoricalIndex(values) - # The CategoricalIndex level we want to build has the same categories + if is_categorical_dtype(values): + values = extract_array(values) + # The Categorical we want to build has the same categories # as values but its codes are by def [0, ..., len(n_categories) - 1] cat_codes = np.arange(len(values.categories), dtype=values.codes.dtype) - categories = values._create_from_codes(cat_codes) + categories = Categorical.from_codes(cat_codes, dtype=values.dtype) codes = values.codes else: # The value of ordered is irrelevant since we don't use cat as such, diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 770870a466aa9..b3548a1dc20d6 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -161,8 +161,8 @@ def strftime(self, date_format): Returns ------- - Index - Index of formatted strings. + ndarray + NumPy ndarray of formatted strings. See Also -------- @@ -180,9 +180,7 @@ def strftime(self, date_format): 'March 10, 2018, 09:00:02 AM'], dtype='object') """ - from pandas import Index - - return Index(self._format_native_types(date_format=date_format)) + return self._format_native_types(date_format=date_format).astype(object) class TimelikeOps: @@ -1018,9 +1016,9 @@ def _add_delta_tdi(self, other): if isinstance(other, np.ndarray): # ndarray[timedelta64]; wrap in TimedeltaIndex for op - from pandas import TimedeltaIndex + from pandas.core.arrays import TimedeltaArray - other = TimedeltaIndex(other) + other = TimedeltaArray._from_sequence(other) self_i8 = self.asi8 other_i8 = other.asi8 diff --git a/pandas/core/arrays/sparse.py b/pandas/core/arrays/sparse.py index 47c7c72051150..476e2aa223d03 100644 --- a/pandas/core/arrays/sparse.py +++ b/pandas/core/arrays/sparse.py @@ -1781,11 +1781,11 @@ def sparse_arithmetic_method(self, other): @classmethod def _create_comparison_method(cls, op): - def cmp_method(self, other): - op_name = op.__name__ + op_name = op.__name__ + if op_name in {"and_", "or_"}: + op_name = op_name[:-1] - if op_name in {"and_", "or_"}: - op_name = op_name[:-1] + def cmp_method(self, other): if isinstance(other, (ABCSeries, ABCIndexClass)): # Rely on pandas to unbox and dispatch to us. diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index 67de7b0196b8e..9f2b31f23d2fa 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -69,7 +69,7 @@ class DatetimeDelegateMixin(DatetimelikeDelegateMixin): # Some are "raw" methods, the result is not not re-boxed in an Index # We also have a few "extra" attrs, which may or may not be raw, # which we we dont' want to expose in the .dt accessor. - _extra_methods = ["to_period", "to_perioddelta", "to_julian_date"] + _extra_methods = ["to_period", "to_perioddelta", "to_julian_date", "strftime"] _extra_raw_methods = ["to_pydatetime", "_local_timestamps", "_has_same_tz"] _extra_raw_properties = ["_box_func", "tz", "tzinfo"] _delegated_properties = DatetimeArray._datetimelike_ops + _extra_raw_properties @@ -1184,7 +1184,6 @@ def slice_indexer(self, start=None, end=None, step=None, kind=None): is_normalized = cache_readonly(DatetimeArray.is_normalized.fget) # type: ignore _resolution = cache_readonly(DatetimeArray._resolution.fget) # type: ignore - strftime = ea_passthrough(DatetimeArray.strftime) _has_same_tz = ea_passthrough(DatetimeArray._has_same_tz) @property diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index f6b3d1076043e..b0cc386f7783d 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -63,7 +63,10 @@ class PeriodDelegateMixin(DatetimelikeDelegateMixin): _delegate_class = PeriodArray _delegated_properties = PeriodArray._datetimelike_ops - _delegated_methods = set(PeriodArray._datetimelike_methods) | {"_addsub_int_array"} + _delegated_methods = set(PeriodArray._datetimelike_methods) | { + "_addsub_int_array", + "strftime", + } _raw_properties = {"is_leap_year"} diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index abc8a414eb37a..6af5dd6f1bf37 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -3202,7 +3202,9 @@ def read(self, start=None, stop=None, **kwargs): values = self.read_array( "block{idx}_values".format(idx=i), start=_start, stop=_stop ) - blk = make_block(values, placement=items.get_indexer(blk_items)) + blk = make_block( + values, placement=items.get_indexer(blk_items), ndim=len(axes) + ) blocks.append(blk) return self.obj_type(BlockManager(blocks, axes)) @@ -4462,7 +4464,7 @@ def read(self, where=None, columns=None, **kwargs): if values.ndim == 1 and isinstance(values, np.ndarray): values = values.reshape((1, values.shape[0])) - block = make_block(values, placement=np.arange(len(cols_))) + block = make_block(values, placement=np.arange(len(cols_)), ndim=2) mgr = BlockManager([block], [cols_, index_]) frames.append(DataFrame(mgr)) diff --git a/pandas/tests/arrays/test_datetimelike.py b/pandas/tests/arrays/test_datetimelike.py index ffda2f4de2700..0b3ccc0ae0e2d 100644 --- a/pandas/tests/arrays/test_datetimelike.py +++ b/pandas/tests/arrays/test_datetimelike.py @@ -462,6 +462,13 @@ def test_concat_same_type_different_freq(self): tm.assert_datetime_array_equal(result, expected) + def test_strftime(self, datetime_index): + arr = DatetimeArray(datetime_index) + + result = arr.strftime("%Y %b") + expected = np.array(datetime_index.strftime("%Y %b")) + tm.assert_numpy_array_equal(result, expected) + class TestTimedeltaArray(SharedTests): index_cls = pd.TimedeltaIndex @@ -652,6 +659,13 @@ def test_array_interface(self, period_index): expected = np.asarray(arr).astype("S20") tm.assert_numpy_array_equal(result, expected) + def test_strftime(self, period_index): + arr = PeriodArray(period_index) + + result = arr.strftime("%Y") + expected = np.array(period_index.strftime("%Y")) + tm.assert_numpy_array_equal(result, expected) + @pytest.mark.parametrize( "array,casting_nats", diff --git a/pandas/tests/arrays/test_integer.py b/pandas/tests/arrays/test_integer.py index 8fbfb4c12f4b2..50cd1469e5196 100644 --- a/pandas/tests/arrays/test_integer.py +++ b/pandas/tests/arrays/test_integer.py @@ -379,8 +379,6 @@ def test_compare_array(self, data, all_compare_operators): class TestCasting: - pass - @pytest.mark.parametrize("dropna", [True, False]) def test_construct_index(self, all_data, dropna): # ensure that we do not coerce to Float64Index, rather diff --git a/pandas/tests/series/test_period.py b/pandas/tests/series/test_period.py index 9b34b52bf39b9..4aeb211170d8f 100644 --- a/pandas/tests/series/test_period.py +++ b/pandas/tests/series/test_period.py @@ -71,10 +71,9 @@ def test_NaT_scalar(self): series[2] = val assert pd.isna(series[2]) - @pytest.mark.xfail(reason="PeriodDtype Series not supported yet") def test_NaT_cast(self): result = Series([np.nan]).astype("period[D]") - expected = Series([pd.NaT]) + expected = Series([pd.NaT], dtype="period[D]") tm.assert_series_equal(result, expected) def test_set_none(self): From 78c6843dcf40b43a45a0c490acb71f92a80fdfa3 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Thu, 8 Aug 2019 08:38:47 -0500 Subject: [PATCH 047/191] DOC: Add expanded index descriptors for specifying for RangeIndex-as-metadata in Parquet file schema (#25709) --- doc/source/development/developer.rst | 60 +++++++++++++++++++--------- 1 file changed, 41 insertions(+), 19 deletions(-) diff --git a/doc/source/development/developer.rst b/doc/source/development/developer.rst index a283920ae4377..923ef005d5926 100644 --- a/doc/source/development/developer.rst +++ b/doc/source/development/developer.rst @@ -37,12 +37,19 @@ So that a ``pandas.DataFrame`` can be faithfully reconstructed, we store a .. code-block:: text - {'index_columns': ['__index_level_0__', '__index_level_1__', ...], + {'index_columns': [, , ...], 'column_indexes': [, , ..., ], 'columns': [, , ...], - 'pandas_version': $VERSION} + 'pandas_version': $VERSION, + 'creator': { + 'library': $LIBRARY, + 'version': $LIBRARY_VERSION + }} -Here, ````/```` and so forth are dictionaries containing the metadata +The "descriptor" values ```` in the ``'index_columns'`` field are +strings (referring to a column) or dictionaries with values as described below. + +The ````/```` and so forth are dictionaries containing the metadata for each column, *including the index columns*. This has JSON form: .. code-block:: text @@ -53,26 +60,37 @@ for each column, *including the index columns*. This has JSON form: 'numpy_type': numpy_type, 'metadata': metadata} -.. note:: +See below for the detailed specification for these. + +Index Metadata Descriptors +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +``RangeIndex`` can be stored as metadata only, not requiring serialization. The +descriptor format for these as is follows: - Every index column is stored with a name matching the pattern - ``__index_level_\d+__`` and its corresponding column information is can be - found with the following code snippet. +.. code-block:: python - Following this naming convention isn't strictly necessary, but strongly - suggested for compatibility with Arrow. + index = pd.RangeIndex(0, 10, 2) + {'kind': 'range', + 'name': index.name, + 'start': index.start, + 'stop': index.stop, + 'step': index.step} - Here's an example of how the index metadata is structured in pyarrow: +Other index types must be serialized as data columns along with the other +DataFrame columns. The metadata for these is a string indicating the name of +the field in the data columns, for example ``'__index_level_0__'``. - .. code-block:: python +If an index has a non-None ``name`` attribute, and there is no other column +with a name matching that value, then the ``index.name`` value can be used as +the descriptor. Otherwise (for unnamed indexes and ones with names colliding +with other column names) a disambiguating name with pattern matching +``__index_level_\d+__`` should be used. In cases of named indexes as data +columns, ``name`` attribute is always stored in the column descriptors as +above. - # assuming there's at least 3 levels in the index - index_columns = metadata['index_columns'] # noqa: F821 - columns = metadata['columns'] # noqa: F821 - ith_index = 2 - assert index_columns[ith_index] == '__index_level_2__' - ith_index_info = columns[-len(index_columns):][ith_index] - ith_index_level_name = ith_index_info['name'] +Column Metadata +~~~~~~~~~~~~~~~ ``pandas_type`` is the logical type of the column, and is one of: @@ -161,4 +179,8 @@ As an example of fully-formed metadata: 'numpy_type': 'int64', 'metadata': None} ], - 'pandas_version': '0.20.0'} + 'pandas_version': '0.20.0', + 'creator': { + 'library': 'pyarrow', + 'version': '0.13.0' + }} From 8b6942f2876264b745529a3c3a7cf410ee096c34 Mon Sep 17 00:00:00 2001 From: Marco Neumann Date: Thu, 8 Aug 2019 22:43:25 +0200 Subject: [PATCH 048/191] PERF: break reference cycle in Index._engine (#27607) Fixes #27585 --- asv_bench/benchmarks/index_object.py | 18 ++++++++++++++++++ doc/source/whatsnew/v0.25.1.rst | 1 + pandas/core/indexes/base.py | 6 +++++- pandas/tests/indexes/test_base.py | 8 ++++++++ 4 files changed, 32 insertions(+), 1 deletion(-) diff --git a/asv_bench/benchmarks/index_object.py b/asv_bench/benchmarks/index_object.py index 6541ddcb0397d..49834ae94cc38 100644 --- a/asv_bench/benchmarks/index_object.py +++ b/asv_bench/benchmarks/index_object.py @@ -1,3 +1,4 @@ +import gc import numpy as np import pandas.util.testing as tm from pandas import ( @@ -225,4 +226,21 @@ def time_intersection_both_duplicate(self, N): self.intv.intersection(self.intv2) +class GC: + params = [1, 2, 5] + + def create_use_drop(self): + idx = Index(list(range(1000 * 1000))) + idx._engine + + def peakmem_gc_instances(self, N): + try: + gc.disable() + + for _ in range(N): + self.create_use_drop() + finally: + gc.enable() + + from .pandas_vb_common import setup # noqa: F401 diff --git a/doc/source/whatsnew/v0.25.1.rst b/doc/source/whatsnew/v0.25.1.rst index f1d3f152e503d..443baa56374ca 100644 --- a/doc/source/whatsnew/v0.25.1.rst +++ b/doc/source/whatsnew/v0.25.1.rst @@ -83,6 +83,7 @@ Indexing ^^^^^^^^ - Bug in partial-string indexing returning a NumPy array rather than a ``Series`` when indexing with a scalar like ``.loc['2015']`` (:issue:`27516`) +- Break reference cycle involving :class:`Index` to allow garbage collection of :class:`Index` objects without running the GC. (:issue:`27585`) - - diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 356ae20b2240a..05565194a7a27 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -691,7 +691,11 @@ def _cleanup(self): @cache_readonly def _engine(self): # property, for now, slow to look up - return self._engine_type(lambda: self._ndarray_values, len(self)) + + # to avoid a refernce cycle, bind `_ndarray_values` to a local variable, so + # `self` is not passed into the lambda. + _ndarray_values = self._ndarray_values + return self._engine_type(lambda: _ndarray_values, len(self)) # -------------------------------------------------------------------- # Array-Like Methods diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index c40a9bce9385b..34d82525495fc 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -1,5 +1,6 @@ from collections import defaultdict from datetime import datetime, timedelta +import gc from io import StringIO import math import operator @@ -2424,6 +2425,13 @@ def test_deprecated_contains(self): with tm.assert_produces_warning(FutureWarning): index.contains(1) + def test_engine_reference_cycle(self): + # https://github.com/pandas-dev/pandas/issues/27585 + index = pd.Index([1, 2, 3]) + nrefs_pre = len(gc.get_referrers(index)) + index._engine + assert len(gc.get_referrers(index)) == nrefs_pre + class TestMixedIntIndex(Base): # Mostly the tests from common.py for which the results differ From d7dcdf3634394079950234aa76a4349629b64e34 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 9 Aug 2019 09:36:57 +0200 Subject: [PATCH 049/191] COMPAT: restore shape for 'invalid' Index with nd array (#27818) --- doc/source/whatsnew/v0.25.1.rst | 2 +- pandas/core/indexes/base.py | 5 ++++- pandas/core/indexes/multi.py | 9 +++++++++ pandas/tests/indexes/test_base.py | 14 ++++++++++++++ 4 files changed, 28 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.25.1.rst b/doc/source/whatsnew/v0.25.1.rst index 443baa56374ca..637ac5c9c8bd1 100644 --- a/doc/source/whatsnew/v0.25.1.rst +++ b/doc/source/whatsnew/v0.25.1.rst @@ -112,7 +112,7 @@ Plotting ^^^^^^^^ - Added a pandas_plotting_backends entrypoint group for registering plot backends. See :ref:`extending.plotting-backends` for more (:issue:`26747`). -- +- Fix compatibility issue with matplotlib when passing a pandas ``Index`` to a plot call (:issue:`27775`). - Groupby/resample/rolling diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 05565194a7a27..598c4dca9ce88 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -5604,7 +5604,10 @@ def shape(self): """ Return a tuple of the shape of the underlying data. """ - return (len(self),) + # not using "(len(self), )" to return "correct" shape if the values + # consists of a >1 D array (see GH-27775) + # overridden in MultiIndex.shape to avoid materializing the values + return self._values.shape Index._add_numeric_methods_disabled() diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 488107690fbd6..b614952ba1e04 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -622,6 +622,15 @@ def _values(self): # We override here, since our parent uses _data, which we dont' use. return self.values + @property + def shape(self): + """ + Return a tuple of the shape of the underlying data. + """ + # overriding the base Index.shape definition to avoid materializing + # the values (GH-27384, GH-27775) + return (len(self),) + @property def array(self): """ diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 34d82525495fc..fe1eb96df1e97 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -2813,3 +2813,17 @@ def test_deprecated_fastpath(): expected = pd.CategoricalIndex(["a", "b", "c"], name="test") tm.assert_index_equal(idx, expected) + + +def test_shape_of_invalid_index(): + # Currently, it is possible to create "invalid" index objects backed by + # a multi-dimensional array (see https://github.com/pandas-dev/pandas/issues/27125 + # about this). However, as long as this is not solved in general,this test ensures + # that the returned shape is consistent with this underlying array for + # compat with matplotlib (see https://github.com/pandas-dev/pandas/issues/27775) + a = np.arange(8).reshape(2, 2, 2) + idx = pd.Index(a) + assert idx.shape == a.shape + + idx = pd.Index([0, 1, 2, 3]) + assert idx[:, None].shape == (4, 1) From 0227e69bb853fe81d071678732ec57974f91776a Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 9 Aug 2019 01:00:53 -0700 Subject: [PATCH 050/191] EA: implement+test EA.view (#27633) --- doc/source/reference/extensions.rst | 1 + pandas/core/arrays/base.py | 24 +++++++++++++++++++++- pandas/core/arrays/categorical.py | 25 +++++------------------ pandas/core/arrays/datetimelike.py | 14 ++----------- pandas/core/arrays/interval.py | 8 ++------ pandas/core/arrays/numpy_.py | 4 ++-- pandas/core/arrays/sparse.py | 8 ++++---- pandas/tests/extension/arrow/test_bool.py | 4 ++++ pandas/tests/extension/base/interface.py | 15 ++++++++++++++ pandas/tests/extension/decimal/array.py | 4 ++-- pandas/tests/extension/json/array.py | 7 +++++-- pandas/tests/extension/test_interval.py | 5 ++++- pandas/tests/extension/test_sparse.py | 4 ++++ 13 files changed, 73 insertions(+), 50 deletions(-) diff --git a/doc/source/reference/extensions.rst b/doc/source/reference/extensions.rst index 78e8734e9b5ff..4b1a99da7cd4c 100644 --- a/doc/source/reference/extensions.rst +++ b/doc/source/reference/extensions.rst @@ -44,6 +44,7 @@ objects. api.extensions.ExtensionArray.argsort api.extensions.ExtensionArray.astype api.extensions.ExtensionArray.copy + api.extensions.ExtensionArray.view api.extensions.ExtensionArray.dropna api.extensions.ExtensionArray.factorize api.extensions.ExtensionArray.fillna diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 00e1d092ffa22..5c121172d0e4f 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -64,6 +64,7 @@ class ExtensionArray: shift take unique + view _concat_same_type _formatter _from_factorized @@ -146,7 +147,7 @@ class ExtensionArray: If implementing NumPy's ``__array_ufunc__`` interface, pandas expects that - 1. You defer by raising ``NotImplemented`` when any Series are present + 1. You defer by returning ``NotImplemented`` when any Series are present in `inputs`. Pandas will extract the arrays and call the ufunc again. 2. You define a ``_HANDLED_TYPES`` tuple as an attribute on the class. Pandas inspect this to determine whether the ufunc is valid for the @@ -861,6 +862,27 @@ def copy(self) -> ABCExtensionArray: """ raise AbstractMethodError(self) + def view(self, dtype=None) -> Union[ABCExtensionArray, np.ndarray]: + """ + Return a view on the array. + + Parameters + ---------- + dtype : str, np.dtype, or ExtensionDtype, optional + Default None + + Returns + ------- + ExtensionArray + """ + # NB: + # - This must return a *new* object referencing the same data, not self. + # - The only case that *must* be implemented is with dtype=None, + # giving a view with the same dtype as self. + if dtype is not None: + raise NotImplementedError(dtype) + return self[:] + # ------------------------------------------------------------------------ # Printing # ------------------------------------------------------------------------ diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 69fa956b73f28..9862b4b530424 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -516,19 +516,12 @@ def astype(self, dtype: Dtype, copy: bool = True) -> ArrayLike: return self._set_dtype(dtype) return np.array(self, dtype=dtype, copy=copy) - @cache_readonly - def ndim(self) -> int: - """ - Number of dimensions of the Categorical - """ - return self._codes.ndim - @cache_readonly def size(self) -> int: """ return the len of myself """ - return len(self) + return self._codes.size @cache_readonly def itemsize(self) -> int: @@ -1763,18 +1756,10 @@ def ravel(self, order="C"): ) return np.array(self) - def view(self): - """ - Return a view of myself. - - For internal compatibility with numpy arrays. - - Returns - ------- - view : Categorical - Returns `self`! - """ - return self + def view(self, dtype=None): + if dtype is not None: + raise NotImplementedError(dtype) + return self._constructor(values=self._codes, dtype=self.dtype, fastpath=True) def to_dense(self): """ diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index b3548a1dc20d6..0372b8f0c080a 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -544,18 +544,8 @@ def astype(self, dtype, copy=True): return np.asarray(self, dtype=dtype) def view(self, dtype=None): - """ - New view on this array with the same data. - - Parameters - ---------- - dtype : numpy dtype, optional - - Returns - ------- - ndarray - With the specified `dtype`. - """ + if dtype is None or dtype is self.dtype: + return type(self)(self._data, dtype=self.dtype) return self._data.view(dtype=dtype) # ------------------------------------------------------------------ diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index 4ab75090c34d0..9cb2721b33634 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -695,18 +695,14 @@ def isna(self): return isna(self.left) @property - def nbytes(self): + def nbytes(self) -> int: return self.left.nbytes + self.right.nbytes @property - def size(self): + def size(self) -> int: # Avoid materializing self.values return self.left.size - @property - def shape(self): - return self.left.shape - def take(self, indices, allow_fill=False, fill_value=None, axis=None, **kwargs): """ Take elements from the IntervalArray. diff --git a/pandas/core/arrays/numpy_.py b/pandas/core/arrays/numpy_.py index 667fb4501ed95..4e2e37d88eb9a 100644 --- a/pandas/core/arrays/numpy_.py +++ b/pandas/core/arrays/numpy_.py @@ -245,11 +245,11 @@ def __setitem__(self, key, value): else: self._ndarray[key] = value - def __len__(self): + def __len__(self) -> int: return len(self._ndarray) @property - def nbytes(self): + def nbytes(self) -> int: return self._ndarray.nbytes def isna(self): diff --git a/pandas/core/arrays/sparse.py b/pandas/core/arrays/sparse.py index 476e2aa223d03..8aa83c3fbc37d 100644 --- a/pandas/core/arrays/sparse.py +++ b/pandas/core/arrays/sparse.py @@ -839,7 +839,7 @@ def fill_value(self, value): self._dtype = SparseDtype(self.dtype.subtype, value) @property - def kind(self): + def kind(self) -> str: """ The kind of sparse index for this array. One of {'integer', 'block'}. """ @@ -854,7 +854,7 @@ def _valid_sp_values(self): mask = notna(sp_vals) return sp_vals[mask] - def __len__(self): + def __len__(self) -> int: return self.sp_index.length @property @@ -868,7 +868,7 @@ def _fill_value_matches(self, fill_value): return self.fill_value == fill_value @property - def nbytes(self): + def nbytes(self) -> int: return self.sp_values.nbytes + self.sp_index.nbytes @property @@ -886,7 +886,7 @@ def density(self): return r @property - def npoints(self): + def npoints(self) -> int: """ The number of non- ``fill_value`` points. diff --git a/pandas/tests/extension/arrow/test_bool.py b/pandas/tests/extension/arrow/test_bool.py index cc0deca765b41..9c53210b75d6b 100644 --- a/pandas/tests/extension/arrow/test_bool.py +++ b/pandas/tests/extension/arrow/test_bool.py @@ -41,6 +41,10 @@ def test_copy(self, data): # __setitem__ does not work, so we only have a smoke-test data.copy() + def test_view(self, data): + # __setitem__ does not work, so we only have a smoke-test + data.view() + class TestConstructors(BaseArrowTests, base.BaseConstructorsTests): def test_from_dtype(self, data): diff --git a/pandas/tests/extension/base/interface.py b/pandas/tests/extension/base/interface.py index dee8021f5375f..a29f6deeffae6 100644 --- a/pandas/tests/extension/base/interface.py +++ b/pandas/tests/extension/base/interface.py @@ -75,3 +75,18 @@ def test_copy(self, data): data[1] = data[0] assert result[1] != result[0] + + def test_view(self, data): + # view with no dtype should return a shallow copy, *not* the same + # object + assert data[1] != data[0] + + result = data.view() + assert result is not data + assert type(result) == type(data) + + result[1] = result[0] + assert data[1] == data[0] + + # check specifically that the `dtype` kwarg is accepted + data.view(dtype=None) diff --git a/pandas/tests/extension/decimal/array.py b/pandas/tests/extension/decimal/array.py index c28ff956a33a4..a1988744d76a1 100644 --- a/pandas/tests/extension/decimal/array.py +++ b/pandas/tests/extension/decimal/array.py @@ -137,11 +137,11 @@ def __setitem__(self, key, value): value = decimal.Decimal(value) self._data[key] = value - def __len__(self): + def __len__(self) -> int: return len(self._data) @property - def nbytes(self): + def nbytes(self) -> int: n = len(self) if n: return n * sys.getsizeof(self[0]) diff --git a/pandas/tests/extension/json/array.py b/pandas/tests/extension/json/array.py index 21c4ac8f055a2..b64ddbd6ac84d 100644 --- a/pandas/tests/extension/json/array.py +++ b/pandas/tests/extension/json/array.py @@ -80,6 +80,9 @@ def __getitem__(self, item): elif isinstance(item, abc.Iterable): # fancy indexing return type(self)([self.data[i] for i in item]) + elif isinstance(item, slice) and item == slice(None): + # Make sure we get a view + return type(self)(self.data) else: # slice return type(self)(self.data[item]) @@ -103,11 +106,11 @@ def __setitem__(self, key, value): assert isinstance(v, self.dtype.type) self.data[k] = v - def __len__(self): + def __len__(self) -> int: return len(self.data) @property - def nbytes(self): + def nbytes(self) -> int: return sys.getsizeof(self.data) def isna(self): diff --git a/pandas/tests/extension/test_interval.py b/pandas/tests/extension/test_interval.py index 1aab71286b4a6..4fdcf930d224f 100644 --- a/pandas/tests/extension/test_interval.py +++ b/pandas/tests/extension/test_interval.py @@ -95,7 +95,10 @@ class TestGrouping(BaseInterval, base.BaseGroupbyTests): class TestInterface(BaseInterval, base.BaseInterfaceTests): - pass + def test_view(self, data): + # __setitem__ incorrectly makes a copy (GH#27147), so we only + # have a smoke-test + data.view() class TestReduce(base.BaseNoReduceTests): diff --git a/pandas/tests/extension/test_sparse.py b/pandas/tests/extension/test_sparse.py index 84d59902d2aa7..6ebe71e173ec2 100644 --- a/pandas/tests/extension/test_sparse.py +++ b/pandas/tests/extension/test_sparse.py @@ -103,6 +103,10 @@ def test_copy(self, data): # __setitem__ does not work, so we only have a smoke-test data.copy() + def test_view(self, data): + # __setitem__ does not work, so we only have a smoke-test + data.view() + class TestConstructors(BaseSparseTests, base.BaseConstructorsTests): pass From 62429aed9e76471f81634d4cd3694000064fe514 Mon Sep 17 00:00:00 2001 From: Eliza Mae Saret Date: Fri, 9 Aug 2019 20:33:49 +0800 Subject: [PATCH 051/191] Update sample() documentation (#27833) Clarify the definition of replace parameter --- pandas/core/generic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index c38489c7b85ef..5a1b2a5dcd20b 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -4786,7 +4786,7 @@ def sample( frac : float, optional Fraction of axis items to return. Cannot be used with `n`. replace : bool, default False - Sample with or without replacement. + Allow or disallow sampling of the same row more than once. weights : str or ndarray-like, optional Default 'None' results in equal probability weighting. If passed a Series, will align with target object on index. Index From 67de633e0cc4266f3a86abc05adcc8fd21e1e6b7 Mon Sep 17 00:00:00 2001 From: avelineg <50472542+avelineg@users.noreply.github.com> Date: Fri, 9 Aug 2019 20:55:22 +0800 Subject: [PATCH 052/191] Add example for Series.str.slice() (#27832) * Add example for Series.str.slice() --- pandas/core/strings.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/pandas/core/strings.py b/pandas/core/strings.py index 169a3a24c254d..25350119f9df5 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -1442,6 +1442,12 @@ def str_slice(arr, start=None, stop=None, step=None): 2 hameleon dtype: object + >>> s.str.slice(start=-1) + 0 a + 1 x + 2 n + dtype: object + >>> s.str.slice(stop=2) 0 ko 1 fo From 7bfa9b85bf3bbab97cd420a7d141da6352f21db1 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 9 Aug 2019 09:18:53 -0500 Subject: [PATCH 053/191] Add security policy (#27822) --- .github/SECURITY.md | 1 + 1 file changed, 1 insertion(+) create mode 100644 .github/SECURITY.md diff --git a/.github/SECURITY.md b/.github/SECURITY.md new file mode 100644 index 0000000000000..f3b059a5d4f13 --- /dev/null +++ b/.github/SECURITY.md @@ -0,0 +1 @@ +To report a security vulnerability to pandas, please go to https://tidelift.com/security and see the instructions there. From be6c3690040642cdcbf0c93025af2527a2fb1df4 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 9 Aug 2019 09:19:02 -0500 Subject: [PATCH 054/191] Add tidelift sponsor to FUNDING (#27823) --- .github/FUNDING.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/FUNDING.yml b/.github/FUNDING.yml index 6912d15abf3d6..944ce9b4fb1f6 100644 --- a/.github/FUNDING.yml +++ b/.github/FUNDING.yml @@ -1 +1,2 @@ custom: https://pandas.pydata.org/donate.html +tidelift: pypi/pandas From 865e91388624ebb1da3eab25e18a50f61782262f Mon Sep 17 00:00:00 2001 From: Unprocessable Date: Sat, 10 Aug 2019 20:33:06 +0200 Subject: [PATCH 055/191] Moving test_diff.py to test_analytics.py As requested... --- pandas/tests/series/test_analytics.py | 56 +++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py index 3a5a387b919be..388f1768ffd0c 100644 --- a/pandas/tests/series/test_analytics.py +++ b/pandas/tests/series/test_analytics.py @@ -20,6 +20,7 @@ from pandas.api.types import is_scalar from pandas.core.index import MultiIndex from pandas.core.indexes.datetimes import Timestamp +from pandas.core.indexes.timedeltas import TimedeltaIndex import pandas.util.testing as tm from pandas.util.testing import ( assert_almost_equal, @@ -236,7 +237,62 @@ def test_npdiff(self): r = np.diff(s) assert_series_equal(Series([nan, 0, 0, 0, nan]), r) + + def test_diff(self): + ''' + Combined datetime ranges, normal diff and boolean diff test. + ''' + # Just run the function + self.ts.diff() + + # int dtype + a = 10000000000000000 + b = a + 1 + s = Series([a, b]) + + rs = s.diff() + assert rs[1] == 1 + + # neg n + rs = self.ts.diff(-1) + xp = self.ts - self.ts.shift(-1) + assert_series_equal(rs, xp) + + # 0 + rs = self.ts.diff(0) + xp = self.ts - self.ts + assert_series_equal(rs, xp) + + # datetime diff (GH3100) + s = Series(date_range("20130102", periods=5)) + rs = s - s.shift(1) + xp = s.diff() + assert_series_equal(rs, xp) + + # timedelta diff + nrs = rs - rs.shift(1) + nxp = xp.diff() + assert_series_equal(nrs, nxp) + + # with tz + s = Series( + date_range("2000-01-01 09:00:00", periods=5, tz="US/Eastern"), name="foo" + ) + result = s.diff() + assert_series_equal( + result, Series(TimedeltaIndex(["NaT"] + ["1 days"] * 4), name="foo") + ) + + # boolean series + s = Series([False, True, True, False, False]) + result = s.diff() + assert_series_equal(result, Series([nan, True, False, True, False])) + # boolean nan series + s = Series([False, True, nan, False, False]) + result = s.diff() + assert_series_equal(result, Series([nan, 1, nan, nan, 0], dtype="object")) + def _check_accum_op(self, name, datetime_series_, check_dtype=True): func = getattr(np, name) tm.assert_numpy_array_equal( From 6f8986b7fe4bd286c8f8116c3ba066762996da9a Mon Sep 17 00:00:00 2001 From: Unprocessable Date: Sat, 10 Aug 2019 20:34:58 +0200 Subject: [PATCH 056/191] Removed test_diff.py, added to test_analytics --- pandas/tests/series/test_diff.py | 59 -------------------------------- 1 file changed, 59 deletions(-) delete mode 100644 pandas/tests/series/test_diff.py diff --git a/pandas/tests/series/test_diff.py b/pandas/tests/series/test_diff.py deleted file mode 100644 index 1d8c17dfa989b..0000000000000 --- a/pandas/tests/series/test_diff.py +++ /dev/null @@ -1,59 +0,0 @@ -from numpy import nan -from pandas import Series, date_range -from pandas.core.indexes.timedeltas import TimedeltaIndex -from pandas.tests.series.common import TestData -from pandas.util.testing import assert_series_equal - - -class TestDiff(TestData): - def test_diff(self): - # Just run the function - self.ts.diff() - - # int dtype - a = 10000000000000000 - b = a + 1 - s = Series([a, b]) - - rs = s.diff() - assert rs[1] == 1 - - # neg n - rs = self.ts.diff(-1) - xp = self.ts - self.ts.shift(-1) - assert_series_equal(rs, xp) - - # 0 - rs = self.ts.diff(0) - xp = self.ts - self.ts - assert_series_equal(rs, xp) - - # datetime diff (GH3100) - s = Series(date_range("20130102", periods=5)) - rs = s - s.shift(1) - xp = s.diff() - assert_series_equal(rs, xp) - - # timedelta diff - nrs = rs - rs.shift(1) - nxp = xp.diff() - assert_series_equal(nrs, nxp) - - # with tz - s = Series( - date_range("2000-01-01 09:00:00", periods=5, tz="US/Eastern"), name="foo" - ) - result = s.diff() - assert_series_equal( - result, Series(TimedeltaIndex(["NaT"] + ["1 days"] * 4), name="foo") - ) - - # boolean series - s = Series([False, True, True, False, False]) - result = s.diff() - assert_series_equal(result, Series([nan, True, False, True, False])) - - # boolean nan series - s = Series([False, True, nan, False, False]) - result = s.diff() - assert_series_equal(result, Series([nan, 1, nan, nan, 0], dtype="object")) From 081c2c8117b58dd2292d19110599b6045472cc82 Mon Sep 17 00:00:00 2001 From: Unprocessable Date: Sat, 10 Aug 2019 20:43:13 +0200 Subject: [PATCH 057/191] Removing tailing whitespaces --- pandas/tests/series/test_analytics.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py index 388f1768ffd0c..30cf9c6fd2bd8 100644 --- a/pandas/tests/series/test_analytics.py +++ b/pandas/tests/series/test_analytics.py @@ -237,7 +237,7 @@ def test_npdiff(self): r = np.diff(s) assert_series_equal(Series([nan, 0, 0, 0, nan]), r) - + def test_diff(self): ''' Combined datetime ranges, normal diff and boolean diff test. @@ -292,7 +292,7 @@ def test_diff(self): s = Series([False, True, nan, False, False]) result = s.diff() assert_series_equal(result, Series([nan, 1, nan, nan, 0], dtype="object")) - + def _check_accum_op(self, name, datetime_series_, check_dtype=True): func = getattr(np, name) tm.assert_numpy_array_equal( From 96bb1b420b779c7489a0e822429f97520ab325ad Mon Sep 17 00:00:00 2001 From: Unprocessable Date: Sat, 10 Aug 2019 22:13:57 +0200 Subject: [PATCH 058/191] STYLE: Black formatting --- pandas/tests/series/test_analytics.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py index 30cf9c6fd2bd8..1d827ced89a08 100644 --- a/pandas/tests/series/test_analytics.py +++ b/pandas/tests/series/test_analytics.py @@ -239,9 +239,9 @@ def test_npdiff(self): assert_series_equal(Series([nan, 0, 0, 0, nan]), r) def test_diff(self): - ''' + """ Combined datetime ranges, normal diff and boolean diff test. - ''' + """ # Just run the function self.ts.diff() From 91ebae51144f6d6b66fefd00d4d47bc42e3cee41 Mon Sep 17 00:00:00 2001 From: Unprocessable Date: Sat, 10 Aug 2019 22:16:04 +0200 Subject: [PATCH 059/191] Update test_analytics.py --- pandas/tests/series/test_analytics.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py index 1d827ced89a08..81331eeccdb7e 100644 --- a/pandas/tests/series/test_analytics.py +++ b/pandas/tests/series/test_analytics.py @@ -239,9 +239,7 @@ def test_npdiff(self): assert_series_equal(Series([nan, 0, 0, 0, nan]), r) def test_diff(self): - """ - Combined datetime ranges, normal diff and boolean diff test. - """ + # Combined datetime diff, normal diff and boolean diff test # Just run the function self.ts.diff() From 74a474f58fd2f6a2677ee31f5e6eee0f6cfaad59 Mon Sep 17 00:00:00 2001 From: Unprocessable Date: Sat, 10 Aug 2019 22:16:27 +0200 Subject: [PATCH 060/191] Update test_analytics.py --- pandas/tests/series/test_analytics.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py index 81331eeccdb7e..877b4baddf839 100644 --- a/pandas/tests/series/test_analytics.py +++ b/pandas/tests/series/test_analytics.py @@ -240,7 +240,6 @@ def test_npdiff(self): def test_diff(self): # Combined datetime diff, normal diff and boolean diff test - # Just run the function self.ts.diff() # int dtype From 40df511e671b8ac2b7538ff1e1db4e59f91db21f Mon Sep 17 00:00:00 2001 From: Unprocessable Date: Sat, 10 Aug 2019 22:33:59 +0200 Subject: [PATCH 061/191] Removing self.ts from code --- pandas/tests/series/test_analytics.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py index 877b4baddf839..c07f6e903d353 100644 --- a/pandas/tests/series/test_analytics.py +++ b/pandas/tests/series/test_analytics.py @@ -240,7 +240,8 @@ def test_npdiff(self): def test_diff(self): # Combined datetime diff, normal diff and boolean diff test - self.ts.diff() + ts = pd.Series([1, 2]) + ts.diff() # int dtype a = 10000000000000000 @@ -251,13 +252,13 @@ def test_diff(self): assert rs[1] == 1 # neg n - rs = self.ts.diff(-1) - xp = self.ts - self.ts.shift(-1) + rs = ts.diff(-1) + xp = ts - ts.shift(-1) assert_series_equal(rs, xp) # 0 - rs = self.ts.diff(0) - xp = self.ts - self.ts + rs = ts.diff(0) + xp = ts - ts assert_series_equal(rs, xp) # datetime diff (GH3100) From 18a2e356f644515916f541f781bf3e55ca5876ee Mon Sep 17 00:00:00 2001 From: Unprocessable Date: Sun, 11 Aug 2019 00:09:07 +0200 Subject: [PATCH 062/191] Update test_analytics.py --- pandas/tests/series/test_analytics.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py index c07f6e903d353..e916edc0bc799 100644 --- a/pandas/tests/series/test_analytics.py +++ b/pandas/tests/series/test_analytics.py @@ -238,9 +238,9 @@ def test_npdiff(self): r = np.diff(s) assert_series_equal(Series([nan, 0, 0, 0, nan]), r) - def test_diff(self): + def test_dt_nm_bool_diff(self): # Combined datetime diff, normal diff and boolean diff test - ts = pd.Series([1, 2]) + ts = Series([1, 2, 3]) ts.diff() # int dtype From 716a70e9fd94828e9c3f6ffb8802e83527dce173 Mon Sep 17 00:00:00 2001 From: Unprocessable Date: Sun, 11 Aug 2019 00:53:38 +0200 Subject: [PATCH 063/191] Adding tm.makeTimeSeries, as in the original --- pandas/tests/series/test_analytics.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py index e916edc0bc799..d60df28df06bb 100644 --- a/pandas/tests/series/test_analytics.py +++ b/pandas/tests/series/test_analytics.py @@ -240,7 +240,7 @@ def test_npdiff(self): def test_dt_nm_bool_diff(self): # Combined datetime diff, normal diff and boolean diff test - ts = Series([1, 2, 3]) + ts = tm.makeTimeSeries(name='ts') ts.diff() # int dtype From e879dd7fc384b5e58677e91c36c53389e23f937f Mon Sep 17 00:00:00 2001 From: Unprocessable Date: Sun, 11 Aug 2019 01:56:41 +0200 Subject: [PATCH 064/191] STYLE: Black --- pandas/tests/series/test_analytics.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py index d60df28df06bb..d329097ddc666 100644 --- a/pandas/tests/series/test_analytics.py +++ b/pandas/tests/series/test_analytics.py @@ -240,7 +240,7 @@ def test_npdiff(self): def test_dt_nm_bool_diff(self): # Combined datetime diff, normal diff and boolean diff test - ts = tm.makeTimeSeries(name='ts') + ts = tm.makeTimeSeries(name="ts") ts.diff() # int dtype From e26fa2b9b7c44ba7a1865c9206ae213e1e908b99 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Sun, 11 Aug 2019 21:29:47 -0500 Subject: [PATCH 065/191] CI: pytest, xdist versions (#27845) * CI: pytest, xdist --- ci/deps/azure-37-locale.yaml | 4 ++-- ci/deps/azure-37-numpydev.yaml | 3 ++- ci/deps/azure-macos-35.yaml | 4 ++-- ci/deps/azure-windows-36.yaml | 4 ++-- ci/deps/azure-windows-37.yaml | 4 ++-- ci/deps/travis-36-cov.yaml | 4 ++-- ci/deps/travis-37.yaml | 4 ++-- 7 files changed, 14 insertions(+), 13 deletions(-) diff --git a/ci/deps/azure-37-locale.yaml b/ci/deps/azure-37-locale.yaml index 05adbf0c924dc..26dcd213bbfa0 100644 --- a/ci/deps/azure-37-locale.yaml +++ b/ci/deps/azure-37-locale.yaml @@ -26,8 +26,8 @@ dependencies: - xlsxwriter - xlwt # universal - - pytest>=4.0.2 - - pytest-xdist + - pytest>=5.0.1 + - pytest-xdist>=1.29.0 - pytest-mock - pytest-azurepipelines - pip diff --git a/ci/deps/azure-37-numpydev.yaml b/ci/deps/azure-37-numpydev.yaml index 5cf897c98da10..65c92ec1dcf0d 100644 --- a/ci/deps/azure-37-numpydev.yaml +++ b/ci/deps/azure-37-numpydev.yaml @@ -6,7 +6,8 @@ dependencies: - pytz - Cython>=0.28.2 # universal - - pytest>=4.0.2 + # pytest < 5 until defaults has pytest-xdist>=1.29.0 + - pytest>=4.0.2,<5.0 - pytest-xdist - pytest-mock - hypothesis>=3.58.0 diff --git a/ci/deps/azure-macos-35.yaml b/ci/deps/azure-macos-35.yaml index 98859b596ab2a..cb2ac08cbf758 100644 --- a/ci/deps/azure-macos-35.yaml +++ b/ci/deps/azure-macos-35.yaml @@ -25,8 +25,8 @@ dependencies: - pip: - pyreadstat # universal - - pytest==4.5.0 - - pytest-xdist + - pytest>=5.0.1 + - pytest-xdist>=1.29.0 - pytest-mock - hypothesis>=3.58.0 # https://github.com/pandas-dev/pandas/issues/27421 diff --git a/ci/deps/azure-windows-36.yaml b/ci/deps/azure-windows-36.yaml index b0f3f5389ac85..ff9264a36cb12 100644 --- a/ci/deps/azure-windows-36.yaml +++ b/ci/deps/azure-windows-36.yaml @@ -23,8 +23,8 @@ dependencies: - xlwt # universal - cython>=0.28.2 - - pytest>=4.0.2 - - pytest-xdist + - pytest>=5.0.1 + - pytest-xdist>=1.29.0 - pytest-mock - pytest-azurepipelines - hypothesis>=3.58.0 diff --git a/ci/deps/azure-windows-37.yaml b/ci/deps/azure-windows-37.yaml index 08208d1e2d59a..075234a937035 100644 --- a/ci/deps/azure-windows-37.yaml +++ b/ci/deps/azure-windows-37.yaml @@ -26,8 +26,8 @@ dependencies: - xlwt # universal - cython>=0.28.2 - - pytest>=4.0.2 - - pytest-xdist + - pytest>=5.0.0 + - pytest-xdist>=1.29.0 - pytest-mock - pytest-azurepipelines - hypothesis>=3.58.0 diff --git a/ci/deps/travis-36-cov.yaml b/ci/deps/travis-36-cov.yaml index a3f6d5b30f3e1..19002cbb8575e 100644 --- a/ci/deps/travis-36-cov.yaml +++ b/ci/deps/travis-36-cov.yaml @@ -39,8 +39,8 @@ dependencies: - xlsxwriter - xlwt # universal - - pytest - - pytest-xdist + - pytest>=5.0.1 + - pytest-xdist>=1.29.0 - pytest-cov - pytest-mock - hypothesis>=3.58.0 diff --git a/ci/deps/travis-37.yaml b/ci/deps/travis-37.yaml index c9a8c274fb144..9e08c41a3d9c0 100644 --- a/ci/deps/travis-37.yaml +++ b/ci/deps/travis-37.yaml @@ -13,8 +13,8 @@ dependencies: - pyarrow - pytz # universal - - pytest>=4.0.2 - - pytest-xdist + - pytest>=5.0.0 + - pytest-xdist>=1.29.0 - pytest-mock - hypothesis>=3.58.0 - s3fs From 2ebab98cf1f36a678256436e7a4f5149536436c9 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 12 Aug 2019 11:37:49 -0500 Subject: [PATCH 066/191] CI: Pin Python to 3.7.3 (#27868) --- ci/deps/azure-37-locale.yaml | 2 +- ci/deps/azure-37-numpydev.yaml | 2 +- ci/deps/travis-37.yaml | 2 +- environment.yml | 2 +- requirements-dev.txt | 1 + 5 files changed, 5 insertions(+), 4 deletions(-) diff --git a/ci/deps/azure-37-locale.yaml b/ci/deps/azure-37-locale.yaml index 26dcd213bbfa0..437fbc24b9411 100644 --- a/ci/deps/azure-37-locale.yaml +++ b/ci/deps/azure-37-locale.yaml @@ -17,7 +17,7 @@ dependencies: - openpyxl - pytables - python-dateutil - - python=3.7.* + - python=3.7.3 - pytz - s3fs - scipy diff --git a/ci/deps/azure-37-numpydev.yaml b/ci/deps/azure-37-numpydev.yaml index 65c92ec1dcf0d..e1acda7ecf3c7 100644 --- a/ci/deps/azure-37-numpydev.yaml +++ b/ci/deps/azure-37-numpydev.yaml @@ -2,7 +2,7 @@ name: pandas-dev channels: - defaults dependencies: - - python=3.7.* + - python=3.7.3 - pytz - Cython>=0.28.2 # universal diff --git a/ci/deps/travis-37.yaml b/ci/deps/travis-37.yaml index 9e08c41a3d9c0..d7bed9e3f03f2 100644 --- a/ci/deps/travis-37.yaml +++ b/ci/deps/travis-37.yaml @@ -4,7 +4,7 @@ channels: - conda-forge - c3i_test dependencies: - - python=3.7.* + - python=3.7.3 - botocore>=1.11 - cython>=0.28.2 - numpy diff --git a/environment.yml b/environment.yml index 6d2cd701c3854..0cbd500209b6c 100644 --- a/environment.yml +++ b/environment.yml @@ -5,7 +5,7 @@ channels: dependencies: # required - numpy>=1.15 - - python=3 + - python=3.7.3 - python-dateutil>=2.6.1 - pytz diff --git a/requirements-dev.txt b/requirements-dev.txt index cf11a3ee28258..f866f1436edc6 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1,4 +1,5 @@ numpy>=1.15 +python==3.7.3 python-dateutil>=2.6.1 pytz asv From 9fd432beaaae56061fdc7282b13c034a8ef12a07 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 12 Aug 2019 12:56:46 -0500 Subject: [PATCH 067/191] BUG: Fix groupby quantile segfault (#27826) * BUG: Fix groupby quantile segfault Validate that q is between 0 and 1. Closes #27470 * prettier --- doc/source/whatsnew/v0.25.1.rst | 2 +- pandas/_libs/groupby.pyx | 5 +++++ pandas/tests/groupby/test_function.py | 11 +++++++++++ 3 files changed, 17 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.25.1.rst b/doc/source/whatsnew/v0.25.1.rst index 637ac5c9c8bd1..7b39810a2cf49 100644 --- a/doc/source/whatsnew/v0.25.1.rst +++ b/doc/source/whatsnew/v0.25.1.rst @@ -120,7 +120,7 @@ Groupby/resample/rolling - Bug in :meth:`pandas.core.groupby.DataFrameGroupBy.transform` where applying a timezone conversion lambda function would drop timezone information (:issue:`27496`) - Bug in windowing over read-only arrays (:issue:`27766`) -- +- Fixed segfault in `pandas.core.groupby.DataFrameGroupBy.quantile` when an invalid quantile was passed (:issue:`27470`) - Reshaping diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index e3f18572abca1..3069bbbf34bb7 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -719,6 +719,11 @@ def group_quantile(ndarray[float64_t] out, ndarray[int64_t] counts, non_na_counts, sort_arr assert values.shape[0] == N + + if not (0 <= q <= 1): + raise ValueError("'q' must be between 0 and 1. Got" + " '{}' instead".format(q)) + inter_methods = { 'linear': INTERPOLATION_LINEAR, 'lower': INTERPOLATION_LOWER, diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index efc3142b25b82..3794120281e1f 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -1247,6 +1247,17 @@ def test_quantile_raises(): df.groupby("key").quantile() +def test_quantile_out_of_bounds_q_raises(): + # https://github.com/pandas-dev/pandas/issues/27470 + df = pd.DataFrame(dict(a=[0, 0, 0, 1, 1, 1], b=range(6))) + g = df.groupby([0, 0, 0, 1, 1, 1]) + with pytest.raises(ValueError, match="Got '50.0' instead"): + g.quantile(50) + + with pytest.raises(ValueError, match="Got '-1.0' instead"): + g.quantile(-1) + + # pipe # -------------------------------- From 2b6c9771019044370fea7f64935620d4c6e2ab38 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 12 Aug 2019 11:53:36 -0700 Subject: [PATCH 068/191] op--> opname (#27849) --- pandas/core/arrays/categorical.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 9862b4b530424..13058882084ff 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -77,7 +77,7 @@ ) -def _cat_compare_op(op): +def _cat_compare_op(opname): def f(self, other): # On python2, you can usually compare any type to any type, and # Categoricals can be seen as a custom type, but having different @@ -90,7 +90,7 @@ def f(self, other): other = lib.item_from_zerodim(other) if not self.ordered: - if op in ["__lt__", "__gt__", "__le__", "__ge__"]: + if opname in ["__lt__", "__gt__", "__le__", "__ge__"]: raise TypeError( "Unordered Categoricals can only compare equality or not" ) @@ -117,7 +117,7 @@ def f(self, other): other_codes = other._codes mask = (self._codes == -1) | (other_codes == -1) - f = getattr(self._codes, op) + f = getattr(self._codes, opname) ret = f(other_codes) if mask.any(): # In other series, the leads to False, so do that here too @@ -127,38 +127,38 @@ def f(self, other): if is_scalar(other): if other in self.categories: i = self.categories.get_loc(other) - ret = getattr(self._codes, op)(i) + ret = getattr(self._codes, opname)(i) # check for NaN in self mask = self._codes == -1 ret[mask] = False return ret else: - if op == "__eq__": + if opname == "__eq__": return np.repeat(False, len(self)) - elif op == "__ne__": + elif opname == "__ne__": return np.repeat(True, len(self)) else: msg = ( "Cannot compare a Categorical for op {op} with a " "scalar, which is not a category." ) - raise TypeError(msg.format(op=op)) + raise TypeError(msg.format(op=opname)) else: # allow categorical vs object dtype array comparisons for equality # these are only positional comparisons - if op in ["__eq__", "__ne__"]: - return getattr(np.array(self), op)(np.array(other)) + if opname in ["__eq__", "__ne__"]: + return getattr(np.array(self), opname)(np.array(other)) msg = ( "Cannot compare a Categorical for op {op} with type {typ}." "\nIf you want to compare values, use 'np.asarray(cat) " " other'." ) - raise TypeError(msg.format(op=op, typ=type(other))) + raise TypeError(msg.format(op=opname, typ=type(other))) - f.__name__ = op + f.__name__ = opname return f From 01f90c187f0eec0e8178371d7c066e600c9e105b Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 12 Aug 2019 11:58:42 -0700 Subject: [PATCH 069/191] CLN: short-circuit case in Block.replace (#27768) --- pandas/core/internals/blocks.py | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index c035e1174bb27..e24e6e088b92a 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -738,8 +738,11 @@ def replace( # If we cannot replace with own dtype, convert to ObjectBlock and # retry if not self._can_hold_element(to_replace): - # TODO: we should be able to infer at this point that there is - # nothing to replace + if not isinstance(to_replace, list): + if inplace: + return [self] + return [self.copy()] + # GH 22083, TypeError or ValueError occurred within error handling # causes infinite loop. Cast and retry only if not objectblock. if is_object_dtype(self): @@ -764,14 +767,27 @@ def replace( filtered_out = ~self.mgr_locs.isin(filter) mask[filtered_out.nonzero()[0]] = False + if not mask.any(): + if inplace: + return [self] + return [self.copy()] + try: blocks = self.putmask(mask, value, inplace=inplace) + # Note: it is _not_ the case that self._can_hold_element(value) + # is always true at this point. In particular, that can fail + # for: + # "2u" with bool-dtype, float-dtype + # 0.5 with int64-dtype + # np.nan with int64-dtype except (TypeError, ValueError): # GH 22083, TypeError or ValueError occurred within error handling # causes infinite loop. Cast and retry only if not objectblock. if is_object_dtype(self): raise + assert not self._can_hold_element(value), value + # try again with a compatible block block = self.astype(object) return block.replace( @@ -924,6 +940,7 @@ def putmask(self, mask, new, align=True, inplace=False, axis=0, transpose=False) # if we are passed a scalar None, convert it here if not is_list_like(new) and isna(new) and not self.is_object: + # FIXME: make sure we have compatible NA new = self.fill_value if self._can_hold_element(new): From 6afa2ad391284aeecdd67340effb0c16bc21dace Mon Sep 17 00:00:00 2001 From: Kaiqi Dong Date: Mon, 12 Aug 2019 21:00:02 +0200 Subject: [PATCH 070/191] BUG: Allow plotting boolean values (#27665) --- doc/source/whatsnew/v1.0.0.rst | 2 +- pandas/plotting/_core.py | 2 ++ pandas/plotting/_matplotlib/core.py | 11 ++++++++--- pandas/tests/plotting/test_series.py | 9 +++++++++ 4 files changed, 20 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 58918f2d8c40e..974d14a4b424c 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -163,7 +163,7 @@ I/O Plotting ^^^^^^^^ -- +- Bug in :meth:`Series.plot` not able to plot boolean values (:issue:`23719`) - Groupby/resample/rolling diff --git a/pandas/plotting/_core.py b/pandas/plotting/_core.py index a3c1499845c2a..ec5c609c1b267 100644 --- a/pandas/plotting/_core.py +++ b/pandas/plotting/_core.py @@ -586,6 +586,8 @@ class PlotAccessor(PandasObject): mark_right : bool, default True When using a secondary_y axis, automatically mark the column labels with "(right)" in the legend + include_bool : bool, default is False + If True, boolean values can be plotted `**kwds` : keywords Options to pass to matplotlib plotting method diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py index c2b37bb297ecb..50f0d16631a15 100644 --- a/pandas/plotting/_matplotlib/core.py +++ b/pandas/plotting/_matplotlib/core.py @@ -106,6 +106,7 @@ def __init__( colormap=None, table=False, layout=None, + include_bool=False, **kwds ): @@ -191,6 +192,7 @@ def __init__( self.colormap = colormap self.table = table + self.include_bool = include_bool self.kwds = kwds @@ -400,9 +402,12 @@ def _compute_plot_data(self): # GH16953, _convert is needed as fallback, for ``Series`` # with ``dtype == object`` data = data._convert(datetime=True, timedelta=True) - numeric_data = data.select_dtypes( - include=[np.number, "datetime", "datetimetz", "timedelta"] - ) + select_include_type = [np.number, "datetime", "datetimetz", "timedelta"] + + # GH23719, allow plotting boolean + if self.include_bool is True: + select_include_type.append(np.bool_) + numeric_data = data.select_dtypes(include=select_include_type) try: is_empty = numeric_data.empty diff --git a/pandas/tests/plotting/test_series.py b/pandas/tests/plotting/test_series.py index 8b4a78e9195b5..111c3a70fc09c 100644 --- a/pandas/tests/plotting/test_series.py +++ b/pandas/tests/plotting/test_series.py @@ -167,6 +167,15 @@ def test_label(self): ax.legend() # draw it self._check_legend_labels(ax, labels=["LABEL"]) + def test_boolean(self): + # GH 23719 + s = Series([False, False, True]) + _check_plot_works(s.plot, include_bool=True) + + msg = "no numeric data to plot" + with pytest.raises(TypeError, match=msg): + _check_plot_works(s.plot) + def test_line_area_nan_series(self): values = [1, 2, np.nan, 3] s = Series(values) From 35821a5794e434117b14797f010602c8e412b36c Mon Sep 17 00:00:00 2001 From: Chris Stadler Date: Mon, 12 Aug 2019 15:10:49 -0400 Subject: [PATCH 071/191] Avoid calling S3File.s3 (#27777) * Avoid calling S3File.s3 When reading from s3 using fastparquet. This attribute was removed in s3fs 0.3.0. This change avoids accessing it by using a new method get_file_and_filesystem which returns the filesystem in addition to the file. --- doc/source/whatsnew/v0.25.1.rst | 2 +- pandas/io/parquet.py | 6 ++++-- pandas/io/s3.py | 25 +++++++++++++++++++------ 3 files changed, 24 insertions(+), 9 deletions(-) diff --git a/doc/source/whatsnew/v0.25.1.rst b/doc/source/whatsnew/v0.25.1.rst index 7b39810a2cf49..7c2e488b98509 100644 --- a/doc/source/whatsnew/v0.25.1.rst +++ b/doc/source/whatsnew/v0.25.1.rst @@ -104,7 +104,7 @@ MultiIndex I/O ^^^ -- +- Avoid calling ``S3File.s3`` when reading parquet, as this was removed in s3fs version 0.3.0 (:issue:`27756`) - - diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index 82c460300582b..6fc70e9f4a737 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -184,12 +184,14 @@ def write( def read(self, path, columns=None, **kwargs): if is_s3_url(path): + from pandas.io.s3 import get_file_and_filesystem + # When path is s3:// an S3File is returned. # We need to retain the original path(str) while also # pass the S3File().open function to fsatparquet impl. - s3, _, _, should_close = get_filepath_or_buffer(path) + s3, filesystem = get_file_and_filesystem(path) try: - parquet_file = self.api.ParquetFile(path, open_with=s3.s3.open) + parquet_file = self.api.ParquetFile(path, open_with=filesystem.open) finally: s3.close() else: diff --git a/pandas/io/s3.py b/pandas/io/s3.py index 0a7c082fec51c..7e0a37e8cba20 100644 --- a/pandas/io/s3.py +++ b/pandas/io/s3.py @@ -1,8 +1,11 @@ """ s3 support for remote file interactivity """ +from typing import IO, Any, Optional, Tuple from urllib.parse import urlparse as parse_url from pandas.compat._optional import import_optional_dependency +from pandas._typing import FilePathOrBuffer + s3fs = import_optional_dependency( "s3fs", extra="The s3fs package is required to handle s3 files." ) @@ -14,9 +17,9 @@ def _strip_schema(url): return result.netloc + result.path -def get_filepath_or_buffer( - filepath_or_buffer, encoding=None, compression=None, mode=None -): +def get_file_and_filesystem( + filepath_or_buffer: FilePathOrBuffer, mode: Optional[str] = None +) -> Tuple[IO, Any]: from botocore.exceptions import NoCredentialsError if mode is None: @@ -24,7 +27,7 @@ def get_filepath_or_buffer( fs = s3fs.S3FileSystem(anon=False) try: - filepath_or_buffer = fs.open(_strip_schema(filepath_or_buffer), mode) + file = fs.open(_strip_schema(filepath_or_buffer), mode) except (FileNotFoundError, NoCredentialsError): # boto3 has troubles when trying to access a public file # when credentialed... @@ -33,5 +36,15 @@ def get_filepath_or_buffer( # A NoCredentialsError is raised if you don't have creds # for that bucket. fs = s3fs.S3FileSystem(anon=True) - filepath_or_buffer = fs.open(_strip_schema(filepath_or_buffer), mode) - return filepath_or_buffer, None, compression, True + file = fs.open(_strip_schema(filepath_or_buffer), mode) + return file, fs + + +def get_filepath_or_buffer( + filepath_or_buffer: FilePathOrBuffer, + encoding: Optional[str] = None, + compression: Optional[str] = None, + mode: Optional[str] = None, +) -> Tuple[IO, Optional[str], Optional[str], bool]: + file, _fs = get_file_and_filesystem(filepath_or_buffer, mode=mode) + return file, None, compression, True From aef0804ff9814783eeb6a9408b81bf5bc334ef61 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 13 Aug 2019 03:04:20 +0200 Subject: [PATCH 072/191] BUG: add back check for MultiIndex case and take_split_path (#27855) * BUG: add back check for MultiIndex case and take_split_path * update comment * add simpler test + whatsnew --- doc/source/whatsnew/v0.25.1.rst | 2 +- pandas/core/indexing.py | 11 ++++++++++ pandas/tests/indexing/multiindex/test_loc.py | 23 ++++++++++++++++++++ 3 files changed, 35 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.25.1.rst b/doc/source/whatsnew/v0.25.1.rst index 7c2e488b98509..b97f45efbeae9 100644 --- a/doc/source/whatsnew/v0.25.1.rst +++ b/doc/source/whatsnew/v0.25.1.rst @@ -84,7 +84,7 @@ Indexing - Bug in partial-string indexing returning a NumPy array rather than a ``Series`` when indexing with a scalar like ``.loc['2015']`` (:issue:`27516`) - Break reference cycle involving :class:`Index` to allow garbage collection of :class:`Index` objects without running the GC. (:issue:`27585`) -- +- Fix regression in assigning values to a single column of a DataFrame with a ``MultiIndex`` columns (:issue:`27841`). - Missing diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index e308ae03730b3..ea00737f776ee 100755 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -321,6 +321,17 @@ def _setitem_with_indexer(self, indexer, value): val = list(value.values()) if isinstance(value, dict) else value take_split_path = not blk._can_hold_element(val) + # if we have any multi-indexes that have non-trivial slices + # (not null slices) then we must take the split path, xref + # GH 10360, GH 27841 + if isinstance(indexer, tuple) and len(indexer) == len(self.obj.axes): + for i, ax in zip(indexer, self.obj.axes): + if isinstance(ax, MultiIndex) and not ( + is_integer(i) or com.is_null_slice(i) + ): + take_split_path = True + break + if isinstance(indexer, tuple): nindexer = [] for i, idx in enumerate(indexer): diff --git a/pandas/tests/indexing/multiindex/test_loc.py b/pandas/tests/indexing/multiindex/test_loc.py index a08b2b4c66af2..8b48c2bf7169f 100644 --- a/pandas/tests/indexing/multiindex/test_loc.py +++ b/pandas/tests/indexing/multiindex/test_loc.py @@ -390,3 +390,26 @@ def test_loc_getitem_lowerdim_corner(multiindex_dataframe_random_data): expected = 0 result = df.sort_index().loc[("bar", "three"), "B"] assert result == expected + + +def test_loc_setitem_single_column_slice(): + # case from https://github.com/pandas-dev/pandas/issues/27841 + df = DataFrame( + "string", + index=list("abcd"), + columns=MultiIndex.from_product([["Main"], ("another", "one")]), + ) + df["labels"] = "a" + df.loc[:, "labels"] = df.index + tm.assert_numpy_array_equal(np.asarray(df["labels"]), np.asarray(df.index)) + + # test with non-object block + df = DataFrame( + np.nan, + index=range(4), + columns=MultiIndex.from_tuples([("A", "1"), ("A", "2"), ("B", "1")]), + ) + expected = df.copy() + df.loc[:, "B"] = np.arange(4) + expected.iloc[:, 2] = np.arange(4) + tm.assert_frame_equal(df, expected) From 794be8c7ab6897b7206f2c6ec60d22fea2e440a3 Mon Sep 17 00:00:00 2001 From: Jimmy Callin Date: Tue, 13 Aug 2019 03:08:37 +0200 Subject: [PATCH 073/191] Added missing space to error description (#27866) --- pandas/io/feather_format.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/feather_format.py b/pandas/io/feather_format.py index 6fe22f14c2c5b..25a6db675265d 100644 --- a/pandas/io/feather_format.py +++ b/pandas/io/feather_format.py @@ -39,7 +39,7 @@ def to_feather(df, path): if not isinstance(df.index, Int64Index): raise ValueError( "feather does not support serializing {} " - "for the index; you can .reset_index()" + "for the index; you can .reset_index() " "to make the index into column(s)".format(type(df.index)) ) From 666928b23ec5dd568040beca22ed35f75ddcbc2f Mon Sep 17 00:00:00 2001 From: Ben Thayer Date: Tue, 13 Aug 2019 02:50:40 -0500 Subject: [PATCH 074/191] DOC: Updated Series.items 'See also' section for clarity (#27888) --- pandas/core/series.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/series.py b/pandas/core/series.py index 4e64a25e430eb..c891298d6e499 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1681,7 +1681,8 @@ def items(self): See Also -------- - DataFrame.items : Equivalent to Series.items for DataFrame. + DataFrame.items : Iterate over (column name, Series) pairs. + DataFrame.iterrows : Iterate over DataFrame rows as (index, Series) pairs. Examples -------- From 23497834392f8a5093c0aa4a2cdce8547c50e7bb Mon Sep 17 00:00:00 2001 From: Ankit Dhankhar Date: Tue, 13 Aug 2019 13:27:19 +0530 Subject: [PATCH 075/191] DOC: clarify see also for DataFrame.iterrows() (#27893) --- pandas/core/frame.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 02241eeaae7b2..e3edab117668f 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -845,8 +845,8 @@ def iterrows(self): See Also -------- - itertuples : Iterate over DataFrame rows as namedtuples of the values. - items : Iterate over (column name, Series) pairs. + DataFrame.itertuples : Iterate over DataFrame rows as namedtuples of the values. + DataFrame.items : Iterate over (column name, Series) pairs. Notes ----- From f872bcd621c95341bcf72b588448cf9fbe9f5b9d Mon Sep 17 00:00:00 2001 From: Samesh Lakhotia <43701530+sameshl@users.noreply.github.com> Date: Tue, 13 Aug 2019 15:11:58 +0530 Subject: [PATCH 076/191] DOC: Add append example in to_excel documentation (#27852) --- pandas/core/generic.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 5a1b2a5dcd20b..71d5068e2e0fc 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -2179,6 +2179,12 @@ def _repr_data_resource_(self): ... df1.to_excel(writer, sheet_name='Sheet_name_1') ... df2.to_excel(writer, sheet_name='Sheet_name_2') + ExcelWriter can also be used to append to an existing Excel file: + + >>> with pd.ExcelWriter('output.xlsx', + ... mode='a') as writer: # doctest: +SKIP + ... df.to_excel(writer, sheet_name='Sheet_name_3') + To set the library that is used to write the Excel file, you can pass the `engine` keyword (the default engine is automatically chosen depending on the file extension): From 6572497583fa12e1a2a63a52c5626ed85b8fa85f Mon Sep 17 00:00:00 2001 From: Samesh Lakhotia <43701530+sameshl@users.noreply.github.com> Date: Tue, 13 Aug 2019 17:27:28 +0530 Subject: [PATCH 077/191] DOC: Standardize use of "Iterate" and "Iterator" (#27871) changed `Iterator over` --> `Iterate over` closes #27861 --- pandas/core/frame.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index e3edab117668f..c8cf307171e8d 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -770,7 +770,7 @@ def style(self): _shared_docs[ "items" ] = r""" - Iterator over (column name, Series) pairs. + Iterate over (column name, Series) pairs. Iterates over the DataFrame columns, returning a tuple with the column name and the content as a Series. From 3e4b196e9fcd362fc8ba58b740a0f1c238ee9323 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 13 Aug 2019 05:05:44 -0700 Subject: [PATCH 078/191] REF: Make CategoricalIndex comparison defer to Categorical comparison (#27769) --- pandas/core/arrays/categorical.py | 3 +++ pandas/core/indexes/base.py | 8 ++++++-- pandas/core/indexes/category.py | 25 +++---------------------- pandas/core/ops/__init__.py | 8 +++++++- pandas/tests/indexes/test_category.py | 5 +++++ 5 files changed, 24 insertions(+), 25 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 13058882084ff..368f532ff7d25 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -88,6 +88,9 @@ def f(self, other): return NotImplemented other = lib.item_from_zerodim(other) + if is_list_like(other) and len(other) != len(self): + # TODO: Could this fail if the categories are listlike objects? + raise ValueError("Lengths must match.") if not self.ordered: if opname in ["__lt__", "__gt__", "__le__", "__ge__"]: diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 598c4dca9ce88..2fe04f4718765 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -48,6 +48,7 @@ ) from pandas.core.dtypes.concat import concat_compat from pandas.core.dtypes.generic import ( + ABCCategorical, ABCDataFrame, ABCDateOffset, ABCDatetimeArray, @@ -99,11 +100,14 @@ def _make_comparison_op(op, cls): def cmp_method(self, other): - if isinstance(other, (np.ndarray, Index, ABCSeries)): + if isinstance(other, (np.ndarray, Index, ABCSeries, ExtensionArray)): if other.ndim > 0 and len(self) != len(other): raise ValueError("Lengths must match to compare") - if is_object_dtype(self) and not isinstance(self, ABCMultiIndex): + if is_object_dtype(self) and isinstance(other, ABCCategorical): + left = type(other)(self._values, dtype=other.dtype) + return op(left, other) + elif is_object_dtype(self) and not isinstance(self, ABCMultiIndex): # don't pass MultiIndex with np.errstate(all="ignore"): result = ops._comp_method_OBJECT_ARRAY(op, self.values, other) diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index 0f6aa711adc90..8bfa7e8d20b4f 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -899,31 +899,12 @@ def _make_compare(op): opname = "__{op}__".format(op=op.__name__) def _evaluate_compare(self, other): - - # if we have a Categorical type, then must have the same - # categories - if isinstance(other, CategoricalIndex): - other = other._values - elif isinstance(other, Index): - other = self._create_categorical(other._values, dtype=self.dtype) - - if isinstance(other, (ABCCategorical, np.ndarray, ABCSeries)): - if len(self.values) != len(other): - raise ValueError("Lengths must match to compare") - - if isinstance(other, ABCCategorical): - if not self.values.is_dtype_equal(other): - raise TypeError( - "categorical index comparisons must " - "have the same categories and ordered " - "attributes" - ) - - result = op(self.values, other) + with np.errstate(all="ignore"): + result = op(self.array, other) if isinstance(result, ABCSeries): # Dispatch to pd.Categorical returned NotImplemented # and we got a Series back; down-cast to ndarray - result = result.values + result = result._values return result return compat.set_function_name(_evaluate_compare, opname, cls) diff --git a/pandas/core/ops/__init__.py b/pandas/core/ops/__init__.py index a56521b9c9fec..b492aa0791b6d 100644 --- a/pandas/core/ops/__init__.py +++ b/pandas/core/ops/__init__.py @@ -832,8 +832,14 @@ def wrapper(self, other, axis=None): # Defer to DataFrame implementation; fail early return NotImplemented - elif isinstance(other, ABCSeries) and not self._indexed_same(other): + if isinstance(other, ABCSeries) and not self._indexed_same(other): raise ValueError("Can only compare identically-labeled Series objects") + elif ( + is_list_like(other) + and len(other) != len(self) + and not isinstance(other, (set, frozenset)) + ): + raise ValueError("Lengths must match") elif ( is_list_like(other) diff --git a/pandas/tests/indexes/test_category.py b/pandas/tests/indexes/test_category.py index 280b0a99c7e68..67bf9bd20e716 100644 --- a/pandas/tests/indexes/test_category.py +++ b/pandas/tests/indexes/test_category.py @@ -823,6 +823,11 @@ def test_equals_categorical(self): msg = ( "categorical index comparisons must have the same categories" " and ordered attributes" + "|" + "Categoricals can only be compared if 'categories' are the same. " + "Categories are different lengths" + "|" + "Categoricals can only be compared if 'ordered' is the same" ) with pytest.raises(TypeError, match=msg): ci1 == ci2 From d187d90f954d2d3a42bee86d8f626857adbe27bf Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 13 Aug 2019 05:06:26 -0700 Subject: [PATCH 079/191] skip test if running non-installed (#27810) --- pandas/tests/plotting/test_backend.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/pandas/tests/plotting/test_backend.py b/pandas/tests/plotting/test_backend.py index e79e7b6239eb3..d126407cfd823 100644 --- a/pandas/tests/plotting/test_backend.py +++ b/pandas/tests/plotting/test_backend.py @@ -46,14 +46,18 @@ def test_backend_is_correct(monkeypatch): @td.skip_if_no_mpl def test_register_entrypoint(): + + dist = pkg_resources.get_distribution("pandas") + if dist.module_path not in pandas.__file__: + # We are running from a non-installed pandas, and this test is invalid + pytest.skip("Testing a non-installed pandas") + mod = types.ModuleType("my_backend") mod.plot = lambda *args, **kwargs: 1 backends = pkg_resources.get_entry_map("pandas") my_entrypoint = pkg_resources.EntryPoint( - "pandas_plotting_backend", - mod.__name__, - dist=pkg_resources.get_distribution("pandas"), + "pandas_plotting_backend", mod.__name__, dist=dist ) backends["pandas_plotting_backends"]["my_backend"] = my_entrypoint # TODO: the docs recommend importlib.util.module_from_spec. But this works for now. From 6a927b00b1d3820d7409d972e451256f08711805 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 13 Aug 2019 05:13:16 -0700 Subject: [PATCH 080/191] CLN: match standardized dispatch logic (#27830) --- pandas/core/arrays/integer.py | 48 ++++++++++++++++------------- pandas/tests/arrays/test_integer.py | 16 ++++++++-- 2 files changed, 39 insertions(+), 25 deletions(-) diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index 1f14bd169a228..069d661e6af34 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -21,7 +21,7 @@ is_scalar, ) from pandas.core.dtypes.dtypes import register_extension_dtype -from pandas.core.dtypes.generic import ABCIndexClass, ABCSeries +from pandas.core.dtypes.generic import ABCDataFrame, ABCIndexClass, ABCSeries from pandas.core.dtypes.missing import isna, notna from pandas.core import nanops, ops @@ -592,25 +592,29 @@ def _values_for_argsort(self) -> np.ndarray: @classmethod def _create_comparison_method(cls, op): - def cmp_method(self, other): + op_name = op.__name__ - op_name = op.__name__ - mask = None + def cmp_method(self, other): - if isinstance(other, (ABCSeries, ABCIndexClass)): + if isinstance(other, (ABCDataFrame, ABCSeries, ABCIndexClass)): # Rely on pandas to unbox and dispatch to us. return NotImplemented + other = lib.item_from_zerodim(other) + mask = None + if isinstance(other, IntegerArray): other, mask = other._data, other._mask elif is_list_like(other): other = np.asarray(other) - if other.ndim > 0 and len(self) != len(other): + if other.ndim > 1: + raise NotImplementedError( + "can only perform ops with 1-d structures" + ) + if len(self) != len(other): raise ValueError("Lengths must match to compare") - other = lib.item_from_zerodim(other) - # numpy will show a DeprecationWarning on invalid elementwise # comparisons, this will raise in the future with warnings.catch_warnings(): @@ -683,31 +687,31 @@ def _maybe_mask_result(self, result, mask, other, op_name): @classmethod def _create_arithmetic_method(cls, op): - def integer_arithmetic_method(self, other): + op_name = op.__name__ - op_name = op.__name__ - mask = None + def integer_arithmetic_method(self, other): - if isinstance(other, (ABCSeries, ABCIndexClass)): + if isinstance(other, (ABCDataFrame, ABCSeries, ABCIndexClass)): # Rely on pandas to unbox and dispatch to us. return NotImplemented - if getattr(other, "ndim", 0) > 1: - raise NotImplementedError("can only perform ops with 1-d structures") + other = lib.item_from_zerodim(other) + mask = None if isinstance(other, IntegerArray): other, mask = other._data, other._mask - elif getattr(other, "ndim", None) == 0: - other = other.item() - elif is_list_like(other): other = np.asarray(other) - if not other.ndim: - other = other.item() - elif other.ndim == 1: - if not (is_float_dtype(other) or is_integer_dtype(other)): - raise TypeError("can only perform ops with numeric values") + if other.ndim > 1: + raise NotImplementedError( + "can only perform ops with 1-d structures" + ) + if len(self) != len(other): + raise ValueError("Lengths must match") + if not (is_float_dtype(other) or is_integer_dtype(other)): + raise TypeError("can only perform ops with numeric values") + else: if not (is_float(other) or is_integer(other)): raise TypeError("can only perform ops with numeric values") diff --git a/pandas/tests/arrays/test_integer.py b/pandas/tests/arrays/test_integer.py index 50cd1469e5196..31a9a0483081e 100644 --- a/pandas/tests/arrays/test_integer.py +++ b/pandas/tests/arrays/test_integer.py @@ -280,7 +280,7 @@ def test_arith_coerce_scalar(self, data, all_arithmetic_operators): other = 0.01 self._check_op(s, op, other) - @pytest.mark.parametrize("other", [1.0, 1.0, np.array(1.0), np.array([1.0])]) + @pytest.mark.parametrize("other", [1.0, np.array(1.0)]) def test_arithmetic_conversion(self, all_arithmetic_operators, other): # if we have a float operand we should have a float result # if that is equal to an integer @@ -290,6 +290,15 @@ def test_arithmetic_conversion(self, all_arithmetic_operators, other): result = op(s, other) assert result.dtype is np.dtype("float") + def test_arith_len_mismatch(self, all_arithmetic_operators): + # operating with a list-like with non-matching length raises + op = self.get_op_from_name(all_arithmetic_operators) + other = np.array([1.0]) + + s = pd.Series([1, 2, 3], dtype="Int64") + with pytest.raises(ValueError, match="Lengths must match"): + op(s, other) + @pytest.mark.parametrize("other", [0, 0.5]) def test_arith_zero_dim_ndarray(self, other): arr = integer_array([1, None, 2]) @@ -322,8 +331,9 @@ def test_error(self, data, all_arithmetic_operators): ops(pd.Series(pd.date_range("20180101", periods=len(s)))) # 2d - with pytest.raises(NotImplementedError): - opa(pd.DataFrame({"A": s})) + result = opa(pd.DataFrame({"A": s})) + assert result is NotImplemented + with pytest.raises(NotImplementedError): opa(np.arange(len(s)).reshape(-1, len(s))) From eddeee5944b63579d590f730ef7fdc52e2312379 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 13 Aug 2019 05:17:37 -0700 Subject: [PATCH 081/191] requested edit to comparison method (#27873) --- pandas/core/ops/__init__.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/pandas/core/ops/__init__.py b/pandas/core/ops/__init__.py index b492aa0791b6d..7229b3de4e9f0 100644 --- a/pandas/core/ops/__init__.py +++ b/pandas/core/ops/__init__.py @@ -841,13 +841,12 @@ def wrapper(self, other, axis=None): ): raise ValueError("Lengths must match") - elif ( - is_list_like(other) - and len(other) != len(self) - and not isinstance(other, frozenset) - ): - # TODO: why are we treating len-1 frozenset differently? - raise ValueError("Lengths must match to compare") + elif isinstance(other, (np.ndarray, ABCIndexClass, ABCSeries)): + # TODO: make this treatment consistent across ops and classes. + # We are not catching all listlikes here (e.g. frozenset, tuple) + # The ambiguous case is object-dtype. See GH#27803 + if len(self) != len(other): + raise ValueError("Lengths must match to compare") if is_categorical_dtype(self): # Dispatch to Categorical implementation; CategoricalIndex From 69b25ee82aeca1fc94a04cfa0159c70ad2a625b4 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 13 Aug 2019 05:18:35 -0700 Subject: [PATCH 082/191] CLN: use invalid_comparison for incorrect case in Index comparison (#27879) --- pandas/core/indexes/base.py | 8 +------- pandas/tests/indexes/test_numpy_compat.py | 5 ++++- 2 files changed, 5 insertions(+), 8 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 2fe04f4718765..36d6f844a85e3 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -116,17 +116,11 @@ def cmp_method(self, other): with np.errstate(all="ignore"): result = op(self.values, np.asarray(other)) - # technically we could support bool dtyped Index - # for now just return the indexing array directly if is_bool_dtype(result): return result - try: - return Index(result) - except TypeError: - return result + return ops.invalid_comparison(self, other, op) name = "__{name}__".format(name=op.__name__) - # TODO: docstring? return set_function_name(cmp_method, name, cls) diff --git a/pandas/tests/indexes/test_numpy_compat.py b/pandas/tests/indexes/test_numpy_compat.py index f9ca1bca04165..645ad19ea4cc9 100644 --- a/pandas/tests/indexes/test_numpy_compat.py +++ b/pandas/tests/indexes/test_numpy_compat.py @@ -118,4 +118,7 @@ def test_elementwise_comparison_warning(): # this test. idx = Index([1, 2]) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - idx == "a" + result = idx == "a" + + expected = np.array([False, False]) + tm.assert_numpy_array_equal(result, expected) From e47362a24cfa0729852392f2995b06946b875507 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 13 Aug 2019 05:20:40 -0700 Subject: [PATCH 083/191] CLN: Index.__new__ (#27883) --- pandas/core/frame.py | 7 +- pandas/core/indexes/base.py | 124 ++++++++++++++---------------------- 2 files changed, 50 insertions(+), 81 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index c8cf307171e8d..6114c9efbfe28 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3447,15 +3447,14 @@ def _get_info_slice(obj, indexer): if not is_list_like(exclude): exclude = (exclude,) if exclude is not None else () - selection = tuple(map(frozenset, (include, exclude))) + selection = (frozenset(include), frozenset(exclude)) if not any(selection): raise ValueError("at least one of include or exclude must be nonempty") # convert the myriad valid dtypes object to a single representation - include, exclude = map( - lambda x: frozenset(map(infer_dtype_from_object, x)), selection - ) + include = frozenset(infer_dtype_from_object(x) for x in include) + exclude = frozenset(infer_dtype_from_object(x) for x in exclude) for dtypes in (include, exclude): invalidate_string_dtypes(dtypes) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 36d6f844a85e3..60c3a8b5269cf 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -307,12 +307,12 @@ def __new__( elif ( is_datetime64_any_dtype(data) - or (dtype is not None and is_datetime64_any_dtype(dtype)) + or is_datetime64_any_dtype(dtype) or "tz" in kwargs ): from pandas import DatetimeIndex - if dtype is not None and is_dtype_equal(_o_dtype, dtype): + if is_dtype_equal(_o_dtype, dtype): # GH#23524 passing `dtype=object` to DatetimeIndex is invalid, # will raise in the where `data` is already tz-aware. So # we leave it out of this step and cast to object-dtype after @@ -327,12 +327,10 @@ def __new__( ) return result - elif is_timedelta64_dtype(data) or ( - dtype is not None and is_timedelta64_dtype(dtype) - ): + elif is_timedelta64_dtype(data) or is_timedelta64_dtype(dtype): from pandas import TimedeltaIndex - if dtype is not None and is_dtype_equal(_o_dtype, dtype): + if is_dtype_equal(_o_dtype, dtype): # Note we can pass copy=False because the .astype below # will always make a copy result = TimedeltaIndex(data, copy=False, name=name, **kwargs) @@ -353,11 +351,9 @@ def __new__( elif is_extension_array_dtype(data) or is_extension_array_dtype(dtype): data = np.asarray(data) if not (dtype is None or is_object_dtype(dtype)): - # coerce to the provided dtype - data = dtype.construct_array_type()._from_sequence( - data, dtype=dtype, copy=False - ) + ea_cls = dtype.construct_array_type() + data = ea_cls._from_sequence(data, dtype=dtype, copy=False) # coerce to the object dtype data = data.astype(object) @@ -366,58 +362,48 @@ def __new__( # index-like elif isinstance(data, (np.ndarray, Index, ABCSeries)): if dtype is not None: - try: - - # we need to avoid having numpy coerce - # things that look like ints/floats to ints unless - # they are actually ints, e.g. '0' and 0.0 - # should not be coerced - # GH 11836 - if is_integer_dtype(dtype): - inferred = lib.infer_dtype(data, skipna=False) - if inferred == "integer": - data = maybe_cast_to_integer_array(data, dtype, copy=copy) - elif inferred in ["floating", "mixed-integer-float"]: - if isna(data).any(): - raise ValueError("cannot convert float NaN to integer") - - if inferred == "mixed-integer-float": - data = maybe_cast_to_integer_array(data, dtype) - - # If we are actually all equal to integers, - # then coerce to integer. - try: - return cls._try_convert_to_int_index( - data, copy, name, dtype - ) - except ValueError: - pass - - # Return an actual float index. - from .numeric import Float64Index - - return Float64Index(data, copy=copy, dtype=dtype, name=name) - - elif inferred == "string": - pass - else: - data = data.astype(dtype) - elif is_float_dtype(dtype): - inferred = lib.infer_dtype(data, skipna=False) - if inferred == "string": + # we need to avoid having numpy coerce + # things that look like ints/floats to ints unless + # they are actually ints, e.g. '0' and 0.0 + # should not be coerced + # GH 11836 + if is_integer_dtype(dtype): + inferred = lib.infer_dtype(data, skipna=False) + if inferred == "integer": + data = maybe_cast_to_integer_array(data, dtype, copy=copy) + elif inferred in ["floating", "mixed-integer-float"]: + if isna(data).any(): + raise ValueError("cannot convert float NaN to integer") + + if inferred == "mixed-integer-float": + data = maybe_cast_to_integer_array(data, dtype) + + # If we are actually all equal to integers, + # then coerce to integer. + try: + return cls._try_convert_to_int_index( + data, copy, name, dtype + ) + except ValueError: pass - else: - data = data.astype(dtype) + + # Return an actual float index. + from .numeric import Float64Index + + return Float64Index(data, copy=copy, dtype=dtype, name=name) + + elif inferred == "string": + pass else: - data = np.array(data, dtype=dtype, copy=copy) - - except (TypeError, ValueError) as e: - msg = str(e) - if ( - "cannot convert float" in msg - or "Trying to coerce float values to integer" in msg - ): - raise + data = data.astype(dtype) + elif is_float_dtype(dtype): + inferred = lib.infer_dtype(data, skipna=False) + if inferred == "string": + pass + else: + data = data.astype(dtype) + else: + data = np.array(data, dtype=dtype, copy=copy) # maybe coerce to a sub-class from pandas.core.indexes.period import PeriodIndex, IncompatibleFrequency @@ -553,16 +539,6 @@ def _simple_new(cls, values, name=None, dtype=None, **kwargs): Must be careful not to recurse. """ - if not hasattr(values, "dtype"): - if (values is None or not len(values)) and dtype is not None: - values = np.empty(0, dtype=dtype) - else: - values = np.array(values, copy=False) - if is_object_dtype(values): - values = cls( - values, name=name, dtype=dtype, **kwargs - )._ndarray_values - if isinstance(values, (ABCSeries, ABCIndexClass)): # Index._data must always be an ndarray. # This is no-copy for when _values is an ndarray, @@ -1860,8 +1836,6 @@ def inferred_type(self): @cache_readonly def is_all_dates(self): - if self._data is None: - return False return is_datetime_array(ensure_object(self.values)) # -------------------------------------------------------------------- @@ -3132,13 +3106,9 @@ def _convert_scalar_indexer(self, key, kind=None): """ @Appender(_index_shared_docs["_convert_slice_indexer"]) - def _convert_slice_indexer(self, key, kind=None): + def _convert_slice_indexer(self, key: slice, kind=None): assert kind in ["ix", "loc", "getitem", "iloc", None] - # if we are not a slice, then we are done - if not isinstance(key, slice): - return key - # validate iloc if kind == "iloc": return slice( From de53f6e8c8faf412995b05ff9fd67e90d86fb468 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 13 Aug 2019 05:21:41 -0700 Subject: [PATCH 084/191] remove unnecessary validate_for_numeric_binop (#27886) --- pandas/core/indexes/base.py | 48 +----------------------------------- pandas/core/indexes/range.py | 3 ++- 2 files changed, 3 insertions(+), 48 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 60c3a8b5269cf..7272d4e2752be 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -1,4 +1,4 @@ -from datetime import datetime, timedelta +from datetime import datetime import operator from textwrap import dedent from typing import Union @@ -50,7 +50,6 @@ from pandas.core.dtypes.generic import ( ABCCategorical, ABCDataFrame, - ABCDateOffset, ABCDatetimeArray, ABCDatetimeIndex, ABCIndexClass, @@ -5352,51 +5351,6 @@ def _validate_for_numeric_unaryop(self, op, opstr): "{opstr} for type: {typ}".format(opstr=opstr, typ=type(self).__name__) ) - def _validate_for_numeric_binop(self, other, op): - """ - Return valid other; evaluate or raise TypeError if we are not of - the appropriate type. - - Notes - ----- - This is an internal method called by ops. - """ - opstr = "__{opname}__".format(opname=op.__name__) - # if we are an inheritor of numeric, - # but not actually numeric (e.g. DatetimeIndex/PeriodIndex) - if not self._is_numeric_dtype: - raise TypeError( - "cannot evaluate a numeric op {opstr} " - "for type: {typ}".format(opstr=opstr, typ=type(self).__name__) - ) - - if isinstance(other, Index): - if not other._is_numeric_dtype: - raise TypeError( - "cannot evaluate a numeric op " - "{opstr} with type: {typ}".format(opstr=opstr, typ=type(other)) - ) - elif isinstance(other, np.ndarray) and not other.ndim: - other = other.item() - - if isinstance(other, (Index, ABCSeries, np.ndarray)): - if len(self) != len(other): - raise ValueError("cannot evaluate a numeric op with unequal lengths") - other = com.values_from_object(other) - if other.dtype.kind not in ["f", "i", "u"]: - raise TypeError("cannot evaluate a numeric op with a non-numeric dtype") - elif isinstance(other, (ABCDateOffset, np.timedelta64, timedelta)): - # higher up to handle - pass - elif isinstance(other, (datetime, np.datetime64)): - # higher up to handle - pass - else: - if not (is_float(other) or is_integer(other)): - raise TypeError("can only perform ops with scalar values") - - return other - @classmethod def _add_numeric_methods_binary(cls): """ diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index d2bea5f68b92d..cfdaf65955dab 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -25,6 +25,7 @@ from pandas.core import ops import pandas.core.common as com +from pandas.core.construction import extract_array import pandas.core.indexes.base as ibase from pandas.core.indexes.base import Index, _index_shared_docs from pandas.core.indexes.numeric import Int64Index @@ -782,7 +783,7 @@ def _evaluate_numeric_binop(self, other): # Must be an np.ndarray; GH#22390 return op(self._int64index, other) - other = self._validate_for_numeric_binop(other, op) + other = extract_array(other, extract_numpy=True) attrs = self._get_attributes_dict() attrs = self._maybe_update_attributes(attrs) From d32d464c7ecc480734bd1dd00a06b29a8530cd5a Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 13 Aug 2019 05:22:40 -0700 Subject: [PATCH 085/191] CLN: remove unnecessary dtype checks (#27889) --- pandas/core/arrays/categorical.py | 17 ++++---- pandas/core/arrays/datetimelike.py | 45 ++++------------------ pandas/core/arrays/datetimes.py | 2 - pandas/tests/arithmetic/test_datetime64.py | 8 +++- 4 files changed, 25 insertions(+), 47 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 368f532ff7d25..bbbeb812d1fe9 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1,3 +1,4 @@ +import operator from shutil import get_terminal_size import textwrap from typing import Type, Union, cast @@ -77,7 +78,9 @@ ) -def _cat_compare_op(opname): +def _cat_compare_op(op): + opname = "__{op}__".format(op=op.__name__) + def f(self, other): # On python2, you can usually compare any type to any type, and # Categoricals can be seen as a custom type, but having different @@ -1243,12 +1246,12 @@ def map(self, mapper): new_categories = new_categories.insert(len(new_categories), np.nan) return np.take(new_categories, self._codes) - __eq__ = _cat_compare_op("__eq__") - __ne__ = _cat_compare_op("__ne__") - __lt__ = _cat_compare_op("__lt__") - __gt__ = _cat_compare_op("__gt__") - __le__ = _cat_compare_op("__le__") - __ge__ = _cat_compare_op("__ge__") + __eq__ = _cat_compare_op(operator.eq) + __ne__ = _cat_compare_op(operator.ne) + __lt__ = _cat_compare_op(operator.lt) + __gt__ = _cat_compare_op(operator.gt) + __le__ = _cat_compare_op(operator.le) + __ge__ = _cat_compare_op(operator.ge) # for Series/ndarray like compat @property diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 0372b8f0c080a..1988726edc79b 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -22,7 +22,6 @@ is_datetime64tz_dtype, is_datetime_or_timedelta_dtype, is_dtype_equal, - is_extension_array_dtype, is_float_dtype, is_integer_dtype, is_list_like, @@ -1230,29 +1229,17 @@ def __add__(self, other): if not is_period_dtype(self): maybe_integer_op_deprecated(self) result = self._addsub_int_array(other, operator.add) - elif is_float_dtype(other): - # Explicitly catch invalid dtypes - raise TypeError( - "cannot add {dtype}-dtype to {cls}".format( - dtype=other.dtype, cls=type(self).__name__ - ) - ) - elif is_period_dtype(other): - # if self is a TimedeltaArray and other is a PeriodArray with - # a timedelta-like (i.e. Tick) freq, this operation is valid. - # Defer to the PeriodArray implementation. - # In remaining cases, this will end up raising TypeError. - return NotImplemented - elif is_extension_array_dtype(other): - # Categorical op will raise; defer explicitly - return NotImplemented - else: # pragma: no cover + else: + # Includes Categorical, other ExtensionArrays + # For PeriodDtype, if self is a TimedeltaArray and other is a + # PeriodArray with a timedelta-like (i.e. Tick) freq, this + # operation is valid. Defer to the PeriodArray implementation. + # In remaining cases, this will end up raising TypeError. return NotImplemented if is_timedelta64_dtype(result) and isinstance(result, np.ndarray): from pandas.core.arrays import TimedeltaArray - # TODO: infer freq? return TimedeltaArray(result) return result @@ -1302,29 +1289,13 @@ def __sub__(self, other): if not is_period_dtype(self): maybe_integer_op_deprecated(self) result = self._addsub_int_array(other, operator.sub) - elif isinstance(other, ABCIndexClass): - raise TypeError( - "cannot subtract {cls} and {typ}".format( - cls=type(self).__name__, typ=type(other).__name__ - ) - ) - elif is_float_dtype(other): - # Explicitly catch invalid dtypes - raise TypeError( - "cannot subtract {dtype}-dtype from {cls}".format( - dtype=other.dtype, cls=type(self).__name__ - ) - ) - elif is_extension_array_dtype(other): - # Categorical op will raise; defer explicitly - return NotImplemented - else: # pragma: no cover + else: + # Includes ExtensionArrays, float_dtype return NotImplemented if is_timedelta64_dtype(result) and isinstance(result, np.ndarray): from pandas.core.arrays import TimedeltaArray - # TODO: infer freq? return TimedeltaArray(result) return result diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index 28537124536e7..1aad130d9a3f5 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -223,8 +223,6 @@ def wrapper(self, other): result = op(self.view("i8"), other.view("i8")) o_mask = other._isnan - result = com.values_from_object(result) - if o_mask.any(): result[o_mask] = nat_result diff --git a/pandas/tests/arithmetic/test_datetime64.py b/pandas/tests/arithmetic/test_datetime64.py index 3920cfcc002d7..1fb93a182ba0b 100644 --- a/pandas/tests/arithmetic/test_datetime64.py +++ b/pandas/tests/arithmetic/test_datetime64.py @@ -1097,7 +1097,13 @@ def test_dt64arr_add_timestamp_raises(self, box_with_array): def test_dt64arr_add_sub_float(self, other, box_with_array): dti = DatetimeIndex(["2011-01-01", "2011-01-02"], freq="D") dtarr = tm.box_expected(dti, box_with_array) - msg = "|".join(["unsupported operand type", "cannot (add|subtract)"]) + msg = "|".join( + [ + "unsupported operand type", + "cannot (add|subtract)", + "ufunc '?(add|subtract)'? cannot use operands with types", + ] + ) with pytest.raises(TypeError, match=msg): dtarr + other with pytest.raises(TypeError, match=msg): From b623a9df3cf3d69fbdddab0f02a03ad69281c8b0 Mon Sep 17 00:00:00 2001 From: Kaiqi Dong Date: Tue, 13 Aug 2019 14:44:10 +0200 Subject: [PATCH 086/191] BUG: boxplot does not work when data has datetime column (#27846) --- doc/source/whatsnew/v1.0.0.rst | 1 + pandas/plotting/_matplotlib/core.py | 14 +++++++++++--- pandas/tests/plotting/test_boxplot_method.py | 17 ++++++++++++++++- 3 files changed, 28 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 974d14a4b424c..b35f230100f8d 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -165,6 +165,7 @@ Plotting - Bug in :meth:`Series.plot` not able to plot boolean values (:issue:`23719`) - +- Bug in :meth:`DataFrame.plot` when ``kind='box'`` and data contains datetime or timedelta data. These types are now automatically dropped (:issue:`22799`) Groupby/resample/rolling ^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py index 50f0d16631a15..a262f89dcc79c 100644 --- a/pandas/plotting/_matplotlib/core.py +++ b/pandas/plotting/_matplotlib/core.py @@ -402,12 +402,20 @@ def _compute_plot_data(self): # GH16953, _convert is needed as fallback, for ``Series`` # with ``dtype == object`` data = data._convert(datetime=True, timedelta=True) - select_include_type = [np.number, "datetime", "datetimetz", "timedelta"] + include_type = [np.number, "datetime", "datetimetz", "timedelta"] # GH23719, allow plotting boolean if self.include_bool is True: - select_include_type.append(np.bool_) - numeric_data = data.select_dtypes(include=select_include_type) + include_type.append(np.bool_) + + # GH22799, exclude datatime-like type for boxplot + exclude_type = None + if self._kind == "box": + # TODO: change after solving issue 27881 + include_type = [np.number] + exclude_type = ["timedelta"] + + numeric_data = data.select_dtypes(include=include_type, exclude=exclude_type) try: is_empty = numeric_data.empty diff --git a/pandas/tests/plotting/test_boxplot_method.py b/pandas/tests/plotting/test_boxplot_method.py index cab0efe53f1fc..5bbaff580c356 100644 --- a/pandas/tests/plotting/test_boxplot_method.py +++ b/pandas/tests/plotting/test_boxplot_method.py @@ -9,7 +9,7 @@ import pandas.util._test_decorators as td -from pandas import DataFrame, MultiIndex, Series +from pandas import DataFrame, MultiIndex, Series, date_range, timedelta_range from pandas.tests.plotting.common import TestPlotBase, _check_plot_works import pandas.util.testing as tm @@ -160,6 +160,21 @@ def test_fontsize(self): df.boxplot("a", fontsize=16), xlabelsize=16, ylabelsize=16 ) + def test_boxplot_numeric_data(self): + # GH 22799 + df = DataFrame( + { + "a": date_range("2012-01-01", periods=100), + "b": np.random.randn(100), + "c": np.random.randn(100) + 2, + "d": date_range("2012-01-01", periods=100).astype(str), + "e": date_range("2012-01-01", periods=100, tz="UTC"), + "f": timedelta_range("1 days", periods=100), + } + ) + ax = df.plot(kind="box") + assert [x.get_text() for x in ax.get_xticklabels()] == ["b", "c"] + @td.skip_if_no_mpl class TestDataFrameGroupByPlots(TestPlotBase): From 80a7a450b43ba85a60529f977b49cfa5f84d359c Mon Sep 17 00:00:00 2001 From: Sparkle Russell-Puleri Date: Tue, 13 Aug 2019 07:12:44 -0700 Subject: [PATCH 087/191] DOC: Add CoC to the README (#27851) * update README file Co-Authored-By: Marc Garcia --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index aeeea1464e1fd..3cde98d3145f2 100644 --- a/README.md +++ b/README.md @@ -233,3 +233,5 @@ You can also triage issues which may include reproducing bug reports, or asking Or maybe through using pandas you have an idea of your own or are looking for something in the documentation and thinking ‘this can be improved’...you can do something about it! Feel free to ask questions on the [mailing list](https://groups.google.com/forum/?fromgroups#!forum/pydata) or on [Gitter](https://gitter.im/pydata/pandas). + +As contributors and maintainers to this project, you are expected to abide by pandas' code of conduct. More information can be found at: [Contributor Code of Conduct](https://github.com/pandas-dev/pandas/blob/master/.github/CODE_OF_CONDUCT.md) From 5de4e55d60bf8487a2ce64a440b6d5d92345a4bc Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 13 Aug 2019 09:15:36 -0700 Subject: [PATCH 088/191] BUG: fix Sparse reduction (#27890) * BUG: fix Sparse reduction --- doc/source/whatsnew/v0.25.1.rst | 2 +- pandas/core/arrays/sparse.py | 3 +++ pandas/tests/series/test_ufunc.py | 5 +---- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/doc/source/whatsnew/v0.25.1.rst b/doc/source/whatsnew/v0.25.1.rst index b97f45efbeae9..dfa216b1db56e 100644 --- a/doc/source/whatsnew/v0.25.1.rst +++ b/doc/source/whatsnew/v0.25.1.rst @@ -133,7 +133,7 @@ Reshaping Sparse ^^^^^^ - +- Bug in reductions for :class:`Series` with Sparse dtypes (:issue:`27080`) - - - diff --git a/pandas/core/arrays/sparse.py b/pandas/core/arrays/sparse.py index 8aa83c3fbc37d..2234167fe0193 100644 --- a/pandas/core/arrays/sparse.py +++ b/pandas/core/arrays/sparse.py @@ -1693,6 +1693,9 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): for sp_value, fv in zip(sp_values, fill_value) ) return arrays + elif is_scalar(sp_values): + # e.g. reductions + return sp_values return self._simple_new( sp_values, self.sp_index, SparseDtype(sp_values.dtype, fill_value) diff --git a/pandas/tests/series/test_ufunc.py b/pandas/tests/series/test_ufunc.py index c024e9caba156..8144a3931b9b8 100644 --- a/pandas/tests/series/test_ufunc.py +++ b/pandas/tests/series/test_ufunc.py @@ -252,10 +252,7 @@ def __add__(self, other): "values", [ pd.array([1, 3, 2]), - pytest.param( - pd.array([1, 10, 0], dtype="Sparse[int]"), - marks=pytest.mark.xfail(resason="GH-27080. Bug in SparseArray"), - ), + pd.array([1, 10, 0], dtype="Sparse[int]"), pd.to_datetime(["2000", "2010", "2001"]), pd.to_datetime(["2000", "2010", "2001"]).tz_localize("CET"), pd.to_datetime(["2000", "2010", "2001"]).to_period(freq="D"), From 584b154cbf667ec4dd3482025718ea28b5827a46 Mon Sep 17 00:00:00 2001 From: Samesh Lakhotia <43701530+sameshl@users.noreply.github.com> Date: Thu, 15 Aug 2019 00:24:48 +0530 Subject: [PATCH 089/191] DOC:Use of "Yields" for documentation of DataFrame.iteritems() (#27876) --- pandas/core/frame.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 6114c9efbfe28..20398069847b1 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -775,7 +775,8 @@ def style(self): Iterates over the DataFrame columns, returning a tuple with the column name and the content as a Series. - %s + Yields + ------ label : object The column names for the DataFrame being iterated over. content : Series @@ -816,7 +817,7 @@ def style(self): Name: population, dtype: int64 """ - @Appender(_shared_docs["items"] % "Yields\n ------") + @Appender(_shared_docs["items"]) def items(self): if self.columns.is_unique and hasattr(self, "_item_cache"): for k in self.columns: @@ -825,9 +826,9 @@ def items(self): for i, k in enumerate(self.columns): yield k, self._ixs(i, axis=1) - @Appender(_shared_docs["items"] % "Returns\n -------") + @Appender(_shared_docs["items"]) def iteritems(self): - return self.items() + yield from self.items() def iterrows(self): """ From 11b28ea584b3647bc8ca5b56889c9199c0af5ec5 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 14 Aug 2019 14:34:10 -0500 Subject: [PATCH 090/191] Revert 37 pins (#27907) * Revert "CI: Pin Python to 3.7.3 (#27868)" This reverts commit 2ebab98cf1f36a678256436e7a4f5149536436c9. --- ci/deps/azure-37-locale.yaml | 2 +- ci/deps/azure-37-numpydev.yaml | 2 +- ci/deps/travis-37.yaml | 2 +- environment.yml | 2 +- requirements-dev.txt | 1 - 5 files changed, 4 insertions(+), 5 deletions(-) diff --git a/ci/deps/azure-37-locale.yaml b/ci/deps/azure-37-locale.yaml index 437fbc24b9411..26dcd213bbfa0 100644 --- a/ci/deps/azure-37-locale.yaml +++ b/ci/deps/azure-37-locale.yaml @@ -17,7 +17,7 @@ dependencies: - openpyxl - pytables - python-dateutil - - python=3.7.3 + - python=3.7.* - pytz - s3fs - scipy diff --git a/ci/deps/azure-37-numpydev.yaml b/ci/deps/azure-37-numpydev.yaml index e1acda7ecf3c7..65c92ec1dcf0d 100644 --- a/ci/deps/azure-37-numpydev.yaml +++ b/ci/deps/azure-37-numpydev.yaml @@ -2,7 +2,7 @@ name: pandas-dev channels: - defaults dependencies: - - python=3.7.3 + - python=3.7.* - pytz - Cython>=0.28.2 # universal diff --git a/ci/deps/travis-37.yaml b/ci/deps/travis-37.yaml index d7bed9e3f03f2..9e08c41a3d9c0 100644 --- a/ci/deps/travis-37.yaml +++ b/ci/deps/travis-37.yaml @@ -4,7 +4,7 @@ channels: - conda-forge - c3i_test dependencies: - - python=3.7.3 + - python=3.7.* - botocore>=1.11 - cython>=0.28.2 - numpy diff --git a/environment.yml b/environment.yml index 0cbd500209b6c..6d2cd701c3854 100644 --- a/environment.yml +++ b/environment.yml @@ -5,7 +5,7 @@ channels: dependencies: # required - numpy>=1.15 - - python=3.7.3 + - python=3 - python-dateutil>=2.6.1 - pytz diff --git a/requirements-dev.txt b/requirements-dev.txt index f866f1436edc6..cf11a3ee28258 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1,5 +1,4 @@ numpy>=1.15 -python==3.7.3 python-dateutil>=2.6.1 pytz asv From fae56d08b5f79d99b2f33ea28a91ffad9bd6f335 Mon Sep 17 00:00:00 2001 From: Terji Petersen Date: Wed, 14 Aug 2019 21:11:34 +0100 Subject: [PATCH 091/191] PERF: Break reference cycle for all Index types (#27840) * Generalize guards for index ref cycles * add issue number --- doc/source/whatsnew/v0.25.1.rst | 2 +- pandas/core/indexes/base.py | 2 +- pandas/core/indexes/category.py | 8 +++++--- pandas/core/indexes/period.py | 5 ++++- pandas/tests/indexes/common.py | 9 +++++++++ pandas/tests/indexes/test_base.py | 8 -------- 6 files changed, 20 insertions(+), 14 deletions(-) diff --git a/doc/source/whatsnew/v0.25.1.rst b/doc/source/whatsnew/v0.25.1.rst index dfa216b1db56e..21f1fa7ddec1f 100644 --- a/doc/source/whatsnew/v0.25.1.rst +++ b/doc/source/whatsnew/v0.25.1.rst @@ -83,7 +83,7 @@ Indexing ^^^^^^^^ - Bug in partial-string indexing returning a NumPy array rather than a ``Series`` when indexing with a scalar like ``.loc['2015']`` (:issue:`27516`) -- Break reference cycle involving :class:`Index` to allow garbage collection of :class:`Index` objects without running the GC. (:issue:`27585`) +- Break reference cycle involving :class:`Index` and other index classes to allow garbage collection of index objects without running the GC. (:issue:`27585`, :issue:`27840`) - Fix regression in assigning values to a single column of a DataFrame with a ``MultiIndex`` columns (:issue:`27841`). - diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 7272d4e2752be..d13e41eed7ad0 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -665,7 +665,7 @@ def _cleanup(self): def _engine(self): # property, for now, slow to look up - # to avoid a refernce cycle, bind `_ndarray_values` to a local variable, so + # to avoid a reference cycle, bind `_ndarray_values` to a local variable, so # `self` is not passed into the lambda. _ndarray_values = self._ndarray_values return self._engine_type(lambda: _ndarray_values, len(self)) diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index 8bfa7e8d20b4f..82806c7351db6 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -446,9 +446,11 @@ def argsort(self, *args, **kwargs): @cache_readonly def _engine(self): - - # we are going to look things up with the codes themselves - return self._engine_type(lambda: self.codes, len(self)) + # we are going to look things up with the codes themselves. + # To avoid a reference cycle, bind `codes` to a local variable, so + # `self` is not passed into the lambda. + codes = self.codes + return self._engine_type(lambda: codes, len(self)) # introspection @cache_readonly diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index b0cc386f7783d..5a2ca109597e8 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -1,5 +1,6 @@ from datetime import datetime, timedelta import warnings +import weakref import numpy as np @@ -441,7 +442,9 @@ def _formatter_func(self): @cache_readonly def _engine(self): - return self._engine_type(lambda: self, len(self)) + # To avoid a reference cycle, pass a weakref of self to _engine_type. + period = weakref.ref(self) + return self._engine_type(period, len(self)) @Appender(_index_shared_docs["contains"]) def __contains__(self, key): diff --git a/pandas/tests/indexes/common.py b/pandas/tests/indexes/common.py index 9459069f0ea2d..0e74c87388682 100644 --- a/pandas/tests/indexes/common.py +++ b/pandas/tests/indexes/common.py @@ -1,3 +1,5 @@ +import gc + import numpy as np import pytest @@ -908,3 +910,10 @@ def test_is_unique(self): # multiple NA should not be unique index_na_dup = index_na.insert(0, np.nan) assert index_na_dup.is_unique is False + + def test_engine_reference_cycle(self): + # GH27585 + index = self.create_index() + nrefs_pre = len(gc.get_referrers(index)) + index._engine + assert len(gc.get_referrers(index)) == nrefs_pre diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index fe1eb96df1e97..d1ed79118d2fa 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -1,6 +1,5 @@ from collections import defaultdict from datetime import datetime, timedelta -import gc from io import StringIO import math import operator @@ -2425,13 +2424,6 @@ def test_deprecated_contains(self): with tm.assert_produces_warning(FutureWarning): index.contains(1) - def test_engine_reference_cycle(self): - # https://github.com/pandas-dev/pandas/issues/27585 - index = pd.Index([1, 2, 3]) - nrefs_pre = len(gc.get_referrers(index)) - index._engine - assert len(gc.get_referrers(index)) == nrefs_pre - class TestMixedIntIndex(Base): # Mostly the tests from common.py for which the results differ From 46ff5ee38acbd1e7b724cf938b8aac3bf79164e7 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 14 Aug 2019 13:21:29 -0700 Subject: [PATCH 092/191] CLN: remove never-True checks (#27908) --- pandas/core/algorithms.py | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 21d12d02c9008..c0ed198e200f1 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -28,13 +28,11 @@ is_complex_dtype, is_datetime64_any_dtype, is_datetime64_ns_dtype, - is_datetime64tz_dtype, is_datetimelike, is_extension_array_dtype, is_float_dtype, is_integer, is_integer_dtype, - is_interval_dtype, is_list_like, is_numeric_dtype, is_object_dtype, @@ -183,8 +181,6 @@ def _reconstruct_data(values, dtype, original): if is_extension_array_dtype(dtype): values = dtype.construct_array_type()._from_sequence(values) - elif is_datetime64tz_dtype(dtype) or is_period_dtype(dtype): - values = Index(original)._shallow_copy(values, name=None) elif is_bool_dtype(dtype): values = values.astype(dtype) @@ -1645,19 +1641,13 @@ def take_nd( May be the same type as the input, or cast to an ndarray. """ - # TODO(EA): Remove these if / elifs as datetimeTZ, interval, become EAs - # dispatch to internal type takes if is_extension_array_dtype(arr): return arr.take(indexer, fill_value=fill_value, allow_fill=allow_fill) - elif is_datetime64tz_dtype(arr): - return arr.take(indexer, fill_value=fill_value, allow_fill=allow_fill) - elif is_interval_dtype(arr): - return arr.take(indexer, fill_value=fill_value, allow_fill=allow_fill) if is_sparse(arr): arr = arr.to_dense() elif isinstance(arr, (ABCIndexClass, ABCSeries)): - arr = arr.values + arr = arr._values arr = np.asarray(arr) From 5c81d8ab661832d3237938ba228f17d506813d41 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 14 Aug 2019 13:39:14 -0700 Subject: [PATCH 093/191] REF: implement should_extension_dispatch (#27815) --- pandas/core/ops/__init__.py | 103 +++++++++--------- .../arrays/categorical/test_operators.py | 8 +- pandas/tests/extension/test_categorical.py | 2 +- 3 files changed, 57 insertions(+), 56 deletions(-) diff --git a/pandas/core/ops/__init__.py b/pandas/core/ops/__init__.py index 7229b3de4e9f0..843f12c20b07b 100644 --- a/pandas/core/ops/__init__.py +++ b/pandas/core/ops/__init__.py @@ -47,7 +47,7 @@ import pandas as pd from pandas._typing import ArrayLike -from pandas.core.construction import extract_array +from pandas.core.construction import array, extract_array from pandas.core.ops import missing from pandas.core.ops.docstrings import ( _arith_doc_FRAME, @@ -460,6 +460,33 @@ def masked_arith_op(x, y, op): # Dispatch logic +def should_extension_dispatch(left: ABCSeries, right: Any) -> bool: + """ + Identify cases where Series operation should use dispatch_to_extension_op. + + Parameters + ---------- + left : Series + right : object + + Returns + ------- + bool + """ + if ( + is_extension_array_dtype(left.dtype) + or is_datetime64_dtype(left.dtype) + or is_timedelta64_dtype(left.dtype) + ): + return True + + if is_extension_array_dtype(right) and not is_scalar(right): + # GH#22378 disallow scalar to exclude e.g. "category", "Int64" + return True + + return False + + def should_series_dispatch(left, right, op): """ Identify cases where a DataFrame operation should dispatch to its @@ -564,19 +591,18 @@ def dispatch_to_extension_op(op, left, right): apply the operator defined by op. """ + if left.dtype.kind in "mM": + # We need to cast datetime64 and timedelta64 ndarrays to + # DatetimeArray/TimedeltaArray. But we avoid wrapping others in + # PandasArray as that behaves poorly with e.g. IntegerArray. + left = array(left) + # The op calls will raise TypeError if the op is not defined # on the ExtensionArray # unbox Series and Index to arrays - if isinstance(left, (ABCSeries, ABCIndexClass)): - new_left = left._values - else: - new_left = left - - if isinstance(right, (ABCSeries, ABCIndexClass)): - new_right = right._values - else: - new_right = right + new_left = extract_array(left, extract_numpy=True) + new_right = extract_array(right, extract_numpy=True) try: res_values = op(new_left, new_right) @@ -684,56 +710,27 @@ def wrapper(left, right): res_name = get_op_result_name(left, right) right = maybe_upcast_for_op(right, left.shape) - if is_categorical_dtype(left): - raise TypeError( - "{typ} cannot perform the operation " - "{op}".format(typ=type(left).__name__, op=str_rep) - ) - - elif is_datetime64_dtype(left) or is_datetime64tz_dtype(left): - from pandas.core.arrays import DatetimeArray - - result = dispatch_to_extension_op(op, DatetimeArray(left), right) - return construct_result(left, result, index=left.index, name=res_name) - - elif is_extension_array_dtype(left) or ( - is_extension_array_dtype(right) and not is_scalar(right) - ): - # GH#22378 disallow scalar to exclude e.g. "category", "Int64" + if should_extension_dispatch(left, right): result = dispatch_to_extension_op(op, left, right) - return construct_result(left, result, index=left.index, name=res_name) - elif is_timedelta64_dtype(left): - from pandas.core.arrays import TimedeltaArray - - result = dispatch_to_extension_op(op, TimedeltaArray(left), right) - return construct_result(left, result, index=left.index, name=res_name) - - elif is_timedelta64_dtype(right): - # We should only get here with non-scalar values for right - # upcast by maybe_upcast_for_op + elif is_timedelta64_dtype(right) or isinstance( + right, (ABCDatetimeArray, ABCDatetimeIndex) + ): + # We should only get here with td64 right with non-scalar values + # for right upcast by maybe_upcast_for_op assert not isinstance(right, (np.timedelta64, np.ndarray)) - result = op(left._values, right) - # We do not pass dtype to ensure that the Series constructor - # does inference in the case where `result` has object-dtype. - return construct_result(left, result, index=left.index, name=res_name) - - elif isinstance(right, (ABCDatetimeArray, ABCDatetimeIndex)): - result = op(left._values, right) - return construct_result(left, result, index=left.index, name=res_name) + else: + lvalues = extract_array(left, extract_numpy=True) + rvalues = extract_array(right, extract_numpy=True) - lvalues = left.values - rvalues = right - if isinstance(rvalues, (ABCSeries, ABCIndexClass)): - rvalues = rvalues._values + with np.errstate(all="ignore"): + result = na_op(lvalues, rvalues) - with np.errstate(all="ignore"): - result = na_op(lvalues, rvalues) - return construct_result( - left, result, index=left.index, name=res_name, dtype=None - ) + # We do not pass dtype to ensure that the Series constructor + # does inference in the case where `result` has object-dtype. + return construct_result(left, result, index=left.index, name=res_name) wrapper.__name__ = op_name return wrapper diff --git a/pandas/tests/arrays/categorical/test_operators.py b/pandas/tests/arrays/categorical/test_operators.py index 9a09ea8422b1f..22c1d5373372a 100644 --- a/pandas/tests/arrays/categorical/test_operators.py +++ b/pandas/tests/arrays/categorical/test_operators.py @@ -349,7 +349,9 @@ def test_numeric_like_ops(self): ("__mul__", r"\*"), ("__truediv__", "/"), ]: - msg = r"Series cannot perform the operation {}".format(str_rep) + msg = r"Series cannot perform the operation {}|unsupported operand".format( + str_rep + ) with pytest.raises(TypeError, match=msg): getattr(df, op)(df) @@ -375,7 +377,9 @@ def test_numeric_like_ops(self): ("__mul__", r"\*"), ("__truediv__", "/"), ]: - msg = r"Series cannot perform the operation {}".format(str_rep) + msg = r"Series cannot perform the operation {}|unsupported operand".format( + str_rep + ) with pytest.raises(TypeError, match=msg): getattr(s, op)(2) diff --git a/pandas/tests/extension/test_categorical.py b/pandas/tests/extension/test_categorical.py index f7456d24ad6d3..0c0e8b0123c03 100644 --- a/pandas/tests/extension/test_categorical.py +++ b/pandas/tests/extension/test_categorical.py @@ -211,7 +211,7 @@ def test_arith_series_with_scalar(self, data, all_arithmetic_operators): def test_add_series_with_extension_array(self, data): ser = pd.Series(data) - with pytest.raises(TypeError, match="cannot perform"): + with pytest.raises(TypeError, match="cannot perform|unsupported operand"): ser + data def test_divmod_series_array(self): From 603dbdc1b54ad5c746b74cd82ba610814077ce70 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 14 Aug 2019 13:39:53 -0700 Subject: [PATCH 094/191] CLN: remove unnecessary validate_for_numeric_unary (#27891) --- pandas/core/indexes/base.py | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index d13e41eed7ad0..4e098b2f8be9b 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -5341,16 +5341,6 @@ def _maybe_update_attributes(self, attrs): """ return attrs - def _validate_for_numeric_unaryop(self, op, opstr): - """ - Validate if we can perform a numeric unary operation. - """ - if not self._is_numeric_dtype: - raise TypeError( - "cannot evaluate a numeric op " - "{opstr} for type: {typ}".format(opstr=opstr, typ=type(self).__name__) - ) - @classmethod def _add_numeric_methods_binary(cls): """ @@ -5383,7 +5373,6 @@ def _add_numeric_methods_unary(cls): def _make_evaluate_unary(op, opstr): def _evaluate_numeric_unary(self): - self._validate_for_numeric_unaryop(op, opstr) attrs = self._get_attributes_dict() attrs = self._maybe_update_attributes(attrs) return Index(op(self.values), **attrs) From 6813d7796e759435e915f3dda84ad9db81ebbadb Mon Sep 17 00:00:00 2001 From: Jeremy Schendel Date: Wed, 14 Aug 2019 16:50:45 -0600 Subject: [PATCH 095/191] DOC: Fix section reference placement in whatsnew (#27919) --- doc/source/whatsnew/v1.0.0.rst | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index b35f230100f8d..aeed3668fe774 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -21,27 +21,27 @@ including other versions of pandas. Enhancements ~~~~~~~~~~~~ -.. _whatsnew_1000.enhancements.other: - - - +.. _whatsnew_1000.enhancements.other: + Other enhancements ^^^^^^^^^^^^^^^^^^ -.. _whatsnew_1000.api_breaking: - - - +.. _whatsnew_1000.api_breaking: + Backwards incompatible API changes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -.. _whatsnew_1000.api.other: - - :class:`pandas.core.groupby.GroupBy.transform` now raises on invalid operation names (:issue:`27489`). - +.. _whatsnew_1000.api.other: + Other API changes ^^^^^^^^^^^^^^^^^ From a656d24ff32d5113cf8db48dd3a1cdfe6e27acf0 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 15 Aug 2019 05:39:56 -0700 Subject: [PATCH 096/191] TST: parametrize arithmetic tests (#27847) --- pandas/tests/arithmetic/test_datetime64.py | 265 +++++++----------- pandas/tests/arithmetic/test_period.py | 26 +- pandas/tests/arithmetic/test_timedelta64.py | 73 ++--- .../tests/scalar/timestamp/test_timestamp.py | 20 ++ 4 files changed, 154 insertions(+), 230 deletions(-) diff --git a/pandas/tests/arithmetic/test_datetime64.py b/pandas/tests/arithmetic/test_datetime64.py index 1fb93a182ba0b..5931cd93cc8c5 100644 --- a/pandas/tests/arithmetic/test_datetime64.py +++ b/pandas/tests/arithmetic/test_datetime64.py @@ -30,6 +30,54 @@ import pandas.util.testing as tm +def assert_invalid_comparison(left, right, box): + """ + Assert that comparison operations with mismatched types behave correctly. + + Parameters + ---------- + left : np.ndarray, ExtensionArray, Index, or Series + right : object + box : {pd.DataFrame, pd.Series, pd.Index, tm.to_array} + """ + # Not for tznaive-tzaware comparison + + # Note: not quite the same as how we do this for tm.box_expected + xbox = box if box is not pd.Index else np.array + + result = left == right + expected = xbox(np.zeros(result.shape, dtype=np.bool_)) + + tm.assert_equal(result, expected) + + result = right == left + tm.assert_equal(result, expected) + + result = left != right + tm.assert_equal(result, ~expected) + + result = right != left + tm.assert_equal(result, ~expected) + + msg = "Invalid comparison between" + with pytest.raises(TypeError, match=msg): + left < right + with pytest.raises(TypeError, match=msg): + left <= right + with pytest.raises(TypeError, match=msg): + left > right + with pytest.raises(TypeError, match=msg): + left >= right + with pytest.raises(TypeError, match=msg): + right < left + with pytest.raises(TypeError, match=msg): + right <= left + with pytest.raises(TypeError, match=msg): + right > left + with pytest.raises(TypeError, match=msg): + right >= left + + def assert_all(obj): """ Test helper to call call obj.all() the appropriate number of times on @@ -47,7 +95,7 @@ def assert_all(obj): class TestDatetime64ArrayLikeComparisons: # Comparison tests for datetime64 vectors fully parametrized over - # DataFrame/Series/DatetimeIndex/DateteimeArray. Ideally all comparison + # DataFrame/Series/DatetimeIndex/DatetimeArray. Ideally all comparison # tests will eventually end up here. def test_compare_zerodim(self, tz_naive_fixture, box_with_array): @@ -59,36 +107,61 @@ def test_compare_zerodim(self, tz_naive_fixture, box_with_array): other = np.array(dti.to_numpy()[0]) - # FIXME: ValueError with transpose on tzaware - dtarr = tm.box_expected(dti, box, transpose=False) + dtarr = tm.box_expected(dti, box) result = dtarr <= other expected = np.array([True, False, False]) - expected = tm.box_expected(expected, xbox, transpose=False) + expected = tm.box_expected(expected, xbox) tm.assert_equal(result, expected) + def test_dt64arr_cmp_date_invalid(self, tz_naive_fixture, box_with_array): + # GH#19800, GH#19301 datetime.date comparison raises to + # match DatetimeIndex/Timestamp. This also matches the behavior + # of stdlib datetime.datetime + tz = tz_naive_fixture -class TestDatetime64DataFrameComparison: - @pytest.mark.parametrize( - "timestamps", - [ - [pd.Timestamp("2012-01-01 13:00:00+00:00")] * 2, - [pd.Timestamp("2012-01-01 13:00:00")] * 2, - ], - ) - def test_tz_aware_scalar_comparison(self, timestamps): - # GH#15966 - df = pd.DataFrame({"test": timestamps}) - expected = pd.DataFrame({"test": [False, False]}) - tm.assert_frame_equal(df == -1, expected) + dti = pd.date_range("20010101", periods=10, tz=tz) + date = dti[0].to_pydatetime().date() + + dtarr = tm.box_expected(dti, box_with_array) + assert_invalid_comparison(dtarr, date, box_with_array) - def test_dt64_nat_comparison(self): + @pytest.mark.parametrize("other", ["foo", -1, 99, 4.0, object(), timedelta(days=2)]) + def test_dt64arr_cmp_scalar_invalid(self, other, tz_naive_fixture, box_with_array): + # GH#22074, GH#15966 + tz = tz_naive_fixture + + rng = date_range("1/1/2000", periods=10, tz=tz) + dtarr = tm.box_expected(rng, box_with_array) + assert_invalid_comparison(dtarr, other, box_with_array) + + @pytest.mark.parametrize("other", [None, np.nan]) + def test_dt64arr_cmp_na_scalar_invalid( + self, other, tz_naive_fixture, box_with_array + ): + # GH#19301 + tz = tz_naive_fixture + dti = pd.date_range("2016-01-01", periods=2, tz=tz) + dtarr = tm.box_expected(dti, box_with_array) + assert_invalid_comparison(dtarr, other, box_with_array) + + def test_dt64arr_nat_comparison(self, tz_naive_fixture, box_with_array): # GH#22242, GH#22163 DataFrame considered NaT == ts incorrectly - ts = pd.Timestamp.now() - df = pd.DataFrame([ts, pd.NaT]) - expected = pd.DataFrame([True, False]) + tz = tz_naive_fixture + box = box_with_array + xbox = box if box is not pd.Index else np.ndarray + + ts = pd.Timestamp.now(tz) + ser = pd.Series([ts, pd.NaT]) + + # FIXME: Can't transpose because that loses the tz dtype on + # the NaT column + obj = tm.box_expected(ser, box, transpose=False) - result = df == ts - tm.assert_frame_equal(result, expected) + expected = pd.Series([True, False], dtype=np.bool_) + expected = tm.box_expected(expected, xbox, transpose=False) + + result = obj == ts + tm.assert_equal(result, expected) class TestDatetime64SeriesComparison: @@ -142,35 +215,17 @@ def test_nat_comparisons(self, dtype, box, reverse, pair): expected = Series([False, False, True]) tm.assert_series_equal(left <= right, expected) - def test_comparison_invalid(self, box_with_array): + def test_comparison_invalid(self, tz_naive_fixture, box_with_array): # GH#4968 # invalid date/int comparisons - xbox = box_with_array if box_with_array is not pd.Index else np.ndarray - + tz = tz_naive_fixture ser = Series(range(5)) - ser2 = Series(pd.date_range("20010101", periods=5)) + ser2 = Series(pd.date_range("20010101", periods=5, tz=tz)) ser = tm.box_expected(ser, box_with_array) ser2 = tm.box_expected(ser2, box_with_array) - for (x, y) in [(ser, ser2), (ser2, ser)]: - - result = x == y - expected = tm.box_expected([False] * 5, xbox) - tm.assert_equal(result, expected) - - result = x != y - expected = tm.box_expected([True] * 5, xbox) - tm.assert_equal(result, expected) - msg = "Invalid comparison between" - with pytest.raises(TypeError, match=msg): - x >= y - with pytest.raises(TypeError, match=msg): - x > y - with pytest.raises(TypeError, match=msg): - x < y - with pytest.raises(TypeError, match=msg): - x <= y + assert_invalid_comparison(ser, ser2, box_with_array) @pytest.mark.parametrize( "data", @@ -227,26 +282,6 @@ def test_series_comparison_scalars(self): expected = Series([x > val for x in series]) tm.assert_series_equal(result, expected) - def test_dt64ser_cmp_date_invalid(self, box_with_array): - # GH#19800 datetime.date comparison raises to - # match DatetimeIndex/Timestamp. This also matches the behavior - # of stdlib datetime.datetime - - ser = pd.date_range("20010101", periods=10) - date = ser[0].to_pydatetime().date() - - ser = tm.box_expected(ser, box_with_array) - assert_all(~(ser == date)) - assert_all(ser != date) - with pytest.raises(TypeError): - ser > date - with pytest.raises(TypeError): - ser < date - with pytest.raises(TypeError): - ser >= date - with pytest.raises(TypeError): - ser <= date - @pytest.mark.parametrize( "left,right", [("lt", "gt"), ("le", "ge"), ("eq", "eq"), ("ne", "ne")] ) @@ -388,57 +423,6 @@ def test_dti_cmp_datetimelike(self, other, tz_naive_fixture): expected = np.array([True, False]) tm.assert_numpy_array_equal(result, expected) - def dt64arr_cmp_non_datetime(self, tz_naive_fixture, box_with_array): - # GH#19301 by convention datetime.date is not considered comparable - # to Timestamp or DatetimeIndex. This may change in the future. - tz = tz_naive_fixture - dti = pd.date_range("2016-01-01", periods=2, tz=tz) - dtarr = tm.box_expected(dti, box_with_array) - - other = datetime(2016, 1, 1).date() - assert not (dtarr == other).any() - assert (dtarr != other).all() - with pytest.raises(TypeError): - dtarr < other - with pytest.raises(TypeError): - dtarr <= other - with pytest.raises(TypeError): - dtarr > other - with pytest.raises(TypeError): - dtarr >= other - - @pytest.mark.parametrize("other", [None, np.nan, pd.NaT]) - def test_dti_eq_null_scalar(self, other, tz_naive_fixture): - # GH#19301 - tz = tz_naive_fixture - dti = pd.date_range("2016-01-01", periods=2, tz=tz) - assert not (dti == other).any() - - @pytest.mark.parametrize("other", [None, np.nan, pd.NaT]) - def test_dti_ne_null_scalar(self, other, tz_naive_fixture): - # GH#19301 - tz = tz_naive_fixture - dti = pd.date_range("2016-01-01", periods=2, tz=tz) - assert (dti != other).all() - - @pytest.mark.parametrize("other", [None, np.nan]) - def test_dti_cmp_null_scalar_inequality( - self, tz_naive_fixture, other, box_with_array - ): - # GH#19301 - tz = tz_naive_fixture - dti = pd.date_range("2016-01-01", periods=2, tz=tz) - dtarr = tm.box_expected(dti, box_with_array) - msg = "Invalid comparison between" - with pytest.raises(TypeError, match=msg): - dtarr < other - with pytest.raises(TypeError, match=msg): - dtarr <= other - with pytest.raises(TypeError, match=msg): - dtarr > other - with pytest.raises(TypeError, match=msg): - dtarr >= other - @pytest.mark.parametrize("dtype", [None, object]) def test_dti_cmp_nat(self, dtype, box_with_array): if box_with_array is tm.to_array and dtype is object: @@ -728,34 +712,6 @@ def test_dti_cmp_str(self, tz_naive_fixture): expected = np.array([True] * 10) tm.assert_numpy_array_equal(result, expected) - @pytest.mark.parametrize("other", ["foo", 99, 4.0, object(), timedelta(days=2)]) - def test_dt64arr_cmp_scalar_invalid(self, other, tz_naive_fixture, box_with_array): - # GH#22074 - tz = tz_naive_fixture - xbox = box_with_array if box_with_array is not pd.Index else np.ndarray - - rng = date_range("1/1/2000", periods=10, tz=tz) - rng = tm.box_expected(rng, box_with_array) - - result = rng == other - expected = np.array([False] * 10) - expected = tm.box_expected(expected, xbox) - tm.assert_equal(result, expected) - - result = rng != other - expected = np.array([True] * 10) - expected = tm.box_expected(expected, xbox) - tm.assert_equal(result, expected) - msg = "Invalid comparison between" - with pytest.raises(TypeError, match=msg): - rng < other - with pytest.raises(TypeError, match=msg): - rng <= other - with pytest.raises(TypeError, match=msg): - rng > other - with pytest.raises(TypeError, match=msg): - rng >= other - def test_dti_cmp_list(self): rng = date_range("1/1/2000", periods=10) @@ -2576,24 +2532,3 @@ def test_shift_months(years, months): raw = [x + pd.offsets.DateOffset(years=years, months=months) for x in dti] expected = DatetimeIndex(raw) tm.assert_index_equal(actual, expected) - - -# FIXME: this belongs in scalar tests -class SubDatetime(datetime): - pass - - -@pytest.mark.parametrize( - "lh,rh", - [ - (SubDatetime(2000, 1, 1), Timedelta(hours=1)), - (Timedelta(hours=1), SubDatetime(2000, 1, 1)), - ], -) -def test_dt_subclass_add_timedelta(lh, rh): - # GH 25851 - # ensure that subclassed datetime works for - # Timedelta operations - result = lh + rh - expected = SubDatetime(2000, 1, 1, 1) - assert result == expected diff --git a/pandas/tests/arithmetic/test_period.py b/pandas/tests/arithmetic/test_period.py index 4b58c290c3cea..ed693d873efb8 100644 --- a/pandas/tests/arithmetic/test_period.py +++ b/pandas/tests/arithmetic/test_period.py @@ -573,12 +573,19 @@ def test_parr_add_sub_float_raises(self, op, other, box_with_array): @pytest.mark.parametrize( "other", [ + # datetime scalars pd.Timestamp.now(), pd.Timestamp.now().to_pydatetime(), pd.Timestamp.now().to_datetime64(), + # datetime-like arrays + pd.date_range("2016-01-01", periods=3, freq="H"), + pd.date_range("2016-01-01", periods=3, tz="Europe/Brussels"), + pd.date_range("2016-01-01", periods=3, freq="S")._data, + pd.date_range("2016-01-01", periods=3, tz="Asia/Tokyo")._data, + # Miscellaneous invalid types ], ) - def test_parr_add_sub_datetime_scalar(self, other, box_with_array): + def test_parr_add_sub_invalid(self, other, box_with_array): # GH#23215 rng = pd.period_range("1/1/2000", freq="D", periods=3) rng = tm.box_expected(rng, box_with_array) @@ -595,23 +602,6 @@ def test_parr_add_sub_datetime_scalar(self, other, box_with_array): # ----------------------------------------------------------------- # __add__/__sub__ with ndarray[datetime64] and ndarray[timedelta64] - def test_parr_add_sub_dt64_array_raises(self, box_with_array): - rng = pd.period_range("1/1/2000", freq="D", periods=3) - dti = pd.date_range("2016-01-01", periods=3) - dtarr = dti.values - - rng = tm.box_expected(rng, box_with_array) - - with pytest.raises(TypeError): - rng + dtarr - with pytest.raises(TypeError): - dtarr + rng - - with pytest.raises(TypeError): - rng - dtarr - with pytest.raises(TypeError): - dtarr - rng - def test_pi_add_sub_td64_array_non_tick_raises(self): rng = pd.period_range("1/1/2000", freq="Q", periods=3) tdi = pd.TimedeltaIndex(["-1 Day", "-1 Day", "-1 Day"]) diff --git a/pandas/tests/arithmetic/test_timedelta64.py b/pandas/tests/arithmetic/test_timedelta64.py index 4f5e00bc5a37d..6af4ea18e63ca 100644 --- a/pandas/tests/arithmetic/test_timedelta64.py +++ b/pandas/tests/arithmetic/test_timedelta64.py @@ -18,6 +18,7 @@ Timestamp, timedelta_range, ) +from pandas.tests.arithmetic.test_datetime64 import assert_invalid_comparison import pandas.util.testing as tm @@ -61,42 +62,33 @@ def test_compare_timedelta64_zerodim(self, box_with_array): # zero-dim of wrong dtype should still raise tdi >= np.array(4) - -class TestTimedelta64ArrayComparisons: - # TODO: All of these need to be parametrized over box - - def test_compare_timedelta_series(self): + @pytest.mark.parametrize( + "td_scalar", + [timedelta(days=1), Timedelta(days=1), Timedelta(days=1).to_timedelta64()], + ) + def test_compare_timedeltalike_scalar(self, box_with_array, td_scalar): # regression test for GH#5963 - s = pd.Series([timedelta(days=1), timedelta(days=2)]) - actual = s > timedelta(days=1) + box = box_with_array + xbox = box if box is not pd.Index else np.ndarray + ser = pd.Series([timedelta(days=1), timedelta(days=2)]) + ser = tm.box_expected(ser, box) + actual = ser > td_scalar expected = pd.Series([False, True]) - tm.assert_series_equal(actual, expected) + expected = tm.box_expected(expected, xbox) + tm.assert_equal(actual, expected) - def test_tdi_cmp_str_invalid(self, box_with_array): - # GH#13624 - xbox = box_with_array if box_with_array is not pd.Index else np.ndarray - tdi = TimedeltaIndex(["1 day", "2 days"]) - tdarr = tm.box_expected(tdi, box_with_array) + @pytest.mark.parametrize("invalid", [345600000000000, "a"]) + def test_td64_comparisons_invalid(self, box_with_array, invalid): + # GH#13624 for str + box = box_with_array + rng = timedelta_range("1 days", periods=10) + obj = tm.box_expected(rng, box) - for left, right in [(tdarr, "a"), ("a", tdarr)]: - with pytest.raises(TypeError): - left > right - with pytest.raises(TypeError): - left >= right - with pytest.raises(TypeError): - left < right - with pytest.raises(TypeError): - left <= right - - result = left == right - expected = np.array([False, False], dtype=bool) - expected = tm.box_expected(expected, xbox) - tm.assert_equal(result, expected) + assert_invalid_comparison(obj, invalid, box) - result = left != right - expected = np.array([True, True], dtype=bool) - expected = tm.box_expected(expected, xbox) - tm.assert_equal(result, expected) + +class TestTimedelta64ArrayComparisons: + # TODO: All of these need to be parametrized over box @pytest.mark.parametrize("dtype", [None, object]) def test_comp_nat(self, dtype): @@ -191,10 +183,6 @@ def test_comparisons_coverage(self): expected = np.array([True, True, True] + [False] * 7) tm.assert_numpy_array_equal(result, expected) - # raise TypeError for now - with pytest.raises(TypeError): - rng < rng[3].value - result = rng == list(rng) exp = rng == rng tm.assert_numpy_array_equal(result, exp) @@ -835,19 +823,10 @@ def test_timedelta64_ops_nat(self): # ------------------------------------------------------------- # Invalid Operations - def test_td64arr_add_str_invalid(self, box_with_array): - # GH#13624 + @pytest.mark.parametrize("other", ["a", 3.14, np.array([2.0, 3.0])]) + def test_td64arr_add_sub_invalid(self, box_with_array, other): + # GH#13624 for str tdi = TimedeltaIndex(["1 day", "2 days"]) - tdi = tm.box_expected(tdi, box_with_array) - - with pytest.raises(TypeError): - tdi + "a" - with pytest.raises(TypeError): - "a" + tdi - - @pytest.mark.parametrize("other", [3.14, np.array([2.0, 3.0])]) - def test_td64arr_add_sub_float(self, box_with_array, other): - tdi = TimedeltaIndex(["-1 days", "-1 days"]) tdarr = tm.box_expected(tdi, box_with_array) with pytest.raises(TypeError): diff --git a/pandas/tests/scalar/timestamp/test_timestamp.py b/pandas/tests/scalar/timestamp/test_timestamp.py index 401fc285424fe..652dd34ca7ce2 100644 --- a/pandas/tests/scalar/timestamp/test_timestamp.py +++ b/pandas/tests/scalar/timestamp/test_timestamp.py @@ -1047,3 +1047,23 @@ def test_to_numpy_alias(self): # GH 24653: alias .to_numpy() for scalars ts = Timestamp(datetime.now()) assert ts.to_datetime64() == ts.to_numpy() + + +class SubDatetime(datetime): + pass + + +@pytest.mark.parametrize( + "lh,rh", + [ + (SubDatetime(2000, 1, 1), Timedelta(hours=1)), + (Timedelta(hours=1), SubDatetime(2000, 1, 1)), + ], +) +def test_dt_subclass_add_timedelta(lh, rh): + # GH#25851 + # ensure that subclassed datetime works for + # Timedelta operations + result = lh + rh + expected = SubDatetime(2000, 1, 1, 1) + assert result == expected From c1f7b39c09bb6b7e8ce5024bd071247a6076b12d Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 15 Aug 2019 05:41:32 -0700 Subject: [PATCH 097/191] BUG: fix+test op(NaT, ndarray), also simplify (#27807) --- pandas/_libs/tslibs/nattype.pyx | 104 ++++++++++++++------ pandas/core/indexes/datetimes.py | 6 +- pandas/tests/arithmetic/test_timedelta64.py | 2 +- pandas/tests/scalar/period/test_period.py | 46 ++------- pandas/tests/scalar/test_nat.py | 47 ++++++++- 5 files changed, 131 insertions(+), 74 deletions(-) diff --git a/pandas/_libs/tslibs/nattype.pyx b/pandas/_libs/tslibs/nattype.pyx index 6fab1b5c02be1..020d1acf0b4ce 100644 --- a/pandas/_libs/tslibs/nattype.pyx +++ b/pandas/_libs/tslibs/nattype.pyx @@ -92,6 +92,9 @@ cdef class _NaT(datetime): # int64_t value # object freq + # higher than np.ndarray and np.matrix + __array_priority__ = 100 + def __hash__(_NaT self): # py3k needs this defined here return hash(self.value) @@ -103,61 +106,102 @@ cdef class _NaT(datetime): if ndim == -1: return _nat_scalar_rules[op] - if ndim == 0: + elif util.is_array(other): + result = np.empty(other.shape, dtype=np.bool_) + result.fill(_nat_scalar_rules[op]) + return result + + elif ndim == 0: if is_datetime64_object(other): return _nat_scalar_rules[op] else: raise TypeError('Cannot compare type %r with type %r' % (type(self).__name__, type(other).__name__)) + # Note: instead of passing "other, self, _reverse_ops[op]", we observe # that `_nat_scalar_rules` is invariant under `_reverse_ops`, # rendering it unnecessary. return PyObject_RichCompare(other, self, op) def __add__(self, other): + if self is not c_NaT: + # cython __radd__ semantics + self, other = other, self + if PyDateTime_Check(other): return c_NaT - + elif PyDelta_Check(other): + return c_NaT + elif is_datetime64_object(other) or is_timedelta64_object(other): + return c_NaT elif hasattr(other, 'delta'): # Timedelta, offsets.Tick, offsets.Week return c_NaT - elif getattr(other, '_typ', None) in ['dateoffset', 'series', - 'period', 'datetimeindex', - 'datetimearray', - 'timedeltaindex', - 'timedeltaarray']: - # Duplicate logic in _Timestamp.__add__ to avoid needing - # to subclass; allows us to @final(_Timestamp.__add__) - return NotImplemented - return c_NaT + + elif is_integer_object(other) or util.is_period_object(other): + # For Period compat + # TODO: the integer behavior is deprecated, remove it + return c_NaT + + elif util.is_array(other): + if other.dtype.kind in 'mM': + # If we are adding to datetime64, we treat NaT as timedelta + # Either way, result dtype is datetime64 + result = np.empty(other.shape, dtype="datetime64[ns]") + result.fill("NaT") + return result + + return NotImplemented def __sub__(self, other): # Duplicate some logic from _Timestamp.__sub__ to avoid needing # to subclass; allows us to @final(_Timestamp.__sub__) + cdef: + bint is_rsub = False + + if self is not c_NaT: + # cython __rsub__ semantics + self, other = other, self + is_rsub = True + if PyDateTime_Check(other): - return NaT + return c_NaT elif PyDelta_Check(other): - return NaT + return c_NaT + elif is_datetime64_object(other) or is_timedelta64_object(other): + return c_NaT + elif hasattr(other, 'delta'): + # offsets.Tick, offsets.Week + return c_NaT - elif getattr(other, '_typ', None) == 'datetimeindex': - # a Timestamp-DatetimeIndex -> yields a negative TimedeltaIndex - return -other.__sub__(self) + elif is_integer_object(other) or util.is_period_object(other): + # For Period compat + # TODO: the integer behavior is deprecated, remove it + return c_NaT - elif getattr(other, '_typ', None) == 'timedeltaindex': - # a Timestamp-TimedeltaIndex -> yields a negative TimedeltaIndex - return (-other).__add__(self) + elif util.is_array(other): + if other.dtype.kind == 'm': + if not is_rsub: + # NaT - timedelta64 we treat NaT as datetime64, so result + # is datetime64 + result = np.empty(other.shape, dtype="datetime64[ns]") + result.fill("NaT") + return result + + # timedelta64 - NaT we have to treat NaT as timedelta64 + # for this to be meaningful, and the result is timedelta64 + result = np.empty(other.shape, dtype="timedelta64[ns]") + result.fill("NaT") + return result + + elif other.dtype.kind == 'M': + # We treat NaT as a datetime, so regardless of whether this is + # NaT - other or other - NaT, the result is timedelta64 + result = np.empty(other.shape, dtype="timedelta64[ns]") + result.fill("NaT") + return result - elif hasattr(other, 'delta'): - # offsets.Tick, offsets.Week - neg_other = -other - return self + neg_other - - elif getattr(other, '_typ', None) in ['period', 'series', - 'periodindex', 'dateoffset', - 'datetimearray', - 'timedeltaarray']: - return NotImplemented - return NaT + return NotImplemented def __pos__(self): return NaT diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index 9f2b31f23d2fa..08f2aa68eca9e 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -4,7 +4,7 @@ import numpy as np -from pandas._libs import Timestamp, index as libindex, lib, tslib as libts +from pandas._libs import NaT, Timestamp, index as libindex, lib, tslib as libts import pandas._libs.join as libjoin from pandas._libs.tslibs import ccalendar, fields, parsing, timezones from pandas.util._decorators import Appender, Substitution, cache_readonly @@ -1281,7 +1281,9 @@ def insert(self, loc, item): raise ValueError("Passed item and index have different timezone") # check freq can be preserved on edge cases if self.size and self.freq is not None: - if (loc == 0 or loc == -len(self)) and item + self.freq == self[0]: + if item is NaT: + pass + elif (loc == 0 or loc == -len(self)) and item + self.freq == self[0]: freq = self.freq elif (loc == len(self)) and item - self.freq == self[-1]: freq = self.freq diff --git a/pandas/tests/arithmetic/test_timedelta64.py b/pandas/tests/arithmetic/test_timedelta64.py index 6af4ea18e63ca..33a5d45df3885 100644 --- a/pandas/tests/arithmetic/test_timedelta64.py +++ b/pandas/tests/arithmetic/test_timedelta64.py @@ -1610,7 +1610,7 @@ def test_td64arr_div_nat_invalid(self, box_with_array): rng = timedelta_range("1 days", "10 days", name="foo") rng = tm.box_expected(rng, box_with_array) - with pytest.raises(TypeError, match="'?true_divide'? cannot use operands"): + with pytest.raises(TypeError, match="unsupported operand type"): rng / pd.NaT with pytest.raises(TypeError, match="Cannot divide NaTType by"): pd.NaT / rng diff --git a/pandas/tests/scalar/period/test_period.py b/pandas/tests/scalar/period/test_period.py index b57b817461788..6da4d556ea07e 100644 --- a/pandas/tests/scalar/period/test_period.py +++ b/pandas/tests/scalar/period/test_period.py @@ -1298,23 +1298,13 @@ def test_add_offset_nat(self): timedelta(365), ]: assert p + o is NaT - - if isinstance(o, np.timedelta64): - with pytest.raises(TypeError): - o + p - else: - assert o + p is NaT + assert o + p is NaT for freq in ["M", "2M", "3M"]: p = Period("NaT", freq=freq) for o in [offsets.MonthEnd(2), offsets.MonthEnd(12)]: assert p + o is NaT - - if isinstance(o, np.timedelta64): - with pytest.raises(TypeError): - o + p - else: - assert o + p is NaT + assert o + p is NaT for o in [ offsets.YearBegin(2), @@ -1324,12 +1314,7 @@ def test_add_offset_nat(self): timedelta(365), ]: assert p + o is NaT - - if isinstance(o, np.timedelta64): - with pytest.raises(TypeError): - o + p - else: - assert o + p is NaT + assert o + p is NaT # freq is Tick for freq in ["D", "2D", "3D"]: @@ -1343,12 +1328,7 @@ def test_add_offset_nat(self): timedelta(hours=48), ]: assert p + o is NaT - - if isinstance(o, np.timedelta64): - with pytest.raises(TypeError): - o + p - else: - assert o + p is NaT + assert o + p is NaT for o in [ offsets.YearBegin(2), @@ -1358,12 +1338,7 @@ def test_add_offset_nat(self): timedelta(hours=23), ]: assert p + o is NaT - - if isinstance(o, np.timedelta64): - with pytest.raises(TypeError): - o + p - else: - assert o + p is NaT + assert o + p is NaT for freq in ["H", "2H", "3H"]: p = Period("NaT", freq=freq) @@ -1376,9 +1351,7 @@ def test_add_offset_nat(self): timedelta(days=4, minutes=180), ]: assert p + o is NaT - - if not isinstance(o, np.timedelta64): - assert o + p is NaT + assert o + p is NaT for o in [ offsets.YearBegin(2), @@ -1388,12 +1361,7 @@ def test_add_offset_nat(self): timedelta(hours=23, minutes=30), ]: assert p + o is NaT - - if isinstance(o, np.timedelta64): - with pytest.raises(TypeError): - o + p - else: - assert o + p is NaT + assert o + p is NaT def test_sub_offset(self): # freq is DateOffset diff --git a/pandas/tests/scalar/test_nat.py b/pandas/tests/scalar/test_nat.py index e7ad76cf95ba0..5b1c4f92bf341 100644 --- a/pandas/tests/scalar/test_nat.py +++ b/pandas/tests/scalar/test_nat.py @@ -1,4 +1,5 @@ from datetime import datetime, timedelta +import operator import numpy as np import pytest @@ -21,6 +22,7 @@ isna, ) from pandas.core.arrays import DatetimeArray, PeriodArray, TimedeltaArray +from pandas.core.ops import roperator from pandas.util import testing as tm @@ -333,8 +335,9 @@ def test_nat_doc_strings(compare): "value,val_type", [ (2, "scalar"), - (1.5, "scalar"), - (np.nan, "scalar"), + (1.5, "floating"), + (np.nan, "floating"), + ("foo", "str"), (timedelta(3600), "timedelta"), (Timedelta("5s"), "timedelta"), (datetime(2014, 1, 1), "timestamp"), @@ -348,6 +351,14 @@ def test_nat_arithmetic_scalar(op_name, value, val_type): # see gh-6873 invalid_ops = { "scalar": {"right_div_left"}, + "floating": { + "right_div_left", + "left_minus_right", + "right_minus_left", + "left_plus_right", + "right_plus_left", + }, + "str": set(_ops.keys()), "timedelta": {"left_times_right", "right_times_left"}, "timestamp": { "left_times_right", @@ -366,6 +377,16 @@ def test_nat_arithmetic_scalar(op_name, value, val_type): and isinstance(value, Timedelta) ): msg = "Cannot multiply" + elif val_type == "str": + # un-specific check here because the message comes from str + # and varies by method + msg = ( + "can only concatenate str|" + "unsupported operand type|" + "can't multiply sequence|" + "Can't convert 'NaTType'|" + "must be str, not NaTType" + ) else: msg = "unsupported operand type" @@ -435,6 +456,28 @@ def test_nat_arithmetic_td64_vector(op_name, box): tm.assert_equal(_ops[op_name](vec, NaT), box_nat) +@pytest.mark.parametrize( + "dtype,op,out_dtype", + [ + ("datetime64[ns]", operator.add, "datetime64[ns]"), + ("datetime64[ns]", roperator.radd, "datetime64[ns]"), + ("datetime64[ns]", operator.sub, "timedelta64[ns]"), + ("datetime64[ns]", roperator.rsub, "timedelta64[ns]"), + ("timedelta64[ns]", operator.add, "datetime64[ns]"), + ("timedelta64[ns]", roperator.radd, "datetime64[ns]"), + ("timedelta64[ns]", operator.sub, "datetime64[ns]"), + ("timedelta64[ns]", roperator.rsub, "timedelta64[ns]"), + ], +) +def test_nat_arithmetic_ndarray(dtype, op, out_dtype): + other = np.arange(10).astype(dtype) + result = op(NaT, other) + + expected = np.empty(other.shape, dtype=out_dtype) + expected.fill("NaT") + tm.assert_numpy_array_equal(result, expected) + + def test_nat_pinned_docstrings(): # see gh-17327 assert NaT.ctime.__doc__ == datetime.ctime.__doc__ From 48dd753de9caa9c7fea8ecae39bf2d5f38244b22 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 15 Aug 2019 05:42:37 -0700 Subject: [PATCH 098/191] CLN: remove _maybe_update_attributes (#27896) --- pandas/core/arrays/timedeltas.py | 4 +++- pandas/core/indexes/base.py | 8 -------- pandas/core/indexes/datetimelike.py | 15 +++++++++++++++ pandas/core/indexes/datetimes.py | 8 -------- pandas/core/indexes/range.py | 3 +-- pandas/core/indexes/timedeltas.py | 18 +++++++----------- 6 files changed, 26 insertions(+), 30 deletions(-) diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index 6899e47045c1c..3609c68a26c0f 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -776,12 +776,14 @@ def __rdivmod__(self, other): res2 = other - res1 * self return res1, res2 - # Note: TimedeltaIndex overrides this in call to cls._add_numeric_methods def __neg__(self): if self.freq is not None: return type(self)(-self._data, freq=-self.freq) return type(self)(-self._data) + def __pos__(self): + return type(self)(self._data, freq=self.freq) + def __abs__(self): # Note: freq is not preserved return type(self)(np.abs(self._data)) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 4e098b2f8be9b..5f17dde01d2c4 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -695,7 +695,6 @@ def __array_wrap__(self, result, context=None): return result attrs = self._get_attributes_dict() - attrs = self._maybe_update_attributes(attrs) return Index(result, **attrs) @cache_readonly @@ -5335,12 +5334,6 @@ def _add_numeric_methods_disabled(cls): cls.__abs__ = make_invalid_op("__abs__") cls.__inv__ = make_invalid_op("__inv__") - def _maybe_update_attributes(self, attrs): - """ - Update Index attributes (e.g. freq) depending on op. - """ - return attrs - @classmethod def _add_numeric_methods_binary(cls): """ @@ -5374,7 +5367,6 @@ def _make_evaluate_unary(op, opstr): def _evaluate_numeric_unary(self): attrs = self._get_attributes_dict() - attrs = self._maybe_update_attributes(attrs) return Index(op(self.values), **attrs) _evaluate_numeric_unary.__name__ = opstr diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index af99c7a2754e5..c7664d9777c71 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -15,6 +15,7 @@ from pandas.core.dtypes.common import ( ensure_int64, + is_bool_dtype, is_dtype_equal, is_float, is_integer, @@ -163,6 +164,20 @@ def values(self): def asi8(self): return self._data.asi8 + def __array_wrap__(self, result, context=None): + """ + Gets called after a ufunc. + """ + result = lib.item_from_zerodim(result) + if is_bool_dtype(result) or lib.is_scalar(result): + return result + + attrs = self._get_attributes_dict() + if not is_period_dtype(self) and attrs["freq"]: + # no need to infer if freq is None + attrs["freq"] = "infer" + return Index(result, **attrs) + # ------------------------------------------------------------------------ def equals(self, other): diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index 08f2aa68eca9e..51daad3b42649 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -465,14 +465,6 @@ def _convert_for_op(self, value): return _to_M8(value) raise ValueError("Passed item and index have different timezone") - def _maybe_update_attributes(self, attrs): - """ Update Index attributes (e.g. freq) depending on op """ - freq = attrs.get("freq", None) - if freq is not None: - # no need to infer if freq is None - attrs["freq"] = "infer" - return attrs - # -------------------------------------------------------------------- # Rendering Methods diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index cfdaf65955dab..43ed6e7b122ea 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -75,7 +75,7 @@ class RangeIndex(Int64Index): _engine_type = libindex.Int64Engine _range = None # type: range - # check whether self._data has benn called + # check whether self._data has been called _cached_data = None # type: np.ndarray # -------------------------------------------------------------------- # Constructors @@ -785,7 +785,6 @@ def _evaluate_numeric_binop(self, other): other = extract_array(other, extract_numpy=True) attrs = self._get_attributes_dict() - attrs = self._maybe_update_attributes(attrs) left, right = self, other diff --git a/pandas/core/indexes/timedeltas.py b/pandas/core/indexes/timedeltas.py index f2ce562536b95..d06afa3daa792 100644 --- a/pandas/core/indexes/timedeltas.py +++ b/pandas/core/indexes/timedeltas.py @@ -44,7 +44,12 @@ class TimedeltaDelegateMixin(DatetimelikeDelegateMixin): # which we we dont' want to expose in the .dt accessor. _delegate_class = TimedeltaArray _delegated_properties = TimedeltaArray._datetimelike_ops + ["components"] - _delegated_methods = TimedeltaArray._datetimelike_methods + ["_box_values"] + _delegated_methods = TimedeltaArray._datetimelike_methods + [ + "_box_values", + "__neg__", + "__pos__", + "__abs__", + ] _raw_properties = {"components"} _raw_methods = {"to_pytimedelta"} @@ -56,7 +61,7 @@ class TimedeltaDelegateMixin(DatetimelikeDelegateMixin): TimedeltaArray, TimedeltaDelegateMixin._delegated_methods, typ="method", - overwrite=False, + overwrite=True, ) class TimedeltaIndex( DatetimeIndexOpsMixin, dtl.TimelikeOps, Int64Index, TimedeltaDelegateMixin @@ -279,14 +284,6 @@ def __setstate__(self, state): _unpickle_compat = __setstate__ - def _maybe_update_attributes(self, attrs): - """ Update Index attributes (e.g. freq) depending on op """ - freq = attrs.get("freq", None) - if freq is not None: - # no need to infer if freq is None - attrs["freq"] = "infer" - return attrs - # ------------------------------------------------------------------- # Rendering Methods @@ -689,7 +686,6 @@ def delete(self, loc): TimedeltaIndex._add_comparison_ops() -TimedeltaIndex._add_numeric_methods_unary() TimedeltaIndex._add_logical_methods_disabled() TimedeltaIndex._add_datetimelike_methods() From 9d7a282baf0702c0fe49544767c4c91e03965aae Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 15 Aug 2019 05:44:10 -0700 Subject: [PATCH 099/191] CLN: simplify comparison method, docstring cleanups (#27923) --- pandas/core/computation/common.py | 4 +-- pandas/core/computation/expressions.py | 44 ++++++++++++++------------ pandas/core/computation/ops.py | 28 +++++++++------- pandas/core/ops/__init__.py | 39 ++++------------------- 4 files changed, 47 insertions(+), 68 deletions(-) diff --git a/pandas/core/computation/common.py b/pandas/core/computation/common.py index b8e212fd2a32e..2a4277e4e0a20 100644 --- a/pandas/core/computation/common.py +++ b/pandas/core/computation/common.py @@ -2,7 +2,7 @@ import numpy as np -import pandas as pd +from pandas._config import get_option # A token value Python's tokenizer probably will never use. _BACKTICK_QUOTED_STRING = 100 @@ -11,7 +11,7 @@ def _ensure_decoded(s): """ if we have bytes, decode them to unicode """ if isinstance(s, (np.bytes_, bytes)): - s = s.decode(pd.get_option("display.encoding")) + s = s.decode(get_option("display.encoding")) return s diff --git a/pandas/core/computation/expressions.py b/pandas/core/computation/expressions.py index d9dc194d484ae..0ecf56cf6fe96 100644 --- a/pandas/core/computation/expressions.py +++ b/pandas/core/computation/expressions.py @@ -203,17 +203,18 @@ def _bool_arith_check( def evaluate(op, op_str, a, b, use_numexpr=True, **eval_kwargs): - """ evaluate and return the expression of the op on a and b - - Parameters - ---------- - - op : the actual operand - op_str: the string version of the op - a : left operand - b : right operand - use_numexpr : whether to try to use numexpr (default True) - """ + """ + Evaluate and return the expression of the op on a and b. + + Parameters + ---------- + op : the actual operand + op_str : the string version of the op + a : left operand + b : right operand + use_numexpr : bool, default True + Whether to try to use numexpr. + """ use_numexpr = use_numexpr and _bool_arith_check(op_str, a, b) if use_numexpr: @@ -222,16 +223,17 @@ def evaluate(op, op_str, a, b, use_numexpr=True, **eval_kwargs): def where(cond, a, b, use_numexpr=True): - """ evaluate the where condition cond on a and b - - Parameters - ---------- - - cond : a boolean array - a : return if cond is True - b : return if cond is False - use_numexpr : whether to try to use numexpr (default True) - """ + """ + Evaluate the where condition cond on a and b + + Parameters + ---------- + cond : ndarray[bool] + a : return if cond is True + b : return if cond is False + use_numexpr : bool, default True + Whether to try to use numexpr. + """ if use_numexpr: return _where(cond, a, b) diff --git a/pandas/core/computation/ops.py b/pandas/core/computation/ops.py index 2bf09a553ce18..b49220ae701bc 100644 --- a/pandas/core/computation/ops.py +++ b/pandas/core/computation/ops.py @@ -51,8 +51,9 @@ class UndefinedVariableError(NameError): - - """NameError subclass for local variables.""" + """ + NameError subclass for local variables. + """ def __init__(self, name, is_local): if is_local: @@ -191,8 +192,8 @@ def __repr__(self): class Op: - - """Hold an operator of arbitrary arity + """ + Hold an operator of arbitrary arity. """ def __init__(self, op, operands, *args, **kwargs): @@ -204,8 +205,9 @@ def __iter__(self): return iter(self.operands) def __repr__(self): - """Print a generic n-ary operator and its operands using infix - notation""" + """ + Print a generic n-ary operator and its operands using infix notation. + """ # recurse over the operands parened = ("({0})".format(pprint_thing(opr)) for opr in self.operands) return pprint_thing(" {0} ".format(self.op).join(parened)) @@ -296,7 +298,8 @@ def _not_in(x, y): def _cast_inplace(terms, acceptable_dtypes, dtype): - """Cast an expression inplace. + """ + Cast an expression inplace. Parameters ---------- @@ -304,7 +307,6 @@ def _cast_inplace(terms, acceptable_dtypes, dtype): The expression that should cast. acceptable_dtypes : list of acceptable numpy.dtype Will not cast if term's dtype in this list. - dtype : str or numpy.dtype The dtype to cast to. """ @@ -325,8 +327,8 @@ def is_term(obj): class BinOp(Op): - - """Hold a binary operator and its operands + """ + Hold a binary operator and its operands. Parameters ---------- @@ -355,7 +357,8 @@ def __init__(self, op, lhs, rhs, **kwargs): ) def __call__(self, env): - """Recursively evaluate an expression in Python space. + """ + Recursively evaluate an expression in Python space. Parameters ---------- @@ -377,7 +380,8 @@ def __call__(self, env): return self.func(left, right) def evaluate(self, env, engine, parser, term_type, eval_in_python): - """Evaluate a binary operation *before* being passed to the engine. + """ + Evaluate a binary operation *before* being passed to the engine. Parameters ---------- diff --git a/pandas/core/ops/__init__.py b/pandas/core/ops/__init__.py index 843f12c20b07b..26672593f98fb 100644 --- a/pandas/core/ops/__init__.py +++ b/pandas/core/ops/__init__.py @@ -32,7 +32,6 @@ is_period_dtype, is_scalar, is_timedelta64_dtype, - needs_i8_conversion, ) from pandas.core.dtypes.generic import ( ABCDataFrame, @@ -758,17 +757,12 @@ def _comp_method_SERIES(cls, op, special): code duplication. """ op_name = _get_op_name(op, special) - masker = _gen_eval_kwargs(op_name).get("masker", False) def na_op(x, y): # TODO: - # should have guarantess on what x, y can be type-wise + # should have guarantees on what x, y can be type-wise # Extension Dtypes are not called here - # Checking that cases that were once handled here are no longer - # reachable. - assert not (is_categorical_dtype(y) and not is_scalar(y)) - if is_object_dtype(x.dtype): result = _comp_method_OBJECT_ARRAY(op, x, y) @@ -776,32 +770,11 @@ def na_op(x, y): return invalid_comparison(x, y, op) else: - - # we want to compare like types - # we only want to convert to integer like if - # we are not NotImplemented, otherwise - # we would allow datetime64 (but viewed as i8) against - # integer comparisons - - # we have a datetime/timedelta and may need to convert - assert not needs_i8_conversion(x) - mask = None - if not is_scalar(y) and needs_i8_conversion(y): - mask = isna(x) | isna(y) - y = y.view("i8") - x = x.view("i8") - - method = getattr(x, op_name, None) - if method is not None: - with np.errstate(all="ignore"): - result = method(y) - if result is NotImplemented: - return invalid_comparison(x, y, op) - else: - result = op(x, y) - - if mask is not None and mask.any(): - result[mask] = masker + method = getattr(x, op_name) + with np.errstate(all="ignore"): + result = method(y) + if result is NotImplemented: + return invalid_comparison(x, y, op) return result From a818281a45f7b5bd24f050e5d6868894c5108db6 Mon Sep 17 00:00:00 2001 From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Thu, 15 Aug 2019 21:27:17 +0100 Subject: [PATCH 100/191] =?UTF-8?q?TST:=20Raise=20ValueError=20and=20sugge?= =?UTF-8?q?stion=20to=20use=20header=3DNone=20if=20header=3D-1=20is=20pa?= =?UTF-8?q?=E2=80=A6=20(#27878)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- doc/source/whatsnew/v0.25.1.rst | 2 +- pandas/io/parsers.py | 11 ++++++++++ pandas/tests/io/parser/test_header.py | 29 +++++++++++++++++++++++++++ 3 files changed, 41 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.25.1.rst b/doc/source/whatsnew/v0.25.1.rst index 21f1fa7ddec1f..f4308684e286a 100644 --- a/doc/source/whatsnew/v0.25.1.rst +++ b/doc/source/whatsnew/v0.25.1.rst @@ -105,7 +105,7 @@ I/O ^^^ - Avoid calling ``S3File.s3`` when reading parquet, as this was removed in s3fs version 0.3.0 (:issue:`27756`) -- +- Better error message when a negative header is passed in :func:`pandas.read_csv` (:issue:`27779`) - Plotting diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index f4b00b0aac5f7..a3ff837bc7f52 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -1393,6 +1393,10 @@ def __init__(self, kwds): if isinstance(self.header, (list, tuple, np.ndarray)): if not all(map(is_integer, self.header)): raise ValueError("header must be integer or list of integers") + if any(i < 0 for i in self.header): + raise ValueError( + "cannot specify multi-index header with negative integers" + ) if kwds.get("usecols"): raise ValueError( "cannot specify usecols when specifying a multi-index header" @@ -1419,6 +1423,13 @@ def __init__(self, kwds): elif self.header is not None and not is_integer(self.header): raise ValueError("header must be integer or list of integers") + # GH 27779 + elif self.header is not None and self.header < 0: + raise ValueError( + "Passing negative integer to header is invalid. " + "For no header, use header=None instead" + ) + self._name_processed = False self._first_chunk = True diff --git a/pandas/tests/io/parser/test_header.py b/pandas/tests/io/parser/test_header.py index 99e0181741998..0ecd8be7ddc78 100644 --- a/pandas/tests/io/parser/test_header.py +++ b/pandas/tests/io/parser/test_header.py @@ -24,6 +24,35 @@ def test_read_with_bad_header(all_parsers): parser.read_csv(s, header=[10]) +def test_negative_header(all_parsers): + # see gh-27779 + parser = all_parsers + data = """1,2,3,4,5 +6,7,8,9,10 +11,12,13,14,15 +""" + with pytest.raises( + ValueError, + match="Passing negative integer to header is invalid. " + "For no header, use header=None instead", + ): + parser.read_csv(StringIO(data), header=-1) + + +@pytest.mark.parametrize("header", [([-1, 2, 4]), ([-5, 0])]) +def test_negative_multi_index_header(all_parsers, header): + # see gh-27779 + parser = all_parsers + data = """1,2,3,4,5 + 6,7,8,9,10 + 11,12,13,14,15 + """ + with pytest.raises( + ValueError, match="cannot specify multi-index header with negative integers" + ): + parser.read_csv(StringIO(data), header=header) + + @pytest.mark.parametrize("header", [True, False]) def test_bool_header_arg(all_parsers, header): # see gh-6114 From ca5198a6daa7757e398112a17ccadc9e7d078d96 Mon Sep 17 00:00:00 2001 From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Fri, 16 Aug 2019 12:22:19 +0100 Subject: [PATCH 101/191] BUG: Ensure that fill_na in Categorical only replaces null values (#27932) * Ensure that :func: in :class: only replaces null values --- doc/source/whatsnew/v0.25.1.rst | 2 +- pandas/core/arrays/categorical.py | 4 ++-- pandas/tests/series/test_missing.py | 22 ++++++++++++++++++++++ 3 files changed, 25 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v0.25.1.rst b/doc/source/whatsnew/v0.25.1.rst index f4308684e286a..4350493f8ff91 100644 --- a/doc/source/whatsnew/v0.25.1.rst +++ b/doc/source/whatsnew/v0.25.1.rst @@ -25,7 +25,7 @@ Bug fixes Categorical ^^^^^^^^^^^ -- +- Bug in :meth:`Categorical.fillna` would replace all values, not just those that are ``NaN`` (:issue:`26215`) - - diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index bbbeb812d1fe9..a895da6184eeb 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1840,8 +1840,8 @@ def fillna(self, value=None, method=None, limit=None): raise ValueError("fill value must be in categories") values_codes = _get_codes_for_values(value, self.categories) - indexer = np.where(values_codes != -1) - codes[indexer] = values_codes[values_codes != -1] + indexer = np.where(codes == -1) + codes[indexer] = values_codes[indexer] # If value is not a dict or Series it should be a scalar elif is_hashable(value): diff --git a/pandas/tests/series/test_missing.py b/pandas/tests/series/test_missing.py index f1b84acf68755..ddd2c566f4cda 100644 --- a/pandas/tests/series/test_missing.py +++ b/pandas/tests/series/test_missing.py @@ -578,6 +578,28 @@ def test_fillna_categorical(self, fill_value, expected_output): exp = Series(Categorical(expected_output, categories=["a", "b"])) tm.assert_series_equal(s.fillna(fill_value), exp) + @pytest.mark.parametrize( + "fill_value, expected_output", + [ + (Series(["a", "b", "c", "d", "e"]), ["a", "b", "b", "d", "e"]), + (Series(["b", "d", "a", "d", "a"]), ["a", "d", "b", "d", "a"]), + ( + Series( + Categorical( + ["b", "d", "a", "d", "a"], categories=["b", "c", "d", "e", "a"] + ) + ), + ["a", "d", "b", "d", "a"], + ), + ], + ) + def test_fillna_categorical_with_new_categories(self, fill_value, expected_output): + # GH 26215 + data = ["a", np.nan, "b", np.nan, np.nan] + s = Series(Categorical(data, categories=["a", "b", "c", "d", "e"])) + exp = Series(Categorical(expected_output, categories=["a", "b", "c", "d", "e"])) + tm.assert_series_equal(s.fillna(fill_value), exp) + def test_fillna_categorical_raise(self): data = ["a", np.nan, "b", np.nan, np.nan] s = Series(Categorical(data, categories=["a", "b"])) From 0e244680dfa416618615b65d595ba97709a36675 Mon Sep 17 00:00:00 2001 From: "Adam J. Stewart" Date: Fri, 16 Aug 2019 06:32:31 -0500 Subject: [PATCH 102/191] DOC: Fix typos in HDFStore docs (#27940) --- doc/source/user_guide/io.rst | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 947bf15a49c7a..5d7a268631778 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -3572,7 +3572,7 @@ Closing a Store and using a context manager: Read/write API '''''''''''''' -``HDFStore`` supports an top-level API using ``read_hdf`` for reading and ``to_hdf`` for writing, +``HDFStore`` supports a top-level API using ``read_hdf`` for reading and ``to_hdf`` for writing, similar to how ``read_csv`` and ``to_csv`` work. .. ipython:: python @@ -3687,7 +3687,7 @@ Hierarchical keys Keys to a store can be specified as a string. These can be in a hierarchical path-name like format (e.g. ``foo/bar/bah``), which will generate a hierarchy of sub-stores (or ``Groups`` in PyTables -parlance). Keys can be specified with out the leading '/' and are **always** +parlance). Keys can be specified without the leading '/' and are **always** absolute (e.g. 'foo' refers to '/foo'). Removal operations can remove everything in the sub-store and **below**, so be *careful*. @@ -3825,7 +3825,7 @@ data. A query is specified using the ``Term`` class under the hood, as a boolean expression. -* ``index`` and ``columns`` are supported indexers of a ``DataFrames``. +* ``index`` and ``columns`` are supported indexers of ``DataFrames``. * if ``data_columns`` are specified, these can be used as additional indexers. Valid comparison operators are: @@ -3917,7 +3917,7 @@ Use boolean expressions, with in-line function evaluation. store.select('dfq', "index>pd.Timestamp('20130104') & columns=['A', 'B']") -Use and inline column reference +Use inline column reference. .. ipython:: python @@ -4593,8 +4593,8 @@ Performance write chunksize (default is 50000). This will significantly lower your memory usage on writing. * You can pass ``expectedrows=`` to the first ``append``, - to set the TOTAL number of expected rows that ``PyTables`` will - expected. This will optimize read/write performance. + to set the TOTAL number of rows that ``PyTables`` will expect. + This will optimize read/write performance. * Duplicate rows can be written to tables, but are filtered out in selection (with the last items being selected; thus a table is unique on major, minor pairs) From 5d3b492743e199e82e3018bb943545181ed0018f Mon Sep 17 00:00:00 2001 From: Kaiqi Dong Date: Fri, 16 Aug 2019 13:38:46 +0200 Subject: [PATCH 103/191] VIS: Fix DataFrame.plot() produces incorrect legend markers (#27808) --- doc/source/whatsnew/v1.0.0.rst | 1 + pandas/plotting/_matplotlib/core.py | 16 ++++++++++++---- pandas/tests/plotting/common.py | 22 ++++++++++++++++++++++ pandas/tests/plotting/test_frame.py | 25 +++++++++++++++++++++++++ 4 files changed, 60 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index aeed3668fe774..481c9074eda7c 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -165,6 +165,7 @@ Plotting - Bug in :meth:`Series.plot` not able to plot boolean values (:issue:`23719`) - +- Bug in :meth:`DataFrame.plot` producing incorrect legend markers when plotting multiple series on the same axis (:issue:`18222`) - Bug in :meth:`DataFrame.plot` when ``kind='box'`` and data contains datetime or timedelta data. These types are now automatically dropped (:issue:`22799`) Groupby/resample/rolling diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py index a262f89dcc79c..287cc2f4130f4 100644 --- a/pandas/plotting/_matplotlib/core.py +++ b/pandas/plotting/_matplotlib/core.py @@ -562,7 +562,7 @@ def _add_legend_handle(self, handle, label, index=None): self.legend_labels.append(label) def _make_legend(self): - ax, leg = self._get_ax_legend(self.axes[0]) + ax, leg, handle = self._get_ax_legend_handle(self.axes[0]) handles = [] labels = [] @@ -571,7 +571,8 @@ def _make_legend(self): if not self.subplots: if leg is not None: title = leg.get_title().get_text() - handles = leg.legendHandles + # Replace leg.LegendHandles because it misses marker info + handles.extend(handle) labels = [x.get_text() for x in leg.get_texts()] if self.legend: @@ -581,6 +582,7 @@ def _make_legend(self): handles += self.legend_handles labels += self.legend_labels + if self.legend_title is not None: title = self.legend_title @@ -592,8 +594,14 @@ def _make_legend(self): if ax.get_visible(): ax.legend(loc="best") - def _get_ax_legend(self, ax): + def _get_ax_legend_handle(self, ax): + """ + Take in axes and return ax, legend and handle under different scenarios + """ leg = ax.get_legend() + + # Get handle from axes + handle, _ = ax.get_legend_handles_labels() other_ax = getattr(ax, "left_ax", None) or getattr(ax, "right_ax", None) other_leg = None if other_ax is not None: @@ -601,7 +609,7 @@ def _get_ax_legend(self, ax): if leg is None and other_leg is not None: leg = other_leg ax = other_ax - return ax, leg + return ax, leg, handle @cache_readonly def plt(self): diff --git a/pandas/tests/plotting/common.py b/pandas/tests/plotting/common.py index 4929422d20e8a..5a591f72d7361 100644 --- a/pandas/tests/plotting/common.py +++ b/pandas/tests/plotting/common.py @@ -103,6 +103,28 @@ def _check_legend_labels(self, axes, labels=None, visible=True): else: assert ax.get_legend() is None + def _check_legend_marker(self, ax, expected_markers=None, visible=True): + """ + Check ax has expected legend markers + + Parameters + ---------- + ax : matplotlib Axes object + expected_markers : list-like + expected legend markers + visible : bool + expected legend visibility. labels are checked only when visible is + True + """ + if visible and (expected_markers is None): + raise ValueError("Markers must be specified when visible is True") + if visible: + handles, _ = ax.get_legend_handles_labels() + markers = [handle.get_marker() for handle in handles] + assert markers == expected_markers + else: + assert ax.get_legend() is None + def _check_data(self, xp, rs): """ Check each axes has identical lines diff --git a/pandas/tests/plotting/test_frame.py b/pandas/tests/plotting/test_frame.py index 65815bcedebfc..7fdc0252b71e3 100644 --- a/pandas/tests/plotting/test_frame.py +++ b/pandas/tests/plotting/test_frame.py @@ -1881,6 +1881,31 @@ def test_df_legend_labels(self): self._check_legend_labels(ax, labels=["LABEL_b", "LABEL_c"]) assert df5.columns.tolist() == ["b", "c"] + def test_missing_marker_multi_plots_on_same_ax(self): + # GH 18222 + df = pd.DataFrame( + data=[[1, 1, 1, 1], [2, 2, 4, 8]], columns=["x", "r", "g", "b"] + ) + fig, ax = self.plt.subplots(nrows=1, ncols=3) + # Left plot + df.plot(x="x", y="r", linewidth=0, marker="o", color="r", ax=ax[0]) + df.plot(x="x", y="g", linewidth=1, marker="x", color="g", ax=ax[0]) + df.plot(x="x", y="b", linewidth=1, marker="o", color="b", ax=ax[0]) + self._check_legend_labels(ax[0], labels=["r", "g", "b"]) + self._check_legend_marker(ax[0], expected_markers=["o", "x", "o"]) + # Center plot + df.plot(x="x", y="b", linewidth=1, marker="o", color="b", ax=ax[1]) + df.plot(x="x", y="r", linewidth=0, marker="o", color="r", ax=ax[1]) + df.plot(x="x", y="g", linewidth=1, marker="x", color="g", ax=ax[1]) + self._check_legend_labels(ax[1], labels=["b", "r", "g"]) + self._check_legend_marker(ax[1], expected_markers=["o", "o", "x"]) + # Right plot + df.plot(x="x", y="g", linewidth=1, marker="x", color="g", ax=ax[2]) + df.plot(x="x", y="b", linewidth=1, marker="o", color="b", ax=ax[2]) + df.plot(x="x", y="r", linewidth=0, marker="o", color="r", ax=ax[2]) + self._check_legend_labels(ax[2], labels=["g", "b", "r"]) + self._check_legend_marker(ax[2], expected_markers=["x", "o", "o"]) + def test_legend_name(self): multi = DataFrame( randn(4, 4), From 7a0bcc39e8fb81ae60b74c851619c1b5d0d85222 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 16 Aug 2019 07:08:16 -0500 Subject: [PATCH 104/191] BUG: Merge with readonly arrays (#27946) --- doc/source/whatsnew/v0.25.1.rst | 4 ++-- pandas/_libs/hashtable.pyx | 2 +- pandas/tests/reshape/merge/test_merge.py | 12 ++++++++++++ 3 files changed, 15 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v0.25.1.rst b/doc/source/whatsnew/v0.25.1.rst index 4350493f8ff91..85da09b9ea0c9 100644 --- a/doc/source/whatsnew/v0.25.1.rst +++ b/doc/source/whatsnew/v0.25.1.rst @@ -127,9 +127,9 @@ Reshaping ^^^^^^^^^ - A ``KeyError`` is now raised if ``.unstack()`` is called on a :class:`Series` or :class:`DataFrame` with a flat :class:`Index` passing a name which is not the correct one (:issue:`18303`) -- Bug in :meth:`DataFrame.crosstab` when ``margins`` set to ``True`` and ``normalize`` is not ``False``, an error is raised. (:issue:`27500`) +- Bug in :meth:`DataFrame.crosstab` when ``margins`` set to ``True`` and ``normalize`` is not ``False``, an error is raised. (:issue:`27500`) - :meth:`DataFrame.join` now suppresses the ``FutureWarning`` when the sort parameter is specified (:issue:`21952`) -- +- Bug in :meth:`DataFrame.join` raising with readonly arrays (:issue:`27943`) Sparse ^^^^^^ diff --git a/pandas/_libs/hashtable.pyx b/pandas/_libs/hashtable.pyx index 3e620f5934d5e..b8df78e600a46 100644 --- a/pandas/_libs/hashtable.pyx +++ b/pandas/_libs/hashtable.pyx @@ -108,7 +108,7 @@ cdef class Int64Factorizer: def get_count(self): return self.count - def factorize(self, int64_t[:] values, sort=False, + def factorize(self, const int64_t[:] values, sort=False, na_sentinel=-1, na_value=None): """ Factorize values with nans replaced by na_sentinel diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index b6c6f967333a8..a04f093ee7818 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -1340,6 +1340,18 @@ def test_merge_take_missing_values_from_index_of_other_dtype(self): expected = expected.reindex(columns=["a", "key", "b"]) tm.assert_frame_equal(result, expected) + def test_merge_readonly(self): + # https://github.com/pandas-dev/pandas/issues/27943 + data1 = pd.DataFrame( + np.arange(20).reshape((4, 5)) + 1, columns=["a", "b", "c", "d", "e"] + ) + data2 = pd.DataFrame( + np.arange(20).reshape((5, 4)) + 1, columns=["a", "b", "x", "y"] + ) + + data1._data.blocks[0].values.flags.writeable = False + data1.merge(data2) # no error + def _check_merge(x, y): for how in ["inner", "left", "outer"]: From e66ad6c77cbd66fe7bc8d7f32a70527f487cc6e2 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 16 Aug 2019 05:11:10 -0700 Subject: [PATCH 105/191] cleanups, remove StringMixin (#27939) --- pandas/compat/chainmap.py | 6 ----- pandas/core/computation/align.py | 8 +++--- pandas/core/computation/common.py | 5 ---- pandas/core/computation/engines.py | 14 +++++----- pandas/core/computation/expr.py | 3 ++- pandas/core/computation/expressions.py | 27 +++++++++---------- pandas/core/computation/ops.py | 8 +++--- pandas/core/computation/scope.py | 37 +++++++++++++++----------- 8 files changed, 54 insertions(+), 54 deletions(-) diff --git a/pandas/compat/chainmap.py b/pandas/compat/chainmap.py index 83f1da597d6a6..84824207de2a9 100644 --- a/pandas/compat/chainmap.py +++ b/pandas/compat/chainmap.py @@ -15,9 +15,3 @@ def __delitem__(self, key): del mapping[key] return raise KeyError(key) - - # override because the m parameter is introduced in Python 3.4 - def new_child(self, m=None): - if m is None: - m = {} - return self.__class__(m, *self.maps) diff --git a/pandas/core/computation/align.py b/pandas/core/computation/align.py index 1046401850963..3e1e5ed89d877 100644 --- a/pandas/core/computation/align.py +++ b/pandas/core/computation/align.py @@ -9,6 +9,7 @@ from pandas.errors import PerformanceWarning import pandas as pd +from pandas.core.base import PandasObject import pandas.core.common as com from pandas.core.computation.common import _result_type_many @@ -34,7 +35,7 @@ def _zip_axes_from_type(typ, new_axes): def _any_pandas_objects(terms): """Check a sequence of terms for instances of PandasObject.""" - return any(isinstance(term.value, pd.core.generic.PandasObject) for term in terms) + return any(isinstance(term.value, PandasObject) for term in terms) def _filter_special_cases(f): @@ -132,7 +133,8 @@ def _align(terms): def _reconstruct_object(typ, obj, axes, dtype): - """Reconstruct an object given its type, raw value, and possibly empty + """ + Reconstruct an object given its type, raw value, and possibly empty (None) axes. Parameters @@ -157,7 +159,7 @@ def _reconstruct_object(typ, obj, axes, dtype): res_t = np.result_type(obj.dtype, dtype) - if not isinstance(typ, partial) and issubclass(typ, pd.core.generic.PandasObject): + if not isinstance(typ, partial) and issubclass(typ, PandasObject): return typ(obj, dtype=res_t, **axes) # special case for pathological things like ~True/~False diff --git a/pandas/core/computation/common.py b/pandas/core/computation/common.py index 2a4277e4e0a20..bd32c8bee1cdf 100644 --- a/pandas/core/computation/common.py +++ b/pandas/core/computation/common.py @@ -36,8 +36,3 @@ def _remove_spaces_column_name(name): class NameResolutionError(NameError): pass - - -class StringMixin: - # TODO: delete this class. Removing this ATM caused a failure. - pass diff --git a/pandas/core/computation/engines.py b/pandas/core/computation/engines.py index 2c94b142a45b3..3cc34ea1f4ed7 100644 --- a/pandas/core/computation/engines.py +++ b/pandas/core/computation/engines.py @@ -17,7 +17,8 @@ class NumExprClobberingError(NameError): def _check_ne_builtin_clash(expr): - """Attempt to prevent foot-shooting in a helpful way. + """ + Attempt to prevent foot-shooting in a helpful way. Parameters ---------- @@ -53,7 +54,8 @@ def convert(self): return printing.pprint_thing(self.expr) def evaluate(self): - """Run the engine on the expression + """ + Run the engine on the expression. This method performs alignment which is necessary no matter what engine is being used, thus its implementation is in the base class. @@ -78,7 +80,8 @@ def _is_aligned(self): @abc.abstractmethod def _evaluate(self): - """Return an evaluated expression. + """ + Return an evaluated expression. Parameters ---------- @@ -94,7 +97,6 @@ def _evaluate(self): class NumExprEngine(AbstractEngine): - """NumExpr engine class""" has_neg_frac = True @@ -127,8 +129,8 @@ def _evaluate(self): class PythonEngine(AbstractEngine): - - """Evaluate an expression in Python space. + """ + Evaluate an expression in Python space. Mostly for testing purposes. """ diff --git a/pandas/core/computation/expr.py b/pandas/core/computation/expr.py index d0d87c23e9346..a58f256cf61d4 100644 --- a/pandas/core/computation/expr.py +++ b/pandas/core/computation/expr.py @@ -41,7 +41,8 @@ def tokenize_string(source): - """Tokenize a Python source code string. + """ + Tokenize a Python source code string. Parameters ---------- diff --git a/pandas/core/computation/expressions.py b/pandas/core/computation/expressions.py index 0ecf56cf6fe96..5b6d275001d36 100644 --- a/pandas/core/computation/expressions.py +++ b/pandas/core/computation/expressions.py @@ -99,15 +99,13 @@ def _evaluate_numexpr(op, op_str, a, b, truediv=True, reversed=False, **eval_kwa result = None if _can_use_numexpr(op, op_str, a, b, "evaluate"): - try: - - # we were originally called by a reversed op - # method - if reversed: - a, b = b, a + if reversed: + # we were originally called by a reversed op method + a, b = b, a - a_value = getattr(a, "values", a) - b_value = getattr(b, "values", b) + a_value = getattr(a, "values", a) + b_value = getattr(b, "values", b) + try: result = ne.evaluate( "a_value {op} b_value".format(op=op_str), local_dict={"a_value": a_value, "b_value": b_value}, @@ -138,11 +136,11 @@ def _where_numexpr(cond, a, b): result = None if _can_use_numexpr(None, "where", a, b, "where"): + cond_value = getattr(cond, "values", cond) + a_value = getattr(a, "values", a) + b_value = getattr(b, "values", b) try: - cond_value = getattr(cond, "values", cond) - a_value = getattr(a, "values", a) - b_value = getattr(b, "values", b) result = ne.evaluate( "where(cond_value, a_value, b_value)", local_dict={ @@ -209,7 +207,8 @@ def evaluate(op, op_str, a, b, use_numexpr=True, **eval_kwargs): Parameters ---------- op : the actual operand - op_str : the string version of the op + op_str : str + The string version of the op. a : left operand b : right operand use_numexpr : bool, default True @@ -224,11 +223,11 @@ def evaluate(op, op_str, a, b, use_numexpr=True, **eval_kwargs): def where(cond, a, b, use_numexpr=True): """ - Evaluate the where condition cond on a and b + Evaluate the where condition cond on a and b. Parameters ---------- - cond : ndarray[bool] + cond : np.ndarray[bool] a : return if cond is True b : return if cond is False use_numexpr : bool, default True diff --git a/pandas/core/computation/ops.py b/pandas/core/computation/ops.py index b49220ae701bc..28b6aef693bfe 100644 --- a/pandas/core/computation/ops.py +++ b/pandas/core/computation/ops.py @@ -476,8 +476,8 @@ def isnumeric(dtype): class Div(BinOp): - - """Div operator to special case casting. + """ + Div operator to special case casting. Parameters ---------- @@ -508,8 +508,8 @@ def __init__(self, lhs, rhs, truediv, *args, **kwargs): class UnaryOp(Op): - - """Hold a unary operator and its operands + """ + Hold a unary operator and its operands. Parameters ---------- diff --git a/pandas/core/computation/scope.py b/pandas/core/computation/scope.py index 8ddd0dd7622e7..b11411eb2dc66 100644 --- a/pandas/core/computation/scope.py +++ b/pandas/core/computation/scope.py @@ -15,9 +15,6 @@ from pandas._libs.tslibs import Timestamp from pandas.compat.chainmap import DeepChainMap -import pandas.core.computation as compu -from pandas.core.computation.common import StringMixin - def _ensure_scope( level, global_dict=None, local_dict=None, resolvers=(), target=None, **kwargs @@ -67,7 +64,8 @@ def _raw_hex_id(obj): def _get_pretty_string(obj): - """Return a prettier version of obj + """ + Return a prettier version of obj. Parameters ---------- @@ -84,9 +82,9 @@ def _get_pretty_string(obj): return sio.getvalue() -class Scope(StringMixin): - - """Object to hold scope, with a few bells to deal with some custom syntax +class Scope: + """ + Object to hold scope, with a few bells to deal with some custom syntax and contexts added by pandas. Parameters @@ -105,7 +103,7 @@ class Scope(StringMixin): temps : dict """ - __slots__ = "level", "scope", "target", "temps" + __slots__ = ["level", "scope", "target", "resolvers", "temps"] def __init__( self, level, global_dict=None, local_dict=None, resolvers=(), target=None @@ -163,7 +161,8 @@ def has_resolvers(self): return bool(len(self.resolvers)) def resolve(self, key, is_local): - """Resolve a variable name in a possibly local context + """ + Resolve a variable name in a possibly local context. Parameters ---------- @@ -198,10 +197,14 @@ def resolve(self, key, is_local): # e.g., df[df > 0] return self.temps[key] except KeyError: - raise compu.ops.UndefinedVariableError(key, is_local) + # runtime import because ops imports from scope + from pandas.core.computation.ops import UndefinedVariableError + + raise UndefinedVariableError(key, is_local) def swapkey(self, old_key, new_key, new_value=None): - """Replace a variable name, with a potentially new value. + """ + Replace a variable name, with a potentially new value. Parameters ---------- @@ -225,7 +228,8 @@ def swapkey(self, old_key, new_key, new_value=None): return def _get_vars(self, stack, scopes): - """Get specifically scoped variables from a list of stack frames. + """ + Get specifically scoped variables from a list of stack frames. Parameters ---------- @@ -247,7 +251,8 @@ def _get_vars(self, stack, scopes): del frame def update(self, level): - """Update the current scope by going back `level` levels. + """ + Update the current scope by going back `level` levels. Parameters ---------- @@ -266,7 +271,8 @@ def update(self, level): del stack[:], stack def add_tmp(self, value): - """Add a temporary variable to the scope. + """ + Add a temporary variable to the scope. Parameters ---------- @@ -297,7 +303,8 @@ def ntemps(self): @property def full_scope(self): - """Return the full scope for use with passing to engines transparently + """ + Return the full scope for use with passing to engines transparently as a mapping. Returns From d03beab3b260501a832499fb92dc81a1075048d7 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 16 Aug 2019 05:12:27 -0700 Subject: [PATCH 106/191] implement array_ops (#27936) --- pandas/core/arrays/datetimes.py | 4 +- pandas/core/indexes/base.py | 2 +- pandas/core/ops/__init__.py | 123 ++----------------------------- pandas/core/ops/array_ops.py | 127 ++++++++++++++++++++++++++++++++ 4 files changed, 135 insertions(+), 121 deletions(-) create mode 100644 pandas/core/ops/array_ops.py diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index 1aad130d9a3f5..093334a815938 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -195,11 +195,11 @@ def wrapper(self, other): return invalid_comparison(self, other, op) if is_object_dtype(other): - # We have to use _comp_method_OBJECT_ARRAY instead of numpy + # We have to use comp_method_OBJECT_ARRAY instead of numpy # comparison otherwise it would fail to raise when # comparing tz-aware and tz-naive with np.errstate(all="ignore"): - result = ops._comp_method_OBJECT_ARRAY( + result = ops.comp_method_OBJECT_ARRAY( op, self.astype(object), other ) o_mask = isna(other) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 5f17dde01d2c4..b983117478c61 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -109,7 +109,7 @@ def cmp_method(self, other): elif is_object_dtype(self) and not isinstance(self, ABCMultiIndex): # don't pass MultiIndex with np.errstate(all="ignore"): - result = ops._comp_method_OBJECT_ARRAY(op, self.values, other) + result = ops.comp_method_OBJECT_ARRAY(op, self.values, other) else: with np.errstate(all="ignore"): diff --git a/pandas/core/ops/__init__.py b/pandas/core/ops/__init__.py index 26672593f98fb..dbcf09a401f27 100644 --- a/pandas/core/ops/__init__.py +++ b/pandas/core/ops/__init__.py @@ -13,11 +13,7 @@ from pandas.errors import NullFrequencyError from pandas.util._decorators import Appender -from pandas.core.dtypes.cast import ( - construct_1d_object_array_from_listlike, - find_common_type, - maybe_upcast_putmask, -) +from pandas.core.dtypes.cast import construct_1d_object_array_from_listlike from pandas.core.dtypes.common import ( ensure_object, is_bool_dtype, @@ -29,7 +25,6 @@ is_integer_dtype, is_list_like, is_object_dtype, - is_period_dtype, is_scalar, is_timedelta64_dtype, ) @@ -37,7 +32,6 @@ ABCDataFrame, ABCDatetimeArray, ABCDatetimeIndex, - ABCIndex, ABCIndexClass, ABCSeries, ABCSparseSeries, @@ -47,7 +41,7 @@ import pandas as pd from pandas._typing import ArrayLike from pandas.core.construction import array, extract_array -from pandas.core.ops import missing +from pandas.core.ops.array_ops import comp_method_OBJECT_ARRAY, define_na_arithmetic_op from pandas.core.ops.docstrings import ( _arith_doc_FRAME, _flex_comp_doc_FRAME, @@ -398,63 +392,6 @@ def mask_cmp_op(x, y, op): return result -def masked_arith_op(x, y, op): - """ - If the given arithmetic operation fails, attempt it again on - only the non-null elements of the input array(s). - - Parameters - ---------- - x : np.ndarray - y : np.ndarray, Series, Index - op : binary operator - """ - # For Series `x` is 1D so ravel() is a no-op; calling it anyway makes - # the logic valid for both Series and DataFrame ops. - xrav = x.ravel() - assert isinstance(x, np.ndarray), type(x) - if isinstance(y, np.ndarray): - dtype = find_common_type([x.dtype, y.dtype]) - result = np.empty(x.size, dtype=dtype) - - # PeriodIndex.ravel() returns int64 dtype, so we have - # to work around that case. See GH#19956 - yrav = y if is_period_dtype(y) else y.ravel() - mask = notna(xrav) & notna(yrav) - - if yrav.shape != mask.shape: - # FIXME: GH#5284, GH#5035, GH#19448 - # Without specifically raising here we get mismatched - # errors in Py3 (TypeError) vs Py2 (ValueError) - # Note: Only = an issue in DataFrame case - raise ValueError("Cannot broadcast operands together.") - - if mask.any(): - with np.errstate(all="ignore"): - result[mask] = op(xrav[mask], yrav[mask]) - - else: - assert is_scalar(y), type(y) - assert isinstance(x, np.ndarray), type(x) - # mask is only meaningful for x - result = np.empty(x.size, dtype=x.dtype) - mask = notna(xrav) - - # 1 ** np.nan is 1. So we have to unmask those. - if op == pow: - mask = np.where(x == 1, False, mask) - elif op == rpow: - mask = np.where(y == 1, False, mask) - - if mask.any(): - with np.errstate(all="ignore"): - result[mask] = op(xrav[mask], y) - - result, changed = maybe_upcast_putmask(result, ~mask, np.nan) - result = result.reshape(x.shape) # 2D compat - return result - - # ----------------------------------------------------------------------------- # Dispatch logic @@ -673,33 +610,7 @@ def _arith_method_SERIES(cls, op, special): _construct_divmod_result if op in [divmod, rdivmod] else _construct_result ) - def na_op(x, y): - """ - Return the result of evaluating op on the passed in values. - - If native types are not compatible, try coersion to object dtype. - - Parameters - ---------- - x : array-like - y : array-like or scalar - - Returns - ------- - array-like - - Raises - ------ - TypeError : invalid operation - """ - import pandas.core.computation.expressions as expressions - - try: - result = expressions.evaluate(op, str_rep, x, y, **eval_kwargs) - except TypeError: - result = masked_arith_op(x, y, op) - - return missing.dispatch_fill_zeros(op, x, y, result) + na_op = define_na_arithmetic_op(op, str_rep, eval_kwargs) def wrapper(left, right): if isinstance(right, ABCDataFrame): @@ -735,22 +646,6 @@ def wrapper(left, right): return wrapper -def _comp_method_OBJECT_ARRAY(op, x, y): - if isinstance(y, list): - y = construct_1d_object_array_from_listlike(y) - if isinstance(y, (np.ndarray, ABCSeries, ABCIndex)): - if not is_object_dtype(y.dtype): - y = y.astype(np.object_) - - if isinstance(y, (ABCSeries, ABCIndex)): - y = y.values - - result = libops.vec_compare(x, y, op) - else: - result = libops.scalar_compare(x, y, op) - return result - - def _comp_method_SERIES(cls, op, special): """ Wrapper function for Series arithmetic operations, to avoid @@ -764,7 +659,7 @@ def na_op(x, y): # Extension Dtypes are not called here if is_object_dtype(x.dtype): - result = _comp_method_OBJECT_ARRAY(op, x, y) + result = comp_method_OBJECT_ARRAY(op, x, y) elif is_datetimelike_v_numeric(x, y): return invalid_comparison(x, y, op) @@ -1091,15 +986,7 @@ def _arith_method_FRAME(cls, op, special): eval_kwargs = _gen_eval_kwargs(op_name) default_axis = _get_frame_op_default_axis(op_name) - def na_op(x, y): - import pandas.core.computation.expressions as expressions - - try: - result = expressions.evaluate(op, str_rep, x, y, **eval_kwargs) - except TypeError: - result = masked_arith_op(x, y, op) - - return missing.dispatch_fill_zeros(op, x, y, result) + na_op = define_na_arithmetic_op(op, str_rep, eval_kwargs) if op_name in _op_descriptions: # i.e. include "add" but not "__add__" diff --git a/pandas/core/ops/array_ops.py b/pandas/core/ops/array_ops.py new file mode 100644 index 0000000000000..a3bfb2e10bb66 --- /dev/null +++ b/pandas/core/ops/array_ops.py @@ -0,0 +1,127 @@ +""" +Functions for arithmetic and comparison operations on NumPy arrays and +ExtensionArrays. +""" +import numpy as np + +from pandas._libs import ops as libops + +from pandas.core.dtypes.cast import ( + construct_1d_object_array_from_listlike, + find_common_type, + maybe_upcast_putmask, +) +from pandas.core.dtypes.common import is_object_dtype, is_period_dtype, is_scalar +from pandas.core.dtypes.generic import ABCIndex, ABCSeries +from pandas.core.dtypes.missing import notna + +from pandas.core.ops import missing +from pandas.core.ops.roperator import rpow + + +def comp_method_OBJECT_ARRAY(op, x, y): + if isinstance(y, list): + y = construct_1d_object_array_from_listlike(y) + + # TODO: Should the checks below be ABCIndexClass? + if isinstance(y, (np.ndarray, ABCSeries, ABCIndex)): + # TODO: should this be ABCIndexClass?? + if not is_object_dtype(y.dtype): + y = y.astype(np.object_) + + if isinstance(y, (ABCSeries, ABCIndex)): + y = y.values + + result = libops.vec_compare(x, y, op) + else: + result = libops.scalar_compare(x, y, op) + return result + + +def masked_arith_op(x, y, op): + """ + If the given arithmetic operation fails, attempt it again on + only the non-null elements of the input array(s). + + Parameters + ---------- + x : np.ndarray + y : np.ndarray, Series, Index + op : binary operator + """ + # For Series `x` is 1D so ravel() is a no-op; calling it anyway makes + # the logic valid for both Series and DataFrame ops. + xrav = x.ravel() + assert isinstance(x, np.ndarray), type(x) + if isinstance(y, np.ndarray): + dtype = find_common_type([x.dtype, y.dtype]) + result = np.empty(x.size, dtype=dtype) + + # PeriodIndex.ravel() returns int64 dtype, so we have + # to work around that case. See GH#19956 + yrav = y if is_period_dtype(y) else y.ravel() + mask = notna(xrav) & notna(yrav) + + if yrav.shape != mask.shape: + # FIXME: GH#5284, GH#5035, GH#19448 + # Without specifically raising here we get mismatched + # errors in Py3 (TypeError) vs Py2 (ValueError) + # Note: Only = an issue in DataFrame case + raise ValueError("Cannot broadcast operands together.") + + if mask.any(): + with np.errstate(all="ignore"): + result[mask] = op(xrav[mask], yrav[mask]) + + else: + assert is_scalar(y), type(y) + assert isinstance(x, np.ndarray), type(x) + # mask is only meaningful for x + result = np.empty(x.size, dtype=x.dtype) + mask = notna(xrav) + + # 1 ** np.nan is 1. So we have to unmask those. + if op == pow: + mask = np.where(x == 1, False, mask) + elif op == rpow: + mask = np.where(y == 1, False, mask) + + if mask.any(): + with np.errstate(all="ignore"): + result[mask] = op(xrav[mask], y) + + result, changed = maybe_upcast_putmask(result, ~mask, np.nan) + result = result.reshape(x.shape) # 2D compat + return result + + +def define_na_arithmetic_op(op, str_rep, eval_kwargs): + def na_op(x, y): + """ + Return the result of evaluating op on the passed in values. + + If native types are not compatible, try coersion to object dtype. + + Parameters + ---------- + x : array-like + y : array-like or scalar + + Returns + ------- + array-like + + Raises + ------ + TypeError : invalid operation + """ + import pandas.core.computation.expressions as expressions + + try: + result = expressions.evaluate(op, str_rep, x, y, **eval_kwargs) + except TypeError: + result = masked_arith_op(x, y, op) + + return missing.dispatch_fill_zeros(op, x, y, result) + + return na_op From 802f67046bbae0a815b2fe9d20d2217485bbc942 Mon Sep 17 00:00:00 2001 From: LiuSeeker Date: Fri, 16 Aug 2019 09:14:42 -0300 Subject: [PATCH 107/191] TST: Asserts all types are caterogical in function fillna with correct error message (#27933) * Fixes issue #13628 * Fixes issue #13628 fix * isort Fixes #13628 * move to 1.0 --- doc/source/whatsnew/v0.25.1.rst | 1 - doc/source/whatsnew/v1.0.0.rst | 1 + pandas/tests/arrays/categorical/test_missing.py | 6 ++++-- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v0.25.1.rst b/doc/source/whatsnew/v0.25.1.rst index 85da09b9ea0c9..34b149a6b8261 100644 --- a/doc/source/whatsnew/v0.25.1.rst +++ b/doc/source/whatsnew/v0.25.1.rst @@ -27,7 +27,6 @@ Categorical - Bug in :meth:`Categorical.fillna` would replace all values, not just those that are ``NaN`` (:issue:`26215`) - -- Datetimelike ^^^^^^^^^^^^ diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 481c9074eda7c..0be4ebc627b30 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -87,6 +87,7 @@ Bug fixes Categorical ^^^^^^^^^^^ +- Added test to assert the :func:`fillna` raises the correct ValueError message when the value isn't a value from categories (:issue:`13628`) - - diff --git a/pandas/tests/arrays/categorical/test_missing.py b/pandas/tests/arrays/categorical/test_missing.py index 1b62479530d24..3037ac79cd592 100644 --- a/pandas/tests/arrays/categorical/test_missing.py +++ b/pandas/tests/arrays/categorical/test_missing.py @@ -5,7 +5,7 @@ from pandas.core.dtypes.dtypes import CategoricalDtype -from pandas import Categorical, Index, isna +from pandas import Categorical, Index, Series, isna import pandas.util.testing as tm @@ -59,11 +59,13 @@ def test_set_item_nan(self): ), (dict(), "Must specify a fill 'value' or 'method'."), (dict(method="bad"), "Invalid fill method. Expecting .* bad"), + (dict(value=Series([1, 2, 3, 4, "a"])), "fill value must be in categories"), ], ) def test_fillna_raises(self, fillna_kwargs, msg): # https://github.com/pandas-dev/pandas/issues/19682 - cat = Categorical([1, 2, 3]) + # https://github.com/pandas-dev/pandas/issues/13628 + cat = Categorical([1, 2, 3, None, None]) with pytest.raises(ValueError, match=msg): cat.fillna(**fillna_kwargs) From f4b4ec2026327021fa646f9f77cc23600feb49e4 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 16 Aug 2019 12:18:52 -0700 Subject: [PATCH 108/191] DOC/CLN: docstring cleanups (#27942) --- pandas/core/frame.py | 2 +- pandas/core/generic.py | 92 ++++++++++++++++++++-------------------- pandas/core/series.py | 2 +- pandas/plotting/_core.py | 30 ++++++------- 4 files changed, 63 insertions(+), 63 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 20398069847b1..603a615c1f8cb 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1190,7 +1190,7 @@ def to_numpy(self, dtype=None, copy=False): Parameters ---------- dtype : str or numpy.dtype, optional - The dtype to pass to :meth:`numpy.asarray` + The dtype to pass to :meth:`numpy.asarray`. copy : bool, default False Whether to ensure that the returned value is a not a view on another array. Note that ``copy=False`` does not *ensure* that diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 71d5068e2e0fc..ba1c516b9b444 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -131,7 +131,7 @@ class NDFrame(PandasObject, SelectionMixin): ---------- data : BlockManager axes : list - copy : boolean, default False + copy : bool, default False """ _internal_names = [ @@ -280,7 +280,8 @@ def _setup_axes( ns=None, docs=None, ): - """Provide axes setup for the major PandasObjects. + """ + Provide axes setup for the major PandasObjects. Parameters ---------- @@ -288,8 +289,8 @@ def _setup_axes( info_axis_num : the axis of the selector dimension (int) stat_axis_num : the number of axis for the default stats (int) aliases : other names for a single axis (dict) - axes_are_reversed : boolean whether to treat passed axes as - reversed (DataFrame) + axes_are_reversed : bool + Whether to treat passed axes as reversed (DataFrame). build_axes : setup the axis properties (default True) """ @@ -676,7 +677,7 @@ def transpose(self, *args, **kwargs): Parameters ---------- args : %(args_transpose)s - copy : boolean, default False + copy : bool, default False Make a copy of the underlying data. Mixed-dtype data will always result in a copy **kwargs @@ -2257,10 +2258,10 @@ def to_json( Parameters ---------- - path_or_buf : string or file handle, optional + path_or_buf : str or file handle, optional File path or object. If not specified, the result is returned as a string. - orient : string + orient : str Indication of expected JSON string format. * Series @@ -2539,7 +2540,7 @@ def to_msgpack(self, path_or_buf=None, encoding="utf-8", **kwargs): def to_sql( self, - name, + name: str, con, schema=None, if_exists="fail", @@ -2557,12 +2558,12 @@ def to_sql( Parameters ---------- - name : string + name : str Name of SQL table. con : sqlalchemy.engine.Engine or sqlite3.Connection Using SQLAlchemy makes it possible to use any DB supported by that library. Legacy support is provided for sqlite3.Connection objects. - schema : string, optional + schema : str, optional Specify the schema (if database flavor supports this). If None, use default schema. if_exists : {'fail', 'replace', 'append'}, default 'fail' @@ -2575,7 +2576,7 @@ def to_sql( index : bool, default True Write DataFrame index as a column. Uses `index_label` as the column name in the table. - index_label : string or sequence, default None + index_label : str or sequence, default None Column label for index column(s). If None is given (default) and `index` is True, then the index names are used. A sequence should be given if the DataFrame uses MultiIndex. @@ -3251,11 +3252,10 @@ def _maybe_update_cacher(self, clear=False, verify_is_copy=True): Parameters ---------- - clear : boolean, default False - clear the item cache - verify_is_copy : boolean, default True - provide is_copy checks - + clear : bool, default False + Clear the item cache. + verify_is_copy : bool, default True + Provide is_copy checks. """ cacher = getattr(self, "_cacher", None) @@ -3621,11 +3621,11 @@ def _check_setitem_copy(self, stacklevel=4, t="setting", force=False): Parameters ---------- - stacklevel : integer, default 4 + stacklevel : int, default 4 the level to show of the stack when the error is output - t : string, the type of setting error - force : boolean, default False - if True, then force showing an error + t : str, the type of setting error + force : bool, default False + If True, then force showing an error. validate if we are doing a setitem on a chained copy. @@ -3954,9 +3954,8 @@ def _update_inplace(self, result, verify_is_copy=True): Parameters ---------- - verify_is_copy : boolean, default True - provide is_copy checks - + verify_is_copy : bool, default True + Provide is_copy checks. """ # NOTE: This does *not* call __finalize__ and that's an explicit # decision that we may revisit in the future. @@ -4571,9 +4570,9 @@ def filter(self, items=None, like=None, regex=None, axis=None): ---------- items : list-like Keep labels from axis which are in items. - like : string + like : str Keep labels from axis for which "like in label == True". - regex : string (regular expression) + regex : str (regular expression) Keep labels from axis for which re.search(regex, label) == True. axis : int or string axis name The axis to filter on. By default this is the info axis, @@ -5233,8 +5232,8 @@ def _consolidate(self, inplace=False): Parameters ---------- - inplace : boolean, default False - If False return new object, otherwise modify existing object + inplace : bool, default False + If False return new object, otherwise modify existing object. Returns ------- @@ -5680,11 +5679,12 @@ def as_blocks(self, copy=True): Parameters ---------- - copy : boolean, default True + copy : bool, default True Returns ------- - values : a dict of dtype -> Constructor Types + dict + Mapping dtype -> Constructor Types. """ warnings.warn( "as_blocks is deprecated and will be removed in a future version", @@ -5993,17 +5993,17 @@ def _convert( Parameters ---------- - datetime : boolean, default False + datetime : bool, default False If True, convert to date where possible. - numeric : boolean, default False + numeric : bool, default False If True, attempt to convert to numbers (including strings), with unconvertible values becoming NaN. - timedelta : boolean, default False + timedelta : bool, default False If True, convert to timedelta where possible. - coerce : boolean, default False + coerce : bool, default False If True, force conversion with unconvertible values converted to - nulls (NaN or NaT) - copy : boolean, default True + nulls (NaN or NaT). + copy : bool, default True If True, return a copy even if no copy is necessary (e.g. no conversion was done). Note: This is meant for internal use, and should not be confused with inplace. @@ -7870,7 +7870,7 @@ def asfreq(self, freq, method=None, how=None, normalize=False, fill_value=None): Parameters ---------- - freq : DateOffset object, or string + freq : DateOffset or str method : {'backfill'/'bfill', 'pad'/'ffill'}, default None Method to use for filling holes in reindexed Series (note this does not fill NaNs that already were present): @@ -8671,7 +8671,7 @@ def ranker(data): level : int or level name, default None Broadcast across a level, matching Index values on the passed MultiIndex level - copy : boolean, default True + copy : bool, default True Always returns new objects. If copy=False and no reindexing is required then original objects are returned. fill_value : scalar, default np.NaN @@ -9463,7 +9463,7 @@ def truncate(self, before=None, after=None, axis=None, copy=True): Truncate all rows after this index value. axis : {0 or 'index', 1 or 'columns'}, optional Axis to truncate. Truncates the index (rows) by default. - copy : boolean, default is True, + copy : bool, default is True, Return a copy of the truncated section. Returns @@ -9607,13 +9607,13 @@ def tz_convert(self, tz, axis=0, level=None, copy=True): Parameters ---------- - tz : string or pytz.timezone object + tz : str or tzinfo object axis : the axis to convert level : int, str, default None - If axis ia a MultiIndex, convert a specific level. Otherwise - must be None - copy : boolean, default True - Also make a copy of the underlying data + If axis is a MultiIndex, convert a specific level. Otherwise + must be None. + copy : bool, default True + Also make a copy of the underlying data. Returns ------- @@ -9667,12 +9667,12 @@ def tz_localize( Parameters ---------- - tz : string or pytz.timezone object + tz : str or tzinfo axis : the axis to localize level : int, str, default None If axis ia a MultiIndex, localize a specific level. Otherwise must be None - copy : boolean, default True + copy : bool, default True Also make a copy of the underlying data ambiguous : 'infer', bool-ndarray, 'NaT', default 'raise' When clocks moved backward due to DST, ambiguous times may arise. @@ -11009,7 +11009,7 @@ def _doc_parms(cls): ---------- axis : {0 or 'index', 1 or 'columns'}, default 0 The index or the name of the axis. 0 is equivalent to None or 'index'. -skipna : boolean, default True +skipna : bool, default True Exclude NA/null values. If an entire row/column is NA, the result will be NA. *args, **kwargs : diff --git a/pandas/core/series.py b/pandas/core/series.py index c891298d6e499..3f04970ee4e58 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -675,8 +675,8 @@ def nonzero(self): 3 4 dtype: int64 - >>> s = pd.Series([0, 3, 0, 4], index=['a', 'b', 'c', 'd']) # same return although index of s is different + >>> s = pd.Series([0, 3, 0, 4], index=['a', 'b', 'c', 'd']) >>> s.nonzero() (array([1, 3]),) >>> s.iloc[s.nonzero()[0]] diff --git a/pandas/plotting/_core.py b/pandas/plotting/_core.py index ec5c609c1b267..2e6a401b49efc 100644 --- a/pandas/plotting/_core.py +++ b/pandas/plotting/_core.py @@ -53,7 +53,7 @@ def hist_series( rotation of y axis labels figsize : tuple, default None figure size in inches by default - bins : integer or sequence, default 10 + bins : int or sequence, default 10 Number of histogram bins to be used. If an integer is given, bins + 1 bin edges are calculated and returned. If bins is a sequence, gives bin edges, including left edge of first bin and right edge of last @@ -116,7 +116,7 @@ def hist_frame( ---------- data : DataFrame The pandas object holding the data. - column : string or sequence + column : str or sequence If passed, will be used to limit data to a subset of columns. by : object, optional If passed, then used to form histograms for separate groups. @@ -148,7 +148,7 @@ def hist_frame( `matplotlib.rcParams` by default. layout : tuple, optional Tuple of (rows, columns) for the layout of the histograms. - bins : integer or sequence, default 10 + bins : int or sequence, default 10 Number of histogram bins to be used. If an integer is given, bins + 1 bin edges are calculated and returned. If bins is a sequence, gives bin edges, including left edge of first bin and right edge of last @@ -177,7 +177,7 @@ def hist_frame( >>> df = pd.DataFrame({ ... 'length': [1.5, 0.5, 1.2, 0.9, 3], ... 'width': [0.7, 0.2, 0.15, 0.2, 1.1] - ... }, index= ['pig', 'rabbit', 'duck', 'chicken', 'horse']) + ... }, index=['pig', 'rabbit', 'duck', 'chicken', 'horse']) >>> hist = df.hist(bins=3) """ plot_backend = _get_plot_backend() @@ -370,8 +370,8 @@ def boxplot( If ``return_type`` is `None`, a NumPy array of axes with the same shape as ``layout`` is returned: - >>> boxplot = df.boxplot(column=['Col1', 'Col2'], by='X', - ... return_type=None) + >>> boxplot = df.boxplot(column=['Col1', 'Col2'], by='X', + ... return_type=None) >>> type(boxplot) """ @@ -446,7 +446,7 @@ def boxplot_frame_groupby( * ``True`` - create a subplot for each group column : column name or list of names, or vector Can be any valid input to groupby - fontsize : int or string + fontsize : int or str rot : label rotation angle grid : Setting this to True will show the grid ax : Matplotlib axis object, default None @@ -530,7 +530,7 @@ class PlotAccessor(PandasObject): figsize : a tuple (width, height) in inches use_index : bool, default True Use index as ticks for x axis - title : string or list + title : str or list Title to use for the plot. If a string is passed, print the string at the top of the figure. If a list is passed and `subplots` is True, print each item in the list above the corresponding subplot. @@ -553,16 +553,16 @@ class PlotAccessor(PandasObject): .. versionchanged:: 0.25.0 xticks : sequence - Values to use for the xticks + Values to use for the xticks. yticks : sequence - Values to use for the yticks + Values to use for the yticks. xlim : 2-tuple/list ylim : 2-tuple/list rot : int, default None Rotation for ticks (xticks for vertical, yticks for horizontal plots) fontsize : int, default None - Font size for xticks and yticks + Font size for xticks and yticks. colormap : str or matplotlib colormap object, default None Colormap to select colors from. If string, load colormap with that name from matplotlib. @@ -587,9 +587,9 @@ class PlotAccessor(PandasObject): When using a secondary_y axis, automatically mark the column labels with "(right)" in the legend include_bool : bool, default is False - If True, boolean values can be plotted + If True, boolean values can be plotted. `**kwds` : keywords - Options to pass to matplotlib plotting method + Options to pass to matplotlib plotting method. Returns ------- @@ -985,7 +985,7 @@ def barh(self, x=None, y=None, **kwargs): .. plot:: :context: close-figs - >>> df = pd.DataFrame({'lab':['A', 'B', 'C'], 'val':[10, 30, 20]}) + >>> df = pd.DataFrame({'lab': ['A', 'B', 'C'], 'val': [10, 30, 20]}) >>> ax = df.plot.barh(x='lab', y='val') Plot a whole DataFrame to a horizontal bar plot @@ -1049,7 +1049,7 @@ def box(self, by=None, **kwargs): Parameters ---------- - by : string or sequence + by : str or sequence Column in the DataFrame to group by. **kwds : optional Additional keywords are documented in From 9f93d57025d7b906de6956f287b71cd18c17b13d Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 16 Aug 2019 15:06:35 -0500 Subject: [PATCH 109/191] TST: xfail on 37, win (#27956) * TST: xfail on 37, win Closes https://github.com/pandas-dev/pandas/issues/27902 --- pandas/tests/groupby/test_categorical.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 756de3edd33dd..b5c2de267869d 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -4,6 +4,8 @@ import numpy as np import pytest +from pandas.compat import PY37, is_platform_windows + import pandas as pd from pandas import ( Categorical, @@ -208,6 +210,9 @@ def test_level_get_group(observed): # GH#21636 previously flaky on py37 +@pytest.mark.xfail( + is_platform_windows() and PY37, reason="Flaky, GH-27902", strict=False +) @pytest.mark.parametrize("ordered", [True, False]) def test_apply(ordered): # GH 10138 From 5b0a2a6e361f75c8cbb715cd33ff55dd61c631c9 Mon Sep 17 00:00:00 2001 From: Bhavani Ravi Date: Sun, 18 Aug 2019 13:53:47 +0530 Subject: [PATCH 110/191] DOC: remove savefig references from the docs v0.7.3 (#27990) --- doc/source/whatsnew/v0.7.3.rst | 6 ------ 1 file changed, 6 deletions(-) diff --git a/doc/source/whatsnew/v0.7.3.rst b/doc/source/whatsnew/v0.7.3.rst index a8697f60d7467..020cf3bdc2d59 100644 --- a/doc/source/whatsnew/v0.7.3.rst +++ b/doc/source/whatsnew/v0.7.3.rst @@ -25,8 +25,6 @@ New features from pandas.tools.plotting import scatter_matrix scatter_matrix(df, alpha=0.2) # noqa F821 -.. image:: ../savefig/scatter_matrix_kde.png - :width: 5in - Add ``stacked`` argument to Series and DataFrame's ``plot`` method for :ref:`stacked bar plots `. @@ -35,15 +33,11 @@ New features df.plot(kind='bar', stacked=True) # noqa F821 -.. image:: ../savefig/bar_plot_stacked_ex.png - :width: 4in .. code-block:: python df.plot(kind='barh', stacked=True) # noqa F821 -.. image:: ../savefig/barh_plot_stacked_ex.png - :width: 4in - Add log x and y :ref:`scaling options ` to ``DataFrame.plot`` and ``Series.plot`` From 35c44cef7a008308f7590242e6995d4f87dac15d Mon Sep 17 00:00:00 2001 From: Ignacio Santolin Date: Mon, 19 Aug 2019 08:43:33 -0300 Subject: [PATCH 111/191] DOC: Updated table with read_fwf() (#27920) --- doc/source/user_guide/io.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 5d7a268631778..7ba103c5ff996 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -28,6 +28,7 @@ The pandas I/O API is a set of top level ``reader`` functions accessed like :delim: ; text;`CSV `__;:ref:`read_csv`;:ref:`to_csv` + text;`TXT `__;:ref:`read_fwf` text;`JSON `__;:ref:`read_json`;:ref:`to_json` text;`HTML `__;:ref:`read_html`;:ref:`to_html` text; Local clipboard;:ref:`read_clipboard`;:ref:`to_clipboard` @@ -1372,6 +1373,7 @@ should pass the ``escapechar`` option: print(data) pd.read_csv(StringIO(data), escapechar='\\') +.. _io.fwf_reader: .. _io.fwf: Files with fixed width columns From ba94f9baaa1a802eee4820a1188db6501a231ae6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Guilherme=20Salom=C3=A9?= Date: Mon, 19 Aug 2019 09:06:14 -0400 Subject: [PATCH 112/191] BUG: lzma is a required part of python, make it optional (#27882) * Importing lzma when Python has been compiled without its support will raise a warning. Substituted import lzma for helper function. --- doc/source/whatsnew/v0.25.1.rst | 9 +++++++- pandas/_libs/parsers.pyx | 8 +++++--- pandas/compat/__init__.py | 30 +++++++++++++++++++++++++++ pandas/io/common.py | 6 ++++-- pandas/tests/io/test_compression.py | 32 +++++++++++++++++++++++++++++ pandas/tests/io/test_pickle.py | 7 ++++--- pandas/util/testing.py | 11 +++++----- 7 files changed, 88 insertions(+), 15 deletions(-) diff --git a/doc/source/whatsnew/v0.25.1.rst b/doc/source/whatsnew/v0.25.1.rst index 34b149a6b8261..4e1bfac77fae2 100644 --- a/doc/source/whatsnew/v0.25.1.rst +++ b/doc/source/whatsnew/v0.25.1.rst @@ -102,7 +102,6 @@ MultiIndex I/O ^^^ - - Avoid calling ``S3File.s3`` when reading parquet, as this was removed in s3fs version 0.3.0 (:issue:`27756`) - Better error message when a negative header is passed in :func:`pandas.read_csv` (:issue:`27779`) - @@ -159,6 +158,14 @@ Other - - +I/O and LZMA +~~~~~~~~~~~~ + +Some users may unknowingly have an incomplete Python installation, which lacks the `lzma` module from the standard library. In this case, `import pandas` failed due to an `ImportError` (:issue: `27575`). +Pandas will now warn, rather than raising an `ImportError` if the `lzma` module is not present. Any subsequent attempt to use `lzma` methods will raise a `RuntimeError`. +A possible fix for the lack of the `lzma` module is to ensure you have the necessary libraries and then re-install Python. +For example, on MacOS installing Python with `pyenv` may lead to an incomplete Python installation due to unmet system dependencies at compilation time (like `xz`). Compilation will succeed, but Python might fail at run time. The issue can be solved by installing the necessary dependencies and then re-installing Python. + .. _whatsnew_0.251.contributors: Contributors diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index cafc31dad3568..6cc9dd22ce7c9 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -2,7 +2,6 @@ # See LICENSE for the license import bz2 import gzip -import lzma import os import sys import time @@ -59,9 +58,12 @@ from pandas.core.arrays import Categorical from pandas.core.dtypes.concat import union_categoricals import pandas.io.common as icom +from pandas.compat import _import_lzma, _get_lzma_file from pandas.errors import (ParserError, DtypeWarning, EmptyDataError, ParserWarning) +lzma = _import_lzma() + # Import CParserError as alias of ParserError for backwards compatibility. # Ultimately, we want to remove this import. See gh-12665 and gh-14479. CParserError = ParserError @@ -645,9 +647,9 @@ cdef class TextReader: 'zip file %s', str(zip_names)) elif self.compression == 'xz': if isinstance(source, str): - source = lzma.LZMAFile(source, 'rb') + source = _get_lzma_file(lzma)(source, 'rb') else: - source = lzma.LZMAFile(filename=source) + source = _get_lzma_file(lzma)(filename=source) else: raise ValueError('Unrecognized compression type: %s' % self.compression) diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py index 5ecd641fc68be..b32da8da3a1fb 100644 --- a/pandas/compat/__init__.py +++ b/pandas/compat/__init__.py @@ -10,6 +10,7 @@ import platform import struct import sys +import warnings PY35 = sys.version_info[:2] == (3, 5) PY36 = sys.version_info >= (3, 6) @@ -65,3 +66,32 @@ def is_platform_mac(): def is_platform_32bit(): return struct.calcsize("P") * 8 < 64 + + +def _import_lzma(): + """Attempts to import lzma, warning the user when lzma is not available. + """ + try: + import lzma + + return lzma + except ImportError: + msg = ( + "Could not import the lzma module. " + "Your installed Python is incomplete. " + "Attempting to use lzma compression will result in a RuntimeError." + ) + warnings.warn(msg) + + +def _get_lzma_file(lzma): + """Returns the lzma method LZMAFile when the module was correctly imported. + Otherwise, raises a RuntimeError. + """ + if lzma is None: + raise RuntimeError( + "lzma module not available. " + "A Python re-install with the proper " + "dependencies might be required to solve this issue." + ) + return lzma.LZMAFile diff --git a/pandas/io/common.py b/pandas/io/common.py index e01e473047b88..ac57cef372399 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -6,7 +6,6 @@ import gzip from http.client import HTTPException # noqa from io import BytesIO -import lzma import mmap import os import pathlib @@ -23,6 +22,7 @@ from urllib.request import pathname2url, urlopen import zipfile +from pandas.compat import _get_lzma_file, _import_lzma from pandas.errors import ( # noqa AbstractMethodError, DtypeWarning, @@ -35,6 +35,8 @@ from pandas._typing import FilePathOrBuffer +lzma = _import_lzma() + # gh-12665: Alias for now and remove later. CParserError = ParserError @@ -395,7 +397,7 @@ def _get_handle( # XZ Compression elif compression == "xz": - f = lzma.LZMAFile(path_or_buf, mode) + f = _get_lzma_file(lzma)(path_or_buf, mode) # Unrecognized Compression else: diff --git a/pandas/tests/io/test_compression.py b/pandas/tests/io/test_compression.py index ce459ab24afe0..16ca1109f266c 100644 --- a/pandas/tests/io/test_compression.py +++ b/pandas/tests/io/test_compression.py @@ -1,5 +1,7 @@ import contextlib import os +import subprocess +import textwrap import warnings import pytest @@ -125,3 +127,33 @@ def test_compression_warning(compression_only): with tm.assert_produces_warning(RuntimeWarning, check_stacklevel=False): with f: df.to_csv(f, compression=compression_only) + + +def test_with_missing_lzma(): + """Tests if import pandas works when lzma is not present.""" + # https://github.com/pandas-dev/pandas/issues/27575 + code = textwrap.dedent( + """\ + import sys + sys.modules['lzma'] = None + import pandas + """ + ) + subprocess.check_output(["python", "-c", code]) + + +def test_with_missing_lzma_runtime(): + """Tests if RuntimeError is hit when calling lzma without + having the module available.""" + code = textwrap.dedent( + """ + import sys + import pytest + sys.modules['lzma'] = None + import pandas + df = pandas.DataFrame() + with pytest.raises(RuntimeError, match='lzma module'): + df.to_csv('foo.csv', compression='xz') + """ + ) + subprocess.check_output(["python", "-c", code]) diff --git a/pandas/tests/io/test_pickle.py b/pandas/tests/io/test_pickle.py index 076d0c9f947c7..30555508f0998 100644 --- a/pandas/tests/io/test_pickle.py +++ b/pandas/tests/io/test_pickle.py @@ -13,7 +13,6 @@ import bz2 import glob import gzip -import lzma import os import pickle import shutil @@ -22,7 +21,7 @@ import pytest -from pandas.compat import is_platform_little_endian +from pandas.compat import _get_lzma_file, _import_lzma, is_platform_little_endian import pandas as pd from pandas import Index @@ -30,6 +29,8 @@ from pandas.tseries.offsets import Day, MonthEnd +lzma = _import_lzma() + @pytest.fixture(scope="module") def current_pickle_data(): @@ -270,7 +271,7 @@ def compress_file(self, src_path, dest_path, compression): with zipfile.ZipFile(dest_path, "w", compression=zipfile.ZIP_DEFLATED) as f: f.write(src_path, os.path.basename(src_path)) elif compression == "xz": - f = lzma.LZMAFile(dest_path, "w") + f = _get_lzma_file(lzma)(dest_path, "w") else: msg = "Unrecognized compression type: {}".format(compression) raise ValueError(msg) diff --git a/pandas/util/testing.py b/pandas/util/testing.py index cf8452cdd0c59..a8f0d0da52e1f 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -5,7 +5,6 @@ from functools import wraps import gzip import http.client -import lzma import os import re from shutil import rmtree @@ -26,7 +25,7 @@ ) import pandas._libs.testing as _testing -from pandas.compat import raise_with_traceback +from pandas.compat import _get_lzma_file, _import_lzma, raise_with_traceback from pandas.core.dtypes.common import ( is_bool, @@ -70,6 +69,8 @@ from pandas.io.common import urlopen from pandas.io.formats.printing import pprint_thing +lzma = _import_lzma() + N = 30 K = 4 _RAISE_NETWORK_ERROR_DEFAULT = False @@ -211,7 +212,7 @@ def decompress_file(path, compression): elif compression == "bz2": f = bz2.BZ2File(path, "rb") elif compression == "xz": - f = lzma.LZMAFile(path, "rb") + f = _get_lzma_file(lzma)(path, "rb") elif compression == "zip": zip_file = zipfile.ZipFile(path) zip_names = zip_file.namelist() @@ -264,9 +265,7 @@ def write_to_compressed(compression, path, data, dest="test"): compress_method = bz2.BZ2File elif compression == "xz": - import lzma - - compress_method = lzma.LZMAFile + compress_method = _get_lzma_file(lzma) else: msg = "Unrecognized compression type: {}".format(compression) raise ValueError(msg) From 6e0ab71e5aaddd783399dfddafa76053d37ab152 Mon Sep 17 00:00:00 2001 From: Avi Kelman Date: Mon, 19 Aug 2019 12:23:40 -0400 Subject: [PATCH 113/191] BUG: Help python csv engine read binary buffers (#27925) * BUG: Help python csv engine read binary buffers The file buffer given to read_csv could have been opened in binary mode, but the python csv reader errors on binary buffers. closes #23779 --- doc/source/whatsnew/v1.0.0.rst | 2 +- pandas/io/common.py | 12 ++++++----- pandas/tests/io/parser/test_common.py | 31 ++++++++++++++++++++++++--- 3 files changed, 36 insertions(+), 9 deletions(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 0be4ebc627b30..3ce558ca336c3 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -158,7 +158,7 @@ MultiIndex I/O ^^^ -- +- :meth:`read_csv` now accepts binary mode file buffers when using the Python csv engine (:issue:`23779`) - Plotting diff --git a/pandas/io/common.py b/pandas/io/common.py index ac57cef372399..26b68dda7b464 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -5,7 +5,7 @@ import csv import gzip from http.client import HTTPException # noqa -from io import BytesIO +from io import BufferedIOBase, BytesIO import mmap import os import pathlib @@ -344,9 +344,9 @@ def _get_handle( try: from s3fs import S3File - need_text_wrapping = (BytesIO, S3File) + need_text_wrapping = (BufferedIOBase, S3File) except ImportError: - need_text_wrapping = (BytesIO,) + need_text_wrapping = BufferedIOBase handles = list() f = path_or_buf @@ -422,8 +422,10 @@ def _get_handle( if is_text and (compression or isinstance(f, need_text_wrapping)): from io import TextIOWrapper - f = TextIOWrapper(f, encoding=encoding, newline="") - handles.append(f) + g = TextIOWrapper(f, encoding=encoding, newline="") + if not isinstance(f, BufferedIOBase): + handles.append(g) + f = g if memory_map and hasattr(f, "fileno"): try: diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py index b94d5cd497ccf..e5366a8357adb 100644 --- a/pandas/tests/io/parser/test_common.py +++ b/pandas/tests/io/parser/test_common.py @@ -2020,9 +2020,34 @@ def test_file_handles_with_open(all_parsers, csv1): # Don't close user provided file handles. parser = all_parsers - with open(csv1, "r") as f: - parser.read_csv(f) - assert not f.closed + for mode in ["r", "rb"]: + with open(csv1, mode) as f: + parser.read_csv(f) + assert not f.closed + + +@pytest.mark.parametrize( + "fname,encoding", + [ + ("test1.csv", "utf-8"), + ("unicode_series.csv", "latin-1"), + ("sauron.SHIFT_JIS.csv", "shiftjis"), + ], +) +def test_binary_mode_file_buffers(all_parsers, csv_dir_path, fname, encoding): + # gh-23779: Python csv engine shouldn't error on files opened in binary. + parser = all_parsers + + fpath = os.path.join(csv_dir_path, fname) + expected = parser.read_csv(fpath, encoding=encoding) + + with open(fpath, mode="r", encoding=encoding) as fa: + result = parser.read_csv(fa) + tm.assert_frame_equal(expected, result) + + with open(fpath, mode="rb") as fb: + result = parser.read_csv(fb, encoding=encoding) + tm.assert_frame_equal(expected, result) def test_invalid_file_buffer_class(all_parsers): From 489d1ff8a1683f7f5549108dc7b04e43b5f32965 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 19 Aug 2019 09:45:41 -0700 Subject: [PATCH 114/191] CLN: missing boilerplate in Sparse op (#27910) * CLN: missing boilerplate in Sparse op --- pandas/core/arrays/sparse.py | 9 ++++++--- pandas/tests/arrays/sparse/test_arithmetics.py | 17 +++++++++++++++++ 2 files changed, 23 insertions(+), 3 deletions(-) diff --git a/pandas/core/arrays/sparse.py b/pandas/core/arrays/sparse.py index 2234167fe0193..201174b6b1995 100644 --- a/pandas/core/arrays/sparse.py +++ b/pandas/core/arrays/sparse.py @@ -39,6 +39,7 @@ ) from pandas.core.dtypes.dtypes import register_extension_dtype from pandas.core.dtypes.generic import ( + ABCDataFrame, ABCIndexClass, ABCSeries, ABCSparseArray, @@ -1735,13 +1736,15 @@ def sparse_unary_method(self): @classmethod def _create_arithmetic_method(cls, op): - def sparse_arithmetic_method(self, other): - op_name = op.__name__ + op_name = op.__name__ - if isinstance(other, (ABCSeries, ABCIndexClass)): + def sparse_arithmetic_method(self, other): + if isinstance(other, (ABCDataFrame, ABCSeries, ABCIndexClass)): # Rely on pandas to dispatch to us. return NotImplemented + other = lib.item_from_zerodim(other) + if isinstance(other, SparseArray): return _sparse_array_op(self, other, op, op_name) diff --git a/pandas/tests/arrays/sparse/test_arithmetics.py b/pandas/tests/arrays/sparse/test_arithmetics.py index 57e5a35d99e48..cb5b437c962f9 100644 --- a/pandas/tests/arrays/sparse/test_arithmetics.py +++ b/pandas/tests/arrays/sparse/test_arithmetics.py @@ -441,6 +441,23 @@ def test_with_list(op): tm.assert_sp_array_equal(result, expected) +def test_with_dataframe(): + # GH#27910 + arr = pd.SparseArray([0, 1], fill_value=0) + df = pd.DataFrame([[1, 2], [3, 4]]) + result = arr.__add__(df) + assert result is NotImplemented + + +def test_with_zerodim_ndarray(): + # GH#27910 + arr = pd.SparseArray([0, 1], fill_value=0) + + result = arr * np.array(2) + expected = arr * 2 + tm.assert_sp_array_equal(result, expected) + + @pytest.mark.parametrize("ufunc", [np.abs, np.exp]) @pytest.mark.parametrize( "arr", [pd.SparseArray([0, 0, -1, 1]), pd.SparseArray([None, None, -1, 1])] From fb62fcf91c874e9c24fa83693c4e6e613f35f864 Mon Sep 17 00:00:00 2001 From: Connor Charles Date: Mon, 19 Aug 2019 18:20:23 +0100 Subject: [PATCH 115/191] BUG: _can_use_numexpr fails when passed large Series (#27773) * BUG: _can_use_numexpr did not handle Series case correctly --- doc/source/whatsnew/v0.25.1.rst | 2 +- pandas/core/computation/expressions.py | 7 ++++--- pandas/tests/test_expressions.py | 29 ++++++++++++++++++++++++-- 3 files changed, 32 insertions(+), 6 deletions(-) diff --git a/doc/source/whatsnew/v0.25.1.rst b/doc/source/whatsnew/v0.25.1.rst index 4e1bfac77fae2..bb28ce9b67a3e 100644 --- a/doc/source/whatsnew/v0.25.1.rst +++ b/doc/source/whatsnew/v0.25.1.rst @@ -53,7 +53,7 @@ Numeric ^^^^^^^ - Bug in :meth:`Series.interpolate` when using a timezone aware :class:`DatetimeIndex` (:issue:`27548`) - Bug when printing negative floating point complex numbers would raise an ``IndexError`` (:issue:`27484`) -- +- Bug where :class:`DataFrame` arithmetic operators such as :meth:`DataFrame.mul` with a :class:`Series` with axis=1 would raise an ``AttributeError`` on :class:`DataFrame` larger than the minimum threshold to invoke numexpr (:issue:`27636`) - Conversion diff --git a/pandas/core/computation/expressions.py b/pandas/core/computation/expressions.py index 5b6d275001d36..29c8239fa518f 100644 --- a/pandas/core/computation/expressions.py +++ b/pandas/core/computation/expressions.py @@ -76,16 +76,17 @@ def _can_use_numexpr(op, op_str, a, b, dtype_check): # required min elements (otherwise we are adding overhead) if np.prod(a.shape) > _MIN_ELEMENTS: - # check for dtype compatibility dtypes = set() for o in [a, b]: - if hasattr(o, "dtypes"): + # Series implements dtypes, check for dimension count as well + if hasattr(o, "dtypes") and o.ndim > 1: s = o.dtypes.value_counts() if len(s) > 1: return False dtypes |= set(s.index.astype(str)) - elif isinstance(o, np.ndarray): + # ndarray and Series Case + elif hasattr(o, "dtype"): dtypes |= {o.dtype.name} # allowed are a superset diff --git a/pandas/tests/test_expressions.py b/pandas/tests/test_expressions.py index 4070624985068..ca514f62f451d 100644 --- a/pandas/tests/test_expressions.py +++ b/pandas/tests/test_expressions.py @@ -66,7 +66,7 @@ def run_arithmetic(self, df, other, assert_func, check_dtype=False, test_flex=Tr operator_name = "truediv" if test_flex: - op = lambda x, y: getattr(df, arith)(y) + op = lambda x, y: getattr(x, arith)(y) op.__name__ = arith else: op = getattr(operator, operator_name) @@ -318,7 +318,6 @@ def testit(): for f in [self.frame, self.frame2, self.mixed, self.mixed2]: for cond in [True, False]: - c = np.empty(f.shape, dtype=np.bool_) c.fill(cond) result = expr.where(c, f.values, f.values + 1) @@ -431,3 +430,29 @@ def test_bool_ops_column_name_dtype(self, test_input, expected): # GH 22383 - .ne fails if columns containing column name 'dtype' result = test_input.loc[:, ["a", "dtype"]].ne(test_input.loc[:, ["a", "dtype"]]) assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "arith", ("add", "sub", "mul", "mod", "truediv", "floordiv") + ) + @pytest.mark.parametrize("axis", (0, 1)) + def test_frame_series_axis(self, axis, arith): + # GH#26736 Dataframe.floordiv(Series, axis=1) fails + if axis == 1 and arith == "floordiv": + pytest.xfail("'floordiv' does not succeed with axis=1 #27636") + + df = self.frame + if axis == 1: + other = self.frame.iloc[0, :] + else: + other = self.frame.iloc[:, 0] + + expr._MIN_ELEMENTS = 0 + + op_func = getattr(df, arith) + + expr.set_use_numexpr(False) + expected = op_func(other, axis=axis) + expr.set_use_numexpr(True) + + result = op_func(other, axis=axis) + assert_frame_equal(expected, result) From a4b0132d8a01971f7ddfce5497950ad10ef8ff5d Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 19 Aug 2019 11:29:06 -0700 Subject: [PATCH 116/191] TST: fix flaky xfail (#28016) * TST: fix flaky xfail --- pandas/tests/series/test_analytics.py | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py index 3a5a387b919be..1ddaa4692d741 100644 --- a/pandas/tests/series/test_analytics.py +++ b/pandas/tests/series/test_analytics.py @@ -1482,16 +1482,7 @@ def test_value_counts_with_nan(self): @pytest.mark.parametrize( "dtype", - [ - "int_", - "uint", - "float_", - "unicode_", - "timedelta64[h]", - pytest.param( - "datetime64[D]", marks=pytest.mark.xfail(reason="GH#7996", strict=True) - ), - ], + ["int_", "uint", "float_", "unicode_", "timedelta64[h]", "datetime64[D]"], ) def test_drop_duplicates_categorical_non_bool(self, dtype, ordered_fixture): cat_array = np.array([1, 2, 3, 4, 5], dtype=np.dtype(dtype)) @@ -1499,6 +1490,10 @@ def test_drop_duplicates_categorical_non_bool(self, dtype, ordered_fixture): # Test case 1 input1 = np.array([1, 2, 3, 3], dtype=np.dtype(dtype)) tc1 = Series(Categorical(input1, categories=cat_array, ordered=ordered_fixture)) + if dtype == "datetime64[D]": + # pre-empty flaky xfail, tc1 values are seemingly-random + if not (np.array(tc1) == input1).all(): + pytest.xfail(reason="GH#7996") expected = Series([False, False, False, True]) tm.assert_series_equal(tc1.duplicated(), expected) @@ -1524,6 +1519,10 @@ def test_drop_duplicates_categorical_non_bool(self, dtype, ordered_fixture): # Test case 2 input2 = np.array([1, 2, 3, 5, 3, 2, 4], dtype=np.dtype(dtype)) tc2 = Series(Categorical(input2, categories=cat_array, ordered=ordered_fixture)) + if dtype == "datetime64[D]": + # pre-empty flaky xfail, tc2 values are seemingly-random + if not (np.array(tc2) == input2).all(): + pytest.xfail(reason="GH#7996") expected = Series([False, False, False, False, True, True, False]) tm.assert_series_equal(tc2.duplicated(), expected) From 325dd686de1589c17731cf93b649ed5ccb5a99b4 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 19 Aug 2019 21:26:20 +0200 Subject: [PATCH 117/191] Fix regression in .ix fallback with IntervalIndex (#27926) --- doc/source/whatsnew/v0.25.1.rst | 1 + pandas/core/indexing.py | 5 ++++- pandas/tests/indexing/test_ix.py | 10 ++++++++++ 3 files changed, 15 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.25.1.rst b/doc/source/whatsnew/v0.25.1.rst index bb28ce9b67a3e..a5a2ef118b987 100644 --- a/doc/source/whatsnew/v0.25.1.rst +++ b/doc/source/whatsnew/v0.25.1.rst @@ -84,6 +84,7 @@ Indexing - Bug in partial-string indexing returning a NumPy array rather than a ``Series`` when indexing with a scalar like ``.loc['2015']`` (:issue:`27516`) - Break reference cycle involving :class:`Index` and other index classes to allow garbage collection of index objects without running the GC. (:issue:`27585`, :issue:`27840`) - Fix regression in assigning values to a single column of a DataFrame with a ``MultiIndex`` columns (:issue:`27841`). +- Fix regression in ``.ix`` fallback with an ``IntervalIndex`` (:issue:`27865`). - Missing diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index ea00737f776ee..7bb5e2fa3018d 100755 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -124,7 +124,7 @@ def __getitem__(self, key): key = tuple(com.apply_if_callable(x, self.obj) for x in key) try: values = self.obj._get_value(*key) - except (KeyError, TypeError, InvalidIndexError): + except (KeyError, TypeError, InvalidIndexError, AttributeError): # TypeError occurs here if the key has non-hashable entries, # generally slice or list. # TODO(ix): most/all of the TypeError cases here are for ix, @@ -132,6 +132,9 @@ def __getitem__(self, key): # The InvalidIndexError is only catched for compatibility # with geopandas, see # https://github.com/pandas-dev/pandas/issues/27258 + # TODO: The AttributeError is for IntervalIndex which + # incorrectly implements get_value, see + # https://github.com/pandas-dev/pandas/issues/27865 pass else: if is_scalar(values): diff --git a/pandas/tests/indexing/test_ix.py b/pandas/tests/indexing/test_ix.py index 45ccd8d1b8fb3..6029db8ed66f6 100644 --- a/pandas/tests/indexing/test_ix.py +++ b/pandas/tests/indexing/test_ix.py @@ -343,3 +343,13 @@ def test_ix_duplicate_returns_series(self): r = df.ix[0.2, "a"] e = df.loc[0.2, "a"] tm.assert_series_equal(r, e) + + def test_ix_intervalindex(self): + # https://github.com/pandas-dev/pandas/issues/27865 + df = DataFrame( + np.random.randn(5, 2), + index=pd.IntervalIndex.from_breaks([-np.inf, 0, 1, 2, 3, np.inf]), + ) + result = df.ix[0:2, 0] + expected = df.iloc[0:2, 0] + tm.assert_series_equal(result, expected) From be08902a74593b5ac7491cf691a21bb083aa8824 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 19 Aug 2019 14:01:50 -0700 Subject: [PATCH 118/191] BUG: TimedeltaArray - Index result.name (#27962) * BUG: TimedeltaArray - Index result.name * change assert to TypeErrror --- pandas/core/indexes/base.py | 5 ++++- pandas/core/ops/array_ops.py | 5 +++-- pandas/tests/arithmetic/test_timedelta64.py | 10 +++++++--- 3 files changed, 14 insertions(+), 6 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index b983117478c61..415255cdbad06 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -2325,7 +2325,10 @@ def __sub__(self, other): return Index(np.array(self) - other) def __rsub__(self, other): - return Index(other - np.array(self)) + # wrap Series to ensure we pin name correctly + from pandas import Series + + return Index(other - Series(self)) def __and__(self, other): return self.intersection(other) diff --git a/pandas/core/ops/array_ops.py b/pandas/core/ops/array_ops.py index a3bfb2e10bb66..523ba5d42a69c 100644 --- a/pandas/core/ops/array_ops.py +++ b/pandas/core/ops/array_ops.py @@ -74,8 +74,9 @@ def masked_arith_op(x, y, op): result[mask] = op(xrav[mask], yrav[mask]) else: - assert is_scalar(y), type(y) - assert isinstance(x, np.ndarray), type(x) + if not is_scalar(y): + raise TypeError(type(y)) + # mask is only meaningful for x result = np.empty(x.size, dtype=x.dtype) mask = notna(xrav) diff --git a/pandas/tests/arithmetic/test_timedelta64.py b/pandas/tests/arithmetic/test_timedelta64.py index 33a5d45df3885..6d6b85a1e81e1 100644 --- a/pandas/tests/arithmetic/test_timedelta64.py +++ b/pandas/tests/arithmetic/test_timedelta64.py @@ -1378,8 +1378,12 @@ def test_td64arr_add_offset_array(self, box): @pytest.mark.parametrize( "names", [(None, None, None), ("foo", "bar", None), ("foo", "foo", "foo")] ) - def test_td64arr_sub_offset_index(self, names, box): + def test_td64arr_sub_offset_index(self, names, box_with_array): # GH#18824, GH#19744 + box = box_with_array + xbox = box if box is not tm.to_array else pd.Index + exname = names[2] if box is not tm.to_array else names[1] + if box is pd.DataFrame and names[1] == "bar": pytest.skip( "Name propagation for DataFrame does not behave like " @@ -1390,11 +1394,11 @@ def test_td64arr_sub_offset_index(self, names, box): other = pd.Index([pd.offsets.Hour(n=1), pd.offsets.Minute(n=-2)], name=names[1]) expected = TimedeltaIndex( - [tdi[n] - other[n] for n in range(len(tdi))], freq="infer", name=names[2] + [tdi[n] - other[n] for n in range(len(tdi))], freq="infer", name=exname ) tdi = tm.box_expected(tdi, box) - expected = tm.box_expected(expected, box) + expected = tm.box_expected(expected, xbox) # The DataFrame operation is transposed and so operates as separate # scalar operations, which do not issue a PerformanceWarning From 9a244b5c843c426f1bafacaec5bfb0f54a9137a0 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 19 Aug 2019 14:48:38 -0700 Subject: [PATCH 119/191] REF: use should_extension_dispatch for comparison method (#27912) * REF: implement should_extension_dispatch --- pandas/core/ops/__init__.py | 36 +++++------------------------------- 1 file changed, 5 insertions(+), 31 deletions(-) diff --git a/pandas/core/ops/__init__.py b/pandas/core/ops/__init__.py index dbcf09a401f27..0de28f0a4a8b3 100644 --- a/pandas/core/ops/__init__.py +++ b/pandas/core/ops/__init__.py @@ -17,9 +17,7 @@ from pandas.core.dtypes.common import ( ensure_object, is_bool_dtype, - is_categorical_dtype, is_datetime64_dtype, - is_datetime64tz_dtype, is_datetimelike_v_numeric, is_extension_array_dtype, is_integer_dtype, @@ -32,6 +30,7 @@ ABCDataFrame, ABCDatetimeArray, ABCDatetimeIndex, + ABCExtensionArray, ABCIndexClass, ABCSeries, ABCSparseSeries, @@ -699,42 +698,17 @@ def wrapper(self, other, axis=None): if isinstance(other, ABCSeries) and not self._indexed_same(other): raise ValueError("Can only compare identically-labeled Series objects") - elif ( - is_list_like(other) - and len(other) != len(self) - and not isinstance(other, (set, frozenset)) - ): - raise ValueError("Lengths must match") - elif isinstance(other, (np.ndarray, ABCIndexClass, ABCSeries)): + elif isinstance( + other, (np.ndarray, ABCExtensionArray, ABCIndexClass, ABCSeries) + ): # TODO: make this treatment consistent across ops and classes. # We are not catching all listlikes here (e.g. frozenset, tuple) # The ambiguous case is object-dtype. See GH#27803 if len(self) != len(other): raise ValueError("Lengths must match to compare") - if is_categorical_dtype(self): - # Dispatch to Categorical implementation; CategoricalIndex - # behavior is non-canonical GH#19513 - res_values = dispatch_to_extension_op(op, self, other) - - elif is_datetime64_dtype(self) or is_datetime64tz_dtype(self): - # Dispatch to DatetimeIndex to ensure identical - # Series/Index behavior - from pandas.core.arrays import DatetimeArray - - res_values = dispatch_to_extension_op(op, DatetimeArray(self), other) - - elif is_timedelta64_dtype(self): - from pandas.core.arrays import TimedeltaArray - - res_values = dispatch_to_extension_op(op, TimedeltaArray(self), other) - - elif is_extension_array_dtype(self) or ( - is_extension_array_dtype(other) and not is_scalar(other) - ): - # Note: the `not is_scalar(other)` condition rules out - # e.g. other == "category" + if should_extension_dispatch(self, other): res_values = dispatch_to_extension_op(op, self, other) elif is_scalar(other) and isna(other): From 9f71625851a31768e585420ae7abff8889143d01 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 19 Aug 2019 23:35:42 -0700 Subject: [PATCH 120/191] CLN: Remove incorrect check, comment, rename (#27922) --- pandas/core/ops/__init__.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/pandas/core/ops/__init__.py b/pandas/core/ops/__init__.py index 0de28f0a4a8b3..7e03b9544ee72 100644 --- a/pandas/core/ops/__init__.py +++ b/pandas/core/ops/__init__.py @@ -730,9 +730,12 @@ def wrapper(self, other, axis=None): ) result = self._constructor(res_values, index=self.index) - # rename is needed in case res_name is None and result.name - # is not. - return finalizer(result).rename(res_name) + result = finalizer(result) + + # Set the result's name after finalizer is called because finalizer + # would set it back to self.name + result.name = res_name + return result wrapper.__name__ = op_name return wrapper From 0a7bb2a4f011afbf058fd7c92bdeb0b2efa49157 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 20 Aug 2019 00:00:44 -0700 Subject: [PATCH 121/191] BUG: fix to_timestamp out_of_bounds (#27916) --- doc/source/whatsnew/v0.25.1.rst | 2 +- pandas/_libs/tslibs/period.pyx | 13 +++++++------ pandas/tests/arrays/test_datetimelike.py | 11 +++++++++++ pandas/tests/scalar/period/test_asfreq.py | 5 +---- 4 files changed, 20 insertions(+), 11 deletions(-) diff --git a/doc/source/whatsnew/v0.25.1.rst b/doc/source/whatsnew/v0.25.1.rst index a5a2ef118b987..cccabbc3f1cab 100644 --- a/doc/source/whatsnew/v0.25.1.rst +++ b/doc/source/whatsnew/v0.25.1.rst @@ -31,7 +31,7 @@ Categorical Datetimelike ^^^^^^^^^^^^ - Bug in :func:`to_datetime` where passing a timezone-naive :class:`DatetimeArray` or :class:`DatetimeIndex` and ``utc=True`` would incorrectly return a timezone-naive result (:issue:`27733`) -- +- Bug in :meth:`Period.to_timestamp` where a :class:`Period` outside the :class:`Timestamp` implementation bounds (roughly 1677-09-21 to 2262-04-11) would return an incorrect :class:`Timestamp` instead of raising ``OutOfBoundsDatetime`` (:issue:`19643`) - - diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx index c68d686ff2bf2..98e55f50062a2 100644 --- a/pandas/_libs/tslibs/period.pyx +++ b/pandas/_libs/tslibs/period.pyx @@ -21,7 +21,8 @@ PyDateTime_IMPORT from pandas._libs.tslibs.np_datetime cimport ( npy_datetimestruct, dtstruct_to_dt64, dt64_to_dtstruct, - pandas_datetime_to_datetimestruct, NPY_DATETIMEUNIT, NPY_FR_D) + pandas_datetime_to_datetimestruct, check_dts_bounds, + NPY_DATETIMEUNIT, NPY_FR_D) cdef extern from "src/datetime/np_datetime.h": int64_t npy_datetimestruct_to_datetime(NPY_DATETIMEUNIT fr, @@ -1011,7 +1012,7 @@ def dt64arr_to_periodarr(int64_t[:] dtarr, int freq, tz=None): @cython.wraparound(False) @cython.boundscheck(False) -def periodarr_to_dt64arr(int64_t[:] periodarr, int freq): +def periodarr_to_dt64arr(const int64_t[:] periodarr, int freq): """ Convert array to datetime64 values from a set of ordinals corresponding to periods per period convention. @@ -1024,9 +1025,8 @@ def periodarr_to_dt64arr(int64_t[:] periodarr, int freq): out = np.empty(l, dtype='i8') - with nogil: - for i in range(l): - out[i] = period_ordinal_to_dt64(periodarr[i], freq) + for i in range(l): + out[i] = period_ordinal_to_dt64(periodarr[i], freq) return out.base # .base to access underlying np.ndarray @@ -1179,7 +1179,7 @@ cpdef int64_t period_ordinal(int y, int m, int d, int h, int min, return get_period_ordinal(&dts, freq) -cpdef int64_t period_ordinal_to_dt64(int64_t ordinal, int freq) nogil: +cdef int64_t period_ordinal_to_dt64(int64_t ordinal, int freq) except? -1: cdef: npy_datetimestruct dts @@ -1187,6 +1187,7 @@ cpdef int64_t period_ordinal_to_dt64(int64_t ordinal, int freq) nogil: return NPY_NAT get_date_info(ordinal, freq, &dts) + check_dts_bounds(&dts) return dtstruct_to_dt64(&dts) diff --git a/pandas/tests/arrays/test_datetimelike.py b/pandas/tests/arrays/test_datetimelike.py index 0b3ccc0ae0e2d..7c482664bca48 100644 --- a/pandas/tests/arrays/test_datetimelike.py +++ b/pandas/tests/arrays/test_datetimelike.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas._libs import OutOfBoundsDatetime + import pandas as pd from pandas.core.arrays import DatetimeArray, PeriodArray, TimedeltaArray import pandas.util.testing as tm @@ -615,6 +617,15 @@ def test_to_timestamp(self, how, period_index): # an EA-specific tm.assert_ function tm.assert_index_equal(pd.Index(result), pd.Index(expected)) + def test_to_timestamp_out_of_bounds(self): + # GH#19643 previously overflowed silently + pi = pd.period_range("1500", freq="Y", periods=3) + with pytest.raises(OutOfBoundsDatetime): + pi.to_timestamp() + + with pytest.raises(OutOfBoundsDatetime): + pi._data.to_timestamp() + @pytest.mark.parametrize("propname", PeriodArray._bool_ops) def test_bool_properties(self, period_index, propname): # in this case _bool_ops is just `is_leap_year` diff --git a/pandas/tests/scalar/period/test_asfreq.py b/pandas/tests/scalar/period/test_asfreq.py index 4cff061cabc40..357274e724c68 100644 --- a/pandas/tests/scalar/period/test_asfreq.py +++ b/pandas/tests/scalar/period/test_asfreq.py @@ -30,11 +30,8 @@ def test_asfreq_near_zero_weekly(self): assert week1.asfreq("D", "E") >= per1 assert week2.asfreq("D", "S") <= per2 - @pytest.mark.xfail( - reason="GH#19643 period_helper asfreq functions fail to check for overflows" - ) def test_to_timestamp_out_of_bounds(self): - # GH#19643, currently gives Timestamp('1754-08-30 22:43:41.128654848') + # GH#19643, used to incorrectly give Timestamp in 1754 per = Period("0001-01-01", freq="B") with pytest.raises(OutOfBoundsDatetime): per.to_timestamp() From e118b1d6b23d1d829c3db05505ef4873f0314e99 Mon Sep 17 00:00:00 2001 From: joy-rosie Date: Tue, 20 Aug 2019 15:00:08 +0100 Subject: [PATCH 122/191] BUG: added a check for if obj is instance of type in _isna-new (#27664) * added a check for if obj is instance of type in _isna-new --- doc/source/whatsnew/v0.25.1.rst | 2 +- pandas/core/dtypes/missing.py | 4 ++++ pandas/tests/dtypes/test_missing.py | 4 ++++ 3 files changed, 9 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.25.1.rst b/doc/source/whatsnew/v0.25.1.rst index cccabbc3f1cab..94770a7aae676 100644 --- a/doc/source/whatsnew/v0.25.1.rst +++ b/doc/source/whatsnew/v0.25.1.rst @@ -90,7 +90,7 @@ Indexing Missing ^^^^^^^ -- +- Bug in :func:`pandas.isnull` or :func:`pandas.isna` when the input is a type e.g. `type(pandas.Series())` (:issue:`27482`) - - diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py index 6f599a6be6021..056cd2222af3c 100644 --- a/pandas/core/dtypes/missing.py +++ b/pandas/core/dtypes/missing.py @@ -133,6 +133,8 @@ def _isna_new(obj): # hack (for now) because MI registers as ndarray elif isinstance(obj, ABCMultiIndex): raise NotImplementedError("isna is not defined for MultiIndex") + elif isinstance(obj, type): + return False elif isinstance( obj, ( @@ -171,6 +173,8 @@ def _isna_old(obj): # hack (for now) because MI registers as ndarray elif isinstance(obj, ABCMultiIndex): raise NotImplementedError("isna is not defined for MultiIndex") + elif isinstance(obj, type): + return False elif isinstance(obj, (ABCSeries, np.ndarray, ABCIndexClass)): return _isna_ndarraylike_old(obj) elif isinstance(obj, ABCGeneric): diff --git a/pandas/tests/dtypes/test_missing.py b/pandas/tests/dtypes/test_missing.py index a688dec50bc95..bbc485ecf94f2 100644 --- a/pandas/tests/dtypes/test_missing.py +++ b/pandas/tests/dtypes/test_missing.py @@ -86,6 +86,10 @@ def test_isna_isnull(self, isna_f): assert not isna_f(np.inf) assert not isna_f(-np.inf) + # type + assert not isna_f(type(pd.Series())) + assert not isna_f(type(pd.DataFrame())) + # series for s in [ tm.makeFloatSeries(), From 69c58da27cb61a81a94cc3a5da3a2c1870b4e693 Mon Sep 17 00:00:00 2001 From: Kaiqi Dong Date: Tue, 20 Aug 2019 16:18:54 +0200 Subject: [PATCH 123/191] PLT: plot('line') or plot('area') produces wrong xlim in xaxis in 0.25.0 (#27993) * Fix issue 27686 --- doc/source/whatsnew/v1.0.0.rst | 1 + pandas/plotting/_matplotlib/core.py | 7 +-- pandas/plotting/_matplotlib/tools.py | 21 --------- pandas/tests/plotting/test_datetimelike.py | 24 +++++++--- pandas/tests/plotting/test_frame.py | 52 ++++++++++++++++++++++ pandas/tests/plotting/test_series.py | 12 +++++ 6 files changed, 84 insertions(+), 33 deletions(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 3ce558ca336c3..4decc99087a9e 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -168,6 +168,7 @@ Plotting - - Bug in :meth:`DataFrame.plot` producing incorrect legend markers when plotting multiple series on the same axis (:issue:`18222`) - Bug in :meth:`DataFrame.plot` when ``kind='box'`` and data contains datetime or timedelta data. These types are now automatically dropped (:issue:`22799`) +- Bug in :meth:`DataFrame.plot.line` and :meth:`DataFrame.plot.area` produce wrong xlim in x-axis (:issue:`27686`, :issue:`25160`, :issue:`24784`) Groupby/resample/rolling ^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py index 287cc2f4130f4..fbca57206e163 100644 --- a/pandas/plotting/_matplotlib/core.py +++ b/pandas/plotting/_matplotlib/core.py @@ -33,8 +33,6 @@ from pandas.plotting._matplotlib.style import _get_standard_colors from pandas.plotting._matplotlib.tools import ( _flatten, - _get_all_lines, - _get_xlim, _handle_shared_axes, _subplots, format_date_labels, @@ -1101,9 +1099,8 @@ def _make_plot(self): ) self._add_legend_handle(newlines[0], label, index=i) - lines = _get_all_lines(ax) - left, right = _get_xlim(lines) - ax.set_xlim(left, right) + # GH27686 set_xlim will truncate xaxis to fixed space + ax.relim() @classmethod def _plot(cls, ax, x, y, style=None, column_num=None, stacking_id=None, **kwds): diff --git a/pandas/plotting/_matplotlib/tools.py b/pandas/plotting/_matplotlib/tools.py index 8472eb3a3d887..fd2913ca51ac3 100644 --- a/pandas/plotting/_matplotlib/tools.py +++ b/pandas/plotting/_matplotlib/tools.py @@ -343,27 +343,6 @@ def _flatten(axes): return np.array(axes) -def _get_all_lines(ax): - lines = ax.get_lines() - - if hasattr(ax, "right_ax"): - lines += ax.right_ax.get_lines() - - if hasattr(ax, "left_ax"): - lines += ax.left_ax.get_lines() - - return lines - - -def _get_xlim(lines): - left, right = np.inf, -np.inf - for l in lines: - x = l.get_xdata(orig=False) - left = min(np.nanmin(x), left) - right = max(np.nanmax(x), right) - return left, right - - def _set_ticks_props(axes, xlabelsize=None, xrot=None, ylabelsize=None, yrot=None): import matplotlib.pyplot as plt diff --git a/pandas/tests/plotting/test_datetimelike.py b/pandas/tests/plotting/test_datetimelike.py index 69070ea11e478..be87929b4545a 100644 --- a/pandas/tests/plotting/test_datetimelike.py +++ b/pandas/tests/plotting/test_datetimelike.py @@ -419,6 +419,8 @@ def test_get_finder(self): assert conv.get_finder("A") == conv._annual_finder assert conv.get_finder("W") == conv._daily_finder + # TODO: The finder should be retested due to wrong xlim values on x-axis + @pytest.mark.xfail(reason="TODO: check details in GH28021") @pytest.mark.slow def test_finder_daily(self): day_lst = [10, 40, 252, 400, 950, 2750, 10000] @@ -442,6 +444,8 @@ def test_finder_daily(self): assert rs1 == xpl1 assert rs2 == xpl2 + # TODO: The finder should be retested due to wrong xlim values on x-axis + @pytest.mark.xfail(reason="TODO: check details in GH28021") @pytest.mark.slow def test_finder_quarterly(self): yrs = [3.5, 11] @@ -465,6 +469,8 @@ def test_finder_quarterly(self): assert rs1 == xpl1 assert rs2 == xpl2 + # TODO: The finder should be retested due to wrong xlim values on x-axis + @pytest.mark.xfail(reason="TODO: check details in GH28021") @pytest.mark.slow def test_finder_monthly(self): yrs = [1.15, 2.5, 4, 11] @@ -498,6 +504,8 @@ def test_finder_monthly_long(self): xp = Period("1989Q1", "M").ordinal assert rs == xp + # TODO: The finder should be retested due to wrong xlim values on x-axis + @pytest.mark.xfail(reason="TODO: check details in GH28021") @pytest.mark.slow def test_finder_annual(self): xp = [1987, 1988, 1990, 1990, 1995, 2020, 2070, 2170] @@ -522,7 +530,7 @@ def test_finder_minutely(self): _, ax = self.plt.subplots() ser.plot(ax=ax) xaxis = ax.get_xaxis() - rs = xaxis.get_majorticklocs()[0] + rs = xaxis.get_majorticklocs()[1] xp = Period("1/1/1999", freq="Min").ordinal assert rs == xp @@ -534,7 +542,7 @@ def test_finder_hourly(self): _, ax = self.plt.subplots() ser.plot(ax=ax) xaxis = ax.get_xaxis() - rs = xaxis.get_majorticklocs()[0] + rs = xaxis.get_majorticklocs()[1] xp = Period("1/1/1999", freq="H").ordinal assert rs == xp @@ -1410,7 +1418,9 @@ def test_plot_outofbounds_datetime(self): def test_format_timedelta_ticks_narrow(self): - expected_labels = ["00:00:00.0000000{:0>2d}".format(i) for i in range(10)] + expected_labels = [ + "00:00:00.0000000{:0>2d}".format(i) for i in np.arange(0, 10, 2) + ] rng = timedelta_range("0", periods=10, freq="ns") df = DataFrame(np.random.randn(len(rng), 3), rng) @@ -1420,8 +1430,8 @@ def test_format_timedelta_ticks_narrow(self): labels = ax.get_xticklabels() result_labels = [x.get_text() for x in labels] - assert len(result_labels) == len(expected_labels) - assert result_labels == expected_labels + assert (len(result_labels) - 2) == len(expected_labels) + assert result_labels[1:-1] == expected_labels def test_format_timedelta_ticks_wide(self): expected_labels = [ @@ -1444,8 +1454,8 @@ def test_format_timedelta_ticks_wide(self): labels = ax.get_xticklabels() result_labels = [x.get_text() for x in labels] - assert len(result_labels) == len(expected_labels) - assert result_labels == expected_labels + assert (len(result_labels) - 2) == len(expected_labels) + assert result_labels[1:-1] == expected_labels def test_timedelta_plot(self): # test issue #8711 diff --git a/pandas/tests/plotting/test_frame.py b/pandas/tests/plotting/test_frame.py index 7fdc0252b71e3..f672cd3a6aa58 100644 --- a/pandas/tests/plotting/test_frame.py +++ b/pandas/tests/plotting/test_frame.py @@ -3177,6 +3177,58 @@ def test_x_multiindex_values_ticks(self): assert labels_position["(2013, 1)"] == 2.0 assert labels_position["(2013, 2)"] == 3.0 + @pytest.mark.parametrize("kind", ["line", "area"]) + def test_xlim_plot_line(self, kind): + # test if xlim is set correctly in plot.line and plot.area + # GH 27686 + df = pd.DataFrame([2, 4], index=[1, 2]) + ax = df.plot(kind=kind) + xlims = ax.get_xlim() + assert xlims[0] < 1 + assert xlims[1] > 2 + + def test_xlim_plot_line_correctly_in_mixed_plot_type(self): + # test if xlim is set correctly when ax contains multiple different kinds + # of plots, GH 27686 + fig, ax = self.plt.subplots() + + indexes = ["k1", "k2", "k3", "k4"] + df = pd.DataFrame( + { + "s1": [1000, 2000, 1500, 2000], + "s2": [900, 1400, 2000, 3000], + "s3": [1500, 1500, 1600, 1200], + "secondary_y": [1, 3, 4, 3], + }, + index=indexes, + ) + df[["s1", "s2", "s3"]].plot.bar(ax=ax, stacked=False) + df[["secondary_y"]].plot(ax=ax, secondary_y=True) + + xlims = ax.get_xlim() + assert xlims[0] < 0 + assert xlims[1] > 3 + + # make sure axis labels are plotted correctly as well + xticklabels = [t.get_text() for t in ax.get_xticklabels()] + assert xticklabels == indexes + + def test_subplots_sharex_false(self): + # test when sharex is set to False, two plots should have different + # labels, GH 25160 + df = pd.DataFrame(np.random.rand(10, 2)) + df.iloc[5:, 1] = np.nan + df.iloc[:5, 0] = np.nan + + figs, axs = self.plt.subplots(2, 1) + df.plot.line(ax=axs, subplots=True, sharex=False) + + expected_ax1 = np.arange(4.5, 10, 0.5) + expected_ax2 = np.arange(-0.5, 5, 0.5) + + tm.assert_numpy_array_equal(axs[0].get_xticks(), expected_ax1) + tm.assert_numpy_array_equal(axs[1].get_xticks(), expected_ax2) + def _generate_4_axes_via_gridspec(): import matplotlib.pyplot as plt diff --git a/pandas/tests/plotting/test_series.py b/pandas/tests/plotting/test_series.py index 111c3a70fc09c..2c4c8aa7461a3 100644 --- a/pandas/tests/plotting/test_series.py +++ b/pandas/tests/plotting/test_series.py @@ -897,3 +897,15 @@ def test_plot_accessor_updates_on_inplace(self): _, ax = self.plt.subplots() after = ax.xaxis.get_ticklocs() tm.assert_numpy_array_equal(before, after) + + @pytest.mark.parametrize("kind", ["line", "area"]) + def test_plot_xlim_for_series(self, kind): + # test if xlim is also correctly plotted in Series for line and area + # GH 27686 + s = Series([2, 3]) + _, ax = self.plt.subplots() + s.plot(kind=kind, ax=ax) + xlims = ax.get_xlim() + + assert xlims[0] < 0 + assert xlims[1] > 1 From e55b6980fd5c99087d0959952b22e74434e9f5e2 Mon Sep 17 00:00:00 2001 From: David Stansby Date: Tue, 20 Aug 2019 15:26:49 +0100 Subject: [PATCH 124/191] BUG: Correctly reinstate Matplotlib converters (#27481) --- doc/source/whatsnew/v0.25.1.rst | 3 +++ pandas/plotting/_matplotlib/converter.py | 7 ++++--- pandas/tests/plotting/test_converter.py | 15 +++++++++++++++ 3 files changed, 22 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v0.25.1.rst b/doc/source/whatsnew/v0.25.1.rst index 94770a7aae676..463dcef9feab6 100644 --- a/doc/source/whatsnew/v0.25.1.rst +++ b/doc/source/whatsnew/v0.25.1.rst @@ -111,6 +111,9 @@ Plotting ^^^^^^^^ - Added a pandas_plotting_backends entrypoint group for registering plot backends. See :ref:`extending.plotting-backends` for more (:issue:`26747`). +- Fixed the re-instatement of Matplotlib datetime converters after calling + `pandas.plotting.deregister_matplotlib_converters()` (:issue:`27481`). +- - Fix compatibility issue with matplotlib when passing a pandas ``Index`` to a plot call (:issue:`27775`). - diff --git a/pandas/plotting/_matplotlib/converter.py b/pandas/plotting/_matplotlib/converter.py index 15648d59c8f98..893854ab26e37 100644 --- a/pandas/plotting/_matplotlib/converter.py +++ b/pandas/plotting/_matplotlib/converter.py @@ -64,11 +64,12 @@ def register(explicit=True): pairs = get_pairs() for type_, cls in pairs: - converter = cls() - if type_ in units.registry: + # Cache previous converter if present + if type_ in units.registry and not isinstance(units.registry[type_], cls): previous = units.registry[type_] _mpl_units[type_] = previous - units.registry[type_] = converter + # Replace with pandas converter + units.registry[type_] = cls() def deregister(): diff --git a/pandas/tests/plotting/test_converter.py b/pandas/tests/plotting/test_converter.py index 35d12706f0590..7001264c41c05 100644 --- a/pandas/tests/plotting/test_converter.py +++ b/pandas/tests/plotting/test_converter.py @@ -40,6 +40,21 @@ def test_initial_warning(): assert "Using an implicitly" in out +def test_registry_mpl_resets(): + # Check that Matplotlib converters are properly reset (see issue #27481) + code = ( + "import matplotlib.units as units; " + "import matplotlib.dates as mdates; " + "n_conv = len(units.registry); " + "import pandas as pd; " + "pd.plotting.register_matplotlib_converters(); " + "pd.plotting.deregister_matplotlib_converters(); " + "assert len(units.registry) == n_conv" + ) + call = [sys.executable, "-c", code] + subprocess.check_output(call) + + def test_timtetonum_accepts_unicode(): assert converter.time2num("00:01") == converter.time2num("00:01") From 62ab439b168d972546e06d329916c6be7ddd1288 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 20 Aug 2019 10:45:05 -0700 Subject: [PATCH 125/191] BUG: rfloordiv with fill_value, closes#27464 (#28024) * BUG: rfloordiv with fill_value, #27464, #26793 --- doc/source/whatsnew/v0.25.1.rst | 2 +- pandas/core/frame.py | 5 +++- pandas/tests/arithmetic/test_numeric.py | 33 +++++++++++++++++++++++++ 3 files changed, 38 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.25.1.rst b/doc/source/whatsnew/v0.25.1.rst index 463dcef9feab6..108ddb1cdeab5 100644 --- a/doc/source/whatsnew/v0.25.1.rst +++ b/doc/source/whatsnew/v0.25.1.rst @@ -54,7 +54,7 @@ Numeric - Bug in :meth:`Series.interpolate` when using a timezone aware :class:`DatetimeIndex` (:issue:`27548`) - Bug when printing negative floating point complex numbers would raise an ``IndexError`` (:issue:`27484`) - Bug where :class:`DataFrame` arithmetic operators such as :meth:`DataFrame.mul` with a :class:`Series` with axis=1 would raise an ``AttributeError`` on :class:`DataFrame` larger than the minimum threshold to invoke numexpr (:issue:`27636`) -- +- Bug in :class:`DataFrame` arithmetic where missing values in results were incorrectly masked with ``NaN`` instead of ``Inf`` (:issue:`27464`) Conversion ^^^^^^^^^^ diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 603a615c1f8cb..1be7e0736f9fe 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -108,6 +108,7 @@ sanitize_index, to_arrays, ) +from pandas.core.ops.missing import dispatch_fill_zeros from pandas.core.series import Series from pandas.io.formats import console, format as fmt @@ -5305,7 +5306,9 @@ def _arith_op(left, right): # iterate over columns return ops.dispatch_to_series(this, other, _arith_op) else: - result = _arith_op(this.values, other.values) + with np.errstate(all="ignore"): + result = _arith_op(this.values, other.values) + result = dispatch_fill_zeros(func, this.values, other.values, result) return self._constructor( result, index=new_index, columns=new_columns, copy=False ) diff --git a/pandas/tests/arithmetic/test_numeric.py b/pandas/tests/arithmetic/test_numeric.py index 2b23790e4ccd3..d686d9f90a5a4 100644 --- a/pandas/tests/arithmetic/test_numeric.py +++ b/pandas/tests/arithmetic/test_numeric.py @@ -1227,3 +1227,36 @@ def test_addsub_arithmetic(self, dtype, delta): tm.assert_index_equal(index + index, 2 * index) tm.assert_index_equal(index - index, 0 * index) assert not (index - index).empty + + +def test_fill_value_inf_masking(): + # GH #27464 make sure we mask 0/1 with Inf and not NaN + df = pd.DataFrame({"A": [0, 1, 2], "B": [1.1, None, 1.1]}) + + other = pd.DataFrame({"A": [1.1, 1.2, 1.3]}, index=[0, 2, 3]) + + result = df.rfloordiv(other, fill_value=1) + + expected = pd.DataFrame( + {"A": [np.inf, 1.0, 0.0, 1.0], "B": [0.0, np.nan, 0.0, np.nan]} + ) + tm.assert_frame_equal(result, expected) + + +def test_dataframe_div_silenced(): + # GH#26793 + pdf1 = pd.DataFrame( + { + "A": np.arange(10), + "B": [np.nan, 1, 2, 3, 4] * 2, + "C": [np.nan] * 10, + "D": np.arange(10), + }, + index=list("abcdefghij"), + columns=list("ABCD"), + ) + pdf2 = pd.DataFrame( + np.random.randn(10, 4), index=list("abcdefghjk"), columns=list("ABCX") + ) + with tm.assert_produces_warning(None): + pdf1.div(pdf2, fill_value=0) From 8110b472513bd63ce2d583c1415b430be556e8a0 Mon Sep 17 00:00:00 2001 From: Ignacio Santolin Date: Tue, 20 Aug 2019 15:40:21 -0300 Subject: [PATCH 126/191] DOC: Corrected file description in read_fwf() (#28009) * DOC: Corrected file description in read_fwf() --- doc/source/user_guide/io.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 7ba103c5ff996..1d49dbdee9c03 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -28,7 +28,7 @@ The pandas I/O API is a set of top level ``reader`` functions accessed like :delim: ; text;`CSV `__;:ref:`read_csv`;:ref:`to_csv` - text;`TXT `__;:ref:`read_fwf` + text;Fixed-Width Text File;:ref:`read_fwf` text;`JSON `__;:ref:`read_json`;:ref:`to_json` text;`HTML `__;:ref:`read_html`;:ref:`to_html` text; Local clipboard;:ref:`read_clipboard`;:ref:`to_clipboard` From d2031d76f739b069c8df0a93944bab2a9d5c8abd Mon Sep 17 00:00:00 2001 From: William Ayd Date: Tue, 20 Aug 2019 20:02:09 +0100 Subject: [PATCH 127/191] Fix GroupBy nth Handling with Observed=False (#26419) * Added test coverage for observed=False with ops * Fixed issue with observed=False and nth --- doc/source/whatsnew/v0.25.1.rst | 1 + pandas/core/groupby/groupby.py | 6 +++++- pandas/tests/groupby/test_categorical.py | 15 +++++++++++++++ 3 files changed, 21 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.25.1.rst b/doc/source/whatsnew/v0.25.1.rst index 108ddb1cdeab5..0d0c6f1c2a808 100644 --- a/doc/source/whatsnew/v0.25.1.rst +++ b/doc/source/whatsnew/v0.25.1.rst @@ -121,6 +121,7 @@ Groupby/resample/rolling ^^^^^^^^^^^^^^^^^^^^^^^^ - Bug in :meth:`pandas.core.groupby.DataFrameGroupBy.transform` where applying a timezone conversion lambda function would drop timezone information (:issue:`27496`) +- Bug in :meth:`pandas.core.groupby.GroupBy.nth` where ``observed=False`` was being ignored for Categorical groupers (:issue:`26385`) - Bug in windowing over read-only arrays (:issue:`27766`) - Fixed segfault in `pandas.core.groupby.DataFrameGroupBy.quantile` when an invalid quantile was passed (:issue:`27470`) - diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index c5e81e21e9fd5..d68557853db08 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1773,7 +1773,11 @@ def nth(self, n: Union[int, List[int]], dropna: Optional[str] = None) -> DataFra if not self.as_index: return out - out.index = self.grouper.result_index[ids[mask]] + result_index = self.grouper.result_index + out.index = result_index[ids[mask]] + + if not self.observed and isinstance(result_index, CategoricalIndex): + out = out.reindex(result_index) return out.sort_index() if self.sort else out diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index b5c2de267869d..e09af3fd48ee6 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -434,6 +434,21 @@ def test_observed_groups_with_nan(observed): tm.assert_dict_equal(result, expected) +def test_observed_nth(): + # GH 26385 + cat = pd.Categorical(["a", np.nan, np.nan], categories=["a", "b", "c"]) + ser = pd.Series([1, 2, 3]) + df = pd.DataFrame({"cat": cat, "ser": ser}) + + result = df.groupby("cat", observed=False)["ser"].nth(0) + + index = pd.Categorical(["a", "b", "c"], categories=["a", "b", "c"]) + expected = pd.Series([1, np.nan, np.nan], index=index, name="ser") + expected.index.name = "cat" + + tm.assert_series_equal(result, expected) + + def test_dataframe_categorical_with_nan(observed): # GH 21151 s1 = Categorical([np.nan, "a", np.nan, "a"], categories=["a", "b", "c"]) From ec3d78618627c5374307ac88ee3adea5a603054d Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 20 Aug 2019 21:24:47 +0200 Subject: [PATCH 128/191] DataFrame html repr: also follow min_rows setting (#27991) * DataFrame html repr: also follow min_rows setting * add whatsnew --- doc/source/whatsnew/v0.25.1.rst | 1 + pandas/core/frame.py | 7 +++++-- pandas/tests/io/formats/test_format.py | 7 +++++++ 3 files changed, 13 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.25.1.rst b/doc/source/whatsnew/v0.25.1.rst index 0d0c6f1c2a808..86d6db01c10c2 100644 --- a/doc/source/whatsnew/v0.25.1.rst +++ b/doc/source/whatsnew/v0.25.1.rst @@ -105,6 +105,7 @@ I/O ^^^ - Avoid calling ``S3File.s3`` when reading parquet, as this was removed in s3fs version 0.3.0 (:issue:`27756`) - Better error message when a negative header is passed in :func:`pandas.read_csv` (:issue:`27779`) +- Follow the ``min_rows`` display option (introduced in v0.25.0) correctly in the html repr in the notebook (:issue:`27991`). - Plotting diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 1be7e0736f9fe..97567192aa17a 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -670,15 +670,18 @@ def _repr_html_(self): if get_option("display.notebook_repr_html"): max_rows = get_option("display.max_rows") + min_rows = get_option("display.min_rows") max_cols = get_option("display.max_columns") show_dimensions = get_option("display.show_dimensions") - return self.to_html( + formatter = fmt.DataFrameFormatter( + self, max_rows=max_rows, + min_rows=min_rows, max_cols=max_cols, show_dimensions=show_dimensions, - notebook=True, ) + return formatter.to_html(notebook=True) else: return None diff --git a/pandas/tests/io/formats/test_format.py b/pandas/tests/io/formats/test_format.py index a048e3bb867bd..c0451a0672c89 100644 --- a/pandas/tests/io/formats/test_format.py +++ b/pandas/tests/io/formats/test_format.py @@ -471,28 +471,35 @@ def test_repr_min_rows(self): # default setting no truncation even if above min_rows assert ".." not in repr(df) + assert ".." not in df._repr_html_() df = pd.DataFrame({"a": range(61)}) # default of max_rows 60 triggers truncation if above assert ".." in repr(df) + assert ".." in df._repr_html_() with option_context("display.max_rows", 10, "display.min_rows", 4): # truncated after first two rows assert ".." in repr(df) assert "2 " not in repr(df) + assert "..." in df._repr_html_() + assert "2" not in df._repr_html_() with option_context("display.max_rows", 12, "display.min_rows", None): # when set to None, follow value of max_rows assert "5 5" in repr(df) + assert "5" in df._repr_html_() with option_context("display.max_rows", 10, "display.min_rows", 12): # when set value higher as max_rows, use the minimum assert "5 5" not in repr(df) + assert "5" not in df._repr_html_() with option_context("display.max_rows", None, "display.min_rows", 12): # max_rows of None -> never truncate assert ".." not in repr(df) + assert ".." not in df._repr_html_() def test_str_max_colwidth(self): # GH 7856 From b7aacb196e063136872e25045aad911c73947897 Mon Sep 17 00:00:00 2001 From: Sparkle Russell-Puleri Date: Tue, 20 Aug 2019 23:28:32 -0700 Subject: [PATCH 129/191] DOC: Add punctuation to IntervalArray docstrings (#28043) --- pandas/core/arrays/interval.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index 9cb2721b33634..7a14d6f1b619a 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -361,7 +361,7 @@ def from_arrays(cls, left, right, closed="right", copy=False, dtype=None): _interval_shared_docs[ "from_tuples" ] = """ - Construct an %(klass)s from an array-like of tuples + Construct an %(klass)s from an array-like of tuples. Parameters ---------- @@ -854,7 +854,7 @@ def _format_space(self): def left(self): """ Return the left endpoints of each Interval in the IntervalArray as - an Index + an Index. """ return self._left @@ -862,7 +862,7 @@ def left(self): def right(self): """ Return the right endpoints of each Interval in the IntervalArray as - an Index + an Index. """ return self._right @@ -870,7 +870,7 @@ def right(self): def closed(self): """ Whether the intervals are closed on the left-side, right-side, both or - neither + neither. """ return self._closed @@ -878,7 +878,7 @@ def closed(self): "set_closed" ] = """ Return an %(klass)s identical to the current one, but closed on the - specified side + specified side. .. versionadded:: 0.24.0 @@ -917,7 +917,7 @@ def set_closed(self, closed): def length(self): """ Return an Index with entries denoting the length of each Interval in - the IntervalArray + the IntervalArray. """ try: return self.right - self.left @@ -945,7 +945,7 @@ def mid(self): ] = """ Return True if the %(klass)s is non-overlapping (no Intervals share points) and is either monotonic increasing or monotonic decreasing, - else False + else False. """ # https://github.com/python/mypy/issues/1362 # Mypy does not support decorated properties @@ -995,7 +995,7 @@ def __array__(self, dtype=None): _interval_shared_docs[ "to_tuples" ] = """ - Return an %(return_type)s of tuples of the form (left, right) + Return an %(return_type)s of tuples of the form (left, right). Parameters ---------- From a429aaf46c744f5b38e9ec613108e0d107c86625 Mon Sep 17 00:00:00 2001 From: Eunseop Jeong Date: Wed, 21 Aug 2019 15:43:03 +0900 Subject: [PATCH 130/191] DOC: Change document code prun in a row (#28029) --- doc/source/user_guide/enhancingperf.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/doc/source/user_guide/enhancingperf.rst b/doc/source/user_guide/enhancingperf.rst index b77bfb9778837..a4eefadd54d8c 100644 --- a/doc/source/user_guide/enhancingperf.rst +++ b/doc/source/user_guide/enhancingperf.rst @@ -243,9 +243,9 @@ We've gotten another big improvement. Let's check again where the time is spent: .. ipython:: python - %prun -l 4 apply_integrate_f(df['a'].to_numpy(), - df['b'].to_numpy(), - df['N'].to_numpy()) + %%prun -l 4 apply_integrate_f(df['a'].to_numpy(), + df['b'].to_numpy(), + df['N'].to_numpy()) As one might expect, the majority of the time is now spent in ``apply_integrate_f``, so if we wanted to make anymore efficiencies we must continue to concentrate our From e4c4b78f1f0efd71b322172355a8bc30a127f77f Mon Sep 17 00:00:00 2001 From: Mak Sze Chun Date: Wed, 21 Aug 2019 22:21:44 +0800 Subject: [PATCH 131/191] CI: Add pip dependence explicitly (#28008) --- ci/deps/azure-macos-35.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/ci/deps/azure-macos-35.yaml b/ci/deps/azure-macos-35.yaml index cb2ac08cbf758..39315b15a018b 100644 --- a/ci/deps/azure-macos-35.yaml +++ b/ci/deps/azure-macos-35.yaml @@ -22,6 +22,7 @@ dependencies: - xlrd - xlsxwriter - xlwt + - pip - pip: - pyreadstat # universal From 32b47102aa95da5a68ef9939c0e9429269ced016 Mon Sep 17 00:00:00 2001 From: Kaiqi Dong Date: Wed, 21 Aug 2019 17:18:09 +0200 Subject: [PATCH 132/191] BUG: Correct the previous bug fixing on xlim for plotting (#28059) * Restore change in datetime like --- pandas/plotting/_matplotlib/core.py | 11 ++++++++-- pandas/plotting/_matplotlib/tools.py | 21 +++++++++++++++++++ pandas/tests/plotting/test_datetimelike.py | 24 +++++++--------------- 3 files changed, 37 insertions(+), 19 deletions(-) diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py index fbca57206e163..6ff3f28440303 100644 --- a/pandas/plotting/_matplotlib/core.py +++ b/pandas/plotting/_matplotlib/core.py @@ -33,6 +33,8 @@ from pandas.plotting._matplotlib.style import _get_standard_colors from pandas.plotting._matplotlib.tools import ( _flatten, + _get_all_lines, + _get_xlim, _handle_shared_axes, _subplots, format_date_labels, @@ -1099,8 +1101,13 @@ def _make_plot(self): ) self._add_legend_handle(newlines[0], label, index=i) - # GH27686 set_xlim will truncate xaxis to fixed space - ax.relim() + if self._is_ts_plot(): + + # reset of xlim should be used for ts data + # TODO: GH28021, should find a way to change view limit on xaxis + lines = _get_all_lines(ax) + left, right = _get_xlim(lines) + ax.set_xlim(left, right) @classmethod def _plot(cls, ax, x, y, style=None, column_num=None, stacking_id=None, **kwds): diff --git a/pandas/plotting/_matplotlib/tools.py b/pandas/plotting/_matplotlib/tools.py index fd2913ca51ac3..67fa79ad5da8c 100644 --- a/pandas/plotting/_matplotlib/tools.py +++ b/pandas/plotting/_matplotlib/tools.py @@ -356,3 +356,24 @@ def _set_ticks_props(axes, xlabelsize=None, xrot=None, ylabelsize=None, yrot=Non if yrot is not None: plt.setp(ax.get_yticklabels(), rotation=yrot) return axes + + +def _get_all_lines(ax): + lines = ax.get_lines() + + if hasattr(ax, "right_ax"): + lines += ax.right_ax.get_lines() + + if hasattr(ax, "left_ax"): + lines += ax.left_ax.get_lines() + + return lines + + +def _get_xlim(lines): + left, right = np.inf, -np.inf + for l in lines: + x = l.get_xdata(orig=False) + left = min(np.nanmin(x), left) + right = max(np.nanmax(x), right) + return left, right diff --git a/pandas/tests/plotting/test_datetimelike.py b/pandas/tests/plotting/test_datetimelike.py index be87929b4545a..e2b7f2819f957 100644 --- a/pandas/tests/plotting/test_datetimelike.py +++ b/pandas/tests/plotting/test_datetimelike.py @@ -419,8 +419,6 @@ def test_get_finder(self): assert conv.get_finder("A") == conv._annual_finder assert conv.get_finder("W") == conv._daily_finder - # TODO: The finder should be retested due to wrong xlim values on x-axis - @pytest.mark.xfail(reason="TODO: check details in GH28021") @pytest.mark.slow def test_finder_daily(self): day_lst = [10, 40, 252, 400, 950, 2750, 10000] @@ -444,8 +442,6 @@ def test_finder_daily(self): assert rs1 == xpl1 assert rs2 == xpl2 - # TODO: The finder should be retested due to wrong xlim values on x-axis - @pytest.mark.xfail(reason="TODO: check details in GH28021") @pytest.mark.slow def test_finder_quarterly(self): yrs = [3.5, 11] @@ -469,8 +465,6 @@ def test_finder_quarterly(self): assert rs1 == xpl1 assert rs2 == xpl2 - # TODO: The finder should be retested due to wrong xlim values on x-axis - @pytest.mark.xfail(reason="TODO: check details in GH28021") @pytest.mark.slow def test_finder_monthly(self): yrs = [1.15, 2.5, 4, 11] @@ -504,8 +498,6 @@ def test_finder_monthly_long(self): xp = Period("1989Q1", "M").ordinal assert rs == xp - # TODO: The finder should be retested due to wrong xlim values on x-axis - @pytest.mark.xfail(reason="TODO: check details in GH28021") @pytest.mark.slow def test_finder_annual(self): xp = [1987, 1988, 1990, 1990, 1995, 2020, 2070, 2170] @@ -530,7 +522,7 @@ def test_finder_minutely(self): _, ax = self.plt.subplots() ser.plot(ax=ax) xaxis = ax.get_xaxis() - rs = xaxis.get_majorticklocs()[1] + rs = xaxis.get_majorticklocs()[0] xp = Period("1/1/1999", freq="Min").ordinal assert rs == xp @@ -542,7 +534,7 @@ def test_finder_hourly(self): _, ax = self.plt.subplots() ser.plot(ax=ax) xaxis = ax.get_xaxis() - rs = xaxis.get_majorticklocs()[1] + rs = xaxis.get_majorticklocs()[0] xp = Period("1/1/1999", freq="H").ordinal assert rs == xp @@ -1418,9 +1410,7 @@ def test_plot_outofbounds_datetime(self): def test_format_timedelta_ticks_narrow(self): - expected_labels = [ - "00:00:00.0000000{:0>2d}".format(i) for i in np.arange(0, 10, 2) - ] + expected_labels = ["00:00:00.0000000{:0>2d}".format(i) for i in np.arange(10)] rng = timedelta_range("0", periods=10, freq="ns") df = DataFrame(np.random.randn(len(rng), 3), rng) @@ -1430,8 +1420,8 @@ def test_format_timedelta_ticks_narrow(self): labels = ax.get_xticklabels() result_labels = [x.get_text() for x in labels] - assert (len(result_labels) - 2) == len(expected_labels) - assert result_labels[1:-1] == expected_labels + assert len(result_labels) == len(expected_labels) + assert result_labels == expected_labels def test_format_timedelta_ticks_wide(self): expected_labels = [ @@ -1454,8 +1444,8 @@ def test_format_timedelta_ticks_wide(self): labels = ax.get_xticklabels() result_labels = [x.get_text() for x in labels] - assert (len(result_labels) - 2) == len(expected_labels) - assert result_labels[1:-1] == expected_labels + assert len(result_labels) == len(expected_labels) + assert result_labels == expected_labels def test_timedelta_plot(self): # test issue #8711 From 8b3246f9f5770d60afb310883212e2788adc33a4 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 21 Aug 2019 13:32:28 -0500 Subject: [PATCH 133/191] CI: Set SHA for codecov upload (#28067) --- ci/run_tests.sh | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/ci/run_tests.sh b/ci/run_tests.sh index ee46da9f52eab..74c1cde325a02 100755 --- a/ci/run_tests.sh +++ b/ci/run_tests.sh @@ -51,8 +51,9 @@ do sh -c "$PYTEST_CMD; ret=\$?; [ \$ret = 5 ] && exit 0 || exit \$ret" if [[ "$COVERAGE" && $? == 0 ]]; then + SHA=`git rev-parse HEAD` echo "uploading coverage for $TYPE tests" - echo "bash <(curl -s https://codecov.io/bash) -Z -c -F $TYPE -f $COVERAGE_FNAME" - bash <(curl -s https://codecov.io/bash) -Z -c -F $TYPE -f $COVERAGE_FNAME + echo "bash <(curl -s https://codecov.io/bash) -Z -c -F $TYPE -f $COVERAGE_FNAME -C $SHA" + bash <(curl -s https://codecov.io/bash) -Z -c -F $TYPE -f $COVERAGE_FNAME -C `git rev-parse HEAD` fi done From cf6b09d94d648b4bc3783eb5fff8b3241600528a Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 21 Aug 2019 13:23:14 -0700 Subject: [PATCH 134/191] CI: disable codecov (#28065) * CI: disable codecov * disable --- ci/run_tests.sh | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/ci/run_tests.sh b/ci/run_tests.sh index 74c1cde325a02..27d3fcb4cf563 100755 --- a/ci/run_tests.sh +++ b/ci/run_tests.sh @@ -50,10 +50,10 @@ do # if no tests are found (the case of "single and slow"), pytest exits with code 5, and would make the script fail, if not for the below code sh -c "$PYTEST_CMD; ret=\$?; [ \$ret = 5 ] && exit 0 || exit \$ret" - if [[ "$COVERAGE" && $? == 0 ]]; then - SHA=`git rev-parse HEAD` - echo "uploading coverage for $TYPE tests" - echo "bash <(curl -s https://codecov.io/bash) -Z -c -F $TYPE -f $COVERAGE_FNAME -C $SHA" - bash <(curl -s https://codecov.io/bash) -Z -c -F $TYPE -f $COVERAGE_FNAME -C `git rev-parse HEAD` - fi + # 2019-08-21 disabling because this is hitting HTTP 400 errors GH#27602 + # if [[ "$COVERAGE" && $? == 0 && "$TRAVIS_BRANCH" == "master" ]]; then + # echo "uploading coverage for $TYPE tests" + # echo "bash <(curl -s https://codecov.io/bash) -Z -c -F $TYPE -f $COVERAGE_FNAME" + # bash <(curl -s https://codecov.io/bash) -Z -c -F $TYPE -f $COVERAGE_FNAME + # fi done From 7f6ba6b353e63e004c293cfa4b7d3a23bda4cc5a Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 21 Aug 2019 15:52:02 -0500 Subject: [PATCH 135/191] DOC: Update whatsnew (#28073) --- doc/source/whatsnew/v0.25.1.rst | 107 +++++++------------------------- 1 file changed, 21 insertions(+), 86 deletions(-) diff --git a/doc/source/whatsnew/v0.25.1.rst b/doc/source/whatsnew/v0.25.1.rst index 86d6db01c10c2..b658a6efbd1a1 100644 --- a/doc/source/whatsnew/v0.25.1.rst +++ b/doc/source/whatsnew/v0.25.1.rst @@ -1,56 +1,43 @@ .. _whatsnew_0251: -What's new in 0.25.1 (July XX, 2019) ------------------------------------- +What's new in 0.25.1 (August 21, 2019) +-------------------------------------- -Enhancements -~~~~~~~~~~~~ - - -.. _whatsnew_0251.enhancements.other: +These are the changes in pandas 0.25.1. See :ref:`release` for a full changelog +including other versions of pandas. -Other enhancements -^^^^^^^^^^^^^^^^^^ +I/O and LZMA +~~~~~~~~~~~~ -- -- -- +Some users may unknowingly have an incomplete Python installation lacking the `lzma` module from the standard library. In this case, `import pandas` failed due to an `ImportError` (:issue: `27575`). +Pandas will now warn, rather than raising an `ImportError` if the `lzma` module is not present. Any subsequent attempt to use `lzma` methods will raise a `RuntimeError`. +A possible fix for the lack of the `lzma` module is to ensure you have the necessary libraries and then re-install Python. +For example, on MacOS installing Python with `pyenv` may lead to an incomplete Python installation due to unmet system dependencies at compilation time (like `xz`). Compilation will succeed, but Python might fail at run time. The issue can be solved by installing the necessary dependencies and then re-installing Python. .. _whatsnew_0251.bug_fixes: Bug fixes ~~~~~~~~~ - Categorical ^^^^^^^^^^^ -- Bug in :meth:`Categorical.fillna` would replace all values, not just those that are ``NaN`` (:issue:`26215`) -- +- Bug in :meth:`Categorical.fillna` that would replace all values, not just those that are ``NaN`` (:issue:`26215`) Datetimelike ^^^^^^^^^^^^ + - Bug in :func:`to_datetime` where passing a timezone-naive :class:`DatetimeArray` or :class:`DatetimeIndex` and ``utc=True`` would incorrectly return a timezone-naive result (:issue:`27733`) - Bug in :meth:`Period.to_timestamp` where a :class:`Period` outside the :class:`Timestamp` implementation bounds (roughly 1677-09-21 to 2262-04-11) would return an incorrect :class:`Timestamp` instead of raising ``OutOfBoundsDatetime`` (:issue:`19643`) -- -- - -Timedelta -^^^^^^^^^ - -- -- -- Timezones ^^^^^^^^^ - Bug in :class:`Index` where a numpy object array with a timezone aware :class:`Timestamp` and ``np.nan`` would not return a :class:`DatetimeIndex` (:issue:`27011`) -- -- Numeric ^^^^^^^ + - Bug in :meth:`Series.interpolate` when using a timezone aware :class:`DatetimeIndex` (:issue:`27548`) - Bug when printing negative floating point complex numbers would raise an ``IndexError`` (:issue:`27484`) - Bug where :class:`DataFrame` arithmetic operators such as :meth:`DataFrame.mul` with a :class:`Series` with axis=1 would raise an ``AttributeError`` on :class:`DataFrame` larger than the minimum threshold to invoke numexpr (:issue:`27636`) @@ -60,23 +47,11 @@ Conversion ^^^^^^^^^^ - Improved the warnings for the deprecated methods :meth:`Series.real` and :meth:`Series.imag` (:issue:`27610`) -- -- - -Strings -^^^^^^^ - -- -- -- - Interval ^^^^^^^^ + - Bug in :class:`IntervalIndex` where `dir(obj)` would raise ``ValueError`` (:issue:`27571`) -- -- -- Indexing ^^^^^^^^ @@ -85,38 +60,26 @@ Indexing - Break reference cycle involving :class:`Index` and other index classes to allow garbage collection of index objects without running the GC. (:issue:`27585`, :issue:`27840`) - Fix regression in assigning values to a single column of a DataFrame with a ``MultiIndex`` columns (:issue:`27841`). - Fix regression in ``.ix`` fallback with an ``IntervalIndex`` (:issue:`27865`). -- Missing ^^^^^^^ -- Bug in :func:`pandas.isnull` or :func:`pandas.isna` when the input is a type e.g. `type(pandas.Series())` (:issue:`27482`) -- -- - -MultiIndex -^^^^^^^^^^ - -- -- -- +- Bug in :func:`pandas.isnull` or :func:`pandas.isna` when the input is a type e.g. ``type(pandas.Series())`` (:issue:`27482`) I/O ^^^ + - Avoid calling ``S3File.s3`` when reading parquet, as this was removed in s3fs version 0.3.0 (:issue:`27756`) - Better error message when a negative header is passed in :func:`pandas.read_csv` (:issue:`27779`) -- Follow the ``min_rows`` display option (introduced in v0.25.0) correctly in the html repr in the notebook (:issue:`27991`). -- +- Follow the ``min_rows`` display option (introduced in v0.25.0) correctly in the HTML repr in the notebook (:issue:`27991`). Plotting ^^^^^^^^ -- Added a pandas_plotting_backends entrypoint group for registering plot backends. See :ref:`extending.plotting-backends` for more (:issue:`26747`). +- Added a ``pandas_plotting_backends`` entrypoint group for registering plot backends. See :ref:`extending.plotting-backends` for more (:issue:`26747`). - Fixed the re-instatement of Matplotlib datetime converters after calling - `pandas.plotting.deregister_matplotlib_converters()` (:issue:`27481`). -- + :meth:`pandas.plotting.deregister_matplotlib_converters` (:issue:`27481`). - Fix compatibility issue with matplotlib when passing a pandas ``Index`` to a plot call (:issue:`27775`). -- Groupby/resample/rolling ^^^^^^^^^^^^^^^^^^^^^^^^ @@ -125,7 +88,6 @@ Groupby/resample/rolling - Bug in :meth:`pandas.core.groupby.GroupBy.nth` where ``observed=False`` was being ignored for Categorical groupers (:issue:`26385`) - Bug in windowing over read-only arrays (:issue:`27766`) - Fixed segfault in `pandas.core.groupby.DataFrameGroupBy.quantile` when an invalid quantile was passed (:issue:`27470`) -- Reshaping ^^^^^^^^^ @@ -137,40 +99,13 @@ Reshaping Sparse ^^^^^^ -- Bug in reductions for :class:`Series` with Sparse dtypes (:issue:`27080`) -- -- -- - - -Build Changes -^^^^^^^^^^^^^ - -- -- -- - -ExtensionArray -^^^^^^^^^^^^^^ -- -- -- +- Bug in reductions for :class:`Series` with Sparse dtypes (:issue:`27080`) Other ^^^^^ -- Bug in :meth:`Series.replace` and :meth:`DataFrame.replace` when replacing timezone-aware timestamps using a dict-like replacer (:issue:`27720`) -- -- -- - -I/O and LZMA -~~~~~~~~~~~~ -Some users may unknowingly have an incomplete Python installation, which lacks the `lzma` module from the standard library. In this case, `import pandas` failed due to an `ImportError` (:issue: `27575`). -Pandas will now warn, rather than raising an `ImportError` if the `lzma` module is not present. Any subsequent attempt to use `lzma` methods will raise a `RuntimeError`. -A possible fix for the lack of the `lzma` module is to ensure you have the necessary libraries and then re-install Python. -For example, on MacOS installing Python with `pyenv` may lead to an incomplete Python installation due to unmet system dependencies at compilation time (like `xz`). Compilation will succeed, but Python might fail at run time. The issue can be solved by installing the necessary dependencies and then re-installing Python. +- Bug in :meth:`Series.replace` and :meth:`DataFrame.replace` when replacing timezone-aware timestamps using a dict-like replacer (:issue:`27720`) .. _whatsnew_0.251.contributors: From becb7744d2de31ba8647ab340e1d2d281a5e6f49 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 21 Aug 2019 15:53:09 -0500 Subject: [PATCH 136/191] TST: non-strict xfail for period test (#28072) This was XPASSing on the MacPython builds: https://travis-ci.org/MacPython/pandas-wheels/jobs/574706922 --- pandas/tests/scalar/period/test_period.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pandas/tests/scalar/period/test_period.py b/pandas/tests/scalar/period/test_period.py index 6da4d556ea07e..a1de205afc0e2 100644 --- a/pandas/tests/scalar/period/test_period.py +++ b/pandas/tests/scalar/period/test_period.py @@ -1549,7 +1549,11 @@ def test_period_immutable(): @pytest.mark.xfail( - PY35, reason="Parsing as Period('0007-01-01', 'D') for reasons unknown", strict=True + # xpassing on MacPython with strict=False + # https://travis-ci.org/MacPython/pandas-wheels/jobs/574706922 + PY35, + reason="Parsing as Period('0007-01-01', 'D') for reasons unknown", + strict=False, ) def test_small_year_parsing(): per1 = Period("0001-01-07", "D") From bdcab11e234a34be47e4395173b1893033629a5a Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 21 Aug 2019 13:54:41 -0700 Subject: [PATCH 137/191] BUG: iter with readonly values, closes #28055 (#28074) * BUG: iter with readonly values, closes #28055 * whatsnew --- doc/source/whatsnew/v0.25.1.rst | 1 + pandas/_libs/tslib.pyx | 2 +- pandas/tests/indexes/datetimes/test_misc.py | 8 ++++++++ 3 files changed, 10 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.25.1.rst b/doc/source/whatsnew/v0.25.1.rst index b658a6efbd1a1..eb54961309e8e 100644 --- a/doc/source/whatsnew/v0.25.1.rst +++ b/doc/source/whatsnew/v0.25.1.rst @@ -29,6 +29,7 @@ Datetimelike - Bug in :func:`to_datetime` where passing a timezone-naive :class:`DatetimeArray` or :class:`DatetimeIndex` and ``utc=True`` would incorrectly return a timezone-naive result (:issue:`27733`) - Bug in :meth:`Period.to_timestamp` where a :class:`Period` outside the :class:`Timestamp` implementation bounds (roughly 1677-09-21 to 2262-04-11) would return an incorrect :class:`Timestamp` instead of raising ``OutOfBoundsDatetime`` (:issue:`19643`) +- Bug in iterating over :class:`DatetimeIndex` when the underlying data is read-only (:issue:`28055`) Timezones ^^^^^^^^^ diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index 4e49f660f5e19..01e500a80dcc4 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -71,7 +71,7 @@ cdef inline object create_time_from_ts( @cython.wraparound(False) @cython.boundscheck(False) -def ints_to_pydatetime(int64_t[:] arr, object tz=None, object freq=None, +def ints_to_pydatetime(const int64_t[:] arr, object tz=None, object freq=None, str box="datetime"): """ Convert an i8 repr to an ndarray of datetimes, date, time or Timestamp diff --git a/pandas/tests/indexes/datetimes/test_misc.py b/pandas/tests/indexes/datetimes/test_misc.py index 4ea32359b8d4a..ab3107a0798e5 100644 --- a/pandas/tests/indexes/datetimes/test_misc.py +++ b/pandas/tests/indexes/datetimes/test_misc.py @@ -377,3 +377,11 @@ def test_nanosecond_field(self): dti = DatetimeIndex(np.arange(10)) tm.assert_index_equal(dti.nanosecond, pd.Index(np.arange(10, dtype=np.int64))) + + +def test_iter_readonly(): + # GH#28055 ints_to_pydatetime with readonly array + arr = np.array([np.datetime64("2012-02-15T12:00:00.000000000")]) + arr.setflags(write=False) + dti = pd.to_datetime(arr) + list(dti) From 8f6118c6a1547ffd39d9b89df1b8e52128b63aa0 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 22 Aug 2019 06:28:36 -0500 Subject: [PATCH 138/191] BUG: Fixed groupby quantile for listlike q (#27827) * BUG: Fixed groupby quantile for listlike q Closes #27526 --- doc/source/whatsnew/v0.25.1.rst | 1 + pandas/core/groupby/groupby.py | 64 ++++++++++++++++++++++----- pandas/tests/groupby/test_function.py | 51 +++++++++++++++++++++ 3 files changed, 104 insertions(+), 12 deletions(-) diff --git a/doc/source/whatsnew/v0.25.1.rst b/doc/source/whatsnew/v0.25.1.rst index eb54961309e8e..680d69a9862cd 100644 --- a/doc/source/whatsnew/v0.25.1.rst +++ b/doc/source/whatsnew/v0.25.1.rst @@ -85,6 +85,7 @@ Plotting Groupby/resample/rolling ^^^^^^^^^^^^^^^^^^^^^^^^ +- Fixed regression in :meth:`pands.core.groupby.DataFrameGroupBy.quantile` raising when multiple quantiles are given (:issue:`27526`) - Bug in :meth:`pandas.core.groupby.DataFrameGroupBy.transform` where applying a timezone conversion lambda function would drop timezone information (:issue:`27496`) - Bug in :meth:`pandas.core.groupby.GroupBy.nth` where ``observed=False`` was being ignored for Categorical groupers (:issue:`26385`) - Bug in windowing over read-only arrays (:issue:`27766`) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index d68557853db08..3e8d079e47326 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1874,6 +1874,7 @@ def quantile(self, q=0.5, interpolation="linear"): a 2.0 b 3.0 """ + from pandas import concat def pre_processor(vals: np.ndarray) -> Tuple[np.ndarray, Optional[Type]]: if is_object_dtype(vals): @@ -1901,18 +1902,57 @@ def post_processor(vals: np.ndarray, inference: Optional[Type]) -> np.ndarray: return vals - return self._get_cythonized_result( - "group_quantile", - self.grouper, - aggregate=True, - needs_values=True, - needs_mask=True, - cython_dtype=np.float64, - pre_processing=pre_processor, - post_processing=post_processor, - q=q, - interpolation=interpolation, - ) + if is_scalar(q): + return self._get_cythonized_result( + "group_quantile", + self.grouper, + aggregate=True, + needs_values=True, + needs_mask=True, + cython_dtype=np.float64, + pre_processing=pre_processor, + post_processing=post_processor, + q=q, + interpolation=interpolation, + ) + else: + results = [ + self._get_cythonized_result( + "group_quantile", + self.grouper, + aggregate=True, + needs_values=True, + needs_mask=True, + cython_dtype=np.float64, + pre_processing=pre_processor, + post_processing=post_processor, + q=qi, + interpolation=interpolation, + ) + for qi in q + ] + result = concat(results, axis=0, keys=q) + # fix levels to place quantiles on the inside + # TODO(GH-10710): Ideally, we could write this as + # >>> result.stack(0).loc[pd.IndexSlice[:, ..., q], :] + # but this hits https://github.com/pandas-dev/pandas/issues/10710 + # which doesn't reorder the list-like `q` on the inner level. + order = np.roll(list(range(result.index.nlevels)), -1) + result = result.reorder_levels(order) + result = result.reindex(q, level=-1) + + # fix order. + hi = len(q) * self.ngroups + arr = np.arange(0, hi, self.ngroups) + arrays = [] + + for i in range(self.ngroups): + arr = arr + i + arrays.append(arr) + + indices = np.concatenate(arrays) + assert len(indices) == len(result) + return result.take(indices) @Substitution(name="groupby") def ngroup(self, ascending=True): diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 3794120281e1f..509d7c33b643b 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -1238,6 +1238,57 @@ def test_quantile(interpolation, a_vals, b_vals, q): tm.assert_frame_equal(result, expected) +def test_quantile_array(): + # https://github.com/pandas-dev/pandas/issues/27526 + df = pd.DataFrame({"A": [0, 1, 2, 3, 4]}) + result = df.groupby([0, 0, 1, 1, 1]).quantile([0.25]) + + index = pd.MultiIndex.from_product([[0, 1], [0.25]]) + expected = pd.DataFrame({"A": [0.25, 2.50]}, index=index) + tm.assert_frame_equal(result, expected) + + df = pd.DataFrame({"A": [0, 1, 2, 3], "B": [4, 5, 6, 7]}) + index = pd.MultiIndex.from_product([[0, 1], [0.25, 0.75]]) + + result = df.groupby([0, 0, 1, 1]).quantile([0.25, 0.75]) + expected = pd.DataFrame( + {"A": [0.25, 0.75, 2.25, 2.75], "B": [4.25, 4.75, 6.25, 6.75]}, index=index + ) + tm.assert_frame_equal(result, expected) + + +def test_quantile_array_no_sort(): + df = pd.DataFrame({"A": [0, 1, 2], "B": [3, 4, 5]}) + result = df.groupby([1, 0, 1], sort=False).quantile([0.25, 0.5, 0.75]) + expected = pd.DataFrame( + {"A": [0.5, 1.0, 1.5, 1.0, 1.0, 1.0], "B": [3.5, 4.0, 4.5, 4.0, 4.0, 4.0]}, + index=pd.MultiIndex.from_product([[1, 0], [0.25, 0.5, 0.75]]), + ) + tm.assert_frame_equal(result, expected) + + result = df.groupby([1, 0, 1], sort=False).quantile([0.75, 0.25]) + expected = pd.DataFrame( + {"A": [1.5, 0.5, 1.0, 1.0], "B": [4.5, 3.5, 4.0, 4.0]}, + index=pd.MultiIndex.from_product([[1, 0], [0.75, 0.25]]), + ) + tm.assert_frame_equal(result, expected) + + +def test_quantile_array_multiple_levels(): + df = pd.DataFrame( + {"A": [0, 1, 2], "B": [3, 4, 5], "c": ["a", "a", "a"], "d": ["a", "a", "b"]} + ) + result = df.groupby(["c", "d"]).quantile([0.25, 0.75]) + index = pd.MultiIndex.from_tuples( + [("a", "a", 0.25), ("a", "a", 0.75), ("a", "b", 0.25), ("a", "b", 0.75)], + names=["c", "d", None], + ) + expected = pd.DataFrame( + {"A": [0.25, 0.75, 2.0, 2.0], "B": [3.25, 3.75, 5.0, 5.0]}, index=index + ) + tm.assert_frame_equal(result, expected) + + def test_quantile_raises(): df = pd.DataFrame( [["foo", "a"], ["foo", "b"], ["foo", "c"]], columns=["key", "val"] From a76df79498c452ace8ef21dfca8e5267169e92cb Mon Sep 17 00:00:00 2001 From: Sumanau Sareen Date: Thu, 22 Aug 2019 13:40:58 +0200 Subject: [PATCH 139/191] Preserve index when setting new column on empty dataframe. (#26471) --- pandas/core/frame.py | 2 +- pandas/tests/frame/test_indexing.py | 8 ++++++++ pandas/tests/indexing/test_partial.py | 11 +++++------ 3 files changed, 14 insertions(+), 7 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 97567192aa17a..f2bb964f35dbd 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3098,7 +3098,7 @@ def _ensure_valid_index(self, value): passed value. """ # GH5632, make sure that we are a Series convertible - if not len(self.index) and is_list_like(value): + if not len(self.index) and is_list_like(value) and len(value): try: value = Series(value) except (ValueError, NotImplementedError, TypeError): diff --git a/pandas/tests/frame/test_indexing.py b/pandas/tests/frame/test_indexing.py index ae14563e5952a..a78b2ab7d1c4c 100644 --- a/pandas/tests/frame/test_indexing.py +++ b/pandas/tests/frame/test_indexing.py @@ -821,6 +821,14 @@ def test_setitem_empty_frame_with_boolean(self, dtype, kwargs): df[df > df2] = 47 assert_frame_equal(df, df2) + def test_setitem_with_empty_listlike(self): + # GH #17101 + index = pd.Index([], name="idx") + result = pd.DataFrame(columns=["A"], index=index) + result["A"] = [] + expected = pd.DataFrame(columns=["A"], index=index) + tm.assert_index_equal(result.index, expected.index) + def test_setitem_scalars_no_index(self): # GH16823 / 17894 df = DataFrame() diff --git a/pandas/tests/indexing/test_partial.py b/pandas/tests/indexing/test_partial.py index 68e93f06e43dc..c4505231932c6 100644 --- a/pandas/tests/indexing/test_partial.py +++ b/pandas/tests/indexing/test_partial.py @@ -442,10 +442,10 @@ def test_partial_set_empty_frame(self): # these work as they don't really change # anything but the index # GH5632 - expected = DataFrame(columns=["foo"], index=Index([], dtype="int64")) + expected = DataFrame(columns=["foo"], index=Index([], dtype="object")) def f(): - df = DataFrame() + df = DataFrame(index=Index([], dtype="object")) df["foo"] = Series([], dtype="object") return df @@ -469,22 +469,21 @@ def f(): expected["foo"] = expected["foo"].astype("float64") def f(): - df = DataFrame() + df = DataFrame(index=Index([], dtype="int64")) df["foo"] = [] return df tm.assert_frame_equal(f(), expected) def f(): - df = DataFrame() + df = DataFrame(index=Index([], dtype="int64")) df["foo"] = Series(np.arange(len(df)), dtype="float64") return df tm.assert_frame_equal(f(), expected) def f(): - df = DataFrame() - tm.assert_index_equal(df.index, Index([], dtype="object")) + df = DataFrame(index=Index([], dtype="int64")) df["foo"] = range(len(df)) return df From b1c871ce4b5e76b3cffe1ebd4216d36379872352 Mon Sep 17 00:00:00 2001 From: Luiz Gustavo Date: Thu, 22 Aug 2019 10:07:47 -0300 Subject: [PATCH 140/191] =?UTF-8?q?BUG:=20Series.rename=20raises=20error?= =?UTF-8?q?=20on=20values=20accepted=20by=20Series=20construc=E2=80=A6=20(?= =?UTF-8?q?#27814)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * BUG: Series.rename raises error on values accepted by Series constructor. --- doc/source/whatsnew/v0.25.1.rst | 1 + pandas/core/series.py | 8 +++----- pandas/tests/series/test_alter_axes.py | 19 +++++++++++++++++++ 3 files changed, 23 insertions(+), 5 deletions(-) diff --git a/doc/source/whatsnew/v0.25.1.rst b/doc/source/whatsnew/v0.25.1.rst index 680d69a9862cd..b307fae4fbdc1 100644 --- a/doc/source/whatsnew/v0.25.1.rst +++ b/doc/source/whatsnew/v0.25.1.rst @@ -108,6 +108,7 @@ Other ^^^^^ - Bug in :meth:`Series.replace` and :meth:`DataFrame.replace` when replacing timezone-aware timestamps using a dict-like replacer (:issue:`27720`) +- Bug in :meth:`Series.rename` when using a custom type indexer. Now any value that isn't callable or dict-like is treated as a scalar. (:issue:`27814`) .. _whatsnew_0.251.contributors: diff --git a/pandas/core/series.py b/pandas/core/series.py index 3f04970ee4e58..8b6c963e40e9d 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -4165,12 +4165,10 @@ def rename(self, index=None, **kwargs): """ kwargs["inplace"] = validate_bool_kwarg(kwargs.get("inplace", False), "inplace") - non_mapping = is_scalar(index) or ( - is_list_like(index) and not is_dict_like(index) - ) - if non_mapping: + if callable(index) or is_dict_like(index): + return super().rename(index=index, **kwargs) + else: return self._set_name(index, inplace=kwargs.get("inplace")) - return super().rename(index=index, **kwargs) @Substitution(**_shared_doc_kwargs) @Appender(generic.NDFrame.reindex.__doc__) diff --git a/pandas/tests/series/test_alter_axes.py b/pandas/tests/series/test_alter_axes.py index 0a25d6ba203cb..5d74ad95be90d 100644 --- a/pandas/tests/series/test_alter_axes.py +++ b/pandas/tests/series/test_alter_axes.py @@ -267,6 +267,25 @@ def test_rename_axis_none(self, kwargs): expected = Series([1, 2, 3], index=expected_index) tm.assert_series_equal(result, expected) + def test_rename_with_custom_indexer(self): + # GH 27814 + class MyIndexer: + pass + + ix = MyIndexer() + s = Series([1, 2, 3]).rename(ix) + assert s.name is ix + + def test_rename_with_custom_indexer_inplace(self): + # GH 27814 + class MyIndexer: + pass + + ix = MyIndexer() + s = Series([1, 2, 3]) + s.rename(ix, inplace=True) + assert s.name is ix + def test_set_axis_inplace_axes(self, axis_series): # GH14636 ser = Series(np.arange(4), index=[1, 3, 5, 7], dtype="int64") From def01cf7bbb5ef8c9bf2e19737ea918e6a76a143 Mon Sep 17 00:00:00 2001 From: ianzur <33916505+ianzur@users.noreply.github.com> Date: Thu, 22 Aug 2019 08:09:48 -0500 Subject: [PATCH 141/191] BUG: timedelta merge asof with tolerance (#27650) * issue #27642 - timedelta merge asof with tolerance --- doc/source/whatsnew/v0.25.1.rst | 1 + pandas/core/reshape/merge.py | 3 +- pandas/tests/reshape/merge/test_merge_asof.py | 55 +++++++++++++++++-- 3 files changed, 53 insertions(+), 6 deletions(-) diff --git a/doc/source/whatsnew/v0.25.1.rst b/doc/source/whatsnew/v0.25.1.rst index b307fae4fbdc1..63dd56f4a3793 100644 --- a/doc/source/whatsnew/v0.25.1.rst +++ b/doc/source/whatsnew/v0.25.1.rst @@ -95,6 +95,7 @@ Reshaping ^^^^^^^^^ - A ``KeyError`` is now raised if ``.unstack()`` is called on a :class:`Series` or :class:`DataFrame` with a flat :class:`Index` passing a name which is not the correct one (:issue:`18303`) +- Bug :meth:`merge_asof` could not merge :class:`Timedelta` objects when passing `tolerance` kwarg (:issue:`27642`) - Bug in :meth:`DataFrame.crosstab` when ``margins`` set to ``True`` and ``normalize`` is not ``False``, an error is raised. (:issue:`27500`) - :meth:`DataFrame.join` now suppresses the ``FutureWarning`` when the sort parameter is specified (:issue:`21952`) - Bug in :meth:`DataFrame.join` raising with readonly arrays (:issue:`27943`) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index f45c7693bf6ed..225de3f11cf7d 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -22,7 +22,6 @@ is_bool, is_bool_dtype, is_categorical_dtype, - is_datetime64_dtype, is_datetime64tz_dtype, is_datetimelike, is_dtype_equal, @@ -1635,7 +1634,7 @@ def _get_merge_keys(self): ) ) - if is_datetime64_dtype(lt) or is_datetime64tz_dtype(lt): + if is_datetimelike(lt): if not isinstance(self.tolerance, Timedelta): raise MergeError(msg) if self.tolerance < Timedelta(0): diff --git a/pandas/tests/reshape/merge/test_merge_asof.py b/pandas/tests/reshape/merge/test_merge_asof.py index 6b66386bafc5e..7412b1de643a1 100644 --- a/pandas/tests/reshape/merge/test_merge_asof.py +++ b/pandas/tests/reshape/merge/test_merge_asof.py @@ -1,3 +1,5 @@ +import datetime + import numpy as np import pytest import pytz @@ -588,14 +590,23 @@ def test_non_sorted(self): # ok, though has dupes merge_asof(trades, self.quotes, on="time", by="ticker") - def test_tolerance(self): + @pytest.mark.parametrize( + "tolerance", + [ + Timedelta("1day"), + pytest.param( + datetime.timedelta(days=1), + marks=pytest.mark.xfail(reason="not implemented", strict=True), + ), + ], + ids=["pd.Timedelta", "datetime.timedelta"], + ) + def test_tolerance(self, tolerance): trades = self.trades quotes = self.quotes - result = merge_asof( - trades, quotes, on="time", by="ticker", tolerance=Timedelta("1day") - ) + result = merge_asof(trades, quotes, on="time", by="ticker", tolerance=tolerance) expected = self.tolerance assert_frame_equal(result, expected) @@ -1246,3 +1257,39 @@ def test_by_mixed_tz_aware(self): ) expected["value_y"] = np.array([np.nan], dtype=object) assert_frame_equal(result, expected) + + def test_timedelta_tolerance_nearest(self): + # GH 27642 + + left = pd.DataFrame( + list(zip([0, 5, 10, 15, 20, 25], [0, 1, 2, 3, 4, 5])), + columns=["time", "left"], + ) + + left["time"] = pd.to_timedelta(left["time"], "ms") + + right = pd.DataFrame( + list(zip([0, 3, 9, 12, 15, 18], [0, 1, 2, 3, 4, 5])), + columns=["time", "right"], + ) + + right["time"] = pd.to_timedelta(right["time"], "ms") + + expected = pd.DataFrame( + list( + zip( + [0, 5, 10, 15, 20, 25], + [0, 1, 2, 3, 4, 5], + [0, np.nan, 2, 4, np.nan, np.nan], + ) + ), + columns=["time", "left", "right"], + ) + + expected["time"] = pd.to_timedelta(expected["time"], "ms") + + result = pd.merge_asof( + left, right, on="time", tolerance=Timedelta("1ms"), direction="nearest" + ) + + assert_frame_equal(result, expected) From 888d1fae80a975147e3f99f9254bf1dbca3affd3 Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Fri, 23 Aug 2019 01:06:03 -0700 Subject: [PATCH 142/191] DOC: update GroupBy.head()/tail() documentation (#27844) --- pandas/core/groupby/groupby.py | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 3e8d079e47326..3eeecd9c149e1 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -2370,8 +2370,9 @@ def head(self, n=5): """ Return first n rows of each group. - Essentially equivalent to ``.apply(lambda x: x.head(n))``, - except ignores as_index flag. + Similar to ``.apply(lambda x: x.head(n))``, but it returns a subset of rows + from the original DataFrame with original index and order preserved + (``as_index`` flag is ignored). Returns ------- @@ -2382,10 +2383,6 @@ def head(self, n=5): >>> df = pd.DataFrame([[1, 2], [1, 4], [5, 6]], ... columns=['A', 'B']) - >>> df.groupby('A', as_index=False).head(1) - A B - 0 1 2 - 2 5 6 >>> df.groupby('A').head(1) A B 0 1 2 @@ -2401,8 +2398,9 @@ def tail(self, n=5): """ Return last n rows of each group. - Essentially equivalent to ``.apply(lambda x: x.tail(n))``, - except ignores as_index flag. + Similar to ``.apply(lambda x: x.tail(n))``, but it returns a subset of rows + from the original DataFrame with original index and order preserved + (``as_index`` flag is ignored). Returns ------- @@ -2417,10 +2415,6 @@ def tail(self, n=5): A B 1 a 2 3 b 2 - >>> df.groupby('A').head(1) - A B - 0 a 1 - 2 b 1 """ self._reset_group_selection() mask = self._cumcount_array(ascending=False) < n From d5ba4c14c62c1a23f53773c4e3ecb3bd9a792a91 Mon Sep 17 00:00:00 2001 From: Wuraola Oyewusi Date: Fri, 23 Aug 2019 10:01:28 +0100 Subject: [PATCH 143/191] DOC: Remove alias for numpy.random.randn from the docs (#28082) --- doc/source/conf.py | 1 - doc/source/whatsnew/v0.10.0.rst | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/doc/source/conf.py b/doc/source/conf.py index 3ebc5d8b6333b..a4b7d97c2cf5e 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -315,7 +315,6 @@ import numpy as np import pandas as pd - randn = np.random.randn np.random.seed(123456) np.set_printoptions(precision=4, suppress=True) pd.options.display.max_rows = 15 diff --git a/doc/source/whatsnew/v0.10.0.rst b/doc/source/whatsnew/v0.10.0.rst index 59ea6b9776232..2e0442364b2f3 100644 --- a/doc/source/whatsnew/v0.10.0.rst +++ b/doc/source/whatsnew/v0.10.0.rst @@ -498,7 +498,7 @@ Here is a taste of what to expect. .. code-block:: ipython - In [58]: p4d = Panel4D(randn(2, 2, 5, 4), + In [58]: p4d = Panel4D(np.random.randn(2, 2, 5, 4), ....: labels=['Label1','Label2'], ....: items=['Item1', 'Item2'], ....: major_axis=date_range('1/1/2000', periods=5), From c7ceff98395b13aded759a6ac8d1fbe49fc9113c Mon Sep 17 00:00:00 2001 From: "Martina G. Vilas" Date: Fri, 23 Aug 2019 11:03:00 +0200 Subject: [PATCH 144/191] DOC: Fix docstrings lack of punctuation (#28031) --- pandas/core/arrays/base.py | 4 ++-- pandas/core/arrays/datetimes.py | 4 ++-- pandas/core/arrays/period.py | 2 +- pandas/core/indexes/datetimes.py | 12 ++++++------ pandas/core/indexes/multi.py | 6 +++--- pandas/core/indexes/timedeltas.py | 22 +++++++++++----------- pandas/core/indexing.py | 2 +- 7 files changed, 26 insertions(+), 26 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 5c121172d0e4f..0778b6726d104 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -514,7 +514,7 @@ def fillna(self, value=None, method=None, limit=None): def dropna(self): """ - Return ExtensionArray without NA values + Return ExtensionArray without NA values. Returns ------- @@ -957,7 +957,7 @@ def _concat_same_type( cls, to_concat: Sequence[ABCExtensionArray] ) -> ABCExtensionArray: """ - Concatenate multiple array + Concatenate multiple array. Parameters ---------- diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index 093334a815938..70df708d36b3b 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -1158,7 +1158,7 @@ def tz_localize(self, tz, ambiguous="raise", nonexistent="raise", errors=None): def to_pydatetime(self): """ Return Datetime Array/Index as object ndarray of datetime.datetime - objects + objects. Returns ------- @@ -1283,7 +1283,7 @@ def to_perioddelta(self, freq): """ Calculate TimedeltaArray of difference between index values and index converted to PeriodArray at specified - freq. Used for vectorized offsets + freq. Used for vectorized offsets. Parameters ---------- diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index 20ce11c70c344..f2d74794eadf5 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -426,7 +426,7 @@ def __array__(self, dtype=None): @property def is_leap_year(self): """ - Logical indicating if the date belongs to a leap year + Logical indicating if the date belongs to a leap year. """ return isleapyear_arr(np.asarray(self.year)) diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index 51daad3b42649..272066d476ce3 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -661,7 +661,7 @@ def _get_time_micros(self): def to_series(self, keep_tz=None, index=None, name=None): """ Create a Series with both index and values equal to the index keys - useful with map for returning an indexer based on an index + useful with map for returning an indexer based on an index. Parameters ---------- @@ -687,10 +687,10 @@ def to_series(self, keep_tz=None, index=None, name=None): behaviour and silence the warning. index : Index, optional - index of resulting Series. If None, defaults to original index - name : string, optional - name of resulting Series. If None, defaults to name of original - index + Index of resulting Series. If None, defaults to original index. + name : str, optional + Name of resulting Series. If None, defaults to name of original + index. Returns ------- @@ -735,7 +735,7 @@ def to_series(self, keep_tz=None, index=None, name=None): def snap(self, freq="S"): """ - Snap time stamps to nearest occurring frequency + Snap time stamps to nearest occurring frequency. Returns ------- diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index b614952ba1e04..761862b9f30e9 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -1250,7 +1250,7 @@ def _set_names(self, names, level=None, validate=True): self.levels[l].rename(name, inplace=True) names = property( - fset=_set_names, fget=_get_names, doc="""\nNames of levels in MultiIndex\n""" + fset=_set_names, fget=_get_names, doc="""\nNames of levels in MultiIndex.\n""" ) @Appender(_index_shared_docs["_get_grouper_for_level"]) @@ -1762,7 +1762,7 @@ def is_all_dates(self): def is_lexsorted(self): """ - Return True if the codes are lexicographically sorted + Return True if the codes are lexicographically sorted. Returns ------- @@ -2246,7 +2246,7 @@ def swaplevel(self, i=-2, j=-1): def reorder_levels(self, order): """ - Rearrange levels using input order. May not drop or duplicate levels + Rearrange levels using input order. May not drop or duplicate levels. Parameters ---------- diff --git a/pandas/core/indexes/timedeltas.py b/pandas/core/indexes/timedeltas.py index d06afa3daa792..8cf14e2ca777e 100644 --- a/pandas/core/indexes/timedeltas.py +++ b/pandas/core/indexes/timedeltas.py @@ -68,20 +68,20 @@ class TimedeltaIndex( ): """ Immutable ndarray of timedelta64 data, represented internally as int64, and - which can be boxed to timedelta objects + which can be boxed to timedelta objects. Parameters ---------- data : array-like (1-dimensional), optional - Optional timedelta-like data to construct index with + Optional timedelta-like data to construct index with. unit : unit of the arg (D,h,m,s,ms,us,ns) denote the unit, optional - which is an integer/float number - freq : string or pandas offset object, optional + Which is an integer/float number. + freq : str or pandas offset object, optional One of pandas date offset strings or corresponding objects. The string 'infer' can be passed in order to set the frequency of the index as the - inferred frequency upon creation + inferred frequency upon creation. copy : bool - Make a copy of input ndarray + Make a copy of input ndarray. start : starting value, timedelta-like, optional If data is None, start is used as the start point in generating regular timedelta data. @@ -90,24 +90,24 @@ class TimedeltaIndex( periods : int, optional, > 0 Number of periods to generate, if generating index. Takes precedence - over end argument + over end argument. .. deprecated:: 0.24.0 end : end time, timedelta-like, optional If periods is none, generated index will extend to first conforming - time on or just past end argument + time on or just past end argument. .. deprecated:: 0.24. 0 - closed : string or None, default None + closed : str or None, default None Make the interval closed with respect to the given frequency to - the 'left', 'right', or both sides (None) + the 'left', 'right', or both sides (None). .. deprecated:: 0.24. 0 name : object - Name to be stored in the index + Name to be stored in the index. Attributes ---------- diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 7bb5e2fa3018d..b8ca3419af4d7 100755 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -49,7 +49,7 @@ def get_indexers_list(): # the public IndexSlicerMaker class _IndexSlice: """ - Create an object to more easily perform multi-index slicing + Create an object to more easily perform multi-index slicing. See Also -------- From 9dc4d718e093ccbb15e024da6d3bad80f4e99ba6 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 23 Aug 2019 08:36:19 -0500 Subject: [PATCH 145/191] DOC: Start 0.25.2 (#28111) * DOC: Start 0.25.2 --- doc/source/whatsnew/index.rst | 1 + doc/source/whatsnew/v0.25.2.rst | 110 ++++++++++++++++++++++++++++++++ 2 files changed, 111 insertions(+) create mode 100644 doc/source/whatsnew/v0.25.2.rst diff --git a/doc/source/whatsnew/index.rst b/doc/source/whatsnew/index.rst index aeab2cf5809e7..fe80cc8bb959a 100644 --- a/doc/source/whatsnew/index.rst +++ b/doc/source/whatsnew/index.rst @@ -24,6 +24,7 @@ Version 0.25 .. toctree:: :maxdepth: 2 + v0.25.2 v0.25.1 v0.25.0 diff --git a/doc/source/whatsnew/v0.25.2.rst b/doc/source/whatsnew/v0.25.2.rst new file mode 100644 index 0000000000000..76473405374e8 --- /dev/null +++ b/doc/source/whatsnew/v0.25.2.rst @@ -0,0 +1,110 @@ +.. _whatsnew_0252: + +What's new in 0.25.2 (October XX, 2019) +--------------------------------------- + +These are the changes in pandas 0.25.2. See :ref:`release` for a full changelog +including other versions of pandas. + +.. _whatsnew_0252.bug_fixes: + +Bug fixes +~~~~~~~~~ + +Categorical +^^^^^^^^^^^ + +- + +Datetimelike +^^^^^^^^^^^^ + +- +- +- + +Timezones +^^^^^^^^^ + +- + +Numeric +^^^^^^^ + +- +- +- +- + +Conversion +^^^^^^^^^^ + +- + +Interval +^^^^^^^^ + +- + +Indexing +^^^^^^^^ + +- +- +- +- + +Missing +^^^^^^^ + +- + +I/O +^^^ + +- +- +- + +Plotting +^^^^^^^^ + +- +- +- + +Groupby/resample/rolling +^^^^^^^^^^^^^^^^^^^^^^^^ + +- +- +- +- +- + +Reshaping +^^^^^^^^^ + +- +- +- +- +- + +Sparse +^^^^^^ + +- + +Other +^^^^^ + +- +- + +.. _whatsnew_0.252.contributors: + +Contributors +~~~~~~~~~~~~ + +.. contributors:: v0.25.1..HEAD From 347ad8564ec7dbf679f61e88f6914ab20d7ae3da Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 23 Aug 2019 11:11:49 -0700 Subject: [PATCH 146/191] TST: fix compression tests when run without virtualenv/condaenv (#28051) --- pandas/tests/io/test_compression.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/tests/io/test_compression.py b/pandas/tests/io/test_compression.py index 16ca1109f266c..d68b6a1effaa0 100644 --- a/pandas/tests/io/test_compression.py +++ b/pandas/tests/io/test_compression.py @@ -1,6 +1,7 @@ import contextlib import os import subprocess +import sys import textwrap import warnings @@ -139,7 +140,7 @@ def test_with_missing_lzma(): import pandas """ ) - subprocess.check_output(["python", "-c", code]) + subprocess.check_output([sys.executable, "-c", code]) def test_with_missing_lzma_runtime(): @@ -156,4 +157,4 @@ def test_with_missing_lzma_runtime(): df.to_csv('foo.csv', compression='xz') """ ) - subprocess.check_output(["python", "-c", code]) + subprocess.check_output([sys.executable, "-c", code]) From e2483c022d58d0871cf2d961b9636bbf7d81917c Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Fri, 23 Aug 2019 23:36:58 +0100 Subject: [PATCH 147/191] TYPING: more type hints for io.formats.printing (#27765) --- pandas/io/formats/printing.py | 40 ++++++++++++++--------------------- 1 file changed, 16 insertions(+), 24 deletions(-) diff --git a/pandas/io/formats/printing.py b/pandas/io/formats/printing.py index 4ec9094ce4abe..ead51693da791 100644 --- a/pandas/io/formats/printing.py +++ b/pandas/io/formats/printing.py @@ -3,12 +3,14 @@ """ import sys -from typing import Callable, Dict, Iterable, List, Optional, Sequence, Tuple, Union +from typing import Any, Callable, Dict, Iterable, List, Optional, Sequence, Tuple, Union from pandas._config import get_option from pandas.core.dtypes.inference import is_sequence +EscapeChars = Union[Dict[str, str], Iterable[str]] + def adjoin(space: int, *lists: List[str], **kwargs) -> str: """ @@ -148,19 +150,16 @@ def _pprint_dict( def pprint_thing( - thing, + thing: Any, _nest_lvl: int = 0, - escape_chars: Optional[Union[Dict[str, str], Iterable[str]]] = None, + escape_chars: Optional[EscapeChars] = None, default_escapes: bool = False, quote_strings: bool = False, max_seq_items: Optional[int] = None, ) -> str: """ This function is the sanctioned way of converting objects - to a unicode representation. - - properly handles nested sequences containing unicode strings - (unicode(object) does not) + to a string representation and properly handles nested sequences. Parameters ---------- @@ -178,21 +177,13 @@ def pprint_thing( Returns ------- - result - unicode str + str """ - def as_escaped_unicode(thing, escape_chars=escape_chars): - # Unicode is fine, else we try to decode using utf-8 and 'replace' - # if that's not it either, we have no way of knowing and the user - # should deal with it himself. - - try: - result = str(thing) # we should try this first - except UnicodeDecodeError: - # either utf-8 or we replace errors - result = str(thing).decode("utf-8", "replace") - + def as_escaped_string( + thing: Any, escape_chars: Optional[EscapeChars] = escape_chars + ) -> str: translate = {"\t": r"\t", "\n": r"\n", "\r": r"\r"} if isinstance(escape_chars, dict): if default_escapes: @@ -202,10 +193,11 @@ def as_escaped_unicode(thing, escape_chars=escape_chars): escape_chars = list(escape_chars.keys()) else: escape_chars = escape_chars or tuple() + + result = str(thing) for c in escape_chars: result = result.replace(c, translate[c]) - - return str(result) + return result if hasattr(thing, "__next__"): return str(thing) @@ -224,11 +216,11 @@ def as_escaped_unicode(thing, escape_chars=escape_chars): max_seq_items=max_seq_items, ) elif isinstance(thing, str) and quote_strings: - result = "'{thing}'".format(thing=as_escaped_unicode(thing)) + result = "'{thing}'".format(thing=as_escaped_string(thing)) else: - result = as_escaped_unicode(thing) + result = as_escaped_string(thing) - return str(result) # always unicode + return result def pprint_thing_encoded( From d75ee703efc0d201af2f05bd166b0f58ec5977b5 Mon Sep 17 00:00:00 2001 From: William Ayd Date: Sat, 24 Aug 2019 00:38:17 +0200 Subject: [PATCH 148/191] Remove Encoding of values in char** For Labels (#27618) --- doc/source/whatsnew/v1.0.0.rst | 1 + pandas/_libs/src/ujson/lib/ultrajson.h | 7 - pandas/_libs/src/ujson/lib/ultrajsonenc.c | 6 + pandas/_libs/src/ujson/python/objToJSON.c | 234 +++++++++++++++------- pandas/tests/io/json/test_pandas.py | 134 ++++++++----- 5 files changed, 250 insertions(+), 132 deletions(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 4decc99087a9e..8e25857e5ad69 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -159,6 +159,7 @@ I/O ^^^ - :meth:`read_csv` now accepts binary mode file buffers when using the Python csv engine (:issue:`23779`) +- Bug in :meth:`DataFrame.to_json` where using a Tuple as a column or index value and using ``orient="columns"`` or ``orient="index"`` would produce invalid JSON (:issue:`20500`) - Plotting diff --git a/pandas/_libs/src/ujson/lib/ultrajson.h b/pandas/_libs/src/ujson/lib/ultrajson.h index 0470fef450dde..ee6e7081bf00e 100644 --- a/pandas/_libs/src/ujson/lib/ultrajson.h +++ b/pandas/_libs/src/ujson/lib/ultrajson.h @@ -307,11 +307,4 @@ EXPORTFUNCTION JSOBJ JSON_DecodeObject(JSONObjectDecoder *dec, const char *buffer, size_t cbBuffer); EXPORTFUNCTION void encode(JSOBJ, JSONObjectEncoder *, const char *, size_t); -#define Buffer_Reserve(__enc, __len) \ - if ((size_t)((__enc)->end - (__enc)->offset) < (size_t)(__len)) { \ - Buffer_Realloc((__enc), (__len)); \ - } - -void Buffer_Realloc(JSONObjectEncoder *enc, size_t cbNeeded); - #endif // PANDAS__LIBS_SRC_UJSON_LIB_ULTRAJSON_H_ diff --git a/pandas/_libs/src/ujson/lib/ultrajsonenc.c b/pandas/_libs/src/ujson/lib/ultrajsonenc.c index 2d6c823a45515..d5b379bee585b 100644 --- a/pandas/_libs/src/ujson/lib/ultrajsonenc.c +++ b/pandas/_libs/src/ujson/lib/ultrajsonenc.c @@ -714,6 +714,12 @@ int Buffer_EscapeStringValidated(JSOBJ obj, JSONObjectEncoder *enc, } } +#define Buffer_Reserve(__enc, __len) \ + if ( (size_t) ((__enc)->end - (__enc)->offset) < (size_t) (__len)) \ + { \ + Buffer_Realloc((__enc), (__len));\ + } \ + #define Buffer_AppendCharUnchecked(__enc, __chr) *((__enc)->offset++) = __chr; FASTCALL_ATTR INLINE_PREFIX void FASTCALL_MSVC strreverse(char *begin, diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c index 926440218b5d9..de336fb3aa1dc 100644 --- a/pandas/_libs/src/ujson/python/objToJSON.c +++ b/pandas/_libs/src/ujson/python/objToJSON.c @@ -48,13 +48,13 @@ Numeric decoder derived from from TCL library #include <../../../tslibs/src/datetime/np_datetime_strings.h> #include "datetime.h" -#define NPY_JSON_BUFSIZE 32768 - static PyTypeObject *type_decimal; static PyTypeObject *cls_dataframe; static PyTypeObject *cls_series; static PyTypeObject *cls_index; static PyTypeObject *cls_nat; +PyObject *cls_timestamp; +PyObject *cls_timedelta; npy_int64 get_nat(void) { return NPY_MIN_INT64; } @@ -166,6 +166,8 @@ void *initObjToJSON(void) cls_index = (PyTypeObject *)PyObject_GetAttrString(mod_pandas, "Index"); cls_series = (PyTypeObject *)PyObject_GetAttrString(mod_pandas, "Series"); + cls_timestamp = PyObject_GetAttrString(mod_pandas, "Timestamp"); + cls_timedelta = PyObject_GetAttrString(mod_pandas, "Timedelta"); Py_DECREF(mod_pandas); } @@ -787,30 +789,23 @@ JSOBJ NpyArr_iterGetValue(JSOBJ obj, JSONTypeContext *tc) { return GET_TC(tc)->itemValue; } -static void NpyArr_getLabel(JSOBJ obj, JSONTypeContext *tc, size_t *outLen, - npy_intp idx, char **labels) { - JSONObjectEncoder *enc = (JSONObjectEncoder *)tc->encoder; - PRINTMARK(); - *outLen = strlen(labels[idx]); - Buffer_Reserve(enc, *outLen); - memcpy(enc->offset, labels[idx], sizeof(char) * (*outLen)); - enc->offset += *outLen; - *outLen = 0; -} - char *NpyArr_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen) { NpyArrContext *npyarr = GET_TC(tc)->npyarr; npy_intp idx; PRINTMARK(); + char *cStr; if (GET_TC(tc)->iterNext == NpyArr_iterNextItem) { idx = npyarr->index[npyarr->stridedim] - 1; - NpyArr_getLabel(obj, tc, outLen, idx, npyarr->columnLabels); + cStr = npyarr->columnLabels[idx]; } else { idx = npyarr->index[npyarr->stridedim - npyarr->inc] - 1; - NpyArr_getLabel(obj, tc, outLen, idx, npyarr->rowLabels); + cStr = npyarr->rowLabels[idx]; } - return NULL; + + *outLen = strlen(cStr); + + return cStr; } //============================================================================= @@ -852,19 +847,22 @@ char *PdBlock_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen) { PdBlockContext *blkCtxt = GET_TC(tc)->pdblock; NpyArrContext *npyarr = blkCtxt->npyCtxts[0]; npy_intp idx; + char *cStr; PRINTMARK(); if (GET_TC(tc)->iterNext == PdBlock_iterNextItem) { idx = blkCtxt->colIdx - 1; - NpyArr_getLabel(obj, tc, outLen, idx, npyarr->columnLabels); + cStr = npyarr->columnLabels[idx]; } else { idx = GET_TC(tc)->iterNext != PdBlock_iterNext ? npyarr->index[npyarr->stridedim - npyarr->inc] - 1 : npyarr->index[npyarr->stridedim]; - NpyArr_getLabel(obj, tc, outLen, idx, npyarr->rowLabels); + cStr = npyarr->rowLabels[idx]; } - return NULL; + + *outLen = strlen(cStr); + return cStr; } char *PdBlock_iterGetName_Transpose(JSOBJ obj, JSONTypeContext *tc, @@ -872,16 +870,19 @@ char *PdBlock_iterGetName_Transpose(JSOBJ obj, JSONTypeContext *tc, PdBlockContext *blkCtxt = GET_TC(tc)->pdblock; NpyArrContext *npyarr = blkCtxt->npyCtxts[blkCtxt->colIdx]; npy_intp idx; + char *cStr; PRINTMARK(); if (GET_TC(tc)->iterNext == NpyArr_iterNextItem) { idx = npyarr->index[npyarr->stridedim] - 1; - NpyArr_getLabel(obj, tc, outLen, idx, npyarr->columnLabels); + cStr = npyarr->columnLabels[idx]; } else { idx = blkCtxt->colIdx; - NpyArr_getLabel(obj, tc, outLen, idx, npyarr->rowLabels); + cStr = npyarr->rowLabels[idx]; } - return NULL; + + *outLen = strlen(cStr); + return cStr; } int PdBlock_iterNext(JSOBJ obj, JSONTypeContext *tc) { @@ -1578,16 +1579,30 @@ void NpyArr_freeLabels(char **labels, npy_intp len) { } } -char **NpyArr_encodeLabels(PyArrayObject *labels, JSONObjectEncoder *enc, +/* + * Function: NpyArr_encodeLabels + * ----------------------------- + * + * Builds an array of "encoded" labels. + * + * labels: PyArrayObject pointer for labels to be "encoded" + * num : number of labels + * + * "encode" is quoted above because we aren't really doing encoding + * For historical reasons this function would actually encode the entire + * array into a separate buffer with a separate call to JSON_Encode + * and would leave it to complex pointer manipulation from there to + * unpack values as needed. To make things simpler and more idiomatic + * this has instead just stringified any input save for datetime values, + * which may need to be represented in various formats. + */ +char **NpyArr_encodeLabels(PyArrayObject *labels, PyObjectEncoder *enc, npy_intp num) { // NOTE this function steals a reference to labels. - PyObjectEncoder *pyenc = (PyObjectEncoder *)enc; PyObject *item = NULL; - npy_intp i, stride, len, need_quotes; + npy_intp i, stride, len; char **ret; - char *dataptr, *cLabel, *origend, *origst, *origoffset; - char labelBuffer[NPY_JSON_BUFSIZE]; - PyArray_GetItemFunc *getitem; + char *dataptr, *cLabel; int type_num; PRINTMARK(); @@ -1614,68 +1629,136 @@ char **NpyArr_encodeLabels(PyArrayObject *labels, JSONObjectEncoder *enc, ret[i] = NULL; } - origst = enc->start; - origend = enc->end; - origoffset = enc->offset; - stride = PyArray_STRIDE(labels, 0); dataptr = PyArray_DATA(labels); - getitem = (PyArray_GetItemFunc *)PyArray_DESCR(labels)->f->getitem; type_num = PyArray_TYPE(labels); for (i = 0; i < num; i++) { - if (PyTypeNum_ISDATETIME(type_num) || PyTypeNum_ISNUMBER(type_num)) - { - item = (PyObject *)labels; - pyenc->npyType = type_num; - pyenc->npyValue = dataptr; - } else { - item = getitem(dataptr, labels); - if (!item) { - NpyArr_freeLabels(ret, num); - ret = 0; - break; - } - } - - cLabel = JSON_EncodeObject(item, enc, labelBuffer, NPY_JSON_BUFSIZE); - - if (item != (PyObject *)labels) { - Py_DECREF(item); - } - - if (PyErr_Occurred() || enc->errorMsg) { + item = PyArray_GETITEM(labels, dataptr); + if (!item) { + NpyArr_freeLabels(ret, num); + ret = 0; + break; + } + + // TODO: for any matches on type_num (date and timedeltas) should use a + // vectorized solution to convert to epoch or iso formats + if (enc->datetimeIso && (type_num == NPY_TIMEDELTA || PyDelta_Check(item))) { + PyObject *td = PyObject_CallFunction(cls_timedelta, "(O)", item); + if (td == NULL) { + Py_DECREF(item); + NpyArr_freeLabels(ret, num); + ret = 0; + break; + } + + PyObject *iso = PyObject_CallMethod(td, "isoformat", NULL); + Py_DECREF(td); + if (iso == NULL) { + Py_DECREF(item); + NpyArr_freeLabels(ret, num); + ret = 0; + break; + } + + cLabel = (char *)PyUnicode_AsUTF8(iso); + Py_DECREF(iso); + len = strlen(cLabel); + } + else if (PyTypeNum_ISDATETIME(type_num) || + PyDateTime_Check(item) || PyDate_Check(item)) { + PyObject *ts = PyObject_CallFunction(cls_timestamp, "(O)", item); + if (ts == NULL) { + Py_DECREF(item); + NpyArr_freeLabels(ret, num); + ret = 0; + break; + } + + if (enc->datetimeIso) { + PyObject *iso = PyObject_CallMethod(ts, "isoformat", NULL); + Py_DECREF(ts); + if (iso == NULL) { + Py_DECREF(item); + NpyArr_freeLabels(ret, num); + ret = 0; + break; + } + + cLabel = (char *)PyUnicode_AsUTF8(iso); + Py_DECREF(iso); + len = strlen(cLabel); + } else { + npy_int64 value; + // TODO: refactor to not duplicate what goes on in beginTypeContext + if (PyObject_HasAttrString(ts, "value")) { + PRINTMARK(); + value = get_long_attr(ts, "value"); + } else { + PRINTMARK(); + value = + total_seconds(ts) * 1000000000LL; // nanoseconds per second + } + Py_DECREF(ts); + + switch (enc->datetimeUnit) { + case NPY_FR_ns: + break; + case NPY_FR_us: + value /= 1000LL; + break; + case NPY_FR_ms: + value /= 1000000LL; + break; + case NPY_FR_s: + value /= 1000000000LL; + break; + default: + Py_DECREF(item); + NpyArr_freeLabels(ret, num); + ret = 0; + break; + } + + char buf[21] = {0}; // 21 chars for 2**63 as string + cLabel = buf; + sprintf(buf, "%" NPY_INT64_FMT, value); + len = strlen(cLabel); + } + } else { // Fallack to string representation + PyObject *str = PyObject_Str(item); + if (str == NULL) { + Py_DECREF(item); + NpyArr_freeLabels(ret, num); + ret = 0; + break; + } + + cLabel = (char *)PyUnicode_AsUTF8(str); + Py_DECREF(str); + len = strlen(cLabel); + } + + Py_DECREF(item); + // Add 1 to include NULL terminator + ret[i] = PyObject_Malloc(len + 1); + memcpy(ret[i], cLabel, len + 1); + + if (PyErr_Occurred()) { NpyArr_freeLabels(ret, num); ret = 0; break; } - need_quotes = ((*cLabel) != '"'); - len = enc->offset - cLabel + 1 + 2 * need_quotes; - ret[i] = PyObject_Malloc(sizeof(char) * len); - if (!ret[i]) { PyErr_NoMemory(); ret = 0; break; } - if (need_quotes) { - ret[i][0] = '"'; - memcpy(ret[i] + 1, cLabel, sizeof(char) * (len - 4)); - ret[i][len - 3] = '"'; - } else { - memcpy(ret[i], cLabel, sizeof(char) * (len - 2)); - } - ret[i][len - 2] = ':'; - ret[i][len - 1] = '\0'; dataptr += stride; } - enc->start = origst; - enc->end = origend; - enc->offset = origoffset; - Py_DECREF(labels); return ret; } @@ -1972,7 +2055,7 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { } pc->columnLabelsLen = PyArray_DIM(pc->newObj, 0); pc->columnLabels = NpyArr_encodeLabels((PyArrayObject *)values, - (JSONObjectEncoder *)enc, + enc, pc->columnLabelsLen); if (!pc->columnLabels) { goto INVALID; @@ -2075,7 +2158,7 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { } pc->columnLabelsLen = PyObject_Size(tmpObj); pc->columnLabels = NpyArr_encodeLabels((PyArrayObject *)values, - (JSONObjectEncoder *)enc, + enc, pc->columnLabelsLen); Py_DECREF(tmpObj); if (!pc->columnLabels) { @@ -2098,7 +2181,7 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { pc->rowLabelsLen = PyObject_Size(tmpObj); pc->rowLabels = NpyArr_encodeLabels((PyArrayObject *)values, - (JSONObjectEncoder *)enc, pc->rowLabelsLen); + enc, pc->rowLabelsLen); Py_DECREF(tmpObj); tmpObj = (enc->outputFormat == INDEX ? PyObject_GetAttrString(obj, "columns") @@ -2117,7 +2200,7 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { } pc->columnLabelsLen = PyObject_Size(tmpObj); pc->columnLabels = NpyArr_encodeLabels((PyArrayObject *)values, - (JSONObjectEncoder *)enc, + enc, pc->columnLabelsLen); Py_DECREF(tmpObj); if (!pc->columnLabels) { @@ -2429,7 +2512,6 @@ PyObject *objToJSON(PyObject *self, PyObject *args, PyObject *kwargs) { PRINTMARK(); ret = JSON_EncodeObject(oinput, encoder, buffer, sizeof(buffer)); PRINTMARK(); - if (PyErr_Occurred()) { PRINTMARK(); return NULL; diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 9c687f036aa68..9842a706f43d7 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -1012,60 +1012,70 @@ def test_convert_dates_infer(self): result = read_json(dumps(data))[["id", infer_word]] assert_frame_equal(result, expected) - def test_date_format_frame(self): + @pytest.mark.parametrize( + "date,date_unit", + [ + ("20130101 20:43:42.123", None), + ("20130101 20:43:42", "s"), + ("20130101 20:43:42.123", "ms"), + ("20130101 20:43:42.123456", "us"), + ("20130101 20:43:42.123456789", "ns"), + ], + ) + def test_date_format_frame(self, date, date_unit): df = self.tsframe.copy() - def test_w_date(date, date_unit=None): - df["date"] = Timestamp(date) - df.iloc[1, df.columns.get_loc("date")] = pd.NaT - df.iloc[5, df.columns.get_loc("date")] = pd.NaT - if date_unit: - json = df.to_json(date_format="iso", date_unit=date_unit) - else: - json = df.to_json(date_format="iso") - result = read_json(json) - expected = df.copy() - expected.index = expected.index.tz_localize("UTC") - expected["date"] = expected["date"].dt.tz_localize("UTC") - assert_frame_equal(result, expected) - - test_w_date("20130101 20:43:42.123") - test_w_date("20130101 20:43:42", date_unit="s") - test_w_date("20130101 20:43:42.123", date_unit="ms") - test_w_date("20130101 20:43:42.123456", date_unit="us") - test_w_date("20130101 20:43:42.123456789", date_unit="ns") + df["date"] = Timestamp(date) + df.iloc[1, df.columns.get_loc("date")] = pd.NaT + df.iloc[5, df.columns.get_loc("date")] = pd.NaT + if date_unit: + json = df.to_json(date_format="iso", date_unit=date_unit) + else: + json = df.to_json(date_format="iso") + result = read_json(json) + expected = df.copy() + # expected.index = expected.index.tz_localize("UTC") + expected["date"] = expected["date"].dt.tz_localize("UTC") + assert_frame_equal(result, expected) + def test_date_format_frame_raises(self): + df = self.tsframe.copy() msg = "Invalid value 'foo' for option 'date_unit'" with pytest.raises(ValueError, match=msg): df.to_json(date_format="iso", date_unit="foo") - def test_date_format_series(self): - def test_w_date(date, date_unit=None): - ts = Series(Timestamp(date), index=self.ts.index) - ts.iloc[1] = pd.NaT - ts.iloc[5] = pd.NaT - if date_unit: - json = ts.to_json(date_format="iso", date_unit=date_unit) - else: - json = ts.to_json(date_format="iso") - result = read_json(json, typ="series") - expected = ts.copy() - expected.index = expected.index.tz_localize("UTC") - expected = expected.dt.tz_localize("UTC") - assert_series_equal(result, expected) - - test_w_date("20130101 20:43:42.123") - test_w_date("20130101 20:43:42", date_unit="s") - test_w_date("20130101 20:43:42.123", date_unit="ms") - test_w_date("20130101 20:43:42.123456", date_unit="us") - test_w_date("20130101 20:43:42.123456789", date_unit="ns") + @pytest.mark.parametrize( + "date,date_unit", + [ + ("20130101 20:43:42.123", None), + ("20130101 20:43:42", "s"), + ("20130101 20:43:42.123", "ms"), + ("20130101 20:43:42.123456", "us"), + ("20130101 20:43:42.123456789", "ns"), + ], + ) + def test_date_format_series(self, date, date_unit): + ts = Series(Timestamp(date), index=self.ts.index) + ts.iloc[1] = pd.NaT + ts.iloc[5] = pd.NaT + if date_unit: + json = ts.to_json(date_format="iso", date_unit=date_unit) + else: + json = ts.to_json(date_format="iso") + result = read_json(json, typ="series") + expected = ts.copy() + # expected.index = expected.index.tz_localize("UTC") + expected = expected.dt.tz_localize("UTC") + assert_series_equal(result, expected) + def test_date_format_series_raises(self): ts = Series(Timestamp("20130101 20:43:42.123"), index=self.ts.index) msg = "Invalid value 'foo' for option 'date_unit'" with pytest.raises(ValueError, match=msg): ts.to_json(date_format="iso", date_unit="foo") - def test_date_unit(self): + @pytest.mark.parametrize("unit", ["s", "ms", "us", "ns"]) + def test_date_unit(self, unit): df = self.tsframe.copy() df["date"] = Timestamp("20130101 20:43:42") dl = df.columns.get_loc("date") @@ -1073,16 +1083,15 @@ def test_date_unit(self): df.iloc[2, dl] = Timestamp("21460101 20:43:42") df.iloc[4, dl] = pd.NaT - for unit in ("s", "ms", "us", "ns"): - json = df.to_json(date_format="epoch", date_unit=unit) + json = df.to_json(date_format="epoch", date_unit=unit) - # force date unit - result = read_json(json, date_unit=unit) - assert_frame_equal(result, df) + # force date unit + result = read_json(json, date_unit=unit) + assert_frame_equal(result, df) - # detect date unit - result = read_json(json, date_unit=None) - assert_frame_equal(result, df) + # detect date unit + result = read_json(json, date_unit=None) + assert_frame_equal(result, df) def test_weird_nested_json(self): # this used to core dump the parser @@ -1611,3 +1620,30 @@ def test_read_timezone_information(self): ) expected = Series([88], index=DatetimeIndex(["2019-01-01 11:00:00"], tz="UTC")) assert_series_equal(result, expected) + + @pytest.mark.parametrize( + "date_format,key", [("epoch", 86400000), ("iso", "P1DT0H0M0S")] + ) + def test_timedelta_as_label(self, date_format, key): + df = pd.DataFrame([[1]], columns=[pd.Timedelta("1D")]) + expected = '{{"{key}":{{"0":1}}}}'.format(key=key) + result = df.to_json(date_format=date_format) + + assert result == expected + + @pytest.mark.parametrize( + "orient,expected", + [ + ("index", "{\"('a', 'b')\":{\"('c', 'd')\":1}}"), + ("columns", "{\"('c', 'd')\":{\"('a', 'b')\":1}}"), + # TODO: the below have separate encoding procedures + # They produce JSON but not in a consistent manner + pytest.param("split", "", marks=pytest.mark.skip), + pytest.param("table", "", marks=pytest.mark.skip), + ], + ) + def test_tuple_labels(self, orient, expected): + # GH 20500 + df = pd.DataFrame([[1]], index=[("a", "b")], columns=[("c", "d")]) + result = df.to_json(orient=orient) + assert result == expected From 5c0da7dd4034427745038381e8e2b77ac8c59d08 Mon Sep 17 00:00:00 2001 From: steveayers124 <46000954+steveayers124@users.noreply.github.com> Date: Sat, 24 Aug 2019 04:32:54 -0500 Subject: [PATCH 149/191] DOC: Fix GL01 and GL02 errors in the docstrings (#27988) --- pandas/conftest.py | 37 +++++++++++++++++++++++++------------ pandas/io/html.py | 24 ++++++++++++++++-------- 2 files changed, 41 insertions(+), 20 deletions(-) diff --git a/pandas/conftest.py b/pandas/conftest.py index 2cf7bf6a6df41..b032e14d8f7e1 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -123,18 +123,22 @@ def ip(): @pytest.fixture(params=[True, False, None]) def observed(request): - """ pass in the observed keyword to groupby for [True, False] + """ + Pass in the observed keyword to groupby for [True, False] This indicates whether categoricals should return values for values which are not in the grouper [False / None], or only values which appear in the grouper [True]. [None] is supported for future compatibility if we decide to change the default (and would need to warn if this - parameter is not passed)""" + parameter is not passed). + """ return request.param @pytest.fixture(params=[True, False, None]) def ordered_fixture(request): - """Boolean 'ordered' parameter for Categorical.""" + """ + Boolean 'ordered' parameter for Categorical. + """ return request.param @@ -234,7 +238,8 @@ def cython_table_items(request): def _get_cython_table_params(ndframe, func_names_and_expected): - """combine frame, functions from SelectionMixin._cython_table + """ + Combine frame, functions from SelectionMixin._cython_table keys and expected result. Parameters @@ -242,7 +247,7 @@ def _get_cython_table_params(ndframe, func_names_and_expected): ndframe : DataFrame or Series func_names_and_expected : Sequence of two items The first item is a name of a NDFrame method ('sum', 'prod') etc. - The second item is the expected return value + The second item is the expected return value. Returns ------- @@ -341,7 +346,8 @@ def strict_data_files(pytestconfig): @pytest.fixture def datapath(strict_data_files): - """Get the path to a data file. + """ + Get the path to a data file. Parameters ---------- @@ -375,7 +381,9 @@ def deco(*args): @pytest.fixture def iris(datapath): - """The iris dataset as a DataFrame.""" + """ + The iris dataset as a DataFrame. + """ return pd.read_csv(datapath("data", "iris.csv")) @@ -504,7 +512,8 @@ def tz_aware_fixture(request): @pytest.fixture(params=STRING_DTYPES) def string_dtype(request): - """Parametrized fixture for string dtypes. + """ + Parametrized fixture for string dtypes. * str * 'str' @@ -515,7 +524,8 @@ def string_dtype(request): @pytest.fixture(params=BYTES_DTYPES) def bytes_dtype(request): - """Parametrized fixture for bytes dtypes. + """ + Parametrized fixture for bytes dtypes. * bytes * 'bytes' @@ -525,7 +535,8 @@ def bytes_dtype(request): @pytest.fixture(params=OBJECT_DTYPES) def object_dtype(request): - """Parametrized fixture for object dtypes. + """ + Parametrized fixture for object dtypes. * object * 'object' @@ -535,7 +546,8 @@ def object_dtype(request): @pytest.fixture(params=DATETIME64_DTYPES) def datetime64_dtype(request): - """Parametrized fixture for datetime64 dtypes. + """ + Parametrized fixture for datetime64 dtypes. * 'datetime64[ns]' * 'M8[ns]' @@ -545,7 +557,8 @@ def datetime64_dtype(request): @pytest.fixture(params=TIMEDELTA64_DTYPES) def timedelta64_dtype(request): - """Parametrized fixture for timedelta64 dtypes. + """ + Parametrized fixture for timedelta64 dtypes. * 'timedelta64[ns]' * 'm8[ns]' diff --git a/pandas/io/html.py b/pandas/io/html.py index 9d2647f226f00..490c574463b9b 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -1,4 +1,5 @@ -""":mod:`pandas.io.html` is a module containing functionality for dealing with +""" +:mod:`pandas.io.html` is a module containing functionality for dealing with HTML IO. """ @@ -58,7 +59,8 @@ def _importers(): def _remove_whitespace(s, regex=_RE_WHITESPACE): - """Replace extra whitespace inside of a string with a single space. + """ + Replace extra whitespace inside of a string with a single space. Parameters ---------- @@ -77,7 +79,8 @@ def _remove_whitespace(s, regex=_RE_WHITESPACE): def _get_skiprows(skiprows): - """Get an iterator given an integer, slice or container. + """ + Get an iterator given an integer, slice or container. Parameters ---------- @@ -107,7 +110,8 @@ def _get_skiprows(skiprows): def _read(obj): - """Try to read from a url, file or string. + """ + Try to read from a url, file or string. Parameters ---------- @@ -136,7 +140,8 @@ def _read(obj): class _HtmlFrameParser: - """Base class for parsers that parse HTML into DataFrames. + """ + Base class for parsers that parse HTML into DataFrames. Parameters ---------- @@ -515,7 +520,8 @@ def _handle_hidden_tables(self, tbl_list, attr_name): class _BeautifulSoupHtml5LibFrameParser(_HtmlFrameParser): - """HTML to DataFrame parser that uses BeautifulSoup under the hood. + """ + HTML to DataFrame parser that uses BeautifulSoup under the hood. See Also -------- @@ -622,7 +628,8 @@ def _build_xpath_expr(attrs): class _LxmlFrameParser(_HtmlFrameParser): - """HTML to DataFrame parser that uses lxml under the hood. + """ + HTML to DataFrame parser that uses lxml under the hood. Warning ------- @@ -937,7 +944,8 @@ def read_html( keep_default_na=True, displayed_only=True, ): - r"""Read HTML tables into a ``list`` of ``DataFrame`` objects. + r""" + Read HTML tables into a ``list`` of ``DataFrame`` objects. Parameters ---------- From 518d8aea8f1a7053b541fc6491a50fca30e6fb08 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sun, 25 Aug 2019 08:54:40 -0700 Subject: [PATCH 150/191] Change trys to checks (#28121) --- pandas/core/internals/blocks.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index e24e6e088b92a..f0ee56f403325 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -2830,9 +2830,9 @@ def _replace_single( regex = regex_re or to_rep_re # try to get the pattern attribute (compiled re) or it's a string - try: + if is_re(to_replace): pattern = to_replace.pattern - except AttributeError: + else: pattern = to_replace # if the pattern is not empty and to_replace is either a string or a @@ -2853,18 +2853,18 @@ def _replace_single( if isna(value) or not isinstance(value, str): def re_replacer(s): - try: + if is_re(rx) and isinstance(s, str): return value if rx.search(s) is not None else s - except TypeError: + else: return s else: # value is guaranteed to be a string here, s can be either a string # or null if it's null it gets returned def re_replacer(s): - try: + if is_re(rx) and isinstance(s, str): return rx.sub(value, s) - except TypeError: + else: return s f = np.vectorize(re_replacer, otypes=[self.dtype]) From 2165a6a64d4064af2bf79d7e6889bda2b6adb86f Mon Sep 17 00:00:00 2001 From: Bryant Moscon Date: Sun, 25 Aug 2019 11:56:15 -0400 Subject: [PATCH 151/191] Remove outdated docstring that no longer applies (#28137) --- pandas/compat/pickle_compat.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/pandas/compat/pickle_compat.py b/pandas/compat/pickle_compat.py index bca33513b0069..87240a9f986c3 100644 --- a/pandas/compat/pickle_compat.py +++ b/pandas/compat/pickle_compat.py @@ -196,10 +196,6 @@ def load_newobj_ex(self): def load(fh, encoding=None, is_verbose=False): """load a pickle, with a provided encoding - if compat is True: - fake the old class hierarchy - if it works, then return the new type objects - Parameters ---------- fh : a filelike object From 09ab18f6dca48d4dde677ce9ed86444f8a937e32 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Sun, 25 Aug 2019 16:57:58 +0100 Subject: [PATCH 152/191] TYPING: _pytest.mark.structures.MarkDecorator -> Callable (#28134) --- pandas/util/_test_decorators.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/pandas/util/_test_decorators.py b/pandas/util/_test_decorators.py index 3de4e5d66d577..627757aaa3741 100644 --- a/pandas/util/_test_decorators.py +++ b/pandas/util/_test_decorators.py @@ -25,9 +25,8 @@ def test_foo(): """ from distutils.version import LooseVersion import locale -from typing import Optional +from typing import Callable, Optional -from _pytest.mark.structures import MarkDecorator import pytest from pandas.compat import is_platform_32bit, is_platform_windows @@ -103,7 +102,7 @@ def _skip_if_no_scipy(): ) -def skip_if_installed(package: str,) -> MarkDecorator: +def skip_if_installed(package: str,) -> Callable: """ Skip a test if a package is installed. @@ -117,7 +116,7 @@ def skip_if_installed(package: str,) -> MarkDecorator: ) -def skip_if_no(package: str, min_version: Optional[str] = None) -> MarkDecorator: +def skip_if_no(package: str, min_version: Optional[str] = None) -> Callable: """ Generic function to help skip tests when required packages are not present on the testing system. From 97f9bbf6d4b8af8691fabb7014b7e5aa006e1cf2 Mon Sep 17 00:00:00 2001 From: William Ayd Date: Sun, 25 Aug 2019 09:04:59 -0700 Subject: [PATCH 153/191] Contributing Guide for Type Hints (#27050) --- doc/source/development/contributing.rst | 130 ++++++++++++++++++++++++ 1 file changed, 130 insertions(+) diff --git a/doc/source/development/contributing.rst b/doc/source/development/contributing.rst index b38f7767ae073..be6555b2ab936 100644 --- a/doc/source/development/contributing.rst +++ b/doc/source/development/contributing.rst @@ -699,6 +699,136 @@ You'll also need to See :ref:`contributing.warnings` for more. +.. _contributing.type_hints: + +Type Hints +---------- + +*pandas* strongly encourages the use of :pep:`484` style type hints. New development should contain type hints and pull requests to annotate existing code are accepted as well! + +Style Guidelines +~~~~~~~~~~~~~~~~ + +Types imports should follow the ``from typing import ...`` convention. So rather than + +.. code-block:: python + + import typing + + primes = [] # type: typing.List[int] + +You should write + +.. code-block:: python + + from typing import List, Optional, Union + + primes = [] # type: List[int] + +``Optional`` should be used where applicable, so instead of + +.. code-block:: python + + maybe_primes = [] # type: List[Union[int, None]] + +You should write + +.. code-block:: python + + maybe_primes = [] # type: List[Optional[int]] + +In some cases in the code base classes may define class variables that shadow builtins. This causes an issue as described in `Mypy 1775 `_. The defensive solution here is to create an unambiguous alias of the builtin and use that without your annotation. For example, if you come across a definition like + +.. code-block:: python + + class SomeClass1: + str = None + +The appropriate way to annotate this would be as follows + +.. code-block:: python + + str_type = str + + class SomeClass2: + str = None # type: str_type + +In some cases you may be tempted to use ``cast`` from the typing module when you know better than the analyzer. This occurs particularly when using custom inference functions. For example + +.. code-block:: python + + from typing import cast + + from pandas.core.dtypes.common import is_number + + def cannot_infer_bad(obj: Union[str, int, float]): + + if is_number(obj): + ... + else: # Reasonably only str objects would reach this but... + obj = cast(str, obj) # Mypy complains without this! + return obj.upper() + +The limitation here is that while a human can reasonably understand that ``is_number`` would catch the ``int`` and ``float`` types mypy cannot make that same inference just yet (see `mypy #5206 `_. While the above works, the use of ``cast`` is **strongly discouraged**. Where applicable a refactor of the code to appease static analysis is preferable + +.. code-block:: python + + def cannot_infer_good(obj: Union[str, int, float]): + + if isinstance(obj, str): + return obj.upper() + else: + ... + +With custom types and inference this is not always possible so exceptions are made, but every effort should be exhausted to avoid ``cast`` before going down such paths. + +Syntax Requirements +~~~~~~~~~~~~~~~~~~~ + +Because *pandas* still supports Python 3.5, :pep:`526` does not apply and variables **must** be annotated with type comments. Specifically, this is a valid annotation within pandas: + +.. code-block:: python + + primes = [] # type: List[int] + +Whereas this is **NOT** allowed: + +.. code-block:: python + + primes: List[int] = [] # not supported in Python 3.5! + +Note that function signatures can always be annotated per :pep:`3107`: + +.. code-block:: python + + def sum_of_primes(primes: List[int] = []) -> int: + ... + + +Pandas-specific Types +~~~~~~~~~~~~~~~~~~~~~ + +Commonly used types specific to *pandas* will appear in `pandas._typing `_ and you should use these where applicable. This module is private for now but ultimately this should be exposed to third party libraries who want to implement type checking against pandas. + +For example, quite a few functions in *pandas* accept a ``dtype`` argument. This can be expressed as a string like ``"object"``, a ``numpy.dtype`` like ``np.int64`` or even a pandas ``ExtensionDtype`` like ``pd.CategoricalDtype``. Rather than burden the user with having to constantly annotate all of those options, this can simply be imported and reused from the pandas._typing module + +.. code-block:: python + + from pandas._typing import Dtype + + def as_type(dtype: Dtype) -> ...: + ... + +This module will ultimately house types for repeatedly used concepts like "path-like", "array-like", "numeric", etc... and can also hold aliases for commonly appearing parameters like `axis`. Development of this module is active so be sure to refer to the source for the most up to date list of available types. + +Validating Type Hints +~~~~~~~~~~~~~~~~~~~~~ + +*pandas* uses `mypy `_ to statically analyze the code base and type hints. After making any change you can ensure your type hints are correct by running + +.. code-block:: shell + + mypy pandas .. _contributing.ci: From 5d9fd7e3b226b68e695d87121f584202aa6d4abc Mon Sep 17 00:00:00 2001 From: John Ward Date: Sun, 25 Aug 2019 15:11:00 -0500 Subject: [PATCH 154/191] DOC: Fixes to docstrings formatting (#28096) --- pandas/core/generic.py | 2 +- pandas/io/clipboards.py | 9 ++-- pandas/io/excel/_base.py | 4 +- pandas/io/pytables.py | 93 ++++++++++++++++++++++----------------- pandas/tseries/offsets.py | 10 +++-- 5 files changed, 68 insertions(+), 50 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index ba1c516b9b444..90779baea32cb 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -1875,7 +1875,7 @@ def __iter__(self): # can we get a better explanation of this? def keys(self): """ - Get the 'info axis' (see Indexing for more) + Get the 'info axis' (see Indexing for more). This is index for Series, columns for DataFrame. diff --git a/pandas/io/clipboards.py b/pandas/io/clipboards.py index d38221d784273..76c01535a26e7 100644 --- a/pandas/io/clipboards.py +++ b/pandas/io/clipboards.py @@ -9,8 +9,7 @@ def read_clipboard(sep=r"\s+", **kwargs): # pragma: no cover r""" - Read text from clipboard and pass to read_csv. See read_csv for the - full argument list + Read text from clipboard and pass to read_csv. Parameters ---------- @@ -18,9 +17,13 @@ def read_clipboard(sep=r"\s+", **kwargs): # pragma: no cover A string or regex delimiter. The default of '\s+' denotes one or more whitespace characters. + **kwargs + See read_csv for the full argument list. + Returns ------- - parsed : DataFrame + DataFrame + A parsed DataFrame object. """ encoding = kwargs.pop("encoding", "utf-8") diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 154656fbb250b..997edf49d9e8f 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -837,10 +837,10 @@ def parse( **kwds ): """ - Parse specified sheet(s) into a DataFrame + Parse specified sheet(s) into a DataFrame. Equivalent to read_excel(ExcelFile, ...) See the read_excel - docstring for more info on accepted parameters + docstring for more info on accepted parameters. Returns ------- diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 6af5dd6f1bf37..576c45a2f8097 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -431,8 +431,9 @@ def _is_metadata_of(group, parent_group): class HDFStore: """ - Dict-like IO interface for storing pandas objects in PyTables - either Fixed or Table format. + Dict-like IO interface for storing pandas objects in PyTables. + + Either Fixed or Table format. Parameters ---------- @@ -564,13 +565,12 @@ def __exit__(self, exc_type, exc_value, traceback): def keys(self): """ - Return a (potentially unordered) list of the keys corresponding to the - objects stored in the HDFStore. These are ABSOLUTE path-names (e.g. - have the leading '/' + Return a list of keys corresponding to objects stored in HDFStore. Returns ------- list + List of ABSOLUTE path-names (e.g. have the leading '/'). """ return [n._v_pathname for n in self.groups()] @@ -703,7 +703,7 @@ def flush(self, fsync=False): def get(self, key): """ - Retrieve pandas object stored in file + Retrieve pandas object stored in file. Parameters ---------- @@ -711,7 +711,8 @@ def get(self, key): Returns ------- - obj : same type as object stored in file + object + Same type as object stored in file. """ group = self.get_node(key) if group is None: @@ -731,25 +732,31 @@ def select( **kwargs ): """ - Retrieve pandas object stored in file, optionally based on where - criteria + Retrieve pandas object stored in file, optionally based on where criteria. Parameters ---------- key : object - where : list of Term (or convertible) objects, optional - start : integer (defaults to None), row number to start selection - stop : integer (defaults to None), row number to stop selection - columns : a list of columns that if not None, will limit the return - columns - iterator : boolean, return an iterator, default False - chunksize : nrows to include in iteration, return an iterator - auto_close : boolean, should automatically close the store when - finished, default is False + Object being retrieved from file. + where : list, default None + List of Term (or convertible) objects, optional. + start : int, default None + Row number to start selection. + stop : int, default None + Row number to stop selection. + columns : list, default None + A list of columns that if not None, will limit the return columns. + iterator : bool, default False + Returns an iterator. + chunksize : int, default None + Number or rows to include in iteration, return an iterator. + auto_close : bool, default False + Should automatically close the store when finished. Returns ------- - The selected object + object + Retrieved object from file. """ group = self.get_node(key) if group is None: @@ -929,28 +936,30 @@ def func(_start, _stop, _where): def put(self, key, value, format=None, append=False, **kwargs): """ - Store object in HDFStore + Store object in HDFStore. Parameters ---------- - key : object - value : {Series, DataFrame} - format : 'fixed(f)|table(t)', default is 'fixed' + key : object + value : {Series, DataFrame} + format : 'fixed(f)|table(t)', default is 'fixed' fixed(f) : Fixed format - Fast writing/reading. Not-appendable, nor searchable + Fast writing/reading. Not-appendable, nor searchable. table(t) : Table format Write as a PyTables Table structure which may perform worse but allow more flexible operations like searching - / selecting subsets of the data - append : boolean, default False + / selecting subsets of the data. + append : bool, default False This will force Table format, append the input data to the existing. - data_columns : list of columns to create as data columns, or True to + data_columns : list, default None + List of columns to create as data columns, or True to use all columns. See `here `__. - encoding : default None, provide an encoding for strings - dropna : boolean, default False, do not write an ALL nan row to - the store settable by the option 'io.hdf.dropna_table' + encoding : str, default None + Provide an encoding for strings. + dropna : bool, default False, do not write an ALL nan row to + The store settable by the option 'io.hdf.dropna_table'. """ if format is None: format = get_option("io.hdf.default_format") or "fixed" @@ -1165,12 +1174,15 @@ def create_table_index(self, key, **kwargs): s.create_index(**kwargs) def groups(self): - """return a list of all the top-level nodes (that are not themselves a - pandas storage object) + """ + Return a list of all the top-level nodes. + + Each node returned is not a pandas storage object. Returns ------- list + List of objects. """ _tables() self._check_if_open() @@ -1188,10 +1200,12 @@ def groups(self): ] def walk(self, where="/"): - """ Walk the pytables group hierarchy for pandas objects + """ + Walk the pytables group hierarchy for pandas objects. This generator will yield the group path, subgroups and pandas object names for each group. + Any non-pandas PyTables objects that are not a group will be ignored. The `where` group itself is listed first (preorder), then each of its @@ -1202,18 +1216,17 @@ def walk(self, where="/"): Parameters ---------- - where : str, optional + where : str, default "/" Group where to start walking. - If not supplied, the root group is used. Yields ------ path : str - Full path to a group (without trailing '/') - groups : list of str - names of the groups contained in `path` - leaves : list of str - names of the pandas objects contained in `path` + Full path to a group (without trailing '/'). + groups : list + Names (strings) of the groups contained in `path`. + leaves : list + Names (strings) of the pandas objects contained in `path`. """ _tables() self._check_if_open() diff --git a/pandas/tseries/offsets.py b/pandas/tseries/offsets.py index a208d5ad2fea9..edf58ba3850a1 100644 --- a/pandas/tseries/offsets.py +++ b/pandas/tseries/offsets.py @@ -204,8 +204,7 @@ def __add__(date): normalize : bool, default False Whether to round the result of a DateOffset addition down to the previous midnight. - **kwds - Temporal parameter that add to or replace the offset value. + **kwds : Temporal parameter that add to or replace the offset value. Parameters that **add** to the offset (like Timedelta): @@ -233,16 +232,19 @@ def __add__(date): See Also -------- - dateutil.relativedelta.relativedelta + dateutil.relativedelta.relativedelta : The relativedelta type is designed + to be applied to an existing datetime an can replace specific components of + that datetime, or represents an interval of time. Examples -------- + >>> from pandas.tseries.offsets import DateOffset >>> ts = pd.Timestamp('2017-01-01 09:10:11') >>> ts + DateOffset(months=3) Timestamp('2017-04-01 09:10:11') >>> ts = pd.Timestamp('2017-01-01 09:10:11') - >>> ts + DateOffset(month=3) + >>> ts + DateOffset(months=2) Timestamp('2017-03-01 09:10:11') """ From 2c9c4223442cd555a1fbc894eb5e89792c09ea63 Mon Sep 17 00:00:00 2001 From: Bhuvana KA Date: Mon, 26 Aug 2019 07:49:37 +0530 Subject: [PATCH 155/191] DOC: Fix RangeIndex and other docstrings for missing period in summary (#28123) --- pandas/core/arrays/categorical.py | 2 +- pandas/core/base.py | 2 +- pandas/core/groupby/grouper.py | 2 +- pandas/core/indexes/base.py | 4 ++-- pandas/core/indexes/datetimes.py | 2 +- pandas/core/indexes/interval.py | 2 +- pandas/core/indexes/period.py | 2 +- pandas/core/indexes/range.py | 12 ++++++------ pandas/core/indexes/timedeltas.py | 2 +- pandas/core/reshape/merge.py | 2 +- pandas/core/util/hashing.py | 2 +- 11 files changed, 17 insertions(+), 17 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index a895da6184eeb..5929a8d51fe43 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -471,7 +471,7 @@ def ordered(self) -> Ordered: @property def dtype(self) -> CategoricalDtype: """ - The :class:`~pandas.api.types.CategoricalDtype` for this instance + The :class:`~pandas.api.types.CategoricalDtype` for this instance. """ return self._dtype diff --git a/pandas/core/base.py b/pandas/core/base.py index 7d2a62318232c..767b559445038 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -1462,7 +1462,7 @@ def is_monotonic_decreasing(self): def memory_usage(self, deep=False): """ - Memory usage of the values + Memory usage of the values. Parameters ---------- diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 143755a47b97b..3415c0e056a1c 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -37,7 +37,7 @@ class Grouper: """ A Grouper allows the user to specify a groupby instruction for a target - object + object. This specification will select a column via the key parameter, or if the level and/or axis parameters are given, a level of the index of the target diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 415255cdbad06..38c5e136d0e60 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -2020,7 +2020,7 @@ def notna(self): _index_shared_docs[ "fillna" ] = """ - Fill NA/NaN values with the specified value + Fill NA/NaN values with the specified value. Parameters ---------- @@ -2051,7 +2051,7 @@ def fillna(self, value=None, downcast=None): _index_shared_docs[ "dropna" ] = """ - Return Index without NA/NaN values + Return Index without NA/NaN values. Parameters ---------- diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index 272066d476ce3..cce390d98c037 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -1594,7 +1594,7 @@ def bdate_range( ): """ Return a fixed frequency DatetimeIndex, with business day as the default - frequency + frequency. Parameters ---------- diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index 9361408290bb1..3874c6404565c 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -1310,7 +1310,7 @@ def interval_range( start=None, end=None, periods=None, freq=None, name=None, closed="right" ): """ - Return a fixed frequency IntervalIndex + Return a fixed frequency IntervalIndex. Parameters ---------- diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index 5a2ca109597e8..f7bf77928bdc7 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -994,7 +994,7 @@ def memory_usage(self, deep=False): def period_range(start=None, end=None, periods=None, freq=None, name=None): """ Return a fixed frequency PeriodIndex, with day (calendar) as the default - frequency + frequency. Parameters ---------- diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index 43ed6e7b122ea..8783351cc74d1 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -236,7 +236,7 @@ def _format_with_header(self, header, na_rep="NaN", **kwargs): @cache_readonly def start(self): """ - The value of the `start` parameter (``0`` if this was not supplied) + The value of the `start` parameter (``0`` if this was not supplied). """ # GH 25710 return self._range.start @@ -244,7 +244,7 @@ def start(self): @property def _start(self): """ - The value of the `start` parameter (``0`` if this was not supplied) + The value of the `start` parameter (``0`` if this was not supplied). .. deprecated:: 0.25.0 Use ``start`` instead. @@ -259,14 +259,14 @@ def _start(self): @cache_readonly def stop(self): """ - The value of the `stop` parameter + The value of the `stop` parameter. """ return self._range.stop @property def _stop(self): """ - The value of the `stop` parameter + The value of the `stop` parameter. .. deprecated:: 0.25.0 Use ``stop`` instead. @@ -282,7 +282,7 @@ def _stop(self): @cache_readonly def step(self): """ - The value of the `step` parameter (``1`` if this was not supplied) + The value of the `step` parameter (``1`` if this was not supplied). """ # GH 25710 return self._range.step @@ -290,7 +290,7 @@ def step(self): @property def _step(self): """ - The value of the `step` parameter (``1`` if this was not supplied) + The value of the `step` parameter (``1`` if this was not supplied). .. deprecated:: 0.25.0 Use ``step`` instead. diff --git a/pandas/core/indexes/timedeltas.py b/pandas/core/indexes/timedeltas.py index 8cf14e2ca777e..b03d60c7b5b37 100644 --- a/pandas/core/indexes/timedeltas.py +++ b/pandas/core/indexes/timedeltas.py @@ -713,7 +713,7 @@ def timedelta_range( ): """ Return a fixed frequency TimedeltaIndex, with day as the default - frequency + frequency. Parameters ---------- diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 225de3f11cf7d..d7fbe464cb1e5 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -178,7 +178,7 @@ def merge_ordered( """ Perform merge with optional filling/interpolation designed for ordered data like time series data. Optionally perform group-wise merge (see - examples) + examples). Parameters ---------- diff --git a/pandas/core/util/hashing.py b/pandas/core/util/hashing.py index 73e126cf230a5..bcdbf0855cbb4 100644 --- a/pandas/core/util/hashing.py +++ b/pandas/core/util/hashing.py @@ -58,7 +58,7 @@ def hash_pandas_object( obj, index=True, encoding="utf8", hash_key=None, categorize=True ): """ - Return a data hash of the Index/Series/DataFrame + Return a data hash of the Index/Series/DataFrame. Parameters ---------- From ea60c1966bf7291829a1479512d7aa89d08bd6dd Mon Sep 17 00:00:00 2001 From: jalbritt Date: Sun, 25 Aug 2019 21:21:36 -0500 Subject: [PATCH 156/191] DOC: Added periods to end of docstrings in explode function (#27973) --- pandas/core/frame.py | 8 ++++---- pandas/core/series.py | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index f2bb964f35dbd..9da7999724a18 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -6183,14 +6183,14 @@ def stack(self, level=-1, dropna=True): def explode(self, column: Union[str, Tuple]) -> "DataFrame": """ - Transform each element of a list-like to a row, replicating the - index values. + Transform each element of a list-like to a row, replicating index values. .. versionadded:: 0.25.0 Parameters ---------- column : str or tuple + Column to explode. Returns ------- @@ -6206,8 +6206,8 @@ def explode(self, column: Union[str, Tuple]) -> "DataFrame": See Also -------- DataFrame.unstack : Pivot a level of the (necessarily hierarchical) - index labels - DataFrame.melt : Unpivot a DataFrame from wide format to long format + index labels. + DataFrame.melt : Unpivot a DataFrame from wide format to long format. Series.explode : Explode a DataFrame from list-like columns to long format. Notes diff --git a/pandas/core/series.py b/pandas/core/series.py index 8b6c963e40e9d..6fb39c422de93 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -3620,7 +3620,7 @@ def explode(self) -> "Series": Series.str.split : Split string values on specified separator. Series.unstack : Unstack, a.k.a. pivot, Series with MultiIndex to produce DataFrame. - DataFrame.melt : Unpivot a DataFrame from wide format to long format + DataFrame.melt : Unpivot a DataFrame from wide format to long format. DataFrame.explode : Explode a DataFrame from list-like columns to long format. From 765eb8d8a02aed564bb9d3be93cf36e355ba0d64 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 26 Aug 2019 09:22:30 -0500 Subject: [PATCH 157/191] COMPAT: 3.8 compat for tests and DataFrame.query (#28101) * COMPAT: implement visit_Constant for 3.8 compat * Updated tests for new error messages. --- doc/source/whatsnew/v0.25.2.rst | 2 +- pandas/compat/__init__.py | 1 + pandas/core/computation/expr.py | 3 +++ pandas/tests/computation/test_eval.py | 27 +++++++++++++++++++++++++-- pandas/tests/io/parser/test_common.py | 5 ++++- pandas/tests/scalar/test_nat.py | 3 +++ 6 files changed, 37 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v0.25.2.rst b/doc/source/whatsnew/v0.25.2.rst index 76473405374e8..403c02c3ff129 100644 --- a/doc/source/whatsnew/v0.25.2.rst +++ b/doc/source/whatsnew/v0.25.2.rst @@ -99,7 +99,7 @@ Sparse Other ^^^^^ -- +- Compatibility with Python 3.8 in :meth:`DataFrame.query` (:issue:`27261`) - .. _whatsnew_0.252.contributors: diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py index b32da8da3a1fb..9c778f68727c6 100644 --- a/pandas/compat/__init__.py +++ b/pandas/compat/__init__.py @@ -15,6 +15,7 @@ PY35 = sys.version_info[:2] == (3, 5) PY36 = sys.version_info >= (3, 6) PY37 = sys.version_info >= (3, 7) +PY38 = sys.version_info >= (3, 8) PYPY = platform.python_implementation() == "PyPy" diff --git a/pandas/core/computation/expr.py b/pandas/core/computation/expr.py index a58f256cf61d4..4c164968575a1 100644 --- a/pandas/core/computation/expr.py +++ b/pandas/core/computation/expr.py @@ -582,6 +582,9 @@ def visit_NameConstant(self, node, **kwargs): def visit_Num(self, node, **kwargs): return self.const_type(node.n, self.env) + def visit_Constant(self, node, **kwargs): + return self.const_type(node.n, self.env) + def visit_Str(self, node, **kwargs): name = self.env.add_tmp(node.s) return self.term_type(name, self.env) diff --git a/pandas/tests/computation/test_eval.py b/pandas/tests/computation/test_eval.py index c500760fa1390..b6ffd8a83e409 100644 --- a/pandas/tests/computation/test_eval.py +++ b/pandas/tests/computation/test_eval.py @@ -14,7 +14,7 @@ from pandas.core.dtypes.common import is_bool, is_list_like, is_scalar import pandas as pd -from pandas import DataFrame, Series, date_range +from pandas import DataFrame, Series, compat, date_range from pandas.core.computation import pytables from pandas.core.computation.check import _NUMEXPR_VERSION from pandas.core.computation.engines import NumExprClobberingError, _engines @@ -1267,7 +1267,10 @@ def test_assignment_column(self): msg = "left hand side of an assignment must be a single name" with pytest.raises(SyntaxError, match=msg): df.eval("d,c = a + b") - msg = "can't assign to function call" + if compat.PY38: + msg = "cannot assign to function call" + else: + msg = "can't assign to function call" with pytest.raises(SyntaxError, match=msg): df.eval('Timestamp("20131001") = a + b') @@ -1967,6 +1970,26 @@ def test_bool_ops_fails_on_scalars(lhs, cmp, rhs, engine, parser): pd.eval(ex, engine=engine, parser=parser) +@pytest.mark.parametrize( + "other", + [ + "'x'", + pytest.param( + "...", marks=pytest.mark.xfail(not compat.PY38, reason="GH-28116") + ), + ], +) +def test_equals_various(other): + df = DataFrame({"A": ["a", "b", "c"]}) + result = df.eval("A == {}".format(other)) + expected = Series([False, False, False], name="A") + if _USE_NUMEXPR: + # https://github.com/pandas-dev/pandas/issues/10239 + # lose name with numexpr engine. Remove when that's fixed. + expected.name = None + tm.assert_series_equal(result, expected) + + def test_inf(engine, parser): s = "inf + 1" expected = np.inf diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py index e5366a8357adb..e04535df56663 100644 --- a/pandas/tests/io/parser/test_common.py +++ b/pandas/tests/io/parser/test_common.py @@ -1898,7 +1898,10 @@ def test_null_byte_char(all_parsers): out = parser.read_csv(StringIO(data), names=names) tm.assert_frame_equal(out, expected) else: - msg = "NULL byte detected" + if compat.PY38: + msg = "line contains NUL" + else: + msg = "NULL byte detected" with pytest.raises(ParserError, match=msg): parser.read_csv(StringIO(data), names=names) diff --git a/pandas/tests/scalar/test_nat.py b/pandas/tests/scalar/test_nat.py index 5b1c4f92bf341..5eb69fb2952dc 100644 --- a/pandas/tests/scalar/test_nat.py +++ b/pandas/tests/scalar/test_nat.py @@ -252,6 +252,7 @@ def _get_overlap_public_nat_methods(klass, as_tuple=False): "day_name", "dst", "floor", + "fromisocalendar", "fromisoformat", "fromordinal", "fromtimestamp", @@ -296,6 +297,8 @@ def test_overlap_public_nat_methods(klass, expected): # "fromisoformat" was introduced in 3.7 if klass is Timestamp and not compat.PY37: expected.remove("fromisoformat") + if klass is Timestamp and not compat.PY38: + expected.remove("fromisocalendar") assert _get_overlap_public_nat_methods(klass) == expected From cebc34327c74fed38ad8ee4cffb7b63999c83b9a Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Mon, 26 Aug 2019 15:26:25 +0100 Subject: [PATCH 158/191] TYPING: --check-untyped-defs for Index.__new__ (#28141) --- pandas/core/indexes/base.py | 56 ++++++++++--------------------------- 1 file changed, 14 insertions(+), 42 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 38c5e136d0e60..2dbd592fc6787 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -10,6 +10,7 @@ import pandas._libs.join as libjoin from pandas._libs.lib import is_datetime_array from pandas._libs.tslibs import OutOfBoundsDatetime, Timestamp +from pandas._libs.tslibs.period import IncompatibleFrequency from pandas._libs.tslibs.timezones import tz_compare from pandas.compat import set_function_name from pandas.compat.numpy import function as nv @@ -262,7 +263,13 @@ def __new__( fastpath=None, tupleize_cols=True, **kwargs - ): + ) -> "Index": + + from .range import RangeIndex + from pandas import PeriodIndex, DatetimeIndex, TimedeltaIndex + from .numeric import Float64Index, Int64Index, UInt64Index + from .interval import IntervalIndex + from .category import CategoricalIndex if name is None and hasattr(data, "name"): name = data.name @@ -277,8 +284,6 @@ def __new__( if fastpath: return cls._simple_new(data, name) - from .range import RangeIndex - if isinstance(data, ABCPandasArray): # ensure users don't accidentally put a PandasArray in an index. data = data.to_numpy() @@ -291,16 +296,12 @@ def __new__( # categorical elif is_categorical_dtype(data) or is_categorical_dtype(dtype): - from .category import CategoricalIndex - return CategoricalIndex(data, dtype=dtype, copy=copy, name=name, **kwargs) # interval elif ( is_interval_dtype(data) or is_interval_dtype(dtype) ) and not is_object_dtype(dtype): - from .interval import IntervalIndex - closed = kwargs.get("closed", None) return IntervalIndex(data, dtype=dtype, name=name, copy=copy, closed=closed) @@ -309,8 +310,6 @@ def __new__( or is_datetime64_any_dtype(dtype) or "tz" in kwargs ): - from pandas import DatetimeIndex - if is_dtype_equal(_o_dtype, dtype): # GH#23524 passing `dtype=object` to DatetimeIndex is invalid, # will raise in the where `data` is already tz-aware. So @@ -318,33 +317,24 @@ def __new__( # the DatetimeIndex construction. # Note we can pass copy=False because the .astype below # will always make a copy - result = DatetimeIndex(data, copy=False, name=name, **kwargs) + result = DatetimeIndex( + data, copy=False, name=name, **kwargs + ) # type: "Index" return result.astype(object) else: - result = DatetimeIndex( - data, copy=copy, name=name, dtype=dtype, **kwargs - ) - return result + return DatetimeIndex(data, copy=copy, name=name, dtype=dtype, **kwargs) elif is_timedelta64_dtype(data) or is_timedelta64_dtype(dtype): - from pandas import TimedeltaIndex - if is_dtype_equal(_o_dtype, dtype): # Note we can pass copy=False because the .astype below # will always make a copy result = TimedeltaIndex(data, copy=False, name=name, **kwargs) return result.astype(object) else: - result = TimedeltaIndex( - data, copy=copy, name=name, dtype=dtype, **kwargs - ) - return result + return TimedeltaIndex(data, copy=copy, name=name, dtype=dtype, **kwargs) elif is_period_dtype(data) and not is_object_dtype(dtype): - from pandas import PeriodIndex - - result = PeriodIndex(data, copy=copy, name=name, **kwargs) - return result + return PeriodIndex(data, copy=copy, name=name, **kwargs) # extension dtype elif is_extension_array_dtype(data) or is_extension_array_dtype(dtype): @@ -387,8 +377,6 @@ def __new__( pass # Return an actual float index. - from .numeric import Float64Index - return Float64Index(data, copy=copy, dtype=dtype, name=name) elif inferred == "string": @@ -405,19 +393,11 @@ def __new__( data = np.array(data, dtype=dtype, copy=copy) # maybe coerce to a sub-class - from pandas.core.indexes.period import PeriodIndex, IncompatibleFrequency - if is_signed_integer_dtype(data.dtype): - from .numeric import Int64Index - return Int64Index(data, copy=copy, dtype=dtype, name=name) elif is_unsigned_integer_dtype(data.dtype): - from .numeric import UInt64Index - return UInt64Index(data, copy=copy, dtype=dtype, name=name) elif is_float_dtype(data.dtype): - from .numeric import Float64Index - return Float64Index(data, copy=copy, dtype=dtype, name=name) elif issubclass(data.dtype.type, np.bool) or is_bool_dtype(data): subarr = data.astype("object") @@ -440,12 +420,8 @@ def __new__( return Index(subarr, copy=copy, dtype=object, name=name) elif inferred in ["floating", "mixed-integer-float", "integer-na"]: # TODO: Returns IntegerArray for integer-na case in the future - from .numeric import Float64Index - return Float64Index(subarr, copy=copy, name=name) elif inferred == "interval": - from .interval import IntervalIndex - try: return IntervalIndex(subarr, name=name, copy=copy) except ValueError: @@ -456,8 +432,6 @@ def __new__( pass elif inferred != "string": if inferred.startswith("datetime"): - from pandas import DatetimeIndex - try: return DatetimeIndex(subarr, copy=copy, name=name, **kwargs) except (ValueError, OutOfBoundsDatetime): @@ -467,8 +441,6 @@ def __new__( pass elif inferred.startswith("timedelta"): - from pandas import TimedeltaIndex - return TimedeltaIndex(subarr, copy=copy, name=name, **kwargs) elif inferred == "period": try: From 0d0daa8466d257c3329c54633a9a98867c86d009 Mon Sep 17 00:00:00 2001 From: Drew Heenan Date: Mon, 26 Aug 2019 07:27:40 -0700 Subject: [PATCH 159/191] ENH: Allow compression in NDFrame.to_csv to be a dict with optional arguments (#26023) (#26024) --- doc/source/whatsnew/v1.0.0.rst | 1 + pandas/core/generic.py | 91 ++++++++++++------- pandas/io/common.py | 115 +++++++++++++++++++------ pandas/io/formats/csvs.py | 10 ++- pandas/tests/io/formats/test_to_csv.py | 41 +++++++++ 5 files changed, 200 insertions(+), 58 deletions(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 8e25857e5ad69..2bfc09e52c68b 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -206,6 +206,7 @@ ExtensionArray Other ^^^^^ - Trying to set the ``display.precision``, ``display.max_rows`` or ``display.max_columns`` using :meth:`set_option` to anything but a ``None`` or a positive int will raise a ``ValueError`` (:issue:`23348`) +- :meth:`DataFrame.to_csv` and :meth:`Series.to_csv` now support dicts as ``compression`` argument with key ``'method'`` being the compression method and others as additional compression options when the compression method is ``'zip'``. (:issue:`26023`) .. _whatsnew_1000.contributors: diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 90779baea32cb..fac5e0f085fc6 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -7,7 +7,17 @@ import pickle import re from textwrap import dedent -from typing import Callable, Dict, FrozenSet, List, Optional, Set +from typing import ( + Callable, + Dict, + FrozenSet, + Hashable, + List, + Optional, + Sequence, + Set, + Union, +) import warnings import weakref @@ -50,7 +60,7 @@ from pandas.core.dtypes.missing import isna, notna import pandas as pd -from pandas._typing import Dtype +from pandas._typing import Dtype, FilePathOrBuffer from pandas.core import missing, nanops import pandas.core.algorithms as algos from pandas.core.base import PandasObject, SelectionMixin @@ -122,6 +132,9 @@ def _single_replace(self, to_replace, method, inplace, limit): return result +bool_t = bool # Need alias because NDFrame has def bool: + + class NDFrame(PandasObject, SelectionMixin): """ N-dimensional analogue of DataFrame. Store multi-dimensional in a @@ -3051,26 +3064,26 @@ def to_latex( def to_csv( self, - path_or_buf=None, - sep=",", - na_rep="", - float_format=None, - columns=None, - header=True, - index=True, - index_label=None, - mode="w", - encoding=None, - compression="infer", - quoting=None, - quotechar='"', - line_terminator=None, - chunksize=None, - date_format=None, - doublequote=True, - escapechar=None, - decimal=".", - ): + path_or_buf: Optional[FilePathOrBuffer] = None, + sep: str = ",", + na_rep: str = "", + float_format: Optional[str] = None, + columns: Optional[Sequence[Hashable]] = None, + header: Union[bool_t, List[str]] = True, + index: bool_t = True, + index_label: Optional[Union[bool_t, str, Sequence[Hashable]]] = None, + mode: str = "w", + encoding: Optional[str] = None, + compression: Optional[Union[str, Dict[str, str]]] = "infer", + quoting: Optional[int] = None, + quotechar: str = '"', + line_terminator: Optional[str] = None, + chunksize: Optional[int] = None, + date_format: Optional[str] = None, + doublequote: bool_t = True, + escapechar: Optional[str] = None, + decimal: Optional[str] = ".", + ) -> Optional[str]: r""" Write object to a comma-separated values (csv) file. @@ -3117,16 +3130,21 @@ def to_csv( encoding : str, optional A string representing the encoding to use in the output file, defaults to 'utf-8'. - compression : str, default 'infer' - Compression mode among the following possible values: {'infer', - 'gzip', 'bz2', 'zip', 'xz', None}. If 'infer' and `path_or_buf` - is path-like, then detect compression from the following - extensions: '.gz', '.bz2', '.zip' or '.xz'. (otherwise no - compression). - - .. versionchanged:: 0.24.0 - - 'infer' option added and set to default. + compression : str or dict, default 'infer' + If str, represents compression mode. If dict, value at 'method' is + the compression mode. Compression mode may be any of the following + possible values: {'infer', 'gzip', 'bz2', 'zip', 'xz', None}. If + compression mode is 'infer' and `path_or_buf` is path-like, then + detect compression mode from the following extensions: '.gz', + '.bz2', '.zip' or '.xz'. (otherwise no compression). If dict given + and mode is 'zip' or inferred as 'zip', other entries passed as + additional compression options. + + .. versionchanged:: 0.25.0 + + May now be a dict with key 'method' as compression mode + and other entries as additional compression options if + compression mode is 'zip'. quoting : optional constant from csv module Defaults to csv.QUOTE_MINIMAL. If you have set a `float_format` @@ -3171,6 +3189,13 @@ def to_csv( ... 'weapon': ['sai', 'bo staff']}) >>> df.to_csv(index=False) 'name,mask,weapon\nRaphael,red,sai\nDonatello,purple,bo staff\n' + + # create 'out.zip' containing 'out.csv' + >>> compression_opts = dict(method='zip', + ... archive_name='out.csv') # doctest: +SKIP + + >>> df.to_csv('out.zip', index=False, + ... compression=compression_opts) # doctest: +SKIP """ df = self if isinstance(self, ABCDataFrame) else self.to_frame() @@ -3204,6 +3229,8 @@ def to_csv( if path_or_buf is None: return formatter.path_or_buf.getvalue() + return None + # ---------------------------------------------------------------------- # Fancy Indexing diff --git a/pandas/io/common.py b/pandas/io/common.py index 26b68dda7b464..290022167e520 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -9,7 +9,19 @@ import mmap import os import pathlib -from typing import IO, AnyStr, BinaryIO, Optional, TextIO, Type +from typing import ( + IO, + Any, + AnyStr, + BinaryIO, + Dict, + List, + Optional, + TextIO, + Tuple, + Type, + Union, +) from urllib.error import URLError # noqa from urllib.parse import ( # noqa urlencode, @@ -255,6 +267,40 @@ def file_path_to_url(path: str) -> str: _compression_to_extension = {"gzip": ".gz", "bz2": ".bz2", "zip": ".zip", "xz": ".xz"} +def _get_compression_method( + compression: Optional[Union[str, Dict[str, str]]] +) -> Tuple[Optional[str], Dict[str, str]]: + """ + Simplifies a compression argument to a compression method string and + a dict containing additional arguments. + + Parameters + ---------- + compression : str or dict + If string, specifies the compression method. If dict, value at key + 'method' specifies compression method. + + Returns + ------- + tuple of ({compression method}, Optional[str] + {compression arguments}, Dict[str, str]) + + Raises + ------ + ValueError on dict missing 'method' key + """ + # Handle dict + if isinstance(compression, dict): + compression_args = compression.copy() + try: + compression = compression_args.pop("method") + except KeyError: + raise ValueError("If dict, compression must have key 'method'") + else: + compression_args = {} + return compression, compression_args + + def _infer_compression( filepath_or_buffer: FilePathOrBuffer, compression: Optional[str] ) -> Optional[str]: @@ -266,8 +312,8 @@ def _infer_compression( Parameters ---------- - filepath_or_buffer : - a path (str) or buffer + filepath_or_buffer : str or file handle + File path or object. compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None} If 'infer' and `filepath_or_buffer` is path-like, then detect compression from the following extensions: '.gz', '.bz2', '.zip', @@ -275,12 +321,11 @@ def _infer_compression( Returns ------- - string or None : - compression method + string or None Raises ------ - ValueError on invalid compression specified + ValueError on invalid compression specified. """ # No compression has been explicitly specified @@ -312,32 +357,49 @@ def _infer_compression( def _get_handle( - path_or_buf, mode, encoding=None, compression=None, memory_map=False, is_text=True + path_or_buf, + mode: str, + encoding=None, + compression: Optional[Union[str, Dict[str, Any]]] = None, + memory_map: bool = False, + is_text: bool = True, ): """ Get file handle for given path/buffer and mode. Parameters ---------- - path_or_buf : - a path (str) or buffer + path_or_buf : str or file handle + File path or object. mode : str - mode to open path_or_buf with + Mode to open path_or_buf with. encoding : str or None - compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default None - If 'infer' and `filepath_or_buffer` is path-like, then detect - compression from the following extensions: '.gz', '.bz2', '.zip', - or '.xz' (otherwise no compression). + Encoding to use. + compression : str or dict, default None + If string, specifies compression mode. If dict, value at key 'method' + specifies compression mode. Compression mode must be one of {'infer', + 'gzip', 'bz2', 'zip', 'xz', None}. If compression mode is 'infer' + and `filepath_or_buffer` is path-like, then detect compression from + the following extensions: '.gz', '.bz2', '.zip', or '.xz' (otherwise + no compression). If dict and compression mode is 'zip' or inferred as + 'zip', other entries passed as additional compression options. + + .. versionchanged:: 1.0.0 + + May now be a dict with key 'method' as compression mode + and other keys as compression options if compression + mode is 'zip'. + memory_map : boolean, default False See parsers._parser_params for more information. is_text : boolean, default True whether file/buffer is in text format (csv, json, etc.), or in binary - mode (pickle, etc.) + mode (pickle, etc.). Returns ------- f : file-like - A file-like object + A file-like object. handles : list of file-like objects A list of file-like object that were opened in this function. """ @@ -346,15 +408,16 @@ def _get_handle( need_text_wrapping = (BufferedIOBase, S3File) except ImportError: - need_text_wrapping = BufferedIOBase + need_text_wrapping = BufferedIOBase # type: ignore - handles = list() + handles = list() # type: List[IO] f = path_or_buf # Convert pathlib.Path/py.path.local or string path_or_buf = _stringify_path(path_or_buf) is_path = isinstance(path_or_buf, str) + compression, compression_args = _get_compression_method(compression) if is_path: compression = _infer_compression(path_or_buf, compression) @@ -376,7 +439,7 @@ def _get_handle( # ZIP Compression elif compression == "zip": - zf = BytesZipFile(path_or_buf, mode) + zf = BytesZipFile(path_or_buf, mode, **compression_args) # Ensure the container is closed as well. handles.append(zf) if zf.mode == "w": @@ -429,9 +492,9 @@ def _get_handle( if memory_map and hasattr(f, "fileno"): try: - g = MMapWrapper(f) + wrapped = MMapWrapper(f) f.close() - f = g + f = wrapped except Exception: # we catch any errors that may have occurred # because that is consistent with the lower-level @@ -456,15 +519,19 @@ def __init__( self, file: FilePathOrBuffer, mode: str, - compression: int = zipfile.ZIP_DEFLATED, + archive_name: Optional[str] = None, **kwargs ): if mode in ["wb", "rb"]: mode = mode.replace("b", "") - super().__init__(file, mode, compression, **kwargs) + self.archive_name = archive_name + super().__init__(file, mode, zipfile.ZIP_DEFLATED, **kwargs) def write(self, data): - super().writestr(self.filename, data) + archive_name = self.filename + if self.archive_name is not None: + archive_name = self.archive_name + super().writestr(archive_name, data) @property def closed(self): diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index 60daf311397e8..e25862537cbfc 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -22,6 +22,7 @@ from pandas.io.common import ( UnicodeWriter, + _get_compression_method, _get_handle, _infer_compression, get_filepath_or_buffer, @@ -58,6 +59,9 @@ def __init__( if path_or_buf is None: path_or_buf = StringIO() + # Extract compression mode as given, if dict + compression, self.compression_args = _get_compression_method(compression) + self.path_or_buf, _, _, _ = get_filepath_or_buffer( path_or_buf, encoding=encoding, compression=compression, mode=mode ) @@ -178,7 +182,7 @@ def save(self): self.path_or_buf, self.mode, encoding=self.encoding, - compression=self.compression, + compression=dict(self.compression_args, method=self.compression), ) close = True @@ -206,11 +210,13 @@ def save(self): if hasattr(self.path_or_buf, "write"): self.path_or_buf.write(buf) else: + compression = dict(self.compression_args, method=self.compression) + f, handles = _get_handle( self.path_or_buf, self.mode, encoding=self.encoding, - compression=self.compression, + compression=compression, ) f.write(buf) close = True diff --git a/pandas/tests/io/formats/test_to_csv.py b/pandas/tests/io/formats/test_to_csv.py index ee236a8253b01..ab44b8b8059eb 100644 --- a/pandas/tests/io/formats/test_to_csv.py +++ b/pandas/tests/io/formats/test_to_csv.py @@ -514,3 +514,44 @@ def test_to_csv_compression(self, compression_only, read_infer, to_infer): df.to_csv(path, compression=to_compression) result = pd.read_csv(path, index_col=0, compression=read_compression) tm.assert_frame_equal(result, df) + + def test_to_csv_compression_dict(self, compression_only): + # GH 26023 + method = compression_only + df = DataFrame({"ABC": [1]}) + filename = "to_csv_compress_as_dict." + filename += "gz" if method == "gzip" else method + with tm.ensure_clean(filename) as path: + df.to_csv(path, compression={"method": method}) + read_df = pd.read_csv(path, index_col=0) + tm.assert_frame_equal(read_df, df) + + def test_to_csv_compression_dict_no_method_raises(self): + # GH 26023 + df = DataFrame({"ABC": [1]}) + compression = {"some_option": True} + msg = "must have key 'method'" + + with tm.ensure_clean("out.zip") as path: + with pytest.raises(ValueError, match=msg): + df.to_csv(path, compression=compression) + + @pytest.mark.parametrize("compression", ["zip", "infer"]) + @pytest.mark.parametrize( + "archive_name", [None, "test_to_csv.csv", "test_to_csv.zip"] + ) + def test_to_csv_zip_arguments(self, compression, archive_name): + # GH 26023 + from zipfile import ZipFile + + df = DataFrame({"ABC": [1]}) + with tm.ensure_clean("to_csv_archive_name.zip") as path: + df.to_csv( + path, compression={"method": compression, "archive_name": archive_name} + ) + zp = ZipFile(path) + expected_arcname = path if archive_name is None else archive_name + expected_arcname = os.path.basename(expected_arcname) + assert len(zp.filelist) == 1 + archived_file = os.path.basename(zp.filelist[0].filename) + assert archived_file == expected_arcname From a1bdacfaf0693336b957b1bd3821f15c05120aff Mon Sep 17 00:00:00 2001 From: Katrin Leinweber <9948149+katrinleinweber@users.noreply.github.com> Date: Mon, 26 Aug 2019 18:37:14 +0200 Subject: [PATCH 160/191] DOC: Harmonize column selection to bracket notation (#27562) * Harmonize column selection to bracket notation As suggested by https://medium.com/dunder-data/minimally-sufficient-pandas-a8e67f2a2428#46f9 --- doc/source/getting_started/10min.rst | 2 +- doc/source/getting_started/basics.rst | 12 +++--- .../comparison/comparison_with_r.rst | 8 ++-- doc/source/user_guide/advanced.rst | 2 +- doc/source/user_guide/cookbook.rst | 6 +-- doc/source/user_guide/enhancingperf.rst | 12 +++--- doc/source/user_guide/indexing.rst | 39 ++++++++++--------- doc/source/user_guide/reshaping.rst | 10 ++--- doc/source/user_guide/visualization.rst | 14 +++---- 9 files changed, 54 insertions(+), 51 deletions(-) diff --git a/doc/source/getting_started/10min.rst b/doc/source/getting_started/10min.rst index 9045e5b32c29f..41520795bde62 100644 --- a/doc/source/getting_started/10min.rst +++ b/doc/source/getting_started/10min.rst @@ -278,7 +278,7 @@ Using a single column's values to select data. .. ipython:: python - df[df.A > 0] + df[df['A'] > 0] Selecting values from a DataFrame where a boolean condition is met. diff --git a/doc/source/getting_started/basics.rst b/doc/source/getting_started/basics.rst index 3f6f56376861f..802ffadf2a81e 100644 --- a/doc/source/getting_started/basics.rst +++ b/doc/source/getting_started/basics.rst @@ -926,7 +926,7 @@ Single aggregations on a ``Series`` this will return a scalar value: .. ipython:: python - tsdf.A.agg('sum') + tsdf['A'].agg('sum') Aggregating with multiple functions @@ -950,13 +950,13 @@ On a ``Series``, multiple functions return a ``Series``, indexed by the function .. ipython:: python - tsdf.A.agg(['sum', 'mean']) + tsdf['A'].agg(['sum', 'mean']) Passing a ``lambda`` function will yield a ```` named row: .. ipython:: python - tsdf.A.agg(['sum', lambda x: x.mean()]) + tsdf['A'].agg(['sum', lambda x: x.mean()]) Passing a named function will yield that name for the row: @@ -965,7 +965,7 @@ Passing a named function will yield that name for the row: def mymean(x): return x.mean() - tsdf.A.agg(['sum', mymean]) + tsdf['A'].agg(['sum', mymean]) Aggregating with a dict +++++++++++++++++++++++ @@ -1065,7 +1065,7 @@ Passing a single function to ``.transform()`` with a ``Series`` will yield a sin .. ipython:: python - tsdf.A.transform(np.abs) + tsdf['A'].transform(np.abs) Transform with multiple functions @@ -1084,7 +1084,7 @@ resulting column names will be the transforming functions. .. ipython:: python - tsdf.A.transform([np.abs, lambda x: x + 1]) + tsdf['A'].transform([np.abs, lambda x: x + 1]) Transforming with a dict diff --git a/doc/source/getting_started/comparison/comparison_with_r.rst b/doc/source/getting_started/comparison/comparison_with_r.rst index 444e886bc951d..f67f46fc2b29b 100644 --- a/doc/source/getting_started/comparison/comparison_with_r.rst +++ b/doc/source/getting_started/comparison/comparison_with_r.rst @@ -81,7 +81,7 @@ R pandas =========================================== =========================================== ``select(df, col_one = col1)`` ``df.rename(columns={'col1': 'col_one'})['col_one']`` ``rename(df, col_one = col1)`` ``df.rename(columns={'col1': 'col_one'})`` -``mutate(df, c=a-b)`` ``df.assign(c=df.a-df.b)`` +``mutate(df, c=a-b)`` ``df.assign(c=df['a']-df['b'])`` =========================================== =========================================== @@ -258,8 +258,8 @@ index/slice as well as standard boolean indexing: df = pd.DataFrame({'a': np.random.randn(10), 'b': np.random.randn(10)}) df.query('a <= b') - df[df.a <= df.b] - df.loc[df.a <= df.b] + df[df['a'] <= df['b']] + df.loc[df['a'] <= df['b']] For more details and examples see :ref:`the query documentation `. @@ -284,7 +284,7 @@ In ``pandas`` the equivalent expression, using the df = pd.DataFrame({'a': np.random.randn(10), 'b': np.random.randn(10)}) df.eval('a + b') - df.a + df.b # same as the previous expression + df['a'] + df['b'] # same as the previous expression In certain cases :meth:`~pandas.DataFrame.eval` will be much faster than evaluation in pure Python. For more details and examples see :ref:`the eval diff --git a/doc/source/user_guide/advanced.rst b/doc/source/user_guide/advanced.rst index 22a9791ffde30..62a9b6396404a 100644 --- a/doc/source/user_guide/advanced.rst +++ b/doc/source/user_guide/advanced.rst @@ -738,7 +738,7 @@ and allows efficient indexing and storage of an index with a large number of dup df['B'] = df['B'].astype(CategoricalDtype(list('cab'))) df df.dtypes - df.B.cat.categories + df['B'].cat.categories Setting the index will create a ``CategoricalIndex``. diff --git a/doc/source/user_guide/cookbook.rst b/doc/source/user_guide/cookbook.rst index 15af5208a4f1f..c9d3bc3a28c70 100644 --- a/doc/source/user_guide/cookbook.rst +++ b/doc/source/user_guide/cookbook.rst @@ -592,8 +592,8 @@ Unlike agg, apply's callable is passed a sub-DataFrame which gives you access to .. ipython:: python df = pd.DataFrame([0, 1, 0, 1, 1, 1, 0, 1, 1], columns=['A']) - df.A.groupby((df.A != df.A.shift()).cumsum()).groups - df.A.groupby((df.A != df.A.shift()).cumsum()).cumsum() + df['A'].groupby((df['A'] != df['A'].shift()).cumsum()).groups + df['A'].groupby((df['A'] != df['A'].shift()).cumsum()).cumsum() Expanding data ************** @@ -719,7 +719,7 @@ Rolling Apply to multiple columns where function calculates a Series before a Sc df def gm(df, const): - v = ((((df.A + df.B) + 1).cumprod()) - 1) * const + v = ((((df['A'] + df['B']) + 1).cumprod()) - 1) * const return v.iloc[-1] s = pd.Series({df.index[i]: gm(df.iloc[i:min(i + 51, len(df) - 1)], 5) diff --git a/doc/source/user_guide/enhancingperf.rst b/doc/source/user_guide/enhancingperf.rst index a4eefadd54d8c..2df5b9d82dcc3 100644 --- a/doc/source/user_guide/enhancingperf.rst +++ b/doc/source/user_guide/enhancingperf.rst @@ -393,15 +393,15 @@ Consider the following toy example of doubling each observation: .. code-block:: ipython # Custom function without numba - In [5]: %timeit df['col1_doubled'] = df.a.apply(double_every_value_nonumba) # noqa E501 + In [5]: %timeit df['col1_doubled'] = df['a'].apply(double_every_value_nonumba) # noqa E501 1000 loops, best of 3: 797 us per loop # Standard implementation (faster than a custom function) - In [6]: %timeit df['col1_doubled'] = df.a * 2 + In [6]: %timeit df['col1_doubled'] = df['a'] * 2 1000 loops, best of 3: 233 us per loop # Custom function with numba - In [7]: %timeit (df['col1_doubled'] = double_every_value_withnumba(df.a.to_numpy()) + In [7]: %timeit (df['col1_doubled'] = double_every_value_withnumba(df['a'].to_numpy()) 1000 loops, best of 3: 145 us per loop Caveats @@ -643,8 +643,8 @@ The equivalent in standard Python would be .. ipython:: python df = pd.DataFrame(dict(a=range(5), b=range(5, 10))) - df['c'] = df.a + df.b - df['d'] = df.a + df.b + df.c + df['c'] = df['a'] + df['b'] + df['d'] = df['a'] + df['b'] + df['c'] df['a'] = 1 df @@ -688,7 +688,7 @@ name in an expression. a = np.random.randn() df.query('@a < a') - df.loc[a < df.a] # same as the previous expression + df.loc[a < df['a']] # same as the previous expression With :func:`pandas.eval` you cannot use the ``@`` prefix *at all*, because it isn't defined in that context. ``pandas`` will let you know this if you try to diff --git a/doc/source/user_guide/indexing.rst b/doc/source/user_guide/indexing.rst index e3b75afcf945e..cf55ce0c9a6d4 100644 --- a/doc/source/user_guide/indexing.rst +++ b/doc/source/user_guide/indexing.rst @@ -210,7 +210,7 @@ as an attribute: See `here for an explanation of valid identifiers `__. - - The attribute will not be available if it conflicts with an existing method name, e.g. ``s.min`` is not allowed. + - The attribute will not be available if it conflicts with an existing method name, e.g. ``s.min`` is not allowed, but ``s['min']`` is possible. - Similarly, the attribute will not be available if it conflicts with any of the following list: ``index``, ``major_axis``, ``minor_axis``, ``items``. @@ -540,7 +540,7 @@ The ``callable`` must be a function with one argument (the calling Series or Dat columns=list('ABCD')) df1 - df1.loc[lambda df: df.A > 0, :] + df1.loc[lambda df: df['A'] > 0, :] df1.loc[:, lambda df: ['A', 'B']] df1.iloc[:, lambda df: [0, 1]] @@ -552,7 +552,7 @@ You can use callable indexing in ``Series``. .. ipython:: python - df1.A.loc[lambda s: s > 0] + df1['A'].loc[lambda s: s > 0] Using these methods / indexers, you can chain data selection operations without using a temporary variable. @@ -561,7 +561,7 @@ without using a temporary variable. bb = pd.read_csv('data/baseball.csv', index_col='id') (bb.groupby(['year', 'team']).sum() - .loc[lambda df: df.r > 100]) + .loc[lambda df: df['r'] > 100]) .. _indexing.deprecate_ix: @@ -871,9 +871,9 @@ Boolean indexing Another common operation is the use of boolean vectors to filter the data. The operators are: ``|`` for ``or``, ``&`` for ``and``, and ``~`` for ``not``. These **must** be grouped by using parentheses, since by default Python will -evaluate an expression such as ``df.A > 2 & df.B < 3`` as -``df.A > (2 & df.B) < 3``, while the desired evaluation order is -``(df.A > 2) & (df.B < 3)``. +evaluate an expression such as ``df['A'] > 2 & df['B'] < 3`` as +``df['A'] > (2 & df['B']) < 3``, while the desired evaluation order is +``(df['A > 2) & (df['B'] < 3)``. Using a boolean vector to index a Series works exactly as in a NumPy ndarray: @@ -1134,7 +1134,7 @@ between the values of columns ``a`` and ``c``. For example: df # pure python - df[(df.a < df.b) & (df.b < df.c)] + df[(df['a'] < df['b']) & (df['b'] < df['c'])] # query df.query('(a < b) & (b < c)') @@ -1241,7 +1241,7 @@ Full numpy-like syntax: df = pd.DataFrame(np.random.randint(n, size=(n, 3)), columns=list('abc')) df df.query('(a < b) & (b < c)') - df[(df.a < df.b) & (df.b < df.c)] + df[(df['a'] < df['b']) & (df['b'] < df['c'])] Slightly nicer by removing the parentheses (by binding making comparison operators bind tighter than ``&`` and ``|``). @@ -1279,12 +1279,12 @@ The ``in`` and ``not in`` operators df.query('a in b') # How you'd do it in pure Python - df[df.a.isin(df.b)] + df[df['a'].isin(df['b'])] df.query('a not in b') # pure Python - df[~df.a.isin(df.b)] + df[~df['a'].isin(df['b'])] You can combine this with other expressions for very succinct queries: @@ -1297,7 +1297,7 @@ You can combine this with other expressions for very succinct queries: df.query('a in b and c < d') # pure Python - df[df.b.isin(df.a) & (df.c < df.d)] + df[df['b'].isin(df['a']) & (df['c'] < df['d'])] .. note:: @@ -1326,7 +1326,7 @@ to ``in``/``not in``. df.query('b == ["a", "b", "c"]') # pure Python - df[df.b.isin(["a", "b", "c"])] + df[df['b'].isin(["a", "b", "c"])] df.query('c == [1, 2]') @@ -1338,7 +1338,7 @@ to ``in``/``not in``. df.query('[1, 2] not in c') # pure Python - df[df.c.isin([1, 2])] + df[df['c'].isin([1, 2])] Boolean operators @@ -1352,7 +1352,7 @@ You can negate boolean expressions with the word ``not`` or the ``~`` operator. df['bools'] = np.random.rand(len(df)) > 0.5 df.query('~bools') df.query('not bools') - df.query('not bools') == df[~df.bools] + df.query('not bools') == df[~df['bools']] Of course, expressions can be arbitrarily complex too: @@ -1362,7 +1362,10 @@ Of course, expressions can be arbitrarily complex too: shorter = df.query('a < b < c and (not bools) or bools > 2') # equivalent in pure Python - longer = df[(df.a < df.b) & (df.b < df.c) & (~df.bools) | (df.bools > 2)] + longer = df[(df['a'] < df['b']) + & (df['b'] < df['c']) + & (~df['bools']) + | (df['bools'] > 2)] shorter longer @@ -1835,14 +1838,14 @@ chained indexing expression, you can set the :ref:`option ` # This will show the SettingWithCopyWarning # but the frame values will be set - dfb['c'][dfb.a.str.startswith('o')] = 42 + dfb['c'][dfb['a'].str.startswith('o')] = 42 This however is operating on a copy and will not work. :: >>> pd.set_option('mode.chained_assignment','warn') - >>> dfb[dfb.a.str.startswith('o')]['c'] = 42 + >>> dfb[dfb['a'].str.startswith('o')]['c'] = 42 Traceback (most recent call last) ... SettingWithCopyWarning: diff --git a/doc/source/user_guide/reshaping.rst b/doc/source/user_guide/reshaping.rst index f118fe84d523a..dd6d3062a8f0a 100644 --- a/doc/source/user_guide/reshaping.rst +++ b/doc/source/user_guide/reshaping.rst @@ -469,7 +469,7 @@ If ``crosstab`` receives only two Series, it will provide a frequency table. 'C': [1, 1, np.nan, 1, 1]}) df - pd.crosstab(df.A, df.B) + pd.crosstab(df['A'], df['B']) Any input passed containing ``Categorical`` data will have **all** of its categories included in the cross-tabulation, even if the actual data does @@ -489,13 +489,13 @@ using the ``normalize`` argument: .. ipython:: python - pd.crosstab(df.A, df.B, normalize=True) + pd.crosstab(df['A'], df['B'], normalize=True) ``normalize`` can also normalize values within each row or within each column: .. ipython:: python - pd.crosstab(df.A, df.B, normalize='columns') + pd.crosstab(df['A'], df['B'], normalize='columns') ``crosstab`` can also be passed a third ``Series`` and an aggregation function (``aggfunc``) that will be applied to the values of the third ``Series`` within @@ -503,7 +503,7 @@ each group defined by the first two ``Series``: .. ipython:: python - pd.crosstab(df.A, df.B, values=df.C, aggfunc=np.sum) + pd.crosstab(df['A'], df['B'], values=df['C'], aggfunc=np.sum) Adding margins ~~~~~~~~~~~~~~ @@ -512,7 +512,7 @@ Finally, one can also add margins or normalize this output. .. ipython:: python - pd.crosstab(df.A, df.B, values=df.C, aggfunc=np.sum, normalize=True, + pd.crosstab(df['A'], df['B'], values=df['C'], aggfunc=np.sum, normalize=True, margins=True) .. _reshaping.tile: diff --git a/doc/source/user_guide/visualization.rst b/doc/source/user_guide/visualization.rst index fdceaa5868cec..fa16b2f216610 100644 --- a/doc/source/user_guide/visualization.rst +++ b/doc/source/user_guide/visualization.rst @@ -1148,10 +1148,10 @@ To plot data on a secondary y-axis, use the ``secondary_y`` keyword: .. ipython:: python - df.A.plot() + df['A'].plot() @savefig series_plot_secondary_y.png - df.B.plot(secondary_y=True, style='g') + df['B'].plot(secondary_y=True, style='g') .. ipython:: python :suppress: @@ -1205,7 +1205,7 @@ Here is the default behavior, notice how the x-axis tick labeling is performed: plt.figure() @savefig ser_plot_suppress.png - df.A.plot() + df['A'].plot() .. ipython:: python :suppress: @@ -1219,7 +1219,7 @@ Using the ``x_compat`` parameter, you can suppress this behavior: plt.figure() @savefig ser_plot_suppress_parm.png - df.A.plot(x_compat=True) + df['A'].plot(x_compat=True) .. ipython:: python :suppress: @@ -1235,9 +1235,9 @@ in ``pandas.plotting.plot_params`` can be used in a `with statement`: @savefig ser_plot_suppress_context.png with pd.plotting.plot_params.use('x_compat', True): - df.A.plot(color='r') - df.B.plot(color='g') - df.C.plot(color='b') + df['A'].plot(color='r') + df['B'].plot(color='g') + df['C'].plot(color='b') .. ipython:: python :suppress: From 7528d088c9aa597174fbccbc1bddb9290ba2556e Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Mon, 26 Aug 2019 18:10:26 +0100 Subject: [PATCH 161/191] TYPING: add stubs for _packer and _unpacker (#28135) --- pandas/io/msgpack/_packer.pyi | 22 ++++++++++++ pandas/io/msgpack/_unpacker.pyi | 59 +++++++++++++++++++++++++++++++++ 2 files changed, 81 insertions(+) create mode 100644 pandas/io/msgpack/_packer.pyi create mode 100644 pandas/io/msgpack/_unpacker.pyi diff --git a/pandas/io/msgpack/_packer.pyi b/pandas/io/msgpack/_packer.pyi new file mode 100644 index 0000000000000..e95a1622c5615 --- /dev/null +++ b/pandas/io/msgpack/_packer.pyi @@ -0,0 +1,22 @@ +# flake8: noqa + +class Packer: + def __cinit__(self): ... + def __init__( + self, + default=..., + encoding=..., + unicode_errors=..., + use_single_float=..., + autoreset: int = ..., + use_bin_type: int = ..., + ): ... + def __dealloc__(self): ... + def _pack(self, o, nest_limit: int = ...) -> int: ... + def pack(self, obj): ... + def pack_ext_type(self, typecode, data): ... + def pack_array_header(self, size): ... + def pack_map_header(self, size): ... + def pack_map_pairs(self, pairs): ... + def reset(self) -> None: ... + def bytes(self): ... diff --git a/pandas/io/msgpack/_unpacker.pyi b/pandas/io/msgpack/_unpacker.pyi new file mode 100644 index 0000000000000..9910895947fb6 --- /dev/null +++ b/pandas/io/msgpack/_unpacker.pyi @@ -0,0 +1,59 @@ +# flake8: noqa + +def unpackb( + packed, + object_hook=..., + list_hook=..., + use_list=..., + encoding=..., + unicode_errors=..., + object_pairs_hook=..., + ext_hook=..., + max_str_len=..., + max_bin_len=..., + max_array_len=..., + max_map_len=..., + max_ext_len=..., +): ... +def unpack( + stream, + object_hook=..., + list_hook=..., + use_list=..., + encoding=..., + unicode_errors=..., + object_pairs_hook=..., +): ... + +class Unpacker: + def __cinit__(self): ... + def __dealloc__(self): ... + def __init__( + self, + file_like=..., + read_size=..., + use_list=..., + object_hook=..., + object_pairs_hook=..., + list_hook=..., + encoding=..., + unicode_errors=..., + max_buffer_size: int = ..., + ext_hook=..., + max_str_len=..., + max_bin_len=..., + max_array_len=..., + max_map_len=..., + max_ext_len=..., + ): ... + def feed(self, next_bytes): ... + def append_buffer(self, _buf, _buf_len): ... + def read_from_file(self): ... + def _unpack(self, execute, write_bytes, iter=...): ... + def read_bytes(self, nbytes): ... + def unpack(self, write_bytes=...): ... + def skip(self, write_bytes=...): ... + def read_array_header(self, write_bytes=...): ... + def read_map_header(self, write_bytes=...): ... + def __iter__(self): ... + def __next__(self): ... From bca39a72b073758d3cfa7afa470462255f1bc066 Mon Sep 17 00:00:00 2001 From: William Ayd Date: Mon, 26 Aug 2019 10:53:59 -0700 Subject: [PATCH 162/191] Run clang-format on objToJSON (#28144) --- pandas/_libs/src/ujson/python/objToJSON.c | 381 +++++++++++----------- 1 file changed, 188 insertions(+), 193 deletions(-) diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c index de336fb3aa1dc..4b612bb033761 100644 --- a/pandas/_libs/src/ujson/python/objToJSON.c +++ b/pandas/_libs/src/ujson/python/objToJSON.c @@ -16,18 +16,19 @@ derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -DISCLAIMED. IN NO EVENT SHALL ESN SOCIAL SOFTWARE AB OR JONAS TARNSTROM BE LIABLE -FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND -ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +DISCLAIMED. IN NO EVENT SHALL ESN SOCIAL SOFTWARE AB OR JONAS TARNSTROM BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE +GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF +THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. Portions of code from MODP_ASCII - Ascii transformations (upper/lower, etc) https://github.com/client9/stringencoders -Copyright (c) 2007 Nick Galbreath -- nickg [at] modp [dot] com. All rights reserved. +Copyright (c) 2007 Nick Galbreath -- nickg [at] modp [dot] com. All rights +reserved. Numeric decoder derived from from TCL library http://www.opensource.apple.com/source/tcl/tcl-14/tcl/license.terms @@ -64,9 +65,9 @@ typedef void *(*PFN_PyTypeToJSON)(JSOBJ obj, JSONTypeContext *ti, typedef struct __NpyArrContext { PyObject *array; char *dataptr; - int curdim; // current dimension in array's order - int stridedim; // dimension we are striding over - int inc; // stride dimension increment (+/- 1) + int curdim; // current dimension in array's order + int stridedim; // dimension we are striding over + int inc; // stride dimension increment (+/- 1) npy_intp dim; npy_intp stride; npy_intp ndim; @@ -83,8 +84,8 @@ typedef struct __PdBlockContext { int ncols; int transpose; - int *cindices; // frame column -> block column map - NpyArrContext **npyCtxts; // NpyArrContext for each column + int *cindices; // frame column -> block column map + NpyArrContext **npyCtxts; // NpyArrContext for each column } PdBlockContext; typedef struct __TypeContext { @@ -148,13 +149,12 @@ enum PANDAS_FORMAT { SPLIT, RECORDS, INDEX, COLUMNS, VALUES }; int PdBlock_iterNext(JSOBJ, JSONTypeContext *); -void *initObjToJSON(void) -{ +void *initObjToJSON(void) { PyObject *mod_pandas; PyObject *mod_nattype; PyObject *mod_decimal = PyImport_ImportModule("decimal"); type_decimal = - (PyTypeObject *)PyObject_GetAttrString(mod_decimal, "Decimal"); + (PyTypeObject *)PyObject_GetAttrString(mod_decimal, "Decimal"); Py_DECREF(mod_decimal); PyDateTime_IMPORT; @@ -167,14 +167,14 @@ void *initObjToJSON(void) cls_series = (PyTypeObject *)PyObject_GetAttrString(mod_pandas, "Series"); cls_timestamp = PyObject_GetAttrString(mod_pandas, "Timestamp"); - cls_timedelta = PyObject_GetAttrString(mod_pandas, "Timedelta"); + cls_timedelta = PyObject_GetAttrString(mod_pandas, "Timedelta"); Py_DECREF(mod_pandas); } mod_nattype = PyImport_ImportModule("pandas._libs.tslibs.nattype"); if (mod_nattype) { - cls_nat = (PyTypeObject *)PyObject_GetAttrString(mod_nattype, - "NaTType"); + cls_nat = + (PyTypeObject *)PyObject_GetAttrString(mod_nattype, "NaTType"); Py_DECREF(mod_nattype); } @@ -212,7 +212,6 @@ static TypeContext *createTypeContext(void) { return pc; } - static int is_sparse_array(PyObject *obj) { // TODO can be removed again once SparseArray.values is removed (GH26421) if (PyObject_HasAttrString(obj, "_subtyp")) { @@ -227,7 +226,6 @@ static int is_sparse_array(PyObject *obj) { return 0; } - static PyObject *get_values(PyObject *obj) { PyObject *values = NULL; @@ -242,7 +240,8 @@ static PyObject *get_values(PyObject *obj) { values = PyObject_CallMethod(values, "to_numpy", NULL); } - if (!is_sparse_array(values) && PyObject_HasAttrString(values, "values")) { + if (!is_sparse_array(values) && + PyObject_HasAttrString(values, "values")) { PyObject *subvals = get_values(values); PyErr_Clear(); PRINTMARK(); @@ -357,20 +356,20 @@ static Py_ssize_t get_attr_length(PyObject *obj, char *attr) { } static npy_int64 get_long_attr(PyObject *o, const char *attr) { - npy_int64 long_val; - PyObject *value = PyObject_GetAttrString(o, attr); - long_val = (PyLong_Check(value) ? - PyLong_AsLongLong(value) : PyLong_AsLong(value)); - Py_DECREF(value); - return long_val; + npy_int64 long_val; + PyObject *value = PyObject_GetAttrString(o, attr); + long_val = + (PyLong_Check(value) ? PyLong_AsLongLong(value) : PyLong_AsLong(value)); + Py_DECREF(value); + return long_val; } static npy_float64 total_seconds(PyObject *td) { - npy_float64 double_val; - PyObject *value = PyObject_CallMethod(td, "total_seconds", NULL); - double_val = PyFloat_AS_DOUBLE(value); - Py_DECREF(value); - return double_val; + npy_float64 double_val; + PyObject *value = PyObject_CallMethod(td, "total_seconds", NULL); + double_val = PyFloat_AS_DOUBLE(value); + Py_DECREF(value); + return double_val; } static PyObject *get_item(PyObject *obj, Py_ssize_t i) { @@ -450,7 +449,7 @@ static void *PyUnicodeToUTF8(JSOBJ _obj, JSONTypeContext *tc, void *outValue, if (PyUnicode_IS_COMPACT_ASCII(obj)) { Py_ssize_t len; - char *data = (char*)PyUnicode_AsUTF8AndSize(obj, &len); + char *data = (char *)PyUnicode_AsUTF8AndSize(obj, &len); *_outLen = len; return data; } @@ -505,7 +504,7 @@ static void *NpyDateTimeScalarToJSON(JSOBJ _obj, JSONTypeContext *tc, // TODO(anyone): Does not appear to be reached in tests. pandas_datetime_to_datetimestruct(obj->obval, - (NPY_DATETIMEUNIT)obj->obmeta.base, &dts); + (NPY_DATETIMEUNIT)obj->obmeta.base, &dts); return PandasDateTimeStructToJSON(&dts, tc, outValue, _outLen); } @@ -664,9 +663,9 @@ void NpyArr_iterBegin(JSOBJ _obj, JSONTypeContext *tc) { GET_TC(tc)->npyarr = npyarr; if (!npyarr) { - PyErr_NoMemory(); - GET_TC(tc)->iterNext = NpyArr_iterNextNone; - return; + PyErr_NoMemory(); + GET_TC(tc)->iterNext = NpyArr_iterNextNone; + return; } npyarr->array = (PyObject *)obj; @@ -677,17 +676,17 @@ void NpyArr_iterBegin(JSOBJ _obj, JSONTypeContext *tc) { npyarr->type_num = PyArray_DESCR(obj)->type_num; if (GET_TC(tc)->transpose) { - npyarr->dim = PyArray_DIM(obj, npyarr->ndim); - npyarr->stride = PyArray_STRIDE(obj, npyarr->ndim); - npyarr->stridedim = npyarr->ndim; - npyarr->index[npyarr->ndim] = 0; - npyarr->inc = -1; + npyarr->dim = PyArray_DIM(obj, npyarr->ndim); + npyarr->stride = PyArray_STRIDE(obj, npyarr->ndim); + npyarr->stridedim = npyarr->ndim; + npyarr->index[npyarr->ndim] = 0; + npyarr->inc = -1; } else { - npyarr->dim = PyArray_DIM(obj, 0); - npyarr->stride = PyArray_STRIDE(obj, 0); - npyarr->stridedim = 0; - npyarr->index[0] = 0; - npyarr->inc = 1; + npyarr->dim = PyArray_DIM(obj, 0); + npyarr->stride = PyArray_STRIDE(obj, 0); + npyarr->stridedim = 0; + npyarr->index[0] = 0; + npyarr->inc = 1; } npyarr->columnLabels = GET_TC(tc)->columnLabels; @@ -735,8 +734,7 @@ int NpyArr_iterNextItem(JSOBJ obj, JSONTypeContext *tc) { NpyArr_freeItemValue(obj, tc); - if (PyArray_ISDATETIME(npyarr->array)) - { + if (PyArray_ISDATETIME(npyarr->array)) { PRINTMARK(); GET_TC(tc)->itemValue = obj; Py_INCREF(obj); @@ -797,10 +795,10 @@ char *NpyArr_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen) { if (GET_TC(tc)->iterNext == NpyArr_iterNextItem) { idx = npyarr->index[npyarr->stridedim] - 1; - cStr = npyarr->columnLabels[idx]; + cStr = npyarr->columnLabels[idx]; } else { idx = npyarr->index[npyarr->stridedim - npyarr->inc] - 1; - cStr = npyarr->rowLabels[idx]; + cStr = npyarr->rowLabels[idx]; } *outLen = strlen(cStr); @@ -852,13 +850,13 @@ char *PdBlock_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen) { if (GET_TC(tc)->iterNext == PdBlock_iterNextItem) { idx = blkCtxt->colIdx - 1; - cStr = npyarr->columnLabels[idx]; + cStr = npyarr->columnLabels[idx]; } else { idx = GET_TC(tc)->iterNext != PdBlock_iterNext ? npyarr->index[npyarr->stridedim - npyarr->inc] - 1 : npyarr->index[npyarr->stridedim]; - cStr = npyarr->rowLabels[idx]; + cStr = npyarr->rowLabels[idx]; } *outLen = strlen(cStr); @@ -875,10 +873,10 @@ char *PdBlock_iterGetName_Transpose(JSOBJ obj, JSONTypeContext *tc, if (GET_TC(tc)->iterNext == NpyArr_iterNextItem) { idx = npyarr->index[npyarr->stridedim] - 1; - cStr = npyarr->columnLabels[idx]; + cStr = npyarr->columnLabels[idx]; } else { idx = blkCtxt->colIdx; - cStr = npyarr->rowLabels[idx]; + cStr = npyarr->rowLabels[idx]; } *outLen = strlen(cStr); @@ -943,9 +941,9 @@ void PdBlock_iterBegin(JSOBJ _obj, JSONTypeContext *tc) { dtype = PyArray_DescrFromType(NPY_INT64); obj = (PyObject *)_obj; - GET_TC(tc) - ->iterGetName = GET_TC(tc)->transpose ? PdBlock_iterGetName_Transpose - : PdBlock_iterGetName; + GET_TC(tc)->iterGetName = GET_TC(tc)->transpose + ? PdBlock_iterGetName_Transpose + : PdBlock_iterGetName; blkCtxt = PyObject_Malloc(sizeof(PdBlockContext)); if (!blkCtxt) { @@ -1396,7 +1394,7 @@ void Series_iterBegin(JSOBJ obj, JSONTypeContext *tc) { PyObjectEncoder *enc = (PyObjectEncoder *)tc->encoder; GET_TC(tc)->index = 0; GET_TC(tc)->cStr = PyObject_Malloc(20 * sizeof(char)); - enc->outputFormat = VALUES; // for contained series + enc->outputFormat = VALUES; // for contained series if (!GET_TC(tc)->cStr) { PyErr_NoMemory(); } @@ -1455,7 +1453,7 @@ void DataFrame_iterBegin(JSOBJ obj, JSONTypeContext *tc) { PyObjectEncoder *enc = (PyObjectEncoder *)tc->encoder; GET_TC(tc)->index = 0; GET_TC(tc)->cStr = PyObject_Malloc(20 * sizeof(char)); - enc->outputFormat = VALUES; // for contained series & index + enc->outputFormat = VALUES; // for contained series & index if (!GET_TC(tc)->cStr) { PyErr_NoMemory(); } @@ -1634,115 +1632,116 @@ char **NpyArr_encodeLabels(PyArrayObject *labels, PyObjectEncoder *enc, type_num = PyArray_TYPE(labels); for (i = 0; i < num; i++) { - item = PyArray_GETITEM(labels, dataptr); + item = PyArray_GETITEM(labels, dataptr); if (!item) { - NpyArr_freeLabels(ret, num); - ret = 0; - break; - } - - // TODO: for any matches on type_num (date and timedeltas) should use a - // vectorized solution to convert to epoch or iso formats - if (enc->datetimeIso && (type_num == NPY_TIMEDELTA || PyDelta_Check(item))) { - PyObject *td = PyObject_CallFunction(cls_timedelta, "(O)", item); - if (td == NULL) { - Py_DECREF(item); NpyArr_freeLabels(ret, num); ret = 0; break; - } - - PyObject *iso = PyObject_CallMethod(td, "isoformat", NULL); - Py_DECREF(td); - if (iso == NULL) { - Py_DECREF(item); - NpyArr_freeLabels(ret, num); - ret = 0; - break; - } - - cLabel = (char *)PyUnicode_AsUTF8(iso); - Py_DECREF(iso); - len = strlen(cLabel); - } - else if (PyTypeNum_ISDATETIME(type_num) || - PyDateTime_Check(item) || PyDate_Check(item)) { - PyObject *ts = PyObject_CallFunction(cls_timestamp, "(O)", item); - if (ts == NULL) { - Py_DECREF(item); - NpyArr_freeLabels(ret, num); - ret = 0; - break; - } - - if (enc->datetimeIso) { - PyObject *iso = PyObject_CallMethod(ts, "isoformat", NULL); - Py_DECREF(ts); - if (iso == NULL) { - Py_DECREF(item); - NpyArr_freeLabels(ret, num); - ret = 0; - break; - } - - cLabel = (char *)PyUnicode_AsUTF8(iso); - Py_DECREF(iso); - len = strlen(cLabel); - } else { - npy_int64 value; - // TODO: refactor to not duplicate what goes on in beginTypeContext - if (PyObject_HasAttrString(ts, "value")) { - PRINTMARK(); - value = get_long_attr(ts, "value"); - } else { - PRINTMARK(); - value = - total_seconds(ts) * 1000000000LL; // nanoseconds per second - } - Py_DECREF(ts); - - switch (enc->datetimeUnit) { - case NPY_FR_ns: - break; - case NPY_FR_us: - value /= 1000LL; - break; - case NPY_FR_ms: - value /= 1000000LL; - break; - case NPY_FR_s: - value /= 1000000000LL; - break; - default: - Py_DECREF(item); - NpyArr_freeLabels(ret, num); - ret = 0; - break; - } - - char buf[21] = {0}; // 21 chars for 2**63 as string - cLabel = buf; - sprintf(buf, "%" NPY_INT64_FMT, value); - len = strlen(cLabel); - } - } else { // Fallack to string representation - PyObject *str = PyObject_Str(item); - if (str == NULL) { - Py_DECREF(item); - NpyArr_freeLabels(ret, num); - ret = 0; - break; - } - - cLabel = (char *)PyUnicode_AsUTF8(str); - Py_DECREF(str); - len = strlen(cLabel); - } - - Py_DECREF(item); - // Add 1 to include NULL terminator - ret[i] = PyObject_Malloc(len + 1); - memcpy(ret[i], cLabel, len + 1); + } + + // TODO: for any matches on type_num (date and timedeltas) should use a + // vectorized solution to convert to epoch or iso formats + if (enc->datetimeIso && + (type_num == NPY_TIMEDELTA || PyDelta_Check(item))) { + PyObject *td = PyObject_CallFunction(cls_timedelta, "(O)", item); + if (td == NULL) { + Py_DECREF(item); + NpyArr_freeLabels(ret, num); + ret = 0; + break; + } + + PyObject *iso = PyObject_CallMethod(td, "isoformat", NULL); + Py_DECREF(td); + if (iso == NULL) { + Py_DECREF(item); + NpyArr_freeLabels(ret, num); + ret = 0; + break; + } + + cLabel = (char *)PyUnicode_AsUTF8(iso); + Py_DECREF(iso); + len = strlen(cLabel); + } else if (PyTypeNum_ISDATETIME(type_num) || PyDateTime_Check(item) || + PyDate_Check(item)) { + PyObject *ts = PyObject_CallFunction(cls_timestamp, "(O)", item); + if (ts == NULL) { + Py_DECREF(item); + NpyArr_freeLabels(ret, num); + ret = 0; + break; + } + + if (enc->datetimeIso) { + PyObject *iso = PyObject_CallMethod(ts, "isoformat", NULL); + Py_DECREF(ts); + if (iso == NULL) { + Py_DECREF(item); + NpyArr_freeLabels(ret, num); + ret = 0; + break; + } + + cLabel = (char *)PyUnicode_AsUTF8(iso); + Py_DECREF(iso); + len = strlen(cLabel); + } else { + npy_int64 value; + // TODO: refactor to not duplicate what goes on in + // beginTypeContext + if (PyObject_HasAttrString(ts, "value")) { + PRINTMARK(); + value = get_long_attr(ts, "value"); + } else { + PRINTMARK(); + value = total_seconds(ts) * + 1000000000LL; // nanoseconds per second + } + Py_DECREF(ts); + + switch (enc->datetimeUnit) { + case NPY_FR_ns: + break; + case NPY_FR_us: + value /= 1000LL; + break; + case NPY_FR_ms: + value /= 1000000LL; + break; + case NPY_FR_s: + value /= 1000000000LL; + break; + default: + Py_DECREF(item); + NpyArr_freeLabels(ret, num); + ret = 0; + break; + } + + char buf[21] = {0}; // 21 chars for 2**63 as string + cLabel = buf; + sprintf(buf, "%" NPY_INT64_FMT, value); + len = strlen(cLabel); + } + } else { // Fallack to string representation + PyObject *str = PyObject_Str(item); + if (str == NULL) { + Py_DECREF(item); + NpyArr_freeLabels(ret, num); + ret = 0; + break; + } + + cLabel = (char *)PyUnicode_AsUTF8(str); + Py_DECREF(str); + len = strlen(cLabel); + } + + Py_DECREF(item); + // Add 1 to include NULL terminator + ret[i] = PyObject_Malloc(len + 1); + memcpy(ret[i], cLabel, len + 1); if (PyErr_Occurred()) { NpyArr_freeLabels(ret, num); @@ -1923,23 +1922,22 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { value = get_long_attr(obj, "value"); } else { PRINTMARK(); - value = - total_seconds(obj) * 1000000000LL; // nanoseconds per second + value = total_seconds(obj) * 1000000000LL; // nanoseconds per second } base = ((PyObjectEncoder *)tc->encoder)->datetimeUnit; switch (base) { - case NPY_FR_ns: - break; - case NPY_FR_us: - value /= 1000LL; - break; - case NPY_FR_ms: - value /= 1000000LL; - break; - case NPY_FR_s: - value /= 1000000000LL; - break; + case NPY_FR_ns: + break; + case NPY_FR_us: + value /= 1000LL; + break; + case NPY_FR_ms: + value /= 1000000LL; + break; + case NPY_FR_s: + value /= 1000000000LL; + break; } exc = PyErr_Occurred(); @@ -2054,8 +2052,7 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { goto INVALID; } pc->columnLabelsLen = PyArray_DIM(pc->newObj, 0); - pc->columnLabels = NpyArr_encodeLabels((PyArrayObject *)values, - enc, + pc->columnLabels = NpyArr_encodeLabels((PyArrayObject *)values, enc, pc->columnLabelsLen); if (!pc->columnLabels) { goto INVALID; @@ -2157,8 +2154,7 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { goto INVALID; } pc->columnLabelsLen = PyObject_Size(tmpObj); - pc->columnLabels = NpyArr_encodeLabels((PyArrayObject *)values, - enc, + pc->columnLabels = NpyArr_encodeLabels((PyArrayObject *)values, enc, pc->columnLabelsLen); Py_DECREF(tmpObj); if (!pc->columnLabels) { @@ -2179,9 +2175,8 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { goto INVALID; } pc->rowLabelsLen = PyObject_Size(tmpObj); - pc->rowLabels = - NpyArr_encodeLabels((PyArrayObject *)values, - enc, pc->rowLabelsLen); + pc->rowLabels = NpyArr_encodeLabels((PyArrayObject *)values, enc, + pc->rowLabelsLen); Py_DECREF(tmpObj); tmpObj = (enc->outputFormat == INDEX ? PyObject_GetAttrString(obj, "columns") @@ -2199,8 +2194,7 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { goto INVALID; } pc->columnLabelsLen = PyObject_Size(tmpObj); - pc->columnLabels = NpyArr_encodeLabels((PyArrayObject *)values, - enc, + pc->columnLabels = NpyArr_encodeLabels((PyArrayObject *)values, enc, pc->columnLabelsLen); Py_DECREF(tmpObj); if (!pc->columnLabels) { @@ -2325,7 +2319,8 @@ void Object_endTypeContext(JSOBJ obj, JSONTypeContext *tc) { PyObject_Free(GET_TC(tc)->cStr); GET_TC(tc)->cStr = NULL; - if (tc->prv != &(((PyObjectEncoder *)tc->encoder)->basicTypeContext)) { // NOLINT + if (tc->prv != + &(((PyObjectEncoder *)tc->encoder)->basicTypeContext)) { // NOLINT PyObject_Free(tc->prv); } tc->prv = NULL; @@ -2388,7 +2383,7 @@ PyObject *objToJSON(PyObject *self, PyObject *args, PyObject *kwargs) { PyObject *newobj; PyObject *oinput = NULL; PyObject *oensureAscii = NULL; - int idoublePrecision = 10; // default double precision setting + int idoublePrecision = 10; // default double precision setting PyObject *oencodeHTMLChars = NULL; char *sOrient = NULL; char *sdateFormat = NULL; @@ -2411,10 +2406,10 @@ PyObject *objToJSON(PyObject *self, PyObject *args, PyObject *kwargs) { PyObject_Malloc, PyObject_Realloc, PyObject_Free, - -1, // recursionMax + -1, // recursionMax idoublePrecision, - 1, // forceAscii - 0, // encodeHTMLChars + 1, // forceAscii + 0, // encodeHTMLChars }}; JSONObjectEncoder *encoder = (JSONObjectEncoder *)&pyEncoder; From 87d26bafcdb2495f8a9e76489d3438b1571beb05 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 26 Aug 2019 13:14:06 -0700 Subject: [PATCH 163/191] PERF: replace with list, closes #28084 (#28099) --- asv_bench/benchmarks/replace.py | 17 +++++++++++++++++ pandas/core/internals/blocks.py | 22 +++++++++++++++++++++- 2 files changed, 38 insertions(+), 1 deletion(-) diff --git a/asv_bench/benchmarks/replace.py b/asv_bench/benchmarks/replace.py index 6137e944e6b9e..f69ae15028525 100644 --- a/asv_bench/benchmarks/replace.py +++ b/asv_bench/benchmarks/replace.py @@ -36,6 +36,23 @@ def time_replace_series(self, inplace): self.s.replace(self.to_rep, inplace=inplace) +class ReplaceList: + # GH#28099 + + params = [(True, False)] + param_names = ["inplace"] + + def setup(self, inplace): + self.df = pd.DataFrame({"A": 0, "B": 0}, index=range(4 * 10 ** 7)) + + def time_replace_list(self, inplace): + self.df.replace([np.inf, -np.inf], np.nan, inplace=inplace) + + def time_replace_list_one_match(self, inplace): + # the 1 can be held in self._df.blocks[0], while the inf and -inf cant + self.df.replace([np.inf, -np.inf, 1], np.nan, inplace=inplace) + + class Convert: params = (["DataFrame", "Series"], ["Timestamp", "Timedelta"]) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index f0ee56f403325..a2a51881016a3 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -743,6 +743,26 @@ def replace( return [self] return [self.copy()] + to_replace = [x for x in to_replace if self._can_hold_element(x)] + if not len(to_replace): + # GH#28084 avoid costly checks since we can infer + # that there is nothing to replace in this block + if inplace: + return [self] + return [self.copy()] + + if len(to_replace) == 1: + # _can_hold_element checks have reduced this back to the + # scalar case and we can avoid a costly object cast + return self.replace( + to_replace[0], + value, + inplace=inplace, + filter=filter, + regex=regex, + convert=convert, + ) + # GH 22083, TypeError or ValueError occurred within error handling # causes infinite loop. Cast and retry only if not objectblock. if is_object_dtype(self): @@ -751,7 +771,7 @@ def replace( # try again with a compatible block block = self.astype(object) return block.replace( - to_replace=original_to_replace, + to_replace=to_replace, value=value, inplace=inplace, filter=filter, From 7deda218435e787275e5899162b482001df85684 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 26 Aug 2019 15:56:57 -0500 Subject: [PATCH 164/191] DOC: whatsnew for 28099 (#28154) --- doc/source/whatsnew/v1.0.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 2bfc09e52c68b..7fe358d3820f2 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -76,6 +76,7 @@ Performance improvements - Performance improvement in indexing with a non-unique :class:`IntervalIndex` (:issue:`27489`) - Performance improvement in `MultiIndex.is_monotonic` (:issue:`27495`) - Performance improvement in :func:`cut` when ``bins`` is an :class:`IntervalIndex` (:issue:`27668`) +- Performance improvement in :meth:`DataFrame.replace` when provided a list of values to replace (:issue:`28099`) .. _whatsnew_1000.bug_fixes: From 9f48098a021c7b744ff4604b605de7b99c7e62f4 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 26 Aug 2019 17:49:08 -0500 Subject: [PATCH 165/191] DOC: Set 1.0.0 in index.rst (#28149) --- doc/source/index.rst.template | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/index.rst.template b/doc/source/index.rst.template index b57ce83cfc33c..f5669626aa2b3 100644 --- a/doc/source/index.rst.template +++ b/doc/source/index.rst.template @@ -39,7 +39,7 @@ See the :ref:`overview` for more detail about what's in the library. :hidden: {% endif %} {% if not single_doc %} - What's New in 0.25.0 + What's New in 1.0.0 install getting_started/index user_guide/index @@ -53,7 +53,7 @@ See the :ref:`overview` for more detail about what's in the library. whatsnew/index {% endif %} -* :doc:`whatsnew/v0.25.0` +* :doc:`whatsnew/v1.0.0` * :doc:`install` * :doc:`getting_started/index` From 294a22c0baa2e024d12f70705c4ec85f4c82b2b0 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 26 Aug 2019 18:11:05 -0500 Subject: [PATCH 166/191] BUG: Fix groupby quantile array (#28113) --- doc/source/whatsnew/v0.25.2.rst | 3 +-- pandas/core/groupby/groupby.py | 4 ++-- pandas/tests/groupby/test_function.py | 18 ++++++++++++++++++ 3 files changed, 21 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v0.25.2.rst b/doc/source/whatsnew/v0.25.2.rst index 403c02c3ff129..6974c7521a237 100644 --- a/doc/source/whatsnew/v0.25.2.rst +++ b/doc/source/whatsnew/v0.25.2.rst @@ -76,8 +76,7 @@ Plotting Groupby/resample/rolling ^^^^^^^^^^^^^^^^^^^^^^^^ -- -- +- Bug incorrectly raising an ``IndexError`` when passing a list of quantiles to :meth:`pandas.core.groupby.DataFrameGroupBy.quantile` (:issue:`28113`). - - - diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 3eeecd9c149e1..87047d2170992 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1947,8 +1947,8 @@ def post_processor(vals: np.ndarray, inference: Optional[Type]) -> np.ndarray: arrays = [] for i in range(self.ngroups): - arr = arr + i - arrays.append(arr) + arr2 = arr + i + arrays.append(arr2) indices = np.concatenate(arrays) assert len(indices) == len(result) diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 509d7c33b643b..d89233f2fd603 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -1257,6 +1257,24 @@ def test_quantile_array(): tm.assert_frame_equal(result, expected) +def test_quantile_array2(): + # https://github.com/pandas-dev/pandas/pull/28085#issuecomment-524066959 + df = pd.DataFrame( + np.random.RandomState(0).randint(0, 5, size=(10, 3)), columns=list("ABC") + ) + result = df.groupby("A").quantile([0.3, 0.7]) + expected = pd.DataFrame( + { + "B": [0.9, 2.1, 2.2, 3.4, 1.6, 2.4, 2.3, 2.7, 0.0, 0.0], + "C": [1.2, 2.8, 1.8, 3.0, 0.0, 0.0, 1.9, 3.1, 3.0, 3.0], + }, + index=pd.MultiIndex.from_product( + [[0, 1, 2, 3, 4], [0.3, 0.7]], names=["A", None] + ), + ) + tm.assert_frame_equal(result, expected) + + def test_quantile_array_no_sort(): df = pd.DataFrame({"A": [0, 1, 2], "B": [3, 4, 5]}) result = df.groupby([1, 0, 1], sort=False).quantile([0.25, 0.5, 0.75]) From ddfc9a232f605e935c06efebdc0830d2b14dfdd5 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Tue, 27 Aug 2019 00:16:13 +0100 Subject: [PATCH 167/191] TYPING: --disallow-any-expr for HTMLFormatter.__init__ (#28140) --- pandas/io/formats/format.py | 6 ++++-- pandas/io/formats/html.py | 8 ++++---- pandas/io/formats/latex.py | 5 +++-- 3 files changed, 11 insertions(+), 8 deletions(-) diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 61af935bd8227..8ff4b9bda0430 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -549,7 +549,8 @@ def __init__( decimal: str = ".", table_id: Optional[str] = None, render_links: bool = False, - **kwds + bold_rows: bool = False, + escape: bool = True, ): self.frame = frame self.show_index_names = index_names @@ -580,7 +581,8 @@ def __init__( else: self.justify = justify - self.kwds = kwds + self.bold_rows = bold_rows + self.escape = escape if columns is not None: self.columns = ensure_index(columns) diff --git a/pandas/io/formats/html.py b/pandas/io/formats/html.py index 4b44893df70ed..8c4a7f4a1213d 100644 --- a/pandas/io/formats/html.py +++ b/pandas/io/formats/html.py @@ -37,7 +37,7 @@ class HTMLFormatter(TableFormatter): def __init__( self, formatter: DataFrameFormatter, - classes: Optional[Union[str, List, Tuple]] = None, + classes: Optional[Union[str, List[str], Tuple[str, ...]]] = None, border: Optional[int] = None, ) -> None: self.fmt = formatter @@ -46,11 +46,11 @@ def __init__( self.frame = self.fmt.frame self.columns = self.fmt.tr_frame.columns self.elements = [] # type: List[str] - self.bold_rows = self.fmt.kwds.get("bold_rows", False) - self.escape = self.fmt.kwds.get("escape", True) + self.bold_rows = self.fmt.bold_rows + self.escape = self.fmt.escape self.show_dimensions = self.fmt.show_dimensions if border is None: - border = get_option("display.html.border") + border = cast(int, get_option("display.html.border")) self.border = border self.table_id = self.fmt.table_id self.render_links = self.fmt.render_links diff --git a/pandas/io/formats/latex.py b/pandas/io/formats/latex.py index c60e15b733f0a..4c4d5ec73269a 100644 --- a/pandas/io/formats/latex.py +++ b/pandas/io/formats/latex.py @@ -39,12 +39,13 @@ def __init__( ): self.fmt = formatter self.frame = self.fmt.frame - self.bold_rows = self.fmt.kwds.get("bold_rows", False) + self.bold_rows = self.fmt.bold_rows self.column_format = column_format self.longtable = longtable self.multicolumn = multicolumn self.multicolumn_format = multicolumn_format self.multirow = multirow + self.escape = self.fmt.escape def write_result(self, buf: IO[str]) -> None: """ @@ -142,7 +143,7 @@ def pad_empties(x): buf.write("\\endfoot\n\n") buf.write("\\bottomrule\n") buf.write("\\endlastfoot\n") - if self.fmt.kwds.get("escape", True): + if self.escape: # escape backslashes first crow = [ ( From 357774695a4caf7b83506686f4c29cc38d2b9726 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 26 Aug 2019 16:39:12 -0700 Subject: [PATCH 168/191] CLN: small ops optimizations (#28036) --- pandas/core/frame.py | 28 +++++++++++++++++----------- pandas/core/ops/__init__.py | 12 ++++++------ pandas/core/ops/array_ops.py | 12 ++++++------ pandas/core/ops/missing.py | 4 ++-- pandas/core/sparse/frame.py | 2 +- 5 files changed, 32 insertions(+), 26 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 9da7999724a18..f636bb6db7430 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5298,12 +5298,19 @@ def _combine_frame(self, other, func, fill_value=None, level=None): this, other = self.align(other, join="outer", level=level, copy=False) new_index, new_columns = this.index, this.columns - def _arith_op(left, right): - # for the mixed_type case where we iterate over columns, - # _arith_op(left, right) is equivalent to - # left._binop(right, func, fill_value=fill_value) - left, right = ops.fill_binop(left, right, fill_value) - return func(left, right) + if fill_value is None: + # since _arith_op may be called in a loop, avoid function call + # overhead if possible by doing this check once + _arith_op = func + + else: + + def _arith_op(left, right): + # for the mixed_type case where we iterate over columns, + # _arith_op(left, right) is equivalent to + # left._binop(right, func, fill_value=fill_value) + left, right = ops.fill_binop(left, right, fill_value) + return func(left, right) if ops.should_series_dispatch(this, other, func): # iterate over columns @@ -5318,7 +5325,7 @@ def _arith_op(left, right): def _combine_match_index(self, other, func, level=None): left, right = self.align(other, join="outer", axis=0, level=level, copy=False) - assert left.index.equals(right.index) + # at this point we have `left.index.equals(right.index)` if left._is_mixed_type or right._is_mixed_type: # operate column-wise; avoid costly object-casting in `.values` @@ -5331,14 +5338,13 @@ def _combine_match_index(self, other, func, level=None): new_data, index=left.index, columns=self.columns, copy=False ) - def _combine_match_columns(self, other, func, level=None): - assert isinstance(other, Series) + def _combine_match_columns(self, other: Series, func, level=None): left, right = self.align(other, join="outer", axis=1, level=level, copy=False) - assert left.columns.equals(right.index) + # at this point we have `left.columns.equals(right.index)` return ops.dispatch_to_series(left, right, func, axis="columns") def _combine_const(self, other, func): - assert lib.is_scalar(other) or np.ndim(other) == 0 + # scalar other or np.ndim(other) == 0 return ops.dispatch_to_series(self, other, func) def combine(self, other, func, fill_value=None, overwrite=True): diff --git a/pandas/core/ops/__init__.py b/pandas/core/ops/__init__.py index 7e03b9544ee72..86cd6e878cde6 100644 --- a/pandas/core/ops/__init__.py +++ b/pandas/core/ops/__init__.py @@ -169,7 +169,7 @@ def maybe_upcast_for_op(obj, shape: Tuple[int, ...]): # np.timedelta64(3, 'D') / 2 == np.timedelta64(1, 'D') return Timedelta(obj) - elif isinstance(obj, np.ndarray) and is_timedelta64_dtype(obj): + elif isinstance(obj, np.ndarray) and is_timedelta64_dtype(obj.dtype): # GH#22390 Unfortunately we need to special-case right-hand # timedelta64 dtypes because numpy casts integer dtypes to # timedelta64 when operating with timedelta64 @@ -415,7 +415,7 @@ def should_extension_dispatch(left: ABCSeries, right: Any) -> bool: ): return True - if is_extension_array_dtype(right) and not is_scalar(right): + if not is_scalar(right) and is_extension_array_dtype(right): # GH#22378 disallow scalar to exclude e.g. "category", "Int64" return True @@ -755,7 +755,7 @@ def na_op(x, y): assert not isinstance(y, (list, ABCSeries, ABCIndexClass)) if isinstance(y, np.ndarray): # bool-bool dtype operations should be OK, should not get here - assert not (is_bool_dtype(x) and is_bool_dtype(y)) + assert not (is_bool_dtype(x.dtype) and is_bool_dtype(y.dtype)) x = ensure_object(x) y = ensure_object(y) result = libops.vec_binop(x, y, op) @@ -804,7 +804,7 @@ def wrapper(self, other): else: # scalars, list, tuple, np.array - is_other_int_dtype = is_integer_dtype(np.asarray(other)) + is_other_int_dtype = is_integer_dtype(np.asarray(other).dtype) if is_list_like(other) and not isinstance(other, np.ndarray): # TODO: Can we do this before the is_integer_dtype check? # could the is_integer_dtype check be checking the wrong @@ -988,10 +988,10 @@ def f(self, other, axis=default_axis, level=None, fill_value=None): self, other, pass_op, fill_value=fill_value, axis=axis, level=level ) else: + # in this case we always have `np.ndim(other) == 0` if fill_value is not None: self = self.fillna(fill_value) - assert np.ndim(other) == 0 return self._combine_const(other, op) f.__name__ = op_name @@ -1032,7 +1032,7 @@ def f(self, other, axis=default_axis, level=None): self, other, na_op, fill_value=None, axis=axis, level=level ) else: - assert np.ndim(other) == 0, other + # in this case we always have `np.ndim(other) == 0` return self._combine_const(other, na_op) f.__name__ = op_name diff --git a/pandas/core/ops/array_ops.py b/pandas/core/ops/array_ops.py index 523ba5d42a69c..f5f6d77676f1f 100644 --- a/pandas/core/ops/array_ops.py +++ b/pandas/core/ops/array_ops.py @@ -11,7 +11,7 @@ find_common_type, maybe_upcast_putmask, ) -from pandas.core.dtypes.common import is_object_dtype, is_period_dtype, is_scalar +from pandas.core.dtypes.common import is_object_dtype, is_scalar from pandas.core.dtypes.generic import ABCIndex, ABCSeries from pandas.core.dtypes.missing import notna @@ -57,9 +57,9 @@ def masked_arith_op(x, y, op): dtype = find_common_type([x.dtype, y.dtype]) result = np.empty(x.size, dtype=dtype) - # PeriodIndex.ravel() returns int64 dtype, so we have - # to work around that case. See GH#19956 - yrav = y if is_period_dtype(y) else y.ravel() + # NB: ravel() is only safe since y is ndarray; for e.g. PeriodIndex + # we would get int64 dtype, see GH#19956 + yrav = y.ravel() mask = notna(xrav) & notna(yrav) if yrav.shape != mask.shape: @@ -82,9 +82,9 @@ def masked_arith_op(x, y, op): mask = notna(xrav) # 1 ** np.nan is 1. So we have to unmask those. - if op == pow: + if op is pow: mask = np.where(x == 1, False, mask) - elif op == rpow: + elif op is rpow: mask = np.where(y == 1, False, mask) if mask.any(): diff --git a/pandas/core/ops/missing.py b/pandas/core/ops/missing.py index 01bc345a40b83..45fa6a2830af6 100644 --- a/pandas/core/ops/missing.py +++ b/pandas/core/ops/missing.py @@ -40,7 +40,7 @@ def fill_zeros(result, x, y, name, fill): Mask the nan's from x. """ - if fill is None or is_float_dtype(result): + if fill is None or is_float_dtype(result.dtype): return result if name.startswith(("r", "__r")): @@ -55,7 +55,7 @@ def fill_zeros(result, x, y, name, fill): if is_scalar_type: y = np.array(y) - if is_integer_dtype(y): + if is_integer_dtype(y.dtype): if (y == 0).any(): diff --git a/pandas/core/sparse/frame.py b/pandas/core/sparse/frame.py index f5add426297a7..8fe6850c84b8b 100644 --- a/pandas/core/sparse/frame.py +++ b/pandas/core/sparse/frame.py @@ -569,13 +569,13 @@ def _combine_frame(self, other, func, fill_value=None, level=None): ).__finalize__(self) def _combine_match_index(self, other, func, level=None): - new_data = {} if level is not None: raise NotImplementedError("'level' argument is not supported") this, other = self.align(other, join="outer", axis=0, level=level, copy=False) + new_data = {} for col, series in this.items(): new_data[col] = func(series.values, other.values) From 49d2019723b0089bd357adf6c936c5a82e0cc775 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 26 Aug 2019 16:52:44 -0700 Subject: [PATCH 169/191] CLN: internals.blocks cleanup, typing (#27941) --- pandas/core/internals/blocks.py | 90 ++++++++++----------------------- 1 file changed, 28 insertions(+), 62 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index a2a51881016a3..33698d245e9ff 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -7,7 +7,7 @@ import numpy as np -from pandas._libs import NaT, Timestamp, lib, tslib, tslibs +from pandas._libs import NaT, Timestamp, lib, tslib import pandas._libs.internals as libinternals from pandas._libs.tslibs import Timedelta, conversion from pandas._libs.tslibs.timezones import tz_compare @@ -407,7 +407,7 @@ def fillna(self, value, limit=None, inplace=False, downcast=None): return self.copy() if self._can_hold_element(value): - # equivalent: self._try_coerce_args(value) would not raise + # equivalent: _try_coerce_args(value) would not raise blocks = self.putmask(mask, value, inplace=inplace) return self._maybe_downcast(blocks, downcast) @@ -669,7 +669,7 @@ def convert( return self.copy() if copy else self - def _can_hold_element(self, element): + def _can_hold_element(self, element: Any) -> bool: """ require the same dtype as ourselves """ dtype = self.values.dtype.type tipo = maybe_infer_dtype_type(element) @@ -857,12 +857,6 @@ def setitem(self, indexer, value): if self._can_hold_element(value): value = self._try_coerce_args(value) - # can keep its own dtype - if hasattr(value, "dtype") and is_dtype_equal(values.dtype, value.dtype): - dtype = self.dtype - else: - dtype = "infer" - else: # current dtype cannot store value, coerce to common dtype find_dtype = False @@ -871,15 +865,9 @@ def setitem(self, indexer, value): dtype = value.dtype find_dtype = True - elif lib.is_scalar(value): - if isna(value): - # NaN promotion is handled in latter path - dtype = False - else: - dtype, _ = infer_dtype_from_scalar(value, pandas_dtype=True) - find_dtype = True - else: - dtype = "infer" + elif lib.is_scalar(value) and not isna(value): + dtype, _ = infer_dtype_from_scalar(value, pandas_dtype=True) + find_dtype = True if find_dtype: dtype = find_common_type([values.dtype, dtype]) @@ -1088,7 +1076,7 @@ def coerce_to_target_dtype(self, other): mytz = getattr(self.dtype, "tz", None) othertz = getattr(dtype, "tz", None) - if str(mytz) != str(othertz): + if not tz_compare(mytz, othertz): return self.astype(object) raise AssertionError( @@ -1308,7 +1296,7 @@ def take_nd(self, indexer, axis, new_mgr_locs=None, fill_tuple=None): else: return self.make_block_same_class(new_values, new_mgr_locs) - def diff(self, n, axis=1): + def diff(self, n: int, axis: int = 1) -> List["Block"]: """ return block for the diff of the values """ new_values = algos.diff(self.values, n, axis=axis) return [self.make_block(values=new_values)] @@ -1397,7 +1385,7 @@ def func(cond, values, other): if not ( (self.is_integer or self.is_bool) - and lib.is_scalar(other) + and lib.is_float(other) and np.isnan(other) ): # np.where will cast integer array to floats in this case @@ -1450,7 +1438,7 @@ def func(cond, values, other): return result_blocks - def equals(self, other): + def equals(self, other) -> bool: if self.dtype != other.dtype or self.shape != other.shape: return False return array_equivalent(self.values, other.values) @@ -1830,7 +1818,7 @@ def take_nd(self, indexer, axis=0, new_mgr_locs=None, fill_tuple=None): return self.make_block_same_class(new_values, new_mgr_locs) - def _can_hold_element(self, element): + def _can_hold_element(self, element: Any) -> bool: # XXX: We may need to think about pushing this onto the array. # We're doing the same as CategoricalBlock here. return True @@ -2000,7 +1988,7 @@ class NumericBlock(Block): class FloatOrComplexBlock(NumericBlock): __slots__ = () - def equals(self, other): + def equals(self, other) -> bool: if self.dtype != other.dtype or self.shape != other.shape: return False left, right = self.values, other.values @@ -2011,7 +1999,7 @@ class FloatBlock(FloatOrComplexBlock): __slots__ = () is_float = True - def _can_hold_element(self, element): + def _can_hold_element(self, element: Any) -> bool: tipo = maybe_infer_dtype_type(element) if tipo is not None: return issubclass(tipo.type, (np.floating, np.integer)) and not issubclass( @@ -2075,7 +2063,7 @@ class ComplexBlock(FloatOrComplexBlock): __slots__ = () is_complex = True - def _can_hold_element(self, element): + def _can_hold_element(self, element: Any) -> bool: tipo = maybe_infer_dtype_type(element) if tipo is not None: return issubclass(tipo.type, (np.floating, np.integer, np.complexfloating)) @@ -2092,7 +2080,7 @@ class IntBlock(NumericBlock): is_integer = True _can_hold_na = False - def _can_hold_element(self, element): + def _can_hold_element(self, element: Any) -> bool: tipo = maybe_infer_dtype_type(element) if tipo is not None: return ( @@ -2182,7 +2170,7 @@ def _astype(self, dtype, **kwargs): # delegate return super()._astype(dtype=dtype, **kwargs) - def _can_hold_element(self, element): + def _can_hold_element(self, element: Any) -> bool: tipo = maybe_infer_dtype_type(element) if tipo is not None: if self.is_datetimetz: @@ -2372,41 +2360,19 @@ def _slice(self, slicer): return self.values[slicer] def _try_coerce_args(self, other): - """ - localize and return i8 for the values - - Parameters - ---------- - other : ndarray-like or scalar - - Returns - ------- - base-type other - """ - if is_valid_nat_for_dtype(other, self.dtype): - other = np.datetime64("NaT", "ns") - elif isinstance(other, self._holder): - if not tz_compare(other.tz, self.values.tz): - raise ValueError("incompatible or non tz-aware value") - - elif isinstance(other, (np.datetime64, datetime, date)): - other = tslibs.Timestamp(other) - - # test we can have an equal time zone - if not tz_compare(other.tz, self.values.tz): - raise ValueError("incompatible or non tz-aware value") - else: - raise TypeError(other) - + # DatetimeArray handles this for us return other - def diff(self, n, axis=0): - """1st discrete difference + def diff(self, n: int, axis: int = 0) -> List["Block"]: + """ + 1st discrete difference. Parameters ---------- - n : int, number of periods to diff - axis : int, axis to diff upon. default 0 + n : int + Number of periods to diff. + axis : int, default 0 + Axis to diff upon. Returns ------- @@ -2468,7 +2434,7 @@ def setitem(self, indexer, value): ) return newb.setitem(indexer, value) - def equals(self, other): + def equals(self, other) -> bool: # override for significant performance improvement if self.dtype != other.dtype or self.shape != other.shape: return False @@ -2507,7 +2473,7 @@ def __init__(self, values, placement, ndim=None): def _holder(self): return TimedeltaArray - def _can_hold_element(self, element): + def _can_hold_element(self, element: Any) -> bool: tipo = maybe_infer_dtype_type(element) if tipo is not None: return issubclass(tipo.type, np.timedelta64) @@ -2600,7 +2566,7 @@ class BoolBlock(NumericBlock): is_bool = True _can_hold_na = False - def _can_hold_element(self, element): + def _can_hold_element(self, element: Any) -> bool: tipo = maybe_infer_dtype_type(element) if tipo is not None: return issubclass(tipo.type, np.bool_) @@ -2694,7 +2660,7 @@ def _maybe_downcast(self, blocks: List["Block"], downcast=None) -> List["Block"] # split and convert the blocks return _extend_blocks([b.convert(datetime=True, numeric=False) for b in blocks]) - def _can_hold_element(self, element): + def _can_hold_element(self, element: Any) -> bool: return True def _try_coerce_args(self, other): From 041b6b180f8175b642977852f01e9211983b46ce Mon Sep 17 00:00:00 2001 From: Kaiqi Dong Date: Tue, 27 Aug 2019 16:09:41 +0200 Subject: [PATCH 170/191] Replace with nested dict raises for overlapping keys (#27696) --- doc/source/whatsnew/v1.0.0.rst | 1 + pandas/core/generic.py | 6 +----- pandas/tests/frame/test_replace.py | 18 ++++++++++++------ 3 files changed, 14 insertions(+), 11 deletions(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 7fe358d3820f2..7a10447e3ad40 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -207,6 +207,7 @@ ExtensionArray Other ^^^^^ - Trying to set the ``display.precision``, ``display.max_rows`` or ``display.max_columns`` using :meth:`set_option` to anything but a ``None`` or a positive int will raise a ``ValueError`` (:issue:`23348`) +- Using :meth:`DataFrame.replace` with overlapping keys in a nested dictionary will no longer raise, now matching the behavior of a flat dictionary (:issue:`27660`) - :meth:`DataFrame.to_csv` and :meth:`Series.to_csv` now support dicts as ``compression`` argument with key ``'method'`` being the compression method and others as additional compression options when the compression method is ``'zip'``. (:issue:`26023`) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index fac5e0f085fc6..6ade69fb4ca9d 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -6669,11 +6669,7 @@ def replace( for k, v in items: keys, values = list(zip(*v.items())) or ([], []) - if set(keys) & set(values): - raise ValueError( - "Replacement not allowed with " - "overlapping keys and values" - ) + to_rep_dict[k] = list(keys) value_dict[k] = list(values) diff --git a/pandas/tests/frame/test_replace.py b/pandas/tests/frame/test_replace.py index 2862615ef8585..b341ed6a52ca5 100644 --- a/pandas/tests/frame/test_replace.py +++ b/pandas/tests/frame/test_replace.py @@ -1069,18 +1069,24 @@ def test_replace_truthy(self): e = df assert_frame_equal(r, e) - def test_replace_int_to_int_chain(self): + def test_nested_dict_overlapping_keys_replace_int(self): + # GH 27660 keep behaviour consistent for simple dictionary and + # nested dictionary replacement df = DataFrame({"a": list(range(1, 5))}) - with pytest.raises(ValueError, match="Replacement not allowed .+"): - df.replace({"a": dict(zip(range(1, 5), range(2, 6)))}) - def test_replace_str_to_str_chain(self): + result = df.replace({"a": dict(zip(range(1, 5), range(2, 6)))}) + expected = df.replace(dict(zip(range(1, 5), range(2, 6)))) + assert_frame_equal(result, expected) + + def test_nested_dict_overlapping_keys_replace_str(self): + # GH 27660 a = np.arange(1, 5) astr = a.astype(str) bstr = np.arange(2, 6).astype(str) df = DataFrame({"a": astr}) - with pytest.raises(ValueError, match="Replacement not allowed .+"): - df.replace({"a": dict(zip(astr, bstr))}) + result = df.replace(dict(zip(astr, bstr))) + expected = df.replace({"a": dict(zip(astr, bstr))}) + assert_frame_equal(result, expected) def test_replace_swapping_bug(self): df = pd.DataFrame({"a": [True, False, True]}) From bd8dbf906e4352567094637c9c824c350dae3ad2 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Tue, 27 Aug 2019 22:32:40 +0100 Subject: [PATCH 171/191] TYPING: --check-untyped-defs util._decorators (#28128) --- pandas/core/groupby/generic.py | 30 ++++++------ pandas/core/indexes/interval.py | 4 +- pandas/core/window/ewm.py | 4 +- pandas/core/window/expanding.py | 4 +- pandas/core/window/rolling.py | 10 ++-- pandas/util/_decorators.py | 82 +++++++++++++++++++-------------- 6 files changed, 74 insertions(+), 60 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index ea2bd22cccc3d..7d6690a0dfa5a 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -833,45 +833,45 @@ def apply(self, func, *args, **kwargs): axis="", ) @Appender(_shared_docs["aggregate"]) - def aggregate(self, func_or_funcs=None, *args, **kwargs): + def aggregate(self, func=None, *args, **kwargs): _level = kwargs.pop("_level", None) - relabeling = func_or_funcs is None + relabeling = func is None columns = None - no_arg_message = "Must provide 'func_or_funcs' or named aggregation **kwargs." + no_arg_message = "Must provide 'func' or named aggregation **kwargs." if relabeling: columns = list(kwargs) if not PY36: # sort for 3.5 and earlier columns = list(sorted(columns)) - func_or_funcs = [kwargs[col] for col in columns] + func = [kwargs[col] for col in columns] kwargs = {} if not columns: raise TypeError(no_arg_message) - if isinstance(func_or_funcs, str): - return getattr(self, func_or_funcs)(*args, **kwargs) + if isinstance(func, str): + return getattr(self, func)(*args, **kwargs) - if isinstance(func_or_funcs, abc.Iterable): + if isinstance(func, abc.Iterable): # Catch instances of lists / tuples # but not the class list / tuple itself. - func_or_funcs = _maybe_mangle_lambdas(func_or_funcs) - ret = self._aggregate_multiple_funcs(func_or_funcs, (_level or 0) + 1) + func = _maybe_mangle_lambdas(func) + ret = self._aggregate_multiple_funcs(func, (_level or 0) + 1) if relabeling: ret.columns = columns else: - cyfunc = self._get_cython_func(func_or_funcs) + cyfunc = self._get_cython_func(func) if cyfunc and not args and not kwargs: return getattr(self, cyfunc)() if self.grouper.nkeys > 1: - return self._python_agg_general(func_or_funcs, *args, **kwargs) + return self._python_agg_general(func, *args, **kwargs) try: - return self._python_agg_general(func_or_funcs, *args, **kwargs) + return self._python_agg_general(func, *args, **kwargs) except Exception: - result = self._aggregate_named(func_or_funcs, *args, **kwargs) + result = self._aggregate_named(func, *args, **kwargs) index = Index(sorted(result), name=self.grouper.names[0]) ret = Series(result, index=index) @@ -1464,8 +1464,8 @@ class DataFrameGroupBy(NDFrameGroupBy): axis="", ) @Appender(_shared_docs["aggregate"]) - def aggregate(self, arg=None, *args, **kwargs): - return super().aggregate(arg, *args, **kwargs) + def aggregate(self, func=None, *args, **kwargs): + return super().aggregate(func, *args, **kwargs) agg = aggregate diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index 3874c6404565c..021ff5fb46276 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -788,7 +788,7 @@ def _find_non_overlapping_monotonic_bounds(self, key): return start, stop def get_loc( - self, key: Any, method: Optional[str] = None + self, key: Any, method: Optional[str] = None, tolerance=None ) -> Union[int, slice, np.ndarray]: """ Get integer location, slice or boolean mask for requested label. @@ -982,7 +982,7 @@ def get_indexer_for(self, target: AnyArrayLike, **kwargs) -> np.ndarray: List of indices. """ if self.is_overlapping: - return self.get_indexer_non_unique(target, **kwargs)[0] + return self.get_indexer_non_unique(target)[0] return self.get_indexer(target, **kwargs) @Appender(_index_shared_docs["get_value"] % _index_doc_kwargs) diff --git a/pandas/core/window/ewm.py b/pandas/core/window/ewm.py index 0ce6d5ddec2ad..40e6c679ba72d 100644 --- a/pandas/core/window/ewm.py +++ b/pandas/core/window/ewm.py @@ -206,8 +206,8 @@ def _constructor(self): axis="", ) @Appender(_shared_docs["aggregate"]) - def aggregate(self, arg, *args, **kwargs): - return super().aggregate(arg, *args, **kwargs) + def aggregate(self, func, *args, **kwargs): + return super().aggregate(func, *args, **kwargs) agg = aggregate diff --git a/pandas/core/window/expanding.py b/pandas/core/window/expanding.py index c43ca6b0565f3..47bd8f2ec593b 100644 --- a/pandas/core/window/expanding.py +++ b/pandas/core/window/expanding.py @@ -136,8 +136,8 @@ def _get_window(self, other=None, **kwargs): axis="", ) @Appender(_shared_docs["aggregate"]) - def aggregate(self, arg, *args, **kwargs): - return super().aggregate(arg, *args, **kwargs) + def aggregate(self, func, *args, **kwargs): + return super().aggregate(func, *args, **kwargs) agg = aggregate diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index 323089b3fdf6b..a7e122fa3528f 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -901,12 +901,12 @@ def func(arg, window, min_periods=None, closed=None): axis="", ) @Appender(_shared_docs["aggregate"]) - def aggregate(self, arg, *args, **kwargs): - result, how = self._aggregate(arg, *args, **kwargs) + def aggregate(self, func, *args, **kwargs): + result, how = self._aggregate(func, *args, **kwargs) if result is None: # these must apply directly - result = arg(self) + result = func(self) return result @@ -1788,8 +1788,8 @@ def _validate_freq(self): axis="", ) @Appender(_shared_docs["aggregate"]) - def aggregate(self, arg, *args, **kwargs): - return super().aggregate(arg, *args, **kwargs) + def aggregate(self, func, *args, **kwargs): + return super().aggregate(func, *args, **kwargs) agg = aggregate diff --git a/pandas/util/_decorators.py b/pandas/util/_decorators.py index 5c7d481ff2586..8a25e511b5fc4 100644 --- a/pandas/util/_decorators.py +++ b/pandas/util/_decorators.py @@ -1,21 +1,35 @@ from functools import wraps import inspect from textwrap import dedent -from typing import Any, Callable, Dict, List, Optional, Tuple, Type, Union +from typing import ( + Any, + Callable, + Dict, + List, + Optional, + Tuple, + Type, + TypeVar, + Union, + cast, +) import warnings from pandas._libs.properties import cache_readonly # noqa +FuncType = Callable[..., Any] +F = TypeVar("F", bound=FuncType) + def deprecate( name: str, - alternative: Callable, + alternative: Callable[..., Any], version: str, alt_name: Optional[str] = None, klass: Optional[Type[Warning]] = None, stacklevel: int = 2, msg: Optional[str] = None, -) -> Callable: +) -> Callable[..., Any]: """ Return a new function that emits a deprecation warning on use. @@ -47,7 +61,7 @@ def deprecate( warning_msg = msg or "{} is deprecated, use {} instead".format(name, alt_name) @wraps(alternative) - def wrapper(*args, **kwargs): + def wrapper(*args, **kwargs) -> Callable[..., Any]: warnings.warn(warning_msg, klass, stacklevel=stacklevel) return alternative(*args, **kwargs) @@ -90,9 +104,9 @@ def wrapper(*args, **kwargs): def deprecate_kwarg( old_arg_name: str, new_arg_name: Optional[str], - mapping: Optional[Union[Dict, Callable[[Any], Any]]] = None, + mapping: Optional[Union[Dict[Any, Any], Callable[[Any], Any]]] = None, stacklevel: int = 2, -) -> Callable: +) -> Callable[..., Any]: """ Decorator to deprecate a keyword argument of a function. @@ -160,27 +174,27 @@ def deprecate_kwarg( "mapping from old to new argument values " "must be dict or callable!" ) - def _deprecate_kwarg(func): + def _deprecate_kwarg(func: F) -> F: @wraps(func) - def wrapper(*args, **kwargs): + def wrapper(*args, **kwargs) -> Callable[..., Any]: old_arg_value = kwargs.pop(old_arg_name, None) - if new_arg_name is None and old_arg_value is not None: - msg = ( - "the '{old_name}' keyword is deprecated and will be " - "removed in a future version. " - "Please take steps to stop the use of '{old_name}'" - ).format(old_name=old_arg_name) - warnings.warn(msg, FutureWarning, stacklevel=stacklevel) - kwargs[old_arg_name] = old_arg_value - return func(*args, **kwargs) - if old_arg_value is not None: - if mapping is not None: - if hasattr(mapping, "get"): - new_arg_value = mapping.get(old_arg_value, old_arg_value) - else: + if new_arg_name is None: + msg = ( + "the '{old_name}' keyword is deprecated and will be " + "removed in a future version. " + "Please take steps to stop the use of '{old_name}'" + ).format(old_name=old_arg_name) + warnings.warn(msg, FutureWarning, stacklevel=stacklevel) + kwargs[old_arg_name] = old_arg_value + return func(*args, **kwargs) + + elif mapping is not None: + if callable(mapping): new_arg_value = mapping(old_arg_value) + else: + new_arg_value = mapping.get(old_arg_value, old_arg_value) msg = ( "the {old_name}={old_val!r} keyword is deprecated, " "use {new_name}={new_val!r} instead" @@ -198,7 +212,7 @@ def wrapper(*args, **kwargs): ).format(old_name=old_arg_name, new_name=new_arg_name) warnings.warn(msg, FutureWarning, stacklevel=stacklevel) - if kwargs.get(new_arg_name, None) is not None: + if kwargs.get(new_arg_name) is not None: msg = ( "Can only specify '{old_name}' or '{new_name}', " "not both" ).format(old_name=old_arg_name, new_name=new_arg_name) @@ -207,17 +221,17 @@ def wrapper(*args, **kwargs): kwargs[new_arg_name] = new_arg_value return func(*args, **kwargs) - return wrapper + return cast(F, wrapper) return _deprecate_kwarg def rewrite_axis_style_signature( name: str, extra_params: List[Tuple[str, Any]] -) -> Callable: - def decorate(func): +) -> Callable[..., Any]: + def decorate(func: F) -> F: @wraps(func) - def wrapper(*args, **kwargs): + def wrapper(*args, **kwargs) -> Callable[..., Any]: return func(*args, **kwargs) kind = inspect.Parameter.POSITIONAL_OR_KEYWORD @@ -234,8 +248,9 @@ def wrapper(*args, **kwargs): sig = inspect.Signature(params) - func.__signature__ = sig - return wrapper + # https://github.com/python/typing/issues/598 + func.__signature__ = sig # type: ignore + return cast(F, wrapper) return decorate @@ -279,18 +294,17 @@ def __init__(self, *args, **kwargs): self.params = args or kwargs - def __call__(self, func: Callable) -> Callable: + def __call__(self, func: F) -> F: func.__doc__ = func.__doc__ and func.__doc__ % self.params return func def update(self, *args, **kwargs) -> None: """ Update self.params with supplied args. - - If called, we assume self.params is a dict. """ - self.params.update(*args, **kwargs) + if isinstance(self.params, dict): + self.params.update(*args, **kwargs) class Appender: @@ -320,7 +334,7 @@ def __init__(self, addendum: Optional[str], join: str = "", indents: int = 0): self.addendum = addendum self.join = join - def __call__(self, func: Callable) -> Callable: + def __call__(self, func: F) -> F: func.__doc__ = func.__doc__ if func.__doc__ else "" self.addendum = self.addendum if self.addendum else "" docitems = [func.__doc__, self.addendum] From 080d57ee9fef9275518908cb7665ea062684c29b Mon Sep 17 00:00:00 2001 From: Addison Lynch Date: Tue, 27 Aug 2019 17:39:03 -0400 Subject: [PATCH 172/191] CLN: Use ABC classes for isinstance checks, remove unnecessary imports (#28158) * CLN: Use ABC classes for isinstance checks, remove unnecessary imports * Formatting repairs --- pandas/core/frame.py | 27 +++++++++++---------------- pandas/core/indexing.py | 40 +++++++++++++++++++++------------------- 2 files changed, 32 insertions(+), 35 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index f636bb6db7430..3d1a39a86c784 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -86,12 +86,7 @@ from pandas.core.arrays.datetimelike import DatetimeLikeArrayMixin as DatetimeLikeArray from pandas.core.arrays.sparse import SparseFrameAccessor from pandas.core.generic import NDFrame, _shared_docs -from pandas.core.index import ( - Index, - MultiIndex, - ensure_index, - ensure_index_from_sequences, -) +from pandas.core.index import Index, ensure_index, ensure_index_from_sequences from pandas.core.indexes import base as ibase from pandas.core.indexes.datetimes import DatetimeIndex from pandas.core.indexes.multi import maybe_droplevels @@ -1734,7 +1729,7 @@ def to_records( if is_datetime64_any_dtype(self.index) and convert_datetime64: ix_vals = [self.index.to_pydatetime()] else: - if isinstance(self.index, MultiIndex): + if isinstance(self.index, ABCMultiIndex): # array of tuples to numpy cols. copy copy copy ix_vals = list(map(np.array, zip(*self.index.values))) else: @@ -1745,7 +1740,7 @@ def to_records( count = 0 index_names = list(self.index.names) - if isinstance(self.index, MultiIndex): + if isinstance(self.index, ABCMultiIndex): for i, n in enumerate(index_names): if n is None: index_names[i] = "level_%d" % count @@ -2868,7 +2863,7 @@ def __getitem__(self, key): # The behavior is inconsistent. It returns a Series, except when # - the key itself is repeated (test on data.shape, #9519), or # - we have a MultiIndex on columns (test on self.columns, #21309) - if data.shape[1] == 1 and not isinstance(self.columns, MultiIndex): + if data.shape[1] == 1 and not isinstance(self.columns, ABCMultiIndex): data = data[key] return data @@ -3657,7 +3652,7 @@ def reindexer(value): elif isinstance(value, DataFrame): # align right-hand-side columns if self.columns # is multi-index and self[key] is a sub-frame - if isinstance(self.columns, MultiIndex) and key in self.columns: + if isinstance(self.columns, ABCMultiIndex) and key in self.columns: loc = self.columns.get_loc(key) if isinstance(loc, (slice, Series, np.ndarray, Index)): cols = maybe_droplevels(self.columns[loc], key) @@ -3706,7 +3701,7 @@ def reindexer(value): # broadcast across multiple columns if necessary if broadcast and key in self.columns and value.ndim == 1: - if not self.columns.is_unique or isinstance(self.columns, MultiIndex): + if not self.columns.is_unique or isinstance(self.columns, ABCMultiIndex): existing_piece = self[key] if isinstance(existing_piece, DataFrame): value = np.tile(value, (len(existing_piece.columns), 1)) @@ -4601,7 +4596,7 @@ def _maybe_casted_values(index, labels=None): new_index = self.index.droplevel(level) if not drop: - if isinstance(self.index, MultiIndex): + if isinstance(self.index, ABCMultiIndex): names = [ n if n is not None else ("level_%d" % i) for (i, n) in enumerate(self.index.names) @@ -4612,7 +4607,7 @@ def _maybe_casted_values(index, labels=None): names = [default] if self.index.name is None else [self.index.name] to_insert = ((self.index, None),) - multi_col = isinstance(self.columns, MultiIndex) + multi_col = isinstance(self.columns, ABCMultiIndex) for i, (lev, lab) in reversed(list(enumerate(to_insert))): if not (level is None or i in level): continue @@ -4994,7 +4989,7 @@ def sort_index( level, ascending=ascending, sort_remaining=sort_remaining ) - elif isinstance(labels, MultiIndex): + elif isinstance(labels, ABCMultiIndex): from pandas.core.sorting import lexsort_indexer indexer = lexsort_indexer( @@ -5280,7 +5275,7 @@ def reorder_levels(self, order, axis=0): type of caller (new object) """ axis = self._get_axis_number(axis) - if not isinstance(self._get_axis(axis), MultiIndex): # pragma: no cover + if not isinstance(self._get_axis(axis), ABCMultiIndex): # pragma: no cover raise TypeError("Can only reorder levels on a hierarchical axis.") result = self.copy() @@ -7784,7 +7779,7 @@ def _count_level(self, level, axis=0, numeric_only=False): count_axis = frame._get_axis(axis) agg_axis = frame._get_agg_axis(axis) - if not isinstance(count_axis, MultiIndex): + if not isinstance(count_axis, ABCMultiIndex): raise TypeError( "Can only count levels on hierarchical " "{ax}.".format(ax=self._get_axis_name(axis)) diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index b8ca3419af4d7..3d495eeb8c885 100755 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -22,11 +22,11 @@ is_sparse, ) from pandas.core.dtypes.concat import concat_compat -from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries +from pandas.core.dtypes.generic import ABCDataFrame, ABCMultiIndex, ABCSeries from pandas.core.dtypes.missing import _infer_fill_value, isna import pandas.core.common as com -from pandas.core.index import Index, InvalidIndexError, MultiIndex +from pandas.core.index import Index, InvalidIndexError from pandas.core.indexers import is_list_like_indexer, length_of_indexer @@ -172,7 +172,7 @@ def _get_setitem_indexer(self, key): ax = self.obj._get_axis(0) - if isinstance(ax, MultiIndex) and self.name != "iloc": + if isinstance(ax, ABCMultiIndex) and self.name != "iloc": try: return ax.get_loc(key) except Exception: @@ -241,7 +241,7 @@ def _has_valid_tuple(self, key: Tuple): ) def _is_nested_tuple_indexer(self, tup: Tuple): - if any(isinstance(ax, MultiIndex) for ax in self.obj.axes): + if any(isinstance(ax, ABCMultiIndex) for ax in self.obj.axes): return any(is_nested_tuple(tup, ax) for ax in self.obj.axes) return False @@ -329,7 +329,7 @@ def _setitem_with_indexer(self, indexer, value): # GH 10360, GH 27841 if isinstance(indexer, tuple) and len(indexer) == len(self.obj.axes): for i, ax in zip(indexer, self.obj.axes): - if isinstance(ax, MultiIndex) and not ( + if isinstance(ax, ABCMultiIndex) and not ( is_integer(i) or com.is_null_slice(i) ): take_split_path = True @@ -422,7 +422,9 @@ def _setitem_with_indexer(self, indexer, value): # if we have a partial multiindex, then need to adjust the plane # indexer here - if len(labels) == 1 and isinstance(self.obj[labels[0]].axes[0], MultiIndex): + if len(labels) == 1 and isinstance( + self.obj[labels[0]].axes[0], ABCMultiIndex + ): item = labels[0] obj = self.obj[item] index = obj.index @@ -495,7 +497,7 @@ def setter(item, v): # we have an equal len Frame if isinstance(value, ABCDataFrame): sub_indexer = list(indexer) - multiindex_indexer = isinstance(labels, MultiIndex) + multiindex_indexer = isinstance(labels, ABCMultiIndex) for item in labels: if item in value: @@ -777,8 +779,8 @@ def _align_frame(self, indexer, df: ABCDataFrame): # we have a multi-index and are trying to align # with a particular, level GH3738 if ( - isinstance(ax, MultiIndex) - and isinstance(df.index, MultiIndex) + isinstance(ax, ABCMultiIndex) + and isinstance(df.index, ABCMultiIndex) and ax.nlevels != df.index.nlevels ): raise TypeError( @@ -904,7 +906,7 @@ def _getitem_lowerdim(self, tup: Tuple): ax0 = self.obj._get_axis(0) # ...but iloc should handle the tuple as simple integer-location # instead of checking it as multiindex representation (GH 13797) - if isinstance(ax0, MultiIndex) and self.name != "iloc": + if isinstance(ax0, ABCMultiIndex) and self.name != "iloc": result = self._handle_lowerdim_multi_index_axis0(tup) if result is not None: return result @@ -1004,7 +1006,7 @@ def _getitem_axis(self, key, axis: int): if isinstance(key, slice): return self._get_slice_axis(key, axis=axis) elif is_list_like_indexer(key) and not ( - isinstance(key, tuple) and isinstance(labels, MultiIndex) + isinstance(key, tuple) and isinstance(labels, ABCMultiIndex) ): if hasattr(key, "ndim") and key.ndim > 1: @@ -1017,7 +1019,7 @@ def _getitem_axis(self, key, axis: int): key = labels._maybe_cast_indexer(key) if is_integer(key): - if axis == 0 and isinstance(labels, MultiIndex): + if axis == 0 and isinstance(labels, ABCMultiIndex): try: return self._get_label(key, axis=axis) except (KeyError, TypeError): @@ -1228,7 +1230,7 @@ def _convert_to_indexer(self, obj, axis: int, raise_missing: bool = False): try: return labels.get_loc(obj) except LookupError: - if isinstance(obj, tuple) and isinstance(labels, MultiIndex): + if isinstance(obj, tuple) and isinstance(labels, ABCMultiIndex): if len(obj) == labels.nlevels: return {"key": obj} raise @@ -1248,7 +1250,7 @@ def _convert_to_indexer(self, obj, axis: int, raise_missing: bool = False): # always valid return {"key": obj} - if obj >= self.obj.shape[axis] and not isinstance(labels, MultiIndex): + if obj >= self.obj.shape[axis] and not isinstance(labels, ABCMultiIndex): # a positional raise ValueError("cannot set by positional indexing with enlargement") @@ -1715,7 +1717,7 @@ def _is_scalar_access(self, key: Tuple): return False ax = self.obj.axes[i] - if isinstance(ax, MultiIndex): + if isinstance(ax, ABCMultiIndex): return False if isinstance(k, str) and ax._supports_partial_string_indexing: @@ -1737,7 +1739,7 @@ def _getitem_scalar(self, key): def _get_partial_string_timestamp_match_key(self, key, labels): """Translate any partial string timestamp matches in key, returning the new key (GH 10331)""" - if isinstance(labels, MultiIndex): + if isinstance(labels, ABCMultiIndex): if ( isinstance(key, str) and labels.levels[0]._supports_partial_string_indexing @@ -1781,7 +1783,7 @@ def _getitem_axis(self, key, axis: int): # to a list of keys # we will use the *values* of the object # and NOT the index if its a PandasObject - if isinstance(labels, MultiIndex): + if isinstance(labels, ABCMultiIndex): if isinstance(key, (ABCSeries, np.ndarray)) and key.ndim <= 1: # Series, or 0,1 ndim ndarray @@ -1809,7 +1811,7 @@ def _getitem_axis(self, key, axis: int): key = tuple([key]) # an iterable multi-selection - if not (isinstance(key, tuple) and isinstance(labels, MultiIndex)): + if not (isinstance(key, tuple) and isinstance(labels, ABCMultiIndex)): if hasattr(key, "ndim") and key.ndim > 1: raise ValueError("Cannot index with multidimensional key") @@ -2474,7 +2476,7 @@ def is_nested_tuple(tup, labels): for i, k in enumerate(tup): if is_list_like(k) or isinstance(k, slice): - return isinstance(labels, MultiIndex) + return isinstance(labels, ABCMultiIndex) return False From d91ffa6407c1baf6afe7d0a1b9655f44da77ac24 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Tue, 27 Aug 2019 22:50:22 +0100 Subject: [PATCH 173/191] TYPING: change to FrameOrSeries Alias in pandas._typing (#28173) --- pandas/_typing.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/pandas/_typing.py b/pandas/_typing.py index 837a7a89e0b83..37a5d7945955d 100644 --- a/pandas/_typing.py +++ b/pandas/_typing.py @@ -11,9 +11,9 @@ from pandas.core.arrays.base import ExtensionArray # noqa: F401 from pandas.core.dtypes.dtypes import ExtensionDtype # noqa: F401 from pandas.core.indexes.base import Index # noqa: F401 - from pandas.core.frame import DataFrame # noqa: F401 from pandas.core.series import Series # noqa: F401 from pandas.core.sparse.series import SparseSeries # noqa: F401 + from pandas.core.generic import NDFrame # noqa: F401 AnyArrayLike = TypeVar( @@ -24,7 +24,10 @@ Dtype = Union[str, np.dtype, "ExtensionDtype"] FilePathOrBuffer = Union[str, Path, IO[AnyStr]] -FrameOrSeries = TypeVar("FrameOrSeries", "Series", "DataFrame") +FrameOrSeries = TypeVar("FrameOrSeries", bound="NDFrame") Scalar = Union[str, int, float] Axis = Union[str, int] Ordered = Optional[bool] + +# to maintain type information across generic functions and parametrization +_T = TypeVar("_T") From 612d3b23da5b99f6c5642be574fb08713a45d7d1 Mon Sep 17 00:00:00 2001 From: killerontherun1 Date: Thu, 29 Aug 2019 02:04:56 +0530 Subject: [PATCH 174/191] Solving GL01,GL02 in pandas.Interval and a few mentioned in the comments (#28197) --- pandas/core/indexes/interval.py | 3 ++- pandas/io/sql.py | 3 ++- pandas/io/stata.py | 2 +- pandas/plotting/_misc.py | 6 ++++-- pandas/util/testing.py | 6 ++++-- 5 files changed, 13 insertions(+), 7 deletions(-) diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index 021ff5fb46276..6b0081c6a2ff5 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -331,7 +331,8 @@ def __contains__(self, key): >>> idx.to_tuples() Index([(0.0, 1.0), (nan, nan), (2.0, 3.0)], dtype='object') >>> idx.to_tuples(na_tuple=False) - Index([(0.0, 1.0), nan, (2.0, 3.0)], dtype='object')""", + Index([(0.0, 1.0), nan, (2.0, 3.0)], dtype='object') + """, ) ) def to_tuples(self, na_tuple=True): diff --git a/pandas/io/sql.py b/pandas/io/sql.py index f1f52a9198d29..72df00fd4c5a1 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -269,7 +269,8 @@ def read_sql_query( parse_dates=None, chunksize=None, ): - """Read SQL query into a DataFrame. + """ + Read SQL query into a DataFrame. Returns a DataFrame corresponding to the result set of the query string. Optionally provide an `index_col` parameter to use one of the diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 69bafc7749258..31fdaa5cc6735 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -138,7 +138,7 @@ _iterator_params, ) -_data_method_doc = """\ +_data_method_doc = """ Read observations from Stata file, converting them into a dataframe .. deprecated:: diff --git a/pandas/plotting/_misc.py b/pandas/plotting/_misc.py index 1cba0e7354182..7ed0ffc6d0115 100644 --- a/pandas/plotting/_misc.py +++ b/pandas/plotting/_misc.py @@ -329,7 +329,8 @@ def parallel_coordinates( sort_labels=False, **kwds ): - """Parallel coordinates plotting. + """ + Parallel coordinates plotting. Parameters ---------- @@ -392,7 +393,8 @@ def parallel_coordinates( def lag_plot(series, lag=1, ax=None, **kwds): - """Lag plot for time series. + """ + Lag plot for time series. Parameters ---------- diff --git a/pandas/util/testing.py b/pandas/util/testing.py index a8f0d0da52e1f..0d543f891a5f6 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -580,7 +580,8 @@ def assert_index_equal( check_categorical: bool = True, obj: str = "Index", ) -> None: - """Check that left and right Index are equal. + """ + Check that left and right Index are equal. Parameters ---------- @@ -1081,7 +1082,8 @@ def assert_series_equal( check_categorical=True, obj="Series", ): - """Check that left and right Series are equal. + """ + Check that left and right Series are equal. Parameters ---------- From bc65fe6c12dc78679ba8584eee83c6e3e243b5b9 Mon Sep 17 00:00:00 2001 From: "Roei.r" Date: Thu, 29 Aug 2019 02:01:46 +0300 Subject: [PATCH 175/191] Fix slicer assignment bug (#28131) --- doc/source/whatsnew/v1.0.0.rst | 2 +- pandas/core/indexers.py | 1 + pandas/tests/indexing/test_loc.py | 10 ++++++++++ 3 files changed, 12 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 7a10447e3ad40..050a26cc86d42 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -141,7 +141,7 @@ Interval Indexing ^^^^^^^^ -- +- Bug in assignment using a reverse slicer (:issue:`26939`) - Missing diff --git a/pandas/core/indexers.py b/pandas/core/indexers.py index 70c48e969172f..433bca940c028 100644 --- a/pandas/core/indexers.py +++ b/pandas/core/indexers.py @@ -226,6 +226,7 @@ def length_of_indexer(indexer, target=None) -> int: if step is None: step = 1 elif step < 0: + start, stop = stop + 1, start + 1 step = -step return (stop - start + step - 1) // step elif isinstance(indexer, (ABCSeries, ABCIndexClass, np.ndarray, list)): diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index abe0cd86c90d7..9845b1ac3a4b9 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -1070,6 +1070,16 @@ def test_series_indexing_zerodim_np_array(self): result = s.loc[np.array(0)] assert result == 1 + def test_loc_reverse_assignment(self): + # GH26939 + data = [1, 2, 3, 4, 5, 6] + [None] * 4 + expected = Series(data, index=range(2010, 2020)) + + result = pd.Series(index=range(2010, 2020)) + result.loc[2015:2010:-1] = [6, 5, 4, 3, 2, 1] + + tm.assert_series_equal(result, expected) + def test_series_loc_getitem_label_list_missing_values(): # gh-11428 From 2518040894ef00d9ce427539937a86b2328a9e50 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 29 Aug 2019 05:17:03 -0700 Subject: [PATCH 176/191] STY: whitespace before class docstringsd (#28209) --- pandas/core/base.py | 1 - pandas/core/computation/expr.py | 8 ++++---- pandas/core/computation/pytables.py | 2 -- pandas/core/groupby/groupby.py | 1 - pandas/core/groupby/grouper.py | 1 - pandas/core/groupby/ops.py | 1 - pandas/core/indexes/frozen.py | 1 - pandas/core/sorting.py | 1 - pandas/io/common.py | 1 - pandas/io/packers.py | 1 - pandas/io/pytables.py | 14 -------------- pandas/tests/io/test_sql.py | 1 - pandas/tests/reshape/test_concat.py | 1 - 13 files changed, 4 insertions(+), 30 deletions(-) diff --git a/pandas/core/base.py b/pandas/core/base.py index 767b559445038..2d5ffb5e91392 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -47,7 +47,6 @@ class PandasObject(DirNamesMixin): - """baseclass for various pandas objects""" @property diff --git a/pandas/core/computation/expr.py b/pandas/core/computation/expr.py index 4c164968575a1..45319a4d63d94 100644 --- a/pandas/core/computation/expr.py +++ b/pandas/core/computation/expr.py @@ -367,8 +367,8 @@ def f(cls): @disallow(_unsupported_nodes) @add_ops(_op_classes) class BaseExprVisitor(ast.NodeVisitor): - - """Custom ast walker. Parsers of other engines should subclass this class + """ + Custom ast walker. Parsers of other engines should subclass this class if necessary. Parameters @@ -803,8 +803,8 @@ def __init__(self, env, engine, parser, preparser=lambda x: x): class Expr: - - """Object encapsulating an expression. + """ + Object encapsulating an expression. Parameters ---------- diff --git a/pandas/core/computation/pytables.py b/pandas/core/computation/pytables.py index 1523eb05ac41d..81658ab23ba46 100644 --- a/pandas/core/computation/pytables.py +++ b/pandas/core/computation/pytables.py @@ -478,7 +478,6 @@ def _validate_where(w): class Expr(expr.Expr): - """ hold a pytables like expression, comprised of possibly multiple 'terms' Parameters @@ -573,7 +572,6 @@ def evaluate(self): class TermValue: - """ hold a term value the we use to construct a condition/filter """ def __init__(self, value, converted, kind): diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 87047d2170992..4d21b5810470a 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1011,7 +1011,6 @@ def _apply_filter(self, indices, dropna): class GroupBy(_GroupBy): - """ Class for grouping and aggregating relational data. diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 3415c0e056a1c..31623171e9e63 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -217,7 +217,6 @@ def __repr__(self): class Grouping: - """ Holds the grouping information for a single key diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index b0c629f017dd3..5ad48fa675dd9 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -706,7 +706,6 @@ def _aggregate_series_pure_python(self, obj, func): class BinGrouper(BaseGrouper): - """ This is an internal Grouper class diff --git a/pandas/core/indexes/frozen.py b/pandas/core/indexes/frozen.py index 2e5b3ff8ef502..329456e25bded 100644 --- a/pandas/core/indexes/frozen.py +++ b/pandas/core/indexes/frozen.py @@ -22,7 +22,6 @@ class FrozenList(PandasObject, list): - """ Container that doesn't allow setting item *but* because it's technically non-hashable, will be used diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index 5db31fe6664ea..e6edad656d430 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -271,7 +271,6 @@ def nargsort(items, kind="quicksort", ascending=True, na_position="last"): class _KeyMapper: - """ Ease my suffering. Map compressed group id -> key tuple """ diff --git a/pandas/io/common.py b/pandas/io/common.py index 290022167e520..30228d660e816 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -576,7 +576,6 @@ def __next__(self) -> str: class UTF8Recoder(BaseIterator): - """ Iterator that reads an encoded stream and re-encodes the input to UTF-8 """ diff --git a/pandas/io/packers.py b/pandas/io/packers.py index 04e49708ff082..ad47ba23b9221 100644 --- a/pandas/io/packers.py +++ b/pandas/io/packers.py @@ -846,7 +846,6 @@ def __init__( class Iterator: - """ manage the unpacking iteration, close the file on completion """ diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 576c45a2f8097..fbe413f820c90 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -429,7 +429,6 @@ def _is_metadata_of(group, parent_group): class HDFStore: - """ Dict-like IO interface for storing pandas objects in PyTables. @@ -1546,7 +1545,6 @@ def _read_group(self, group, **kwargs): class TableIterator: - """ define the iteration interface on a table Parameters @@ -1654,7 +1652,6 @@ def get_result(self, coordinates=False): class IndexCol: - """ an index column description class Parameters @@ -1968,7 +1965,6 @@ def write_metadata(self, handler): class GenericIndexCol(IndexCol): - """ an index which is not represented in the data of the table """ @property @@ -2006,7 +2002,6 @@ def set_attr(self): class DataCol(IndexCol): - """ a data holding column, by definition this is not indexable Parameters @@ -2456,7 +2451,6 @@ def set_attr(self): class DataIndexableCol(DataCol): - """ represent a data column that can be indexed """ is_data_indexable = True @@ -2479,7 +2473,6 @@ def get_atom_timedelta64(self, block): class GenericDataIndexableCol(DataIndexableCol): - """ represent a generic pytables data column """ def get_attr(self): @@ -2487,7 +2480,6 @@ def get_attr(self): class Fixed: - """ represent an object in my store facilitate read/write of various types of objects this is an abstract base class @@ -2655,7 +2647,6 @@ def delete(self, where=None, start=None, stop=None, **kwargs): class GenericFixed(Fixed): - """ a generified fixed version """ _index_type_map = {DatetimeIndex: "datetime", PeriodIndex: "period"} @@ -3252,7 +3243,6 @@ class FrameFixed(BlockManagerFixed): class Table(Fixed): - """ represent a table: facilitate read/write of various types of tables @@ -4127,7 +4117,6 @@ def read_column(self, column, where=None, start=None, stop=None): class WORMTable(Table): - """ a write-once read-many table: this format DOES NOT ALLOW appending to a table. writing is a one-time operation the data are stored in a format that allows for searching the data on disk @@ -4149,7 +4138,6 @@ def write(self, **kwargs): class LegacyTable(Table): - """ an appendable table: allow append/query/delete operations to a (possibly) already existing appendable table this table ALLOWS append (but doesn't require them), and stores the data in a format @@ -4603,7 +4591,6 @@ def write(self, **kwargs): class AppendableMultiFrameTable(AppendableFrameTable): - """ a frame with a multi-index """ table_type = "appendable_multiframe" @@ -4962,7 +4949,6 @@ def _need_convert(kind): class Selection: - """ Carries out a selection operation on a tables.Table object. diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index d8465a427eaea..25727447b4c6f 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -565,7 +565,6 @@ def _transaction_test(self): class _TestSQLApi(PandasSQLTest): - """ Base class to test the public API. diff --git a/pandas/tests/reshape/test_concat.py b/pandas/tests/reshape/test_concat.py index 6366bf0521fbc..13f0f14014a31 100644 --- a/pandas/tests/reshape/test_concat.py +++ b/pandas/tests/reshape/test_concat.py @@ -50,7 +50,6 @@ def sort_with_none(request): class TestConcatAppendCommon: - """ Test common dtype coercion rules between concat and append. """ From 5f34933848d7daa129651a53158cb94367bacbcd Mon Sep 17 00:00:00 2001 From: DavidRosen Date: Thu, 29 Aug 2019 08:31:31 -0400 Subject: [PATCH 177/191] DOC: Example for adding a calculated column in SQL and Pandas (#28182) * Add example for adding a calculated column --- .../comparison/comparison_with_sql.rst | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/doc/source/getting_started/comparison/comparison_with_sql.rst b/doc/source/getting_started/comparison/comparison_with_sql.rst index 366fdd546f58b..6a03c06de3699 100644 --- a/doc/source/getting_started/comparison/comparison_with_sql.rst +++ b/doc/source/getting_started/comparison/comparison_with_sql.rst @@ -49,6 +49,20 @@ With pandas, column selection is done by passing a list of column names to your Calling the DataFrame without the list of column names would display all columns (akin to SQL's ``*``). +In SQL, you can add a calculated column: + +.. code-block:: sql + + SELECT *, tip/total_bill as tip_rate + FROM tips + LIMIT 5; + +With pandas, you can use the :meth:`DataFrame.assign` method of a DataFrame to append a new column: + +.. ipython:: python + + tips.assign(tip_rate=tips['tip'] / tips['total_bill']).head(5) + WHERE ----- Filtering in SQL is done via a WHERE clause. From 03b3c8fc82b3a18a3ddcad1b3b26d601467fc74c Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Thu, 29 Aug 2019 20:28:54 +0100 Subject: [PATCH 178/191] CLN: minor typos MutliIndex -> MultiIndex (#28223) --- doc/source/whatsnew/v0.20.0.rst | 2 +- pandas/tests/frame/test_reshape.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.rst b/doc/source/whatsnew/v0.20.0.rst index ef6108ae3ec90..62604dd3edd2d 100644 --- a/doc/source/whatsnew/v0.20.0.rst +++ b/doc/source/whatsnew/v0.20.0.rst @@ -495,7 +495,7 @@ Other enhancements - :func:`pandas.util.hash_pandas_object` has gained the ability to hash a ``MultiIndex`` (:issue:`15224`) - ``Series/DataFrame.squeeze()`` have gained the ``axis`` parameter. (:issue:`15339`) - ``DataFrame.to_excel()`` has a new ``freeze_panes`` parameter to turn on Freeze Panes when exporting to Excel (:issue:`15160`) -- ``pd.read_html()`` will parse multiple header rows, creating a MutliIndex header. (:issue:`13434`). +- ``pd.read_html()`` will parse multiple header rows, creating a MultiIndex header. (:issue:`13434`). - HTML table output skips ``colspan`` or ``rowspan`` attribute if equal to 1. (:issue:`15403`) - :class:`pandas.io.formats.style.Styler` template now has blocks for easier extension, see the :ref:`example notebook ` (:issue:`15649`) - :meth:`Styler.render() ` now accepts ``**kwargs`` to allow user-defined variables in the template (:issue:`15649`) diff --git a/pandas/tests/frame/test_reshape.py b/pandas/tests/frame/test_reshape.py index f3452e9a85fb3..84e343f07f990 100644 --- a/pandas/tests/frame/test_reshape.py +++ b/pandas/tests/frame/test_reshape.py @@ -984,7 +984,7 @@ def test_stack_preserve_categorical_dtype(self, ordered, labels): df = DataFrame([[10, 11, 12]], columns=cidx) result = df.stack() - # `MutliIndex.from_product` preserves categorical dtype - + # `MultiIndex.from_product` preserves categorical dtype - # it's tested elsewhere. midx = pd.MultiIndex.from_product([df.index, cidx]) expected = Series([10, 11, 12], index=midx) From d9b3993cc3722ddd01367089d374652c0b5ce0ce Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 30 Aug 2019 07:28:37 -0700 Subject: [PATCH 179/191] reduction-> libreduction for grepability (#28184) --- pandas/core/apply.py | 6 +++--- pandas/core/groupby/ops.py | 10 +++++----- pandas/tests/groupby/test_bin_groupby.py | 16 ++++++++-------- 3 files changed, 16 insertions(+), 16 deletions(-) diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 5c8599dbb054b..b96b3c7572031 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -3,7 +3,7 @@ import numpy as np -from pandas._libs import reduction +from pandas._libs import reduction as libreduction from pandas.util._decorators import cache_readonly from pandas.core.dtypes.common import ( @@ -221,7 +221,7 @@ def apply_raw(self): """ apply to the values as a numpy array """ try: - result = reduction.compute_reduction(self.values, self.f, axis=self.axis) + result = libreduction.compute_reduction(self.values, self.f, axis=self.axis) except Exception: result = np.apply_along_axis(self.f, self.axis, self.values) @@ -281,7 +281,7 @@ def apply_standard(self): dummy = Series(empty_arr, index=index, dtype=values.dtype) try: - result = reduction.compute_reduction( + result = libreduction.compute_reduction( values, self.f, axis=self.axis, dummy=dummy, labels=labels ) return self.obj._constructor_sliced(result, index=labels) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 5ad48fa675dd9..7afb0a28f943e 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -12,7 +12,7 @@ from pandas._libs import NaT, iNaT, lib import pandas._libs.groupby as libgroupby -import pandas._libs.reduction as reduction +import pandas._libs.reduction as libreduction from pandas.errors import AbstractMethodError from pandas.util._decorators import cache_readonly @@ -207,7 +207,7 @@ def apply(self, f, data, axis=0): if len(result_values) == len(group_keys): return group_keys, result_values, mutated - except reduction.InvalidApply: + except libreduction.InvalidApply: # Cannot fast apply on MultiIndex (_has_complex_internals). # This Exception is also raised if `f` triggers an exception # but it is preferable to raise the exception in Python. @@ -678,7 +678,7 @@ def _aggregate_series_fast(self, obj, func): indexer = get_group_index_sorter(group_index, ngroups) obj = obj.take(indexer) group_index = algorithms.take_nd(group_index, indexer, allow_fill=False) - grouper = reduction.SeriesGrouper(obj, func, group_index, ngroups, dummy) + grouper = libreduction.SeriesGrouper(obj, func, group_index, ngroups, dummy) result, counts = grouper.get_result() return result, counts @@ -851,7 +851,7 @@ def groupings(self): def agg_series(self, obj, func): dummy = obj[:0] - grouper = reduction.SeriesBinGrouper(obj, func, self.bins, dummy) + grouper = libreduction.SeriesBinGrouper(obj, func, self.bins, dummy) return grouper.get_result() @@ -939,7 +939,7 @@ def fast_apply(self, f, names): return [], True sdata = self._get_sorted_data() - return reduction.apply_frame_axis0(sdata, f, names, starts, ends) + return libreduction.apply_frame_axis0(sdata, f, names, starts, ends) def _chop(self, sdata, slice_obj): if self.axis == 0: diff --git a/pandas/tests/groupby/test_bin_groupby.py b/pandas/tests/groupby/test_bin_groupby.py index 2195686ee9c7f..b8f9ecd42bae3 100644 --- a/pandas/tests/groupby/test_bin_groupby.py +++ b/pandas/tests/groupby/test_bin_groupby.py @@ -2,7 +2,7 @@ from numpy import nan import pytest -from pandas._libs import groupby, lib, reduction +from pandas._libs import groupby, lib, reduction as libreduction from pandas.core.dtypes.common import ensure_int64 @@ -18,7 +18,7 @@ def test_series_grouper(): labels = np.array([-1, -1, -1, 0, 0, 0, 1, 1, 1, 1], dtype=np.int64) - grouper = reduction.SeriesGrouper(obj, np.mean, labels, 2, dummy) + grouper = libreduction.SeriesGrouper(obj, np.mean, labels, 2, dummy) result, counts = grouper.get_result() expected = np.array([obj[3:6].mean(), obj[6:].mean()]) @@ -34,7 +34,7 @@ def test_series_bin_grouper(): bins = np.array([3, 6]) - grouper = reduction.SeriesBinGrouper(obj, np.mean, bins, dummy) + grouper = libreduction.SeriesBinGrouper(obj, np.mean, bins, dummy) result, counts = grouper.get_result() expected = np.array([obj[:3].mean(), obj[3:6].mean(), obj[6:].mean()]) @@ -120,31 +120,31 @@ class TestMoments: class TestReducer: def test_int_index(self): arr = np.random.randn(100, 4) - result = reduction.compute_reduction(arr, np.sum, labels=Index(np.arange(4))) + result = libreduction.compute_reduction(arr, np.sum, labels=Index(np.arange(4))) expected = arr.sum(0) assert_almost_equal(result, expected) - result = reduction.compute_reduction( + result = libreduction.compute_reduction( arr, np.sum, axis=1, labels=Index(np.arange(100)) ) expected = arr.sum(1) assert_almost_equal(result, expected) dummy = Series(0.0, index=np.arange(100)) - result = reduction.compute_reduction( + result = libreduction.compute_reduction( arr, np.sum, dummy=dummy, labels=Index(np.arange(4)) ) expected = arr.sum(0) assert_almost_equal(result, expected) dummy = Series(0.0, index=np.arange(4)) - result = reduction.compute_reduction( + result = libreduction.compute_reduction( arr, np.sum, axis=1, dummy=dummy, labels=Index(np.arange(100)) ) expected = arr.sum(1) assert_almost_equal(result, expected) - result = reduction.compute_reduction( + result = libreduction.compute_reduction( arr, np.sum, axis=1, dummy=dummy, labels=Index(np.arange(100)) ) assert_almost_equal(result, expected) From 82a7455f8a69b99e9508e6f69bae943072d12a1b Mon Sep 17 00:00:00 2001 From: Jeremy Schendel Date: Fri, 30 Aug 2019 08:32:27 -0600 Subject: [PATCH 180/191] REGR: Fix to_csv with IntervalIndex (#28229) * REGR: Fix to_csv with IntervalIndex --- doc/source/whatsnew/v0.25.2.rst | 2 +- pandas/core/indexes/interval.py | 8 +--- pandas/tests/frame/test_to_csv.py | 14 +++++++ .../tests/indexes/interval/test_interval.py | 40 +++++++++++++++++++ pandas/tests/series/test_io.py | 14 +++++++ 5 files changed, 71 insertions(+), 7 deletions(-) diff --git a/doc/source/whatsnew/v0.25.2.rst b/doc/source/whatsnew/v0.25.2.rst index 6974c7521a237..8d8a39139cf84 100644 --- a/doc/source/whatsnew/v0.25.2.rst +++ b/doc/source/whatsnew/v0.25.2.rst @@ -62,7 +62,7 @@ Missing I/O ^^^ -- +- Regression in :meth:`~DataFrame.to_csv` where writing a :class:`Series` or :class:`DataFrame` indexed by an :class:`IntervalIndex` would incorrectly raise a ``TypeError`` (:issue:`28210`) - - diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index 6b0081c6a2ff5..7c581a12764b1 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -1096,12 +1096,8 @@ def _format_with_header(self, header, **kwargs): return header + list(self._format_native_types(**kwargs)) def _format_native_types(self, na_rep="NaN", quoting=None, **kwargs): - """ actually format my specific types """ - from pandas.io.formats.format import ExtensionArrayFormatter - - return ExtensionArrayFormatter( - values=self, na_rep=na_rep, justify="all", leading_space=False - ).get_result() + # GH 28210: use base method but with different default na_rep + return super()._format_native_types(na_rep=na_rep, quoting=quoting, **kwargs) def _format_data(self, name=None): diff --git a/pandas/tests/frame/test_to_csv.py b/pandas/tests/frame/test_to_csv.py index e2e4a82ff581c..8fb028a0f0326 100644 --- a/pandas/tests/frame/test_to_csv.py +++ b/pandas/tests/frame/test_to_csv.py @@ -695,6 +695,20 @@ def _make_frame(names=None): tm.assert_index_equal(recons.columns, exp.columns) assert len(recons) == 0 + def test_to_csv_interval_index(self): + # GH 28210 + df = DataFrame({"A": list("abc"), "B": range(3)}, index=pd.interval_range(0, 3)) + + with ensure_clean("__tmp_to_csv_interval_index__.csv") as path: + df.to_csv(path) + result = self.read_csv(path, index_col=0) + + # can't roundtrip intervalindex via read_csv so check string repr (GH 23595) + expected = df.copy() + expected.index = expected.index.astype(str) + + assert_frame_equal(result, expected) + def test_to_csv_float32_nanrep(self): df = DataFrame(np.random.randn(1, 4).astype(np.float32)) df[1] = np.nan diff --git a/pandas/tests/indexes/interval/test_interval.py b/pandas/tests/indexes/interval/test_interval.py index c1a21e6a7f152..eeb0f43f4b900 100644 --- a/pandas/tests/indexes/interval/test_interval.py +++ b/pandas/tests/indexes/interval/test_interval.py @@ -417,6 +417,46 @@ def test_repr_missing(self, constructor, expected): result = repr(obj) assert result == expected + @pytest.mark.parametrize( + "tuples, closed, expected_data", + [ + ([(0, 1), (1, 2), (2, 3)], "left", ["[0, 1)", "[1, 2)", "[2, 3)"]), + ( + [(0.5, 1.0), np.nan, (2.0, 3.0)], + "right", + ["(0.5, 1.0]", "NaN", "(2.0, 3.0]"], + ), + ( + [ + (Timestamp("20180101"), Timestamp("20180102")), + np.nan, + ((Timestamp("20180102"), Timestamp("20180103"))), + ], + "both", + ["[2018-01-01, 2018-01-02]", "NaN", "[2018-01-02, 2018-01-03]"], + ), + ( + [ + (Timedelta("0 days"), Timedelta("1 days")), + (Timedelta("1 days"), Timedelta("2 days")), + np.nan, + ], + "neither", + [ + "(0 days 00:00:00, 1 days 00:00:00)", + "(1 days 00:00:00, 2 days 00:00:00)", + "NaN", + ], + ), + ], + ) + def test_to_native_types(self, tuples, closed, expected_data): + # GH 28210 + index = IntervalIndex.from_tuples(tuples, closed=closed) + result = index.to_native_types() + expected = np.array(expected_data) + tm.assert_numpy_array_equal(result, expected) + def test_get_item(self, closed): i = IntervalIndex.from_arrays((0, 1, np.nan), (1, 2, np.nan), closed=closed) assert i[0] == Interval(0.0, 1.0, closed=closed) diff --git a/pandas/tests/series/test_io.py b/pandas/tests/series/test_io.py index 0686b397cbd81..0ddf1dfcabb59 100644 --- a/pandas/tests/series/test_io.py +++ b/pandas/tests/series/test_io.py @@ -191,6 +191,20 @@ def test_to_csv_compression(self, s, encoding, compression): s, pd.read_csv(fh, index_col=0, squeeze=True, encoding=encoding) ) + def test_to_csv_interval_index(self): + # GH 28210 + s = Series(["foo", "bar", "baz"], index=pd.interval_range(0, 3)) + + with ensure_clean("__tmp_to_csv_interval_index__.csv") as path: + s.to_csv(path, header=False) + result = self.read_csv(path, index_col=0, squeeze=True) + + # can't roundtrip intervalindex via read_csv so check string repr (GH 23595) + expected = s.copy() + expected.index = expected.index.astype(str) + + assert_series_equal(result, expected) + class TestSeriesIO: def test_to_frame(self, datetime_series): From 7b25463abeeea197f55ff2d5187938dd4cba08ce Mon Sep 17 00:00:00 2001 From: Kaiqi Dong Date: Fri, 30 Aug 2019 16:47:16 +0200 Subject: [PATCH 181/191] BUG: Multiple lambdas in named aggregation (#27921) --- doc/source/whatsnew/v1.0.0.rst | 1 + pandas/core/groupby/generic.py | 42 ++++- .../tests/groupby/aggregate/test_aggregate.py | 149 +++++++++++++++++- 3 files changed, 187 insertions(+), 5 deletions(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 050a26cc86d42..83beec5607986 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -178,6 +178,7 @@ Groupby/resample/rolling - - - Bug in :meth:`DataFrame.groupby` not offering selection by column name when ``axis=1`` (:issue:`27614`) +- Bug in :meth:`DataFrameGroupby.agg` not able to use lambda function with named aggregation (:issue:`27519`) Reshaping ^^^^^^^^^ diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 7d6690a0dfa5a..b0bcd1cc1e27c 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -268,7 +268,9 @@ def aggregate(self, func, *args, **kwargs): result.index = np.arange(len(result)) if relabeling: - result = result[order] + + # used reordered index of columns + result = result.iloc[:, order] result.columns = columns return result._convert(datetime=True) @@ -1731,8 +1733,8 @@ def _normalize_keyword_aggregation(kwargs): The transformed kwargs. columns : List[str] The user-provided keys. - order : List[Tuple[str, str]] - Pairs of the input and output column names. + col_idx_order : List[int] + List of columns indices. Examples -------- @@ -1759,7 +1761,39 @@ def _normalize_keyword_aggregation(kwargs): else: aggspec[column] = [aggfunc] order.append((column, com.get_callable_name(aggfunc) or aggfunc)) - return aggspec, columns, order + + # uniquify aggfunc name if duplicated in order list + uniquified_order = _make_unique(order) + + # GH 25719, due to aggspec will change the order of assigned columns in aggregation + # uniquified_aggspec will store uniquified order list and will compare it with order + # based on index + aggspec_order = [ + (column, com.get_callable_name(aggfunc) or aggfunc) + for column, aggfuncs in aggspec.items() + for aggfunc in aggfuncs + ] + uniquified_aggspec = _make_unique(aggspec_order) + + # get the new indice of columns by comparison + col_idx_order = Index(uniquified_aggspec).get_indexer(uniquified_order) + return aggspec, columns, col_idx_order + + +def _make_unique(seq): + """Uniquify aggfunc name of the pairs in the order list + + Examples: + -------- + >>> _make_unique([('a', ''), ('a', ''), ('b', '')]) + [('a', '_0'), ('a', '_1'), ('b', '')] + """ + return [ + (pair[0], "_".join([pair[1], str(seq[:i].count(pair))])) + if seq.count(pair) > 1 + else pair + for i, pair in enumerate(seq) + ] # TODO: Can't use, because mypy doesn't like us setting __name__ diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index 52d4fa76bf879..aa80c461a00e7 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -10,7 +10,7 @@ import pandas as pd from pandas import DataFrame, Index, MultiIndex, Series, compat, concat from pandas.core.base import SpecificationError -from pandas.core.groupby.generic import _maybe_mangle_lambdas +from pandas.core.groupby.generic import _make_unique, _maybe_mangle_lambdas from pandas.core.groupby.grouper import Grouping import pandas.util.testing as tm @@ -560,3 +560,150 @@ def test_with_kwargs(self): result = pd.Series([1, 2]).groupby([0, 0]).agg([f1, f2], 0, b=10) expected = pd.DataFrame({"": [13], "": [30]}) tm.assert_frame_equal(result, expected) + + def test_agg_with_one_lambda(self): + # GH 25719, write tests for DataFrameGroupby.agg with only one lambda + df = pd.DataFrame( + { + "kind": ["cat", "dog", "cat", "dog"], + "height": [9.1, 6.0, 9.5, 34.0], + "weight": [7.9, 7.5, 9.9, 198.0], + } + ) + + # sort for 35 and earlier + columns = ["height_sqr_min", "height_max", "weight_max"] + if compat.PY35: + columns = ["height_max", "height_sqr_min", "weight_max"] + expected = pd.DataFrame( + { + "height_sqr_min": [82.81, 36.00], + "height_max": [9.5, 34.0], + "weight_max": [9.9, 198.0], + }, + index=pd.Index(["cat", "dog"], name="kind"), + columns=columns, + ) + + # check pd.NameAgg case + result1 = df.groupby(by="kind").agg( + height_sqr_min=pd.NamedAgg( + column="height", aggfunc=lambda x: np.min(x ** 2) + ), + height_max=pd.NamedAgg(column="height", aggfunc="max"), + weight_max=pd.NamedAgg(column="weight", aggfunc="max"), + ) + tm.assert_frame_equal(result1, expected) + + # check agg(key=(col, aggfunc)) case + result2 = df.groupby(by="kind").agg( + height_sqr_min=("height", lambda x: np.min(x ** 2)), + height_max=("height", "max"), + weight_max=("weight", "max"), + ) + tm.assert_frame_equal(result2, expected) + + def test_agg_multiple_lambda(self): + # GH25719, test for DataFrameGroupby.agg with multiple lambdas + # with mixed aggfunc + df = pd.DataFrame( + { + "kind": ["cat", "dog", "cat", "dog"], + "height": [9.1, 6.0, 9.5, 34.0], + "weight": [7.9, 7.5, 9.9, 198.0], + } + ) + # sort for 35 and earlier + columns = [ + "height_sqr_min", + "height_max", + "weight_max", + "height_max_2", + "weight_min", + ] + if compat.PY35: + columns = [ + "height_max", + "height_max_2", + "height_sqr_min", + "weight_max", + "weight_min", + ] + expected = pd.DataFrame( + { + "height_sqr_min": [82.81, 36.00], + "height_max": [9.5, 34.0], + "weight_max": [9.9, 198.0], + "height_max_2": [9.5, 34.0], + "weight_min": [7.9, 7.5], + }, + index=pd.Index(["cat", "dog"], name="kind"), + columns=columns, + ) + + # check agg(key=(col, aggfunc)) case + result1 = df.groupby(by="kind").agg( + height_sqr_min=("height", lambda x: np.min(x ** 2)), + height_max=("height", "max"), + weight_max=("weight", "max"), + height_max_2=("height", lambda x: np.max(x)), + weight_min=("weight", lambda x: np.min(x)), + ) + tm.assert_frame_equal(result1, expected) + + # check pd.NamedAgg case + result2 = df.groupby(by="kind").agg( + height_sqr_min=pd.NamedAgg( + column="height", aggfunc=lambda x: np.min(x ** 2) + ), + height_max=pd.NamedAgg(column="height", aggfunc="max"), + weight_max=pd.NamedAgg(column="weight", aggfunc="max"), + height_max_2=pd.NamedAgg(column="height", aggfunc=lambda x: np.max(x)), + weight_min=pd.NamedAgg(column="weight", aggfunc=lambda x: np.min(x)), + ) + tm.assert_frame_equal(result2, expected) + + @pytest.mark.parametrize( + "order, expected_reorder", + [ + ( + [ + ("height", ""), + ("height", "max"), + ("weight", "max"), + ("height", ""), + ("weight", ""), + ], + [ + ("height", "_0"), + ("height", "max"), + ("weight", "max"), + ("height", "_1"), + ("weight", ""), + ], + ), + ( + [ + ("col2", "min"), + ("col1", ""), + ("col1", ""), + ("col1", ""), + ], + [ + ("col2", "min"), + ("col1", "_0"), + ("col1", "_1"), + ("col1", "_2"), + ], + ), + ( + [("col", ""), ("col", ""), ("col", "")], + [("col", "_0"), ("col", "_1"), ("col", "_2")], + ), + ], + ) + def test_make_unique(self, order, expected_reorder): + # GH 27519, test if make_unique function reorders correctly + result = _make_unique(order) + + assert result == expected_reorder From 51db82d9cc1abcec6c912d83e714811005471379 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 30 Aug 2019 09:38:39 -0700 Subject: [PATCH 182/191] PERF: lazify pytz seqToRE call, trims 35ms from import (#28228) --- pandas/_libs/tslibs/strptime.pyx | 44 +++++++++++++++++++++----------- 1 file changed, 29 insertions(+), 15 deletions(-) diff --git a/pandas/_libs/tslibs/strptime.pyx b/pandas/_libs/tslibs/strptime.pyx index d93858cff5e05..fbda5f178e164 100644 --- a/pandas/_libs/tslibs/strptime.pyx +++ b/pandas/_libs/tslibs/strptime.pyx @@ -341,7 +341,8 @@ def array_strptime(object[:] values, object fmt, return result, result_timezone.base -"""_getlang, LocaleTime, TimeRE, _calc_julian_from_U_or_W are vendored +""" +_getlang, LocaleTime, TimeRE, _calc_julian_from_U_or_W are vendored from the standard library, see https://github.com/python/cpython/blob/master/Lib/_strptime.py The original module-level docstring follows. @@ -363,7 +364,8 @@ def _getlang(): class LocaleTime: - """Stores and handles locale-specific information related to time. + """ + Stores and handles locale-specific information related to time. ATTRIBUTES: f_weekday -- full weekday names (7-item list) @@ -382,7 +384,8 @@ class LocaleTime: """ def __init__(self): - """Set all attributes. + """ + Set all attributes. Order of methods called matters for dependency reasons. @@ -399,7 +402,6 @@ class LocaleTime: Only other possible issue is if someone changed the timezone and did not call tz.tzset . That is an issue for the programmer, though, since changing the timezone is worthless without that call. - """ self.lang = _getlang() self.__calc_weekday() @@ -518,15 +520,16 @@ class TimeRE(dict): """ def __init__(self, locale_time=None): - """Create keys/values. + """ + Create keys/values. Order of execution is important for dependency reasons. - """ if locale_time: self.locale_time = locale_time else: self.locale_time = LocaleTime() + self._Z = None base = super() base.__init__({ # The " \d" part of the regex is to make %c from ANSI C work @@ -555,21 +558,29 @@ class TimeRE(dict): 'B': self.__seqToRE(self.locale_time.f_month[1:], 'B'), 'b': self.__seqToRE(self.locale_time.a_month[1:], 'b'), 'p': self.__seqToRE(self.locale_time.am_pm, 'p'), - 'Z': self.__seqToRE(pytz.all_timezones, 'Z'), + # 'Z' key is generated lazily via __getitem__ '%': '%'}) base.__setitem__('W', base.__getitem__('U').replace('U', 'W')) base.__setitem__('c', self.pattern(self.locale_time.LC_date_time)) base.__setitem__('x', self.pattern(self.locale_time.LC_date)) base.__setitem__('X', self.pattern(self.locale_time.LC_time)) + def __getitem__(self, key): + if key == "Z": + # lazy computation + if self._Z is None: + self._Z = self.__seqToRE(pytz.all_timezones, 'Z') + return self._Z + return super().__getitem__(key) + def __seqToRE(self, to_convert, directive): - """Convert a list to a regex string for matching a directive. + """ + Convert a list to a regex string for matching a directive. Want possible matching values to be from longest to shortest. This prevents the possibility of a match occurring for a value that also a substring of a larger value that should have matched (e.g., 'abc' matching when 'abcdef' should have been the match). - """ to_convert = sorted(to_convert, key=len, reverse=True) for value in to_convert: @@ -582,11 +593,11 @@ class TimeRE(dict): return '%s)' % regex def pattern(self, format): - """Return regex pattern for the format string. + """ + Return regex pattern for the format string. Need to make sure that any characters that might be interpreted as regex syntax are escaped. - """ processed_format = '' # The sub() call escapes all characters that might be misconstrued @@ -619,7 +630,8 @@ _regex_cache = {} cdef int _calc_julian_from_U_or_W(int year, int week_of_year, int day_of_week, int week_starts_Mon): - """Calculate the Julian day based on the year, week of the year, and day of + """ + Calculate the Julian day based on the year, week of the year, and day of the week, with week_start_day representing whether the week of the year assumes the week starts on Sunday or Monday (6 or 0). @@ -660,8 +672,10 @@ cdef int _calc_julian_from_U_or_W(int year, int week_of_year, return 1 + days_to_week + day_of_week -cdef object _calc_julian_from_V(int iso_year, int iso_week, int iso_weekday): - """Calculate the Julian day based on the ISO 8601 year, week, and weekday. +cdef (int, int) _calc_julian_from_V(int iso_year, int iso_week, int iso_weekday): + """ + Calculate the Julian day based on the ISO 8601 year, week, and weekday. + ISO weeks start on Mondays, with week 01 being the week containing 4 Jan. ISO week days range from 1 (Monday) to 7 (Sunday). @@ -694,7 +708,7 @@ cdef object _calc_julian_from_V(int iso_year, int iso_week, int iso_weekday): return iso_year, ordinal -cdef parse_timezone_directive(object z): +cdef parse_timezone_directive(str z): """ Parse the '%z' directive and return a pytz.FixedOffset From 75c9783d4924c98d84e9722060686fc7b4643259 Mon Sep 17 00:00:00 2001 From: Bas Nijholt Date: Fri, 30 Aug 2019 19:05:31 +0200 Subject: [PATCH 183/191] STYLE: run pre-commit filters on the repo (#27915) * add isort:skip to "from .pandas_vb_common import setup" * add isort:skip to noqa: E402 marked lines * run black * add noqa: E402 isort:skip where needed * run pre-commit filters on asv_bench/benchmarks/ * parse the isort config when using pre-commit * run isort on pandas/core/api.py * run pre-commit filters and commit trivial import sorting changes * specify flake8 errors in pandas/io/msgpack/__init__.py * fix imports for doc/source/conf.py * fix the [isort] skip entry in setup.cfg Also I removed the files for which I have fixed the problems. --- .pre-commit-config.yaml | 36 ++++++----- asv_bench/benchmarks/attrs_caching.py | 3 +- asv_bench/benchmarks/binary_ops.py | 3 +- asv_bench/benchmarks/categoricals.py | 6 +- asv_bench/benchmarks/ctors.py | 5 +- asv_bench/benchmarks/dtypes.py | 10 +-- asv_bench/benchmarks/eval.py | 3 +- asv_bench/benchmarks/frame_ctor.py | 5 +- asv_bench/benchmarks/frame_methods.py | 4 +- asv_bench/benchmarks/gil.py | 9 +-- asv_bench/benchmarks/groupby.py | 3 +- asv_bench/benchmarks/index_object.py | 14 +++-- asv_bench/benchmarks/indexing.py | 17 ++--- asv_bench/benchmarks/inference.py | 7 ++- asv_bench/benchmarks/io/csv.py | 7 ++- asv_bench/benchmarks/io/excel.py | 6 +- asv_bench/benchmarks/io/hdf.py | 5 +- asv_bench/benchmarks/io/json.py | 5 +- asv_bench/benchmarks/io/msgpack.py | 4 +- asv_bench/benchmarks/io/pickle.py | 3 +- asv_bench/benchmarks/io/sql.py | 7 ++- asv_bench/benchmarks/io/stata.py | 3 +- asv_bench/benchmarks/join_merge.py | 5 +- asv_bench/benchmarks/multiindex_object.py | 5 +- asv_bench/benchmarks/offset.py | 3 +- asv_bench/benchmarks/pandas_vb_common.py | 3 +- asv_bench/benchmarks/period.py | 1 + asv_bench/benchmarks/plotting.py | 7 ++- asv_bench/benchmarks/reindex.py | 6 +- asv_bench/benchmarks/replace.py | 3 +- asv_bench/benchmarks/reshape.py | 7 ++- asv_bench/benchmarks/rolling.py | 5 +- asv_bench/benchmarks/series_methods.py | 5 +- asv_bench/benchmarks/sparse.py | 2 +- asv_bench/benchmarks/stat_ops.py | 4 +- asv_bench/benchmarks/strings.py | 3 +- asv_bench/benchmarks/timeseries.py | 6 +- ci/print_skipped.py | 2 +- doc/logo/pandas_logo.py | 3 +- doc/make.py | 8 +-- doc/source/conf.py | 24 +++++--- doc/source/user_guide/io.rst | 2 +- doc/sphinxext/contributors.py | 3 +- pandas/core/api.py | 61 +++++++++--------- pandas/io/msgpack/__init__.py | 14 +++-- pandas/tests/io/pytables/test_pytables.py | 5 +- pandas/tests/io/test_feather.py | 2 +- scripts/find_commits_touching_func.py | 6 +- scripts/generate_pip_deps_from_conda.py | 2 +- scripts/merge-pr.py | 7 ++- scripts/tests/test_validate_docstrings.py | 7 ++- scripts/validate_docstrings.py | 32 +++++----- setup.cfg | 75 +++++------------------ setup.py | 20 +++--- 54 files changed, 255 insertions(+), 248 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 32ffb3330564c..5cc22c638c9b1 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,17 +1,21 @@ repos: - - repo: https://github.com/python/black - rev: stable - hooks: - - id: black - language_version: python3.7 - - repo: https://gitlab.com/pycqa/flake8 - rev: 3.7.7 - hooks: - - id: flake8 - language: python_venv - additional_dependencies: [flake8-comprehensions] - - repo: https://github.com/pre-commit/mirrors-isort - rev: v4.3.20 - hooks: - - id: isort - language: python_venv +- repo: https://github.com/python/black + rev: stable + hooks: + - id: black + language_version: python3.7 +- repo: https://gitlab.com/pycqa/flake8 + rev: 3.7.7 + hooks: + - id: flake8 + language: python_venv + additional_dependencies: [flake8-comprehensions] +- repo: https://github.com/pre-commit/mirrors-isort + rev: v4.3.20 + hooks: + - id: isort + language: python_venv +- repo: https://github.com/asottile/seed-isort-config + rev: v1.9.2 + hooks: + - id: seed-isort-config diff --git a/asv_bench/benchmarks/attrs_caching.py b/asv_bench/benchmarks/attrs_caching.py index c43e5dfd729aa..501e27b9078ec 100644 --- a/asv_bench/benchmarks/attrs_caching.py +++ b/asv_bench/benchmarks/attrs_caching.py @@ -1,4 +1,5 @@ import numpy as np + from pandas import DataFrame try: @@ -32,4 +33,4 @@ def time_cache_readonly(self): self.obj.prop -from .pandas_vb_common import setup # noqa: F401 +from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/binary_ops.py b/asv_bench/benchmarks/binary_ops.py index fd3324b78f1c3..58e0db67d6025 100644 --- a/asv_bench/benchmarks/binary_ops.py +++ b/asv_bench/benchmarks/binary_ops.py @@ -1,4 +1,5 @@ import numpy as np + from pandas import DataFrame, Series, date_range from pandas.core.algorithms import checked_add_with_arr @@ -155,4 +156,4 @@ def time_add_overflow_both_arg_nan(self): ) -from .pandas_vb_common import setup # noqa: F401 +from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/categoricals.py b/asv_bench/benchmarks/categoricals.py index 8097118a79d20..559aa7050a640 100644 --- a/asv_bench/benchmarks/categoricals.py +++ b/asv_bench/benchmarks/categoricals.py @@ -1,7 +1,9 @@ +import warnings + import numpy as np + import pandas as pd import pandas.util.testing as tm -import warnings try: from pandas.api.types import union_categoricals @@ -280,4 +282,4 @@ def time_sort_values(self): self.index.sort_values(ascending=False) -from .pandas_vb_common import setup # noqa: F401 +from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/ctors.py b/asv_bench/benchmarks/ctors.py index 654075292cdf6..ec3dd7a48a89f 100644 --- a/asv_bench/benchmarks/ctors.py +++ b/asv_bench/benchmarks/ctors.py @@ -1,6 +1,7 @@ import numpy as np + +from pandas import DatetimeIndex, Index, MultiIndex, Series, Timestamp import pandas.util.testing as tm -from pandas import Series, Index, DatetimeIndex, Timestamp, MultiIndex def no_change(arr): @@ -113,4 +114,4 @@ def time_multiindex_from_iterables(self): MultiIndex.from_product(self.iterables) -from .pandas_vb_common import setup # noqa: F401 +from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/dtypes.py b/asv_bench/benchmarks/dtypes.py index 60800b1f9cae7..24cc1c6f9fa70 100644 --- a/asv_bench/benchmarks/dtypes.py +++ b/asv_bench/benchmarks/dtypes.py @@ -1,14 +1,14 @@ +import numpy as np + from pandas.api.types import pandas_dtype -import numpy as np from .pandas_vb_common import ( - numeric_dtypes, datetime_dtypes, - string_dtypes, extension_dtypes, + numeric_dtypes, + string_dtypes, ) - _numpy_dtypes = [ np.dtype(dtype) for dtype in (numeric_dtypes + datetime_dtypes + string_dtypes) ] @@ -40,4 +40,4 @@ def time_pandas_dtype_invalid(self, dtype): pass -from .pandas_vb_common import setup # noqa: F401 +from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/eval.py b/asv_bench/benchmarks/eval.py index 84e94315cc28b..06a181875aaa8 100644 --- a/asv_bench/benchmarks/eval.py +++ b/asv_bench/benchmarks/eval.py @@ -1,4 +1,5 @@ import numpy as np + import pandas as pd try: @@ -62,4 +63,4 @@ def time_query_with_boolean_selection(self): self.df.query("(a >= @self.min_val) & (a <= @self.max_val)") -from .pandas_vb_common import setup # noqa: F401 +from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/frame_ctor.py b/asv_bench/benchmarks/frame_ctor.py index acfb26bcf5d7c..3944e0bc523d8 100644 --- a/asv_bench/benchmarks/frame_ctor.py +++ b/asv_bench/benchmarks/frame_ctor.py @@ -1,6 +1,7 @@ import numpy as np + +from pandas import DataFrame, MultiIndex, Series, Timestamp, date_range import pandas.util.testing as tm -from pandas import DataFrame, Series, MultiIndex, Timestamp, date_range try: from pandas.tseries.offsets import Nano, Hour @@ -104,4 +105,4 @@ def time_frame_from_lists(self): self.df = DataFrame(self.data) -from .pandas_vb_common import setup # noqa: F401 +from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/frame_methods.py b/asv_bench/benchmarks/frame_methods.py index e2f6764c76eef..05f98c66faa2b 100644 --- a/asv_bench/benchmarks/frame_methods.py +++ b/asv_bench/benchmarks/frame_methods.py @@ -1,5 +1,5 @@ -import warnings import string +import warnings import numpy as np @@ -609,4 +609,4 @@ def time_dataframe_describe(self): self.df.describe() -from .pandas_vb_common import setup # noqa: F401 +from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/gil.py b/asv_bench/benchmarks/gil.py index 0d0b75561d057..d57492dd37268 100644 --- a/asv_bench/benchmarks/gil.py +++ b/asv_bench/benchmarks/gil.py @@ -1,7 +1,8 @@ import numpy as np -import pandas.util.testing as tm -from pandas import DataFrame, Series, read_csv, factorize, date_range + +from pandas import DataFrame, Series, date_range, factorize, read_csv from pandas.core.algorithms import take_1d +import pandas.util.testing as tm try: from pandas import ( @@ -36,7 +37,7 @@ def wrapper(fname): return wrapper -from .pandas_vb_common import BaseIO +from .pandas_vb_common import BaseIO # noqa: E402 isort:skip class ParallelGroupbyMethods: @@ -301,4 +302,4 @@ def time_loop(self, threads): self.loop() -from .pandas_vb_common import setup # noqa: F401 +from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py index 39b07d4734399..d51c53e2264f1 100644 --- a/asv_bench/benchmarks/groupby.py +++ b/asv_bench/benchmarks/groupby.py @@ -15,7 +15,6 @@ ) import pandas.util.testing as tm - method_blacklist = { "object": { "median", @@ -626,4 +625,4 @@ def time_first(self): self.df_nans.groupby("key").transform("first") -from .pandas_vb_common import setup # noqa: F401 +from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/index_object.py b/asv_bench/benchmarks/index_object.py index 49834ae94cc38..a94960d494707 100644 --- a/asv_bench/benchmarks/index_object.py +++ b/asv_bench/benchmarks/index_object.py @@ -1,15 +1,17 @@ import gc + import numpy as np -import pandas.util.testing as tm + from pandas import ( - Series, - date_range, DatetimeIndex, - Index, - RangeIndex, Float64Index, + Index, IntervalIndex, + RangeIndex, + Series, + date_range, ) +import pandas.util.testing as tm class SetOperations: @@ -243,4 +245,4 @@ def peakmem_gc_instances(self, N): gc.enable() -from .pandas_vb_common import setup # noqa: F401 +from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/indexing.py b/asv_bench/benchmarks/indexing.py index 84604b8196536..ac35139c1954a 100644 --- a/asv_bench/benchmarks/indexing.py +++ b/asv_bench/benchmarks/indexing.py @@ -1,22 +1,23 @@ import warnings import numpy as np -import pandas.util.testing as tm + from pandas import ( - Series, + CategoricalIndex, DataFrame, - MultiIndex, - Int64Index, - UInt64Index, Float64Index, - IntervalIndex, - CategoricalIndex, IndexSlice, + Int64Index, + IntervalIndex, + MultiIndex, + Series, + UInt64Index, concat, date_range, option_context, period_range, ) +import pandas.util.testing as tm class NumericSeriesIndexing: @@ -371,4 +372,4 @@ def time_chained_indexing(self, mode): df2["C"] = 1.0 -from .pandas_vb_common import setup # noqa: F401 +from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/inference.py b/asv_bench/benchmarks/inference.py index 66ef4f2aec380..e85b3bd2c7687 100644 --- a/asv_bench/benchmarks/inference.py +++ b/asv_bench/benchmarks/inference.py @@ -1,8 +1,9 @@ import numpy as np -import pandas.util.testing as tm + from pandas import DataFrame, Series, to_numeric +import pandas.util.testing as tm -from .pandas_vb_common import numeric_dtypes, lib +from .pandas_vb_common import lib, numeric_dtypes class NumericInferOps: @@ -120,4 +121,4 @@ def time_convert(self, data): lib.maybe_convert_numeric(data, set(), coerce_numeric=False) -from .pandas_vb_common import setup # noqa: F401 +from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py index 4525e504fc4dd..9b8599b0a1b64 100644 --- a/asv_bench/benchmarks/io/csv.py +++ b/asv_bench/benchmarks/io/csv.py @@ -1,10 +1,11 @@ +from io import StringIO import random import string import numpy as np + +from pandas import Categorical, DataFrame, date_range, read_csv, to_datetime import pandas.util.testing as tm -from pandas import DataFrame, Categorical, date_range, read_csv, to_datetime -from io import StringIO from ..pandas_vb_common import BaseIO @@ -406,4 +407,4 @@ def time_to_datetime_format_DD_MM_YYYY(self, cache_dates): to_datetime(df["date"], cache=cache_dates, format="%d-%m-%Y") -from ..pandas_vb_common import setup # noqa: F401 +from ..pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/io/excel.py b/asv_bench/benchmarks/io/excel.py index 12e70f84e5203..9aa5cbd5b6f7c 100644 --- a/asv_bench/benchmarks/io/excel.py +++ b/asv_bench/benchmarks/io/excel.py @@ -1,6 +1,8 @@ from io import BytesIO + import numpy as np -from pandas import DataFrame, date_range, ExcelWriter, read_excel + +from pandas import DataFrame, ExcelWriter, date_range, read_excel import pandas.util.testing as tm @@ -35,4 +37,4 @@ def time_write_excel(self, engine): writer_write.save() -from ..pandas_vb_common import setup # noqa: F401 +from ..pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/io/hdf.py b/asv_bench/benchmarks/io/hdf.py index 2874a7889156b..8ec04a2087f1b 100644 --- a/asv_bench/benchmarks/io/hdf.py +++ b/asv_bench/benchmarks/io/hdf.py @@ -1,5 +1,6 @@ import numpy as np -from pandas import DataFrame, date_range, HDFStore, read_hdf + +from pandas import DataFrame, HDFStore, date_range, read_hdf import pandas.util.testing as tm from ..pandas_vb_common import BaseIO @@ -127,4 +128,4 @@ def time_write_hdf(self, format): self.df.to_hdf(self.fname, "df", format=format) -from ..pandas_vb_common import setup # noqa: F401 +from ..pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/io/json.py b/asv_bench/benchmarks/io/json.py index fc07f2a484102..b249c92b53e93 100644 --- a/asv_bench/benchmarks/io/json.py +++ b/asv_bench/benchmarks/io/json.py @@ -1,6 +1,7 @@ import numpy as np + +from pandas import DataFrame, concat, date_range, read_json, timedelta_range import pandas.util.testing as tm -from pandas import DataFrame, date_range, timedelta_range, concat, read_json from ..pandas_vb_common import BaseIO @@ -214,4 +215,4 @@ def peakmem_float(self, frames): df.to_json() -from ..pandas_vb_common import setup # noqa: F401 +from ..pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/io/msgpack.py b/asv_bench/benchmarks/io/msgpack.py index d97b4ae13f0bd..f5038602539ab 100644 --- a/asv_bench/benchmarks/io/msgpack.py +++ b/asv_bench/benchmarks/io/msgpack.py @@ -1,5 +1,7 @@ import warnings + import numpy as np + from pandas import DataFrame, date_range, read_msgpack import pandas.util.testing as tm @@ -27,4 +29,4 @@ def time_write_msgpack(self): self.df.to_msgpack(self.fname) -from ..pandas_vb_common import setup # noqa: F401 +from ..pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/io/pickle.py b/asv_bench/benchmarks/io/pickle.py index 286ac767c02e7..647e9d27dec9d 100644 --- a/asv_bench/benchmarks/io/pickle.py +++ b/asv_bench/benchmarks/io/pickle.py @@ -1,4 +1,5 @@ import numpy as np + from pandas import DataFrame, date_range, read_pickle import pandas.util.testing as tm @@ -25,4 +26,4 @@ def time_write_pickle(self): self.df.to_pickle(self.fname) -from ..pandas_vb_common import setup # noqa: F401 +from ..pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/io/sql.py b/asv_bench/benchmarks/io/sql.py index b80872b17a9e4..fe84c869717e3 100644 --- a/asv_bench/benchmarks/io/sql.py +++ b/asv_bench/benchmarks/io/sql.py @@ -1,10 +1,11 @@ import sqlite3 import numpy as np -import pandas.util.testing as tm -from pandas import DataFrame, date_range, read_sql_query, read_sql_table from sqlalchemy import create_engine +from pandas import DataFrame, date_range, read_sql_query, read_sql_table +import pandas.util.testing as tm + class SQL: @@ -141,4 +142,4 @@ def time_read_sql_table_column(self, dtype): read_sql_table(self.table_name, self.con, columns=[dtype]) -from ..pandas_vb_common import setup # noqa: F401 +from ..pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/io/stata.py b/asv_bench/benchmarks/io/stata.py index b3ed71af47dc8..28829785d72e9 100644 --- a/asv_bench/benchmarks/io/stata.py +++ b/asv_bench/benchmarks/io/stata.py @@ -1,4 +1,5 @@ import numpy as np + from pandas import DataFrame, date_range, read_stata import pandas.util.testing as tm @@ -50,4 +51,4 @@ def setup(self, convert_dates): self.df.to_stata(self.fname, self.convert_dates) -from ..pandas_vb_common import setup # noqa: F401 +from ..pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/join_merge.py b/asv_bench/benchmarks/join_merge.py index 7c899e3dc6ac8..6aa82a43a4d6a 100644 --- a/asv_bench/benchmarks/join_merge.py +++ b/asv_bench/benchmarks/join_merge.py @@ -1,8 +1,9 @@ import string import numpy as np + +from pandas import DataFrame, MultiIndex, Series, concat, date_range, merge, merge_asof import pandas.util.testing as tm -from pandas import DataFrame, Series, MultiIndex, date_range, concat, merge, merge_asof try: from pandas import merge_ordered @@ -348,4 +349,4 @@ def time_series_align_left_monotonic(self): self.ts1.align(self.ts2, join="left") -from .pandas_vb_common import setup # noqa: F401 +from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/multiindex_object.py b/asv_bench/benchmarks/multiindex_object.py index eda059a68e8a5..3f4fd7ad911c1 100644 --- a/asv_bench/benchmarks/multiindex_object.py +++ b/asv_bench/benchmarks/multiindex_object.py @@ -1,8 +1,9 @@ import string import numpy as np + +from pandas import DataFrame, MultiIndex, date_range import pandas.util.testing as tm -from pandas import date_range, MultiIndex, DataFrame class GetLoc: @@ -146,4 +147,4 @@ def time_categorical_level(self): self.df.set_index(["a", "b"]) -from .pandas_vb_common import setup # noqa: F401 +from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/offset.py b/asv_bench/benchmarks/offset.py index 31c3b6fb6cb60..d822646e712ae 100644 --- a/asv_bench/benchmarks/offset.py +++ b/asv_bench/benchmarks/offset.py @@ -1,7 +1,8 @@ -import warnings from datetime import datetime +import warnings import numpy as np + import pandas as pd try: diff --git a/asv_bench/benchmarks/pandas_vb_common.py b/asv_bench/benchmarks/pandas_vb_common.py index fdc8207021c0f..1faf13329110d 100644 --- a/asv_bench/benchmarks/pandas_vb_common.py +++ b/asv_bench/benchmarks/pandas_vb_common.py @@ -1,7 +1,8 @@ -import os from importlib import import_module +import os import numpy as np + import pandas as pd # Compatibility import for lib diff --git a/asv_bench/benchmarks/period.py b/asv_bench/benchmarks/period.py index 2f8ae0650ab75..7303240a25f29 100644 --- a/asv_bench/benchmarks/period.py +++ b/asv_bench/benchmarks/period.py @@ -1,4 +1,5 @@ from pandas import DataFrame, Period, PeriodIndex, Series, date_range, period_range + from pandas.tseries.frequencies import to_offset diff --git a/asv_bench/benchmarks/plotting.py b/asv_bench/benchmarks/plotting.py index 4fb0876f05a0a..5c718516360ed 100644 --- a/asv_bench/benchmarks/plotting.py +++ b/asv_bench/benchmarks/plotting.py @@ -1,11 +1,12 @@ +import matplotlib import numpy as np -from pandas import DataFrame, Series, DatetimeIndex, date_range + +from pandas import DataFrame, DatetimeIndex, Series, date_range try: from pandas.plotting import andrews_curves except ImportError: from pandas.tools.plotting import andrews_curves -import matplotlib matplotlib.use("Agg") @@ -93,4 +94,4 @@ def time_plot_andrews_curves(self): andrews_curves(self.df, "Name") -from .pandas_vb_common import setup # noqa: F401 +from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/reindex.py b/asv_bench/benchmarks/reindex.py index 8d4c9ebaf3e89..cd450f801c805 100644 --- a/asv_bench/benchmarks/reindex.py +++ b/asv_bench/benchmarks/reindex.py @@ -1,6 +1,8 @@ import numpy as np + +from pandas import DataFrame, Index, MultiIndex, Series, date_range, period_range import pandas.util.testing as tm -from pandas import DataFrame, Series, MultiIndex, Index, date_range, period_range + from .pandas_vb_common import lib @@ -159,4 +161,4 @@ def time_lib_fast_zip(self): lib.fast_zip(self.col_array_list) -from .pandas_vb_common import setup # noqa: F401 +from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/replace.py b/asv_bench/benchmarks/replace.py index f69ae15028525..2a115fb0b4fe3 100644 --- a/asv_bench/benchmarks/replace.py +++ b/asv_bench/benchmarks/replace.py @@ -1,4 +1,5 @@ import numpy as np + import pandas as pd @@ -73,4 +74,4 @@ def time_replace(self, constructor, replace_data): self.data.replace(self.to_replace) -from .pandas_vb_common import setup # noqa: F401 +from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/reshape.py b/asv_bench/benchmarks/reshape.py index cc373f413fb88..441f4b380656e 100644 --- a/asv_bench/benchmarks/reshape.py +++ b/asv_bench/benchmarks/reshape.py @@ -1,9 +1,10 @@ -import string from itertools import product +import string import numpy as np -from pandas import DataFrame, MultiIndex, date_range, melt, wide_to_long + import pandas as pd +from pandas import DataFrame, MultiIndex, date_range, melt, wide_to_long class Melt: @@ -262,4 +263,4 @@ def time_explode(self, n_rows, max_list_length): self.series.explode() -from .pandas_vb_common import setup # noqa: F401 +from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/rolling.py b/asv_bench/benchmarks/rolling.py index a70977fcf539f..3640513d31be2 100644 --- a/asv_bench/benchmarks/rolling.py +++ b/asv_bench/benchmarks/rolling.py @@ -1,6 +1,7 @@ -import pandas as pd import numpy as np +import pandas as pd + class Methods: @@ -121,4 +122,4 @@ def peakmem_fixed(self): self.roll.max() -from .pandas_vb_common import setup # noqa: F401 +from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/series_methods.py b/asv_bench/benchmarks/series_methods.py index 6038a2ab4bd9f..a3f1d92545c3f 100644 --- a/asv_bench/benchmarks/series_methods.py +++ b/asv_bench/benchmarks/series_methods.py @@ -1,8 +1,9 @@ from datetime import datetime import numpy as np + +from pandas import NaT, Series, date_range import pandas.util.testing as tm -from pandas import Series, date_range, NaT class SeriesConstructor: @@ -275,4 +276,4 @@ def time_func(self, func, N, dtype): self.func() -from .pandas_vb_common import setup # noqa: F401 +from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/sparse.py b/asv_bench/benchmarks/sparse.py index 19d08c086a508..ac78ca53679fd 100644 --- a/asv_bench/benchmarks/sparse.py +++ b/asv_bench/benchmarks/sparse.py @@ -136,4 +136,4 @@ def time_division(self, fill_value): self.arr1 / self.arr2 -from .pandas_vb_common import setup # noqa: F401 +from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/stat_ops.py b/asv_bench/benchmarks/stat_ops.py index 620a6de0f5f34..6032bee41958e 100644 --- a/asv_bench/benchmarks/stat_ops.py +++ b/asv_bench/benchmarks/stat_ops.py @@ -1,6 +1,6 @@ import numpy as np -import pandas as pd +import pandas as pd ops = ["mean", "sum", "median", "std", "skew", "kurt", "mad", "prod", "sem", "var"] @@ -148,4 +148,4 @@ def time_cov_series(self, use_bottleneck): self.s.cov(self.s2) -from .pandas_vb_common import setup # noqa: F401 +from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/strings.py b/asv_bench/benchmarks/strings.py index 6be2fa92d9eac..f30b2482615bd 100644 --- a/asv_bench/benchmarks/strings.py +++ b/asv_bench/benchmarks/strings.py @@ -1,7 +1,8 @@ import warnings import numpy as np -from pandas import Series, DataFrame + +from pandas import DataFrame, Series import pandas.util.testing as tm diff --git a/asv_bench/benchmarks/timeseries.py b/asv_bench/benchmarks/timeseries.py index 1020b773f8acb..498774034d642 100644 --- a/asv_bench/benchmarks/timeseries.py +++ b/asv_bench/benchmarks/timeseries.py @@ -2,7 +2,9 @@ import dateutil import numpy as np -from pandas import to_datetime, date_range, Series, DataFrame, period_range + +from pandas import DataFrame, Series, date_range, period_range, to_datetime + from pandas.tseries.frequencies import infer_freq try: @@ -426,4 +428,4 @@ def time_dt_accessor_year(self, tz): self.series.dt.year -from .pandas_vb_common import setup # noqa: F401 +from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/ci/print_skipped.py b/ci/print_skipped.py index a44281044e11d..6bc1dcfcd320d 100755 --- a/ci/print_skipped.py +++ b/ci/print_skipped.py @@ -1,8 +1,8 @@ #!/usr/bin/env python +import math import os import sys -import math import xml.etree.ElementTree as et diff --git a/doc/logo/pandas_logo.py b/doc/logo/pandas_logo.py index 5a07b094e6ad3..89410e3847bef 100644 --- a/doc/logo/pandas_logo.py +++ b/doc/logo/pandas_logo.py @@ -1,7 +1,6 @@ # script to generate the pandas logo -from matplotlib import pyplot as plt -from matplotlib import rcParams +from matplotlib import pyplot as plt, rcParams import numpy as np rcParams["mathtext.fontset"] = "cm" diff --git a/doc/make.py b/doc/make.py index 48febef20fbe6..cbb1fa6a5324a 100755 --- a/doc/make.py +++ b/doc/make.py @@ -11,18 +11,18 @@ $ python make.py html $ python make.py latex """ +import argparse +import csv import importlib -import sys import os import shutil -import csv import subprocess -import argparse +import sys import webbrowser + import docutils import docutils.parsers.rst - DOC_PATH = os.path.dirname(os.path.abspath(__file__)) SOURCE_PATH = os.path.join(DOC_PATH, "source") BUILD_PATH = os.path.join(DOC_PATH, "build") diff --git a/doc/source/conf.py b/doc/source/conf.py index a4b7d97c2cf5e..1da1948e45268 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -10,15 +10,15 @@ # All configuration values have a default; values that are commented out # serve to show the default. -import sys -import os -import inspect import importlib +import inspect import logging +import os +import sys + import jinja2 -from sphinx.ext.autosummary import _import_by_name from numpydoc.docscrape import NumpyDocString - +from sphinx.ext.autosummary import _import_by_name logger = logging.getLogger(__name__) @@ -141,7 +141,7 @@ # built documents. # # The short X.Y version. -import pandas +import pandas # noqa: E402 isort:skip # version = '%s r%s' % (pandas.__version__, svn_version()) version = str(pandas.__version__) @@ -432,10 +432,14 @@ # Add custom Documenter to handle attributes/methods of an AccessorProperty # eg pandas.Series.str and pandas.Series.dt (see GH9322) -import sphinx -from sphinx.util import rpartition -from sphinx.ext.autodoc import Documenter, MethodDocumenter, AttributeDocumenter -from sphinx.ext.autosummary import Autosummary +import sphinx # noqa: E402 isort:skip +from sphinx.util import rpartition # noqa: E402 isort:skip +from sphinx.ext.autodoc import ( # noqa: E402 isort:skip + AttributeDocumenter, + Documenter, + MethodDocumenter, +) +from sphinx.ext.autosummary import Autosummary # noqa: E402 isort:skip class AccessorDocumenter(MethodDocumenter): diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 1d49dbdee9c03..338c890ce317c 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -3206,7 +3206,7 @@ argument to ``to_excel`` and to ``ExcelWriter``. The built-in engines are: writer = pd.ExcelWriter('path_to_file.xlsx', engine='xlsxwriter') # Or via pandas configuration. - from pandas import options # noqa: E402 + from pandas import options # noqa: E402 options.io.excel.xlsx.writer = 'xlsxwriter' df.to_excel('path_to_file.xlsx', sheet_name='Sheet1') diff --git a/doc/sphinxext/contributors.py b/doc/sphinxext/contributors.py index 4256e4659715d..1a064f71792e9 100644 --- a/doc/sphinxext/contributors.py +++ b/doc/sphinxext/contributors.py @@ -8,12 +8,11 @@ code contributors and commits, and then list each contributor individually. """ +from announce import build_components from docutils import nodes from docutils.parsers.rst import Directive import git -from announce import build_components - class ContributorsDirective(Directive): required_arguments = 1 diff --git a/pandas/core/api.py b/pandas/core/api.py index 73323d93b8215..bd2a57a15bdd2 100644 --- a/pandas/core/api.py +++ b/pandas/core/api.py @@ -2,6 +2,16 @@ import numpy as np +from pandas.core.dtypes.dtypes import ( + CategoricalDtype, + DatetimeTZDtype, + IntervalDtype, + PeriodDtype, +) +from pandas.core.dtypes.missing import isna, isnull, notna, notnull + +from pandas.core.algorithms import factorize, unique, value_counts +from pandas.core.arrays import Categorical from pandas.core.arrays.integer import ( Int8Dtype, Int16Dtype, @@ -12,45 +22,38 @@ UInt32Dtype, UInt64Dtype, ) -from pandas.core.algorithms import factorize, unique, value_counts -from pandas.core.dtypes.missing import isna, isnull, notna, notnull -from pandas.core.dtypes.dtypes import ( - CategoricalDtype, - PeriodDtype, - IntervalDtype, - DatetimeTZDtype, -) -from pandas.core.arrays import Categorical from pandas.core.construction import array + from pandas.core.groupby import Grouper, NamedAgg -from pandas.io.formats.format import set_eng_float_format + +# DataFrame needs to be imported after NamedAgg to avoid a circular import +from pandas.core.frame import DataFrame # isort:skip from pandas.core.index import ( - Index, CategoricalIndex, - Int64Index, - UInt64Index, - RangeIndex, + DatetimeIndex, Float64Index, - MultiIndex, + Index, + Int64Index, IntervalIndex, - TimedeltaIndex, - DatetimeIndex, - PeriodIndex, + MultiIndex, NaT, + PeriodIndex, + RangeIndex, + TimedeltaIndex, + UInt64Index, ) +from pandas.core.indexes.datetimes import Timestamp, bdate_range, date_range +from pandas.core.indexes.interval import Interval, interval_range from pandas.core.indexes.period import Period, period_range from pandas.core.indexes.timedeltas import Timedelta, timedelta_range -from pandas.core.indexes.datetimes import Timestamp, date_range, bdate_range -from pandas.core.indexes.interval import Interval, interval_range - -from pandas.core.series import Series -from pandas.core.frame import DataFrame - -# TODO: Remove import when statsmodels updates #18264 -from pandas.core.reshape.reshape import get_dummies - from pandas.core.indexing import IndexSlice -from pandas.core.tools.numeric import to_numeric -from pandas.tseries.offsets import DateOffset +from pandas.core.reshape.reshape import ( + get_dummies, +) # TODO: Remove get_dummies import when statsmodels updates #18264 +from pandas.core.series import Series from pandas.core.tools.datetimes import to_datetime +from pandas.core.tools.numeric import to_numeric from pandas.core.tools.timedeltas import to_timedelta + +from pandas.io.formats.format import set_eng_float_format +from pandas.tseries.offsets import DateOffset diff --git a/pandas/io/msgpack/__init__.py b/pandas/io/msgpack/__init__.py index 9b09cffd83f75..7107263c180cb 100644 --- a/pandas/io/msgpack/__init__.py +++ b/pandas/io/msgpack/__init__.py @@ -2,8 +2,8 @@ from collections import namedtuple -from pandas.io.msgpack.exceptions import * # noqa -from pandas.io.msgpack._version import version # noqa +from pandas.io.msgpack.exceptions import * # noqa: F401,F403 isort:skip +from pandas.io.msgpack._version import version # noqa: F401 isort:skip class ExtType(namedtuple("ExtType", "code data")): @@ -19,10 +19,14 @@ def __new__(cls, code, data): return super().__new__(cls, code, data) -import os # noqa +import os # noqa: F401,E402 isort:skip -from pandas.io.msgpack._packer import Packer # noqa -from pandas.io.msgpack._unpacker import unpack, unpackb, Unpacker # noqa +from pandas.io.msgpack._unpacker import ( # noqa: F401,E402 isort:skip + Unpacker, + unpack, + unpackb, +) +from pandas.io.msgpack._packer import Packer # noqa: E402 isort:skip def pack(o, stream, **kwargs): diff --git a/pandas/tests/io/pytables/test_pytables.py b/pandas/tests/io/pytables/test_pytables.py index d67f2c3b7bd66..7306393a1339e 100644 --- a/pandas/tests/io/pytables/test_pytables.py +++ b/pandas/tests/io/pytables/test_pytables.py @@ -37,7 +37,6 @@ import pandas.util.testing as tm from pandas.util.testing import assert_frame_equal, assert_series_equal, set_timezone -from pandas.io import pytables as pytables # noqa:E402 from pandas.io.formats.printing import pprint_thing from pandas.io.pytables import ( ClosedFileError, @@ -46,7 +45,9 @@ Term, read_hdf, ) -from pandas.io.pytables import TableIterator # noqa:E402 + +from pandas.io import pytables as pytables # noqa: E402 isort:skip +from pandas.io.pytables import TableIterator # noqa: E402 isort:skip tables = pytest.importorskip("tables") diff --git a/pandas/tests/io/test_feather.py b/pandas/tests/io/test_feather.py index 87a2405a10dd5..ee668d6890756 100644 --- a/pandas/tests/io/test_feather.py +++ b/pandas/tests/io/test_feather.py @@ -8,7 +8,7 @@ import pandas.util.testing as tm from pandas.util.testing import assert_frame_equal, ensure_clean -from pandas.io.feather_format import read_feather, to_feather # noqa:E402 +from pandas.io.feather_format import read_feather, to_feather # noqa: E402 isort:skip pyarrow = pytest.importorskip("pyarrow") diff --git a/scripts/find_commits_touching_func.py b/scripts/find_commits_touching_func.py index 1075a257d4270..95a892b822cff 100755 --- a/scripts/find_commits_touching_func.py +++ b/scripts/find_commits_touching_func.py @@ -10,11 +10,11 @@ Usage:: $ ./find_commits_touching_func.py (see arguments below) """ -import logging -import re -import os import argparse from collections import namedtuple +import logging +import os +import re from dateutil.parser import parse diff --git a/scripts/generate_pip_deps_from_conda.py b/scripts/generate_pip_deps_from_conda.py index 6ae10c2cb07d2..29fe8bf84c12b 100755 --- a/scripts/generate_pip_deps_from_conda.py +++ b/scripts/generate_pip_deps_from_conda.py @@ -16,8 +16,8 @@ import os import re import sys -import yaml +import yaml EXCLUDE = {"python=3"} RENAME = {"pytables": "tables", "pyqt": "pyqt5"} diff --git a/scripts/merge-pr.py b/scripts/merge-pr.py index 95352751a23c6..300cb149f387f 100755 --- a/scripts/merge-pr.py +++ b/scripts/merge-pr.py @@ -22,14 +22,15 @@ # usage: ./apache-pr-merge.py (see config env vars below) # # Lightly modified from version of this script in incubator-parquet-format -from subprocess import check_output -from requests.auth import HTTPBasicAuth -import requests import os +from subprocess import check_output import sys import textwrap +import requests +from requests.auth import HTTPBasicAuth + PANDAS_HOME = "." PROJECT_NAME = "pandas" print("PANDAS_HOME = " + PANDAS_HOME) diff --git a/scripts/tests/test_validate_docstrings.py b/scripts/tests/test_validate_docstrings.py index 35aaf10458f44..85e5bf239cbfa 100644 --- a/scripts/tests/test_validate_docstrings.py +++ b/scripts/tests/test_validate_docstrings.py @@ -2,12 +2,13 @@ import random import string import textwrap -import pytest -import numpy as np -import pandas as pd +import numpy as np +import pytest import validate_docstrings +import pandas as pd + validate_one = validate_docstrings.validate_one diff --git a/scripts/validate_docstrings.py b/scripts/validate_docstrings.py index bf5d861281a36..401eaf8ff5ed5 100755 --- a/scripts/validate_docstrings.py +++ b/scripts/validate_docstrings.py @@ -13,20 +13,20 @@ $ ./validate_docstrings.py $ ./validate_docstrings.py pandas.DataFrame.head """ -import os -import sys -import json -import re -import glob -import functools -import collections import argparse -import pydoc -import inspect -import importlib +import ast +import collections import doctest +import functools +import glob +import importlib +import inspect +import json +import os +import pydoc +import re +import sys import tempfile -import ast import textwrap import flake8.main.application @@ -41,20 +41,20 @@ # script. Setting here before matplotlib is loaded. # We don't warn for the number of open plots, as none is actually being opened os.environ["MPLBACKEND"] = "Template" -import matplotlib +import matplotlib # noqa: E402 isort:skip matplotlib.rc("figure", max_open_warning=10000) -import numpy +import numpy # noqa: E402 isort:skip BASE_PATH = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) sys.path.insert(0, os.path.join(BASE_PATH)) -import pandas +import pandas # noqa: E402 isort:skip sys.path.insert(1, os.path.join(BASE_PATH, "doc", "sphinxext")) -from numpydoc.docscrape import NumpyDocString -from pandas.io.formats.printing import pprint_thing +from numpydoc.docscrape import NumpyDocString # noqa: E402 isort:skip +from pandas.io.formats.printing import pprint_thing # noqa: E402 isort:skip PRIVATE_CLASSES = ["NDFrame", "IndexOpsMixin"] diff --git a/setup.cfg b/setup.cfg index 716ff5d9d8853..43dbac15f5cfe 100644 --- a/setup.cfg +++ b/setup.cfg @@ -110,68 +110,25 @@ directory = coverage_html_report # To be kept consistent with "Import Formatting" section in contributing.rst [isort] -known_pre_libs=pandas._config -known_pre_core=pandas._libs,pandas.util._*,pandas.compat,pandas.errors -known_dtypes=pandas.core.dtypes -known_post_core=pandas.tseries,pandas.io,pandas.plotting -sections=FUTURE,STDLIB,THIRDPARTY,PRE_LIBS,PRE_CORE,DTYPES,FIRSTPARTY,POST_CORE,LOCALFOLDER - -known_first_party=pandas -known_third_party=Cython,numpy,dateutil,matplotlib,python-dateutil,pytz,pyarrow,pytest - -multi_line_output=3 -include_trailing_comma=True -force_grid_wrap=0 -combine_as_imports=True -line_length=88 -force_sort_within_sections=True -skip_glob=env, -skip= - pandas/__init__.py - pandas/core/api.py, - pandas/io/msgpack/__init__.py - asv_bench/benchmarks/attrs_caching.py, - asv_bench/benchmarks/binary_ops.py, - asv_bench/benchmarks/categoricals.py, - asv_bench/benchmarks/ctors.py, - asv_bench/benchmarks/eval.py, - asv_bench/benchmarks/frame_ctor.py, - asv_bench/benchmarks/frame_methods.py, - asv_bench/benchmarks/gil.py, - asv_bench/benchmarks/groupby.py, - asv_bench/benchmarks/index_object.py, - asv_bench/benchmarks/indexing.py, - asv_bench/benchmarks/inference.py, - asv_bench/benchmarks/io/csv.py, - asv_bench/benchmarks/io/excel.py, - asv_bench/benchmarks/io/hdf.py, - asv_bench/benchmarks/io/json.py, - asv_bench/benchmarks/io/msgpack.py, - asv_bench/benchmarks/io/pickle.py, - asv_bench/benchmarks/io/sql.py, - asv_bench/benchmarks/io/stata.py, - asv_bench/benchmarks/join_merge.py, - asv_bench/benchmarks/multiindex_object.py, - asv_bench/benchmarks/panel_ctor.py, - asv_bench/benchmarks/panel_methods.py, - asv_bench/benchmarks/plotting.py, - asv_bench/benchmarks/reindex.py, - asv_bench/benchmarks/replace.py, - asv_bench/benchmarks/reshape.py, - asv_bench/benchmarks/rolling.py, - asv_bench/benchmarks/series_methods.py, - asv_bench/benchmarks/sparse.py, - asv_bench/benchmarks/stat_ops.py, - asv_bench/benchmarks/timeseries.py - asv_bench/benchmarks/pandas_vb_common.py - asv_bench/benchmarks/offset.py - asv_bench/benchmarks/dtypes.py - asv_bench/benchmarks/strings.py - asv_bench/benchmarks/period.py +known_pre_libs = pandas._config +known_pre_core = pandas._libs,pandas.util._*,pandas.compat,pandas.errors +known_dtypes = pandas.core.dtypes +known_post_core = pandas.tseries,pandas.io,pandas.plotting +sections = FUTURE,STDLIB,THIRDPARTY,PRE_LIBS,PRE_CORE,DTYPES,FIRSTPARTY,POST_CORE,LOCALFOLDER +known_first_party = pandas +known_third_party = _pytest,announce,dateutil,docutils,flake8,git,hypothesis,jinja2,lxml,matplotlib,numpy,numpydoc,pkg_resources,pyarrow,pytest,pytz,requests,scipy,setuptools,sphinx,sqlalchemy,validate_docstrings,yaml +multi_line_output = 3 +include_trailing_comma = True +force_grid_wrap = 0 +combine_as_imports = True +line_length = 88 +force_sort_within_sections = True +skip_glob = env, +skip = pandas/__init__.py,pandas/core/api.py [mypy] ignore_missing_imports=True no_implicit_optional=True [mypy-pandas.conftest,pandas.tests.*] -ignore_errors=True \ No newline at end of file +ignore_errors=True diff --git a/setup.py b/setup.py index d2c6b18b892cd..a86527ace092b 100755 --- a/setup.py +++ b/setup.py @@ -6,16 +6,16 @@ BSD license. Parts are from lxml (https://github.com/lxml/lxml) """ +from distutils.sysconfig import get_config_vars +from distutils.version import LooseVersion import os from os.path import join as pjoin - -import pkg_resources import platform -from distutils.sysconfig import get_config_vars -import sys import shutil -from distutils.version import LooseVersion -from setuptools import setup, Command, find_packages +import sys + +import pkg_resources +from setuptools import Command, find_packages, setup # versioning import versioneer @@ -58,8 +58,8 @@ def is_platform_mac(): # The import of Extension must be after the import of Cython, otherwise # we do not get the appropriately patched class. # See https://cython.readthedocs.io/en/latest/src/reference/compilation.html -from distutils.extension import Extension # noqa:E402 -from distutils.command.build import build # noqa:E402 +from distutils.extension import Extension # noqa: E402 isort:skip +from distutils.command.build import build # noqa: E402 isort:skip try: if not _CYTHON_INSTALLED: @@ -831,9 +831,7 @@ def srcpath(name=None, suffix=".pyx", subdir="src"): ] }, entry_points={ - "pandas_plotting_backends": [ - "matplotlib = pandas:plotting._matplotlib", - ], + "pandas_plotting_backends": ["matplotlib = pandas:plotting._matplotlib"] }, **setuptools_kwargs ) From fadb27138a97eb96b619111f906b8921d2290d26 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Fri, 30 Aug 2019 18:06:49 +0100 Subject: [PATCH 184/191] REGR: tags for notebook display closes #28204 (#28216) * REGR: tags for notebook display closes #28204 --- doc/source/whatsnew/v0.25.2.rst | 1 + pandas/core/frame.py | 15 + .../html_repr_max_rows_10_min_rows_12.html | 70 +++++ .../html_repr_max_rows_10_min_rows_4.html | 46 +++ .../html_repr_max_rows_12_min_rows_None.html | 78 +++++ .../html_repr_max_rows_None_min_rows_12.html | 269 ++++++++++++++++++ ...l_repr_min_rows_default_no_truncation.html | 105 +++++++ .../html_repr_min_rows_default_truncated.html | 70 +++++ pandas/tests/io/formats/test_to_html.py | 39 +++ 9 files changed, 693 insertions(+) create mode 100644 pandas/tests/io/formats/data/html/html_repr_max_rows_10_min_rows_12.html create mode 100644 pandas/tests/io/formats/data/html/html_repr_max_rows_10_min_rows_4.html create mode 100644 pandas/tests/io/formats/data/html/html_repr_max_rows_12_min_rows_None.html create mode 100644 pandas/tests/io/formats/data/html/html_repr_max_rows_None_min_rows_12.html create mode 100644 pandas/tests/io/formats/data/html/html_repr_min_rows_default_no_truncation.html create mode 100644 pandas/tests/io/formats/data/html/html_repr_min_rows_default_truncated.html diff --git a/doc/source/whatsnew/v0.25.2.rst b/doc/source/whatsnew/v0.25.2.rst index 8d8a39139cf84..1cdf213d81a74 100644 --- a/doc/source/whatsnew/v0.25.2.rst +++ b/doc/source/whatsnew/v0.25.2.rst @@ -62,6 +62,7 @@ Missing I/O ^^^ +- Fix regression in notebook display where tags not used for :attr:`DataFrame.index` (:issue:`28204`). - Regression in :meth:`~DataFrame.to_csv` where writing a :class:`Series` or :class:`DataFrame` indexed by an :class:`IntervalIndex` would incorrectly raise a ``TypeError`` (:issue:`28210`) - - diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 3d1a39a86c784..16fece1c7eb8b 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -671,10 +671,25 @@ def _repr_html_(self): formatter = fmt.DataFrameFormatter( self, + columns=None, + col_space=None, + na_rep="NaN", + formatters=None, + float_format=None, + sparsify=None, + justify=None, + index_names=True, + header=True, + index=True, + bold_rows=True, + escape=True, max_rows=max_rows, min_rows=min_rows, max_cols=max_cols, show_dimensions=show_dimensions, + decimal=".", + table_id=None, + render_links=False, ) return formatter.to_html(notebook=True) else: diff --git a/pandas/tests/io/formats/data/html/html_repr_max_rows_10_min_rows_12.html b/pandas/tests/io/formats/data/html/html_repr_max_rows_10_min_rows_12.html new file mode 100644 index 0000000000000..4eb3f5319749d --- /dev/null +++ b/pandas/tests/io/formats/data/html/html_repr_max_rows_10_min_rows_12.html @@ -0,0 +1,70 @@ +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
a
00
11
22
33
44
......
5656
5757
5858
5959
6060
+

61 rows × 1 columns

+
diff --git a/pandas/tests/io/formats/data/html/html_repr_max_rows_10_min_rows_4.html b/pandas/tests/io/formats/data/html/html_repr_max_rows_10_min_rows_4.html new file mode 100644 index 0000000000000..2b1d97aec517c --- /dev/null +++ b/pandas/tests/io/formats/data/html/html_repr_max_rows_10_min_rows_4.html @@ -0,0 +1,46 @@ +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
a
00
11
......
5959
6060
+

61 rows × 1 columns

+
diff --git a/pandas/tests/io/formats/data/html/html_repr_max_rows_12_min_rows_None.html b/pandas/tests/io/formats/data/html/html_repr_max_rows_12_min_rows_None.html new file mode 100644 index 0000000000000..a539e5a4884a1 --- /dev/null +++ b/pandas/tests/io/formats/data/html/html_repr_max_rows_12_min_rows_None.html @@ -0,0 +1,78 @@ +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
a
00
11
22
33
44
55
......
5555
5656
5757
5858
5959
6060
+

61 rows × 1 columns

+
diff --git a/pandas/tests/io/formats/data/html/html_repr_max_rows_None_min_rows_12.html b/pandas/tests/io/formats/data/html/html_repr_max_rows_None_min_rows_12.html new file mode 100644 index 0000000000000..3e680a505c6d6 --- /dev/null +++ b/pandas/tests/io/formats/data/html/html_repr_max_rows_None_min_rows_12.html @@ -0,0 +1,269 @@ +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
a
00
11
22
33
44
55
66
77
88
99
1010
1111
1212
1313
1414
1515
1616
1717
1818
1919
2020
2121
2222
2323
2424
2525
2626
2727
2828
2929
3030
3131
3232
3333
3434
3535
3636
3737
3838
3939
4040
4141
4242
4343
4444
4545
4646
4747
4848
4949
5050
5151
5252
5353
5454
5555
5656
5757
5858
5959
6060
+
diff --git a/pandas/tests/io/formats/data/html/html_repr_min_rows_default_no_truncation.html b/pandas/tests/io/formats/data/html/html_repr_min_rows_default_no_truncation.html new file mode 100644 index 0000000000000..10f6247e37def --- /dev/null +++ b/pandas/tests/io/formats/data/html/html_repr_min_rows_default_no_truncation.html @@ -0,0 +1,105 @@ +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
a
00
11
22
33
44
55
66
77
88
99
1010
1111
1212
1313
1414
1515
1616
1717
1818
1919
+
diff --git a/pandas/tests/io/formats/data/html/html_repr_min_rows_default_truncated.html b/pandas/tests/io/formats/data/html/html_repr_min_rows_default_truncated.html new file mode 100644 index 0000000000000..4eb3f5319749d --- /dev/null +++ b/pandas/tests/io/formats/data/html/html_repr_min_rows_default_truncated.html @@ -0,0 +1,70 @@ +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
a
00
11
22
33
44
......
5656
5757
5858
5959
6060
+

61 rows × 1 columns

+
diff --git a/pandas/tests/io/formats/test_to_html.py b/pandas/tests/io/formats/test_to_html.py index 448e869df950d..52c7b89220f06 100644 --- a/pandas/tests/io/formats/test_to_html.py +++ b/pandas/tests/io/formats/test_to_html.py @@ -713,3 +713,42 @@ def test_to_html_with_col_space_units(unit): for h in hdrs: expected = ''.format(unit=unit) assert expected in h + + +def test_html_repr_min_rows_default(datapath): + # gh-27991 + + # default setting no truncation even if above min_rows + df = pd.DataFrame({"a": range(20)}) + result = df._repr_html_() + expected = expected_html(datapath, "html_repr_min_rows_default_no_truncation") + assert result == expected + + # default of max_rows 60 triggers truncation if above + df = pd.DataFrame({"a": range(61)}) + result = df._repr_html_() + expected = expected_html(datapath, "html_repr_min_rows_default_truncated") + assert result == expected + + +@pytest.mark.parametrize( + "max_rows,min_rows,expected", + [ + # truncated after first two rows + (10, 4, "html_repr_max_rows_10_min_rows_4"), + # when set to None, follow value of max_rows + (12, None, "html_repr_max_rows_12_min_rows_None"), + # when set value higher as max_rows, use the minimum + (10, 12, "html_repr_max_rows_10_min_rows_12"), + # max_rows of None -> never truncate + (None, 12, "html_repr_max_rows_None_min_rows_12"), + ], +) +def test_html_repr_min_rows(datapath, max_rows, min_rows, expected): + # gh-27991 + + df = pd.DataFrame({"a": range(61)}) + expected = expected_html(datapath, expected) + with option_context("display.max_rows", max_rows, "display.min_rows", min_rows): + result = df._repr_html_() + assert result == expected From cad39188c64bb844d6e915a97d1b88c6b4337723 Mon Sep 17 00:00:00 2001 From: John G Evans Date: Fri, 30 Aug 2019 13:08:33 -0400 Subject: [PATCH 185/191] Fix read of py27 pytables tz attribute, gh#26443 (#28221) --- doc/source/whatsnew/v1.0.0.rst | 1 + pandas/io/pytables.py | 7 ++++++- pandas/tests/io/data/legacy_hdf/gh26443.h5 | Bin 0 -> 7168 bytes pandas/tests/io/pytables/test_pytables.py | 13 +++++++++++++ 4 files changed, 20 insertions(+), 1 deletion(-) create mode 100644 pandas/tests/io/data/legacy_hdf/gh26443.h5 diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 83beec5607986..3b6288146bdf2 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -97,6 +97,7 @@ Datetimelike ^^^^^^^^^^^^ - Bug in :meth:`Series.__setitem__` incorrectly casting ``np.timedelta64("NaT")`` to ``np.datetime64("NaT")`` when inserting into a :class:`Series` with datetime64 dtype (:issue:`27311`) - Bug in :meth:`Series.dt` property lookups when the underlying data is read-only (:issue:`27529`) +- Bug in ``HDFStore.__getitem__`` incorrectly reading tz attribute created in Python 2 (:issue:`26443`) - diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index fbe413f820c90..1ff3400323e54 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -2902,7 +2902,12 @@ def read_index_node(self, node, start=None, stop=None): kwargs["freq"] = node._v_attrs["freq"] if "tz" in node._v_attrs: - kwargs["tz"] = node._v_attrs["tz"] + if isinstance(node._v_attrs["tz"], bytes): + # created by python2 + kwargs["tz"] = node._v_attrs["tz"].decode("utf-8") + else: + # created by python3 + kwargs["tz"] = node._v_attrs["tz"] if kind in ("date", "datetime"): index = factory( diff --git a/pandas/tests/io/data/legacy_hdf/gh26443.h5 b/pandas/tests/io/data/legacy_hdf/gh26443.h5 new file mode 100644 index 0000000000000000000000000000000000000000..45aa64324530f943b48fa5c63390392af1110c6b GIT binary patch literal 7168 zcmeHL%}*0S6rU|$3s$KgL?j-FryfiNg2safsuUrqZPQjF2UAwM;0C&_y9GgG;!*z- zkNycBIdb>n(Sry61D*G^+bk4B4Z*P6bl%R)n|ZTue(%kj_i=u9?&8qckb=iFRj*3n zs}w)^Q8%e2s58SdQ-kRTrk@h@VyJ!veWTo-;`zsWsNp-eSIfDa(ws8CQ0`W{{q$x^ zLrl+=2Ih0w5`6G8{%S$#F5^s;v2O3+tKe~7I{uJeX1qpmom6R-)2~eZt1eXY8o=tR z{)23hk4(CV@;}OFD;3=i{C)Z_{Ey4Ur|_X6Kg@2b#ay|vP%I!nn2zvIW+rqE`0pb% zw50)mX{nlhn9o&ebH(zLY_-;H{8VO=$E9%2sGX+R;J&cHh4^hamK-;v)l^vb~BwJf)8`L*^=%f|Ia{pzY` zHJ~S{WZZD)xGmfBou*6v7$6ehCNp_AGu1h65Xaf>O0({`YbZwdvY+W| zC`RoQ3CbQlJFvohU?P27Pb3mJgQ-0^y!069q$c`dL!yeh!e0SA$$?wXqd4hy&SlHx z>=KL#&e5aKxv9)FN=&M8q!3p^v>H8kwf;nf$VYymmshFO-XJ!2EmGX`lJVb-G0h$6 zv2GmHF6Zftyd2GbW_#;a+fvK#etiDAIr5fQiEU?g#qq*o4a6FVH4tkc);hzIeH3Ha-vc6q4TI}Q6D zT{k-JCp7Lj+=Td`d#Q${yzv6qOh1gXFAw_A3|w`~Vr@KX|W%8=R@y zZ-ATl;SMAG*@o>oE7qNY{kpc)^wtH}dpD3h$ft(uHqjnkOvv~L@F(+zbO>d==LSRW zb98+S))DfPT&bTN=hHFHQH6*5Mafr3^If689YMTsz76wh9q#Lhyh0N9ayQuLqtAoi a&nrCmuks3^ot~E8O5<)Dn3?(a=iD!5Y$rGX literal 0 HcmV?d00001 diff --git a/pandas/tests/io/pytables/test_pytables.py b/pandas/tests/io/pytables/test_pytables.py index 7306393a1339e..77cac00882771 100644 --- a/pandas/tests/io/pytables/test_pytables.py +++ b/pandas/tests/io/pytables/test_pytables.py @@ -5447,3 +5447,16 @@ def test_read_with_where_tz_aware_index(self): store.append(key, expected, format="table", append=True) result = pd.read_hdf(path, key, where="DATE > 20151130") assert_frame_equal(result, expected) + + def test_py2_created_with_datetimez(self, datapath): + # The test HDF5 file was created in Python 2, but could not be read in + # Python 3. + # + # GH26443 + index = [pd.Timestamp("2019-01-01T18:00").tz_localize("America/New_York")] + expected = DataFrame({"data": 123}, index=index) + with ensure_clean_store( + datapath("io", "data", "legacy_hdf", "gh26443.h5"), mode="r" + ) as store: + result = store["key"] + assert_frame_equal(result, expected) From 621ad9df37911ea577029d8cac5de0920f07f33e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?O=C4=9Fuzhan=20=C3=96=C4=9Freden?= Date: Fri, 30 Aug 2019 19:09:03 +0200 Subject: [PATCH 186/191] DOC: Document existing functionality of pandas.DataFrame.to_sql() #11886 (#26795) * DOC: add single dtype to NDFrame.to_sql --- pandas/core/generic.py | 15 ++++++++------- pandas/io/sql.py | 23 ++++++++++++----------- 2 files changed, 20 insertions(+), 18 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 6ade69fb4ca9d..1a5b36b07e93c 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -2594,13 +2594,14 @@ def to_sql( `index` is True, then the index names are used. A sequence should be given if the DataFrame uses MultiIndex. chunksize : int, optional - Rows will be written in batches of this size at a time. By default, - all rows will be written at once. - dtype : dict, optional - Specifying the datatype for columns. The keys should be the column - names and the values should be the SQLAlchemy types or strings for - the sqlite3 legacy mode. - method : {None, 'multi', callable}, default None + Specify the number of rows in each batch to be written at a time. + By default, all rows will be written at once. + dtype : dict or scalar, optional + Specifying the datatype for columns. If a dictionary is used, the + keys should be the column names and the values should be the + SQLAlchemy types or strings for the sqlite3 legacy mode. If a + scalar is provided, it will be applied to all columns. + method : {None, 'multi', callable}, optional Controls the SQL insertion clause used: * None : Uses standard SQL ``INSERT`` clause (one per row). diff --git a/pandas/io/sql.py b/pandas/io/sql.py index 72df00fd4c5a1..44cb399336d62 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -456,14 +456,14 @@ def to_sql( Parameters ---------- frame : DataFrame, Series - name : string + name : str Name of SQL table. con : SQLAlchemy connectable(engine/connection) or database string URI or sqlite3 DBAPI2 connection Using SQLAlchemy makes it possible to use any DB supported by that library. If a DBAPI2 object, only sqlite3 is supported. - schema : string, default None + schema : str, optional Name of SQL schema in database to write to (if database flavor supports this). If None, use default schema (default). if_exists : {'fail', 'replace', 'append'}, default 'fail' @@ -472,18 +472,19 @@ def to_sql( - append: If table exists, insert data. Create if does not exist. index : boolean, default True Write DataFrame index as a column. - index_label : string or sequence, default None + index_label : str or sequence, optional Column label for index column(s). If None is given (default) and `index` is True, then the index names are used. A sequence should be given if the DataFrame uses MultiIndex. - chunksize : int, default None - If not None, then rows will be written in batches of this size at a - time. If None, all rows will be written at once. - dtype : single SQLtype or dict of column name to SQL type, default None - Optional specifying the datatype for columns. The SQL type should - be a SQLAlchemy type, or a string for sqlite3 fallback connection. - If all columns are of the same type, one single value can be used. - method : {None, 'multi', callable}, default None + chunksize : int, optional + Specify the number of rows in each batch to be written at a time. + By default, all rows will be written at once. + dtype : dict or scalar, optional + Specifying the datatype for columns. If a dictionary is used, the + keys should be the column names and the values should be the + SQLAlchemy types or strings for the sqlite3 fallback mode. If a + scalar is provided, it will be applied to all columns. + method : {None, 'multi', callable}, optional Controls the SQL insertion clause used: - None : Uses standard SQL ``INSERT`` clause (one per row). From bfdbebec423d781ebde189de24f5413298ab7c81 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 30 Aug 2019 11:43:48 -0700 Subject: [PATCH 187/191] CLN: catch less inside try/except (#28203) * CLN: catch less inside try/except --- pandas/_libs/reduction.pyx | 4 ---- pandas/core/groupby/generic.py | 17 +++++++++-------- pandas/core/groupby/groupby.py | 3 ++- 3 files changed, 11 insertions(+), 13 deletions(-) diff --git a/pandas/_libs/reduction.pyx b/pandas/_libs/reduction.pyx index f95685c337969..c892c1cf1b8a3 100644 --- a/pandas/_libs/reduction.pyx +++ b/pandas/_libs/reduction.pyx @@ -296,8 +296,6 @@ cdef class SeriesBinGrouper: islider.advance(group_size) vslider.advance(group_size) - except: - raise finally: # so we don't free the wrong memory islider.reset() @@ -425,8 +423,6 @@ cdef class SeriesGrouper: group_size = 0 - except: - raise finally: # so we don't free the wrong memory islider.reset() diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index b0bcd1cc1e27c..5e463d50d43d6 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -242,15 +242,18 @@ def aggregate(self, func, *args, **kwargs): # grouper specific aggregations if self.grouper.nkeys > 1: return self._python_agg_general(func, *args, **kwargs) + elif args or kwargs: + result = self._aggregate_generic(func, *args, **kwargs) else: # try to treat as if we are passing a list try: - assert not args and not kwargs result = self._aggregate_multiple_funcs( [func], _level=_level, _axis=self.axis ) - + except Exception: + result = self._aggregate_generic(func) + else: result.columns = Index( result.columns.levels[0], name=self._selected_obj.columns.name ) @@ -260,8 +263,6 @@ def aggregate(self, func, *args, **kwargs): # values. concat no longer converts DataFrame[Sparse] # to SparseDataFrame, so we do it here. result = SparseDataFrame(result._data) - except Exception: - result = self._aggregate_generic(func, *args, **kwargs) if not self.as_index: self._insert_inaxis_grouper_inplace(result) @@ -313,10 +314,10 @@ def _aggregate_item_by_item(self, func, *args, **kwargs): cannot_agg = [] errors = None for item in obj: - try: - data = obj[item] - colg = SeriesGroupBy(data, selection=item, grouper=self.grouper) + data = obj[item] + colg = SeriesGroupBy(data, selection=item, grouper=self.grouper) + try: cast = self._transform_should_cast(func) result[item] = colg.aggregate(func, *args, **kwargs) @@ -684,7 +685,7 @@ def _transform_item_by_item(self, obj, wrapper): return DataFrame(output, index=obj.index, columns=columns) - def filter(self, func, dropna=True, *args, **kwargs): # noqa + def filter(self, func, dropna=True, *args, **kwargs): """ Return a copy of a DataFrame excluding elements from groups that do not satisfy the boolean criterion specified by func. diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 4d21b5810470a..6deef16bdec13 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -653,7 +653,8 @@ def curried(x): # mark this column as an error try: return self._aggregate_item_by_item(name, *args, **kwargs) - except (AttributeError): + except AttributeError: + # e.g. SparseArray has no flags attr raise ValueError return wrapper From f8a924bcc3191ea7c82482ddf22728e629e808f3 Mon Sep 17 00:00:00 2001 From: Will Holmgren Date: Fri, 30 Aug 2019 13:54:02 -0700 Subject: [PATCH 188/191] DOC: fix DatetimeIndex.tz_localize doc string example (#28237) --- pandas/core/arrays/datetimes.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index 70df708d36b3b..732f819e743a4 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -1063,6 +1063,7 @@ def tz_localize(self, tz, ambiguous="raise", nonexistent="raise", errors=None): Be careful with DST changes. When there is sequential data, pandas can infer the DST time: + >>> s = pd.to_datetime(pd.Series(['2018-10-28 01:30:00', ... '2018-10-28 02:00:00', ... '2018-10-28 02:30:00', @@ -1094,6 +1095,7 @@ def tz_localize(self, tz, ambiguous="raise", nonexistent="raise", errors=None): If the DST transition causes nonexistent times, you can shift these dates forward or backwards with a timedelta object or `'shift_forward'` or `'shift_backwards'`. + >>> s = pd.to_datetime(pd.Series(['2015-03-29 02:30:00', ... '2015-03-29 03:30:00'])) >>> s.dt.tz_localize('Europe/Warsaw', nonexistent='shift_forward') From 850e315284d15bb51f12df3ba56755057b826c9c Mon Sep 17 00:00:00 2001 From: Unprocessable Date: Mon, 2 Sep 2019 15:30:07 +0200 Subject: [PATCH 189/191] Adding updates --- pandas/core/arrays/datetimes.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index 70df708d36b3b..732f819e743a4 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -1063,6 +1063,7 @@ def tz_localize(self, tz, ambiguous="raise", nonexistent="raise", errors=None): Be careful with DST changes. When there is sequential data, pandas can infer the DST time: + >>> s = pd.to_datetime(pd.Series(['2018-10-28 01:30:00', ... '2018-10-28 02:00:00', ... '2018-10-28 02:30:00', @@ -1094,6 +1095,7 @@ def tz_localize(self, tz, ambiguous="raise", nonexistent="raise", errors=None): If the DST transition causes nonexistent times, you can shift these dates forward or backwards with a timedelta object or `'shift_forward'` or `'shift_backwards'`. + >>> s = pd.to_datetime(pd.Series(['2015-03-29 02:30:00', ... '2015-03-29 03:30:00'])) >>> s.dt.tz_localize('Europe/Warsaw', nonexistent='shift_forward') From 8d2547a754e37295f96a2d0a1ccf899fc8f3d6a8 Mon Sep 17 00:00:00 2001 From: Unprocessable Date: Mon, 2 Sep 2019 15:46:41 +0200 Subject: [PATCH 190/191] BUG: Fix numpy boolean subtraction error in Series.diff (#27755) --- pandas/core/algorithms.py | 4 ++ pandas/tests/series/test_analytics.py | 54 ++++++++++++++++++++++++++ pandas/tests/series/test_timeseries.py | 42 -------------------- 3 files changed, 58 insertions(+), 42 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index c0ed198e200f1..1132f7d6ffdfd 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -1915,6 +1915,7 @@ def diff(arr, n, axis=0): dtype = arr.dtype is_timedelta = False + is_bool = False if needs_i8_conversion(arr): dtype = np.float64 arr = arr.view("i8") @@ -1923,6 +1924,7 @@ def diff(arr, n, axis=0): elif is_bool_dtype(dtype): dtype = np.object_ + is_bool = True elif is_integer_dtype(dtype): dtype = np.float64 @@ -1962,6 +1964,8 @@ def diff(arr, n, axis=0): result = res - lag result[mask] = na out_arr[res_indexer] = result + elif is_bool: + out_arr[res_indexer] = arr[res_indexer] ^ arr[lag_indexer] else: out_arr[res_indexer] = arr[res_indexer] - arr[lag_indexer] diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py index 1ddaa4692d741..d6cb7f8d6a8be 100644 --- a/pandas/tests/series/test_analytics.py +++ b/pandas/tests/series/test_analytics.py @@ -20,6 +20,7 @@ from pandas.api.types import is_scalar from pandas.core.index import MultiIndex from pandas.core.indexes.datetimes import Timestamp +from pandas.core.indexes.timedeltas import TimedeltaIndex import pandas.util.testing as tm from pandas.util.testing import ( assert_almost_equal, @@ -237,6 +238,59 @@ def test_npdiff(self): r = np.diff(s) assert_series_equal(Series([nan, 0, 0, 0, nan]), r) + def test_dt_nm_bool_diff(self): + # Combined datetime diff, normal diff and boolean diff test + ts = tm.makeTimeSeries(name="ts") + ts.diff() + + # int dtype + a = 10000000000000000 + b = a + 1 + s = Series([a, b]) + + rs = s.diff() + assert rs[1] == 1 + + # neg n + rs = ts.diff(-1) + xp = ts - ts.shift(-1) + assert_series_equal(rs, xp) + + # 0 + rs = ts.diff(0) + xp = ts - ts + assert_series_equal(rs, xp) + + # datetime diff (GH3100) + s = Series(date_range("20130102", periods=5)) + rs = s - s.shift(1) + xp = s.diff() + assert_series_equal(rs, xp) + + # timedelta diff + nrs = rs - rs.shift(1) + nxp = xp.diff() + assert_series_equal(nrs, nxp) + + # with tz + s = Series( + date_range("2000-01-01 09:00:00", periods=5, tz="US/Eastern"), name="foo" + ) + result = s.diff() + assert_series_equal( + result, Series(TimedeltaIndex(["NaT"] + ["1 days"] * 4), name="foo") + ) + + # boolean series + s = Series([False, True, True, False, False]) + result = s.diff() + assert_series_equal(result, Series([nan, True, False, True, False])) + + # boolean nan series + s = Series([False, True, nan, False, False]) + result = s.diff() + assert_series_equal(result, Series([nan, 1, nan, nan, 0], dtype="object")) + def _check_accum_op(self, name, datetime_series_, check_dtype=True): func = getattr(np, name) tm.assert_numpy_array_equal( diff --git a/pandas/tests/series/test_timeseries.py b/pandas/tests/series/test_timeseries.py index d0ca5d82c6b33..fbe3f929cf5b5 100644 --- a/pandas/tests/series/test_timeseries.py +++ b/pandas/tests/series/test_timeseries.py @@ -355,48 +355,6 @@ def test_asfreq_datetimeindex_empty_series(self): ) tm.assert_index_equal(expected.index, result.index) - def test_diff(self): - # Just run the function - self.ts.diff() - - # int dtype - a = 10000000000000000 - b = a + 1 - s = Series([a, b]) - - rs = s.diff() - assert rs[1] == 1 - - # neg n - rs = self.ts.diff(-1) - xp = self.ts - self.ts.shift(-1) - assert_series_equal(rs, xp) - - # 0 - rs = self.ts.diff(0) - xp = self.ts - self.ts - assert_series_equal(rs, xp) - - # datetime diff (GH3100) - s = Series(date_range("20130102", periods=5)) - rs = s - s.shift(1) - xp = s.diff() - assert_series_equal(rs, xp) - - # timedelta diff - nrs = rs - rs.shift(1) - nxp = xp.diff() - assert_series_equal(nrs, nxp) - - # with tz - s = Series( - date_range("2000-01-01 09:00:00", periods=5, tz="US/Eastern"), name="foo" - ) - result = s.diff() - assert_series_equal( - result, Series(TimedeltaIndex(["NaT"] + ["1 days"] * 4), name="foo") - ) - def test_pct_change(self): rs = self.ts.pct_change(fill_method=None) assert_series_equal(rs, self.ts / self.ts.shift(1) - 1) From 5530d5c6632749c6ff7e122236d89ce7df495e73 Mon Sep 17 00:00:00 2001 From: Unprocessable Date: Mon, 2 Sep 2019 16:15:30 +0200 Subject: [PATCH 191/191] Update v0.25.1.rst --- doc/source/whatsnew/v0.25.1.rst | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v0.25.1.rst b/doc/source/whatsnew/v0.25.1.rst index f246a7f87c9b4..63dd56f4a3793 100644 --- a/doc/source/whatsnew/v0.25.1.rst +++ b/doc/source/whatsnew/v0.25.1.rst @@ -109,10 +109,7 @@ Other ^^^^^ - Bug in :meth:`Series.replace` and :meth:`DataFrame.replace` when replacing timezone-aware timestamps using a dict-like replacer (:issue:`27720`) -- Bug in :meth:`Series.diff` where a boolean series would cause a TypeError (the - operator is deprecated) when using NumPy >= 0.13.0 (:issue:`17294`) -- -- ->>>>>>> patch-1 +- Bug in :meth:`Series.rename` when using a custom type indexer. Now any value that isn't callable or dict-like is treated as a scalar. (:issue:`27814`) .. _whatsnew_0.251.contributors: