From 912c2d0679fe09f951aa0adf5c250b1d3e25c21d Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 17 Mar 2020 18:22:37 -0700 Subject: [PATCH 01/40] BUG/API: _values_for_factorize/_from_factorized round-trip --- pandas/core/arrays/boolean.py | 3 ++- pandas/tests/extension/base/reshaping.py | 7 +++++++ pandas/tests/extension/json/array.py | 4 +++- pandas/tests/extension/test_datetime.py | 8 ++++++++ 4 files changed, 20 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py index d93b5fbc83312..e6fc2993ff53c 100644 --- a/pandas/core/arrays/boolean.py +++ b/pandas/core/arrays/boolean.py @@ -320,7 +320,8 @@ def _values_for_factorize(self) -> Tuple[np.ndarray, int]: @classmethod def _from_factorized(cls, values, original: "BooleanArray") -> "BooleanArray": - return cls._from_sequence(values, dtype=original.dtype) + mask = values == -1 + return cls(values.astype(bool), mask) _HANDLED_TYPES = (np.ndarray, numbers.Number, bool, np.bool_) diff --git a/pandas/tests/extension/base/reshaping.py b/pandas/tests/extension/base/reshaping.py index ec21898852888..3e82e9d9fa37f 100644 --- a/pandas/tests/extension/base/reshaping.py +++ b/pandas/tests/extension/base/reshaping.py @@ -324,3 +324,10 @@ def test_transpose(self, data): self.assert_frame_equal(result, expected) self.assert_frame_equal(np.transpose(np.transpose(df)), df) self.assert_frame_equal(np.transpose(np.transpose(df[["A"]])), df[["A"]]) + + def test_factorize_roundtrip(self, data): + # GH#32673 + values = data._values_for_factorize()[0] + result = type(data)._from_factorized(values, data) + + self.assert_equal(result, data) diff --git a/pandas/tests/extension/json/array.py b/pandas/tests/extension/json/array.py index 1f026e405dc17..d576228674968 100644 --- a/pandas/tests/extension/json/array.py +++ b/pandas/tests/extension/json/array.py @@ -65,7 +65,9 @@ def _from_sequence(cls, scalars, dtype=None, copy=False): @classmethod def _from_factorized(cls, values, original): - return cls([UserDict(x) for x in values if x != ()]) + return cls( + [UserDict(x) if x != () else original.dtype.na_value for x in values] + ) def __getitem__(self, item): if isinstance(item, numbers.Integral): diff --git a/pandas/tests/extension/test_datetime.py b/pandas/tests/extension/test_datetime.py index 3aa188098620d..38666a1709092 100644 --- a/pandas/tests/extension/test_datetime.py +++ b/pandas/tests/extension/test_datetime.py @@ -4,6 +4,7 @@ from pandas.core.dtypes.dtypes import DatetimeTZDtype import pandas as pd +import pandas._testing as tm from pandas.core.arrays import DatetimeArray from pandas.tests.extension import base @@ -201,6 +202,13 @@ def test_unstack(self, obj): result = ser.unstack(0) self.assert_equal(result, expected) + def test_factorize_roundtrip(self, data): + # GH#32673, for DTA we dont preserve freq + values = data._values_for_factorize()[0] + result = type(data)._from_factorized(values, data) + + tm.assert_numpy_array_equal(result.asi8, data.asi8) + class TestSetitem(BaseDatetimeTests, base.BaseSetitemTests): pass From c33ba806df0064e5db4921091ddf77442a46c0da Mon Sep 17 00:00:00 2001 From: jamin-aws-ospo <60270728+jamin-aws-ospo@users.noreply.github.com> Date: Tue, 17 Mar 2020 17:00:57 -0700 Subject: [PATCH 02/40] copy license text from: tidyverse/haven (#32756) --- LICENSES/HAVEN_LICENSE | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/LICENSES/HAVEN_LICENSE b/LICENSES/HAVEN_LICENSE index 2f444cb44d505..ce1b07b783e74 100644 --- a/LICENSES/HAVEN_LICENSE +++ b/LICENSES/HAVEN_LICENSE @@ -1,2 +1,21 @@ -YEAR: 2013-2016 -COPYRIGHT HOLDER: Hadley Wickham; RStudio; and Evan Miller +# MIT License + +Copyright (c) 2019 Hadley Wickham; RStudio; and Evan Miller + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. From 0a422279af7c3b70c9e3ba346028435284559b34 Mon Sep 17 00:00:00 2001 From: MomIsBestFriend <50263213+MomIsBestFriend@users.noreply.github.com> Date: Wed, 18 Mar 2020 04:35:09 +0200 Subject: [PATCH 03/40] BLD: Suppressing warnings when compiling pandas/_libs/writers (#32795) --- pandas/_libs/writers.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/writers.pyx b/pandas/_libs/writers.pyx index 9e95dea979577..ebf98232da58b 100644 --- a/pandas/_libs/writers.pyx +++ b/pandas/_libs/writers.pyx @@ -36,7 +36,7 @@ def write_csv_rows( """ # In crude testing, N>100 yields little marginal improvement cdef: - Py_ssize_t i, j, k = len(data_index), N = 100, ncols = len(cols) + Py_ssize_t i, j = 0, k = len(data_index), N = 100, ncols = len(cols) list rows # pre-allocate rows From 4e401cb0638546653843456bbcc943bac9ec42b3 Mon Sep 17 00:00:00 2001 From: Derek McCammond Date: Wed, 18 Mar 2020 05:12:00 -0400 Subject: [PATCH 04/40] Avoid bare pytest.raises in dtypes/test_dtypes.py (#32672) --- pandas/core/dtypes/dtypes.py | 2 +- .../arrays/categorical/test_constructors.py | 4 +- pandas/tests/dtypes/test_dtypes.py | 38 +++++++++++-------- 3 files changed, 26 insertions(+), 18 deletions(-) diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index 181f0c8906853..d29102cbd4604 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -558,7 +558,7 @@ def validate_categories(categories, fastpath: bool = False): if not fastpath: if categories.hasnans: - raise ValueError("Categorial categories cannot be null") + raise ValueError("Categorical categories cannot be null") if not categories.is_unique: raise ValueError("Categorical categories must be unique") diff --git a/pandas/tests/arrays/categorical/test_constructors.py b/pandas/tests/arrays/categorical/test_constructors.py index c6b4c4904735c..3e31c1acbe09d 100644 --- a/pandas/tests/arrays/categorical/test_constructors.py +++ b/pandas/tests/arrays/categorical/test_constructors.py @@ -252,7 +252,7 @@ def test_constructor_not_sequence(self): def test_constructor_with_null(self): # Cannot have NaN in categories - msg = "Categorial categories cannot be null" + msg = "Categorical categories cannot be null" with pytest.raises(ValueError, match=msg): Categorical([np.nan, "a", "b", "c"], categories=[np.nan, "a", "b", "c"]) @@ -500,7 +500,7 @@ def test_from_codes_non_unique_categories(self): Categorical.from_codes([0, 1, 2], categories=["a", "a", "b"]) def test_from_codes_nan_cat_included(self): - with pytest.raises(ValueError, match="Categorial categories cannot be null"): + with pytest.raises(ValueError, match="Categorical categories cannot be null"): Categorical.from_codes([0, 1, 2], categories=["a", "b", np.nan]) def test_from_codes_too_negative(self): diff --git a/pandas/tests/dtypes/test_dtypes.py b/pandas/tests/dtypes/test_dtypes.py index 55b1ac819049d..16ee7c27780ca 100644 --- a/pandas/tests/dtypes/test_dtypes.py +++ b/pandas/tests/dtypes/test_dtypes.py @@ -361,7 +361,7 @@ def test_hash_vs_equality(self, dtype): assert hash(dtype) == hash(dtype3) def test_construction(self): - with pytest.raises(ValueError): + with pytest.raises(ValueError, match="Invalid frequency: xx"): PeriodDtype("xx") for s in ["period[D]", "Period[D]", "D"]: @@ -414,21 +414,25 @@ def test_construction_from_string(self, dtype): assert is_dtype_equal(dtype, result) result = PeriodDtype.construct_from_string("period[D]") assert is_dtype_equal(dtype, result) - with pytest.raises(TypeError): - PeriodDtype.construct_from_string("foo") - with pytest.raises(TypeError): - PeriodDtype.construct_from_string("period[foo]") - with pytest.raises(TypeError): - PeriodDtype.construct_from_string("foo[D]") - - with pytest.raises(TypeError): - PeriodDtype.construct_from_string("datetime64[ns]") - with pytest.raises(TypeError): - PeriodDtype.construct_from_string("datetime64[ns, US/Eastern]") with pytest.raises(TypeError, match="list"): PeriodDtype.construct_from_string([1, 2, 3]) + @pytest.mark.parametrize( + "string", + [ + "foo", + "period[foo]", + "foo[D]", + "datetime64[ns]", + "datetime64[ns, US/Eastern]", + ], + ) + def test_construct_dtype_from_string_invalid_raises(self, string): + msg = f"Cannot construct a 'PeriodDtype' from '{string}'" + with pytest.raises(TypeError, match=re.escape(msg)): + PeriodDtype.construct_from_string(string) + def test_is_dtype(self, dtype): assert PeriodDtype.is_dtype(dtype) assert PeriodDtype.is_dtype("period[D]") @@ -475,7 +479,9 @@ def test_basic(self, dtype): def test_empty(self): dt = PeriodDtype() - with pytest.raises(AttributeError): + # https://github.com/pandas-dev/pandas/issues/27388 + msg = "object has no attribute 'freqstr'" + with pytest.raises(AttributeError, match=msg): str(dt) def test_not_string(self): @@ -764,11 +770,13 @@ def test_order_hashes_different(self, v1, v2): assert c1 is not c3 def test_nan_invalid(self): - with pytest.raises(ValueError): + msg = "Categorical categories cannot be null" + with pytest.raises(ValueError, match=msg): CategoricalDtype([1, 2, np.nan]) def test_non_unique_invalid(self): - with pytest.raises(ValueError): + msg = "Categorical categories must be unique" + with pytest.raises(ValueError, match=msg): CategoricalDtype([1, 2, 1]) def test_same_categories_different_order(self): From 679e5d373153caa2f803543f0bb6a5f132ae1685 Mon Sep 17 00:00:00 2001 From: MomIsBestFriend <50263213+MomIsBestFriend@users.noreply.github.com> Date: Wed, 18 Mar 2020 21:24:42 +0200 Subject: [PATCH 05/40] PERF: Using Numpy C-API when calling `np.arange` (#32804) Co-authored-by: MomIsBestFriend <> --- pandas/_libs/internals.pyx | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pandas/_libs/internals.pyx b/pandas/_libs/internals.pyx index 5545302fcbfc4..63f076b7ee993 100644 --- a/pandas/_libs/internals.pyx +++ b/pandas/_libs/internals.pyx @@ -308,7 +308,10 @@ cdef slice_getitem(slice slc, ind): return slice(s_start, s_stop, s_step) else: - return np.arange(s_start, s_stop, s_step, dtype=np.int64)[ind] + # NOTE: + # this is the C-optimized equivalent of + # `np.arange(s_start, s_stop, s_step, dtype=np.int64)[ind]` + return cnp.PyArray_Arange(s_start, s_stop, s_step, NPY_INT64)[ind] @cython.boundscheck(False) From 84f287e7cd79df9cf898909453823557aa366656 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 18 Mar 2020 12:25:00 -0700 Subject: [PATCH 06/40] TYP: annotate to_numpy (#32809) --- pandas/core/arrays/base.py | 4 +++- pandas/core/arrays/numpy_.py | 5 ++++- pandas/core/frame.py | 2 +- pandas/tests/extension/decimal/array.py | 4 +++- 4 files changed, 11 insertions(+), 4 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 6aa303dd04703..c42c1539daa5a 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -356,7 +356,9 @@ def __iter__(self): for i in range(len(self)): yield self[i] - def to_numpy(self, dtype=None, copy=False, na_value=lib.no_default): + def to_numpy( + self, dtype=None, copy: bool = False, na_value=lib.no_default + ) -> np.ndarray: """ Convert to a NumPy ndarray. diff --git a/pandas/core/arrays/numpy_.py b/pandas/core/arrays/numpy_.py index 0e64967ce93a6..e8333606ec54c 100644 --- a/pandas/core/arrays/numpy_.py +++ b/pandas/core/arrays/numpy_.py @@ -435,7 +435,10 @@ def skew(self, axis=None, dtype=None, out=None, keepdims=False, skipna=True): # ------------------------------------------------------------------------ # Additional Methods - def to_numpy(self, dtype=None, copy=False, na_value=lib.no_default): + + def to_numpy( + self, dtype=None, copy: bool = False, na_value=lib.no_default + ) -> np.ndarray: result = np.asarray(self._ndarray, dtype=dtype) if (copy or na_value is not lib.no_default) and result is self._ndarray: diff --git a/pandas/core/frame.py b/pandas/core/frame.py index baa6fb07ff233..b9e43b1cd9b05 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1253,7 +1253,7 @@ def from_dict(cls, data, orient="columns", dtype=None, columns=None) -> "DataFra return cls(data, index=index, columns=columns, dtype=dtype) - def to_numpy(self, dtype=None, copy=False) -> np.ndarray: + def to_numpy(self, dtype=None, copy: bool = False) -> np.ndarray: """ Convert the DataFrame to a NumPy array. diff --git a/pandas/tests/extension/decimal/array.py b/pandas/tests/extension/decimal/array.py index 9384ed5199c1f..85d8ad6ec6e38 100644 --- a/pandas/tests/extension/decimal/array.py +++ b/pandas/tests/extension/decimal/array.py @@ -79,7 +79,9 @@ def _from_factorized(cls, values, original): _HANDLED_TYPES = (decimal.Decimal, numbers.Number, np.ndarray) - def to_numpy(self, dtype=None, copy=False, na_value=no_default, decimals=None): + def to_numpy( + self, dtype=None, copy: bool = False, na_value=no_default, decimals=None + ) -> np.ndarray: result = np.asarray(self, dtype=dtype) if decimals is not None: result = np.asarray([round(x, decimals) for x in result]) From 5b92d03c6fb23d716a641be82520551252de3d2a Mon Sep 17 00:00:00 2001 From: mglasder Date: Wed, 18 Mar 2020 23:09:44 +0100 Subject: [PATCH 07/40] fstring format added in pandas//tests/io/test_common.py:144: (#32813) --- pandas/tests/io/test_common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py index 730043e6ec7d7..0c79ef4378b66 100644 --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py @@ -141,7 +141,7 @@ def test_read_non_existant(self, reader, module, error_class, fn_ext): pytest.importorskip(module) path = os.path.join(HERE, "data", "does_not_exist." + fn_ext) - msg1 = r"File (b')?.+does_not_exist\.{}'? does not exist".format(fn_ext) + msg1 = fr"File (b')?.+does_not_exist\.{fn_ext}'? does not exist" msg2 = fr"\[Errno 2\] No such file or directory: '.+does_not_exist\.{fn_ext}'" msg3 = "Expected object or value" msg4 = "path_or_buf needs to be a string file path or file-like" From 0f0ec28e7c6bee99eef1fbb653b9f7a82f797b83 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 18 Mar 2020 17:02:06 -0700 Subject: [PATCH 08/40] BUG: Series.__getitem__ with downstream scalars (#32684) --- doc/source/whatsnew/v1.1.0.rst | 1 + pandas/core/series.py | 4 ++++ pandas/tests/dtypes/test_inference.py | 6 ++++++ pandas/tests/series/indexing/test_indexing.py | 12 ++++++++++++ 4 files changed, 23 insertions(+) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 0d3a9a8f969a4..441c6cee32b2a 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -303,6 +303,7 @@ Indexing - Bug in :meth:`Series.loc` and :meth:`DataFrame.loc` when indexing with an integer key on a object-dtype :class:`Index` that is not all-integers (:issue:`31905`) - Bug in :meth:`DataFrame.iloc.__setitem__` on a :class:`DataFrame` with duplicate columns incorrectly setting values for all matching columns (:issue:`15686`, :issue:`22036`) - Bug in :meth:`DataFrame.loc:` and :meth:`Series.loc` with a :class:`DatetimeIndex`, :class:`TimedeltaIndex`, or :class:`PeriodIndex` incorrectly allowing lookups of non-matching datetime-like dtypes (:issue:`32650`) +- Bug in :meth:`Series.__getitem__` indexing with non-standard scalars, e.g. ``np.dtype`` (:issue:`32684`) Missing ^^^^^^^ diff --git a/pandas/core/series.py b/pandas/core/series.py index 8a6839b4fb181..21477cce48e63 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -926,6 +926,10 @@ def _get_with(self, key): elif isinstance(key, tuple): return self._get_values_tuple(key) + elif not is_list_like(key): + # e.g. scalars that aren't recognized by lib.is_scalar, GH#32684 + return self.loc[key] + if not isinstance(key, (list, np.ndarray, ExtensionArray, Series, Index)): key = list(key) diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py index e0fef833d4ced..ab9916eea8e5a 100644 --- a/pandas/tests/dtypes/test_inference.py +++ b/pandas/tests/dtypes/test_inference.py @@ -1437,6 +1437,7 @@ def test_is_scalar_pandas_scalars(self): assert is_scalar(Period("2014-01-01")) assert is_scalar(Interval(left=0, right=1)) assert is_scalar(DateOffset(days=1)) + assert is_scalar(pd.offsets.Minute(3)) def test_is_scalar_pandas_containers(self): assert not is_scalar(Series(dtype=object)) @@ -1445,6 +1446,11 @@ def test_is_scalar_pandas_containers(self): assert not is_scalar(DataFrame([[1]])) assert not is_scalar(Index([])) assert not is_scalar(Index([1])) + assert not is_scalar(Categorical([])) + assert not is_scalar(DatetimeIndex([])._data) + assert not is_scalar(TimedeltaIndex([])._data) + assert not is_scalar(DatetimeIndex([])._data.to_period("D")) + assert not is_scalar(pd.array([1, 2, 3])) def test_is_scalar_number(self): # Number() is not recognied by PyNumber_Check, so by extension diff --git a/pandas/tests/series/indexing/test_indexing.py b/pandas/tests/series/indexing/test_indexing.py index 18fcbea683dd3..5b3786e1a0d3c 100644 --- a/pandas/tests/series/indexing/test_indexing.py +++ b/pandas/tests/series/indexing/test_indexing.py @@ -923,3 +923,15 @@ def test_getitem_2d_no_warning(): series = pd.Series([1, 2, 3], index=[1, 2, 3]) with tm.assert_produces_warning(None): series[:, None] + + +def test_getitem_unrecognized_scalar(): + # GH#32684 a scalar key that is not recognized by lib.is_scalar + + # a series that might be produced via `frame.dtypes` + ser = pd.Series([1, 2], index=[np.dtype("O"), np.dtype("i8")]) + + key = ser.index[1] + + result = ser[key] + assert result == 2 From 02ac9761ee501eaa82ef8617e8dff60cca2f6c1d Mon Sep 17 00:00:00 2001 From: MomIsBestFriend <50263213+MomIsBestFriend@users.noreply.github.com> Date: Thu, 19 Mar 2020 02:09:46 +0200 Subject: [PATCH 09/40] CLN: Using clearer imports (#32459) Co-authored-by: MomIsBestFriend <> --- pandas/_libs/tslibs/strptime.pyx | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/pandas/_libs/tslibs/strptime.pyx b/pandas/_libs/tslibs/strptime.pyx index dfe050c7bbff7..a48c3365947dc 100644 --- a/pandas/_libs/tslibs/strptime.pyx +++ b/pandas/_libs/tslibs/strptime.pyx @@ -4,7 +4,7 @@ import time import locale import calendar import re -from datetime import date as datetime_date +import datetime from _thread import allocate_lock as _thread_allocate_lock @@ -288,20 +288,20 @@ def array_strptime(object[:] values, object fmt, bint exact=True, errors='raise' elif iso_year != -1 and iso_week != -1: year, julian = _calc_julian_from_V(iso_year, iso_week, weekday + 1) - # Cannot pre-calculate datetime_date() since can change in Julian + # Cannot pre-calculate datetime.date() since can change in Julian # calculation and thus could have different value for the day of the wk # calculation. try: if julian == -1: # Need to add 1 to result since first day of the year is 1, not # 0. - ordinal = datetime_date(year, month, day).toordinal() - julian = ordinal - datetime_date(year, 1, 1).toordinal() + 1 + ordinal = datetime.date(year, month, day).toordinal() + julian = ordinal - datetime.date(year, 1, 1).toordinal() + 1 else: # Assume that if they bothered to include Julian day it will # be accurate. - datetime_result = datetime_date.fromordinal( - (julian - 1) + datetime_date(year, 1, 1).toordinal()) + datetime_result = datetime.date.fromordinal( + (julian - 1) + datetime.date(year, 1, 1).toordinal()) year = datetime_result.year month = datetime_result.month day = datetime_result.day @@ -311,7 +311,7 @@ def array_strptime(object[:] values, object fmt, bint exact=True, errors='raise' continue raise if weekday == -1: - weekday = datetime_date(year, month, day).weekday() + weekday = datetime.date(year, month, day).weekday() dts.year = year dts.month = month @@ -649,7 +649,7 @@ cdef int _calc_julian_from_U_or_W(int year, int week_of_year, cdef: int first_weekday, week_0_length, days_to_week - first_weekday = datetime_date(year, 1, 1).weekday() + first_weekday = datetime.date(year, 1, 1).weekday() # If we are dealing with the %U directive (week starts on Sunday), it's # easier to just shift the view to Sunday being the first day of the # week. @@ -692,14 +692,14 @@ cdef (int, int) _calc_julian_from_V(int iso_year, int iso_week, int iso_weekday) cdef: int correction, ordinal - correction = datetime_date(iso_year, 1, 4).isoweekday() + 3 + correction = datetime.date(iso_year, 1, 4).isoweekday() + 3 ordinal = (iso_week * 7) + iso_weekday - correction # ordinal may be negative or 0 now, which means the date is in the previous # calendar year if ordinal < 1: - ordinal += datetime_date(iso_year, 1, 1).toordinal() + ordinal += datetime.date(iso_year, 1, 1).toordinal() iso_year -= 1 - ordinal -= datetime_date(iso_year, 1, 1).toordinal() + ordinal -= datetime.date(iso_year, 1, 1).toordinal() return iso_year, ordinal From 0ad9c82471a45655a5353fcd3d0099d99c7e54de Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 18 Mar 2020 17:19:52 -0700 Subject: [PATCH 10/40] REF: Implement core._algos (#32767) --- pandas/core/array_algos/__init__.py | 9 ++++++++ pandas/core/array_algos/transforms.py | 33 +++++++++++++++++++++++++++ pandas/core/arrays/datetimelike.py | 22 ++---------------- pandas/core/internals/blocks.py | 22 ++---------------- 4 files changed, 46 insertions(+), 40 deletions(-) create mode 100644 pandas/core/array_algos/__init__.py create mode 100644 pandas/core/array_algos/transforms.py diff --git a/pandas/core/array_algos/__init__.py b/pandas/core/array_algos/__init__.py new file mode 100644 index 0000000000000..a7655a013c6cf --- /dev/null +++ b/pandas/core/array_algos/__init__.py @@ -0,0 +1,9 @@ +""" +core.array_algos is for algorithms that operate on ndarray and ExtensionArray. +These should: + +- Assume that any Index, Series, or DataFrame objects have already been unwrapped. +- Assume that any list arguments have already been cast to ndarray/EA. +- Not depend on Index, Series, or DataFrame, nor import any of these. +- May dispatch to ExtensionArray methods, but should not import from core.arrays. +""" diff --git a/pandas/core/array_algos/transforms.py b/pandas/core/array_algos/transforms.py new file mode 100644 index 0000000000000..f775b6d733d9c --- /dev/null +++ b/pandas/core/array_algos/transforms.py @@ -0,0 +1,33 @@ +""" +transforms.py is for shape-preserving functions. +""" + +import numpy as np + +from pandas.core.dtypes.common import ensure_platform_int + + +def shift(values: np.ndarray, periods: int, axis: int, fill_value) -> np.ndarray: + new_values = values + + # make sure array sent to np.roll is c_contiguous + f_ordered = values.flags.f_contiguous + if f_ordered: + new_values = new_values.T + axis = new_values.ndim - axis - 1 + + if np.prod(new_values.shape): + new_values = np.roll(new_values, ensure_platform_int(periods), axis=axis) + + axis_indexer = [slice(None)] * values.ndim + if periods > 0: + axis_indexer[axis] = slice(None, periods) + else: + axis_indexer[axis] = slice(periods, None) + new_values[tuple(axis_indexer)] = fill_value + + # restore original order + if f_ordered: + new_values = new_values.T + + return new_values diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 105d9581b1a25..7510bfd1f67ad 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -40,6 +40,7 @@ from pandas.core import missing, nanops, ops from pandas.core.algorithms import checked_add_with_arr, take, unique1d, value_counts +from pandas.core.array_algos.transforms import shift from pandas.core.arrays.base import ExtensionArray, ExtensionOpsMixin import pandas.core.common as com from pandas.core.construction import array, extract_array @@ -773,26 +774,7 @@ def shift(self, periods=1, fill_value=None, axis=0): fill_value = self._unbox_scalar(fill_value) - new_values = self._data - - # make sure array sent to np.roll is c_contiguous - f_ordered = new_values.flags.f_contiguous - if f_ordered: - new_values = new_values.T - axis = new_values.ndim - axis - 1 - - new_values = np.roll(new_values, periods, axis=axis) - - axis_indexer = [slice(None)] * self.ndim - if periods > 0: - axis_indexer[axis] = slice(None, periods) - else: - axis_indexer[axis] = slice(periods, None) - new_values[tuple(axis_indexer)] = fill_value - - # restore original order - if f_ordered: - new_values = new_values.T + new_values = shift(self._data, periods, axis, fill_value) return type(self)._simple_new(new_values, dtype=self.dtype) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index cab2bd5146745..adeb1ae04a58d 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -29,7 +29,6 @@ from pandas.core.dtypes.common import ( _NS_DTYPE, _TD_DTYPE, - ensure_platform_int, is_bool_dtype, is_categorical, is_categorical_dtype, @@ -66,6 +65,7 @@ ) import pandas.core.algorithms as algos +from pandas.core.array_algos.transforms import shift from pandas.core.arrays import ( Categorical, DatetimeArray, @@ -1316,25 +1316,7 @@ def shift(self, periods, axis: int = 0, fill_value=None): # that, handle boolean etc also new_values, fill_value = maybe_upcast(self.values, fill_value) - # make sure array sent to np.roll is c_contiguous - f_ordered = new_values.flags.f_contiguous - if f_ordered: - new_values = new_values.T - axis = new_values.ndim - axis - 1 - - if np.prod(new_values.shape): - new_values = np.roll(new_values, ensure_platform_int(periods), axis=axis) - - axis_indexer = [slice(None)] * self.ndim - if periods > 0: - axis_indexer[axis] = slice(None, periods) - else: - axis_indexer[axis] = slice(periods, None) - new_values[tuple(axis_indexer)] = fill_value - - # restore original order - if f_ordered: - new_values = new_values.T + new_values = shift(new_values, periods, axis, fill_value) return [self.make_block(new_values)] From 964cedbaceb2063242e881447bded274f294c29d Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Wed, 18 Mar 2020 17:23:25 -0700 Subject: [PATCH 11/40] CLN: Consolidate numba facilities (#32770) --- pandas/core/util/numba_.py | 58 ++++++++++++++++++ pandas/core/window/numba_.py | 110 +++++++++-------------------------- 2 files changed, 87 insertions(+), 81 deletions(-) create mode 100644 pandas/core/util/numba_.py diff --git a/pandas/core/util/numba_.py b/pandas/core/util/numba_.py new file mode 100644 index 0000000000000..e4debab2c22ee --- /dev/null +++ b/pandas/core/util/numba_.py @@ -0,0 +1,58 @@ +"""Common utilities for Numba operations""" +import types +from typing import Callable, Dict, Optional + +import numpy as np + +from pandas.compat._optional import import_optional_dependency + + +def check_kwargs_and_nopython( + kwargs: Optional[Dict] = None, nopython: Optional[bool] = None +): + if kwargs and nopython: + raise ValueError( + "numba does not support kwargs with nopython=True: " + "https://github.com/numba/numba/issues/2916" + ) + + +def get_jit_arguments(engine_kwargs: Optional[Dict[str, bool]] = None): + """ + Return arguments to pass to numba.JIT, falling back on pandas default JIT settings. + """ + if engine_kwargs is None: + engine_kwargs = {} + + nopython = engine_kwargs.get("nopython", True) + nogil = engine_kwargs.get("nogil", False) + parallel = engine_kwargs.get("parallel", False) + return nopython, nogil, parallel + + +def jit_user_function(func: Callable, nopython: bool, nogil: bool, parallel: bool): + """ + JIT the user's function given the configurable arguments. + """ + numba = import_optional_dependency("numba") + + if isinstance(func, numba.targets.registry.CPUDispatcher): + # Don't jit a user passed jitted function + numba_func = func + else: + + @numba.generated_jit(nopython=nopython, nogil=nogil, parallel=parallel) + def numba_func(data, *_args): + if getattr(np, func.__name__, False) is func or isinstance( + func, types.BuiltinFunctionType + ): + jf = func + else: + jf = numba.jit(func, nopython=nopython, nogil=nogil) + + def impl(data, *_args): + return jf(data, *_args) + + return impl + + return numba_func diff --git a/pandas/core/window/numba_.py b/pandas/core/window/numba_.py index d6e8194c861fa..5d35ec7457ab0 100644 --- a/pandas/core/window/numba_.py +++ b/pandas/core/window/numba_.py @@ -1,4 +1,3 @@ -import types from typing import Any, Callable, Dict, Optional, Tuple import numpy as np @@ -6,35 +5,49 @@ from pandas._typing import Scalar from pandas.compat._optional import import_optional_dependency +from pandas.core.util.numba_ import ( + check_kwargs_and_nopython, + get_jit_arguments, + jit_user_function, +) -def make_rolling_apply( - func: Callable[..., Scalar], + +def generate_numba_apply_func( args: Tuple, - nogil: bool, - parallel: bool, - nopython: bool, + kwargs: Dict[str, Any], + func: Callable[..., Scalar], + engine_kwargs: Optional[Dict[str, bool]], ): """ - Creates a JITted rolling apply function with a JITted version of - the user's function. + Generate a numba jitted apply function specified by values from engine_kwargs. + + 1. jit the user's function + 2. Return a rolling apply function with the jitted function inline + + Configurations specified in engine_kwargs apply to both the user's + function _AND_ the rolling apply function. Parameters ---------- - func : function - function to be applied to each window and will be JITed args : tuple *args to be passed into the function - nogil : bool - nogil parameter from engine_kwargs for numba.jit - parallel : bool - parallel parameter from engine_kwargs for numba.jit - nopython : bool - nopython parameter from engine_kwargs for numba.jit + kwargs : dict + **kwargs to be passed into the function + func : function + function to be applied to each window and will be JITed + engine_kwargs : dict + dictionary of arguments to be passed into numba.jit Returns ------- Numba function """ + nopython, nogil, parallel = get_jit_arguments(engine_kwargs) + + check_kwargs_and_nopython(kwargs, nopython) + + numba_func = jit_user_function(func, nopython, nogil, parallel) + numba = import_optional_dependency("numba") if parallel: @@ -42,25 +55,6 @@ def make_rolling_apply( else: loop_range = range - if isinstance(func, numba.targets.registry.CPUDispatcher): - # Don't jit a user passed jitted function - numba_func = func - else: - - @numba.generated_jit(nopython=nopython, nogil=nogil, parallel=parallel) - def numba_func(window, *_args): - if getattr(np, func.__name__, False) is func or isinstance( - func, types.BuiltinFunctionType - ): - jf = func - else: - jf = numba.jit(func, nopython=nopython, nogil=nogil) - - def impl(window, *_args): - return jf(window, *_args) - - return impl - @numba.jit(nopython=nopython, nogil=nogil, parallel=parallel) def roll_apply( values: np.ndarray, begin: np.ndarray, end: np.ndarray, minimum_periods: int, @@ -78,49 +72,3 @@ def roll_apply( return result return roll_apply - - -def generate_numba_apply_func( - args: Tuple, - kwargs: Dict[str, Any], - func: Callable[..., Scalar], - engine_kwargs: Optional[Dict[str, bool]], -): - """ - Generate a numba jitted apply function specified by values from engine_kwargs. - - 1. jit the user's function - 2. Return a rolling apply function with the jitted function inline - - Configurations specified in engine_kwargs apply to both the user's - function _AND_ the rolling apply function. - - Parameters - ---------- - args : tuple - *args to be passed into the function - kwargs : dict - **kwargs to be passed into the function - func : function - function to be applied to each window and will be JITed - engine_kwargs : dict - dictionary of arguments to be passed into numba.jit - - Returns - ------- - Numba function - """ - if engine_kwargs is None: - engine_kwargs = {} - - nopython = engine_kwargs.get("nopython", True) - nogil = engine_kwargs.get("nogil", False) - parallel = engine_kwargs.get("parallel", False) - - if kwargs and nopython: - raise ValueError( - "numba does not support kwargs with nopython=True: " - "https://github.com/numba/numba/issues/2916" - ) - - return make_rolling_apply(func, args, nogil, parallel, nopython) From 23ac98a7d0bac6db643f9c2b71f3964f2a504eca Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 18 Mar 2020 17:24:23 -0700 Subject: [PATCH 12/40] CLN: remove _ndarray_values (#32768) --- doc/source/development/internals.rst | 10 ++---- doc/source/reference/extensions.rst | 1 - pandas/core/arrays/base.py | 17 ---------- pandas/core/arrays/categorical.py | 13 ++------ pandas/core/arrays/datetimelike.py | 4 --- pandas/core/arrays/integer.py | 12 ------- pandas/core/base.py | 17 ---------- pandas/core/indexes/base.py | 31 ++++++++----------- pandas/core/indexes/datetimelike.py | 2 +- pandas/core/indexes/extension.py | 4 --- pandas/core/series.py | 22 ++++++------- pandas/tests/base/test_conversion.py | 28 ----------------- pandas/tests/indexes/common.py | 10 +----- .../indexes/interval/test_constructors.py | 4 +-- .../tests/indexes/interval/test_interval.py | 2 +- .../tests/indexes/period/test_constructors.py | 4 +-- pandas/tests/indexes/period/test_period.py | 6 ++-- pandas/tests/reductions/test_reductions.py | 4 +-- 18 files changed, 37 insertions(+), 154 deletions(-) diff --git a/doc/source/development/internals.rst b/doc/source/development/internals.rst index 4ad045a91b5fe..8f1c3d5d818c2 100644 --- a/doc/source/development/internals.rst +++ b/doc/source/development/internals.rst @@ -89,16 +89,10 @@ pandas extends NumPy's type system with custom types, like ``Categorical`` or datetimes with a timezone, so we have multiple notions of "values". For 1-D containers (``Index`` classes and ``Series``) we have the following convention: -* ``cls._ndarray_values`` is *always* a NumPy ``ndarray``. Ideally, - ``_ndarray_values`` is cheap to compute. For example, for a ``Categorical``, - this returns the codes, not the array of objects. * ``cls._values`` refers is the "best possible" array. This could be an - ``ndarray``, ``ExtensionArray``, or in ``Index`` subclass (note: we're in the - process of removing the index subclasses here so that it's always an - ``ndarray`` or ``ExtensionArray``). + ``ndarray`` or ``ExtensionArray``. -So, for example, ``Series[category]._values`` is a ``Categorical``, while -``Series[category]._ndarray_values`` is the underlying codes. +So, for example, ``Series[category]._values`` is a ``Categorical``. .. _ref-subclassing-pandas: diff --git a/doc/source/reference/extensions.rst b/doc/source/reference/extensions.rst index 78fdfbfd28144..4c0763e091b75 100644 --- a/doc/source/reference/extensions.rst +++ b/doc/source/reference/extensions.rst @@ -37,7 +37,6 @@ objects. api.extensions.ExtensionArray._from_factorized api.extensions.ExtensionArray._from_sequence api.extensions.ExtensionArray._from_sequence_of_strings - api.extensions.ExtensionArray._ndarray_values api.extensions.ExtensionArray._reduce api.extensions.ExtensionArray._values_for_argsort api.extensions.ExtensionArray._values_for_factorize diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index c42c1539daa5a..ab24beb0da4fc 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -93,7 +93,6 @@ class ExtensionArray: _from_factorized _from_sequence _from_sequence_of_strings - _ndarray_values _reduce _values_for_argsort _values_for_factorize @@ -1046,22 +1045,6 @@ def _concat_same_type( # of objects _can_hold_na = True - @property - def _ndarray_values(self) -> np.ndarray: - """ - Internal pandas method for lossy conversion to a NumPy ndarray. - - This method is not part of the pandas interface. - - The expectation is that this is cheap to compute, and is primarily - used for interacting with our indexers. - - Returns - ------- - array : ndarray - """ - return np.array(self) - def _reduce(self, name, skipna=True, **kwargs): """ Return a scalar result of performing the reduction operation. diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 497a9893e6c66..bfccc6f244219 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -451,10 +451,6 @@ def dtype(self) -> CategoricalDtype: """ return self._dtype - @property - def _ndarray_values(self) -> np.ndarray: - return self.codes - @property def _constructor(self) -> Type["Categorical"]: return Categorical @@ -2567,12 +2563,7 @@ def _get_codes_for_values(values, categories): """ dtype_equal = is_dtype_equal(values.dtype, categories.dtype) - if dtype_equal: - # To prevent erroneous dtype coercion in _get_data_algo, retrieve - # the underlying numpy array. gh-22702 - values = getattr(values, "_ndarray_values", values) - categories = getattr(categories, "_ndarray_values", categories) - elif is_extension_array_dtype(categories.dtype) and is_object_dtype(values): + if is_extension_array_dtype(categories.dtype) and is_object_dtype(values): # Support inferring the correct extension dtype from an array of # scalar objects. e.g. # Categorical(array[Period, Period], categories=PeriodIndex(...)) @@ -2582,7 +2573,7 @@ def _get_codes_for_values(values, categories): # exception raised in _from_sequence values = ensure_object(values) categories = ensure_object(categories) - else: + elif not dtype_equal: values = ensure_object(values) categories = ensure_object(categories) diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 7510bfd1f67ad..7cf50ff2b88af 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -456,10 +456,6 @@ def asi8(self) -> np.ndarray: # do not cache or you'll create a memory leak return self._data.view("i8") - @property - def _ndarray_values(self): - return self._data - # ---------------------------------------------------------------- # Rendering Methods diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index fb33840ad757c..f2880c5cbee42 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -478,18 +478,6 @@ def astype(self, dtype, copy: bool = True) -> ArrayLike: data = self.to_numpy(dtype=dtype, **kwargs) return astype_nansafe(data, dtype, copy=False) - @property - def _ndarray_values(self) -> np.ndarray: - """ - Internal pandas method for lossy conversion to a NumPy ndarray. - - This method is not part of the pandas interface. - - The expectation is that this is cheap to compute, and is primarily - used for interacting with our indexers. - """ - return self._data - def _values_for_factorize(self) -> Tuple[np.ndarray, float]: # TODO: https://github.com/pandas-dev/pandas/issues/30037 # use masked algorithms, rather than object-dtype / np.nan. diff --git a/pandas/core/base.py b/pandas/core/base.py index bf2ed02c57a29..9281d2f72b409 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -855,23 +855,6 @@ def to_numpy(self, dtype=None, copy=False, na_value=lib.no_default, **kwargs): result[self.isna()] = na_value return result - @property - def _ndarray_values(self) -> np.ndarray: - """ - The data as an ndarray, possibly losing information. - - The expectation is that this is cheap to compute, and is primarily - used for interacting with our indexers. - - - categorical -> codes - """ - if is_extension_array_dtype(self): - return self.array._ndarray_values - # As a mixin, we depend on the mixing class having values. - # Special mixin syntax may be developed in the future: - # https://github.com/python/typing/issues/246 - return self.values # type: ignore - @property def empty(self): return not self.size diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 162d69d957669..4fa771dfbcf82 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -464,8 +464,7 @@ def _simple_new(cls, values, name: Label = None): # _index_data is a (temporary?) fix to ensure that the direct data # manipulation we do in `_libs/reduction.pyx` continues to work. # We need access to the actual ndarray, since we're messing with - # data buffers and strides. We don't re-use `_ndarray_values`, since - # we actually set this value too. + # data buffers and strides. result._index_data = values result._name = name result._cache = {} @@ -625,7 +624,8 @@ def ravel(self, order="C"): -------- numpy.ndarray.ravel """ - return self._ndarray_values.ravel(order=order) + values = self._get_engine_target() + return values.ravel(order=order) def view(self, cls=None): @@ -3846,29 +3846,24 @@ def _values(self) -> Union[ExtensionArray, np.ndarray]: """ The best array representation. - This is an ndarray or ExtensionArray. This differs from - ``_ndarray_values``, which always returns an ndarray. + This is an ndarray or ExtensionArray. - Both ``_values`` and ``_ndarray_values`` are consistent between - ``Series`` and ``Index`` (except for datetime64[ns], which returns - a DatetimeArray for _values on the Index, but ndarray[M8ns] on the - Series). + ``_values`` are consistent between``Series`` and ``Index``. It may differ from the public '.values' method. - index | values | _values | _ndarray_values | - ----------------- | --------------- | ------------- | --------------- | - Index | ndarray | ndarray | ndarray | - CategoricalIndex | Categorical | Categorical | ndarray[int] | - DatetimeIndex | ndarray[M8ns] | DatetimeArray | ndarray[M8ns] | - DatetimeIndex[tz] | ndarray[M8ns] | DatetimeArray | ndarray[M8ns] | - PeriodIndex | ndarray[object] | PeriodArray | ndarray[int] | - IntervalIndex | IntervalArray | IntervalArray | ndarray[object] | + index | values | _values | + ----------------- | --------------- | ------------- | + Index | ndarray | ndarray | + CategoricalIndex | Categorical | Categorical | + DatetimeIndex | ndarray[M8ns] | DatetimeArray | + DatetimeIndex[tz] | ndarray[M8ns] | DatetimeArray | + PeriodIndex | ndarray[object] | PeriodArray | + IntervalIndex | IntervalArray | IntervalArray | See Also -------- values - _ndarray_values """ return self._data diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index 9c55d2de946a8..2f641a3d4c111 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -179,7 +179,7 @@ def sort_values(self, return_indexer=False, ascending=True): sorted_index = self.take(_as) return sorted_index, _as else: - # NB: using asi8 instead of _ndarray_values matters in numpy 1.18 + # NB: using asi8 instead of _data matters in numpy 1.18 # because the treatment of NaT has been changed to put NaT last # instead of first. sorted_values = np.sort(self.asi8) diff --git a/pandas/core/indexes/extension.py b/pandas/core/indexes/extension.py index 6d5f0dbb830f9..6851aeec0ca40 100644 --- a/pandas/core/indexes/extension.py +++ b/pandas/core/indexes/extension.py @@ -228,10 +228,6 @@ def __iter__(self): def __array__(self, dtype=None) -> np.ndarray: return np.asarray(self._data, dtype=dtype) - @property - def _ndarray_values(self) -> np.ndarray: - return self._data._ndarray_values - def _get_engine_target(self) -> np.ndarray: return self._data._values_for_argsort() diff --git a/pandas/core/series.py b/pandas/core/series.py index 21477cce48e63..006a98a6cddcb 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -550,21 +550,17 @@ def _values(self): timedelta64 dtypes), while ``.array`` ensures to always return an ExtensionArray. - Differs from ``._ndarray_values``, as that ensures to always return a - numpy array (it will call ``_ndarray_values`` on the ExtensionArray, if - the Series was backed by an ExtensionArray). - Overview: - dtype | values | _values | array | _ndarray_values | - ----------- | ------------- | ------------- | ------------- | --------------- | - Numeric | ndarray | ndarray | PandasArray | ndarray | - Category | Categorical | Categorical | Categorical | ndarray[int] | - dt64[ns] | ndarray[M8ns] | DatetimeArray | DatetimeArray | ndarray[M8ns] | - dt64[ns tz] | ndarray[M8ns] | DatetimeArray | DatetimeArray | ndarray[M8ns] | - td64[ns] | ndarray[m8ns] | TimedeltaArray| ndarray[m8ns] | ndarray[m8ns] | - Period | ndarray[obj] | PeriodArray | PeriodArray | ndarray[int] | - Nullable | EA | EA | EA | ndarray | + dtype | values | _values | array | + ----------- | ------------- | ------------- | ------------- | + Numeric | ndarray | ndarray | PandasArray | + Category | Categorical | Categorical | Categorical | + dt64[ns] | ndarray[M8ns] | DatetimeArray | DatetimeArray | + dt64[ns tz] | ndarray[M8ns] | DatetimeArray | DatetimeArray | + td64[ns] | ndarray[m8ns] | TimedeltaArray| ndarray[m8ns] | + Period | ndarray[obj] | PeriodArray | PeriodArray | + Nullable | EA | EA | EA | """ return self._data.internal_values() diff --git a/pandas/tests/base/test_conversion.py b/pandas/tests/base/test_conversion.py index 46fd1551e6170..59f9103072fe9 100644 --- a/pandas/tests/base/test_conversion.py +++ b/pandas/tests/base/test_conversion.py @@ -220,34 +220,6 @@ def test_values_consistent(array, expected_type, dtype): tm.assert_equal(l_values, r_values) -@pytest.mark.parametrize( - "array, expected", - [ - (np.array([0, 1], dtype=np.int64), np.array([0, 1], dtype=np.int64)), - (np.array(["0", "1"]), np.array(["0", "1"], dtype=object)), - (pd.Categorical(["a", "a"]), np.array([0, 0], dtype="int8")), - ( - pd.DatetimeIndex(["2017-01-01T00:00:00"]), - np.array(["2017-01-01T00:00:00"], dtype="M8[ns]"), - ), - ( - pd.DatetimeIndex(["2017-01-01T00:00:00"], tz="US/Eastern"), - np.array(["2017-01-01T05:00:00"], dtype="M8[ns]"), - ), - (pd.TimedeltaIndex([10 ** 10]), np.array([10 ** 10], dtype="m8[ns]")), - ( - pd.PeriodIndex(["2017", "2018"], freq="D"), - np.array([17167, 17532], dtype=np.int64), - ), - ], -) -def test_ndarray_values(array, expected): - l_values = pd.Series(array)._ndarray_values - r_values = pd.Index(array)._ndarray_values - tm.assert_numpy_array_equal(l_values, r_values) - tm.assert_numpy_array_equal(l_values, expected) - - @pytest.mark.parametrize("arr", [np.array([1, 2, 3])]) def test_numpy_array(arr): ser = pd.Series(arr) diff --git a/pandas/tests/indexes/common.py b/pandas/tests/indexes/common.py index c13385c135e9f..43f696e0b13db 100644 --- a/pandas/tests/indexes/common.py +++ b/pandas/tests/indexes/common.py @@ -313,16 +313,11 @@ def test_ensure_copied_data(self, indices): result = result.tz_localize("UTC").tz_convert(indices.tz) tm.assert_index_equal(indices, result) - tm.assert_numpy_array_equal( - indices._ndarray_values, result._ndarray_values, check_same="copy" - ) if isinstance(indices, PeriodIndex): # .values an object array of Period, thus copied result = index_type(ordinal=indices.asi8, copy=False, **init_kwargs) - tm.assert_numpy_array_equal( - indices._ndarray_values, result._ndarray_values, check_same="same" - ) + tm.assert_numpy_array_equal(indices.asi8, result.asi8, check_same="same") elif isinstance(indices, IntervalIndex): # checked in test_interval.py pass @@ -331,9 +326,6 @@ def test_ensure_copied_data(self, indices): tm.assert_numpy_array_equal( indices.values, result.values, check_same="same" ) - tm.assert_numpy_array_equal( - indices._ndarray_values, result._ndarray_values, check_same="same" - ) def test_memory_usage(self, indices): indices._engine.clear_mapping() diff --git a/pandas/tests/indexes/interval/test_constructors.py b/pandas/tests/indexes/interval/test_constructors.py index 837c124db2bed..fa881df8139c6 100644 --- a/pandas/tests/indexes/interval/test_constructors.py +++ b/pandas/tests/indexes/interval/test_constructors.py @@ -91,7 +91,7 @@ def test_constructor_nan(self, constructor, breaks, closed): assert result.closed == closed assert result.dtype.subtype == expected_subtype - tm.assert_numpy_array_equal(result._ndarray_values, expected_values) + tm.assert_numpy_array_equal(np.array(result), expected_values) @pytest.mark.parametrize( "breaks", @@ -114,7 +114,7 @@ def test_constructor_empty(self, constructor, breaks, closed): assert result.empty assert result.closed == closed assert result.dtype.subtype == expected_subtype - tm.assert_numpy_array_equal(result._ndarray_values, expected_values) + tm.assert_numpy_array_equal(np.array(result), expected_values) @pytest.mark.parametrize( "breaks", diff --git a/pandas/tests/indexes/interval/test_interval.py b/pandas/tests/indexes/interval/test_interval.py index c2b209c810af9..efdd3fc9907a2 100644 --- a/pandas/tests/indexes/interval/test_interval.py +++ b/pandas/tests/indexes/interval/test_interval.py @@ -147,7 +147,7 @@ def test_ensure_copied_data(self, closed): ) # by-definition make a copy - result = IntervalIndex(index._ndarray_values, copy=False) + result = IntervalIndex(np.array(index), copy=False) tm.assert_numpy_array_equal( index.left.values, result.left.values, check_same="copy" ) diff --git a/pandas/tests/indexes/period/test_constructors.py b/pandas/tests/indexes/period/test_constructors.py index b5ff83ec7514d..cb2140d0b4025 100644 --- a/pandas/tests/indexes/period/test_constructors.py +++ b/pandas/tests/indexes/period/test_constructors.py @@ -147,9 +147,9 @@ def test_constructor_fromarraylike(self): msg = "freq not specified and cannot be inferred" with pytest.raises(ValueError, match=msg): - PeriodIndex(idx._ndarray_values) + PeriodIndex(idx.asi8) with pytest.raises(ValueError, match=msg): - PeriodIndex(list(idx._ndarray_values)) + PeriodIndex(list(idx.asi8)) msg = "'Period' object is not iterable" with pytest.raises(TypeError, match=msg): diff --git a/pandas/tests/indexes/period/test_period.py b/pandas/tests/indexes/period/test_period.py index 03f0be3f368cb..df2f85cd7f1e2 100644 --- a/pandas/tests/indexes/period/test_period.py +++ b/pandas/tests/indexes/period/test_period.py @@ -161,7 +161,7 @@ def test_values(self): tm.assert_numpy_array_equal(idx.to_numpy(), exp) exp = np.array([], dtype=np.int64) - tm.assert_numpy_array_equal(idx._ndarray_values, exp) + tm.assert_numpy_array_equal(idx.asi8, exp) idx = PeriodIndex(["2011-01", NaT], freq="M") @@ -169,7 +169,7 @@ def test_values(self): tm.assert_numpy_array_equal(idx.values, exp) tm.assert_numpy_array_equal(idx.to_numpy(), exp) exp = np.array([492, -9223372036854775808], dtype=np.int64) - tm.assert_numpy_array_equal(idx._ndarray_values, exp) + tm.assert_numpy_array_equal(idx.asi8, exp) idx = PeriodIndex(["2011-01-01", NaT], freq="D") @@ -177,7 +177,7 @@ def test_values(self): tm.assert_numpy_array_equal(idx.values, exp) tm.assert_numpy_array_equal(idx.to_numpy(), exp) exp = np.array([14975, -9223372036854775808], dtype=np.int64) - tm.assert_numpy_array_equal(idx._ndarray_values, exp) + tm.assert_numpy_array_equal(idx.asi8, exp) def test_period_index_length(self): pi = period_range(freq="A", start="1/1/2001", end="12/1/2009") diff --git a/pandas/tests/reductions/test_reductions.py b/pandas/tests/reductions/test_reductions.py index 211d0d52d8357..abd99aadfb484 100644 --- a/pandas/tests/reductions/test_reductions.py +++ b/pandas/tests/reductions/test_reductions.py @@ -55,9 +55,7 @@ def test_ops(self, opname, obj): if not isinstance(obj, PeriodIndex): expected = getattr(obj.values, opname)() else: - expected = pd.Period( - ordinal=getattr(obj._ndarray_values, opname)(), freq=obj.freq - ) + expected = pd.Period(ordinal=getattr(obj.asi8, opname)(), freq=obj.freq) try: assert result == expected except TypeError: From 8e5ba59ca8a9ba2f9dbd6314e8bc373a16c1e734 Mon Sep 17 00:00:00 2001 From: MomIsBestFriend <50263213+MomIsBestFriend@users.noreply.github.com> Date: Thu, 19 Mar 2020 02:25:32 +0200 Subject: [PATCH 13/40] BLD: Suppressing errors while compling pandas/_libs/groupby (#32794) --- pandas/_libs/groupby.pyx | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 35a6963165194..e7ac3b8442c6d 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -869,7 +869,9 @@ def group_last(rank_t[:, :] out, assert min_count == -1, "'min_count' only used in add and prod" - if not len(values) == len(labels): + # TODO(cython 3.0): + # Instead of `labels.shape[0]` use `len(labels)` + if not len(values) == labels.shape[0]: raise AssertionError("len(index) != len(labels)") nobs = np.zeros((out).shape, dtype=np.int64) @@ -960,7 +962,9 @@ def group_nth(rank_t[:, :] out, assert min_count == -1, "'min_count' only used in add and prod" - if not len(values) == len(labels): + # TODO(cython 3.0): + # Instead of `labels.shape[0]` use `len(labels)` + if not len(values) == labels.shape[0]: raise AssertionError("len(index) != len(labels)") nobs = np.zeros((out).shape, dtype=np.int64) @@ -1254,7 +1258,9 @@ def group_max(groupby_t[:, :] out, assert min_count == -1, "'min_count' only used in add and prod" - if not len(values) == len(labels): + # TODO(cython 3.0): + # Instead of `labels.shape[0]` use `len(labels)` + if not len(values) == labels.shape[0]: raise AssertionError("len(index) != len(labels)") nobs = np.zeros((out).shape, dtype=np.int64) @@ -1327,7 +1333,9 @@ def group_min(groupby_t[:, :] out, assert min_count == -1, "'min_count' only used in add and prod" - if not len(values) == len(labels): + # TODO(cython 3.0): + # Instead of `labels.shape[0]` use `len(labels)` + if not len(values) == labels.shape[0]: raise AssertionError("len(index) != len(labels)") nobs = np.zeros((out).shape, dtype=np.int64) From 3aa42268ce38b0cb3882cd0ad08468511b3ee925 Mon Sep 17 00:00:00 2001 From: Terji Petersen Date: Thu, 19 Mar 2020 00:52:09 +0000 Subject: [PATCH 14/40] TYP: PandasObject._cache (#32775) --- pandas/core/base.py | 6 ++++-- pandas/core/indexes/base.py | 3 +-- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/pandas/core/base.py b/pandas/core/base.py index 9281d2f72b409..e1c6bef66239d 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -4,7 +4,7 @@ import builtins import textwrap -from typing import Dict, FrozenSet, List, Optional, Union +from typing import Any, Dict, FrozenSet, List, Optional, Union import numpy as np @@ -49,6 +49,8 @@ class PandasObject(DirNamesMixin): Baseclass for various pandas objects. """ + _cache: Dict[str, Any] + @property def _constructor(self): """ @@ -63,7 +65,7 @@ def __repr__(self) -> str: # Should be overwritten by base classes return object.__repr__(self) - def _reset_cache(self, key=None): + def _reset_cache(self, key: Optional[str] = None) -> None: """ Reset cached properties. If ``key`` is passed, only clears that key. """ diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 4fa771dfbcf82..4501dd1ddd887 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -1,7 +1,7 @@ from datetime import datetime import operator from textwrap import dedent -from typing import TYPE_CHECKING, Any, Dict, FrozenSet, Hashable, Union +from typing import TYPE_CHECKING, Any, FrozenSet, Hashable, Union import warnings import numpy as np @@ -249,7 +249,6 @@ def _outer_indexer(self, left, right): _typ = "index" _data: Union[ExtensionArray, np.ndarray] - _cache: Dict[str, Any] _id = None _name: Label = None # MultiIndex.levels previously allowed setting the index name. We From bc24a8ccead373869ba43a2a96b66d18c172899f Mon Sep 17 00:00:00 2001 From: William Ayd Date: Wed, 18 Mar 2020 17:57:21 -0700 Subject: [PATCH 15/40] Implement C Level Timedelta ISO Function; fix JSON usage (#30903) --- doc/source/whatsnew/v1.1.0.rst | 1 + .../_libs/src/ujson/python/date_conversions.c | 26 ++++++ .../_libs/src/ujson/python/date_conversions.h | 2 + pandas/_libs/src/ujson/python/objToJSON.c | 81 +++++++++---------- .../tslibs/src/datetime/np_datetime_strings.c | 34 ++++++++ .../tslibs/src/datetime/np_datetime_strings.h | 10 +++ .../tests/io/json/test_json_table_schema.py | 3 +- pandas/tests/io/json/test_pandas.py | 23 ++++++ pandas/tests/io/json/test_ujson.py | 23 +++++- 9 files changed, 155 insertions(+), 48 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 441c6cee32b2a..fbde1727fbb27 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -343,6 +343,7 @@ I/O timestamps with ``version="2.0"`` (:issue:`31652`). - Bug in :meth:`read_csv` was raising `TypeError` when `sep=None` was used in combination with `comment` keyword (:issue:`31396`) - Bug in :class:`HDFStore` that caused it to set to ``int64`` the dtype of a ``datetime64`` column when reading a DataFrame in Python 3 from fixed format written in Python 2 (:issue:`31750`) +- Bug in :meth:`DataFrame.to_json` where ``Timedelta`` objects would not be serialized correctly with ``date_format="iso"`` (:issue:`28256`) - :func:`read_csv` will raise a ``ValueError`` when the column names passed in `parse_dates` are missing in the Dataframe (:issue:`31251`) - Bug in :meth:`read_excel` where a UTF-8 string with a high surrogate would cause a segmentation violation (:issue:`23809`) - Bug in :meth:`read_csv` was causing a file descriptor leak on an empty file (:issue:`31488`) diff --git a/pandas/_libs/src/ujson/python/date_conversions.c b/pandas/_libs/src/ujson/python/date_conversions.c index fc4bdef8463af..bcb1334d978ef 100644 --- a/pandas/_libs/src/ujson/python/date_conversions.c +++ b/pandas/_libs/src/ujson/python/date_conversions.c @@ -116,3 +116,29 @@ npy_datetime PyDateTimeToEpoch(PyDateTime_Date *dt, NPY_DATETIMEUNIT base) { npy_datetime npy_dt = npy_datetimestruct_to_datetime(NPY_FR_ns, &dts); return NpyDateTimeToEpoch(npy_dt, base); } + +/* Converts the int64_t representation of a duration to ISO; mutates len */ +char *int64ToIsoDuration(int64_t value, size_t *len) { + pandas_timedeltastruct tds; + int ret_code; + + pandas_timedelta_to_timedeltastruct(value, NPY_FR_ns, &tds); + + // Max theoretical length of ISO Duration with 64 bit day + // as the largest unit is 70 characters + 1 for a null terminator + char *result = PyObject_Malloc(71); + if (result == NULL) { + PyErr_NoMemory(); + return NULL; + } + + ret_code = make_iso_8601_timedelta(&tds, result, len); + if (ret_code == -1) { + PyErr_SetString(PyExc_ValueError, + "Could not convert timedelta value to string"); + PyObject_Free(result); + return NULL; + } + + return result; +} diff --git a/pandas/_libs/src/ujson/python/date_conversions.h b/pandas/_libs/src/ujson/python/date_conversions.h index 45455f4d6128b..1b5cbf2a7e307 100644 --- a/pandas/_libs/src/ujson/python/date_conversions.h +++ b/pandas/_libs/src/ujson/python/date_conversions.h @@ -28,4 +28,6 @@ char *PyDateTimeToIso(PyDateTime_Date *obj, NPY_DATETIMEUNIT base, size_t *len); // Convert a Python Date/Datetime to Unix epoch with resolution base npy_datetime PyDateTimeToEpoch(PyDateTime_Date *dt, NPY_DATETIMEUNIT base); +char *int64ToIsoDuration(int64_t value, size_t *len); + #endif diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c index db1feccb4d0c6..95e98779c2368 100644 --- a/pandas/_libs/src/ujson/python/objToJSON.c +++ b/pandas/_libs/src/ujson/python/objToJSON.c @@ -165,7 +165,6 @@ void *initObjToJSON(void) { cls_index = (PyTypeObject *)PyObject_GetAttrString(mod_pandas, "Index"); cls_series = (PyTypeObject *)PyObject_GetAttrString(mod_pandas, "Series"); - cls_timedelta = PyObject_GetAttrString(mod_pandas, "Timedelta"); Py_DECREF(mod_pandas); } @@ -357,6 +356,12 @@ static char *NpyDateTimeToIsoCallback(JSOBJ Py_UNUSED(unused), return int64ToIso(GET_TC(tc)->longValue, base, len); } +/* JSON callback. returns a char* and mutates the pointer to *len */ +static char *NpyTimeDeltaToIsoCallback(JSOBJ Py_UNUSED(unused), + JSONTypeContext *tc, size_t *len) { + return int64ToIsoDuration(GET_TC(tc)->longValue, len); +} + /* JSON callback */ static char *PyDateTimeToIsoCallback(JSOBJ obj, JSONTypeContext *tc, size_t *len) { @@ -1445,7 +1450,8 @@ char **NpyArr_encodeLabels(PyArrayObject *labels, PyObjectEncoder *enc, 1000000000LL; // nanoseconds per second } else { // datetime.* objects don't follow above rules - nanosecVal = PyDateTimeToEpoch(item, NPY_FR_ns); + nanosecVal = + PyDateTimeToEpoch((PyDateTime_Date *)item, NPY_FR_ns); } } } @@ -1457,31 +1463,8 @@ char **NpyArr_encodeLabels(PyArrayObject *labels, PyObjectEncoder *enc, strncpy(cLabel, "null", len); } else { if (enc->datetimeIso) { - // TODO: Vectorized Timedelta function if ((type_num == NPY_TIMEDELTA) || (PyDelta_Check(item))) { - PyObject *td = - PyObject_CallFunction(cls_timedelta, "(O)", item); - if (td == NULL) { - Py_DECREF(item); - NpyArr_freeLabels(ret, num); - ret = 0; - break; - } - - PyObject *iso = - PyObject_CallMethod(td, "isoformat", NULL); - Py_DECREF(td); - if (iso == NULL) { - Py_DECREF(item); - NpyArr_freeLabels(ret, num); - ret = 0; - break; - } - - len = strlen(PyUnicode_AsUTF8(iso)); - cLabel = PyObject_Malloc(len + 1); - memcpy(cLabel, PyUnicode_AsUTF8(iso), len + 1); - Py_DECREF(iso); + cLabel = int64ToIsoDuration(nanosecVal, &len); } else { if (type_num == NPY_DATETIME) { cLabel = int64ToIso(nanosecVal, base, &len); @@ -1614,7 +1597,11 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { if (enc->datetimeIso) { PRINTMARK(); - pc->PyTypeToUTF8 = NpyDateTimeToIsoCallback; + if (enc->npyType == NPY_TIMEDELTA) { + pc->PyTypeToUTF8 = NpyTimeDeltaToIsoCallback; + } else { + pc->PyTypeToUTF8 = NpyDateTimeToIsoCallback; + } // Currently no way to pass longVal to iso function, so use // state management GET_TC(tc)->longValue = longVal; @@ -1695,7 +1682,8 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { PRINTMARK(); NPY_DATETIMEUNIT base = ((PyObjectEncoder *)tc->encoder)->datetimeUnit; - GET_TC(tc)->longValue = PyDateTimeToEpoch((PyDateTime_Date *)obj, base); + GET_TC(tc)->longValue = + PyDateTimeToEpoch((PyDateTime_Date *)obj, base); tc->type = JT_LONG; } return; @@ -1721,7 +1709,8 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { PRINTMARK(); NPY_DATETIMEUNIT base = ((PyObjectEncoder *)tc->encoder)->datetimeUnit; - GET_TC(tc)->longValue = PyDateTimeToEpoch((PyDateTime_Date *)obj, base); + GET_TC(tc)->longValue = + PyDateTimeToEpoch((PyDateTime_Date *)obj, base); tc->type = JT_LONG; } return; @@ -1734,28 +1723,30 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { value = total_seconds(obj) * 1000000000LL; // nanoseconds per second } - unit = ((PyObjectEncoder *)tc->encoder)->datetimeUnit; - if (scaleNanosecToUnit(&value, unit) != 0) { - // TODO: Add some kind of error handling here - } - - exc = PyErr_Occurred(); - - if (exc && PyErr_ExceptionMatches(PyExc_OverflowError)) { - PRINTMARK(); - goto INVALID; - } - + PRINTMARK(); if (value == get_nat()) { PRINTMARK(); tc->type = JT_NULL; return; - } + } else if (enc->datetimeIso) { + pc->PyTypeToUTF8 = NpyTimeDeltaToIsoCallback; + tc->type = JT_UTF8; + } else { + unit = ((PyObjectEncoder *)tc->encoder)->datetimeUnit; + if (scaleNanosecToUnit(&value, unit) != 0) { + // TODO: Add some kind of error handling here + } - GET_TC(tc)->longValue = value; + exc = PyErr_Occurred(); - PRINTMARK(); - tc->type = JT_LONG; + if (exc && PyErr_ExceptionMatches(PyExc_OverflowError)) { + PRINTMARK(); + goto INVALID; + } + + tc->type = JT_LONG; + } + GET_TC(tc)->longValue = value; return; } else if (PyArray_IsScalar(obj, Integer)) { PRINTMARK(); diff --git a/pandas/_libs/tslibs/src/datetime/np_datetime_strings.c b/pandas/_libs/tslibs/src/datetime/np_datetime_strings.c index 54ed6ecff21e2..b245ae5880ecb 100644 --- a/pandas/_libs/tslibs/src/datetime/np_datetime_strings.c +++ b/pandas/_libs/tslibs/src/datetime/np_datetime_strings.c @@ -905,3 +905,37 @@ int make_iso_8601_datetime(npy_datetimestruct *dts, char *outstr, int outlen, outlen); return -1; } + + +int make_iso_8601_timedelta(pandas_timedeltastruct *tds, + char *outstr, size_t *outlen) { + *outlen = 0; + *outlen += snprintf(outstr, 60, // NOLINT + "P%" NPY_INT64_FMT + "DT%" NPY_INT32_FMT + "H%" NPY_INT32_FMT + "M%" NPY_INT32_FMT, + tds->days, tds->hrs, tds->min, tds->sec); + outstr += *outlen; + + if (tds->ns != 0) { + *outlen += snprintf(outstr, 12, // NOLINT + ".%03" NPY_INT32_FMT + "%03" NPY_INT32_FMT + "%03" NPY_INT32_FMT + "S", tds->ms, tds->us, tds->ns); + } else if (tds->us != 0) { + *outlen += snprintf(outstr, 9, // NOLINT + ".%03" NPY_INT32_FMT + "%03" NPY_INT32_FMT + "S", tds->ms, tds->us); + } else if (tds->ms != 0) { + *outlen += snprintf(outstr, 6, // NOLINT + ".%03" NPY_INT32_FMT "S", tds->ms); + } else { + *outlen += snprintf(outstr, 2, // NOLINT + "%s", "S"); + } + + return 0; +} diff --git a/pandas/_libs/tslibs/src/datetime/np_datetime_strings.h b/pandas/_libs/tslibs/src/datetime/np_datetime_strings.h index 880c34ea77638..200a71ff0c2b7 100644 --- a/pandas/_libs/tslibs/src/datetime/np_datetime_strings.h +++ b/pandas/_libs/tslibs/src/datetime/np_datetime_strings.h @@ -79,4 +79,14 @@ get_datetime_iso_8601_strlen(int local, NPY_DATETIMEUNIT base); int make_iso_8601_datetime(npy_datetimestruct *dts, char *outstr, int outlen, NPY_DATETIMEUNIT base); + +/* + * Converts an pandas_timedeltastruct to an ISO 8601 string. + * + * Mutates outlen to provide size of (non-NULL terminated) string. + * + * Currently has no error handling + */ +int make_iso_8601_timedelta(pandas_timedeltastruct *tds, char *outstr, + size_t *outlen); #endif // PANDAS__LIBS_TSLIBS_SRC_DATETIME_NP_DATETIME_STRINGS_H_ diff --git a/pandas/tests/io/json/test_json_table_schema.py b/pandas/tests/io/json/test_json_table_schema.py index 2ac2acc6748d1..c0d40048a72fe 100644 --- a/pandas/tests/io/json/test_json_table_schema.py +++ b/pandas/tests/io/json/test_json_table_schema.py @@ -603,8 +603,7 @@ def test_timestamp_in_columns(self): result = df.to_json(orient="table") js = json.loads(result) assert js["schema"]["fields"][1]["name"] == "2016-01-01T00:00:00.000Z" - # TODO - below expectation is not correct; see GH 28256 - assert js["schema"]["fields"][2]["name"] == 10000 + assert js["schema"]["fields"][2]["name"] == "P0DT0H0M10S" @pytest.mark.parametrize( "case", diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 276dfd666c5d0..d56ddb98fa4fa 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -1057,6 +1057,29 @@ def test_mixed_timedelta_datetime(self): result = pd.read_json(frame.to_json(date_unit="ns"), dtype={"a": "int64"}) tm.assert_frame_equal(result, expected, check_index_type=False) + @pytest.mark.parametrize("as_object", [True, False]) + @pytest.mark.parametrize("date_format", ["iso", "epoch"]) + @pytest.mark.parametrize("timedelta_typ", [pd.Timedelta, timedelta]) + def test_timedelta_to_json(self, as_object, date_format, timedelta_typ): + # GH28156: to_json not correctly formatting Timedelta + data = [timedelta_typ(days=1), timedelta_typ(days=2), pd.NaT] + if as_object: + data.append("a") + + ser = pd.Series(data, index=data) + if date_format == "iso": + expected = ( + '{"P1DT0H0M0S":"P1DT0H0M0S","P2DT0H0M0S":"P2DT0H0M0S","null":null}' + ) + else: + expected = '{"86400000":86400000,"172800000":172800000,"null":null}' + + if as_object: + expected = expected.replace("}", ',"a":"a"}') + + result = ser.to_json(date_format=date_format) + assert result == expected + def test_default_handler(self): value = object() frame = DataFrame({"a": [7, value]}) diff --git a/pandas/tests/io/json/test_ujson.py b/pandas/tests/io/json/test_ujson.py index e86667626deda..34dd9ba9bc7b6 100644 --- a/pandas/tests/io/json/test_ujson.py +++ b/pandas/tests/io/json/test_ujson.py @@ -16,7 +16,7 @@ from pandas._libs.tslib import Timestamp import pandas.compat as compat -from pandas import DataFrame, DatetimeIndex, Index, NaT, Series, date_range +from pandas import DataFrame, DatetimeIndex, Index, NaT, Series, Timedelta, date_range import pandas._testing as tm @@ -1103,3 +1103,24 @@ def test_encode_set(self): for v in dec: assert v in s + + @pytest.mark.parametrize( + "td", + [ + Timedelta(days=366), + Timedelta(days=-1), + Timedelta(hours=13, minutes=5, seconds=5), + Timedelta(hours=13, minutes=20, seconds=30), + Timedelta(days=-1, nanoseconds=5), + Timedelta(nanoseconds=1), + Timedelta(microseconds=1, nanoseconds=1), + Timedelta(milliseconds=1, microseconds=1, nanoseconds=1), + Timedelta(milliseconds=999, microseconds=999, nanoseconds=999), + ], + ) + def test_encode_timedelta_iso(self, td): + # GH 28256 + result = ujson.encode(td, iso_dates=True) + expected = f'"{td.isoformat()}"' + + assert result == expected From 628513e51e67bc3fdc061932a00cf61f927d7af4 Mon Sep 17 00:00:00 2001 From: Robert de Vries Date: Thu, 19 Mar 2020 02:40:40 +0100 Subject: [PATCH 16/40] BUG: read_csv: fix wrong exception on permissions issue (#32737) * Generate exception from the C code in the proper manner Get rid of all error printf's and produce proper Python exceptions * Declare some more exceptions from C code * Remove special case error message for c parser * Add whatsnew entry * Fix missing semicolons * Add regression test * Remove special case handling for Windows PyErr_SetFromErrnoWithFilename works for Unix and Windows * Remove call to GetLastError(), when using 0, the python error code handles this * black fixes * Fix indentation of assert statement (also in previous test, same error) * Skip the test on windows * Fix black issue * Let new_mmap fail without exception to allow fallback * Do not create a python error in new_mmap to allow the fallback to work silently * Remove the NULL pointer check for new_rd_source now that it will raise an exception * Update doc/source/whatsnew/v1.1.0.rst Co-Authored-By: gfyoung Co-authored-by: Jeff Reback Co-authored-by: gfyoung --- doc/source/whatsnew/v1.1.0.rst | 1 + pandas/_libs/parsers.pyx | 18 ++---------------- pandas/_libs/src/parser/io.c | 19 +++++++++++-------- pandas/tests/io/parser/test_common.py | 16 +++++++++++++--- 4 files changed, 27 insertions(+), 27 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index fbde1727fbb27..2db61a17858de 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -348,6 +348,7 @@ I/O - Bug in :meth:`read_excel` where a UTF-8 string with a high surrogate would cause a segmentation violation (:issue:`23809`) - Bug in :meth:`read_csv` was causing a file descriptor leak on an empty file (:issue:`31488`) - Bug in :meth:`read_csv` was causing a segfault when there were blank lines between the header and data rows (:issue:`28071`) +- Bug in :meth:`read_csv` was raising a misleading exception on a permissions issue (:issue:`23784`) Plotting diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index 4f7d75e0aaad6..39195585ebfa6 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -241,9 +241,9 @@ cdef extern from "parser/io.h": void* buffer_mmap_bytes(void *source, size_t nbytes, size_t *bytes_read, int *status) - void *new_file_source(char *fname, size_t buffer_size) + void *new_file_source(char *fname, size_t buffer_size) except NULL - void *new_rd_source(object obj) + void *new_rd_source(object obj) except NULL int del_file_source(void *src) int del_rd_source(void *src) @@ -667,26 +667,12 @@ cdef class TextReader: ptr = new_file_source(source, self.parser.chunksize) self.parser.cb_io = &buffer_file_bytes self.parser.cb_cleanup = &del_file_source - - if ptr == NULL: - if not os.path.exists(source): - - raise FileNotFoundError( - ENOENT, - f'File {usource} does not exist', - usource) - raise IOError('Initializing from file failed') - self.parser.source = ptr elif hasattr(source, 'read'): # e.g., StringIO ptr = new_rd_source(source) - if ptr == NULL: - raise IOError('Initializing parser from file-like ' - 'object failed') - self.parser.source = ptr self.parser.cb_io = &buffer_rd_bytes self.parser.cb_cleanup = &del_rd_source diff --git a/pandas/_libs/src/parser/io.c b/pandas/_libs/src/parser/io.c index 1e3295fcb6fc7..51504527de5a2 100644 --- a/pandas/_libs/src/parser/io.c +++ b/pandas/_libs/src/parser/io.c @@ -28,6 +28,7 @@ The full license is in the LICENSE file, distributed with this software. void *new_file_source(char *fname, size_t buffer_size) { file_source *fs = (file_source *)malloc(sizeof(file_source)); if (fs == NULL) { + PyErr_NoMemory(); return NULL; } @@ -41,17 +42,20 @@ void *new_file_source(char *fname, size_t buffer_size) { int required = MultiByteToWideChar(CP_UTF8, 0, fname, -1, NULL, 0); if (required == 0) { free(fs); + PyErr_SetFromWindowsErr(0); return NULL; } wname = (wchar_t*)malloc(required * sizeof(wchar_t)); if (wname == NULL) { free(fs); + PyErr_NoMemory(); return NULL; } if (MultiByteToWideChar(CP_UTF8, 0, fname, -1, wname, required) < required) { free(wname); free(fs); + PyErr_SetFromWindowsErr(0); return NULL; } fs->fd = _wopen(wname, O_RDONLY | O_BINARY); @@ -62,6 +66,7 @@ void *new_file_source(char *fname, size_t buffer_size) { #endif if (fs->fd == -1) { free(fs); + PyErr_SetFromErrnoWithFilename(PyExc_OSError, fname); return NULL; } @@ -71,6 +76,7 @@ void *new_file_source(char *fname, size_t buffer_size) { if (fs->buffer == NULL) { close(fs->fd); free(fs); + PyErr_NoMemory(); return NULL; } @@ -83,6 +89,10 @@ void *new_file_source(char *fname, size_t buffer_size) { void *new_rd_source(PyObject *obj) { rd_source *rds = (rd_source *)malloc(sizeof(rd_source)); + if (rds == NULL) { + PyErr_NoMemory(); + return NULL; + } /* hold on to this object */ Py_INCREF(obj); rds->obj = obj; @@ -220,20 +230,15 @@ void *new_mmap(char *fname) { mm = (memory_map *)malloc(sizeof(memory_map)); if (mm == NULL) { - fprintf(stderr, "new_file_buffer: malloc() failed.\n"); - return (NULL); + return NULL; } mm->fd = open(fname, O_RDONLY | O_BINARY); if (mm->fd == -1) { - fprintf(stderr, "new_file_buffer: open(%s) failed. errno =%d\n", - fname, errno); free(mm); return NULL; } if (fstat(mm->fd, &stat) == -1) { - fprintf(stderr, "new_file_buffer: fstat() failed. errno =%d\n", - errno); close(mm->fd); free(mm); return NULL; @@ -242,8 +247,6 @@ void *new_mmap(char *fname) { mm->memmap = mmap(NULL, filesize, PROT_READ, MAP_SHARED, mm->fd, 0); if (mm->memmap == MAP_FAILED) { - /* XXX Eventually remove this print statement. */ - fprintf(stderr, "new_file_buffer: mmap() failed.\n"); close(mm->fd); free(mm); return NULL; diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py index 0f3a5be76ae60..9de2ec9799353 100644 --- a/pandas/tests/io/parser/test_common.py +++ b/pandas/tests/io/parser/test_common.py @@ -960,13 +960,23 @@ def test_nonexistent_path(all_parsers): parser = all_parsers path = f"{tm.rands(10)}.csv" - msg = f"File {path} does not exist" if parser.engine == "c" else r"\[Errno 2\]" + msg = r"\[Errno 2\]" with pytest.raises(FileNotFoundError, match=msg) as e: parser.read_csv(path) + assert path == e.value.filename - filename = e.value.filename - assert path == filename +@td.skip_if_windows # os.chmod does not work in windows +def test_no_permission(all_parsers): + # GH 23784 + parser = all_parsers + + msg = r"\[Errno 13\]" + with tm.ensure_clean() as path: + os.chmod(path, 0) # make file unreadable + with pytest.raises(PermissionError, match=msg) as e: + parser.read_csv(path) + assert path == e.value.filename def test_missing_trailing_delimiters(all_parsers): From 92cf475ac33c804a4d1d01b68499f43d2924823d Mon Sep 17 00:00:00 2001 From: William Ayd Date: Thu, 19 Mar 2020 02:32:49 -0700 Subject: [PATCH 17/40] Fixturize JSON tests (#31191) --- pandas/conftest.py | 25 +++++++++ pandas/tests/io/json/test_pandas.py | 80 +++++++++++------------------ pandas/tests/resample/conftest.py | 2 +- pandas/tests/resample/test_base.py | 20 ++++---- 4 files changed, 65 insertions(+), 62 deletions(-) diff --git a/pandas/conftest.py b/pandas/conftest.py index d8f96021cdb15..e12acb5dd56d5 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -878,6 +878,11 @@ def tick_classes(request): ) +@pytest.fixture +def empty_series(): + return pd.Series([], index=[], dtype=np.float64) + + @pytest.fixture def datetime_series(): """ @@ -888,6 +893,26 @@ def datetime_series(): return s +@pytest.fixture +def string_series(): + """ + Fixture for Series of floats with Index of unique strings + """ + s = tm.makeStringSeries() + s.name = "series" + return s + + +@pytest.fixture +def object_series(): + """ + Fixture for Series of dtype object with Index of unique strings + """ + s = tm.makeObjectSeries() + s.name = "objects" + return s + + @pytest.fixture def float_frame(): """ diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index d56ddb98fa4fa..e13b2b34d611b 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -42,22 +42,9 @@ def assert_json_roundtrip_equal(result, expected, orient): @pytest.mark.filterwarnings("ignore:the 'numpy' keyword is deprecated:FutureWarning") class TestPandasContainer: - @pytest.fixture(scope="function", autouse=True) - def setup(self, datapath): - self.dirpath = datapath("io", "json", "data") - - self.ts = tm.makeTimeSeries() - self.ts.name = "ts" - - self.series = tm.makeStringSeries() - self.series.name = "series" - - self.objSeries = tm.makeObjectSeries() - self.objSeries.name = "objects" - - self.empty_series = Series([], index=[], dtype=np.float64) + @pytest.fixture(autouse=True) + def setup(self): self.empty_frame = DataFrame() - self.frame = _frame.copy() self.frame2 = _frame2.copy() self.intframe = _intframe.copy() @@ -67,15 +54,6 @@ def setup(self, datapath): yield - del self.dirpath - - del self.ts - - del self.series - - del self.objSeries - - del self.empty_series del self.empty_frame del self.frame @@ -457,7 +435,7 @@ def test_frame_mixedtype_orient(self): # GH10289 left = read_json(inp, orient="values", convert_axes=False) tm.assert_frame_equal(left, right) - def test_v12_compat(self): + def test_v12_compat(self, datapath): df = DataFrame( [ [1.56808523, 0.65727391, 1.81021139, -0.17251653], @@ -474,12 +452,13 @@ def test_v12_compat(self): df["modified"] = df["date"] df.iloc[1, df.columns.get_loc("modified")] = pd.NaT - v12_json = os.path.join(self.dirpath, "tsframe_v012.json") + dirpath = datapath("io", "json", "data") + v12_json = os.path.join(dirpath, "tsframe_v012.json") df_unser = pd.read_json(v12_json) tm.assert_frame_equal(df, df_unser) df_iso = df.drop(["modified"], axis=1) - v12_iso_json = os.path.join(self.dirpath, "tsframe_iso_v012.json") + v12_iso_json = os.path.join(dirpath, "tsframe_iso_v012.json") df_unser_iso = pd.read_json(v12_iso_json) tm.assert_frame_equal(df_iso, df_unser_iso) @@ -633,15 +612,15 @@ def test_series_non_unique_index(self): unser = read_json(s.to_json(orient="records"), orient="records", typ="series") tm.assert_numpy_array_equal(s.values, unser.values) - def test_series_default_orient(self): - assert self.series.to_json() == self.series.to_json(orient="index") + def test_series_default_orient(self, string_series): + assert string_series.to_json() == string_series.to_json(orient="index") @pytest.mark.parametrize("numpy", [True, False]) - def test_series_roundtrip_simple(self, orient, numpy): - data = self.series.to_json(orient=orient) + def test_series_roundtrip_simple(self, orient, numpy, string_series): + data = string_series.to_json(orient=orient) result = pd.read_json(data, typ="series", orient=orient, numpy=numpy) - expected = self.series.copy() + expected = string_series if orient in ("values", "records"): expected = expected.reset_index(drop=True) if orient != "split": @@ -651,13 +630,13 @@ def test_series_roundtrip_simple(self, orient, numpy): @pytest.mark.parametrize("dtype", [False, None]) @pytest.mark.parametrize("numpy", [True, False]) - def test_series_roundtrip_object(self, orient, numpy, dtype): - data = self.objSeries.to_json(orient=orient) + def test_series_roundtrip_object(self, orient, numpy, dtype, object_series): + data = object_series.to_json(orient=orient) result = pd.read_json( data, typ="series", orient=orient, numpy=numpy, dtype=dtype ) - expected = self.objSeries.copy() + expected = object_series if orient in ("values", "records"): expected = expected.reset_index(drop=True) if orient != "split": @@ -666,12 +645,11 @@ def test_series_roundtrip_object(self, orient, numpy, dtype): tm.assert_series_equal(result, expected) @pytest.mark.parametrize("numpy", [True, False]) - def test_series_roundtrip_empty(self, orient, numpy): - data = self.empty_series.to_json(orient=orient) + def test_series_roundtrip_empty(self, orient, numpy, empty_series): + data = empty_series.to_json(orient=orient) result = pd.read_json(data, typ="series", orient=orient, numpy=numpy) - expected = self.empty_series.copy() - # TODO: see what causes inconsistency + expected = empty_series if orient in ("values", "records"): expected = expected.reset_index(drop=True) else: @@ -680,11 +658,11 @@ def test_series_roundtrip_empty(self, orient, numpy): tm.assert_series_equal(result, expected) @pytest.mark.parametrize("numpy", [True, False]) - def test_series_roundtrip_timeseries(self, orient, numpy): - data = self.ts.to_json(orient=orient) + def test_series_roundtrip_timeseries(self, orient, numpy, datetime_series): + data = datetime_series.to_json(orient=orient) result = pd.read_json(data, typ="series", orient=orient, numpy=numpy) - expected = self.ts.copy() + expected = datetime_series if orient in ("values", "records"): expected = expected.reset_index(drop=True) if orient != "split": @@ -772,7 +750,7 @@ def test_path(self): df.to_json(path) read_json(path) - def test_axis_dates(self): + def test_axis_dates(self, datetime_series): # frame json = self.tsframe.to_json() @@ -780,12 +758,12 @@ def test_axis_dates(self): tm.assert_frame_equal(result, self.tsframe) # series - json = self.ts.to_json() + json = datetime_series.to_json() result = read_json(json, typ="series") - tm.assert_series_equal(result, self.ts, check_names=False) + tm.assert_series_equal(result, datetime_series, check_names=False) assert result.name is None - def test_convert_dates(self): + def test_convert_dates(self, datetime_series): # frame df = self.tsframe.copy() @@ -805,7 +783,7 @@ def test_convert_dates(self): tm.assert_frame_equal(result, expected) # series - ts = Series(Timestamp("20130101"), index=self.ts.index) + ts = Series(Timestamp("20130101"), index=datetime_series.index) json = ts.to_json() result = read_json(json, typ="series") tm.assert_series_equal(result, ts) @@ -900,8 +878,8 @@ def test_date_format_frame_raises(self): ("20130101 20:43:42.123456789", "ns"), ], ) - def test_date_format_series(self, date, date_unit): - ts = Series(Timestamp(date), index=self.ts.index) + def test_date_format_series(self, date, date_unit, datetime_series): + ts = Series(Timestamp(date), index=datetime_series.index) ts.iloc[1] = pd.NaT ts.iloc[5] = pd.NaT if date_unit: @@ -914,8 +892,8 @@ def test_date_format_series(self, date, date_unit): expected = expected.dt.tz_localize("UTC") tm.assert_series_equal(result, expected) - def test_date_format_series_raises(self): - ts = Series(Timestamp("20130101 20:43:42.123"), index=self.ts.index) + def test_date_format_series_raises(self, datetime_series): + ts = Series(Timestamp("20130101 20:43:42.123"), index=datetime_series.index) msg = "Invalid value 'foo' for option 'date_unit'" with pytest.raises(ValueError, match=msg): ts.to_json(date_format="iso", date_unit="foo") diff --git a/pandas/tests/resample/conftest.py b/pandas/tests/resample/conftest.py index d5b71a6e4cee1..fb2111a60a261 100644 --- a/pandas/tests/resample/conftest.py +++ b/pandas/tests/resample/conftest.py @@ -134,7 +134,7 @@ def series(index, _series_name, _static_values): @pytest.fixture -def empty_series(series): +def empty_series_dti(series): """ Fixture for parametrization of empty Series with date_range, period_range and timedelta_range indexes diff --git a/pandas/tests/resample/test_base.py b/pandas/tests/resample/test_base.py index c84a5bf653b0a..3384c2a94487b 100644 --- a/pandas/tests/resample/test_base.py +++ b/pandas/tests/resample/test_base.py @@ -94,13 +94,13 @@ def test_raises_on_non_datetimelike_index(): @all_ts @pytest.mark.parametrize("freq", ["M", "D", "H"]) -def test_resample_empty_series(freq, empty_series, resample_method): +def test_resample_empty_series(freq, empty_series_dti, resample_method): # GH12771 & GH12868 if resample_method == "ohlc": pytest.skip("need to test for ohlc from GH13083") - s = empty_series + s = empty_series_dti result = getattr(s.resample(freq), resample_method)() expected = s.copy() @@ -114,13 +114,13 @@ def test_resample_empty_series(freq, empty_series, resample_method): @all_ts @pytest.mark.parametrize("freq", ["M", "D", "H"]) @pytest.mark.parametrize("resample_method", ["count", "size"]) -def test_resample_count_empty_series(freq, empty_series, resample_method): +def test_resample_count_empty_series(freq, empty_series_dti, resample_method): # GH28427 - result = getattr(empty_series.resample(freq), resample_method)() + result = getattr(empty_series_dti.resample(freq), resample_method)() - index = _asfreq_compat(empty_series.index, freq) + index = _asfreq_compat(empty_series_dti.index, freq) - expected = pd.Series([], dtype="int64", index=index, name=empty_series.name) + expected = pd.Series([], dtype="int64", index=index, name=empty_series_dti.name) tm.assert_series_equal(result, expected) @@ -188,9 +188,9 @@ def test_resample_empty_dtypes(index, dtype, resample_method): # Empty series were sometimes causing a segfault (for the functions # with Cython bounds-checking disabled) or an IndexError. We just run # them to ensure they no longer do. (GH #10228) - empty_series = Series([], index, dtype) + empty_series_dti = Series([], index, dtype) try: - getattr(empty_series.resample("d"), resample_method)() + getattr(empty_series_dti.resample("d"), resample_method)() except DataError: # Ignore these since some combinations are invalid # (ex: doing mean with dtype of np.object) @@ -227,9 +227,9 @@ def test_resample_loffset_arg_type(frame, create_index, arg): @all_ts -def test_apply_to_empty_series(empty_series): +def test_apply_to_empty_series(empty_series_dti): # GH 14313 - s = empty_series + s = empty_series_dti for freq in ["M", "D", "H"]: result = s.resample(freq).apply(lambda x: 1) expected = s.resample(freq).apply(np.sum) From ebeb6bc7f24507ddb7ec3101fb65f59c7f2cdc68 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 19 Mar 2020 11:47:15 +0100 Subject: [PATCH 18/40] PERF: fix SparseArray._simple_new object initialization (#32821) --- pandas/core/arrays/sparse/array.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index 93091555201e8..963c2f3d53138 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -399,7 +399,7 @@ def __init__( def _simple_new( cls, sparse_array: np.ndarray, sparse_index: SparseIndex, dtype: SparseDtype ) -> "SparseArray": - new = cls([]) + new = object.__new__(cls) new._sparse_index = sparse_index new._sparse_values = sparse_array new._dtype = dtype From 6473fcd8d1530cf694e69edb4b61a5ef47ddb176 Mon Sep 17 00:00:00 2001 From: Derek McCammond Date: Thu, 19 Mar 2020 06:57:49 -0400 Subject: [PATCH 19/40] Avoid bare pytest.raises in indexes/categorical/test_indexing.py (#32797) --- pandas/tests/indexes/categorical/test_indexing.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pandas/tests/indexes/categorical/test_indexing.py b/pandas/tests/indexes/categorical/test_indexing.py index 507e38d9acac2..1d41e17e327a8 100644 --- a/pandas/tests/indexes/categorical/test_indexing.py +++ b/pandas/tests/indexes/categorical/test_indexing.py @@ -65,7 +65,8 @@ def test_take_fill_value(self): with pytest.raises(ValueError, match=msg): idx.take(np.array([1, 0, -5]), fill_value=True) - with pytest.raises(IndexError): + msg = "index -5 is out of bounds for (axis 0 with )?size 3" + with pytest.raises(IndexError, match=msg): idx.take(np.array([1, -5])) def test_take_fill_value_datetime(self): @@ -104,7 +105,8 @@ def test_take_fill_value_datetime(self): with pytest.raises(ValueError, match=msg): idx.take(np.array([1, 0, -5]), fill_value=True) - with pytest.raises(IndexError): + msg = "index -5 is out of bounds for (axis 0 with )?size 3" + with pytest.raises(IndexError, match=msg): idx.take(np.array([1, -5])) def test_take_invalid_kwargs(self): From 192d736b5297869121ab27f091884e17b34aa0e9 Mon Sep 17 00:00:00 2001 From: Farhan Reynaldo Date: Thu, 19 Mar 2020 18:09:02 +0700 Subject: [PATCH 20/40] See also (#32820) --- pandas/core/series.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/pandas/core/series.py b/pandas/core/series.py index 006a98a6cddcb..aaaeadc0cf618 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1692,6 +1692,10 @@ def count(self, level=None): int or Series (if level specified) Number of non-null values in the Series. + See Also + -------- + DataFrame.count : Count non-NA cells for each column or row. + Examples -------- >>> s = pd.Series([0.0, 1.0, np.nan]) @@ -2222,6 +2226,12 @@ def corr(self, other, method="pearson", min_periods=None) -> float: float Correlation with other. + See Also + -------- + DataFrame.corr : Compute pairwise correlation between columns. + DataFrame.corrwith : Compute pairwise correlation with another + DataFrame or Series. + Examples -------- >>> def histogram_intersection(a, b): @@ -2264,6 +2274,10 @@ def cov(self, other, min_periods=None) -> float: Covariance between Series and other normalized by N-1 (unbiased estimator). + See Also + -------- + DataFrame.cov : Compute pairwise covariance of columns. + Examples -------- >>> s1 = pd.Series([0.90010907, 0.13484424, 0.62036035]) From 10c7b04c869f64f6658259e6b5bb576c9a573288 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 19 Mar 2020 04:33:30 -0700 Subject: [PATCH 21/40] TYP: annotate (#32730) --- pandas/core/arrays/base.py | 22 ++++++++-------- pandas/core/arrays/datetimelike.py | 2 +- pandas/core/arrays/period.py | 42 ++++++++++++------------------ pandas/core/ops/dispatch.py | 10 +++---- pandas/io/pytables.py | 6 ++--- 5 files changed, 36 insertions(+), 46 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index ab24beb0da4fc..67e3807c477fb 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -21,7 +21,7 @@ from pandas.core.dtypes.common import is_array_like, is_list_like from pandas.core.dtypes.dtypes import ExtensionDtype -from pandas.core.dtypes.generic import ABCExtensionArray, ABCIndexClass, ABCSeries +from pandas.core.dtypes.generic import ABCIndexClass, ABCSeries from pandas.core.dtypes.missing import isna from pandas.core import ops @@ -591,7 +591,7 @@ def dropna(self): """ return self[~self.isna()] - def shift(self, periods: int = 1, fill_value: object = None) -> ABCExtensionArray: + def shift(self, periods: int = 1, fill_value: object = None) -> "ExtensionArray": """ Shift values by desired number. @@ -728,7 +728,7 @@ def _values_for_factorize(self) -> Tuple[np.ndarray, Any]: """ return self.astype(object), np.nan - def factorize(self, na_sentinel: int = -1) -> Tuple[np.ndarray, ABCExtensionArray]: + def factorize(self, na_sentinel: int = -1) -> Tuple[np.ndarray, "ExtensionArray"]: """ Encode the extension array as an enumerated type. @@ -833,7 +833,7 @@ def repeat(self, repeats, axis=None): def take( self, indices: Sequence[int], allow_fill: bool = False, fill_value: Any = None - ) -> ABCExtensionArray: + ) -> "ExtensionArray": """ Take elements from an array. @@ -922,7 +922,7 @@ def take(self, indices, allow_fill=False, fill_value=None): # pandas.api.extensions.take raise AbstractMethodError(self) - def copy(self) -> ABCExtensionArray: + def copy(self) -> "ExtensionArray": """ Return a copy of the array. @@ -932,7 +932,7 @@ def copy(self) -> ABCExtensionArray: """ raise AbstractMethodError(self) - def view(self, dtype=None) -> Union[ABCExtensionArray, np.ndarray]: + def view(self, dtype=None) -> ArrayLike: """ Return a view on the array. @@ -943,8 +943,8 @@ def view(self, dtype=None) -> Union[ABCExtensionArray, np.ndarray]: Returns ------- - ExtensionArray - A view of the :class:`ExtensionArray`. + ExtensionArray or np.ndarray + A view on the :class:`ExtensionArray`'s data. """ # NB: # - This must return a *new* object referencing the same data, not self. @@ -1002,7 +1002,7 @@ def _formatter(self, boxed: bool = False) -> Callable[[Any], Optional[str]]: # Reshaping # ------------------------------------------------------------------------ - def ravel(self, order="C") -> ABCExtensionArray: + def ravel(self, order="C") -> "ExtensionArray": """ Return a flattened view on this array. @@ -1023,8 +1023,8 @@ def ravel(self, order="C") -> ABCExtensionArray: @classmethod def _concat_same_type( - cls, to_concat: Sequence[ABCExtensionArray] - ) -> ABCExtensionArray: + cls, to_concat: Sequence["ExtensionArray"] + ) -> "ExtensionArray": """ Concatenate multiple array. diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 7cf50ff2b88af..1972b7e18d804 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -131,7 +131,7 @@ class AttributesMixin: _data: np.ndarray @classmethod - def _simple_new(cls, values, **kwargs): + def _simple_new(cls, values: np.ndarray, **kwargs): raise AbstractMethodError(cls) @property diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index 5eeee644b3854..680b37c955278 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -31,13 +31,7 @@ pandas_dtype, ) from pandas.core.dtypes.dtypes import PeriodDtype -from pandas.core.dtypes.generic import ( - ABCIndexClass, - ABCPeriod, - ABCPeriodArray, - ABCPeriodIndex, - ABCSeries, -) +from pandas.core.dtypes.generic import ABCIndexClass, ABCPeriodIndex, ABCSeries from pandas.core.dtypes.missing import isna, notna import pandas.core.algorithms as algos @@ -48,7 +42,7 @@ from pandas.tseries.offsets import DateOffset, Tick, _delta_to_tick -def _field_accessor(name, alias, docstring=None): +def _field_accessor(name: str, alias: int, docstring=None): def f(self): base, mult = libfrequencies.get_freq_code(self.freq) result = get_period_field_arr(alias, self.asi8, base) @@ -170,7 +164,7 @@ def __init__(self, values, freq=None, dtype=None, copy=False): self._dtype = PeriodDtype(freq) @classmethod - def _simple_new(cls, values: np.ndarray, freq=None, **kwargs): + def _simple_new(cls, values: np.ndarray, freq=None, **kwargs) -> "PeriodArray": # alias for PeriodArray.__init__ assert isinstance(values, np.ndarray) and values.dtype == "i8" return cls(values, freq=freq, **kwargs) @@ -181,7 +175,7 @@ def _from_sequence( scalars: Sequence[Optional[Period]], dtype: Optional[PeriodDtype] = None, copy: bool = False, - ) -> ABCPeriodArray: + ) -> "PeriodArray": if dtype: freq = dtype.freq else: @@ -202,11 +196,13 @@ def _from_sequence( return cls(ordinals, freq=freq) @classmethod - def _from_sequence_of_strings(cls, strings, dtype=None, copy=False): + def _from_sequence_of_strings( + cls, strings, dtype=None, copy=False + ) -> "PeriodArray": return cls._from_sequence(strings, dtype, copy) @classmethod - def _from_datetime64(cls, data, freq, tz=None): + def _from_datetime64(cls, data, freq, tz=None) -> "PeriodArray": """ Construct a PeriodArray from a datetime64 array @@ -270,12 +266,12 @@ def _check_compatible_with(self, other, setitem: bool = False): # Data / Attributes @cache_readonly - def dtype(self): + def dtype(self) -> PeriodDtype: return self._dtype # error: Read-only property cannot override read-write property [misc] @property # type: ignore - def freq(self): + def freq(self) -> DateOffset: """ Return the frequency object for this PeriodArray. """ @@ -402,7 +398,7 @@ def __arrow_array__(self, type=None): daysinmonth = days_in_month @property - def is_leap_year(self): + def is_leap_year(self) -> np.ndarray: """ Logical indicating if the date belongs to a leap year. """ @@ -458,12 +454,6 @@ def to_timestamp(self, freq=None, how="start"): new_data = libperiod.periodarr_to_dt64arr(new_data.asi8, base) return DatetimeArray._from_sequence(new_data, freq="infer") - # -------------------------------------------------------------------- - # Array-like / EA-Interface Methods - - def _values_for_argsort(self): - return self._data - # -------------------------------------------------------------------- def _time_shift(self, periods, freq=None): @@ -495,7 +485,7 @@ def _time_shift(self, periods, freq=None): def _box_func(self): return lambda x: Period._from_ordinal(ordinal=x, freq=self.freq) - def asfreq(self, freq=None, how="E"): + def asfreq(self, freq=None, how="E") -> "PeriodArray": """ Convert the Period Array/Index to the specified frequency `freq`. @@ -557,7 +547,7 @@ def asfreq(self, freq=None, how="E"): # ------------------------------------------------------------------ # Rendering Methods - def _formatter(self, boxed=False): + def _formatter(self, boxed: bool = False): if boxed: return str return "'{}'".format @@ -584,7 +574,7 @@ def _format_native_types(self, na_rep="NaT", date_format=None, **kwargs): # ------------------------------------------------------------------ - def astype(self, dtype, copy=True): + def astype(self, dtype, copy: bool = True): # We handle Period[T] -> Period[U] # Our parent handles everything else. dtype = pandas_dtype(dtype) @@ -965,8 +955,8 @@ def _get_ordinal_range(start, end, periods, freq, mult=1): if end is not None: end = Period(end, freq) - is_start_per = isinstance(start, ABCPeriod) - is_end_per = isinstance(end, ABCPeriod) + is_start_per = isinstance(start, Period) + is_end_per = isinstance(end, Period) if is_start_per and is_end_per and start.freq != end.freq: raise ValueError("start and end must have same freq") diff --git a/pandas/core/ops/dispatch.py b/pandas/core/ops/dispatch.py index 61a3032c7a02c..5c34cb20be266 100644 --- a/pandas/core/ops/dispatch.py +++ b/pandas/core/ops/dispatch.py @@ -1,10 +1,12 @@ """ Functions for defining unary operations. """ -from typing import Any, Union +from typing import Any import numpy as np +from pandas._typing import ArrayLike + from pandas.core.dtypes.common import ( is_datetime64_dtype, is_extension_array_dtype, @@ -13,7 +15,7 @@ is_scalar, is_timedelta64_dtype, ) -from pandas.core.dtypes.generic import ABCExtensionArray, ABCSeries +from pandas.core.dtypes.generic import ABCSeries from pandas.core.construction import array @@ -93,9 +95,7 @@ def should_series_dispatch(left, right, op): return False -def dispatch_to_extension_op( - op, left: Union[ABCExtensionArray, np.ndarray], right: Any, -): +def dispatch_to_extension_op(op, left: ArrayLike, right: Any): """ Assume that left or right is a Series backed by an ExtensionArray, apply the operator defined by op. diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 7aeed5c316d7f..544d45999c14b 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -2202,7 +2202,7 @@ def __eq__(self, other: Any) -> bool: for a in ["name", "cname", "dtype", "pos"] ) - def set_data(self, data: Union[np.ndarray, ABCExtensionArray]): + def set_data(self, data: ArrayLike): assert data is not None assert self.dtype is None @@ -4959,11 +4959,11 @@ def _dtype_to_kind(dtype_str: str) -> str: return kind -def _get_data_and_dtype_name(data: Union[np.ndarray, ABCExtensionArray]): +def _get_data_and_dtype_name(data: ArrayLike): """ Convert the passed data into a storable form and a dtype string. """ - if is_categorical_dtype(data.dtype): + if isinstance(data, Categorical): data = data.codes # For datetime64tz we need to drop the TZ in tests TODO: why? From 662aef3e12dae0051cd62e1106c7f63636d31b43 Mon Sep 17 00:00:00 2001 From: MomIsBestFriend <50263213+MomIsBestFriend@users.noreply.github.com> Date: Thu, 19 Mar 2020 19:40:36 +0200 Subject: [PATCH 22/40] TST: Parametrize in pandas/tests/internals/test_internals.py (#32687) * TST: Parametrize in pandas/tests/internals/test_internals.py * Addressed lint issues * Addressing lint issues Co-authored-by: MomIsBestFriend <> --- pandas/tests/internals/test_internals.py | 305 +++++++++++++---------- 1 file changed, 170 insertions(+), 135 deletions(-) diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py index 1a7d5839d9a11..deffeb0a1800c 100644 --- a/pandas/tests/internals/test_internals.py +++ b/pandas/tests/internals/test_internals.py @@ -333,13 +333,9 @@ def test_pickle(self, mgr): assert not mgr2._is_consolidated assert not mgr2._known_consolidated - def test_non_unique_pickle(self): - - mgr = create_mgr("a,a,a:f8") - mgr2 = tm.round_trip_pickle(mgr) - tm.assert_frame_equal(DataFrame(mgr), DataFrame(mgr2)) - - mgr = create_mgr("a: f8; a: i8") + @pytest.mark.parametrize("mgr_string", ["a,a,a:f8", "a: f8; a: i8"]) + def test_non_unique_pickle(self, mgr_string): + mgr = create_mgr(mgr_string) mgr2 = tm.round_trip_pickle(mgr) tm.assert_frame_equal(DataFrame(mgr), DataFrame(mgr2)) @@ -427,22 +423,25 @@ def test_sparse_mixed(self): # TODO: what to test here? - def test_as_array_float(self): - mgr = create_mgr("c: f4; d: f2; e: f8") - assert mgr.as_array().dtype == np.float64 - - mgr = create_mgr("c: f4; d: f2") - assert mgr.as_array().dtype == np.float32 - - def test_as_array_int_bool(self): - mgr = create_mgr("a: bool-1; b: bool-2") - assert mgr.as_array().dtype == np.bool_ - - mgr = create_mgr("a: i8-1; b: i8-2; c: i4; d: i2; e: u1") - assert mgr.as_array().dtype == np.int64 + @pytest.mark.parametrize( + "mgr_string, dtype", + [("c: f4; d: f2", np.float32), ("c: f4; d: f2; e: f8", np.float64)], + ) + def test_as_array_float(self, mgr_string, dtype): + mgr = create_mgr(mgr_string) + assert mgr.as_array().dtype == dtype - mgr = create_mgr("c: i4; d: i2; e: u1") - assert mgr.as_array().dtype == np.int32 + @pytest.mark.parametrize( + "mgr_string, dtype", + [ + ("a: bool-1; b: bool-2", np.bool_), + ("a: i8-1; b: i8-2; c: i4; d: i2; e: u1", np.int64), + ("c: i4; d: i2; e: u1", np.int32), + ], + ) + def test_as_array_int_bool(self, mgr_string, dtype): + mgr = create_mgr(mgr_string) + assert mgr.as_array().dtype == dtype def test_as_array_datetime(self): mgr = create_mgr("h: datetime-1; g: datetime-2") @@ -548,7 +547,6 @@ def test_invalid_ea_block(self): create_mgr("a: category2; b: category2") def test_interleave(self): - # self for dtype in ["f8", "i8", "object", "bool", "complex", "M8[ns]", "m8[ns]"]: mgr = create_mgr(f"a: {dtype}") @@ -556,6 +554,30 @@ def test_interleave(self): mgr = create_mgr(f"a: {dtype}; b: {dtype}") assert mgr.as_array().dtype == dtype + @pytest.mark.parametrize( + "mgr_string, dtype", + [ + ("a: category", "i8"), + ("a: category; b: category", "i8"), + ("a: category; b: category2", "object"), + ("a: category2", "object"), + ("a: category2; b: category2", "object"), + ("a: f8", "f8"), + ("a: f8; b: i8", "f8"), + ("a: f4; b: i8", "f8"), + ("a: f4; b: i8; d: object", "object"), + ("a: bool; b: i8", "object"), + ("a: complex", "complex"), + ("a: f8; b: category", "object"), + ("a: M8[ns]; b: category", "object"), + ("a: M8[ns]; b: bool", "object"), + ("a: M8[ns]; b: i8", "object"), + ("a: m8[ns]; b: bool", "object"), + ("a: m8[ns]; b: i8", "object"), + ("a: M8[ns]; b: m8[ns]", "object"), + ], + ) + def test_interleave_dtype(self, mgr_string, dtype): # will be converted according the actual dtype of the underlying mgr = create_mgr("a: category") assert mgr.as_array().dtype == "i8" @@ -689,13 +711,12 @@ def test_get_bool_data(self): def test_unicode_repr_doesnt_raise(self): repr(create_mgr("b,\u05d0: object")) - def test_equals(self): + @pytest.mark.parametrize( + "mgr_string", ["a,b,c: i8-1; d,e,f: i8-2", "a,a,a: i8-1; b,b,b: i8-2"] + ) + def test_equals(self, mgr_string): # unique items - bm1 = create_mgr("a,b,c: i8-1; d,e,f: i8-2") - bm2 = BlockManager(bm1.blocks[::-1], bm1.axes) - assert bm1.equals(bm2) - - bm1 = create_mgr("a,a,a: i8-1; b,b,b: i8-2") + bm1 = create_mgr(mgr_string) bm2 = BlockManager(bm1.blocks[::-1], bm1.axes) assert bm1.equals(bm2) @@ -905,97 +926,111 @@ def assert_reindex_indexer_is_ok(mgr, axis, new_labels, indexer, fill_value): class TestBlockPlacement: - def test_slice_len(self): - assert len(BlockPlacement(slice(0, 4))) == 4 - assert len(BlockPlacement(slice(0, 4, 2))) == 2 - assert len(BlockPlacement(slice(0, 3, 2))) == 2 - - assert len(BlockPlacement(slice(0, 1, 2))) == 1 - assert len(BlockPlacement(slice(1, 0, -1))) == 1 + @pytest.mark.parametrize( + "slc, expected", + [ + (slice(0, 4), 4), + (slice(0, 4, 2), 2), + (slice(0, 3, 2), 2), + (slice(0, 1, 2), 1), + (slice(1, 0, -1), 1), + ], + ) + def test_slice_len(self, slc, expected): + assert len(BlockPlacement(slc)) == expected - def test_zero_step_raises(self): + @pytest.mark.parametrize("slc", [slice(1, 1, 0), slice(1, 2, 0)]) + def test_zero_step_raises(self, slc): msg = "slice step cannot be zero" - with pytest.raises(ValueError, match=msg): - BlockPlacement(slice(1, 1, 0)) + BlockPlacement(slc) + + @pytest.mark.parametrize( + "slc", + [ + slice(None, None), + slice(10, None), + slice(None, None, -1), + slice(None, 10, -1), + # These are "unbounded" because negative index will + # change depending on container shape. + slice(-1, None), + slice(None, -1), + slice(-1, -1), + slice(-1, None, -1), + slice(None, -1, -1), + slice(-1, -1, -1), + ], + ) + def test_unbounded_slice_raises(self, slc): + msg = "unbounded slice" with pytest.raises(ValueError, match=msg): - BlockPlacement(slice(1, 2, 0)) - - def test_unbounded_slice_raises(self): - def assert_unbounded_slice_error(slc): - with pytest.raises(ValueError, match="unbounded slice"): - BlockPlacement(slc) - - assert_unbounded_slice_error(slice(None, None)) - assert_unbounded_slice_error(slice(10, None)) - assert_unbounded_slice_error(slice(None, None, -1)) - assert_unbounded_slice_error(slice(None, 10, -1)) - - # These are "unbounded" because negative index will change depending on - # container shape. - assert_unbounded_slice_error(slice(-1, None)) - assert_unbounded_slice_error(slice(None, -1)) - assert_unbounded_slice_error(slice(-1, -1)) - assert_unbounded_slice_error(slice(-1, None, -1)) - assert_unbounded_slice_error(slice(None, -1, -1)) - assert_unbounded_slice_error(slice(-1, -1, -1)) - - def test_not_slice_like_slices(self): - def assert_not_slice_like(slc): - assert not BlockPlacement(slc).is_slice_like - - assert_not_slice_like(slice(0, 0)) - assert_not_slice_like(slice(100, 0)) - - assert_not_slice_like(slice(100, 100, -1)) - assert_not_slice_like(slice(0, 100, -1)) - - assert not BlockPlacement(slice(0, 0)).is_slice_like - assert not BlockPlacement(slice(100, 100)).is_slice_like - - def test_array_to_slice_conversion(self): - def assert_as_slice_equals(arr, slc): - assert BlockPlacement(arr).as_slice == slc - - assert_as_slice_equals([0], slice(0, 1, 1)) - assert_as_slice_equals([100], slice(100, 101, 1)) - - assert_as_slice_equals([0, 1, 2], slice(0, 3, 1)) - assert_as_slice_equals([0, 5, 10], slice(0, 15, 5)) - assert_as_slice_equals([0, 100], slice(0, 200, 100)) - - assert_as_slice_equals([2, 1], slice(2, 0, -1)) - - def test_not_slice_like_arrays(self): - def assert_not_slice_like(arr): - assert not BlockPlacement(arr).is_slice_like - - assert_not_slice_like([]) - assert_not_slice_like([-1]) - assert_not_slice_like([-1, -2, -3]) - assert_not_slice_like([-10]) - assert_not_slice_like([-1]) - assert_not_slice_like([-1, 0, 1, 2]) - assert_not_slice_like([-2, 0, 2, 4]) - assert_not_slice_like([1, 0, -1]) - assert_not_slice_like([1, 1, 1]) - - def test_slice_iter(self): - assert list(BlockPlacement(slice(0, 3))) == [0, 1, 2] - assert list(BlockPlacement(slice(0, 0))) == [] - assert list(BlockPlacement(slice(3, 0))) == [] - - def test_slice_to_array_conversion(self): - def assert_as_array_equals(slc, asarray): - tm.assert_numpy_array_equal( - BlockPlacement(slc).as_array, np.asarray(asarray, dtype=np.int64) - ) + BlockPlacement(slc) - assert_as_array_equals(slice(0, 3), [0, 1, 2]) - assert_as_array_equals(slice(0, 0), []) - assert_as_array_equals(slice(3, 0), []) + @pytest.mark.parametrize( + "slc", + [ + slice(0, 0), + slice(100, 0), + slice(100, 100), + slice(100, 100, -1), + slice(0, 100, -1), + ], + ) + def test_not_slice_like_slices(self, slc): + assert not BlockPlacement(slc).is_slice_like + + @pytest.mark.parametrize( + "arr, slc", + [ + ([0], slice(0, 1, 1)), + ([100], slice(100, 101, 1)), + ([0, 1, 2], slice(0, 3, 1)), + ([0, 5, 10], slice(0, 15, 5)), + ([0, 100], slice(0, 200, 100)), + ([2, 1], slice(2, 0, -1)), + ], + ) + def test_array_to_slice_conversion(self, arr, slc): + assert BlockPlacement(arr).as_slice == slc - assert_as_array_equals(slice(3, 0, -1), [3, 2, 1]) + @pytest.mark.parametrize( + "arr", + [ + [], + [-1], + [-1, -2, -3], + [-10], + [-1], + [-1, 0, 1, 2], + [-2, 0, 2, 4], + [1, 0, -1], + [1, 1, 1], + ], + ) + def test_not_slice_like_arrays(self, arr): + assert not BlockPlacement(arr).is_slice_like + + @pytest.mark.parametrize( + "slc, expected", + [(slice(0, 3), [0, 1, 2]), (slice(0, 0), []), (slice(3, 0), [])], + ) + def test_slice_iter(self, slc, expected): + assert list(BlockPlacement(slc)) == expected + + @pytest.mark.parametrize( + "slc, arr", + [ + (slice(0, 3), [0, 1, 2]), + (slice(0, 0), []), + (slice(3, 0), []), + (slice(3, 0, -1), [3, 2, 1]), + ], + ) + def test_slice_to_array_conversion(self, slc, arr): + tm.assert_numpy_array_equal( + BlockPlacement(slc).as_array, np.asarray(arr, dtype=np.int64) + ) def test_blockplacement_add(self): bpl = BlockPlacement(slice(0, 5)) @@ -1003,30 +1038,30 @@ def test_blockplacement_add(self): assert bpl.add(np.arange(5)).as_slice == slice(0, 10, 2) assert list(bpl.add(np.arange(5, 0, -1))) == [5, 5, 5, 5, 5] - def test_blockplacement_add_int(self): - def assert_add_equals(val, inc, result): - assert list(BlockPlacement(val).add(inc)) == result - - assert_add_equals(slice(0, 0), 0, []) - assert_add_equals(slice(1, 4), 0, [1, 2, 3]) - assert_add_equals(slice(3, 0, -1), 0, [3, 2, 1]) - assert_add_equals([1, 2, 4], 0, [1, 2, 4]) - - assert_add_equals(slice(0, 0), 10, []) - assert_add_equals(slice(1, 4), 10, [11, 12, 13]) - assert_add_equals(slice(3, 0, -1), 10, [13, 12, 11]) - assert_add_equals([1, 2, 4], 10, [11, 12, 14]) - - assert_add_equals(slice(0, 0), -1, []) - assert_add_equals(slice(1, 4), -1, [0, 1, 2]) - assert_add_equals([1, 2, 4], -1, [0, 1, 3]) + @pytest.mark.parametrize( + "val, inc, expected", + [ + (slice(0, 0), 0, []), + (slice(1, 4), 0, [1, 2, 3]), + (slice(3, 0, -1), 0, [3, 2, 1]), + ([1, 2, 4], 0, [1, 2, 4]), + (slice(0, 0), 10, []), + (slice(1, 4), 10, [11, 12, 13]), + (slice(3, 0, -1), 10, [13, 12, 11]), + ([1, 2, 4], 10, [11, 12, 14]), + (slice(0, 0), -1, []), + (slice(1, 4), -1, [0, 1, 2]), + ([1, 2, 4], -1, [0, 1, 3]), + ], + ) + def test_blockplacement_add_int(self, val, inc, expected): + assert list(BlockPlacement(val).add(inc)) == expected + @pytest.mark.parametrize("val", [slice(1, 4), [1, 2, 4]]) + def test_blockplacement_add_int_raises(self, val): msg = "iadd causes length change" - - with pytest.raises(ValueError, match=msg): - BlockPlacement(slice(1, 4)).add(-10) with pytest.raises(ValueError, match=msg): - BlockPlacement([1, 2, 4]).add(-10) + BlockPlacement(val).add(-10) class DummyElement: From b295c023759f12e1078a932048e003e887069a91 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Thu, 19 Mar 2020 18:23:40 +0000 Subject: [PATCH 23/40] TYP: update setup.cfg (#32829) --- setup.cfg | 3 --- 1 file changed, 3 deletions(-) diff --git a/setup.cfg b/setup.cfg index 42c507a2b6b01..87802190ea26a 100644 --- a/setup.cfg +++ b/setup.cfg @@ -196,9 +196,6 @@ check_untyped_defs=False [mypy-pandas.core.indexes.multi] check_untyped_defs=False -[mypy-pandas.core.indexing] -check_untyped_defs=False - [mypy-pandas.core.internals.blocks] check_untyped_defs=False From 804dfc60852ac3a80c23aa73d19bf53a39ada620 Mon Sep 17 00:00:00 2001 From: RaisaDZ <34237447+RaisaDZ@users.noreply.github.com> Date: Thu, 19 Mar 2020 18:56:22 +0000 Subject: [PATCH 24/40] CLN: Update docstring decorator from Appender to doc (#32828) --- pandas/core/arrays/numpy_.py | 4 ++-- pandas/core/indexes/extension.py | 10 +++++----- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/pandas/core/arrays/numpy_.py b/pandas/core/arrays/numpy_.py index e8333606ec54c..3058e1d6073f3 100644 --- a/pandas/core/arrays/numpy_.py +++ b/pandas/core/arrays/numpy_.py @@ -6,7 +6,7 @@ from pandas._libs import lib from pandas.compat.numpy import function as nv -from pandas.util._decorators import Appender +from pandas.util._decorators import doc from pandas.util._validators import validate_fillna_kwargs from pandas.core.dtypes.dtypes import ExtensionDtype @@ -449,7 +449,7 @@ def to_numpy( return result - @Appender(ExtensionArray.searchsorted.__doc__) + @doc(ExtensionArray.searchsorted) def searchsorted(self, value, side="left", sorter=None): return searchsorted(self.to_numpy(), value, side=side, sorter=sorter) diff --git a/pandas/core/indexes/extension.py b/pandas/core/indexes/extension.py index 6851aeec0ca40..f38a4fb83c64f 100644 --- a/pandas/core/indexes/extension.py +++ b/pandas/core/indexes/extension.py @@ -7,7 +7,7 @@ from pandas.compat.numpy import function as nv from pandas.errors import AbstractMethodError -from pandas.util._decorators import Appender, cache_readonly +from pandas.util._decorators import cache_readonly, doc from pandas.core.dtypes.common import ( ensure_platform_int, @@ -231,7 +231,7 @@ def __array__(self, dtype=None) -> np.ndarray: def _get_engine_target(self) -> np.ndarray: return self._data._values_for_argsort() - @Appender(Index.dropna.__doc__) + @doc(Index.dropna) def dropna(self, how="any"): if how not in ("any", "all"): raise ValueError(f"invalid how option: {how}") @@ -253,7 +253,7 @@ def _concat_same_dtype(self, to_concat, name): arr = type(self._data)._concat_same_type(to_concat) return type(self)._simple_new(arr, name=name) - @Appender(Index.take.__doc__) + @doc(Index.take) def take(self, indices, axis=0, allow_fill=True, fill_value=None, **kwargs): nv.validate_take(tuple(), kwargs) indices = ensure_platform_int(indices) @@ -283,7 +283,7 @@ def _get_unique_index(self, dropna=False): result = result[~result.isna()] return self._shallow_copy(result) - @Appender(Index.map.__doc__) + @doc(Index.map) def map(self, mapper, na_action=None): # Try to run function on index first, and then on elements of index # Especially important for group-by functionality @@ -300,7 +300,7 @@ def map(self, mapper, na_action=None): except Exception: return self.astype(object).map(mapper) - @Appender(Index.astype.__doc__) + @doc(Index.astype) def astype(self, dtype, copy=True): if is_dtype_equal(self.dtype, dtype) and copy is False: # Ensure that self.astype(self.dtype) is self From 1db3b097c2f668ca08654b2818cbebb313d30145 Mon Sep 17 00:00:00 2001 From: Robert de Vries Date: Thu, 19 Mar 2020 20:50:38 +0100 Subject: [PATCH 25/40] BUG: Fix segfault on dir of a DataFrame with a unicode surrogate character in the column name (#32701) --- doc/source/whatsnew/v1.1.0.rst | 1 + pandas/_libs/hashtable_class_helper.pxi.in | 6 ++++++ pandas/tests/frame/test_api.py | 8 ++++++++ pandas/tests/io/parser/test_dtypes.py | 2 +- 4 files changed, 16 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 2db61a17858de..720ce7af47a18 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -405,6 +405,7 @@ Other - Fixed :func:`pandas.testing.assert_series_equal` to correctly raise if left object is a different subclass with ``check_series_type=True`` (:issue:`32670`). - :meth:`IntegerArray.astype` now supports ``datetime64`` dtype (:issue:32538`) - Fixed bug in :func:`pandas.testing.assert_series_equal` where dtypes were checked for ``Interval`` and ``ExtensionArray`` operands when ``check_dtype`` was ``False`` (:issue:`32747`) +- Bug in :meth:`DataFrame.__dir__` caused a segfault when using unicode surrogates in a column name (:issue:`25509`) .. --------------------------------------------------------------------------- diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index 811025a4b5764..d662e03304e2e 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -12,6 +12,9 @@ WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in from pandas._libs.tslibs.util cimport get_c_string from pandas._libs.missing cimport C_NA +cdef extern from "Python.h": + void PyErr_Clear() + {{py: # name, dtype, c_type @@ -790,6 +793,9 @@ cdef class StringHashTable(HashTable): else: # if ignore_na is False, we also stringify NaN/None/etc. v = get_c_string(val) + if v == NULL: + PyErr_Clear() + v = get_c_string(repr(val)) vecs[i] = v # compute diff --git a/pandas/tests/frame/test_api.py b/pandas/tests/frame/test_api.py index a021dd91a7d26..940a76601b75e 100644 --- a/pandas/tests/frame/test_api.py +++ b/pandas/tests/frame/test_api.py @@ -127,6 +127,14 @@ def test_not_hashable(self): with pytest.raises(TypeError, match=msg): hash(empty_frame) + def test_column_name_contains_unicode_surrogate(self): + # GH 25509 + colname = "\ud83d" + df = DataFrame({colname: []}) + # this should not crash + assert colname not in dir(df) + assert df.columns[0] == colname + def test_new_empty_index(self): df1 = DataFrame(np.random.randn(0, 3)) df2 = DataFrame(np.random.randn(0, 3)) diff --git a/pandas/tests/io/parser/test_dtypes.py b/pandas/tests/io/parser/test_dtypes.py index 11dcf7f04f76b..e68dcb3aa577e 100644 --- a/pandas/tests/io/parser/test_dtypes.py +++ b/pandas/tests/io/parser/test_dtypes.py @@ -192,7 +192,7 @@ def test_categorical_dtype_utf16(all_parsers, csv_dir_path): pth = os.path.join(csv_dir_path, "utf16_ex.txt") parser = all_parsers encoding = "utf-16" - sep = "," + sep = "\t" expected = parser.read_csv(pth, sep=sep, encoding=encoding) expected = expected.apply(Categorical) From 563da98c920ad85331193f71633a0541fbc08337 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 19 Mar 2020 21:16:30 +0100 Subject: [PATCH 26/40] PERF: skip non-consolidatable blocks when checking consolidation (#32826) --- pandas/core/arrays/sparse/array.py | 1 - pandas/core/internals/blocks.py | 13 ------------- pandas/core/internals/managers.py | 4 ++-- 3 files changed, 2 insertions(+), 16 deletions(-) diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index 963c2f3d53138..8021e0babe4e0 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -268,7 +268,6 @@ class SparseArray(PandasObject, ExtensionArray, ExtensionOpsMixin): Indices: array([2, 3], dtype=int32) """ - _pandas_ftype = "sparse" _subtyp = "sparse_array" # register ABCSparseArray _deprecations = PandasObject._deprecations | frozenset(["get_values"]) _sparse_index: SparseIndex diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index adeb1ae04a58d..fec8639f5a44d 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -110,7 +110,6 @@ class Block(PandasObject): _can_consolidate = True _verify_integrity = True _validate_ndim = True - _ftype = "dense" _concatenator = staticmethod(np.concatenate) def __init__(self, values, placement, ndim=None): @@ -322,14 +321,6 @@ def shape(self): def dtype(self): return self.values.dtype - @property - def ftype(self) -> str: - if getattr(self.values, "_pandas_ftype", False): - dtype = self.dtype.subtype - else: - dtype = self.dtype - return f"{dtype}:{self._ftype}" - def merge(self, other): return _merge_blocks([self, other]) @@ -1956,10 +1947,6 @@ def where( return [self.make_block_same_class(result, placement=self.mgr_locs)] - @property - def _ftype(self): - return getattr(self.values, "_pandas_ftype", Block._ftype) - def _unstack(self, unstacker_func, new_columns, n_rows, fill_value): # ExtensionArray-safe unstack. # We override ObjectBlock._unstack, which unstacks directly on the diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index da334561385d6..66e96af05eb71 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -676,8 +676,8 @@ def is_consolidated(self) -> bool: return self._is_consolidated def _consolidate_check(self) -> None: - ftypes = [blk.ftype for blk in self.blocks] - self._is_consolidated = len(ftypes) == len(set(ftypes)) + dtypes = [blk.dtype for blk in self.blocks if blk._can_consolidate] + self._is_consolidated = len(dtypes) == len(set(dtypes)) self._known_consolidated = True @property From 78e0ccd2440007ee3035f68fbe452acaca031f01 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 19 Mar 2020 14:15:53 -0700 Subject: [PATCH 27/40] CLN: remove DatetimeLikeArray._add_delta (#32799) --- pandas/core/arrays/datetimelike.py | 63 +++++++++++++----------------- pandas/core/arrays/datetimes.py | 17 -------- pandas/core/arrays/period.py | 40 ++++++------------- pandas/core/arrays/timedeltas.py | 17 -------- 4 files changed, 38 insertions(+), 99 deletions(-) diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 1972b7e18d804..b2bff0b0142e2 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -1129,56 +1129,46 @@ def _sub_period(self, other): def _add_offset(self, offset): raise AbstractMethodError(self) - def _add_delta(self, other): + def _add_timedeltalike_scalar(self, other): """ - Add a timedelta-like, Tick or TimedeltaIndex-like object - to self, yielding an int64 numpy array - - Parameters - ---------- - delta : {timedelta, np.timedelta64, Tick, - TimedeltaIndex, ndarray[timedelta64]} + Add a delta of a timedeltalike Returns ------- - result : ndarray[int64] - - Notes - ----- - The result's name is set outside of _add_delta by the calling - method (__add__ or __sub__), if necessary (i.e. for Indexes). - """ - if isinstance(other, (Tick, timedelta, np.timedelta64)): - new_values = self._add_timedeltalike_scalar(other) - elif is_timedelta64_dtype(other): - # ndarray[timedelta64] or TimedeltaArray/index - new_values = self._add_delta_tdi(other) - - return new_values - - def _add_timedeltalike_scalar(self, other): - """ - Add a delta of a timedeltalike - return the i8 result view + Same type as self """ if isna(other): # i.e np.timedelta64("NaT"), not recognized by delta_to_nanoseconds new_values = np.empty(self.shape, dtype="i8") new_values[:] = iNaT - return new_values + return type(self)(new_values, dtype=self.dtype) inc = delta_to_nanoseconds(other) new_values = checked_add_with_arr(self.asi8, inc, arr_mask=self._isnan).view( "i8" ) new_values = self._maybe_mask_results(new_values) - return new_values.view("i8") - def _add_delta_tdi(self, other): + new_freq = None + if isinstance(self.freq, Tick) or is_period_dtype(self.dtype): + # adding a scalar preserves freq + new_freq = self.freq + + if new_freq is not None: + # fastpath that doesnt require inference + return type(self)(new_values, dtype=self.dtype, freq=new_freq) + return type(self)._from_sequence(new_values, dtype=self.dtype, freq="infer") + + def _add_timedelta_arraylike(self, other): """ Add a delta of a TimedeltaIndex - return the i8 result view + + Returns + ------- + Same type as self """ + # overriden by PeriodArray + if len(self) != len(other): raise ValueError("cannot add indices of unequal length") @@ -1196,7 +1186,8 @@ def _add_delta_tdi(self, other): if self._hasnans or other._hasnans: mask = (self._isnan) | (other._isnan) new_values[mask] = iNaT - return new_values.view("i8") + + return type(self)._from_sequence(new_values, dtype=self.dtype, freq="infer") def _add_nat(self): """ @@ -1338,7 +1329,7 @@ def __add__(self, other): if other is NaT: result = self._add_nat() elif isinstance(other, (Tick, timedelta, np.timedelta64)): - result = self._add_delta(other) + result = self._add_timedeltalike_scalar(other) elif isinstance(other, DateOffset): # specifically _not_ a Tick result = self._add_offset(other) @@ -1354,7 +1345,7 @@ def __add__(self, other): # array-like others elif is_timedelta64_dtype(other): # TimedeltaIndex, ndarray[timedelta64] - result = self._add_delta(other) + result = self._add_timedelta_arraylike(other) elif is_object_dtype(other): # e.g. Array/Index of DateOffset objects result = self._addsub_object_array(other, operator.add) @@ -1390,7 +1381,7 @@ def __sub__(self, other): if other is NaT: result = self._sub_nat() elif isinstance(other, (Tick, timedelta, np.timedelta64)): - result = self._add_delta(-other) + result = self._add_timedeltalike_scalar(-other) elif isinstance(other, DateOffset): # specifically _not_ a Tick result = self._add_offset(-other) @@ -1409,7 +1400,7 @@ def __sub__(self, other): # array-like others elif is_timedelta64_dtype(other): # TimedeltaIndex, ndarray[timedelta64] - result = self._add_delta(-other) + result = self._add_timedelta_arraylike(-other) elif is_object_dtype(other): # e.g. Array/Index of DateOffset objects result = self._addsub_object_array(other, operator.sub) diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index 2110f782330fb..2d74582b049f7 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -718,23 +718,6 @@ def _sub_datetimelike_scalar(self, other): result = self._maybe_mask_results(result) return result.view("timedelta64[ns]") - def _add_delta(self, delta): - """ - Add a timedelta-like, Tick, or TimedeltaIndex-like object - to self, yielding a new DatetimeArray - - Parameters - ---------- - other : {timedelta, np.timedelta64, Tick, - TimedeltaIndex, ndarray[timedelta64]} - - Returns - ------- - result : DatetimeArray - """ - new_values = super()._add_delta(delta) - return type(self)._from_sequence(new_values, tz=self.tz, freq="infer") - # ----------------------------------------------------------------- # Timezone Conversion and Localization Methods diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index 680b37c955278..6ee439de414f1 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -647,10 +647,11 @@ def _add_timedeltalike_scalar(self, other): Returns ------- - result : ndarray[int64] + PeriodArray """ - assert isinstance(self.freq, Tick) # checked by calling function - assert isinstance(other, (timedelta, np.timedelta64, Tick)) + if not isinstance(self.freq, Tick): + # We cannot add timedelta-like to non-tick PeriodArray + raise raise_on_incompatible(self, other) if notna(other): # special handling for np.timedelta64("NaT"), avoid calling @@ -660,10 +661,9 @@ def _add_timedeltalike_scalar(self, other): # Note: when calling parent class's _add_timedeltalike_scalar, # it will call delta_to_nanoseconds(delta). Because delta here # is an integer, delta_to_nanoseconds will return it unchanged. - ordinals = super()._add_timedeltalike_scalar(other) - return ordinals + return super()._add_timedeltalike_scalar(other) - def _add_delta_tdi(self, other): + def _add_timedelta_arraylike(self, other): """ Parameters ---------- @@ -673,7 +673,9 @@ def _add_delta_tdi(self, other): ------- result : ndarray[int64] """ - assert isinstance(self.freq, Tick) # checked by calling function + if not isinstance(self.freq, Tick): + # We cannot add timedelta-like to non-tick PeriodArray + raise raise_on_incompatible(self, other) if not np.all(isna(other)): delta = self._check_timedeltalike_freq_compat(other) @@ -681,28 +683,8 @@ def _add_delta_tdi(self, other): # all-NaT TimedeltaIndex is equivalent to a single scalar td64 NaT return self + np.timedelta64("NaT") - return self._addsub_int_array(delta, operator.add).asi8 - - def _add_delta(self, other): - """ - Add a timedelta-like, Tick, or TimedeltaIndex-like object - to self, yielding a new PeriodArray - - Parameters - ---------- - other : {timedelta, np.timedelta64, Tick, - TimedeltaIndex, ndarray[timedelta64]} - - Returns - ------- - result : PeriodArray - """ - if not isinstance(self.freq, Tick): - # We cannot add timedelta-like to non-tick PeriodArray - raise raise_on_incompatible(self, other) - - new_ordinals = super()._add_delta(other) - return type(self)(new_ordinals, freq=self.freq) + ordinals = self._addsub_int_array(delta, operator.add).asi8 + return type(self)(ordinals, dtype=self.dtype) def _check_timedeltalike_freq_compat(self, other): """ diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index dbc0b0b3ccbbf..a25426c5c99cc 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -400,23 +400,6 @@ def _add_offset(self, other): f"cannot add the type {type(other).__name__} to a {type(self).__name__}" ) - def _add_delta(self, delta): - """ - Add a timedelta-like, Tick, or TimedeltaIndex-like object - to self, yielding a new TimedeltaArray. - - Parameters - ---------- - other : {timedelta, np.timedelta64, Tick, - TimedeltaIndex, ndarray[timedelta64]} - - Returns - ------- - result : TimedeltaArray - """ - new_values = super()._add_delta(delta) - return type(self)._from_sequence(new_values, freq="infer") - def _add_datetime_arraylike(self, other): """ Add DatetimeArray/Index or ndarray[datetime64] to TimedeltaArray. From e8acc2694e8e8bd72028701dae0c9d8156e6db5d Mon Sep 17 00:00:00 2001 From: William Ayd Date: Thu, 19 Mar 2020 14:29:18 -0700 Subject: [PATCH 28/40] Error on C Warnings (#32163) --- pandas/_libs/hashtable_class_helper.pxi.in | 4 +-- pandas/_libs/internals.pyx | 25 +++++++++---------- pandas/_libs/parsers.pyx | 1 - .../_libs/src/ujson/python/date_conversions.c | 4 +-- .../_libs/src/ujson/python/date_conversions.h | 5 ++-- pandas/_libs/src/ujson/python/objToJSON.c | 9 +++---- .../_libs/tslibs/src/datetime/np_datetime.c | 10 +++----- .../_libs/tslibs/src/datetime/np_datetime.h | 3 +-- setup.py | 11 ++++++-- 9 files changed, 36 insertions(+), 36 deletions(-) diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index d662e03304e2e..4c2b6b8c5a8aa 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -196,7 +196,7 @@ cdef class StringVector: append_data_string(self.data, x) - cdef extend(self, ndarray[:] x): + cdef extend(self, ndarray[object] x): for i in range(len(x)): self.append(x[i]) @@ -241,7 +241,7 @@ cdef class ObjectVector: self.external_view_exists = True return self.ao - cdef extend(self, ndarray[:] x): + cdef extend(self, ndarray[object] x): for i in range(len(x)): self.append(x[i]) diff --git a/pandas/_libs/internals.pyx b/pandas/_libs/internals.pyx index 63f076b7ee993..c65205e406607 100644 --- a/pandas/_libs/internals.pyx +++ b/pandas/_libs/internals.pyx @@ -378,25 +378,23 @@ def get_blkno_indexers(int64_t[:] blknos, bint group=True): object blkno object group_dict = defaultdict(list) - int64_t[:] res_view n = blknos.shape[0] - - if n == 0: - return - + result = list() start = 0 cur_blkno = blknos[start] - if group is False: + if n == 0: + pass + elif group is False: for i in range(1, n): if blknos[i] != cur_blkno: - yield cur_blkno, slice(start, i) + result.append((cur_blkno, slice(start, i))) start = i cur_blkno = blknos[i] - yield cur_blkno, slice(start, n) + result.append((cur_blkno, slice(start, n))) else: for i in range(1, n): if blknos[i] != cur_blkno: @@ -409,19 +407,20 @@ def get_blkno_indexers(int64_t[:] blknos, bint group=True): for blkno, slices in group_dict.items(): if len(slices) == 1: - yield blkno, slice(slices[0][0], slices[0][1]) + result.append((blkno, slice(slices[0][0], slices[0][1]))) else: tot_len = sum(stop - start for start, stop in slices) - result = np.empty(tot_len, dtype=np.int64) - res_view = result + arr = np.empty(tot_len, dtype=np.int64) i = 0 for start, stop in slices: for diff in range(start, stop): - res_view[i] = diff + arr[i] = diff i += 1 - yield blkno, result + result.append((blkno, arr)) + + return result def get_blkno_placements(blknos, group: bool = True): diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index 39195585ebfa6..2085e91d69ed0 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -792,7 +792,6 @@ cdef class TextReader: self._tokenize_rows(1) header = [ self.names ] - data_line = 0 if self.parser.lines < 1: field_count = len(header[0]) diff --git a/pandas/_libs/src/ujson/python/date_conversions.c b/pandas/_libs/src/ujson/python/date_conversions.c index bcb1334d978ef..4c25ab572bebe 100644 --- a/pandas/_libs/src/ujson/python/date_conversions.c +++ b/pandas/_libs/src/ujson/python/date_conversions.c @@ -67,7 +67,7 @@ npy_datetime NpyDateTimeToEpoch(npy_datetime dt, NPY_DATETIMEUNIT base) { } /* Convert PyDatetime To ISO C-string. mutates len */ -char *PyDateTimeToIso(PyDateTime_Date *obj, NPY_DATETIMEUNIT base, +char *PyDateTimeToIso(PyObject *obj, NPY_DATETIMEUNIT base, size_t *len) { npy_datetimestruct dts; int ret; @@ -98,7 +98,7 @@ char *PyDateTimeToIso(PyDateTime_Date *obj, NPY_DATETIMEUNIT base, return result; } -npy_datetime PyDateTimeToEpoch(PyDateTime_Date *dt, NPY_DATETIMEUNIT base) { +npy_datetime PyDateTimeToEpoch(PyObject *dt, NPY_DATETIMEUNIT base) { npy_datetimestruct dts; int ret; diff --git a/pandas/_libs/src/ujson/python/date_conversions.h b/pandas/_libs/src/ujson/python/date_conversions.h index 1b5cbf2a7e307..23e36999be43f 100644 --- a/pandas/_libs/src/ujson/python/date_conversions.h +++ b/pandas/_libs/src/ujson/python/date_conversions.h @@ -4,7 +4,6 @@ #define PY_SSIZE_T_CLEAN #include #include -#include "datetime.h" // Scales value inplace from nanosecond resolution to unit resolution int scaleNanosecToUnit(npy_int64 *value, NPY_DATETIMEUNIT unit); @@ -23,10 +22,10 @@ npy_datetime NpyDateTimeToEpoch(npy_datetime dt, NPY_DATETIMEUNIT base); // up to precision `base` e.g. base="s" yields 2020-01-03T00:00:00Z // while base="ns" yields "2020-01-01T00:00:00.000000000Z" // len is mutated to save the length of the returned string -char *PyDateTimeToIso(PyDateTime_Date *obj, NPY_DATETIMEUNIT base, size_t *len); +char *PyDateTimeToIso(PyObject *obj, NPY_DATETIMEUNIT base, size_t *len); // Convert a Python Date/Datetime to Unix epoch with resolution base -npy_datetime PyDateTimeToEpoch(PyDateTime_Date *dt, NPY_DATETIMEUNIT base); +npy_datetime PyDateTimeToEpoch(PyObject *dt, NPY_DATETIMEUNIT base); char *int64ToIsoDuration(int64_t value, size_t *len); diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c index 95e98779c2368..965d6aec2c1cf 100644 --- a/pandas/_libs/src/ujson/python/objToJSON.c +++ b/pandas/_libs/src/ujson/python/objToJSON.c @@ -1451,7 +1451,7 @@ char **NpyArr_encodeLabels(PyArrayObject *labels, PyObjectEncoder *enc, } else { // datetime.* objects don't follow above rules nanosecVal = - PyDateTimeToEpoch((PyDateTime_Date *)item, NPY_FR_ns); + PyDateTimeToEpoch(item, NPY_FR_ns); } } } @@ -1469,8 +1469,7 @@ char **NpyArr_encodeLabels(PyArrayObject *labels, PyObjectEncoder *enc, if (type_num == NPY_DATETIME) { cLabel = int64ToIso(nanosecVal, base, &len); } else { - cLabel = PyDateTimeToIso((PyDateTime_Date *)item, - base, &len); + cLabel = PyDateTimeToIso(item, base, &len); } } if (cLabel == NULL) { @@ -1683,7 +1682,7 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { NPY_DATETIMEUNIT base = ((PyObjectEncoder *)tc->encoder)->datetimeUnit; GET_TC(tc)->longValue = - PyDateTimeToEpoch((PyDateTime_Date *)obj, base); + PyDateTimeToEpoch(obj, base); tc->type = JT_LONG; } return; @@ -1710,7 +1709,7 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { NPY_DATETIMEUNIT base = ((PyObjectEncoder *)tc->encoder)->datetimeUnit; GET_TC(tc)->longValue = - PyDateTimeToEpoch((PyDateTime_Date *)obj, base); + PyDateTimeToEpoch(obj, base); tc->type = JT_LONG; } return; diff --git a/pandas/_libs/tslibs/src/datetime/np_datetime.c b/pandas/_libs/tslibs/src/datetime/np_datetime.c index a8a47e2e90f93..f647098140528 100644 --- a/pandas/_libs/tslibs/src/datetime/np_datetime.c +++ b/pandas/_libs/tslibs/src/datetime/np_datetime.c @@ -21,7 +21,6 @@ This file is derived from NumPy 1.7. See NUMPY_LICENSE.txt #endif // NPY_NO_DEPRECATED_API #include -#include #include #include @@ -313,15 +312,14 @@ int cmp_npy_datetimestruct(const npy_datetimestruct *a, * object into a NumPy npy_datetimestruct. Uses tzinfo (if present) * to convert to UTC time. * - * While the C API has PyDate_* and PyDateTime_* functions, the following - * implementation just asks for attributes, and thus supports - * datetime duck typing. The tzinfo time zone conversion would require - * this style of access anyway. + * The following implementation just asks for attributes, and thus + * supports datetime duck typing. The tzinfo time zone conversion + * requires this style of access as well. * * Returns -1 on error, 0 on success, and 1 (with no error set) * if obj doesn't have the needed date or datetime attributes. */ -int convert_pydatetime_to_datetimestruct(PyDateTime_Date *dtobj, +int convert_pydatetime_to_datetimestruct(PyObject *dtobj, npy_datetimestruct *out) { // Assumes that obj is a valid datetime object PyObject *tmp; diff --git a/pandas/_libs/tslibs/src/datetime/np_datetime.h b/pandas/_libs/tslibs/src/datetime/np_datetime.h index 549d38409ca83..0bbc24ed822c5 100644 --- a/pandas/_libs/tslibs/src/datetime/np_datetime.h +++ b/pandas/_libs/tslibs/src/datetime/np_datetime.h @@ -22,7 +22,6 @@ This file is derived from NumPy 1.7. See NUMPY_LICENSE.txt #endif // NPY_NO_DEPRECATED_API #include -#include typedef struct { npy_int64 days; @@ -35,7 +34,7 @@ extern const npy_datetimestruct _NS_MAX_DTS; // stuff pandas needs // ---------------------------------------------------------------------------- -int convert_pydatetime_to_datetimestruct(PyDateTime_Date *dtobj, +int convert_pydatetime_to_datetimestruct(PyObject *dtobj, npy_datetimestruct *out); npy_datetime npy_datetimestruct_to_datetime(NPY_DATETIMEUNIT base, diff --git a/setup.py b/setup.py index 2d49d7e1e85f2..461ef005c3df3 100755 --- a/setup.py +++ b/setup.py @@ -433,8 +433,7 @@ def run(self): extra_compile_args.append("/Z7") extra_link_args.append("/DEBUG") else: - # args to ignore warnings - extra_compile_args = [] + extra_compile_args = ["-Werror"] extra_link_args = [] if debugging_symbols_requested: extra_compile_args.append("-g") @@ -477,6 +476,14 @@ def run(self): # we can't do anything about these warnings because they stem from # cython+numpy version mismatches. macros.append(("NPY_NO_DEPRECATED_API", "0")) +if "-Werror" in extra_compile_args: + try: + import numpy as np + except ImportError: + pass + else: + if np.__version__ < LooseVersion("1.16.0"): + extra_compile_args.remove("-Werror") # ---------------------------------------------------------------------- From 6d74398e3eb2ca40a4066a2e5ea4a77569c5ea0b Mon Sep 17 00:00:00 2001 From: Terji Petersen Date: Thu, 19 Mar 2020 21:30:21 +0000 Subject: [PATCH 29/40] CLN: simplify MultiIndex._shallow_copy (#32772) --- pandas/core/indexes/multi.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index d30765217390f..303fc62d6ad35 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -990,15 +990,11 @@ def _constructor(self): def _shallow_copy(self, values=None, **kwargs): if values is not None: names = kwargs.pop("names", kwargs.pop("name", self.names)) - # discards freq - kwargs.pop("freq", None) return MultiIndex.from_tuples(values, names=names, **kwargs) result = self.copy(**kwargs) result._cache = self._cache.copy() - # GH32669 - if "levels" in result._cache: - del result._cache["levels"] + result._cache.pop("levels", None) # GH32669 return result def _shallow_copy_with_infer(self, values, **kwargs): From 21d385989571cd0997b319e6717bbbe149eecaa7 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 19 Mar 2020 23:20:35 +0100 Subject: [PATCH 30/40] DOC: use new pydata-sphinx-theme name (#32840) --- doc/source/conf.py | 2 +- environment.yml | 2 +- requirements-dev.txt | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/doc/source/conf.py b/doc/source/conf.py index a95cd4ab696f7..35833627f6c05 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -195,7 +195,7 @@ # The theme to use for HTML and HTML Help pages. Major themes that come with # Sphinx are currently 'default' and 'sphinxdoc'. -html_theme = "pandas_sphinx_theme" +html_theme = "pydata_sphinx_theme" # The style sheet to use for HTML and HTML Help pages. A file of that name # must exist either in Sphinx' static/ path, or in one of the custom paths diff --git a/environment.yml b/environment.yml index cbdaf8e6c4217..532c36038fcaf 100644 --- a/environment.yml +++ b/environment.yml @@ -104,5 +104,5 @@ dependencies: - pyreadstat # pandas.read_spss - tabulate>=0.8.3 # DataFrame.to_markdown - pip: - - git+https://github.com/pandas-dev/pandas-sphinx-theme.git@master + - git+https://github.com/pandas-dev/pydata-sphinx-theme.git@master - git+https://github.com/numpy/numpydoc diff --git a/requirements-dev.txt b/requirements-dev.txt index a469cbdd93ceb..9ee67c56ab8ca 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -70,5 +70,5 @@ sqlalchemy xarray pyreadstat tabulate>=0.8.3 -git+https://github.com/pandas-dev/pandas-sphinx-theme.git@master +git+https://github.com/pandas-dev/pydata-sphinx-theme.git@master git+https://github.com/numpy/numpydoc \ No newline at end of file From c8651ede519d0187ca6a99d789161105775e8b52 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 19 Mar 2020 16:32:06 -0700 Subject: [PATCH 31/40] REF: pass align_keys to apply --- pandas/core/internals/managers.py | 38 +++++++++++++++---------------- 1 file changed, 18 insertions(+), 20 deletions(-) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 66e96af05eb71..e83b71bd966cc 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -27,7 +27,7 @@ ) from pandas.core.dtypes.concat import concat_compat from pandas.core.dtypes.dtypes import ExtensionDtype -from pandas.core.dtypes.generic import ABCExtensionArray, ABCSeries +from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries from pandas.core.dtypes.missing import isna import pandas.core.algorithms as algos @@ -375,7 +375,7 @@ def reduce(self, func, *args, **kwargs): return res - def apply(self: T, f, filter=None, **kwargs) -> T: + def apply(self: T, f, filter=None, align_keys=None, **kwargs) -> T: """ Iterate over the blocks, collect and create a new BlockManager. @@ -390,6 +390,7 @@ def apply(self: T, f, filter=None, **kwargs) -> T: ------- BlockManager """ + align_keys = align_keys or [] result_blocks = [] # fillna: Series/DataFrame is responsible for making sure value is aligned @@ -404,28 +405,14 @@ def apply(self: T, f, filter=None, **kwargs) -> T: self._consolidate_inplace() + align_copy = False if f == "where": align_copy = True - if kwargs.get("align", True): - align_keys = ["other", "cond"] - else: - align_keys = ["cond"] - elif f == "putmask": - align_copy = False - if kwargs.get("align", True): - align_keys = ["new", "mask"] - else: - align_keys = ["mask"] - else: - align_keys = [] - # TODO(EA): may interfere with ExtensionBlock.setitem for blocks - # with a .values attribute. aligned_args = { k: kwargs[k] for k in align_keys - if not isinstance(kwargs[k], ABCExtensionArray) - and hasattr(kwargs[k], "values") + if isinstance(kwargs[k], (ABCSeries, ABCDataFrame)) } for b in self.blocks: @@ -561,13 +548,24 @@ def isna(self, func) -> "BlockManager": return self.apply("apply", func=func) def where(self, **kwargs) -> "BlockManager": - return self.apply("where", **kwargs) + if kwargs.pop("align", True): + align_keys = ["other", "cond"] + else: + align_keys = ["cond"] + + return self.apply("where", align_keys=align_keys, **kwargs) def setitem(self, indexer, value) -> "BlockManager": return self.apply("setitem", indexer=indexer, value=value) def putmask(self, **kwargs): - return self.apply("putmask", **kwargs) + + if kwargs.pop("align", True): + align_keys = ["new", "mask"] + else: + align_keys = ["mask"] + + return self.apply("putmask", align_keys=align_keys, **kwargs) def diff(self, n: int, axis: int) -> "BlockManager": return self.apply("diff", n=n, axis=axis) From 5edf4e1a6935af7a3eec844fb224929c6abf0908 Mon Sep 17 00:00:00 2001 From: MomIsBestFriend <50263213+MomIsBestFriend@users.noreply.github.com> Date: Fri, 20 Mar 2020 02:22:26 +0200 Subject: [PATCH 32/40] DOC: FutureWarning in Sphinx build when calling read_parquet (#32833) --- doc/source/user_guide/scale.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/user_guide/scale.rst b/doc/source/user_guide/scale.rst index 43bb4966ec5bf..61fa24bb77cfc 100644 --- a/doc/source/user_guide/scale.rst +++ b/doc/source/user_guide/scale.rst @@ -246,6 +246,7 @@ We'll import ``dask.dataframe`` and notice that the API feels similar to pandas. We can use Dask's ``read_parquet`` function, but provide a globstring of files to read in. .. ipython:: python + :okwarning: import dask.dataframe as dd From 5717ae5c8a73949849cd4bbb5ce474c623a27512 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 20 Mar 2020 19:39:12 -0700 Subject: [PATCH 33/40] checkpoint passing --- pandas/core/array_algos/npcompat.py | 14 ++++++++++++++ pandas/core/arrays/timedeltas.py | 7 ++++++- pandas/core/ops/__init__.py | 13 +++++++++++++ pandas/tests/arithmetic/test_datetime64.py | 4 ++-- pandas/tests/arithmetic/test_timedelta64.py | 14 +++++++++----- pandas/tests/frame/test_arithmetic.py | 2 +- 6 files changed, 45 insertions(+), 9 deletions(-) create mode 100644 pandas/core/array_algos/npcompat.py diff --git a/pandas/core/array_algos/npcompat.py b/pandas/core/array_algos/npcompat.py new file mode 100644 index 0000000000000..a459b86981783 --- /dev/null +++ b/pandas/core/array_algos/npcompat.py @@ -0,0 +1,14 @@ +""" +Implementations of high-level numpy functions that are ExtensionArray-compatible. +""" +import numpy as np + +from pandas._typing import ArrayLike + + +def tile(arr: ArrayLike, shape) -> ArrayLike: + raise NotImplementedError + + +def broadcast_to(arr: ArrayLike, shape) -> ArrayLike: + raise NotImplementedError diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index a25426c5c99cc..61beaa0636e5b 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -39,7 +39,7 @@ from pandas.core.algorithms import checked_add_with_arr from pandas.core.arrays import datetimelike as dtl import pandas.core.common as com -from pandas.core.construction import extract_array +from pandas.core.construction import extract_array, array from pandas.tseries.frequencies import to_offset from pandas.tseries.offsets import Tick @@ -523,6 +523,11 @@ def __truediv__(self, other): # will be returned. GH#23829 result = [self[n] / other[n] for n in range(len(self))] result = np.array(result) + if self.ndim == 2: + # FIXME: kludge, just trying to get the tests passing + result = extract_array(array(result.ravel()), extract_numpy=True).reshape(result.shape) + if result.dtype.kind == "m": + result = np.asarray(result) # TODO: no real reason for this, but we test it return result else: diff --git a/pandas/core/ops/__init__.py b/pandas/core/ops/__init__.py index 3153a9ac28c10..bd44cc35a9dc4 100644 --- a/pandas/core/ops/__init__.py +++ b/pandas/core/ops/__init__.py @@ -393,6 +393,19 @@ def column_op(a, b): # in which case we specifically want to operate row-by-row assert right.index.equals(left.columns) + if isinstance(right.dtype, np.dtype) and left._data.nblocks == 1: + # includes TDA/DTA-naive + rvals = right._values + right = rvals.reshape(1, -1) + right = np.broadcast_to(right, left.shape).T # Needs TDA/DTA compat + if not isinstance(rvals, np.ndarray): + # re-wrap DTA/TDA + right = type(rvals)(right) + + array_op = get_array_op(func, str_rep=str_rep) + bm = left._data.apply(array_op, right=right) # TODO: BlockManager.apply needs to know to align right + return type(left)(bm) + if right.dtype == "timedelta64[ns]": # ensure we treat NaT values as the correct dtype # Note: we do not do this unconditionally as it may be lossy or diff --git a/pandas/tests/arithmetic/test_datetime64.py b/pandas/tests/arithmetic/test_datetime64.py index f7211ab5f9fd4..55d90c71cd6c1 100644 --- a/pandas/tests/arithmetic/test_datetime64.py +++ b/pandas/tests/arithmetic/test_datetime64.py @@ -1417,7 +1417,7 @@ def test_dt64arr_add_sub_offset_ndarray(self, tz_naive_fixture, box_with_array): other = np.array([pd.offsets.MonthEnd(), pd.offsets.Day(n=2)]) - warn = None if box_with_array is pd.DataFrame else PerformanceWarning + warn = None if (box_with_array is pd.DataFrame and tz is not None) else PerformanceWarning with tm.assert_produces_warning(warn): res = dtarr + other expected = DatetimeIndex( @@ -2378,7 +2378,7 @@ def test_dti_addsub_object_arraylike( expected = pd.DatetimeIndex(["2017-01-31", "2017-01-06"], tz=tz_naive_fixture) expected = tm.box_expected(expected, xbox) - warn = None if box_with_array is pd.DataFrame else PerformanceWarning + warn = None if (box_with_array is pd.DataFrame and tz is not None) else PerformanceWarning with tm.assert_produces_warning(warn): result = dtarr + other tm.assert_equal(result, expected) diff --git a/pandas/tests/arithmetic/test_timedelta64.py b/pandas/tests/arithmetic/test_timedelta64.py index b11fcfd20b8c4..ce2376a49d53b 100644 --- a/pandas/tests/arithmetic/test_timedelta64.py +++ b/pandas/tests/arithmetic/test_timedelta64.py @@ -1320,7 +1320,7 @@ def test_td64arr_add_offset_index(self, names, box): # The DataFrame operation is transposed and so operates as separate # scalar operations, which do not issue a PerformanceWarning - warn = PerformanceWarning if box is not pd.DataFrame else None + warn = PerformanceWarning #if box is not pd.DataFrame else None with tm.assert_produces_warning(warn): res = tdi + other tm.assert_equal(res, expected) @@ -1346,7 +1346,7 @@ def test_td64arr_add_offset_array(self, box_with_array): # The DataFrame operation is transposed and so operates as separate # scalar operations, which do not issue a PerformanceWarning - warn = PerformanceWarning if box is not pd.DataFrame else None + warn = PerformanceWarning #if box is not pd.DataFrame else None with tm.assert_produces_warning(warn): res = tdi + other tm.assert_equal(res, expected) @@ -1382,7 +1382,7 @@ def test_td64arr_sub_offset_index(self, names, box_with_array): # The DataFrame operation is transposed and so operates as separate # scalar operations, which do not issue a PerformanceWarning - warn = PerformanceWarning if box is not pd.DataFrame else None + warn = PerformanceWarning #if box is not pd.DataFrame else None with tm.assert_produces_warning(warn): res = tdi - other tm.assert_equal(res, expected) @@ -1401,7 +1401,7 @@ def test_td64arr_sub_offset_array(self, box_with_array): # The DataFrame operation is transposed and so operates as separate # scalar operations, which do not issue a PerformanceWarning - warn = None if box_with_array is pd.DataFrame else PerformanceWarning + warn = PerformanceWarning#None if box_with_array is pd.DataFrame else PerformanceWarning with tm.assert_produces_warning(warn): res = tdi - other tm.assert_equal(res, expected) @@ -1473,7 +1473,7 @@ def test_td64arr_add_sub_object_array(self, box_with_array): [pd.Timedelta(days=1), pd.offsets.Day(2), pd.Timestamp("2000-01-04")] ) - warn = PerformanceWarning if box_with_array is not pd.DataFrame else None + warn = PerformanceWarning# if box_with_array is not pd.DataFrame else None with tm.assert_produces_warning(warn): result = tdarr + other @@ -1481,6 +1481,8 @@ def test_td64arr_add_sub_object_array(self, box_with_array): [pd.Timedelta(days=2), pd.Timedelta(days=4), pd.Timestamp("2000-01-07")] ) expected = tm.box_expected(expected, box_with_array) + if box_with_array is pd.DataFrame: + expected = expected.astype(object) tm.assert_equal(result, expected) msg = "unsupported operand type|cannot subtract a datelike" @@ -1495,6 +1497,8 @@ def test_td64arr_add_sub_object_array(self, box_with_array): [pd.Timedelta(0), pd.Timedelta(0), pd.Timestamp("2000-01-01")] ) expected = tm.box_expected(expected, box_with_array) + if box_with_array is pd.DataFrame: + expected = expected.astype(object) tm.assert_equal(result, expected) diff --git a/pandas/tests/frame/test_arithmetic.py b/pandas/tests/frame/test_arithmetic.py index b39c58b9931ab..ef6805f736b1b 100644 --- a/pandas/tests/frame/test_arithmetic.py +++ b/pandas/tests/frame/test_arithmetic.py @@ -594,7 +594,7 @@ def test_df_arith_2d_array_rowlike_broadcasts(self, all_arithmetic_operators): expected = pd.DataFrame(exvals, columns=df.columns, index=df.index) - if opname in ["__rmod__", "__rfloordiv__"]: + if False:#opname in ["__rmod__", "__rfloordiv__"]: # exvals will have dtypes [f8, i8, i8] so expected will be # all-f8, but the DataFrame operation will return mixed dtypes # use exvals[-1].dtype instead of "i8" for compat with 32-bit From 579a31a49426bc863e1f46fc78068fced9ee027b Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sat, 21 Mar 2020 14:15:24 -0700 Subject: [PATCH 34/40] checkpoint passing --- pandas/core/array_algos/npcompat.py | 9 ++++++++- pandas/core/arrays/datetimelike.py | 6 ++++++ pandas/core/internals/managers.py | 7 ++++++- pandas/core/ops/__init__.py | 11 ++++++----- pandas/core/ops/array_ops.py | 1 + 5 files changed, 27 insertions(+), 7 deletions(-) diff --git a/pandas/core/array_algos/npcompat.py b/pandas/core/array_algos/npcompat.py index a459b86981783..7bb3f507a8110 100644 --- a/pandas/core/array_algos/npcompat.py +++ b/pandas/core/array_algos/npcompat.py @@ -11,4 +11,11 @@ def tile(arr: ArrayLike, shape) -> ArrayLike: def broadcast_to(arr: ArrayLike, shape) -> ArrayLike: - raise NotImplementedError + if isinstance(arr, np.ndarray): + return np.broadcast_to(arr, shape) + + values = arr._values_for_factorize()[0] + + btvalues = np.broadcast_to(values, shape) + result = type(arr)._from_factorized(btvalues, arr) + return result diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index b2bff0b0142e2..87cf33764a924 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -427,6 +427,12 @@ def ravel(self, *args, **kwargs): data = self._data.ravel(*args, **kwargs) return type(self)(data, dtype=self.dtype) + @property + def T(self): + # Note: we drop any freq + data = self._data.T + return type(self)(data, dtype=self.dtype) + @property def _box_func(self): """ diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index e83b71bd966cc..3144137d256a5 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -426,7 +426,12 @@ def apply(self: T, f, filter=None, align_keys=None, **kwargs) -> T: for k, obj in aligned_args.items(): axis = obj._info_axis_number - kwargs[k] = obj.reindex(b_items, axis=axis, copy=align_copy) + if isinstance(obj, (ABCSeries, ABCDataFrame)): + kwargs[k] = obj.reindex(b_items, axis=axis, copy=align_copy) + else: + # We should have an ndarray or ExtensionArray + assert obj.shape[0] == self.shape[0], (obj.shape, self.shape) + kwargs[k] = obj[b.mgr_locs.indexer] if callable(f): applied = b.apply(f, **kwargs) diff --git a/pandas/core/ops/__init__.py b/pandas/core/ops/__init__.py index bd44cc35a9dc4..858475fb8072b 100644 --- a/pandas/core/ops/__init__.py +++ b/pandas/core/ops/__init__.py @@ -55,6 +55,7 @@ rtruediv, rxor, ) +from pandas.core.array_algos.npcompat import broadcast_to if TYPE_CHECKING: from pandas import DataFrame # noqa:F401 @@ -397,16 +398,16 @@ def column_op(a, b): # includes TDA/DTA-naive rvals = right._values right = rvals.reshape(1, -1) - right = np.broadcast_to(right, left.shape).T # Needs TDA/DTA compat - if not isinstance(rvals, np.ndarray): - # re-wrap DTA/TDA - right = type(rvals)(right) + right = broadcast_to(right, left.shape).T # Needs TDA/DTA compat + #if not isinstance(rvals, np.ndarray): + # # re-wrap DTA/TDA + # right = type(rvals)(right) array_op = get_array_op(func, str_rep=str_rep) bm = left._data.apply(array_op, right=right) # TODO: BlockManager.apply needs to know to align right return type(left)(bm) - if right.dtype == "timedelta64[ns]": + if right.dtype == "timedelta64[ns]": # still needed for two tests with PeriodArray # ensure we treat NaT values as the correct dtype # Note: we do not do this unconditionally as it may be lossy or # expensive for EA dtypes. diff --git a/pandas/core/ops/array_ops.py b/pandas/core/ops/array_ops.py index e285c53d9813e..0fdfc94c4d5a1 100644 --- a/pandas/core/ops/array_ops.py +++ b/pandas/core/ops/array_ops.py @@ -33,6 +33,7 @@ ) from pandas.core.dtypes.missing import isna, notna +from pandas.core.array_algos.npcompat import broadcast_to from pandas.core.ops import missing from pandas.core.ops.dispatch import dispatch_to_extension_op, should_extension_dispatch from pandas.core.ops.invalid import invalid_comparison From 0617a1798623a11ba02d3d45f2822cc63b46ffe1 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sun, 22 Mar 2020 19:23:49 -0700 Subject: [PATCH 35/40] Checkpoint 2 failures, both argueable --- pandas/core/arrays/datetimelike.py | 5 +++-- pandas/core/internals/managers.py | 10 ++++++---- pandas/core/ops/__init__.py | 17 +++++++++-------- pandas/tests/frame/test_operators.py | 1 + 4 files changed, 19 insertions(+), 14 deletions(-) diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 87cf33764a924..f976a6b1997ff 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -1186,9 +1186,10 @@ def _add_timedelta_arraylike(self, other): self_i8 = self.asi8 other_i8 = other.asi8 + # TODO: do we need to worry about these having the same row/column order? new_values = checked_add_with_arr( - self_i8, other_i8, arr_mask=self._isnan, b_mask=other._isnan - ) + self_i8.ravel(), other_i8.ravel(), arr_mask=self._isnan.ravel(), b_mask=other._isnan.ravel() + ).reshape(self.shape) if self._hasnans or other._hasnans: mask = (self._isnan) | (other._isnan) new_values[mask] = iNaT diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 3144137d256a5..bee701534b129 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -412,7 +412,7 @@ def apply(self: T, f, filter=None, align_keys=None, **kwargs) -> T: aligned_args = { k: kwargs[k] for k in align_keys - if isinstance(kwargs[k], (ABCSeries, ABCDataFrame)) + #if isinstance(kwargs[k], (ABCSeries, ABCDataFrame)) } for b in self.blocks: @@ -425,13 +425,15 @@ def apply(self: T, f, filter=None, align_keys=None, **kwargs) -> T: b_items = self.items[b.mgr_locs.indexer] for k, obj in aligned_args.items(): - axis = obj._info_axis_number if isinstance(obj, (ABCSeries, ABCDataFrame)): + axis = obj._info_axis_number kwargs[k] = obj.reindex(b_items, axis=axis, copy=align_copy) else: # We should have an ndarray or ExtensionArray - assert obj.shape[0] == self.shape[0], (obj.shape, self.shape) - kwargs[k] = obj[b.mgr_locs.indexer] + if obj.ndim == 2: + # FIXME: kludge; shouldnt need the ndim restriction + assert obj.shape[0] == self.shape[0], (obj.shape, self.shape) + kwargs[k] = obj[b.mgr_locs.indexer] if callable(f): applied = b.apply(f, **kwargs) diff --git a/pandas/core/ops/__init__.py b/pandas/core/ops/__init__.py index 858475fb8072b..ae236916535d0 100644 --- a/pandas/core/ops/__init__.py +++ b/pandas/core/ops/__init__.py @@ -394,20 +394,21 @@ def column_op(a, b): # in which case we specifically want to operate row-by-row assert right.index.equals(left.columns) - if isinstance(right.dtype, np.dtype) and left._data.nblocks == 1: + # FIXME: we really dont want separate code-paths for nblocks==1; + # tests are likely biased towards nblocks==1. + if isinstance(right.dtype, np.dtype):# and left._data.nblocks == 1: # includes TDA/DTA-naive rvals = right._values right = rvals.reshape(1, -1) - right = broadcast_to(right, left.shape).T # Needs TDA/DTA compat - #if not isinstance(rvals, np.ndarray): - # # re-wrap DTA/TDA - # right = type(rvals)(right) + right = broadcast_to(right, left.shape).T array_op = get_array_op(func, str_rep=str_rep) - bm = left._data.apply(array_op, right=right) # TODO: BlockManager.apply needs to know to align right + bm = left._data.apply(array_op, right=right, align_keys=["right"]) + # TODO: BlockManager.apply needs to know to align right return type(left)(bm) - if right.dtype == "timedelta64[ns]": # still needed for two tests with PeriodArray + # still needed for two tests with PeriodArray + if right.dtype == "timedelta64[ns]": # ensure we treat NaT values as the correct dtype # Note: we do not do this unconditionally as it may be lossy or # expensive for EA dtypes. @@ -417,7 +418,7 @@ def column_op(a, b): return {i: func(a.iloc[:, i], b[i]) for i in range(len(a.columns))} else: - + # FIXME: this will be wrong for Categorical `b` def column_op(a, b): return {i: func(a.iloc[:, i], b.iloc[i]) for i in range(len(a.columns))} diff --git a/pandas/tests/frame/test_operators.py b/pandas/tests/frame/test_operators.py index 542d9835bb5d3..90165b1268426 100644 --- a/pandas/tests/frame/test_operators.py +++ b/pandas/tests/frame/test_operators.py @@ -452,6 +452,7 @@ def test_combineFrame(self, float_frame, mixed_float_frame, mixed_int_frame): added = float_frame + mixed_int_frame _check_mixed_float(added, dtype="float64") + # TODO: arithmetic test def test_combine_series( self, float_frame, mixed_float_frame, mixed_int_frame, datetime_frame ): From 97388938c04b3e1739c4597d3b6c31fb793ec6ad Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 24 Mar 2020 20:49:41 -0700 Subject: [PATCH 36/40] Implement block-wise ops for frame-series with axis=1 --- pandas/core/array_algos/npcompat.py | 31 +++++++++++----- pandas/core/arrays/datetimelike.py | 9 +++-- pandas/core/arrays/period.py | 16 +++++++-- pandas/core/arrays/timedeltas.py | 9 +++-- pandas/core/indexes/period.py | 12 +++++++ pandas/core/internals/managers.py | 11 +++--- pandas/core/ops/__init__.py | 14 +++----- pandas/core/ops/array_ops.py | 1 - pandas/tests/arithmetic/test_datetime64.py | 11 ++++-- pandas/tests/arithmetic/test_timedelta64.py | 39 +++++++++------------ pandas/tests/frame/common.py | 16 ++++----- pandas/tests/frame/test_arithmetic.py | 7 ---- 12 files changed, 102 insertions(+), 74 deletions(-) diff --git a/pandas/core/array_algos/npcompat.py b/pandas/core/array_algos/npcompat.py index 7bb3f507a8110..b7d71cfb312f2 100644 --- a/pandas/core/array_algos/npcompat.py +++ b/pandas/core/array_algos/npcompat.py @@ -1,5 +1,5 @@ """ -Implementations of high-level numpy functions that are ExtensionArray-compatible. +Implementations of high-level numpy functions that are ExtensionArray-compatible. """ import numpy as np @@ -7,15 +7,28 @@ def tile(arr: ArrayLike, shape) -> ArrayLike: - raise NotImplementedError + raise NotImplementedError -def broadcast_to(arr: ArrayLike, shape) -> ArrayLike: - if isinstance(arr, np.ndarray): - return np.broadcast_to(arr, shape) +def broadcast_to(arr: ArrayLike, shape, orient=None) -> ArrayLike: + if isinstance(arr, np.ndarray): + values = arr + else: + # ExtensionArray + values = arr._values_for_factorize()[0] - values = arr._values_for_factorize()[0] + # TODO: if we are ndim==size==1 it shouldnt matter whether rowlike/columnlike? + if values.ndim == 1 and orient is not None: + # SUpport treating a 1-dimensional array as either a row or column + assert orient in ["rowlike", "columnlike"] + if orient == "rowlike": + values = values.reshape(1, -1) + else: + values = values.reshape(-1, 1) - btvalues = np.broadcast_to(values, shape) - result = type(arr)._from_factorized(btvalues, arr) - return result + btvalues = np.broadcast_to(values, shape) + if isinstance(arr, np.ndarray): + result = btvalues + else: + result = type(arr)._from_factorized(btvalues, arr) + return result diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index d3d907d88979f..2f93df291ad55 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -567,7 +567,7 @@ def __getitem__(self, key): else: key = check_array_indexer(self, key) - is_period = is_period_dtype(self) + is_period = is_period_dtype(self.dtype) if is_period: freq = self.freq else: @@ -583,7 +583,7 @@ def __getitem__(self, key): freq = self.freq result = getitem(key) - if result.ndim > 1: + if result.ndim > 1 and not is_period and not is_datetime64tz_dtype(self.dtype): # To support MPL which performs slicing with 2 dim # even though it only has 1 dim by definition return result @@ -1216,7 +1216,10 @@ def _add_timedelta_arraylike(self, other): other_i8 = other.asi8 # TODO: do we need to worry about these having the same row/column order? new_values = checked_add_with_arr( - self_i8.ravel(), other_i8.ravel(), arr_mask=self._isnan.ravel(), b_mask=other._isnan.ravel() + self_i8.ravel(), + other_i8.ravel(), + arr_mask=self._isnan.ravel(), + b_mask=other._isnan.ravel(), ).reshape(self.shape) if self._hasnans or other._hasnans: mask = (self._isnan) | (other._isnan) diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index c24b0b5fa64b8..b90502d129793 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -623,10 +623,20 @@ def _addsub_int_array( assert op in [operator.add, operator.sub] if op is operator.sub: other = -other - res_values = algos.checked_add_with_arr(self.asi8, other, arr_mask=self._isnan) + + mask = self._isnan + if self.ndim == self.size == 1 and other.ndim == 2: + # TODO: more general case? should this be handled by DataFrame + # op before we get here? + arr = np.broadcast_to(self._data[:, None], other.shape) + self = type(self)(arr, freq=self.freq) + + res_values = algos.checked_add_with_arr( + self.asi8.ravel(), other.ravel(), arr_mask=self._isnan, + ) res_values = res_values.view("i8") - res_values[self._isnan] = iNaT - return type(self)(res_values, freq=self.freq) + res_values[mask] = iNaT + return type(self)(res_values.reshape(self.shape), freq=self.freq) def _add_offset(self, other): assert not isinstance(other, Tick) diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index 61beaa0636e5b..ca0d6424eb92b 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -39,7 +39,7 @@ from pandas.core.algorithms import checked_add_with_arr from pandas.core.arrays import datetimelike as dtl import pandas.core.common as com -from pandas.core.construction import extract_array, array +from pandas.core.construction import array, extract_array from pandas.tseries.frequencies import to_offset from pandas.tseries.offsets import Tick @@ -521,13 +521,16 @@ def __truediv__(self, other): # Note: we do not do type inference on the result, so either # an object array or numeric-dtyped (if numpy does inference) # will be returned. GH#23829 + # FIXME: the above comment is no longer accurate... sometimes result = [self[n] / other[n] for n in range(len(self))] result = np.array(result) if self.ndim == 2: # FIXME: kludge, just trying to get the tests passing - result = extract_array(array(result.ravel()), extract_numpy=True).reshape(result.shape) + res = extract_array(array(result.ravel()), extract_numpy=True) + result = res.reshape(result.shape) if result.dtype.kind == "m": - result = np.asarray(result) # TODO: no real reason for this, but we test it + # TODO: no real reason for this, but we test it + result = np.asarray(result) return result else: diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index 0a906c87866f8..ac17ffcd6a1a4 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -32,6 +32,7 @@ validate_dtype_freq, ) import pandas.core.common as com +from pandas.core.indexers import deprecate_ndim_indexing import pandas.core.indexes.base as ibase from pandas.core.indexes.base import ( InvalidIndexError, @@ -349,6 +350,17 @@ def _int64index(self) -> Int64Index: # ------------------------------------------------------------------------ # Index Methods + def __getitem__(self, key): + # PeriodArray.__getitem__ returns PeriodArray for 2D lookups, + # so we need to issue deprecation warning and cast here + result = super().__getitem__(key) + + if isinstance(result, PeriodIndex) and result._data.ndim == 2: + # this are not actually a valid Index object + deprecate_ndim_indexing(result._data) + return result._data._data + return result + def __array_wrap__(self, result, context=None): """ Gets called after a ufunc. Needs additional handling as diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 8a84d309adc5d..d09e2f73091a3 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -409,11 +409,7 @@ def apply(self: T, f, filter=None, align_keys=None, **kwargs) -> T: if f == "where": align_copy = True - aligned_args = { - k: kwargs[k] - for k in align_keys - #if isinstance(kwargs[k], (ABCSeries, ABCDataFrame)) - } + aligned_args = {k: kwargs[k] for k in align_keys} for b in self.blocks: if filter is not None: @@ -432,7 +428,10 @@ def apply(self: T, f, filter=None, align_keys=None, **kwargs) -> T: # We should have an ndarray or ExtensionArray if obj.ndim == 2: # FIXME: kludge; shouldnt need the ndim restriction - assert obj.shape[0] == self.shape[0], (obj.shape, self.shape) + assert obj.shape[0] == self.shape[0], ( + obj.shape, + self.shape, + ) kwargs[k] = obj[b.mgr_locs.indexer] if callable(f): diff --git a/pandas/core/ops/__init__.py b/pandas/core/ops/__init__.py index f4490fdc33114..b35a382ef9224 100644 --- a/pandas/core/ops/__init__.py +++ b/pandas/core/ops/__init__.py @@ -17,6 +17,7 @@ from pandas.core.dtypes.generic import ABCDataFrame, ABCIndexClass, ABCSeries from pandas.core.dtypes.missing import isna +from pandas.core.array_algos.npcompat import broadcast_to from pandas.core.construction import extract_array from pandas.core.ops.array_ops import ( arithmetic_op, @@ -54,7 +55,6 @@ rtruediv, rxor, ) -from pandas.core.array_algos.npcompat import broadcast_to if TYPE_CHECKING: from pandas import DataFrame # noqa:F401 @@ -334,17 +334,13 @@ def column_op(a, b): # in which case we specifically want to operate row-by-row assert right.index.equals(left.columns) - # FIXME: we really dont want separate code-paths for nblocks==1; - # tests are likely biased towards nblocks==1. - if isinstance(right.dtype, np.dtype):# and left._data.nblocks == 1: - # includes TDA/DTA-naive - rvals = right._values - right = rvals.reshape(1, -1) - right = broadcast_to(right, left.shape).T + rvals = right._values + if hasattr(rvals, "reshape"): + # i.e. ndarray, DatetimeArray, TimedeltaArray, PeriodArray + right = broadcast_to(rvals, left.shape, orient="rowlike").T array_op = get_array_op(func, str_rep=str_rep) bm = left._data.apply(array_op, right=right, align_keys=["right"]) - # TODO: BlockManager.apply needs to know to align right return type(left)(bm) # still needed for two tests with PeriodArray diff --git a/pandas/core/ops/array_ops.py b/pandas/core/ops/array_ops.py index bf03c61400127..5dd7af454cbd1 100644 --- a/pandas/core/ops/array_ops.py +++ b/pandas/core/ops/array_ops.py @@ -28,7 +28,6 @@ from pandas.core.dtypes.generic import ABCExtensionArray, ABCIndex, ABCSeries from pandas.core.dtypes.missing import isna, notna -from pandas.core.array_algos.npcompat import broadcast_to from pandas.core.ops import missing from pandas.core.ops.dispatch import should_extension_dispatch from pandas.core.ops.invalid import invalid_comparison diff --git a/pandas/tests/arithmetic/test_datetime64.py b/pandas/tests/arithmetic/test_datetime64.py index 55d90c71cd6c1..90d2f7d2930bf 100644 --- a/pandas/tests/arithmetic/test_datetime64.py +++ b/pandas/tests/arithmetic/test_datetime64.py @@ -1065,6 +1065,7 @@ def test_dt64arr_add_sub_parr( "unsupported operand", "descriptor.*requires", "ufunc.*cannot use operands", + "Addition/subtraction of integers and integer-arrays", ] ) assert_invalid_addsub_type(dtarr, parr, msg) @@ -1417,7 +1418,10 @@ def test_dt64arr_add_sub_offset_ndarray(self, tz_naive_fixture, box_with_array): other = np.array([pd.offsets.MonthEnd(), pd.offsets.Day(n=2)]) - warn = None if (box_with_array is pd.DataFrame and tz is not None) else PerformanceWarning + warn = PerformanceWarning + if box_with_array is pd.DataFrame and tz is not None: + warn = None + with tm.assert_produces_warning(warn): res = dtarr + other expected = DatetimeIndex( @@ -2378,7 +2382,10 @@ def test_dti_addsub_object_arraylike( expected = pd.DatetimeIndex(["2017-01-31", "2017-01-06"], tz=tz_naive_fixture) expected = tm.box_expected(expected, xbox) - warn = None if (box_with_array is pd.DataFrame and tz is not None) else PerformanceWarning + warn = PerformanceWarning + if box_with_array is pd.DataFrame and tz is not None: + warn = None + with tm.assert_produces_warning(warn): result = dtarr + other tm.assert_equal(result, expected) diff --git a/pandas/tests/arithmetic/test_timedelta64.py b/pandas/tests/arithmetic/test_timedelta64.py index ce2376a49d53b..3ba8dd0395485 100644 --- a/pandas/tests/arithmetic/test_timedelta64.py +++ b/pandas/tests/arithmetic/test_timedelta64.py @@ -1066,7 +1066,13 @@ def test_td64arr_sub_periodlike(self, box_with_array, tdi_freq, pi_freq): # TODO: parametrize over box for pi? tdi = tm.box_expected(tdi, box_with_array) - msg = "cannot subtract|unsupported operand type" + msg = "|".join( + [ + "cannot subtract", + "unsupported operand type", + "Addition/subtraction of integers and integer-arrays", + ] + ) with pytest.raises(TypeError, match=msg): tdi - pi @@ -1318,14 +1324,11 @@ def test_td64arr_add_offset_index(self, names, box): tdi = tm.box_expected(tdi, box) expected = tm.box_expected(expected, box) - # The DataFrame operation is transposed and so operates as separate - # scalar operations, which do not issue a PerformanceWarning - warn = PerformanceWarning #if box is not pd.DataFrame else None - with tm.assert_produces_warning(warn): + with tm.assert_produces_warning(PerformanceWarning): res = tdi + other tm.assert_equal(res, expected) - with tm.assert_produces_warning(warn): + with tm.assert_produces_warning(PerformanceWarning): res2 = other + tdi tm.assert_equal(res2, expected) @@ -1344,14 +1347,11 @@ def test_td64arr_add_offset_array(self, box_with_array): tdi = tm.box_expected(tdi, box) expected = tm.box_expected(expected, box) - # The DataFrame operation is transposed and so operates as separate - # scalar operations, which do not issue a PerformanceWarning - warn = PerformanceWarning #if box is not pd.DataFrame else None - with tm.assert_produces_warning(warn): + with tm.assert_produces_warning(PerformanceWarning): res = tdi + other tm.assert_equal(res, expected) - with tm.assert_produces_warning(warn): + with tm.assert_produces_warning(PerformanceWarning): res2 = other + tdi tm.assert_equal(res2, expected) @@ -1380,10 +1380,7 @@ def test_td64arr_sub_offset_index(self, names, box_with_array): tdi = tm.box_expected(tdi, box) expected = tm.box_expected(expected, xbox) - # The DataFrame operation is transposed and so operates as separate - # scalar operations, which do not issue a PerformanceWarning - warn = PerformanceWarning #if box is not pd.DataFrame else None - with tm.assert_produces_warning(warn): + with tm.assert_produces_warning(PerformanceWarning): res = tdi - other tm.assert_equal(res, expected) @@ -1399,10 +1396,7 @@ def test_td64arr_sub_offset_array(self, box_with_array): tdi = tm.box_expected(tdi, box_with_array) expected = tm.box_expected(expected, box_with_array) - # The DataFrame operation is transposed and so operates as separate - # scalar operations, which do not issue a PerformanceWarning - warn = PerformanceWarning#None if box_with_array is pd.DataFrame else PerformanceWarning - with tm.assert_produces_warning(warn): + with tm.assert_produces_warning(PerformanceWarning): res = tdi - other tm.assert_equal(res, expected) @@ -1473,8 +1467,7 @@ def test_td64arr_add_sub_object_array(self, box_with_array): [pd.Timedelta(days=1), pd.offsets.Day(2), pd.Timestamp("2000-01-04")] ) - warn = PerformanceWarning# if box_with_array is not pd.DataFrame else None - with tm.assert_produces_warning(warn): + with tm.assert_produces_warning(PerformanceWarning): result = tdarr + other expected = pd.Index( @@ -1487,10 +1480,10 @@ def test_td64arr_add_sub_object_array(self, box_with_array): msg = "unsupported operand type|cannot subtract a datelike" with pytest.raises(TypeError, match=msg): - with tm.assert_produces_warning(warn): + with tm.assert_produces_warning(PerformanceWarning): tdarr - other - with tm.assert_produces_warning(warn): + with tm.assert_produces_warning(PerformanceWarning): result = other - tdarr expected = pd.Index( diff --git a/pandas/tests/frame/common.py b/pandas/tests/frame/common.py index 463a140972ab5..778f07ee6223c 100644 --- a/pandas/tests/frame/common.py +++ b/pandas/tests/frame/common.py @@ -6,13 +6,13 @@ def _check_mixed_float(df, dtype=None): elif isinstance(dtype, dict): dtypes.update(dtype) if dtypes.get("A"): - assert df.dtypes["A"] == dtypes["A"] + assert df.dtypes["A"] == dtypes["A"], (df.dtypes, dtypes) if dtypes.get("B"): - assert df.dtypes["B"] == dtypes["B"] + assert df.dtypes["B"] == dtypes["B"], (df.dtypes, dtypes) if dtypes.get("C"): - assert df.dtypes["C"] == dtypes["C"] + assert df.dtypes["C"] == dtypes["C"], (df.dtypes, dtypes) if dtypes.get("D"): - assert df.dtypes["D"] == dtypes["D"] + assert df.dtypes["D"] == dtypes["D"], (df.dtypes, dtypes) def _check_mixed_int(df, dtype=None): @@ -22,10 +22,10 @@ def _check_mixed_int(df, dtype=None): elif isinstance(dtype, dict): dtypes.update(dtype) if dtypes.get("A"): - assert df.dtypes["A"] == dtypes["A"] + assert df.dtypes["A"] == dtypes["A"], (df.dtypes, dtypes) if dtypes.get("B"): - assert df.dtypes["B"] == dtypes["B"] + assert df.dtypes["B"] == dtypes["B"], (df.dtypes, dtypes) if dtypes.get("C"): - assert df.dtypes["C"] == dtypes["C"] + assert df.dtypes["C"] == dtypes["C"], (df.dtypes, dtypes) if dtypes.get("D"): - assert df.dtypes["D"] == dtypes["D"] + assert df.dtypes["D"] == dtypes["D"], (df.dtypes, dtypes) diff --git a/pandas/tests/frame/test_arithmetic.py b/pandas/tests/frame/test_arithmetic.py index 1c28508bb5812..b06d0ac786f45 100644 --- a/pandas/tests/frame/test_arithmetic.py +++ b/pandas/tests/frame/test_arithmetic.py @@ -611,13 +611,6 @@ def test_df_arith_2d_array_rowlike_broadcasts(self, all_arithmetic_operators): expected = pd.DataFrame(exvals, columns=df.columns, index=df.index) - if False:#opname in ["__rmod__", "__rfloordiv__"]: - # exvals will have dtypes [f8, i8, i8] so expected will be - # all-f8, but the DataFrame operation will return mixed dtypes - # use exvals[-1].dtype instead of "i8" for compat with 32-bit - # systems/pythons - expected[False] = expected[False].astype(exvals[-1].dtype) - result = getattr(df, opname)(rowlike) tm.assert_frame_equal(result, expected) From 37938dbef19945c4ef57f6c2e6838fbc2a2a016a Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 30 Mar 2020 11:05:55 -0700 Subject: [PATCH 37/40] rebase --- pandas/core/internals/managers.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 98ea3092beb7b..e8f6a3b74b16d 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -424,7 +424,9 @@ def apply(self: T, f, filter=None, align_keys=None, **kwargs) -> T: for k, obj in aligned_args.items(): if isinstance(obj, (ABCSeries, ABCDataFrame)): axis = obj._info_axis_number - kwargs[k] = obj.reindex(b_items, axis=axis, copy=align_copy)._values + kwargs[k] = obj.reindex( + b_items, axis=axis, copy=align_copy + )._values else: # We should have an ndarray or ExtensionArray if obj.ndim == 2: From f9d3895b2add5bca54df8435b89378d718220a18 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 1 Apr 2020 11:50:51 -0700 Subject: [PATCH 38/40] update test --- pandas/tests/series/test_operators.py | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/pandas/tests/series/test_operators.py b/pandas/tests/series/test_operators.py index 1340f514e31ce..ba1b3e9d0ca8e 100644 --- a/pandas/tests/series/test_operators.py +++ b/pandas/tests/series/test_operators.py @@ -266,23 +266,24 @@ def test_scalar_na_logical_ops_corners(self): result = s & list(s) tm.assert_series_equal(result, expected) + def test_scalar_na_logical_ops_corners_aligns(self): + s = Series([2, 3, 4, 5, 6, 7, 8, 9, datetime(2005, 1, 1)]) + s[::2] = np.nan d = DataFrame({"A": s}) - # TODO: Fix this exception - needs to be fixed! (see GH5035) - # (previously this was a TypeError because series returned - # NotImplemented - # this is an alignment issue; these are equivalent - # https://github.com/pandas-dev/pandas/issues/5284 + expected = DataFrame(False, index=range(9), columns=["A"] + list(range(9))) - with pytest.raises(TypeError): - d.__and__(s, axis="columns") - with pytest.raises(TypeError): - d.__and__(s, axis=1) + result = d.__and__(s, axis="columns") + tm.assert_frame_equal(result, expected) - with pytest.raises(TypeError): - s & d - with pytest.raises(TypeError): - d & s + result = d.__and__(s, axis=1) + tm.assert_frame_equal(result, expected) + + result = s & d + tm.assert_frame_equal(result, expected) + + result = d & s + tm.assert_frame_equal(result, expected) expected = (s & s).to_frame("A") result = d.__and__(s, axis="index") From 5abec0b1d9f14696b98df881cafd9e46f26a6e62 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 1 Apr 2020 11:55:19 -0700 Subject: [PATCH 39/40] update test --- pandas/tests/frame/test_arithmetic.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/tests/frame/test_arithmetic.py b/pandas/tests/frame/test_arithmetic.py index dd579aa1d0530..4f5cedb189085 100644 --- a/pandas/tests/frame/test_arithmetic.py +++ b/pandas/tests/frame/test_arithmetic.py @@ -1033,9 +1033,8 @@ def test_combine_series( assert "E" in larger_added assert np.isnan(larger_added["E"]).all() - # no upcast needed added = mixed_float_frame + series - _check_mixed_float(added) + assert np.all(added.dtypes == series.dtype) # vs mix (upcast) as needed added = mixed_float_frame + series.astype("float32") From ae38398650327bbce33db8057b1e28e54d0ef1f4 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 1 Apr 2020 11:56:45 -0700 Subject: [PATCH 40/40] whatsnew --- doc/source/whatsnew/v1.1.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 25f847c698278..41ede51b18d27 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -266,6 +266,7 @@ Performance improvements - Performance improvement in :class:`Timedelta` constructor (:issue:`30543`) - Performance improvement in :class:`Timestamp` constructor (:issue:`30543`) - Performance improvement in flex arithmetic ops between :class:`DataFrame` and :class:`Series` with ``axis=0`` (:issue:`31296`) +- Performance improvement in arithmetic operations between :class:`DataFrame` and :class:`Series` (:issue:`32997`) - The internal index method :meth:`~Index._shallow_copy` now copies cached attributes over to the new index, avoiding creating these again on the new index. This can speed up many operations that depend on creating copies of existing indexes (:issue:`28584`, :issue:`32640`, :issue:`32669`)