From e41387dd7c64e0a2032f8aad12a3c00ae50e3165 Mon Sep 17 00:00:00 2001 From: yanglinlee Date: Fri, 24 May 2019 11:45:59 -0400 Subject: [PATCH 01/34] BUG: None comparison evaluates to True #26504 --- doc/source/whatsnew/v0.25.0.rst | 2 +- pandas/core/arrays/categorical.py | 10 +++++++++- pandas/tests/arrays/categorical/test_operators.py | 13 +++++++++++++ 3 files changed, 23 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 91b70334dc9bc..df7f952d4180a 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -305,7 +305,7 @@ Categorical ^^^^^^^^^^^ - Bug in :func:`DataFrame.at` and :func:`Series.at` that would raise exception if the index was a :class:`CategoricalIndex` (:issue:`20629`) -- +- Bug in :func:`_cat_compare_op` that would valuate comparison with None to True (:issue:`26504`) - Datetimelike diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index d25ccd1b158be..eeacb88f33f55 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -100,7 +100,15 @@ def f(self, other): if is_scalar(other): if other in self.categories: i = self.categories.get_loc(other) - return getattr(self._codes, op)(i) + f = getattr(self._codes, op) + ret = f(i) + + # check for NaN in self + na_mask = (self._codes == -1) + if na_mask.any(): + # In other series, the leads to False, so do that here too + ret[na_mask] = False + return ret else: if op == '__eq__': return np.repeat(False, len(self)) diff --git a/pandas/tests/arrays/categorical/test_operators.py b/pandas/tests/arrays/categorical/test_operators.py index dc6e1a5bc36b3..b323cb2b6a7c3 100644 --- a/pandas/tests/arrays/categorical/test_operators.py +++ b/pandas/tests/arrays/categorical/test_operators.py @@ -186,6 +186,19 @@ def test_comparison_with_unknown_scalars(self): tm.assert_numpy_array_equal(cat != 4, np.array([True, True, True])) + def test_comparison_with_known_scalars(self): + # https://github.com/pandas-dev/pandas/issues/26504 + # and following comparisons with scalars in categories with None should + # be evaluated as False + + cat1 = Categorical([1, 2, 3, None], categories=[1, 2, 3], ordered=True) + cat2 = Categorical([None, 1, 2, 3], categories=[1, 2, 3], ordered=True) + + tm.assert_numpy_array_equal(cat1 <= 2, + np.array([True, True, False, False])) + tm.assert_numpy_array_equal(cat2 <= 2, + np.array([False, True, True, False])) + @pytest.mark.parametrize('data,reverse,base', [ (list("abc"), list("cba"), list("bbb")), ([1, 2, 3], [3, 2, 1], [2, 2, 2])] From 9af03ce19c8e2184afc62340138881fb7c804dfd Mon Sep 17 00:00:00 2001 From: ArtinSarraf Date: Tue, 21 May 2019 16:41:44 -0400 Subject: [PATCH 02/34] ENH - Index set operation modifications to address issue #23525 (#23538) --- doc/source/whatsnew/v0.25.0.rst | 27 +++++ pandas/core/indexes/base.py | 104 +++++++++++++++--- pandas/core/indexes/datetimes.py | 34 +----- pandas/core/indexes/interval.py | 26 ++--- pandas/core/indexes/numeric.py | 8 ++ pandas/core/indexes/period.py | 12 +- pandas/core/indexes/range.py | 10 +- pandas/core/indexes/timedeltas.py | 21 +--- pandas/tests/indexes/common.py | 24 +--- pandas/tests/indexes/conftest.py | 36 +++--- .../tests/indexes/datetimes/test_datetime.py | 6 +- pandas/tests/indexes/datetimes/test_setops.py | 22 +++- .../tests/indexes/datetimes/test_timezones.py | 5 +- .../tests/indexes/interval/test_interval.py | 13 ++- pandas/tests/indexes/period/test_setops.py | 4 - pandas/tests/indexes/test_base.py | 2 + pandas/tests/indexes/test_setops.py | 76 +++++++++++++ pandas/tests/reshape/test_concat.py | 23 ++-- pandas/tests/series/test_combine_concat.py | 1 + pandas/tests/series/test_missing.py | 14 ++- pandas/tests/series/test_operators.py | 36 +++++- 21 files changed, 343 insertions(+), 161 deletions(-) create mode 100644 pandas/tests/indexes/test_setops.py diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index df7f952d4180a..0c69d1a4ce013 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -154,6 +154,33 @@ returned if all the columns were dummy encoded, and a :class:`DataFrame` otherwi Providing any ``SparseSeries`` or ``SparseDataFrame`` to :func:`concat` will cause a ``SparseSeries`` or ``SparseDataFrame`` to be returned, as before. +.. _whatsnew_0250.api_breaking.incompatible_index_unions + +Incompatible Index Type Unions +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +When performing :func:`Index.union` operations between objects of incompatible dtypes, +the result will be a base :class:`Index` of dtype ``object``. This behavior holds true for +unions between :class:`Index` objects that previously would have been prohibited. The dtype +of empty :class:`Index` objects will now be evaluated before performing union operations +rather than simply returning the other :class:`Index` object. :func:`Index.union` can now be +considered commutative, such that ``A.union(B) == B.union(A)`` (:issue:`23525`). + +*Previous Behavior*: + + In [1]: pd.period_range('19910905', periods=2).union(pd.Int64Index([1, 2, 3])) + ... + ValueError: can only call with other PeriodIndex-ed objects + + In [2]: pd.Index([], dtype=object).union(pd.Index([1, 2, 3])) + Out[2]: Int64Index([1, 2, 3], dtype='int64') + +*New Behavior*: + +.. ipython:: python + + pd.period_range('19910905', periods=2).union(pd.Int64Index([1, 2, 3])) + pd.Index([], dtype=object).union(pd.Index([1, 2, 3])) ``DataFrame`` groupby ffill/bfill no longer return group labels ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index dd3717813ce3f..eff7ff2c9f347 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -20,11 +20,10 @@ ensure_categorical, ensure_int64, ensure_object, ensure_platform_int, is_bool, is_bool_dtype, is_categorical, is_categorical_dtype, is_datetime64_any_dtype, is_datetime64tz_dtype, is_dtype_equal, - is_dtype_union_equal, is_extension_array_dtype, is_float, is_float_dtype, - is_hashable, is_integer, is_integer_dtype, is_interval_dtype, is_iterator, - is_list_like, is_object_dtype, is_period_dtype, is_scalar, - is_signed_integer_dtype, is_timedelta64_dtype, is_unsigned_integer_dtype, - pandas_dtype) + is_extension_array_dtype, is_float, is_float_dtype, is_hashable, + is_integer, is_integer_dtype, is_interval_dtype, is_iterator, is_list_like, + is_object_dtype, is_period_dtype, is_scalar, is_signed_integer_dtype, + is_timedelta64_dtype, is_unsigned_integer_dtype, pandas_dtype) import pandas.core.dtypes.concat as _concat from pandas.core.dtypes.generic import ( ABCDataFrame, ABCDateOffset, ABCDatetimeArray, ABCIndexClass, @@ -2262,6 +2261,47 @@ def _get_reconciled_name_object(self, other): return self._shallow_copy(name=name) return self + def _union_incompatible_dtypes(self, other, sort): + """ + Casts this and other index to object dtype to allow the formation + of a union between incompatible types. + + Parameters + ---------- + other : Index or array-like + sort : False or None, default False + Whether to sort the resulting index. + + * False : do not sort the result. + * None : sort the result, except when `self` and `other` are equal + or when the values cannot be compared. + + Returns + ------- + Index + """ + this = self.astype(object, copy=False) + # cast to Index for when `other` is list-like + other = Index(other).astype(object, copy=False) + return Index.union(this, other, sort=sort).astype(object, copy=False) + + def _is_compatible_with_other(self, other): + """ + Check whether this and the other dtype are compatible with each other. + Meaning a union can be formed between them without needing to be cast + to dtype object. + + Parameters + ---------- + other : Index or array-like + + Returns + ------- + bool + """ + return (type(self) is type(other) + and is_dtype_equal(self.dtype, other.dtype)) + def _validate_sort_keyword(self, sort): if sort not in [None, False]: raise ValueError("The 'sort' keyword only takes the values of " @@ -2271,6 +2311,11 @@ def union(self, other, sort=None): """ Form the union of two Index objects. + If the Index objects are incompatible, both Index objects will be + cast to dtype('object') first. + + .. versionchanged:: 0.25.0 + Parameters ---------- other : Index or array-like @@ -2300,30 +2345,54 @@ def union(self, other, sort=None): Examples -------- + Union matching dtypes + >>> idx1 = pd.Index([1, 2, 3, 4]) >>> idx2 = pd.Index([3, 4, 5, 6]) >>> idx1.union(idx2) Int64Index([1, 2, 3, 4, 5, 6], dtype='int64') + + Union mismatched dtypes + + >>> idx1 = pd.Index(['a', 'b', 'c', 'd']) + >>> idx2 = pd.Index([1, 2, 3, 4]) + >>> idx1.union(idx2) + Index(['a', 'b', 'c', 'd', 1, 2, 3, 4], dtype='object') """ self._validate_sort_keyword(sort) self._assert_can_do_setop(other) - other = ensure_index(other) - if len(other) == 0 or self.equals(other): + if not self._is_compatible_with_other(other): + return self._union_incompatible_dtypes(other, sort=sort) + + return self._union(other, sort=sort) + + def _union(self, other, sort): + """ + Specific union logic should go here. In subclasses, union behavior + should be overwritten here rather than in `self.union`. + + Parameters + ---------- + other : Index or array-like + sort : False or None, default False + Whether to sort the resulting index. + + * False : do not sort the result. + * None : sort the result, except when `self` and `other` are equal + or when the values cannot be compared. + + Returns + ------- + Index + """ + + if not len(other) or self.equals(other): return self._get_reconciled_name_object(other) - if len(self) == 0: + if not len(self): return other._get_reconciled_name_object(self) - # TODO: is_dtype_union_equal is a hack around - # 1. buggy set ops with duplicates (GH #13432) - # 2. CategoricalIndex lacking setops (GH #10186) - # Once those are fixed, this workaround can be removed - if not is_dtype_union_equal(self.dtype, other.dtype): - this = self.astype('O') - other = other.astype('O') - return this.union(other, sort=sort) - # TODO(EA): setops-refactor, clean all this up if is_period_dtype(self) or is_datetime64tz_dtype(self): lvals = self._ndarray_values @@ -2370,6 +2439,7 @@ def union(self, other, sort=None): def _wrap_setop_result(self, other, result): return self._constructor(result, name=get_op_result_name(self, other)) + # TODO: standardize return type of non-union setops type(self vs other) def intersection(self, other, sort=False): """ Form the intersection of two Index objects. diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index 9c735a5598f4a..7fd537fb9989a 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -451,35 +451,9 @@ def _formatter_func(self): # -------------------------------------------------------------------- # Set Operation Methods - def union(self, other, sort=None): - """ - Specialized union for DatetimeIndex objects. If combine - overlapping ranges with the same DateOffset, will be much - faster than Index.union - - Parameters - ---------- - other : DatetimeIndex or array-like - sort : bool or None, default None - Whether to sort the resulting Index. - - * None : Sort the result, except when - - 1. `self` and `other` are equal. - 2. `self` or `other` has length 0. - 3. Some values in `self` or `other` cannot be compared. - A RuntimeWarning is issued in this case. - - * False : do not sort the result - - .. versionadded:: 0.25.0 - - Returns - ------- - y : Index or DatetimeIndex - """ - self._validate_sort_keyword(sort) - self._assert_can_do_setop(other) + def _union(self, other, sort): + if not len(other) or self.equals(other) or not len(self): + return super()._union(other, sort=sort) if len(other) == 0 or self.equals(other) or len(self) == 0: return super().union(other, sort=sort) @@ -495,7 +469,7 @@ def union(self, other, sort=None): if this._can_fast_union(other): return this._fast_union(other, sort=sort) else: - result = Index.union(this, other, sort=sort) + result = Index._union(this, other, sort=sort) if isinstance(result, DatetimeIndex): # TODO: we shouldn't be setting attributes like this; # in all the tests this equality already holds diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index a3dbf2e03957b..87216dcc7b957 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -964,19 +964,6 @@ def insert(self, loc, item): new_right = self.right.insert(loc, right_insert) return self._shallow_copy(new_left, new_right) - def _as_like_interval_index(self, other): - self._assert_can_do_setop(other) - other = ensure_index(other) - if not isinstance(other, IntervalIndex): - msg = ('the other index needs to be an IntervalIndex too, but ' - 'was type {}').format(other.__class__.__name__) - raise TypeError(msg) - elif self.closed != other.closed: - msg = ('can only do set operations between two IntervalIndex ' - 'objects that are closed on the same side') - raise ValueError(msg) - return other - def _concat_same_dtype(self, to_concat, name): """ assert that we all have the same .closed @@ -1092,7 +1079,17 @@ def overlaps(self, other): def _setop(op_name, sort=None): def func(self, other, sort=sort): - other = self._as_like_interval_index(other) + self._assert_can_do_setop(other) + other = ensure_index(other) + if not isinstance(other, IntervalIndex): + result = getattr(self.astype(object), op_name)(other) + if op_name in ('difference',): + result = result.astype(self.dtype) + return result + elif self.closed != other.closed: + msg = ('can only do set operations between two IntervalIndex ' + 'objects that are closed on the same side') + raise ValueError(msg) # GH 19016: ensure set op will not return a prohibited dtype subtypes = [self.dtype.subtype, other.dtype.subtype] @@ -1114,6 +1111,7 @@ def func(self, other, sort=sort): return type(self).from_tuples(result, closed=self.closed, name=result_name) + return func @property diff --git a/pandas/core/indexes/numeric.py b/pandas/core/indexes/numeric.py index a11f34cbdcceb..b6c8ba588f9d6 100644 --- a/pandas/core/indexes/numeric.py +++ b/pandas/core/indexes/numeric.py @@ -9,6 +9,7 @@ is_bool, is_bool_dtype, is_dtype_equal, is_extension_array_dtype, is_float, is_integer_dtype, is_scalar, needs_i8_conversion, pandas_dtype) import pandas.core.dtypes.concat as _concat +from pandas.core.dtypes.generic import ABCInt64Index, ABCRangeIndex from pandas.core.dtypes.missing import isna from pandas.core import algorithms @@ -221,6 +222,13 @@ def _assert_safe_casting(cls, data, subarr): raise TypeError('Unsafe NumPy casting, you must ' 'explicitly cast') + def _is_compatible_with_other(self, other): + return ( + super()._is_compatible_with_other(other) + or all(isinstance(type(obj), (ABCInt64Index, ABCRangeIndex)) + for obj in [self, other]) + ) + Int64Index._add_numeric_methods() Int64Index._add_logical_methods() diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index ed08de54ad6f2..044951ceda502 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -791,6 +791,11 @@ def join(self, other, how='left', level=None, return_indexers=False, """ self._assert_can_do_setop(other) + if not isinstance(other, PeriodIndex): + return self.astype(object).join(other, how=how, level=level, + return_indexers=return_indexers, + sort=sort) + result = Int64Index.join(self, other, how=how, level=level, return_indexers=return_indexers, sort=sort) @@ -807,10 +812,9 @@ def intersection(self, other, sort=False): def _assert_can_do_setop(self, other): super()._assert_can_do_setop(other) - if not isinstance(other, PeriodIndex): - raise ValueError('can only call with other PeriodIndex-ed objects') - - if self.freq != other.freq: + # *Can't* use PeriodIndexes of different freqs + # *Can* use PeriodIndex/DatetimeIndex + if isinstance(other, PeriodIndex) and self.freq != other.freq: msg = DIFFERENT_FREQ.format(cls=type(self).__name__, own_freq=self.freqstr, other_freq=other.freqstr) diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index 160e6284d3c59..ea14a4c789cd3 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -470,7 +470,7 @@ def _extended_gcd(self, a, b): old_t, t = t, old_t - quotient * t return old_r, old_s, old_t - def union(self, other, sort=None): + def _union(self, other, sort): """ Form the union of two Index objects and sorts if possible @@ -490,9 +490,8 @@ def union(self, other, sort=None): ------- union : Index """ - self._assert_can_do_setop(other) - if len(other) == 0 or self.equals(other) or len(self) == 0: - return super().union(other, sort=sort) + if not len(other) or self.equals(other) or not len(self): + return super()._union(other, sort=sort) if isinstance(other, RangeIndex) and sort is None: start_s, step_s = self._start, self._step @@ -530,8 +529,7 @@ def union(self, other, sort=None): (start_s + step_o >= start_o) and (end_s - step_o <= end_o)): return RangeIndex(start_r, end_r + step_o, step_o) - - return self._int64index.union(other, sort=sort) + return self._int64index._union(other, sort=sort) @Appender(_index_shared_docs['join']) def join(self, other, how='left', level=None, return_indexers=False, diff --git a/pandas/core/indexes/timedeltas.py b/pandas/core/indexes/timedeltas.py index 5e62c2ef881e9..6ae17e62b49c6 100644 --- a/pandas/core/indexes/timedeltas.py +++ b/pandas/core/indexes/timedeltas.py @@ -329,24 +329,9 @@ def astype(self, dtype, copy=True): return Index(result.astype('i8'), name=self.name) return DatetimeIndexOpsMixin.astype(self, dtype, copy=copy) - def union(self, other): - """ - Specialized union for TimedeltaIndex objects. If combine - overlapping ranges with the same DateOffset, will be much - faster than Index.union - - Parameters - ---------- - other : TimedeltaIndex or array-like - - Returns - ------- - y : Index or TimedeltaIndex - """ - self._assert_can_do_setop(other) - + def _union(self, other, sort): if len(other) == 0 or self.equals(other) or len(self) == 0: - return super().union(other) + return super()._union(other, sort=sort) if not isinstance(other, TimedeltaIndex): try: @@ -358,7 +343,7 @@ def union(self, other): if this._can_fast_union(other): return this._fast_union(other) else: - result = Index.union(this, other) + result = Index._union(this, other, sort=sort) if isinstance(result, TimedeltaIndex): if result.freq is None: result.freq = to_offset(result.inferred_freq) diff --git a/pandas/tests/indexes/common.py b/pandas/tests/indexes/common.py index 71d1e686f5c02..674f600bc8693 100644 --- a/pandas/tests/indexes/common.py +++ b/pandas/tests/indexes/common.py @@ -447,11 +447,7 @@ def test_intersection_base(self): cases = [klass(second.values) for klass in [np.array, Series, list]] for case in cases: - if isinstance(idx, PeriodIndex): - msg = "can only call with other PeriodIndex-ed objects" - with pytest.raises(ValueError, match=msg): - first.intersection(case) - elif isinstance(idx, CategoricalIndex): + if isinstance(idx, CategoricalIndex): pass else: result = first.intersection(case) @@ -474,11 +470,7 @@ def test_union_base(self): cases = [klass(second.values) for klass in [np.array, Series, list]] for case in cases: - if isinstance(idx, PeriodIndex): - msg = "can only call with other PeriodIndex-ed objects" - with pytest.raises(ValueError, match=msg): - first.union(case) - elif isinstance(idx, CategoricalIndex): + if isinstance(idx, CategoricalIndex): pass else: result = first.union(case) @@ -506,11 +498,7 @@ def test_difference_base(self, sort): cases = [klass(second.values) for klass in [np.array, Series, list]] for case in cases: - if isinstance(idx, PeriodIndex): - msg = "can only call with other PeriodIndex-ed objects" - with pytest.raises(ValueError, match=msg): - first.difference(case, sort) - elif isinstance(idx, CategoricalIndex): + if isinstance(idx, CategoricalIndex): pass elif isinstance(idx, (DatetimeIndex, TimedeltaIndex)): assert result.__class__ == answer.__class__ @@ -540,11 +528,7 @@ def test_symmetric_difference(self): cases = [klass(second.values) for klass in [np.array, Series, list]] for case in cases: - if isinstance(idx, PeriodIndex): - msg = "can only call with other PeriodIndex-ed objects" - with pytest.raises(ValueError, match=msg): - first.symmetric_difference(case) - elif isinstance(idx, CategoricalIndex): + if isinstance(idx, CategoricalIndex): pass else: result = first.symmetric_difference(case) diff --git a/pandas/tests/indexes/conftest.py b/pandas/tests/indexes/conftest.py index 632d5b2875a5a..83f1f22b158b1 100644 --- a/pandas/tests/indexes/conftest.py +++ b/pandas/tests/indexes/conftest.py @@ -5,23 +5,25 @@ from pandas.core.indexes.api import Index, MultiIndex import pandas.util.testing as tm - -@pytest.fixture(params=[tm.makeUnicodeIndex(100), - tm.makeStringIndex(100), - tm.makeDateIndex(100), - tm.makePeriodIndex(100), - tm.makeTimedeltaIndex(100), - tm.makeIntIndex(100), - tm.makeUIntIndex(100), - tm.makeRangeIndex(100), - tm.makeFloatIndex(100), - Index([True, False]), - tm.makeCategoricalIndex(100), - Index([]), - MultiIndex.from_tuples(zip( - ['foo', 'bar', 'baz'], [1, 2, 3])), - Index([0, 0, 1, 1, 2, 2])], - ids=lambda x: type(x).__name__) +indices_list = [tm.makeUnicodeIndex(100), + tm.makeStringIndex(100), + tm.makeDateIndex(100), + tm.makePeriodIndex(100), + tm.makeTimedeltaIndex(100), + tm.makeIntIndex(100), + tm.makeUIntIndex(100), + tm.makeRangeIndex(100), + tm.makeFloatIndex(100), + Index([True, False]), + tm.makeCategoricalIndex(100), + tm.makeIntervalIndex(100), + Index([]), + MultiIndex.from_tuples(zip( + ['foo', 'bar', 'baz'], [1, 2, 3])), + Index([0, 0, 1, 1, 2, 2])] + + +@pytest.fixture(params=indices_list, ids=lambda x: type(x).__name__) def indices(request): return request.param diff --git a/pandas/tests/indexes/datetimes/test_datetime.py b/pandas/tests/indexes/datetimes/test_datetime.py index efa6d006bad6f..01649cb4646de 100644 --- a/pandas/tests/indexes/datetimes/test_datetime.py +++ b/pandas/tests/indexes/datetimes/test_datetime.py @@ -298,9 +298,9 @@ def test_join_with_period_index(self, join_type): c_idx_type='p', r_idx_type='dt') s = df.iloc[:5, 0] - msg = 'can only call with other PeriodIndex-ed objects' - with pytest.raises(ValueError, match=msg): - df.columns.join(s.index, how=join_type) + expected = df.columns.astype('O').join(s.index, how=join_type) + result = df.columns.join(s.index, how=join_type) + tm.assert_index_equal(expected, result) def test_factorize(self): idx1 = DatetimeIndex(['2014-01', '2014-01', '2014-02', '2014-02', diff --git a/pandas/tests/indexes/datetimes/test_setops.py b/pandas/tests/indexes/datetimes/test_setops.py index 45a3a64216cab..fd666f3d56c9d 100644 --- a/pandas/tests/indexes/datetimes/test_setops.py +++ b/pandas/tests/indexes/datetimes/test_setops.py @@ -29,11 +29,20 @@ def test_union2(self, sort): union = first.union(second, sort=sort) tm.assert_index_equal(union, everything) + @pytest.mark.parametrize("box", [np.array, Series, list]) + @pytest.mark.parametrize("sort", [None, False]) + def test_union3(self, sort, box): + everything = tm.makeDateIndex(10) + first = everything[:5] + second = everything[5:] + # GH 10149 - cases = [klass(second.values) for klass in [np.array, Series, list]] - for case in cases: - result = first.union(case, sort=sort) - tm.assert_index_equal(result, everything) + expected = first.astype('O').union( + pd.Index(second.values, dtype='O') + ).astype('O') + case = box(second.values) + result = first.union(case, sort=sort) + tm.assert_index_equal(result, expected) @pytest.mark.parametrize("tz", tz) @pytest.mark.parametrize("sort", [None, False]) @@ -303,11 +312,12 @@ def test_datetimeindex_union_join_empty(self, sort): empty = Index([]) result = dti.union(empty, sort=sort) - assert isinstance(result, DatetimeIndex) - assert result is result + expected = dti.astype('O') + tm.assert_index_equal(result, expected) result = dti.join(empty) assert isinstance(result, DatetimeIndex) + tm.assert_index_equal(result, dti) def test_join_nonunique(self): idx1 = to_datetime(['2012-11-06 16:00:11.477563', diff --git a/pandas/tests/indexes/datetimes/test_timezones.py b/pandas/tests/indexes/datetimes/test_timezones.py index 3f876565119cb..368dc68e516df 100644 --- a/pandas/tests/indexes/datetimes/test_timezones.py +++ b/pandas/tests/indexes/datetimes/test_timezones.py @@ -1077,7 +1077,10 @@ def test_dti_union_aware(self): tz="US/Eastern") result = rng.union(rng2) - assert result.tz.zone == 'UTC' + expected = rng.astype('O').union(rng2.astype('O')) + tm.assert_index_equal(result, expected) + assert result[0].tz.zone == 'US/Central' + assert result[-1].tz.zone == 'US/Eastern' @pytest.mark.parametrize('tz', [None, 'UTC', "US/Central", dateutil.tz.tzoffset(None, -28800)]) diff --git a/pandas/tests/indexes/interval/test_interval.py b/pandas/tests/indexes/interval/test_interval.py index 61465d8454383..f4f63aaecd336 100644 --- a/pandas/tests/indexes/interval/test_interval.py +++ b/pandas/tests/indexes/interval/test_interval.py @@ -901,15 +901,18 @@ def test_symmetric_difference(self, closed, sort): @pytest.mark.parametrize('op_name', [ 'union', 'intersection', 'difference', 'symmetric_difference']) @pytest.mark.parametrize("sort", [None, False]) - def test_set_operation_errors(self, closed, op_name, sort): + def test_set_incompatible_types(self, closed, op_name, sort): index = self.create_index(closed=closed) set_op = getattr(index, op_name) + # TODO: standardize return type of non-union setops type(self vs other) # non-IntervalIndex - msg = ('the other index needs to be an IntervalIndex too, but ' - 'was type Int64Index') - with pytest.raises(TypeError, match=msg): - set_op(Index([1, 2, 3]), sort=sort) + if op_name == 'difference': + expected = index + else: + expected = getattr(index.astype('O'), op_name)(Index([1, 2, 3])) + result = set_op(Index([1, 2, 3]), sort=sort) + tm.assert_index_equal(result, expected) # mixed closed msg = ('can only do set operations between two IntervalIndex objects ' diff --git a/pandas/tests/indexes/period/test_setops.py b/pandas/tests/indexes/period/test_setops.py index 29d07a0985574..a9102aeec060c 100644 --- a/pandas/tests/indexes/period/test_setops.py +++ b/pandas/tests/indexes/period/test_setops.py @@ -127,10 +127,6 @@ def test_union_misc(self, sort): with pytest.raises(period.IncompatibleFrequency): index.union(index2, sort=sort) - msg = 'can only call with other PeriodIndex-ed objects' - with pytest.raises(ValueError, match=msg): - index.join(index.to_timestamp()) - index3 = period_range('1/1/2000', '1/20/2000', freq='2D') with pytest.raises(period.IncompatibleFrequency): index.join(index3) diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 7b507a9de6b5d..7e70d77ea70fc 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -889,6 +889,8 @@ def test_union_identity(self, sort): # i.e. identity is not preserved when sort is True assert (union is first) is (not sort) + # This should no longer be the same object, since [] is not consistent, + # both objects will be recast to dtype('O') union = first.union([], sort=sort) assert (union is first) is (not sort) diff --git a/pandas/tests/indexes/test_setops.py b/pandas/tests/indexes/test_setops.py new file mode 100644 index 0000000000000..b626ced2ccb1b --- /dev/null +++ b/pandas/tests/indexes/test_setops.py @@ -0,0 +1,76 @@ +''' +The tests in this package are to ensure the proper resultant dtypes of +set operations. +''' +import itertools as it + +import numpy as np +import pytest + +from pandas.core.dtypes.common import is_dtype_equal + +import pandas as pd +from pandas import Int64Index, RangeIndex +from pandas.tests.indexes.conftest import indices_list +import pandas.util.testing as tm + +COMPATIBLE_INCONSISTENT_PAIRS = { + (Int64Index, RangeIndex): (tm.makeIntIndex, tm.makeRangeIndex) +} + + +@pytest.fixture(params=list(it.combinations(indices_list, 2)), + ids=lambda x: type(x[0]).__name__ + type(x[1]).__name__) +def index_pair(request): + """ + Create all combinations of 2 index types. + """ + return request.param + + +def test_union_same_types(indices): + # Union with a non-unique, non-monotonic index raises error + # Only needed for bool index factory + idx1 = indices.sort_values() + idx2 = indices.sort_values() + assert idx1.union(idx2).dtype == idx1.dtype + + +def test_union_different_types(index_pair): + # GH 23525 + idx1, idx2 = index_pair + type_pair = tuple(sorted([type(idx1), type(idx2)], key=lambda x: str(x))) + if type_pair in COMPATIBLE_INCONSISTENT_PAIRS: + pytest.xfail('This test only considers non compatible indexes.') + + if any(isinstance(idx, pd.MultiIndex) for idx in index_pair): + pytest.xfail('This test doesn\'t consider multiindixes.') + + if is_dtype_equal(idx1.dtype, idx2.dtype): + pytest.xfail('This test only considers non matching dtypes.') + + # A union with a CategoricalIndex (even as dtype('O')) and a + # non-CategoricalIndex can only be made if both indices are monotonic. + # This is true before this PR as well. + + # Union with a non-unique, non-monotonic index raises error + # This applies to the boolean index + idx1 = idx1.sort_values() + idx2 = idx2.sort_values() + + assert idx1.union(idx2).dtype == np.dtype('O') + assert idx2.union(idx1).dtype == np.dtype('O') + + +@pytest.mark.parametrize('idx_fact1,idx_fact2', + COMPATIBLE_INCONSISTENT_PAIRS.values()) +def test_compatible_inconsistent_pairs(idx_fact1, idx_fact2): + # GH 23525 + idx1 = idx_fact1(10) + idx2 = idx_fact2(20) + + res1 = idx1.union(idx2) + res2 = idx2.union(idx1) + + assert res1.dtype in (idx1.dtype, idx2.dtype) + assert res2.dtype in (idx1.dtype, idx2.dtype) diff --git a/pandas/tests/reshape/test_concat.py b/pandas/tests/reshape/test_concat.py index 3d9f3da75306a..ecd62380d8c65 100644 --- a/pandas/tests/reshape/test_concat.py +++ b/pandas/tests/reshape/test_concat.py @@ -960,22 +960,23 @@ def test_append_different_columns_types_raises( df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=index_can_append) ser = pd.Series([7, 8, 9], index=index_cannot_append_with_other, name=2) - msg = ("the other index needs to be an IntervalIndex too, but was" + msg = (r"unorderable types: (Interval|int)\(\) (<|>) " + r"(int|long|float|str|Timestamp)\(\)|" + r"Expected tuple, got (int|long|float|str)|" + r"Cannot compare type 'Timestamp' with type '(int|long)'|" + r"'(<|>)' not supported between instances of 'int' " + r"and '(str|Timestamp)'|" + r"the other index needs to be an IntervalIndex too, but was" r" type {}|" r"object of type '(int|float|Timestamp)' has no len\(\)|" "Expected tuple, got str") - with pytest.raises(TypeError, match=msg.format( - index_can_append.__class__.__name__)): + with pytest.raises(TypeError, match=msg): df.append(ser) df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=index_cannot_append_with_other) ser = pd.Series([7, 8, 9], index=index_can_append, name=2) - msg = (r"unorderable types: (Interval|int)\(\) > " - r"(int|float|str)\(\)|" - r"Expected tuple, got (int|float|str)|" - r"Cannot compare type 'Timestamp' with type 'int'|" - r"'>' not supported between instances of 'int' and 'str'") + with pytest.raises(TypeError, match=msg): df.append(ser) @@ -2029,7 +2030,8 @@ def test_concat_empty_series(self): s1 = pd.Series([1, 2, 3], name='x') s2 = pd.Series(name='y') res = pd.concat([s1, s2], axis=1) - exp = pd.DataFrame({'x': [1, 2, 3], 'y': [np.nan, np.nan, np.nan]}) + exp = pd.DataFrame({'x': [1, 2, 3], 'y': [np.nan, np.nan, np.nan]}, + index=pd.Index([0, 1, 2], dtype='O')) tm.assert_frame_equal(res, exp) s1 = pd.Series([1, 2, 3], name='x') @@ -2044,7 +2046,8 @@ def test_concat_empty_series(self): s2 = pd.Series(name=None) res = pd.concat([s1, s2], axis=1) exp = pd.DataFrame({'x': [1, 2, 3], 0: [np.nan, np.nan, np.nan]}, - columns=['x', 0]) + columns=['x', 0], + index=pd.Index([0, 1, 2], dtype='O')) tm.assert_frame_equal(res, exp) @pytest.mark.parametrize('tz', [None, 'UTC']) diff --git a/pandas/tests/series/test_combine_concat.py b/pandas/tests/series/test_combine_concat.py index 41c3e220ad06f..ed5cf2d6b2c51 100644 --- a/pandas/tests/series/test_combine_concat.py +++ b/pandas/tests/series/test_combine_concat.py @@ -100,6 +100,7 @@ def test_combine_first(self): # corner case s = Series([1., 2, 3], index=[0, 1, 2]) result = s.combine_first(Series([], index=[])) + s.index = s.index.astype('O') assert_series_equal(s, result) def test_update(self): diff --git a/pandas/tests/series/test_missing.py b/pandas/tests/series/test_missing.py index 13e8d6c885029..11ad238eecd77 100644 --- a/pandas/tests/series/test_missing.py +++ b/pandas/tests/series/test_missing.py @@ -912,7 +912,7 @@ def test_interpolate_pchip(self): # interpolate at new_index new_index = ser.index.union(Index([49.25, 49.5, 49.75, 50.25, 50.5, - 50.75])) + 50.75])).astype(float) interp_s = ser.reindex(new_index).interpolate(method='pchip') # does not blow up, GH5977 interp_s[49:51] @@ -928,7 +928,9 @@ def test_interpolate_akima(self): index=Index([1.0, 1.25, 1.5, 1.75, 2.0, 2.25, 2.5, 2.75, 3.0])) # interpolate at new_index - new_index = ser.index.union(Index([1.25, 1.5, 1.75, 2.25, 2.5, 2.75])) + new_index = ser.index.union( + Index([1.25, 1.5, 1.75, 2.25, 2.5, 2.75]) + ).astype(float) interp_s = ser.reindex(new_index).interpolate(method='akima') assert_series_equal(interp_s[1:3], expected) @@ -941,7 +943,9 @@ def test_interpolate_piecewise_polynomial(self): index=Index([1.0, 1.25, 1.5, 1.75, 2.0, 2.25, 2.5, 2.75, 3.0])) # interpolate at new_index - new_index = ser.index.union(Index([1.25, 1.5, 1.75, 2.25, 2.5, 2.75])) + new_index = ser.index.union( + Index([1.25, 1.5, 1.75, 2.25, 2.5, 2.75]) + ).astype(float) interp_s = ser.reindex(new_index).interpolate( method='piecewise_polynomial') assert_series_equal(interp_s[1:3], expected) @@ -955,7 +959,9 @@ def test_interpolate_from_derivatives(self): index=Index([1.0, 1.25, 1.5, 1.75, 2.0, 2.25, 2.5, 2.75, 3.0])) # interpolate at new_index - new_index = ser.index.union(Index([1.25, 1.5, 1.75, 2.25, 2.5, 2.75])) + new_index = ser.index.union( + Index([1.25, 1.5, 1.75, 2.25, 2.5, 2.75]) + ).astype(float) interp_s = ser.reindex(new_index).interpolate( method='from_derivatives') assert_series_equal(interp_s[1:3], expected) diff --git a/pandas/tests/series/test_operators.py b/pandas/tests/series/test_operators.py index fee1976698b04..215fa9f22277e 100644 --- a/pandas/tests/series/test_operators.py +++ b/pandas/tests/series/test_operators.py @@ -8,10 +8,12 @@ from pandas import ( Categorical, DataFrame, Index, Series, bdate_range, date_range, isna) from pandas.core import ops +from pandas.core.indexes.base import InvalidIndexError import pandas.core.nanops as nanops import pandas.util.testing as tm from pandas.util.testing import ( - assert_almost_equal, assert_frame_equal, assert_series_equal) + assert_almost_equal, assert_frame_equal, assert_index_equal, + assert_series_equal) from .common import TestData @@ -171,7 +173,6 @@ def test_scalar_na_logical_ops_corners(self): operator.and_, operator.or_, operator.xor, - ]) def test_logical_ops_with_index(self, op): # GH#22092, GH#19792 @@ -190,6 +191,37 @@ def test_logical_ops_with_index(self, op): result = op(ser, idx2) assert_series_equal(result, expected) + @pytest.mark.parametrize('op', [ + pytest.param(ops.rand_, + marks=pytest.mark.xfail(reason="GH#22092 Index " + "implementation returns " + "Index", + raises=AssertionError, + strict=True)), + pytest.param(ops.ror_, + marks=pytest.mark.xfail(reason="Index.get_indexer " + "with non unique index", + raises=InvalidIndexError, + strict=True)), + ops.rxor, + ]) + def test_reversed_logical_ops_with_index(self, op): + # GH#22092, GH#19792 + ser = Series([True, True, False, False]) + idx1 = Index([True, False, True, False]) + idx2 = Index([1, 0, 1, 0]) + + # symmetric_difference is only for rxor, but other 2 should fail + expected = idx1.symmetric_difference(ser) + + result = op(ser, idx1) + assert_index_equal(result, expected) + + expected = idx2.symmetric_difference(ser) + + result = op(ser, idx2) + assert_index_equal(result, expected) + @pytest.mark.parametrize("op, expected", [ (ops.rand_, pd.Index([False, True])), (ops.ror_, pd.Index([False, True])), From 620fa592e2403bf56de46945d1c7363006491173 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Tue, 21 May 2019 23:33:41 +0100 Subject: [PATCH 03/34] DOC/CLN: wil -> will (#26484) --- pandas/core/generic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 76910f425836e..623e2b4863029 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -926,7 +926,7 @@ def squeeze(self, axis=None): a 1 Name: 0, dtype: int64 - Squeezing all axes wil project directly into a scalar: + Squeezing all axes will project directly into a scalar: >>> df_0a.squeeze() 1 From aad1bf9904ef83e9d35022c9df9ec7080ca45f2e Mon Sep 17 00:00:00 2001 From: Brett Randall Date: Wed, 22 May 2019 23:58:19 +1000 Subject: [PATCH 04/34] Fixed typo mutiplication -> multiplication. (#26489) --- pandas/core/frame.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index afe37bf198ab7..6bfa63012689d 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -918,7 +918,7 @@ def __len__(self): def dot(self, other): """ - Compute the matrix mutiplication between the DataFrame and other. + Compute the matrix multiplication between the DataFrame and other. This method computes the matrix product between the DataFrame and the values of an other Series, DataFrame or a numpy array. From ef87d02a71fe075075a3a32df8af8804f390da2d Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Thu, 23 May 2019 14:04:56 +0100 Subject: [PATCH 05/34] DOC: fix SyntaxError in doc build on Windows (#26499) --- doc/source/conf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/conf.py b/doc/source/conf.py index e7d358c7961ab..971aa04ba866a 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -319,7 +319,7 @@ pd.options.display.max_rows = 15 import os - os.chdir('{}') + os.chdir(r'{}') """.format(os.path.dirname(os.path.dirname(__file__))) From 89cc7f2ce39f046dd9877d569fe226d6a1cbfe5a Mon Sep 17 00:00:00 2001 From: Mats Maiwald <32721837+matsmaiwald@users.noreply.github.com> Date: Thu, 23 May 2019 18:46:20 +0200 Subject: [PATCH 06/34] DOC: Highlighted role of index alignment in DataFrame.dot(other) (#26480) (#26496) --- pandas/core/frame.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 6bfa63012689d..7d501e8095921 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -944,7 +944,9 @@ def dot(self, other): Notes ----- The dimensions of DataFrame and other must be compatible in order to - compute the matrix multiplication. + compute the matrix multiplication. In addition, the column names of + DataFrame and the index of other must contain the same values, as they + will be aligned prior to the multiplication. The dot method for Series computes the inner product, instead of the matrix product here. @@ -982,6 +984,14 @@ def dot(self, other): 0 1 0 1 4 1 2 2 + + Note how shuffling of the objects does not change the result. + + >>> s2 = s.reindex([1, 0, 2, 3]) + >>> df.dot(s2) + 0 -4 + 1 5 + dtype: int64 """ if isinstance(other, (Series, DataFrame)): common = self.columns.union(other.index) From babd5720f67e66fc817f85925b1ef9cf0b746576 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Fri, 24 May 2019 16:11:25 +0100 Subject: [PATCH 07/34] DOC/CLN: Change API reference section title (#26486) --- doc/source/reference/indexing.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/doc/source/reference/indexing.rst b/doc/source/reference/indexing.rst index 680cb7e3dac91..42ebf648f299f 100644 --- a/doc/source/reference/indexing.rst +++ b/doc/source/reference/indexing.rst @@ -2,9 +2,9 @@ .. _api.indexing: -======== -Indexing -======== +============= +Index Objects +============= Index ----- From 4c231a77858b06f5ac4c48faf3155394c166da38 Mon Sep 17 00:00:00 2001 From: topper-123 Date: Fri, 24 May 2019 17:29:32 +0200 Subject: [PATCH 08/34] CLN: Remove StringMixin from PandasObject (#26505) --- doc/source/whatsnew/v0.25.0.rst | 14 ++++++++++++++ pandas/core/arrays/categorical.py | 6 +----- pandas/core/arrays/sparse.py | 2 +- pandas/core/base.py | 4 ++-- pandas/core/frame.py | 2 +- pandas/core/generic.py | 2 +- pandas/core/groupby/groupby.py | 4 ++-- pandas/core/indexes/base.py | 2 +- pandas/core/indexes/frozen.py | 2 +- pandas/core/internals/blocks.py | 3 +-- pandas/core/internals/managers.py | 2 +- pandas/core/panel.py | 2 +- pandas/core/series.py | 2 +- pandas/core/sparse/series.py | 5 ++--- pandas/core/window.py | 2 +- 15 files changed, 31 insertions(+), 23 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 0c69d1a4ce013..d86379c4d0703 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -213,6 +213,20 @@ are returned. (:issue:`21521`) df.groupby("a").ffill() +``__str__`` methods now call ``__repr__`` rather than vica-versa +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Pandas has until now mostly defined string representations in a Pandas objects's +``__str__``/``__unicode__``/``__bytes__`` methods, and called ``__str__`` from the ``__repr__`` +method, if a specific ``__repr__`` method is not found. This is not needed for Python3. +In Pandas 0.25, the string representations of Pandas objects are now generally +defined in ``__repr__``, and calls to ``__str__`` in general now pass the call on to +the ``__repr__``, if a specific ``__str__`` method doesn't exist, as is standard for Python. +This change is backward compatible for direct usage of Pandas, but if you subclass +Pandas objects *and* give your subclasses specific ``__str__``/``__repr__`` methods, +you may have to adjust your ``__str__``/``__repr__`` methods (:issue:`26495`). + + .. _whatsnew_0250.api_breaking.deps: Increased minimum versions for dependencies diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index eeacb88f33f55..1d6b906158125 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -2030,7 +2030,7 @@ def _get_repr(self, length=True, na_rep='NaN', footer=True): result = formatter.to_string() return str(result) - def __str__(self): + def __repr__(self): """ String representation. """ @@ -2045,10 +2045,6 @@ def __str__(self): return result - def __repr__(self): - # We want to bypass the ExtensionArray.__repr__ - return str(self) - def _maybe_coerce_indexer(self, indexer): """ return an indexer coerced to the codes dtype diff --git a/pandas/core/arrays/sparse.py b/pandas/core/arrays/sparse.py index 7a66e0ff33cc7..b0236cb393c1c 100644 --- a/pandas/core/arrays/sparse.py +++ b/pandas/core/arrays/sparse.py @@ -1831,7 +1831,7 @@ def _add_comparison_ops(cls): # ---------- # Formatting # ----------- - def __str__(self): + def __repr__(self): return '{self}\nFill: {fill}\n{index}'.format( self=printing.pprint_thing(self), fill=printing.pprint_thing(self.fill_value), diff --git a/pandas/core/base.py b/pandas/core/base.py index f7837c60c0b82..3f59871fb5b38 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -55,7 +55,7 @@ def __repr__(self): return str(self) -class PandasObject(StringMixin, DirNamesMixin): +class PandasObject(DirNamesMixin): """baseclass for various pandas objects""" @@ -64,7 +64,7 @@ def _constructor(self): """class constructor (for this class it's just `__class__`""" return self.__class__ - def __str__(self): + def __repr__(self): """ Return a string representation for a particular object. """ diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 7d501e8095921..7cf200506e853 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -610,7 +610,7 @@ def _info_repr(self): return info_repr_option and not (self._repr_fits_horizontal_() and self._repr_fits_vertical_()) - def __str__(self): + def __repr__(self): """ Return a string representation for a particular DataFrame. """ diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 623e2b4863029..76c73fc40977c 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -2022,7 +2022,7 @@ def __setstate__(self, state): # ---------------------------------------------------------------------- # Rendering Methods - def __str__(self): + def __repr__(self): # string representation based upon iterating over self # (since, by definition, `PandasContainers` are iterable) prepr = '[%s]' % ','.join(map(pprint_thing, self)) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 4e9e3b4963b6d..aa04b7505afe4 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -373,8 +373,8 @@ def __init__(self, obj, keys=None, axis=0, level=None, def __len__(self): return len(self.groups) - def __str__(self): - # TODO: Better str/repr for GroupBy object + def __repr__(self): + # TODO: Better repr for GroupBy object return object.__repr__(self) def _assure_grouper(self): diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index eff7ff2c9f347..a4544e79e2dfa 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -932,7 +932,7 @@ def __deepcopy__(self, memo=None): # -------------------------------------------------------------------- # Rendering Methods - def __str__(self): + def __repr__(self): """ Return a string representation for this object. """ diff --git a/pandas/core/indexes/frozen.py b/pandas/core/indexes/frozen.py index 60e4253e3101b..aeb0fa119ab33 100644 --- a/pandas/core/indexes/frozen.py +++ b/pandas/core/indexes/frozen.py @@ -149,7 +149,7 @@ def values(self): arr = self.view(np.ndarray).copy() return arr - def __str__(self): + def __repr__(self): """ Return a string representation for this object. """ diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 0ac87c653cfff..f86ef40a97299 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -233,8 +233,7 @@ def make_block_same_class(self, values, placement=None, ndim=None, return make_block(values, placement=placement, ndim=ndim, klass=self.__class__, dtype=dtype) - def __str__(self): - + def __repr__(self): # don't want to print out all of the items here name = pprint_thing(self.__class__.__name__) if self._is_single_block: diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 96a672b60da70..0b63588c9f5d9 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -291,7 +291,7 @@ def _post_setstate(self): def __len__(self): return len(self.items) - def __str__(self): + def __repr__(self): output = pprint_thing(self.__class__.__name__) for i, ax in enumerate(self.axes): if i == 0: diff --git a/pandas/core/panel.py b/pandas/core/panel.py index b6b957c543df6..c65a73bd0d3f0 100644 --- a/pandas/core/panel.py +++ b/pandas/core/panel.py @@ -340,7 +340,7 @@ def _compare_constructor(self, other, func): # ---------------------------------------------------------------------- # Magic methods - def __str__(self): + def __repr__(self): """ Return a string representation for a particular Panel. """ diff --git a/pandas/core/series.py b/pandas/core/series.py index 5b59fd6e7b38d..55b5bdcbf53f4 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1384,7 +1384,7 @@ def reset_index(self, level=None, drop=False, name=None, inplace=False): # ---------------------------------------------------------------------- # Rendering Methods - def __str__(self): + def __repr__(self): """ Return a string representation for a particular Series. """ diff --git a/pandas/core/sparse/series.py b/pandas/core/sparse/series.py index ae1c94e136475..eac59e2c0f5eb 100644 --- a/pandas/core/sparse/series.py +++ b/pandas/core/sparse/series.py @@ -217,9 +217,8 @@ def as_sparse_array(self, kind=None, fill_value=None, copy=False): return SparseArray(self.values, sparse_index=self.sp_index, fill_value=fill_value, kind=kind, copy=copy) - def __str__(self): - # currently, unicode is same as repr...fixes infinite loop - series_rep = Series.__str__(self) + def __repr__(self): + series_rep = Series.__repr__(self) rep = '{series}\n{index!r}'.format(series=series_rep, index=self.sp_index) return rep diff --git a/pandas/core/window.py b/pandas/core/window.py index deb64f1fb089d..d51e12035c829 100644 --- a/pandas/core/window.py +++ b/pandas/core/window.py @@ -157,7 +157,7 @@ def _get_window(self, other=None): def _window_type(self): return self.__class__.__name__ - def __str__(self): + def __repr__(self): """ Provide a nice str repr of our rolling object. """ From cffbaac126fb1043e44cac9ca4ed872d5424fc52 Mon Sep 17 00:00:00 2001 From: Vaibhav Vishal Date: Fri, 24 May 2019 21:02:14 +0530 Subject: [PATCH 09/34] Fix type annotations in pandas.core.indexes.datetimes (#26404) --- mypy.ini | 6 ------ pandas/core/indexes/datetimelike.py | 14 +++++++------- pandas/core/indexes/datetimes.py | 12 +++++++----- 3 files changed, 14 insertions(+), 18 deletions(-) diff --git a/mypy.ini b/mypy.ini index 584c747a26f2e..3df8fd13a2a75 100644 --- a/mypy.ini +++ b/mypy.ini @@ -8,11 +8,5 @@ ignore_errors=True [mypy-pandas.core.indexes.datetimelike] ignore_errors=True -[mypy-pandas.core.indexes.datetimes] -ignore_errors=True - [mypy-pandas.core.indexes.period] ignore_errors=True - -[mypy-pandas.core.indexes.timedeltas] -ignore_errors=True \ No newline at end of file diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index 7454b015cb556..092cec00228cd 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -57,7 +57,7 @@ class DatetimeIndexOpsMixin(ExtensionOpsMixin): """ common ops mixin to support a unified interface datetimelike Index """ - _data = None # type: DatetimeLikeArrayMixin + _data = None # DatetimeLikeArrayMixin assumes subclasses are mutable, so these are # properties there. They can be made into cache_readonly for Index @@ -220,9 +220,9 @@ def __contains__(self, key): # Try to run function on index first, and then on elements of index # Especially important for group-by functionality - def map(self, f): + def map(self, mapper, na_action=None): try: - result = f(self) + result = mapper(self) # Try to use this result if we can if isinstance(result, np.ndarray): @@ -232,7 +232,7 @@ def map(self, f): raise TypeError('The map function must return an Index object') return result except Exception: - return self.astype(object).map(f) + return self.astype(object).map(mapper) def sort_values(self, return_indexer=False, ascending=True): """ @@ -430,8 +430,8 @@ def argmax(self, axis=None, skipna=True, *args, **kwargs): # -------------------------------------------------------------------- # Rendering Methods - def _format_with_header(self, header, **kwargs): - return header + list(self._format_native_types(**kwargs)) + def _format_with_header(self, header, na_rep='NaT', **kwargs): + return header + list(self._format_native_types(na_rep, **kwargs)) @property def _formatter_func(self): @@ -509,7 +509,7 @@ def __rsub__(self, other): cls.__rsub__ = __rsub__ - def isin(self, values): + def isin(self, values, level=None): """ Compute boolean array of whether each index value is found in the passed set of values. diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index 7fd537fb9989a..e68431b79dcd3 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -4,8 +4,8 @@ import numpy as np -from pandas._libs import ( - Timestamp, index as libindex, join as libjoin, lib, tslib as libts) +from pandas._libs import Timestamp, index as libindex, lib, tslib as libts +import pandas._libs.join as libjoin from pandas._libs.tslibs import ccalendar, fields, parsing, timezones from pandas.util._decorators import Appender, Substitution, cache_readonly @@ -1087,9 +1087,11 @@ def slice_indexer(self, start=None, end=None, step=None, kind=None): _is_monotonic_decreasing = Index.is_monotonic_decreasing _is_unique = Index.is_unique - _timezone = cache_readonly(DatetimeArray._timezone.fget) - is_normalized = cache_readonly(DatetimeArray.is_normalized.fget) - _resolution = cache_readonly(DatetimeArray._resolution.fget) + _timezone = cache_readonly(DatetimeArray._timezone.fget) # type: ignore + is_normalized = cache_readonly( + DatetimeArray.is_normalized.fget) # type: ignore + _resolution = cache_readonly( + DatetimeArray._resolution.fget) # type: ignore strftime = ea_passthrough(DatetimeArray.strftime) _has_same_tz = ea_passthrough(DatetimeArray._has_same_tz) From a8af7a1a73164e97627bb5b96dacfdb2ba10b24e Mon Sep 17 00:00:00 2001 From: Mats Maiwald <32721837+matsmaiwald@users.noreply.github.com> Date: Fri, 24 May 2019 17:47:01 +0200 Subject: [PATCH 10/34] =?UTF-8?q?Better=20error=20message=20for=20DataFram?= =?UTF-8?q?e.hist()=20without=20numerical=20columns=20(=E2=80=A6=20(#26483?= =?UTF-8?q?)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pandas/plotting/_core.py | 4 ++++ pandas/tests/plotting/test_hist_method.py | 10 ++++++++++ 2 files changed, 14 insertions(+) diff --git a/pandas/plotting/_core.py b/pandas/plotting/_core.py index 90297ecfa3415..fed4b0d90983c 100644 --- a/pandas/plotting/_core.py +++ b/pandas/plotting/_core.py @@ -2426,6 +2426,10 @@ def hist_frame(data, column=None, by=None, grid=True, xlabelsize=None, data = data._get_numeric_data() naxes = len(data.columns) + if naxes == 0: + raise ValueError("hist method requires numerical columns, " + "nothing to plot.") + fig, axes = _subplots(naxes=naxes, ax=ax, squeeze=False, sharex=sharex, sharey=sharey, figsize=figsize, layout=layout) diff --git a/pandas/tests/plotting/test_hist_method.py b/pandas/tests/plotting/test_hist_method.py index c62ed21c2fb17..f3f6c9c7fc2d4 100644 --- a/pandas/tests/plotting/test_hist_method.py +++ b/pandas/tests/plotting/test_hist_method.py @@ -209,6 +209,16 @@ def test_hist_df_legacy(self): with pytest.raises(AttributeError): ser.hist(foo='bar') + @pytest.mark.slow + def test_hist_non_numerical_raises(self): + # gh-10444 + df = DataFrame(np.random.rand(10, 2)) + df_o = df.astype(np.object) + + msg = "hist method requires numerical columns, nothing to plot." + with pytest.raises(ValueError, match=msg): + df_o.hist() + @pytest.mark.slow def test_hist_layout(self): df = DataFrame(randn(100, 3)) From ac026742dfc74b6c26ef867fa846cc322a602847 Mon Sep 17 00:00:00 2001 From: William Ayd Date: Fri, 24 May 2019 09:01:09 -0700 Subject: [PATCH 11/34] Excel Test Cleanup - ReadWriteClass (#26473) --- pandas/tests/io/test_excel.py | 427 +++++++++++++++++----------------- 1 file changed, 216 insertions(+), 211 deletions(-) diff --git a/pandas/tests/io/test_excel.py b/pandas/tests/io/test_excel.py index 112d14795d9bf..f9926cd26d8da 100644 --- a/pandas/tests/io/test_excel.py +++ b/pandas/tests/io/test_excel.py @@ -49,7 +49,6 @@ def ignore_xlrd_time_clock_warning(): yield -@td.skip_if_no('xlrd', '1.0.0') class SharedItems: @pytest.fixture(autouse=True) @@ -60,6 +59,20 @@ def setup_method(self, datapath): self.tsframe = _tsframe.copy() self.mixed_frame = _mixed_frame.copy() + +@td.skip_if_no('xlrd', '1.0.0') +class ReadingTestsBase(SharedItems): + # This is based on ExcelWriterBase + + @pytest.fixture(autouse=True, params=['xlrd', None]) + def set_engine(self, request): + func_name = "get_exceldf" + old_func = getattr(self, func_name) + new_func = partial(old_func, engine=request.param) + setattr(self, func_name, new_func) + yield + setattr(self, func_name, old_func) + def get_csv_refdf(self, basename): """ Obtain the reference data from read_csv with the Python engine. @@ -114,19 +127,6 @@ def get_exceldf(self, basename, ext, *args, **kwds): pth = os.path.join(self.dirpath, basename + ext) return read_excel(pth, *args, **kwds) - -class ReadingTestsBase(SharedItems): - # This is based on ExcelWriterBase - - @pytest.fixture(autouse=True, params=['xlrd', None]) - def set_engine(self, request): - func_name = "get_exceldf" - old_func = getattr(self, func_name) - new_func = partial(old_func, engine=request.param) - setattr(self, func_name, new_func) - yield - setattr(self, func_name, old_func) - @td.skip_if_no("xlrd", "1.0.1") # see gh-22682 def test_usecols_int(self, ext): @@ -565,74 +565,6 @@ def test_read_excel_blank_with_header(self, ext): actual = self.get_exceldf('blank_with_header', ext, 'Sheet1') tm.assert_frame_equal(actual, expected) - @td.skip_if_no("xlwt") - @td.skip_if_no("openpyxl") - @pytest.mark.parametrize("header,expected", [ - (None, DataFrame([np.nan] * 4)), - (0, DataFrame({"Unnamed: 0": [np.nan] * 3})) - ]) - def test_read_one_empty_col_no_header(self, ext, header, expected): - # xref gh-12292 - filename = "no_header" - df = pd.DataFrame( - [["", 1, 100], - ["", 2, 200], - ["", 3, 300], - ["", 4, 400]] - ) - - with ensure_clean(ext) as path: - df.to_excel(path, filename, index=False, header=False) - result = read_excel(path, filename, usecols=[0], header=header) - - tm.assert_frame_equal(result, expected) - - @td.skip_if_no("xlwt") - @td.skip_if_no("openpyxl") - @pytest.mark.parametrize("header,expected", [ - (None, DataFrame([0] + [np.nan] * 4)), - (0, DataFrame([np.nan] * 4)) - ]) - def test_read_one_empty_col_with_header(self, ext, header, expected): - filename = "with_header" - df = pd.DataFrame( - [["", 1, 100], - ["", 2, 200], - ["", 3, 300], - ["", 4, 400]] - ) - - with ensure_clean(ext) as path: - df.to_excel(path, 'with_header', index=False, header=True) - result = read_excel(path, filename, usecols=[0], header=header) - - tm.assert_frame_equal(result, expected) - - @td.skip_if_no('openpyxl') - @td.skip_if_no('xlwt') - def test_set_column_names_in_parameter(self, ext): - # GH 12870 : pass down column names associated with - # keyword argument names - refdf = pd.DataFrame([[1, 'foo'], [2, 'bar'], - [3, 'baz']], columns=['a', 'b']) - - with ensure_clean(ext) as pth: - with ExcelWriter(pth) as writer: - refdf.to_excel(writer, 'Data_no_head', - header=False, index=False) - refdf.to_excel(writer, 'Data_with_head', index=False) - - refdf.columns = ['A', 'B'] - - with ExcelFile(pth) as reader: - xlsdf_no_head = read_excel(reader, 'Data_no_head', - header=None, names=['A', 'B']) - xlsdf_with_head = read_excel(reader, 'Data_with_head', - index_col=None, names=['A', 'B']) - - tm.assert_frame_equal(xlsdf_no_head, refdf) - tm.assert_frame_equal(xlsdf_with_head, refdf) - def test_date_conversion_overflow(self, ext): # GH 10001 : pandas.ExcelFile ignore parse_dates=False expected = pd.DataFrame([[pd.Timestamp('2016-03-12'), 'Marc Johnson'], @@ -741,7 +673,6 @@ def test_read_from_file_url(self, ext): tm.assert_frame_equal(url_table, local_table) - @td.skip_if_no('pathlib') def test_read_from_pathlib_path(self, ext): # GH12655 @@ -780,32 +711,6 @@ def test_reader_closes_file(self, ext): assert f.closed - @td.skip_if_no("xlwt") - @td.skip_if_no("openpyxl") - def test_creating_and_reading_multiple_sheets(self, ext): - # see gh-9450 - # - # Test reading multiple sheets, from a runtime - # created Excel file with multiple sheets. - def tdf(col_sheet_name): - d, i = [11, 22, 33], [1, 2, 3] - return DataFrame(d, i, columns=[col_sheet_name]) - - sheets = ["AAA", "BBB", "CCC"] - - dfs = [tdf(s) for s in sheets] - dfs = dict(zip(sheets, dfs)) - - with ensure_clean(ext) as pth: - with ExcelWriter(pth) as ew: - for sheetname, df in dfs.items(): - df.to_excel(ew, sheetname) - - dfs_returned = read_excel(pth, sheet_name=sheets, index_col=0) - - for s in sheets: - tm.assert_frame_equal(dfs[s], dfs_returned[s]) - def test_reader_seconds(self, ext): # Test reading times with and without milliseconds. GH5945. @@ -902,84 +807,6 @@ def test_read_excel_multiindex_header_only(self, ext): expected = DataFrame([[1, 2, 3, 4]] * 2, columns=exp_columns) tm.assert_frame_equal(result, expected) - @td.skip_if_no("xlsxwriter") - def test_read_excel_multiindex_empty_level(self, ext): - # see gh-12453 - with ensure_clean(ext) as path: - df = DataFrame({ - ("One", "x"): {0: 1}, - ("Two", "X"): {0: 3}, - ("Two", "Y"): {0: 7}, - ("Zero", ""): {0: 0} - }) - - expected = DataFrame({ - ("One", "x"): {0: 1}, - ("Two", "X"): {0: 3}, - ("Two", "Y"): {0: 7}, - ("Zero", "Unnamed: 4_level_1"): {0: 0} - }) - - df.to_excel(path) - actual = pd.read_excel(path, header=[0, 1], index_col=0) - tm.assert_frame_equal(actual, expected) - - df = pd.DataFrame({ - ("Beg", ""): {0: 0}, - ("Middle", "x"): {0: 1}, - ("Tail", "X"): {0: 3}, - ("Tail", "Y"): {0: 7} - }) - - expected = pd.DataFrame({ - ("Beg", "Unnamed: 1_level_1"): {0: 0}, - ("Middle", "x"): {0: 1}, - ("Tail", "X"): {0: 3}, - ("Tail", "Y"): {0: 7} - }) - - df.to_excel(path) - actual = pd.read_excel(path, header=[0, 1], index_col=0) - tm.assert_frame_equal(actual, expected) - - @td.skip_if_no("xlsxwriter") - @pytest.mark.parametrize("c_idx_names", [True, False]) - @pytest.mark.parametrize("r_idx_names", [True, False]) - @pytest.mark.parametrize("c_idx_levels", [1, 3]) - @pytest.mark.parametrize("r_idx_levels", [1, 3]) - def test_excel_multindex_roundtrip(self, ext, c_idx_names, r_idx_names, - c_idx_levels, r_idx_levels): - # see gh-4679 - with ensure_clean(ext) as pth: - if c_idx_levels == 1 and c_idx_names: - pytest.skip("Column index name cannot be " - "serialized unless it's a MultiIndex") - - # Empty name case current read in as - # unnamed levels, not Nones. - check_names = r_idx_names or r_idx_levels <= 1 - - df = mkdf(5, 5, c_idx_names, r_idx_names, - c_idx_levels, r_idx_levels) - df.to_excel(pth) - - act = pd.read_excel(pth, index_col=list(range(r_idx_levels)), - header=list(range(c_idx_levels))) - tm.assert_frame_equal(df, act, check_names=check_names) - - df.iloc[0, :] = np.nan - df.to_excel(pth) - - act = pd.read_excel(pth, index_col=list(range(r_idx_levels)), - header=list(range(c_idx_levels))) - tm.assert_frame_equal(df, act, check_names=check_names) - - df.iloc[-1, :] = np.nan - df.to_excel(pth) - act = pd.read_excel(pth, index_col=list(range(r_idx_levels)), - header=list(range(c_idx_levels))) - tm.assert_frame_equal(df, act, check_names=check_names) - def test_excel_old_index_format(self, ext): # see gh-4679 filename = "test_index_name_pre17" + ext @@ -1054,30 +881,6 @@ def test_read_excel_chunksize(self, ext): pd.read_excel(os.path.join(self.dirpath, 'test1' + ext), chunksize=100) - @td.skip_if_no("xlwt") - @td.skip_if_no("openpyxl") - def test_read_excel_parse_dates(self, ext): - # see gh-11544, gh-12051 - df = DataFrame( - {"col": [1, 2, 3], - "date_strings": pd.date_range("2012-01-01", periods=3)}) - df2 = df.copy() - df2["date_strings"] = df2["date_strings"].dt.strftime("%m/%d/%Y") - - with ensure_clean(ext) as pth: - df2.to_excel(pth) - - res = read_excel(pth, index_col=0) - tm.assert_frame_equal(df2, res) - - res = read_excel(pth, parse_dates=["date_strings"], index_col=0) - tm.assert_frame_equal(df, res) - - date_parser = lambda x: pd.datetime.strptime(x, "%m/%d/%Y") - res = read_excel(pth, parse_dates=["date_strings"], - date_parser=date_parser, index_col=0) - tm.assert_frame_equal(df, res) - def test_read_excel_skiprows_list(self, ext): # GH 4903 actual = pd.read_excel(os.path.join(self.dirpath, @@ -1141,6 +944,208 @@ def test_read_excel_squeeze(self, ext): tm.assert_series_equal(actual, expected) +@td.skip_if_no('xlrd', '1.0.0') +@pytest.mark.parametrize("ext", ['.xls', '.xlsx', '.xlsm']) +class TestRoundTrip: + + @td.skip_if_no("xlwt") + @td.skip_if_no("openpyxl") + @pytest.mark.parametrize("header,expected", [ + (None, DataFrame([np.nan] * 4)), + (0, DataFrame({"Unnamed: 0": [np.nan] * 3})) + ]) + def test_read_one_empty_col_no_header(self, ext, header, expected): + # xref gh-12292 + filename = "no_header" + df = pd.DataFrame( + [["", 1, 100], + ["", 2, 200], + ["", 3, 300], + ["", 4, 400]] + ) + + with ensure_clean(ext) as path: + df.to_excel(path, filename, index=False, header=False) + result = read_excel(path, filename, usecols=[0], header=header) + + tm.assert_frame_equal(result, expected) + + @td.skip_if_no("xlwt") + @td.skip_if_no("openpyxl") + @pytest.mark.parametrize("header,expected", [ + (None, DataFrame([0] + [np.nan] * 4)), + (0, DataFrame([np.nan] * 4)) + ]) + def test_read_one_empty_col_with_header(self, ext, header, expected): + filename = "with_header" + df = pd.DataFrame( + [["", 1, 100], + ["", 2, 200], + ["", 3, 300], + ["", 4, 400]] + ) + + with ensure_clean(ext) as path: + df.to_excel(path, 'with_header', index=False, header=True) + result = read_excel(path, filename, usecols=[0], header=header) + + tm.assert_frame_equal(result, expected) + + @td.skip_if_no('openpyxl') + @td.skip_if_no('xlwt') + def test_set_column_names_in_parameter(self, ext): + # GH 12870 : pass down column names associated with + # keyword argument names + refdf = pd.DataFrame([[1, 'foo'], [2, 'bar'], + [3, 'baz']], columns=['a', 'b']) + + with ensure_clean(ext) as pth: + with ExcelWriter(pth) as writer: + refdf.to_excel(writer, 'Data_no_head', + header=False, index=False) + refdf.to_excel(writer, 'Data_with_head', index=False) + + refdf.columns = ['A', 'B'] + + with ExcelFile(pth) as reader: + xlsdf_no_head = read_excel(reader, 'Data_no_head', + header=None, names=['A', 'B']) + xlsdf_with_head = read_excel(reader, 'Data_with_head', + index_col=None, names=['A', 'B']) + + tm.assert_frame_equal(xlsdf_no_head, refdf) + tm.assert_frame_equal(xlsdf_with_head, refdf) + + @td.skip_if_no("xlwt") + @td.skip_if_no("openpyxl") + def test_creating_and_reading_multiple_sheets(self, ext): + # see gh-9450 + # + # Test reading multiple sheets, from a runtime + # created Excel file with multiple sheets. + def tdf(col_sheet_name): + d, i = [11, 22, 33], [1, 2, 3] + return DataFrame(d, i, columns=[col_sheet_name]) + + sheets = ["AAA", "BBB", "CCC"] + + dfs = [tdf(s) for s in sheets] + dfs = dict(zip(sheets, dfs)) + + with ensure_clean(ext) as pth: + with ExcelWriter(pth) as ew: + for sheetname, df in dfs.items(): + df.to_excel(ew, sheetname) + + dfs_returned = read_excel(pth, sheet_name=sheets, index_col=0) + + for s in sheets: + tm.assert_frame_equal(dfs[s], dfs_returned[s]) + + @td.skip_if_no("xlsxwriter") + def test_read_excel_multiindex_empty_level(self, ext): + # see gh-12453 + with ensure_clean(ext) as path: + df = DataFrame({ + ("One", "x"): {0: 1}, + ("Two", "X"): {0: 3}, + ("Two", "Y"): {0: 7}, + ("Zero", ""): {0: 0} + }) + + expected = DataFrame({ + ("One", "x"): {0: 1}, + ("Two", "X"): {0: 3}, + ("Two", "Y"): {0: 7}, + ("Zero", "Unnamed: 4_level_1"): {0: 0} + }) + + df.to_excel(path) + actual = pd.read_excel(path, header=[0, 1], index_col=0) + tm.assert_frame_equal(actual, expected) + + df = pd.DataFrame({ + ("Beg", ""): {0: 0}, + ("Middle", "x"): {0: 1}, + ("Tail", "X"): {0: 3}, + ("Tail", "Y"): {0: 7} + }) + + expected = pd.DataFrame({ + ("Beg", "Unnamed: 1_level_1"): {0: 0}, + ("Middle", "x"): {0: 1}, + ("Tail", "X"): {0: 3}, + ("Tail", "Y"): {0: 7} + }) + + df.to_excel(path) + actual = pd.read_excel(path, header=[0, 1], index_col=0) + tm.assert_frame_equal(actual, expected) + + @td.skip_if_no("xlsxwriter") + @pytest.mark.parametrize("c_idx_names", [True, False]) + @pytest.mark.parametrize("r_idx_names", [True, False]) + @pytest.mark.parametrize("c_idx_levels", [1, 3]) + @pytest.mark.parametrize("r_idx_levels", [1, 3]) + def test_excel_multindex_roundtrip(self, ext, c_idx_names, r_idx_names, + c_idx_levels, r_idx_levels): + # see gh-4679 + with ensure_clean(ext) as pth: + if c_idx_levels == 1 and c_idx_names: + pytest.skip("Column index name cannot be " + "serialized unless it's a MultiIndex") + + # Empty name case current read in as + # unnamed levels, not Nones. + check_names = r_idx_names or r_idx_levels <= 1 + + df = mkdf(5, 5, c_idx_names, r_idx_names, + c_idx_levels, r_idx_levels) + df.to_excel(pth) + + act = pd.read_excel(pth, index_col=list(range(r_idx_levels)), + header=list(range(c_idx_levels))) + tm.assert_frame_equal(df, act, check_names=check_names) + + df.iloc[0, :] = np.nan + df.to_excel(pth) + + act = pd.read_excel(pth, index_col=list(range(r_idx_levels)), + header=list(range(c_idx_levels))) + tm.assert_frame_equal(df, act, check_names=check_names) + + df.iloc[-1, :] = np.nan + df.to_excel(pth) + act = pd.read_excel(pth, index_col=list(range(r_idx_levels)), + header=list(range(c_idx_levels))) + tm.assert_frame_equal(df, act, check_names=check_names) + + @td.skip_if_no("xlwt") + @td.skip_if_no("openpyxl") + def test_read_excel_parse_dates(self, ext): + # see gh-11544, gh-12051 + df = DataFrame( + {"col": [1, 2, 3], + "date_strings": pd.date_range("2012-01-01", periods=3)}) + df2 = df.copy() + df2["date_strings"] = df2["date_strings"].dt.strftime("%m/%d/%Y") + + with ensure_clean(ext) as pth: + df2.to_excel(pth) + + res = read_excel(pth, index_col=0) + tm.assert_frame_equal(df2, res) + + res = read_excel(pth, parse_dates=["date_strings"], index_col=0) + tm.assert_frame_equal(df, res) + + date_parser = lambda x: pd.datetime.strptime(x, "%m/%d/%Y") + res = read_excel(pth, parse_dates=["date_strings"], + date_parser=date_parser, index_col=0) + tm.assert_frame_equal(df, res) + + +@td.skip_if_no('xlrd', '1.0.0') @pytest.mark.parametrize("ext", ['.xls', '.xlsx', '.xlsm']) class TestXlrdReader(ReadingTestsBase): """ From 91512111bc0f42ac15695d2af94e3fff3d6ba536 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Fri, 24 May 2019 15:07:54 -0700 Subject: [PATCH 12/34] CLN: pd.TimeGrouper (#26477) --- doc/source/whatsnew/v0.25.0.rst | 2 +- pandas/__init__.py | 2 +- pandas/core/api.py | 12 ---------- pandas/tests/api/test_api.py | 13 +---------- pandas/tests/groupby/test_timegrouper.py | 7 +++--- pandas/tests/resample/test_base.py | 4 ++-- pandas/tests/resample/test_datetime_index.py | 14 ++++++------ pandas/tests/resample/test_time_grouper.py | 24 ++++++++------------ 8 files changed, 25 insertions(+), 53 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index d86379c4d0703..d4104ab1d79a1 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -313,7 +313,7 @@ Removal of prior version deprecations/changes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - Removed ``Panel`` (:issue:`25047`, :issue:`25191`, :issue:`25231`) - -- +- Removed previously deprecated ``TimeGrouper`` (:issue:`16942`) - .. _whatsnew_0250.performance: diff --git a/pandas/__init__.py b/pandas/__init__.py index bd367bbe27d5e..6af6f3093c120 100644 --- a/pandas/__init__.py +++ b/pandas/__init__.py @@ -65,7 +65,7 @@ to_numeric, to_datetime, to_timedelta, # misc - np, TimeGrouper, Grouper, factorize, unique, value_counts, + np, Grouper, factorize, unique, value_counts, array, Categorical, set_eng_float_format, Series, DataFrame, Panel) diff --git a/pandas/core/api.py b/pandas/core/api.py index 96f623bda9a8a..b7398e433f28f 100644 --- a/pandas/core/api.py +++ b/pandas/core/api.py @@ -45,15 +45,3 @@ from pandas.tseries.offsets import DateOffset from pandas.core.tools.datetimes import to_datetime from pandas.core.tools.timedeltas import to_timedelta - - -# Deprecation: xref gh-16747 -class TimeGrouper: - - def __new__(cls, *args, **kwargs): - from pandas.core.resample import TimeGrouper - import warnings - warnings.warn("pd.TimeGrouper is deprecated and will be removed; " - "Please use pd.Grouper(freq=...)", - FutureWarning, stacklevel=2) - return TimeGrouper(*args, **kwargs) diff --git a/pandas/tests/api/test_api.py b/pandas/tests/api/test_api.py index 7ee0225723675..c92808200ebea 100644 --- a/pandas/tests/api/test_api.py +++ b/pandas/tests/api/test_api.py @@ -50,7 +50,7 @@ class TestPDApi(Base): ] # these are already deprecated; awaiting removal - deprecated_classes = ['TimeGrouper', 'Panel'] + deprecated_classes = ['Panel'] # these should be deprecated in the future deprecated_classes_in_future = [] @@ -132,17 +132,6 @@ def test_testing(self): self.check(testing, self.funcs) -class TestTopLevelDeprecations: - - # top-level API deprecations - # GH 13790 - - def test_TimeGrouper(self): - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - pd.TimeGrouper(freq='D') - - class TestCDateRange: def test_deprecation_cdaterange(self): diff --git a/pandas/tests/groupby/test_timegrouper.py b/pandas/tests/groupby/test_timegrouper.py index 21c71154c95ef..ef05e6ada4890 100644 --- a/pandas/tests/groupby/test_timegrouper.py +++ b/pandas/tests/groupby/test_timegrouper.py @@ -10,6 +10,7 @@ import pandas as pd from pandas import DataFrame, Index, MultiIndex, Series, Timestamp, date_range +from pandas.core.groupby.grouper import Grouper from pandas.core.groupby.ops import BinGrouper from pandas.util import testing as tm from pandas.util.testing import assert_frame_equal, assert_series_equal @@ -365,10 +366,8 @@ def sumfunc_value(x): return x.value.sum() expected = df.groupby(pd.Grouper(key='date')).apply(sumfunc_value) - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - result = (df_dt.groupby(pd.TimeGrouper(freq='M', key='date')) - .apply(sumfunc_value)) + result = (df_dt.groupby(Grouper(freq='M', key='date')) + .apply(sumfunc_value)) assert_series_equal(result.reset_index(drop=True), expected.reset_index(drop=True)) diff --git a/pandas/tests/resample/test_base.py b/pandas/tests/resample/test_base.py index c3c908f4b0d1b..63fa2007e401d 100644 --- a/pandas/tests/resample/test_base.py +++ b/pandas/tests/resample/test_base.py @@ -6,10 +6,10 @@ import pandas as pd from pandas import DataFrame, Series from pandas.core.groupby.groupby import DataError +from pandas.core.groupby.grouper import Grouper from pandas.core.indexes.datetimes import date_range from pandas.core.indexes.period import PeriodIndex, period_range from pandas.core.indexes.timedeltas import TimedeltaIndex, timedelta_range -from pandas.core.resample import TimeGrouper import pandas.util.testing as tm from pandas.util.testing import ( assert_almost_equal, assert_frame_equal, assert_index_equal, @@ -214,7 +214,7 @@ def test_apply_to_empty_series(empty_series): def test_resampler_is_iterable(series): # GH 15314 freq = 'H' - tg = TimeGrouper(freq, convention='start') + tg = Grouper(freq=freq, convention='start') grouped = series.groupby(tg) resampled = series.resample(freq) for (rk, rv), (gk, gv) in zip(resampled, grouped): diff --git a/pandas/tests/resample/test_datetime_index.py b/pandas/tests/resample/test_datetime_index.py index c2868979e9d8d..5711174ef0c9f 100644 --- a/pandas/tests/resample/test_datetime_index.py +++ b/pandas/tests/resample/test_datetime_index.py @@ -10,10 +10,10 @@ import pandas as pd from pandas import DataFrame, Series, Timedelta, Timestamp, isna, notna +from pandas.core.groupby.grouper import Grouper from pandas.core.indexes.datetimes import date_range from pandas.core.indexes.period import Period, period_range -from pandas.core.resample import ( - DatetimeIndex, TimeGrouper, _get_timestamp_range_edges) +from pandas.core.resample import DatetimeIndex, _get_timestamp_range_edges import pandas.util.testing as tm from pandas.util.testing import ( assert_almost_equal, assert_frame_equal, assert_series_equal) @@ -42,7 +42,7 @@ def test_custom_grouper(index): dti = index s = Series(np.array([1] * len(dti)), index=dti, dtype='int64') - b = TimeGrouper(Minute(5)) + b = Grouper(freq=Minute(5)) g = s.groupby(b) # check all cython functions work @@ -50,7 +50,7 @@ def test_custom_grouper(index): for f in funcs: g._cython_agg_general(f) - b = TimeGrouper(Minute(5), closed='right', label='right') + b = Grouper(freq=Minute(5), closed='right', label='right') g = s.groupby(b) # check all cython functions work funcs = ['add', 'mean', 'prod', 'ohlc', 'min', 'max', 'var'] @@ -116,7 +116,7 @@ def test_resample_integerarray(): def test_resample_basic_grouper(series): s = series result = s.resample('5Min').last() - grouper = TimeGrouper(Minute(5), closed='left', label='left') + grouper = Grouper(freq=Minute(5), closed='left', label='left') expected = s.groupby(grouper).agg(lambda x: x[-1]) assert_series_equal(result, expected) @@ -373,7 +373,7 @@ def test_resample_upsampling_picked_but_not_correct(): def test_resample_frame_basic(): df = tm.makeTimeDataFrame() - b = TimeGrouper('M') + b = Grouper(freq='M') g = df.groupby(b) # check all cython functions work @@ -521,7 +521,7 @@ def test_nearest_upsample_with_limit(): def test_resample_ohlc(series): s = series - grouper = TimeGrouper(Minute(5)) + grouper = Grouper(freq=Minute(5)) expect = s.groupby(grouper).agg(lambda x: x[-1]) result = s.resample('5Min').ohlc() diff --git a/pandas/tests/resample/test_time_grouper.py b/pandas/tests/resample/test_time_grouper.py index 2f330d1f2484b..3f767f8e7100f 100644 --- a/pandas/tests/resample/test_time_grouper.py +++ b/pandas/tests/resample/test_time_grouper.py @@ -6,8 +6,8 @@ import pandas as pd from pandas import DataFrame, Series +from pandas.core.groupby.grouper import Grouper from pandas.core.indexes.datetimes import date_range -from pandas.core.resample import TimeGrouper import pandas.util.testing as tm from pandas.util.testing import assert_frame_equal, assert_series_equal @@ -16,9 +16,7 @@ def test_apply(): - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - grouper = pd.TimeGrouper(freq='A', label='right', closed='right') + grouper = Grouper(freq='A', label='right', closed='right') grouped = test_series.groupby(grouper) @@ -38,9 +36,7 @@ def test_count(): expected = test_series.groupby(lambda x: x.year).count() - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - grouper = pd.TimeGrouper(freq='A', label='right', closed='right') + grouper = Grouper(freq='A', label='right', closed='right') result = test_series.groupby(grouper).count() expected.index = result.index assert_series_equal(result, expected) @@ -64,7 +60,7 @@ def test_apply_iteration(): N = 1000 ind = pd.date_range(start="2000-01-01", freq="D", periods=N) df = DataFrame({'open': 1, 'close': 2}, index=ind) - tg = TimeGrouper('M') + tg = Grouper(freq='M') _, grouper, _ = tg._get_grouper(df) @@ -93,7 +89,7 @@ def test_fails_on_no_datetime_index(name, func): msg = ("Only valid with DatetimeIndex, TimedeltaIndex " "or PeriodIndex, but got an instance of '{}'".format(name)) with pytest.raises(TypeError, match=msg): - df.groupby(TimeGrouper('D')) + df.groupby(Grouper(freq='D')) def test_aaa_group_order(): @@ -105,7 +101,7 @@ def test_aaa_group_order(): df['key'] = [datetime(2013, 1, 1), datetime(2013, 1, 2), datetime(2013, 1, 3), datetime(2013, 1, 4), datetime(2013, 1, 5)] * 4 - grouped = df.groupby(TimeGrouper(key='key', freq='D')) + grouped = df.groupby(Grouper(key='key', freq='D')) tm.assert_frame_equal(grouped.get_group(datetime(2013, 1, 1)), df[::5]) @@ -135,7 +131,7 @@ def test_aggregate_normal(resample_method): datetime(2013, 1, 5)] * 4 normal_grouped = normal_df.groupby('key') - dt_grouped = dt_df.groupby(TimeGrouper(key='key', freq='D')) + dt_grouped = dt_df.groupby(Grouper(key='key', freq='D')) expected = getattr(normal_grouped, resample_method)() dt_result = getattr(dt_grouped, resample_method)() @@ -195,7 +191,7 @@ def test_aggregate_with_nat(func, fill_value): datetime(2013, 1, 4), datetime(2013, 1, 5)] * 4 normal_grouped = normal_df.groupby('key') - dt_grouped = dt_df.groupby(TimeGrouper(key='key', freq='D')) + dt_grouped = dt_df.groupby(Grouper(key='key', freq='D')) normal_result = getattr(normal_grouped, func)() dt_result = getattr(dt_grouped, func)() @@ -222,7 +218,7 @@ def test_aggregate_with_nat_size(): datetime(2013, 1, 4), datetime(2013, 1, 5)] * 4 normal_grouped = normal_df.groupby('key') - dt_grouped = dt_df.groupby(TimeGrouper(key='key', freq='D')) + dt_grouped = dt_df.groupby(Grouper(key='key', freq='D')) normal_result = normal_grouped.size() dt_result = dt_grouped.size() @@ -238,7 +234,7 @@ def test_aggregate_with_nat_size(): def test_repr(): # GH18203 - result = repr(TimeGrouper(key='A', freq='H')) + result = repr(Grouper(key='A', freq='H')) expected = ("TimeGrouper(key='A', freq=, axis=0, sort=True, " "closed='left', label='left', how='mean', " "convention='e', base=0)") From 8c8a1759a92c87ff3a56f8cef2d6ba2d9d500bc1 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Fri, 24 May 2019 18:18:04 -0700 Subject: [PATCH 13/34] CLN: Remove ExcelWriter.sheetname (#26464) xref gh-6581 --- doc/source/whatsnew/v0.25.0.rst | 2 +- pandas/io/excel/_base.py | 24 ++++------------------- pandas/tests/io/test_excel.py | 34 +++++++++++---------------------- 3 files changed, 16 insertions(+), 44 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index d4104ab1d79a1..29cc14b638996 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -312,7 +312,7 @@ Deprecations Removal of prior version deprecations/changes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - Removed ``Panel`` (:issue:`25047`, :issue:`25191`, :issue:`25231`) -- +- Removed the previously deprecated ``sheetname`` keyword in :func:`read_excel` (:issue:`16442`, :issue:`20938`) - Removed previously deprecated ``TimeGrouper`` (:issue:`16942`) - diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index c0678575fd6f0..a0d51e85aa4f3 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -5,7 +5,6 @@ import os from textwrap import fill from urllib.request import urlopen -import warnings from pandas._config import config @@ -291,15 +290,10 @@ def read_excel(io, mangle_dupe_cols=True, **kwds): - # Can't use _deprecate_kwarg since sheetname=None has a special meaning - if is_integer(sheet_name) and sheet_name == 0 and 'sheetname' in kwds: - warnings.warn("The `sheetname` keyword is deprecated, use " - "`sheet_name` instead", FutureWarning, stacklevel=2) - sheet_name = kwds.pop("sheetname") - - if 'sheet' in kwds: - raise TypeError("read_excel() got an unexpected keyword argument " - "`sheet`") + for arg in ('sheet', 'sheetname'): + if arg in kwds: + raise TypeError("read_excel() got an unexpected keyword argument " + "`{}`".format(arg)) if not isinstance(io, ExcelFile): io = ExcelFile(io, engine=engine) @@ -833,16 +827,6 @@ def parse(self, DataFrame or dict of DataFrames DataFrame from the passed in Excel file. """ - - # Can't use _deprecate_kwarg since sheetname=None has a special meaning - if is_integer(sheet_name) and sheet_name == 0 and 'sheetname' in kwds: - warnings.warn("The `sheetname` keyword is deprecated, use " - "`sheet_name` instead", FutureWarning, stacklevel=2) - sheet_name = kwds.pop("sheetname") - elif 'sheetname' in kwds: - raise TypeError("Cannot specify both `sheet_name` " - "and `sheetname`. Use just `sheet_name`") - if 'chunksize' in kwds: raise NotImplementedError("chunksize keyword of read_excel " "is not implemented") diff --git a/pandas/tests/io/test_excel.py b/pandas/tests/io/test_excel.py index f9926cd26d8da..44ce3111c3a1e 100644 --- a/pandas/tests/io/test_excel.py +++ b/pandas/tests/io/test_excel.py @@ -342,15 +342,15 @@ def test_excel_passes_na(self, ext): tm.assert_frame_equal(parsed, expected) @td.skip_if_no('xlrd', '1.0.1') # GH-22682 - def test_deprecated_sheetname(self, ext): + @pytest.mark.parametrize('arg', ['sheet', 'sheetname']) + def test_unexpected_kwargs_raises(self, ext, arg): # gh-17964 excel = self.get_excelfile('test1', ext) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - read_excel(excel, sheetname='Sheet1') - - with pytest.raises(TypeError): - read_excel(excel, sheet='Sheet1') + kwarg = {arg: 'Sheet1'} + msg = "unexpected keyword argument `{}`".format(arg) + with pytest.raises(TypeError, match=msg): + read_excel(excel, **kwarg) @td.skip_if_no('xlrd', '1.0.1') # GH-22682 def test_excel_table_sheet_by_index(self, ext): @@ -588,32 +588,20 @@ def test_sheet_name_and_sheetname(self, ext): df_ref = self.get_csv_refdf(filename) df1 = self.get_exceldf(filename, ext, sheet_name=sheet_name, index_col=0) # doc - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - with ignore_xlrd_time_clock_warning(): - df2 = self.get_exceldf(filename, ext, index_col=0, - sheetname=sheet_name) # backward compat + with ignore_xlrd_time_clock_warning(): + df2 = self.get_exceldf(filename, ext, index_col=0, + sheet_name=sheet_name) excel = self.get_excelfile(filename, ext) df1_parse = excel.parse(sheet_name=sheet_name, index_col=0) # doc - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - df2_parse = excel.parse(index_col=0, - sheetname=sheet_name) # backward compat + df2_parse = excel.parse(index_col=0, + sheet_name=sheet_name) tm.assert_frame_equal(df1, df_ref, check_names=False) tm.assert_frame_equal(df2, df_ref, check_names=False) tm.assert_frame_equal(df1_parse, df_ref, check_names=False) tm.assert_frame_equal(df2_parse, df_ref, check_names=False) - def test_sheet_name_both_raises(self, ext): - with pytest.raises(TypeError, match="Cannot specify both"): - self.get_exceldf('test1', ext, sheetname='Sheet1', - sheet_name='Sheet1') - - excel = self.get_excelfile('test1', ext) - with pytest.raises(TypeError, match="Cannot specify both"): - excel.parse(sheetname='Sheet1', - sheet_name='Sheet1') - def test_excel_read_buffer(self, ext): pth = os.path.join(self.dirpath, 'test1' + ext) From 9d6d95994ad2d58bad0ae0910ea9b5ab2df6be50 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Sat, 25 May 2019 11:40:15 -0700 Subject: [PATCH 14/34] CLN: Remove deprecated parse_cols from read_excel (#26522) xref gh-6581 --- doc/source/whatsnew/v0.25.0.rst | 4 ++-- pandas/io/excel/_base.py | 10 +--------- pandas/tests/io/test_excel.py | 22 +--------------------- 3 files changed, 4 insertions(+), 32 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 29cc14b638996..af59a34245660 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -313,8 +313,8 @@ Removal of prior version deprecations/changes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - Removed ``Panel`` (:issue:`25047`, :issue:`25191`, :issue:`25231`) - Removed the previously deprecated ``sheetname`` keyword in :func:`read_excel` (:issue:`16442`, :issue:`20938`) -- Removed previously deprecated ``TimeGrouper`` (:issue:`16942`) -- +- Removed the previously deprecated ``TimeGrouper`` (:issue:`16942`) +- Removed the previously deprecated ``parse_cols`` keyword in :func:`read_excel` (:issue:`16488`) .. _whatsnew_0250.performance: diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index a0d51e85aa4f3..3af6be7a371e7 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -64,12 +64,6 @@ those columns will be combined into a ``MultiIndex``. If a subset of data is selected with ``usecols``, index_col is based on the subset. -parse_cols : int or list, default None - Alias of `usecols`. - - .. deprecated:: 0.21.0 - Use `usecols` instead. - usecols : int, str, list-like, or callable default None Return a subset of the columns. @@ -260,14 +254,12 @@ @Appender(_read_excel_doc) -@deprecate_kwarg("parse_cols", "usecols") @deprecate_kwarg("skip_footer", "skipfooter") def read_excel(io, sheet_name=0, header=0, names=None, index_col=None, - parse_cols=None, usecols=None, squeeze=False, dtype=None, @@ -290,7 +282,7 @@ def read_excel(io, mangle_dupe_cols=True, **kwds): - for arg in ('sheet', 'sheetname'): + for arg in ('sheet', 'sheetname', 'parse_cols'): if arg in kwds: raise TypeError("read_excel() got an unexpected keyword argument " "`{}`".format(arg)) diff --git a/pandas/tests/io/test_excel.py b/pandas/tests/io/test_excel.py index 44ce3111c3a1e..100de227aa97c 100644 --- a/pandas/tests/io/test_excel.py +++ b/pandas/tests/io/test_excel.py @@ -147,17 +147,9 @@ def test_usecols_int(self, ext): df2 = self.get_exceldf("test1", ext, "Sheet2", skiprows=[1], index_col=0, usecols=3) - # parse_cols instead of usecols, usecols as int - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - with ignore_xlrd_time_clock_warning(): - df3 = self.get_exceldf("test1", ext, "Sheet2", skiprows=[1], - index_col=0, parse_cols=3) - # TODO add index to xls file) tm.assert_frame_equal(df1, df_ref, check_names=False) tm.assert_frame_equal(df2, df_ref, check_names=False) - tm.assert_frame_equal(df3, df_ref, check_names=False) @td.skip_if_no('xlrd', '1.0.1') # GH-22682 def test_usecols_list(self, ext): @@ -169,15 +161,9 @@ def test_usecols_list(self, ext): df2 = self.get_exceldf('test1', ext, 'Sheet2', skiprows=[1], index_col=0, usecols=[0, 2, 3]) - with tm.assert_produces_warning(FutureWarning): - with ignore_xlrd_time_clock_warning(): - df3 = self.get_exceldf('test1', ext, 'Sheet2', skiprows=[1], - index_col=0, parse_cols=[0, 2, 3]) - # TODO add index to xls file) tm.assert_frame_equal(df1, dfref, check_names=False) tm.assert_frame_equal(df2, dfref, check_names=False) - tm.assert_frame_equal(df3, dfref, check_names=False) @td.skip_if_no('xlrd', '1.0.1') # GH-22682 def test_usecols_str(self, ext): @@ -190,15 +176,9 @@ def test_usecols_str(self, ext): df3 = self.get_exceldf('test1', ext, 'Sheet2', skiprows=[1], index_col=0, usecols='A:D') - with tm.assert_produces_warning(FutureWarning): - with ignore_xlrd_time_clock_warning(): - df4 = self.get_exceldf('test1', ext, 'Sheet2', skiprows=[1], - index_col=0, parse_cols='A:D') - # TODO add index to xls, read xls ignores index name ? tm.assert_frame_equal(df2, df1, check_names=False) tm.assert_frame_equal(df3, df1, check_names=False) - tm.assert_frame_equal(df4, df1, check_names=False) df1 = dfref.reindex(columns=['B', 'C']) df2 = self.get_exceldf('test1', ext, 'Sheet1', index_col=0, @@ -342,7 +322,7 @@ def test_excel_passes_na(self, ext): tm.assert_frame_equal(parsed, expected) @td.skip_if_no('xlrd', '1.0.1') # GH-22682 - @pytest.mark.parametrize('arg', ['sheet', 'sheetname']) + @pytest.mark.parametrize('arg', ['sheet', 'sheetname', 'parse_cols']) def test_unexpected_kwargs_raises(self, ext, arg): # gh-17964 excel = self.get_excelfile('test1', ext) From 3bb47664e28ae5e3d33748cbf1825c4acbd4297e Mon Sep 17 00:00:00 2001 From: Mak Sze Chun Date: Sun, 26 May 2019 03:24:39 +0800 Subject: [PATCH 15/34] [TEST] Add two more parameters to the test_dti_add_sub_nonzero_mth_offset (#26392) * Add two more parameters to the test * Add array into the boy and add parameter freq --- pandas/tests/arithmetic/test_datetime64.py | 38 ++++++++++++++-------- 1 file changed, 25 insertions(+), 13 deletions(-) diff --git a/pandas/tests/arithmetic/test_datetime64.py b/pandas/tests/arithmetic/test_datetime64.py index 910fa4818c5de..13adae279c989 100644 --- a/pandas/tests/arithmetic/test_datetime64.py +++ b/pandas/tests/arithmetic/test_datetime64.py @@ -1435,27 +1435,39 @@ def test_dt64arr_add_sub_offset_ndarray(self, tz_naive_fixture, expected = tm.box_expected(expected, box_with_array) tm.assert_equal(res, expected) - @pytest.mark.parametrize("op, offset, exp", [ + @pytest.mark.parametrize("op, offset, exp, exp_freq", [ ('__add__', pd.DateOffset(months=3, days=10), - DatetimeIndex([Timestamp('2014-04-11'), Timestamp('2015-04-11'), - Timestamp('2016-04-11'), Timestamp('2017-04-11')])), + [Timestamp('2014-04-11'), Timestamp('2015-04-11'), + Timestamp('2016-04-11'), Timestamp('2017-04-11')], + None), ('__add__', pd.DateOffset(months=3), - DatetimeIndex([Timestamp('2014-04-01'), Timestamp('2015-04-01'), - Timestamp('2016-04-01'), Timestamp('2017-04-01')])), + [Timestamp('2014-04-01'), Timestamp('2015-04-01'), + Timestamp('2016-04-01'), Timestamp('2017-04-01')], + "AS-APR"), ('__sub__', pd.DateOffset(months=3, days=10), - DatetimeIndex([Timestamp('2013-09-21'), Timestamp('2014-09-21'), - Timestamp('2015-09-21'), Timestamp('2016-09-21')])), + [Timestamp('2013-09-21'), Timestamp('2014-09-21'), + Timestamp('2015-09-21'), Timestamp('2016-09-21')], + None), ('__sub__', pd.DateOffset(months=3), - DatetimeIndex([Timestamp('2013-10-01'), Timestamp('2014-10-01'), - Timestamp('2015-10-01'), Timestamp('2016-10-01')])) - + [Timestamp('2013-10-01'), Timestamp('2014-10-01'), + Timestamp('2015-10-01'), Timestamp('2016-10-01')], + "AS-OCT") ]) - def test_dti_add_sub_nonzero_mth_offset(self, op, offset, exp): + def test_dti_add_sub_nonzero_mth_offset(self, op, offset, + exp, exp_freq, + tz_aware_fixture, + box_with_array): # GH 26258 - date = date_range(start='01 Jan 2014', end='01 Jan 2017', freq='AS') + tz = tz_aware_fixture + date = date_range(start='01 Jan 2014', end='01 Jan 2017', freq='AS', + tz=tz) + date = tm.box_expected(date, box_with_array, False) mth = getattr(date, op) result = mth(offset) - tm.assert_equal(result, exp) + + expected = pd.DatetimeIndex(exp, tz=tz, freq=exp_freq) + expected = tm.box_expected(expected, box_with_array, False) + tm.assert_equal(result, expected) class TestDatetime64OverflowHandling: From 014abdc3553bb49c681bff11e09fb7c55f4500db Mon Sep 17 00:00:00 2001 From: Nanda H Krishna Date: Sun, 26 May 2019 07:18:02 +0530 Subject: [PATCH 16/34] Remove py.path special handling from io.common (#26458) --- pandas/io/common.py | 17 ++--------------- 1 file changed, 2 insertions(+), 15 deletions(-) diff --git a/pandas/io/common.py b/pandas/io/common.py index f9cd1806763e2..34635ebf64ad6 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -9,6 +9,7 @@ import lzma import mmap import os +import pathlib from urllib.error import URLError # noqa from urllib.parse import ( # noqa urlencode, urljoin, urlparse as parse_url, uses_netloc, uses_params, @@ -115,24 +116,10 @@ def _stringify_path(filepath_or_buffer): Any other object is passed through unchanged, which includes bytes, strings, buffers, or anything else that's not even path-like. """ - try: - import pathlib - _PATHLIB_INSTALLED = True - except ImportError: - _PATHLIB_INSTALLED = False - - try: - from py.path import local as LocalPath - _PY_PATH_INSTALLED = True - except ImportError: - _PY_PATH_INSTALLED = False - if hasattr(filepath_or_buffer, '__fspath__'): return filepath_or_buffer.__fspath__() - if _PATHLIB_INSTALLED and isinstance(filepath_or_buffer, pathlib.Path): + elif isinstance(filepath_or_buffer, pathlib.Path): return str(filepath_or_buffer) - if _PY_PATH_INSTALLED and isinstance(filepath_or_buffer, LocalPath): - return filepath_or_buffer.strpath return _expand_user(filepath_or_buffer) From 420eee5bf7b8458bddfc6dd3ff2c9020da38dbef Mon Sep 17 00:00:00 2001 From: topper-123 Date: Sun, 26 May 2019 16:31:43 +0200 Subject: [PATCH 17/34] CLN: remove StringMixin from code base, except core.computation (#26523) --- pandas/io/pytables.py | 19 +++++++++---------- pandas/io/stata.py | 3 +-- 2 files changed, 10 insertions(+), 12 deletions(-) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 0f7f6fe399256..11f705e88179d 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -32,7 +32,6 @@ to_datetime) from pandas.core.arrays.categorical import Categorical from pandas.core.arrays.sparse import BlockIndex, IntIndex -from pandas.core.base import StringMixin import pandas.core.common as com from pandas.core.computation.pytables import Expr, maybe_expression from pandas.core.index import ensure_index @@ -398,7 +397,7 @@ def _is_metadata_of(group, parent_group): return False -class HDFStore(StringMixin): +class HDFStore: """ Dict-like IO interface for storing pandas objects in PyTables @@ -520,7 +519,7 @@ def __contains__(self, key): def __len__(self): return len(self.groups()) - def __str__(self): + def __repr__(self): return '{type}\nFile path: {path}\n'.format( type=type(self), path=pprint_thing(self._path)) @@ -1519,7 +1518,7 @@ def get_result(self, coordinates=False): return results -class IndexCol(StringMixin): +class IndexCol: """ an index column description class @@ -1587,7 +1586,7 @@ def set_table(self, table): self.table = table return self - def __str__(self): + def __repr__(self): temp = tuple( map(pprint_thing, (self.name, @@ -1881,7 +1880,7 @@ def __init__(self, values=None, kind=None, typ=None, self.set_data(data) self.set_metadata(metadata) - def __str__(self): + def __repr__(self): temp = tuple( map(pprint_thing, (self.name, @@ -2286,7 +2285,7 @@ def get_attr(self): pass -class Fixed(StringMixin): +class Fixed: """ represent an object in my store facilitate read/write of various types of objects @@ -2336,7 +2335,7 @@ def pandas_type(self): def format_type(self): return 'fixed' - def __str__(self): + def __repr__(self): """ return a pretty representation of myself """ self.infer_axes() s = self.shape @@ -3077,8 +3076,8 @@ def table_type_short(self): def format_type(self): return 'table' - def __str__(self): - """ return a pretty representatgion of myself """ + def __repr__(self): + """ return a pretty representation of myself """ self.infer_axes() dc = ",dc->[{columns}]".format(columns=(','.join( self.data_columns) if len(self.data_columns) else '')) diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 27ddc4ef6f594..d8dfd15477974 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -31,7 +31,6 @@ from pandas import ( Categorical, DatetimeIndex, NaT, Timestamp, concat, isna, to_datetime, to_timedelta) -from pandas.core.base import StringMixin from pandas.core.frame import DataFrame from pandas.core.series import Series @@ -712,7 +711,7 @@ def generate_value_label(self, byteorder, encoding): return bio.read() -class StataMissingValue(StringMixin): +class StataMissingValue: """ An observation's missing value. From 48a4b8cf966529dcd441ece139afe82fc4873742 Mon Sep 17 00:00:00 2001 From: Chuanzhu Xu Date: Sun, 26 May 2019 13:57:47 -0400 Subject: [PATCH 18/34] MAINT: port numpy#13188 for np_datetime simplification (#26516) * MAINT: port numpy#13188 for np_datetime simplificaion Bring numpy changes about emulating the behavior of python's divmod to pandas. * cpplint fix * Add reference numpy change into comment * fix typo --- doc/source/whatsnew/v0.25.0.rst | 2 +- .../_libs/tslibs/src/datetime/np_datetime.c | 208 +++++++----------- pandas/core/arrays/categorical.py | 8 +- .../arrays/categorical/test_operators.py | 17 +- 4 files changed, 94 insertions(+), 141 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index af59a34245660..9ea690a11259d 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -346,7 +346,7 @@ Categorical ^^^^^^^^^^^ - Bug in :func:`DataFrame.at` and :func:`Series.at` that would raise exception if the index was a :class:`CategoricalIndex` (:issue:`20629`) -- Bug in :func:`_cat_compare_op` that would valuate comparison with None to True (:issue:`26504`) +- Fixed Bug in :func:`_cat_compare_op` that would evaluate comparison of ordered `Categorical` with missing values with scalar to True sometimes (:issue:`26504`) - Datetimelike diff --git a/pandas/_libs/tslibs/src/datetime/np_datetime.c b/pandas/_libs/tslibs/src/datetime/np_datetime.c index 87866d804503e..a8a47e2e90f93 100644 --- a/pandas/_libs/tslibs/src/datetime/np_datetime.c +++ b/pandas/_libs/tslibs/src/datetime/np_datetime.c @@ -498,6 +498,27 @@ npy_datetime npy_datetimestruct_to_datetime(NPY_DATETIMEUNIT base, return ret; } +/* + * Port numpy#13188 https://github.com/numpy/numpy/pull/13188/ + * + * Computes the python `ret, d = divmod(d, unit)`. + * + * Note that GCC is smart enough at -O2 to eliminate the `if(*d < 0)` branch + * for subsequent calls to this command - it is able to deduce that `*d >= 0`. + */ +npy_int64 extract_unit(npy_datetime *d, npy_datetime unit) { + assert(unit > 0); + npy_int64 div = *d / unit; + npy_int64 mod = *d % unit; + if (mod < 0) { + mod += unit; + div -= 1; + } + assert(mod >= 0); + *d = mod; + return div; +} + /* * Converts a datetime based on the given metadata into a datetimestruct */ @@ -522,13 +543,8 @@ void pandas_datetime_to_datetimestruct(npy_datetime dt, break; case NPY_FR_M: - if (dt >= 0) { - out->year = 1970 + dt / 12; - out->month = dt % 12 + 1; - } else { - out->year = 1969 + (dt + 1) / 12; - out->month = 12 + (dt + 1) % 12; - } + out->year = 1970 + extract_unit(&dt, 12); + out->month = dt + 1; break; case NPY_FR_W: @@ -543,167 +559,105 @@ void pandas_datetime_to_datetimestruct(npy_datetime dt, case NPY_FR_h: perday = 24LL; - if (dt >= 0) { - set_datetimestruct_days(dt / perday, out); - dt = dt % perday; - } else { - set_datetimestruct_days( - dt / perday - (dt % perday == 0 ? 0 : 1), out); - dt = (perday - 1) + (dt + 1) % perday; - } + set_datetimestruct_days(extract_unit(&dt, perday), out); out->hour = dt; break; case NPY_FR_m: perday = 24LL * 60; - if (dt >= 0) { - set_datetimestruct_days(dt / perday, out); - dt = dt % perday; - } else { - set_datetimestruct_days( - dt / perday - (dt % perday == 0 ? 0 : 1), out); - dt = (perday - 1) + (dt + 1) % perday; - } - out->hour = dt / 60; - out->min = dt % 60; + set_datetimestruct_days(extract_unit(&dt, perday), out); + out->hour = (int)extract_unit(&dt, 60); + out->min = (int)dt; break; case NPY_FR_s: perday = 24LL * 60 * 60; - if (dt >= 0) { - set_datetimestruct_days(dt / perday, out); - dt = dt % perday; - } else { - set_datetimestruct_days( - dt / perday - (dt % perday == 0 ? 0 : 1), out); - dt = (perday - 1) + (dt + 1) % perday; - } - out->hour = dt / (60 * 60); - out->min = (dt / 60) % 60; - out->sec = dt % 60; + set_datetimestruct_days(extract_unit(&dt, perday), out); + out->hour = (int)extract_unit(&dt, 60 * 60); + out->min = (int)extract_unit(&dt, 60); + out->sec = (int)dt; break; case NPY_FR_ms: perday = 24LL * 60 * 60 * 1000; - if (dt >= 0) { - set_datetimestruct_days(dt / perday, out); - dt = dt % perday; - } else { - set_datetimestruct_days( - dt / perday - (dt % perday == 0 ? 0 : 1), out); - dt = (perday - 1) + (dt + 1) % perday; - } - out->hour = dt / (60 * 60 * 1000LL); - out->min = (dt / (60 * 1000LL)) % 60; - out->sec = (dt / 1000LL) % 60; - out->us = (dt % 1000LL) * 1000; + set_datetimestruct_days(extract_unit(&dt, perday), out); + out->hour = (int)extract_unit(&dt, 1000LL * 60 * 60); + out->min = (int)extract_unit(&dt, 1000LL * 60); + out->sec = (int)extract_unit(&dt, 1000LL); + out->us = (int)(dt * 1000); break; case NPY_FR_us: perday = 24LL * 60LL * 60LL * 1000LL * 1000LL; - if (dt >= 0) { - set_datetimestruct_days(dt / perday, out); - dt = dt % perday; - } else { - set_datetimestruct_days( - dt / perday - (dt % perday == 0 ? 0 : 1), out); - dt = (perday - 1) + (dt + 1) % perday; - } - out->hour = dt / (60 * 60 * 1000000LL); - out->min = (dt / (60 * 1000000LL)) % 60; - out->sec = (dt / 1000000LL) % 60; - out->us = dt % 1000000LL; + set_datetimestruct_days(extract_unit(&dt, perday), out); + out->hour = (int)extract_unit(&dt, 1000LL * 1000 * 60 * 60); + out->min = (int)extract_unit(&dt, 1000LL * 1000 * 60); + out->sec = (int)extract_unit(&dt, 1000LL * 1000); + out->us = (int)dt; break; case NPY_FR_ns: perday = 24LL * 60LL * 60LL * 1000LL * 1000LL * 1000LL; - if (dt >= 0) { - set_datetimestruct_days(dt / perday, out); - dt = dt % perday; - } else { - set_datetimestruct_days( - dt / perday - (dt % perday == 0 ? 0 : 1), out); - dt = (perday - 1) + (dt + 1) % perday; - } - out->hour = dt / (60 * 60 * 1000000000LL); - out->min = (dt / (60 * 1000000000LL)) % 60; - out->sec = (dt / 1000000000LL) % 60; - out->us = (dt / 1000LL) % 1000000LL; - out->ps = (dt % 1000LL) * 1000; + set_datetimestruct_days(extract_unit(&dt, perday), out); + out->hour = (int)extract_unit(&dt, 1000LL * 1000 * 1000 * 60 * 60); + out->min = (int)extract_unit(&dt, 1000LL * 1000 * 1000 * 60); + out->sec = (int)extract_unit(&dt, 1000LL * 1000 * 1000); + out->us = (int)extract_unit(&dt, 1000LL); + out->ps = (int)(dt * 1000); break; case NPY_FR_ps: perday = 24LL * 60 * 60 * 1000 * 1000 * 1000 * 1000; - if (dt >= 0) { - set_datetimestruct_days(dt / perday, out); - dt = dt % perday; - } else { - set_datetimestruct_days( - dt / perday - (dt % perday == 0 ? 0 : 1), out); - dt = (perday - 1) + (dt + 1) % perday; - } - out->hour = dt / (60 * 60 * 1000000000000LL); - out->min = (dt / (60 * 1000000000000LL)) % 60; - out->sec = (dt / 1000000000000LL) % 60; - out->us = (dt / 1000000LL) % 1000000LL; - out->ps = dt % 1000000LL; + set_datetimestruct_days(extract_unit(&dt, perday), out); + out->hour = (int)extract_unit(&dt, 1000LL * 1000 * 1000 * 60 * 60); + out->min = (int)extract_unit(&dt, 1000LL * 1000 * 1000 * 60); + out->sec = (int)extract_unit(&dt, 1000LL * 1000 * 1000); + out->us = (int)extract_unit(&dt, 1000LL); + out->ps = (int)(dt * 1000); break; case NPY_FR_fs: /* entire range is only +- 2.6 hours */ - if (dt >= 0) { - out->hour = dt / (60 * 60 * 1000000000000000LL); - out->min = (dt / (60 * 1000000000000000LL)) % 60; - out->sec = (dt / 1000000000000000LL) % 60; - out->us = (dt / 1000000000LL) % 1000000LL; - out->ps = (dt / 1000LL) % 1000000LL; - out->as = (dt % 1000LL) * 1000; - } else { - npy_datetime minutes; - - minutes = dt / (60 * 1000000000000000LL); - dt = dt % (60 * 1000000000000000LL); - if (dt < 0) { - dt += (60 * 1000000000000000LL); - --minutes; - } - /* Offset the negative minutes */ - add_minutes_to_datetimestruct(out, minutes); - out->sec = (dt / 1000000000000000LL) % 60; - out->us = (dt / 1000000000LL) % 1000000LL; - out->ps = (dt / 1000LL) % 1000000LL; - out->as = (dt % 1000LL) * 1000; + out->hour = (int)extract_unit(&dt, 1000LL * 1000 * 1000 * 1000 * + 1000 * 60 * 60); + if (out->hour < 0) { + out->year = 1969; + out->month = 12; + out->day = 31; + out->hour += 24; + assert(out->hour >= 0); } + out->min = (int)extract_unit(&dt, 1000LL * 1000 * 1000 * 1000 * + 1000 * 60); + out->sec = (int)extract_unit(&dt, 1000LL * 1000 * 1000 * 1000 * + 1000); + out->us = (int)extract_unit(&dt, 1000LL * 1000 * 1000); + out->ps = (int)extract_unit(&dt, 1000LL); + out->as = (int)(dt * 1000); break; case NPY_FR_as: /* entire range is only +- 9.2 seconds */ - if (dt >= 0) { - out->sec = (dt / 1000000000000000000LL) % 60; - out->us = (dt / 1000000000000LL) % 1000000LL; - out->ps = (dt / 1000000LL) % 1000000LL; - out->as = dt % 1000000LL; - } else { - npy_datetime seconds; - - seconds = dt / 1000000000000000000LL; - dt = dt % 1000000000000000000LL; - if (dt < 0) { - dt += 1000000000000000000LL; - --seconds; - } - /* Offset the negative seconds */ - add_seconds_to_datetimestruct(out, seconds); - out->us = (dt / 1000000000000LL) % 1000000LL; - out->ps = (dt / 1000000LL) % 1000000LL; - out->as = dt % 1000000LL; + out->sec = (int)extract_unit(&dt, 1000LL * 1000 * 1000 * 1000 * + 1000 * 1000); + if (out->sec < 0) { + out->year = 1969; + out->month = 12; + out->day = 31; + out->hour = 23; + out->min = 59; + out->sec += 60; + assert(out->sec >= 0); } + out->us = (int)extract_unit(&dt, 1000LL * 1000 * 1000 * 1000); + out->ps = (int)extract_unit(&dt, 1000LL * 1000); + out->as = (int)dt; break; default: diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 1d6b906158125..df01c6bee8917 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -100,13 +100,12 @@ def f(self, other): if is_scalar(other): if other in self.categories: i = self.categories.get_loc(other) - f = getattr(self._codes, op) - ret = f(i) + ret = getattr(self._codes, op)(i) # check for NaN in self na_mask = (self._codes == -1) if na_mask.any(): - # In other series, the leads to False, so do that here too + # comparison to missing values NaN leads to False ret[na_mask] = False return ret else: @@ -1412,6 +1411,7 @@ def isna(self): ret = self._codes == -1 return ret + isnull = isna def notna(self): @@ -1433,6 +1433,7 @@ def notna(self): """ return ~self.isna() + notnull = notna def put(self, *args, **kwargs): @@ -2555,6 +2556,7 @@ def index(self): stacklevel=2) return self._index + # utility routines diff --git a/pandas/tests/arrays/categorical/test_operators.py b/pandas/tests/arrays/categorical/test_operators.py index b323cb2b6a7c3..f2f4871a70d92 100644 --- a/pandas/tests/arrays/categorical/test_operators.py +++ b/pandas/tests/arrays/categorical/test_operators.py @@ -17,7 +17,6 @@ def test_categories_none_comparisons(self): tm.assert_categorical_equal(factor, self.factor) def test_comparisons(self): - result = self.factor[self.factor == 'a'] expected = self.factor[np.asarray(self.factor) == 'a'] tm.assert_categorical_equal(result, expected) @@ -186,23 +185,21 @@ def test_comparison_with_unknown_scalars(self): tm.assert_numpy_array_equal(cat != 4, np.array([True, True, True])) - def test_comparison_with_known_scalars(self): + def test_comparison_of_ordered_categorical_with_missing_values(self): # https://github.com/pandas-dev/pandas/issues/26504 - # and following comparisons with scalars in categories with None should - # be evaluated as False + # BUG: fix ordered categorical comparison with missing values (#26504 ) + # and following comparisons with scalars in categories with missing values + # should be evaluated as False - cat1 = Categorical([1, 2, 3, None], categories=[1, 2, 3], ordered=True) - cat2 = Categorical([None, 1, 2, 3], categories=[1, 2, 3], ordered=True) + cat = Categorical([1, 2, 3, None], categories=[1, 2, 3], ordered=True) - tm.assert_numpy_array_equal(cat1 <= 2, + tm.assert_numpy_array_equal(cat <= 2, np.array([True, True, False, False])) - tm.assert_numpy_array_equal(cat2 <= 2, - np.array([False, True, True, False])) @pytest.mark.parametrize('data,reverse,base', [ (list("abc"), list("cba"), list("bbb")), ([1, 2, 3], [3, 2, 1], [2, 2, 2])] - ) + ) def test_comparisons(self, data, reverse, base): cat_rev = Series( Categorical(data, categories=reverse, ordered=True)) From 3e205694ef3fe3447b5da626a05e03a29d977ab0 Mon Sep 17 00:00:00 2001 From: yanglinlee Date: Tue, 28 May 2019 21:41:31 -0400 Subject: [PATCH 19/34] fix categorical comparison with missing values #26504 --- pandas/tests/arrays/categorical/test_operators.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/pandas/tests/arrays/categorical/test_operators.py b/pandas/tests/arrays/categorical/test_operators.py index f2f4871a70d92..24a274ed8c115 100644 --- a/pandas/tests/arrays/categorical/test_operators.py +++ b/pandas/tests/arrays/categorical/test_operators.py @@ -188,8 +188,8 @@ def test_comparison_with_unknown_scalars(self): def test_comparison_of_ordered_categorical_with_missing_values(self): # https://github.com/pandas-dev/pandas/issues/26504 # BUG: fix ordered categorical comparison with missing values (#26504 ) - # and following comparisons with scalars in categories with missing values - # should be evaluated as False + # and following comparisons with scalars in categories with missing + # values should be evaluated as False cat = Categorical([1, 2, 3, None], categories=[1, 2, 3], ordered=True) @@ -198,8 +198,7 @@ def test_comparison_of_ordered_categorical_with_missing_values(self): @pytest.mark.parametrize('data,reverse,base', [ (list("abc"), list("cba"), list("bbb")), - ([1, 2, 3], [3, 2, 1], [2, 2, 2])] - ) + ([1, 2, 3], [3, 2, 1], [2, 2, 2])]) def test_comparisons(self, data, reverse, base): cat_rev = Series( Categorical(data, categories=reverse, ordered=True)) From 7e6662d70bd6c0008a48ed7f5c806fb841f22a2b Mon Sep 17 00:00:00 2001 From: Big Head Date: Tue, 28 May 2019 22:46:38 -0400 Subject: [PATCH 20/34] Update test_operators.py --- pandas/tests/arrays/categorical/test_operators.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/arrays/categorical/test_operators.py b/pandas/tests/arrays/categorical/test_operators.py index 24a274ed8c115..fe10823254a12 100644 --- a/pandas/tests/arrays/categorical/test_operators.py +++ b/pandas/tests/arrays/categorical/test_operators.py @@ -185,7 +185,7 @@ def test_comparison_with_unknown_scalars(self): tm.assert_numpy_array_equal(cat != 4, np.array([True, True, True])) - def test_comparison_of_ordered_categorical_with_missing_values(self): + def test_comparison_of_ordered_categorical_with_missing_values_to_scalar(self): # https://github.com/pandas-dev/pandas/issues/26504 # BUG: fix ordered categorical comparison with missing values (#26504 ) # and following comparisons with scalars in categories with missing From 16dac3a3ca1a64f603c6e4466a884fafe0771c99 Mon Sep 17 00:00:00 2001 From: Big Head Date: Tue, 28 May 2019 22:51:02 -0400 Subject: [PATCH 21/34] Update categorical.py --- pandas/core/arrays/categorical.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index df01c6bee8917..93346488aaf1d 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -89,12 +89,12 @@ def f(self, other): else: other_codes = other._codes - na_mask = (self._codes == -1) | (other_codes == -1) + mask = (self._codes == -1) | (other_codes == -1) f = getattr(self._codes, op) ret = f(other_codes) - if na_mask.any(): + if mask.any(): # In other series, the leads to False, so do that here too - ret[na_mask] = False + ret[mask] = False return ret if is_scalar(other): @@ -103,10 +103,10 @@ def f(self, other): ret = getattr(self._codes, op)(i) # check for NaN in self - na_mask = (self._codes == -1) - if na_mask.any(): + mask = (self._codes == -1) + if mask.any(): # comparison to missing values NaN leads to False - ret[na_mask] = False + ret[mask] = False return ret else: if op == '__eq__': From 9464f72944e48c1809b0e5430873eb63cb1655a0 Mon Sep 17 00:00:00 2001 From: Big Head Date: Tue, 28 May 2019 22:55:19 -0400 Subject: [PATCH 22/34] Update test_operators.py --- pandas/tests/arrays/categorical/test_operators.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/arrays/categorical/test_operators.py b/pandas/tests/arrays/categorical/test_operators.py index fe10823254a12..c4c977b3a84e2 100644 --- a/pandas/tests/arrays/categorical/test_operators.py +++ b/pandas/tests/arrays/categorical/test_operators.py @@ -185,7 +185,7 @@ def test_comparison_with_unknown_scalars(self): tm.assert_numpy_array_equal(cat != 4, np.array([True, True, True])) - def test_comparison_of_ordered_categorical_with_missing_values_to_scalar(self): + def test_comparison_of_ordered_categorical_with_nan_to_scalar(self): # https://github.com/pandas-dev/pandas/issues/26504 # BUG: fix ordered categorical comparison with missing values (#26504 ) # and following comparisons with scalars in categories with missing From c2b73438c68eadb07a479658fe51fc8998978a50 Mon Sep 17 00:00:00 2001 From: Big Head Date: Wed, 29 May 2019 01:10:19 -0400 Subject: [PATCH 23/34] Update test_operators.py --- .../tests/arrays/categorical/test_operators.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/pandas/tests/arrays/categorical/test_operators.py b/pandas/tests/arrays/categorical/test_operators.py index c4c977b3a84e2..13d037b76ec18 100644 --- a/pandas/tests/arrays/categorical/test_operators.py +++ b/pandas/tests/arrays/categorical/test_operators.py @@ -185,7 +185,8 @@ def test_comparison_with_unknown_scalars(self): tm.assert_numpy_array_equal(cat != 4, np.array([True, True, True])) - def test_comparison_of_ordered_categorical_with_nan_to_scalar(self): + def test_comparison_of_ordered_categorical_with_nan_to_scalar( + self, compare_operators_no_eq_ne): # https://github.com/pandas-dev/pandas/issues/26504 # BUG: fix ordered categorical comparison with missing values (#26504 ) # and following comparisons with scalars in categories with missing @@ -193,9 +194,18 @@ def test_comparison_of_ordered_categorical_with_nan_to_scalar(self): cat = Categorical([1, 2, 3, None], categories=[1, 2, 3], ordered=True) - tm.assert_numpy_array_equal(cat <= 2, - np.array([True, True, False, False])) + assert getattr(cat, compare_operators_no_eq_ne)(2)[-1] == False + def test_comparison_of_ordered_categorical_with_nan_to_listlike( + self, compare_operators_no_eq_ne): + # https://github.com/pandas-dev/pandas/issues/26504 + # and following comparisons of missing values in ordered Categorical + # with listlike should be evaluated as False + + cat = Categorical([1, 2, 3, None], categories=[1, 2, 3], ordered=True) + other = Categorical([2, 2, 2, 2], categories=[1, 2, 3], ordered=True) + assert getattr(cat, compare_operators_no_eq_ne)(other)[-1] == False + @pytest.mark.parametrize('data,reverse,base', [ (list("abc"), list("cba"), list("bbb")), ([1, 2, 3], [3, 2, 1], [2, 2, 2])]) From 65014e76780a0533d22d748d980b2a592d3d15fd Mon Sep 17 00:00:00 2001 From: Big Head Date: Wed, 29 May 2019 12:22:13 -0400 Subject: [PATCH 24/34] Update doc/source/whatsnew/v0.25.0.rst Co-Authored-By: Joris Van den Bossche --- doc/source/whatsnew/v0.25.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 9f10d758c72c4..c65ba8ae54f2e 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -348,7 +348,7 @@ Categorical ^^^^^^^^^^^ - Bug in :func:`DataFrame.at` and :func:`Series.at` that would raise exception if the index was a :class:`CategoricalIndex` (:issue:`20629`) -- Fixed Bug in :func:`_cat_compare_op` that would evaluate comparison of ordered `Categorical` with missing values with scalar to True sometimes (:issue:`26504`) +- Fixed bug in comparison of ordered `Categorical` that contained missing values with a scalar which sometimes incorrectly resulted in True (:issue:`26504`) - Datetimelike From 8964f0acd8b1b15d2e6e642056380718205e6288 Mon Sep 17 00:00:00 2001 From: Big Head Date: Wed, 29 May 2019 20:57:14 -0400 Subject: [PATCH 25/34] Update test_operators.py --- pandas/tests/arrays/categorical/test_operators.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/tests/arrays/categorical/test_operators.py b/pandas/tests/arrays/categorical/test_operators.py index 13d037b76ec18..b6b4713a7d87e 100644 --- a/pandas/tests/arrays/categorical/test_operators.py +++ b/pandas/tests/arrays/categorical/test_operators.py @@ -186,7 +186,7 @@ def test_comparison_with_unknown_scalars(self): np.array([True, True, True])) def test_comparison_of_ordered_categorical_with_nan_to_scalar( - self, compare_operators_no_eq_ne): + self, compare_operators_no_eq_ne): # https://github.com/pandas-dev/pandas/issues/26504 # BUG: fix ordered categorical comparison with missing values (#26504 ) # and following comparisons with scalars in categories with missing @@ -198,8 +198,8 @@ def test_comparison_of_ordered_categorical_with_nan_to_scalar( def test_comparison_of_ordered_categorical_with_nan_to_listlike( self, compare_operators_no_eq_ne): - # https://github.com/pandas-dev/pandas/issues/26504 - # and following comparisons of missing values in ordered Categorical + # https://github.com/pandas-dev/pandas/issues/26504 + # and following comparisons of missing values in ordered Categorical # with listlike should be evaluated as False cat = Categorical([1, 2, 3, None], categories=[1, 2, 3], ordered=True) From 7f404d2a5e931196eadeceb7308642d14fc8afd2 Mon Sep 17 00:00:00 2001 From: Big Head Date: Wed, 29 May 2019 21:02:59 -0400 Subject: [PATCH 26/34] Update test_operators.py --- pandas/tests/arrays/categorical/test_operators.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/tests/arrays/categorical/test_operators.py b/pandas/tests/arrays/categorical/test_operators.py index b6b4713a7d87e..101731b4e96cb 100644 --- a/pandas/tests/arrays/categorical/test_operators.py +++ b/pandas/tests/arrays/categorical/test_operators.py @@ -208,7 +208,8 @@ def test_comparison_of_ordered_categorical_with_nan_to_listlike( @pytest.mark.parametrize('data,reverse,base', [ (list("abc"), list("cba"), list("bbb")), - ([1, 2, 3], [3, 2, 1], [2, 2, 2])]) + ([1, 2, 3], [3, 2, 1], [2, 2, 2])] + ) def test_comparisons(self, data, reverse, base): cat_rev = Series( Categorical(data, categories=reverse, ordered=True)) From 19e3711343c1ef5d926ea78705033c37e185f2a8 Mon Sep 17 00:00:00 2001 From: Big Head Date: Wed, 29 May 2019 21:23:35 -0400 Subject: [PATCH 27/34] Update v0.25.0.rst --- doc/source/whatsnew/v0.25.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index c65ba8ae54f2e..ffa9fc5977709 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -348,7 +348,7 @@ Categorical ^^^^^^^^^^^ - Bug in :func:`DataFrame.at` and :func:`Series.at` that would raise exception if the index was a :class:`CategoricalIndex` (:issue:`20629`) -- Fixed bug in comparison of ordered `Categorical` that contained missing values with a scalar which sometimes incorrectly resulted in True (:issue:`26504`) +- Fixed bug in comparison of ordered :class:`Categorical` that contained missing values with a scalar which sometimes incorrectly resulted in True (:issue:`26504`) - Datetimelike From 2fc1d278d2a33faaf0ac482433f6b44323515f54 Mon Sep 17 00:00:00 2001 From: Big Head Date: Wed, 29 May 2019 21:40:01 -0400 Subject: [PATCH 28/34] Update test_operators.py --- .../tests/arrays/categorical/test_operators.py | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/pandas/tests/arrays/categorical/test_operators.py b/pandas/tests/arrays/categorical/test_operators.py index 101731b4e96cb..80f218bd3a30c 100644 --- a/pandas/tests/arrays/categorical/test_operators.py +++ b/pandas/tests/arrays/categorical/test_operators.py @@ -7,6 +7,7 @@ from pandas import Categorical, DataFrame, Series, date_range from pandas.tests.arrays.categorical.common import TestCategorical import pandas.util.testing as tm +import warnings class TestCategoricalOpsWithFactor(TestCategorical): @@ -193,8 +194,14 @@ def test_comparison_of_ordered_categorical_with_nan_to_scalar( # values should be evaluated as False cat = Categorical([1, 2, 3, None], categories=[1, 2, 3], ordered=True) - - assert getattr(cat, compare_operators_no_eq_ne)(2)[-1] == False + scalar = 2 + + with warnings.catch_warnings(): + warnings.simplefilter("ignore", RuntimeWarning) + actual = getattr(cat, compare_operators_no_eq_ne)(scalar) + expected = getattr(np.array(cat), compare_operators_no_eq_ne)(scalar) + tm.assert_numpy_array_equal(actual, expected) + def test_comparison_of_ordered_categorical_with_nan_to_listlike( self, compare_operators_no_eq_ne): @@ -204,7 +211,12 @@ def test_comparison_of_ordered_categorical_with_nan_to_listlike( cat = Categorical([1, 2, 3, None], categories=[1, 2, 3], ordered=True) other = Categorical([2, 2, 2, 2], categories=[1, 2, 3], ordered=True) - assert getattr(cat, compare_operators_no_eq_ne)(other)[-1] == False + + with warnings.catch_warnings(): + warnings.simplefilter("ignore", RuntimeWarning) + actual = getattr(cat, compare_operators_no_eq_ne)(other) + expected = getattr(np.array(cat), compare_operators_no_eq_ne)(2) + tm.assert_numpy_array_equal(actual, expected) @pytest.mark.parametrize('data,reverse,base', [ (list("abc"), list("cba"), list("bbb")), From c80c2dcab5cafd8604d09eab9ceef4a6314b072e Mon Sep 17 00:00:00 2001 From: Big Head Date: Wed, 29 May 2019 22:49:09 -0400 Subject: [PATCH 29/34] Update test_operators.py --- pandas/tests/arrays/categorical/test_operators.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/pandas/tests/arrays/categorical/test_operators.py b/pandas/tests/arrays/categorical/test_operators.py index 80f218bd3a30c..05bbb30bd00a1 100644 --- a/pandas/tests/arrays/categorical/test_operators.py +++ b/pandas/tests/arrays/categorical/test_operators.py @@ -195,13 +195,12 @@ def test_comparison_of_ordered_categorical_with_nan_to_scalar( cat = Categorical([1, 2, 3, None], categories=[1, 2, 3], ordered=True) scalar = 2 - with warnings.catch_warnings(): warnings.simplefilter("ignore", RuntimeWarning) actual = getattr(cat, compare_operators_no_eq_ne)(scalar) - expected = getattr(np.array(cat), compare_operators_no_eq_ne)(scalar) + expected = getattr(np.array(cat), + compare_operators_no_eq_ne)(scalar) tm.assert_numpy_array_equal(actual, expected) - def test_comparison_of_ordered_categorical_with_nan_to_listlike( self, compare_operators_no_eq_ne): @@ -211,13 +210,12 @@ def test_comparison_of_ordered_categorical_with_nan_to_listlike( cat = Categorical([1, 2, 3, None], categories=[1, 2, 3], ordered=True) other = Categorical([2, 2, 2, 2], categories=[1, 2, 3], ordered=True) - with warnings.catch_warnings(): warnings.simplefilter("ignore", RuntimeWarning) actual = getattr(cat, compare_operators_no_eq_ne)(other) expected = getattr(np.array(cat), compare_operators_no_eq_ne)(2) tm.assert_numpy_array_equal(actual, expected) - + @pytest.mark.parametrize('data,reverse,base', [ (list("abc"), list("cba"), list("bbb")), ([1, 2, 3], [3, 2, 1], [2, 2, 2])] From 2e01686a7e6c6eb9a1e4ce20c0f0862210c4b046 Mon Sep 17 00:00:00 2001 From: Big Head Date: Wed, 29 May 2019 23:23:44 -0400 Subject: [PATCH 30/34] Update test_operators.py --- pandas/tests/arrays/categorical/test_operators.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/arrays/categorical/test_operators.py b/pandas/tests/arrays/categorical/test_operators.py index 05bbb30bd00a1..1f85a22e03d96 100644 --- a/pandas/tests/arrays/categorical/test_operators.py +++ b/pandas/tests/arrays/categorical/test_operators.py @@ -198,7 +198,7 @@ def test_comparison_of_ordered_categorical_with_nan_to_scalar( with warnings.catch_warnings(): warnings.simplefilter("ignore", RuntimeWarning) actual = getattr(cat, compare_operators_no_eq_ne)(scalar) - expected = getattr(np.array(cat), + expected = getattr(np.array(cat), compare_operators_no_eq_ne)(scalar) tm.assert_numpy_array_equal(actual, expected) From 924f6937db462de93c9b6ba70bd1e653b3465d42 Mon Sep 17 00:00:00 2001 From: Big Head Date: Wed, 29 May 2019 23:51:40 -0400 Subject: [PATCH 31/34] Update test_operators.py --- pandas/tests/arrays/categorical/test_operators.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/arrays/categorical/test_operators.py b/pandas/tests/arrays/categorical/test_operators.py index 1f85a22e03d96..72275444ccaa1 100644 --- a/pandas/tests/arrays/categorical/test_operators.py +++ b/pandas/tests/arrays/categorical/test_operators.py @@ -1,4 +1,5 @@ import operator +import warnings import numpy as np import pytest @@ -7,7 +8,6 @@ from pandas import Categorical, DataFrame, Series, date_range from pandas.tests.arrays.categorical.common import TestCategorical import pandas.util.testing as tm -import warnings class TestCategoricalOpsWithFactor(TestCategorical): From 3b4a42a58126f47b8e3e16167660238810246743 Mon Sep 17 00:00:00 2001 From: Big Head Date: Thu, 30 May 2019 10:18:45 -0400 Subject: [PATCH 32/34] Update categorical.py --- pandas/core/arrays/categorical.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 93346488aaf1d..8bbd161ee107b 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1411,7 +1411,6 @@ def isna(self): ret = self._codes == -1 return ret - isnull = isna def notna(self): @@ -1433,7 +1432,6 @@ def notna(self): """ return ~self.isna() - notnull = notna def put(self, *args, **kwargs): @@ -2556,7 +2554,6 @@ def index(self): stacklevel=2) return self._index - # utility routines From 57480bd3c3be622c2fafbdd5b73dcffac0ee22ae Mon Sep 17 00:00:00 2001 From: Big Head Date: Thu, 30 May 2019 10:24:59 -0400 Subject: [PATCH 33/34] Update test_operators.py --- pandas/tests/arrays/categorical/test_operators.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/pandas/tests/arrays/categorical/test_operators.py b/pandas/tests/arrays/categorical/test_operators.py index 72275444ccaa1..b67d430667682 100644 --- a/pandas/tests/arrays/categorical/test_operators.py +++ b/pandas/tests/arrays/categorical/test_operators.py @@ -195,12 +195,13 @@ def test_comparison_of_ordered_categorical_with_nan_to_scalar( cat = Categorical([1, 2, 3, None], categories=[1, 2, 3], ordered=True) scalar = 2 + with warnings.catch_warnings(): warnings.simplefilter("ignore", RuntimeWarning) - actual = getattr(cat, compare_operators_no_eq_ne)(scalar) expected = getattr(np.array(cat), compare_operators_no_eq_ne)(scalar) - tm.assert_numpy_array_equal(actual, expected) + actual = getattr(cat, compare_operators_no_eq_ne)(scalar) + tm.assert_numpy_array_equal(actual, expected) def test_comparison_of_ordered_categorical_with_nan_to_listlike( self, compare_operators_no_eq_ne): @@ -210,11 +211,12 @@ def test_comparison_of_ordered_categorical_with_nan_to_listlike( cat = Categorical([1, 2, 3, None], categories=[1, 2, 3], ordered=True) other = Categorical([2, 2, 2, 2], categories=[1, 2, 3], ordered=True) + with warnings.catch_warnings(): warnings.simplefilter("ignore", RuntimeWarning) - actual = getattr(cat, compare_operators_no_eq_ne)(other) expected = getattr(np.array(cat), compare_operators_no_eq_ne)(2) - tm.assert_numpy_array_equal(actual, expected) + actual = getattr(cat, compare_operators_no_eq_ne)(other) + tm.assert_numpy_array_equal(actual, expected) @pytest.mark.parametrize('data,reverse,base', [ (list("abc"), list("cba"), list("bbb")), From 8bb9bcf27a4dbc1220b6eeaffa605a3aef286da3 Mon Sep 17 00:00:00 2001 From: yanglinlee Date: Fri, 31 May 2019 12:22:55 -0400 Subject: [PATCH 34/34] fix categorical comparison with missing values --- pandas/core/arrays/categorical.py | 4 +--- pandas/tests/arrays/categorical/test_operators.py | 2 -- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 6d34a8b66c5ea..44bb44457bc25 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -104,9 +104,7 @@ def f(self, other): # check for NaN in self mask = (self._codes == -1) - if mask.any(): - # comparison to missing values NaN leads to False - ret[mask] = False + ret[mask] = False return ret else: if op == '__eq__': diff --git a/pandas/tests/arrays/categorical/test_operators.py b/pandas/tests/arrays/categorical/test_operators.py index b67d430667682..a443408bf9479 100644 --- a/pandas/tests/arrays/categorical/test_operators.py +++ b/pandas/tests/arrays/categorical/test_operators.py @@ -195,7 +195,6 @@ def test_comparison_of_ordered_categorical_with_nan_to_scalar( cat = Categorical([1, 2, 3, None], categories=[1, 2, 3], ordered=True) scalar = 2 - with warnings.catch_warnings(): warnings.simplefilter("ignore", RuntimeWarning) expected = getattr(np.array(cat), @@ -211,7 +210,6 @@ def test_comparison_of_ordered_categorical_with_nan_to_listlike( cat = Categorical([1, 2, 3, None], categories=[1, 2, 3], ordered=True) other = Categorical([2, 2, 2, 2], categories=[1, 2, 3], ordered=True) - with warnings.catch_warnings(): warnings.simplefilter("ignore", RuntimeWarning) expected = getattr(np.array(cat), compare_operators_no_eq_ne)(2)