From e41387dd7c64e0a2032f8aad12a3c00ae50e3165 Mon Sep 17 00:00:00 2001 From: yanglinlee Date: Fri, 24 May 2019 11:45:59 -0400 Subject: [PATCH 01/38] BUG: None comparison evaluates to True #26504 --- doc/source/whatsnew/v0.25.0.rst | 2 +- pandas/core/arrays/categorical.py | 10 +++++++++- pandas/tests/arrays/categorical/test_operators.py | 13 +++++++++++++ 3 files changed, 23 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 91b70334dc9bc..df7f952d4180a 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -305,7 +305,7 @@ Categorical ^^^^^^^^^^^ - Bug in :func:`DataFrame.at` and :func:`Series.at` that would raise exception if the index was a :class:`CategoricalIndex` (:issue:`20629`) -- +- Bug in :func:`_cat_compare_op` that would valuate comparison with None to True (:issue:`26504`) - Datetimelike diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index d25ccd1b158be..eeacb88f33f55 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -100,7 +100,15 @@ def f(self, other): if is_scalar(other): if other in self.categories: i = self.categories.get_loc(other) - return getattr(self._codes, op)(i) + f = getattr(self._codes, op) + ret = f(i) + + # check for NaN in self + na_mask = (self._codes == -1) + if na_mask.any(): + # In other series, the leads to False, so do that here too + ret[na_mask] = False + return ret else: if op == '__eq__': return np.repeat(False, len(self)) diff --git a/pandas/tests/arrays/categorical/test_operators.py b/pandas/tests/arrays/categorical/test_operators.py index dc6e1a5bc36b3..b323cb2b6a7c3 100644 --- a/pandas/tests/arrays/categorical/test_operators.py +++ b/pandas/tests/arrays/categorical/test_operators.py @@ -186,6 +186,19 @@ def test_comparison_with_unknown_scalars(self): tm.assert_numpy_array_equal(cat != 4, np.array([True, True, True])) + def test_comparison_with_known_scalars(self): + # https://github.com/pandas-dev/pandas/issues/26504 + # and following comparisons with scalars in categories with None should + # be evaluated as False + + cat1 = Categorical([1, 2, 3, None], categories=[1, 2, 3], ordered=True) + cat2 = Categorical([None, 1, 2, 3], categories=[1, 2, 3], ordered=True) + + tm.assert_numpy_array_equal(cat1 <= 2, + np.array([True, True, False, False])) + tm.assert_numpy_array_equal(cat2 <= 2, + np.array([False, True, True, False])) + @pytest.mark.parametrize('data,reverse,base', [ (list("abc"), list("cba"), list("bbb")), ([1, 2, 3], [3, 2, 1], [2, 2, 2])] From 9af03ce19c8e2184afc62340138881fb7c804dfd Mon Sep 17 00:00:00 2001 From: ArtinSarraf Date: Tue, 21 May 2019 16:41:44 -0400 Subject: [PATCH 02/38] ENH - Index set operation modifications to address issue #23525 (#23538) --- doc/source/whatsnew/v0.25.0.rst | 27 +++++ pandas/core/indexes/base.py | 104 +++++++++++++++--- pandas/core/indexes/datetimes.py | 34 +----- pandas/core/indexes/interval.py | 26 ++--- pandas/core/indexes/numeric.py | 8 ++ pandas/core/indexes/period.py | 12 +- pandas/core/indexes/range.py | 10 +- pandas/core/indexes/timedeltas.py | 21 +--- pandas/tests/indexes/common.py | 24 +--- pandas/tests/indexes/conftest.py | 36 +++--- .../tests/indexes/datetimes/test_datetime.py | 6 +- pandas/tests/indexes/datetimes/test_setops.py | 22 +++- .../tests/indexes/datetimes/test_timezones.py | 5 +- .../tests/indexes/interval/test_interval.py | 13 ++- pandas/tests/indexes/period/test_setops.py | 4 - pandas/tests/indexes/test_base.py | 2 + pandas/tests/indexes/test_setops.py | 76 +++++++++++++ pandas/tests/reshape/test_concat.py | 23 ++-- pandas/tests/series/test_combine_concat.py | 1 + pandas/tests/series/test_missing.py | 14 ++- pandas/tests/series/test_operators.py | 36 +++++- 21 files changed, 343 insertions(+), 161 deletions(-) create mode 100644 pandas/tests/indexes/test_setops.py diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index df7f952d4180a..0c69d1a4ce013 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -154,6 +154,33 @@ returned if all the columns were dummy encoded, and a :class:`DataFrame` otherwi Providing any ``SparseSeries`` or ``SparseDataFrame`` to :func:`concat` will cause a ``SparseSeries`` or ``SparseDataFrame`` to be returned, as before. +.. _whatsnew_0250.api_breaking.incompatible_index_unions + +Incompatible Index Type Unions +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +When performing :func:`Index.union` operations between objects of incompatible dtypes, +the result will be a base :class:`Index` of dtype ``object``. This behavior holds true for +unions between :class:`Index` objects that previously would have been prohibited. The dtype +of empty :class:`Index` objects will now be evaluated before performing union operations +rather than simply returning the other :class:`Index` object. :func:`Index.union` can now be +considered commutative, such that ``A.union(B) == B.union(A)`` (:issue:`23525`). + +*Previous Behavior*: + + In [1]: pd.period_range('19910905', periods=2).union(pd.Int64Index([1, 2, 3])) + ... + ValueError: can only call with other PeriodIndex-ed objects + + In [2]: pd.Index([], dtype=object).union(pd.Index([1, 2, 3])) + Out[2]: Int64Index([1, 2, 3], dtype='int64') + +*New Behavior*: + +.. ipython:: python + + pd.period_range('19910905', periods=2).union(pd.Int64Index([1, 2, 3])) + pd.Index([], dtype=object).union(pd.Index([1, 2, 3])) ``DataFrame`` groupby ffill/bfill no longer return group labels ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index dd3717813ce3f..eff7ff2c9f347 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -20,11 +20,10 @@ ensure_categorical, ensure_int64, ensure_object, ensure_platform_int, is_bool, is_bool_dtype, is_categorical, is_categorical_dtype, is_datetime64_any_dtype, is_datetime64tz_dtype, is_dtype_equal, - is_dtype_union_equal, is_extension_array_dtype, is_float, is_float_dtype, - is_hashable, is_integer, is_integer_dtype, is_interval_dtype, is_iterator, - is_list_like, is_object_dtype, is_period_dtype, is_scalar, - is_signed_integer_dtype, is_timedelta64_dtype, is_unsigned_integer_dtype, - pandas_dtype) + is_extension_array_dtype, is_float, is_float_dtype, is_hashable, + is_integer, is_integer_dtype, is_interval_dtype, is_iterator, is_list_like, + is_object_dtype, is_period_dtype, is_scalar, is_signed_integer_dtype, + is_timedelta64_dtype, is_unsigned_integer_dtype, pandas_dtype) import pandas.core.dtypes.concat as _concat from pandas.core.dtypes.generic import ( ABCDataFrame, ABCDateOffset, ABCDatetimeArray, ABCIndexClass, @@ -2262,6 +2261,47 @@ def _get_reconciled_name_object(self, other): return self._shallow_copy(name=name) return self + def _union_incompatible_dtypes(self, other, sort): + """ + Casts this and other index to object dtype to allow the formation + of a union between incompatible types. + + Parameters + ---------- + other : Index or array-like + sort : False or None, default False + Whether to sort the resulting index. + + * False : do not sort the result. + * None : sort the result, except when `self` and `other` are equal + or when the values cannot be compared. + + Returns + ------- + Index + """ + this = self.astype(object, copy=False) + # cast to Index for when `other` is list-like + other = Index(other).astype(object, copy=False) + return Index.union(this, other, sort=sort).astype(object, copy=False) + + def _is_compatible_with_other(self, other): + """ + Check whether this and the other dtype are compatible with each other. + Meaning a union can be formed between them without needing to be cast + to dtype object. + + Parameters + ---------- + other : Index or array-like + + Returns + ------- + bool + """ + return (type(self) is type(other) + and is_dtype_equal(self.dtype, other.dtype)) + def _validate_sort_keyword(self, sort): if sort not in [None, False]: raise ValueError("The 'sort' keyword only takes the values of " @@ -2271,6 +2311,11 @@ def union(self, other, sort=None): """ Form the union of two Index objects. + If the Index objects are incompatible, both Index objects will be + cast to dtype('object') first. + + .. versionchanged:: 0.25.0 + Parameters ---------- other : Index or array-like @@ -2300,30 +2345,54 @@ def union(self, other, sort=None): Examples -------- + Union matching dtypes + >>> idx1 = pd.Index([1, 2, 3, 4]) >>> idx2 = pd.Index([3, 4, 5, 6]) >>> idx1.union(idx2) Int64Index([1, 2, 3, 4, 5, 6], dtype='int64') + + Union mismatched dtypes + + >>> idx1 = pd.Index(['a', 'b', 'c', 'd']) + >>> idx2 = pd.Index([1, 2, 3, 4]) + >>> idx1.union(idx2) + Index(['a', 'b', 'c', 'd', 1, 2, 3, 4], dtype='object') """ self._validate_sort_keyword(sort) self._assert_can_do_setop(other) - other = ensure_index(other) - if len(other) == 0 or self.equals(other): + if not self._is_compatible_with_other(other): + return self._union_incompatible_dtypes(other, sort=sort) + + return self._union(other, sort=sort) + + def _union(self, other, sort): + """ + Specific union logic should go here. In subclasses, union behavior + should be overwritten here rather than in `self.union`. + + Parameters + ---------- + other : Index or array-like + sort : False or None, default False + Whether to sort the resulting index. + + * False : do not sort the result. + * None : sort the result, except when `self` and `other` are equal + or when the values cannot be compared. + + Returns + ------- + Index + """ + + if not len(other) or self.equals(other): return self._get_reconciled_name_object(other) - if len(self) == 0: + if not len(self): return other._get_reconciled_name_object(self) - # TODO: is_dtype_union_equal is a hack around - # 1. buggy set ops with duplicates (GH #13432) - # 2. CategoricalIndex lacking setops (GH #10186) - # Once those are fixed, this workaround can be removed - if not is_dtype_union_equal(self.dtype, other.dtype): - this = self.astype('O') - other = other.astype('O') - return this.union(other, sort=sort) - # TODO(EA): setops-refactor, clean all this up if is_period_dtype(self) or is_datetime64tz_dtype(self): lvals = self._ndarray_values @@ -2370,6 +2439,7 @@ def union(self, other, sort=None): def _wrap_setop_result(self, other, result): return self._constructor(result, name=get_op_result_name(self, other)) + # TODO: standardize return type of non-union setops type(self vs other) def intersection(self, other, sort=False): """ Form the intersection of two Index objects. diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index 9c735a5598f4a..7fd537fb9989a 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -451,35 +451,9 @@ def _formatter_func(self): # -------------------------------------------------------------------- # Set Operation Methods - def union(self, other, sort=None): - """ - Specialized union for DatetimeIndex objects. If combine - overlapping ranges with the same DateOffset, will be much - faster than Index.union - - Parameters - ---------- - other : DatetimeIndex or array-like - sort : bool or None, default None - Whether to sort the resulting Index. - - * None : Sort the result, except when - - 1. `self` and `other` are equal. - 2. `self` or `other` has length 0. - 3. Some values in `self` or `other` cannot be compared. - A RuntimeWarning is issued in this case. - - * False : do not sort the result - - .. versionadded:: 0.25.0 - - Returns - ------- - y : Index or DatetimeIndex - """ - self._validate_sort_keyword(sort) - self._assert_can_do_setop(other) + def _union(self, other, sort): + if not len(other) or self.equals(other) or not len(self): + return super()._union(other, sort=sort) if len(other) == 0 or self.equals(other) or len(self) == 0: return super().union(other, sort=sort) @@ -495,7 +469,7 @@ def union(self, other, sort=None): if this._can_fast_union(other): return this._fast_union(other, sort=sort) else: - result = Index.union(this, other, sort=sort) + result = Index._union(this, other, sort=sort) if isinstance(result, DatetimeIndex): # TODO: we shouldn't be setting attributes like this; # in all the tests this equality already holds diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index a3dbf2e03957b..87216dcc7b957 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -964,19 +964,6 @@ def insert(self, loc, item): new_right = self.right.insert(loc, right_insert) return self._shallow_copy(new_left, new_right) - def _as_like_interval_index(self, other): - self._assert_can_do_setop(other) - other = ensure_index(other) - if not isinstance(other, IntervalIndex): - msg = ('the other index needs to be an IntervalIndex too, but ' - 'was type {}').format(other.__class__.__name__) - raise TypeError(msg) - elif self.closed != other.closed: - msg = ('can only do set operations between two IntervalIndex ' - 'objects that are closed on the same side') - raise ValueError(msg) - return other - def _concat_same_dtype(self, to_concat, name): """ assert that we all have the same .closed @@ -1092,7 +1079,17 @@ def overlaps(self, other): def _setop(op_name, sort=None): def func(self, other, sort=sort): - other = self._as_like_interval_index(other) + self._assert_can_do_setop(other) + other = ensure_index(other) + if not isinstance(other, IntervalIndex): + result = getattr(self.astype(object), op_name)(other) + if op_name in ('difference',): + result = result.astype(self.dtype) + return result + elif self.closed != other.closed: + msg = ('can only do set operations between two IntervalIndex ' + 'objects that are closed on the same side') + raise ValueError(msg) # GH 19016: ensure set op will not return a prohibited dtype subtypes = [self.dtype.subtype, other.dtype.subtype] @@ -1114,6 +1111,7 @@ def func(self, other, sort=sort): return type(self).from_tuples(result, closed=self.closed, name=result_name) + return func @property diff --git a/pandas/core/indexes/numeric.py b/pandas/core/indexes/numeric.py index a11f34cbdcceb..b6c8ba588f9d6 100644 --- a/pandas/core/indexes/numeric.py +++ b/pandas/core/indexes/numeric.py @@ -9,6 +9,7 @@ is_bool, is_bool_dtype, is_dtype_equal, is_extension_array_dtype, is_float, is_integer_dtype, is_scalar, needs_i8_conversion, pandas_dtype) import pandas.core.dtypes.concat as _concat +from pandas.core.dtypes.generic import ABCInt64Index, ABCRangeIndex from pandas.core.dtypes.missing import isna from pandas.core import algorithms @@ -221,6 +222,13 @@ def _assert_safe_casting(cls, data, subarr): raise TypeError('Unsafe NumPy casting, you must ' 'explicitly cast') + def _is_compatible_with_other(self, other): + return ( + super()._is_compatible_with_other(other) + or all(isinstance(type(obj), (ABCInt64Index, ABCRangeIndex)) + for obj in [self, other]) + ) + Int64Index._add_numeric_methods() Int64Index._add_logical_methods() diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index ed08de54ad6f2..044951ceda502 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -791,6 +791,11 @@ def join(self, other, how='left', level=None, return_indexers=False, """ self._assert_can_do_setop(other) + if not isinstance(other, PeriodIndex): + return self.astype(object).join(other, how=how, level=level, + return_indexers=return_indexers, + sort=sort) + result = Int64Index.join(self, other, how=how, level=level, return_indexers=return_indexers, sort=sort) @@ -807,10 +812,9 @@ def intersection(self, other, sort=False): def _assert_can_do_setop(self, other): super()._assert_can_do_setop(other) - if not isinstance(other, PeriodIndex): - raise ValueError('can only call with other PeriodIndex-ed objects') - - if self.freq != other.freq: + # *Can't* use PeriodIndexes of different freqs + # *Can* use PeriodIndex/DatetimeIndex + if isinstance(other, PeriodIndex) and self.freq != other.freq: msg = DIFFERENT_FREQ.format(cls=type(self).__name__, own_freq=self.freqstr, other_freq=other.freqstr) diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index 160e6284d3c59..ea14a4c789cd3 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -470,7 +470,7 @@ def _extended_gcd(self, a, b): old_t, t = t, old_t - quotient * t return old_r, old_s, old_t - def union(self, other, sort=None): + def _union(self, other, sort): """ Form the union of two Index objects and sorts if possible @@ -490,9 +490,8 @@ def union(self, other, sort=None): ------- union : Index """ - self._assert_can_do_setop(other) - if len(other) == 0 or self.equals(other) or len(self) == 0: - return super().union(other, sort=sort) + if not len(other) or self.equals(other) or not len(self): + return super()._union(other, sort=sort) if isinstance(other, RangeIndex) and sort is None: start_s, step_s = self._start, self._step @@ -530,8 +529,7 @@ def union(self, other, sort=None): (start_s + step_o >= start_o) and (end_s - step_o <= end_o)): return RangeIndex(start_r, end_r + step_o, step_o) - - return self._int64index.union(other, sort=sort) + return self._int64index._union(other, sort=sort) @Appender(_index_shared_docs['join']) def join(self, other, how='left', level=None, return_indexers=False, diff --git a/pandas/core/indexes/timedeltas.py b/pandas/core/indexes/timedeltas.py index 5e62c2ef881e9..6ae17e62b49c6 100644 --- a/pandas/core/indexes/timedeltas.py +++ b/pandas/core/indexes/timedeltas.py @@ -329,24 +329,9 @@ def astype(self, dtype, copy=True): return Index(result.astype('i8'), name=self.name) return DatetimeIndexOpsMixin.astype(self, dtype, copy=copy) - def union(self, other): - """ - Specialized union for TimedeltaIndex objects. If combine - overlapping ranges with the same DateOffset, will be much - faster than Index.union - - Parameters - ---------- - other : TimedeltaIndex or array-like - - Returns - ------- - y : Index or TimedeltaIndex - """ - self._assert_can_do_setop(other) - + def _union(self, other, sort): if len(other) == 0 or self.equals(other) or len(self) == 0: - return super().union(other) + return super()._union(other, sort=sort) if not isinstance(other, TimedeltaIndex): try: @@ -358,7 +343,7 @@ def union(self, other): if this._can_fast_union(other): return this._fast_union(other) else: - result = Index.union(this, other) + result = Index._union(this, other, sort=sort) if isinstance(result, TimedeltaIndex): if result.freq is None: result.freq = to_offset(result.inferred_freq) diff --git a/pandas/tests/indexes/common.py b/pandas/tests/indexes/common.py index 71d1e686f5c02..674f600bc8693 100644 --- a/pandas/tests/indexes/common.py +++ b/pandas/tests/indexes/common.py @@ -447,11 +447,7 @@ def test_intersection_base(self): cases = [klass(second.values) for klass in [np.array, Series, list]] for case in cases: - if isinstance(idx, PeriodIndex): - msg = "can only call with other PeriodIndex-ed objects" - with pytest.raises(ValueError, match=msg): - first.intersection(case) - elif isinstance(idx, CategoricalIndex): + if isinstance(idx, CategoricalIndex): pass else: result = first.intersection(case) @@ -474,11 +470,7 @@ def test_union_base(self): cases = [klass(second.values) for klass in [np.array, Series, list]] for case in cases: - if isinstance(idx, PeriodIndex): - msg = "can only call with other PeriodIndex-ed objects" - with pytest.raises(ValueError, match=msg): - first.union(case) - elif isinstance(idx, CategoricalIndex): + if isinstance(idx, CategoricalIndex): pass else: result = first.union(case) @@ -506,11 +498,7 @@ def test_difference_base(self, sort): cases = [klass(second.values) for klass in [np.array, Series, list]] for case in cases: - if isinstance(idx, PeriodIndex): - msg = "can only call with other PeriodIndex-ed objects" - with pytest.raises(ValueError, match=msg): - first.difference(case, sort) - elif isinstance(idx, CategoricalIndex): + if isinstance(idx, CategoricalIndex): pass elif isinstance(idx, (DatetimeIndex, TimedeltaIndex)): assert result.__class__ == answer.__class__ @@ -540,11 +528,7 @@ def test_symmetric_difference(self): cases = [klass(second.values) for klass in [np.array, Series, list]] for case in cases: - if isinstance(idx, PeriodIndex): - msg = "can only call with other PeriodIndex-ed objects" - with pytest.raises(ValueError, match=msg): - first.symmetric_difference(case) - elif isinstance(idx, CategoricalIndex): + if isinstance(idx, CategoricalIndex): pass else: result = first.symmetric_difference(case) diff --git a/pandas/tests/indexes/conftest.py b/pandas/tests/indexes/conftest.py index 632d5b2875a5a..83f1f22b158b1 100644 --- a/pandas/tests/indexes/conftest.py +++ b/pandas/tests/indexes/conftest.py @@ -5,23 +5,25 @@ from pandas.core.indexes.api import Index, MultiIndex import pandas.util.testing as tm - -@pytest.fixture(params=[tm.makeUnicodeIndex(100), - tm.makeStringIndex(100), - tm.makeDateIndex(100), - tm.makePeriodIndex(100), - tm.makeTimedeltaIndex(100), - tm.makeIntIndex(100), - tm.makeUIntIndex(100), - tm.makeRangeIndex(100), - tm.makeFloatIndex(100), - Index([True, False]), - tm.makeCategoricalIndex(100), - Index([]), - MultiIndex.from_tuples(zip( - ['foo', 'bar', 'baz'], [1, 2, 3])), - Index([0, 0, 1, 1, 2, 2])], - ids=lambda x: type(x).__name__) +indices_list = [tm.makeUnicodeIndex(100), + tm.makeStringIndex(100), + tm.makeDateIndex(100), + tm.makePeriodIndex(100), + tm.makeTimedeltaIndex(100), + tm.makeIntIndex(100), + tm.makeUIntIndex(100), + tm.makeRangeIndex(100), + tm.makeFloatIndex(100), + Index([True, False]), + tm.makeCategoricalIndex(100), + tm.makeIntervalIndex(100), + Index([]), + MultiIndex.from_tuples(zip( + ['foo', 'bar', 'baz'], [1, 2, 3])), + Index([0, 0, 1, 1, 2, 2])] + + +@pytest.fixture(params=indices_list, ids=lambda x: type(x).__name__) def indices(request): return request.param diff --git a/pandas/tests/indexes/datetimes/test_datetime.py b/pandas/tests/indexes/datetimes/test_datetime.py index efa6d006bad6f..01649cb4646de 100644 --- a/pandas/tests/indexes/datetimes/test_datetime.py +++ b/pandas/tests/indexes/datetimes/test_datetime.py @@ -298,9 +298,9 @@ def test_join_with_period_index(self, join_type): c_idx_type='p', r_idx_type='dt') s = df.iloc[:5, 0] - msg = 'can only call with other PeriodIndex-ed objects' - with pytest.raises(ValueError, match=msg): - df.columns.join(s.index, how=join_type) + expected = df.columns.astype('O').join(s.index, how=join_type) + result = df.columns.join(s.index, how=join_type) + tm.assert_index_equal(expected, result) def test_factorize(self): idx1 = DatetimeIndex(['2014-01', '2014-01', '2014-02', '2014-02', diff --git a/pandas/tests/indexes/datetimes/test_setops.py b/pandas/tests/indexes/datetimes/test_setops.py index 45a3a64216cab..fd666f3d56c9d 100644 --- a/pandas/tests/indexes/datetimes/test_setops.py +++ b/pandas/tests/indexes/datetimes/test_setops.py @@ -29,11 +29,20 @@ def test_union2(self, sort): union = first.union(second, sort=sort) tm.assert_index_equal(union, everything) + @pytest.mark.parametrize("box", [np.array, Series, list]) + @pytest.mark.parametrize("sort", [None, False]) + def test_union3(self, sort, box): + everything = tm.makeDateIndex(10) + first = everything[:5] + second = everything[5:] + # GH 10149 - cases = [klass(second.values) for klass in [np.array, Series, list]] - for case in cases: - result = first.union(case, sort=sort) - tm.assert_index_equal(result, everything) + expected = first.astype('O').union( + pd.Index(second.values, dtype='O') + ).astype('O') + case = box(second.values) + result = first.union(case, sort=sort) + tm.assert_index_equal(result, expected) @pytest.mark.parametrize("tz", tz) @pytest.mark.parametrize("sort", [None, False]) @@ -303,11 +312,12 @@ def test_datetimeindex_union_join_empty(self, sort): empty = Index([]) result = dti.union(empty, sort=sort) - assert isinstance(result, DatetimeIndex) - assert result is result + expected = dti.astype('O') + tm.assert_index_equal(result, expected) result = dti.join(empty) assert isinstance(result, DatetimeIndex) + tm.assert_index_equal(result, dti) def test_join_nonunique(self): idx1 = to_datetime(['2012-11-06 16:00:11.477563', diff --git a/pandas/tests/indexes/datetimes/test_timezones.py b/pandas/tests/indexes/datetimes/test_timezones.py index 3f876565119cb..368dc68e516df 100644 --- a/pandas/tests/indexes/datetimes/test_timezones.py +++ b/pandas/tests/indexes/datetimes/test_timezones.py @@ -1077,7 +1077,10 @@ def test_dti_union_aware(self): tz="US/Eastern") result = rng.union(rng2) - assert result.tz.zone == 'UTC' + expected = rng.astype('O').union(rng2.astype('O')) + tm.assert_index_equal(result, expected) + assert result[0].tz.zone == 'US/Central' + assert result[-1].tz.zone == 'US/Eastern' @pytest.mark.parametrize('tz', [None, 'UTC', "US/Central", dateutil.tz.tzoffset(None, -28800)]) diff --git a/pandas/tests/indexes/interval/test_interval.py b/pandas/tests/indexes/interval/test_interval.py index 61465d8454383..f4f63aaecd336 100644 --- a/pandas/tests/indexes/interval/test_interval.py +++ b/pandas/tests/indexes/interval/test_interval.py @@ -901,15 +901,18 @@ def test_symmetric_difference(self, closed, sort): @pytest.mark.parametrize('op_name', [ 'union', 'intersection', 'difference', 'symmetric_difference']) @pytest.mark.parametrize("sort", [None, False]) - def test_set_operation_errors(self, closed, op_name, sort): + def test_set_incompatible_types(self, closed, op_name, sort): index = self.create_index(closed=closed) set_op = getattr(index, op_name) + # TODO: standardize return type of non-union setops type(self vs other) # non-IntervalIndex - msg = ('the other index needs to be an IntervalIndex too, but ' - 'was type Int64Index') - with pytest.raises(TypeError, match=msg): - set_op(Index([1, 2, 3]), sort=sort) + if op_name == 'difference': + expected = index + else: + expected = getattr(index.astype('O'), op_name)(Index([1, 2, 3])) + result = set_op(Index([1, 2, 3]), sort=sort) + tm.assert_index_equal(result, expected) # mixed closed msg = ('can only do set operations between two IntervalIndex objects ' diff --git a/pandas/tests/indexes/period/test_setops.py b/pandas/tests/indexes/period/test_setops.py index 29d07a0985574..a9102aeec060c 100644 --- a/pandas/tests/indexes/period/test_setops.py +++ b/pandas/tests/indexes/period/test_setops.py @@ -127,10 +127,6 @@ def test_union_misc(self, sort): with pytest.raises(period.IncompatibleFrequency): index.union(index2, sort=sort) - msg = 'can only call with other PeriodIndex-ed objects' - with pytest.raises(ValueError, match=msg): - index.join(index.to_timestamp()) - index3 = period_range('1/1/2000', '1/20/2000', freq='2D') with pytest.raises(period.IncompatibleFrequency): index.join(index3) diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 7b507a9de6b5d..7e70d77ea70fc 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -889,6 +889,8 @@ def test_union_identity(self, sort): # i.e. identity is not preserved when sort is True assert (union is first) is (not sort) + # This should no longer be the same object, since [] is not consistent, + # both objects will be recast to dtype('O') union = first.union([], sort=sort) assert (union is first) is (not sort) diff --git a/pandas/tests/indexes/test_setops.py b/pandas/tests/indexes/test_setops.py new file mode 100644 index 0000000000000..b626ced2ccb1b --- /dev/null +++ b/pandas/tests/indexes/test_setops.py @@ -0,0 +1,76 @@ +''' +The tests in this package are to ensure the proper resultant dtypes of +set operations. +''' +import itertools as it + +import numpy as np +import pytest + +from pandas.core.dtypes.common import is_dtype_equal + +import pandas as pd +from pandas import Int64Index, RangeIndex +from pandas.tests.indexes.conftest import indices_list +import pandas.util.testing as tm + +COMPATIBLE_INCONSISTENT_PAIRS = { + (Int64Index, RangeIndex): (tm.makeIntIndex, tm.makeRangeIndex) +} + + +@pytest.fixture(params=list(it.combinations(indices_list, 2)), + ids=lambda x: type(x[0]).__name__ + type(x[1]).__name__) +def index_pair(request): + """ + Create all combinations of 2 index types. + """ + return request.param + + +def test_union_same_types(indices): + # Union with a non-unique, non-monotonic index raises error + # Only needed for bool index factory + idx1 = indices.sort_values() + idx2 = indices.sort_values() + assert idx1.union(idx2).dtype == idx1.dtype + + +def test_union_different_types(index_pair): + # GH 23525 + idx1, idx2 = index_pair + type_pair = tuple(sorted([type(idx1), type(idx2)], key=lambda x: str(x))) + if type_pair in COMPATIBLE_INCONSISTENT_PAIRS: + pytest.xfail('This test only considers non compatible indexes.') + + if any(isinstance(idx, pd.MultiIndex) for idx in index_pair): + pytest.xfail('This test doesn\'t consider multiindixes.') + + if is_dtype_equal(idx1.dtype, idx2.dtype): + pytest.xfail('This test only considers non matching dtypes.') + + # A union with a CategoricalIndex (even as dtype('O')) and a + # non-CategoricalIndex can only be made if both indices are monotonic. + # This is true before this PR as well. + + # Union with a non-unique, non-monotonic index raises error + # This applies to the boolean index + idx1 = idx1.sort_values() + idx2 = idx2.sort_values() + + assert idx1.union(idx2).dtype == np.dtype('O') + assert idx2.union(idx1).dtype == np.dtype('O') + + +@pytest.mark.parametrize('idx_fact1,idx_fact2', + COMPATIBLE_INCONSISTENT_PAIRS.values()) +def test_compatible_inconsistent_pairs(idx_fact1, idx_fact2): + # GH 23525 + idx1 = idx_fact1(10) + idx2 = idx_fact2(20) + + res1 = idx1.union(idx2) + res2 = idx2.union(idx1) + + assert res1.dtype in (idx1.dtype, idx2.dtype) + assert res2.dtype in (idx1.dtype, idx2.dtype) diff --git a/pandas/tests/reshape/test_concat.py b/pandas/tests/reshape/test_concat.py index 3d9f3da75306a..ecd62380d8c65 100644 --- a/pandas/tests/reshape/test_concat.py +++ b/pandas/tests/reshape/test_concat.py @@ -960,22 +960,23 @@ def test_append_different_columns_types_raises( df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=index_can_append) ser = pd.Series([7, 8, 9], index=index_cannot_append_with_other, name=2) - msg = ("the other index needs to be an IntervalIndex too, but was" + msg = (r"unorderable types: (Interval|int)\(\) (<|>) " + r"(int|long|float|str|Timestamp)\(\)|" + r"Expected tuple, got (int|long|float|str)|" + r"Cannot compare type 'Timestamp' with type '(int|long)'|" + r"'(<|>)' not supported between instances of 'int' " + r"and '(str|Timestamp)'|" + r"the other index needs to be an IntervalIndex too, but was" r" type {}|" r"object of type '(int|float|Timestamp)' has no len\(\)|" "Expected tuple, got str") - with pytest.raises(TypeError, match=msg.format( - index_can_append.__class__.__name__)): + with pytest.raises(TypeError, match=msg): df.append(ser) df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=index_cannot_append_with_other) ser = pd.Series([7, 8, 9], index=index_can_append, name=2) - msg = (r"unorderable types: (Interval|int)\(\) > " - r"(int|float|str)\(\)|" - r"Expected tuple, got (int|float|str)|" - r"Cannot compare type 'Timestamp' with type 'int'|" - r"'>' not supported between instances of 'int' and 'str'") + with pytest.raises(TypeError, match=msg): df.append(ser) @@ -2029,7 +2030,8 @@ def test_concat_empty_series(self): s1 = pd.Series([1, 2, 3], name='x') s2 = pd.Series(name='y') res = pd.concat([s1, s2], axis=1) - exp = pd.DataFrame({'x': [1, 2, 3], 'y': [np.nan, np.nan, np.nan]}) + exp = pd.DataFrame({'x': [1, 2, 3], 'y': [np.nan, np.nan, np.nan]}, + index=pd.Index([0, 1, 2], dtype='O')) tm.assert_frame_equal(res, exp) s1 = pd.Series([1, 2, 3], name='x') @@ -2044,7 +2046,8 @@ def test_concat_empty_series(self): s2 = pd.Series(name=None) res = pd.concat([s1, s2], axis=1) exp = pd.DataFrame({'x': [1, 2, 3], 0: [np.nan, np.nan, np.nan]}, - columns=['x', 0]) + columns=['x', 0], + index=pd.Index([0, 1, 2], dtype='O')) tm.assert_frame_equal(res, exp) @pytest.mark.parametrize('tz', [None, 'UTC']) diff --git a/pandas/tests/series/test_combine_concat.py b/pandas/tests/series/test_combine_concat.py index 41c3e220ad06f..ed5cf2d6b2c51 100644 --- a/pandas/tests/series/test_combine_concat.py +++ b/pandas/tests/series/test_combine_concat.py @@ -100,6 +100,7 @@ def test_combine_first(self): # corner case s = Series([1., 2, 3], index=[0, 1, 2]) result = s.combine_first(Series([], index=[])) + s.index = s.index.astype('O') assert_series_equal(s, result) def test_update(self): diff --git a/pandas/tests/series/test_missing.py b/pandas/tests/series/test_missing.py index 13e8d6c885029..11ad238eecd77 100644 --- a/pandas/tests/series/test_missing.py +++ b/pandas/tests/series/test_missing.py @@ -912,7 +912,7 @@ def test_interpolate_pchip(self): # interpolate at new_index new_index = ser.index.union(Index([49.25, 49.5, 49.75, 50.25, 50.5, - 50.75])) + 50.75])).astype(float) interp_s = ser.reindex(new_index).interpolate(method='pchip') # does not blow up, GH5977 interp_s[49:51] @@ -928,7 +928,9 @@ def test_interpolate_akima(self): index=Index([1.0, 1.25, 1.5, 1.75, 2.0, 2.25, 2.5, 2.75, 3.0])) # interpolate at new_index - new_index = ser.index.union(Index([1.25, 1.5, 1.75, 2.25, 2.5, 2.75])) + new_index = ser.index.union( + Index([1.25, 1.5, 1.75, 2.25, 2.5, 2.75]) + ).astype(float) interp_s = ser.reindex(new_index).interpolate(method='akima') assert_series_equal(interp_s[1:3], expected) @@ -941,7 +943,9 @@ def test_interpolate_piecewise_polynomial(self): index=Index([1.0, 1.25, 1.5, 1.75, 2.0, 2.25, 2.5, 2.75, 3.0])) # interpolate at new_index - new_index = ser.index.union(Index([1.25, 1.5, 1.75, 2.25, 2.5, 2.75])) + new_index = ser.index.union( + Index([1.25, 1.5, 1.75, 2.25, 2.5, 2.75]) + ).astype(float) interp_s = ser.reindex(new_index).interpolate( method='piecewise_polynomial') assert_series_equal(interp_s[1:3], expected) @@ -955,7 +959,9 @@ def test_interpolate_from_derivatives(self): index=Index([1.0, 1.25, 1.5, 1.75, 2.0, 2.25, 2.5, 2.75, 3.0])) # interpolate at new_index - new_index = ser.index.union(Index([1.25, 1.5, 1.75, 2.25, 2.5, 2.75])) + new_index = ser.index.union( + Index([1.25, 1.5, 1.75, 2.25, 2.5, 2.75]) + ).astype(float) interp_s = ser.reindex(new_index).interpolate( method='from_derivatives') assert_series_equal(interp_s[1:3], expected) diff --git a/pandas/tests/series/test_operators.py b/pandas/tests/series/test_operators.py index fee1976698b04..215fa9f22277e 100644 --- a/pandas/tests/series/test_operators.py +++ b/pandas/tests/series/test_operators.py @@ -8,10 +8,12 @@ from pandas import ( Categorical, DataFrame, Index, Series, bdate_range, date_range, isna) from pandas.core import ops +from pandas.core.indexes.base import InvalidIndexError import pandas.core.nanops as nanops import pandas.util.testing as tm from pandas.util.testing import ( - assert_almost_equal, assert_frame_equal, assert_series_equal) + assert_almost_equal, assert_frame_equal, assert_index_equal, + assert_series_equal) from .common import TestData @@ -171,7 +173,6 @@ def test_scalar_na_logical_ops_corners(self): operator.and_, operator.or_, operator.xor, - ]) def test_logical_ops_with_index(self, op): # GH#22092, GH#19792 @@ -190,6 +191,37 @@ def test_logical_ops_with_index(self, op): result = op(ser, idx2) assert_series_equal(result, expected) + @pytest.mark.parametrize('op', [ + pytest.param(ops.rand_, + marks=pytest.mark.xfail(reason="GH#22092 Index " + "implementation returns " + "Index", + raises=AssertionError, + strict=True)), + pytest.param(ops.ror_, + marks=pytest.mark.xfail(reason="Index.get_indexer " + "with non unique index", + raises=InvalidIndexError, + strict=True)), + ops.rxor, + ]) + def test_reversed_logical_ops_with_index(self, op): + # GH#22092, GH#19792 + ser = Series([True, True, False, False]) + idx1 = Index([True, False, True, False]) + idx2 = Index([1, 0, 1, 0]) + + # symmetric_difference is only for rxor, but other 2 should fail + expected = idx1.symmetric_difference(ser) + + result = op(ser, idx1) + assert_index_equal(result, expected) + + expected = idx2.symmetric_difference(ser) + + result = op(ser, idx2) + assert_index_equal(result, expected) + @pytest.mark.parametrize("op, expected", [ (ops.rand_, pd.Index([False, True])), (ops.ror_, pd.Index([False, True])), From 620fa592e2403bf56de46945d1c7363006491173 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Tue, 21 May 2019 23:33:41 +0100 Subject: [PATCH 03/38] DOC/CLN: wil -> will (#26484) --- pandas/core/generic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 76910f425836e..623e2b4863029 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -926,7 +926,7 @@ def squeeze(self, axis=None): a 1 Name: 0, dtype: int64 - Squeezing all axes wil project directly into a scalar: + Squeezing all axes will project directly into a scalar: >>> df_0a.squeeze() 1 From aad1bf9904ef83e9d35022c9df9ec7080ca45f2e Mon Sep 17 00:00:00 2001 From: Brett Randall Date: Wed, 22 May 2019 23:58:19 +1000 Subject: [PATCH 04/38] Fixed typo mutiplication -> multiplication. (#26489) --- pandas/core/frame.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index afe37bf198ab7..6bfa63012689d 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -918,7 +918,7 @@ def __len__(self): def dot(self, other): """ - Compute the matrix mutiplication between the DataFrame and other. + Compute the matrix multiplication between the DataFrame and other. This method computes the matrix product between the DataFrame and the values of an other Series, DataFrame or a numpy array. From ef87d02a71fe075075a3a32df8af8804f390da2d Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Thu, 23 May 2019 14:04:56 +0100 Subject: [PATCH 05/38] DOC: fix SyntaxError in doc build on Windows (#26499) --- doc/source/conf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/conf.py b/doc/source/conf.py index e7d358c7961ab..971aa04ba866a 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -319,7 +319,7 @@ pd.options.display.max_rows = 15 import os - os.chdir('{}') + os.chdir(r'{}') """.format(os.path.dirname(os.path.dirname(__file__))) From 89cc7f2ce39f046dd9877d569fe226d6a1cbfe5a Mon Sep 17 00:00:00 2001 From: Mats Maiwald <32721837+matsmaiwald@users.noreply.github.com> Date: Thu, 23 May 2019 18:46:20 +0200 Subject: [PATCH 06/38] DOC: Highlighted role of index alignment in DataFrame.dot(other) (#26480) (#26496) --- pandas/core/frame.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 6bfa63012689d..7d501e8095921 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -944,7 +944,9 @@ def dot(self, other): Notes ----- The dimensions of DataFrame and other must be compatible in order to - compute the matrix multiplication. + compute the matrix multiplication. In addition, the column names of + DataFrame and the index of other must contain the same values, as they + will be aligned prior to the multiplication. The dot method for Series computes the inner product, instead of the matrix product here. @@ -982,6 +984,14 @@ def dot(self, other): 0 1 0 1 4 1 2 2 + + Note how shuffling of the objects does not change the result. + + >>> s2 = s.reindex([1, 0, 2, 3]) + >>> df.dot(s2) + 0 -4 + 1 5 + dtype: int64 """ if isinstance(other, (Series, DataFrame)): common = self.columns.union(other.index) From babd5720f67e66fc817f85925b1ef9cf0b746576 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Fri, 24 May 2019 16:11:25 +0100 Subject: [PATCH 07/38] DOC/CLN: Change API reference section title (#26486) --- doc/source/reference/indexing.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/doc/source/reference/indexing.rst b/doc/source/reference/indexing.rst index 680cb7e3dac91..42ebf648f299f 100644 --- a/doc/source/reference/indexing.rst +++ b/doc/source/reference/indexing.rst @@ -2,9 +2,9 @@ .. _api.indexing: -======== -Indexing -======== +============= +Index Objects +============= Index ----- From 4c231a77858b06f5ac4c48faf3155394c166da38 Mon Sep 17 00:00:00 2001 From: topper-123 Date: Fri, 24 May 2019 17:29:32 +0200 Subject: [PATCH 08/38] CLN: Remove StringMixin from PandasObject (#26505) --- doc/source/whatsnew/v0.25.0.rst | 14 ++++++++++++++ pandas/core/arrays/categorical.py | 6 +----- pandas/core/arrays/sparse.py | 2 +- pandas/core/base.py | 4 ++-- pandas/core/frame.py | 2 +- pandas/core/generic.py | 2 +- pandas/core/groupby/groupby.py | 4 ++-- pandas/core/indexes/base.py | 2 +- pandas/core/indexes/frozen.py | 2 +- pandas/core/internals/blocks.py | 3 +-- pandas/core/internals/managers.py | 2 +- pandas/core/panel.py | 2 +- pandas/core/series.py | 2 +- pandas/core/sparse/series.py | 5 ++--- pandas/core/window.py | 2 +- 15 files changed, 31 insertions(+), 23 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 0c69d1a4ce013..d86379c4d0703 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -213,6 +213,20 @@ are returned. (:issue:`21521`) df.groupby("a").ffill() +``__str__`` methods now call ``__repr__`` rather than vica-versa +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Pandas has until now mostly defined string representations in a Pandas objects's +``__str__``/``__unicode__``/``__bytes__`` methods, and called ``__str__`` from the ``__repr__`` +method, if a specific ``__repr__`` method is not found. This is not needed for Python3. +In Pandas 0.25, the string representations of Pandas objects are now generally +defined in ``__repr__``, and calls to ``__str__`` in general now pass the call on to +the ``__repr__``, if a specific ``__str__`` method doesn't exist, as is standard for Python. +This change is backward compatible for direct usage of Pandas, but if you subclass +Pandas objects *and* give your subclasses specific ``__str__``/``__repr__`` methods, +you may have to adjust your ``__str__``/``__repr__`` methods (:issue:`26495`). + + .. _whatsnew_0250.api_breaking.deps: Increased minimum versions for dependencies diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index eeacb88f33f55..1d6b906158125 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -2030,7 +2030,7 @@ def _get_repr(self, length=True, na_rep='NaN', footer=True): result = formatter.to_string() return str(result) - def __str__(self): + def __repr__(self): """ String representation. """ @@ -2045,10 +2045,6 @@ def __str__(self): return result - def __repr__(self): - # We want to bypass the ExtensionArray.__repr__ - return str(self) - def _maybe_coerce_indexer(self, indexer): """ return an indexer coerced to the codes dtype diff --git a/pandas/core/arrays/sparse.py b/pandas/core/arrays/sparse.py index 7a66e0ff33cc7..b0236cb393c1c 100644 --- a/pandas/core/arrays/sparse.py +++ b/pandas/core/arrays/sparse.py @@ -1831,7 +1831,7 @@ def _add_comparison_ops(cls): # ---------- # Formatting # ----------- - def __str__(self): + def __repr__(self): return '{self}\nFill: {fill}\n{index}'.format( self=printing.pprint_thing(self), fill=printing.pprint_thing(self.fill_value), diff --git a/pandas/core/base.py b/pandas/core/base.py index f7837c60c0b82..3f59871fb5b38 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -55,7 +55,7 @@ def __repr__(self): return str(self) -class PandasObject(StringMixin, DirNamesMixin): +class PandasObject(DirNamesMixin): """baseclass for various pandas objects""" @@ -64,7 +64,7 @@ def _constructor(self): """class constructor (for this class it's just `__class__`""" return self.__class__ - def __str__(self): + def __repr__(self): """ Return a string representation for a particular object. """ diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 7d501e8095921..7cf200506e853 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -610,7 +610,7 @@ def _info_repr(self): return info_repr_option and not (self._repr_fits_horizontal_() and self._repr_fits_vertical_()) - def __str__(self): + def __repr__(self): """ Return a string representation for a particular DataFrame. """ diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 623e2b4863029..76c73fc40977c 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -2022,7 +2022,7 @@ def __setstate__(self, state): # ---------------------------------------------------------------------- # Rendering Methods - def __str__(self): + def __repr__(self): # string representation based upon iterating over self # (since, by definition, `PandasContainers` are iterable) prepr = '[%s]' % ','.join(map(pprint_thing, self)) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 4e9e3b4963b6d..aa04b7505afe4 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -373,8 +373,8 @@ def __init__(self, obj, keys=None, axis=0, level=None, def __len__(self): return len(self.groups) - def __str__(self): - # TODO: Better str/repr for GroupBy object + def __repr__(self): + # TODO: Better repr for GroupBy object return object.__repr__(self) def _assure_grouper(self): diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index eff7ff2c9f347..a4544e79e2dfa 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -932,7 +932,7 @@ def __deepcopy__(self, memo=None): # -------------------------------------------------------------------- # Rendering Methods - def __str__(self): + def __repr__(self): """ Return a string representation for this object. """ diff --git a/pandas/core/indexes/frozen.py b/pandas/core/indexes/frozen.py index 60e4253e3101b..aeb0fa119ab33 100644 --- a/pandas/core/indexes/frozen.py +++ b/pandas/core/indexes/frozen.py @@ -149,7 +149,7 @@ def values(self): arr = self.view(np.ndarray).copy() return arr - def __str__(self): + def __repr__(self): """ Return a string representation for this object. """ diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 0ac87c653cfff..f86ef40a97299 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -233,8 +233,7 @@ def make_block_same_class(self, values, placement=None, ndim=None, return make_block(values, placement=placement, ndim=ndim, klass=self.__class__, dtype=dtype) - def __str__(self): - + def __repr__(self): # don't want to print out all of the items here name = pprint_thing(self.__class__.__name__) if self._is_single_block: diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 96a672b60da70..0b63588c9f5d9 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -291,7 +291,7 @@ def _post_setstate(self): def __len__(self): return len(self.items) - def __str__(self): + def __repr__(self): output = pprint_thing(self.__class__.__name__) for i, ax in enumerate(self.axes): if i == 0: diff --git a/pandas/core/panel.py b/pandas/core/panel.py index b6b957c543df6..c65a73bd0d3f0 100644 --- a/pandas/core/panel.py +++ b/pandas/core/panel.py @@ -340,7 +340,7 @@ def _compare_constructor(self, other, func): # ---------------------------------------------------------------------- # Magic methods - def __str__(self): + def __repr__(self): """ Return a string representation for a particular Panel. """ diff --git a/pandas/core/series.py b/pandas/core/series.py index 5b59fd6e7b38d..55b5bdcbf53f4 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1384,7 +1384,7 @@ def reset_index(self, level=None, drop=False, name=None, inplace=False): # ---------------------------------------------------------------------- # Rendering Methods - def __str__(self): + def __repr__(self): """ Return a string representation for a particular Series. """ diff --git a/pandas/core/sparse/series.py b/pandas/core/sparse/series.py index ae1c94e136475..eac59e2c0f5eb 100644 --- a/pandas/core/sparse/series.py +++ b/pandas/core/sparse/series.py @@ -217,9 +217,8 @@ def as_sparse_array(self, kind=None, fill_value=None, copy=False): return SparseArray(self.values, sparse_index=self.sp_index, fill_value=fill_value, kind=kind, copy=copy) - def __str__(self): - # currently, unicode is same as repr...fixes infinite loop - series_rep = Series.__str__(self) + def __repr__(self): + series_rep = Series.__repr__(self) rep = '{series}\n{index!r}'.format(series=series_rep, index=self.sp_index) return rep diff --git a/pandas/core/window.py b/pandas/core/window.py index deb64f1fb089d..d51e12035c829 100644 --- a/pandas/core/window.py +++ b/pandas/core/window.py @@ -157,7 +157,7 @@ def _get_window(self, other=None): def _window_type(self): return self.__class__.__name__ - def __str__(self): + def __repr__(self): """ Provide a nice str repr of our rolling object. """ From cffbaac126fb1043e44cac9ca4ed872d5424fc52 Mon Sep 17 00:00:00 2001 From: Vaibhav Vishal Date: Fri, 24 May 2019 21:02:14 +0530 Subject: [PATCH 09/38] Fix type annotations in pandas.core.indexes.datetimes (#26404) --- mypy.ini | 6 ------ pandas/core/indexes/datetimelike.py | 14 +++++++------- pandas/core/indexes/datetimes.py | 12 +++++++----- 3 files changed, 14 insertions(+), 18 deletions(-) diff --git a/mypy.ini b/mypy.ini index 584c747a26f2e..3df8fd13a2a75 100644 --- a/mypy.ini +++ b/mypy.ini @@ -8,11 +8,5 @@ ignore_errors=True [mypy-pandas.core.indexes.datetimelike] ignore_errors=True -[mypy-pandas.core.indexes.datetimes] -ignore_errors=True - [mypy-pandas.core.indexes.period] ignore_errors=True - -[mypy-pandas.core.indexes.timedeltas] -ignore_errors=True \ No newline at end of file diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index 7454b015cb556..092cec00228cd 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -57,7 +57,7 @@ class DatetimeIndexOpsMixin(ExtensionOpsMixin): """ common ops mixin to support a unified interface datetimelike Index """ - _data = None # type: DatetimeLikeArrayMixin + _data = None # DatetimeLikeArrayMixin assumes subclasses are mutable, so these are # properties there. They can be made into cache_readonly for Index @@ -220,9 +220,9 @@ def __contains__(self, key): # Try to run function on index first, and then on elements of index # Especially important for group-by functionality - def map(self, f): + def map(self, mapper, na_action=None): try: - result = f(self) + result = mapper(self) # Try to use this result if we can if isinstance(result, np.ndarray): @@ -232,7 +232,7 @@ def map(self, f): raise TypeError('The map function must return an Index object') return result except Exception: - return self.astype(object).map(f) + return self.astype(object).map(mapper) def sort_values(self, return_indexer=False, ascending=True): """ @@ -430,8 +430,8 @@ def argmax(self, axis=None, skipna=True, *args, **kwargs): # -------------------------------------------------------------------- # Rendering Methods - def _format_with_header(self, header, **kwargs): - return header + list(self._format_native_types(**kwargs)) + def _format_with_header(self, header, na_rep='NaT', **kwargs): + return header + list(self._format_native_types(na_rep, **kwargs)) @property def _formatter_func(self): @@ -509,7 +509,7 @@ def __rsub__(self, other): cls.__rsub__ = __rsub__ - def isin(self, values): + def isin(self, values, level=None): """ Compute boolean array of whether each index value is found in the passed set of values. diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index 7fd537fb9989a..e68431b79dcd3 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -4,8 +4,8 @@ import numpy as np -from pandas._libs import ( - Timestamp, index as libindex, join as libjoin, lib, tslib as libts) +from pandas._libs import Timestamp, index as libindex, lib, tslib as libts +import pandas._libs.join as libjoin from pandas._libs.tslibs import ccalendar, fields, parsing, timezones from pandas.util._decorators import Appender, Substitution, cache_readonly @@ -1087,9 +1087,11 @@ def slice_indexer(self, start=None, end=None, step=None, kind=None): _is_monotonic_decreasing = Index.is_monotonic_decreasing _is_unique = Index.is_unique - _timezone = cache_readonly(DatetimeArray._timezone.fget) - is_normalized = cache_readonly(DatetimeArray.is_normalized.fget) - _resolution = cache_readonly(DatetimeArray._resolution.fget) + _timezone = cache_readonly(DatetimeArray._timezone.fget) # type: ignore + is_normalized = cache_readonly( + DatetimeArray.is_normalized.fget) # type: ignore + _resolution = cache_readonly( + DatetimeArray._resolution.fget) # type: ignore strftime = ea_passthrough(DatetimeArray.strftime) _has_same_tz = ea_passthrough(DatetimeArray._has_same_tz) From a8af7a1a73164e97627bb5b96dacfdb2ba10b24e Mon Sep 17 00:00:00 2001 From: Mats Maiwald <32721837+matsmaiwald@users.noreply.github.com> Date: Fri, 24 May 2019 17:47:01 +0200 Subject: [PATCH 10/38] =?UTF-8?q?Better=20error=20message=20for=20DataFram?= =?UTF-8?q?e.hist()=20without=20numerical=20columns=20(=E2=80=A6=20(#26483?= =?UTF-8?q?)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pandas/plotting/_core.py | 4 ++++ pandas/tests/plotting/test_hist_method.py | 10 ++++++++++ 2 files changed, 14 insertions(+) diff --git a/pandas/plotting/_core.py b/pandas/plotting/_core.py index 90297ecfa3415..fed4b0d90983c 100644 --- a/pandas/plotting/_core.py +++ b/pandas/plotting/_core.py @@ -2426,6 +2426,10 @@ def hist_frame(data, column=None, by=None, grid=True, xlabelsize=None, data = data._get_numeric_data() naxes = len(data.columns) + if naxes == 0: + raise ValueError("hist method requires numerical columns, " + "nothing to plot.") + fig, axes = _subplots(naxes=naxes, ax=ax, squeeze=False, sharex=sharex, sharey=sharey, figsize=figsize, layout=layout) diff --git a/pandas/tests/plotting/test_hist_method.py b/pandas/tests/plotting/test_hist_method.py index c62ed21c2fb17..f3f6c9c7fc2d4 100644 --- a/pandas/tests/plotting/test_hist_method.py +++ b/pandas/tests/plotting/test_hist_method.py @@ -209,6 +209,16 @@ def test_hist_df_legacy(self): with pytest.raises(AttributeError): ser.hist(foo='bar') + @pytest.mark.slow + def test_hist_non_numerical_raises(self): + # gh-10444 + df = DataFrame(np.random.rand(10, 2)) + df_o = df.astype(np.object) + + msg = "hist method requires numerical columns, nothing to plot." + with pytest.raises(ValueError, match=msg): + df_o.hist() + @pytest.mark.slow def test_hist_layout(self): df = DataFrame(randn(100, 3)) From ac026742dfc74b6c26ef867fa846cc322a602847 Mon Sep 17 00:00:00 2001 From: William Ayd Date: Fri, 24 May 2019 09:01:09 -0700 Subject: [PATCH 11/38] Excel Test Cleanup - ReadWriteClass (#26473) --- pandas/tests/io/test_excel.py | 427 +++++++++++++++++----------------- 1 file changed, 216 insertions(+), 211 deletions(-) diff --git a/pandas/tests/io/test_excel.py b/pandas/tests/io/test_excel.py index 112d14795d9bf..f9926cd26d8da 100644 --- a/pandas/tests/io/test_excel.py +++ b/pandas/tests/io/test_excel.py @@ -49,7 +49,6 @@ def ignore_xlrd_time_clock_warning(): yield -@td.skip_if_no('xlrd', '1.0.0') class SharedItems: @pytest.fixture(autouse=True) @@ -60,6 +59,20 @@ def setup_method(self, datapath): self.tsframe = _tsframe.copy() self.mixed_frame = _mixed_frame.copy() + +@td.skip_if_no('xlrd', '1.0.0') +class ReadingTestsBase(SharedItems): + # This is based on ExcelWriterBase + + @pytest.fixture(autouse=True, params=['xlrd', None]) + def set_engine(self, request): + func_name = "get_exceldf" + old_func = getattr(self, func_name) + new_func = partial(old_func, engine=request.param) + setattr(self, func_name, new_func) + yield + setattr(self, func_name, old_func) + def get_csv_refdf(self, basename): """ Obtain the reference data from read_csv with the Python engine. @@ -114,19 +127,6 @@ def get_exceldf(self, basename, ext, *args, **kwds): pth = os.path.join(self.dirpath, basename + ext) return read_excel(pth, *args, **kwds) - -class ReadingTestsBase(SharedItems): - # This is based on ExcelWriterBase - - @pytest.fixture(autouse=True, params=['xlrd', None]) - def set_engine(self, request): - func_name = "get_exceldf" - old_func = getattr(self, func_name) - new_func = partial(old_func, engine=request.param) - setattr(self, func_name, new_func) - yield - setattr(self, func_name, old_func) - @td.skip_if_no("xlrd", "1.0.1") # see gh-22682 def test_usecols_int(self, ext): @@ -565,74 +565,6 @@ def test_read_excel_blank_with_header(self, ext): actual = self.get_exceldf('blank_with_header', ext, 'Sheet1') tm.assert_frame_equal(actual, expected) - @td.skip_if_no("xlwt") - @td.skip_if_no("openpyxl") - @pytest.mark.parametrize("header,expected", [ - (None, DataFrame([np.nan] * 4)), - (0, DataFrame({"Unnamed: 0": [np.nan] * 3})) - ]) - def test_read_one_empty_col_no_header(self, ext, header, expected): - # xref gh-12292 - filename = "no_header" - df = pd.DataFrame( - [["", 1, 100], - ["", 2, 200], - ["", 3, 300], - ["", 4, 400]] - ) - - with ensure_clean(ext) as path: - df.to_excel(path, filename, index=False, header=False) - result = read_excel(path, filename, usecols=[0], header=header) - - tm.assert_frame_equal(result, expected) - - @td.skip_if_no("xlwt") - @td.skip_if_no("openpyxl") - @pytest.mark.parametrize("header,expected", [ - (None, DataFrame([0] + [np.nan] * 4)), - (0, DataFrame([np.nan] * 4)) - ]) - def test_read_one_empty_col_with_header(self, ext, header, expected): - filename = "with_header" - df = pd.DataFrame( - [["", 1, 100], - ["", 2, 200], - ["", 3, 300], - ["", 4, 400]] - ) - - with ensure_clean(ext) as path: - df.to_excel(path, 'with_header', index=False, header=True) - result = read_excel(path, filename, usecols=[0], header=header) - - tm.assert_frame_equal(result, expected) - - @td.skip_if_no('openpyxl') - @td.skip_if_no('xlwt') - def test_set_column_names_in_parameter(self, ext): - # GH 12870 : pass down column names associated with - # keyword argument names - refdf = pd.DataFrame([[1, 'foo'], [2, 'bar'], - [3, 'baz']], columns=['a', 'b']) - - with ensure_clean(ext) as pth: - with ExcelWriter(pth) as writer: - refdf.to_excel(writer, 'Data_no_head', - header=False, index=False) - refdf.to_excel(writer, 'Data_with_head', index=False) - - refdf.columns = ['A', 'B'] - - with ExcelFile(pth) as reader: - xlsdf_no_head = read_excel(reader, 'Data_no_head', - header=None, names=['A', 'B']) - xlsdf_with_head = read_excel(reader, 'Data_with_head', - index_col=None, names=['A', 'B']) - - tm.assert_frame_equal(xlsdf_no_head, refdf) - tm.assert_frame_equal(xlsdf_with_head, refdf) - def test_date_conversion_overflow(self, ext): # GH 10001 : pandas.ExcelFile ignore parse_dates=False expected = pd.DataFrame([[pd.Timestamp('2016-03-12'), 'Marc Johnson'], @@ -741,7 +673,6 @@ def test_read_from_file_url(self, ext): tm.assert_frame_equal(url_table, local_table) - @td.skip_if_no('pathlib') def test_read_from_pathlib_path(self, ext): # GH12655 @@ -780,32 +711,6 @@ def test_reader_closes_file(self, ext): assert f.closed - @td.skip_if_no("xlwt") - @td.skip_if_no("openpyxl") - def test_creating_and_reading_multiple_sheets(self, ext): - # see gh-9450 - # - # Test reading multiple sheets, from a runtime - # created Excel file with multiple sheets. - def tdf(col_sheet_name): - d, i = [11, 22, 33], [1, 2, 3] - return DataFrame(d, i, columns=[col_sheet_name]) - - sheets = ["AAA", "BBB", "CCC"] - - dfs = [tdf(s) for s in sheets] - dfs = dict(zip(sheets, dfs)) - - with ensure_clean(ext) as pth: - with ExcelWriter(pth) as ew: - for sheetname, df in dfs.items(): - df.to_excel(ew, sheetname) - - dfs_returned = read_excel(pth, sheet_name=sheets, index_col=0) - - for s in sheets: - tm.assert_frame_equal(dfs[s], dfs_returned[s]) - def test_reader_seconds(self, ext): # Test reading times with and without milliseconds. GH5945. @@ -902,84 +807,6 @@ def test_read_excel_multiindex_header_only(self, ext): expected = DataFrame([[1, 2, 3, 4]] * 2, columns=exp_columns) tm.assert_frame_equal(result, expected) - @td.skip_if_no("xlsxwriter") - def test_read_excel_multiindex_empty_level(self, ext): - # see gh-12453 - with ensure_clean(ext) as path: - df = DataFrame({ - ("One", "x"): {0: 1}, - ("Two", "X"): {0: 3}, - ("Two", "Y"): {0: 7}, - ("Zero", ""): {0: 0} - }) - - expected = DataFrame({ - ("One", "x"): {0: 1}, - ("Two", "X"): {0: 3}, - ("Two", "Y"): {0: 7}, - ("Zero", "Unnamed: 4_level_1"): {0: 0} - }) - - df.to_excel(path) - actual = pd.read_excel(path, header=[0, 1], index_col=0) - tm.assert_frame_equal(actual, expected) - - df = pd.DataFrame({ - ("Beg", ""): {0: 0}, - ("Middle", "x"): {0: 1}, - ("Tail", "X"): {0: 3}, - ("Tail", "Y"): {0: 7} - }) - - expected = pd.DataFrame({ - ("Beg", "Unnamed: 1_level_1"): {0: 0}, - ("Middle", "x"): {0: 1}, - ("Tail", "X"): {0: 3}, - ("Tail", "Y"): {0: 7} - }) - - df.to_excel(path) - actual = pd.read_excel(path, header=[0, 1], index_col=0) - tm.assert_frame_equal(actual, expected) - - @td.skip_if_no("xlsxwriter") - @pytest.mark.parametrize("c_idx_names", [True, False]) - @pytest.mark.parametrize("r_idx_names", [True, False]) - @pytest.mark.parametrize("c_idx_levels", [1, 3]) - @pytest.mark.parametrize("r_idx_levels", [1, 3]) - def test_excel_multindex_roundtrip(self, ext, c_idx_names, r_idx_names, - c_idx_levels, r_idx_levels): - # see gh-4679 - with ensure_clean(ext) as pth: - if c_idx_levels == 1 and c_idx_names: - pytest.skip("Column index name cannot be " - "serialized unless it's a MultiIndex") - - # Empty name case current read in as - # unnamed levels, not Nones. - check_names = r_idx_names or r_idx_levels <= 1 - - df = mkdf(5, 5, c_idx_names, r_idx_names, - c_idx_levels, r_idx_levels) - df.to_excel(pth) - - act = pd.read_excel(pth, index_col=list(range(r_idx_levels)), - header=list(range(c_idx_levels))) - tm.assert_frame_equal(df, act, check_names=check_names) - - df.iloc[0, :] = np.nan - df.to_excel(pth) - - act = pd.read_excel(pth, index_col=list(range(r_idx_levels)), - header=list(range(c_idx_levels))) - tm.assert_frame_equal(df, act, check_names=check_names) - - df.iloc[-1, :] = np.nan - df.to_excel(pth) - act = pd.read_excel(pth, index_col=list(range(r_idx_levels)), - header=list(range(c_idx_levels))) - tm.assert_frame_equal(df, act, check_names=check_names) - def test_excel_old_index_format(self, ext): # see gh-4679 filename = "test_index_name_pre17" + ext @@ -1054,30 +881,6 @@ def test_read_excel_chunksize(self, ext): pd.read_excel(os.path.join(self.dirpath, 'test1' + ext), chunksize=100) - @td.skip_if_no("xlwt") - @td.skip_if_no("openpyxl") - def test_read_excel_parse_dates(self, ext): - # see gh-11544, gh-12051 - df = DataFrame( - {"col": [1, 2, 3], - "date_strings": pd.date_range("2012-01-01", periods=3)}) - df2 = df.copy() - df2["date_strings"] = df2["date_strings"].dt.strftime("%m/%d/%Y") - - with ensure_clean(ext) as pth: - df2.to_excel(pth) - - res = read_excel(pth, index_col=0) - tm.assert_frame_equal(df2, res) - - res = read_excel(pth, parse_dates=["date_strings"], index_col=0) - tm.assert_frame_equal(df, res) - - date_parser = lambda x: pd.datetime.strptime(x, "%m/%d/%Y") - res = read_excel(pth, parse_dates=["date_strings"], - date_parser=date_parser, index_col=0) - tm.assert_frame_equal(df, res) - def test_read_excel_skiprows_list(self, ext): # GH 4903 actual = pd.read_excel(os.path.join(self.dirpath, @@ -1141,6 +944,208 @@ def test_read_excel_squeeze(self, ext): tm.assert_series_equal(actual, expected) +@td.skip_if_no('xlrd', '1.0.0') +@pytest.mark.parametrize("ext", ['.xls', '.xlsx', '.xlsm']) +class TestRoundTrip: + + @td.skip_if_no("xlwt") + @td.skip_if_no("openpyxl") + @pytest.mark.parametrize("header,expected", [ + (None, DataFrame([np.nan] * 4)), + (0, DataFrame({"Unnamed: 0": [np.nan] * 3})) + ]) + def test_read_one_empty_col_no_header(self, ext, header, expected): + # xref gh-12292 + filename = "no_header" + df = pd.DataFrame( + [["", 1, 100], + ["", 2, 200], + ["", 3, 300], + ["", 4, 400]] + ) + + with ensure_clean(ext) as path: + df.to_excel(path, filename, index=False, header=False) + result = read_excel(path, filename, usecols=[0], header=header) + + tm.assert_frame_equal(result, expected) + + @td.skip_if_no("xlwt") + @td.skip_if_no("openpyxl") + @pytest.mark.parametrize("header,expected", [ + (None, DataFrame([0] + [np.nan] * 4)), + (0, DataFrame([np.nan] * 4)) + ]) + def test_read_one_empty_col_with_header(self, ext, header, expected): + filename = "with_header" + df = pd.DataFrame( + [["", 1, 100], + ["", 2, 200], + ["", 3, 300], + ["", 4, 400]] + ) + + with ensure_clean(ext) as path: + df.to_excel(path, 'with_header', index=False, header=True) + result = read_excel(path, filename, usecols=[0], header=header) + + tm.assert_frame_equal(result, expected) + + @td.skip_if_no('openpyxl') + @td.skip_if_no('xlwt') + def test_set_column_names_in_parameter(self, ext): + # GH 12870 : pass down column names associated with + # keyword argument names + refdf = pd.DataFrame([[1, 'foo'], [2, 'bar'], + [3, 'baz']], columns=['a', 'b']) + + with ensure_clean(ext) as pth: + with ExcelWriter(pth) as writer: + refdf.to_excel(writer, 'Data_no_head', + header=False, index=False) + refdf.to_excel(writer, 'Data_with_head', index=False) + + refdf.columns = ['A', 'B'] + + with ExcelFile(pth) as reader: + xlsdf_no_head = read_excel(reader, 'Data_no_head', + header=None, names=['A', 'B']) + xlsdf_with_head = read_excel(reader, 'Data_with_head', + index_col=None, names=['A', 'B']) + + tm.assert_frame_equal(xlsdf_no_head, refdf) + tm.assert_frame_equal(xlsdf_with_head, refdf) + + @td.skip_if_no("xlwt") + @td.skip_if_no("openpyxl") + def test_creating_and_reading_multiple_sheets(self, ext): + # see gh-9450 + # + # Test reading multiple sheets, from a runtime + # created Excel file with multiple sheets. + def tdf(col_sheet_name): + d, i = [11, 22, 33], [1, 2, 3] + return DataFrame(d, i, columns=[col_sheet_name]) + + sheets = ["AAA", "BBB", "CCC"] + + dfs = [tdf(s) for s in sheets] + dfs = dict(zip(sheets, dfs)) + + with ensure_clean(ext) as pth: + with ExcelWriter(pth) as ew: + for sheetname, df in dfs.items(): + df.to_excel(ew, sheetname) + + dfs_returned = read_excel(pth, sheet_name=sheets, index_col=0) + + for s in sheets: + tm.assert_frame_equal(dfs[s], dfs_returned[s]) + + @td.skip_if_no("xlsxwriter") + def test_read_excel_multiindex_empty_level(self, ext): + # see gh-12453 + with ensure_clean(ext) as path: + df = DataFrame({ + ("One", "x"): {0: 1}, + ("Two", "X"): {0: 3}, + ("Two", "Y"): {0: 7}, + ("Zero", ""): {0: 0} + }) + + expected = DataFrame({ + ("One", "x"): {0: 1}, + ("Two", "X"): {0: 3}, + ("Two", "Y"): {0: 7}, + ("Zero", "Unnamed: 4_level_1"): {0: 0} + }) + + df.to_excel(path) + actual = pd.read_excel(path, header=[0, 1], index_col=0) + tm.assert_frame_equal(actual, expected) + + df = pd.DataFrame({ + ("Beg", ""): {0: 0}, + ("Middle", "x"): {0: 1}, + ("Tail", "X"): {0: 3}, + ("Tail", "Y"): {0: 7} + }) + + expected = pd.DataFrame({ + ("Beg", "Unnamed: 1_level_1"): {0: 0}, + ("Middle", "x"): {0: 1}, + ("Tail", "X"): {0: 3}, + ("Tail", "Y"): {0: 7} + }) + + df.to_excel(path) + actual = pd.read_excel(path, header=[0, 1], index_col=0) + tm.assert_frame_equal(actual, expected) + + @td.skip_if_no("xlsxwriter") + @pytest.mark.parametrize("c_idx_names", [True, False]) + @pytest.mark.parametrize("r_idx_names", [True, False]) + @pytest.mark.parametrize("c_idx_levels", [1, 3]) + @pytest.mark.parametrize("r_idx_levels", [1, 3]) + def test_excel_multindex_roundtrip(self, ext, c_idx_names, r_idx_names, + c_idx_levels, r_idx_levels): + # see gh-4679 + with ensure_clean(ext) as pth: + if c_idx_levels == 1 and c_idx_names: + pytest.skip("Column index name cannot be " + "serialized unless it's a MultiIndex") + + # Empty name case current read in as + # unnamed levels, not Nones. + check_names = r_idx_names or r_idx_levels <= 1 + + df = mkdf(5, 5, c_idx_names, r_idx_names, + c_idx_levels, r_idx_levels) + df.to_excel(pth) + + act = pd.read_excel(pth, index_col=list(range(r_idx_levels)), + header=list(range(c_idx_levels))) + tm.assert_frame_equal(df, act, check_names=check_names) + + df.iloc[0, :] = np.nan + df.to_excel(pth) + + act = pd.read_excel(pth, index_col=list(range(r_idx_levels)), + header=list(range(c_idx_levels))) + tm.assert_frame_equal(df, act, check_names=check_names) + + df.iloc[-1, :] = np.nan + df.to_excel(pth) + act = pd.read_excel(pth, index_col=list(range(r_idx_levels)), + header=list(range(c_idx_levels))) + tm.assert_frame_equal(df, act, check_names=check_names) + + @td.skip_if_no("xlwt") + @td.skip_if_no("openpyxl") + def test_read_excel_parse_dates(self, ext): + # see gh-11544, gh-12051 + df = DataFrame( + {"col": [1, 2, 3], + "date_strings": pd.date_range("2012-01-01", periods=3)}) + df2 = df.copy() + df2["date_strings"] = df2["date_strings"].dt.strftime("%m/%d/%Y") + + with ensure_clean(ext) as pth: + df2.to_excel(pth) + + res = read_excel(pth, index_col=0) + tm.assert_frame_equal(df2, res) + + res = read_excel(pth, parse_dates=["date_strings"], index_col=0) + tm.assert_frame_equal(df, res) + + date_parser = lambda x: pd.datetime.strptime(x, "%m/%d/%Y") + res = read_excel(pth, parse_dates=["date_strings"], + date_parser=date_parser, index_col=0) + tm.assert_frame_equal(df, res) + + +@td.skip_if_no('xlrd', '1.0.0') @pytest.mark.parametrize("ext", ['.xls', '.xlsx', '.xlsm']) class TestXlrdReader(ReadingTestsBase): """ From 91512111bc0f42ac15695d2af94e3fff3d6ba536 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Fri, 24 May 2019 15:07:54 -0700 Subject: [PATCH 12/38] CLN: pd.TimeGrouper (#26477) --- doc/source/whatsnew/v0.25.0.rst | 2 +- pandas/__init__.py | 2 +- pandas/core/api.py | 12 ---------- pandas/tests/api/test_api.py | 13 +---------- pandas/tests/groupby/test_timegrouper.py | 7 +++--- pandas/tests/resample/test_base.py | 4 ++-- pandas/tests/resample/test_datetime_index.py | 14 ++++++------ pandas/tests/resample/test_time_grouper.py | 24 ++++++++------------ 8 files changed, 25 insertions(+), 53 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index d86379c4d0703..d4104ab1d79a1 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -313,7 +313,7 @@ Removal of prior version deprecations/changes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - Removed ``Panel`` (:issue:`25047`, :issue:`25191`, :issue:`25231`) - -- +- Removed previously deprecated ``TimeGrouper`` (:issue:`16942`) - .. _whatsnew_0250.performance: diff --git a/pandas/__init__.py b/pandas/__init__.py index bd367bbe27d5e..6af6f3093c120 100644 --- a/pandas/__init__.py +++ b/pandas/__init__.py @@ -65,7 +65,7 @@ to_numeric, to_datetime, to_timedelta, # misc - np, TimeGrouper, Grouper, factorize, unique, value_counts, + np, Grouper, factorize, unique, value_counts, array, Categorical, set_eng_float_format, Series, DataFrame, Panel) diff --git a/pandas/core/api.py b/pandas/core/api.py index 96f623bda9a8a..b7398e433f28f 100644 --- a/pandas/core/api.py +++ b/pandas/core/api.py @@ -45,15 +45,3 @@ from pandas.tseries.offsets import DateOffset from pandas.core.tools.datetimes import to_datetime from pandas.core.tools.timedeltas import to_timedelta - - -# Deprecation: xref gh-16747 -class TimeGrouper: - - def __new__(cls, *args, **kwargs): - from pandas.core.resample import TimeGrouper - import warnings - warnings.warn("pd.TimeGrouper is deprecated and will be removed; " - "Please use pd.Grouper(freq=...)", - FutureWarning, stacklevel=2) - return TimeGrouper(*args, **kwargs) diff --git a/pandas/tests/api/test_api.py b/pandas/tests/api/test_api.py index 7ee0225723675..c92808200ebea 100644 --- a/pandas/tests/api/test_api.py +++ b/pandas/tests/api/test_api.py @@ -50,7 +50,7 @@ class TestPDApi(Base): ] # these are already deprecated; awaiting removal - deprecated_classes = ['TimeGrouper', 'Panel'] + deprecated_classes = ['Panel'] # these should be deprecated in the future deprecated_classes_in_future = [] @@ -132,17 +132,6 @@ def test_testing(self): self.check(testing, self.funcs) -class TestTopLevelDeprecations: - - # top-level API deprecations - # GH 13790 - - def test_TimeGrouper(self): - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - pd.TimeGrouper(freq='D') - - class TestCDateRange: def test_deprecation_cdaterange(self): diff --git a/pandas/tests/groupby/test_timegrouper.py b/pandas/tests/groupby/test_timegrouper.py index 21c71154c95ef..ef05e6ada4890 100644 --- a/pandas/tests/groupby/test_timegrouper.py +++ b/pandas/tests/groupby/test_timegrouper.py @@ -10,6 +10,7 @@ import pandas as pd from pandas import DataFrame, Index, MultiIndex, Series, Timestamp, date_range +from pandas.core.groupby.grouper import Grouper from pandas.core.groupby.ops import BinGrouper from pandas.util import testing as tm from pandas.util.testing import assert_frame_equal, assert_series_equal @@ -365,10 +366,8 @@ def sumfunc_value(x): return x.value.sum() expected = df.groupby(pd.Grouper(key='date')).apply(sumfunc_value) - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - result = (df_dt.groupby(pd.TimeGrouper(freq='M', key='date')) - .apply(sumfunc_value)) + result = (df_dt.groupby(Grouper(freq='M', key='date')) + .apply(sumfunc_value)) assert_series_equal(result.reset_index(drop=True), expected.reset_index(drop=True)) diff --git a/pandas/tests/resample/test_base.py b/pandas/tests/resample/test_base.py index c3c908f4b0d1b..63fa2007e401d 100644 --- a/pandas/tests/resample/test_base.py +++ b/pandas/tests/resample/test_base.py @@ -6,10 +6,10 @@ import pandas as pd from pandas import DataFrame, Series from pandas.core.groupby.groupby import DataError +from pandas.core.groupby.grouper import Grouper from pandas.core.indexes.datetimes import date_range from pandas.core.indexes.period import PeriodIndex, period_range from pandas.core.indexes.timedeltas import TimedeltaIndex, timedelta_range -from pandas.core.resample import TimeGrouper import pandas.util.testing as tm from pandas.util.testing import ( assert_almost_equal, assert_frame_equal, assert_index_equal, @@ -214,7 +214,7 @@ def test_apply_to_empty_series(empty_series): def test_resampler_is_iterable(series): # GH 15314 freq = 'H' - tg = TimeGrouper(freq, convention='start') + tg = Grouper(freq=freq, convention='start') grouped = series.groupby(tg) resampled = series.resample(freq) for (rk, rv), (gk, gv) in zip(resampled, grouped): diff --git a/pandas/tests/resample/test_datetime_index.py b/pandas/tests/resample/test_datetime_index.py index c2868979e9d8d..5711174ef0c9f 100644 --- a/pandas/tests/resample/test_datetime_index.py +++ b/pandas/tests/resample/test_datetime_index.py @@ -10,10 +10,10 @@ import pandas as pd from pandas import DataFrame, Series, Timedelta, Timestamp, isna, notna +from pandas.core.groupby.grouper import Grouper from pandas.core.indexes.datetimes import date_range from pandas.core.indexes.period import Period, period_range -from pandas.core.resample import ( - DatetimeIndex, TimeGrouper, _get_timestamp_range_edges) +from pandas.core.resample import DatetimeIndex, _get_timestamp_range_edges import pandas.util.testing as tm from pandas.util.testing import ( assert_almost_equal, assert_frame_equal, assert_series_equal) @@ -42,7 +42,7 @@ def test_custom_grouper(index): dti = index s = Series(np.array([1] * len(dti)), index=dti, dtype='int64') - b = TimeGrouper(Minute(5)) + b = Grouper(freq=Minute(5)) g = s.groupby(b) # check all cython functions work @@ -50,7 +50,7 @@ def test_custom_grouper(index): for f in funcs: g._cython_agg_general(f) - b = TimeGrouper(Minute(5), closed='right', label='right') + b = Grouper(freq=Minute(5), closed='right', label='right') g = s.groupby(b) # check all cython functions work funcs = ['add', 'mean', 'prod', 'ohlc', 'min', 'max', 'var'] @@ -116,7 +116,7 @@ def test_resample_integerarray(): def test_resample_basic_grouper(series): s = series result = s.resample('5Min').last() - grouper = TimeGrouper(Minute(5), closed='left', label='left') + grouper = Grouper(freq=Minute(5), closed='left', label='left') expected = s.groupby(grouper).agg(lambda x: x[-1]) assert_series_equal(result, expected) @@ -373,7 +373,7 @@ def test_resample_upsampling_picked_but_not_correct(): def test_resample_frame_basic(): df = tm.makeTimeDataFrame() - b = TimeGrouper('M') + b = Grouper(freq='M') g = df.groupby(b) # check all cython functions work @@ -521,7 +521,7 @@ def test_nearest_upsample_with_limit(): def test_resample_ohlc(series): s = series - grouper = TimeGrouper(Minute(5)) + grouper = Grouper(freq=Minute(5)) expect = s.groupby(grouper).agg(lambda x: x[-1]) result = s.resample('5Min').ohlc() diff --git a/pandas/tests/resample/test_time_grouper.py b/pandas/tests/resample/test_time_grouper.py index 2f330d1f2484b..3f767f8e7100f 100644 --- a/pandas/tests/resample/test_time_grouper.py +++ b/pandas/tests/resample/test_time_grouper.py @@ -6,8 +6,8 @@ import pandas as pd from pandas import DataFrame, Series +from pandas.core.groupby.grouper import Grouper from pandas.core.indexes.datetimes import date_range -from pandas.core.resample import TimeGrouper import pandas.util.testing as tm from pandas.util.testing import assert_frame_equal, assert_series_equal @@ -16,9 +16,7 @@ def test_apply(): - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - grouper = pd.TimeGrouper(freq='A', label='right', closed='right') + grouper = Grouper(freq='A', label='right', closed='right') grouped = test_series.groupby(grouper) @@ -38,9 +36,7 @@ def test_count(): expected = test_series.groupby(lambda x: x.year).count() - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - grouper = pd.TimeGrouper(freq='A', label='right', closed='right') + grouper = Grouper(freq='A', label='right', closed='right') result = test_series.groupby(grouper).count() expected.index = result.index assert_series_equal(result, expected) @@ -64,7 +60,7 @@ def test_apply_iteration(): N = 1000 ind = pd.date_range(start="2000-01-01", freq="D", periods=N) df = DataFrame({'open': 1, 'close': 2}, index=ind) - tg = TimeGrouper('M') + tg = Grouper(freq='M') _, grouper, _ = tg._get_grouper(df) @@ -93,7 +89,7 @@ def test_fails_on_no_datetime_index(name, func): msg = ("Only valid with DatetimeIndex, TimedeltaIndex " "or PeriodIndex, but got an instance of '{}'".format(name)) with pytest.raises(TypeError, match=msg): - df.groupby(TimeGrouper('D')) + df.groupby(Grouper(freq='D')) def test_aaa_group_order(): @@ -105,7 +101,7 @@ def test_aaa_group_order(): df['key'] = [datetime(2013, 1, 1), datetime(2013, 1, 2), datetime(2013, 1, 3), datetime(2013, 1, 4), datetime(2013, 1, 5)] * 4 - grouped = df.groupby(TimeGrouper(key='key', freq='D')) + grouped = df.groupby(Grouper(key='key', freq='D')) tm.assert_frame_equal(grouped.get_group(datetime(2013, 1, 1)), df[::5]) @@ -135,7 +131,7 @@ def test_aggregate_normal(resample_method): datetime(2013, 1, 5)] * 4 normal_grouped = normal_df.groupby('key') - dt_grouped = dt_df.groupby(TimeGrouper(key='key', freq='D')) + dt_grouped = dt_df.groupby(Grouper(key='key', freq='D')) expected = getattr(normal_grouped, resample_method)() dt_result = getattr(dt_grouped, resample_method)() @@ -195,7 +191,7 @@ def test_aggregate_with_nat(func, fill_value): datetime(2013, 1, 4), datetime(2013, 1, 5)] * 4 normal_grouped = normal_df.groupby('key') - dt_grouped = dt_df.groupby(TimeGrouper(key='key', freq='D')) + dt_grouped = dt_df.groupby(Grouper(key='key', freq='D')) normal_result = getattr(normal_grouped, func)() dt_result = getattr(dt_grouped, func)() @@ -222,7 +218,7 @@ def test_aggregate_with_nat_size(): datetime(2013, 1, 4), datetime(2013, 1, 5)] * 4 normal_grouped = normal_df.groupby('key') - dt_grouped = dt_df.groupby(TimeGrouper(key='key', freq='D')) + dt_grouped = dt_df.groupby(Grouper(key='key', freq='D')) normal_result = normal_grouped.size() dt_result = dt_grouped.size() @@ -238,7 +234,7 @@ def test_aggregate_with_nat_size(): def test_repr(): # GH18203 - result = repr(TimeGrouper(key='A', freq='H')) + result = repr(Grouper(key='A', freq='H')) expected = ("TimeGrouper(key='A', freq=, axis=0, sort=True, " "closed='left', label='left', how='mean', " "convention='e', base=0)") From 8c8a1759a92c87ff3a56f8cef2d6ba2d9d500bc1 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Fri, 24 May 2019 18:18:04 -0700 Subject: [PATCH 13/38] CLN: Remove ExcelWriter.sheetname (#26464) xref gh-6581 --- doc/source/whatsnew/v0.25.0.rst | 2 +- pandas/io/excel/_base.py | 24 ++++------------------- pandas/tests/io/test_excel.py | 34 +++++++++++---------------------- 3 files changed, 16 insertions(+), 44 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index d4104ab1d79a1..29cc14b638996 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -312,7 +312,7 @@ Deprecations Removal of prior version deprecations/changes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - Removed ``Panel`` (:issue:`25047`, :issue:`25191`, :issue:`25231`) -- +- Removed the previously deprecated ``sheetname`` keyword in :func:`read_excel` (:issue:`16442`, :issue:`20938`) - Removed previously deprecated ``TimeGrouper`` (:issue:`16942`) - diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index c0678575fd6f0..a0d51e85aa4f3 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -5,7 +5,6 @@ import os from textwrap import fill from urllib.request import urlopen -import warnings from pandas._config import config @@ -291,15 +290,10 @@ def read_excel(io, mangle_dupe_cols=True, **kwds): - # Can't use _deprecate_kwarg since sheetname=None has a special meaning - if is_integer(sheet_name) and sheet_name == 0 and 'sheetname' in kwds: - warnings.warn("The `sheetname` keyword is deprecated, use " - "`sheet_name` instead", FutureWarning, stacklevel=2) - sheet_name = kwds.pop("sheetname") - - if 'sheet' in kwds: - raise TypeError("read_excel() got an unexpected keyword argument " - "`sheet`") + for arg in ('sheet', 'sheetname'): + if arg in kwds: + raise TypeError("read_excel() got an unexpected keyword argument " + "`{}`".format(arg)) if not isinstance(io, ExcelFile): io = ExcelFile(io, engine=engine) @@ -833,16 +827,6 @@ def parse(self, DataFrame or dict of DataFrames DataFrame from the passed in Excel file. """ - - # Can't use _deprecate_kwarg since sheetname=None has a special meaning - if is_integer(sheet_name) and sheet_name == 0 and 'sheetname' in kwds: - warnings.warn("The `sheetname` keyword is deprecated, use " - "`sheet_name` instead", FutureWarning, stacklevel=2) - sheet_name = kwds.pop("sheetname") - elif 'sheetname' in kwds: - raise TypeError("Cannot specify both `sheet_name` " - "and `sheetname`. Use just `sheet_name`") - if 'chunksize' in kwds: raise NotImplementedError("chunksize keyword of read_excel " "is not implemented") diff --git a/pandas/tests/io/test_excel.py b/pandas/tests/io/test_excel.py index f9926cd26d8da..44ce3111c3a1e 100644 --- a/pandas/tests/io/test_excel.py +++ b/pandas/tests/io/test_excel.py @@ -342,15 +342,15 @@ def test_excel_passes_na(self, ext): tm.assert_frame_equal(parsed, expected) @td.skip_if_no('xlrd', '1.0.1') # GH-22682 - def test_deprecated_sheetname(self, ext): + @pytest.mark.parametrize('arg', ['sheet', 'sheetname']) + def test_unexpected_kwargs_raises(self, ext, arg): # gh-17964 excel = self.get_excelfile('test1', ext) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - read_excel(excel, sheetname='Sheet1') - - with pytest.raises(TypeError): - read_excel(excel, sheet='Sheet1') + kwarg = {arg: 'Sheet1'} + msg = "unexpected keyword argument `{}`".format(arg) + with pytest.raises(TypeError, match=msg): + read_excel(excel, **kwarg) @td.skip_if_no('xlrd', '1.0.1') # GH-22682 def test_excel_table_sheet_by_index(self, ext): @@ -588,32 +588,20 @@ def test_sheet_name_and_sheetname(self, ext): df_ref = self.get_csv_refdf(filename) df1 = self.get_exceldf(filename, ext, sheet_name=sheet_name, index_col=0) # doc - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - with ignore_xlrd_time_clock_warning(): - df2 = self.get_exceldf(filename, ext, index_col=0, - sheetname=sheet_name) # backward compat + with ignore_xlrd_time_clock_warning(): + df2 = self.get_exceldf(filename, ext, index_col=0, + sheet_name=sheet_name) excel = self.get_excelfile(filename, ext) df1_parse = excel.parse(sheet_name=sheet_name, index_col=0) # doc - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - df2_parse = excel.parse(index_col=0, - sheetname=sheet_name) # backward compat + df2_parse = excel.parse(index_col=0, + sheet_name=sheet_name) tm.assert_frame_equal(df1, df_ref, check_names=False) tm.assert_frame_equal(df2, df_ref, check_names=False) tm.assert_frame_equal(df1_parse, df_ref, check_names=False) tm.assert_frame_equal(df2_parse, df_ref, check_names=False) - def test_sheet_name_both_raises(self, ext): - with pytest.raises(TypeError, match="Cannot specify both"): - self.get_exceldf('test1', ext, sheetname='Sheet1', - sheet_name='Sheet1') - - excel = self.get_excelfile('test1', ext) - with pytest.raises(TypeError, match="Cannot specify both"): - excel.parse(sheetname='Sheet1', - sheet_name='Sheet1') - def test_excel_read_buffer(self, ext): pth = os.path.join(self.dirpath, 'test1' + ext) From 9d6d95994ad2d58bad0ae0910ea9b5ab2df6be50 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Sat, 25 May 2019 11:40:15 -0700 Subject: [PATCH 14/38] CLN: Remove deprecated parse_cols from read_excel (#26522) xref gh-6581 --- doc/source/whatsnew/v0.25.0.rst | 4 ++-- pandas/io/excel/_base.py | 10 +--------- pandas/tests/io/test_excel.py | 22 +--------------------- 3 files changed, 4 insertions(+), 32 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 29cc14b638996..af59a34245660 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -313,8 +313,8 @@ Removal of prior version deprecations/changes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - Removed ``Panel`` (:issue:`25047`, :issue:`25191`, :issue:`25231`) - Removed the previously deprecated ``sheetname`` keyword in :func:`read_excel` (:issue:`16442`, :issue:`20938`) -- Removed previously deprecated ``TimeGrouper`` (:issue:`16942`) -- +- Removed the previously deprecated ``TimeGrouper`` (:issue:`16942`) +- Removed the previously deprecated ``parse_cols`` keyword in :func:`read_excel` (:issue:`16488`) .. _whatsnew_0250.performance: diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index a0d51e85aa4f3..3af6be7a371e7 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -64,12 +64,6 @@ those columns will be combined into a ``MultiIndex``. If a subset of data is selected with ``usecols``, index_col is based on the subset. -parse_cols : int or list, default None - Alias of `usecols`. - - .. deprecated:: 0.21.0 - Use `usecols` instead. - usecols : int, str, list-like, or callable default None Return a subset of the columns. @@ -260,14 +254,12 @@ @Appender(_read_excel_doc) -@deprecate_kwarg("parse_cols", "usecols") @deprecate_kwarg("skip_footer", "skipfooter") def read_excel(io, sheet_name=0, header=0, names=None, index_col=None, - parse_cols=None, usecols=None, squeeze=False, dtype=None, @@ -290,7 +282,7 @@ def read_excel(io, mangle_dupe_cols=True, **kwds): - for arg in ('sheet', 'sheetname'): + for arg in ('sheet', 'sheetname', 'parse_cols'): if arg in kwds: raise TypeError("read_excel() got an unexpected keyword argument " "`{}`".format(arg)) diff --git a/pandas/tests/io/test_excel.py b/pandas/tests/io/test_excel.py index 44ce3111c3a1e..100de227aa97c 100644 --- a/pandas/tests/io/test_excel.py +++ b/pandas/tests/io/test_excel.py @@ -147,17 +147,9 @@ def test_usecols_int(self, ext): df2 = self.get_exceldf("test1", ext, "Sheet2", skiprows=[1], index_col=0, usecols=3) - # parse_cols instead of usecols, usecols as int - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - with ignore_xlrd_time_clock_warning(): - df3 = self.get_exceldf("test1", ext, "Sheet2", skiprows=[1], - index_col=0, parse_cols=3) - # TODO add index to xls file) tm.assert_frame_equal(df1, df_ref, check_names=False) tm.assert_frame_equal(df2, df_ref, check_names=False) - tm.assert_frame_equal(df3, df_ref, check_names=False) @td.skip_if_no('xlrd', '1.0.1') # GH-22682 def test_usecols_list(self, ext): @@ -169,15 +161,9 @@ def test_usecols_list(self, ext): df2 = self.get_exceldf('test1', ext, 'Sheet2', skiprows=[1], index_col=0, usecols=[0, 2, 3]) - with tm.assert_produces_warning(FutureWarning): - with ignore_xlrd_time_clock_warning(): - df3 = self.get_exceldf('test1', ext, 'Sheet2', skiprows=[1], - index_col=0, parse_cols=[0, 2, 3]) - # TODO add index to xls file) tm.assert_frame_equal(df1, dfref, check_names=False) tm.assert_frame_equal(df2, dfref, check_names=False) - tm.assert_frame_equal(df3, dfref, check_names=False) @td.skip_if_no('xlrd', '1.0.1') # GH-22682 def test_usecols_str(self, ext): @@ -190,15 +176,9 @@ def test_usecols_str(self, ext): df3 = self.get_exceldf('test1', ext, 'Sheet2', skiprows=[1], index_col=0, usecols='A:D') - with tm.assert_produces_warning(FutureWarning): - with ignore_xlrd_time_clock_warning(): - df4 = self.get_exceldf('test1', ext, 'Sheet2', skiprows=[1], - index_col=0, parse_cols='A:D') - # TODO add index to xls, read xls ignores index name ? tm.assert_frame_equal(df2, df1, check_names=False) tm.assert_frame_equal(df3, df1, check_names=False) - tm.assert_frame_equal(df4, df1, check_names=False) df1 = dfref.reindex(columns=['B', 'C']) df2 = self.get_exceldf('test1', ext, 'Sheet1', index_col=0, @@ -342,7 +322,7 @@ def test_excel_passes_na(self, ext): tm.assert_frame_equal(parsed, expected) @td.skip_if_no('xlrd', '1.0.1') # GH-22682 - @pytest.mark.parametrize('arg', ['sheet', 'sheetname']) + @pytest.mark.parametrize('arg', ['sheet', 'sheetname', 'parse_cols']) def test_unexpected_kwargs_raises(self, ext, arg): # gh-17964 excel = self.get_excelfile('test1', ext) From 3bb47664e28ae5e3d33748cbf1825c4acbd4297e Mon Sep 17 00:00:00 2001 From: Mak Sze Chun Date: Sun, 26 May 2019 03:24:39 +0800 Subject: [PATCH 15/38] [TEST] Add two more parameters to the test_dti_add_sub_nonzero_mth_offset (#26392) * Add two more parameters to the test * Add array into the boy and add parameter freq --- pandas/tests/arithmetic/test_datetime64.py | 38 ++++++++++++++-------- 1 file changed, 25 insertions(+), 13 deletions(-) diff --git a/pandas/tests/arithmetic/test_datetime64.py b/pandas/tests/arithmetic/test_datetime64.py index 910fa4818c5de..13adae279c989 100644 --- a/pandas/tests/arithmetic/test_datetime64.py +++ b/pandas/tests/arithmetic/test_datetime64.py @@ -1435,27 +1435,39 @@ def test_dt64arr_add_sub_offset_ndarray(self, tz_naive_fixture, expected = tm.box_expected(expected, box_with_array) tm.assert_equal(res, expected) - @pytest.mark.parametrize("op, offset, exp", [ + @pytest.mark.parametrize("op, offset, exp, exp_freq", [ ('__add__', pd.DateOffset(months=3, days=10), - DatetimeIndex([Timestamp('2014-04-11'), Timestamp('2015-04-11'), - Timestamp('2016-04-11'), Timestamp('2017-04-11')])), + [Timestamp('2014-04-11'), Timestamp('2015-04-11'), + Timestamp('2016-04-11'), Timestamp('2017-04-11')], + None), ('__add__', pd.DateOffset(months=3), - DatetimeIndex([Timestamp('2014-04-01'), Timestamp('2015-04-01'), - Timestamp('2016-04-01'), Timestamp('2017-04-01')])), + [Timestamp('2014-04-01'), Timestamp('2015-04-01'), + Timestamp('2016-04-01'), Timestamp('2017-04-01')], + "AS-APR"), ('__sub__', pd.DateOffset(months=3, days=10), - DatetimeIndex([Timestamp('2013-09-21'), Timestamp('2014-09-21'), - Timestamp('2015-09-21'), Timestamp('2016-09-21')])), + [Timestamp('2013-09-21'), Timestamp('2014-09-21'), + Timestamp('2015-09-21'), Timestamp('2016-09-21')], + None), ('__sub__', pd.DateOffset(months=3), - DatetimeIndex([Timestamp('2013-10-01'), Timestamp('2014-10-01'), - Timestamp('2015-10-01'), Timestamp('2016-10-01')])) - + [Timestamp('2013-10-01'), Timestamp('2014-10-01'), + Timestamp('2015-10-01'), Timestamp('2016-10-01')], + "AS-OCT") ]) - def test_dti_add_sub_nonzero_mth_offset(self, op, offset, exp): + def test_dti_add_sub_nonzero_mth_offset(self, op, offset, + exp, exp_freq, + tz_aware_fixture, + box_with_array): # GH 26258 - date = date_range(start='01 Jan 2014', end='01 Jan 2017', freq='AS') + tz = tz_aware_fixture + date = date_range(start='01 Jan 2014', end='01 Jan 2017', freq='AS', + tz=tz) + date = tm.box_expected(date, box_with_array, False) mth = getattr(date, op) result = mth(offset) - tm.assert_equal(result, exp) + + expected = pd.DatetimeIndex(exp, tz=tz, freq=exp_freq) + expected = tm.box_expected(expected, box_with_array, False) + tm.assert_equal(result, expected) class TestDatetime64OverflowHandling: From 014abdc3553bb49c681bff11e09fb7c55f4500db Mon Sep 17 00:00:00 2001 From: Nanda H Krishna Date: Sun, 26 May 2019 07:18:02 +0530 Subject: [PATCH 16/38] Remove py.path special handling from io.common (#26458) --- pandas/io/common.py | 17 ++--------------- 1 file changed, 2 insertions(+), 15 deletions(-) diff --git a/pandas/io/common.py b/pandas/io/common.py index f9cd1806763e2..34635ebf64ad6 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -9,6 +9,7 @@ import lzma import mmap import os +import pathlib from urllib.error import URLError # noqa from urllib.parse import ( # noqa urlencode, urljoin, urlparse as parse_url, uses_netloc, uses_params, @@ -115,24 +116,10 @@ def _stringify_path(filepath_or_buffer): Any other object is passed through unchanged, which includes bytes, strings, buffers, or anything else that's not even path-like. """ - try: - import pathlib - _PATHLIB_INSTALLED = True - except ImportError: - _PATHLIB_INSTALLED = False - - try: - from py.path import local as LocalPath - _PY_PATH_INSTALLED = True - except ImportError: - _PY_PATH_INSTALLED = False - if hasattr(filepath_or_buffer, '__fspath__'): return filepath_or_buffer.__fspath__() - if _PATHLIB_INSTALLED and isinstance(filepath_or_buffer, pathlib.Path): + elif isinstance(filepath_or_buffer, pathlib.Path): return str(filepath_or_buffer) - if _PY_PATH_INSTALLED and isinstance(filepath_or_buffer, LocalPath): - return filepath_or_buffer.strpath return _expand_user(filepath_or_buffer) From 420eee5bf7b8458bddfc6dd3ff2c9020da38dbef Mon Sep 17 00:00:00 2001 From: topper-123 Date: Sun, 26 May 2019 16:31:43 +0200 Subject: [PATCH 17/38] CLN: remove StringMixin from code base, except core.computation (#26523) --- pandas/io/pytables.py | 19 +++++++++---------- pandas/io/stata.py | 3 +-- 2 files changed, 10 insertions(+), 12 deletions(-) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 0f7f6fe399256..11f705e88179d 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -32,7 +32,6 @@ to_datetime) from pandas.core.arrays.categorical import Categorical from pandas.core.arrays.sparse import BlockIndex, IntIndex -from pandas.core.base import StringMixin import pandas.core.common as com from pandas.core.computation.pytables import Expr, maybe_expression from pandas.core.index import ensure_index @@ -398,7 +397,7 @@ def _is_metadata_of(group, parent_group): return False -class HDFStore(StringMixin): +class HDFStore: """ Dict-like IO interface for storing pandas objects in PyTables @@ -520,7 +519,7 @@ def __contains__(self, key): def __len__(self): return len(self.groups()) - def __str__(self): + def __repr__(self): return '{type}\nFile path: {path}\n'.format( type=type(self), path=pprint_thing(self._path)) @@ -1519,7 +1518,7 @@ def get_result(self, coordinates=False): return results -class IndexCol(StringMixin): +class IndexCol: """ an index column description class @@ -1587,7 +1586,7 @@ def set_table(self, table): self.table = table return self - def __str__(self): + def __repr__(self): temp = tuple( map(pprint_thing, (self.name, @@ -1881,7 +1880,7 @@ def __init__(self, values=None, kind=None, typ=None, self.set_data(data) self.set_metadata(metadata) - def __str__(self): + def __repr__(self): temp = tuple( map(pprint_thing, (self.name, @@ -2286,7 +2285,7 @@ def get_attr(self): pass -class Fixed(StringMixin): +class Fixed: """ represent an object in my store facilitate read/write of various types of objects @@ -2336,7 +2335,7 @@ def pandas_type(self): def format_type(self): return 'fixed' - def __str__(self): + def __repr__(self): """ return a pretty representation of myself """ self.infer_axes() s = self.shape @@ -3077,8 +3076,8 @@ def table_type_short(self): def format_type(self): return 'table' - def __str__(self): - """ return a pretty representatgion of myself """ + def __repr__(self): + """ return a pretty representation of myself """ self.infer_axes() dc = ",dc->[{columns}]".format(columns=(','.join( self.data_columns) if len(self.data_columns) else '')) diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 27ddc4ef6f594..d8dfd15477974 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -31,7 +31,6 @@ from pandas import ( Categorical, DatetimeIndex, NaT, Timestamp, concat, isna, to_datetime, to_timedelta) -from pandas.core.base import StringMixin from pandas.core.frame import DataFrame from pandas.core.series import Series @@ -712,7 +711,7 @@ def generate_value_label(self, byteorder, encoding): return bio.read() -class StataMissingValue(StringMixin): +class StataMissingValue: """ An observation's missing value. From 48a4b8cf966529dcd441ece139afe82fc4873742 Mon Sep 17 00:00:00 2001 From: Chuanzhu Xu Date: Sun, 26 May 2019 13:57:47 -0400 Subject: [PATCH 18/38] MAINT: port numpy#13188 for np_datetime simplification (#26516) * MAINT: port numpy#13188 for np_datetime simplificaion Bring numpy changes about emulating the behavior of python's divmod to pandas. * cpplint fix * Add reference numpy change into comment * fix typo --- doc/source/whatsnew/v0.25.0.rst | 2 +- .../_libs/tslibs/src/datetime/np_datetime.c | 208 +++++++----------- pandas/core/arrays/categorical.py | 8 +- .../arrays/categorical/test_operators.py | 17 +- 4 files changed, 94 insertions(+), 141 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index af59a34245660..9ea690a11259d 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -346,7 +346,7 @@ Categorical ^^^^^^^^^^^ - Bug in :func:`DataFrame.at` and :func:`Series.at` that would raise exception if the index was a :class:`CategoricalIndex` (:issue:`20629`) -- Bug in :func:`_cat_compare_op` that would valuate comparison with None to True (:issue:`26504`) +- Fixed Bug in :func:`_cat_compare_op` that would evaluate comparison of ordered `Categorical` with missing values with scalar to True sometimes (:issue:`26504`) - Datetimelike diff --git a/pandas/_libs/tslibs/src/datetime/np_datetime.c b/pandas/_libs/tslibs/src/datetime/np_datetime.c index 87866d804503e..a8a47e2e90f93 100644 --- a/pandas/_libs/tslibs/src/datetime/np_datetime.c +++ b/pandas/_libs/tslibs/src/datetime/np_datetime.c @@ -498,6 +498,27 @@ npy_datetime npy_datetimestruct_to_datetime(NPY_DATETIMEUNIT base, return ret; } +/* + * Port numpy#13188 https://github.com/numpy/numpy/pull/13188/ + * + * Computes the python `ret, d = divmod(d, unit)`. + * + * Note that GCC is smart enough at -O2 to eliminate the `if(*d < 0)` branch + * for subsequent calls to this command - it is able to deduce that `*d >= 0`. + */ +npy_int64 extract_unit(npy_datetime *d, npy_datetime unit) { + assert(unit > 0); + npy_int64 div = *d / unit; + npy_int64 mod = *d % unit; + if (mod < 0) { + mod += unit; + div -= 1; + } + assert(mod >= 0); + *d = mod; + return div; +} + /* * Converts a datetime based on the given metadata into a datetimestruct */ @@ -522,13 +543,8 @@ void pandas_datetime_to_datetimestruct(npy_datetime dt, break; case NPY_FR_M: - if (dt >= 0) { - out->year = 1970 + dt / 12; - out->month = dt % 12 + 1; - } else { - out->year = 1969 + (dt + 1) / 12; - out->month = 12 + (dt + 1) % 12; - } + out->year = 1970 + extract_unit(&dt, 12); + out->month = dt + 1; break; case NPY_FR_W: @@ -543,167 +559,105 @@ void pandas_datetime_to_datetimestruct(npy_datetime dt, case NPY_FR_h: perday = 24LL; - if (dt >= 0) { - set_datetimestruct_days(dt / perday, out); - dt = dt % perday; - } else { - set_datetimestruct_days( - dt / perday - (dt % perday == 0 ? 0 : 1), out); - dt = (perday - 1) + (dt + 1) % perday; - } + set_datetimestruct_days(extract_unit(&dt, perday), out); out->hour = dt; break; case NPY_FR_m: perday = 24LL * 60; - if (dt >= 0) { - set_datetimestruct_days(dt / perday, out); - dt = dt % perday; - } else { - set_datetimestruct_days( - dt / perday - (dt % perday == 0 ? 0 : 1), out); - dt = (perday - 1) + (dt + 1) % perday; - } - out->hour = dt / 60; - out->min = dt % 60; + set_datetimestruct_days(extract_unit(&dt, perday), out); + out->hour = (int)extract_unit(&dt, 60); + out->min = (int)dt; break; case NPY_FR_s: perday = 24LL * 60 * 60; - if (dt >= 0) { - set_datetimestruct_days(dt / perday, out); - dt = dt % perday; - } else { - set_datetimestruct_days( - dt / perday - (dt % perday == 0 ? 0 : 1), out); - dt = (perday - 1) + (dt + 1) % perday; - } - out->hour = dt / (60 * 60); - out->min = (dt / 60) % 60; - out->sec = dt % 60; + set_datetimestruct_days(extract_unit(&dt, perday), out); + out->hour = (int)extract_unit(&dt, 60 * 60); + out->min = (int)extract_unit(&dt, 60); + out->sec = (int)dt; break; case NPY_FR_ms: perday = 24LL * 60 * 60 * 1000; - if (dt >= 0) { - set_datetimestruct_days(dt / perday, out); - dt = dt % perday; - } else { - set_datetimestruct_days( - dt / perday - (dt % perday == 0 ? 0 : 1), out); - dt = (perday - 1) + (dt + 1) % perday; - } - out->hour = dt / (60 * 60 * 1000LL); - out->min = (dt / (60 * 1000LL)) % 60; - out->sec = (dt / 1000LL) % 60; - out->us = (dt % 1000LL) * 1000; + set_datetimestruct_days(extract_unit(&dt, perday), out); + out->hour = (int)extract_unit(&dt, 1000LL * 60 * 60); + out->min = (int)extract_unit(&dt, 1000LL * 60); + out->sec = (int)extract_unit(&dt, 1000LL); + out->us = (int)(dt * 1000); break; case NPY_FR_us: perday = 24LL * 60LL * 60LL * 1000LL * 1000LL; - if (dt >= 0) { - set_datetimestruct_days(dt / perday, out); - dt = dt % perday; - } else { - set_datetimestruct_days( - dt / perday - (dt % perday == 0 ? 0 : 1), out); - dt = (perday - 1) + (dt + 1) % perday; - } - out->hour = dt / (60 * 60 * 1000000LL); - out->min = (dt / (60 * 1000000LL)) % 60; - out->sec = (dt / 1000000LL) % 60; - out->us = dt % 1000000LL; + set_datetimestruct_days(extract_unit(&dt, perday), out); + out->hour = (int)extract_unit(&dt, 1000LL * 1000 * 60 * 60); + out->min = (int)extract_unit(&dt, 1000LL * 1000 * 60); + out->sec = (int)extract_unit(&dt, 1000LL * 1000); + out->us = (int)dt; break; case NPY_FR_ns: perday = 24LL * 60LL * 60LL * 1000LL * 1000LL * 1000LL; - if (dt >= 0) { - set_datetimestruct_days(dt / perday, out); - dt = dt % perday; - } else { - set_datetimestruct_days( - dt / perday - (dt % perday == 0 ? 0 : 1), out); - dt = (perday - 1) + (dt + 1) % perday; - } - out->hour = dt / (60 * 60 * 1000000000LL); - out->min = (dt / (60 * 1000000000LL)) % 60; - out->sec = (dt / 1000000000LL) % 60; - out->us = (dt / 1000LL) % 1000000LL; - out->ps = (dt % 1000LL) * 1000; + set_datetimestruct_days(extract_unit(&dt, perday), out); + out->hour = (int)extract_unit(&dt, 1000LL * 1000 * 1000 * 60 * 60); + out->min = (int)extract_unit(&dt, 1000LL * 1000 * 1000 * 60); + out->sec = (int)extract_unit(&dt, 1000LL * 1000 * 1000); + out->us = (int)extract_unit(&dt, 1000LL); + out->ps = (int)(dt * 1000); break; case NPY_FR_ps: perday = 24LL * 60 * 60 * 1000 * 1000 * 1000 * 1000; - if (dt >= 0) { - set_datetimestruct_days(dt / perday, out); - dt = dt % perday; - } else { - set_datetimestruct_days( - dt / perday - (dt % perday == 0 ? 0 : 1), out); - dt = (perday - 1) + (dt + 1) % perday; - } - out->hour = dt / (60 * 60 * 1000000000000LL); - out->min = (dt / (60 * 1000000000000LL)) % 60; - out->sec = (dt / 1000000000000LL) % 60; - out->us = (dt / 1000000LL) % 1000000LL; - out->ps = dt % 1000000LL; + set_datetimestruct_days(extract_unit(&dt, perday), out); + out->hour = (int)extract_unit(&dt, 1000LL * 1000 * 1000 * 60 * 60); + out->min = (int)extract_unit(&dt, 1000LL * 1000 * 1000 * 60); + out->sec = (int)extract_unit(&dt, 1000LL * 1000 * 1000); + out->us = (int)extract_unit(&dt, 1000LL); + out->ps = (int)(dt * 1000); break; case NPY_FR_fs: /* entire range is only +- 2.6 hours */ - if (dt >= 0) { - out->hour = dt / (60 * 60 * 1000000000000000LL); - out->min = (dt / (60 * 1000000000000000LL)) % 60; - out->sec = (dt / 1000000000000000LL) % 60; - out->us = (dt / 1000000000LL) % 1000000LL; - out->ps = (dt / 1000LL) % 1000000LL; - out->as = (dt % 1000LL) * 1000; - } else { - npy_datetime minutes; - - minutes = dt / (60 * 1000000000000000LL); - dt = dt % (60 * 1000000000000000LL); - if (dt < 0) { - dt += (60 * 1000000000000000LL); - --minutes; - } - /* Offset the negative minutes */ - add_minutes_to_datetimestruct(out, minutes); - out->sec = (dt / 1000000000000000LL) % 60; - out->us = (dt / 1000000000LL) % 1000000LL; - out->ps = (dt / 1000LL) % 1000000LL; - out->as = (dt % 1000LL) * 1000; + out->hour = (int)extract_unit(&dt, 1000LL * 1000 * 1000 * 1000 * + 1000 * 60 * 60); + if (out->hour < 0) { + out->year = 1969; + out->month = 12; + out->day = 31; + out->hour += 24; + assert(out->hour >= 0); } + out->min = (int)extract_unit(&dt, 1000LL * 1000 * 1000 * 1000 * + 1000 * 60); + out->sec = (int)extract_unit(&dt, 1000LL * 1000 * 1000 * 1000 * + 1000); + out->us = (int)extract_unit(&dt, 1000LL * 1000 * 1000); + out->ps = (int)extract_unit(&dt, 1000LL); + out->as = (int)(dt * 1000); break; case NPY_FR_as: /* entire range is only +- 9.2 seconds */ - if (dt >= 0) { - out->sec = (dt / 1000000000000000000LL) % 60; - out->us = (dt / 1000000000000LL) % 1000000LL; - out->ps = (dt / 1000000LL) % 1000000LL; - out->as = dt % 1000000LL; - } else { - npy_datetime seconds; - - seconds = dt / 1000000000000000000LL; - dt = dt % 1000000000000000000LL; - if (dt < 0) { - dt += 1000000000000000000LL; - --seconds; - } - /* Offset the negative seconds */ - add_seconds_to_datetimestruct(out, seconds); - out->us = (dt / 1000000000000LL) % 1000000LL; - out->ps = (dt / 1000000LL) % 1000000LL; - out->as = dt % 1000000LL; + out->sec = (int)extract_unit(&dt, 1000LL * 1000 * 1000 * 1000 * + 1000 * 1000); + if (out->sec < 0) { + out->year = 1969; + out->month = 12; + out->day = 31; + out->hour = 23; + out->min = 59; + out->sec += 60; + assert(out->sec >= 0); } + out->us = (int)extract_unit(&dt, 1000LL * 1000 * 1000 * 1000); + out->ps = (int)extract_unit(&dt, 1000LL * 1000); + out->as = (int)dt; break; default: diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 1d6b906158125..df01c6bee8917 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -100,13 +100,12 @@ def f(self, other): if is_scalar(other): if other in self.categories: i = self.categories.get_loc(other) - f = getattr(self._codes, op) - ret = f(i) + ret = getattr(self._codes, op)(i) # check for NaN in self na_mask = (self._codes == -1) if na_mask.any(): - # In other series, the leads to False, so do that here too + # comparison to missing values NaN leads to False ret[na_mask] = False return ret else: @@ -1412,6 +1411,7 @@ def isna(self): ret = self._codes == -1 return ret + isnull = isna def notna(self): @@ -1433,6 +1433,7 @@ def notna(self): """ return ~self.isna() + notnull = notna def put(self, *args, **kwargs): @@ -2555,6 +2556,7 @@ def index(self): stacklevel=2) return self._index + # utility routines diff --git a/pandas/tests/arrays/categorical/test_operators.py b/pandas/tests/arrays/categorical/test_operators.py index b323cb2b6a7c3..f2f4871a70d92 100644 --- a/pandas/tests/arrays/categorical/test_operators.py +++ b/pandas/tests/arrays/categorical/test_operators.py @@ -17,7 +17,6 @@ def test_categories_none_comparisons(self): tm.assert_categorical_equal(factor, self.factor) def test_comparisons(self): - result = self.factor[self.factor == 'a'] expected = self.factor[np.asarray(self.factor) == 'a'] tm.assert_categorical_equal(result, expected) @@ -186,23 +185,21 @@ def test_comparison_with_unknown_scalars(self): tm.assert_numpy_array_equal(cat != 4, np.array([True, True, True])) - def test_comparison_with_known_scalars(self): + def test_comparison_of_ordered_categorical_with_missing_values(self): # https://github.com/pandas-dev/pandas/issues/26504 - # and following comparisons with scalars in categories with None should - # be evaluated as False + # BUG: fix ordered categorical comparison with missing values (#26504 ) + # and following comparisons with scalars in categories with missing values + # should be evaluated as False - cat1 = Categorical([1, 2, 3, None], categories=[1, 2, 3], ordered=True) - cat2 = Categorical([None, 1, 2, 3], categories=[1, 2, 3], ordered=True) + cat = Categorical([1, 2, 3, None], categories=[1, 2, 3], ordered=True) - tm.assert_numpy_array_equal(cat1 <= 2, + tm.assert_numpy_array_equal(cat <= 2, np.array([True, True, False, False])) - tm.assert_numpy_array_equal(cat2 <= 2, - np.array([False, True, True, False])) @pytest.mark.parametrize('data,reverse,base', [ (list("abc"), list("cba"), list("bbb")), ([1, 2, 3], [3, 2, 1], [2, 2, 2])] - ) + ) def test_comparisons(self, data, reverse, base): cat_rev = Series( Categorical(data, categories=reverse, ordered=True)) From 3e205694ef3fe3447b5da626a05e03a29d977ab0 Mon Sep 17 00:00:00 2001 From: yanglinlee Date: Tue, 28 May 2019 21:41:31 -0400 Subject: [PATCH 19/38] fix categorical comparison with missing values #26504 --- pandas/tests/arrays/categorical/test_operators.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/pandas/tests/arrays/categorical/test_operators.py b/pandas/tests/arrays/categorical/test_operators.py index f2f4871a70d92..24a274ed8c115 100644 --- a/pandas/tests/arrays/categorical/test_operators.py +++ b/pandas/tests/arrays/categorical/test_operators.py @@ -188,8 +188,8 @@ def test_comparison_with_unknown_scalars(self): def test_comparison_of_ordered_categorical_with_missing_values(self): # https://github.com/pandas-dev/pandas/issues/26504 # BUG: fix ordered categorical comparison with missing values (#26504 ) - # and following comparisons with scalars in categories with missing values - # should be evaluated as False + # and following comparisons with scalars in categories with missing + # values should be evaluated as False cat = Categorical([1, 2, 3, None], categories=[1, 2, 3], ordered=True) @@ -198,8 +198,7 @@ def test_comparison_of_ordered_categorical_with_missing_values(self): @pytest.mark.parametrize('data,reverse,base', [ (list("abc"), list("cba"), list("bbb")), - ([1, 2, 3], [3, 2, 1], [2, 2, 2])] - ) + ([1, 2, 3], [3, 2, 1], [2, 2, 2])]) def test_comparisons(self, data, reverse, base): cat_rev = Series( Categorical(data, categories=reverse, ordered=True)) From 7e6662d70bd6c0008a48ed7f5c806fb841f22a2b Mon Sep 17 00:00:00 2001 From: Big Head Date: Tue, 28 May 2019 22:46:38 -0400 Subject: [PATCH 20/38] Update test_operators.py --- pandas/tests/arrays/categorical/test_operators.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/arrays/categorical/test_operators.py b/pandas/tests/arrays/categorical/test_operators.py index 24a274ed8c115..fe10823254a12 100644 --- a/pandas/tests/arrays/categorical/test_operators.py +++ b/pandas/tests/arrays/categorical/test_operators.py @@ -185,7 +185,7 @@ def test_comparison_with_unknown_scalars(self): tm.assert_numpy_array_equal(cat != 4, np.array([True, True, True])) - def test_comparison_of_ordered_categorical_with_missing_values(self): + def test_comparison_of_ordered_categorical_with_missing_values_to_scalar(self): # https://github.com/pandas-dev/pandas/issues/26504 # BUG: fix ordered categorical comparison with missing values (#26504 ) # and following comparisons with scalars in categories with missing From 16dac3a3ca1a64f603c6e4466a884fafe0771c99 Mon Sep 17 00:00:00 2001 From: Big Head Date: Tue, 28 May 2019 22:51:02 -0400 Subject: [PATCH 21/38] Update categorical.py --- pandas/core/arrays/categorical.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index df01c6bee8917..93346488aaf1d 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -89,12 +89,12 @@ def f(self, other): else: other_codes = other._codes - na_mask = (self._codes == -1) | (other_codes == -1) + mask = (self._codes == -1) | (other_codes == -1) f = getattr(self._codes, op) ret = f(other_codes) - if na_mask.any(): + if mask.any(): # In other series, the leads to False, so do that here too - ret[na_mask] = False + ret[mask] = False return ret if is_scalar(other): @@ -103,10 +103,10 @@ def f(self, other): ret = getattr(self._codes, op)(i) # check for NaN in self - na_mask = (self._codes == -1) - if na_mask.any(): + mask = (self._codes == -1) + if mask.any(): # comparison to missing values NaN leads to False - ret[na_mask] = False + ret[mask] = False return ret else: if op == '__eq__': From 9464f72944e48c1809b0e5430873eb63cb1655a0 Mon Sep 17 00:00:00 2001 From: Big Head Date: Tue, 28 May 2019 22:55:19 -0400 Subject: [PATCH 22/38] Update test_operators.py --- pandas/tests/arrays/categorical/test_operators.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/arrays/categorical/test_operators.py b/pandas/tests/arrays/categorical/test_operators.py index fe10823254a12..c4c977b3a84e2 100644 --- a/pandas/tests/arrays/categorical/test_operators.py +++ b/pandas/tests/arrays/categorical/test_operators.py @@ -185,7 +185,7 @@ def test_comparison_with_unknown_scalars(self): tm.assert_numpy_array_equal(cat != 4, np.array([True, True, True])) - def test_comparison_of_ordered_categorical_with_missing_values_to_scalar(self): + def test_comparison_of_ordered_categorical_with_nan_to_scalar(self): # https://github.com/pandas-dev/pandas/issues/26504 # BUG: fix ordered categorical comparison with missing values (#26504 ) # and following comparisons with scalars in categories with missing From c2b73438c68eadb07a479658fe51fc8998978a50 Mon Sep 17 00:00:00 2001 From: Big Head Date: Wed, 29 May 2019 01:10:19 -0400 Subject: [PATCH 23/38] Update test_operators.py --- .../tests/arrays/categorical/test_operators.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/pandas/tests/arrays/categorical/test_operators.py b/pandas/tests/arrays/categorical/test_operators.py index c4c977b3a84e2..13d037b76ec18 100644 --- a/pandas/tests/arrays/categorical/test_operators.py +++ b/pandas/tests/arrays/categorical/test_operators.py @@ -185,7 +185,8 @@ def test_comparison_with_unknown_scalars(self): tm.assert_numpy_array_equal(cat != 4, np.array([True, True, True])) - def test_comparison_of_ordered_categorical_with_nan_to_scalar(self): + def test_comparison_of_ordered_categorical_with_nan_to_scalar( + self, compare_operators_no_eq_ne): # https://github.com/pandas-dev/pandas/issues/26504 # BUG: fix ordered categorical comparison with missing values (#26504 ) # and following comparisons with scalars in categories with missing @@ -193,9 +194,18 @@ def test_comparison_of_ordered_categorical_with_nan_to_scalar(self): cat = Categorical([1, 2, 3, None], categories=[1, 2, 3], ordered=True) - tm.assert_numpy_array_equal(cat <= 2, - np.array([True, True, False, False])) + assert getattr(cat, compare_operators_no_eq_ne)(2)[-1] == False + def test_comparison_of_ordered_categorical_with_nan_to_listlike( + self, compare_operators_no_eq_ne): + # https://github.com/pandas-dev/pandas/issues/26504 + # and following comparisons of missing values in ordered Categorical + # with listlike should be evaluated as False + + cat = Categorical([1, 2, 3, None], categories=[1, 2, 3], ordered=True) + other = Categorical([2, 2, 2, 2], categories=[1, 2, 3], ordered=True) + assert getattr(cat, compare_operators_no_eq_ne)(other)[-1] == False + @pytest.mark.parametrize('data,reverse,base', [ (list("abc"), list("cba"), list("bbb")), ([1, 2, 3], [3, 2, 1], [2, 2, 2])]) From 65014e76780a0533d22d748d980b2a592d3d15fd Mon Sep 17 00:00:00 2001 From: Big Head Date: Wed, 29 May 2019 12:22:13 -0400 Subject: [PATCH 24/38] Update doc/source/whatsnew/v0.25.0.rst Co-Authored-By: Joris Van den Bossche --- doc/source/whatsnew/v0.25.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 9f10d758c72c4..c65ba8ae54f2e 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -348,7 +348,7 @@ Categorical ^^^^^^^^^^^ - Bug in :func:`DataFrame.at` and :func:`Series.at` that would raise exception if the index was a :class:`CategoricalIndex` (:issue:`20629`) -- Fixed Bug in :func:`_cat_compare_op` that would evaluate comparison of ordered `Categorical` with missing values with scalar to True sometimes (:issue:`26504`) +- Fixed bug in comparison of ordered `Categorical` that contained missing values with a scalar which sometimes incorrectly resulted in True (:issue:`26504`) - Datetimelike From 8964f0acd8b1b15d2e6e642056380718205e6288 Mon Sep 17 00:00:00 2001 From: Big Head Date: Wed, 29 May 2019 20:57:14 -0400 Subject: [PATCH 25/38] Update test_operators.py --- pandas/tests/arrays/categorical/test_operators.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/tests/arrays/categorical/test_operators.py b/pandas/tests/arrays/categorical/test_operators.py index 13d037b76ec18..b6b4713a7d87e 100644 --- a/pandas/tests/arrays/categorical/test_operators.py +++ b/pandas/tests/arrays/categorical/test_operators.py @@ -186,7 +186,7 @@ def test_comparison_with_unknown_scalars(self): np.array([True, True, True])) def test_comparison_of_ordered_categorical_with_nan_to_scalar( - self, compare_operators_no_eq_ne): + self, compare_operators_no_eq_ne): # https://github.com/pandas-dev/pandas/issues/26504 # BUG: fix ordered categorical comparison with missing values (#26504 ) # and following comparisons with scalars in categories with missing @@ -198,8 +198,8 @@ def test_comparison_of_ordered_categorical_with_nan_to_scalar( def test_comparison_of_ordered_categorical_with_nan_to_listlike( self, compare_operators_no_eq_ne): - # https://github.com/pandas-dev/pandas/issues/26504 - # and following comparisons of missing values in ordered Categorical + # https://github.com/pandas-dev/pandas/issues/26504 + # and following comparisons of missing values in ordered Categorical # with listlike should be evaluated as False cat = Categorical([1, 2, 3, None], categories=[1, 2, 3], ordered=True) From 7f404d2a5e931196eadeceb7308642d14fc8afd2 Mon Sep 17 00:00:00 2001 From: Big Head Date: Wed, 29 May 2019 21:02:59 -0400 Subject: [PATCH 26/38] Update test_operators.py --- pandas/tests/arrays/categorical/test_operators.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/tests/arrays/categorical/test_operators.py b/pandas/tests/arrays/categorical/test_operators.py index b6b4713a7d87e..101731b4e96cb 100644 --- a/pandas/tests/arrays/categorical/test_operators.py +++ b/pandas/tests/arrays/categorical/test_operators.py @@ -208,7 +208,8 @@ def test_comparison_of_ordered_categorical_with_nan_to_listlike( @pytest.mark.parametrize('data,reverse,base', [ (list("abc"), list("cba"), list("bbb")), - ([1, 2, 3], [3, 2, 1], [2, 2, 2])]) + ([1, 2, 3], [3, 2, 1], [2, 2, 2])] + ) def test_comparisons(self, data, reverse, base): cat_rev = Series( Categorical(data, categories=reverse, ordered=True)) From 19e3711343c1ef5d926ea78705033c37e185f2a8 Mon Sep 17 00:00:00 2001 From: Big Head Date: Wed, 29 May 2019 21:23:35 -0400 Subject: [PATCH 27/38] Update v0.25.0.rst --- doc/source/whatsnew/v0.25.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index c65ba8ae54f2e..ffa9fc5977709 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -348,7 +348,7 @@ Categorical ^^^^^^^^^^^ - Bug in :func:`DataFrame.at` and :func:`Series.at` that would raise exception if the index was a :class:`CategoricalIndex` (:issue:`20629`) -- Fixed bug in comparison of ordered `Categorical` that contained missing values with a scalar which sometimes incorrectly resulted in True (:issue:`26504`) +- Fixed bug in comparison of ordered :class:`Categorical` that contained missing values with a scalar which sometimes incorrectly resulted in True (:issue:`26504`) - Datetimelike From 2fc1d278d2a33faaf0ac482433f6b44323515f54 Mon Sep 17 00:00:00 2001 From: Big Head Date: Wed, 29 May 2019 21:40:01 -0400 Subject: [PATCH 28/38] Update test_operators.py --- .../tests/arrays/categorical/test_operators.py | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/pandas/tests/arrays/categorical/test_operators.py b/pandas/tests/arrays/categorical/test_operators.py index 101731b4e96cb..80f218bd3a30c 100644 --- a/pandas/tests/arrays/categorical/test_operators.py +++ b/pandas/tests/arrays/categorical/test_operators.py @@ -7,6 +7,7 @@ from pandas import Categorical, DataFrame, Series, date_range from pandas.tests.arrays.categorical.common import TestCategorical import pandas.util.testing as tm +import warnings class TestCategoricalOpsWithFactor(TestCategorical): @@ -193,8 +194,14 @@ def test_comparison_of_ordered_categorical_with_nan_to_scalar( # values should be evaluated as False cat = Categorical([1, 2, 3, None], categories=[1, 2, 3], ordered=True) - - assert getattr(cat, compare_operators_no_eq_ne)(2)[-1] == False + scalar = 2 + + with warnings.catch_warnings(): + warnings.simplefilter("ignore", RuntimeWarning) + actual = getattr(cat, compare_operators_no_eq_ne)(scalar) + expected = getattr(np.array(cat), compare_operators_no_eq_ne)(scalar) + tm.assert_numpy_array_equal(actual, expected) + def test_comparison_of_ordered_categorical_with_nan_to_listlike( self, compare_operators_no_eq_ne): @@ -204,7 +211,12 @@ def test_comparison_of_ordered_categorical_with_nan_to_listlike( cat = Categorical([1, 2, 3, None], categories=[1, 2, 3], ordered=True) other = Categorical([2, 2, 2, 2], categories=[1, 2, 3], ordered=True) - assert getattr(cat, compare_operators_no_eq_ne)(other)[-1] == False + + with warnings.catch_warnings(): + warnings.simplefilter("ignore", RuntimeWarning) + actual = getattr(cat, compare_operators_no_eq_ne)(other) + expected = getattr(np.array(cat), compare_operators_no_eq_ne)(2) + tm.assert_numpy_array_equal(actual, expected) @pytest.mark.parametrize('data,reverse,base', [ (list("abc"), list("cba"), list("bbb")), From c80c2dcab5cafd8604d09eab9ceef4a6314b072e Mon Sep 17 00:00:00 2001 From: Big Head Date: Wed, 29 May 2019 22:49:09 -0400 Subject: [PATCH 29/38] Update test_operators.py --- pandas/tests/arrays/categorical/test_operators.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/pandas/tests/arrays/categorical/test_operators.py b/pandas/tests/arrays/categorical/test_operators.py index 80f218bd3a30c..05bbb30bd00a1 100644 --- a/pandas/tests/arrays/categorical/test_operators.py +++ b/pandas/tests/arrays/categorical/test_operators.py @@ -195,13 +195,12 @@ def test_comparison_of_ordered_categorical_with_nan_to_scalar( cat = Categorical([1, 2, 3, None], categories=[1, 2, 3], ordered=True) scalar = 2 - with warnings.catch_warnings(): warnings.simplefilter("ignore", RuntimeWarning) actual = getattr(cat, compare_operators_no_eq_ne)(scalar) - expected = getattr(np.array(cat), compare_operators_no_eq_ne)(scalar) + expected = getattr(np.array(cat), + compare_operators_no_eq_ne)(scalar) tm.assert_numpy_array_equal(actual, expected) - def test_comparison_of_ordered_categorical_with_nan_to_listlike( self, compare_operators_no_eq_ne): @@ -211,13 +210,12 @@ def test_comparison_of_ordered_categorical_with_nan_to_listlike( cat = Categorical([1, 2, 3, None], categories=[1, 2, 3], ordered=True) other = Categorical([2, 2, 2, 2], categories=[1, 2, 3], ordered=True) - with warnings.catch_warnings(): warnings.simplefilter("ignore", RuntimeWarning) actual = getattr(cat, compare_operators_no_eq_ne)(other) expected = getattr(np.array(cat), compare_operators_no_eq_ne)(2) tm.assert_numpy_array_equal(actual, expected) - + @pytest.mark.parametrize('data,reverse,base', [ (list("abc"), list("cba"), list("bbb")), ([1, 2, 3], [3, 2, 1], [2, 2, 2])] From 2e01686a7e6c6eb9a1e4ce20c0f0862210c4b046 Mon Sep 17 00:00:00 2001 From: Big Head Date: Wed, 29 May 2019 23:23:44 -0400 Subject: [PATCH 30/38] Update test_operators.py --- pandas/tests/arrays/categorical/test_operators.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/arrays/categorical/test_operators.py b/pandas/tests/arrays/categorical/test_operators.py index 05bbb30bd00a1..1f85a22e03d96 100644 --- a/pandas/tests/arrays/categorical/test_operators.py +++ b/pandas/tests/arrays/categorical/test_operators.py @@ -198,7 +198,7 @@ def test_comparison_of_ordered_categorical_with_nan_to_scalar( with warnings.catch_warnings(): warnings.simplefilter("ignore", RuntimeWarning) actual = getattr(cat, compare_operators_no_eq_ne)(scalar) - expected = getattr(np.array(cat), + expected = getattr(np.array(cat), compare_operators_no_eq_ne)(scalar) tm.assert_numpy_array_equal(actual, expected) From 924f6937db462de93c9b6ba70bd1e653b3465d42 Mon Sep 17 00:00:00 2001 From: Big Head Date: Wed, 29 May 2019 23:51:40 -0400 Subject: [PATCH 31/38] Update test_operators.py --- pandas/tests/arrays/categorical/test_operators.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/arrays/categorical/test_operators.py b/pandas/tests/arrays/categorical/test_operators.py index 1f85a22e03d96..72275444ccaa1 100644 --- a/pandas/tests/arrays/categorical/test_operators.py +++ b/pandas/tests/arrays/categorical/test_operators.py @@ -1,4 +1,5 @@ import operator +import warnings import numpy as np import pytest @@ -7,7 +8,6 @@ from pandas import Categorical, DataFrame, Series, date_range from pandas.tests.arrays.categorical.common import TestCategorical import pandas.util.testing as tm -import warnings class TestCategoricalOpsWithFactor(TestCategorical): From 3b4a42a58126f47b8e3e16167660238810246743 Mon Sep 17 00:00:00 2001 From: Big Head Date: Thu, 30 May 2019 10:18:45 -0400 Subject: [PATCH 32/38] Update categorical.py --- pandas/core/arrays/categorical.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 93346488aaf1d..8bbd161ee107b 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1411,7 +1411,6 @@ def isna(self): ret = self._codes == -1 return ret - isnull = isna def notna(self): @@ -1433,7 +1432,6 @@ def notna(self): """ return ~self.isna() - notnull = notna def put(self, *args, **kwargs): @@ -2556,7 +2554,6 @@ def index(self): stacklevel=2) return self._index - # utility routines From 57480bd3c3be622c2fafbdd5b73dcffac0ee22ae Mon Sep 17 00:00:00 2001 From: Big Head Date: Thu, 30 May 2019 10:24:59 -0400 Subject: [PATCH 33/38] Update test_operators.py --- pandas/tests/arrays/categorical/test_operators.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/pandas/tests/arrays/categorical/test_operators.py b/pandas/tests/arrays/categorical/test_operators.py index 72275444ccaa1..b67d430667682 100644 --- a/pandas/tests/arrays/categorical/test_operators.py +++ b/pandas/tests/arrays/categorical/test_operators.py @@ -195,12 +195,13 @@ def test_comparison_of_ordered_categorical_with_nan_to_scalar( cat = Categorical([1, 2, 3, None], categories=[1, 2, 3], ordered=True) scalar = 2 + with warnings.catch_warnings(): warnings.simplefilter("ignore", RuntimeWarning) - actual = getattr(cat, compare_operators_no_eq_ne)(scalar) expected = getattr(np.array(cat), compare_operators_no_eq_ne)(scalar) - tm.assert_numpy_array_equal(actual, expected) + actual = getattr(cat, compare_operators_no_eq_ne)(scalar) + tm.assert_numpy_array_equal(actual, expected) def test_comparison_of_ordered_categorical_with_nan_to_listlike( self, compare_operators_no_eq_ne): @@ -210,11 +211,12 @@ def test_comparison_of_ordered_categorical_with_nan_to_listlike( cat = Categorical([1, 2, 3, None], categories=[1, 2, 3], ordered=True) other = Categorical([2, 2, 2, 2], categories=[1, 2, 3], ordered=True) + with warnings.catch_warnings(): warnings.simplefilter("ignore", RuntimeWarning) - actual = getattr(cat, compare_operators_no_eq_ne)(other) expected = getattr(np.array(cat), compare_operators_no_eq_ne)(2) - tm.assert_numpy_array_equal(actual, expected) + actual = getattr(cat, compare_operators_no_eq_ne)(other) + tm.assert_numpy_array_equal(actual, expected) @pytest.mark.parametrize('data,reverse,base', [ (list("abc"), list("cba"), list("bbb")), From 8bb9bcf27a4dbc1220b6eeaffa605a3aef286da3 Mon Sep 17 00:00:00 2001 From: yanglinlee Date: Fri, 31 May 2019 12:22:55 -0400 Subject: [PATCH 34/38] fix categorical comparison with missing values --- pandas/core/arrays/categorical.py | 4 +--- pandas/tests/arrays/categorical/test_operators.py | 2 -- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 6d34a8b66c5ea..44bb44457bc25 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -104,9 +104,7 @@ def f(self, other): # check for NaN in self mask = (self._codes == -1) - if mask.any(): - # comparison to missing values NaN leads to False - ret[mask] = False + ret[mask] = False return ret else: if op == '__eq__': diff --git a/pandas/tests/arrays/categorical/test_operators.py b/pandas/tests/arrays/categorical/test_operators.py index b67d430667682..a443408bf9479 100644 --- a/pandas/tests/arrays/categorical/test_operators.py +++ b/pandas/tests/arrays/categorical/test_operators.py @@ -195,7 +195,6 @@ def test_comparison_of_ordered_categorical_with_nan_to_scalar( cat = Categorical([1, 2, 3, None], categories=[1, 2, 3], ordered=True) scalar = 2 - with warnings.catch_warnings(): warnings.simplefilter("ignore", RuntimeWarning) expected = getattr(np.array(cat), @@ -211,7 +210,6 @@ def test_comparison_of_ordered_categorical_with_nan_to_listlike( cat = Categorical([1, 2, 3, None], categories=[1, 2, 3], ordered=True) other = Categorical([2, 2, 2, 2], categories=[1, 2, 3], ordered=True) - with warnings.catch_warnings(): warnings.simplefilter("ignore", RuntimeWarning) expected = getattr(np.array(cat), compare_operators_no_eq_ne)(2) From 0bf85270b5605e1533b47c58d2cfcb0a2e3fb8dc Mon Sep 17 00:00:00 2001 From: "Michael P. Moran" <30704827+mpmoran@users.noreply.github.com> Date: Wed, 17 Jul 2019 15:33:48 -0500 Subject: [PATCH 35/38] DOC: cleanup docstring for read_json and fix error in contribution guide (#27280) * DOC: cleanup docstring for read_json and fix error in contribution guide --- doc/source/development/contributing.rst | 8 ++--- pandas/io/json/_json.py | 42 +++++++++++++++---------- 2 files changed, 29 insertions(+), 21 deletions(-) diff --git a/doc/source/development/contributing.rst b/doc/source/development/contributing.rst index 92d7cf1a79d8c..80dc8b0d8782b 100644 --- a/doc/source/development/contributing.rst +++ b/doc/source/development/contributing.rst @@ -288,7 +288,7 @@ complex changes to the documentation as well. Some other important things to know about the docs: * The *pandas* documentation consists of two parts: the docstrings in the code - itself and the docs in this folder ``pandas/doc/``. + itself and the docs in this folder ``doc/``. The docstrings provide a clear explanation of the usage of the individual functions, while the documentation in this folder consists of tutorial-like @@ -404,11 +404,11 @@ Building the documentation ~~~~~~~~~~~~~~~~~~~~~~~~~~ So how do you build the docs? Navigate to your local -``pandas/doc/`` directory in the console and run:: +``doc/`` directory in the console and run:: python make.py html -Then you can find the HTML output in the folder ``pandas/doc/build/html/``. +Then you can find the HTML output in the folder ``doc/build/html/``. The first time you build the docs, it will take quite a while because it has to run all the code examples and build all the generated docstring pages. In subsequent @@ -448,7 +448,7 @@ You can also specify to use multiple cores to speed up the documentation build:: Open the following file in a web browser to see the full documentation you just built:: - pandas/docs/build/html/index.html + doc/build/html/index.html And you'll have the satisfaction of seeing your new and improved documentation! diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index 24d41b5101a77..ada7e6f43125d 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -400,8 +400,10 @@ def read_json( .. versionadded:: 0.23.0 'table' as an allowed value for the ``orient`` argument - typ : type of object to recover (series or frame), default 'frame' - dtype : boolean or dict, default None + typ : {'frame', 'series'}, default 'frame' + The type of object to recover. + + dtype : bool or dict, default None If True, infer dtypes; if a dict of column to dtype, then use those; if False, then don't infer dtypes at all, applies only to the data. @@ -411,7 +413,7 @@ def read_json( Not applicable for ``orient='table'``. - convert_axes : boolean, default None + convert_axes : bool, default None Try to convert the axes to the proper dtypes. For all ``orient`` values except ``'table'``, default is True. @@ -420,9 +422,9 @@ def read_json( Not applicable for ``orient='table'``. - convert_dates : boolean, default True - List of columns to parse for dates; If True, then try to parse - datelike columns default is True; a column label is datelike if + convert_dates : bool or list of str, default True + List of columns to parse for dates. If True, then try to parse + datelike columns. A column label is datelike if * it ends with ``'_at'``, @@ -432,34 +434,38 @@ def read_json( * it is ``'modified'``, or - * it is ``'date'`` + * it is ``'date'``. + + keep_default_dates : bool, default True + If parsing dates, then parse the default datelike columns. - keep_default_dates : boolean, default True - If parsing dates, then parse the default datelike columns - numpy : boolean, default False + numpy : bool, default False Direct decoding to numpy arrays. Supports numeric data only, but non-numeric column and index labels are supported. Note also that the JSON ordering MUST be the same for each term if numpy=True. - precise_float : boolean, default False + + precise_float : bool, default False Set to enable usage of higher precision (strtod) function when decoding string to double values. Default (False) is to use fast but - less precise builtin functionality - date_unit : string, default None + less precise builtin functionality. + + date_unit : str, default None The timestamp unit to detect if converting dates. The default behaviour is to try and detect the correct precision, but if this is not desired then pass one of 's', 'ms', 'us' or 'ns' to force parsing only seconds, milliseconds, microseconds or nanoseconds respectively. + encoding : str, default is 'utf-8' The encoding to use to decode py3 bytes. .. versionadded:: 0.19.0 - lines : boolean, default False + lines : bool, default False Read the file as a json object per line. .. versionadded:: 0.19.0 - chunksize : integer, default None + chunksize : int, optional Return JsonReader object for iteration. See the `line-delimited json docs `_ @@ -480,11 +486,13 @@ def read_json( Returns ------- - result : Series or DataFrame, depending on the value of `typ`. + Series or DataFrame + The type returned depends on the value of `typ`. See Also -------- - DataFrame.to_json + DataFrame.to_json : Convert a DataFrame to a JSON string. + Series.to_json : Convert a Series to a JSON string. Notes ----- From 7e27254f67a1ae8331520b01c1230ee769f5a5f7 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 17 Jul 2019 17:16:31 -0500 Subject: [PATCH 36/38] BUG: Fix insertion of wrong-dtypes NaT into Series[m8ns] (#27323) --- pandas/_libs/index.pyx | 4 +++ pandas/core/series.py | 7 +++-- pandas/tests/series/indexing/test_indexing.py | 30 +++++++++++++++++++ 3 files changed, 39 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index ba2838d59f814..7000c07b1f5a6 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -544,6 +544,10 @@ cpdef convert_scalar(ndarray arr, object value): pass elif isinstance(value, timedelta): return Timedelta(value).value + elif util.is_datetime64_object(value): + # exclude np.datetime64("NaT") which would otherwise be picked up + # by the `value != value check below + pass elif value is None or value != value: return NPY_NAT elif isinstance(value, str): diff --git a/pandas/core/series.py b/pandas/core/series.py index 46b96c1ece77c..0f0914a4f74aa 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -46,6 +46,7 @@ ABCSparseSeries, ) from pandas.core.dtypes.missing import ( + is_valid_nat_for_dtype, isna, na_value_for_dtype, notna, @@ -1198,13 +1199,15 @@ def setitem(key, value): pass elif is_timedelta64_dtype(self.dtype): # reassign a null value to iNaT - if isna(value): + if is_valid_nat_for_dtype(value, self.dtype): + # exclude np.datetime64("NaT") value = iNaT try: self.index._engine.set_value(self._values, key, value) return - except TypeError: + except (TypeError, ValueError): + # ValueError appears in only some builds in CI pass self.loc[key] = value diff --git a/pandas/tests/series/indexing/test_indexing.py b/pandas/tests/series/indexing/test_indexing.py index 6ff878f07da84..d73be76795c88 100644 --- a/pandas/tests/series/indexing/test_indexing.py +++ b/pandas/tests/series/indexing/test_indexing.py @@ -654,6 +654,36 @@ def test_timedelta_assignment(): tm.assert_series_equal(s, expected) +@pytest.mark.parametrize( + "nat_val,should_cast", + [ + (pd.NaT, True), + (np.timedelta64("NaT", "ns"), True), + (np.datetime64("NaT", "ns"), False), + ], +) +def test_td64_series_assign_nat(nat_val, should_cast): + # some nat-like values should be cast to timedelta64 when inserting + # into a timedelta64 series. Others should coerce to object + # and retain their dtypes. + base = pd.Series([0, 1, 2], dtype="m8[ns]") + expected = pd.Series([pd.NaT, 1, 2], dtype="m8[ns]") + if not should_cast: + expected = expected.astype(object) + + ser = base.copy(deep=True) + ser[0] = nat_val + tm.assert_series_equal(ser, expected) + + ser = base.copy(deep=True) + ser.loc[0] = nat_val + tm.assert_series_equal(ser, expected) + + ser = base.copy(deep=True) + ser.iloc[0] = nat_val + tm.assert_series_equal(ser, expected) + + @pytest.mark.parametrize( "td", [ From a101f9e4d6287caba1534c3f26473464605c512e Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Wed, 17 Jul 2019 17:06:00 -0700 Subject: [PATCH 37/38] TYPING: pandas/core/window.py (#27391) * TYPING: pandas/core/window.py * Address comments * Remove unused import * Sort imports --- pandas/_typing.py | 1 + pandas/core/window.py | 119 ++++++++++++++------------ pandas/tests/frame/test_arithmetic.py | 15 ++++ 3 files changed, 78 insertions(+), 57 deletions(-) diff --git a/pandas/_typing.py b/pandas/_typing.py index 46b1b4685ec9f..a1224a609579e 100644 --- a/pandas/_typing.py +++ b/pandas/_typing.py @@ -31,3 +31,4 @@ FrameOrSeries = TypeVar("FrameOrSeries", ABCSeries, ABCDataFrame) Scalar = Union[str, int, float] +Axis = Union[str, int] diff --git a/pandas/core/window.py b/pandas/core/window.py index 0c1f6a1a6dace..86574208a3fc0 100644 --- a/pandas/core/window.py +++ b/pandas/core/window.py @@ -5,7 +5,7 @@ from collections import defaultdict from datetime import timedelta from textwrap import dedent -from typing import Set +from typing import List, Optional, Set import warnings import numpy as np @@ -35,6 +35,7 @@ ABCTimedeltaIndex, ) +from pandas._typing import Axis, FrameOrSeries from pandas.core.base import DataError, PandasObject, SelectionMixin import pandas.core.common as com from pandas.core.generic import _shared_docs @@ -63,24 +64,23 @@ class _Window(PandasObject, SelectionMixin): "axis", "on", "closed", - ] + ] # type: List[str] exclusions = set() # type: Set[str] def __init__( self, obj, window=None, - min_periods=None, - center=False, - win_type=None, - axis=0, - on=None, - closed=None, + min_periods: Optional[int] = None, + center: Optional[bool] = False, + win_type: Optional[str] = None, + axis: Axis = 0, + on: Optional[str] = None, + closed: Optional[str] = None, **kwargs ): self.__dict__.update(kwargs) - self.blocks = [] self.obj = obj self.on = on self.closed = closed @@ -97,7 +97,7 @@ def _constructor(self): return Window @property - def is_datetimelike(self): + def is_datetimelike(self) -> Optional[bool]: return None @property @@ -105,7 +105,7 @@ def _on(self): return None @property - def is_freq_type(self): + def is_freq_type(self) -> bool: return self.win_type == "freq" def validate(self): @@ -121,22 +121,12 @@ def validate(self): ]: raise ValueError("closed must be 'right', 'left', 'both' or " "'neither'") - def _convert_freq(self): - """ - Resample according to the how, return a new object. - """ - obj = self._selected_obj - index = None - return obj, index - def _create_blocks(self): """ Split data into blocks & return conformed data. """ - obj, index = self._convert_freq() - if index is not None: - index = self._on + obj = self._selected_obj # filter out the on from the object if self.on is not None: @@ -144,7 +134,7 @@ def _create_blocks(self): obj = obj.reindex(columns=obj.columns.difference([self.on]), copy=False) blocks = obj._to_dict_of_blocks(copy=False).values() - return blocks, obj, index + return blocks, obj def _gotitem(self, key, ndim, subset=None): """ @@ -186,10 +176,10 @@ def _get_window(self, other=None): return self.window @property - def _window_type(self): + def _window_type(self) -> str: return self.__class__.__name__ - def __repr__(self): + def __repr__(self) -> str: """ Provide a nice str repr of our rolling object. """ @@ -207,23 +197,21 @@ def __iter__(self): url = "https://github.com/pandas-dev/pandas/issues/11704" raise NotImplementedError("See issue #11704 {url}".format(url=url)) - def _get_index(self, index=None): + def _get_index(self) -> Optional[np.ndarray]: """ - Return index as ndarrays. + Return index as an ndarray. Returns ------- - tuple of (index, index_as_ndarray) + None or ndarray """ if self.is_freq_type: - if index is None: - index = self._on - return index, index.asi8 - return index, index - - def _prep_values(self, values=None, kill_inf=True): + return self._on.asi8 + return None + def _prep_values(self, values: Optional[np.ndarray] = None) -> np.ndarray: + """Convert input to numpy arrays for Cython routines""" if values is None: values = getattr(self._selected_obj, "values", self._selected_obj) @@ -247,13 +235,12 @@ def _prep_values(self, values=None, kill_inf=True): "cannot handle this type -> {0}" "".format(values.dtype) ) - if kill_inf: - values = values.copy() - values[np.isinf(values)] = np.NaN + # Always convert inf to nan + values[np.isinf(values)] = np.NaN return values - def _wrap_result(self, result, block=None, obj=None): + def _wrap_result(self, result, block=None, obj=None) -> FrameOrSeries: """ Wrap a single result. """ @@ -281,7 +268,7 @@ def _wrap_result(self, result, block=None, obj=None): return type(obj)(result, index=index, columns=block.columns) return result - def _wrap_results(self, results, blocks, obj, exclude=None): + def _wrap_results(self, results, blocks, obj, exclude=None) -> FrameOrSeries: """ Wrap the results. @@ -335,7 +322,7 @@ def _wrap_results(self, results, blocks, obj, exclude=None): return obj.astype("float64") return concat(final, axis=1).reindex(columns=columns, copy=False) - def _center_window(self, result, window): + def _center_window(self, result, window) -> np.ndarray: """ Center the result in the window. """ @@ -724,7 +711,7 @@ def _apply_window(self, mean=True, **kwargs): window = self._prep_window(**kwargs) center = self.center - blocks, obj, index = self._create_blocks() + blocks, obj = self._create_blocks() block_list = list(blocks) results = [] @@ -912,9 +899,9 @@ def _apply( if check_minp is None: check_minp = _use_window - blocks, obj, index = self._create_blocks() + blocks, obj = self._create_blocks() block_list = list(blocks) - index, indexi = self._get_index(index=index) + index_as_array = self._get_index() results = [] exclude = [] @@ -947,7 +934,7 @@ def func(arg, window, min_periods=None, closed=None): minp = check_minp(min_periods, window) # ensure we are only rolling on floats arg = ensure_float64(arg) - return cfunc(arg, window, minp, indexi, closed, **kwargs) + return cfunc(arg, window, minp, index_as_array, closed, **kwargs) # calculation function if center: @@ -1027,9 +1014,9 @@ class _Rolling_and_Expanding(_Rolling): def count(self): - blocks, obj, index = self._create_blocks() + blocks, obj = self._create_blocks() # Validate the index - self._get_index(index=index) + self._get_index() window = self._get_window() window = min(window, len(obj)) if not self.center else window @@ -1088,11 +1075,10 @@ def count(self): def apply(self, func, raw=None, args=(), kwargs={}): from pandas import Series - # TODO: _level is unused? - _level = kwargs.pop("_level", None) # noqa + kwargs.pop("_level", None) window = self._get_window() offset = _offset(window, self.center) - index, indexi = self._get_index() + index_as_array = self._get_index() # TODO: default is for backward compat # change to False in the future @@ -1113,7 +1099,16 @@ def f(arg, window, min_periods, closed): if not raw: arg = Series(arg, index=self.obj.index) return libwindow.roll_generic( - arg, window, minp, indexi, closed, offset, func, raw, args, kwargs + arg, + window, + minp, + index_as_array, + closed, + offset, + func, + raw, + args, + kwargs, ) return self._apply(f, func, args=args, kwargs=kwargs, center=False, raw=raw) @@ -1285,12 +1280,12 @@ def median(self, **kwargs): def std(self, ddof=1, *args, **kwargs): nv.validate_window_func("std", args, kwargs) window = self._get_window() - index, indexi = self._get_index() + index_as_array = self._get_index() def f(arg, *args, **kwargs): minp = _require_min_periods(1)(self.min_periods, window) return _zsqrt( - libwindow.roll_var(arg, window, minp, indexi, self.closed, ddof) + libwindow.roll_var(arg, window, minp, index_as_array, self.closed, ddof) ) return self._apply( @@ -1474,17 +1469,27 @@ def kurt(self, **kwargs): def quantile(self, quantile, interpolation="linear", **kwargs): window = self._get_window() - index, indexi = self._get_index() + index_as_array = self._get_index() def f(arg, *args, **kwargs): minp = _use_window(self.min_periods, window) if quantile == 1.0: - return libwindow.roll_max(arg, window, minp, indexi, self.closed) + return libwindow.roll_max( + arg, window, minp, index_as_array, self.closed + ) elif quantile == 0.0: - return libwindow.roll_min(arg, window, minp, indexi, self.closed) + return libwindow.roll_min( + arg, window, minp, index_as_array, self.closed + ) else: return libwindow.roll_quantile( - arg, window, minp, indexi, self.closed, quantile, interpolation + arg, + window, + minp, + index_as_array, + self.closed, + quantile, + interpolation, ) return self._apply(f, "quantile", quantile=quantile, **kwargs) @@ -2450,7 +2455,7 @@ def _apply(self, func, **kwargs): ------- y : same type as input argument """ - blocks, obj, index = self._create_blocks() + blocks, obj = self._create_blocks() block_list = list(blocks) results = [] diff --git a/pandas/tests/frame/test_arithmetic.py b/pandas/tests/frame/test_arithmetic.py index 7c022106c9104..05c91bd8be945 100644 --- a/pandas/tests/frame/test_arithmetic.py +++ b/pandas/tests/frame/test_arithmetic.py @@ -642,3 +642,18 @@ def test_arith_non_pandas_object(self): val3 = np.random.rand(*df.shape) added = pd.DataFrame(df.values + val3, index=df.index, columns=df.columns) tm.assert_frame_equal(df.add(val3), added) + + def test_operations_with_interval_categories_index(self, all_arithmetic_operators): + # GH#27415 + op = all_arithmetic_operators + ind = pd.CategoricalIndex(pd.interval_range(start=0.0, end=2.0)) + + df = pd.DataFrame([[1, 2]], columns=ind) + num = 100 + try: + getattr(df, op)(num) + except TypeError: + pytest.fail( + "Unexpected TypeError for operations on DataFrame\ + with interval categories as index" + ) From 7a5dac46acd7d209753e963105506b02e329a62d Mon Sep 17 00:00:00 2001 From: yanglinlee Date: Sat, 20 Jul 2019 18:21:37 -0400 Subject: [PATCH 38/38] TST: Add test for operations on DataFrame with Interval CategoricalIndex --- pandas/tests/frame/test_arithmetic.py | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/pandas/tests/frame/test_arithmetic.py b/pandas/tests/frame/test_arithmetic.py index 05c91bd8be945..706bc122c6d9e 100644 --- a/pandas/tests/frame/test_arithmetic.py +++ b/pandas/tests/frame/test_arithmetic.py @@ -647,13 +647,9 @@ def test_operations_with_interval_categories_index(self, all_arithmetic_operator # GH#27415 op = all_arithmetic_operators ind = pd.CategoricalIndex(pd.interval_range(start=0.0, end=2.0)) - - df = pd.DataFrame([[1, 2]], columns=ind) - num = 100 - try: - getattr(df, op)(num) - except TypeError: - pytest.fail( - "Unexpected TypeError for operations on DataFrame\ - with interval categories as index" - ) + data = [1, 2] + df = pd.DataFrame([data], columns=ind) + num = 10 + result = getattr(df, op)(num) + expected = pd.DataFrame([[getattr(n, op)(num) for n in data]], columns=ind) + tm.assert_frame_equal(result, expected)