From 32091728753419061c053d2257a6b75ae2207268 Mon Sep 17 00:00:00 2001 From: Kalyan Gokhale <4734245+KalyanGokhale@users.noreply.github.com> Date: Sat, 26 May 2018 12:40:55 +0530 Subject: [PATCH 01/13] Revert "22May" --- doc/source/advanced.rst | 49 ----------- doc/source/api.rst | 11 +-- doc/source/timedeltas.rst | 42 +-------- doc/source/timeseries.rst | 12 --- doc/source/whatsnew/v0.23.1.txt | 16 +--- pandas/_libs/tslibs/timedeltas.pyx | 43 ++-------- pandas/core/accessor.py | 3 +- pandas/core/base.py | 2 +- pandas/core/common.py | 7 +- pandas/core/frame.py | 13 +-- pandas/core/indexes/base.py | 54 ------------ pandas/core/indexes/multi.py | 46 ++++++++++ pandas/core/series.py | 10 +-- pandas/core/strings.py | 4 +- pandas/tests/frame/test_indexing.py | 10 +-- pandas/tests/frame/test_reshape.py | 17 ---- pandas/tests/frame/test_sorting.py | 34 ++------ pandas/tests/indexes/datetimes/test_tools.py | 8 -- pandas/tests/indexes/test_base.py | 30 ------- pandas/tests/indexes/test_multi.py | 18 +--- pandas/tests/indexing/test_indexing.py | 2 +- pandas/tests/io/parser/common.py | 17 ++-- pandas/tests/io/parser/compression.py | 15 ++-- pandas/tests/io/parser/test_textreader.py | 12 ++- pandas/tests/io/sas/test_sas7bdat.py | 2 - pandas/tests/io/test_packers.py | 5 +- pandas/tests/reshape/test_concat.py | 11 --- .../tests/scalar/timedelta/test_timedelta.py | 10 --- .../tests/scalar/timestamp/test_timestamp.py | 85 +++++++++---------- pandas/tests/series/test_io.py | 5 +- pandas/tests/series/test_sorting.py | 9 +- pandas/tests/util/test_testing.py | 38 --------- pandas/util/testing.py | 29 +++---- 33 files changed, 182 insertions(+), 487 deletions(-) diff --git a/doc/source/advanced.rst b/doc/source/advanced.rst index ec517d3e07bdf..c81842d3d9212 100644 --- a/doc/source/advanced.rst +++ b/doc/source/advanced.rst @@ -924,55 +924,6 @@ bins, with ``NaN`` representing a missing value similar to other dtypes. pd.cut([0, 3, 5, 1], bins=c.categories) - -Generating Ranges of Intervals -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -If we need intervals on a regular frequency, we can use the :func:`interval_range` function -to create an ``IntervalIndex`` using various combinations of ``start``, ``end``, and ``periods``. -The default frequency for ``interval_range`` is a 1 for numeric intervals, and calendar day for -datetime-like intervals: - -.. ipython:: python - - pd.interval_range(start=0, end=5) - - pd.interval_range(start=pd.Timestamp('2017-01-01'), periods=4) - - pd.interval_range(end=pd.Timedelta('3 days'), periods=3) - -The ``freq`` parameter can used to specify non-default frequencies, and can utilize a variety -of :ref:`frequency aliases ` with datetime-like intervals: - -.. ipython:: python - - pd.interval_range(start=0, periods=5, freq=1.5) - - pd.interval_range(start=pd.Timestamp('2017-01-01'), periods=4, freq='W') - - pd.interval_range(start=pd.Timedelta('0 days'), periods=3, freq='9H') - -Additionally, the ``closed`` parameter can be used to specify which side(s) the intervals -are closed on. Intervals are closed on the right side by default. - -.. ipython:: python - - pd.interval_range(start=0, end=4, closed='both') - - pd.interval_range(start=0, end=4, closed='neither') - -.. versionadded:: 0.23.0 - -Specifying ``start``, ``end``, and ``periods`` will generate a range of evenly spaced -intervals from ``start`` to ``end`` inclusively, with ``periods`` number of elements -in the resulting ``IntervalIndex``: - -.. ipython:: python - - pd.interval_range(start=0, end=6, periods=4) - - pd.interval_range(pd.Timestamp('2018-01-01'), pd.Timestamp('2018-02-28'), periods=3) - Miscellaneous indexing FAQ -------------------------- diff --git a/doc/source/api.rst b/doc/source/api.rst index 4faec93490fde..d00e5511f1100 100644 --- a/doc/source/api.rst +++ b/doc/source/api.rst @@ -1459,6 +1459,7 @@ Modifying and Computations Index.is_floating Index.is_integer Index.is_interval + Index.is_lexsorted_for_tuple Index.is_mixed Index.is_numeric Index.is_object @@ -1470,19 +1471,11 @@ Modifying and Computations Index.where Index.take Index.putmask + Index.set_names Index.unique Index.nunique Index.value_counts -Compatibility with MultiIndex -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -.. autosummary:: - :toctree: generated/ - - Index.set_names - Index.is_lexsorted_for_tuple - Index.droplevel - Missing Values ~~~~~~~~~~~~~~ .. autosummary:: diff --git a/doc/source/timedeltas.rst b/doc/source/timedeltas.rst index 745810704f665..5f3a01f0725d4 100644 --- a/doc/source/timedeltas.rst +++ b/doc/source/timedeltas.rst @@ -352,8 +352,8 @@ You can convert a ``Timedelta`` to an `ISO 8601 Duration`_ string with the TimedeltaIndex -------------- -To generate an index with time delta, you can use either the :class:`TimedeltaIndex` or -the :func:`timedelta_range` constructor. +To generate an index with time delta, you can use either the ``TimedeltaIndex`` or +the ``timedelta_range`` constructor. Using ``TimedeltaIndex`` you can pass string-like, ``Timedelta``, ``timedelta``, or ``np.timedelta64`` objects. Passing ``np.nan/pd.NaT/nat`` will represent missing values. @@ -363,47 +363,13 @@ or ``np.timedelta64`` objects. Passing ``np.nan/pd.NaT/nat`` will represent miss pd.TimedeltaIndex(['1 days', '1 days, 00:00:05', np.timedelta64(2,'D'), datetime.timedelta(days=2,seconds=2)]) -Generating Ranges of Time Deltas -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Similar to :func:`date_range`, you can construct regular ranges of a ``TimedeltaIndex`` -using :func:`timedelta_range`. The default frequency for ``timedelta_range`` is -calendar day: - -.. ipython:: python - - pd.timedelta_range(start='1 days', periods=5) - -Various combinations of ``start``, ``end``, and ``periods`` can be used with -``timedelta_range``: - -.. ipython:: python - - pd.timedelta_range(start='1 days', end='5 days') - - pd.timedelta_range(end='10 days', periods=4) - -The ``freq`` parameter can passed a variety of :ref:`frequency aliases `: +Similarly to ``date_range``, you can construct regular ranges of a ``TimedeltaIndex``: .. ipython:: python + pd.timedelta_range(start='1 days', periods=5, freq='D') pd.timedelta_range(start='1 days', end='2 days', freq='30T') - pd.timedelta_range(start='1 days', periods=5, freq='2D5H') - - -.. versionadded:: 0.23.0 - -Specifying ``start``, ``end``, and ``periods`` will generate a range of evenly spaced -timedeltas from ``start`` to ``end`` inclusively, with ``periods`` number of elements -in the resulting ``TimedeltaIndex``: - -.. ipython:: python - - pd.timedelta_range('0 days', '4 days', periods=5) - - pd.timedelta_range('0 days', '4 days', periods=10) - Using the TimedeltaIndex ~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/doc/source/timeseries.rst b/doc/source/timeseries.rst index 1b0cf86995a39..73e3e721aad71 100644 --- a/doc/source/timeseries.rst +++ b/doc/source/timeseries.rst @@ -393,18 +393,6 @@ of those specified will not be generated: pd.bdate_range(start=start, periods=20) -.. versionadded:: 0.23.0 - -Specifying ``start``, ``end``, and ``periods`` will generate a range of evenly spaced -dates from ``start`` to ``end`` inclusively, with ``periods`` number of elements in the -resulting ``DatetimeIndex``: - -.. ipython:: python - - pd.date_range('2018-01-01', '2018-01-05', periods=5) - - pd.date_range('2018-01-01', '2018-01-05', periods=10) - .. _timeseries.custom-freq-ranges: Custom Frequency Ranges diff --git a/doc/source/whatsnew/v0.23.1.txt b/doc/source/whatsnew/v0.23.1.txt index 44f7280d5535f..9382d74f95295 100644 --- a/doc/source/whatsnew/v0.23.1.txt +++ b/doc/source/whatsnew/v0.23.1.txt @@ -15,8 +15,6 @@ and bug fixes. We recommend that all users upgrade to this version. New features ~~~~~~~~~~~~ -- :meth:`Index.droplevel` is now implemented also for flat indexes, for compatibility with MultiIndex (:issue:`21115`) - .. _whatsnew_0231.deprecations: @@ -46,8 +44,6 @@ Documentation Changes Bug Fixes ~~~~~~~~~ -- tab completion on :class:`Index` in IPython no longer outputs deprecation warnings (:issue:`21125`) - Groupby/Resample/Rolling ^^^^^^^^^^^^^^^^^^^^^^^^ @@ -57,15 +53,7 @@ Strings ^^^^^^^ - Bug in :meth:`Series.str.replace()` where the method throws `TypeError` on Python 3.5.2 (:issue: `21078`) - -Timedelta -^^^^^^^^^ -- Bug in :class:`Timedelta`: where passing a float with a unit would prematurely round the float precision (:issue: `14156`) - -Categorical -^^^^^^^^^^^ - -- Bug in :func:`pandas.util.testing.assert_index_equal` which raised ``AssertionError`` incorrectly, when comparing two :class:`CategoricalIndex` objects with param ``check_categorical=False`` (:issue:`19776`) +- Conversion ^^^^^^^^^^ @@ -94,7 +82,7 @@ Plotting Reshaping ^^^^^^^^^ -- Bug in :func:`concat` where error was raised in concatenating :class:`Series` with numpy scalar and tuple names (:issue:`21015`) +- - Categorical diff --git a/pandas/_libs/tslibs/timedeltas.pyx b/pandas/_libs/tslibs/timedeltas.pyx index 4c237da7b6d0e..f7bb6c1dbb304 100644 --- a/pandas/_libs/tslibs/timedeltas.pyx +++ b/pandas/_libs/tslibs/timedeltas.pyx @@ -202,22 +202,22 @@ cpdef inline int64_t cast_from_unit(object ts, object unit) except? -1: if unit == 'D' or unit == 'd': m = 1000000000L * 86400 - p = 9 + p = 6 elif unit == 'h': m = 1000000000L * 3600 - p = 9 + p = 6 elif unit == 'm': m = 1000000000L * 60 - p = 9 + p = 6 elif unit == 's': m = 1000000000L - p = 9 + p = 6 elif unit == 'ms': m = 1000000L - p = 6 + p = 3 elif unit == 'us': m = 1000L - p = 3 + p = 0 elif unit == 'ns' or unit is None: m = 1L p = 0 @@ -231,10 +231,10 @@ cpdef inline int64_t cast_from_unit(object ts, object unit) except? -1: # cast the unit, multiply base/frace separately # to avoid precision issues from float -> int base = ts - frac = ts - base + frac = ts -base if p: frac = round(frac, p) - return (base * m) + (frac * m) + return (base *m) + (frac *m) cdef inline _decode_if_necessary(object ts): @@ -760,32 +760,7 @@ cdef class _Timedelta(timedelta): @property def delta(self): - """ - Return the timedelta in nanoseconds (ns), for internal compatibility. - - Returns - ------- - int - Timedelta in nanoseconds. - - Examples - -------- - >>> td = pd.Timedelta('1 days 42 ns') - >>> td.delta - 86400000000042 - - >>> td = pd.Timedelta('3 s') - >>> td.delta - 3000000000 - - >>> td = pd.Timedelta('3 ms 5 us') - >>> td.delta - 3005000 - - >>> td = pd.Timedelta(42, unit='ns') - >>> td.delta - 42 - """ + """ return out delta in ns (for internal compat) """ return self.value @property diff --git a/pandas/core/accessor.py b/pandas/core/accessor.py index 7a853d575aa69..c638b9e4ea117 100644 --- a/pandas/core/accessor.py +++ b/pandas/core/accessor.py @@ -12,8 +12,7 @@ class DirNamesMixin(object): _accessors = frozenset([]) - _deprecations = frozenset( - ['asobject', 'base', 'data', 'flags', 'itemsize', 'strides']) + _deprecations = frozenset(['asobject']) def _dir_deletions(self): """ delete unwanted __dir__ for this object """ diff --git a/pandas/core/base.py b/pandas/core/base.py index c331ead8d2fef..aa051c6f5eaef 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -114,7 +114,7 @@ def _reset_cache(self, key=None): def __sizeof__(self): """ - Generates the total memory usage for an object that returns + Generates the total memory usage for a object that returns either a value or Series of values """ if hasattr(self, 'memory_usage'): diff --git a/pandas/core/common.py b/pandas/core/common.py index 1de8269c9a0c6..b9182bfd2cbe2 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -55,11 +55,8 @@ def flatten(l): def _consensus_name_attr(objs): name = objs[0].name for obj in objs[1:]: - try: - if obj.name != name: - name = None - except ValueError: - name = None + if obj.name != name: + return None return name diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 0a07e85401638..77a67c048a48d 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4096,8 +4096,9 @@ def _maybe_casted_values(index, labels=None): if not isinstance(level, (tuple, list)): level = [level] level = [self.index._get_level_number(lev) for lev in level] - if len(level) < self.index.nlevels: - new_index = self.index.droplevel(level) + if isinstance(self.index, MultiIndex): + if len(level) < self.index.nlevels: + new_index = self.index.droplevel(level) if not drop: if isinstance(self.index, MultiIndex): @@ -4453,10 +4454,7 @@ def sort_index(self, axis=0, level=None, ascending=True, inplace=False, axis = self._get_axis_number(axis) labels = self._get_axis(axis) - # make sure that the axis is lexsorted to start - # if not we need to reconstruct to get the correct indexer - labels = labels._sort_levels_monotonic() - if level is not None: + if level: new_axis, indexer = labels.sortlevel(level, ascending=ascending, sort_remaining=sort_remaining) @@ -4464,6 +4462,9 @@ def sort_index(self, axis=0, level=None, ascending=True, inplace=False, elif isinstance(labels, MultiIndex): from pandas.core.sorting import lexsort_indexer + # make sure that the axis is lexsorted to start + # if not we need to reconstruct to get the correct indexer + labels = labels._sort_levels_monotonic() indexer = lexsort_indexer(labels._get_labels_for_sorting(), orders=ascending, na_position=na_position) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index f79288c167356..df39eb5fd8312 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -3158,60 +3158,6 @@ def _get_level_values(self, level): get_level_values = _get_level_values - def droplevel(self, level=0): - """ - Return index with requested level(s) removed. If resulting index has - only 1 level left, the result will be of Index type, not MultiIndex. - - .. versionadded:: 0.23.1 (support for non-MultiIndex) - - Parameters - ---------- - level : int, str, or list-like, default 0 - If a string is given, must be the name of a level - If list-like, elements must be names or indexes of levels. - - Returns - ------- - index : Index or MultiIndex - """ - if not isinstance(level, (tuple, list)): - level = [level] - - levnums = sorted(self._get_level_number(lev) for lev in level)[::-1] - - if len(level) == 0: - return self - if len(level) >= self.nlevels: - raise ValueError("Cannot remove {} levels from an index with {} " - "levels: at least one level must be " - "left.".format(len(level), self.nlevels)) - # The two checks above guarantee that here self is a MultiIndex - - new_levels = list(self.levels) - new_labels = list(self.labels) - new_names = list(self.names) - - for i in levnums: - new_levels.pop(i) - new_labels.pop(i) - new_names.pop(i) - - if len(new_levels) == 1: - - # set nan if needed - mask = new_labels[0] == -1 - result = new_levels[0].take(new_labels[0]) - if mask.any(): - result = result.putmask(mask, np.nan) - - result.name = new_names[0] - return result - else: - from .multi import MultiIndex - return MultiIndex(levels=new_levels, labels=new_labels, - names=new_names, verify_integrity=False) - _index_shared_docs['get_indexer'] = """ Compute indexer and mask for new index given the current index. The indexer should be then used as an input to ndarray.take to align the diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index ea0fab7e17648..fbcf06a28c1e5 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -1761,6 +1761,52 @@ def _drop_from_level(self, labels, level): return self[mask] + def droplevel(self, level=0): + """ + Return Index with requested level removed. If MultiIndex has only 2 + levels, the result will be of Index type not MultiIndex. + + Parameters + ---------- + level : int/level name or list thereof + + Notes + ----- + Does not check if result index is unique or not + + Returns + ------- + index : Index or MultiIndex + """ + levels = level + if not isinstance(levels, (tuple, list)): + levels = [level] + + new_levels = list(self.levels) + new_labels = list(self.labels) + new_names = list(self.names) + + levnums = sorted(self._get_level_number(lev) for lev in levels)[::-1] + + for i in levnums: + new_levels.pop(i) + new_labels.pop(i) + new_names.pop(i) + + if len(new_levels) == 1: + + # set nan if needed + mask = new_labels[0] == -1 + result = new_levels[0].take(new_labels[0]) + if mask.any(): + result = result.putmask(mask, np.nan) + + result.name = new_names[0] + return result + else: + return MultiIndex(levels=new_levels, labels=new_labels, + names=new_names, verify_integrity=False) + def swaplevel(self, i=-2, j=-1): """ Swap level i with level j. diff --git a/pandas/core/series.py b/pandas/core/series.py index c9329e8b9e572..6d396e845219e 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1199,8 +1199,9 @@ def reset_index(self, level=None, drop=False, name=None, inplace=False): if not isinstance(level, (tuple, list)): level = [level] level = [self.index._get_level_number(lev) for lev in level] - if len(level) < self.index.nlevels: - new_index = self.index.droplevel(level) + if isinstance(self.index, MultiIndex): + if len(level) < self.index.nlevels: + new_index = self.index.droplevel(level) if inplace: self.index = new_index @@ -2616,7 +2617,7 @@ def sort_index(self, axis=0, level=None, ascending=True, inplace=False, axis = self._get_axis_number(axis) index = self.index - if level is not None: + if level: new_index, indexer = index.sortlevel(level, ascending=ascending, sort_remaining=sort_remaining) elif isinstance(index, MultiIndex): @@ -3176,8 +3177,7 @@ def apply(self, func, convert_dtype=True, args=(), **kwds): # handle ufuncs and lambdas if kwds or args and not isinstance(func, np.ufunc): - def f(x): - return func(x, *args, **kwds) + f = lambda x: func(x, *args, **kwds) else: f = func diff --git a/pandas/core/strings.py b/pandas/core/strings.py index 5d50c45fe7eca..81d775157cf62 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -241,7 +241,7 @@ def str_count(arr, pat, flags=0): Escape ``'$'`` to find the literal dollar sign. >>> s = pd.Series(['$', 'B', 'Aab$', '$$ca', 'C$B$', 'cat']) - >>> s.str.count('\\$') + >>> s.str.count('\$') 0 1 1 0 2 1 @@ -358,7 +358,7 @@ def str_contains(arr, pat, case=True, flags=0, na=np.nan, regex=True): Returning any digit using regular expression. - >>> s1.str.contains('\\d', regex=True) + >>> s1.str.contains('\d', regex=True) 0 False 1 False 2 False diff --git a/pandas/tests/frame/test_indexing.py b/pandas/tests/frame/test_indexing.py index 5459b6910e11a..6d74ce54faa94 100644 --- a/pandas/tests/frame/test_indexing.py +++ b/pandas/tests/frame/test_indexing.py @@ -397,13 +397,11 @@ def test_getitem_setitem_ix_negative_integers(self): df = DataFrame(np.random.randn(8, 4)) # ix does label-based indexing when having an integer index - with catch_warnings(record=True): - with pytest.raises(KeyError): - df.ix[[-1]] + with pytest.raises(KeyError): + df.ix[[-1]] - with catch_warnings(record=True): - with pytest.raises(KeyError): - df.ix[:, [-1]] + with pytest.raises(KeyError): + df.ix[:, [-1]] # #1942 a = DataFrame(randn(20, 2), index=[chr(x + 65) for x in range(20)]) diff --git a/pandas/tests/frame/test_reshape.py b/pandas/tests/frame/test_reshape.py index d05321abefca6..d89731dc09044 100644 --- a/pandas/tests/frame/test_reshape.py +++ b/pandas/tests/frame/test_reshape.py @@ -861,23 +861,6 @@ def test_stack_preserve_categorical_dtype(self): tm.assert_series_equal(result, expected) - @pytest.mark.parametrize("level", [0, 'baz']) - def test_unstack_swaplevel_sortlevel(self, level): - # GH 20994 - mi = pd.MultiIndex.from_product([[0], ['d', 'c']], - names=['bar', 'baz']) - df = pd.DataFrame([[0, 2], [1, 3]], index=mi, columns=['B', 'A']) - df.columns.name = 'foo' - - expected = pd.DataFrame([ - [3, 1, 2, 0]], columns=pd.MultiIndex.from_tuples([ - ('c', 'A'), ('c', 'B'), ('d', 'A'), ('d', 'B')], names=[ - 'baz', 'foo'])) - expected.index.name = 'bar' - - result = df.unstack().swaplevel(axis=1).sort_index(axis=1, level=level) - tm.assert_frame_equal(result, expected) - def test_unstack_fill_frame_object(): # GH12815 Test unstacking with object. diff --git a/pandas/tests/frame/test_sorting.py b/pandas/tests/frame/test_sorting.py index 599ae683f914b..b60eb89e87da5 100644 --- a/pandas/tests/frame/test_sorting.py +++ b/pandas/tests/frame/test_sorting.py @@ -550,36 +550,18 @@ def test_sort_index(self): expected = frame.iloc[:, ::-1] assert_frame_equal(result, expected) - @pytest.mark.parametrize("level", ['A', 0]) # GH 21052 - def test_sort_index_multiindex(self, level): + def test_sort_index_multiindex(self): # GH13496 # sort rows by specified level of multi-index - mi = MultiIndex.from_tuples([ - [2, 1, 3], [2, 1, 2], [1, 1, 1]], names=list('ABC')) - df = DataFrame([[1, 2], [3, 4], [5, 6]], index=mi) - - expected_mi = MultiIndex.from_tuples([ - [1, 1, 1], - [2, 1, 2], - [2, 1, 3]], names=list('ABC')) - expected = pd.DataFrame([ - [5, 6], - [3, 4], - [1, 2]], index=expected_mi) - result = df.sort_index(level=level) - assert_frame_equal(result, expected) + mi = MultiIndex.from_tuples([[2, 1, 3], [1, 1, 1]], names=list('ABC')) + df = DataFrame([[1, 2], [3, 4]], mi) - # sort_remaining=False - expected_mi = MultiIndex.from_tuples([ - [1, 1, 1], - [2, 1, 3], - [2, 1, 2]], names=list('ABC')) - expected = pd.DataFrame([ - [5, 6], - [1, 2], - [3, 4]], index=expected_mi) - result = df.sort_index(level=level, sort_remaining=False) + # MI sort, but no level: sort_level has no effect + mi = MultiIndex.from_tuples([[1, 1, 3], [1, 1, 1]], names=list('ABC')) + df = DataFrame([[1, 2], [3, 4]], mi) + result = df.sort_index(sort_remaining=False) + expected = df.sort_index() assert_frame_equal(result, expected) def test_sort_index_intervalindex(self): diff --git a/pandas/tests/indexes/datetimes/test_tools.py b/pandas/tests/indexes/datetimes/test_tools.py index 8b0514764b0c0..45be3974dad63 100644 --- a/pandas/tests/indexes/datetimes/test_tools.py +++ b/pandas/tests/indexes/datetimes/test_tools.py @@ -650,14 +650,6 @@ def test_unit_mixed(self, cache): with pytest.raises(ValueError): pd.to_datetime(arr, errors='raise', cache=cache) - @pytest.mark.parametrize('cache', [True, False]) - def test_unit_rounding(self, cache): - # GH 14156: argument will incur floating point errors but no - # premature rounding - result = pd.to_datetime(1434743731.8770001, unit='s', cache=cache) - expected = pd.Timestamp('2015-06-19 19:55:31.877000093') - assert result == expected - @pytest.mark.parametrize('cache', [True, False]) def test_dataframe(self, cache): diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index f9f16dc0ce8b7..f4fa547574b9e 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -245,25 +245,6 @@ def test_constructor_int_dtype_nan(self): result = Index(data, dtype='float') tm.assert_index_equal(result, expected) - def test_droplevel(self, indices): - # GH 21115 - if isinstance(indices, MultiIndex): - # Tested separately in test_multi.py - return - - assert indices.droplevel([]).equals(indices) - - for level in indices.name, [indices.name]: - if isinstance(indices.name, tuple) and level is indices.name: - # GH 21121 : droplevel with tuple name - continue - with pytest.raises(ValueError): - indices.droplevel(level) - - for level in 'wrong', ['wrong']: - with pytest.raises(KeyError): - indices.droplevel(level) - @pytest.mark.parametrize("dtype", ['int64', 'uint64']) def test_constructor_int_dtype_nan_raises(self, dtype): # see gh-15187 @@ -2107,17 +2088,6 @@ def test_get_duplicates_deprecated(self): with tm.assert_produces_warning(FutureWarning): index.get_duplicates() - def test_tab_complete_warning(self, ip): - # https://github.com/pandas-dev/pandas/issues/16409 - pytest.importorskip('IPython', minversion="6.0.0") - from IPython.core.completer import provisionalcompleter - - code = "import pandas as pd; idx = pd.Index([1, 2])" - ip.run_code(code) - with tm.assert_produces_warning(None): - with provisionalcompleter('ignore'): - list(ip.Completer.completions('idx.', 4)) - class TestMixedIntIndex(Base): # Mostly the tests from common.py for which the results differ diff --git a/pandas/tests/indexes/test_multi.py b/pandas/tests/indexes/test_multi.py index c9f6bc9151d00..37f70090c179f 100644 --- a/pandas/tests/indexes/test_multi.py +++ b/pandas/tests/indexes/test_multi.py @@ -99,8 +99,7 @@ def test_where_array_like(self): cond = [False, True] for klass in klasses: - def f(): - return i.where(klass(cond)) + f = lambda: i.where(klass(cond)) pytest.raises(NotImplementedError, f) def test_repeat(self): @@ -2079,7 +2078,7 @@ def test_droplevel_with_names(self): expected = index.droplevel(1) assert dropped.equals(expected) - def test_droplevel_list(self): + def test_droplevel_multiple(self): index = MultiIndex( levels=[Index(lrange(4)), Index(lrange(4)), Index(lrange(4))], labels=[np.array([0, 0, 1, 2, 2, 2, 3, 3]), np.array( @@ -2090,16 +2089,6 @@ def test_droplevel_list(self): expected = index[:2].droplevel(2).droplevel(0) assert dropped.equals(expected) - dropped = index[:2].droplevel([]) - expected = index[:2] - assert dropped.equals(expected) - - with pytest.raises(ValueError): - index[:2].droplevel(['one', 'two', 'three']) - - with pytest.raises(KeyError): - index[:2].droplevel(['one', 'four']) - def test_drop_not_lexsorted(self): # GH 12078 @@ -2416,8 +2405,7 @@ def check(nlevels, with_nulls): # with a dup if with_nulls: - def f(a): - return np.insert(a, 1000, a[0]) + f = lambda a: np.insert(a, 1000, a[0]) labels = list(map(f, labels)) index = MultiIndex(levels=levels, labels=labels) else: diff --git a/pandas/tests/indexing/test_indexing.py b/pandas/tests/indexing/test_indexing.py index 9c992770fc64c..7a17408d4468f 100644 --- a/pandas/tests/indexing/test_indexing.py +++ b/pandas/tests/indexing/test_indexing.py @@ -197,7 +197,7 @@ def test_dups_fancy_indexing(self): # List containing only missing label dfnu = DataFrame(np.random.randn(5, 3), index=list('AABCD')) with pytest.raises(KeyError): - dfnu.loc[['E']] + dfnu.ix[['E']] # ToDo: check_index_type can be True after GH 11497 diff --git a/pandas/tests/io/parser/common.py b/pandas/tests/io/parser/common.py index 2b7ff1f5a9879..2423ddcd9a1a0 100644 --- a/pandas/tests/io/parser/common.py +++ b/pandas/tests/io/parser/common.py @@ -54,21 +54,20 @@ def test_bad_stream_exception(self): # and C engine will raise UnicodeDecodeError instead of # c engine raising ParserError and swallowing exception # that caused read to fail. + handle = open(self.csv_shiftjs, "rb") codec = codecs.lookup("utf-8") utf8 = codecs.lookup('utf-8') - + # stream must be binary UTF8 + stream = codecs.StreamRecoder( + handle, utf8.encode, utf8.decode, codec.streamreader, + codec.streamwriter) if compat.PY3: msg = "'utf-8' codec can't decode byte" else: msg = "'utf8' codec can't decode byte" - - # stream must be binary UTF8 - with open(self.csv_shiftjs, "rb") as handle, codecs.StreamRecoder( - handle, utf8.encode, utf8.decode, codec.streamreader, - codec.streamwriter) as stream: - - with tm.assert_raises_regex(UnicodeDecodeError, msg): - self.read_csv(stream) + with tm.assert_raises_regex(UnicodeDecodeError, msg): + self.read_csv(stream) + stream.close() def test_read_csv(self): if not compat.PY3: diff --git a/pandas/tests/io/parser/compression.py b/pandas/tests/io/parser/compression.py index e84db66561c49..01c6620e50d37 100644 --- a/pandas/tests/io/parser/compression.py +++ b/pandas/tests/io/parser/compression.py @@ -110,15 +110,16 @@ def test_read_csv_infer_compression(self): # see gh-9770 expected = self.read_csv(self.csv1, index_col=0, parse_dates=True) - with open(self.csv1) as f: - inputs = [self.csv1, self.csv1 + '.gz', - self.csv1 + '.bz2', f] + inputs = [self.csv1, self.csv1 + '.gz', + self.csv1 + '.bz2', open(self.csv1)] - for inp in inputs: - df = self.read_csv(inp, index_col=0, parse_dates=True, - compression='infer') + for f in inputs: + df = self.read_csv(f, index_col=0, parse_dates=True, + compression='infer') + + tm.assert_frame_equal(expected, df) - tm.assert_frame_equal(expected, df) + inputs[3].close() def test_read_csv_compressed_utf16_example(self): # GH18071 diff --git a/pandas/tests/io/parser/test_textreader.py b/pandas/tests/io/parser/test_textreader.py index e8d9d8b52164b..ab4c14034cd20 100644 --- a/pandas/tests/io/parser/test_textreader.py +++ b/pandas/tests/io/parser/test_textreader.py @@ -35,18 +35,24 @@ def setup_method(self, method): self.xls1 = os.path.join(self.dirpath, 'test.xls') def test_file_handle(self): - with open(self.csv1, 'rb') as f: + try: + f = open(self.csv1, 'rb') reader = TextReader(f) - reader.read() + result = reader.read() # noqa + finally: + f.close() def test_string_filename(self): reader = TextReader(self.csv1, header=None) reader.read() def test_file_handle_mmap(self): - with open(self.csv1, 'rb') as f: + try: + f = open(self.csv1, 'rb') reader = TextReader(f, memory_map=True, header=None) reader.read() + finally: + f.close() def test_StringIO(self): with open(self.csv1, 'rb') as f: diff --git a/pandas/tests/io/sas/test_sas7bdat.py b/pandas/tests/io/sas/test_sas7bdat.py index b80263021c269..5da347e47957c 100644 --- a/pandas/tests/io/sas/test_sas7bdat.py +++ b/pandas/tests/io/sas/test_sas7bdat.py @@ -182,8 +182,6 @@ def test_date_time(): fname = os.path.join(dirpath, "datetime.csv") df0 = pd.read_csv(fname, parse_dates=['Date1', 'Date2', 'DateTime', 'DateTimeHi', 'Taiw']) - # GH 19732: Timestamps imported from sas will incur floating point errors - df.iloc[:, 3] = df.iloc[:, 3].dt.round('us') tm.assert_frame_equal(df, df0) diff --git a/pandas/tests/io/test_packers.py b/pandas/tests/io/test_packers.py index 0b1c1ca178762..cfac77291803d 100644 --- a/pandas/tests/io/test_packers.py +++ b/pandas/tests/io/test_packers.py @@ -128,8 +128,9 @@ def test_string_io(self): with ensure_clean(self.path) as p: s = df.to_msgpack() - with open(p, 'wb') as fh: - fh.write(s) + fh = open(p, 'wb') + fh.write(s) + fh.close() result = read_msgpack(p) tm.assert_frame_equal(result, df) diff --git a/pandas/tests/reshape/test_concat.py b/pandas/tests/reshape/test_concat.py index dea305d4b3fee..f5e58fa70e1c4 100644 --- a/pandas/tests/reshape/test_concat.py +++ b/pandas/tests/reshape/test_concat.py @@ -2487,14 +2487,3 @@ def test_concat_aligned_sort_does_not_raise(): columns=[1, 'a']) result = pd.concat([df, df], ignore_index=True, sort=True) tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize("s1name,s2name", [ - (np.int64(190), (43, 0)), (190, (43, 0))]) -def test_concat_series_name_npscalar_tuple(s1name, s2name): - # GH21015 - s1 = pd.Series({'a': 1, 'b': 2}, name=s1name) - s2 = pd.Series({'c': 5, 'd': 6}, name=s2name) - result = pd.concat([s1, s2]) - expected = pd.Series({'a': 1, 'b': 2, 'c': 5, 'd': 6}) - tm.assert_series_equal(result, expected) diff --git a/pandas/tests/scalar/timedelta/test_timedelta.py b/pandas/tests/scalar/timedelta/test_timedelta.py index 205fdf49d3e91..3fdc2aa71bfc0 100644 --- a/pandas/tests/scalar/timedelta/test_timedelta.py +++ b/pandas/tests/scalar/timedelta/test_timedelta.py @@ -106,16 +106,6 @@ def test_compare_timedelta_ndarray(self): class TestTimedeltas(object): - @pytest.mark.parametrize("unit, value, expected", [ - ('us', 9.999, 9999), ('ms', 9.999999, 9999999), - ('s', 9.999999999, 9999999999)]) - def test_rounding_on_int_unit_construction(self, unit, value, expected): - # GH 12690 - result = Timedelta(value, unit=unit) - assert result.value == expected - result = Timedelta(str(value) + unit) - assert result.value == expected - def test_total_seconds_scalar(self): # see gh-10939 rng = Timedelta('1 days, 10:11:12.100123456') diff --git a/pandas/tests/scalar/timestamp/test_timestamp.py b/pandas/tests/scalar/timestamp/test_timestamp.py index ab87d98fca8eb..b022b327de57c 100644 --- a/pandas/tests/scalar/timestamp/test_timestamp.py +++ b/pandas/tests/scalar/timestamp/test_timestamp.py @@ -621,51 +621,10 @@ def test_basics_nanos(self): assert stamp.microsecond == 145224 assert stamp.nanosecond == 192 - @pytest.mark.parametrize('value, check_kwargs', [ - [946688461000000000, {}], - [946688461000000000 / long(1000), dict(unit='us')], - [946688461000000000 / long(1000000), dict(unit='ms')], - [946688461000000000 / long(1000000000), dict(unit='s')], - [10957, dict(unit='D', h=0)], - pytest.param((946688461000000000 + 500000) / long(1000000000), - dict(unit='s', us=499, ns=964), - marks=pytest.mark.skipif(not PY3, - reason='using truediv, so these' - ' are like floats')), - pytest.param((946688461000000000 + 500000000) / long(1000000000), - dict(unit='s', us=500000), - marks=pytest.mark.skipif(not PY3, - reason='using truediv, so these' - ' are like floats')), - pytest.param((946688461000000000 + 500000) / long(1000000), - dict(unit='ms', us=500), - marks=pytest.mark.skipif(not PY3, - reason='using truediv, so these' - ' are like floats')), - pytest.param((946688461000000000 + 500000) / long(1000000000), - dict(unit='s'), - marks=pytest.mark.skipif(PY3, - reason='get chopped in py2')), - pytest.param((946688461000000000 + 500000000) / long(1000000000), - dict(unit='s'), - marks=pytest.mark.skipif(PY3, - reason='get chopped in py2')), - pytest.param((946688461000000000 + 500000) / long(1000000), - dict(unit='ms'), - marks=pytest.mark.skipif(PY3, - reason='get chopped in py2')), - [(946688461000000000 + 500000) / long(1000), dict(unit='us', us=500)], - [(946688461000000000 + 500000000) / long(1000000), - dict(unit='ms', us=500000)], - [946688461000000000 / 1000.0 + 5, dict(unit='us', us=5)], - [946688461000000000 / 1000.0 + 5000, dict(unit='us', us=5000)], - [946688461000000000 / 1000000.0 + 0.5, dict(unit='ms', us=500)], - [946688461000000000 / 1000000.0 + 0.005, dict(unit='ms', us=5, ns=5)], - [946688461000000000 / 1000000000.0 + 0.5, dict(unit='s', us=500000)], - [10957 + 0.5, dict(unit='D', h=12)]]) - def test_unit(self, value, check_kwargs): - def check(value, unit=None, h=1, s=1, us=0, ns=0): - stamp = Timestamp(value, unit=unit) + def test_unit(self): + + def check(val, unit=None, h=1, s=1, us=0): + stamp = Timestamp(val, unit=unit) assert stamp.year == 2000 assert stamp.month == 1 assert stamp.day == 1 @@ -678,9 +637,41 @@ def check(value, unit=None, h=1, s=1, us=0, ns=0): assert stamp.minute == 0 assert stamp.second == 0 assert stamp.microsecond == 0 - assert stamp.nanosecond == ns + assert stamp.nanosecond == 0 + + ts = Timestamp('20000101 01:01:01') + val = ts.value + days = (ts - Timestamp('1970-01-01')).days + + check(val) + check(val / long(1000), unit='us') + check(val / long(1000000), unit='ms') + check(val / long(1000000000), unit='s') + check(days, unit='D', h=0) - check(value, **check_kwargs) + # using truediv, so these are like floats + if PY3: + check((val + 500000) / long(1000000000), unit='s', us=500) + check((val + 500000000) / long(1000000000), unit='s', us=500000) + check((val + 500000) / long(1000000), unit='ms', us=500) + + # get chopped in py2 + else: + check((val + 500000) / long(1000000000), unit='s') + check((val + 500000000) / long(1000000000), unit='s') + check((val + 500000) / long(1000000), unit='ms') + + # ok + check((val + 500000) / long(1000), unit='us', us=500) + check((val + 500000000) / long(1000000), unit='ms', us=500000) + + # floats + check(val / 1000.0 + 5, unit='us', us=5) + check(val / 1000.0 + 5000, unit='us', us=5000) + check(val / 1000000.0 + 0.5, unit='ms', us=500) + check(val / 1000000.0 + 0.005, unit='ms', us=5) + check(val / 1000000000.0 + 0.5, unit='s', us=500000) + check(days + 0.5, unit='D', h=12) def test_roundtrip(self): diff --git a/pandas/tests/series/test_io.py b/pandas/tests/series/test_io.py index e369dfda6deac..0b0d4334c86a3 100644 --- a/pandas/tests/series/test_io.py +++ b/pandas/tests/series/test_io.py @@ -75,8 +75,9 @@ def test_from_csv(self): series_h = self.read_csv(path, header=0) assert series_h.name == "series" - with open(path, "w") as outfile: - outfile.write("1998-01-01|1.0\n1999-01-01|2.0") + outfile = open(path, "w") + outfile.write("1998-01-01|1.0\n1999-01-01|2.0") + outfile.close() series = self.read_csv(path, sep="|") check_series = Series({datetime(1998, 1, 1): 1.0, diff --git a/pandas/tests/series/test_sorting.py b/pandas/tests/series/test_sorting.py index 13e0d1b12c372..01b4ea6eaa238 100644 --- a/pandas/tests/series/test_sorting.py +++ b/pandas/tests/series/test_sorting.py @@ -141,20 +141,19 @@ def test_sort_index_inplace(self): assert result is None tm.assert_series_equal(random_order, self.ts) - @pytest.mark.parametrize("level", ['A', 0]) # GH 21052 - def test_sort_index_multiindex(self, level): + def test_sort_index_multiindex(self): mi = MultiIndex.from_tuples([[1, 1, 3], [1, 1, 1]], names=list('ABC')) s = Series([1, 2], mi) backwards = s.iloc[[1, 0]] # implicit sort_remaining=True - res = s.sort_index(level=level) + res = s.sort_index(level='A') assert_series_equal(backwards, res) # GH13496 - # sort has no effect without remaining lvls - res = s.sort_index(level=level, sort_remaining=False) + # rows share same level='A': sort has no effect without remaining lvls + res = s.sort_index(level='A', sort_remaining=False) assert_series_equal(s, res) def test_sort_index_kind(self): diff --git a/pandas/tests/util/test_testing.py b/pandas/tests/util/test_testing.py index ab7c4fb528452..d6f58d16bcf64 100644 --- a/pandas/tests/util/test_testing.py +++ b/pandas/tests/util/test_testing.py @@ -503,25 +503,6 @@ def test_index_equal_metadata_message(self): with tm.assert_raises_regex(AssertionError, expected): assert_index_equal(idx1, idx2) - def test_categorical_index_equality(self): - expected = """Index are different - -Attribute "dtype" are different -\\[left\\]: CategoricalDtype\\(categories=\\[u?'a', u?'b'\\], ordered=False\\) -\\[right\\]: CategoricalDtype\\(categories=\\[u?'a', u?'b', u?'c'\\], \ -ordered=False\\)""" - - with tm.assert_raises_regex(AssertionError, expected): - assert_index_equal(pd.Index(pd.Categorical(['a', 'b'])), - pd.Index(pd.Categorical(['a', 'b'], - categories=['a', 'b', 'c']))) - - def test_categorical_index_equality_relax_categories_check(self): - assert_index_equal(pd.Index(pd.Categorical(['a', 'b'])), - pd.Index(pd.Categorical(['a', 'b'], - categories=['a', 'b', 'c'])), - check_categorical=False) - class TestAssertSeriesEqual(object): @@ -619,25 +600,6 @@ def test_series_equal_message(self): assert_series_equal(pd.Series([1, 2, 3]), pd.Series([1, 2, 4]), check_less_precise=True) - def test_categorical_series_equality(self): - expected = """Attributes are different - -Attribute "dtype" are different -\\[left\\]: CategoricalDtype\\(categories=\\[u?'a', u?'b'\\], ordered=False\\) -\\[right\\]: CategoricalDtype\\(categories=\\[u?'a', u?'b', u?'c'\\], \ -ordered=False\\)""" - - with tm.assert_raises_regex(AssertionError, expected): - assert_series_equal(pd.Series(pd.Categorical(['a', 'b'])), - pd.Series(pd.Categorical(['a', 'b'], - categories=['a', 'b', 'c']))) - - def test_categorical_series_equality_relax_categories_check(self): - assert_series_equal(pd.Series(pd.Categorical(['a', 'b'])), - pd.Series(pd.Categorical(['a', 'b'], - categories=['a', 'b', 'c'])), - check_categorical=False) - class TestAssertFrameEqual(object): diff --git a/pandas/util/testing.py b/pandas/util/testing.py index 233eba6490937..e1484a9c1b390 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -778,12 +778,8 @@ def assert_index_equal(left, right, exact='equiv', check_names=True, def _check_types(l, r, obj='Index'): if exact: - assert_class_equal(l, r, exact=exact, obj=obj) - - # Skip exact dtype checking when `check_categorical` is False - if check_categorical: - assert_attr_equal('dtype', l, r, obj=obj) - + assert_class_equal(left, right, exact=exact, obj=obj) + assert_attr_equal('dtype', l, r, obj=obj) # allow string-like to have different inferred_types if l.inferred_type in ('string', 'unicode'): assert r.inferred_type in ('string', 'unicode') @@ -833,8 +829,7 @@ def _get_ilevel_values(index, level): # get_level_values may change dtype _check_types(left.levels[level], right.levels[level], obj=obj) - # skip exact index checking when `check_categorical` is False - if check_exact and check_categorical: + if check_exact: if not left.equals(right): diff = np.sum((left.values != right.values) .astype(int)) * 100.0 / len(left) @@ -955,23 +950,23 @@ def is_sorted(seq): def assert_categorical_equal(left, right, check_dtype=True, - check_category_order=True, obj='Categorical'): + obj='Categorical', check_category_order=True): """Test that Categoricals are equivalent. Parameters ---------- - left : Categorical - right : Categorical + left, right : Categorical + Categoricals to compare check_dtype : bool, default True Check that integer dtype of the codes are the same + obj : str, default 'Categorical' + Specify object name being compared, internally used to show appropriate + assertion message check_category_order : bool, default True Whether the order of the categories should be compared, which implies identical integer codes. If False, only the resulting values are compared. The ordered attribute is checked regardless. - obj : str, default 'Categorical' - Specify object name being compared, internally used to show appropriate - assertion message """ _check_isinstance(left, right, Categorical) @@ -1025,7 +1020,7 @@ def raise_assert_detail(obj, message, left, right, diff=None): def assert_numpy_array_equal(left, right, strict_nan=False, check_dtype=True, err_msg=None, - check_same=None, obj='numpy array'): + obj='numpy array', check_same=None): """ Checks that 'np.ndarray' is equivalent Parameters @@ -1038,11 +1033,11 @@ def assert_numpy_array_equal(left, right, strict_nan=False, check dtype if both a and b are np.ndarray err_msg : str, default None If provided, used as assertion message - check_same : None|'copy'|'same', default None - Ensure left and right refer/do not refer to the same memory area obj : str, default 'numpy array' Specify object name being compared, internally used to show appropriate assertion message + check_same : None|'copy'|'same', default None + Ensure left and right refer/do not refer to the same memory area """ # instance validation From 3aa8561349984859096f0f8220ef2e52071ff293 Mon Sep 17 00:00:00 2001 From: Kalyan Gokhale <4734245+KalyanGokhale@users.noreply.github.com> Date: Fri, 13 Jul 2018 18:12:21 +0530 Subject: [PATCH 02/13] Update 13Jul18 --- doc/source/advanced.rst | 49 +++++++++++ doc/source/api.rst | 11 ++- doc/source/timedeltas.rst | 20 ++++- doc/source/timeseries.rst | 12 +++ doc/source/whatsnew/v0.23.1.txt | 5 -- pandas/_libs/tslibs/timedeltas.pyx | 43 ++++++++-- pandas/core/accessor.py | 3 +- pandas/core/base.py | 2 +- pandas/core/common.py | 7 +- pandas/core/frame.py | 13 ++- pandas/core/indexes/base.py | 54 ++++++++++++ pandas/core/indexes/multi.py | 46 ---------- pandas/core/series.py | 10 +-- pandas/core/strings.py | 4 +- pandas/tests/frame/test_indexing.py | 10 ++- pandas/tests/frame/test_reshape.py | 17 ++++ pandas/tests/frame/test_sorting.py | 34 ++++++-- pandas/tests/indexes/datetimes/test_tools.py | 8 ++ pandas/tests/indexes/test_base.py | 30 +++++++ pandas/tests/indexing/test_indexing.py | 2 +- pandas/tests/io/parser/common.py | 17 ++-- pandas/tests/io/parser/compression.py | 15 ++-- pandas/tests/io/parser/test_textreader.py | 12 +-- pandas/tests/io/sas/test_sas7bdat.py | 2 + pandas/tests/io/test_packers.py | 5 +- pandas/tests/reshape/test_concat.py | 11 +++ .../tests/scalar/timedelta/test_timedelta.py | 10 +++ .../tests/scalar/timestamp/test_timestamp.py | 85 ++++++++++--------- pandas/tests/series/test_io.py | 5 +- pandas/tests/series/test_sorting.py | 9 +- pandas/tests/util/test_testing.py | 38 +++++++++ pandas/util/testing.py | 29 ++++--- 32 files changed, 437 insertions(+), 181 deletions(-) diff --git a/doc/source/advanced.rst b/doc/source/advanced.rst index 19d745121ce17..e530ece2e12c5 100644 --- a/doc/source/advanced.rst +++ b/doc/source/advanced.rst @@ -924,6 +924,55 @@ bins, with ``NaN`` representing a missing value similar to other dtypes. pd.cut([0, 3, 5, 1], bins=c.categories) + +Generating Ranges of Intervals +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +If we need intervals on a regular frequency, we can use the :func:`interval_range` function +to create an ``IntervalIndex`` using various combinations of ``start``, ``end``, and ``periods``. +The default frequency for ``interval_range`` is a 1 for numeric intervals, and calendar day for +datetime-like intervals: + +.. ipython:: python + + pd.interval_range(start=0, end=5) + + pd.interval_range(start=pd.Timestamp('2017-01-01'), periods=4) + + pd.interval_range(end=pd.Timedelta('3 days'), periods=3) + +The ``freq`` parameter can used to specify non-default frequencies, and can utilize a variety +of :ref:`frequency aliases ` with datetime-like intervals: + +.. ipython:: python + + pd.interval_range(start=0, periods=5, freq=1.5) + + pd.interval_range(start=pd.Timestamp('2017-01-01'), periods=4, freq='W') + + pd.interval_range(start=pd.Timedelta('0 days'), periods=3, freq='9H') + +Additionally, the ``closed`` parameter can be used to specify which side(s) the intervals +are closed on. Intervals are closed on the right side by default. + +.. ipython:: python + + pd.interval_range(start=0, end=4, closed='both') + + pd.interval_range(start=0, end=4, closed='neither') + +.. versionadded:: 0.23.0 + +Specifying ``start``, ``end``, and ``periods`` will generate a range of evenly spaced +intervals from ``start`` to ``end`` inclusively, with ``periods`` number of elements +in the resulting ``IntervalIndex``: + +.. ipython:: python + + pd.interval_range(start=0, end=6, periods=4) + + pd.interval_range(pd.Timestamp('2018-01-01'), pd.Timestamp('2018-02-28'), periods=3) + Miscellaneous indexing FAQ -------------------------- diff --git a/doc/source/api.rst b/doc/source/api.rst index f3b9529f841a8..fff944651588e 100644 --- a/doc/source/api.rst +++ b/doc/source/api.rst @@ -1459,7 +1459,6 @@ Modifying and Computations Index.is_floating Index.is_integer Index.is_interval - Index.is_lexsorted_for_tuple Index.is_mixed Index.is_numeric Index.is_object @@ -1471,11 +1470,19 @@ Modifying and Computations Index.where Index.take Index.putmask - Index.set_names Index.unique Index.nunique Index.value_counts +Compatibility with MultiIndex +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. autosummary:: + :toctree: generated/ + + Index.set_names + Index.is_lexsorted_for_tuple + Index.droplevel + Missing Values ~~~~~~~~~~~~~~ .. autosummary:: diff --git a/doc/source/timedeltas.rst b/doc/source/timedeltas.rst index d07d5dc2066dd..e602e45784f4a 100644 --- a/doc/source/timedeltas.rst +++ b/doc/source/timedeltas.rst @@ -352,8 +352,8 @@ You can convert a ``Timedelta`` to an `ISO 8601 Duration`_ string with the TimedeltaIndex -------------- -To generate an index with time delta, you can use either the ``TimedeltaIndex`` or -the ``timedelta_range`` constructor. +To generate an index with time delta, you can use either the :class:`TimedeltaIndex` or +the :func:`timedelta_range` constructor. Using ``TimedeltaIndex`` you can pass string-like, ``Timedelta``, ``timedelta``, or ``np.timedelta64`` objects. Passing ``np.nan/pd.NaT/nat`` will represent missing values. @@ -394,9 +394,23 @@ The ``freq`` parameter can passed a variety of :ref:`frequency aliases >>>>>> upstream/master - Bug in :func:`DataFrame.agg` where applying multiple aggregation functions to a :class:`DataFrame` with duplicated column names would cause a stack overflow (:issue:`21063`) - Bug in :func:`pandas.core.groupby.GroupBy.ffill` and :func:`pandas.core.groupby.GroupBy.bfill` where the fill within a grouping would not always be applied as intended due to the implementations' use of a non-stable sort (:issue:`21207`) diff --git a/pandas/_libs/tslibs/timedeltas.pyx b/pandas/_libs/tslibs/timedeltas.pyx index 5831c94876ce4..b9405b15a0980 100644 --- a/pandas/_libs/tslibs/timedeltas.pyx +++ b/pandas/_libs/tslibs/timedeltas.pyx @@ -241,22 +241,22 @@ cpdef inline int64_t cast_from_unit(object ts, object unit) except? -1: if unit == 'D' or unit == 'd': m = 1000000000L * 86400 - p = 6 + p = 9 elif unit == 'h': m = 1000000000L * 3600 - p = 6 + p = 9 elif unit == 'm': m = 1000000000L * 60 - p = 6 + p = 9 elif unit == 's': m = 1000000000L - p = 6 + p = 9 elif unit == 'ms': m = 1000000L - p = 3 + p = 6 elif unit == 'us': m = 1000L - p = 0 + p = 3 elif unit == 'ns' or unit is None: m = 1L p = 0 @@ -270,10 +270,10 @@ cpdef inline int64_t cast_from_unit(object ts, object unit) except? -1: # cast the unit, multiply base/frace separately # to avoid precision issues from float -> int base = ts - frac = ts -base + frac = ts - base if p: frac = round(frac, p) - return (base *m) + (frac *m) + return (base * m) + (frac * m) cdef inline _decode_if_necessary(object ts): @@ -799,7 +799,32 @@ cdef class _Timedelta(timedelta): @property def delta(self): - """ return out delta in ns (for internal compat) """ + """ + Return the timedelta in nanoseconds (ns), for internal compatibility. + + Returns + ------- + int + Timedelta in nanoseconds. + + Examples + -------- + >>> td = pd.Timedelta('1 days 42 ns') + >>> td.delta + 86400000000042 + + >>> td = pd.Timedelta('3 s') + >>> td.delta + 3000000000 + + >>> td = pd.Timedelta('3 ms 5 us') + >>> td.delta + 3005000 + + >>> td = pd.Timedelta(42, unit='ns') + >>> td.delta + 42 + """ return self.value @property diff --git a/pandas/core/accessor.py b/pandas/core/accessor.py index c638b9e4ea117..7a853d575aa69 100644 --- a/pandas/core/accessor.py +++ b/pandas/core/accessor.py @@ -12,7 +12,8 @@ class DirNamesMixin(object): _accessors = frozenset([]) - _deprecations = frozenset(['asobject']) + _deprecations = frozenset( + ['asobject', 'base', 'data', 'flags', 'itemsize', 'strides']) def _dir_deletions(self): """ delete unwanted __dir__ for this object """ diff --git a/pandas/core/base.py b/pandas/core/base.py index 6ec2c2ce1e043..1226662824eb5 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -114,7 +114,7 @@ def _reset_cache(self, key=None): def __sizeof__(self): """ - Generates the total memory usage for a object that returns + Generates the total memory usage for an object that returns either a value or Series of values """ if hasattr(self, 'memory_usage'): diff --git a/pandas/core/common.py b/pandas/core/common.py index 970c91913de4d..0a33873630d27 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -55,8 +55,11 @@ def flatten(l): def _consensus_name_attr(objs): name = objs[0].name for obj in objs[1:]: - if obj.name != name: - return None + try: + if obj.name != name: + name = None + except ValueError: + name = None return name diff --git a/pandas/core/frame.py b/pandas/core/frame.py index b2ce8fd393dc1..6380944338010 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4193,9 +4193,8 @@ def _maybe_casted_values(index, labels=None): if not isinstance(level, (tuple, list)): level = [level] level = [self.index._get_level_number(lev) for lev in level] - if isinstance(self.index, MultiIndex): - if len(level) < self.index.nlevels: - new_index = self.index.droplevel(level) + if len(level) < self.index.nlevels: + new_index = self.index.droplevel(level) if not drop: if isinstance(self.index, MultiIndex): @@ -4555,7 +4554,10 @@ def sort_index(self, axis=0, level=None, ascending=True, inplace=False, axis = self._get_axis_number(axis) labels = self._get_axis(axis) - if level: + # make sure that the axis is lexsorted to start + # if not we need to reconstruct to get the correct indexer + labels = labels._sort_levels_monotonic() + if level is not None: new_axis, indexer = labels.sortlevel(level, ascending=ascending, sort_remaining=sort_remaining) @@ -4563,9 +4565,6 @@ def sort_index(self, axis=0, level=None, ascending=True, inplace=False, elif isinstance(labels, MultiIndex): from pandas.core.sorting import lexsort_indexer - # make sure that the axis is lexsorted to start - # if not we need to reconstruct to get the correct indexer - labels = labels._sort_levels_monotonic() indexer = lexsort_indexer(labels._get_labels_for_sorting(), orders=ascending, na_position=na_position) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 28b29d87109da..78fa6f8217157 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -3134,6 +3134,60 @@ def _get_level_values(self, level): get_level_values = _get_level_values + def droplevel(self, level=0): + """ + Return index with requested level(s) removed. If resulting index has + only 1 level left, the result will be of Index type, not MultiIndex. + + .. versionadded:: 0.23.1 (support for non-MultiIndex) + + Parameters + ---------- + level : int, str, or list-like, default 0 + If a string is given, must be the name of a level + If list-like, elements must be names or indexes of levels. + + Returns + ------- + index : Index or MultiIndex + """ + if not isinstance(level, (tuple, list)): + level = [level] + + levnums = sorted(self._get_level_number(lev) for lev in level)[::-1] + + if len(level) == 0: + return self + if len(level) >= self.nlevels: + raise ValueError("Cannot remove {} levels from an index with {} " + "levels: at least one level must be " + "left.".format(len(level), self.nlevels)) + # The two checks above guarantee that here self is a MultiIndex + + new_levels = list(self.levels) + new_labels = list(self.labels) + new_names = list(self.names) + + for i in levnums: + new_levels.pop(i) + new_labels.pop(i) + new_names.pop(i) + + if len(new_levels) == 1: + + # set nan if needed + mask = new_labels[0] == -1 + result = new_levels[0].take(new_labels[0]) + if mask.any(): + result = result.putmask(mask, np.nan) + + result.name = new_names[0] + return result + else: + from .multi import MultiIndex + return MultiIndex(levels=new_levels, labels=new_labels, + names=new_names, verify_integrity=False) + _index_shared_docs['get_indexer'] = """ Compute indexer and mask for new index given the current index. The indexer should be then used as an input to ndarray.take to align the diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 8c49c0e58bf4a..a791ce1d87264 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -1739,52 +1739,6 @@ def _drop_from_level(self, labels, level): return self[mask] - def droplevel(self, level=0): - """ - Return Index with requested level removed. If MultiIndex has only 2 - levels, the result will be of Index type not MultiIndex. - - Parameters - ---------- - level : int/level name or list thereof - - Notes - ----- - Does not check if result index is unique or not - - Returns - ------- - index : Index or MultiIndex - """ - levels = level - if not isinstance(levels, (tuple, list)): - levels = [level] - - new_levels = list(self.levels) - new_labels = list(self.labels) - new_names = list(self.names) - - levnums = sorted(self._get_level_number(lev) for lev in levels)[::-1] - - for i in levnums: - new_levels.pop(i) - new_labels.pop(i) - new_names.pop(i) - - if len(new_levels) == 1: - - # set nan if needed - mask = new_labels[0] == -1 - result = new_levels[0].take(new_labels[0]) - if mask.any(): - result = result.putmask(mask, np.nan) - - result.name = new_names[0] - return result - else: - return MultiIndex(levels=new_levels, labels=new_labels, - names=new_names, verify_integrity=False) - def swaplevel(self, i=-2, j=-1): """ Swap level i with level j. diff --git a/pandas/core/series.py b/pandas/core/series.py index 557159bcf3739..0bdb9d9cc23a6 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1207,9 +1207,8 @@ def reset_index(self, level=None, drop=False, name=None, inplace=False): if not isinstance(level, (tuple, list)): level = [level] level = [self.index._get_level_number(lev) for lev in level] - if isinstance(self.index, MultiIndex): - if len(level) < self.index.nlevels: - new_index = self.index.droplevel(level) + if len(level) < self.index.nlevels: + new_index = self.index.droplevel(level) if inplace: self.index = new_index @@ -2653,7 +2652,7 @@ def sort_index(self, axis=0, level=None, ascending=True, inplace=False, axis = self._get_axis_number(axis) index = self.index - if level: + if level is not None: new_index, indexer = index.sortlevel(level, ascending=ascending, sort_remaining=sort_remaining) elif isinstance(index, MultiIndex): @@ -3207,7 +3206,8 @@ def apply(self, func, convert_dtype=True, args=(), **kwds): # handle ufuncs and lambdas if kwds or args and not isinstance(func, np.ufunc): - f = lambda x: func(x, *args, **kwds) + def f(x): + return func(x, *args, **kwds) else: f = func diff --git a/pandas/core/strings.py b/pandas/core/strings.py index ac5fa750fb2e6..e4765c00f80fd 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -241,7 +241,7 @@ def str_count(arr, pat, flags=0): Escape ``'$'`` to find the literal dollar sign. >>> s = pd.Series(['$', 'B', 'Aab$', '$$ca', 'C$B$', 'cat']) - >>> s.str.count('\$') + >>> s.str.count('\\$') 0 1 1 0 2 1 @@ -358,7 +358,7 @@ def str_contains(arr, pat, case=True, flags=0, na=np.nan, regex=True): Returning any digit using regular expression. - >>> s1.str.contains('\d', regex=True) + >>> s1.str.contains('\\d', regex=True) 0 False 1 False 2 False diff --git a/pandas/tests/frame/test_indexing.py b/pandas/tests/frame/test_indexing.py index 81805222b8f01..9ca2b7e3c8a6a 100644 --- a/pandas/tests/frame/test_indexing.py +++ b/pandas/tests/frame/test_indexing.py @@ -399,11 +399,13 @@ def test_getitem_setitem_ix_negative_integers(self): df = DataFrame(np.random.randn(8, 4)) # ix does label-based indexing when having an integer index - with pytest.raises(KeyError): - df.ix[[-1]] + with catch_warnings(record=True): + with pytest.raises(KeyError): + df.ix[[-1]] - with pytest.raises(KeyError): - df.ix[:, [-1]] + with catch_warnings(record=True): + with pytest.raises(KeyError): + df.ix[:, [-1]] # #1942 a = DataFrame(randn(20, 2), index=[chr(x + 65) for x in range(20)]) diff --git a/pandas/tests/frame/test_reshape.py b/pandas/tests/frame/test_reshape.py index 538a3fb27ffe2..ebf6c5e37b916 100644 --- a/pandas/tests/frame/test_reshape.py +++ b/pandas/tests/frame/test_reshape.py @@ -871,6 +871,23 @@ def test_stack_preserve_categorical_dtype(self): tm.assert_series_equal(result, expected) + @pytest.mark.parametrize("level", [0, 'baz']) + def test_unstack_swaplevel_sortlevel(self, level): + # GH 20994 + mi = pd.MultiIndex.from_product([[0], ['d', 'c']], + names=['bar', 'baz']) + df = pd.DataFrame([[0, 2], [1, 3]], index=mi, columns=['B', 'A']) + df.columns.name = 'foo' + + expected = pd.DataFrame([ + [3, 1, 2, 0]], columns=pd.MultiIndex.from_tuples([ + ('c', 'A'), ('c', 'B'), ('d', 'A'), ('d', 'B')], names=[ + 'baz', 'foo'])) + expected.index.name = 'bar' + + result = df.unstack().swaplevel(axis=1).sort_index(axis=1, level=level) + tm.assert_frame_equal(result, expected) + def test_unstack_fill_frame_object(): # GH12815 Test unstacking with object. diff --git a/pandas/tests/frame/test_sorting.py b/pandas/tests/frame/test_sorting.py index b60eb89e87da5..599ae683f914b 100644 --- a/pandas/tests/frame/test_sorting.py +++ b/pandas/tests/frame/test_sorting.py @@ -550,18 +550,36 @@ def test_sort_index(self): expected = frame.iloc[:, ::-1] assert_frame_equal(result, expected) - def test_sort_index_multiindex(self): + @pytest.mark.parametrize("level", ['A', 0]) # GH 21052 + def test_sort_index_multiindex(self, level): # GH13496 # sort rows by specified level of multi-index - mi = MultiIndex.from_tuples([[2, 1, 3], [1, 1, 1]], names=list('ABC')) - df = DataFrame([[1, 2], [3, 4]], mi) + mi = MultiIndex.from_tuples([ + [2, 1, 3], [2, 1, 2], [1, 1, 1]], names=list('ABC')) + df = DataFrame([[1, 2], [3, 4], [5, 6]], index=mi) + + expected_mi = MultiIndex.from_tuples([ + [1, 1, 1], + [2, 1, 2], + [2, 1, 3]], names=list('ABC')) + expected = pd.DataFrame([ + [5, 6], + [3, 4], + [1, 2]], index=expected_mi) + result = df.sort_index(level=level) + assert_frame_equal(result, expected) - # MI sort, but no level: sort_level has no effect - mi = MultiIndex.from_tuples([[1, 1, 3], [1, 1, 1]], names=list('ABC')) - df = DataFrame([[1, 2], [3, 4]], mi) - result = df.sort_index(sort_remaining=False) - expected = df.sort_index() + # sort_remaining=False + expected_mi = MultiIndex.from_tuples([ + [1, 1, 1], + [2, 1, 3], + [2, 1, 2]], names=list('ABC')) + expected = pd.DataFrame([ + [5, 6], + [1, 2], + [3, 4]], index=expected_mi) + result = df.sort_index(level=level, sort_remaining=False) assert_frame_equal(result, expected) def test_sort_index_intervalindex(self): diff --git a/pandas/tests/indexes/datetimes/test_tools.py b/pandas/tests/indexes/datetimes/test_tools.py index 9271a79fe0131..fa9f9fc90387a 100644 --- a/pandas/tests/indexes/datetimes/test_tools.py +++ b/pandas/tests/indexes/datetimes/test_tools.py @@ -703,6 +703,14 @@ def test_unit_mixed(self, cache): with pytest.raises(ValueError): pd.to_datetime(arr, errors='raise', cache=cache) + @pytest.mark.parametrize('cache', [True, False]) + def test_unit_rounding(self, cache): + # GH 14156: argument will incur floating point errors but no + # premature rounding + result = pd.to_datetime(1434743731.8770001, unit='s', cache=cache) + expected = pd.Timestamp('2015-06-19 19:55:31.877000093') + assert result == expected + @pytest.mark.parametrize('cache', [True, False]) def test_dataframe(self, cache): diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index f1fb30990d8da..639e51e9361ab 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -245,6 +245,25 @@ def test_constructor_int_dtype_nan(self): result = Index(data, dtype='float') tm.assert_index_equal(result, expected) + def test_droplevel(self, indices): + # GH 21115 + if isinstance(indices, MultiIndex): + # Tested separately in test_multi.py + return + + assert indices.droplevel([]).equals(indices) + + for level in indices.name, [indices.name]: + if isinstance(indices.name, tuple) and level is indices.name: + # GH 21121 : droplevel with tuple name + continue + with pytest.raises(ValueError): + indices.droplevel(level) + + for level in 'wrong', ['wrong']: + with pytest.raises(KeyError): + indices.droplevel(level) + @pytest.mark.parametrize("dtype", ['int64', 'uint64']) def test_constructor_int_dtype_nan_raises(self, dtype): # see gh-15187 @@ -2112,6 +2131,17 @@ def test_get_duplicates_deprecated(self): with tm.assert_produces_warning(FutureWarning): index.get_duplicates() + def test_tab_complete_warning(self, ip): + # https://github.com/pandas-dev/pandas/issues/16409 + pytest.importorskip('IPython', minversion="6.0.0") + from IPython.core.completer import provisionalcompleter + + code = "import pandas as pd; idx = pd.Index([1, 2])" + ip.run_code(code) + with tm.assert_produces_warning(None): + with provisionalcompleter('ignore'): + list(ip.Completer.completions('idx.', 4)) + class TestMixedIntIndex(Base): # Mostly the tests from common.py for which the results differ diff --git a/pandas/tests/indexing/test_indexing.py b/pandas/tests/indexing/test_indexing.py index 7a17408d4468f..9c992770fc64c 100644 --- a/pandas/tests/indexing/test_indexing.py +++ b/pandas/tests/indexing/test_indexing.py @@ -197,7 +197,7 @@ def test_dups_fancy_indexing(self): # List containing only missing label dfnu = DataFrame(np.random.randn(5, 3), index=list('AABCD')) with pytest.raises(KeyError): - dfnu.ix[['E']] + dfnu.loc[['E']] # ToDo: check_index_type can be True after GH 11497 diff --git a/pandas/tests/io/parser/common.py b/pandas/tests/io/parser/common.py index 17ef03f6a88e6..9e871d27f0ce8 100644 --- a/pandas/tests/io/parser/common.py +++ b/pandas/tests/io/parser/common.py @@ -54,20 +54,21 @@ def test_bad_stream_exception(self): # and C engine will raise UnicodeDecodeError instead of # c engine raising ParserError and swallowing exception # that caused read to fail. - handle = open(self.csv_shiftjs, "rb") codec = codecs.lookup("utf-8") utf8 = codecs.lookup('utf-8') - # stream must be binary UTF8 - stream = codecs.StreamRecoder( - handle, utf8.encode, utf8.decode, codec.streamreader, - codec.streamwriter) + if compat.PY3: msg = "'utf-8' codec can't decode byte" else: msg = "'utf8' codec can't decode byte" - with tm.assert_raises_regex(UnicodeDecodeError, msg): - self.read_csv(stream) - stream.close() + + # stream must be binary UTF8 + with open(self.csv_shiftjs, "rb") as handle, codecs.StreamRecoder( + handle, utf8.encode, utf8.decode, codec.streamreader, + codec.streamwriter) as stream: + + with tm.assert_raises_regex(UnicodeDecodeError, msg): + self.read_csv(stream) def test_read_csv(self): if not compat.PY3: diff --git a/pandas/tests/io/parser/compression.py b/pandas/tests/io/parser/compression.py index 48b2cedb63811..e4950af19ea95 100644 --- a/pandas/tests/io/parser/compression.py +++ b/pandas/tests/io/parser/compression.py @@ -110,16 +110,15 @@ def test_read_csv_infer_compression(self): # see gh-9770 expected = self.read_csv(self.csv1, index_col=0, parse_dates=True) - inputs = [self.csv1, self.csv1 + '.gz', - self.csv1 + '.bz2', open(self.csv1)] + with open(self.csv1) as f: + inputs = [self.csv1, self.csv1 + '.gz', + self.csv1 + '.bz2', f] - for f in inputs: - df = self.read_csv(f, index_col=0, parse_dates=True, - compression='infer') - - tm.assert_frame_equal(expected, df) + for inp in inputs: + df = self.read_csv(inp, index_col=0, parse_dates=True, + compression='infer') - inputs[3].close() + tm.assert_frame_equal(expected, df) def test_read_csv_compressed_utf16_example(self, datapath): # GH18071 diff --git a/pandas/tests/io/parser/test_textreader.py b/pandas/tests/io/parser/test_textreader.py index c1e0f1dc753e8..c7026e3e0fc88 100644 --- a/pandas/tests/io/parser/test_textreader.py +++ b/pandas/tests/io/parser/test_textreader.py @@ -36,24 +36,18 @@ def setup_method(self, datapath): self.xls1 = os.path.join(self.dirpath, 'test.xls') def test_file_handle(self): - try: - f = open(self.csv1, 'rb') + with open(self.csv1, 'rb') as f: reader = TextReader(f) - result = reader.read() # noqa - finally: - f.close() + reader.read() def test_string_filename(self): reader = TextReader(self.csv1, header=None) reader.read() def test_file_handle_mmap(self): - try: - f = open(self.csv1, 'rb') + with open(self.csv1, 'rb') as f: reader = TextReader(f, memory_map=True, header=None) reader.read() - finally: - f.close() def test_StringIO(self): with open(self.csv1, 'rb') as f: diff --git a/pandas/tests/io/sas/test_sas7bdat.py b/pandas/tests/io/sas/test_sas7bdat.py index ae40653c28f99..101ee3e619f5b 100644 --- a/pandas/tests/io/sas/test_sas7bdat.py +++ b/pandas/tests/io/sas/test_sas7bdat.py @@ -178,6 +178,8 @@ def test_date_time(datapath): fname = datapath("io", "sas", "data", "datetime.csv") df0 = pd.read_csv(fname, parse_dates=['Date1', 'Date2', 'DateTime', 'DateTimeHi', 'Taiw']) + # GH 19732: Timestamps imported from sas will incur floating point errors + df.iloc[:, 3] = df.iloc[:, 3].dt.round('us') tm.assert_frame_equal(df, df0) diff --git a/pandas/tests/io/test_packers.py b/pandas/tests/io/test_packers.py index 491d5fe33cc33..412e218f95c6f 100644 --- a/pandas/tests/io/test_packers.py +++ b/pandas/tests/io/test_packers.py @@ -129,9 +129,8 @@ def test_string_io(self): with ensure_clean(self.path) as p: s = df.to_msgpack() - fh = open(p, 'wb') - fh.write(s) - fh.close() + with open(p, 'wb') as fh: + fh.write(s) result = read_msgpack(p) tm.assert_frame_equal(result, df) diff --git a/pandas/tests/reshape/test_concat.py b/pandas/tests/reshape/test_concat.py index 0d9f24e01fc57..d05fd689ed754 100644 --- a/pandas/tests/reshape/test_concat.py +++ b/pandas/tests/reshape/test_concat.py @@ -2497,3 +2497,14 @@ def test_concat_aligned_sort_does_not_raise(): columns=[1, 'a']) result = pd.concat([df, df], ignore_index=True, sort=True) tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("s1name,s2name", [ + (np.int64(190), (43, 0)), (190, (43, 0))]) +def test_concat_series_name_npscalar_tuple(s1name, s2name): + # GH21015 + s1 = pd.Series({'a': 1, 'b': 2}, name=s1name) + s2 = pd.Series({'c': 5, 'd': 6}, name=s2name) + result = pd.concat([s1, s2]) + expected = pd.Series({'a': 1, 'b': 2, 'c': 5, 'd': 6}) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/scalar/timedelta/test_timedelta.py b/pandas/tests/scalar/timedelta/test_timedelta.py index 466ece500156a..6472bd4245622 100644 --- a/pandas/tests/scalar/timedelta/test_timedelta.py +++ b/pandas/tests/scalar/timedelta/test_timedelta.py @@ -106,6 +106,16 @@ def test_compare_timedelta_ndarray(self): class TestTimedeltas(object): + @pytest.mark.parametrize("unit, value, expected", [ + ('us', 9.999, 9999), ('ms', 9.999999, 9999999), + ('s', 9.999999999, 9999999999)]) + def test_rounding_on_int_unit_construction(self, unit, value, expected): + # GH 12690 + result = Timedelta(value, unit=unit) + assert result.value == expected + result = Timedelta(str(value) + unit) + assert result.value == expected + def test_total_seconds_scalar(self): # see gh-10939 rng = Timedelta('1 days, 10:11:12.100123456') diff --git a/pandas/tests/scalar/timestamp/test_timestamp.py b/pandas/tests/scalar/timestamp/test_timestamp.py index 5b9af7389d630..4172bfd41b9db 100644 --- a/pandas/tests/scalar/timestamp/test_timestamp.py +++ b/pandas/tests/scalar/timestamp/test_timestamp.py @@ -648,10 +648,51 @@ def test_basics_nanos(self): assert stamp.microsecond == 145224 assert stamp.nanosecond == 192 - def test_unit(self): - - def check(val, unit=None, h=1, s=1, us=0): - stamp = Timestamp(val, unit=unit) + @pytest.mark.parametrize('value, check_kwargs', [ + [946688461000000000, {}], + [946688461000000000 / long(1000), dict(unit='us')], + [946688461000000000 / long(1000000), dict(unit='ms')], + [946688461000000000 / long(1000000000), dict(unit='s')], + [10957, dict(unit='D', h=0)], + pytest.param((946688461000000000 + 500000) / long(1000000000), + dict(unit='s', us=499, ns=964), + marks=pytest.mark.skipif(not PY3, + reason='using truediv, so these' + ' are like floats')), + pytest.param((946688461000000000 + 500000000) / long(1000000000), + dict(unit='s', us=500000), + marks=pytest.mark.skipif(not PY3, + reason='using truediv, so these' + ' are like floats')), + pytest.param((946688461000000000 + 500000) / long(1000000), + dict(unit='ms', us=500), + marks=pytest.mark.skipif(not PY3, + reason='using truediv, so these' + ' are like floats')), + pytest.param((946688461000000000 + 500000) / long(1000000000), + dict(unit='s'), + marks=pytest.mark.skipif(PY3, + reason='get chopped in py2')), + pytest.param((946688461000000000 + 500000000) / long(1000000000), + dict(unit='s'), + marks=pytest.mark.skipif(PY3, + reason='get chopped in py2')), + pytest.param((946688461000000000 + 500000) / long(1000000), + dict(unit='ms'), + marks=pytest.mark.skipif(PY3, + reason='get chopped in py2')), + [(946688461000000000 + 500000) / long(1000), dict(unit='us', us=500)], + [(946688461000000000 + 500000000) / long(1000000), + dict(unit='ms', us=500000)], + [946688461000000000 / 1000.0 + 5, dict(unit='us', us=5)], + [946688461000000000 / 1000.0 + 5000, dict(unit='us', us=5000)], + [946688461000000000 / 1000000.0 + 0.5, dict(unit='ms', us=500)], + [946688461000000000 / 1000000.0 + 0.005, dict(unit='ms', us=5, ns=5)], + [946688461000000000 / 1000000000.0 + 0.5, dict(unit='s', us=500000)], + [10957 + 0.5, dict(unit='D', h=12)]]) + def test_unit(self, value, check_kwargs): + def check(value, unit=None, h=1, s=1, us=0, ns=0): + stamp = Timestamp(value, unit=unit) assert stamp.year == 2000 assert stamp.month == 1 assert stamp.day == 1 @@ -664,41 +705,9 @@ def check(val, unit=None, h=1, s=1, us=0): assert stamp.minute == 0 assert stamp.second == 0 assert stamp.microsecond == 0 - assert stamp.nanosecond == 0 - - ts = Timestamp('20000101 01:01:01') - val = ts.value - days = (ts - Timestamp('1970-01-01')).days - - check(val) - check(val / long(1000), unit='us') - check(val / long(1000000), unit='ms') - check(val / long(1000000000), unit='s') - check(days, unit='D', h=0) + assert stamp.nanosecond == ns - # using truediv, so these are like floats - if PY3: - check((val + 500000) / long(1000000000), unit='s', us=500) - check((val + 500000000) / long(1000000000), unit='s', us=500000) - check((val + 500000) / long(1000000), unit='ms', us=500) - - # get chopped in py2 - else: - check((val + 500000) / long(1000000000), unit='s') - check((val + 500000000) / long(1000000000), unit='s') - check((val + 500000) / long(1000000), unit='ms') - - # ok - check((val + 500000) / long(1000), unit='us', us=500) - check((val + 500000000) / long(1000000), unit='ms', us=500000) - - # floats - check(val / 1000.0 + 5, unit='us', us=5) - check(val / 1000.0 + 5000, unit='us', us=5000) - check(val / 1000000.0 + 0.5, unit='ms', us=500) - check(val / 1000000.0 + 0.005, unit='ms', us=5) - check(val / 1000000000.0 + 0.5, unit='s', us=500000) - check(days + 0.5, unit='D', h=12) + check(value, **check_kwargs) def test_roundtrip(self): diff --git a/pandas/tests/series/test_io.py b/pandas/tests/series/test_io.py index 90f37053ce17e..814d794d45c18 100644 --- a/pandas/tests/series/test_io.py +++ b/pandas/tests/series/test_io.py @@ -76,9 +76,8 @@ def test_from_csv(self): series_h = self.read_csv(path, header=0) assert series_h.name == "series" - outfile = open(path, "w") - outfile.write("1998-01-01|1.0\n1999-01-01|2.0") - outfile.close() + with open(path, "w") as outfile: + outfile.write("1998-01-01|1.0\n1999-01-01|2.0") series = self.read_csv(path, sep="|") check_series = Series({datetime(1998, 1, 1): 1.0, diff --git a/pandas/tests/series/test_sorting.py b/pandas/tests/series/test_sorting.py index 01b4ea6eaa238..13e0d1b12c372 100644 --- a/pandas/tests/series/test_sorting.py +++ b/pandas/tests/series/test_sorting.py @@ -141,19 +141,20 @@ def test_sort_index_inplace(self): assert result is None tm.assert_series_equal(random_order, self.ts) - def test_sort_index_multiindex(self): + @pytest.mark.parametrize("level", ['A', 0]) # GH 21052 + def test_sort_index_multiindex(self, level): mi = MultiIndex.from_tuples([[1, 1, 3], [1, 1, 1]], names=list('ABC')) s = Series([1, 2], mi) backwards = s.iloc[[1, 0]] # implicit sort_remaining=True - res = s.sort_index(level='A') + res = s.sort_index(level=level) assert_series_equal(backwards, res) # GH13496 - # rows share same level='A': sort has no effect without remaining lvls - res = s.sort_index(level='A', sort_remaining=False) + # sort has no effect without remaining lvls + res = s.sort_index(level=level, sort_remaining=False) assert_series_equal(s, res) def test_sort_index_kind(self): diff --git a/pandas/tests/util/test_testing.py b/pandas/tests/util/test_testing.py index c0e8b8b627686..95ea4658212e9 100644 --- a/pandas/tests/util/test_testing.py +++ b/pandas/tests/util/test_testing.py @@ -504,6 +504,25 @@ def test_index_equal_metadata_message(self): with tm.assert_raises_regex(AssertionError, expected): assert_index_equal(idx1, idx2) + def test_categorical_index_equality(self): + expected = """Index are different + +Attribute "dtype" are different +\\[left\\]: CategoricalDtype\\(categories=\\[u?'a', u?'b'\\], ordered=False\\) +\\[right\\]: CategoricalDtype\\(categories=\\[u?'a', u?'b', u?'c'\\], \ +ordered=False\\)""" + + with tm.assert_raises_regex(AssertionError, expected): + assert_index_equal(pd.Index(pd.Categorical(['a', 'b'])), + pd.Index(pd.Categorical(['a', 'b'], + categories=['a', 'b', 'c']))) + + def test_categorical_index_equality_relax_categories_check(self): + assert_index_equal(pd.Index(pd.Categorical(['a', 'b'])), + pd.Index(pd.Categorical(['a', 'b'], + categories=['a', 'b', 'c'])), + check_categorical=False) + class TestAssertSeriesEqual(object): @@ -601,6 +620,25 @@ def test_series_equal_message(self): assert_series_equal(pd.Series([1, 2, 3]), pd.Series([1, 2, 4]), check_less_precise=True) + def test_categorical_series_equality(self): + expected = """Attributes are different + +Attribute "dtype" are different +\\[left\\]: CategoricalDtype\\(categories=\\[u?'a', u?'b'\\], ordered=False\\) +\\[right\\]: CategoricalDtype\\(categories=\\[u?'a', u?'b', u?'c'\\], \ +ordered=False\\)""" + + with tm.assert_raises_regex(AssertionError, expected): + assert_series_equal(pd.Series(pd.Categorical(['a', 'b'])), + pd.Series(pd.Categorical(['a', 'b'], + categories=['a', 'b', 'c']))) + + def test_categorical_series_equality_relax_categories_check(self): + assert_series_equal(pd.Series(pd.Categorical(['a', 'b'])), + pd.Series(pd.Categorical(['a', 'b'], + categories=['a', 'b', 'c'])), + check_categorical=False) + class TestAssertFrameEqual(object): diff --git a/pandas/util/testing.py b/pandas/util/testing.py index 85fc1b16c73fa..54ae8cfb3d39e 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -809,8 +809,12 @@ def assert_index_equal(left, right, exact='equiv', check_names=True, def _check_types(l, r, obj='Index'): if exact: - assert_class_equal(left, right, exact=exact, obj=obj) - assert_attr_equal('dtype', l, r, obj=obj) + assert_class_equal(l, r, exact=exact, obj=obj) + + # Skip exact dtype checking when `check_categorical` is False + if check_categorical: + assert_attr_equal('dtype', l, r, obj=obj) + # allow string-like to have different inferred_types if l.inferred_type in ('string', 'unicode'): assert r.inferred_type in ('string', 'unicode') @@ -860,7 +864,8 @@ def _get_ilevel_values(index, level): # get_level_values may change dtype _check_types(left.levels[level], right.levels[level], obj=obj) - if check_exact: + # skip exact index checking when `check_categorical` is False + if check_exact and check_categorical: if not left.equals(right): diff = np.sum((left.values != right.values) .astype(int)) * 100.0 / len(left) @@ -981,23 +986,23 @@ def is_sorted(seq): def assert_categorical_equal(left, right, check_dtype=True, - obj='Categorical', check_category_order=True): + check_category_order=True, obj='Categorical'): """Test that Categoricals are equivalent. Parameters ---------- - left, right : Categorical - Categoricals to compare + left : Categorical + right : Categorical check_dtype : bool, default True Check that integer dtype of the codes are the same - obj : str, default 'Categorical' - Specify object name being compared, internally used to show appropriate - assertion message check_category_order : bool, default True Whether the order of the categories should be compared, which implies identical integer codes. If False, only the resulting values are compared. The ordered attribute is checked regardless. + obj : str, default 'Categorical' + Specify object name being compared, internally used to show appropriate + assertion message """ _check_isinstance(left, right, Categorical) @@ -1051,7 +1056,7 @@ def raise_assert_detail(obj, message, left, right, diff=None): def assert_numpy_array_equal(left, right, strict_nan=False, check_dtype=True, err_msg=None, - obj='numpy array', check_same=None): + check_same=None, obj='numpy array'): """ Checks that 'np.ndarray' is equivalent Parameters @@ -1064,11 +1069,11 @@ def assert_numpy_array_equal(left, right, strict_nan=False, check dtype if both a and b are np.ndarray err_msg : str, default None If provided, used as assertion message + check_same : None|'copy'|'same', default None + Ensure left and right refer/do not refer to the same memory area obj : str, default 'numpy array' Specify object name being compared, internally used to show appropriate assertion message - check_same : None|'copy'|'same', default None - Ensure left and right refer/do not refer to the same memory area """ # instance validation From ee66578f4053f69ed81987c196625a37978bfbda Mon Sep 17 00:00:00 2001 From: Kalyan Gokhale <4734245+KalyanGokhale@users.noreply.github.com> Date: Fri, 13 Jul 2018 18:29:01 +0530 Subject: [PATCH 03/13] Initial commit --- doc/source/whatsnew/v0.23.4.txt | 2 +- pandas/core/algorithms.py | 52 ++++++++++++++++++-------------- pandas/tests/test_algos.py | 53 +++++++++++---------------------- 3 files changed, 48 insertions(+), 59 deletions(-) diff --git a/doc/source/whatsnew/v0.23.4.txt b/doc/source/whatsnew/v0.23.4.txt index a88c22e3d01f7..b0ac44dc91d37 100644 --- a/doc/source/whatsnew/v0.23.4.txt +++ b/doc/source/whatsnew/v0.23.4.txt @@ -31,7 +31,7 @@ Bug Fixes **Conversion** -- +- Unwanted casting of float to int in :func:`isin` (:issue:`21804`) - **Indexing** diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 6e49e8044ff25..989fdd3098f68 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -23,7 +23,7 @@ is_period_dtype, is_numeric_dtype, is_float_dtype, is_bool_dtype, needs_i8_conversion, - is_datetimetz, + is_datetimetz, is_datetime_or_timedelta_dtype, is_datetime64_any_dtype, is_datetime64tz_dtype, is_timedelta64_dtype, is_datetimelike, is_interval_dtype, is_scalar, is_list_like, @@ -39,6 +39,8 @@ from pandas.util._decorators import (Appender, Substitution, deprecate_kwarg) +from pandas._libs.tslibs.timestamps import Timestamp + _shared_docs = {} @@ -413,35 +415,41 @@ def isin(comps, values): return comps._values.isin(values) comps = com._values_from_object(comps) + comps, dtype_comps, _ = _ensure_data(comps) - comps, dtype, _ = _ensure_data(comps) - values, _, _ = _ensure_data(values, dtype=dtype) + is_time_like = lambda x: (is_datetime_or_timedelta_dtype(x) + or isinstance(x, Timestamp)) - # faster for larger cases to use np.in1d - f = lambda x, y: htable.ismember_object(x, values) + is_int = lambda x: ((x == np.int64) or (x == int)) + + is_float = lambda x: ((x == np.float64) or (x == float)) + + f = lambda x, y: htable.ismember_object(x.astype(object), y.astype(object)) # GH16012 # Ensure np.in1d doesn't get object types or it *may* throw an exception + # faster for larger cases to use np.in1d if len(comps) > 1000000 and not is_object_dtype(comps): f = lambda x, y: np.in1d(x, y) - elif is_integer_dtype(comps): - try: - values = values.astype('int64', copy=False) - comps = comps.astype('int64', copy=False) - f = lambda x, y: htable.ismember_int64(x, y) - except (TypeError, ValueError): - values = values.astype(object) - comps = comps.astype(object) - elif is_float_dtype(comps): - try: - values = values.astype('float64', copy=False) - comps = comps.astype('float64', copy=False) - checknull = isna(values).any() - f = lambda x, y: htable.ismember_float64(x, y, checknull) - except (TypeError, ValueError): - values = values.astype(object) - comps = comps.astype(object) + if is_time_like(dtype_comps): + values, _, _ = _ensure_data(values, dtype=dtype_comps) + else: + values, dtype_values, _ = _ensure_data(values) + comps_types = set([type(v) for v in comps]) + values_types = set([type(v) for v in values]) + if len(comps_types) == len(values_types) == 1: + comps_types = comps_types.pop() + values_types = values_types.pop() + if (is_int(comps_types) and is_int(values_types)): + values = values.astype('int64', copy=False) + comps = comps.astype('int64', copy=False) + f = lambda x, y: htable.ismember_int64(x, y) + elif (is_float(comps_types) and is_float(values_types)): + values = values.astype('float64', copy=False) + comps = comps.astype('float64', copy=False) + checknull = isna(values).any() + f = lambda x, y: htable.ismember_float64(x, y, checknull) return f(comps, values) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 25e64aa82cc36..8ab907a9723bd 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -509,42 +509,23 @@ def test_invalid(self): pytest.raises(TypeError, lambda: algos.isin(1, [1])) pytest.raises(TypeError, lambda: algos.isin([1], 1)) - def test_basic(self): - - result = algos.isin([1, 2], [1]) - expected = np.array([True, False]) - tm.assert_numpy_array_equal(result, expected) - - result = algos.isin(np.array([1, 2]), [1]) - expected = np.array([True, False]) - tm.assert_numpy_array_equal(result, expected) - - result = algos.isin(Series([1, 2]), [1]) - expected = np.array([True, False]) - tm.assert_numpy_array_equal(result, expected) - - result = algos.isin(Series([1, 2]), Series([1])) - expected = np.array([True, False]) - tm.assert_numpy_array_equal(result, expected) - - result = algos.isin(Series([1, 2]), set([1])) - expected = np.array([True, False]) - tm.assert_numpy_array_equal(result, expected) - - result = algos.isin(['a', 'b'], ['a']) - expected = np.array([True, False]) - tm.assert_numpy_array_equal(result, expected) - - result = algos.isin(Series(['a', 'b']), Series(['a'])) - expected = np.array([True, False]) - tm.assert_numpy_array_equal(result, expected) - - result = algos.isin(Series(['a', 'b']), set(['a'])) - expected = np.array([True, False]) - tm.assert_numpy_array_equal(result, expected) - - result = algos.isin(['a', 'b'], [1]) - expected = np.array([False, False]) + @pytest.mark.parametrize("comps,values,expected", [ + ([1, 2], [1], [True, False]), + ([1, 0], [1, 0.5], [True, False]), + ([1.0, 0], [1, 0.5], [True, False]), + ([1.0, 0.0], [1, 0], [True, True]), + (np.array([1, 2]), [1], [True, False]), + (Series([1, 2]), [1], [True, False]), + (Series([1, 2]), Series([1]), [True, False]), + (Series([1, 2]), set([1]), [True, False]), + (['a', 'b'], ['a'], [True, False]), + (Series(['a', 'b']), Series(['a']), [True, False]), + (Series(['a', 'b']), set(['a']), [True, False]), + (['a', 'b'], [1], [False, False]) + ]) + def test_basic(self, comps, values, expected): + result = algos.isin(comps, values) + expected = np.array(expected) tm.assert_numpy_array_equal(result, expected) def test_i8(self): From 84be606b42a24539b4f8f7cf3445e318f5e496e7 Mon Sep 17 00:00:00 2001 From: Kalyan Gokhale <4734245+KalyanGokhale@users.noreply.github.com> Date: Fri, 13 Jul 2018 21:40:23 +0530 Subject: [PATCH 04/13] Replaced list with generator --- pandas/core/algorithms.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 989fdd3098f68..21398df60b091 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -436,8 +436,8 @@ def isin(comps, values): values, _, _ = _ensure_data(values, dtype=dtype_comps) else: values, dtype_values, _ = _ensure_data(values) - comps_types = set([type(v) for v in comps]) - values_types = set([type(v) for v in values]) + comps_types = set(type(v) for v in comps) + values_types = set(type(v) for v in values) if len(comps_types) == len(values_types) == 1: comps_types = comps_types.pop() values_types = values_types.pop() From f8cc271be991ef2892a80b37091d37d6cb3af9a2 Mon Sep 17 00:00:00 2001 From: Kalyan Gokhale <4734245+KalyanGokhale@users.noreply.github.com> Date: Sun, 15 Jul 2018 08:40:09 +0530 Subject: [PATCH 05/13] Code restructure --- pandas/core/algorithms.py | 55 +++++++++++++++++++++++---------------- 1 file changed, 32 insertions(+), 23 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 21398df60b091..3ae5dc262da65 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -415,41 +415,50 @@ def isin(comps, values): return comps._values.isin(values) comps = com._values_from_object(comps) - comps, dtype_comps, _ = _ensure_data(comps) + + comps, dtype, _ = _ensure_data(comps) is_time_like = lambda x: (is_datetime_or_timedelta_dtype(x) or isinstance(x, Timestamp)) + if is_time_like(dtype): + values, _, _ = _ensure_data(values, dtype=dtype) + else: + values, _, _ = _ensure_data(values) - is_int = lambda x: ((x == np.int64) or (x == int)) + # faster for larger cases to use np.in1d + f = lambda x, y: htable.ismember_object(x.astype(object), y.astype(object)) + + # This block checks if comps and values + # are all int or all float + int_flg = False + float_flg = False + is_int = lambda x: ((x == np.int64) or (x == int)) is_float = lambda x: ((x == np.float64) or (x == float)) - f = lambda x, y: htable.ismember_object(x.astype(object), y.astype(object)) + comps_types = set(type(v) for v in comps) + values_types = set(type(v) for v in values) + + if len(comps_types) == len(values_types) == 1: + if (is_int(comps_types) and is_int(values_types)): + int_flg = True + elif (is_float(comps_types) and is_float(values_types)): + float_flg = True # GH16012 # Ensure np.in1d doesn't get object types or it *may* throw an exception - # faster for larger cases to use np.in1d if len(comps) > 1000000 and not is_object_dtype(comps): f = lambda x, y: np.in1d(x, y) - - if is_time_like(dtype_comps): - values, _, _ = _ensure_data(values, dtype=dtype_comps) - else: - values, dtype_values, _ = _ensure_data(values) - comps_types = set(type(v) for v in comps) - values_types = set(type(v) for v in values) - if len(comps_types) == len(values_types) == 1: - comps_types = comps_types.pop() - values_types = values_types.pop() - if (is_int(comps_types) and is_int(values_types)): - values = values.astype('int64', copy=False) - comps = comps.astype('int64', copy=False) - f = lambda x, y: htable.ismember_int64(x, y) - elif (is_float(comps_types) and is_float(values_types)): - values = values.astype('float64', copy=False) - comps = comps.astype('float64', copy=False) - checknull = isna(values).any() - f = lambda x, y: htable.ismember_float64(x, y, checknull) + elif int_flg: + values = values.astype('int64', copy=False) + comps = comps.astype('int64', copy=False) + f = lambda x, y: htable.ismember_int64(x, y) + + elif float_flg: + values = values.astype('float64', copy=False) + comps = comps.astype('float64', copy=False) + checknull = isna(values).any() + f = lambda x, y: htable.ismember_float64(x, y, checknull) return f(comps, values) From b71dad65f1fb0e1a3e82d040e3e206368e8cbb70 Mon Sep 17 00:00:00 2001 From: Kalyan Gokhale <4734245+KalyanGokhale@users.noreply.github.com> Date: Sun, 15 Jul 2018 09:34:10 +0530 Subject: [PATCH 06/13] Code restructing, included missing element --- pandas/core/algorithms.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 3ae5dc262da65..9ba83080daed4 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -440,6 +440,8 @@ def isin(comps, values): values_types = set(type(v) for v in values) if len(comps_types) == len(values_types) == 1: + comps_types = comps_types.pop() + values_types = values_types.pop() if (is_int(comps_types) and is_int(values_types)): int_flg = True elif (is_float(comps_types) and is_float(values_types)): From 54167114a201a5edee13d519fc92ec329f35b341 Mon Sep 17 00:00:00 2001 From: Kalyan Gokhale <4734245+KalyanGokhale@users.noreply.github.com> Date: Sun, 15 Jul 2018 14:59:30 +0530 Subject: [PATCH 07/13] Cleaned removed ifs for int_flg and float_flg --- pandas/core/algorithms.py | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 9ba83080daed4..f7a5d9d91d894 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -420,6 +420,7 @@ def isin(comps, values): is_time_like = lambda x: (is_datetime_or_timedelta_dtype(x) or isinstance(x, Timestamp)) + if is_time_like(dtype): values, _, _ = _ensure_data(values, dtype=dtype) else: @@ -428,24 +429,20 @@ def isin(comps, values): # faster for larger cases to use np.in1d f = lambda x, y: htable.ismember_object(x.astype(object), y.astype(object)) - # This block checks if comps and values - # are all int or all float - int_flg = False - float_flg = False + comps_types = set(type(v) for v in comps) + values_types = set(type(v) for v in values) is_int = lambda x: ((x == np.int64) or (x == int)) is_float = lambda x: ((x == np.float64) or (x == float)) - comps_types = set(type(v) for v in comps) - values_types = set(type(v) for v in values) + int_flg = False + float_flg = False if len(comps_types) == len(values_types) == 1: comps_types = comps_types.pop() values_types = values_types.pop() - if (is_int(comps_types) and is_int(values_types)): - int_flg = True - elif (is_float(comps_types) and is_float(values_types)): - float_flg = True + int_flg = (is_int(comps_types) and is_int(values_types)) + float_flg = (is_float(comps_types) and is_float(values_types)) # GH16012 # Ensure np.in1d doesn't get object types or it *may* throw an exception From 9fca52c57b349b87298b28f3c1cb66d1627e11bf Mon Sep 17 00:00:00 2001 From: Kalyan Gokhale <4734245+KalyanGokhale@users.noreply.github.com> Date: Tue, 17 Jul 2018 21:16:11 +0530 Subject: [PATCH 08/13] Using existing type check functions --- pandas/core/algorithms.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index f7a5d9d91d894..424d65e72934d 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -16,12 +16,12 @@ from pandas.core.dtypes.common import ( is_array_like, is_unsigned_integer_dtype, is_signed_integer_dtype, - is_integer_dtype, is_complex_dtype, + is_integer, is_integer_dtype, is_complex_dtype, is_object_dtype, is_extension_array_dtype, is_categorical_dtype, is_sparse, is_period_dtype, - is_numeric_dtype, is_float_dtype, + is_numeric_dtype, is_float, is_float_dtype, is_bool_dtype, needs_i8_conversion, is_datetimetz, is_datetime_or_timedelta_dtype, is_datetime64_any_dtype, is_datetime64tz_dtype, @@ -432,16 +432,13 @@ def isin(comps, values): comps_types = set(type(v) for v in comps) values_types = set(type(v) for v in values) - is_int = lambda x: ((x == np.int64) or (x == int)) - is_float = lambda x: ((x == np.float64) or (x == float)) - int_flg = False float_flg = False if len(comps_types) == len(values_types) == 1: comps_types = comps_types.pop() values_types = values_types.pop() - int_flg = (is_int(comps_types) and is_int(values_types)) + int_flg = (is_integer(comps_types) and is_integer(values_types)) float_flg = (is_float(comps_types) and is_float(values_types)) # GH16012 From dd37f9ca59ab7667b6be7babe09e671f61502cbf Mon Sep 17 00:00:00 2001 From: Kalyan Gokhale <4734245+KalyanGokhale@users.noreply.github.com> Date: Tue, 17 Jul 2018 21:36:00 +0530 Subject: [PATCH 09/13] Revert "Using existing type check functions" This reverts commit 9fca52c57b349b87298b28f3c1cb66d1627e11bf. --- pandas/core/algorithms.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 424d65e72934d..f7a5d9d91d894 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -16,12 +16,12 @@ from pandas.core.dtypes.common import ( is_array_like, is_unsigned_integer_dtype, is_signed_integer_dtype, - is_integer, is_integer_dtype, is_complex_dtype, + is_integer_dtype, is_complex_dtype, is_object_dtype, is_extension_array_dtype, is_categorical_dtype, is_sparse, is_period_dtype, - is_numeric_dtype, is_float, is_float_dtype, + is_numeric_dtype, is_float_dtype, is_bool_dtype, needs_i8_conversion, is_datetimetz, is_datetime_or_timedelta_dtype, is_datetime64_any_dtype, is_datetime64tz_dtype, @@ -432,13 +432,16 @@ def isin(comps, values): comps_types = set(type(v) for v in comps) values_types = set(type(v) for v in values) + is_int = lambda x: ((x == np.int64) or (x == int)) + is_float = lambda x: ((x == np.float64) or (x == float)) + int_flg = False float_flg = False if len(comps_types) == len(values_types) == 1: comps_types = comps_types.pop() values_types = values_types.pop() - int_flg = (is_integer(comps_types) and is_integer(values_types)) + int_flg = (is_int(comps_types) and is_int(values_types)) float_flg = (is_float(comps_types) and is_float(values_types)) # GH16012 From 52f813116b7fe5a3701c3c1a4f5a430ed90487c4 Mon Sep 17 00:00:00 2001 From: Kalyan Gokhale <4734245+KalyanGokhale@users.noreply.github.com> Date: Fri, 20 Jul 2018 21:35:40 +0530 Subject: [PATCH 10/13] Remove flgs and retain try-except --- pandas/core/algorithms.py | 41 +++++++++++++++++++++------------------ 1 file changed, 22 insertions(+), 19 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index f7a5d9d91d894..2996b11d74ad9 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -427,37 +427,40 @@ def isin(comps, values): values, _, _ = _ensure_data(values) # faster for larger cases to use np.in1d - f = lambda x, y: htable.ismember_object(x.astype(object), y.astype(object)) + f = lambda x, y: htable.ismember_object(x, y) comps_types = set(type(v) for v in comps) values_types = set(type(v) for v in values) - is_int = lambda x: ((x == np.int64) or (x == int)) - is_float = lambda x: ((x == np.float64) or (x == float)) - - int_flg = False - float_flg = False - if len(comps_types) == len(values_types) == 1: comps_types = comps_types.pop() values_types = values_types.pop() - int_flg = (is_int(comps_types) and is_int(values_types)) - float_flg = (is_float(comps_types) and is_float(values_types)) + + is_int = lambda x: ((x == np.int64) or (x == int)) + is_float = lambda x: ((x == np.float64) or (x == float)) # GH16012 # Ensure np.in1d doesn't get object types or it *may* throw an exception if len(comps) > 1000000 and not is_object_dtype(comps): f = lambda x, y: np.in1d(x, y) - elif int_flg: - values = values.astype('int64', copy=False) - comps = comps.astype('int64', copy=False) - f = lambda x, y: htable.ismember_int64(x, y) - - elif float_flg: - values = values.astype('float64', copy=False) - comps = comps.astype('float64', copy=False) - checknull = isna(values).any() - f = lambda x, y: htable.ismember_float64(x, y, checknull) + elif (is_int(comps_types) and is_int(values_types)): + try: + values = values.astype('int64', copy=False) + comps = comps.astype('int64', copy=False) + f = lambda x, y: htable.ismember_int64(x, y) + except (TypeError, ValueError): + values = values.astype(object) + comps = comps.astype(object) + + elif (is_float(comps_types) and is_float(values_types)): + try: + values = values.astype('float64', copy=False) + comps = comps.astype('float64', copy=False) + checknull = isna(values).any() + f = lambda x, y: htable.ismember_float64(x, y, checknull) + except (TypeError, ValueError): + values = values.astype(object) + comps = comps.astype(object) return f(comps, values) From 63a5f1549b03ae517001fee7338f860ef498f922 Mon Sep 17 00:00:00 2001 From: Kalyan Gokhale <4734245+KalyanGokhale@users.noreply.github.com> Date: Fri, 20 Jul 2018 21:48:13 +0530 Subject: [PATCH 11/13] Remove flgs --- pandas/core/algorithms.py | 30 ++++++++++-------------------- 1 file changed, 10 insertions(+), 20 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 2996b11d74ad9..3833917f2e419 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -421,46 +421,36 @@ def isin(comps, values): is_time_like = lambda x: (is_datetime_or_timedelta_dtype(x) or isinstance(x, Timestamp)) + is_int = lambda x: ((x == np.int64) or (x == int)) + is_float = lambda x: ((x == np.float64) or (x == float)) + if is_time_like(dtype): values, _, _ = _ensure_data(values, dtype=dtype) else: values, _, _ = _ensure_data(values) - # faster for larger cases to use np.in1d - f = lambda x, y: htable.ismember_object(x, y) - comps_types = set(type(v) for v in comps) values_types = set(type(v) for v in values) - if len(comps_types) == len(values_types) == 1: - comps_types = comps_types.pop() - values_types = values_types.pop() - - is_int = lambda x: ((x == np.int64) or (x == int)) - is_float = lambda x: ((x == np.float64) or (x == float)) + # faster for larger cases to use np.in1d + f = lambda x, y: htable.ismember_object(x.astype(object), y.astype(object)) # GH16012 # Ensure np.in1d doesn't get object types or it *may* throw an exception if len(comps) > 1000000 and not is_object_dtype(comps): f = lambda x, y: np.in1d(x, y) - elif (is_int(comps_types) and is_int(values_types)): - try: + elif len(comps_types) == len(values_types) == 1: + comps_types = comps_types.pop() + values_types = values_types.pop() + if (is_int(comps_types) and is_int(values_types)): values = values.astype('int64', copy=False) comps = comps.astype('int64', copy=False) f = lambda x, y: htable.ismember_int64(x, y) - except (TypeError, ValueError): - values = values.astype(object) - comps = comps.astype(object) - - elif (is_float(comps_types) and is_float(values_types)): - try: + elif (is_float(comps_types) and is_float(values_types)): values = values.astype('float64', copy=False) comps = comps.astype('float64', copy=False) checknull = isna(values).any() f = lambda x, y: htable.ismember_float64(x, y, checknull) - except (TypeError, ValueError): - values = values.astype(object) - comps = comps.astype(object) return f(comps, values) From 2e0bb49691a83ccfb360441be2647f29a9c0db78 Mon Sep 17 00:00:00 2001 From: Kalyan Gokhale <4734245+KalyanGokhale@users.noreply.github.com> Date: Fri, 20 Jul 2018 22:15:50 +0530 Subject: [PATCH 12/13] Remove flg, fix whatsnew --- doc/source/whatsnew/v0.23.4.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.23.4.txt b/doc/source/whatsnew/v0.23.4.txt index b0ac44dc91d37..c85a9aabf7dbd 100644 --- a/doc/source/whatsnew/v0.23.4.txt +++ b/doc/source/whatsnew/v0.23.4.txt @@ -31,7 +31,7 @@ Bug Fixes **Conversion** -- Unwanted casting of float to int in :func:`isin` (:issue:`21804`) +- Bug in :func:`isin` which led to unwanted casting of float to int (:issue:`21804`) - **Indexing** From 82386c3199a1139cbf0655ea54051ccfff050aa4 Mon Sep 17 00:00:00 2001 From: Kalyan Gokhale <4734245+KalyanGokhale@users.noreply.github.com> Date: Fri, 20 Jul 2018 22:23:55 +0530 Subject: [PATCH 13/13] Whatsnew updated --- doc/source/whatsnew/v0.23.4.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.23.4.txt b/doc/source/whatsnew/v0.23.4.txt index c85a9aabf7dbd..9c2ef7ecb601c 100644 --- a/doc/source/whatsnew/v0.23.4.txt +++ b/doc/source/whatsnew/v0.23.4.txt @@ -31,7 +31,7 @@ Bug Fixes **Conversion** -- Bug in :func:`isin` which led to unwanted casting of float to int (:issue:`21804`) +- Bug where unwanted casting of float to int in :func:`isin` led to incorrect comparison outcome (:issue:`21804`) - **Indexing**