diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 9b8b1e106fcf9..4cdd495fe0c31 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -21,12 +21,12 @@ jobs: - template: ci/azure/posix.yml parameters: name: macOS - vmImage: macOS-10.14 + vmImage: macOS-10.15 - template: ci/azure/windows.yml parameters: name: Windows - vmImage: vs2017-win2016 + vmImage: windows-2019 - job: py38_32bit pool: diff --git a/doc/source/development/contributing_codebase.rst b/doc/source/development/contributing_codebase.rst index e812aaa760a8f..721b1af126709 100644 --- a/doc/source/development/contributing_codebase.rst +++ b/doc/source/development/contributing_codebase.rst @@ -812,7 +812,21 @@ Changes should be reflected in the release notes located in ``doc/source/whatsne This file contains an ongoing change log for each release. Add an entry to this file to document your fix, enhancement or (unavoidable) breaking change. Make sure to include the GitHub issue number when adding your entry (using ``:issue:`1234``` where ``1234`` is the -issue/pull request number). +issue/pull request number). Your entry should be written using full sentences and proper +grammar. + +When mentioning parts of the API, use a Sphinx ``:func:``, ``:meth:``, or ``:class:`` +directive as appropriate. Not all public API functions and methods have a +documentation page; ideally links would only be added if they resolve. You can +usually find similar examples by checking the release notes for one of the previous +versions. + +If your code is a bugfix, add your entry to the relevant bugfix section. Avoid +adding to the ``Other`` section; only in rare cases should entries go there. +Being as concise as possible, the description of the bug should include how the +user may encounter it and an indication of the bug itself, e.g. +"produces incorrect results" or "incorrectly raises". It may be necessary to also +indicate the new behavior. If your code is an enhancement, it is most likely necessary to add usage examples to the existing documentation. This can be done following the section diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index b87274307431b..03dfe475475a1 100755 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -338,19 +338,20 @@ maps labels to their new names along the default axis, is allowed to be passed b *pandas 0.25.x* -.. code-block:: python +.. code-block:: ipython - >>> df = pd.DataFrame([[1]]) - >>> df.rename({0: 1}, {0: 2}) + In [1]: df = pd.DataFrame([[1]]) + In [2]: df.rename({0: 1}, {0: 2}) + Out[2]: FutureWarning: ...Use named arguments to resolve ambiguity... 2 1 1 *pandas 1.0.0* -.. code-block:: python +.. code-block:: ipython - >>> df.rename({0: 1}, {0: 2}) + In [3]: df.rename({0: 1}, {0: 2}) Traceback (most recent call last): ... TypeError: rename() takes from 1 to 2 positional arguments but 3 were given @@ -359,26 +360,28 @@ Note that errors will now be raised when conflicting or potentially ambiguous ar *pandas 0.25.x* -.. code-block:: python +.. code-block:: ipython - >>> df.rename({0: 1}, index={0: 2}) + In [4]: df.rename({0: 1}, index={0: 2}) + Out[4]: 0 1 1 - >>> df.rename(mapper={0: 1}, index={0: 2}) + In [5]: df.rename(mapper={0: 1}, index={0: 2}) + Out[5]: 0 2 1 *pandas 1.0.0* -.. code-block:: python +.. code-block:: ipython - >>> df.rename({0: 1}, index={0: 2}) + In [6]: df.rename({0: 1}, index={0: 2}) Traceback (most recent call last): ... TypeError: Cannot specify both 'mapper' and any of 'index' or 'columns' - >>> df.rename(mapper={0: 1}, index={0: 2}) + In [7]: df.rename(mapper={0: 1}, index={0: 2}) Traceback (most recent call last): ... TypeError: Cannot specify both 'mapper' and any of 'index' or 'columns' @@ -405,12 +408,12 @@ Extended verbose info output for :class:`~pandas.DataFrame` *pandas 0.25.x* -.. code-block:: python +.. code-block:: ipython - >>> df = pd.DataFrame({"int_col": [1, 2, 3], + In [1]: df = pd.DataFrame({"int_col": [1, 2, 3], ... "text_col": ["a", "b", "c"], ... "float_col": [0.0, 0.1, 0.2]}) - >>> df.info(verbose=True) + In [2]: df.info(verbose=True) RangeIndex: 3 entries, 0 to 2 Data columns (total 3 columns): @@ -440,14 +443,16 @@ Extended verbose info output for :class:`~pandas.DataFrame` *pandas 0.25.x* -.. code-block:: python +.. code-block:: ipython - >>> pd.array(["a", None]) + In [1]: pd.array(["a", None]) + Out[1]: ['a', None] Length: 2, dtype: object - >>> pd.array([1, None]) + In [2]: pd.array([1, None]) + Out[2]: [1, None] Length: 2, dtype: object @@ -470,15 +475,17 @@ As a reminder, you can specify the ``dtype`` to disable all inference. *pandas 0.25.x* -.. code-block:: python +.. code-block:: ipython - >>> a = pd.array([1, 2, None], dtype="Int64") - >>> a + In [1]: a = pd.array([1, 2, None], dtype="Int64") + In [2]: a + Out[2]: [1, 2, NaN] Length: 3, dtype: Int64 - >>> a[2] + In [3]: a[2] + Out[3]: nan *pandas 1.0.0* @@ -499,9 +506,10 @@ will now raise. *pandas 0.25.x* -.. code-block:: python +.. code-block:: ipython - >>> np.asarray(a, dtype="float") + In [1]: np.asarray(a, dtype="float") + Out[1]: array([ 1., 2., nan]) *pandas 1.0.0* @@ -525,9 +533,10 @@ will now be ``pd.NA`` instead of ``np.nan`` in presence of missing values *pandas 0.25.x* -.. code-block:: python +.. code-block:: ipython - >>> pd.Series(a).sum(skipna=False) + In [1]: pd.Series(a).sum(skipna=False) + Out[1]: nan *pandas 1.0.0* @@ -543,9 +552,10 @@ integer dtype for the values. *pandas 0.25.x* -.. code-block:: python +.. code-block:: ipython - >>> pd.Series([2, 1, 1, None], dtype="Int64").value_counts().dtype + In [1]: pd.Series([2, 1, 1, None], dtype="Int64").value_counts().dtype + Out[1]: dtype('int64') *pandas 1.0.0* @@ -565,15 +575,17 @@ Comparison operations on a :class:`arrays.IntegerArray` now returns a *pandas 0.25.x* -.. code-block:: python +.. code-block:: ipython - >>> a = pd.array([1, 2, None], dtype="Int64") - >>> a + In [1]: a = pd.array([1, 2, None], dtype="Int64") + In [2]: a + Out[2]: [1, 2, NaN] Length: 3, dtype: Int64 - >>> a > 1 + In [3]: a > 1 + Out[3]: array([False, True, False]) *pandas 1.0.0* @@ -640,9 +652,10 @@ scalar values in the result are instances of the extension dtype's scalar type. *pandas 0.25.x* -.. code-block:: python +.. code-block:: ipython - >>> df.resample("2D").agg(lambda x: 'a').A.dtype + In [1]> df.resample("2D").agg(lambda x: 'a').A.dtype + Out[1]: CategoricalDtype(categories=['a', 'b'], ordered=False) *pandas 1.0.0* @@ -657,9 +670,10 @@ depending on how the results are cast back to the original dtype. *pandas 0.25.x* -.. code-block:: python +.. code-block:: ipython - >>> df.resample("2D").agg(lambda x: 'c') + In [1] df.resample("2D").agg(lambda x: 'c') + Out[1]: A 0 NaN @@ -871,10 +885,10 @@ matplotlib directly rather than :meth:`~DataFrame.plot`. To use pandas formatters with a matplotlib plot, specify -.. code-block:: python +.. code-block:: ipython - >>> import pandas as pd - >>> pd.options.plotting.matplotlib.register_converters = True + In [1]: import pandas as pd + In [2]: pd.options.plotting.matplotlib.register_converters = True Note that plots created by :meth:`DataFrame.plot` and :meth:`Series.plot` *do* register the converters automatically. The only behavior change is when plotting a date-like object via ``matplotlib.pyplot.plot`` diff --git a/doc/source/whatsnew/v1.2.1.rst b/doc/source/whatsnew/v1.2.1.rst index bfe30d52e2aff..34e28eab6d4bf 100644 --- a/doc/source/whatsnew/v1.2.1.rst +++ b/doc/source/whatsnew/v1.2.1.rst @@ -52,20 +52,23 @@ DataFrame / Series combination) would ignore the indices, only match the inputs by shape, and use the index/columns of the first DataFrame for the result: -.. code-block:: python +.. code-block:: ipython - >>> df1 = pd.DataFrame({"a": [1, 2], "b": [3, 4]}, index=[0, 1]) - ... df2 = pd.DataFrame({"a": [1, 2], "b": [3, 4]}, index=[1, 2]) - >>> df1 + In [1]: df1 = pd.DataFrame({"a": [1, 2], "b": [3, 4]}, index=[0, 1]) + In [2]: df2 = pd.DataFrame({"a": [1, 2], "b": [3, 4]}, index=[1, 2]) + In [3]: df1 + Out[3]: a b 0 1 3 1 2 4 - >>> df2 + In [4]: df2 + Out[4]: a b 1 1 3 2 2 4 - >>> np.add(df1, df2) + In [5]: np.add(df1, df2) + Out[5]: a b 0 2 6 1 4 8 @@ -73,9 +76,10 @@ the result: This contrasts with how other pandas operations work, which first align the inputs: -.. code-block:: python +.. code-block:: ipython - >>> df1 + df2 + In [6]: df1 + df2 + Out[6]: a b 0 NaN NaN 1 3.0 7.0 @@ -94,9 +98,10 @@ objects (eg ``np.add(s1, s2)``) already aligns and continues to do so. To avoid the warning and keep the current behaviour of ignoring the indices, convert one of the arguments to a NumPy array: -.. code-block:: python +.. code-block:: ipython - >>> np.add(df1, np.asarray(df2)) + In [7]: np.add(df1, np.asarray(df2)) + Out[7]: a b 0 2 6 1 4 8 @@ -104,10 +109,11 @@ convert one of the arguments to a NumPy array: To obtain the future behaviour and silence the warning, you can align manually before passing the arguments to the ufunc: -.. code-block:: python +.. code-block:: ipython - >>> df1, df2 = df1.align(df2) - >>> np.add(df1, df2) + In [8]: df1, df2 = df1.align(df2) + In [9]: np.add(df1, df2) + Out[9]: a b 0 NaN NaN 1 3.0 7.0 diff --git a/doc/source/whatsnew/v1.3.1.rst b/doc/source/whatsnew/v1.3.1.rst index 9c17a22bf6d52..03aa87193ab9d 100644 --- a/doc/source/whatsnew/v1.3.1.rst +++ b/doc/source/whatsnew/v1.3.1.rst @@ -16,6 +16,7 @@ Fixed regressions ~~~~~~~~~~~~~~~~~ - Pandas could not be built on PyPy (:issue:`42355`) - :class:`DataFrame` constructed with with an older version of pandas could not be unpickled (:issue:`42345`) +- Performance regression in constructing a :class:`DataFrame` from a dictionary of dictionaries (:issue:`42338`) - .. --------------------------------------------------------------------------- @@ -24,7 +25,8 @@ Fixed regressions Bug fixes ~~~~~~~~~ -- + +-Bug in :meth:`pandas.cut` on :class:`Series` with duplicate indices (:issue:`42185`) and non-exact :meth:`pandas.CategoricalIndex` (:issue:`42425`) - .. --------------------------------------------------------------------------- diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 5b09b62fa9e88..3f557b6a7c424 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -150,6 +150,7 @@ Deprecations - Deprecated :meth:`Index.is_type_compatible` (:issue:`42113`) - Deprecated ``method`` argument in :meth:`Index.get_loc`, use ``index.get_indexer([label], method=...)`` instead (:issue:`42269`) - Deprecated treating integer keys in :meth:`Series.__setitem__` as positional when the index is a :class:`Float64Index` not containing the key, a :class:`IntervalIndex` with no entries containing the key, or a :class:`MultiIndex` with leading :class:`Float64Index` level not containing the key (:issue:`33469`) +- Deprecated treating ``numpy.datetime64`` objects as UTC times when passed to the :class:`Timestamp` constructor along with a timezone. In a future version, these will be treated as wall-times. To retain the old behavior, use ``Timestamp(dt64).tz_localize("UTC").tz_convert(tz)`` (:issue:`24559`) - .. --------------------------------------------------------------------------- @@ -212,7 +213,7 @@ Interval Indexing ^^^^^^^^ - Bug in indexing on a :class:`Series` or :class:`DataFrame` with a :class:`DatetimeIndex` when passing a string, the return type depended on whether the index was monotonic (:issue:`24892`) -- +- Bug in :meth:`DataFrame.truncate` and :meth:`Series.truncate` when the object's Index has a length greater than one but only one unique value (:issue:`42365`) Missing ^^^^^^^ diff --git a/pandas/_libs/lib.pyi b/pandas/_libs/lib.pyi index d39c5ac5d2967..5be50f16af003 100644 --- a/pandas/_libs/lib.pyi +++ b/pandas/_libs/lib.pyi @@ -51,6 +51,7 @@ def is_string_array(values: np.ndarray, skipna: bool = False): ... def is_float_array(values: np.ndarray, skipna: bool = False): ... def is_integer_array(values: np.ndarray, skipna: bool = False): ... def is_bool_array(values: np.ndarray, skipna: bool = False): ... +def fast_multiget(mapping: dict, keys: np.ndarray, default=np.nan) -> np.ndarray: ... def fast_unique_multiple_list_gen(gen: Generator, sort: bool = True) -> list: ... def fast_unique_multiple_list(lists: list, sort: bool = True) -> list: ... def fast_unique_multiple(arrays: list, sort: bool = True) -> list: ... diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index c48b8ed5aec9c..4ab2497be94d5 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2955,6 +2955,28 @@ def to_object_array_tuples(rows: object) -> np.ndarray: return result +@cython.wraparound(False) +@cython.boundscheck(False) +def fast_multiget(dict mapping, ndarray keys, default=np.nan) -> np.ndarray: + cdef: + Py_ssize_t i, n = len(keys) + object val + ndarray[object] output = np.empty(n, dtype='O') + + if n == 0: + # kludge, for Series + return np.empty(0, dtype='f8') + + for i in range(n): + val = keys[i] + if val in mapping: + output[i] = mapping[val] + else: + output[i] = default + + return maybe_convert_objects(output) + + def is_bool_list(obj: list) -> bool: """ Check if this list contains only bool or np.bool_ objects. diff --git a/pandas/_libs/tslibs/timestamps.pyx b/pandas/_libs/tslibs/timestamps.pyx index 88a2706a35aa2..e4e9df5176459 100644 --- a/pandas/_libs/tslibs/timestamps.pyx +++ b/pandas/_libs/tslibs/timestamps.pyx @@ -1329,6 +1329,19 @@ class Timestamp(_Timestamp): "the tz parameter. Use tz_convert instead.") tzobj = maybe_get_tz(tz) + if tzobj is not None and is_datetime64_object(ts_input): + # GH#24559, GH#42288 In the future we will treat datetime64 as + # wall-time (consistent with DatetimeIndex) + warnings.warn( + "In a future version, when passing a np.datetime64 object and " + "a timezone to Timestamp, the datetime64 will be interpreted " + "as a wall time, not a UTC time. To interpret as a UTC time, " + "use `Timestamp(dt64).tz_localize('UTC').tz_convert(tz)`", + FutureWarning, + stacklevel=1, + ) + # Once this deprecation is enforced, we can do + # return Timestamp(ts_input).tz_localize(tzobj) ts = convert_to_tsobject(ts_input, tzobj, unit, 0, 0, nanosecond or 0) if ts.value == NPY_NAT: diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index 0a517b22d1bb3..8513bbb044e83 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -509,6 +509,11 @@ def _check_compatible_with(self, other, setitem: bool = False): # Descriptive Properties def _box_func(self, x) -> Timestamp | NaTType: + if isinstance(x, np.datetime64): + # GH#42228 + # Argument 1 to "signedinteger" has incompatible type "datetime64"; + # expected "Union[SupportsInt, Union[str, bytes], SupportsIndex]" + x = np.int64(x) # type: ignore[arg-type] ts = Timestamp(x, tz=self.tz) # Non-overlapping identity check (left operand type: "Timestamp", # right operand type: "NaTType") diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 9f0fefc9a10d0..15a18d5027274 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -781,6 +781,21 @@ def infer_dtype_from_scalar(val, pandas_dtype: bool = False) -> tuple[DtypeObj, return dtype, val +def dict_compat(d: dict[Scalar, Scalar]) -> dict[Scalar, Scalar]: + """ + Convert datetimelike-keyed dicts to a Timestamp-keyed dict. + + Parameters + ---------- + d: dict-like object + + Returns + ------- + dict + """ + return {maybe_box_datetimelike(key): value for key, value in d.items()} + + def infer_dtype_from_array( arr, pandas_dtype: bool = False ) -> tuple[DtypeObj, ArrayLike]: diff --git a/pandas/core/generic.py b/pandas/core/generic.py index c15aa3cf697da..3b8458fac2942 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3765,18 +3765,12 @@ class animal locomotion self._consolidate_inplace() if isinstance(index, MultiIndex): - try: - loc, new_index = index._get_loc_level(key, level=0) - except TypeError as err: - raise TypeError( - f"Expected label or tuple of labels, got {key}" - ) from err - else: - if not drop_level: - if lib.is_integer(loc): - new_index = index[loc : loc + 1] - else: - new_index = index[loc] + loc, new_index = index._get_loc_level(key, level=0) + if not drop_level: + if lib.is_integer(loc): + new_index = index[loc : loc + 1] + else: + new_index = index[loc] else: loc = index.get_loc(key) @@ -9388,7 +9382,7 @@ def truncate( if before is not None and after is not None and before > after: raise ValueError(f"Truncate: {after} must be after {before}") - if len(ax) > 1 and ax.is_monotonic_decreasing: + if len(ax) > 1 and ax.is_monotonic_decreasing and ax.nunique() > 1: before, after = after, before slicer = [slice(None, None)] * self._AXIS_LEN diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 04592e319f7ed..8903d29782610 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -2988,6 +2988,7 @@ def maybe_mi_droplevels(indexer, levels): if isinstance(key, tuple) and level == 0: try: + # Check if this tuple is a single key in our first level if key in self.levels[0]: indexer = self._get_level_indexer(key, level=level) new_index = maybe_mi_droplevels(indexer, [0]) @@ -3021,19 +3022,29 @@ def maybe_mi_droplevels(indexer, levels): indexer = None for i, k in enumerate(key): if not isinstance(k, slice): - k = self._get_level_indexer(k, level=i) - if isinstance(k, slice): - # everything - if k.start == 0 and k.stop == len(self): - k = slice(None, None) - else: - k_index = k + loc_level = self._get_level_indexer(k, level=i) + if isinstance(loc_level, slice): + if com.is_null_slice(loc_level) or com.is_full_slice( + loc_level, len(self) + ): + # everything + continue + else: + # e.g. test_xs_IndexSlice_argument_not_implemented + k_index = np.zeros(len(self), dtype=bool) + k_index[loc_level] = True - if isinstance(k, slice): - if k == slice(None, None): - continue else: - raise TypeError(key) + k_index = loc_level + + elif com.is_null_slice(k): + # taking everything, does not affect `indexer` below + continue + + else: + # FIXME: this message can be inaccurate, e.g. + # test_series_varied_multiindex_alignment + raise TypeError(f"Expected label or tuple of labels, got {key}") if indexer is None: indexer = k_index diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index e362213c52bd5..387dcca6897b7 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -882,17 +882,21 @@ def _getitem_nested_tuple(self, tup: tuple): if self.name != "loc": # This should never be reached, but lets be explicit about it raise ValueError("Too many indices") - if isinstance(self.obj, ABCSeries) and any( - isinstance(k, tuple) for k in tup - ): - # GH#35349 Raise if tuple in tuple for series - raise ValueError("Too many indices") if all(is_hashable(x) or com.is_null_slice(x) for x in tup): # GH#10521 Series should reduce MultiIndex dimensions instead of # DataFrame, IndexingError is not raised when slice(None,None,None) # with one row. with suppress(IndexingError): return self._handle_lowerdim_multi_index_axis0(tup) + elif isinstance(self.obj, ABCSeries) and any( + isinstance(k, tuple) for k in tup + ): + # GH#35349 Raise if tuple in tuple for series + # Do this after the all-hashable-or-null-slice check so that + # we are only getting non-hashable tuples, in particular ones + # that themselves contain a slice entry + # See test_loc_series_getitem_too_many_dimensions + raise ValueError("Too many indices") # this is a series with a multi-index specified a tuple of # selectors @@ -1127,6 +1131,9 @@ def _handle_lowerdim_multi_index_axis0(self, tup: tuple): return self._get_label(tup, axis=axis) except TypeError as err: # slices are unhashable + # FIXME: this raises when we have a DatetimeIndex first level and a + # string for the first tup entry + # see test_partial_slicing_with_multiindex raise IndexingError("No label returned") from err except KeyError as ek: diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 7bef7ae9b39d7..5e327adfb8905 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -26,6 +26,7 @@ from pandas.core.dtypes.cast import ( construct_1d_arraylike_from_scalar, + dict_compat, maybe_cast_to_datetime, maybe_convert_platform, maybe_infer_to_datetimelike, @@ -59,7 +60,6 @@ TimedeltaArray, ) from pandas.core.construction import ( - create_series_with_explicit_dtype, ensure_wrapped_if_datetimelike, extract_array, range_to_ndarray, @@ -67,7 +67,9 @@ ) from pandas.core.indexes import base as ibase from pandas.core.indexes.api import ( + DatetimeIndex, Index, + TimedeltaIndex, ensure_index, get_objs_combined_axis, union_indexes, @@ -556,6 +558,7 @@ def convert(v): def _homogenize(data, index: Index, dtype: DtypeObj | None) -> list[ArrayLike]: + oindex = None homogenized = [] for val in data: @@ -570,9 +573,18 @@ def _homogenize(data, index: Index, dtype: DtypeObj | None) -> list[ArrayLike]: val = val._values else: if isinstance(val, dict): - # see test_constructor_subclass_dict - # test_constructor_dict_datetime64_index - val = create_series_with_explicit_dtype(val, index=index)._values + # GH#41785 this _should_ be equivalent to (but faster than) + # val = create_series_with_explicit_dtype(val, index=index)._values + if oindex is None: + oindex = index.astype("O") + + if isinstance(index, (DatetimeIndex, TimedeltaIndex)): + # see test_constructor_dict_datetime64_index + val = dict_compat(val) + else: + # see test_constructor_subclass_dict + val = dict(val) + val = lib.fast_multiget(val, oindex._values, default=np.nan) val = sanitize_array( val, index, dtype=dtype, copy=False, raise_cast_failure=False diff --git a/pandas/core/reshape/tile.py b/pandas/core/reshape/tile.py index 7db30dc1ba9b9..bf0664e233499 100644 --- a/pandas/core/reshape/tile.py +++ b/pandas/core/reshape/tile.py @@ -421,7 +421,7 @@ def _bins_to_cuts( ids = ensure_platform_int(bins.searchsorted(x, side=side)) if include_lowest: - ids[x == bins[0]] = 1 + ids[np.asarray(x) == bins[0]] = 1 na_mask = isna(x) | (ids == len(bins)) | (ids == 0) has_nas = na_mask.any() diff --git a/pandas/core/series.py b/pandas/core/series.py index 6e6e8b6d8b178..9eab934d7db83 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -201,7 +201,7 @@ class Series(base.IndexOpsMixin, generic.NDFrame): methods from ndarray have been overridden to automatically exclude missing data (currently represented as NaN). - Operations between Series (+, -, /, *, **) align values based on their + Operations between Series (+, -, /, \\*, \\*\\*) align values based on their associated index values-- they need not be the same length. The result index will be the sorted union of the two indexes. diff --git a/pandas/tests/dtypes/cast/test_dict_compat.py b/pandas/tests/dtypes/cast/test_dict_compat.py new file mode 100644 index 0000000000000..13dc82d779f95 --- /dev/null +++ b/pandas/tests/dtypes/cast/test_dict_compat.py @@ -0,0 +1,14 @@ +import numpy as np + +from pandas.core.dtypes.cast import dict_compat + +from pandas import Timestamp + + +def test_dict_compat(): + data_datetime64 = {np.datetime64("1990-03-15"): 1, np.datetime64("2015-03-15"): 2} + data_unchanged = {1: 2, 3: 4, 5: 6} + expected = {Timestamp("1990-3-15"): 1, Timestamp("2015-03-15"): 2} + assert dict_compat(data_datetime64) == expected + assert dict_compat(expected) == expected + assert dict_compat(data_unchanged) == data_unchanged diff --git a/pandas/tests/frame/indexing/test_xs.py b/pandas/tests/frame/indexing/test_xs.py index a76aec9ebda44..d2704876c31c5 100644 --- a/pandas/tests/frame/indexing/test_xs.py +++ b/pandas/tests/frame/indexing/test_xs.py @@ -318,12 +318,13 @@ def test_xs_IndexSlice_argument_not_implemented(self, klass): if klass is Series: obj = obj[0] - msg = ( - "Expected label or tuple of labels, got " - r"\(\('foo', 'qux', 0\), slice\(None, None, None\)\)" - ) - with pytest.raises(TypeError, match=msg): - obj.xs(IndexSlice[("foo", "qux", 0), :]) + expected = obj.iloc[-2:].droplevel(0) + + result = obj.xs(IndexSlice[("foo", "qux", 0), :]) + tm.assert_equal(result, expected) + + result = obj.loc[IndexSlice[("foo", "qux", 0), :]] + tm.assert_equal(result, expected) @pytest.mark.parametrize("klass", [DataFrame, Series]) def test_xs_levels_raises(self, klass): diff --git a/pandas/tests/frame/methods/test_truncate.py b/pandas/tests/frame/methods/test_truncate.py index 210e86067566a..f2d60240009c5 100644 --- a/pandas/tests/frame/methods/test_truncate.py +++ b/pandas/tests/frame/methods/test_truncate.py @@ -148,3 +148,13 @@ def test_truncate_multiindex(self, frame_or_series): expected = expected["col"] tm.assert_equal(result, expected) + + def test_truncate_index_only_one_unique_value(self, frame_or_series): + # GH 42365 + obj = Series(0, index=date_range("2021-06-30", "2021-06-30")).repeat(5) + if frame_or_series is DataFrame: + obj = obj.to_frame(name="a") + + truncated = obj.truncate("2021-06-28", "2021-07-01") + + tm.assert_equal(truncated, obj) diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index e96b25418d408..2e0f48849530a 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -1663,6 +1663,18 @@ def test_loc_multiindex_levels_contain_values_not_in_index_anymore(self, lt_valu with pytest.raises(KeyError, match=r"\['b'\] not in index"): df.loc[df["a"] < lt_value, :].loc[["b"], :] + def test_loc_drops_level(self): + # Based on test_series_varied_multiindex_alignment, where + # this used to fail to drop the first level + mi = MultiIndex.from_product( + [list("ab"), list("xy"), [1, 2]], names=["ab", "xy", "num"] + ) + ser = Series(range(8), index=mi) + + loc_result = ser.loc["a", :, :] + expected = ser.index.droplevel(0)[:4] + tm.assert_index_equal(loc_result.index, expected) + class TestLocSetitemWithExpansion: @pytest.mark.slow diff --git a/pandas/tests/reshape/test_cut.py b/pandas/tests/reshape/test_cut.py index 944205c66c3e6..7c491536de04a 100644 --- a/pandas/tests/reshape/test_cut.py +++ b/pandas/tests/reshape/test_cut.py @@ -691,3 +691,49 @@ def test_cut_no_warnings(): labels = [f"{i} - {i + 9}" for i in range(0, 100, 10)] with tm.assert_produces_warning(False): df["group"] = cut(df.value, range(0, 105, 10), right=False, labels=labels) + + +def test_cut_with_duplicated_index_lowest_included(): + # GH 42185 + expected = Series( + [Interval(-0.001, 2, closed="right")] * 3 + + [Interval(2, 4, closed="right"), Interval(-0.001, 2, closed="right")], + index=[0, 1, 2, 3, 0], + dtype="category", + ).cat.as_ordered() + + s = Series([0, 1, 2, 3, 0], index=[0, 1, 2, 3, 0]) + result = cut(s, bins=[0, 2, 4], include_lowest=True) + tm.assert_series_equal(result, expected) + + +def test_cut_with_nonexact_categorical_indices(): + + # GH 42424 + + ser = Series(range(0, 100)) + ser1 = cut(ser, 10).value_counts().head(5) + ser2 = cut(ser, 10).value_counts().tail(5) + result = DataFrame({"1": ser1, "2": ser2}) + + index = pd.CategoricalIndex( + [ + Interval(-0.099, 9.9, closed="right"), + Interval(9.9, 19.8, closed="right"), + Interval(19.8, 29.7, closed="right"), + Interval(29.7, 39.6, closed="right"), + Interval(39.6, 49.5, closed="right"), + Interval(49.5, 59.4, closed="right"), + Interval(59.4, 69.3, closed="right"), + Interval(69.3, 79.2, closed="right"), + Interval(79.2, 89.1, closed="right"), + Interval(89.1, 99, closed="right"), + ], + ordered=True, + ) + + expected = DataFrame( + {"1": [10] * 5 + [np.nan] * 5, "2": [np.nan] * 5 + [10] * 5}, index=index + ) + + tm.assert_frame_equal(expected, result) diff --git a/pandas/tests/scalar/timestamp/test_constructors.py b/pandas/tests/scalar/timestamp/test_constructors.py index c6c475f5b87a2..b3deb1a57e5c3 100644 --- a/pandas/tests/scalar/timestamp/test_constructors.py +++ b/pandas/tests/scalar/timestamp/test_constructors.py @@ -24,6 +24,24 @@ class TestTimestampConstructors: + def test_constructor_datetime64_with_tz(self): + # GH#42288, GH#24559 + dt = np.datetime64("1970-01-01 05:00:00") + tzstr = "UTC+05:00" + + msg = "interpreted as a wall time" + with tm.assert_produces_warning(FutureWarning, match=msg): + ts = Timestamp(dt, tz=tzstr) + + # Check that we match the old behavior + alt = Timestamp(dt).tz_localize("UTC").tz_convert(tzstr) + assert ts == alt + + # Check that we *don't* match the future behavior + assert ts.hour != 5 + expected_future = Timestamp(dt).tz_localize(tzstr) + assert ts != expected_future + def test_constructor(self): base_str = "2014-07-01 09:00" base_dt = datetime(2014, 7, 1, 9) diff --git a/pandas/tests/series/methods/test_truncate.py b/pandas/tests/series/methods/test_truncate.py index ca5c3e2639097..a3a27a744b180 100644 --- a/pandas/tests/series/methods/test_truncate.py +++ b/pandas/tests/series/methods/test_truncate.py @@ -55,3 +55,11 @@ def test_truncate_one_element_series(self): # the input Series and the expected Series are the same tm.assert_series_equal(result, series) + + def test_truncate_index_only_one_unique_value(self): + # GH 42365 + obj = Series(0, index=date_range("2021-06-30", "2021-06-30")).repeat(5) + + truncated = obj.truncate("2021-06-28", "2021-07-01") + + tm.assert_series_equal(truncated, obj) diff --git a/pandas/tests/series/test_arithmetic.py b/pandas/tests/series/test_arithmetic.py index 6b7c2c698c211..4d1c75da72399 100644 --- a/pandas/tests/series/test_arithmetic.py +++ b/pandas/tests/series/test_arithmetic.py @@ -924,7 +924,7 @@ def test_series_varied_multiindex_alignment(): [1000 * i for i in range(1, 5)], index=pd.MultiIndex.from_product([list("xy"), [1, 2]], names=["xy", "num"]), ) - result = s1.loc[pd.IndexSlice["a", :, :]] + s2 + result = s1.loc[pd.IndexSlice[["a"], :, :]] + s2 expected = Series( [1000, 2001, 3002, 4003], index=pd.MultiIndex.from_tuples( diff --git a/scripts/validate_docstrings.py b/scripts/validate_docstrings.py index 9b65204403612..7562895d9db3e 100755 --- a/scripts/validate_docstrings.py +++ b/scripts/validate_docstrings.py @@ -17,37 +17,26 @@ import argparse import doctest -import glob import importlib +import io import json -import os +import pathlib import subprocess import sys import tempfile -try: - from io import StringIO -except ImportError: - from cStringIO import StringIO +import matplotlib +import matplotlib.pyplot as plt +import numpy +from numpydoc.validate import ( + Docstring, + validate, +) -# Template backend makes matplotlib to not plot anything. This is useful -# to avoid that plot windows are open from the doctests while running the -# script. Setting here before matplotlib is loaded. -# We don't warn for the number of open plots, as none is actually being opened -os.environ["MPLBACKEND"] = "Template" -import matplotlib # isort:skip +import pandas -matplotlib.rc("figure", max_open_warning=10000) - -import numpy # isort:skip - -BASE_PATH = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) - -sys.path.insert(0, os.path.join(BASE_PATH)) -import pandas # isort:skip - -sys.path.insert(1, os.path.join(BASE_PATH, "doc", "sphinxext")) -from numpydoc.validate import validate, Docstring # isort:skip +# With template backend, matplotlib plots nothing +matplotlib.use("template") PRIVATE_CLASSES = ["NDFrame", "IndexOpsMixin"] @@ -157,7 +146,7 @@ def examples_errors(self): context = {"np": numpy, "pd": pandas} error_msgs = "" for test in finder.find(self.raw_doc, self.name, globs=context): - f = StringIO() + f = io.StringIO() runner.run(test, out=f.write) error_msgs += f.getvalue() return error_msgs @@ -263,6 +252,7 @@ def pandas_validate(func_name: str): if doc.non_hyphenated_array_like(): result["errors"].append(pandas_error("GL05")) + plt.close("all") return result @@ -288,13 +278,14 @@ def validate_all(prefix, ignore_deprecated=False): result = {} seen = {} - api_doc_fnames = os.path.join(BASE_PATH, "doc", "source", "reference", "*.rst") + base_path = pathlib.Path(__file__).parent.parent + api_doc_fnames = pathlib.Path(base_path, "doc", "source", "reference") api_items = [] - for api_doc_fname in glob.glob(api_doc_fnames): + for api_doc_fname in api_doc_fnames.glob("*.rst"): with open(api_doc_fname) as f: api_items += list(get_api_items(f)) - for func_name, func_obj, section, subsection in api_items: + for func_name, _, section, subsection in api_items: if prefix and not func_name.startswith(prefix): continue doc_info = pandas_validate(func_name)