diff --git a/README.md b/README.md index 1130eb30954dc..5342eda4390eb 100644 --- a/README.md +++ b/README.md @@ -5,82 +5,16 @@ ----------------- # pandas: powerful Python data analysis toolkit - - - - - - - - - - - - - - - - - - - - - - - - - - - -   - - - - - - - - - -
Latest Release - - latest release - -
- - latest release - -
Package Status - - status - -
License - - license - -
Build Status - - travis build status - -
- - Azure Pipelines build status - -
Coverage - - coverage - -
Downloads - - conda-forge downloads - -
Gitter - - - -
- - +[![PyPI Latest Release](https://img.shields.io/pypi/v/pandas.svg)](https://pypi.org/project/pandas/) +[![Conda Latest Release](https://anaconda.org/conda-forge/pandas/badges/version.svg)](https://anaconda.org/anaconda/pandas/) +[![Package Status](https://img.shields.io/pypi/status/pandas.svg)](https://pypi.org/project/pandas/) +[![License](https://img.shields.io/pypi/l/pandas.svg)](https://github.com/pandas-dev/pandas/blob/master/LICENSE) +[![Travis Build Status](https://travis-ci.org/pandas-dev/pandas.svg?branch=master)](https://travis-ci.org/pandas-dev/pandas) +[![Azure Build Status](https://dev.azure.com/pandas-dev/pandas/_apis/build/status/pandas-dev.pandas?branch=master)](https://dev.azure.com/pandas-dev/pandas/_build/latest?definitionId=1&branch=master) +[![Coverage](https://codecov.io/github/pandas-dev/pandas/coverage.svg?branch=master)](https://codecov.io/gh/pandas-dev/pandas) +[![Downloads](https://anaconda.org/conda-forge/pandas/badges/downloads.svg)](https://pandas.pydata.org) +[![Gitter](https://badges.gitter.im/Join%20Chat.svg)](https://gitter.im/pydata/pandas) +[![Powered by NumFOCUS](https://img.shields.io/badge/powered%20by-NumFOCUS-orange.svg?style=flat&colorA=E1523D&colorB=007D8A)](https://numfocus.org) ## What is it? diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 0cc42be42d61e..b46989894ae12 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -113,7 +113,7 @@ if [[ -z "$CHECK" || "$CHECK" == "lint" ]]; then # Imports - Check formatting using isort see setup.cfg for settings MSG='Check import format using isort' ; echo $MSG - ISORT_CMD="isort --recursive --check-only pandas asv_bench" + ISORT_CMD="isort --quiet --recursive --check-only pandas asv_bench" if [[ "$GITHUB_ACTIONS" == "true" ]]; then eval $ISORT_CMD | awk '{print "##[error]" $0}'; RET=$(($RET + ${PIPESTATUS[0]})) else diff --git a/ci/deps/travis-36-cov.yaml b/ci/deps/travis-36-cov.yaml index 869d2ab683f0c..6883301a63a9b 100644 --- a/ci/deps/travis-36-cov.yaml +++ b/ci/deps/travis-36-cov.yaml @@ -27,8 +27,7 @@ dependencies: - numexpr - numpy=1.15.* - odfpy - - openpyxl<=3.0.1 - # https://github.com/pandas-dev/pandas/pull/30009 openpyxl 3.0.2 broke + - openpyxl - pandas-gbq - psycopg2 - pyarrow>=0.13.0 diff --git a/ci/deps/travis-37.yaml b/ci/deps/travis-37.yaml index 73e2c20b31438..682b1016ff3a2 100644 --- a/ci/deps/travis-37.yaml +++ b/ci/deps/travis-37.yaml @@ -2,7 +2,6 @@ name: pandas-dev channels: - defaults - conda-forge - - c3i_test dependencies: - python=3.7.* diff --git a/doc/redirects.csv b/doc/redirects.csv index 0a71f037d23c3..3a990b09e7f7d 100644 --- a/doc/redirects.csv +++ b/doc/redirects.csv @@ -46,7 +46,10 @@ developer,development/developer extending,development/extending internals,development/internals -# api +# api moved function +reference/api/pandas.io.json.json_normalize,pandas.json_normalize + +# api rename api,reference/index generated/pandas.api.extensions.ExtensionArray.argsort,../reference/api/pandas.api.extensions.ExtensionArray.argsort generated/pandas.api.extensions.ExtensionArray.astype,../reference/api/pandas.api.extensions.ExtensionArray.astype diff --git a/doc/source/conf.py b/doc/source/conf.py index 28df08a8607b9..c12c148d0f10d 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -209,6 +209,7 @@ "external_links": [], "github_url": "https://github.com/pandas-dev/pandas", "twitter_url": "https://twitter.com/pandas_dev", + "google_analytics_id": "UA-27880019-2", } # Add any paths that contain custom themes here, relative to this directory. diff --git a/doc/source/getting_started/basics.rst b/doc/source/getting_started/basics.rst index 4fef5efbd1551..277080006cb3c 100644 --- a/doc/source/getting_started/basics.rst +++ b/doc/source/getting_started/basics.rst @@ -1973,7 +1973,7 @@ Pandas has two ways to store strings. 1. ``object`` dtype, which can hold any Python object, including strings. 2. :class:`StringDtype`, which is dedicated to strings. -Generally, we recommend using :class:`StringDtype`. See :ref:`text.types` fore more. +Generally, we recommend using :class:`StringDtype`. See :ref:`text.types` for more. Finally, arbitrary objects may be stored using the ``object`` dtype, but should be avoided to the extent possible (for performance and interoperability with diff --git a/doc/source/user_guide/visualization.rst b/doc/source/user_guide/visualization.rst index 6680ba854cb6f..756dd06aced7f 100644 --- a/doc/source/user_guide/visualization.rst +++ b/doc/source/user_guide/visualization.rst @@ -1641,3 +1641,46 @@ when plotting a large number of points. :suppress: plt.close('all') + +Plotting backends +----------------- + +Starting in version 0.25, pandas can be extended with third-party plotting backends. The +main idea is letting users select a plotting backend different than the provided +one based on Matplotlib. + +This can be done by passsing 'backend.module' as the argument ``backend`` in ``plot`` +function. For example: + +.. code-block:: python + + >>> Series([1, 2, 3]).plot(backend='backend.module') + +Alternatively, you can also set this option globally, do you don't need to specify +the keyword in each ``plot`` call. For example: + +.. code-block:: python + + >>> pd.set_option('plotting.backend', 'backend.module') + >>> pd.Series([1, 2, 3]).plot() + +Or: + +.. code-block:: python + + >>> pd.options.plotting.backend = 'backend.module' + >>> pd.Series([1, 2, 3]).plot() + +This would be more or less equivalent to: + +.. code-block:: python + + >>> import backend.module + >>> backend.module.plot(pd.Series([1, 2, 3])) + +The backend module can then use other visualization tools (Bokeh, Altair, hvplot,...) +to generate the plots. Some libraries implementing a backend for pandas are listed +on the ecosystem :ref:`ecosystem.visualization` page. + +Developers guide can be found at +https://dev.pandas.io/docs/development/extending.html#plotting-backends \ No newline at end of file diff --git a/doc/source/whatsnew/index.rst b/doc/source/whatsnew/index.rst index 111caa81f7169..68aabfe76d8de 100644 --- a/doc/source/whatsnew/index.rst +++ b/doc/source/whatsnew/index.rst @@ -26,6 +26,7 @@ Version 1.0 v1.0.0 v1.0.1 + v1.0.2 Version 0.25 ------------ diff --git a/doc/source/whatsnew/v1.0.1.rst b/doc/source/whatsnew/v1.0.1.rst index 801d97b777e00..ef3bb8161d13f 100644 --- a/doc/source/whatsnew/v1.0.1.rst +++ b/doc/source/whatsnew/v1.0.1.rst @@ -1,7 +1,7 @@ .. _whatsnew_101: -What's new in 1.0.1 (??) ------------------------- +What's new in 1.0.1 (February 5, 2020) +-------------------------------------- These are the changes in pandas 1.0.1. See :ref:`release` for a full changelog including other versions of pandas. @@ -10,126 +10,64 @@ including other versions of pandas. .. --------------------------------------------------------------------------- +.. _whatsnew_101.regressions: -.. _whatsnew_101.bug_fixes: - -Bug fixes -~~~~~~~~~ -- Bug in :meth:`GroupBy.apply` was raising ``TypeError`` if called with function which returned a non-pandas non-scalar object (e.g. a list) (:issue:`31441`) - -Categorical -^^^^^^^^^^^ - -- -- - -Datetimelike -^^^^^^^^^^^^ -- Fixed regression in :meth:`to_datetime` when parsing non-nanosecond resolution datetimes (:issue:`31491`) -- Fixed bug in :meth:`to_datetime` raising when ``cache=True`` and out-of-bound values are present (:issue:`31491`) - -Timedelta -^^^^^^^^^ - -- -- - -Timezones -^^^^^^^^^ - -- -- - - -Numeric -^^^^^^^ -- Bug in dtypes being lost in ``DataFrame.__invert__`` (``~`` operator) with mixed dtypes (:issue:`31183`) -- Bug in :class:`Series` multiplication when multiplying a numeric :class:`Series` with >10000 elements with a timedelta-like scalar (:issue:`31467`) -- - -Conversion -^^^^^^^^^^ - -- -- - -Strings -^^^^^^^ - -- -- - +Fixed regressions +~~~~~~~~~~~~~~~~~ -Interval -^^^^^^^^ - -- -- - -Indexing -^^^^^^^^ - -- Fixed regression when indexing a ``Series`` or ``DataFrame`` indexed by ``DatetimeIndex`` with a slice containg a :class:`datetime.date` (:issue:`31501`) - Fixed regression in :class:`DataFrame` setting values with a slice (e.g. ``df[-4:] = 1``) indexing by label instead of position (:issue:`31469`) -- -- -- Bug where assigning to a :class:`Series` using a IntegerArray / BooleanArray as a mask would raise ``TypeError`` (:issue:`31446`) - -Missing -^^^^^^^ - -- -- - -MultiIndex -^^^^^^^^^^ - -- -- +- Fixed regression when indexing a ``Series`` or ``DataFrame`` indexed by ``DatetimeIndex`` with a slice containg a :class:`datetime.date` (:issue:`31501`) +- Fixed regression in ``DataFrame.__setitem__`` raising an ``AttributeError`` with a :class:`MultiIndex` and a non-monotonic indexer (:issue:`31449`) +- Fixed regression in :class:`Series` multiplication when multiplying a numeric :class:`Series` with >10000 elements with a timedelta-like scalar (:issue:`31457`) +- Fixed regression in ``.groupby().agg()`` raising an ``AssertionError`` for some reductions like ``min`` on object-dtype columns (:issue:`31522`) +- Fixed regression in ``.groupby()`` aggregations with categorical dtype using Cythonized reduction functions (e.g. ``first``) (:issue:`31450`) +- Fixed regression in :meth:`GroupBy.apply` if called with a function which returned a non-pandas non-scalar object (e.g. a list or numpy array) (:issue:`31441`) +- Fixed regression in :meth:`DataFrame.groupby` whereby taking the minimum or maximum of a column with period dtype would raise a ``TypeError``. (:issue:`31471`) +- Fixed regression in :meth:`DataFrame.groupby` with an empty DataFrame grouping by a level of a MultiIndex (:issue:`31670`). +- Fixed regression in :meth:`DataFrame.apply` with object dtype and non-reducing function (:issue:`31505`) +- Fixed regression in :meth:`to_datetime` when parsing non-nanosecond resolution datetimes (:issue:`31491`) +- Fixed regression in :meth:`~DataFrame.to_csv` where specifying an ``na_rep`` might truncate the values written (:issue:`31447`) +- Fixed regression in :class:`Categorical` construction with ``numpy.str_`` categories (:issue:`31499`) +- Fixed regression in :meth:`DataFrame.loc` and :meth:`DataFrame.iloc` when selecting a row containing a single ``datetime64`` or ``timedelta64`` column (:issue:`31649`) +- Fixed regression where setting :attr:`pd.options.display.max_colwidth` was not accepting negative integer. In addition, this behavior has been deprecated in favor of using ``None`` (:issue:`31532`) +- Fixed regression in objTOJSON.c fix return-type warning (:issue:`31463`) +- Fixed regression in :meth:`qcut` when passed a nullable integer. (:issue:`31389`) +- Fixed regression in assigning to a :class:`Series` using a nullable integer dtype (:issue:`31446`) +- Fixed performance regression when indexing a ``DataFrame`` or ``Series`` with a :class:`MultiIndex` for the index using a list of labels (:issue:`31648`) +- Fixed regression in :meth:`read_csv` used in file like object ``RawIOBase`` is not recognize ``encoding`` option (:issue:`31575`) -I/O -^^^ +.. --------------------------------------------------------------------------- -- Fixed regression in :meth:`~DataFrame.to_csv` where specifying an ``na_rep`` might truncate the values written (:issue:`31447`) -- -- +.. _whatsnew_101.deprecations: -Plotting -^^^^^^^^ +Deprecations +~~~~~~~~~~~~ -- -- +- Support for negative integer for :attr:`pd.options.display.max_colwidth` is deprecated in favor of using ``None`` (:issue:`31532`) -Groupby/resample/rolling -^^^^^^^^^^^^^^^^^^^^^^^^ +.. --------------------------------------------------------------------------- -- -- +.. _whatsnew_101.bug_fixes: +Bug fixes +~~~~~~~~~ -Reshaping -^^^^^^^^^ +**Datetimelike** -- -- +- Fixed bug in :meth:`to_datetime` raising when ``cache=True`` and out-of-bound values are present (:issue:`31491`) -Sparse -^^^^^^ +**Numeric** -- -- +- Bug in dtypes being lost in ``DataFrame.__invert__`` (``~`` operator) with mixed dtypes (:issue:`31183`) + and for extension-array backed ``Series`` and ``DataFrame`` (:issue:`23087`) -ExtensionArray -^^^^^^^^^^^^^^ +**Plotting** -- Bug in dtype being lost in ``__invert__`` (``~`` operator) for extension-array backed ``Series`` and ``DataFrame`` (:issue:`23087`) -- +- Plotting tz-aware timeseries no longer gives UserWarning (:issue:`31205`) +**Interval** -Other -^^^^^ -- -- +- Bug in :meth:`Series.shift` with ``interval`` dtype raising a ``TypeError`` when shifting an interval array of integers or datetimes (:issue:`34195`) .. --------------------------------------------------------------------------- @@ -137,3 +75,5 @@ Other Contributors ~~~~~~~~~~~~ + +.. contributors:: v1.0.0..v1.0.1|HEAD diff --git a/doc/source/whatsnew/v1.0.2.rst b/doc/source/whatsnew/v1.0.2.rst new file mode 100644 index 0000000000000..70aaaa6d0a60d --- /dev/null +++ b/doc/source/whatsnew/v1.0.2.rst @@ -0,0 +1,39 @@ +.. _whatsnew_102: + +What's new in 1.0.2 (February ??, 2020) +--------------------------------------- + +These are the changes in pandas 1.0.2. See :ref:`release` for a full changelog +including other versions of pandas. + +{{ header }} + +.. --------------------------------------------------------------------------- + +.. _whatsnew_102.regressions: + +Fixed regressions +~~~~~~~~~~~~~~~~~ + +- Fixed regression in :meth:`DataFrame.to_excel` when ``columns`` kwarg is passed (:issue:`31677`) +- + +.. --------------------------------------------------------------------------- + +.. _whatsnew_102.bug_fixes: + +Bug fixes +~~~~~~~~~ + +**I/O** + +- Using ``pd.NA`` with :meth:`DataFrame.to_json` now correctly outputs a null value instead of an empty object (:issue:`31615`) + +.. --------------------------------------------------------------------------- + +.. _whatsnew_102.contributors: + +Contributors +~~~~~~~~~~~~ + +.. contributors:: v1.0.1..v1.0.2|HEAD \ No newline at end of file diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index e07a8fa0469f4..aea5695a96388 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -60,7 +60,11 @@ Backwards incompatible API changes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - :meth:`DataFrame.swaplevels` now raises a ``TypeError`` if the axis is not a :class:`MultiIndex`. Previously a ``AttributeError`` was raised (:issue:`31126`) - +- :meth:`DataFrameGroupby.mean` and :meth:`SeriesGroupby.mean` (and similarly for :meth:`~DataFrameGroupby.median`, :meth:`~DataFrameGroupby.std`` and :meth:`~DataFrameGroupby.var``) + now raise a ``TypeError`` if a not-accepted keyword argument is passed into it. + Previously a ``UnsupportedFunctionCall`` was raised (``AssertionError`` if ``min_count`` passed into :meth:`~DataFrameGroupby.median``) (:issue:`31485`) +- :meth:`DataFrame.at` and :meth:`Series.at` will raise a ``TypeError`` instead of a ``ValueError`` if an incompatible key is passed, and ``KeyError`` if a missing key is passed, matching the behavior of ``.loc[]`` (:issue:`31722`) +- .. --------------------------------------------------------------------------- @@ -105,11 +109,13 @@ Datetimelike - Bug in :class:`Timestamp` where constructing :class:`Timestamp` from ambiguous epoch time and calling constructor again changed :meth:`Timestamp.value` property (:issue:`24329`) - :meth:`DatetimeArray.searchsorted`, :meth:`TimedeltaArray.searchsorted`, :meth:`PeriodArray.searchsorted` not recognizing non-pandas scalars and incorrectly raising ``ValueError`` instead of ``TypeError`` (:issue:`30950`) - Bug in :class:`Timestamp` where constructing :class:`Timestamp` with dateutil timezone less than 128 nanoseconds before daylight saving time switch from winter to summer would result in nonexistent time (:issue:`31043`) +- Bug in :meth:`DataFrame.reindex` and :meth:`Series.reindex` when reindexing with a tz-aware index (:issue:`26683`) +- Bug in :meth:`Period.to_timestamp`, :meth:`Period.start_time` with microsecond frequency returning a timestamp one nanosecond earlier than the correct time (:issue:`31475`) Timedelta ^^^^^^^^^ -- +- Bug in constructing a :class:`Timedelta` with a high precision integer that would round the :class:`Timedelta` components (:issue:`31354`) - Timezones @@ -150,7 +156,8 @@ Indexing - Bug in :meth:`PeriodIndex.get_loc` treating higher-resolution strings differently from :meth:`PeriodIndex.get_value` (:issue:`31172`) - Bug in :meth:`Series.at` and :meth:`DataFrame.at` not matching ``.loc`` behavior when looking up an integer in a :class:`Float64Index` (:issue:`31329`) - Bug in :meth:`PeriodIndex.is_monotonic` incorrectly returning ``True`` when containing leading ``NaT`` entries (:issue:`31437`) -- +- Bug in :meth:`DatetimeIndex.get_loc` raising ``KeyError`` with converted-integer key instead of the user-passed key (:issue:`31425`) +- Bug in :meth:`Series.xs` incorrectly returning ``Timestamp`` instead of ``datetime64`` in some object-dtype cases (:issue:`31630`) Missing ^^^^^^^ @@ -160,15 +167,24 @@ Missing MultiIndex ^^^^^^^^^^ +- Bug in :meth:`Dataframe.loc` when used with a :class:`MultiIndex`. The returned values were not in the same order as the given inputs (:issue:`22797`) -- +.. ipython:: python + + df = pd.DataFrame(np.arange(4), + index=[["a", "a", "b", "b"], [1, 2, 1, 2]]) + # Rows are now ordered as the requested keys + df.loc[(['b', 'a'], [2, 1]), :] - I/O ^^^ - Bug in :meth:`read_json` where integer overflow was occuring when json contains big number strings. (:issue:`30320`) -- -- +- `read_csv` will now raise a ``ValueError`` when the arguments `header` and `prefix` both are not `None`. (:issue:`27394`) +- Bug in :meth:`DataFrame.to_json` was raising ``NotFoundError`` when ``path_or_buf`` was an S3 URI (:issue:`28375`) +- Bug in :meth:`DataFrame.to_parquet` overwriting pyarrow's default for + ``coerce_timestamps``; following pyarrow's default allows writing nanosecond + timestamps with ``version="2.0"`` (:issue:`31652`). Plotting ^^^^^^^^ @@ -210,7 +226,7 @@ Other ^^^^^ - Appending a dictionary to a :class:`DataFrame` without passing ``ignore_index=True`` will raise ``TypeError: Can only append a dict if ignore_index=True`` instead of ``TypeError: Can only append a Series if ignore_index=True or if the Series has a name`` (:issue:`30871`) -- +- Set operations on an object-dtype :class:`Index` now always return object-dtype results (:issue:`31401`) .. --------------------------------------------------------------------------- diff --git a/doc/sphinxext/announce.py b/doc/sphinxext/announce.py index f394aac5c545b..e4859157f73de 100755 --- a/doc/sphinxext/announce.py +++ b/doc/sphinxext/announce.py @@ -57,6 +57,16 @@ def get_authors(revision_range): pat = "^.*\\t(.*)$" lst_release, cur_release = [r.strip() for r in revision_range.split("..")] + if "|" in cur_release: + # e.g. v1.0.1|HEAD + maybe_tag, head = cur_release.split("|") + assert head == "HEAD" + if maybe_tag in this_repo.tags: + cur_release = maybe_tag + else: + cur_release = head + revision_range = f"{lst_release}..{cur_release}" + # authors, in current release and previous to current release. cur = set(re.findall(pat, this_repo.git.shortlog("-s", revision_range), re.M)) pre = set(re.findall(pat, this_repo.git.shortlog("-s", lst_release), re.M)) diff --git a/doc/sphinxext/contributors.py b/doc/sphinxext/contributors.py index d9ba2bb2cfb07..c2b21e40cadad 100644 --- a/doc/sphinxext/contributors.py +++ b/doc/sphinxext/contributors.py @@ -6,7 +6,13 @@ This will be replaced with a message indicating the number of code contributors and commits, and then list each contributor -individually. +individually. For development versions (before a tag is available) +use:: + + .. contributors:: v0.23.0..v0.23.1|HEAD + +While the v0.23.1 tag does not exist, that will use the HEAD of the +branch as the end of the revision range. """ from announce import build_components from docutils import nodes diff --git a/pandas/_config/config.py b/pandas/_config/config.py index cacd6f5454de7..8b6116d3abd60 100644 --- a/pandas/_config/config.py +++ b/pandas/_config/config.py @@ -155,9 +155,7 @@ def _describe_option(pat: str = "", _print_desc: bool = True): if len(keys) == 0: raise OptionError("No such keys(s)") - s = "" - for k in keys: # filter by pat - s += _build_option_description(k) + s = "\n".join([_build_option_description(k) for k in keys]) if _print_desc: print(s) diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index 7d57c67e70b58..6671375f628e7 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -670,7 +670,9 @@ cdef class StringHashTable(HashTable): val = values[i] if isinstance(val, str): - v = get_c_string(val) + # GH#31499 if we have a np.str_ get_c_string wont recognize + # it as a str, even though isinstance does. + v = get_c_string(val) else: v = get_c_string(self.na_string_sentinel) vecs[i] = v @@ -703,7 +705,9 @@ cdef class StringHashTable(HashTable): val = values[i] if isinstance(val, str): - v = get_c_string(val) + # GH#31499 if we have a np.str_ get_c_string wont recognize + # it as a str, even though isinstance does. + v = get_c_string(val) else: v = get_c_string(self.na_string_sentinel) vecs[i] = v diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index 1915eaf6e07dd..4185cc2084469 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -1,17 +1,12 @@ -from datetime import datetime, timedelta, date import warnings -import cython - import numpy as np cimport numpy as cnp from numpy cimport (ndarray, intp_t, float64_t, float32_t, int64_t, int32_t, int16_t, int8_t, - uint64_t, uint32_t, uint16_t, uint8_t, - # Note: NPY_DATETIME, NPY_TIMEDELTA are only available - # for cimport in cython>=0.27.3 - NPY_DATETIME, NPY_TIMEDELTA) + uint64_t, uint32_t, uint16_t, uint8_t +) cnp.import_array() @@ -23,7 +18,7 @@ from pandas._libs.tslibs.c_timestamp cimport _Timestamp from pandas._libs.hashtable cimport HashTable from pandas._libs import algos, hashtable as _hash -from pandas._libs.tslibs import Timestamp, Timedelta, period as periodlib +from pandas._libs.tslibs import Timedelta, period as periodlib from pandas._libs.missing import checknull @@ -35,16 +30,6 @@ cdef inline bint is_definitely_invalid_key(object val): return False -cpdef get_value_at(ndarray arr, object loc, object tz=None): - obj = util.get_value_at(arr, loc) - - if arr.descr.type_num == NPY_DATETIME: - return Timestamp(obj, tz=tz) - elif arr.descr.type_num == NPY_TIMEDELTA: - return Timedelta(obj) - return obj - - # Don't populate hash tables in monotonic indexes larger than this _SIZE_CUTOFF = 1_000_000 @@ -72,35 +57,6 @@ cdef class IndexEngine: self._ensure_mapping_populated() return val in self.mapping - cpdef get_value(self, ndarray arr, object key, object tz=None): - """ - Parameters - ---------- - arr : 1-dimensional ndarray - """ - cdef: - object loc - - loc = self.get_loc(key) - if isinstance(loc, slice) or util.is_array(loc): - return arr[loc] - else: - return get_value_at(arr, loc, tz=tz) - - cpdef set_value(self, ndarray arr, object key, object value): - """ - Parameters - ---------- - arr : 1-dimensional ndarray - """ - cdef: - object loc - - loc = self.get_loc(key) - value = convert_scalar(arr, value) - - arr[loc] = value - cpdef get_loc(self, object val): cdef: Py_ssize_t loc @@ -549,54 +505,6 @@ cdef class PeriodEngine(Int64Engine): return super(PeriodEngine, self).get_indexer_non_unique(ordinal_array) -cpdef convert_scalar(ndarray arr, object value): - # we don't turn integers - # into datetimes/timedeltas - - # we don't turn bools into int/float/complex - - if arr.descr.type_num == NPY_DATETIME: - if util.is_array(value): - pass - elif isinstance(value, (datetime, np.datetime64, date)): - return Timestamp(value).to_datetime64() - elif util.is_timedelta64_object(value): - # exclude np.timedelta64("NaT") from value != value below - pass - elif value is None or value != value: - return np.datetime64("NaT", "ns") - raise ValueError("cannot set a Timestamp with a non-timestamp " - f"{type(value).__name__}") - - elif arr.descr.type_num == NPY_TIMEDELTA: - if util.is_array(value): - pass - elif isinstance(value, timedelta) or util.is_timedelta64_object(value): - value = Timedelta(value) - if value is NaT: - return np.timedelta64("NaT", "ns") - return value.to_timedelta64() - elif util.is_datetime64_object(value): - # exclude np.datetime64("NaT") which would otherwise be picked up - # by the `value != value check below - pass - elif value is None or value != value: - return np.timedelta64("NaT", "ns") - raise ValueError("cannot set a Timedelta with a non-timedelta " - f"{type(value).__name__}") - - if (issubclass(arr.dtype.type, (np.integer, np.floating, np.complex)) and - not issubclass(arr.dtype.type, np.bool_)): - if util.is_bool_object(value): - raise ValueError("Cannot assign bool to float/integer series") - - if issubclass(arr.dtype.type, (np.integer, np.bool_)): - if util.is_float_object(value) and value != value: - raise ValueError("Cannot assign nan to integer series") - - return value - - cdef class BaseMultiIndexCodesEngine: """ Base class for MultiIndexUIntEngine and MultiIndexPyIntEngine, which diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index 377d49f2bbd29..3077f73a8d1a4 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -638,7 +638,7 @@ cdef class TextReader: raise ValueError(f'Unrecognized compression type: ' f'{self.compression}') - if self.encoding and isinstance(source, io.BufferedIOBase): + if self.encoding and isinstance(source, (io.BufferedIOBase, io.RawIOBase)): source = io.TextIOWrapper( source, self.encoding.decode('utf-8'), newline='') diff --git a/pandas/_libs/reduction.pyx b/pandas/_libs/reduction.pyx index 89164c527002a..43d253f632f0f 100644 --- a/pandas/_libs/reduction.pyx +++ b/pandas/_libs/reduction.pyx @@ -114,7 +114,8 @@ cdef class Reducer: if self.typ is not None: # In this case, we also have self.index name = labels[i] - cached_typ = self.typ(chunk, index=self.index, name=name) + cached_typ = self.typ( + chunk, index=self.index, name=name, dtype=arr.dtype) # use the cached_typ if possible if cached_typ is not None: diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c index 62c2870c198c4..8cfc20ffd2c1c 100644 --- a/pandas/_libs/src/ujson/python/objToJSON.c +++ b/pandas/_libs/src/ujson/python/objToJSON.c @@ -53,6 +53,7 @@ static PyTypeObject *cls_dataframe; static PyTypeObject *cls_series; static PyTypeObject *cls_index; static PyTypeObject *cls_nat; +static PyTypeObject *cls_na; PyObject *cls_timedelta; npy_int64 get_nat(void) { return NPY_MIN_INT64; } @@ -127,7 +128,6 @@ typedef struct __PyObjectEncoder { // pass-through to encode numpy data directly int npyType; void *npyValue; - TypeContext basicTypeContext; int datetimeIso; NPY_DATETIMEUNIT datetimeUnit; @@ -150,6 +150,7 @@ int PdBlock_iterNext(JSOBJ, JSONTypeContext *); void *initObjToJSON(void) { PyObject *mod_pandas; PyObject *mod_nattype; + PyObject *mod_natype; PyObject *mod_decimal = PyImport_ImportModule("decimal"); type_decimal = (PyTypeObject *)PyObject_GetAttrString(mod_decimal, "Decimal"); @@ -175,8 +176,16 @@ void *initObjToJSON(void) { Py_DECREF(mod_nattype); } + mod_natype = PyImport_ImportModule("pandas._libs.missing"); + if (mod_natype) { + cls_na = (PyTypeObject *)PyObject_GetAttrString(mod_natype, "NAType"); + Py_DECREF(mod_natype); + } + /* Initialise numpy API */ import_array(); + // GH 31463 + return NULL; } static TypeContext *createTypeContext(void) { @@ -925,15 +934,15 @@ char *Tuple_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *Py_UNUSED(tc), } //============================================================================= -// Iterator iteration functions +// Set iteration functions // itemValue is borrowed reference, no ref counting //============================================================================= -void Iter_iterBegin(JSOBJ obj, JSONTypeContext *tc) { +void Set_iterBegin(JSOBJ obj, JSONTypeContext *tc) { GET_TC(tc)->itemValue = NULL; GET_TC(tc)->iterator = PyObject_GetIter(obj); } -int Iter_iterNext(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { +int Set_iterNext(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { PyObject *item; if (GET_TC(tc)->itemValue) { @@ -951,7 +960,7 @@ int Iter_iterNext(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { return 1; } -void Iter_iterEnd(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { +void Set_iterEnd(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { if (GET_TC(tc)->itemValue) { Py_DECREF(GET_TC(tc)->itemValue); GET_TC(tc)->itemValue = NULL; @@ -963,11 +972,11 @@ void Iter_iterEnd(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { } } -JSOBJ Iter_iterGetValue(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { +JSOBJ Set_iterGetValue(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { return GET_TC(tc)->itemValue; } -char *Iter_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *Py_UNUSED(tc), +char *Set_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *Py_UNUSED(tc), size_t *Py_UNUSED(outLen)) { return NULL; } @@ -1788,6 +1797,10 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { "%R (0d array) is not JSON serializable at the moment", obj); goto INVALID; + } else if (PyObject_TypeCheck(obj, cls_na)) { + PRINTMARK(); + tc->type = JT_NULL; + return; } ISITERABLE: @@ -2040,11 +2053,11 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { } else if (PyAnySet_Check(obj)) { PRINTMARK(); tc->type = JT_ARRAY; - pc->iterBegin = Iter_iterBegin; - pc->iterEnd = Iter_iterEnd; - pc->iterNext = Iter_iterNext; - pc->iterGetValue = Iter_iterGetValue; - pc->iterGetName = Iter_iterGetName; + pc->iterBegin = Set_iterBegin; + pc->iterEnd = Set_iterEnd; + pc->iterNext = Set_iterNext; + pc->iterGetValue = Set_iterGetValue; + pc->iterGetName = Set_iterGetName; return; } @@ -2115,10 +2128,7 @@ void Object_endTypeContext(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { PyObject_Free(GET_TC(tc)->cStr); GET_TC(tc)->cStr = NULL; - if (tc->prv != - &(((PyObjectEncoder *)tc->encoder)->basicTypeContext)) { // NOLINT - PyObject_Free(tc->prv); - } + PyObject_Free(tc->prv); tc->prv = NULL; } } @@ -2216,16 +2226,6 @@ PyObject *objToJSON(PyObject *Py_UNUSED(self), PyObject *args, pyEncoder.datetimeUnit = NPY_FR_ms; pyEncoder.outputFormat = COLUMNS; pyEncoder.defaultHandler = 0; - pyEncoder.basicTypeContext.newObj = NULL; - pyEncoder.basicTypeContext.dictObj = NULL; - pyEncoder.basicTypeContext.itemValue = NULL; - pyEncoder.basicTypeContext.itemName = NULL; - pyEncoder.basicTypeContext.attrList = NULL; - pyEncoder.basicTypeContext.iterator = NULL; - pyEncoder.basicTypeContext.cStr = NULL; - pyEncoder.basicTypeContext.npyarr = NULL; - pyEncoder.basicTypeContext.rowLabels = NULL; - pyEncoder.basicTypeContext.columnLabels = NULL; PRINTMARK(); diff --git a/pandas/_libs/tslibs/conversion.pyx b/pandas/_libs/tslibs/conversion.pyx index e0862b9250045..bf38fcfb6103c 100644 --- a/pandas/_libs/tslibs/conversion.pyx +++ b/pandas/_libs/tslibs/conversion.pyx @@ -29,7 +29,7 @@ from pandas._libs.tslibs.util cimport ( from pandas._libs.tslibs.timedeltas cimport cast_from_unit from pandas._libs.tslibs.timezones cimport ( is_utc, is_tzlocal, is_fixed_offset, get_utcoffset, get_dst_info, - get_timezone, maybe_get_tz, tz_compare, treat_tz_as_dateutil) + get_timezone, maybe_get_tz, tz_compare) from pandas._libs.tslibs.timezones import UTC from pandas._libs.tslibs.parsing import parse_datetime_string @@ -341,14 +341,6 @@ cdef _TSObject convert_datetime_to_tsobject(datetime ts, object tz, obj.tzinfo = tz else: obj.value = pydatetime_to_dt64(ts, &obj.dts) - # GH 24329 When datetime is ambiguous, - # pydatetime_to_dt64 doesn't take DST into account - # but with dateutil timezone, get_utcoffset does - # so we need to correct for it - if treat_tz_as_dateutil(ts.tzinfo): - if ts.tzinfo.is_ambiguous(ts): - dst_offset = ts.tzinfo.dst(ts) - obj.value += int(dst_offset.total_seconds() * 1e9) obj.tzinfo = ts.tzinfo if obj.tzinfo is not None and not is_utc(obj.tzinfo): diff --git a/pandas/_libs/tslibs/nattype.pyx b/pandas/_libs/tslibs/nattype.pyx index 357f183b3a845..9f6f401a1a5f5 100644 --- a/pandas/_libs/tslibs/nattype.pyx +++ b/pandas/_libs/tslibs/nattype.pyx @@ -2,7 +2,7 @@ from cpython.object cimport ( PyObject_RichCompare, Py_GT, Py_GE, Py_EQ, Py_NE, Py_LT, Py_LE) -from cpython.datetime cimport (datetime, +from cpython.datetime cimport (datetime, timedelta, PyDateTime_Check, PyDelta_Check, PyDateTime_IMPORT) @@ -276,13 +276,6 @@ cdef class _NaT(datetime): def __long__(self): return NPY_NAT - def total_seconds(self): - """ - Total duration of timedelta in seconds (to microsecond precision). - """ - # GH#10939 - return np.nan - @property def is_leap_year(self): return False @@ -386,6 +379,7 @@ class NaTType(_NaT): # nan methods weekday = _make_nan_func('weekday', datetime.weekday.__doc__) isoweekday = _make_nan_func('isoweekday', datetime.isoweekday.__doc__) + total_seconds = _make_nan_func('total_seconds', timedelta.total_seconds.__doc__) month_name = _make_nan_func('month_name', # noqa:E128 """ Return the month name of the Timestamp with specified locale. diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx index 3dd560ece188d..9419f0eba39aa 100644 --- a/pandas/_libs/tslibs/period.pyx +++ b/pandas/_libs/tslibs/period.pyx @@ -22,7 +22,7 @@ PyDateTime_IMPORT from pandas._libs.tslibs.np_datetime cimport ( npy_datetimestruct, dtstruct_to_dt64, dt64_to_dtstruct, pandas_datetime_to_datetimestruct, check_dts_bounds, - NPY_DATETIMEUNIT, NPY_FR_D) + NPY_DATETIMEUNIT, NPY_FR_D, NPY_FR_us) cdef extern from "src/datetime/np_datetime.h": int64_t npy_datetimestruct_to_datetime(NPY_DATETIMEUNIT fr, @@ -272,6 +272,8 @@ cdef int64_t DtoB_weekday(int64_t unix_date) nogil: cdef int64_t DtoB(npy_datetimestruct *dts, int roll_back, int64_t unix_date) nogil: + # calculate the current week (counting from 1970-01-01) treating + # sunday as last day of a week cdef: int day_of_week = dayofweek(dts.year, dts.month, dts.day) @@ -473,9 +475,6 @@ cdef int DtoQ_yq(int64_t ordinal, asfreq_info *af_info, int *year) nogil: int quarter pandas_datetime_to_datetimestruct(ordinal, NPY_FR_D, &dts) - # TODO: Another version of this function used - # date_info_from_days_and_time(&dts, unix_date, 0) - # instead of pandas_datetime_to_datetimestruct; is one more performant? if af_info.to_end != 12: dts.month -= af_info.to_end if dts.month <= 0: @@ -509,14 +508,18 @@ cdef int64_t asfreq_DTtoM(int64_t ordinal, asfreq_info *af_info) nogil: cdef int64_t asfreq_DTtoW(int64_t ordinal, asfreq_info *af_info) nogil: ordinal = downsample_daytime(ordinal, af_info) - return (ordinal + 3 - af_info.to_end) // 7 + 1 + return unix_date_to_week(ordinal, af_info.to_end) + + +cdef int64_t unix_date_to_week(int64_t unix_date, int to_end) nogil: + return (unix_date + 3 - to_end) // 7 + 1 # -------------------------------------------------------------------- # Conversion _from_ BusinessDay Freq cdef int64_t asfreq_BtoDT(int64_t ordinal, asfreq_info *af_info) nogil: - ordinal = ((ordinal + 3) // 5) * 7 + (ordinal + 3) % 5 -3 + ordinal = ((ordinal + 3) // 5) * 7 + (ordinal + 3) % 5 - 3 return upsample_daytime(ordinal, af_info) @@ -753,14 +756,7 @@ cdef int64_t get_period_ordinal(npy_datetimestruct *dts, int freq) nogil: if fmonth == 0: fmonth = 12 - mdiff = dts.month - fmonth - # TODO: Aren't the next two conditions equivalent to - # unconditional incrementing? - if mdiff < 0: - mdiff += 12 - if dts.month >= fmonth: - mdiff += 12 - + mdiff = dts.month - fmonth + 12 return (dts.year - 1970) * 4 + (mdiff - 1) // 3 elif freq == FR_MTH: @@ -797,23 +793,10 @@ cdef int64_t get_period_ordinal(npy_datetimestruct *dts, int freq) nogil: return unix_date elif freq == FR_BUS: - # calculate the current week (counting from 1970-01-01) treating - # sunday as last day of a week - weeks = (unix_date + 3) // 7 - # calculate the current weekday (in range 1 .. 7) - delta = (unix_date + 3) % 7 + 1 - # return the number of business days in full weeks plus the business - # days in the last - possible partial - week - if delta <= 5: - return (5 * weeks) + delta - 4 - else: - return (5 * weeks) + (5 + 1) - 4 + return DtoB(dts, 0, unix_date) elif freq_group == FR_WK: - day_adj = freq - FR_WK - return (unix_date + 3 - day_adj) // 7 + 1 - - # raise ValueError + return unix_date_to_week(unix_date, freq - FR_WK) cdef void get_date_info(int64_t ordinal, int freq, @@ -983,7 +966,7 @@ cdef inline int month_to_quarter(int month) nogil: @cython.wraparound(False) @cython.boundscheck(False) -def dt64arr_to_periodarr(int64_t[:] dtarr, int freq, tz=None): +def dt64arr_to_periodarr(const int64_t[:] dtarr, int freq, tz=None): """ Convert array of datetime64 values (passed in as 'i8' dtype) to a set of periods corresponding to desired frequency, per period convention. @@ -1186,7 +1169,12 @@ cdef int64_t period_ordinal_to_dt64(int64_t ordinal, int freq) except? -1: if ordinal == NPY_NAT: return NPY_NAT - get_date_info(ordinal, freq, &dts) + if freq == 11000: + # Microsecond, avoid get_date_info to prevent floating point errors + pandas_datetime_to_datetimestruct(ordinal, NPY_FR_us, &dts) + else: + get_date_info(ordinal, freq, &dts) + check_dts_bounds(&dts) return dtstruct_to_dt64(&dts) @@ -1383,7 +1371,7 @@ cdef int pdays_in_month(int64_t ordinal, int freq): @cython.wraparound(False) @cython.boundscheck(False) -def get_period_field_arr(int code, int64_t[:] arr, int freq): +def get_period_field_arr(int code, const int64_t[:] arr, int freq): cdef: Py_ssize_t i, sz int64_t[:] out @@ -1496,7 +1484,7 @@ def extract_freq(ndarray[object] values): @cython.wraparound(False) @cython.boundscheck(False) -cdef int64_t[:] localize_dt64arr_to_period(int64_t[:] stamps, +cdef int64_t[:] localize_dt64arr_to_period(const int64_t[:] stamps, int freq, object tz): cdef: Py_ssize_t n = len(stamps) @@ -1584,7 +1572,7 @@ cdef class _Period: return freq @classmethod - def _from_ordinal(cls, ordinal, freq): + def _from_ordinal(cls, ordinal: int, freq) -> "Period": """ Fast creation from an ordinal and freq that are already validated! """ @@ -1704,7 +1692,7 @@ cdef class _Period: else: return NotImplemented - def asfreq(self, freq, how='E'): + def asfreq(self, freq, how='E') -> "Period": """ Convert Period to desired frequency, at the start or end of the interval. @@ -1735,7 +1723,7 @@ cdef class _Period: return Period(ordinal=ordinal, freq=freq) @property - def start_time(self): + def start_time(self) -> Timestamp: """ Get the Timestamp for the start of the period. @@ -1765,13 +1753,13 @@ cdef class _Period: return self.to_timestamp(how='S') @property - def end_time(self): + def end_time(self) -> Timestamp: # freq.n can't be negative or 0 # ordinal = (self + self.freq.n).start_time.value - 1 ordinal = (self + self.freq).start_time.value - 1 return Timestamp(ordinal) - def to_timestamp(self, freq=None, how='start', tz=None): + def to_timestamp(self, freq=None, how='start', tz=None) -> Timestamp: """ Return the Timestamp representation of the Period. @@ -1811,17 +1799,17 @@ cdef class _Period: return Timestamp(dt64, tz=tz) @property - def year(self): + def year(self) -> int: base, mult = get_freq_code(self.freq) return pyear(self.ordinal, base) @property - def month(self): + def month(self) -> int: base, mult = get_freq_code(self.freq) return pmonth(self.ordinal, base) @property - def day(self): + def day(self) -> int: """ Get day of the month that a Period falls on. @@ -1844,7 +1832,7 @@ cdef class _Period: return pday(self.ordinal, base) @property - def hour(self): + def hour(self) -> int: """ Get the hour of the day component of the Period. @@ -1874,7 +1862,7 @@ cdef class _Period: return phour(self.ordinal, base) @property - def minute(self): + def minute(self) -> int: """ Get minute of the hour component of the Period. @@ -1898,7 +1886,7 @@ cdef class _Period: return pminute(self.ordinal, base) @property - def second(self): + def second(self) -> int: """ Get the second component of the Period. @@ -1922,12 +1910,12 @@ cdef class _Period: return psecond(self.ordinal, base) @property - def weekofyear(self): + def weekofyear(self) -> int: base, mult = get_freq_code(self.freq) return pweek(self.ordinal, base) @property - def week(self): + def week(self) -> int: """ Get the week of the year on the given Period. @@ -1957,7 +1945,7 @@ cdef class _Period: return self.weekofyear @property - def dayofweek(self): + def dayofweek(self) -> int: """ Day of the week the period lies in, with Monday=0 and Sunday=6. @@ -2008,7 +1996,7 @@ cdef class _Period: return pweekday(self.ordinal, base) @property - def weekday(self): + def weekday(self) -> int: """ Day of the week the period lies in, with Monday=0 and Sunday=6. @@ -2061,7 +2049,7 @@ cdef class _Period: return self.dayofweek @property - def dayofyear(self): + def dayofyear(self) -> int: """ Return the day of the year. @@ -2096,12 +2084,12 @@ cdef class _Period: return pday_of_year(self.ordinal, base) @property - def quarter(self): + def quarter(self) -> int: base, mult = get_freq_code(self.freq) return pquarter(self.ordinal, base) @property - def qyear(self): + def qyear(self) -> int: """ Fiscal year the Period lies in according to its starting-quarter. @@ -2145,7 +2133,7 @@ cdef class _Period: return pqyear(self.ordinal, base) @property - def days_in_month(self): + def days_in_month(self) -> int: """ Get the total number of days in the month that this period falls on. @@ -2179,7 +2167,7 @@ cdef class _Period: return pdays_in_month(self.ordinal, base) @property - def daysinmonth(self): + def daysinmonth(self) -> int: """ Get the total number of days of the month that the Period falls in. @@ -2209,7 +2197,7 @@ cdef class _Period: return Period(datetime.now(), freq=freq) @property - def freqstr(self): + def freqstr(self) -> str: return self.freq.freqstr def __repr__(self) -> str: diff --git a/pandas/_libs/tslibs/resolution.pyx b/pandas/_libs/tslibs/resolution.pyx index c0b20c14e9920..1e0eb7f97ec54 100644 --- a/pandas/_libs/tslibs/resolution.pyx +++ b/pandas/_libs/tslibs/resolution.pyx @@ -27,7 +27,7 @@ cdef: # ---------------------------------------------------------------------- -cpdef resolution(int64_t[:] stamps, tz=None): +cpdef resolution(const int64_t[:] stamps, tz=None): cdef: Py_ssize_t i, n = len(stamps) npy_datetimestruct dts @@ -38,7 +38,7 @@ cpdef resolution(int64_t[:] stamps, tz=None): return _reso_local(stamps, tz) -cdef _reso_local(int64_t[:] stamps, object tz): +cdef _reso_local(const int64_t[:] stamps, object tz): cdef: Py_ssize_t i, n = len(stamps) int reso = RESO_DAY, curr_reso @@ -106,7 +106,7 @@ cdef inline int _reso_stamp(npy_datetimestruct *dts): return RESO_DAY -def get_freq_group(freq): +def get_freq_group(freq) -> int: """ Return frequency code group of given frequency str or offset. @@ -189,7 +189,7 @@ class Resolution: _freq_reso_map = {v: k for k, v in _reso_freq_map.items()} @classmethod - def get_str(cls, reso): + def get_str(cls, reso: int) -> str: """ Return resolution str against resolution code. @@ -201,7 +201,7 @@ class Resolution: return cls._reso_str_map.get(reso, 'day') @classmethod - def get_reso(cls, resostr): + def get_reso(cls, resostr: str) -> int: """ Return resolution str against resolution code. @@ -216,7 +216,7 @@ class Resolution: return cls._str_reso_map.get(resostr, cls.RESO_DAY) @classmethod - def get_freq_group(cls, resostr): + def get_freq_group(cls, resostr: str) -> int: """ Return frequency str against resolution str. @@ -228,7 +228,7 @@ class Resolution: return get_freq_group(cls.get_freq(resostr)) @classmethod - def get_freq(cls, resostr): + def get_freq(cls, resostr: str) -> str: """ Return frequency str against resolution str. @@ -240,7 +240,7 @@ class Resolution: return cls._reso_freq_map[resostr] @classmethod - def get_str_from_freq(cls, freq): + def get_str_from_freq(cls, freq: str) -> str: """ Return resolution str against frequency str. @@ -252,7 +252,7 @@ class Resolution: return cls._freq_reso_map.get(freq, 'day') @classmethod - def get_reso_from_freq(cls, freq): + def get_reso_from_freq(cls, freq: str) -> int: """ Return resolution code against frequency str. diff --git a/pandas/_libs/tslibs/timedeltas.pyx b/pandas/_libs/tslibs/timedeltas.pyx index 9c031baf70a77..3742506a7f8af 100644 --- a/pandas/_libs/tslibs/timedeltas.pyx +++ b/pandas/_libs/tslibs/timedeltas.pyx @@ -1,5 +1,4 @@ import collections -import textwrap import cython @@ -859,14 +858,6 @@ cdef class _Timedelta(timedelta): """ return self.to_timedelta64() - def total_seconds(self): - """ - Total duration of timedelta in seconds (to microsecond precision). - """ - # GH 31043 - # Microseconds precision to avoid confusing tzinfo.utcoffset - return (self.value - self.value % 1000) / 1e9 - def view(self, dtype): """ Array view compatibility. @@ -1250,7 +1241,7 @@ class Timedelta(_Timedelta): return NaT # make timedelta happy - td_base = _Timedelta.__new__(cls, microseconds=int(value) / 1000) + td_base = _Timedelta.__new__(cls, microseconds=int(value) // 1000) td_base.value = value td_base.is_populated = 0 return td_base diff --git a/pandas/_libs/tslibs/timestamps.pyx b/pandas/_libs/tslibs/timestamps.pyx index 4915671aa6512..b8c462abe35f1 100644 --- a/pandas/_libs/tslibs/timestamps.pyx +++ b/pandas/_libs/tslibs/timestamps.pyx @@ -1,4 +1,3 @@ -import sys import warnings import numpy as np diff --git a/pandas/_libs/util.pxd b/pandas/_libs/util.pxd index 15fedbb20beec..828bccf7d5641 100644 --- a/pandas/_libs/util.pxd +++ b/pandas/_libs/util.pxd @@ -1,7 +1,5 @@ from pandas._libs.tslibs.util cimport * -from cython cimport Py_ssize_t - cimport numpy as cnp from numpy cimport ndarray @@ -51,49 +49,3 @@ cdef inline void set_array_not_contiguous(ndarray ao) nogil: PyArray_CLEARFLAGS(ao, (NPY_ARRAY_C_CONTIGUOUS | NPY_ARRAY_F_CONTIGUOUS)) - -cdef inline Py_ssize_t validate_indexer(ndarray arr, object loc) except -1: - """ - Cast the given indexer `loc` to an integer. If it is negative, i.e. a - python-style indexing-from-the-end indexer, translate it to a - from-the-front indexer. Raise if this is not possible. - - Parameters - ---------- - arr : ndarray - loc : object - - Returns - ------- - idx : Py_ssize_t - - Raises - ------ - IndexError - """ - cdef: - Py_ssize_t idx, size - int casted - - if is_float_object(loc): - casted = int(loc) - if casted == loc: - loc = casted - - idx = loc - size = cnp.PyArray_SIZE(arr) - - if idx < 0 and size > 0: - idx += size - if idx >= size or size == 0 or idx < 0: - raise IndexError('index out of bounds') - - return idx - - -cdef inline object get_value_at(ndarray arr, object loc): - cdef: - Py_ssize_t i - - i = validate_indexer(arr, loc) - return arr[i] diff --git a/pandas/_testing.py b/pandas/_testing.py index 631d550c60534..13af8703cef93 100644 --- a/pandas/_testing.py +++ b/pandas/_testing.py @@ -8,7 +8,7 @@ from shutil import rmtree import string import tempfile -from typing import Any, List, Optional, Union, cast +from typing import Any, Callable, List, Optional, Type, Union, cast import warnings import zipfile @@ -2757,3 +2757,24 @@ def convert_rows_list_to_csv_str(rows_list: List[str]): sep = os.linesep expected = sep.join(rows_list) + sep return expected + + +def external_error_raised( + expected_exception: Type[Exception], +) -> Callable[[Type[Exception], None], None]: + """ + Helper function to mark pytest.raises that have an external error message. + + Parameters + ---------- + expected_exception : Exception + Expected error to raise. + + Returns + ------- + Callable + Regular `pytest.raises` function with `match` equal to `None`. + """ + import pytest + + return pytest.raises(expected_exception, match=None) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 3a6662d3e3ae2..d26ff7490e714 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -2504,10 +2504,6 @@ class CategoricalAccessor(PandasDelegate, PandasObject, NoNewAttributesMixin): >>> s.cat.as_unordered() """ - _deprecations = PandasObject._deprecations | frozenset( - ["categorical", "index", "name"] - ) - def __init__(self, data): self._validate(data) self._parent = data.values diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index 48ad659b771f6..4bfd5f5770b69 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -25,6 +25,7 @@ from pandas.core.dtypes.missing import isna from pandas.core import nanops, ops +import pandas.core.common as com from pandas.core.indexers import check_array_indexer from pandas.core.ops import invalid_comparison from pandas.core.ops.common import unpack_zerodim_and_defer @@ -586,9 +587,8 @@ def _reduce(self, name, skipna=True, **kwargs): # if we have a preservable numeric op, # provide coercion back to an integer type if possible elif name in ["sum", "min", "max", "prod"]: - int_result = int(result) - if int_result == result: - result = int_result + # GH#31409 more performant than casting-then-checking + result = com.cast_scalar_indexer(result) return result diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index 398ed75c060ca..0b35a031bc53f 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -27,6 +27,7 @@ from pandas.core.dtypes.dtypes import IntervalDtype from pandas.core.dtypes.generic import ( ABCDatetimeIndex, + ABCExtensionArray, ABCIndexClass, ABCInterval, ABCIntervalIndex, @@ -789,6 +790,33 @@ def size(self) -> int: # Avoid materializing self.values return self.left.size + def shift(self, periods: int = 1, fill_value: object = None) -> ABCExtensionArray: + if not len(self) or periods == 0: + return self.copy() + + if isna(fill_value): + fill_value = self.dtype.na_value + + # ExtensionArray.shift doesn't work for two reasons + # 1. IntervalArray.dtype.na_value may not be correct for the dtype. + # 2. IntervalArray._from_sequence only accepts NaN for missing values, + # not other values like NaT + + empty_len = min(abs(periods), len(self)) + if isna(fill_value): + fill_value = self.left._na_value + empty = IntervalArray.from_breaks([fill_value] * (empty_len + 1)) + else: + empty = self._from_sequence([fill_value] * empty_len) + + if periods > 0: + a = empty + b = self[:-periods] + else: + a = self[abs(periods) :] + b = empty + return self._concat_same_type([a, b]) + def take(self, indices, allow_fill=False, fill_value=None, axis=None, **kwargs): """ Take elements from the IntervalArray. diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index b476a019c66cc..8008805ddcf87 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -9,7 +9,7 @@ import numpy as np -from pandas._libs import index as libindex, lib +from pandas._libs import lib import pandas._libs.sparse as splib from pandas._libs.sparse import BlockIndex, IntIndex, SparseIndex from pandas._libs.tslibs import NaT @@ -794,7 +794,9 @@ def _get_val_at(self, loc): if sp_loc == -1: return self.fill_value else: - return libindex.get_value_at(self.sp_values, sp_loc) + val = self.sp_values[sp_loc] + val = com.maybe_box_datetimelike(val, self.sp_values.dtype) + return val def take(self, indices, allow_fill=False, fill_value=None): if is_scalar(indices): diff --git a/pandas/core/base.py b/pandas/core/base.py index 9fe1af776dd2b..f3c8b50e774af 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -1027,12 +1027,10 @@ def tolist(self): -------- numpy.ndarray.tolist """ - if self.dtype.kind in ["m", "M"]: - return [com.maybe_box_datetimelike(x) for x in self._values] - elif is_extension_array_dtype(self._values): + if not isinstance(self._values, np.ndarray): + # check for ndarray instead of dtype to catch DTA/TDA return list(self._values) - else: - return self._values.tolist() + return self._values.tolist() to_list = tolist @@ -1049,9 +1047,8 @@ def __iter__(self): iterator """ # We are explicitly making element iterators. - if self.dtype.kind in ["m", "M"]: - return map(com.maybe_box_datetimelike, self._values) - elif is_extension_array_dtype(self._values): + if not isinstance(self._values, np.ndarray): + # Check type instead of dtype to catch DTA/TDA return iter(self._values) else: return map(self._values.item, range(self._values.size)) diff --git a/pandas/core/common.py b/pandas/core/common.py index a76119da2707a..00c7a41477017 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -72,8 +72,12 @@ def consensus_name_attr(objs): return name -def maybe_box_datetimelike(value): +def maybe_box_datetimelike(value, dtype=None): # turn a datetime like into a Timestamp/timedelta as needed + if dtype == object: + # If we dont have datetime64/timedelta64 dtype, we dont want to + # box datetimelike scalars + return value if isinstance(value, (np.datetime64, datetime)): value = tslibs.Timestamp(value) @@ -156,7 +160,7 @@ def cast_scalar_indexer(val): outval : scalar """ # assumes lib.is_scalar(val) - if lib.is_float(val) and val == int(val): + if lib.is_float(val) and val.is_integer(): return int(val) return val diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py index 3776c6f816d96..b0410e31c6de7 100644 --- a/pandas/core/config_init.py +++ b/pandas/core/config_init.py @@ -9,6 +9,8 @@ module is imported, register them here rather than in the module. """ +import warnings + import pandas._config.config as cf from pandas._config.config import ( is_bool, @@ -341,8 +343,26 @@ def is_terminal() -> bool: validator=is_instance_factory([type(None), int]), ) cf.register_option("max_categories", 8, pc_max_categories_doc, validator=is_int) + + def _deprecate_negative_int_max_colwidth(key): + value = cf.get_option(key) + if value is not None and value < 0: + warnings.warn( + "Passing a negative integer is deprecated in version 1.0 and " + "will not be supported in future version. Instead, use None " + "to not limit the column width.", + FutureWarning, + stacklevel=4, + ) + cf.register_option( - "max_colwidth", 50, max_colwidth_doc, validator=is_nonnegative_int + # FIXME: change `validator=is_nonnegative_int` + # in version 1.2 + "max_colwidth", + 50, + max_colwidth_doc, + validator=is_instance_factory([type(None), int]), + cb=_deprecate_negative_int_max_colwidth, ) if is_terminal(): max_cols = 0 # automatically determine optimal number of columns diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 52c569793e499..0719b8ce6010b 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1,11 +1,18 @@ """ routings for casting """ -from datetime import datetime, timedelta +from datetime import date, datetime, timedelta import numpy as np from pandas._libs import lib, tslib, tslibs -from pandas._libs.tslibs import NaT, OutOfBoundsDatetime, Period, iNaT +from pandas._libs.tslibs import ( + NaT, + OutOfBoundsDatetime, + Period, + Timedelta, + Timestamp, + iNaT, +) from pandas._libs.tslibs.timezones import tz_compare from pandas._typing import Dtype from pandas.util._validators import validate_bool_kwarg @@ -1599,3 +1606,59 @@ def maybe_cast_to_integer_array(arr, dtype, copy: bool = False): if is_integer_dtype(dtype) and (is_float_dtype(arr) or is_object_dtype(arr)): raise ValueError("Trying to coerce float values to integers") + + +def convert_scalar_for_putitemlike(scalar, dtype: np.dtype): + """ + Convert datetimelike scalar if we are setting into a datetime64 + or timedelta64 ndarray. + + Parameters + ---------- + scalar : scalar + dtype : np.dtpye + + Returns + ------- + scalar + """ + if dtype.kind == "m": + if isinstance(scalar, (timedelta, np.timedelta64)): + # We have to cast after asm8 in case we have NaT + return Timedelta(scalar).asm8.view("timedelta64[ns]") + elif scalar is None or scalar is NaT or (is_float(scalar) and np.isnan(scalar)): + return np.timedelta64("NaT", "ns") + if dtype.kind == "M": + if isinstance(scalar, (date, np.datetime64)): + # Note: we include date, not just datetime + return Timestamp(scalar).to_datetime64() + elif scalar is None or scalar is NaT or (is_float(scalar) and np.isnan(scalar)): + return np.datetime64("NaT", "ns") + else: + validate_numeric_casting(dtype, scalar) + return scalar + + +def validate_numeric_casting(dtype: np.dtype, value): + """ + Check that we can losslessly insert the given value into an array + with the given dtype. + + Parameters + ---------- + dtype : np.dtype + value : scalar + + Raises + ------ + ValueError + """ + if issubclass(dtype.type, (np.integer, np.bool_)): + if is_float(value) and np.isnan(value): + raise ValueError("Cannot assign nan to integer series") + + if issubclass(dtype.type, (np.integer, np.floating, np.complex)) and not issubclass( + dtype.type, np.bool_ + ): + if is_bool(value): + raise ValueError("Cannot assign bool to float/integer series") diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 0dea8235e9d3f..e0efa93379bca 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -69,6 +69,7 @@ maybe_infer_to_datetimelike, maybe_upcast, maybe_upcast_putmask, + validate_numeric_casting, ) from pandas.core.dtypes.common import ( ensure_float64, @@ -2900,12 +2901,8 @@ def _get_value(self, index, col, takeable: bool = False): engine = self.index._engine try: - if isinstance(series._values, np.ndarray): - # i.e. not EA, we can use engine - return engine.get_value(series._values, index) - else: - loc = series.index.get_loc(index) - return series._values[loc] + loc = engine.get_loc(index) + return series._values[loc] except KeyError: # GH 20629 if self.index.nlevels > 1: @@ -3028,10 +3025,14 @@ def _set_value(self, index, col, value, takeable: bool = False): series = self._get_item_cache(col) engine = self.index._engine - engine.set_value(series._values, index, value) + loc = engine.get_loc(index) + validate_numeric_casting(series.dtype, value) + + series._values[loc] = value + # Note: trying to use series._set_value breaks tests in + # tests.frame.indexing.test_indexing and tests.indexing.test_partial return self except (KeyError, TypeError): - # set using a non-recursive method & reset the cache if takeable: self.iloc[index, col] = value @@ -6556,7 +6557,9 @@ def unstack(self, level=-1, fill_value=None): @Appender( _shared_docs["melt"] % dict( - caller="df.melt(", versionadded=".. versionadded:: 0.20.0\n", other="melt" + caller="df.melt(", + versionadded="\n .. versionadded:: 0.20.0\n", + other="melt", ) ) def melt( diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 3b1d7e4c50be5..313d40b575629 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -1922,10 +1922,8 @@ def _repr_data_resource_(self): Parameters ---------- - buf : writable buffer, defaults to sys.stdout - Where to send the output. By default, the output is printed to - sys.stdout. Pass a writable buffer if you need to further process - the output. + buf : str, Path or StringIO-like, optional, default None + Buffer to write to. If None, the output is returned as a string. mode : str, optional Mode in which file is opened. **kwargs @@ -3444,15 +3442,14 @@ class animal locomotion new_index = self.index[loc] if is_scalar(loc): - new_values = self._data.fast_xs(loc) + # In this case loc should be an integer + if self.ndim == 1: + # if we encounter an array-like and we only have 1 dim + # that means that their are list/ndarrays inside the Series! + # so just return them (GH 6394) + return self._values[loc] - # may need to box a datelike-scalar - # - # if we encounter an array-like and we only have 1 dim - # that means that their are list/ndarrays inside the Series! - # so just return them (GH 6394) - if not is_list_like(new_values) or self.ndim == 1: - return com.maybe_box_datetimelike(new_values) + new_values = self._data.fast_xs(loc) result = self._constructor_sliced( new_values, @@ -3501,7 +3498,9 @@ def _iget_item_cache(self, item): def _box_item_values(self, key, values): raise AbstractMethodError(self) - def _slice(self: FrameOrSeries, slobj: slice, axis=0, kind=None) -> FrameOrSeries: + def _slice( + self: FrameOrSeries, slobj: slice, axis=0, kind: str = "getitem" + ) -> FrameOrSeries: """ Construct a slice of this container. diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 27dd6e953c219..f194c774cf329 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1022,6 +1022,10 @@ def _cython_agg_blocks( agg_blocks: List[Block] = [] new_items: List[np.ndarray] = [] deleted_items: List[np.ndarray] = [] + # Some object-dtype blocks might be split into List[Block[T], Block[U]] + split_items: List[np.ndarray] = [] + split_frames: List[DataFrame] = [] + no_result = object() for block in data.blocks: # Avoid inheriting result from earlier in the loop @@ -1061,40 +1065,56 @@ def _cython_agg_blocks( else: result = cast(DataFrame, result) # unwrap DataFrame to get array + if len(result._data.blocks) != 1: + # We've split an object block! Everything we've assumed + # about a single block input returning a single block output + # is a lie. To keep the code-path for the typical non-split case + # clean, we choose to clean up this mess later on. + split_items.append(locs) + split_frames.append(result) + continue + assert len(result._data.blocks) == 1 result = result._data.blocks[0].values if isinstance(result, np.ndarray) and result.ndim == 1: result = result.reshape(1, -1) - finally: - assert not isinstance(result, DataFrame) - - if result is not no_result: - # see if we can cast the block back to the original dtype - result = maybe_downcast_numeric(result, block.dtype) - - if block.is_extension and isinstance(result, np.ndarray): - # e.g. block.values was an IntegerArray - # (1, N) case can occur if block.values was Categorical - # and result is ndarray[object] - assert result.ndim == 1 or result.shape[0] == 1 - try: - # Cast back if feasible - result = type(block.values)._from_sequence( - result.ravel(), dtype=block.values.dtype - ) - except ValueError: - # reshape to be valid for non-Extension Block - result = result.reshape(1, -1) + assert not isinstance(result, DataFrame) + + if result is not no_result: + # see if we can cast the block back to the original dtype + result = maybe_downcast_numeric(result, block.dtype) + + if block.is_extension and isinstance(result, np.ndarray): + # e.g. block.values was an IntegerArray + # (1, N) case can occur if block.values was Categorical + # and result is ndarray[object] + assert result.ndim == 1 or result.shape[0] == 1 + try: + # Cast back if feasible + result = type(block.values)._from_sequence( + result.ravel(), dtype=block.values.dtype + ) + except ValueError: + # reshape to be valid for non-Extension Block + result = result.reshape(1, -1) - agg_block: Block = block.make_block(result) + agg_block: Block = block.make_block(result) new_items.append(locs) agg_blocks.append(agg_block) - if not agg_blocks: + if not (agg_blocks or split_frames): raise DataError("No numeric types to aggregate") + if split_items: + # Clean up the mess left over from split blocks. + for locs, result in zip(split_items, split_frames): + assert len(locs) == result.shape[1] + for i, loc in enumerate(locs): + new_items.append(np.array([loc], dtype=locs.dtype)) + agg_blocks.append(result.iloc[:, [i]]._data.blocks[0]) + # reset the locs in the blocks to correspond to our # current ordering indexer = np.concatenate(new_items) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 54275dc52bb56..0245b9f74d944 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1180,10 +1180,16 @@ def count(self): @Substitution(name="groupby") @Substitution(see_also=_common_see_also) - def mean(self, *args, **kwargs): + def mean(self, numeric_only: bool = True): """ Compute mean of groups, excluding missing values. + Parameters + ---------- + numeric_only : bool, default True + Include only float, int, boolean columns. If None, will attempt to use + everything, then use only numeric data. + Returns ------- pandas.Series or pandas.DataFrame @@ -1222,19 +1228,26 @@ def mean(self, *args, **kwargs): 2 4.0 Name: B, dtype: float64 """ - nv.validate_groupby_func("mean", args, kwargs, ["numeric_only"]) return self._cython_agg_general( - "mean", alt=lambda x, axis: Series(x).mean(**kwargs), **kwargs + "mean", + alt=lambda x, axis: Series(x).mean(numeric_only=numeric_only), + numeric_only=numeric_only, ) @Substitution(name="groupby") @Appender(_common_see_also) - def median(self, **kwargs): + def median(self, numeric_only=True): """ Compute median of groups, excluding missing values. For multiple groupings, the result index will be a MultiIndex + Parameters + ---------- + numeric_only : bool, default True + Include only float, int, boolean columns. If None, will attempt to use + everything, then use only numeric data. + Returns ------- Series or DataFrame @@ -1242,13 +1255,13 @@ def median(self, **kwargs): """ return self._cython_agg_general( "median", - alt=lambda x, axis: Series(x).median(axis=axis, **kwargs), - **kwargs, + alt=lambda x, axis: Series(x).median(axis=axis, numeric_only=numeric_only), + numeric_only=numeric_only, ) @Substitution(name="groupby") @Appender(_common_see_also) - def std(self, ddof: int = 1, *args, **kwargs): + def std(self, ddof: int = 1): """ Compute standard deviation of groups, excluding missing values. @@ -1266,12 +1279,11 @@ def std(self, ddof: int = 1, *args, **kwargs): """ # TODO: implement at Cython level? - nv.validate_groupby_func("std", args, kwargs) - return np.sqrt(self.var(ddof=ddof, **kwargs)) + return np.sqrt(self.var(ddof=ddof)) @Substitution(name="groupby") @Appender(_common_see_also) - def var(self, ddof: int = 1, *args, **kwargs): + def var(self, ddof: int = 1): """ Compute variance of groups, excluding missing values. @@ -1287,15 +1299,14 @@ def var(self, ddof: int = 1, *args, **kwargs): Series or DataFrame Variance of values within each group. """ - nv.validate_groupby_func("var", args, kwargs) if ddof == 1: return self._cython_agg_general( - "var", alt=lambda x, axis: Series(x).var(ddof=ddof, **kwargs), **kwargs + "var", alt=lambda x, axis: Series(x).var(ddof=ddof) ) else: - f = lambda x: x.var(ddof=ddof, **kwargs) + func = lambda x: x.var(ddof=ddof) with _group_selection_context(self): - return self._python_agg_general(f) + return self._python_agg_general(func) @Substitution(name="groupby") @Appender(_common_see_also) @@ -1383,7 +1394,9 @@ def func(self, numeric_only=numeric_only, min_count=min_count): except DataError: pass except NotImplementedError as err: - if "function is not implemented for this dtype" in str(err): + if "function is not implemented for this dtype" in str( + err + ) or "category dtype not supported" in str(err): # raised in _get_cython_function, in some cases can # be trimmed by implementing cython funcs for more dtypes pass diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 77c54ec736aaa..761353ca5a6ca 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -31,6 +31,7 @@ is_extension_array_dtype, is_integer_dtype, is_numeric_dtype, + is_period_dtype, is_sparse, is_timedelta64_dtype, needs_i8_conversion, @@ -567,7 +568,12 @@ def _cython_operation( if swapped: result = result.swapaxes(0, axis) - if is_datetime64tz_dtype(orig_values.dtype): + if is_datetime64tz_dtype(orig_values.dtype) or is_period_dtype( + orig_values.dtype + ): + # We need to use the constructors directly for these dtypes + # since numpy won't recognize them + # https://github.com/pandas-dev/pandas/issues/31471 result = type(orig_values)(result.astype(np.int64), dtype=orig_values.dtype) elif is_datetimelike and kind == "aggregate": result = result.astype(orig_values.dtype) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 95cfab4c96af3..e8ad2bef099a1 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -1,7 +1,7 @@ from datetime import datetime import operator from textwrap import dedent -from typing import Any, FrozenSet, Hashable, Optional, Union +from typing import TYPE_CHECKING, Any, FrozenSet, Hashable, Optional, Union import warnings import numpy as np @@ -18,7 +18,10 @@ from pandas.util._decorators import Appender, Substitution, cache_readonly from pandas.core.dtypes import concat as _concat -from pandas.core.dtypes.cast import maybe_cast_to_integer_array +from pandas.core.dtypes.cast import ( + maybe_cast_to_integer_array, + validate_numeric_casting, +) from pandas.core.dtypes.common import ( ensure_categorical, ensure_int64, @@ -68,7 +71,7 @@ from pandas.core.arrays import ExtensionArray from pandas.core.base import IndexOpsMixin, PandasObject import pandas.core.common as com -from pandas.core.indexers import deprecate_ndim_indexing, maybe_convert_indices +from pandas.core.indexers import deprecate_ndim_indexing from pandas.core.indexes.frozen import FrozenList import pandas.core.missing as missing from pandas.core.ops import get_op_result_name @@ -83,6 +86,10 @@ pprint_thing, ) +if TYPE_CHECKING: + from pandas import Series + + __all__ = ["Index"] _unsortable_types = frozenset(("mixed", "mixed-integer")) @@ -522,6 +529,7 @@ def _shallow_copy(self, values=None, **kwargs): values = self.values attributes = self._get_attributes_dict() + attributes.update(kwargs) return self._simple_new(values, **attributes) @@ -2566,6 +2574,7 @@ def _union(self, other, sort): # worth making this faster? a very unusual case value_set = set(lvals) result.extend([x for x in rvals if x not in value_set]) + result = Index(result)._values # do type inference here else: # find indexes of things in "other" that are not in "self" if self.is_unique: @@ -2595,7 +2604,8 @@ def _union(self, other, sort): return self._wrap_setop_result(other, result) def _wrap_setop_result(self, other, result): - return self._constructor(result, name=get_op_result_name(self, other)) + name = get_op_result_name(self, other) + return self._shallow_copy(result, name=name) # TODO: standardize return type of non-union setops type(self vs other) def intersection(self, other, sort=False): @@ -2652,9 +2662,10 @@ def intersection(self, other, sort=False): if self.is_monotonic and other.is_monotonic: try: result = self._inner_indexer(lvals, rvals)[0] - return self._wrap_setop_result(other, result) except TypeError: pass + else: + return self._wrap_setop_result(other, result) try: indexer = Index(rvals).get_indexer(lvals) @@ -2880,10 +2891,15 @@ def get_loc(self, key, method=None, tolerance=None): "tolerance argument only valid if using pad, " "backfill or nearest lookups" ) + casted_key = self._maybe_cast_indexer(key) try: - return self._engine.get_loc(key) + return self._engine.get_loc(casted_key) except KeyError: - return self._engine.get_loc(self._maybe_cast_indexer(key)) + raise KeyError(key) + + if tolerance is not None: + tolerance = self._convert_tolerance(tolerance, np.asarray(key)) + indexer = self.get_indexer([key], method=method, tolerance=tolerance) if indexer.ndim > 1 or indexer.size > 1: raise TypeError("get_loc requires scalar valued input") @@ -3061,9 +3077,8 @@ def _get_nearest_indexer(self, target: "Index", limit, tolerance) -> np.ndarray: left_indexer = self.get_indexer(target, "pad", limit=limit) right_indexer = self.get_indexer(target, "backfill", limit=limit) - target = np.asarray(target) - left_distances = abs(self.values[left_indexer] - target) - right_distances = abs(self.values[right_indexer] - target) + left_distances = np.abs(self[left_indexer] - target) + right_distances = np.abs(self[right_indexer] - target) op = operator.lt if self.is_monotonic_increasing else operator.le indexer = np.where( @@ -3085,20 +3100,16 @@ def _filter_indexer_tolerance( # -------------------------------------------------------------------- # Indexer Conversion Methods - def _convert_scalar_indexer(self, key, kind=None): + def _convert_scalar_indexer(self, key, kind: str_t): """ Convert a scalar indexer. Parameters ---------- key : label of the slice bound - kind : {'loc', 'getitem', 'iloc'} or None + kind : {'loc', 'getitem'} """ - assert kind in ["loc", "getitem", "iloc", None] - - if kind == "iloc": - self._validate_indexer("positional", key, "iloc") - return key + assert kind in ["loc", "getitem"] if len(self) and not isinstance(self, ABCMultiIndex): @@ -3147,9 +3158,9 @@ def _convert_slice_indexer(self, key: slice, kind=None): # validate iloc if kind == "iloc": - self._validate_indexer("slice", key.start, "iloc") - self._validate_indexer("slice", key.stop, "iloc") - self._validate_indexer("slice", key.step, "iloc") + self._validate_indexer("positional", key.start, "iloc") + self._validate_indexer("positional", key.stop, "iloc") + self._validate_indexer("positional", key.step, "iloc") return key # potentially cast the bounds to integers @@ -3200,7 +3211,7 @@ def is_int(v): return indexer - def _convert_listlike_indexer(self, keyarr, kind=None): + def _convert_listlike_indexer(self, keyarr): """ Parameters ---------- @@ -3219,7 +3230,7 @@ def _convert_listlike_indexer(self, keyarr, kind=None): else: keyarr = self._convert_arr_indexer(keyarr) - indexer = self._convert_list_indexer(keyarr, kind=kind) + indexer = self._convert_list_indexer(keyarr) return indexer, keyarr def _convert_arr_indexer(self, keyarr): @@ -3253,7 +3264,7 @@ def _convert_index_indexer(self, keyarr): """ return keyarr - def _convert_list_indexer(self, keyarr, kind=None): + def _convert_list_indexer(self, keyarr): """ Convert a list-like indexer to the appropriate dtype. @@ -3267,29 +3278,6 @@ def _convert_list_indexer(self, keyarr, kind=None): ------- positional indexer or None """ - if ( - kind in [None, "iloc"] - and is_integer_dtype(keyarr) - and not self.is_floating() - ): - - if self.inferred_type == "mixed-integer": - indexer = self.get_indexer(keyarr) - if (indexer >= 0).all(): - return indexer - # missing values are flagged as -1 by get_indexer and negative - # indices are already converted to positive indices in the - # above if-statement, so the negative flags are changed to - # values outside the range of indices so as to trigger an - # IndexError in maybe_convert_indices - indexer[indexer < 0] = len(self) - - return maybe_convert_indices(indexer, len(self)) - - elif not self.inferred_type == "integer": - keyarr = np.where(keyarr < 0, len(self) + keyarr, keyarr) - return keyarr - return None def _invalid_indexer(self, form: str_t, key): @@ -3297,8 +3285,8 @@ def _invalid_indexer(self, form: str_t, key): Consistent invalid indexer message. """ raise TypeError( - f"cannot do {form} indexing on {type(self)} with these " - f"indexers [{key}] of {type(key)}" + f"cannot do {form} indexing on {type(self).__name__} with these " + f"indexers [{key}] of type {type(key).__name__}" ) # -------------------------------------------------------------------- @@ -4096,6 +4084,11 @@ def __contains__(self, key: Any) -> bool: bool Whether the key search is in the index. + Raises + ------ + TypeError + If the key is not hashable. + See Also -------- Index.isin : Returns an ndarray of boolean dtype indicating whether the @@ -4573,21 +4566,15 @@ def argsort(self, *args, **kwargs) -> np.ndarray: result = np.array(self) return result.argsort(*args, **kwargs) - _index_shared_docs[ - "get_value" - ] = """ + def get_value(self, series: "Series", key): + """ Fast lookup of value from 1-dimensional ndarray. Only use this if you know what you're doing. Returns ------- - scalar - A value in the Series with the index of the key value in self. + scalar or Series """ - - @Appender(_index_shared_docs["get_value"] % _index_doc_kwargs) - def get_value(self, series, key): - if not is_scalar(key): # if key is not a scalar, directly raise an error (the code below # would convert to numpy arrays and raise later any way) - GH29926 @@ -4599,9 +4586,9 @@ def get_value(self, series, key): # If that fails, raise a KeyError if an integer # index, otherwise, see if key is an integer, and # try that - loc = self._engine.get_loc(key) + loc = self.get_loc(key) except KeyError: - if len(self) > 0 and (self.holds_integer() or self.is_boolean()): + if not self._should_fallback_to_positional(): raise elif is_integer(key): # If the Index cannot hold integer, then this is unambiguously @@ -4612,7 +4599,15 @@ def get_value(self, series, key): return self._get_values_for_loc(series, loc) - def _get_values_for_loc(self, series, loc): + def _should_fallback_to_positional(self) -> bool: + """ + If an integer key is not found, should we fall back to positional indexing? + """ + if len(self) > 0 and (self.holds_integer() or self.is_boolean()): + return False + return True + + def _get_values_for_loc(self, series: "Series", loc): """ Do a positional lookup on the given Series, returning either a scalar or a Series. @@ -4620,10 +4615,6 @@ def _get_values_for_loc(self, series, loc): Assumes that `series.index is self` """ if is_integer(loc): - if isinstance(series._values, np.ndarray): - # Since we have an ndarray and not DatetimeArray, we dont - # have to worry about a tz. - return libindex.get_value_at(series._values, loc, tz=None) return series._values[loc] return series.iloc[loc] @@ -4646,9 +4637,9 @@ def set_value(self, arr, key, value): FutureWarning, stacklevel=2, ) - self._engine.set_value( - com.values_from_object(arr), com.values_from_object(key), value - ) + loc = self._engine.get_loc(key) + validate_numeric_casting(arr.dtype, value) + arr[loc] = value _index_shared_docs[ "get_indexer_non_unique" @@ -4929,13 +4920,8 @@ def _maybe_cast_indexer(self, key): to an int if equivalent. """ - if is_float(key) and not self.is_floating(): - try: - ckey = int(key) - if ckey == key: - key = ckey - except (OverflowError, ValueError, TypeError): - pass + if not self.is_floating(): + return com.cast_scalar_indexer(key) return key def _validate_indexer(self, form: str_t, key, kind: str_t): diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index d556c014467cf..85229c728848f 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -1,4 +1,4 @@ -from typing import Any, List +from typing import TYPE_CHECKING, Any, List import warnings import numpy as np @@ -7,7 +7,6 @@ from pandas._libs import index as libindex from pandas._libs.hashtable import duplicated_int64 -from pandas._typing import AnyArrayLike from pandas.util._decorators import Appender, cache_readonly from pandas.core.dtypes.common import ( @@ -29,7 +28,9 @@ from pandas.core.indexes.base import Index, _index_shared_docs, maybe_extract_name from pandas.core.indexes.extension import ExtensionIndex, inherit_names import pandas.core.missing as missing -from pandas.core.ops import get_op_result_name + +if TYPE_CHECKING: + from pandas import Series _index_doc_kwargs = dict(ibase._index_doc_kwargs) _index_doc_kwargs.update(dict(target_klass="CategoricalIndex")) @@ -159,17 +160,6 @@ class CategoricalIndex(ExtensionIndex, accessor.PandasDelegate): _typ = "categoricalindex" - _raw_inherit = { - "argsort", - "_internal_get_values", - "tolist", - "codes", - "categories", - "ordered", - "_reverse_indexer", - "searchsorted", - } - codes: np.ndarray categories: Index _data: Categorical @@ -386,12 +376,6 @@ def _has_complex_internals(self) -> bool: # used to avoid libreduction code paths, which raise or require conversion return True - def _wrap_setop_result(self, other, result): - name = get_op_result_name(self, other) - # We use _shallow_copy rather than the Index implementation - # (which uses _constructor) in order to preserve dtype. - return self._shallow_copy(result, name=name) - @Appender(Index.__contains__.__doc__) def __contains__(self, key: Any) -> bool: # if key is a NaN, check if any NaN is in self. @@ -455,53 +439,19 @@ def _to_safe_for_reshape(self): """ convert to object if we are a categorical """ return self.astype("object") - def get_loc(self, key, method=None): - """ - Get integer location, slice or boolean mask for requested label. - - Parameters - ---------- - key : label - method : {None} - * default: exact matches only. - - Returns - ------- - loc : int if unique index, slice if monotonic index, else mask - - Raises - ------ - KeyError : if the key is not in the index - - Examples - -------- - >>> unique_index = pd.CategoricalIndex(list('abc')) - >>> unique_index.get_loc('b') - 1 - - >>> monotonic_index = pd.CategoricalIndex(list('abbc')) - >>> monotonic_index.get_loc('b') - slice(1, 3, None) - - >>> non_monotonic_index = pd.CategoricalIndex(list('abcb')) - >>> non_monotonic_index.get_loc('b') - array([False, True, False, True], dtype=bool) - """ + def _maybe_cast_indexer(self, key): code = self.categories.get_loc(key) code = self.codes.dtype.type(code) - try: - return self._engine.get_loc(code) - except KeyError: - raise KeyError(key) + return code - def get_value(self, series: AnyArrayLike, key: Any): + def get_value(self, series: "Series", key: Any): """ Fast lookup of value from 1-dimensional ndarray. Only use this if you know what you're doing Parameters ---------- - series : Series, ExtensionArray, Index, or ndarray + series : Series 1-dimensional array to take values from key: : scalar The value of this index at the position of the desired value, @@ -521,7 +471,7 @@ def get_value(self, series: AnyArrayLike, key: Any): pass # we might be a positional inexer - return super().get_value(series, key) + return Index.get_value(self, series, key) @Appender(Index.where.__doc__) def where(self, cond, other=None): @@ -674,21 +624,22 @@ def get_indexer_non_unique(self, target): return ensure_platform_int(indexer), missing @Appender(Index._convert_scalar_indexer.__doc__) - def _convert_scalar_indexer(self, key, kind=None): + def _convert_scalar_indexer(self, key, kind: str): + assert kind in ["loc", "getitem"] if kind == "loc": try: - return self.categories._convert_scalar_indexer(key, kind=kind) + return self.categories._convert_scalar_indexer(key, kind="loc") except TypeError: self._invalid_indexer("label", key) return super()._convert_scalar_indexer(key, kind=kind) @Appender(Index._convert_list_indexer.__doc__) - def _convert_list_indexer(self, keyarr, kind=None): + def _convert_list_indexer(self, keyarr): # Return our indexer or raise if all of the values are not included in # the categories if self.categories._defer_to_indexing: - indexer = self.categories._convert_list_indexer(keyarr, kind=kind) + indexer = self.categories._convert_list_indexer(keyarr) return Index(self.codes).get_indexer_for(indexer) indexer = self.categories.get_indexer(np.asarray(keyarr)) @@ -852,18 +803,13 @@ def _concat_same_dtype(self, to_concat, name): result.name = name return result - def _delegate_property_get(self, name: str, *args, **kwargs): - """ method delegation to the ._values """ - prop = getattr(self._values, name) - return prop # no wrapping for now - def _delegate_method(self, name: str, *args, **kwargs): """ method delegation to the ._values """ method = getattr(self._values, name) if "inplace" in kwargs: raise ValueError("cannot use inplace with CategoricalIndex") res = method(*args, **kwargs) - if is_scalar(res) or name in self._raw_inherit: + if is_scalar(res): return res return CategoricalIndex(res, name=self.name) diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index e3eeca2c45e76..d06d0d499ef47 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -80,7 +80,16 @@ def wrapper(left, right): cache=True, ) @inherit_names( - ["__iter__", "mean", "freq", "freqstr", "_ndarray_values", "asi8", "_box_values"], + [ + "__iter__", + "mean", + "freq", + "freqstr", + "_ndarray_values", + "asi8", + "_box_values", + "_box_func", + ], DatetimeLikeArrayMixin, ) class DatetimeIndexOpsMixin(ExtensionIndex): @@ -191,7 +200,7 @@ def sort_values(self, return_indexer=False, ascending=True): arr = type(self._data)._simple_new( sorted_values, dtype=self.dtype, freq=freq ) - return self._simple_new(arr, name=self.name) + return type(self)._simple_new(arr, name=self.name) @Appender(_index_shared_docs["take"] % _index_doc_kwargs) def take(self, indices, axis=0, allow_fill=True, fill_value=None, **kwargs): @@ -374,8 +383,9 @@ def _format_attrs(self): return attrs # -------------------------------------------------------------------- + # Indexing Methods - def _convert_scalar_indexer(self, key, kind=None): + def _convert_scalar_indexer(self, key, kind: str): """ We don't allow integer or float indexing on datetime-like when using loc. @@ -383,23 +393,27 @@ def _convert_scalar_indexer(self, key, kind=None): Parameters ---------- key : label of the slice bound - kind : {'loc', 'getitem', 'iloc'} or None + kind : {'loc', 'getitem'} """ - assert kind in ["loc", "getitem", "iloc", None] + assert kind in ["loc", "getitem"] + + if not is_scalar(key): + raise TypeError(key) # we don't allow integer/float indexing for loc - # we don't allow float indexing for ix/getitem - if is_scalar(key): - is_int = is_integer(key) - is_flt = is_float(key) - if kind in ["loc"] and (is_int or is_flt): - self._invalid_indexer("index", key) - elif kind in ["getitem"] and is_flt: - self._invalid_indexer("index", key) + # we don't allow float indexing for getitem + is_int = is_integer(key) + is_flt = is_float(key) + if kind == "loc" and (is_int or is_flt): + self._invalid_indexer("label", key) + elif kind == "getitem" and is_flt: + self._invalid_indexer("label", key) return super()._convert_scalar_indexer(key, kind=kind) + # -------------------------------------------------------------------- + __add__ = make_wrapped_arith_op("__add__") __radd__ = make_wrapped_arith_op("__radd__") __sub__ = make_wrapped_arith_op("__sub__") @@ -514,7 +528,7 @@ def _concat_same_dtype(self, to_concat, name): if is_diff_evenly_spaced: new_data._freq = self.freq - return self._simple_new(new_data, name=name) + return type(self)._simple_new(new_data, name=name) def shift(self, periods=1, freq=None): """ @@ -617,7 +631,7 @@ def _shallow_copy(self, values=None, **kwargs): del attributes["freq"] attributes.update(kwargs) - return self._simple_new(values, **attributes) + return type(self)._simple_new(values, **attributes) # -------------------------------------------------------------------- # Set Operation Methods @@ -789,11 +803,10 @@ def _union(self, other, sort): if this._can_fast_union(other): return this._fast_union(other, sort=sort) else: - result = Index._union(this, other, sort=sort) - if isinstance(result, type(self)): - assert result._data.dtype == this.dtype - if result.freq is None: - result._set_freq("infer") + i8self = Int64Index._simple_new(self.asi8, name=self.name) + i8other = Int64Index._simple_new(other.asi8, name=other.name) + i8result = i8self._union(i8other, sort=sort) + result = type(self)(i8result, dtype=self.dtype, freq="infer") return result # -------------------------------------------------------------------- @@ -875,7 +888,7 @@ def _wrap_joined_index(self, joined, other): kwargs = {} if hasattr(self, "tz"): kwargs["tz"] = getattr(other, "tz", None) - return self._simple_new(joined, name, **kwargs) + return type(self)._simple_new(joined, name, **kwargs) # -------------------------------------------------------------------- # List-Like Methods diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index 416c3d0701a85..3d57f0944b318 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -5,15 +5,8 @@ import numpy as np -from pandas._libs import ( - NaT, - Timedelta, - Timestamp, - index as libindex, - lib, - tslib as libts, -) -from pandas._libs.tslibs import ccalendar, fields, parsing, timezones +from pandas._libs import NaT, Period, Timestamp, index as libindex, lib, tslib as libts +from pandas._libs.tslibs import fields, parsing, timezones from pandas.util._decorators import cache_readonly from pandas.core.dtypes.common import _NS_DTYPE, is_float, is_integer, is_scalar @@ -29,7 +22,6 @@ from pandas.core.indexes.base import Index, InvalidIndexError, maybe_extract_name from pandas.core.indexes.datetimelike import DatetimeTimedeltaMixin from pandas.core.indexes.extension import inherit_names -from pandas.core.ops import get_op_result_name import pandas.core.tools.datetimes as tools from pandas.tseries.frequencies import Resolution, to_offset @@ -70,7 +62,6 @@ def _new_DatetimeIndex(cls, d): "_field_ops", "_datetimelike_ops", "_datetimelike_methods", - "_box_func", "tz", "tzinfo", "dtype", @@ -348,18 +339,9 @@ def union_many(self, others): if this._can_fast_union(other): this = this._fast_union(other) else: - dtype = this.dtype this = Index.union(this, other) - if isinstance(this, DatetimeIndex): - # TODO: we shouldn't be setting attributes like this; - # in all the tests this equality already holds - this._data._dtype = dtype return this - def _wrap_setop_result(self, other, result): - name = get_op_result_name(self, other) - return self._shallow_copy(result, name=name, freq=None) - # -------------------------------------------------------------------- def _get_time_micros(self): @@ -476,7 +458,7 @@ def _parsed_string_to_bounds(self, reso: str, parsed: datetime): Parameters ---------- - reso : Resolution + reso : str Resolution provided by parsed string. parsed : datetime Datetime from parsed string. @@ -484,7 +466,6 @@ def _parsed_string_to_bounds(self, reso: str, parsed: datetime): Returns ------- lower, upper: pd.Timestamp - """ valid_resos = { "year", @@ -500,50 +481,11 @@ def _parsed_string_to_bounds(self, reso: str, parsed: datetime): } if reso not in valid_resos: raise KeyError - if reso == "year": - start = Timestamp(parsed.year, 1, 1) - end = Timestamp(parsed.year + 1, 1, 1) - Timedelta(nanoseconds=1) - elif reso == "month": - d = ccalendar.get_days_in_month(parsed.year, parsed.month) - start = Timestamp(parsed.year, parsed.month, 1) - end = start + Timedelta(days=d, nanoseconds=-1) - elif reso == "quarter": - qe = (((parsed.month - 1) + 2) % 12) + 1 # two months ahead - d = ccalendar.get_days_in_month(parsed.year, qe) # at end of month - start = Timestamp(parsed.year, parsed.month, 1) - end = Timestamp(parsed.year, qe, 1) + Timedelta(days=d, nanoseconds=-1) - elif reso == "day": - start = Timestamp(parsed.year, parsed.month, parsed.day) - end = start + Timedelta(days=1, nanoseconds=-1) - elif reso == "hour": - start = Timestamp(parsed.year, parsed.month, parsed.day, parsed.hour) - end = start + Timedelta(hours=1, nanoseconds=-1) - elif reso == "minute": - start = Timestamp( - parsed.year, parsed.month, parsed.day, parsed.hour, parsed.minute - ) - end = start + Timedelta(minutes=1, nanoseconds=-1) - elif reso == "second": - start = Timestamp( - parsed.year, - parsed.month, - parsed.day, - parsed.hour, - parsed.minute, - parsed.second, - ) - end = start + Timedelta(seconds=1, nanoseconds=-1) - elif reso == "microsecond": - start = Timestamp( - parsed.year, - parsed.month, - parsed.day, - parsed.hour, - parsed.minute, - parsed.second, - parsed.microsecond, - ) - end = start + Timedelta(microseconds=1, nanoseconds=-1) + + grp = Resolution.get_freq_group(reso) + per = Period(parsed, freq=(grp, 1)) + start, end = per.start_time, per.end_time + # GH 24076 # If an incoming date string contained a UTC offset, need to localize # the parsed date to this offset first before aligning with the index's @@ -601,6 +543,7 @@ def _partial_date_slice( raise KeyError # a monotonic (sorted) series can be sliced + # Use asi8.searchsorted to avoid re-validating left = stamps.searchsorted(t1.value, side="left") if use_lhs else None right = stamps.searchsorted(t2.value, side="right") if use_rhs else None @@ -617,17 +560,6 @@ def _maybe_promote(self, other): other = DatetimeIndex(other) return self, other - def get_value(self, series, key): - """ - Fast lookup of value from 1-dimensional ndarray. Only use this if you - know what you're doing - """ - if is_integer(key): - loc = key - else: - loc = self.get_loc(key) - return self._get_values_for_loc(series, loc) - def get_loc(self, key, method=None, tolerance=None): """ Get integer location for requested label @@ -639,18 +571,13 @@ def get_loc(self, key, method=None, tolerance=None): if not is_scalar(key): raise InvalidIndexError(key) + orig_key = key if is_valid_nat_for_dtype(key, self.dtype): key = NaT - if tolerance is not None: - # try converting tolerance now, so errors don't get swallowed by - # the try/except clauses below - tolerance = self._convert_tolerance(tolerance, np.asarray(key)) - - if isinstance(key, (datetime, np.datetime64)): + if isinstance(key, self._data._recognized_scalars): # needed to localize naive datetimes key = self._maybe_cast_for_get_loc(key) - return Index.get_loc(self, key, method, tolerance) elif isinstance(key, str): try: @@ -659,9 +586,8 @@ def get_loc(self, key, method=None, tolerance=None): pass try: - stamp = self._maybe_cast_for_get_loc(key) - return Index.get_loc(self, stamp, method, tolerance) - except (KeyError, ValueError): + key = self._maybe_cast_for_get_loc(key) + except ValueError: raise KeyError(key) elif isinstance(key, timedelta): @@ -670,14 +596,21 @@ def get_loc(self, key, method=None, tolerance=None): f"Cannot index {type(self).__name__} with {type(key).__name__}" ) - if isinstance(key, time): + elif isinstance(key, time): if method is not None: raise NotImplementedError( "cannot yet lookup inexact labels when key is a time object" ) return self.indexer_at_time(key) - return Index.get_loc(self, key, method, tolerance) + else: + # unrecognized type + raise KeyError(key) + + try: + return Index.get_loc(self, key, method, tolerance) + except KeyError: + raise KeyError(orig_key) def _maybe_cast_for_get_loc(self, key) -> Timestamp: # needed to localize naive datetimes diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index 6a3e808ab9821..03fb8db2e1e1e 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -1,7 +1,7 @@ """ define the IntervalIndex """ from operator import le, lt import textwrap -from typing import TYPE_CHECKING, Any, Optional, Tuple, Union +from typing import Any, Optional, Tuple, Union import numpy as np @@ -57,10 +57,6 @@ from pandas.tseries.frequencies import to_offset from pandas.tseries.offsets import DateOffset -if TYPE_CHECKING: - from pandas import Series - - _VALID_CLOSED = {"left", "right", "both", "neither"} _index_doc_kwargs = dict(ibase._index_doc_kwargs) @@ -527,17 +523,22 @@ def is_overlapping(self) -> bool: # GH 23309 return self._engine.is_overlapping + def _should_fallback_to_positional(self): + # integer lookups in Series.__getitem__ are unambiguously + # positional in this case + return self.dtype.subtype.kind in ["m", "M"] + @Appender(Index._convert_scalar_indexer.__doc__) - def _convert_scalar_indexer(self, key, kind=None): - if kind == "iloc": - return super()._convert_scalar_indexer(key, kind=kind) + def _convert_scalar_indexer(self, key, kind: str): + assert kind in ["getitem", "loc"] + # never iloc, so no-op return key def _maybe_cast_slice_bound(self, label, side, kind): return getattr(self, side)._maybe_cast_slice_bound(label, side, kind) @Appender(Index._convert_list_indexer.__doc__) - def _convert_list_indexer(self, keyarr, kind=None): + def _convert_list_indexer(self, keyarr): """ we are passed a list-like indexer. Return the indexer for matching intervals. @@ -884,11 +885,6 @@ def get_indexer_for(self, target: AnyArrayLike, **kwargs) -> np.ndarray: return self.get_indexer_non_unique(target)[0] return self.get_indexer(target, **kwargs) - @Appender(_index_shared_docs["get_value"] % _index_doc_kwargs) - def get_value(self, series: "Series", key): - loc = self.get_loc(key) - return series.iloc[loc] - def _convert_slice_indexer(self, key: slice, kind=None): if not (key.step is None or key.step == 1): raise ValueError("cannot support not-default step in a slice") diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 889622f44bbb7..708bea7d132a2 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -1,14 +1,14 @@ -import datetime from sys import getsizeof -from typing import Any, Hashable, List, Optional, Sequence, Union +from typing import Any, Hashable, Iterable, List, Optional, Sequence, Tuple, Union import warnings import numpy as np from pandas._config import get_option -from pandas._libs import Timestamp, algos as libalgos, index as libindex, lib, tslibs +from pandas._libs import algos as libalgos, index as libindex, lib from pandas._libs.hashtable import duplicated_int64 +from pandas._typing import AnyArrayLike, ArrayLike, Scalar from pandas.compat.numpy import function as nv from pandas.errors import PerformanceWarning, UnsortedIndexError from pandas.util._decorators import Appender, cache_readonly @@ -234,6 +234,8 @@ class MultiIndex(Index): _comparables = ["names"] rename = Index.set_names + _tuples = None + # -------------------------------------------------------------------- # Constructors @@ -620,29 +622,29 @@ def from_frame(cls, df, sortorder=None, names=None): # -------------------------------------------------------------------- - @property - def levels(self): - result = [ - x._shallow_copy(name=name) for x, name in zip(self._levels, self._names) - ] - for level in result: - # disallow midx.levels[0].name = "foo" - level._no_setting_name = True - return FrozenList(result) - @property def _values(self): # We override here, since our parent uses _data, which we don't use. return self.values @property - def shape(self): - """ - Return a tuple of the shape of the underlying data. - """ - # overriding the base Index.shape definition to avoid materializing - # the values (GH-27384, GH-27775) - return (len(self),) + def values(self): + if self._tuples is not None: + return self._tuples + + values = [] + + for i in range(self.nlevels): + vals = self._get_level_values(i) + if is_categorical_dtype(vals): + vals = vals._internal_get_values() + if isinstance(vals.dtype, ExtensionDtype) or hasattr(vals, "_box_values"): + vals = vals.astype(object) + vals = np.array(vals, copy=False) + values.append(vals) + + self._tuples = lib.fast_zip(values) + return self._tuples @property def array(self): @@ -659,6 +661,34 @@ def array(self): "'MultiIndex.to_numpy()' to get a NumPy array of tuples." ) + @property + def shape(self): + """ + Return a tuple of the shape of the underlying data. + """ + # overriding the base Index.shape definition to avoid materializing + # the values (GH-27384, GH-27775) + return (len(self),) + + def __len__(self) -> int: + return len(self.codes[0]) + + # -------------------------------------------------------------------- + # Levels Methods + + @cache_readonly + def levels(self): + # Use cache_readonly to ensure that self.get_locs doesn't repeatedly + # create new IndexEngine + # https://github.com/pandas-dev/pandas/issues/31648 + result = [ + x._shallow_copy(name=name) for x, name in zip(self._levels, self._names) + ] + for level in result: + # disallow midx.levels[0].name = "foo" + level._no_setting_name = True + return FrozenList(result) + def _set_levels( self, levels, level=None, copy=False, validate=True, verify_integrity=False ): @@ -785,6 +815,23 @@ def set_levels(self, levels, level=None, inplace=False, verify_integrity=True): if not inplace: return idx + @property + def nlevels(self) -> int: + """ + Integer number of levels in this MultiIndex. + """ + return len(self._levels) + + @property + def levshape(self): + """ + A tuple with the length of each level. + """ + return tuple(len(x) for x in self.levels) + + # -------------------------------------------------------------------- + # Codes Methods + @property def codes(self): return self._codes @@ -895,6 +942,57 @@ def set_codes(self, codes, level=None, inplace=False, verify_integrity=True): if not inplace: return idx + # -------------------------------------------------------------------- + # Index Internals + + @cache_readonly + def _engine(self): + # Calculate the number of bits needed to represent labels in each + # level, as log2 of their sizes (including -1 for NaN): + sizes = np.ceil(np.log2([len(l) + 1 for l in self.levels])) + + # Sum bit counts, starting from the _right_.... + lev_bits = np.cumsum(sizes[::-1])[::-1] + + # ... in order to obtain offsets such that sorting the combination of + # shifted codes (one for each level, resulting in a unique integer) is + # equivalent to sorting lexicographically the codes themselves. Notice + # that each level needs to be shifted by the number of bits needed to + # represent the _previous_ ones: + offsets = np.concatenate([lev_bits[1:], [0]]).astype("uint64") + + # Check the total number of bits needed for our representation: + if lev_bits[0] > 64: + # The levels would overflow a 64 bit uint - use Python integers: + return MultiIndexPyIntEngine(self.levels, self.codes, offsets) + return MultiIndexUIntEngine(self.levels, self.codes, offsets) + + @property + def _constructor(self): + return MultiIndex.from_tuples + + @Appender(Index._shallow_copy.__doc__) + def _shallow_copy(self, values=None, **kwargs): + if values is not None: + names = kwargs.pop("names", kwargs.pop("name", self.names)) + # discards freq + kwargs.pop("freq", None) + return MultiIndex.from_tuples(values, names=names, **kwargs) + return self.copy(**kwargs) + + def _shallow_copy_with_infer(self, values, **kwargs): + # On equal MultiIndexes the difference is empty. + # Therefore, an empty MultiIndex is returned GH13490 + if len(values) == 0: + return MultiIndex( + levels=[[] for _ in range(self.nlevels)], + codes=[[] for _ in range(self.nlevels)], + **kwargs, + ) + return self._shallow_copy(values, **kwargs) + + # -------------------------------------------------------------------- + def copy( self, names=None, @@ -961,17 +1059,6 @@ def view(self, cls=None): result._id = self._id return result - def _shallow_copy_with_infer(self, values, **kwargs): - # On equal MultiIndexes the difference is empty. - # Therefore, an empty MultiIndex is returned GH13490 - if len(values) == 0: - return MultiIndex( - levels=[[] for _ in range(self.nlevels)], - codes=[[] for _ in range(self.nlevels)], - **kwargs, - ) - return self._shallow_copy(values, **kwargs) - @Appender(Index.__contains__.__doc__) def __contains__(self, key: Any) -> bool: hash(key) @@ -981,15 +1068,6 @@ def __contains__(self, key: Any) -> bool: except (LookupError, TypeError, ValueError): return False - @Appender(Index._shallow_copy.__doc__) - def _shallow_copy(self, values=None, **kwargs): - if values is not None: - names = kwargs.pop("names", kwargs.pop("name", self.names)) - # discards freq - kwargs.pop("freq", None) - return MultiIndex.from_tuples(values, names=names, **kwargs) - return self.copy(**kwargs) - @cache_readonly def dtype(self) -> np.dtype: return np.dtype("O") @@ -1039,6 +1117,7 @@ def _nbytes(self, deep: bool = False) -> int: # -------------------------------------------------------------------- # Rendering Methods + def _formatter_func(self, tup): """ Formats each item in tup according to its level's formatter function. @@ -1165,9 +1244,7 @@ def format( return result_levels # -------------------------------------------------------------------- - - def __len__(self) -> int: - return len(self.codes[0]) + # Names Methods def _get_names(self): return FrozenList(self._names) @@ -1227,10 +1304,15 @@ def _set_names(self, names, level=None, validate=True): ) self._names[lev] = name + # If .levels has been accessed, the names in our cache will be stale. + self._reset_cache() + names = property( fset=_set_names, fget=_get_names, doc="""\nNames of levels in MultiIndex.\n""" ) + # -------------------------------------------------------------------- + @Appender(Index._get_grouper_for_level.__doc__) def _get_grouper_for_level(self, mapper, level): indexer = self.codes[level] @@ -1268,10 +1350,6 @@ def _get_grouper_for_level(self, mapper, level): return grouper, codes, level_index - @property - def _constructor(self): - return MultiIndex.from_tuples - @cache_readonly def inferred_type(self) -> str: return "mixed" @@ -1303,49 +1381,6 @@ def _get_level_number(self, level) -> int: ) return level - _tuples = None - - @cache_readonly - def _engine(self): - # Calculate the number of bits needed to represent labels in each - # level, as log2 of their sizes (including -1 for NaN): - sizes = np.ceil(np.log2([len(l) + 1 for l in self.levels])) - - # Sum bit counts, starting from the _right_.... - lev_bits = np.cumsum(sizes[::-1])[::-1] - - # ... in order to obtain offsets such that sorting the combination of - # shifted codes (one for each level, resulting in a unique integer) is - # equivalent to sorting lexicographically the codes themselves. Notice - # that each level needs to be shifted by the number of bits needed to - # represent the _previous_ ones: - offsets = np.concatenate([lev_bits[1:], [0]]).astype("uint64") - - # Check the total number of bits needed for our representation: - if lev_bits[0] > 64: - # The levels would overflow a 64 bit uint - use Python integers: - return MultiIndexPyIntEngine(self.levels, self.codes, offsets) - return MultiIndexUIntEngine(self.levels, self.codes, offsets) - - @property - def values(self): - if self._tuples is not None: - return self._tuples - - values = [] - - for i in range(self.nlevels): - vals = self._get_level_values(i) - if is_categorical_dtype(vals): - vals = vals._internal_get_values() - if isinstance(vals.dtype, ExtensionDtype) or hasattr(vals, "_box_values"): - vals = vals.astype(object) - vals = np.array(vals, copy=False) - values.append(vals) - - self._tuples = lib.fast_zip(values) - return self._tuples - @property def _has_complex_internals(self) -> bool: # used to avoid libreduction code paths, which raise or require conversion @@ -1461,68 +1496,6 @@ def dropna(self, how="any"): new_codes = [level_codes[~indexer] for level_codes in self.codes] return self.copy(codes=new_codes, deep=True) - def get_value(self, series, key): - # Label-based - s = com.values_from_object(series) - k = com.values_from_object(key) - - def _try_mi(k): - # TODO: what if a level contains tuples?? - loc = self.get_loc(k) - new_values = series._values[loc] - new_index = self[loc] - new_index = maybe_droplevels(new_index, k) - return series._constructor( - new_values, index=new_index, name=series.name - ).__finalize__(self) - - try: - return self._engine.get_value(s, k) - except KeyError as e1: - try: - return _try_mi(key) - except KeyError: - pass - - try: - return libindex.get_value_at(s, k) - except IndexError: - raise - except TypeError: - # generator/iterator-like - if is_iterator(key): - raise InvalidIndexError(key) - else: - raise e1 - except Exception: # pragma: no cover - raise e1 - except TypeError: - - # a Timestamp will raise a TypeError in a multi-index - # rather than a KeyError, try it here - # note that a string that 'looks' like a Timestamp will raise - # a KeyError! (GH5725) - if isinstance(key, (datetime.datetime, np.datetime64, str)): - try: - return _try_mi(key) - except KeyError: - raise - except (IndexError, ValueError, TypeError): - pass - - try: - return _try_mi(Timestamp(key)) - except ( - KeyError, - TypeError, - IndexError, - ValueError, - tslibs.OutOfBoundsDatetime, - ): - pass - - raise InvalidIndexError(key) - def _get_level_values(self, level, unique=False): """ Return vector of label values for requested level, @@ -1869,19 +1842,8 @@ def remove_unused_levels(self): return result - @property - def nlevels(self) -> int: - """ - Integer number of levels in this MultiIndex. - """ - return len(self._levels) - - @property - def levshape(self): - """ - A tuple with the length of each level. - """ - return tuple(len(x) for x in self.levels) + # -------------------------------------------------------------------- + # Pickling Methods def __reduce__(self): """Necessary for making this object picklable""" @@ -1915,6 +1877,8 @@ def __setstate__(self, state): self.sortorder = sortorder self._reset_identity() + # -------------------------------------------------------------------- + def __getitem__(self, key): if is_scalar(key): key = com.cast_scalar_indexer(key) @@ -2287,7 +2251,104 @@ def sortlevel(self, level=0, ascending=True, sort_remaining=True): return new_index, indexer - def _convert_listlike_indexer(self, keyarr, kind=None): + def reindex(self, target, method=None, level=None, limit=None, tolerance=None): + """ + Create index with target's values (move/add/delete values as necessary) + + Returns + ------- + new_index : pd.MultiIndex + Resulting index + indexer : np.ndarray or None + Indices of output values in original index. + + """ + # GH6552: preserve names when reindexing to non-named target + # (i.e. neither Index nor Series). + preserve_names = not hasattr(target, "names") + + if level is not None: + if method is not None: + raise TypeError("Fill method not supported if level passed") + + # GH7774: preserve dtype/tz if target is empty and not an Index. + # target may be an iterator + target = ibase._ensure_has_len(target) + if len(target) == 0 and not isinstance(target, Index): + idx = self.levels[level] + attrs = idx._get_attributes_dict() + attrs.pop("freq", None) # don't preserve freq + target = type(idx)._simple_new(np.empty(0, dtype=idx.dtype), **attrs) + else: + target = ensure_index(target) + target, indexer, _ = self._join_level( + target, level, how="right", return_indexers=True, keep_order=False + ) + else: + target = ensure_index(target) + if self.equals(target): + indexer = None + else: + if self.is_unique: + indexer = self.get_indexer( + target, method=method, limit=limit, tolerance=tolerance + ) + else: + raise ValueError("cannot handle a non-unique multi-index!") + + if not isinstance(target, MultiIndex): + if indexer is None: + target = self + elif (indexer >= 0).all(): + target = self.take(indexer) + else: + # hopefully? + target = MultiIndex.from_tuples(target) + + if ( + preserve_names + and target.nlevels == self.nlevels + and target.names != self.names + ): + target = target.copy(deep=False) + target.names = self.names + + return target, indexer + + # -------------------------------------------------------------------- + # Indexing Methods + + def get_value(self, series, key): + # Label-based + if not is_hashable(key) or is_iterator(key): + # We allow tuples if they are hashable, whereas other Index + # subclasses require scalar. + # We have to explicitly exclude generators, as these are hashable. + raise InvalidIndexError(key) + + def _try_mi(k): + # TODO: what if a level contains tuples?? + loc = self.get_loc(k) + + new_values = series._values[loc] + if is_scalar(loc): + return new_values + + new_index = self[loc] + new_index = maybe_droplevels(new_index, k) + return series._constructor( + new_values, index=new_index, name=series.name + ).__finalize__(self) + + try: + return _try_mi(key) + except KeyError: + if is_integer(key): + return series._values[key] + else: + raise + + def _convert_listlike_indexer(self, keyarr): """ Parameters ---------- @@ -2300,7 +2361,7 @@ def _convert_listlike_indexer(self, keyarr, kind=None): indexer is an ndarray or None if cannot convert keyarr are tuple-safe keys """ - indexer, keyarr = super()._convert_listlike_indexer(keyarr, kind=kind) + indexer, keyarr = super()._convert_listlike_indexer(keyarr) # are we indexing a specific level if indexer is None and len(keyarr) and not isinstance(keyarr[0], tuple): @@ -2361,70 +2422,6 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): def get_indexer_non_unique(self, target): return super().get_indexer_non_unique(target) - def reindex(self, target, method=None, level=None, limit=None, tolerance=None): - """ - Create index with target's values (move/add/delete values as necessary) - - Returns - ------- - new_index : pd.MultiIndex - Resulting index - indexer : np.ndarray or None - Indices of output values in original index. - - """ - # GH6552: preserve names when reindexing to non-named target - # (i.e. neither Index nor Series). - preserve_names = not hasattr(target, "names") - - if level is not None: - if method is not None: - raise TypeError("Fill method not supported if level passed") - - # GH7774: preserve dtype/tz if target is empty and not an Index. - # target may be an iterator - target = ibase._ensure_has_len(target) - if len(target) == 0 and not isinstance(target, Index): - idx = self.levels[level] - attrs = idx._get_attributes_dict() - attrs.pop("freq", None) # don't preserve freq - target = type(idx)._simple_new(np.empty(0, dtype=idx.dtype), **attrs) - else: - target = ensure_index(target) - target, indexer, _ = self._join_level( - target, level, how="right", return_indexers=True, keep_order=False - ) - else: - target = ensure_index(target) - if self.equals(target): - indexer = None - else: - if self.is_unique: - indexer = self.get_indexer( - target, method=method, limit=limit, tolerance=tolerance - ) - else: - raise ValueError("cannot handle a non-unique multi-index!") - - if not isinstance(target, MultiIndex): - if indexer is None: - target = self - elif (indexer >= 0).all(): - target = self.take(indexer) - else: - # hopefully? - target = MultiIndex.from_tuples(target) - - if ( - preserve_names - and target.nlevels == self.nlevels - and target.names != self.names - ): - target = target.copy(deep=False) - target.names = self.names - - return target, indexer - def get_slice_bound( self, label: Union[Hashable, Sequence[Hashable]], side: str, kind: str ) -> int: @@ -3058,8 +3055,70 @@ def _update_indexer(idxr, indexer=indexer): # empty indexer if indexer is None: return Int64Index([])._ndarray_values + + indexer = self._reorder_indexer(seq, indexer) + return indexer._ndarray_values + def _reorder_indexer( + self, seq: Tuple[Union[Scalar, Iterable, AnyArrayLike], ...], indexer: ArrayLike + ) -> ArrayLike: + """ + Reorder an indexer of a MultiIndex (self) so that the label are in the + same order as given in seq + + Parameters + ---------- + seq : label/slice/list/mask or a sequence of such + indexer: an Int64Index indexer of self + + Returns + ------- + indexer : a sorted Int64Index indexer of self ordered as seq + """ + # If the index is lexsorted and the list_like label in seq are sorted + # then we do not need to sort + if self.is_lexsorted(): + need_sort = False + for i, k in enumerate(seq): + if is_list_like(k): + if not need_sort: + k_codes = self.levels[i].get_indexer(k) + k_codes = k_codes[k_codes >= 0] # Filter absent keys + # True if the given codes are not ordered + need_sort = (k_codes[:-1] > k_codes[1:]).any() + # Bail out if both index and seq are sorted + if not need_sort: + return indexer + + n = len(self) + keys: Tuple[np.ndarray, ...] = tuple() + # For each level of the sequence in seq, map the level codes with the + # order they appears in a list-like sequence + # This mapping is then use to reorder the indexer + for i, k in enumerate(seq): + if com.is_bool_indexer(k): + new_order = np.arange(n)[indexer] + elif is_list_like(k): + # Generate a map with all level codes as sorted initially + key_order_map = np.ones(len(self.levels[i]), dtype=np.uint64) * len( + self.levels[i] + ) + # Set order as given in the indexer list + level_indexer = self.levels[i].get_indexer(k) + level_indexer = level_indexer[level_indexer >= 0] # Filter absent keys + key_order_map[level_indexer] = np.arange(len(level_indexer)) + + new_order = key_order_map[self.codes[i][indexer]] + else: + # For all other case, use the same order as the level + new_order = np.arange(n)[indexer] + keys = (new_order,) + keys + + # Find the reordering using lexsort on the keys mapping + ind = np.lexsort(keys) + return indexer[ind] + def truncate(self, before=None, after=None): """ Slice index between two labels / tuples, return new MultiIndex @@ -3158,6 +3217,9 @@ def equal_levels(self, other) -> bool: return False return True + # -------------------------------------------------------------------- + # Set Methods + def union(self, other, sort=None): """ Form the union of two MultiIndex objects @@ -3310,21 +3372,6 @@ def difference(self, other, sort=None): else: return MultiIndex.from_tuples(difference, sortorder=0, names=result_names) - @Appender(Index.astype.__doc__) - def astype(self, dtype, copy=True): - dtype = pandas_dtype(dtype) - if is_categorical_dtype(dtype): - msg = "> 1 ndim Categorical are not supported at this time" - raise NotImplementedError(msg) - elif not is_object_dtype(dtype): - raise TypeError( - f"Setting {type(self)} dtype to anything other " - "than object is not supported" - ) - elif copy is True: - return self._shallow_copy() - return self - def _convert_can_do_setop(self, other): result_names = self.names @@ -3345,6 +3392,23 @@ def _convert_can_do_setop(self, other): result_names = self.names if self.names == other.names else None return other, result_names + # -------------------------------------------------------------------- + + @Appender(Index.astype.__doc__) + def astype(self, dtype, copy=True): + dtype = pandas_dtype(dtype) + if is_categorical_dtype(dtype): + msg = "> 1 ndim Categorical are not supported at this time" + raise NotImplementedError(msg) + elif not is_object_dtype(dtype): + raise TypeError( + f"Setting {type(self)} dtype to anything other " + "than object is not supported" + ) + elif copy is True: + return self._shallow_copy() + return self + def insert(self, loc: int, item): """ Make new MultiIndex inserting new item at location diff --git a/pandas/core/indexes/numeric.py b/pandas/core/indexes/numeric.py index 4d3d560aaa688..d67c40a78d807 100644 --- a/pandas/core/indexes/numeric.py +++ b/pandas/core/indexes/numeric.py @@ -1,4 +1,4 @@ -from typing import TYPE_CHECKING, Any +from typing import Any import numpy as np @@ -32,12 +32,9 @@ from pandas.core import algorithms import pandas.core.common as com -from pandas.core.indexes.base import Index, InvalidIndexError, maybe_extract_name +from pandas.core.indexes.base import Index, maybe_extract_name from pandas.core.ops import get_op_result_name -if TYPE_CHECKING: - from pandas import Series - _num_index_shared_docs = dict() @@ -253,12 +250,11 @@ def asi8(self) -> np.ndarray: return self.values.view(self._default_dtype) @Appender(Index._convert_scalar_indexer.__doc__) - def _convert_scalar_indexer(self, key, kind=None): - assert kind in ["loc", "getitem", "iloc", None] + def _convert_scalar_indexer(self, key, kind: str): + assert kind in ["loc", "getitem"] - # don't coerce ilocs to integers - if kind != "iloc": - key = self._maybe_cast_indexer(key) + # never iloc, which we don't coerce to integers + key = self._maybe_cast_indexer(key) return super()._convert_scalar_indexer(key, kind=kind) @@ -383,13 +379,17 @@ def astype(self, dtype, copy=True): return Int64Index(arr) return super().astype(dtype, copy=copy) - @Appender(Index._convert_scalar_indexer.__doc__) - def _convert_scalar_indexer(self, key, kind=None): - assert kind in ["loc", "getitem", "iloc", None] + # ---------------------------------------------------------------- + # Indexing Methods - if kind == "iloc": - self._validate_indexer("positional", key, "iloc") + @Appender(Index._should_fallback_to_positional.__doc__) + def _should_fallback_to_positional(self): + return False + @Appender(Index._convert_scalar_indexer.__doc__) + def _convert_scalar_indexer(self, key, kind: str): + assert kind in ["loc", "getitem"] + # no-op for non-iloc return key @Appender(Index._convert_slice_indexer.__doc__) @@ -401,6 +401,8 @@ def _convert_slice_indexer(self, key: slice, kind=None): # translate to locations return self.slice_indexer(key.start, key.stop, key.step, kind=kind) + # ---------------------------------------------------------------- + def _format_native_types( self, na_rep="", float_format=None, decimal=".", quoting=None, **kwargs ): @@ -416,16 +418,6 @@ def _format_native_types( ) return formatter.get_result_as_array() - def get_value(self, series: "Series", key): - """ - We always want to get an index value, never a value. - """ - if not is_scalar(key): - raise InvalidIndexError - - loc = self.get_loc(key) - return self._get_values_for_loc(series, loc) - def equals(self, other) -> bool: """ Determines if two Index objects contain the same elements. diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index 75c100c9d2c08..42f0a012902a3 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -1,11 +1,11 @@ from datetime import datetime, timedelta -from typing import TYPE_CHECKING, Any +from typing import Any import weakref import numpy as np from pandas._libs import index as libindex -from pandas._libs.tslibs import NaT, frequencies as libfrequencies, resolution +from pandas._libs.tslibs import frequencies as libfrequencies, resolution from pandas._libs.tslibs.parsing import parse_time_string from pandas._libs.tslibs.period import Period from pandas.util._decorators import Appender, cache_readonly @@ -18,7 +18,6 @@ is_float, is_integer, is_integer_dtype, - is_list_like, is_object_dtype, is_scalar, pandas_dtype, @@ -51,9 +50,6 @@ _index_doc_kwargs = dict(ibase._index_doc_kwargs) _index_doc_kwargs.update(dict(target_klass="PeriodIndex or list of Periods")) -if TYPE_CHECKING: - from pandas import Series - # --- Period index sketch @@ -280,22 +276,6 @@ def _shallow_copy_with_infer(self, values=None, **kwargs): """ we always want to return a PeriodIndex """ return self._shallow_copy(values=values, **kwargs) - @property - def _box_func(self): - """Maybe box an ordinal or Period""" - # TODO(DatetimeArray): Avoid double-boxing - # PeriodArray takes care of boxing already, so we need to check - # whether we're given an ordinal or a Period. It seems like some - # places outside of indexes/period.py are calling this _box_func, - # but passing data that's already boxed. - def func(x): - if isinstance(x, Period) or x is NaT: - return x - else: - return Period._from_ordinal(ordinal=x, freq=self.freq) - - return func - def _maybe_convert_timedelta(self, other): """ Convert timedelta-like input to an integer multiple of self.freq @@ -471,17 +451,6 @@ def inferred_type(self) -> str: # indexing return "period" - def get_value(self, series: "Series", key): - """ - Fast lookup of value from 1-dimensional ndarray. Only use this if you - know what you're doing - """ - if is_integer(key): - loc = key - else: - loc = self.get_loc(key) - return self._get_values_for_loc(series, loc) - @Appender(_index_shared_docs["get_indexer"] % _index_doc_kwargs) def get_indexer(self, target, method=None, limit=None, tolerance=None): target = ensure_index(target) @@ -576,12 +545,9 @@ def get_loc(self, key, method=None, tolerance=None): key = Period(key, freq=self.freq) except ValueError: # we cannot construct the Period - # as we have an invalid type - if is_list_like(key): - raise TypeError(f"'{key}' is an invalid key") raise KeyError(key) - ordinal = key.ordinal if key is not NaT else key.value + ordinal = self._data._unbox_scalar(key) try: return self._engine.get_loc(ordinal) except KeyError: diff --git a/pandas/core/indexes/timedeltas.py b/pandas/core/indexes/timedeltas.py index 08a07e8d30348..ec0414adc1376 100644 --- a/pandas/core/indexes/timedeltas.py +++ b/pandas/core/indexes/timedeltas.py @@ -1,7 +1,5 @@ """ implement the TimedeltaIndex """ -import numpy as np - from pandas._libs import NaT, Timedelta, index as libindex from pandas.util._decorators import Appender @@ -53,7 +51,6 @@ "_datetimelike_methods", "_other_ops", "components", - "_box_func", "to_pytimedelta", "sum", "std", @@ -225,17 +222,6 @@ def _maybe_promote(self, other): other = TimedeltaIndex(other) return self, other - def get_value(self, series, key): - """ - Fast lookup of value from 1-dimensional ndarray. Only use this if you - know what you're doing - """ - if is_integer(key): - loc = key - else: - loc = self.get_loc(key) - return self._get_values_for_loc(series, loc) - def get_loc(self, key, method=None, tolerance=None): """ Get integer location for requested label @@ -262,11 +248,6 @@ def get_loc(self, key, method=None, tolerance=None): else: raise KeyError(key) - if tolerance is not None: - # try converting tolerance now, so errors don't get swallowed by - # the try/except clauses below - tolerance = self._convert_tolerance(tolerance, np.asarray(key)) - return Index.get_loc(self, key, method, tolerance) def _maybe_cast_slice_bound(self, label, side: str, kind): @@ -297,12 +278,6 @@ def _maybe_cast_slice_bound(self, label, side: str, kind): return label - def _get_string_slice(self, key: str, use_lhs: bool = True, use_rhs: bool = True): - # TODO: Check for non-True use_lhs/use_rhs - assert isinstance(key, str), type(key) - # given a key, try to figure out a location for a partial slice - raise NotImplementedError - def is_type_compatible(self, typ) -> bool: return typ == self.inferred_type or typ == "timedelta" diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 7e56148b7569e..5c0f893554957 100755 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -566,7 +566,7 @@ def iat(self) -> "_iAtIndexer": return _iAtIndexer("iat", self) -class _NDFrameIndexer(_NDFrameIndexerBase): +class _LocationIndexer(_NDFrameIndexerBase): _valid_types: str axis = None @@ -591,15 +591,9 @@ def _get_label(self, label, axis: int): return self.obj._xs(label, axis=axis) - def _get_loc(self, key: int, axis: int): - return self.obj._ixs(key, axis=axis) - - def _slice(self, obj, axis: int, kind=None): - return self.obj._slice(obj, axis=axis, kind=kind) - def _get_setitem_indexer(self, key): if self.axis is not None: - return self._convert_tuple(key) + return self._convert_tuple(key, is_setter=True) ax = self.obj._get_axis(0) @@ -612,7 +606,7 @@ def _get_setitem_indexer(self, key): if isinstance(key, tuple): try: - return self._convert_tuple(key) + return self._convert_tuple(key, is_setter=True) except IndexingError: pass @@ -620,7 +614,7 @@ def _get_setitem_indexer(self, key): return list(key) try: - return self._convert_to_indexer(key, axis=0) + return self._convert_to_indexer(key, axis=0, is_setter=True) except TypeError as e: # invalid indexer type vs 'other' indexing errors @@ -683,68 +677,25 @@ def _is_nested_tuple_indexer(self, tup: Tuple) -> bool: return any(is_nested_tuple(tup, ax) for ax in self.obj.axes) return False - def _convert_tuple(self, key): + def _convert_tuple(self, key, is_setter: bool = False): keyidx = [] if self.axis is not None: axis = self.obj._get_axis_number(self.axis) for i in range(self.ndim): if i == axis: - keyidx.append(self._convert_to_indexer(key, axis=axis)) + keyidx.append( + self._convert_to_indexer(key, axis=axis, is_setter=is_setter) + ) else: keyidx.append(slice(None)) else: for i, k in enumerate(key): if i >= self.ndim: raise IndexingError("Too many indexers") - idx = self._convert_to_indexer(k, axis=i) + idx = self._convert_to_indexer(k, axis=i, is_setter=is_setter) keyidx.append(idx) return tuple(keyidx) - def _convert_scalar_indexer(self, key, axis: int): - # if we are accessing via lowered dim, use the last dim - ax = self.obj._get_axis(min(axis, self.ndim - 1)) - # a scalar - return ax._convert_scalar_indexer(key, kind=self.name) - - def _convert_slice_indexer(self, key: slice, axis: int): - # if we are accessing via lowered dim, use the last dim - ax = self.obj._get_axis(min(axis, self.ndim - 1)) - return ax._convert_slice_indexer(key, kind=self.name) - - def _has_valid_setitem_indexer(self, indexer) -> bool: - return True - - def _has_valid_positional_setitem_indexer(self, indexer) -> bool: - """ - Validate that a positional indexer cannot enlarge its target - will raise if needed, does not modify the indexer externally. - - Returns - ------- - bool - """ - if isinstance(indexer, dict): - raise IndexError(f"{self.name} cannot enlarge its target object") - else: - if not isinstance(indexer, tuple): - indexer = _tuplify(self.ndim, indexer) - for ax, i in zip(self.obj.axes, indexer): - if isinstance(i, slice): - # should check the stop slice? - pass - elif is_list_like_indexer(i): - # should check the elements? - pass - elif is_integer(i): - if i >= len(ax): - raise IndexError( - f"{self.name} cannot enlarge its target object" - ) - elif isinstance(i, dict): - raise IndexError(f"{self.name} cannot enlarge its target object") - - return True - def _setitem_with_indexer(self, indexer, value): self._has_valid_setitem_indexer(indexer) @@ -893,7 +844,8 @@ def _setitem_with_indexer(self, indexer, value): # we can directly set the series here # as we select a slice indexer on the mi - idx = index._convert_slice_indexer(idx) + if isinstance(idx, slice): + idx = index._convert_slice_indexer(idx) obj._consolidate_inplace() obj = obj.copy() obj._data = obj._data.setitem(indexer=tuple([idx]), value=value) @@ -1232,80 +1184,6 @@ def _align_frame(self, indexer, df: ABCDataFrame): raise ValueError("Incompatible indexer with DataFrame") - def _getitem_tuple(self, tup: Tuple): - try: - return self._getitem_lowerdim(tup) - except IndexingError: - pass - - # no multi-index, so validate all of the indexers - self._has_valid_tuple(tup) - - # ugly hack for GH #836 - if self._multi_take_opportunity(tup): - return self._multi_take(tup) - - # no shortcut needed - retval = self.obj - for i, key in enumerate(tup): - if com.is_null_slice(key): - continue - - retval = getattr(retval, self.name)._getitem_axis(key, axis=i) - - return retval - - def _multi_take_opportunity(self, tup: Tuple) -> bool: - """ - Check whether there is the possibility to use ``_multi_take``. - - Currently the limit is that all axes being indexed, must be indexed with - list-likes. - - Parameters - ---------- - tup : tuple - Tuple of indexers, one per axis. - - Returns - ------- - bool - Whether the current indexing, - can be passed through `_multi_take`. - """ - if not all(is_list_like_indexer(x) for x in tup): - return False - - # just too complicated - if any(com.is_bool_indexer(x) for x in tup): - return False - - return True - - def _multi_take(self, tup: Tuple): - """ - Create the indexers for the passed tuple of keys, and - executes the take operation. This allows the take operation to be - executed all at once, rather than once for each dimension. - Improving efficiency. - - Parameters - ---------- - tup : tuple - Tuple of indexers, one per axis. - - Returns - ------- - values: same type as the object being indexed - """ - # GH 836 - o = self.obj - d = { - axis: self._get_listlike_indexer(key, axis) - for (key, axis) in zip(tup, o._AXIS_ORDERS) - } - return o._reindex_with_indexers(d, copy=True, allow_dups=True) - def _handle_lowerdim_multi_index_axis0(self, tup: Tuple): # we have an axis0 multi-index, handle or raise axis = self.axis or 0 @@ -1426,97 +1304,6 @@ def _getitem_nested_tuple(self, tup: Tuple): return obj - def _get_listlike_indexer(self, key, axis: int, raise_missing: bool = False): - """ - Transform a list-like of keys into a new index and an indexer. - - Parameters - ---------- - key : list-like - Targeted labels. - axis: int - Dimension on which the indexing is being made. - raise_missing: bool, default False - Whether to raise a KeyError if some labels were not found. - Will be removed in the future, and then this method will always behave as - if ``raise_missing=True``. - - Raises - ------ - KeyError - If at least one key was requested but none was found, and - raise_missing=True. - - Returns - ------- - keyarr: Index - New index (coinciding with 'key' if the axis is unique). - values : array-like - Indexer for the return object, -1 denotes keys not found. - """ - o = self.obj - ax = o._get_axis(axis) - - # Have the index compute an indexer or return None - # if it cannot handle: - indexer, keyarr = ax._convert_listlike_indexer(key, kind=self.name) - # We only act on all found values: - if indexer is not None and (indexer != -1).all(): - self._validate_read_indexer(key, indexer, axis, raise_missing=raise_missing) - return ax[indexer], indexer - - if ax.is_unique and not getattr(ax, "is_overlapping", False): - indexer = ax.get_indexer_for(key) - keyarr = ax.reindex(keyarr)[0] - else: - keyarr, indexer, new_indexer = ax._reindex_non_unique(keyarr) - - self._validate_read_indexer(keyarr, indexer, axis, raise_missing=raise_missing) - return keyarr, indexer - - def _getitem_iterable(self, key, axis: int): - """ - Index current object with an an iterable key. - - The iterable key can be a boolean indexer or a collection of keys. - - Parameters - ---------- - key : iterable - Targeted labels or boolean indexer. - axis: int - Dimension on which the indexing is being made. - - Raises - ------ - KeyError - If no key was found. Will change in the future to raise if not all - keys were found. - IndexingError - If the boolean indexer is unalignable with the object being - indexed. - - Returns - ------- - scalar, DataFrame, or Series: indexed value(s). - """ - # caller is responsible for ensuring non-None axis - self._validate_key(key, axis) - - labels = self.obj._get_axis(axis) - - if com.is_bool_indexer(key): - # A boolean indexer - key = check_bool_indexer(labels, key) - (inds,) = key.nonzero() - return self.obj._take_with_is_copy(inds, axis=axis) - else: - # A collection of keys - keyarr, indexer = self._get_listlike_indexer(key, axis, raise_missing=False) - return self.obj._reindex_with_indexers( - {axis: [keyarr, indexer]}, copy=True, allow_dups=True - ) - def _validate_read_indexer( self, key, indexer, axis: int, raise_missing: bool = False ): @@ -1577,135 +1364,59 @@ def _validate_read_indexer( "https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike" # noqa:E501 ) - def _convert_to_indexer(self, key, axis: int): - """ - Convert indexing key into something we can use to do actual fancy - indexing on a ndarray. - - Examples - ix[:5] -> slice(0, 5) - ix[[1,2,3]] -> [1,2,3] - ix[['foo', 'bar', 'baz']] -> [i, j, k] (indices of foo, bar, baz) - - Going by Zen of Python? - 'In the face of ambiguity, refuse the temptation to guess.' - raise AmbiguousIndexError with integer labels? - - No, prefer label-based indexing - """ - labels = self.obj._get_axis(axis) - - if isinstance(key, slice): - return self._convert_slice_indexer(key, axis) + def _convert_to_indexer(self, key, axis: int, is_setter: bool = False): + raise AbstractMethodError(self) - # try to find out correct indexer, if not type correct raise - try: - key = self._convert_scalar_indexer(key, axis) - except TypeError: - # but we will allow setting - pass + def __getitem__(self, key): + if type(key) is tuple: + key = tuple(com.apply_if_callable(x, self.obj) for x in key) + if self._is_scalar_access(key): + try: + return self.obj._get_value(*key, takeable=self._takeable) + except (KeyError, IndexError, AttributeError): + # AttributeError for IntervalTree get_value + pass + return self._getitem_tuple(key) + else: + # we by definition only have the 0th axis + axis = self.axis or 0 - # see if we are positional in nature - is_int_index = labels.is_integer() - is_int_positional = is_integer(key) and not is_int_index + maybe_callable = com.apply_if_callable(key, self.obj) + return self._getitem_axis(maybe_callable, axis=axis) - if is_scalar(key) or isinstance(labels, ABCMultiIndex): - # Otherwise get_loc will raise InvalidIndexError + def _is_scalar_access(self, key: Tuple): + raise NotImplementedError() - # if we are a label return me - try: - return labels.get_loc(key) - except LookupError: - if isinstance(key, tuple) and isinstance(labels, ABCMultiIndex): - if len(key) == labels.nlevels: - return {"key": key} - raise - except TypeError: - pass - except ValueError: - if not is_int_positional: - raise + def _getitem_tuple(self, tup: Tuple): + raise AbstractMethodError(self) - # a positional - if is_int_positional: + def _getitem_axis(self, key, axis: int): + raise NotImplementedError() - # if we are setting and its not a valid location - # its an insert which fails by definition + def _has_valid_setitem_indexer(self, indexer) -> bool: + raise AbstractMethodError(self) - if self.name == "loc": - # always valid - return {"key": key} - - if key >= self.obj.shape[axis] and not isinstance(labels, ABCMultiIndex): - # a positional - raise ValueError("cannot set by positional indexing with enlargement") - - return key - - if is_nested_tuple(key, labels): - return labels.get_locs(key) - - elif is_list_like_indexer(key): - - if com.is_bool_indexer(key): - key = check_bool_indexer(labels, key) - (inds,) = key.nonzero() - return inds - else: - # When setting, missing keys are not allowed, even with .loc: - return self._get_listlike_indexer(key, axis, raise_missing=True)[1] - else: - try: - return labels.get_loc(key) - except LookupError: - # allow a not found key only if we are a setter - if not is_list_like_indexer(key): - return {"key": key} - raise - - -class _LocationIndexer(_NDFrameIndexer): - _takeable: bool = False - - def __getitem__(self, key): - if type(key) is tuple: - key = tuple(com.apply_if_callable(x, self.obj) for x in key) - if self._is_scalar_access(key): - try: - return self.obj._get_value(*key, takeable=self._takeable) - except (KeyError, IndexError, AttributeError): - # AttributeError for IntervalTree get_value - pass - return self._getitem_tuple(key) - else: - # we by definition only have the 0th axis - axis = self.axis or 0 - - maybe_callable = com.apply_if_callable(key, self.obj) - return self._getitem_axis(maybe_callable, axis=axis) - - def _is_scalar_access(self, key: Tuple): - raise NotImplementedError() - - def _getitem_axis(self, key, axis: int): - raise NotImplementedError() - - def _getbool_axis(self, key, axis: int): - # caller is responsible for ensuring non-None axis - labels = self.obj._get_axis(axis) - key = check_bool_indexer(labels, key) - inds = key.nonzero()[0] - return self.obj._take_with_is_copy(inds, axis=axis) + def _getbool_axis(self, key, axis: int): + # caller is responsible for ensuring non-None axis + labels = self.obj._get_axis(axis) + key = check_bool_indexer(labels, key) + inds = key.nonzero()[0] + return self.obj._take_with_is_copy(inds, axis=axis) @Appender(IndexingMixin.loc.__doc__) class _LocIndexer(_LocationIndexer): + _takeable: bool = False _valid_types = ( "labels (MUST BE IN THE INDEX), slices of labels (BOTH " "endpoints included! Can be slices of integers if the " "index is integers), listlike of labels, boolean" ) - @Appender(_NDFrameIndexer._validate_key.__doc__) + # ------------------------------------------------------------------- + # Key Checks + + @Appender(_LocationIndexer._validate_key.__doc__) def _validate_key(self, key, axis: int): # valid for a collection of labels (we check their presence later) @@ -1720,7 +1431,11 @@ def _validate_key(self, key, axis: int): return if not is_list_like_indexer(key): - self._convert_scalar_indexer(key, axis) + labels = self.obj._get_axis(axis) + labels._convert_scalar_indexer(key, kind="loc") + + def _has_valid_setitem_indexer(self, indexer) -> bool: + return True def _is_scalar_access(self, key: Tuple) -> bool: """ @@ -1753,6 +1468,61 @@ def _is_scalar_access(self, key: Tuple) -> bool: return True + # ------------------------------------------------------------------- + # MultiIndex Handling + + def _multi_take_opportunity(self, tup: Tuple) -> bool: + """ + Check whether there is the possibility to use ``_multi_take``. + + Currently the limit is that all axes being indexed, must be indexed with + list-likes. + + Parameters + ---------- + tup : tuple + Tuple of indexers, one per axis. + + Returns + ------- + bool + Whether the current indexing, + can be passed through `_multi_take`. + """ + if not all(is_list_like_indexer(x) for x in tup): + return False + + # just too complicated + if any(com.is_bool_indexer(x) for x in tup): + return False + + return True + + def _multi_take(self, tup: Tuple): + """ + Create the indexers for the passed tuple of keys, and + executes the take operation. This allows the take operation to be + executed all at once, rather than once for each dimension. + Improving efficiency. + + Parameters + ---------- + tup : tuple + Tuple of indexers, one per axis. + + Returns + ------- + values: same type as the object being indexed + """ + # GH 836 + d = { + axis: self._get_listlike_indexer(key, axis) + for (key, axis) in zip(tup, self.obj._AXIS_ORDERS) + } + return self.obj._reindex_with_indexers(d, copy=True, allow_dups=True) + + # ------------------------------------------------------------------- + def _get_partial_string_timestamp_match_key(self, key, labels): """ Translate any partial string timestamp matches in key, returning the @@ -1785,6 +1555,60 @@ def _get_partial_string_timestamp_match_key(self, key, labels): return key + def _getitem_iterable(self, key, axis: int): + """ + Index current object with an an iterable collection of keys. + + Parameters + ---------- + key : iterable + Targeted labels. + axis: int + Dimension on which the indexing is being made. + + Raises + ------ + KeyError + If no key was found. Will change in the future to raise if not all + keys were found. + + Returns + ------- + scalar, DataFrame, or Series: indexed value(s). + """ + # we assume that not com.is_bool_indexer(key), as that is + # handled before we get here. + self._validate_key(key, axis) + + # A collection of keys + keyarr, indexer = self._get_listlike_indexer(key, axis, raise_missing=False) + return self.obj._reindex_with_indexers( + {axis: [keyarr, indexer]}, copy=True, allow_dups=True + ) + + def _getitem_tuple(self, tup: Tuple): + try: + return self._getitem_lowerdim(tup) + except IndexingError: + pass + + # no multi-index, so validate all of the indexers + self._has_valid_tuple(tup) + + # ugly hack for GH #836 + if self._multi_take_opportunity(tup): + return self._multi_take(tup) + + # no shortcut needed + retval = self.obj + for i, key in enumerate(tup): + if com.is_null_slice(key): + continue + + retval = getattr(retval, self.name)._getitem_axis(key, axis=i) + + return retval + def _getitem_axis(self, key, axis: int): key = item_from_zerodim(key) if is_iterator(key): @@ -1865,12 +1689,139 @@ def _get_slice_axis(self, slice_obj: slice, axis: int): ) if isinstance(indexer, slice): - return self._slice(indexer, axis=axis, kind="iloc") + return self.obj._slice(indexer, axis=axis, kind="iloc") else: # DatetimeIndex overrides Index.slice_indexer and may # return a DatetimeIndex instead of a slice object. return self.obj.take(indexer, axis=axis) + def _convert_to_indexer(self, key, axis: int, is_setter: bool = False): + """ + Convert indexing key into something we can use to do actual fancy + indexing on a ndarray. + + Examples + ix[:5] -> slice(0, 5) + ix[[1,2,3]] -> [1,2,3] + ix[['foo', 'bar', 'baz']] -> [i, j, k] (indices of foo, bar, baz) + + Going by Zen of Python? + 'In the face of ambiguity, refuse the temptation to guess.' + raise AmbiguousIndexError with integer labels? + - No, prefer label-based indexing + """ + labels = self.obj._get_axis(axis) + + if isinstance(key, slice): + return labels._convert_slice_indexer(key, kind="loc") + + if is_scalar(key): + # try to find out correct indexer, if not type correct raise + try: + key = labels._convert_scalar_indexer(key, kind="loc") + except TypeError: + # but we will allow setting + if not is_setter: + raise + + # see if we are positional in nature + is_int_index = labels.is_integer() + is_int_positional = is_integer(key) and not is_int_index + + if is_scalar(key) or isinstance(labels, ABCMultiIndex): + # Otherwise get_loc will raise InvalidIndexError + + # if we are a label return me + try: + return labels.get_loc(key) + except LookupError: + if isinstance(key, tuple) and isinstance(labels, ABCMultiIndex): + if len(key) == labels.nlevels: + return {"key": key} + raise + except TypeError: + pass + except ValueError: + if not is_int_positional: + raise + + # a positional + if is_int_positional: + + # if we are setting and its not a valid location + # its an insert which fails by definition + + # always valid + return {"key": key} + + if is_nested_tuple(key, labels): + return labels.get_locs(key) + + elif is_list_like_indexer(key): + + if com.is_bool_indexer(key): + key = check_bool_indexer(labels, key) + (inds,) = key.nonzero() + return inds + else: + # When setting, missing keys are not allowed, even with .loc: + return self._get_listlike_indexer(key, axis, raise_missing=True)[1] + else: + try: + return labels.get_loc(key) + except LookupError: + # allow a not found key only if we are a setter + if not is_list_like_indexer(key): + return {"key": key} + raise + + def _get_listlike_indexer(self, key, axis: int, raise_missing: bool = False): + """ + Transform a list-like of keys into a new index and an indexer. + + Parameters + ---------- + key : list-like + Targeted labels. + axis: int + Dimension on which the indexing is being made. + raise_missing: bool, default False + Whether to raise a KeyError if some labels were not found. + Will be removed in the future, and then this method will always behave as + if ``raise_missing=True``. + + Raises + ------ + KeyError + If at least one key was requested but none was found, and + raise_missing=True. + + Returns + ------- + keyarr: Index + New index (coinciding with 'key' if the axis is unique). + values : array-like + Indexer for the return object, -1 denotes keys not found. + """ + ax = self.obj._get_axis(axis) + + # Have the index compute an indexer or return None + # if it cannot handle: + indexer, keyarr = ax._convert_listlike_indexer(key) + # We only act on all found values: + if indexer is not None and (indexer != -1).all(): + self._validate_read_indexer(key, indexer, axis, raise_missing=raise_missing) + return ax[indexer], indexer + + if ax.is_unique and not getattr(ax, "is_overlapping", False): + indexer = ax.get_indexer_for(key) + keyarr = ax.reindex(keyarr)[0] + else: + keyarr, indexer, new_indexer = ax._reindex_non_unique(keyarr) + + self._validate_read_indexer(keyarr, indexer, axis, raise_missing=raise_missing) + return keyarr, indexer + @Appender(IndexingMixin.iloc.__doc__) class _iLocIndexer(_LocationIndexer): @@ -1880,6 +1831,9 @@ class _iLocIndexer(_LocationIndexer): ) _takeable = True + # ------------------------------------------------------------------- + # Key Checks + def _validate_key(self, key, axis: int): if com.is_bool_indexer(key): if hasattr(key, "index") and isinstance(key.index, Index): @@ -1920,6 +1874,37 @@ def _validate_key(self, key, axis: int): def _has_valid_setitem_indexer(self, indexer): self._has_valid_positional_setitem_indexer(indexer) + def _has_valid_positional_setitem_indexer(self, indexer) -> bool: + """ + Validate that a positional indexer cannot enlarge its target + will raise if needed, does not modify the indexer externally. + + Returns + ------- + bool + """ + if isinstance(indexer, dict): + raise IndexError(f"{self.name} cannot enlarge its target object") + else: + if not isinstance(indexer, tuple): + indexer = _tuplify(self.ndim, indexer) + for ax, i in zip(self.obj.axes, indexer): + if isinstance(i, slice): + # should check the stop slice? + pass + elif is_list_like_indexer(i): + # should check the elements? + pass + elif is_integer(i): + if i >= len(ax): + raise IndexError( + f"{self.name} cannot enlarge its target object" + ) + elif isinstance(i, dict): + raise IndexError(f"{self.name} cannot enlarge its target object") + + return True + def _is_scalar_access(self, key: Tuple) -> bool: """ Returns @@ -1963,6 +1948,8 @@ def _validate_integer(self, key: int, axis: int) -> None: if key >= len_axis or key < -len_axis: raise IndexError("single positional indexer is out-of-bounds") + # ------------------------------------------------------------------- + def _getitem_tuple(self, tup: Tuple): self._has_valid_tuple(tup) @@ -2038,7 +2025,7 @@ def _getitem_axis(self, key, axis: int): # validate the location self._validate_integer(key, axis) - return self._get_loc(key, axis=axis) + return self.obj._ixs(key, axis=axis) def _get_slice_axis(self, slice_obj: slice, axis: int): # caller is responsible for ensuring non-None axis @@ -2047,25 +2034,26 @@ def _get_slice_axis(self, slice_obj: slice, axis: int): if not need_slice(slice_obj): return obj.copy(deep=False) - indexer = self._convert_slice_indexer(slice_obj, axis) - return self._slice(indexer, axis=axis, kind="iloc") + labels = obj._get_axis(axis) + indexer = labels._convert_slice_indexer(slice_obj, kind="iloc") + return self.obj._slice(indexer, axis=axis, kind="iloc") - def _convert_to_indexer(self, key, axis: int): + def _convert_to_indexer(self, key, axis: int, is_setter: bool = False): """ Much simpler as we only have to deal with our valid types. """ + labels = self.obj._get_axis(axis) + # make need to convert a float key if isinstance(key, slice): - return self._convert_slice_indexer(key, axis) + return labels._convert_slice_indexer(key, kind="iloc") elif is_float(key): - return self._convert_scalar_indexer(key, axis) - - try: - self._validate_key(key, axis) + labels._validate_indexer("positional", key, "iloc") return key - except ValueError: - raise ValueError(f"Can only index by location with a [{self._valid_types}]") + + self._validate_key(key, axis) + return key class _ScalarAccessIndexer(_NDFrameIndexerBase): @@ -2116,21 +2104,11 @@ def _convert_key(self, key, is_setter: bool = False): if is_setter: return list(key) - for ax, i in zip(self.obj.axes, key): - if ax.is_integer(): - if not is_integer(i): - raise ValueError( - "At based indexing on an integer index " - "can only have integer indexers" - ) - else: - if is_integer(i) and not (ax.holds_integer() or ax.is_floating()): - raise ValueError( - "At based indexing on an non-integer " - "index can only have non-integer " - "indexers" - ) - return key + lkey = list(key) + for n, (ax, i) in enumerate(zip(self.obj.axes, key)): + lkey[n] = ax._convert_scalar_indexer(i, kind="loc") + + return tuple(lkey) @Appender(IndexingMixin.iat.__doc__) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 9e31ccebd0f1b..85a26179276f5 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -7,8 +7,7 @@ import numpy as np -from pandas._libs import NaT, algos as libalgos, lib, tslib, writers -from pandas._libs.index import convert_scalar +from pandas._libs import NaT, Timestamp, algos as libalgos, lib, tslib, writers import pandas._libs.internals as libinternals from pandas._libs.tslibs import Timedelta, conversion from pandas._libs.tslibs.timezones import tz_compare @@ -16,6 +15,7 @@ from pandas.core.dtypes.cast import ( astype_nansafe, + convert_scalar_for_putitemlike, find_common_type, infer_dtype_from, infer_dtype_from_scalar, @@ -762,7 +762,7 @@ def replace( # The only non-DatetimeLike class that also has a non-trivial # try_coerce_args is ObjectBlock, but that overrides replace, # so does not get here. - to_replace = convert_scalar(values, to_replace) + to_replace = convert_scalar_for_putitemlike(to_replace, values.dtype) mask = missing.mask_missing(values, to_replace) if filter is not None: @@ -841,7 +841,7 @@ def setitem(self, indexer, value): # We only get here for non-Extension Blocks, so _try_coerce_args # is only relevant for DatetimeBlock and TimedeltaBlock if lib.is_scalar(value): - value = convert_scalar(values, value) + value = convert_scalar_for_putitemlike(value, values.dtype) else: # current dtype cannot store value, coerce to common dtype @@ -957,7 +957,7 @@ def putmask(self, mask, new, align=True, inplace=False, axis=0, transpose=False) # We only get here for non-Extension Blocks, so _try_coerce_args # is only relevant for DatetimeBlock and TimedeltaBlock if lib.is_scalar(new): - new = convert_scalar(new_values, new) + new = convert_scalar_for_putitemlike(new, new_values.dtype) if transpose: new_values = new_values.T @@ -1200,7 +1200,7 @@ def _interpolate_with_fill( values = self.values if inplace else self.values.copy() # We only get here for non-ExtensionBlock - fill_value = convert_scalar(self.values, fill_value) + fill_value = convert_scalar_for_putitemlike(fill_value, self.values.dtype) values = missing.interpolate_2d( values, @@ -1405,7 +1405,7 @@ def where_func(cond, values, other): raise TypeError if lib.is_scalar(other) and isinstance(values, np.ndarray): # convert datetime to datetime64, timedelta to timedelta64 - other = convert_scalar(values, other) + other = convert_scalar_for_putitemlike(other, values.dtype) # By the time we get here, we should have all Series/Index # args extracted to ndarray @@ -2158,6 +2158,16 @@ def internal_values(self): # Override to return DatetimeArray and TimedeltaArray return self.array_values() + def iget(self, key): + # GH#31649 we need to wrap scalars in Timestamp/Timedelta + # TODO: this can be removed if we ever have 2D EA + result = super().iget(key) + if isinstance(result, np.datetime64): + result = Timestamp(result) + elif isinstance(result, np.timedelta64): + result = Timedelta(result) + return result + class DatetimeBlock(DatetimeLikeBlockMixin, Block): __slots__ = () diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 526863d2e5ec3..08ae0b02169d4 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1565,7 +1565,7 @@ def fast_xs(self, loc): fast path for getting a cross-section return a view of the data """ - return self._block.values[loc] + raise NotImplementedError("Use series._values[loc] instead") def concat(self, to_concat, new_axis) -> "SingleBlockManager": """ diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index 8829c242b1129..d9f21f0b274ac 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -352,8 +352,8 @@ def __init__( for obj in objs: if not isinstance(obj, (Series, DataFrame)): msg = ( - "cannot concatenate object of type '{typ}'; " - "only Series and DataFrame objs are valid".format(typ=type(obj)) + f"cannot concatenate object of type '{type(obj)}'; " + "only Series and DataFrame objs are valid" ) raise TypeError(msg) @@ -403,8 +403,7 @@ def __init__( self._is_series = isinstance(sample, ABCSeries) if not 0 <= axis <= sample.ndim: raise AssertionError( - "axis must be between 0 and {ndim}, input was " - "{axis}".format(ndim=sample.ndim, axis=axis) + f"axis must be between 0 and {sample.ndim}, input was {axis}" ) # if we have mixed ndims, then convert to highest ndim @@ -622,11 +621,7 @@ def _make_concat_multiindex(indexes, keys, levels=None, names=None) -> MultiInde try: i = level.get_loc(key) except KeyError: - raise ValueError( - "Key {key!s} not in level {level!s}".format( - key=key, level=level - ) - ) + raise ValueError(f"Key {key} not in level {level}") to_concat.append(np.repeat(i, len(index))) codes_list.append(np.concatenate(to_concat)) @@ -677,11 +672,7 @@ def _make_concat_multiindex(indexes, keys, levels=None, names=None) -> MultiInde mask = mapped == -1 if mask.any(): - raise ValueError( - "Values not found in passed level: {hlevel!s}".format( - hlevel=hlevel[mask] - ) - ) + raise ValueError(f"Values not found in passed level: {hlevel[mask]!s}") new_codes.append(np.repeat(mapped, n)) diff --git a/pandas/core/reshape/melt.py b/pandas/core/reshape/melt.py index d04287e1e9088..782b8043430e1 100644 --- a/pandas/core/reshape/melt.py +++ b/pandas/core/reshape/melt.py @@ -88,9 +88,7 @@ def melt( if len(frame.columns.names) == len(set(frame.columns.names)): var_name = frame.columns.names else: - var_name = [ - "variable_{i}".format(i=i) for i in range(len(frame.columns.names)) - ] + var_name = [f"variable_{i}" for i in range(len(frame.columns.names))] else: var_name = [ frame.columns.name if frame.columns.name is not None else "variable" @@ -417,9 +415,7 @@ def wide_to_long( """ def get_var_names(df, stub: str, sep: str, suffix: str) -> List[str]: - regex = r"^{stub}{sep}{suffix}$".format( - stub=re.escape(stub), sep=re.escape(sep), suffix=suffix - ) + regex = fr"^{re.escape(stub)}{re.escape(sep)}{suffix}$" pattern = re.compile(regex) return [col for col in df.columns if pattern.match(col)] diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index ceee2f66dba42..480c5279ad3f6 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -611,8 +611,9 @@ def __init__( if _left.columns.nlevels != _right.columns.nlevels: msg = ( "merging between different levels can give an unintended " - "result ({left} levels on the left, {right} on the right)" - ).format(left=_left.columns.nlevels, right=_right.columns.nlevels) + f"result ({left.columns.nlevels} levels on the left," + f"{right.columns.nlevels} on the right)" + ) warnings.warn(msg, UserWarning) self._validate_specification() @@ -679,7 +680,7 @@ def _indicator_pre_merge( if i in columns: raise ValueError( "Cannot use `indicator=True` option when " - "data contains a column named {name}".format(name=i) + f"data contains a column named {i}" ) if self.indicator_name in columns: raise ValueError( @@ -831,7 +832,7 @@ def _maybe_add_join_keys(self, result, left_indexer, right_indexer): else: result.index = Index(key_col, name=name) else: - result.insert(i, name or "key_{i}".format(i=i), key_col) + result.insert(i, name or f"key_{i}", key_col) def _get_join_indexers(self): """ return the join indexers """ @@ -1185,13 +1186,10 @@ def _validate_specification(self): if len(common_cols) == 0: raise MergeError( "No common columns to perform merge on. " - "Merge options: left_on={lon}, right_on={ron}, " - "left_index={lidx}, right_index={ridx}".format( - lon=self.left_on, - ron=self.right_on, - lidx=self.left_index, - ridx=self.right_index, - ) + f"Merge options: left_on={self.left_on}, " + f"right_on={self.right_on}, " + f"left_index={self.left_index}, " + f"right_index={self.right_index}" ) if not common_cols.is_unique: raise MergeError(f"Data columns not unique: {repr(common_cols)}") @@ -1486,12 +1484,12 @@ def get_result(self): def _asof_function(direction: str): - name = "asof_join_{dir}".format(dir=direction) + name = f"asof_join_{direction}" return getattr(libjoin, name, None) def _asof_by_function(direction: str): - name = "asof_join_{dir}_on_X_by_Y".format(dir=direction) + name = f"asof_join_{direction}_on_X_by_Y" return getattr(libjoin, name, None) @@ -1601,9 +1599,7 @@ def _validate_specification(self): # check 'direction' is valid if self.direction not in ["backward", "forward", "nearest"]: - raise MergeError( - "direction invalid: {direction}".format(direction=self.direction) - ) + raise MergeError(f"direction invalid: {self.direction}") @property def _asof_key(self): @@ -1628,17 +1624,13 @@ def _get_merge_keys(self): # later with a ValueError, so we don't *need* to check # for them here. msg = ( - "incompatible merge keys [{i}] {lkdtype} and " - "{rkdtype}, both sides category, but not equal ones".format( - i=i, lkdtype=repr(lk.dtype), rkdtype=repr(rk.dtype) - ) + f"incompatible merge keys [{i}] {repr(lk.dtype)} and " + f"{repr(rk.dtype)}, both sides category, but not equal ones" ) else: msg = ( - "incompatible merge keys [{i}] {lkdtype} and " - "{rkdtype}, must be the same type".format( - i=i, lkdtype=repr(lk.dtype), rkdtype=repr(rk.dtype) - ) + f"incompatible merge keys [{i}] {repr(lk.dtype)} and " + f"{repr(rk.dtype)}, must be the same type" ) raise MergeError(msg) @@ -1651,10 +1643,8 @@ def _get_merge_keys(self): lt = left_join_keys[-1] msg = ( - "incompatible tolerance {tolerance}, must be compat " - "with type {lkdtype}".format( - tolerance=type(self.tolerance), lkdtype=repr(lt.dtype) - ) + f"incompatible tolerance {self.tolerance}, must be compat " + f"with type {repr(lk.dtype)}" ) if needs_i8_conversion(lt): @@ -1680,8 +1670,11 @@ def _get_merge_keys(self): # validate allow_exact_matches if not is_bool(self.allow_exact_matches): - msg = "allow_exact_matches must be boolean, passed {passed}" - raise MergeError(msg.format(passed=self.allow_exact_matches)) + msg = ( + "allow_exact_matches must be boolean, " + f"passed {self.allow_exact_matches}" + ) + raise MergeError(msg) return left_join_keys, right_join_keys, join_names diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index a5a9ec9fb79ba..053fb86836ff8 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -200,7 +200,7 @@ def _add_margins( if not isinstance(margins_name, str): raise ValueError("margins_name argument must be a string") - msg = 'Conflicting name "{name}" in margins'.format(name=margins_name) + msg = f'Conflicting name "{margins_name}" in margins' for level in table.index.names: if margins_name in table.index.get_level_values(level): raise ValueError(msg) @@ -650,9 +650,7 @@ def _normalize(table, normalize, margins: bool, margins_name="All"): if (margins_name not in table.iloc[-1, :].name) | ( margins_name != table.iloc[:, -1].name ): - raise ValueError( - "{mname} not in pivoted DataFrame".format(mname=margins_name) - ) + raise ValueError(f"{margins_name} not in pivoted DataFrame") column_margin = table.iloc[:-1, -1] index_margin = table.iloc[-1, :-1] @@ -702,7 +700,7 @@ def _get_names(arrs, names, prefix: str = "row"): if isinstance(arr, ABCSeries) and arr.name is not None: names.append(arr.name) else: - names.append("{prefix}_{i}".format(prefix=prefix, i=i)) + names.append(f"{prefix}_{i}") else: if len(names) != len(arrs): raise AssertionError("arrays and names must have the same length") diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index f00ff0d4ba5ed..359e5b956f8a5 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -873,15 +873,13 @@ def get_dummies( # validate prefixes and separator to avoid silently dropping cols def check_len(item, name): - len_msg = ( - "Length of '{name}' ({len_item}) did not match the " - "length of the columns being encoded ({len_enc})." - ) if is_list_like(item): if not len(item) == data_to_encode.shape[1]: - len_msg = len_msg.format( - name=name, len_item=len(item), len_enc=data_to_encode.shape[1] + len_msg = ( + f"Length of '{name}' ({len(item)}) did not match the " + "length of the columns being encoded " + f"({data_to_encode.shape[1]})." ) raise ValueError(len_msg) @@ -990,8 +988,7 @@ def get_empty_frame(data) -> DataFrame: # PY2 embedded unicode, gh-22084 def _make_col_name(prefix, prefix_sep, level) -> str: - fstr = "{prefix}{prefix_sep}{level}" - return fstr.format(prefix=prefix, prefix_sep=prefix_sep, level=level) + return f"{prefix}{prefix_sep}{level}" dummy_cols = [_make_col_name(prefix, prefix_sep, level) for level in levels] diff --git a/pandas/core/reshape/tile.py b/pandas/core/reshape/tile.py index 00a7645d0c7a5..a18b45a077be0 100644 --- a/pandas/core/reshape/tile.py +++ b/pandas/core/reshape/tile.py @@ -202,17 +202,10 @@ def cut( """ # NOTE: this binning code is changed a bit from histogram for var(x) == 0 - # for handling the cut for datetime and timedelta objects original = x x = _preprocess_for_cut(x) x, dtype = _coerce_to_type(x) - # To support cut(IntegerArray), we convert to object dtype with NaN - # Will properly support in the future. - # https://github.com/pandas-dev/pandas/pull/31290 - if is_extension_array_dtype(x.dtype) and is_integer_dtype(x.dtype): - x = x.to_numpy(dtype=object, na_value=np.nan) - if not np.iterable(bins): if is_scalar(bins) and bins < 1: raise ValueError("`bins` should be a positive integer.") @@ -434,7 +427,7 @@ def _bins_to_cuts( def _coerce_to_type(x): """ - if the passed data is of datetime/timedelta or bool type, + if the passed data is of datetime/timedelta, bool or nullable int type, this method converts it to numeric so that cut or qcut method can handle it """ @@ -451,6 +444,12 @@ def _coerce_to_type(x): elif is_bool_dtype(x): # GH 20303 x = x.astype(np.int64) + # To support cut and qcut for IntegerArray we convert to float dtype. + # Will properly support in the future. + # https://github.com/pandas-dev/pandas/pull/31290 + # https://github.com/pandas-dev/pandas/issues/31389 + elif is_extension_array_dtype(x) and is_integer_dtype(x): + x = x.to_numpy(dtype=np.float64, na_value=np.nan) if dtype is not None: # GH 19768: force NaT to NaN during integer conversion diff --git a/pandas/core/series.py b/pandas/core/series.py index e5cea8ebfc914..0786674daf874 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -22,13 +22,13 @@ from pandas._config import get_option -from pandas._libs import index as libindex, lib, properties, reshape, tslibs +from pandas._libs import lib, properties, reshape, tslibs from pandas._typing import Label from pandas.compat.numpy import function as nv from pandas.util._decorators import Appender, Substitution from pandas.util._validators import validate_bool_kwarg, validate_percentile -from pandas.core.dtypes.cast import convert_dtypes +from pandas.core.dtypes.cast import convert_dtypes, validate_numeric_casting from pandas.core.dtypes.common import ( _is_unorderable_exception, ensure_platform_int, @@ -838,16 +838,11 @@ def _ixs(self, i: int, axis: int = 0): ------- scalar (int) or Series (slice, sequence) """ + return self._values[i] - # dispatch to the values if we need - values = self._values - if isinstance(values, np.ndarray): - return libindex.get_value_at(values, i) - else: - return values[i] - - def _slice(self, slobj: slice, axis: int = 0, kind=None) -> "Series": - slobj = self.index._convert_slice_indexer(slobj, kind=kind or "getitem") + def _slice(self, slobj: slice, axis: int = 0, kind: str = "getitem") -> "Series": + assert kind in ["getitem", "iloc"] + slobj = self.index._convert_slice_indexer(slobj, kind=kind) return self._get_values(slobj) def __getitem__(self, key): @@ -856,31 +851,33 @@ def __getitem__(self, key): if key is Ellipsis: return self - try: - result = self.index.get_value(self, key) + key_is_scalar = is_scalar(key) + if key_is_scalar: + key = self.index._convert_scalar_indexer(key, kind="getitem") - return result - except InvalidIndexError: - pass - except (KeyError, ValueError): - if isinstance(key, tuple) and isinstance(self.index, MultiIndex): - # kludge - pass - elif com.is_bool_indexer(key): - pass - else: + if key_is_scalar or isinstance(self.index, MultiIndex): + # Otherwise index.get_value will raise InvalidIndexError + try: + result = self.index.get_value(self, key) - # we can try to coerce the indexer (or this will raise) - new_key = self.index._convert_scalar_indexer(key, kind="getitem") - if type(new_key) != type(key): - return self.__getitem__(new_key) - raise + return result + except InvalidIndexError: + pass + except (KeyError, ValueError): + if isinstance(key, tuple) and isinstance(self.index, MultiIndex): + # kludge + pass + else: + raise - if is_iterator(key): - key = list(key) + if not key_is_scalar: + # avoid expensive checks if we know we have a scalar + if is_iterator(key): + key = list(key) - if com.is_bool_indexer(key): - key = check_bool_indexer(self.index, key) + if com.is_bool_indexer(key): + key = check_bool_indexer(self.index, key) + return self._get_values(key) return self._get_with(key) @@ -913,6 +910,8 @@ def _get_with(self, key): else: key_type = lib.infer_dtype(key, skipna=False) + # Note: The key_type == "boolean" case should be caught by the + # com.is_bool_indexer check in __getitem__ if key_type == "integer": if self.index.is_integer() or self.index.is_floating(): return self.loc[key] @@ -921,8 +920,6 @@ def _get_with(self, key): return self.iloc[indexer] else: return self._get_values(key) - elif key_type == "boolean": - return self._get_values(key) if isinstance(key, (list, tuple)): # TODO: de-dup with tuple case handled above? @@ -981,7 +978,7 @@ def _get_value(self, label, takeable: bool = False): scalar value """ if takeable: - return com.maybe_box_datetimelike(self._values[label]) + return self._values[label] return self.index.get_value(self, label) def __setitem__(self, key, value): @@ -1026,17 +1023,10 @@ def __setitem__(self, key, value): self._maybe_update_cacher() def _set_with_engine(self, key, value): - values = self._values - if is_extension_array_dtype(values.dtype): - # The cython indexing engine does not support ExtensionArrays. - values[self.index.get_loc(key)] = value - return - try: - self.index._engine.set_value(values, key, value) - return - except KeyError: - values[self.index.get_loc(key)] = value - return + # fails with AttributeError for IntervalIndex + loc = self.index._engine.get_loc(key) + validate_numeric_casting(self.dtype, value) + self._values[loc] = value def _set_with(self, key, value): # other: fancy integer or otherwise @@ -1116,11 +1106,10 @@ def _set_value(self, label, value, takeable: bool = False): try: if takeable: self._values[label] = value - elif isinstance(self._values, np.ndarray): - # i.e. not EA, so we can use _engine - self.index._engine.set_value(self._values, label, value) else: - self.loc[label] = value + loc = self.index.get_loc(label) + validate_numeric_casting(self.dtype, value) + self._values[loc] = value except KeyError: # set using a non-recursive method diff --git a/pandas/io/common.py b/pandas/io/common.py index 00f2961e41617..c4772895afd1e 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -3,11 +3,23 @@ import bz2 from collections import abc import gzip -from io import BufferedIOBase, BytesIO +from io import BufferedIOBase, BytesIO, RawIOBase import mmap import os import pathlib -from typing import IO, Any, AnyStr, Dict, List, Mapping, Optional, Tuple, Union +from typing import ( + IO, + TYPE_CHECKING, + Any, + AnyStr, + Dict, + List, + Mapping, + Optional, + Tuple, + Type, + Union, +) from urllib.parse import ( # noqa urlencode, urljoin, @@ -37,6 +49,10 @@ _VALID_URLS.discard("") +if TYPE_CHECKING: + from io import IOBase # noqa: F401 + + def is_url(url) -> bool: """ Check to see if a URL has a valid protocol. @@ -356,12 +372,13 @@ def get_handle( handles : list of file-like objects A list of file-like object that were opened in this function. """ + need_text_wrapping: Tuple[Type["IOBase"], ...] try: from s3fs import S3File - need_text_wrapping = (BufferedIOBase, S3File) + need_text_wrapping = (BufferedIOBase, RawIOBase, S3File) except ImportError: - need_text_wrapping = BufferedIOBase # type: ignore + need_text_wrapping = (BufferedIOBase, RawIOBase) handles: List[IO] = list() f = path_or_buf @@ -437,7 +454,7 @@ def get_handle( from io import TextIOWrapper g = TextIOWrapper(f, encoding=encoding, newline="") - if not isinstance(f, BufferedIOBase): + if not isinstance(f, (BufferedIOBase, RawIOBase)): handles.append(g) f = g diff --git a/pandas/io/formats/excel.py b/pandas/io/formats/excel.py index 14e79538541af..28a069bc9fc1b 100644 --- a/pandas/io/formats/excel.py +++ b/pandas/io/formats/excel.py @@ -403,7 +403,7 @@ def __init__( # Deprecated in GH#17295, enforced in 1.0.0 raise KeyError("Not all names specified in 'columns' are found") - self.df = df + self.df = df.reindex(columns=cols) self.columns = self.df.columns self.float_format = float_format diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index 204807b55c877..04fd17a00041b 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -19,12 +19,7 @@ from pandas.core.construction import create_series_with_explicit_dtype from pandas.core.reshape.concat import concat -from pandas.io.common import ( - get_filepath_or_buffer, - get_handle, - infer_compression, - stringify_path, -) +from pandas.io.common import get_filepath_or_buffer, get_handle, infer_compression from pandas.io.json._normalize import convert_to_line_delimits from pandas.io.json._table_schema import build_table_schema, parse_table_schema from pandas.io.parsers import _validate_integer @@ -56,7 +51,11 @@ def to_json( "'index=False' is only valid when 'orient' is 'split' or 'table'" ) - path_or_buf = stringify_path(path_or_buf) + if path_or_buf is not None: + path_or_buf, _, _, _ = get_filepath_or_buffer( + path_or_buf, compression=compression, mode="w" + ) + if lines and orient != "records": raise ValueError("'lines' keyword only valid when 'orient' is records") diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index 98f2eb3929b59..926635062d853 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -85,7 +85,6 @@ def write( df: DataFrame, path, compression="snappy", - coerce_timestamps="ms", index: Optional[bool] = None, partition_cols=None, **kwargs, @@ -103,17 +102,12 @@ def write( table, path, compression=compression, - coerce_timestamps=coerce_timestamps, partition_cols=partition_cols, **kwargs, ) else: self.api.parquet.write_table( - table, - path, - compression=compression, - coerce_timestamps=coerce_timestamps, - **kwargs, + table, path, compression=compression, **kwargs, ) def read(self, path, columns=None, **kwargs): diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index a33d81ff437bf..a7d8c374a9aae 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -5,7 +5,8 @@ from collections import abc, defaultdict import csv import datetime -from io import BufferedIOBase, StringIO, TextIOWrapper +from io import BufferedIOBase, RawIOBase, StringIO, TextIOWrapper +from itertools import chain import re import sys from textwrap import fill @@ -1399,17 +1400,21 @@ def __init__(self, kwds): "index_col must only contain row numbers " "when specifying a multi-index header" ) - - # GH 16338 - elif self.header is not None and not is_integer(self.header): - raise ValueError("header must be integer or list of integers") - - # GH 27779 - elif self.header is not None and self.header < 0: - raise ValueError( - "Passing negative integer to header is invalid. " - "For no header, use header=None instead" - ) + elif self.header is not None: + # GH 27394 + if self.prefix is not None: + raise ValueError( + "Argument prefix must be None if argument header is not None" + ) + # GH 16338 + elif not is_integer(self.header): + raise ValueError("header must be integer or list of integers") + # GH 27779 + elif self.header < 0: + raise ValueError( + "Passing negative integer to header is invalid. " + "For no header, use header=None instead" + ) self._name_processed = False @@ -1419,6 +1424,26 @@ def __init__(self, kwds): # keep references to file handles opened by the parser itself self.handles = [] + def _confirm_parse_dates_presence(self, columns): + """ + if user has provided names for parse_dates, check if those columns + are available. + """ + if isinstance(self.parse_dates, list): + cols_needed = self.parse_dates + elif isinstance(self.parse_dates, dict): + cols_needed = chain(*self.parse_dates.values()) + else: + cols_needed = [] + + missing_cols = ", ".join( + [col for col in cols_needed if isinstance(col, str) and col not in columns] + ) + if missing_cols: + raise ValueError( + f"Missing column provided to 'parse_dates': '{missing_cols}'" + ) + def close(self): for f in self.handles: f.close() @@ -1868,7 +1893,7 @@ def __init__(self, src, **kwds): # Handle the file object with universal line mode enabled. # We will handle the newline character ourselves later on. - if isinstance(src, BufferedIOBase): + if isinstance(src, (BufferedIOBase, RawIOBase)): src = TextIOWrapper(src, encoding=encoding, newline="") kwds["encoding"] = "utf-8" @@ -1938,6 +1963,7 @@ def __init__(self, src, **kwds): if len(self.names) < len(usecols): _validate_usecols_names(usecols, self.names) + self._confirm_parse_dates_presence(self.names) self._set_noconvert_columns() self.orig_names = self.names @@ -2308,6 +2334,7 @@ def __init__(self, f, **kwds): if self.index_names is None: self.index_names = index_names + self._confirm_parse_dates_presence(self.columns) if self.parse_dates: self._no_thousands_columns = self._set_no_thousands_columns() else: @@ -3278,6 +3305,10 @@ def _isindex(colspec): if is_scalar(colspec): if isinstance(colspec, int) and colspec not in data_dict: colspec = orig_names[colspec] + elif colspec not in orig_names: + raise ValueError( + f"Missing column provided to 'parse_dates': '{colspec}'" + ) if _isindex(colspec): continue data_dict[colspec] = converter(data_dict[colspec]) diff --git a/pandas/plotting/_matplotlib/timeseries.py b/pandas/plotting/_matplotlib/timeseries.py index dd048114142f3..3abce690cbe6b 100644 --- a/pandas/plotting/_matplotlib/timeseries.py +++ b/pandas/plotting/_matplotlib/timeseries.py @@ -251,7 +251,7 @@ def _maybe_convert_index(ax, data): freq = frequencies.get_period_alias(freq) if isinstance(data.index, ABCDatetimeIndex): - data = data.to_period(freq=freq) + data = data.tz_localize(None).to_period(freq=freq) elif isinstance(data.index, ABCPeriodIndex): data.index = data.index.asfreq(freq=freq) return data diff --git a/pandas/tests/arrays/categorical/test_constructors.py b/pandas/tests/arrays/categorical/test_constructors.py index cfba3da354d44..70e1421c8dcf4 100644 --- a/pandas/tests/arrays/categorical/test_constructors.py +++ b/pandas/tests/arrays/categorical/test_constructors.py @@ -408,6 +408,11 @@ def test_constructor_str_unknown(self): with pytest.raises(ValueError, match="Unknown dtype"): Categorical([1, 2], dtype="foo") + def test_constructor_np_strs(self): + # GH#31499 Hastable.map_locations needs to work on np.str_ objects + cat = pd.Categorical(["1", "0", "1"], [np.str_("0"), np.str_("1")]) + assert all(isinstance(x, np.str_) for x in cat.categories) + def test_constructor_from_categorical_with_dtype(self): dtype = CategoricalDtype(["a", "b", "c"], ordered=True) values = Categorical(["a", "b", "d"]) diff --git a/pandas/tests/arrays/interval/test_interval.py b/pandas/tests/arrays/interval/test_interval.py index 35eda4a0ec5bc..7e7762d8973a0 100644 --- a/pandas/tests/arrays/interval/test_interval.py +++ b/pandas/tests/arrays/interval/test_interval.py @@ -81,6 +81,24 @@ def test_where_raises(self, other): with pytest.raises(ValueError, match=match): ser.where([True, False, True], other=other) + def test_shift(self): + # https://github.com/pandas-dev/pandas/issues/31495 + a = IntervalArray.from_breaks([1, 2, 3]) + result = a.shift() + # int -> float + expected = IntervalArray.from_tuples([(np.nan, np.nan), (1.0, 2.0)]) + tm.assert_interval_array_equal(result, expected) + + def test_shift_datetime(self): + a = IntervalArray.from_breaks(pd.date_range("2000", periods=4)) + result = a.shift(2) + expected = a.take([-1, -1, 0], allow_fill=True) + tm.assert_interval_array_equal(result, expected) + + result = a.shift(-1) + expected = a.take([1, 2, -1], allow_fill=True) + tm.assert_interval_array_equal(result, expected) + class TestSetitem: def test_set_na(self, left_right_dtypes): diff --git a/pandas/tests/arrays/test_integer.py b/pandas/tests/arrays/test_integer.py index cc81ae4504dd8..7a0c9300a43a2 100644 --- a/pandas/tests/arrays/test_integer.py +++ b/pandas/tests/arrays/test_integer.py @@ -1061,19 +1061,6 @@ def test_value_counts_na(): tm.assert_series_equal(result, expected) -@pytest.mark.parametrize("bins", [3, [0, 5, 15]]) -@pytest.mark.parametrize("right", [True, False]) -@pytest.mark.parametrize("include_lowest", [True, False]) -def test_cut(bins, right, include_lowest): - a = np.random.randint(0, 10, size=50).astype(object) - a[::2] = np.nan - result = pd.cut( - pd.array(a, dtype="Int64"), bins, right=right, include_lowest=include_lowest - ) - expected = pd.cut(a, bins, right=right, include_lowest=include_lowest) - tm.assert_categorical_equal(result, expected) - - def test_array_setitem_nullable_boolean_mask(): # GH 31446 ser = pd.Series([1, 2], dtype="Int64") diff --git a/pandas/tests/dtypes/test_common.py b/pandas/tests/dtypes/test_common.py index 097e83d93ee71..4c917b9bb42d2 100644 --- a/pandas/tests/dtypes/test_common.py +++ b/pandas/tests/dtypes/test_common.py @@ -675,6 +675,8 @@ def test__get_dtype(input_param, result): ) def test__get_dtype_fails(input_param, expected_error_message): # python objects + # 2020-02-02 npdev changed error message + expected_error_message += f"|Cannot interpret '{input_param}' as a data type" with pytest.raises(TypeError, match=expected_error_message): com._get_dtype(input_param) diff --git a/pandas/tests/dtypes/test_dtypes.py b/pandas/tests/dtypes/test_dtypes.py index a599a086ae92b..dd99b81fb6764 100644 --- a/pandas/tests/dtypes/test_dtypes.py +++ b/pandas/tests/dtypes/test_dtypes.py @@ -32,66 +32,60 @@ class Base: - def setup_method(self, method): - self.dtype = self.create() - - def test_hash(self): - hash(self.dtype) - - def test_equality_invalid(self): - assert not self.dtype == "foo" - assert not is_dtype_equal(self.dtype, np.int64) - - def test_numpy_informed(self): - with pytest.raises(TypeError, match="data type not understood"): - np.dtype(self.dtype) + def test_hash(self, dtype): + hash(dtype) + + def test_equality_invalid(self, dtype): + assert not dtype == "foo" + assert not is_dtype_equal(dtype, np.int64) + + def test_numpy_informed(self, dtype): + # npdev 2020-02-02 changed from "data type not understood" to + # "Cannot interpret 'foo' as a data type" + msg = "|".join( + ["data type not understood", "Cannot interpret '.*' as a data type"] + ) + with pytest.raises(TypeError, match=msg): + np.dtype(dtype) - assert not self.dtype == np.str_ - assert not np.str_ == self.dtype + assert not dtype == np.str_ + assert not np.str_ == dtype - def test_pickle(self): + def test_pickle(self, dtype): # make sure our cache is NOT pickled # clear the cache - type(self.dtype).reset_cache() - assert not len(self.dtype._cache) + type(dtype).reset_cache() + assert not len(dtype._cache) # force back to the cache - result = tm.round_trip_pickle(self.dtype) - assert not len(self.dtype._cache) - assert result == self.dtype + result = tm.round_trip_pickle(dtype) + assert not len(dtype._cache) + assert result == dtype class TestCategoricalDtype(Base): - def create(self): + @pytest.fixture + def dtype(self): + """ + Class level fixture of dtype for TestCategoricalDtype + """ return CategoricalDtype() - def test_pickle(self): - # make sure our cache is NOT pickled - - # clear the cache - type(self.dtype).reset_cache() - assert not len(self.dtype._cache) - - # force back to the cache - result = tm.round_trip_pickle(self.dtype) - assert result == self.dtype - - def test_hash_vs_equality(self): - dtype = self.dtype + def test_hash_vs_equality(self, dtype): dtype2 = CategoricalDtype() assert dtype == dtype2 assert dtype2 == dtype assert hash(dtype) == hash(dtype2) - def test_equality(self): - assert is_dtype_equal(self.dtype, "category") - assert is_dtype_equal(self.dtype, CategoricalDtype()) - assert not is_dtype_equal(self.dtype, "foo") + def test_equality(self, dtype): + assert is_dtype_equal(dtype, "category") + assert is_dtype_equal(dtype, CategoricalDtype()) + assert not is_dtype_equal(dtype, "foo") - def test_construction_from_string(self): + def test_construction_from_string(self, dtype): result = CategoricalDtype.construct_from_string("category") - assert is_dtype_equal(self.dtype, result) + assert is_dtype_equal(dtype, result) msg = "Cannot construct a 'CategoricalDtype' from 'foo'" with pytest.raises(TypeError, match=msg): CategoricalDtype.construct_from_string("foo") @@ -133,16 +127,16 @@ def test_from_values_or_dtype_raises(self, values, categories, ordered, dtype): with pytest.raises(ValueError, match=msg): CategoricalDtype._from_values_or_dtype(values, categories, ordered, dtype) - def test_is_dtype(self): - assert CategoricalDtype.is_dtype(self.dtype) + def test_is_dtype(self, dtype): + assert CategoricalDtype.is_dtype(dtype) assert CategoricalDtype.is_dtype("category") assert CategoricalDtype.is_dtype(CategoricalDtype()) assert not CategoricalDtype.is_dtype("foo") assert not CategoricalDtype.is_dtype(np.float64) - def test_basic(self): + def test_basic(self, dtype): - assert is_categorical_dtype(self.dtype) + assert is_categorical_dtype(dtype) factor = Categorical(["a", "b", "b", "a", "a", "c", "c", "c"]) @@ -180,7 +174,11 @@ def test_is_boolean(self, categories, expected): class TestDatetimeTZDtype(Base): - def create(self): + @pytest.fixture + def dtype(self): + """ + Class level fixture of dtype for TestDatetimeTZDtype + """ return DatetimeTZDtype("ns", "US/Eastern") def test_alias_to_unit_raises(self): @@ -196,9 +194,8 @@ def test_alias_to_unit_bad_alias_raises(self): with pytest.raises(TypeError, match=""): DatetimeTZDtype("datetime64[ns, US/NotATZ]") - def test_hash_vs_equality(self): + def test_hash_vs_equality(self, dtype): # make sure that we satisfy is semantics - dtype = self.dtype dtype2 = DatetimeTZDtype("ns", "US/Eastern") dtype3 = DatetimeTZDtype(dtype2) assert dtype == dtype2 @@ -223,54 +220,54 @@ def test_subclass(self): assert issubclass(type(a), type(a)) assert issubclass(type(a), type(b)) - def test_compat(self): - assert is_datetime64tz_dtype(self.dtype) + def test_compat(self, dtype): + assert is_datetime64tz_dtype(dtype) assert is_datetime64tz_dtype("datetime64[ns, US/Eastern]") - assert is_datetime64_any_dtype(self.dtype) + assert is_datetime64_any_dtype(dtype) assert is_datetime64_any_dtype("datetime64[ns, US/Eastern]") - assert is_datetime64_ns_dtype(self.dtype) + assert is_datetime64_ns_dtype(dtype) assert is_datetime64_ns_dtype("datetime64[ns, US/Eastern]") - assert not is_datetime64_dtype(self.dtype) + assert not is_datetime64_dtype(dtype) assert not is_datetime64_dtype("datetime64[ns, US/Eastern]") - def test_construction_from_string(self): + def test_construction_from_string(self, dtype): result = DatetimeTZDtype.construct_from_string("datetime64[ns, US/Eastern]") - assert is_dtype_equal(self.dtype, result) - msg = "Cannot construct a 'DatetimeTZDtype' from 'foo'" - with pytest.raises(TypeError, match=msg): - DatetimeTZDtype.construct_from_string("foo") - - def test_construct_from_string_raises(self): - with pytest.raises(TypeError, match="notatz"): - DatetimeTZDtype.construct_from_string("datetime64[ns, notatz]") + assert is_dtype_equal(dtype, result) - msg = "'construct_from_string' expects a string, got " - with pytest.raises(TypeError, match=re.escape(msg)): - # list instead of string - DatetimeTZDtype.construct_from_string(["datetime64[ns, notatz]"]) - - msg = "^Cannot construct a 'DatetimeTZDtype'" - with pytest.raises(TypeError, match=msg): + @pytest.mark.parametrize( + "string", + [ + "foo", + "datetime64[ns, notatz]", # non-nano unit - DatetimeTZDtype.construct_from_string("datetime64[ps, UTC]") + "datetime64[ps, UTC]", + # dateutil str that returns None from gettz + "datetime64[ns, dateutil/invalid]", + ], + ) + def test_construct_from_string_invalid_raises(self, string): + msg = f"Cannot construct a 'DatetimeTZDtype' from '{string}'" + with pytest.raises(TypeError, match=re.escape(msg)): + DatetimeTZDtype.construct_from_string(string) + def test_construct_from_string_wrong_type_raises(self): + msg = "'construct_from_string' expects a string, got " with pytest.raises(TypeError, match=msg): - # dateutil str that returns None from gettz - DatetimeTZDtype.construct_from_string("datetime64[ns, dateutil/invalid]") + DatetimeTZDtype.construct_from_string(["datetime64[ns, notatz]"]) - def test_is_dtype(self): + def test_is_dtype(self, dtype): assert not DatetimeTZDtype.is_dtype(None) - assert DatetimeTZDtype.is_dtype(self.dtype) + assert DatetimeTZDtype.is_dtype(dtype) assert DatetimeTZDtype.is_dtype("datetime64[ns, US/Eastern]") assert not DatetimeTZDtype.is_dtype("foo") assert DatetimeTZDtype.is_dtype(DatetimeTZDtype("ns", "US/Pacific")) assert not DatetimeTZDtype.is_dtype(np.float64) - def test_equality(self): - assert is_dtype_equal(self.dtype, "datetime64[ns, US/Eastern]") - assert is_dtype_equal(self.dtype, DatetimeTZDtype("ns", "US/Eastern")) - assert not is_dtype_equal(self.dtype, "foo") - assert not is_dtype_equal(self.dtype, DatetimeTZDtype("ns", "CET")) + def test_equality(self, dtype): + assert is_dtype_equal(dtype, "datetime64[ns, US/Eastern]") + assert is_dtype_equal(dtype, DatetimeTZDtype("ns", "US/Eastern")) + assert not is_dtype_equal(dtype, "foo") + assert not is_dtype_equal(dtype, DatetimeTZDtype("ns", "CET")) assert not is_dtype_equal( DatetimeTZDtype("ns", "US/Eastern"), DatetimeTZDtype("ns", "US/Pacific") ) @@ -278,9 +275,9 @@ def test_equality(self): # numpy compat assert is_dtype_equal(np.dtype("M8[ns]"), "datetime64[ns]") - def test_basic(self): + def test_basic(self, dtype): - assert is_datetime64tz_dtype(self.dtype) + assert is_datetime64tz_dtype(dtype) dr = date_range("20130101", periods=3, tz="US/Eastern") s = Series(dr, name="A") @@ -326,12 +323,15 @@ def test_tz_standardize(self): class TestPeriodDtype(Base): - def create(self): + @pytest.fixture + def dtype(self): + """ + Class level fixture of dtype for TestPeriodDtype + """ return PeriodDtype("D") - def test_hash_vs_equality(self): + def test_hash_vs_equality(self, dtype): # make sure that we satisfy is semantics - dtype = self.dtype dtype2 = PeriodDtype("D") dtype3 = PeriodDtype(dtype2) assert dtype == dtype2 @@ -386,17 +386,17 @@ def test_identity(self): assert PeriodDtype("period[1S1U]") == PeriodDtype("period[1000001U]") assert PeriodDtype("period[1S1U]") is PeriodDtype("period[1000001U]") - def test_compat(self): - assert not is_datetime64_ns_dtype(self.dtype) + def test_compat(self, dtype): + assert not is_datetime64_ns_dtype(dtype) assert not is_datetime64_ns_dtype("period[D]") - assert not is_datetime64_dtype(self.dtype) + assert not is_datetime64_dtype(dtype) assert not is_datetime64_dtype("period[D]") - def test_construction_from_string(self): + def test_construction_from_string(self, dtype): result = PeriodDtype("period[D]") - assert is_dtype_equal(self.dtype, result) + assert is_dtype_equal(dtype, result) result = PeriodDtype.construct_from_string("period[D]") - assert is_dtype_equal(self.dtype, result) + assert is_dtype_equal(dtype, result) with pytest.raises(TypeError): PeriodDtype.construct_from_string("foo") with pytest.raises(TypeError): @@ -412,8 +412,8 @@ def test_construction_from_string(self): with pytest.raises(TypeError, match="list"): PeriodDtype.construct_from_string([1, 2, 3]) - def test_is_dtype(self): - assert PeriodDtype.is_dtype(self.dtype) + def test_is_dtype(self, dtype): + assert PeriodDtype.is_dtype(dtype) assert PeriodDtype.is_dtype("period[D]") assert PeriodDtype.is_dtype("period[3D]") assert PeriodDtype.is_dtype(PeriodDtype("3D")) @@ -431,17 +431,17 @@ def test_is_dtype(self): assert not PeriodDtype.is_dtype(np.int64) assert not PeriodDtype.is_dtype(np.float64) - def test_equality(self): - assert is_dtype_equal(self.dtype, "period[D]") - assert is_dtype_equal(self.dtype, PeriodDtype("D")) - assert is_dtype_equal(self.dtype, PeriodDtype("D")) + def test_equality(self, dtype): + assert is_dtype_equal(dtype, "period[D]") + assert is_dtype_equal(dtype, PeriodDtype("D")) + assert is_dtype_equal(dtype, PeriodDtype("D")) assert is_dtype_equal(PeriodDtype("D"), PeriodDtype("D")) - assert not is_dtype_equal(self.dtype, "D") + assert not is_dtype_equal(dtype, "D") assert not is_dtype_equal(PeriodDtype("D"), PeriodDtype("2D")) - def test_basic(self): - assert is_period_dtype(self.dtype) + def test_basic(self, dtype): + assert is_period_dtype(dtype) pidx = pd.period_range("2013-01-01 09:00", periods=5, freq="H") @@ -467,12 +467,15 @@ def test_not_string(self): class TestIntervalDtype(Base): - def create(self): + @pytest.fixture + def dtype(self): + """ + Class level fixture of dtype for TestIntervalDtype + """ return IntervalDtype("int64") - def test_hash_vs_equality(self): + def test_hash_vs_equality(self, dtype): # make sure that we satisfy is semantics - dtype = self.dtype dtype2 = IntervalDtype("int64") dtype3 = IntervalDtype(dtype2) assert dtype == dtype2 @@ -539,11 +542,11 @@ def test_construction_errors(self, subtype): with pytest.raises(TypeError, match=msg): IntervalDtype(subtype) - def test_construction_from_string(self): + def test_construction_from_string(self, dtype): result = IntervalDtype("interval[int64]") - assert is_dtype_equal(self.dtype, result) + assert is_dtype_equal(dtype, result) result = IntervalDtype.construct_from_string("interval[int64]") - assert is_dtype_equal(self.dtype, result) + assert is_dtype_equal(dtype, result) @pytest.mark.parametrize("string", [0, 3.14, ("a", "b"), None]) def test_construction_from_string_errors(self, string): @@ -572,8 +575,8 @@ def test_subclass(self): assert issubclass(type(a), type(a)) assert issubclass(type(a), type(b)) - def test_is_dtype(self): - assert IntervalDtype.is_dtype(self.dtype) + def test_is_dtype(self, dtype): + assert IntervalDtype.is_dtype(dtype) assert IntervalDtype.is_dtype("interval") assert IntervalDtype.is_dtype(IntervalDtype("float64")) assert IntervalDtype.is_dtype(IntervalDtype("int64")) @@ -589,12 +592,12 @@ def test_is_dtype(self): assert not IntervalDtype.is_dtype(np.int64) assert not IntervalDtype.is_dtype(np.float64) - def test_equality(self): - assert is_dtype_equal(self.dtype, "interval[int64]") - assert is_dtype_equal(self.dtype, IntervalDtype("int64")) + def test_equality(self, dtype): + assert is_dtype_equal(dtype, "interval[int64]") + assert is_dtype_equal(dtype, IntervalDtype("int64")) assert is_dtype_equal(IntervalDtype("int64"), IntervalDtype("int64")) - assert not is_dtype_equal(self.dtype, "int64") + assert not is_dtype_equal(dtype, "int64") assert not is_dtype_equal(IntervalDtype("int64"), IntervalDtype("float64")) # invalid subtype comparisons do not raise when directly compared @@ -650,8 +653,8 @@ def test_name_repr_generic(self, subtype): assert str(dtype) == "interval" assert dtype.name == "interval" - def test_basic(self): - assert is_interval_dtype(self.dtype) + def test_basic(self, dtype): + assert is_interval_dtype(dtype) ii = IntervalIndex.from_breaks(range(3)) diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py index 4a84a21084de2..22e53dbc89f01 100644 --- a/pandas/tests/extension/base/methods.py +++ b/pandas/tests/extension/base/methods.py @@ -280,6 +280,13 @@ def test_shift_empty_array(self, data, periods): expected = empty self.assert_extension_array_equal(result, expected) + def test_shift_zero_copies(self, data): + result = data.shift(0) + assert result is not data + + result = data[:0].shift(2) + assert result is not data + def test_shift_fill_value(self, data): arr = data[:4] fill_value = data[0] diff --git a/pandas/tests/extension/json/array.py b/pandas/tests/extension/json/array.py index 9e741bb7f267c..1ba1b872fa5e2 100644 --- a/pandas/tests/extension/json/array.py +++ b/pandas/tests/extension/json/array.py @@ -16,7 +16,7 @@ import random import string import sys -from typing import Type +from typing import Any, Mapping, Type import numpy as np @@ -27,7 +27,7 @@ class JSONDtype(ExtensionDtype): type = abc.Mapping name = "json" - na_value = UserDict() + na_value: Mapping[str, Any] = UserDict() @classmethod def construct_array_type(cls) -> Type["JSONArray"]: diff --git a/pandas/tests/frame/indexing/test_datetime.py b/pandas/tests/frame/indexing/test_datetime.py index a1c12be2b0180..6bfcac3793584 100644 --- a/pandas/tests/frame/indexing/test_datetime.py +++ b/pandas/tests/frame/indexing/test_datetime.py @@ -45,13 +45,6 @@ def test_set_reset(self): df = result.set_index("foo") tm.assert_index_equal(df.index, idx) - def test_transpose(self, timezone_frame): - - result = timezone_frame.T - expected = DataFrame(timezone_frame.values.T) - expected.index = ["A", "B", "C"] - tm.assert_frame_equal(result, expected) - def test_scalar_assignment(self): # issue #19843 df = pd.DataFrame(index=(0, 1, 2)) diff --git a/pandas/tests/frame/indexing/test_iat.py b/pandas/tests/frame/indexing/test_iat.py new file mode 100644 index 0000000000000..23e3392251a3a --- /dev/null +++ b/pandas/tests/frame/indexing/test_iat.py @@ -0,0 +1,7 @@ +def test_iat(float_frame): + + for i, row in enumerate(float_frame.index): + for j, col in enumerate(float_frame.columns): + result = float_frame.iat[i, j] + expected = float_frame.at[row, col] + assert result == expected diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py index 64d0f9ee2b062..6fc8c0e9ad459 100644 --- a/pandas/tests/frame/indexing/test_indexing.py +++ b/pandas/tests/frame/indexing/test_indexing.py @@ -28,6 +28,29 @@ from pandas.tseries.offsets import BDay +class TestGet: + def test_get(self, float_frame): + b = float_frame.get("B") + tm.assert_series_equal(b, float_frame["B"]) + + assert float_frame.get("foo") is None + tm.assert_series_equal( + float_frame.get("foo", float_frame["B"]), float_frame["B"] + ) + + @pytest.mark.parametrize( + "df", + [ + DataFrame(), + DataFrame(columns=list("AB")), + DataFrame(columns=list("AB"), index=range(3)), + ], + ) + def test_get_none(self, df): + # see gh-5652 + assert df.get(None) is None + + class TestDataFrameIndexing: def test_getitem(self, float_frame): # Slicing @@ -64,27 +87,6 @@ def test_getitem_dupe_cols(self): with pytest.raises(KeyError, match=re.escape(msg)): df[["baf"]] - def test_get(self, float_frame): - b = float_frame.get("B") - tm.assert_series_equal(b, float_frame["B"]) - - assert float_frame.get("foo") is None - tm.assert_series_equal( - float_frame.get("foo", float_frame["B"]), float_frame["B"] - ) - - @pytest.mark.parametrize( - "df", - [ - DataFrame(), - DataFrame(columns=list("AB")), - DataFrame(columns=list("AB"), index=range(3)), - ], - ) - def test_get_none(self, df): - # see gh-5652 - assert df.get(None) is None - @pytest.mark.parametrize("key_type", [iter, np.array, Series, Index]) def test_loc_iterable(self, float_frame, key_type): idx = key_type(["A", "B", "C"]) @@ -1048,9 +1050,8 @@ def test_getitem_setitem_float_labels(self): # positional slicing only via iloc! msg = ( - "cannot do slice indexing on " - r" with " - r"these indexers \[1.0\] of " + "cannot do positional indexing on Float64Index with " + r"these indexers \[1.0\] of type float" ) with pytest.raises(TypeError, match=msg): df.iloc[1.0:5] @@ -1547,14 +1548,6 @@ def test_loc_duplicates(self): df.loc[trange[bool_idx], "A"] += 6 tm.assert_frame_equal(df, expected) - def test_iat(self, float_frame): - - for i, row in enumerate(float_frame.index): - for j, col in enumerate(float_frame.columns): - result = float_frame.iat[i, j] - expected = float_frame.at[row, col] - assert result == expected - @pytest.mark.parametrize( "method,expected_values", [ @@ -1608,6 +1601,16 @@ def test_reindex_methods_nearest_special(self): actual = df.reindex(target, method="nearest", tolerance=[0.5, 0.01, 0.4, 0.1]) tm.assert_frame_equal(expected, actual) + def test_reindex_nearest_tz(self, tz_aware_fixture): + # GH26683 + tz = tz_aware_fixture + idx = pd.date_range("2019-01-01", periods=5, tz=tz) + df = pd.DataFrame({"x": list(range(5))}, index=idx) + + expected = df.head(3) + actual = df.reindex(idx[:3], method="nearest") + tm.assert_frame_equal(expected, actual) + def test_reindex_frame_add_nat(self): rng = date_range("1/1/2000 00:00:00", periods=10, freq="10s") df = DataFrame({"A": np.random.randn(len(rng)), "B": rng}) @@ -1916,89 +1919,6 @@ def test_at_time_between_time_datetimeindex(self): result.loc[bkey] = df.iloc[binds] tm.assert_frame_equal(result, df) - def test_xs(self, float_frame, datetime_frame): - idx = float_frame.index[5] - xs = float_frame.xs(idx) - for item, value in xs.items(): - if np.isnan(value): - assert np.isnan(float_frame[item][idx]) - else: - assert value == float_frame[item][idx] - - # mixed-type xs - test_data = {"A": {"1": 1, "2": 2}, "B": {"1": "1", "2": "2", "3": "3"}} - frame = DataFrame(test_data) - xs = frame.xs("1") - assert xs.dtype == np.object_ - assert xs["A"] == 1 - assert xs["B"] == "1" - - with pytest.raises( - KeyError, match=re.escape("Timestamp('1999-12-31 00:00:00', freq='B')") - ): - datetime_frame.xs(datetime_frame.index[0] - BDay()) - - # xs get column - series = float_frame.xs("A", axis=1) - expected = float_frame["A"] - tm.assert_series_equal(series, expected) - - # view is returned if possible - series = float_frame.xs("A", axis=1) - series[:] = 5 - assert (expected == 5).all() - - def test_xs_corner(self): - # pathological mixed-type reordering case - df = DataFrame(index=[0]) - df["A"] = 1.0 - df["B"] = "foo" - df["C"] = 2.0 - df["D"] = "bar" - df["E"] = 3.0 - - xs = df.xs(0) - exp = pd.Series([1.0, "foo", 2.0, "bar", 3.0], index=list("ABCDE"), name=0) - tm.assert_series_equal(xs, exp) - - # no columns but Index(dtype=object) - df = DataFrame(index=["a", "b", "c"]) - result = df.xs("a") - expected = Series([], name="a", index=pd.Index([]), dtype=np.float64) - tm.assert_series_equal(result, expected) - - def test_xs_duplicates(self): - df = DataFrame(np.random.randn(5, 2), index=["b", "b", "c", "b", "a"]) - - cross = df.xs("c") - exp = df.iloc[2] - tm.assert_series_equal(cross, exp) - - def test_xs_keep_level(self): - df = DataFrame( - { - "day": {0: "sat", 1: "sun"}, - "flavour": {0: "strawberry", 1: "strawberry"}, - "sales": {0: 10, 1: 12}, - "year": {0: 2008, 1: 2008}, - } - ).set_index(["year", "flavour", "day"]) - result = df.xs("sat", level="day", drop_level=False) - expected = df[:1] - tm.assert_frame_equal(result, expected) - - result = df.xs([2008, "sat"], level=["year", "day"], drop_level=False) - tm.assert_frame_equal(result, expected) - - def test_xs_view(self): - # in 0.14 this will return a view if possible a copy otherwise, but - # this is numpy dependent - - dm = DataFrame(np.arange(20.0).reshape(4, 5), index=range(4), columns=range(5)) - - dm.xs(2)[:] = 10 - assert (dm.xs(2) == 10).all() - def test_index_namedtuple(self): from collections import namedtuple @@ -2154,31 +2074,6 @@ def test_mask_callable(self): tm.assert_frame_equal(result, exp) tm.assert_frame_equal(result, (df + 2).mask((df + 2) > 8, (df + 2) + 10)) - def test_head_tail(self, float_frame): - tm.assert_frame_equal(float_frame.head(), float_frame[:5]) - tm.assert_frame_equal(float_frame.tail(), float_frame[-5:]) - - tm.assert_frame_equal(float_frame.head(0), float_frame[0:0]) - tm.assert_frame_equal(float_frame.tail(0), float_frame[0:0]) - - tm.assert_frame_equal(float_frame.head(-1), float_frame[:-1]) - tm.assert_frame_equal(float_frame.tail(-1), float_frame[1:]) - tm.assert_frame_equal(float_frame.head(1), float_frame[:1]) - tm.assert_frame_equal(float_frame.tail(1), float_frame[-1:]) - # with a float index - df = float_frame.copy() - df.index = np.arange(len(float_frame)) + 0.1 - tm.assert_frame_equal(df.head(), df.iloc[:5]) - tm.assert_frame_equal(df.tail(), df.iloc[-5:]) - tm.assert_frame_equal(df.head(0), df[0:0]) - tm.assert_frame_equal(df.tail(0), df[0:0]) - tm.assert_frame_equal(df.head(-1), df.iloc[:-1]) - tm.assert_frame_equal(df.tail(-1), df.iloc[1:]) - # test empty dataframe - empty_df = DataFrame() - tm.assert_frame_equal(empty_df.tail(), empty_df) - tm.assert_frame_equal(empty_df.head(), empty_df) - def test_type_error_multiindex(self): # See gh-12218 df = DataFrame( @@ -2270,9 +2165,40 @@ def test_set_reset(self): df = result.set_index("foo") tm.assert_index_equal(df.index, idx) - def test_transpose(self, uint64_frame): - result = uint64_frame.T - expected = DataFrame(uint64_frame.values.T) - expected.index = ["A", "B"] - tm.assert_frame_equal(result, expected) +def test_object_casting_indexing_wraps_datetimelike(): + # GH#31649, check the indexing methods all the way down the stack + df = pd.DataFrame( + { + "A": [1, 2], + "B": pd.date_range("2000", periods=2), + "C": pd.timedelta_range("1 Day", periods=2), + } + ) + + ser = df.loc[0] + assert isinstance(ser.values[1], pd.Timestamp) + assert isinstance(ser.values[2], pd.Timedelta) + + ser = df.iloc[0] + assert isinstance(ser.values[1], pd.Timestamp) + assert isinstance(ser.values[2], pd.Timedelta) + + ser = df.xs(0, axis=0) + assert isinstance(ser.values[1], pd.Timestamp) + assert isinstance(ser.values[2], pd.Timedelta) + + mgr = df._data + arr = mgr.fast_xs(0) + assert isinstance(arr[1], pd.Timestamp) + assert isinstance(arr[2], pd.Timedelta) + + blk = mgr.blocks[mgr._blknos[1]] + assert blk.dtype == "M8[ns]" # we got the right block + val = blk.iget((0, 0)) + assert isinstance(val, pd.Timestamp) + + blk = mgr.blocks[mgr._blknos[2]] + assert blk.dtype == "m8[ns]" # we got the right block + val = blk.iget((0, 0)) + assert isinstance(val, pd.Timedelta) diff --git a/pandas/tests/frame/indexing/test_where.py b/pandas/tests/frame/indexing/test_where.py index df1b128dcd227..507b2e9cd237b 100644 --- a/pandas/tests/frame/indexing/test_where.py +++ b/pandas/tests/frame/indexing/test_where.py @@ -10,22 +10,30 @@ import pandas._testing as tm -class TestDataFrameIndexingWhere: - def test_where(self, float_string_frame, mixed_float_frame, mixed_int_frame): - default_frame = DataFrame(np.random.randn(5, 3), columns=["A", "B", "C"]) - - def _safe_add(df): - # only add to the numeric items - def is_ok(s): - return ( - issubclass(s.dtype.type, (np.integer, np.floating)) - and s.dtype != "uint8" - ) - - return DataFrame( - dict((c, s + 1) if is_ok(s) else (c, s) for c, s in df.items()) - ) +@pytest.fixture(params=["default", "float_string", "mixed_float", "mixed_int"]) +def where_frame(request, float_string_frame, mixed_float_frame, mixed_int_frame): + if request.param == "default": + return DataFrame(np.random.randn(5, 3), columns=["A", "B", "C"]) + if request.param == "float_string": + return float_string_frame + if request.param == "mixed_float": + return mixed_float_frame + if request.param == "mixed_int": + return mixed_int_frame + + +def _safe_add(df): + # only add to the numeric items + def is_ok(s): + return ( + issubclass(s.dtype.type, (np.integer, np.floating)) and s.dtype != "uint8" + ) + + return DataFrame(dict((c, s + 1) if is_ok(s) else (c, s) for c, s in df.items())) + +class TestDataFrameIndexingWhere: + def test_where_get(self, where_frame, float_string_frame): def _check_get(df, cond, check_dtypes=True): other1 = _safe_add(df) rs = df.where(cond, other1) @@ -40,19 +48,15 @@ def _check_get(df, cond, check_dtypes=True): assert (rs.dtypes == df.dtypes).all() # check getting - for df in [ - default_frame, - float_string_frame, - mixed_float_frame, - mixed_int_frame, - ]: - if df is float_string_frame: - with pytest.raises(TypeError): - df > 0 - continue - cond = df > 0 - _check_get(df, cond) - + df = where_frame + if df is float_string_frame: + with pytest.raises(TypeError): + df > 0 + return + cond = df > 0 + _check_get(df, cond) + + def test_where_upcasting(self): # upcasting case (GH # 2794) df = DataFrame( { @@ -78,6 +82,7 @@ def _check_get(df, cond, check_dtypes=True): tm.assert_series_equal(result, expected) + def test_where_alignment(self, where_frame, float_string_frame): # aligning def _check_align(df, cond, other, check_dtypes=True): rs = df.where(cond, other) @@ -107,27 +112,30 @@ def _check_align(df, cond, other, check_dtypes=True): if check_dtypes and not isinstance(other, np.ndarray): assert (rs.dtypes == df.dtypes).all() - for df in [float_string_frame, mixed_float_frame, mixed_int_frame]: - if df is float_string_frame: - with pytest.raises(TypeError): - df > 0 - continue + df = where_frame + if df is float_string_frame: + with pytest.raises(TypeError): + df > 0 + return - # other is a frame - cond = (df > 0)[1:] - _check_align(df, cond, _safe_add(df)) + # other is a frame + cond = (df > 0)[1:] + _check_align(df, cond, _safe_add(df)) - # check other is ndarray - cond = df > 0 - _check_align(df, cond, (_safe_add(df).values)) + # check other is ndarray + cond = df > 0 + _check_align(df, cond, (_safe_add(df).values)) - # integers are upcast, so don't check the dtypes - cond = df > 0 - check_dtypes = all(not issubclass(s.type, np.integer) for s in df.dtypes) - _check_align(df, cond, np.nan, check_dtypes=check_dtypes) + # integers are upcast, so don't check the dtypes + cond = df > 0 + check_dtypes = all(not issubclass(s.type, np.integer) for s in df.dtypes) + _check_align(df, cond, np.nan, check_dtypes=check_dtypes) + def test_where_invalid(self): # invalid conditions - df = default_frame + df = DataFrame(np.random.randn(5, 3), columns=["A", "B", "C"]) + cond = df > 0 + err1 = (df + 1).values[0:2, :] msg = "other must be the same shape as self when an ndarray" with pytest.raises(ValueError, match=msg): @@ -144,7 +152,9 @@ def _check_align(df, cond, other, check_dtypes=True): with pytest.raises(ValueError, match=msg): df.mask(0) + def test_where_set(self, where_frame, float_string_frame): # where inplace + def _check_set(df, cond, check_dtypes=True): dfi = df.copy() econd = cond.reindex_like(df).fillna(True) @@ -160,27 +170,23 @@ def _check_set(df, cond, check_dtypes=True): v = np.dtype("float64") assert dfi[k].dtype == v - for df in [ - default_frame, - float_string_frame, - mixed_float_frame, - mixed_int_frame, - ]: - if df is float_string_frame: - with pytest.raises(TypeError): - df > 0 - continue + df = where_frame + if df is float_string_frame: + with pytest.raises(TypeError): + df > 0 + return - cond = df > 0 - _check_set(df, cond) + cond = df > 0 + _check_set(df, cond) - cond = df >= 0 - _check_set(df, cond) + cond = df >= 0 + _check_set(df, cond) - # aligning - cond = (df >= 0)[1:] - _check_set(df, cond) + # aligning + cond = (df >= 0)[1:] + _check_set(df, cond) + def test_where_series_slicing(self): # GH 10218 # test DataFrame.where with Series slicing df = DataFrame({"a": range(3), "b": range(4, 7)}) diff --git a/pandas/tests/frame/indexing/test_xs.py b/pandas/tests/frame/indexing/test_xs.py new file mode 100644 index 0000000000000..71b40585f0c2f --- /dev/null +++ b/pandas/tests/frame/indexing/test_xs.py @@ -0,0 +1,95 @@ +import re + +import numpy as np +import pytest + +import pandas as pd +from pandas import DataFrame, Series +import pandas._testing as tm + +from pandas.tseries.offsets import BDay + + +class TestXS: + def test_xs(self, float_frame, datetime_frame): + idx = float_frame.index[5] + xs = float_frame.xs(idx) + for item, value in xs.items(): + if np.isnan(value): + assert np.isnan(float_frame[item][idx]) + else: + assert value == float_frame[item][idx] + + # mixed-type xs + test_data = {"A": {"1": 1, "2": 2}, "B": {"1": "1", "2": "2", "3": "3"}} + frame = DataFrame(test_data) + xs = frame.xs("1") + assert xs.dtype == np.object_ + assert xs["A"] == 1 + assert xs["B"] == "1" + + with pytest.raises( + KeyError, match=re.escape("Timestamp('1999-12-31 00:00:00', freq='B')") + ): + datetime_frame.xs(datetime_frame.index[0] - BDay()) + + # xs get column + series = float_frame.xs("A", axis=1) + expected = float_frame["A"] + tm.assert_series_equal(series, expected) + + # view is returned if possible + series = float_frame.xs("A", axis=1) + series[:] = 5 + assert (expected == 5).all() + + def test_xs_corner(self): + # pathological mixed-type reordering case + df = DataFrame(index=[0]) + df["A"] = 1.0 + df["B"] = "foo" + df["C"] = 2.0 + df["D"] = "bar" + df["E"] = 3.0 + + xs = df.xs(0) + exp = pd.Series([1.0, "foo", 2.0, "bar", 3.0], index=list("ABCDE"), name=0) + tm.assert_series_equal(xs, exp) + + # no columns but Index(dtype=object) + df = DataFrame(index=["a", "b", "c"]) + result = df.xs("a") + expected = Series([], name="a", index=pd.Index([]), dtype=np.float64) + tm.assert_series_equal(result, expected) + + def test_xs_duplicates(self): + df = DataFrame(np.random.randn(5, 2), index=["b", "b", "c", "b", "a"]) + + cross = df.xs("c") + exp = df.iloc[2] + tm.assert_series_equal(cross, exp) + + def test_xs_keep_level(self): + df = DataFrame( + { + "day": {0: "sat", 1: "sun"}, + "flavour": {0: "strawberry", 1: "strawberry"}, + "sales": {0: 10, 1: 12}, + "year": {0: 2008, 1: 2008}, + } + ).set_index(["year", "flavour", "day"]) + result = df.xs("sat", level="day", drop_level=False) + expected = df[:1] + tm.assert_frame_equal(result, expected) + + result = df.xs([2008, "sat"], level=["year", "day"], drop_level=False) + tm.assert_frame_equal(result, expected) + + def test_xs_view(self): + # in 0.14 this will return a view if possible a copy otherwise, but + # this is numpy dependent + + dm = DataFrame(np.arange(20.0).reshape(4, 5), index=range(4), columns=range(5)) + + dm.xs(2)[:] = 10 + assert (dm.xs(2) == 10).all() diff --git a/pandas/tests/frame/methods/test_combine_first.py b/pandas/tests/frame/methods/test_combine_first.py new file mode 100644 index 0000000000000..7715cb1cb6eec --- /dev/null +++ b/pandas/tests/frame/methods/test_combine_first.py @@ -0,0 +1,349 @@ +from datetime import datetime + +import numpy as np +import pytest + +import pandas as pd +from pandas import DataFrame, Index, Series +import pandas._testing as tm + + +class TestDataFrameCombineFirst: + def test_combine_first_mixed(self): + a = Series(["a", "b"], index=range(2)) + b = Series(range(2), index=range(2)) + f = DataFrame({"A": a, "B": b}) + + a = Series(["a", "b"], index=range(5, 7)) + b = Series(range(2), index=range(5, 7)) + g = DataFrame({"A": a, "B": b}) + + exp = pd.DataFrame( + {"A": list("abab"), "B": [0.0, 1.0, 0.0, 1.0]}, index=[0, 1, 5, 6] + ) + combined = f.combine_first(g) + tm.assert_frame_equal(combined, exp) + + def test_combine_first(self, float_frame): + # disjoint + head, tail = float_frame[:5], float_frame[5:] + + combined = head.combine_first(tail) + reordered_frame = float_frame.reindex(combined.index) + tm.assert_frame_equal(combined, reordered_frame) + assert tm.equalContents(combined.columns, float_frame.columns) + tm.assert_series_equal(combined["A"], reordered_frame["A"]) + + # same index + fcopy = float_frame.copy() + fcopy["A"] = 1 + del fcopy["C"] + + fcopy2 = float_frame.copy() + fcopy2["B"] = 0 + del fcopy2["D"] + + combined = fcopy.combine_first(fcopy2) + + assert (combined["A"] == 1).all() + tm.assert_series_equal(combined["B"], fcopy["B"]) + tm.assert_series_equal(combined["C"], fcopy2["C"]) + tm.assert_series_equal(combined["D"], fcopy["D"]) + + # overlap + head, tail = reordered_frame[:10].copy(), reordered_frame + head["A"] = 1 + + combined = head.combine_first(tail) + assert (combined["A"][:10] == 1).all() + + # reverse overlap + tail["A"][:10] = 0 + combined = tail.combine_first(head) + assert (combined["A"][:10] == 0).all() + + # no overlap + f = float_frame[:10] + g = float_frame[10:] + combined = f.combine_first(g) + tm.assert_series_equal(combined["A"].reindex(f.index), f["A"]) + tm.assert_series_equal(combined["A"].reindex(g.index), g["A"]) + + # corner cases + comb = float_frame.combine_first(DataFrame()) + tm.assert_frame_equal(comb, float_frame) + + comb = DataFrame().combine_first(float_frame) + tm.assert_frame_equal(comb, float_frame) + + comb = float_frame.combine_first(DataFrame(index=["faz", "boo"])) + assert "faz" in comb.index + + # #2525 + df = DataFrame({"a": [1]}, index=[datetime(2012, 1, 1)]) + df2 = DataFrame(columns=["b"]) + result = df.combine_first(df2) + assert "b" in result + + def test_combine_first_mixed_bug(self): + idx = Index(["a", "b", "c", "e"]) + ser1 = Series([5.0, -9.0, 4.0, 100.0], index=idx) + ser2 = Series(["a", "b", "c", "e"], index=idx) + ser3 = Series([12, 4, 5, 97], index=idx) + + frame1 = DataFrame({"col0": ser1, "col2": ser2, "col3": ser3}) + + idx = Index(["a", "b", "c", "f"]) + ser1 = Series([5.0, -9.0, 4.0, 100.0], index=idx) + ser2 = Series(["a", "b", "c", "f"], index=idx) + ser3 = Series([12, 4, 5, 97], index=idx) + + frame2 = DataFrame({"col1": ser1, "col2": ser2, "col5": ser3}) + + combined = frame1.combine_first(frame2) + assert len(combined.columns) == 5 + + # gh 3016 (same as in update) + df = DataFrame( + [[1.0, 2.0, False, True], [4.0, 5.0, True, False]], + columns=["A", "B", "bool1", "bool2"], + ) + + other = DataFrame([[45, 45]], index=[0], columns=["A", "B"]) + result = df.combine_first(other) + tm.assert_frame_equal(result, df) + + df.loc[0, "A"] = np.nan + result = df.combine_first(other) + df.loc[0, "A"] = 45 + tm.assert_frame_equal(result, df) + + # doc example + df1 = DataFrame( + {"A": [1.0, np.nan, 3.0, 5.0, np.nan], "B": [np.nan, 2.0, 3.0, np.nan, 6.0]} + ) + + df2 = DataFrame( + { + "A": [5.0, 2.0, 4.0, np.nan, 3.0, 7.0], + "B": [np.nan, np.nan, 3.0, 4.0, 6.0, 8.0], + } + ) + + result = df1.combine_first(df2) + expected = DataFrame({"A": [1, 2, 3, 5, 3, 7.0], "B": [np.nan, 2, 3, 4, 6, 8]}) + tm.assert_frame_equal(result, expected) + + # GH3552, return object dtype with bools + df1 = DataFrame( + [[np.nan, 3.0, True], [-4.6, np.nan, True], [np.nan, 7.0, False]] + ) + df2 = DataFrame([[-42.6, np.nan, True], [-5.0, 1.6, False]], index=[1, 2]) + + result = df1.combine_first(df2)[2] + expected = Series([True, True, False], name=2) + tm.assert_series_equal(result, expected) + + # GH 3593, converting datetime64[ns] incorrectly + df0 = DataFrame( + {"a": [datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)]} + ) + df1 = DataFrame({"a": [None, None, None]}) + df2 = df1.combine_first(df0) + tm.assert_frame_equal(df2, df0) + + df2 = df0.combine_first(df1) + tm.assert_frame_equal(df2, df0) + + df0 = DataFrame( + {"a": [datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)]} + ) + df1 = DataFrame({"a": [datetime(2000, 1, 2), None, None]}) + df2 = df1.combine_first(df0) + result = df0.copy() + result.iloc[0, :] = df1.iloc[0, :] + tm.assert_frame_equal(df2, result) + + df2 = df0.combine_first(df1) + tm.assert_frame_equal(df2, df0) + + def test_combine_first_align_nan(self): + # GH 7509 (not fixed) + dfa = pd.DataFrame([[pd.Timestamp("2011-01-01"), 2]], columns=["a", "b"]) + dfb = pd.DataFrame([[4], [5]], columns=["b"]) + assert dfa["a"].dtype == "datetime64[ns]" + assert dfa["b"].dtype == "int64" + + res = dfa.combine_first(dfb) + exp = pd.DataFrame( + {"a": [pd.Timestamp("2011-01-01"), pd.NaT], "b": [2.0, 5.0]}, + columns=["a", "b"], + ) + tm.assert_frame_equal(res, exp) + assert res["a"].dtype == "datetime64[ns]" + # ToDo: this must be int64 + assert res["b"].dtype == "float64" + + res = dfa.iloc[:0].combine_first(dfb) + exp = pd.DataFrame({"a": [np.nan, np.nan], "b": [4, 5]}, columns=["a", "b"]) + tm.assert_frame_equal(res, exp) + # ToDo: this must be datetime64 + assert res["a"].dtype == "float64" + # ToDo: this must be int64 + assert res["b"].dtype == "int64" + + def test_combine_first_timezone(self): + # see gh-7630 + data1 = pd.to_datetime("20100101 01:01").tz_localize("UTC") + df1 = pd.DataFrame( + columns=["UTCdatetime", "abc"], + data=data1, + index=pd.date_range("20140627", periods=1), + ) + data2 = pd.to_datetime("20121212 12:12").tz_localize("UTC") + df2 = pd.DataFrame( + columns=["UTCdatetime", "xyz"], + data=data2, + index=pd.date_range("20140628", periods=1), + ) + res = df2[["UTCdatetime"]].combine_first(df1) + exp = pd.DataFrame( + { + "UTCdatetime": [ + pd.Timestamp("2010-01-01 01:01", tz="UTC"), + pd.Timestamp("2012-12-12 12:12", tz="UTC"), + ], + "abc": [pd.Timestamp("2010-01-01 01:01:00", tz="UTC"), pd.NaT], + }, + columns=["UTCdatetime", "abc"], + index=pd.date_range("20140627", periods=2, freq="D"), + ) + tm.assert_frame_equal(res, exp) + assert res["UTCdatetime"].dtype == "datetime64[ns, UTC]" + assert res["abc"].dtype == "datetime64[ns, UTC]" + + # see gh-10567 + dts1 = pd.date_range("2015-01-01", "2015-01-05", tz="UTC") + df1 = pd.DataFrame({"DATE": dts1}) + dts2 = pd.date_range("2015-01-03", "2015-01-05", tz="UTC") + df2 = pd.DataFrame({"DATE": dts2}) + + res = df1.combine_first(df2) + tm.assert_frame_equal(res, df1) + assert res["DATE"].dtype == "datetime64[ns, UTC]" + + dts1 = pd.DatetimeIndex( + ["2011-01-01", "NaT", "2011-01-03", "2011-01-04"], tz="US/Eastern" + ) + df1 = pd.DataFrame({"DATE": dts1}, index=[1, 3, 5, 7]) + dts2 = pd.DatetimeIndex( + ["2012-01-01", "2012-01-02", "2012-01-03"], tz="US/Eastern" + ) + df2 = pd.DataFrame({"DATE": dts2}, index=[2, 4, 5]) + + res = df1.combine_first(df2) + exp_dts = pd.DatetimeIndex( + [ + "2011-01-01", + "2012-01-01", + "NaT", + "2012-01-02", + "2011-01-03", + "2011-01-04", + ], + tz="US/Eastern", + ) + exp = pd.DataFrame({"DATE": exp_dts}, index=[1, 2, 3, 4, 5, 7]) + tm.assert_frame_equal(res, exp) + + # different tz + dts1 = pd.date_range("2015-01-01", "2015-01-05", tz="US/Eastern") + df1 = pd.DataFrame({"DATE": dts1}) + dts2 = pd.date_range("2015-01-03", "2015-01-05") + df2 = pd.DataFrame({"DATE": dts2}) + + # if df1 doesn't have NaN, keep its dtype + res = df1.combine_first(df2) + tm.assert_frame_equal(res, df1) + assert res["DATE"].dtype == "datetime64[ns, US/Eastern]" + + dts1 = pd.date_range("2015-01-01", "2015-01-02", tz="US/Eastern") + df1 = pd.DataFrame({"DATE": dts1}) + dts2 = pd.date_range("2015-01-01", "2015-01-03") + df2 = pd.DataFrame({"DATE": dts2}) + + res = df1.combine_first(df2) + exp_dts = [ + pd.Timestamp("2015-01-01", tz="US/Eastern"), + pd.Timestamp("2015-01-02", tz="US/Eastern"), + pd.Timestamp("2015-01-03"), + ] + exp = pd.DataFrame({"DATE": exp_dts}) + tm.assert_frame_equal(res, exp) + assert res["DATE"].dtype == "object" + + def test_combine_first_timedelta(self): + data1 = pd.TimedeltaIndex(["1 day", "NaT", "3 day", "4day"]) + df1 = pd.DataFrame({"TD": data1}, index=[1, 3, 5, 7]) + data2 = pd.TimedeltaIndex(["10 day", "11 day", "12 day"]) + df2 = pd.DataFrame({"TD": data2}, index=[2, 4, 5]) + + res = df1.combine_first(df2) + exp_dts = pd.TimedeltaIndex( + ["1 day", "10 day", "NaT", "11 day", "3 day", "4 day"] + ) + exp = pd.DataFrame({"TD": exp_dts}, index=[1, 2, 3, 4, 5, 7]) + tm.assert_frame_equal(res, exp) + assert res["TD"].dtype == "timedelta64[ns]" + + def test_combine_first_period(self): + data1 = pd.PeriodIndex(["2011-01", "NaT", "2011-03", "2011-04"], freq="M") + df1 = pd.DataFrame({"P": data1}, index=[1, 3, 5, 7]) + data2 = pd.PeriodIndex(["2012-01-01", "2012-02", "2012-03"], freq="M") + df2 = pd.DataFrame({"P": data2}, index=[2, 4, 5]) + + res = df1.combine_first(df2) + exp_dts = pd.PeriodIndex( + ["2011-01", "2012-01", "NaT", "2012-02", "2011-03", "2011-04"], freq="M" + ) + exp = pd.DataFrame({"P": exp_dts}, index=[1, 2, 3, 4, 5, 7]) + tm.assert_frame_equal(res, exp) + assert res["P"].dtype == data1.dtype + + # different freq + dts2 = pd.PeriodIndex(["2012-01-01", "2012-01-02", "2012-01-03"], freq="D") + df2 = pd.DataFrame({"P": dts2}, index=[2, 4, 5]) + + res = df1.combine_first(df2) + exp_dts = [ + pd.Period("2011-01", freq="M"), + pd.Period("2012-01-01", freq="D"), + pd.NaT, + pd.Period("2012-01-02", freq="D"), + pd.Period("2011-03", freq="M"), + pd.Period("2011-04", freq="M"), + ] + exp = pd.DataFrame({"P": exp_dts}, index=[1, 2, 3, 4, 5, 7]) + tm.assert_frame_equal(res, exp) + assert res["P"].dtype == "object" + + def test_combine_first_int(self): + # GH14687 - integer series that do no align exactly + + df1 = pd.DataFrame({"a": [0, 1, 3, 5]}, dtype="int64") + df2 = pd.DataFrame({"a": [1, 4]}, dtype="int64") + + res = df1.combine_first(df2) + tm.assert_frame_equal(res, df1) + assert res["a"].dtype == "int64" + + @pytest.mark.parametrize("val", [1, 1.0]) + def test_combine_first_with_asymmetric_other(self, val): + # see gh-20699 + df1 = pd.DataFrame({"isNum": [val]}) + df2 = pd.DataFrame({"isBool": [True]}) + + res = df1.combine_first(df2) + exp = pd.DataFrame({"isBool": [True], "isNum": [val]}) + + tm.assert_frame_equal(res, exp) diff --git a/pandas/tests/frame/methods/test_head_tail.py b/pandas/tests/frame/methods/test_head_tail.py new file mode 100644 index 0000000000000..93763bc12ce0d --- /dev/null +++ b/pandas/tests/frame/methods/test_head_tail.py @@ -0,0 +1,30 @@ +import numpy as np + +from pandas import DataFrame +import pandas._testing as tm + + +def test_head_tail(float_frame): + tm.assert_frame_equal(float_frame.head(), float_frame[:5]) + tm.assert_frame_equal(float_frame.tail(), float_frame[-5:]) + + tm.assert_frame_equal(float_frame.head(0), float_frame[0:0]) + tm.assert_frame_equal(float_frame.tail(0), float_frame[0:0]) + + tm.assert_frame_equal(float_frame.head(-1), float_frame[:-1]) + tm.assert_frame_equal(float_frame.tail(-1), float_frame[1:]) + tm.assert_frame_equal(float_frame.head(1), float_frame[:1]) + tm.assert_frame_equal(float_frame.tail(1), float_frame[-1:]) + # with a float index + df = float_frame.copy() + df.index = np.arange(len(float_frame)) + 0.1 + tm.assert_frame_equal(df.head(), df.iloc[:5]) + tm.assert_frame_equal(df.tail(), df.iloc[-5:]) + tm.assert_frame_equal(df.head(0), df[0:0]) + tm.assert_frame_equal(df.tail(0), df[0:0]) + tm.assert_frame_equal(df.head(-1), df.iloc[:-1]) + tm.assert_frame_equal(df.tail(-1), df.iloc[1:]) + # test empty dataframe + empty_df = DataFrame() + tm.assert_frame_equal(empty_df.tail(), empty_df) + tm.assert_frame_equal(empty_df.head(), empty_df) diff --git a/pandas/tests/frame/methods/test_replace.py b/pandas/tests/frame/methods/test_replace.py index aa91e7a489356..92b74c4409d7d 100644 --- a/pandas/tests/frame/methods/test_replace.py +++ b/pandas/tests/frame/methods/test_replace.py @@ -1356,3 +1356,10 @@ def test_replace_replacer_dtype(self, replacer): result = df.replace({"a": replacer, "b": replacer}) expected = pd.DataFrame([replacer]) tm.assert_frame_equal(result, expected) + + def test_replace_after_convert_dtypes(self): + # GH31517 + df = pd.DataFrame({"grp": [1, 2, 3, 4, 5]}, dtype="Int64") + result = df.replace(1, 10) + expected = pd.DataFrame({"grp": [10, 2, 3, 4, 5]}, dtype="Int64") + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_transpose.py b/pandas/tests/frame/methods/test_transpose.py index 428b9e5068407..a5fe5f3a6d5e4 100644 --- a/pandas/tests/frame/methods/test_transpose.py +++ b/pandas/tests/frame/methods/test_transpose.py @@ -1,3 +1,5 @@ +import numpy as np + import pandas as pd import pandas._testing as tm @@ -41,3 +43,34 @@ def test_transpose_object_to_tzaware_mixed_tz(self): assert (df2.dtypes == object).all() res2 = df2.T assert (res2.dtypes == [dti.dtype, dti2.dtype]).all() + + def test_transpose_uint64(self, uint64_frame): + + result = uint64_frame.T + expected = pd.DataFrame(uint64_frame.values.T) + expected.index = ["A", "B"] + tm.assert_frame_equal(result, expected) + + def test_transpose_float(self, float_frame): + frame = float_frame + dft = frame.T + for idx, series in dft.items(): + for col, value in series.items(): + if np.isnan(value): + assert np.isnan(frame[col][idx]) + else: + assert value == frame[col][idx] + + # mixed type + index, data = tm.getMixedTypeDict() + mixed = pd.DataFrame(data, index=index) + + mixed_T = mixed.T + for col, s in mixed_T.items(): + assert s.dtype == np.object_ + + def test_transpose_get_view(self, float_frame): + dft = float_frame.T + dft.values[:, 5:10] = 5 + + assert (float_frame.values[5:10] == 5).all() diff --git a/pandas/tests/frame/methods/test_update.py b/pandas/tests/frame/methods/test_update.py new file mode 100644 index 0000000000000..d9de026dbf4e9 --- /dev/null +++ b/pandas/tests/frame/methods/test_update.py @@ -0,0 +1,135 @@ +import numpy as np +import pytest + +import pandas as pd +from pandas import DataFrame, Series, date_range +import pandas._testing as tm + + +class TestDataFrameUpdate: + def test_update_nan(self): + # #15593 #15617 + # test 1 + df1 = DataFrame({"A": [1.0, 2, 3], "B": date_range("2000", periods=3)}) + df2 = DataFrame({"A": [None, 2, 3]}) + expected = df1.copy() + df1.update(df2, overwrite=False) + + tm.assert_frame_equal(df1, expected) + + # test 2 + df1 = DataFrame({"A": [1.0, None, 3], "B": date_range("2000", periods=3)}) + df2 = DataFrame({"A": [None, 2, 3]}) + expected = DataFrame({"A": [1.0, 2, 3], "B": date_range("2000", periods=3)}) + df1.update(df2, overwrite=False) + + tm.assert_frame_equal(df1, expected) + + def test_update(self): + df = DataFrame( + [[1.5, np.nan, 3.0], [1.5, np.nan, 3.0], [1.5, np.nan, 3], [1.5, np.nan, 3]] + ) + + other = DataFrame([[3.6, 2.0, np.nan], [np.nan, np.nan, 7]], index=[1, 3]) + + df.update(other) + + expected = DataFrame( + [[1.5, np.nan, 3], [3.6, 2, 3], [1.5, np.nan, 3], [1.5, np.nan, 7.0]] + ) + tm.assert_frame_equal(df, expected) + + def test_update_dtypes(self): + + # gh 3016 + df = DataFrame( + [[1.0, 2.0, False, True], [4.0, 5.0, True, False]], + columns=["A", "B", "bool1", "bool2"], + ) + + other = DataFrame([[45, 45]], index=[0], columns=["A", "B"]) + df.update(other) + + expected = DataFrame( + [[45.0, 45.0, False, True], [4.0, 5.0, True, False]], + columns=["A", "B", "bool1", "bool2"], + ) + tm.assert_frame_equal(df, expected) + + def test_update_nooverwrite(self): + df = DataFrame( + [[1.5, np.nan, 3.0], [1.5, np.nan, 3.0], [1.5, np.nan, 3], [1.5, np.nan, 3]] + ) + + other = DataFrame([[3.6, 2.0, np.nan], [np.nan, np.nan, 7]], index=[1, 3]) + + df.update(other, overwrite=False) + + expected = DataFrame( + [[1.5, np.nan, 3], [1.5, 2, 3], [1.5, np.nan, 3], [1.5, np.nan, 3.0]] + ) + tm.assert_frame_equal(df, expected) + + def test_update_filtered(self): + df = DataFrame( + [[1.5, np.nan, 3.0], [1.5, np.nan, 3.0], [1.5, np.nan, 3], [1.5, np.nan, 3]] + ) + + other = DataFrame([[3.6, 2.0, np.nan], [np.nan, np.nan, 7]], index=[1, 3]) + + df.update(other, filter_func=lambda x: x > 2) + + expected = DataFrame( + [[1.5, np.nan, 3], [1.5, np.nan, 3], [1.5, np.nan, 3], [1.5, np.nan, 7.0]] + ) + tm.assert_frame_equal(df, expected) + + @pytest.mark.parametrize( + "bad_kwarg, exception, msg", + [ + # errors must be 'ignore' or 'raise' + ({"errors": "something"}, ValueError, "The parameter errors must.*"), + ({"join": "inner"}, NotImplementedError, "Only left join is supported"), + ], + ) + def test_update_raise_bad_parameter(self, bad_kwarg, exception, msg): + df = DataFrame([[1.5, 1, 3.0]]) + with pytest.raises(exception, match=msg): + df.update(df, **bad_kwarg) + + def test_update_raise_on_overlap(self): + df = DataFrame( + [[1.5, 1, 3.0], [1.5, np.nan, 3.0], [1.5, np.nan, 3], [1.5, np.nan, 3]] + ) + + other = DataFrame([[2.0, np.nan], [np.nan, 7]], index=[1, 3], columns=[1, 2]) + with pytest.raises(ValueError, match="Data overlaps"): + df.update(other, errors="raise") + + def test_update_from_non_df(self): + d = {"a": Series([1, 2, 3, 4]), "b": Series([5, 6, 7, 8])} + df = DataFrame(d) + + d["a"] = Series([5, 6, 7, 8]) + df.update(d) + + expected = DataFrame(d) + + tm.assert_frame_equal(df, expected) + + d = {"a": [1, 2, 3, 4], "b": [5, 6, 7, 8]} + df = DataFrame(d) + + d["a"] = [5, 6, 7, 8] + df.update(d) + + expected = DataFrame(d) + + tm.assert_frame_equal(df, expected) + + def test_update_datetime_tz(self): + # GH 25807 + result = DataFrame([pd.Timestamp("2019", tz="UTC")]) + result.update(result) + expected = DataFrame([pd.Timestamp("2019", tz="UTC")]) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/test_api.py b/pandas/tests/frame/test_api.py index 9de5d6fe16a0d..17cc50661e3cb 100644 --- a/pandas/tests/frame/test_api.py +++ b/pandas/tests/frame/test_api.py @@ -14,15 +14,15 @@ class TestDataFrameMisc: - def test_copy_index_name_checking(self, float_frame): + @pytest.mark.parametrize("attr", ["index", "columns"]) + def test_copy_index_name_checking(self, float_frame, attr): # don't want to be able to modify the index stored elsewhere after # making a copy - for attr in ("index", "columns"): - ind = getattr(float_frame, attr) - ind.name = None - cp = float_frame.copy() - getattr(cp, attr).name = "foo" - assert getattr(float_frame, attr).name is None + ind = getattr(float_frame, attr) + ind.name = None + cp = float_frame.copy() + getattr(cp, attr).name = "foo" + assert getattr(float_frame, attr).name is None def test_getitem_pop_assign_name(self, float_frame): s = float_frame["A"] @@ -358,24 +358,6 @@ def test_to_numpy_copy(self): assert df.to_numpy(copy=False).base is arr assert df.to_numpy(copy=True).base is None - def test_transpose(self, float_frame): - frame = float_frame - dft = frame.T - for idx, series in dft.items(): - for col, value in series.items(): - if np.isnan(value): - assert np.isnan(frame[col][idx]) - else: - assert value == frame[col][idx] - - # mixed type - index, data = tm.getMixedTypeDict() - mixed = DataFrame(data, index=index) - - mixed_T = mixed.T - for col, s in mixed_T.items(): - assert s.dtype == np.object_ - def test_swapaxes(self): df = DataFrame(np.random.randn(10, 5)) tm.assert_frame_equal(df.T, df.swapaxes(0, 1)) @@ -470,12 +452,6 @@ def test_deepcopy(self, float_frame): for idx, value in series.items(): assert float_frame["A"][idx] != value - def test_transpose_get_view(self, float_frame): - dft = float_frame.T - dft.values[:, 5:10] = 5 - - assert (float_frame.values[5:10] == 5).all() - def test_inplace_return_self(self): # GH 1893 diff --git a/pandas/tests/frame/test_apply.py b/pandas/tests/frame/test_apply.py index e98f74e133ea9..fe6abef97acc4 100644 --- a/pandas/tests/frame/test_apply.py +++ b/pandas/tests/frame/test_apply.py @@ -703,6 +703,14 @@ def apply_list(row): ) tm.assert_series_equal(result, expected) + def test_apply_noreduction_tzaware_object(self): + # https://github.com/pandas-dev/pandas/issues/31505 + df = pd.DataFrame({"foo": [pd.Timestamp("2020", tz="UTC")]}, dtype="object") + result = df.apply(lambda x: x) + tm.assert_frame_equal(result, df) + result = df.apply(lambda x: x.copy()) + tm.assert_frame_equal(result, df) + class TestInferOutputShape: # the user has supplied an opaque UDF where diff --git a/pandas/tests/frame/test_combine_concat.py b/pandas/tests/frame/test_combine_concat.py index 9bad54b051d6c..36a476d195fe5 100644 --- a/pandas/tests/frame/test_combine_concat.py +++ b/pandas/tests/frame/test_combine_concat.py @@ -128,115 +128,6 @@ def test_concat_tuple_keys(self): ) tm.assert_frame_equal(results, expected) - def test_update(self): - df = DataFrame( - [[1.5, np.nan, 3.0], [1.5, np.nan, 3.0], [1.5, np.nan, 3], [1.5, np.nan, 3]] - ) - - other = DataFrame([[3.6, 2.0, np.nan], [np.nan, np.nan, 7]], index=[1, 3]) - - df.update(other) - - expected = DataFrame( - [[1.5, np.nan, 3], [3.6, 2, 3], [1.5, np.nan, 3], [1.5, np.nan, 7.0]] - ) - tm.assert_frame_equal(df, expected) - - def test_update_dtypes(self): - - # gh 3016 - df = DataFrame( - [[1.0, 2.0, False, True], [4.0, 5.0, True, False]], - columns=["A", "B", "bool1", "bool2"], - ) - - other = DataFrame([[45, 45]], index=[0], columns=["A", "B"]) - df.update(other) - - expected = DataFrame( - [[45.0, 45.0, False, True], [4.0, 5.0, True, False]], - columns=["A", "B", "bool1", "bool2"], - ) - tm.assert_frame_equal(df, expected) - - def test_update_nooverwrite(self): - df = DataFrame( - [[1.5, np.nan, 3.0], [1.5, np.nan, 3.0], [1.5, np.nan, 3], [1.5, np.nan, 3]] - ) - - other = DataFrame([[3.6, 2.0, np.nan], [np.nan, np.nan, 7]], index=[1, 3]) - - df.update(other, overwrite=False) - - expected = DataFrame( - [[1.5, np.nan, 3], [1.5, 2, 3], [1.5, np.nan, 3], [1.5, np.nan, 3.0]] - ) - tm.assert_frame_equal(df, expected) - - def test_update_filtered(self): - df = DataFrame( - [[1.5, np.nan, 3.0], [1.5, np.nan, 3.0], [1.5, np.nan, 3], [1.5, np.nan, 3]] - ) - - other = DataFrame([[3.6, 2.0, np.nan], [np.nan, np.nan, 7]], index=[1, 3]) - - df.update(other, filter_func=lambda x: x > 2) - - expected = DataFrame( - [[1.5, np.nan, 3], [1.5, np.nan, 3], [1.5, np.nan, 3], [1.5, np.nan, 7.0]] - ) - tm.assert_frame_equal(df, expected) - - @pytest.mark.parametrize( - "bad_kwarg, exception, msg", - [ - # errors must be 'ignore' or 'raise' - ({"errors": "something"}, ValueError, "The parameter errors must.*"), - ({"join": "inner"}, NotImplementedError, "Only left join is supported"), - ], - ) - def test_update_raise_bad_parameter(self, bad_kwarg, exception, msg): - df = DataFrame([[1.5, 1, 3.0]]) - with pytest.raises(exception, match=msg): - df.update(df, **bad_kwarg) - - def test_update_raise_on_overlap(self): - df = DataFrame( - [[1.5, 1, 3.0], [1.5, np.nan, 3.0], [1.5, np.nan, 3], [1.5, np.nan, 3]] - ) - - other = DataFrame([[2.0, np.nan], [np.nan, 7]], index=[1, 3], columns=[1, 2]) - with pytest.raises(ValueError, match="Data overlaps"): - df.update(other, errors="raise") - - def test_update_from_non_df(self): - d = {"a": Series([1, 2, 3, 4]), "b": Series([5, 6, 7, 8])} - df = DataFrame(d) - - d["a"] = Series([5, 6, 7, 8]) - df.update(d) - - expected = DataFrame(d) - - tm.assert_frame_equal(df, expected) - - d = {"a": [1, 2, 3, 4], "b": [5, 6, 7, 8]} - df = DataFrame(d) - - d["a"] = [5, 6, 7, 8] - df.update(d) - - expected = DataFrame(d) - - tm.assert_frame_equal(df, expected) - - def test_update_datetime_tz(self): - # GH 25807 - result = DataFrame([pd.Timestamp("2019", tz="UTC")]) - result.update(result) - expected = DataFrame([pd.Timestamp("2019", tz="UTC")]) - tm.assert_frame_equal(result, expected) - def test_join_str_datetime(self): str_dates = ["20120209", "20120222"] dt_dates = [datetime(2012, 2, 9), datetime(2012, 2, 22)] @@ -422,347 +313,6 @@ def test_concat_astype_dup_col(self): ).astype("category") tm.assert_frame_equal(result, expected) - -class TestDataFrameCombineFirst: - def test_combine_first_mixed(self): - a = Series(["a", "b"], index=range(2)) - b = Series(range(2), index=range(2)) - f = DataFrame({"A": a, "B": b}) - - a = Series(["a", "b"], index=range(5, 7)) - b = Series(range(2), index=range(5, 7)) - g = DataFrame({"A": a, "B": b}) - - exp = pd.DataFrame( - {"A": list("abab"), "B": [0.0, 1.0, 0.0, 1.0]}, index=[0, 1, 5, 6] - ) - combined = f.combine_first(g) - tm.assert_frame_equal(combined, exp) - - def test_combine_first(self, float_frame): - # disjoint - head, tail = float_frame[:5], float_frame[5:] - - combined = head.combine_first(tail) - reordered_frame = float_frame.reindex(combined.index) - tm.assert_frame_equal(combined, reordered_frame) - assert tm.equalContents(combined.columns, float_frame.columns) - tm.assert_series_equal(combined["A"], reordered_frame["A"]) - - # same index - fcopy = float_frame.copy() - fcopy["A"] = 1 - del fcopy["C"] - - fcopy2 = float_frame.copy() - fcopy2["B"] = 0 - del fcopy2["D"] - - combined = fcopy.combine_first(fcopy2) - - assert (combined["A"] == 1).all() - tm.assert_series_equal(combined["B"], fcopy["B"]) - tm.assert_series_equal(combined["C"], fcopy2["C"]) - tm.assert_series_equal(combined["D"], fcopy["D"]) - - # overlap - head, tail = reordered_frame[:10].copy(), reordered_frame - head["A"] = 1 - - combined = head.combine_first(tail) - assert (combined["A"][:10] == 1).all() - - # reverse overlap - tail["A"][:10] = 0 - combined = tail.combine_first(head) - assert (combined["A"][:10] == 0).all() - - # no overlap - f = float_frame[:10] - g = float_frame[10:] - combined = f.combine_first(g) - tm.assert_series_equal(combined["A"].reindex(f.index), f["A"]) - tm.assert_series_equal(combined["A"].reindex(g.index), g["A"]) - - # corner cases - comb = float_frame.combine_first(DataFrame()) - tm.assert_frame_equal(comb, float_frame) - - comb = DataFrame().combine_first(float_frame) - tm.assert_frame_equal(comb, float_frame) - - comb = float_frame.combine_first(DataFrame(index=["faz", "boo"])) - assert "faz" in comb.index - - # #2525 - df = DataFrame({"a": [1]}, index=[datetime(2012, 1, 1)]) - df2 = DataFrame(columns=["b"]) - result = df.combine_first(df2) - assert "b" in result - - def test_combine_first_mixed_bug(self): - idx = Index(["a", "b", "c", "e"]) - ser1 = Series([5.0, -9.0, 4.0, 100.0], index=idx) - ser2 = Series(["a", "b", "c", "e"], index=idx) - ser3 = Series([12, 4, 5, 97], index=idx) - - frame1 = DataFrame({"col0": ser1, "col2": ser2, "col3": ser3}) - - idx = Index(["a", "b", "c", "f"]) - ser1 = Series([5.0, -9.0, 4.0, 100.0], index=idx) - ser2 = Series(["a", "b", "c", "f"], index=idx) - ser3 = Series([12, 4, 5, 97], index=idx) - - frame2 = DataFrame({"col1": ser1, "col2": ser2, "col5": ser3}) - - combined = frame1.combine_first(frame2) - assert len(combined.columns) == 5 - - # gh 3016 (same as in update) - df = DataFrame( - [[1.0, 2.0, False, True], [4.0, 5.0, True, False]], - columns=["A", "B", "bool1", "bool2"], - ) - - other = DataFrame([[45, 45]], index=[0], columns=["A", "B"]) - result = df.combine_first(other) - tm.assert_frame_equal(result, df) - - df.loc[0, "A"] = np.nan - result = df.combine_first(other) - df.loc[0, "A"] = 45 - tm.assert_frame_equal(result, df) - - # doc example - df1 = DataFrame( - {"A": [1.0, np.nan, 3.0, 5.0, np.nan], "B": [np.nan, 2.0, 3.0, np.nan, 6.0]} - ) - - df2 = DataFrame( - { - "A": [5.0, 2.0, 4.0, np.nan, 3.0, 7.0], - "B": [np.nan, np.nan, 3.0, 4.0, 6.0, 8.0], - } - ) - - result = df1.combine_first(df2) - expected = DataFrame({"A": [1, 2, 3, 5, 3, 7.0], "B": [np.nan, 2, 3, 4, 6, 8]}) - tm.assert_frame_equal(result, expected) - - # GH3552, return object dtype with bools - df1 = DataFrame( - [[np.nan, 3.0, True], [-4.6, np.nan, True], [np.nan, 7.0, False]] - ) - df2 = DataFrame([[-42.6, np.nan, True], [-5.0, 1.6, False]], index=[1, 2]) - - result = df1.combine_first(df2)[2] - expected = Series([True, True, False], name=2) - tm.assert_series_equal(result, expected) - - # GH 3593, converting datetime64[ns] incorrectly - df0 = DataFrame( - {"a": [datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)]} - ) - df1 = DataFrame({"a": [None, None, None]}) - df2 = df1.combine_first(df0) - tm.assert_frame_equal(df2, df0) - - df2 = df0.combine_first(df1) - tm.assert_frame_equal(df2, df0) - - df0 = DataFrame( - {"a": [datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)]} - ) - df1 = DataFrame({"a": [datetime(2000, 1, 2), None, None]}) - df2 = df1.combine_first(df0) - result = df0.copy() - result.iloc[0, :] = df1.iloc[0, :] - tm.assert_frame_equal(df2, result) - - df2 = df0.combine_first(df1) - tm.assert_frame_equal(df2, df0) - - def test_combine_first_align_nan(self): - # GH 7509 (not fixed) - dfa = pd.DataFrame([[pd.Timestamp("2011-01-01"), 2]], columns=["a", "b"]) - dfb = pd.DataFrame([[4], [5]], columns=["b"]) - assert dfa["a"].dtype == "datetime64[ns]" - assert dfa["b"].dtype == "int64" - - res = dfa.combine_first(dfb) - exp = pd.DataFrame( - {"a": [pd.Timestamp("2011-01-01"), pd.NaT], "b": [2.0, 5.0]}, - columns=["a", "b"], - ) - tm.assert_frame_equal(res, exp) - assert res["a"].dtype == "datetime64[ns]" - # ToDo: this must be int64 - assert res["b"].dtype == "float64" - - res = dfa.iloc[:0].combine_first(dfb) - exp = pd.DataFrame({"a": [np.nan, np.nan], "b": [4, 5]}, columns=["a", "b"]) - tm.assert_frame_equal(res, exp) - # ToDo: this must be datetime64 - assert res["a"].dtype == "float64" - # ToDo: this must be int64 - assert res["b"].dtype == "int64" - - def test_combine_first_timezone(self): - # see gh-7630 - data1 = pd.to_datetime("20100101 01:01").tz_localize("UTC") - df1 = pd.DataFrame( - columns=["UTCdatetime", "abc"], - data=data1, - index=pd.date_range("20140627", periods=1), - ) - data2 = pd.to_datetime("20121212 12:12").tz_localize("UTC") - df2 = pd.DataFrame( - columns=["UTCdatetime", "xyz"], - data=data2, - index=pd.date_range("20140628", periods=1), - ) - res = df2[["UTCdatetime"]].combine_first(df1) - exp = pd.DataFrame( - { - "UTCdatetime": [ - pd.Timestamp("2010-01-01 01:01", tz="UTC"), - pd.Timestamp("2012-12-12 12:12", tz="UTC"), - ], - "abc": [pd.Timestamp("2010-01-01 01:01:00", tz="UTC"), pd.NaT], - }, - columns=["UTCdatetime", "abc"], - index=pd.date_range("20140627", periods=2, freq="D"), - ) - tm.assert_frame_equal(res, exp) - assert res["UTCdatetime"].dtype == "datetime64[ns, UTC]" - assert res["abc"].dtype == "datetime64[ns, UTC]" - - # see gh-10567 - dts1 = pd.date_range("2015-01-01", "2015-01-05", tz="UTC") - df1 = pd.DataFrame({"DATE": dts1}) - dts2 = pd.date_range("2015-01-03", "2015-01-05", tz="UTC") - df2 = pd.DataFrame({"DATE": dts2}) - - res = df1.combine_first(df2) - tm.assert_frame_equal(res, df1) - assert res["DATE"].dtype == "datetime64[ns, UTC]" - - dts1 = pd.DatetimeIndex( - ["2011-01-01", "NaT", "2011-01-03", "2011-01-04"], tz="US/Eastern" - ) - df1 = pd.DataFrame({"DATE": dts1}, index=[1, 3, 5, 7]) - dts2 = pd.DatetimeIndex( - ["2012-01-01", "2012-01-02", "2012-01-03"], tz="US/Eastern" - ) - df2 = pd.DataFrame({"DATE": dts2}, index=[2, 4, 5]) - - res = df1.combine_first(df2) - exp_dts = pd.DatetimeIndex( - [ - "2011-01-01", - "2012-01-01", - "NaT", - "2012-01-02", - "2011-01-03", - "2011-01-04", - ], - tz="US/Eastern", - ) - exp = pd.DataFrame({"DATE": exp_dts}, index=[1, 2, 3, 4, 5, 7]) - tm.assert_frame_equal(res, exp) - - # different tz - dts1 = pd.date_range("2015-01-01", "2015-01-05", tz="US/Eastern") - df1 = pd.DataFrame({"DATE": dts1}) - dts2 = pd.date_range("2015-01-03", "2015-01-05") - df2 = pd.DataFrame({"DATE": dts2}) - - # if df1 doesn't have NaN, keep its dtype - res = df1.combine_first(df2) - tm.assert_frame_equal(res, df1) - assert res["DATE"].dtype == "datetime64[ns, US/Eastern]" - - dts1 = pd.date_range("2015-01-01", "2015-01-02", tz="US/Eastern") - df1 = pd.DataFrame({"DATE": dts1}) - dts2 = pd.date_range("2015-01-01", "2015-01-03") - df2 = pd.DataFrame({"DATE": dts2}) - - res = df1.combine_first(df2) - exp_dts = [ - pd.Timestamp("2015-01-01", tz="US/Eastern"), - pd.Timestamp("2015-01-02", tz="US/Eastern"), - pd.Timestamp("2015-01-03"), - ] - exp = pd.DataFrame({"DATE": exp_dts}) - tm.assert_frame_equal(res, exp) - assert res["DATE"].dtype == "object" - - def test_combine_first_timedelta(self): - data1 = pd.TimedeltaIndex(["1 day", "NaT", "3 day", "4day"]) - df1 = pd.DataFrame({"TD": data1}, index=[1, 3, 5, 7]) - data2 = pd.TimedeltaIndex(["10 day", "11 day", "12 day"]) - df2 = pd.DataFrame({"TD": data2}, index=[2, 4, 5]) - - res = df1.combine_first(df2) - exp_dts = pd.TimedeltaIndex( - ["1 day", "10 day", "NaT", "11 day", "3 day", "4 day"] - ) - exp = pd.DataFrame({"TD": exp_dts}, index=[1, 2, 3, 4, 5, 7]) - tm.assert_frame_equal(res, exp) - assert res["TD"].dtype == "timedelta64[ns]" - - def test_combine_first_period(self): - data1 = pd.PeriodIndex(["2011-01", "NaT", "2011-03", "2011-04"], freq="M") - df1 = pd.DataFrame({"P": data1}, index=[1, 3, 5, 7]) - data2 = pd.PeriodIndex(["2012-01-01", "2012-02", "2012-03"], freq="M") - df2 = pd.DataFrame({"P": data2}, index=[2, 4, 5]) - - res = df1.combine_first(df2) - exp_dts = pd.PeriodIndex( - ["2011-01", "2012-01", "NaT", "2012-02", "2011-03", "2011-04"], freq="M" - ) - exp = pd.DataFrame({"P": exp_dts}, index=[1, 2, 3, 4, 5, 7]) - tm.assert_frame_equal(res, exp) - assert res["P"].dtype == data1.dtype - - # different freq - dts2 = pd.PeriodIndex(["2012-01-01", "2012-01-02", "2012-01-03"], freq="D") - df2 = pd.DataFrame({"P": dts2}, index=[2, 4, 5]) - - res = df1.combine_first(df2) - exp_dts = [ - pd.Period("2011-01", freq="M"), - pd.Period("2012-01-01", freq="D"), - pd.NaT, - pd.Period("2012-01-02", freq="D"), - pd.Period("2011-03", freq="M"), - pd.Period("2011-04", freq="M"), - ] - exp = pd.DataFrame({"P": exp_dts}, index=[1, 2, 3, 4, 5, 7]) - tm.assert_frame_equal(res, exp) - assert res["P"].dtype == "object" - - def test_combine_first_int(self): - # GH14687 - integer series that do no align exactly - - df1 = pd.DataFrame({"a": [0, 1, 3, 5]}, dtype="int64") - df2 = pd.DataFrame({"a": [1, 4]}, dtype="int64") - - res = df1.combine_first(df2) - tm.assert_frame_equal(res, df1) - assert res["a"].dtype == "int64" - - @pytest.mark.parametrize("val", [1, 1.0]) - def test_combine_first_with_asymmetric_other(self, val): - # see gh-20699 - df1 = pd.DataFrame({"isNum": [val]}) - df2 = pd.DataFrame({"isBool": [True]}) - - res = df1.combine_first(df2) - exp = pd.DataFrame({"isBool": [True], "isNum": [val]}) - - tm.assert_frame_equal(res, exp) - def test_concat_datetime_datetime64_frame(self): # #2624 rows = [] @@ -776,23 +326,3 @@ def test_concat_datetime_datetime64_frame(self): # it works! pd.concat([df1, df2_obj]) - - -class TestDataFrameUpdate: - def test_update_nan(self): - # #15593 #15617 - # test 1 - df1 = DataFrame({"A": [1.0, 2, 3], "B": date_range("2000", periods=3)}) - df2 = DataFrame({"A": [None, 2, 3]}) - expected = df1.copy() - df1.update(df2, overwrite=False) - - tm.assert_frame_equal(df1, expected) - - # test 2 - df1 = DataFrame({"A": [1.0, None, 3], "B": date_range("2000", periods=3)}) - df2 = DataFrame({"A": [None, 2, 3]}) - expected = DataFrame({"A": [1.0, 2, 3], "B": date_range("2000", periods=3)}) - df1.update(df2, overwrite=False) - - tm.assert_frame_equal(df1, expected) diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 7b1a9d8ff6ae3..5f4c78449f71d 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -1860,9 +1860,8 @@ def check(df): # No NaN found -> error if len(indexer) == 0: msg = ( - "cannot do label indexing on " - r" " - r"with these indexers \[nan\] of " + "cannot do label indexing on RangeIndex " + r"with these indexers \[nan\] of type float" ) with pytest.raises(TypeError, match=msg): df.loc[:, np.nan] diff --git a/pandas/tests/frame/test_repr_info.py b/pandas/tests/frame/test_repr_info.py index 49e6fe4940e18..a7e01d8f1fd6d 100644 --- a/pandas/tests/frame/test_repr_info.py +++ b/pandas/tests/frame/test_repr_info.py @@ -223,8 +223,7 @@ def test_info_verbose(self): for i, line in enumerate(lines): if i >= start and i < start + size: - index = i - start - line_nr = " {} ".format(index) + line_nr = f" {i - start} " assert line.startswith(line_nr) def test_info_memory(self): @@ -236,7 +235,7 @@ def test_info_memory(self): bytes = float(df.memory_usage().sum()) expected = textwrap.dedent( - """\ + f"""\ RangeIndex: 2 entries, 0 to 1 Data columns (total 1 columns): @@ -244,10 +243,8 @@ def test_info_memory(self): --- ------ -------------- ----- 0 a 2 non-null int64 dtypes: int64(1) - memory usage: {} bytes - """.format( - bytes - ) + memory usage: {bytes} bytes + """ ) assert result == expected @@ -313,9 +310,7 @@ def test_info_shows_column_dtypes(self): ) assert header in res for i, dtype in enumerate(dtypes): - name = " {i:d} {i:d} {n:d} non-null {dtype}".format( - i=i, n=n, dtype=dtype - ) + name = f" {i:d} {i:d} {n:d} non-null {dtype}" assert name in res def test_info_max_cols(self): diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index 2d31996a8a964..ff99081521ffb 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -377,6 +377,65 @@ def test_agg_index_has_complex_internals(index): tm.assert_frame_equal(result, expected) +def test_agg_split_block(): + # https://github.com/pandas-dev/pandas/issues/31522 + df = pd.DataFrame( + { + "key1": ["a", "a", "b", "b", "a"], + "key2": ["one", "two", "one", "two", "one"], + "key3": ["three", "three", "three", "six", "six"], + } + ) + result = df.groupby("key1").min() + expected = pd.DataFrame( + {"key2": ["one", "one"], "key3": ["six", "six"]}, + index=pd.Index(["a", "b"], name="key1"), + ) + tm.assert_frame_equal(result, expected) + + +def test_agg_split_object_part_datetime(): + # https://github.com/pandas-dev/pandas/pull/31616 + df = pd.DataFrame( + { + "A": pd.date_range("2000", periods=4), + "B": ["a", "b", "c", "d"], + "C": [1, 2, 3, 4], + "D": ["b", "c", "d", "e"], + "E": pd.date_range("2000", periods=4), + "F": [1, 2, 3, 4], + } + ).astype(object) + result = df.groupby([0, 0, 0, 0]).min() + expected = pd.DataFrame( + { + "A": [pd.Timestamp("2000")], + "B": ["a"], + "C": [1], + "D": ["b"], + "E": [pd.Timestamp("2000")], + "F": [1], + } + ) + tm.assert_frame_equal(result, expected) + + +def test_agg_cython_category_not_implemented_fallback(): + # https://github.com/pandas-dev/pandas/issues/31450 + df = pd.DataFrame({"col_num": [1, 1, 2, 3]}) + df["col_cat"] = df["col_num"].astype("category") + + result = df.groupby("col_num").col_cat.first() + expected = pd.Series( + [1, 2, 3], index=pd.Index([1, 2, 3], name="col_num"), name="col_cat" + ) + tm.assert_series_equal(result, expected) + + result = df.groupby("col_num").agg({"col_cat": "first"}) + expected = expected.to_frame() + tm.assert_frame_equal(result, expected) + + class TestNamedAggregationSeries: def test_series_named_agg(self): df = pd.Series([1, 2, 3, 4]) @@ -684,6 +743,34 @@ def aggfunc(x): tm.assert_frame_equal(result, expected) +@pytest.mark.parametrize("func", ["min", "max"]) +def test_groupby_aggregate_period_column(func): + # GH 31471 + groups = [1, 2] + periods = pd.period_range("2020", periods=2, freq="Y") + df = pd.DataFrame({"a": groups, "b": periods}) + + result = getattr(df.groupby("a")["b"], func)() + idx = pd.Int64Index([1, 2], name="a") + expected = pd.Series(periods, index=idx, name="b") + + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("func", ["min", "max"]) +def test_groupby_aggregate_period_frame(func): + # GH 31471 + groups = [1, 2] + periods = pd.period_range("2020", periods=2, freq="Y") + df = pd.DataFrame({"a": groups, "b": periods}) + + result = getattr(df.groupby("a"), func)() + idx = pd.Int64Index([1, 2], name="a") + expected = pd.DataFrame({"b": periods}, index=idx) + + tm.assert_frame_equal(result, expected) + + class TestLambdaMangling: def test_basic(self): df = pd.DataFrame({"A": [0, 0, 1, 1], "B": [1, 2, 3, 4]}) diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index 9c2b045079622..41ec70468aaeb 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -851,3 +851,17 @@ def test_apply_function_returns_non_pandas_non_scalar(function, expected_values) result = df.groupby("groups").apply(function) expected = pd.Series(expected_values, index=pd.Index(["A", "B"], name="groups")) tm.assert_series_equal(result, expected) + + +def test_apply_function_returns_numpy_array(): + # GH 31605 + def fct(group): + return group["B"].values.flatten() + + df = pd.DataFrame({"A": ["a", "a", "b", "none"], "B": [1, 2, 3, np.nan]}) + + result = df.groupby("A").apply(fct) + expected = pd.Series( + [[1.0, 2.0], [3.0], [np.nan]], index=pd.Index(["a", "b", "none"], name="A") + ) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 97cf1af1d2e9e..73e36cb5e6c84 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -671,7 +671,7 @@ def test_nsmallest(): tm.assert_series_equal(gb.nsmallest(3, keep="last"), e) -@pytest.mark.parametrize("func", ["mean", "var", "std", "cumprod", "cumsum"]) +@pytest.mark.parametrize("func", ["cumprod", "cumsum"]) def test_numpy_compat(func): # see gh-12811 df = pd.DataFrame({"A": [1, 2, 1], "B": [1, 2, 3]}) diff --git a/pandas/tests/groupby/test_grouping.py b/pandas/tests/groupby/test_grouping.py index 4273139b32828..efcd22f9c0c82 100644 --- a/pandas/tests/groupby/test_grouping.py +++ b/pandas/tests/groupby/test_grouping.py @@ -676,6 +676,19 @@ def test_groupby_level_index_value_all_na(self): ) tm.assert_frame_equal(result, expected) + def test_groupby_multiindex_level_empty(self): + # https://github.com/pandas-dev/pandas/issues/31670 + df = pd.DataFrame( + [[123, "a", 1.0], [123, "b", 2.0]], columns=["id", "category", "value"] + ) + df = df.set_index(["id", "category"]) + empty = df[df.value < 0] + result = empty.groupby("id").sum() + expected = pd.DataFrame( + dtype="float64", columns=["value"], index=pd.Int64Index([], name="id") + ) + tm.assert_frame_equal(result, expected) + # get_group # -------------------------------- diff --git a/pandas/tests/indexes/base_class/__init__.py b/pandas/tests/indexes/base_class/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/pandas/tests/indexes/base_class/test_constructors.py b/pandas/tests/indexes/base_class/test_constructors.py new file mode 100644 index 0000000000000..9e6a8f34c135d --- /dev/null +++ b/pandas/tests/indexes/base_class/test_constructors.py @@ -0,0 +1,36 @@ +import pytest + +from pandas import Index, MultiIndex + + +class TestIndexConstructor: + # Tests for the Index constructor, specifically for cases that do + # not return a subclass + + def test_constructor_corner(self): + # corner case + msg = ( + r"Index\(\.\.\.\) must be called with a collection of some " + "kind, 0 was passed" + ) + with pytest.raises(TypeError, match=msg): + Index(0) + + @pytest.mark.parametrize("index_vals", [[("A", 1), "B"], ["B", ("A", 1)]]) + def test_construction_list_mixed_tuples(self, index_vals): + # see gh-10697: if we are constructing from a mixed list of tuples, + # make sure that we are independent of the sorting order. + index = Index(index_vals) + assert isinstance(index, Index) + assert not isinstance(index, MultiIndex) + + def test_constructor_wrong_kwargs(self): + # GH #19348 + with pytest.raises(TypeError, match="Unexpected keyword arguments {'foo'}"): + Index([], foo="bar") + + @pytest.mark.xfail(reason="see GH#21311: Index doesn't enforce dtype argument") + def test_constructor_cast(self): + msg = "could not convert string to float" + with pytest.raises(ValueError, match=msg): + Index(["a", "b", "c"], dtype=float) diff --git a/pandas/tests/indexes/base_class/test_setops.py b/pandas/tests/indexes/base_class/test_setops.py new file mode 100644 index 0000000000000..e7d5e21d0ba47 --- /dev/null +++ b/pandas/tests/indexes/base_class/test_setops.py @@ -0,0 +1,74 @@ +import numpy as np +import pytest + +from pandas import Index, Series +import pandas._testing as tm +from pandas.core.algorithms import safe_sort + + +class TestIndexSetOps: + def test_union_base(self): + index = Index([0, "a", 1, "b", 2, "c"]) + first = index[3:] + second = index[:5] + + result = first.union(second) + + expected = Index([0, 1, 2, "a", "b", "c"]) + tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize("klass", [np.array, Series, list]) + def test_union_different_type_base(self, klass): + # GH 10149 + index = Index([0, "a", 1, "b", 2, "c"]) + first = index[3:] + second = index[:5] + + result = first.union(klass(second.values)) + + assert tm.equalContents(result, index) + + @pytest.mark.parametrize("sort", [None, False]) + def test_intersection_base(self, sort): + # (same results for py2 and py3 but sortedness not tested elsewhere) + index = Index([0, "a", 1, "b", 2, "c"]) + first = index[:5] + second = index[:3] + + expected = Index([0, 1, "a"]) if sort is None else Index([0, "a", 1]) + result = first.intersection(second, sort=sort) + tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize("klass", [np.array, Series, list]) + @pytest.mark.parametrize("sort", [None, False]) + def test_intersection_different_type_base(self, klass, sort): + # GH 10149 + index = Index([0, "a", 1, "b", 2, "c"]) + first = index[:5] + second = index[:3] + + result = first.intersection(klass(second.values), sort=sort) + assert tm.equalContents(result, second) + + @pytest.mark.parametrize("sort", [None, False]) + def test_difference_base(self, sort): + # (same results for py2 and py3 but sortedness not tested elsewhere) + index = Index([0, "a", 1, "b", 2, "c"]) + first = index[:4] + second = index[3:] + + result = first.difference(second, sort) + expected = Index([0, "a", 1]) + if sort is None: + expected = Index(safe_sort(expected)) + tm.assert_index_equal(result, expected) + + def test_symmetric_difference(self): + # (same results for py2 and py3 but sortedness not tested elsewhere) + index = Index([0, "a", 1, "b", 2, "c"]) + first = index[:4] + second = index[3:] + + result = first.symmetric_difference(second) + expected = Index([0, 1, 2, "a", "c"]) + tm.assert_index_equal(result, expected) diff --git a/pandas/tests/indexes/categorical/test_category.py b/pandas/tests/indexes/categorical/test_category.py index d870259c2539b..c18cd1f252c83 100644 --- a/pandas/tests/indexes/categorical/test_category.py +++ b/pandas/tests/indexes/categorical/test_category.py @@ -146,76 +146,6 @@ def test_contains_list(self): with pytest.raises(TypeError, match="unhashable type"): ["a", "b"] in idx - def test_map(self): - ci = pd.CategoricalIndex(list("ABABC"), categories=list("CBA"), ordered=True) - result = ci.map(lambda x: x.lower()) - exp = pd.CategoricalIndex(list("ababc"), categories=list("cba"), ordered=True) - tm.assert_index_equal(result, exp) - - ci = pd.CategoricalIndex( - list("ABABC"), categories=list("BAC"), ordered=False, name="XXX" - ) - result = ci.map(lambda x: x.lower()) - exp = pd.CategoricalIndex( - list("ababc"), categories=list("bac"), ordered=False, name="XXX" - ) - tm.assert_index_equal(result, exp) - - # GH 12766: Return an index not an array - tm.assert_index_equal( - ci.map(lambda x: 1), Index(np.array([1] * 5, dtype=np.int64), name="XXX") - ) - - # change categories dtype - ci = pd.CategoricalIndex(list("ABABC"), categories=list("BAC"), ordered=False) - - def f(x): - return {"A": 10, "B": 20, "C": 30}.get(x) - - result = ci.map(f) - exp = pd.CategoricalIndex( - [10, 20, 10, 20, 30], categories=[20, 10, 30], ordered=False - ) - tm.assert_index_equal(result, exp) - - result = ci.map(pd.Series([10, 20, 30], index=["A", "B", "C"])) - tm.assert_index_equal(result, exp) - - result = ci.map({"A": 10, "B": 20, "C": 30}) - tm.assert_index_equal(result, exp) - - def test_map_with_categorical_series(self): - # GH 12756 - a = pd.Index([1, 2, 3, 4]) - b = pd.Series(["even", "odd", "even", "odd"], dtype="category") - c = pd.Series(["even", "odd", "even", "odd"]) - - exp = CategoricalIndex(["odd", "even", "odd", np.nan]) - tm.assert_index_equal(a.map(b), exp) - exp = pd.Index(["odd", "even", "odd", np.nan]) - tm.assert_index_equal(a.map(c), exp) - - @pytest.mark.parametrize( - ("data", "f"), - ( - ([1, 1, np.nan], pd.isna), - ([1, 2, np.nan], pd.isna), - ([1, 1, np.nan], {1: False}), - ([1, 2, np.nan], {1: False, 2: False}), - ([1, 1, np.nan], pd.Series([False, False])), - ([1, 2, np.nan], pd.Series([False, False, False])), - ), - ) - def test_map_with_nan(self, data, f): # GH 24241 - values = pd.Categorical(data) - result = values.map(f) - if data[1] == 1: - expected = pd.Categorical([False, False, np.nan]) - tm.assert_categorical_equal(result, expected) - else: - expected = pd.Index([False, False, np.nan]) - tm.assert_index_equal(result, expected) - @pytest.mark.parametrize("klass", [list, tuple, np.array, pd.Series]) def test_where(self, klass): i = self.create_index() @@ -384,89 +314,6 @@ def test_astype_category(self, name, dtype_ordered, index_ordered): expected = index tm.assert_index_equal(result, expected) - def test_reindex_base(self): - # Determined by cat ordering. - idx = CategoricalIndex(list("cab"), categories=list("cab")) - expected = np.arange(len(idx), dtype=np.intp) - - actual = idx.get_indexer(idx) - tm.assert_numpy_array_equal(expected, actual) - - with pytest.raises(ValueError, match="Invalid fill method"): - idx.get_indexer(idx, method="invalid") - - def test_reindexing(self): - np.random.seed(123456789) - - ci = self.create_index() - oidx = Index(np.array(ci)) - - for n in [1, 2, 5, len(ci)]: - finder = oidx[np.random.randint(0, len(ci), size=n)] - expected = oidx.get_indexer_non_unique(finder)[0] - - actual = ci.get_indexer(finder) - tm.assert_numpy_array_equal(expected, actual) - - # see gh-17323 - # - # Even when indexer is equal to the - # members in the index, we should - # respect duplicates instead of taking - # the fast-track path. - for finder in [list("aabbca"), list("aababca")]: - expected = oidx.get_indexer_non_unique(finder)[0] - - actual = ci.get_indexer(finder) - tm.assert_numpy_array_equal(expected, actual) - - def test_reindex_dtype(self): - c = CategoricalIndex(["a", "b", "c", "a"]) - res, indexer = c.reindex(["a", "c"]) - tm.assert_index_equal(res, Index(["a", "a", "c"]), exact=True) - tm.assert_numpy_array_equal(indexer, np.array([0, 3, 2], dtype=np.intp)) - - c = CategoricalIndex(["a", "b", "c", "a"]) - res, indexer = c.reindex(Categorical(["a", "c"])) - - exp = CategoricalIndex(["a", "a", "c"], categories=["a", "c"]) - tm.assert_index_equal(res, exp, exact=True) - tm.assert_numpy_array_equal(indexer, np.array([0, 3, 2], dtype=np.intp)) - - c = CategoricalIndex(["a", "b", "c", "a"], categories=["a", "b", "c", "d"]) - res, indexer = c.reindex(["a", "c"]) - exp = Index(["a", "a", "c"], dtype="object") - tm.assert_index_equal(res, exp, exact=True) - tm.assert_numpy_array_equal(indexer, np.array([0, 3, 2], dtype=np.intp)) - - c = CategoricalIndex(["a", "b", "c", "a"], categories=["a", "b", "c", "d"]) - res, indexer = c.reindex(Categorical(["a", "c"])) - exp = CategoricalIndex(["a", "a", "c"], categories=["a", "c"]) - tm.assert_index_equal(res, exp, exact=True) - tm.assert_numpy_array_equal(indexer, np.array([0, 3, 2], dtype=np.intp)) - - def test_reindex_duplicate_target(self): - # See GH25459 - cat = CategoricalIndex(["a", "b", "c"], categories=["a", "b", "c", "d"]) - res, indexer = cat.reindex(["a", "c", "c"]) - exp = Index(["a", "c", "c"], dtype="object") - tm.assert_index_equal(res, exp, exact=True) - tm.assert_numpy_array_equal(indexer, np.array([0, 2, 2], dtype=np.intp)) - - res, indexer = cat.reindex( - CategoricalIndex(["a", "c", "c"], categories=["a", "b", "c", "d"]) - ) - exp = CategoricalIndex(["a", "c", "c"], categories=["a", "b", "c", "d"]) - tm.assert_index_equal(res, exp, exact=True) - tm.assert_numpy_array_equal(indexer, np.array([0, 2, 2], dtype=np.intp)) - - def test_reindex_empty_index(self): - # See GH16770 - c = CategoricalIndex([]) - res, indexer = c.reindex(["a", "b"]) - tm.assert_index_equal(res, Index(["a", "b"]), exact=True) - tm.assert_numpy_array_equal(indexer, np.array([-1, -1], dtype=np.intp)) - @pytest.mark.parametrize( "data, non_lexsorted_data", [[[1, 2, 3], [9, 0, 1, 2, 3]], [list("abc"), list("fabcd")]], @@ -518,75 +365,6 @@ def test_drop_duplicates(self): tm.assert_index_equal(idx.drop_duplicates(), expected) tm.assert_index_equal(idx.unique(), expected) - def test_get_indexer(self): - - idx1 = CategoricalIndex(list("aabcde"), categories=list("edabc")) - idx2 = CategoricalIndex(list("abf")) - - for indexer in [idx2, list("abf"), Index(list("abf"))]: - r1 = idx1.get_indexer(idx2) - tm.assert_almost_equal(r1, np.array([0, 1, 2, -1], dtype=np.intp)) - - msg = ( - "method='pad' and method='backfill' not implemented yet for " - "CategoricalIndex" - ) - with pytest.raises(NotImplementedError, match=msg): - idx2.get_indexer(idx1, method="pad") - with pytest.raises(NotImplementedError, match=msg): - idx2.get_indexer(idx1, method="backfill") - - msg = "method='nearest' not implemented yet for CategoricalIndex" - with pytest.raises(NotImplementedError, match=msg): - idx2.get_indexer(idx1, method="nearest") - - def test_get_loc(self): - # GH 12531 - cidx1 = CategoricalIndex(list("abcde"), categories=list("edabc")) - idx1 = Index(list("abcde")) - assert cidx1.get_loc("a") == idx1.get_loc("a") - assert cidx1.get_loc("e") == idx1.get_loc("e") - - for i in [cidx1, idx1]: - with pytest.raises(KeyError, match="'NOT-EXIST'"): - i.get_loc("NOT-EXIST") - - # non-unique - cidx2 = CategoricalIndex(list("aacded"), categories=list("edabc")) - idx2 = Index(list("aacded")) - - # results in bool array - res = cidx2.get_loc("d") - tm.assert_numpy_array_equal(res, idx2.get_loc("d")) - tm.assert_numpy_array_equal( - res, np.array([False, False, False, True, False, True]) - ) - # unique element results in scalar - res = cidx2.get_loc("e") - assert res == idx2.get_loc("e") - assert res == 4 - - for i in [cidx2, idx2]: - with pytest.raises(KeyError, match="'NOT-EXIST'"): - i.get_loc("NOT-EXIST") - - # non-unique, sliceable - cidx3 = CategoricalIndex(list("aabbb"), categories=list("abc")) - idx3 = Index(list("aabbb")) - - # results in slice - res = cidx3.get_loc("a") - assert res == idx3.get_loc("a") - assert res == slice(0, 2, None) - - res = cidx3.get_loc("b") - assert res == idx3.get_loc("b") - assert res == slice(2, 5, None) - - for i in [cidx3, idx3]: - with pytest.raises(KeyError, match="'c'"): - i.get_loc("c") - def test_repr_roundtrip(self): ci = CategoricalIndex(["a", "b"], categories=["a", "b"], ordered=True) @@ -837,122 +615,6 @@ def test_fillna_categorical(self): with pytest.raises(ValueError, match=msg): idx.fillna(2.0) - def test_take_fill_value(self): - # GH 12631 - - # numeric category - idx = pd.CategoricalIndex([1, 2, 3], name="xxx") - result = idx.take(np.array([1, 0, -1])) - expected = pd.CategoricalIndex([2, 1, 3], name="xxx") - tm.assert_index_equal(result, expected) - tm.assert_categorical_equal(result.values, expected.values) - - # fill_value - result = idx.take(np.array([1, 0, -1]), fill_value=True) - expected = pd.CategoricalIndex([2, 1, np.nan], categories=[1, 2, 3], name="xxx") - tm.assert_index_equal(result, expected) - tm.assert_categorical_equal(result.values, expected.values) - - # allow_fill=False - result = idx.take(np.array([1, 0, -1]), allow_fill=False, fill_value=True) - expected = pd.CategoricalIndex([2, 1, 3], name="xxx") - tm.assert_index_equal(result, expected) - tm.assert_categorical_equal(result.values, expected.values) - - # object category - idx = pd.CategoricalIndex( - list("CBA"), categories=list("ABC"), ordered=True, name="xxx" - ) - result = idx.take(np.array([1, 0, -1])) - expected = pd.CategoricalIndex( - list("BCA"), categories=list("ABC"), ordered=True, name="xxx" - ) - tm.assert_index_equal(result, expected) - tm.assert_categorical_equal(result.values, expected.values) - - # fill_value - result = idx.take(np.array([1, 0, -1]), fill_value=True) - expected = pd.CategoricalIndex( - ["B", "C", np.nan], categories=list("ABC"), ordered=True, name="xxx" - ) - tm.assert_index_equal(result, expected) - tm.assert_categorical_equal(result.values, expected.values) - - # allow_fill=False - result = idx.take(np.array([1, 0, -1]), allow_fill=False, fill_value=True) - expected = pd.CategoricalIndex( - list("BCA"), categories=list("ABC"), ordered=True, name="xxx" - ) - tm.assert_index_equal(result, expected) - tm.assert_categorical_equal(result.values, expected.values) - - msg = ( - "When allow_fill=True and fill_value is not None, " - "all indices must be >= -1" - ) - with pytest.raises(ValueError, match=msg): - idx.take(np.array([1, 0, -2]), fill_value=True) - with pytest.raises(ValueError, match=msg): - idx.take(np.array([1, 0, -5]), fill_value=True) - - with pytest.raises(IndexError): - idx.take(np.array([1, -5])) - - def test_take_fill_value_datetime(self): - - # datetime category - idx = pd.DatetimeIndex(["2011-01-01", "2011-02-01", "2011-03-01"], name="xxx") - idx = pd.CategoricalIndex(idx) - result = idx.take(np.array([1, 0, -1])) - expected = pd.DatetimeIndex( - ["2011-02-01", "2011-01-01", "2011-03-01"], name="xxx" - ) - expected = pd.CategoricalIndex(expected) - tm.assert_index_equal(result, expected) - - # fill_value - result = idx.take(np.array([1, 0, -1]), fill_value=True) - expected = pd.DatetimeIndex(["2011-02-01", "2011-01-01", "NaT"], name="xxx") - exp_cats = pd.DatetimeIndex(["2011-01-01", "2011-02-01", "2011-03-01"]) - expected = pd.CategoricalIndex(expected, categories=exp_cats) - tm.assert_index_equal(result, expected) - - # allow_fill=False - result = idx.take(np.array([1, 0, -1]), allow_fill=False, fill_value=True) - expected = pd.DatetimeIndex( - ["2011-02-01", "2011-01-01", "2011-03-01"], name="xxx" - ) - expected = pd.CategoricalIndex(expected) - tm.assert_index_equal(result, expected) - - msg = ( - "When allow_fill=True and fill_value is not None, " - "all indices must be >= -1" - ) - with pytest.raises(ValueError, match=msg): - idx.take(np.array([1, 0, -2]), fill_value=True) - with pytest.raises(ValueError, match=msg): - idx.take(np.array([1, 0, -5]), fill_value=True) - - with pytest.raises(IndexError): - idx.take(np.array([1, -5])) - - def test_take_invalid_kwargs(self): - idx = pd.CategoricalIndex([1, 2, 3], name="foo") - indices = [1, 0, -1] - - msg = r"take\(\) got an unexpected keyword argument 'foo'" - with pytest.raises(TypeError, match=msg): - idx.take(indices, foo=2) - - msg = "the 'out' parameter is not supported" - with pytest.raises(ValueError, match=msg): - idx.take(indices, out=indices) - - msg = "the 'mode' parameter is not supported" - with pytest.raises(ValueError, match=msg): - idx.take(indices, mode="clip") - @pytest.mark.parametrize( "dtype, engine_type", [ @@ -976,19 +638,10 @@ def test_engine_type(self, dtype, engine_type): assert np.issubdtype(ci.codes.dtype, dtype) assert isinstance(ci._engine, engine_type) - @pytest.mark.parametrize( - "data, categories", - [ - (list("abcbca"), list("cab")), - (pd.interval_range(0, 3).repeat(3), pd.interval_range(0, 3)), - ], - ids=["string", "interval"], - ) - def test_map_str(self, data, categories, ordered_fixture): - # GH 31202 - override base class since we want to maintain categorical/ordered - index = CategoricalIndex(data, categories=categories, ordered=ordered_fixture) - result = index.map(str) - expected = CategoricalIndex( - map(str, data), categories=map(str, categories), ordered=ordered_fixture - ) - tm.assert_index_equal(result, expected) + def test_reindex_base(self): + # See test_reindex.py + pass + + def test_map_str(self): + # See test_map.py + pass diff --git a/pandas/tests/indexes/categorical/test_indexing.py b/pandas/tests/indexes/categorical/test_indexing.py new file mode 100644 index 0000000000000..507e38d9acac2 --- /dev/null +++ b/pandas/tests/indexes/categorical/test_indexing.py @@ -0,0 +1,250 @@ +import numpy as np +import pytest + +import pandas as pd +from pandas import CategoricalIndex, Index +import pandas._testing as tm + + +class TestTake: + def test_take_fill_value(self): + # GH 12631 + + # numeric category + idx = pd.CategoricalIndex([1, 2, 3], name="xxx") + result = idx.take(np.array([1, 0, -1])) + expected = pd.CategoricalIndex([2, 1, 3], name="xxx") + tm.assert_index_equal(result, expected) + tm.assert_categorical_equal(result.values, expected.values) + + # fill_value + result = idx.take(np.array([1, 0, -1]), fill_value=True) + expected = pd.CategoricalIndex([2, 1, np.nan], categories=[1, 2, 3], name="xxx") + tm.assert_index_equal(result, expected) + tm.assert_categorical_equal(result.values, expected.values) + + # allow_fill=False + result = idx.take(np.array([1, 0, -1]), allow_fill=False, fill_value=True) + expected = pd.CategoricalIndex([2, 1, 3], name="xxx") + tm.assert_index_equal(result, expected) + tm.assert_categorical_equal(result.values, expected.values) + + # object category + idx = pd.CategoricalIndex( + list("CBA"), categories=list("ABC"), ordered=True, name="xxx" + ) + result = idx.take(np.array([1, 0, -1])) + expected = pd.CategoricalIndex( + list("BCA"), categories=list("ABC"), ordered=True, name="xxx" + ) + tm.assert_index_equal(result, expected) + tm.assert_categorical_equal(result.values, expected.values) + + # fill_value + result = idx.take(np.array([1, 0, -1]), fill_value=True) + expected = pd.CategoricalIndex( + ["B", "C", np.nan], categories=list("ABC"), ordered=True, name="xxx" + ) + tm.assert_index_equal(result, expected) + tm.assert_categorical_equal(result.values, expected.values) + + # allow_fill=False + result = idx.take(np.array([1, 0, -1]), allow_fill=False, fill_value=True) + expected = pd.CategoricalIndex( + list("BCA"), categories=list("ABC"), ordered=True, name="xxx" + ) + tm.assert_index_equal(result, expected) + tm.assert_categorical_equal(result.values, expected.values) + + msg = ( + "When allow_fill=True and fill_value is not None, " + "all indices must be >= -1" + ) + with pytest.raises(ValueError, match=msg): + idx.take(np.array([1, 0, -2]), fill_value=True) + with pytest.raises(ValueError, match=msg): + idx.take(np.array([1, 0, -5]), fill_value=True) + + with pytest.raises(IndexError): + idx.take(np.array([1, -5])) + + def test_take_fill_value_datetime(self): + + # datetime category + idx = pd.DatetimeIndex(["2011-01-01", "2011-02-01", "2011-03-01"], name="xxx") + idx = pd.CategoricalIndex(idx) + result = idx.take(np.array([1, 0, -1])) + expected = pd.DatetimeIndex( + ["2011-02-01", "2011-01-01", "2011-03-01"], name="xxx" + ) + expected = pd.CategoricalIndex(expected) + tm.assert_index_equal(result, expected) + + # fill_value + result = idx.take(np.array([1, 0, -1]), fill_value=True) + expected = pd.DatetimeIndex(["2011-02-01", "2011-01-01", "NaT"], name="xxx") + exp_cats = pd.DatetimeIndex(["2011-01-01", "2011-02-01", "2011-03-01"]) + expected = pd.CategoricalIndex(expected, categories=exp_cats) + tm.assert_index_equal(result, expected) + + # allow_fill=False + result = idx.take(np.array([1, 0, -1]), allow_fill=False, fill_value=True) + expected = pd.DatetimeIndex( + ["2011-02-01", "2011-01-01", "2011-03-01"], name="xxx" + ) + expected = pd.CategoricalIndex(expected) + tm.assert_index_equal(result, expected) + + msg = ( + "When allow_fill=True and fill_value is not None, " + "all indices must be >= -1" + ) + with pytest.raises(ValueError, match=msg): + idx.take(np.array([1, 0, -2]), fill_value=True) + with pytest.raises(ValueError, match=msg): + idx.take(np.array([1, 0, -5]), fill_value=True) + + with pytest.raises(IndexError): + idx.take(np.array([1, -5])) + + def test_take_invalid_kwargs(self): + idx = pd.CategoricalIndex([1, 2, 3], name="foo") + indices = [1, 0, -1] + + msg = r"take\(\) got an unexpected keyword argument 'foo'" + with pytest.raises(TypeError, match=msg): + idx.take(indices, foo=2) + + msg = "the 'out' parameter is not supported" + with pytest.raises(ValueError, match=msg): + idx.take(indices, out=indices) + + msg = "the 'mode' parameter is not supported" + with pytest.raises(ValueError, match=msg): + idx.take(indices, mode="clip") + + +class TestGetLoc: + def test_get_loc(self): + # GH 12531 + cidx1 = CategoricalIndex(list("abcde"), categories=list("edabc")) + idx1 = Index(list("abcde")) + assert cidx1.get_loc("a") == idx1.get_loc("a") + assert cidx1.get_loc("e") == idx1.get_loc("e") + + for i in [cidx1, idx1]: + with pytest.raises(KeyError, match="'NOT-EXIST'"): + i.get_loc("NOT-EXIST") + + # non-unique + cidx2 = CategoricalIndex(list("aacded"), categories=list("edabc")) + idx2 = Index(list("aacded")) + + # results in bool array + res = cidx2.get_loc("d") + tm.assert_numpy_array_equal(res, idx2.get_loc("d")) + tm.assert_numpy_array_equal( + res, np.array([False, False, False, True, False, True]) + ) + # unique element results in scalar + res = cidx2.get_loc("e") + assert res == idx2.get_loc("e") + assert res == 4 + + for i in [cidx2, idx2]: + with pytest.raises(KeyError, match="'NOT-EXIST'"): + i.get_loc("NOT-EXIST") + + # non-unique, sliceable + cidx3 = CategoricalIndex(list("aabbb"), categories=list("abc")) + idx3 = Index(list("aabbb")) + + # results in slice + res = cidx3.get_loc("a") + assert res == idx3.get_loc("a") + assert res == slice(0, 2, None) + + res = cidx3.get_loc("b") + assert res == idx3.get_loc("b") + assert res == slice(2, 5, None) + + for i in [cidx3, idx3]: + with pytest.raises(KeyError, match="'c'"): + i.get_loc("c") + + def test_get_loc_unique(self): + cidx = pd.CategoricalIndex(list("abc")) + result = cidx.get_loc("b") + assert result == 1 + + def test_get_loc_monotonic_nonunique(self): + cidx = pd.CategoricalIndex(list("abbc")) + result = cidx.get_loc("b") + expected = slice(1, 3, None) + assert result == expected + + def test_get_loc_nonmonotonic_nonunique(self): + cidx = pd.CategoricalIndex(list("abcb")) + result = cidx.get_loc("b") + expected = np.array([False, True, False, True], dtype=bool) + tm.assert_numpy_array_equal(result, expected) + + +class TestGetIndexer: + def test_get_indexer_base(self): + # Determined by cat ordering. + idx = CategoricalIndex(list("cab"), categories=list("cab")) + expected = np.arange(len(idx), dtype=np.intp) + + actual = idx.get_indexer(idx) + tm.assert_numpy_array_equal(expected, actual) + + with pytest.raises(ValueError, match="Invalid fill method"): + idx.get_indexer(idx, method="invalid") + + def test_get_indexer_non_unique(self): + np.random.seed(123456789) + + ci = CategoricalIndex(list("aabbca"), categories=list("cab"), ordered=False) + oidx = Index(np.array(ci)) + + for n in [1, 2, 5, len(ci)]: + finder = oidx[np.random.randint(0, len(ci), size=n)] + expected = oidx.get_indexer_non_unique(finder)[0] + + actual = ci.get_indexer(finder) + tm.assert_numpy_array_equal(expected, actual) + + # see gh-17323 + # + # Even when indexer is equal to the + # members in the index, we should + # respect duplicates instead of taking + # the fast-track path. + for finder in [list("aabbca"), list("aababca")]: + expected = oidx.get_indexer_non_unique(finder)[0] + + actual = ci.get_indexer(finder) + tm.assert_numpy_array_equal(expected, actual) + + def test_get_indexer(self): + + idx1 = CategoricalIndex(list("aabcde"), categories=list("edabc")) + idx2 = CategoricalIndex(list("abf")) + + for indexer in [idx2, list("abf"), Index(list("abf"))]: + r1 = idx1.get_indexer(idx2) + tm.assert_almost_equal(r1, np.array([0, 1, 2, -1], dtype=np.intp)) + + msg = ( + "method='pad' and method='backfill' not implemented yet for " + "CategoricalIndex" + ) + with pytest.raises(NotImplementedError, match=msg): + idx2.get_indexer(idx1, method="pad") + with pytest.raises(NotImplementedError, match=msg): + idx2.get_indexer(idx1, method="backfill") + + msg = "method='nearest' not implemented yet for CategoricalIndex" + with pytest.raises(NotImplementedError, match=msg): + idx2.get_indexer(idx1, method="nearest") diff --git a/pandas/tests/indexes/categorical/test_map.py b/pandas/tests/indexes/categorical/test_map.py new file mode 100644 index 0000000000000..943359a72e971 --- /dev/null +++ b/pandas/tests/indexes/categorical/test_map.py @@ -0,0 +1,95 @@ +import numpy as np +import pytest + +import pandas as pd +from pandas import CategoricalIndex, Index +import pandas._testing as tm + + +class TestMap: + @pytest.mark.parametrize( + "data, categories", + [ + (list("abcbca"), list("cab")), + (pd.interval_range(0, 3).repeat(3), pd.interval_range(0, 3)), + ], + ids=["string", "interval"], + ) + def test_map_str(self, data, categories, ordered_fixture): + # GH 31202 - override base class since we want to maintain categorical/ordered + index = CategoricalIndex(data, categories=categories, ordered=ordered_fixture) + result = index.map(str) + expected = CategoricalIndex( + map(str, data), categories=map(str, categories), ordered=ordered_fixture + ) + tm.assert_index_equal(result, expected) + + def test_map(self): + ci = pd.CategoricalIndex(list("ABABC"), categories=list("CBA"), ordered=True) + result = ci.map(lambda x: x.lower()) + exp = pd.CategoricalIndex(list("ababc"), categories=list("cba"), ordered=True) + tm.assert_index_equal(result, exp) + + ci = pd.CategoricalIndex( + list("ABABC"), categories=list("BAC"), ordered=False, name="XXX" + ) + result = ci.map(lambda x: x.lower()) + exp = pd.CategoricalIndex( + list("ababc"), categories=list("bac"), ordered=False, name="XXX" + ) + tm.assert_index_equal(result, exp) + + # GH 12766: Return an index not an array + tm.assert_index_equal( + ci.map(lambda x: 1), Index(np.array([1] * 5, dtype=np.int64), name="XXX") + ) + + # change categories dtype + ci = pd.CategoricalIndex(list("ABABC"), categories=list("BAC"), ordered=False) + + def f(x): + return {"A": 10, "B": 20, "C": 30}.get(x) + + result = ci.map(f) + exp = pd.CategoricalIndex( + [10, 20, 10, 20, 30], categories=[20, 10, 30], ordered=False + ) + tm.assert_index_equal(result, exp) + + result = ci.map(pd.Series([10, 20, 30], index=["A", "B", "C"])) + tm.assert_index_equal(result, exp) + + result = ci.map({"A": 10, "B": 20, "C": 30}) + tm.assert_index_equal(result, exp) + + def test_map_with_categorical_series(self): + # GH 12756 + a = pd.Index([1, 2, 3, 4]) + b = pd.Series(["even", "odd", "even", "odd"], dtype="category") + c = pd.Series(["even", "odd", "even", "odd"]) + + exp = CategoricalIndex(["odd", "even", "odd", np.nan]) + tm.assert_index_equal(a.map(b), exp) + exp = pd.Index(["odd", "even", "odd", np.nan]) + tm.assert_index_equal(a.map(c), exp) + + @pytest.mark.parametrize( + ("data", "f"), + ( + ([1, 1, np.nan], pd.isna), + ([1, 2, np.nan], pd.isna), + ([1, 1, np.nan], {1: False}), + ([1, 2, np.nan], {1: False, 2: False}), + ([1, 1, np.nan], pd.Series([False, False])), + ([1, 2, np.nan], pd.Series([False, False, False])), + ), + ) + def test_map_with_nan(self, data, f): # GH 24241 + values = pd.Categorical(data) + result = values.map(f) + if data[1] == 1: + expected = pd.Categorical([False, False, np.nan]) + tm.assert_categorical_equal(result, expected) + else: + expected = pd.Index([False, False, np.nan]) + tm.assert_index_equal(result, expected) diff --git a/pandas/tests/indexes/categorical/test_reindex.py b/pandas/tests/indexes/categorical/test_reindex.py new file mode 100644 index 0000000000000..f59ddc42ce4e4 --- /dev/null +++ b/pandas/tests/indexes/categorical/test_reindex.py @@ -0,0 +1,53 @@ +import numpy as np + +from pandas import Categorical, CategoricalIndex, Index +import pandas._testing as tm + + +class TestReindex: + def test_reindex_dtype(self): + c = CategoricalIndex(["a", "b", "c", "a"]) + res, indexer = c.reindex(["a", "c"]) + tm.assert_index_equal(res, Index(["a", "a", "c"]), exact=True) + tm.assert_numpy_array_equal(indexer, np.array([0, 3, 2], dtype=np.intp)) + + c = CategoricalIndex(["a", "b", "c", "a"]) + res, indexer = c.reindex(Categorical(["a", "c"])) + + exp = CategoricalIndex(["a", "a", "c"], categories=["a", "c"]) + tm.assert_index_equal(res, exp, exact=True) + tm.assert_numpy_array_equal(indexer, np.array([0, 3, 2], dtype=np.intp)) + + c = CategoricalIndex(["a", "b", "c", "a"], categories=["a", "b", "c", "d"]) + res, indexer = c.reindex(["a", "c"]) + exp = Index(["a", "a", "c"], dtype="object") + tm.assert_index_equal(res, exp, exact=True) + tm.assert_numpy_array_equal(indexer, np.array([0, 3, 2], dtype=np.intp)) + + c = CategoricalIndex(["a", "b", "c", "a"], categories=["a", "b", "c", "d"]) + res, indexer = c.reindex(Categorical(["a", "c"])) + exp = CategoricalIndex(["a", "a", "c"], categories=["a", "c"]) + tm.assert_index_equal(res, exp, exact=True) + tm.assert_numpy_array_equal(indexer, np.array([0, 3, 2], dtype=np.intp)) + + def test_reindex_duplicate_target(self): + # See GH25459 + cat = CategoricalIndex(["a", "b", "c"], categories=["a", "b", "c", "d"]) + res, indexer = cat.reindex(["a", "c", "c"]) + exp = Index(["a", "c", "c"], dtype="object") + tm.assert_index_equal(res, exp, exact=True) + tm.assert_numpy_array_equal(indexer, np.array([0, 2, 2], dtype=np.intp)) + + res, indexer = cat.reindex( + CategoricalIndex(["a", "c", "c"], categories=["a", "b", "c", "d"]) + ) + exp = CategoricalIndex(["a", "c", "c"], categories=["a", "b", "c", "d"]) + tm.assert_index_equal(res, exp, exact=True) + tm.assert_numpy_array_equal(indexer, np.array([0, 2, 2], dtype=np.intp)) + + def test_reindex_empty_index(self): + # See GH16770 + c = CategoricalIndex([]) + res, indexer = c.reindex(["a", "b"]) + tm.assert_index_equal(res, Index(["a", "b"]), exact=True) + tm.assert_numpy_array_equal(indexer, np.array([-1, -1], dtype=np.intp)) diff --git a/pandas/tests/indexes/common.py b/pandas/tests/indexes/common.py index 26d120619defc..da27057a783ab 100644 --- a/pandas/tests/indexes/common.py +++ b/pandas/tests/indexes/common.py @@ -167,6 +167,10 @@ def test_create_index_existing_name(self): def test_numeric_compat(self): idx = self.create_index() + # Check that this doesn't cover MultiIndex case, if/when it does, + # we can remove multi.test_compat.test_numeric_compat + assert not isinstance(idx, MultiIndex) + with pytest.raises(TypeError, match="cannot perform __mul__"): idx * 1 with pytest.raises(TypeError, match="cannot perform __rmul__"): diff --git a/pandas/tests/indexes/datetimes/test_constructors.py b/pandas/tests/indexes/datetimes/test_constructors.py index 68285d41bda70..1d1d371fcec1e 100644 --- a/pandas/tests/indexes/datetimes/test_constructors.py +++ b/pandas/tests/indexes/datetimes/test_constructors.py @@ -951,16 +951,11 @@ def test_datetimeindex_constructor_misc(self): assert len(idx1) == len(idx2) assert idx1.freq == idx2.freq + def test_pass_datetimeindex_to_index(self): + # Bugs in #1396 + rng = date_range("1/1/2000", "3/1/2000") + idx = Index(rng, dtype=object) -def test_timedelta_constructor_identity(): - # Test for #30543 - expected = pd.Timedelta(np.timedelta64(1, "s")) - result = pd.Timedelta(expected) - assert result is expected + expected = Index(rng.to_pydatetime(), dtype=object) - -def test_timestamp_constructor_identity(): - # Test for #30543 - expected = pd.Timestamp("2017-01-01T12") - result = pd.Timestamp(expected) - assert result is expected + tm.assert_numpy_array_equal(idx.values, expected.values) diff --git a/pandas/tests/indexes/datetimes/test_indexing.py b/pandas/tests/indexes/datetimes/test_indexing.py index 2f954117f48d7..c358e72538788 100644 --- a/pandas/tests/indexes/datetimes/test_indexing.py +++ b/pandas/tests/indexes/datetimes/test_indexing.py @@ -344,6 +344,115 @@ def test_take_fill_value_with_timezone(self): idx.take(np.array([1, -5])) +class TestGetLoc: + @pytest.mark.parametrize("method", [None, "pad", "backfill", "nearest"]) + def test_get_loc_method_exact_match(self, method): + idx = pd.date_range("2000-01-01", periods=3) + assert idx.get_loc(idx[1], method) == 1 + assert idx.get_loc(idx[1].to_pydatetime(), method) == 1 + assert idx.get_loc(str(idx[1]), method) == 1 + + if method is not None: + assert idx.get_loc(idx[1], method, tolerance=pd.Timedelta("0 days")) == 1 + + def test_get_loc(self): + idx = pd.date_range("2000-01-01", periods=3) + + assert idx.get_loc("2000-01-01", method="nearest") == 0 + assert idx.get_loc("2000-01-01T12", method="nearest") == 1 + + assert idx.get_loc("2000-01-01T12", method="nearest", tolerance="1 day") == 1 + assert ( + idx.get_loc("2000-01-01T12", method="nearest", tolerance=pd.Timedelta("1D")) + == 1 + ) + assert ( + idx.get_loc( + "2000-01-01T12", method="nearest", tolerance=np.timedelta64(1, "D") + ) + == 1 + ) + assert ( + idx.get_loc("2000-01-01T12", method="nearest", tolerance=timedelta(1)) == 1 + ) + with pytest.raises(ValueError, match="unit abbreviation w/o a number"): + idx.get_loc("2000-01-01T12", method="nearest", tolerance="foo") + with pytest.raises(KeyError, match="'2000-01-01T03'"): + idx.get_loc("2000-01-01T03", method="nearest", tolerance="2 hours") + with pytest.raises( + ValueError, match="tolerance size must match target index size" + ): + idx.get_loc( + "2000-01-01", + method="nearest", + tolerance=[ + pd.Timedelta("1day").to_timedelta64(), + pd.Timedelta("1day").to_timedelta64(), + ], + ) + + assert idx.get_loc("2000", method="nearest") == slice(0, 3) + assert idx.get_loc("2000-01", method="nearest") == slice(0, 3) + + assert idx.get_loc("1999", method="nearest") == 0 + assert idx.get_loc("2001", method="nearest") == 2 + + with pytest.raises(KeyError, match="'1999'"): + idx.get_loc("1999", method="pad") + with pytest.raises(KeyError, match="'2001'"): + idx.get_loc("2001", method="backfill") + + with pytest.raises(KeyError, match="'foobar'"): + idx.get_loc("foobar") + with pytest.raises(InvalidIndexError, match=r"slice\(None, 2, None\)"): + idx.get_loc(slice(2)) + + idx = pd.to_datetime(["2000-01-01", "2000-01-04"]) + assert idx.get_loc("2000-01-02", method="nearest") == 0 + assert idx.get_loc("2000-01-03", method="nearest") == 1 + assert idx.get_loc("2000-01", method="nearest") == slice(0, 2) + + # time indexing + idx = pd.date_range("2000-01-01", periods=24, freq="H") + tm.assert_numpy_array_equal( + idx.get_loc(time(12)), np.array([12]), check_dtype=False + ) + tm.assert_numpy_array_equal( + idx.get_loc(time(12, 30)), np.array([]), check_dtype=False + ) + with pytest.raises(NotImplementedError): + idx.get_loc(time(12, 30), method="pad") + + def test_get_loc_nat(self): + # GH#20464 + index = DatetimeIndex(["1/3/2000", "NaT"]) + assert index.get_loc(pd.NaT) == 1 + + assert index.get_loc(None) == 1 + + assert index.get_loc(np.nan) == 1 + + assert index.get_loc(pd.NA) == 1 + + assert index.get_loc(np.datetime64("NaT")) == 1 + + with pytest.raises(KeyError, match="NaT"): + index.get_loc(np.timedelta64("NaT")) + + @pytest.mark.parametrize("key", [pd.Timedelta(0), pd.Timedelta(1), timedelta(0)]) + def test_get_loc_timedelta_invalid_key(self, key): + # GH#20464 + dti = pd.date_range("1970-01-01", periods=10) + with pytest.raises(TypeError): + dti.get_loc(key) + + def test_get_loc_reasonable_key_error(self): + # GH#1062 + index = DatetimeIndex(["1/3/2000"]) + with pytest.raises(KeyError, match="2000"): + index.get_loc("1/1/2000") + + class TestDatetimeIndex: @pytest.mark.parametrize( "null", [None, np.nan, np.datetime64("NaT"), pd.NaT, pd.NA] @@ -639,84 +748,6 @@ def test_get_value(self): result = dti.get_value(ser, key.to_datetime64()) assert result == 7 - def test_get_loc(self): - idx = pd.date_range("2000-01-01", periods=3) - - for method in [None, "pad", "backfill", "nearest"]: - assert idx.get_loc(idx[1], method) == 1 - assert idx.get_loc(idx[1].to_pydatetime(), method) == 1 - assert idx.get_loc(str(idx[1]), method) == 1 - - if method is not None: - assert ( - idx.get_loc(idx[1], method, tolerance=pd.Timedelta("0 days")) == 1 - ) - - assert idx.get_loc("2000-01-01", method="nearest") == 0 - assert idx.get_loc("2000-01-01T12", method="nearest") == 1 - - assert idx.get_loc("2000-01-01T12", method="nearest", tolerance="1 day") == 1 - assert ( - idx.get_loc("2000-01-01T12", method="nearest", tolerance=pd.Timedelta("1D")) - == 1 - ) - assert ( - idx.get_loc( - "2000-01-01T12", method="nearest", tolerance=np.timedelta64(1, "D") - ) - == 1 - ) - assert ( - idx.get_loc("2000-01-01T12", method="nearest", tolerance=timedelta(1)) == 1 - ) - with pytest.raises(ValueError, match="unit abbreviation w/o a number"): - idx.get_loc("2000-01-01T12", method="nearest", tolerance="foo") - with pytest.raises(KeyError, match="'2000-01-01T03'"): - idx.get_loc("2000-01-01T03", method="nearest", tolerance="2 hours") - with pytest.raises( - ValueError, match="tolerance size must match target index size" - ): - idx.get_loc( - "2000-01-01", - method="nearest", - tolerance=[ - pd.Timedelta("1day").to_timedelta64(), - pd.Timedelta("1day").to_timedelta64(), - ], - ) - - assert idx.get_loc("2000", method="nearest") == slice(0, 3) - assert idx.get_loc("2000-01", method="nearest") == slice(0, 3) - - assert idx.get_loc("1999", method="nearest") == 0 - assert idx.get_loc("2001", method="nearest") == 2 - - with pytest.raises(KeyError, match="'1999'"): - idx.get_loc("1999", method="pad") - with pytest.raises(KeyError, match="'2001'"): - idx.get_loc("2001", method="backfill") - - with pytest.raises(KeyError, match="'foobar'"): - idx.get_loc("foobar") - with pytest.raises(InvalidIndexError, match=r"slice\(None, 2, None\)"): - idx.get_loc(slice(2)) - - idx = pd.to_datetime(["2000-01-01", "2000-01-04"]) - assert idx.get_loc("2000-01-02", method="nearest") == 0 - assert idx.get_loc("2000-01-03", method="nearest") == 1 - assert idx.get_loc("2000-01", method="nearest") == slice(0, 2) - - # time indexing - idx = pd.date_range("2000-01-01", periods=24, freq="H") - tm.assert_numpy_array_equal( - idx.get_loc(time(12)), np.array([12]), check_dtype=False - ) - tm.assert_numpy_array_equal( - idx.get_loc(time(12, 30)), np.array([]), check_dtype=False - ) - with pytest.raises(NotImplementedError): - idx.get_loc(time(12, 30), method="pad") - def test_get_indexer(self): idx = pd.date_range("2000-01-01", periods=3) exp = np.array([0, 1, 2], dtype=np.intp) @@ -756,32 +787,3 @@ def test_get_indexer(self): idx.get_indexer(target, "nearest", tolerance=tol_bad) with pytest.raises(ValueError): idx.get_indexer(idx[[0]], method="nearest", tolerance="foo") - - def test_reasonable_key_error(self): - # GH#1062 - index = DatetimeIndex(["1/3/2000"]) - with pytest.raises(KeyError, match="2000"): - index.get_loc("1/1/2000") - - @pytest.mark.parametrize("key", [pd.Timedelta(0), pd.Timedelta(1), timedelta(0)]) - def test_timedelta_invalid_key(self, key): - # GH#20464 - dti = pd.date_range("1970-01-01", periods=10) - with pytest.raises(TypeError): - dti.get_loc(key) - - def test_get_loc_nat(self): - # GH#20464 - index = DatetimeIndex(["1/3/2000", "NaT"]) - assert index.get_loc(pd.NaT) == 1 - - assert index.get_loc(None) == 1 - - assert index.get_loc(np.nan) == 1 - - assert index.get_loc(pd.NA) == 1 - - assert index.get_loc(np.datetime64("NaT")) == 1 - - with pytest.raises(KeyError, match="NaT"): - index.get_loc(np.timedelta64("NaT")) diff --git a/pandas/tests/indexes/datetimes/test_misc.py b/pandas/tests/indexes/datetimes/test_misc.py index 340f53b2868bd..d0464698e3f24 100644 --- a/pandas/tests/indexes/datetimes/test_misc.py +++ b/pandas/tests/indexes/datetimes/test_misc.py @@ -12,15 +12,6 @@ class TestTimeSeries: - def test_pass_datetimeindex_to_index(self): - # Bugs in #1396 - rng = date_range("1/1/2000", "3/1/2000") - idx = Index(rng, dtype=object) - - expected = Index(rng.to_pydatetime(), dtype=object) - - tm.assert_numpy_array_equal(idx.values, expected.values) - def test_range_edges(self): # GH#13672 idx = pd.date_range( diff --git a/pandas/tests/indexes/multi/conftest.py b/pandas/tests/indexes/multi/conftest.py index acaea4ff96ff5..67ebfcddf6c2d 100644 --- a/pandas/tests/indexes/multi/conftest.py +++ b/pandas/tests/indexes/multi/conftest.py @@ -49,12 +49,6 @@ def index_names(): return ["first", "second"] -@pytest.fixture -def holder(): - # the MultiIndex constructor used to base compatibility with pickle - return MultiIndex - - @pytest.fixture def compat_props(): # a MultiIndex must have these properties associated with it diff --git a/pandas/tests/indexes/multi/test_analytics.py b/pandas/tests/indexes/multi/test_analytics.py index e64511efd7ffb..a9e02934f27ab 100644 --- a/pandas/tests/indexes/multi/test_analytics.py +++ b/pandas/tests/indexes/multi/test_analytics.py @@ -146,83 +146,6 @@ def test_append_mixed_dtypes(): tm.assert_index_equal(res, exp) -def test_take(idx): - indexer = [4, 3, 0, 2] - result = idx.take(indexer) - expected = idx[indexer] - assert result.equals(expected) - - # TODO: Remove Commented Code - # if not isinstance(idx, - # (DatetimeIndex, PeriodIndex, TimedeltaIndex)): - # GH 10791 - msg = "'MultiIndex' object has no attribute 'freq'" - with pytest.raises(AttributeError, match=msg): - idx.freq - - -def test_take_invalid_kwargs(idx): - idx = idx - indices = [1, 2] - - msg = r"take\(\) got an unexpected keyword argument 'foo'" - with pytest.raises(TypeError, match=msg): - idx.take(indices, foo=2) - - msg = "the 'out' parameter is not supported" - with pytest.raises(ValueError, match=msg): - idx.take(indices, out=indices) - - msg = "the 'mode' parameter is not supported" - with pytest.raises(ValueError, match=msg): - idx.take(indices, mode="clip") - - -def test_take_fill_value(): - # GH 12631 - vals = [["A", "B"], [pd.Timestamp("2011-01-01"), pd.Timestamp("2011-01-02")]] - idx = pd.MultiIndex.from_product(vals, names=["str", "dt"]) - - result = idx.take(np.array([1, 0, -1])) - exp_vals = [ - ("A", pd.Timestamp("2011-01-02")), - ("A", pd.Timestamp("2011-01-01")), - ("B", pd.Timestamp("2011-01-02")), - ] - expected = pd.MultiIndex.from_tuples(exp_vals, names=["str", "dt"]) - tm.assert_index_equal(result, expected) - - # fill_value - result = idx.take(np.array([1, 0, -1]), fill_value=True) - exp_vals = [ - ("A", pd.Timestamp("2011-01-02")), - ("A", pd.Timestamp("2011-01-01")), - (np.nan, pd.NaT), - ] - expected = pd.MultiIndex.from_tuples(exp_vals, names=["str", "dt"]) - tm.assert_index_equal(result, expected) - - # allow_fill=False - result = idx.take(np.array([1, 0, -1]), allow_fill=False, fill_value=True) - exp_vals = [ - ("A", pd.Timestamp("2011-01-02")), - ("A", pd.Timestamp("2011-01-01")), - ("B", pd.Timestamp("2011-01-02")), - ] - expected = pd.MultiIndex.from_tuples(exp_vals, names=["str", "dt"]) - tm.assert_index_equal(result, expected) - - msg = "When allow_fill=True and fill_value is not None, all indices must be >= -1" - with pytest.raises(ValueError, match=msg): - idx.take(np.array([1, 0, -2]), fill_value=True) - with pytest.raises(ValueError, match=msg): - idx.take(np.array([1, 0, -5]), fill_value=True) - - msg = "index -5 is out of bounds for( axis 0 with)? size 4" - with pytest.raises(IndexError, match=msg): - idx.take(np.array([1, -5])) - - def test_iter(idx): result = list(idx) expected = [ diff --git a/pandas/tests/indexes/multi/test_compat.py b/pandas/tests/indexes/multi/test_compat.py index 545a7ddef29bb..9a76f0623eb31 100644 --- a/pandas/tests/indexes/multi/test_compat.py +++ b/pandas/tests/indexes/multi/test_compat.py @@ -112,8 +112,8 @@ def test_ndarray_compat_properties(idx, compat_props): idx.values.nbytes -def test_pickle_compat_construction(holder): +def test_pickle_compat_construction(): # this is testing for pickle compat # need an object to create with with pytest.raises(TypeError, match="Must pass both levels and codes"): - holder() + MultiIndex() diff --git a/pandas/tests/indexes/multi/test_get_set.py b/pandas/tests/indexes/multi/test_get_set.py index 074072ae581b2..675a1e2e832f3 100644 --- a/pandas/tests/indexes/multi/test_get_set.py +++ b/pandas/tests/indexes/multi/test_get_set.py @@ -57,8 +57,6 @@ def test_get_value_duplicates(): ) assert index.get_loc("D") == slice(0, 3) - with pytest.raises(KeyError, match=r"^'D'$"): - index._engine.get_value(np.array([]), "D") def test_get_level_values_all_na(): @@ -159,7 +157,7 @@ def test_set_levels_codes_directly(idx): minor_codes = [(x + 1) % 1 for x in minor_codes] new_codes = [major_codes, minor_codes] - msg = "can't set attribute" + msg = "[Cc]an't set attribute" with pytest.raises(AttributeError, match=msg): idx.levels = new_levels with pytest.raises(AttributeError, match=msg): diff --git a/pandas/tests/indexes/multi/test_indexing.py b/pandas/tests/indexes/multi/test_indexing.py index b08280a712642..21a4773fa3683 100644 --- a/pandas/tests/indexes/multi/test_indexing.py +++ b/pandas/tests/indexes/multi/test_indexing.py @@ -392,7 +392,7 @@ def test_get_loc_missing_nan(): # GH 8569 idx = MultiIndex.from_arrays([[1.0, 2.0], [3.0, 4.0]]) assert isinstance(idx.get_loc(1), slice) - with pytest.raises(KeyError, match=r"^3\.0$"): + with pytest.raises(KeyError, match=r"^3$"): idx.get_loc(3) with pytest.raises(KeyError, match=r"^nan$"): idx.get_loc(np.nan) diff --git a/pandas/tests/indexes/multi/test_sorting.py b/pandas/tests/indexes/multi/test_sorting.py index 50242c1cac549..bb40612b9a55a 100644 --- a/pandas/tests/indexes/multi/test_sorting.py +++ b/pandas/tests/indexes/multi/test_sorting.py @@ -1,3 +1,5 @@ +import random + import numpy as np import pytest @@ -9,8 +11,6 @@ def test_sortlevel(idx): - import random - tuples = list(idx) random.shuffle(tuples) diff --git a/pandas/tests/indexes/multi/test_take.py b/pandas/tests/indexes/multi/test_take.py new file mode 100644 index 0000000000000..85043ff8812af --- /dev/null +++ b/pandas/tests/indexes/multi/test_take.py @@ -0,0 +1,82 @@ +import numpy as np +import pytest + +import pandas as pd +import pandas._testing as tm + + +def test_take(idx): + indexer = [4, 3, 0, 2] + result = idx.take(indexer) + expected = idx[indexer] + assert result.equals(expected) + + # FIXME: Remove Commented Code + # if not isinstance(idx, + # (DatetimeIndex, PeriodIndex, TimedeltaIndex)): + # GH 10791 + msg = "'MultiIndex' object has no attribute 'freq'" + with pytest.raises(AttributeError, match=msg): + idx.freq + + +def test_take_invalid_kwargs(idx): + idx = idx + indices = [1, 2] + + msg = r"take\(\) got an unexpected keyword argument 'foo'" + with pytest.raises(TypeError, match=msg): + idx.take(indices, foo=2) + + msg = "the 'out' parameter is not supported" + with pytest.raises(ValueError, match=msg): + idx.take(indices, out=indices) + + msg = "the 'mode' parameter is not supported" + with pytest.raises(ValueError, match=msg): + idx.take(indices, mode="clip") + + +def test_take_fill_value(): + # GH 12631 + vals = [["A", "B"], [pd.Timestamp("2011-01-01"), pd.Timestamp("2011-01-02")]] + idx = pd.MultiIndex.from_product(vals, names=["str", "dt"]) + + result = idx.take(np.array([1, 0, -1])) + exp_vals = [ + ("A", pd.Timestamp("2011-01-02")), + ("A", pd.Timestamp("2011-01-01")), + ("B", pd.Timestamp("2011-01-02")), + ] + expected = pd.MultiIndex.from_tuples(exp_vals, names=["str", "dt"]) + tm.assert_index_equal(result, expected) + + # fill_value + result = idx.take(np.array([1, 0, -1]), fill_value=True) + exp_vals = [ + ("A", pd.Timestamp("2011-01-02")), + ("A", pd.Timestamp("2011-01-01")), + (np.nan, pd.NaT), + ] + expected = pd.MultiIndex.from_tuples(exp_vals, names=["str", "dt"]) + tm.assert_index_equal(result, expected) + + # allow_fill=False + result = idx.take(np.array([1, 0, -1]), allow_fill=False, fill_value=True) + exp_vals = [ + ("A", pd.Timestamp("2011-01-02")), + ("A", pd.Timestamp("2011-01-01")), + ("B", pd.Timestamp("2011-01-02")), + ] + expected = pd.MultiIndex.from_tuples(exp_vals, names=["str", "dt"]) + tm.assert_index_equal(result, expected) + + msg = "When allow_fill=True and fill_value is not None, all indices must be >= -1" + with pytest.raises(ValueError, match=msg): + idx.take(np.array([1, 0, -2]), fill_value=True) + with pytest.raises(ValueError, match=msg): + idx.take(np.array([1, 0, -5]), fill_value=True) + + msg = "index -5 is out of bounds for( axis 0 with)? size 4" + with pytest.raises(IndexError, match=msg): + idx.take(np.array([1, -5])) diff --git a/pandas/tests/indexes/period/test_indexing.py b/pandas/tests/indexes/period/test_indexing.py index 38514594efe09..fffc4a7562306 100644 --- a/pandas/tests/indexes/period/test_indexing.py +++ b/pandas/tests/indexes/period/test_indexing.py @@ -486,15 +486,17 @@ def test_get_value_datetime_hourly(self, freq): assert ser.loc[ts2] == 7 def test_get_value_integer(self): + msg = "index 16801 is out of bounds for axis 0 with size 3" dti = pd.date_range("2016-01-01", periods=3) pi = dti.to_period("D") ser = pd.Series(range(3), index=pi) - with pytest.raises(IndexError, match="index out of bounds"): + with pytest.raises(IndexError, match=msg): pi.get_value(ser, 16801) + msg = "index 46 is out of bounds for axis 0 with size 3" pi2 = dti.to_period("Y") # duplicates, ordinals are all 46 ser2 = pd.Series(range(3), index=pi2) - with pytest.raises(IndexError, match="index out of bounds"): + with pytest.raises(IndexError, match=msg): pi2.get_value(ser2, 46) def test_is_monotonic_increasing(self): diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index e72963de09ab4..04af9b09bbf89 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -34,7 +34,6 @@ period_range, ) import pandas._testing as tm -from pandas.core.algorithms import safe_sort from pandas.core.indexes.api import ( Index, MultiIndex, @@ -108,23 +107,6 @@ def test_constructor_copy(self, index): # arr = np.array(5.) # pytest.raises(Exception, arr.view, Index) - def test_constructor_corner(self): - # corner case - msg = ( - r"Index\(\.\.\.\) must be called with a collection of some " - "kind, 0 was passed" - ) - with pytest.raises(TypeError, match=msg): - Index(0) - - @pytest.mark.parametrize("index_vals", [[("A", 1), "B"], ["B", ("A", 1)]]) - def test_construction_list_mixed_tuples(self, index_vals): - # see gh-10697: if we are constructing from a mixed list of tuples, - # make sure that we are independent of the sorting order. - index = Index(index_vals) - assert isinstance(index, Index) - assert not isinstance(index, MultiIndex) - @pytest.mark.parametrize("na_value", [None, np.nan]) @pytest.mark.parametrize("vtype", [list, tuple, iter]) def test_construction_list_tuples_nan(self, na_value, vtype): @@ -359,11 +341,6 @@ def test_constructor_simple_new(self, vals, dtype): result = index._simple_new(index.values, dtype) tm.assert_index_equal(result, index) - def test_constructor_wrong_kwargs(self): - # GH #19348 - with pytest.raises(TypeError, match="Unexpected keyword arguments {'foo'}"): - Index([], foo="bar") - @pytest.mark.parametrize( "vals", [ @@ -554,12 +531,6 @@ def test_constructor_overflow_int64(self): with pytest.raises(OverflowError, match=msg): Index([np.iinfo(np.uint64).max - 1], dtype="int64") - @pytest.mark.xfail(reason="see GH#21311: Index doesn't enforce dtype argument") - def test_constructor_cast(self): - msg = "could not convert string to float" - with pytest.raises(ValueError, match=msg): - Index(["a", "b", "c"], dtype=float) - @pytest.mark.parametrize( "index", [ @@ -1047,6 +1018,32 @@ def test_setops_disallow_true(self, method): with pytest.raises(ValueError, match="The 'sort' keyword only takes"): getattr(idx1, method)(idx2, sort=True) + def test_setops_preserve_object_dtype(self): + idx = pd.Index([1, 2, 3], dtype=object) + result = idx.intersection(idx[1:]) + expected = idx[1:] + tm.assert_index_equal(result, expected) + + # if other is not monotonic increasing, intersection goes through + # a different route + result = idx.intersection(idx[1:][::-1]) + tm.assert_index_equal(result, expected) + + result = idx._union(idx[1:], sort=None) + expected = idx + tm.assert_index_equal(result, expected) + + result = idx.union(idx[1:], sort=None) + tm.assert_index_equal(result, expected) + + # if other is not monotonic increasing, _union goes through + # a different route + result = idx._union(idx[1:][::-1], sort=None) + tm.assert_index_equal(result, expected) + + result = idx.union(idx[1:][::-1], sort=None) + tm.assert_index_equal(result, expected) + def test_map_identity_mapping(self, indices): # GH 12766 tm.assert_index_equal(indices, indices.map(lambda x: x)) @@ -2502,78 +2499,12 @@ def test_copy_name2(self): assert index3.name == "NewName" assert index3.names == ["NewName"] - def test_union_base(self): - index = self.create_index() - first = index[3:] - second = index[:5] - - result = first.union(second) - - expected = Index([0, 1, 2, "a", "b", "c"]) - tm.assert_index_equal(result, expected) - - @pytest.mark.parametrize("klass", [np.array, Series, list]) - def test_union_different_type_base(self, klass): - # GH 10149 - index = self.create_index() - first = index[3:] - second = index[:5] - - result = first.union(klass(second.values)) - - assert tm.equalContents(result, index) - def test_unique_na(self): idx = pd.Index([2, np.nan, 2, 1], name="my_index") expected = pd.Index([2, np.nan, 1], name="my_index") result = idx.unique() tm.assert_index_equal(result, expected) - @pytest.mark.parametrize("sort", [None, False]) - def test_intersection_base(self, sort): - # (same results for py2 and py3 but sortedness not tested elsewhere) - index = self.create_index() - first = index[:5] - second = index[:3] - - expected = Index([0, 1, "a"]) if sort is None else Index([0, "a", 1]) - result = first.intersection(second, sort=sort) - tm.assert_index_equal(result, expected) - - @pytest.mark.parametrize("klass", [np.array, Series, list]) - @pytest.mark.parametrize("sort", [None, False]) - def test_intersection_different_type_base(self, klass, sort): - # GH 10149 - index = self.create_index() - first = index[:5] - second = index[:3] - - result = first.intersection(klass(second.values), sort=sort) - assert tm.equalContents(result, second) - - @pytest.mark.parametrize("sort", [None, False]) - def test_difference_base(self, sort): - # (same results for py2 and py3 but sortedness not tested elsewhere) - index = self.create_index() - first = index[:4] - second = index[3:] - - result = first.difference(second, sort) - expected = Index([0, "a", 1]) - if sort is None: - expected = Index(safe_sort(expected)) - tm.assert_index_equal(result, expected) - - def test_symmetric_difference(self): - # (same results for py2 and py3 but sortedness not tested elsewhere) - index = self.create_index() - first = index[:4] - second = index[3:] - - result = first.symmetric_difference(second) - expected = Index([0, 1, 2, "a", "c"]) - tm.assert_index_equal(result, expected) - def test_logical_compat(self): index = self.create_index() assert index.all() == index.values.all() diff --git a/pandas/tests/indexes/test_numeric.py b/pandas/tests/indexes/test_numeric.py index 992a91ad8a528..1b504ce99604d 100644 --- a/pandas/tests/indexes/test_numeric.py +++ b/pandas/tests/indexes/test_numeric.py @@ -385,7 +385,7 @@ def test_get_loc_missing_nan(self): # GH 8569 idx = Float64Index([1, 2]) assert idx.get_loc(1) == 0 - with pytest.raises(KeyError, match=r"^3\.0$"): + with pytest.raises(KeyError, match=r"^3$"): idx.get_loc(3) with pytest.raises(KeyError, match="^nan$"): idx.get_loc(np.nan) diff --git a/pandas/tests/indexing/multiindex/test_getitem.py b/pandas/tests/indexing/multiindex/test_getitem.py index c15fa34283f21..7e75b5324445e 100644 --- a/pandas/tests/indexing/multiindex/test_getitem.py +++ b/pandas/tests/indexing/multiindex/test_getitem.py @@ -87,8 +87,8 @@ def test_series_getitem_returns_scalar( (lambda s: s[(2000, 3, 4)], KeyError, r"^\(2000, 3, 4\)$"), (lambda s: s.loc[(2000, 3, 4)], KeyError, r"^\(2000, 3, 4\)$"), (lambda s: s.loc[(2000, 3, 4, 5)], IndexingError, "Too many indexers"), - (lambda s: s.__getitem__(len(s)), IndexError, "index out of bounds"), - (lambda s: s[len(s)], IndexError, "index out of bounds"), + (lambda s: s.__getitem__(len(s)), IndexError, "is out of bounds"), + (lambda s: s[len(s)], IndexError, "is out of bounds"), ( lambda s: s.iloc[len(s)], IndexError, diff --git a/pandas/tests/indexing/multiindex/test_setitem.py b/pandas/tests/indexing/multiindex/test_setitem.py index aebd1ad2573ed..1e641760f7e8d 100644 --- a/pandas/tests/indexing/multiindex/test_setitem.py +++ b/pandas/tests/indexing/multiindex/test_setitem.py @@ -414,6 +414,16 @@ def test_astype_assignment_with_dups(self): df["A"] = df["A"].astype(np.float64) tm.assert_index_equal(df.index, index) + def test_setitem_nonmonotonic(self): + # https://github.com/pandas-dev/pandas/issues/31449 + index = pd.MultiIndex.from_tuples( + [("a", "c"), ("b", "x"), ("a", "d")], names=["l1", "l2"] + ) + df = pd.DataFrame(data=[0, 1, 2], index=index, columns=["e"]) + df.loc["a", "e"] = np.arange(99, 101, dtype="int64") + expected = pd.DataFrame({"e": [99, 1, 100]}, index=index) + tm.assert_frame_equal(df, expected) + def test_frame_setitem_view_direct(multiindex_dataframe_random_data): # this works because we are modifying the underlying array diff --git a/pandas/tests/indexing/test_categorical.py b/pandas/tests/indexing/test_categorical.py index 8c8dece53277e..da935b1c911d0 100644 --- a/pandas/tests/indexing/test_categorical.py +++ b/pandas/tests/indexing/test_categorical.py @@ -83,8 +83,8 @@ def test_loc_scalar(self): df.loc["d", "C"] = 10 msg = ( - r"cannot do label indexing on with these indexers \[1\] of " + "cannot do label indexing on CategoricalIndex with these " + r"indexers \[1\] of type int" ) with pytest.raises(TypeError, match=msg): df.loc[1] diff --git a/pandas/tests/indexing/test_floats.py b/pandas/tests/indexing/test_floats.py index 5530896a90941..8bb88cd9fd63a 100644 --- a/pandas/tests/indexing/test_floats.py +++ b/pandas/tests/indexing/test_floats.py @@ -22,16 +22,9 @@ def check(self, result, original, indexer, getitem): tm.assert_almost_equal(result, expected) - def test_scalar_error(self): - - # GH 4892 - # float_indexers should raise exceptions - # on appropriate Index types & accessors - # this duplicates the code below - # but is specifically testing for the error - # message - - for index in [ + @pytest.mark.parametrize( + "index_func", + [ tm.makeStringIndex, tm.makeUnicodeIndex, tm.makeCategoricalIndex, @@ -40,22 +33,31 @@ def test_scalar_error(self): tm.makePeriodIndex, tm.makeIntIndex, tm.makeRangeIndex, - ]: + ], + ) + def test_scalar_error(self, index_func): - i = index(5) + # GH 4892 + # float_indexers should raise exceptions + # on appropriate Index types & accessors + # this duplicates the code below + # but is specifically testing for the error + # message - s = Series(np.arange(len(i)), index=i) + i = index_func(5) - msg = "Cannot index by location index" - with pytest.raises(TypeError, match=msg): - s.iloc[3.0] + s = Series(np.arange(len(i)), index=i) - msg = ( - "cannot do positional indexing on {klass} with these " - r"indexers \[3\.0\] of {kind}".format(klass=type(i), kind=str(float)) - ) - with pytest.raises(TypeError, match=msg): - s.iloc[3.0] = 0 + msg = "Cannot index by location index" + with pytest.raises(TypeError, match=msg): + s.iloc[3.0] + + msg = ( + "cannot do positional indexing on {klass} with these " + r"indexers \[3\.0\] of type float".format(klass=type(i).__name__) + ) + with pytest.raises(TypeError, match=msg): + s.iloc[3.0] = 0 def test_scalar_non_numeric(self): @@ -90,11 +92,11 @@ def test_scalar_non_numeric(self): else: error = TypeError msg = ( - r"cannot do (label|index|positional) indexing " + r"cannot do (label|positional) indexing " r"on {klass} with these indexers \[3\.0\] of " - r"{kind}|" + r"type float|" "Cannot index by location index with a " - "non-integer key".format(klass=type(i), kind=str(float)) + "non-integer key".format(klass=type(i).__name__) ) with pytest.raises(error, match=msg): idxr(s)[3.0] @@ -107,13 +109,13 @@ def test_scalar_non_numeric(self): "mixed", }: error = KeyError - msg = r"^3$" + msg = r"^3\.0$" else: error = TypeError msg = ( - r"cannot do (label|index) indexing " + r"cannot do label indexing " r"on {klass} with these indexers \[3\.0\] of " - r"{kind}".format(klass=type(i), kind=str(float)) + r"type float".format(klass=type(i).__name__) ) with pytest.raises(error, match=msg): s.loc[3.0] @@ -123,9 +125,9 @@ def test_scalar_non_numeric(self): # setting with a float fails with iloc msg = ( - r"cannot do (label|index|positional) indexing " + r"cannot do (label|positional) indexing " r"on {klass} with these indexers \[3\.0\] of " - r"{kind}".format(klass=type(i), kind=str(float)) + r"type float".format(klass=type(i).__name__) ) with pytest.raises(TypeError, match=msg): s.iloc[3.0] = 0 @@ -160,9 +162,9 @@ def test_scalar_non_numeric(self): s = Series(np.arange(len(i)), index=i) s[3] msg = ( - r"cannot do (label|index) indexing " + r"cannot do label indexing " r"on {klass} with these indexers \[3\.0\] of " - r"{kind}".format(klass=type(i), kind=str(float)) + r"type float".format(klass=type(i).__name__) ) with pytest.raises(TypeError, match=msg): s[3.0] @@ -179,15 +181,15 @@ def test_scalar_with_mixed(self): msg = ( r"cannot do label indexing " r"on {klass} with these indexers \[1\.0\] of " - r"{kind}|" + r"type float|" "Cannot index by location index with a non-integer key".format( - klass=str(Index), kind=str(float) + klass=Index.__name__ ) ) with pytest.raises(TypeError, match=msg): idxr(s2)[1.0] - with pytest.raises(KeyError, match=r"^1$"): + with pytest.raises(KeyError, match=r"^1\.0$"): s2.loc[1.0] result = s2.loc["b"] @@ -201,7 +203,7 @@ def test_scalar_with_mixed(self): msg = ( r"cannot do label indexing " r"on {klass} with these indexers \[1\.0\] of " - r"{kind}".format(klass=str(Index), kind=str(float)) + r"type float".format(klass=Index.__name__) ) with pytest.raises(TypeError, match=msg): idxr(s3)[1.0] @@ -213,7 +215,7 @@ def test_scalar_with_mixed(self): msg = "Cannot index by location index with a non-integer key" with pytest.raises(TypeError, match=msg): s3.iloc[1.0] - with pytest.raises(KeyError, match=r"^1$"): + with pytest.raises(KeyError, match=r"^1\.0$"): s3.loc[1.0] result = s3.loc[1.5] @@ -315,7 +317,7 @@ def test_scalar_float(self): msg = ( r"cannot do positional indexing " r"on {klass} with these indexers \[3\.0\] of " - r"{kind}".format(klass=str(Float64Index), kind=str(float)) + r"type float".format(klass=Float64Index.__name__) ) with pytest.raises(TypeError, match=msg): s2.iloc[3.0] = 0 @@ -344,9 +346,9 @@ def test_slice_non_numeric(self): for l in [slice(3.0, 4), slice(3, 4.0), slice(3.0, 4.0)]: msg = ( - "cannot do slice indexing " + "cannot do positional indexing " r"on {klass} with these indexers \[(3|4)\.0\] of " - "{kind}".format(klass=type(index), kind=str(float)) + "type float".format(klass=type(index).__name__) ) with pytest.raises(TypeError, match=msg): s.iloc[l] @@ -354,14 +356,10 @@ def test_slice_non_numeric(self): for idxr in [lambda x: x.loc, lambda x: x.iloc, lambda x: x]: msg = ( - "cannot do slice indexing " + "cannot do (slice|positional) indexing " r"on {klass} with these indexers " r"\[(3|4)(\.0)?\] " - r"of ({kind_float}|{kind_int})".format( - klass=type(index), - kind_float=str(float), - kind_int=str(int), - ) + r"of type (float|int)".format(klass=type(index).__name__) ) with pytest.raises(TypeError, match=msg): idxr(s)[l] @@ -370,23 +368,19 @@ def test_slice_non_numeric(self): for l in [slice(3.0, 4), slice(3, 4.0), slice(3.0, 4.0)]: msg = ( - "cannot do slice indexing " + "cannot do positional indexing " r"on {klass} with these indexers \[(3|4)\.0\] of " - "{kind}".format(klass=type(index), kind=str(float)) + "type float".format(klass=type(index).__name__) ) with pytest.raises(TypeError, match=msg): s.iloc[l] = 0 for idxr in [lambda x: x.loc, lambda x: x.iloc, lambda x: x]: msg = ( - "cannot do slice indexing " + "cannot do (slice|positional) indexing " r"on {klass} with these indexers " r"\[(3|4)(\.0)?\] " - r"of ({kind_float}|{kind_int})".format( - klass=type(index), - kind_float=str(float), - kind_int=str(int), - ) + r"of type (float|int)".format(klass=type(index).__name__) ) with pytest.raises(TypeError, match=msg): idxr(s)[l] = 0 @@ -426,7 +420,7 @@ def test_slice_integer(self): msg = ( "cannot do slice indexing " r"on {klass} with these indexers \[(3|4)\.0\] of " - "{kind}".format(klass=type(index), kind=str(float)) + "type float".format(klass=type(index).__name__) ) with pytest.raises(TypeError, match=msg): s[l] @@ -450,7 +444,7 @@ def test_slice_integer(self): msg = ( "cannot do slice indexing " r"on {klass} with these indexers \[-6\.0\] of " - "{kind}".format(klass=type(index), kind=str(float)) + "type float".format(klass=type(index).__name__) ) with pytest.raises(TypeError, match=msg): s[slice(-6.0, 6.0)] @@ -476,7 +470,7 @@ def test_slice_integer(self): msg = ( "cannot do slice indexing " r"on {klass} with these indexers \[(2|3)\.5\] of " - "{kind}".format(klass=type(index), kind=str(float)) + "type float".format(klass=type(index).__name__) ) with pytest.raises(TypeError, match=msg): s[l] @@ -494,7 +488,7 @@ def test_slice_integer(self): msg = ( "cannot do slice indexing " r"on {klass} with these indexers \[(3|4)\.0\] of " - "{kind}".format(klass=type(index), kind=str(float)) + "type float".format(klass=type(index).__name__) ) with pytest.raises(TypeError, match=msg): s[l] = 0 @@ -515,9 +509,9 @@ def test_integer_positional_indexing(self): klass = RangeIndex msg = ( - "cannot do slice indexing " + "cannot do (slice|positional) indexing " r"on {klass} with these indexers \[(2|4)\.0\] of " - "{kind}".format(klass=str(klass), kind=str(float)) + "type float".format(klass=klass.__name__) ) with pytest.raises(TypeError, match=msg): idxr(s)[l] @@ -542,7 +536,7 @@ def f(idxr): msg = ( "cannot do slice indexing " r"on {klass} with these indexers \[(0|1)\.0\] of " - "{kind}".format(klass=type(index), kind=str(float)) + "type float".format(klass=type(index).__name__) ) with pytest.raises(TypeError, match=msg): s[l] @@ -557,7 +551,7 @@ def f(idxr): msg = ( "cannot do slice indexing " r"on {klass} with these indexers \[-10\.0\] of " - "{kind}".format(klass=type(index), kind=str(float)) + "type float".format(klass=type(index).__name__) ) with pytest.raises(TypeError, match=msg): s[slice(-10.0, 10.0)] @@ -576,7 +570,7 @@ def f(idxr): msg = ( "cannot do slice indexing " r"on {klass} with these indexers \[0\.5\] of " - "{kind}".format(klass=type(index), kind=str(float)) + "type float".format(klass=type(index).__name__) ) with pytest.raises(TypeError, match=msg): s[l] @@ -593,7 +587,7 @@ def f(idxr): msg = ( "cannot do slice indexing " r"on {klass} with these indexers \[(3|4)\.0\] of " - "{kind}".format(klass=type(index), kind=str(float)) + "type float".format(klass=type(index).__name__) ) with pytest.raises(TypeError, match=msg): s[l] = 0 @@ -666,11 +660,11 @@ def test_floating_misc(self): # value not found (and no fallbacking at all) # scalar integers - with pytest.raises(KeyError, match=r"^4\.0$"): + with pytest.raises(KeyError, match=r"^4$"): s.loc[4] - with pytest.raises(KeyError, match=r"^4\.0$"): + with pytest.raises(KeyError, match=r"^4$"): s.loc[4] - with pytest.raises(KeyError, match=r"^4\.0$"): + with pytest.raises(KeyError, match=r"^4$"): s[4] # fancy floats/integers create the correct entry (as nan) diff --git a/pandas/tests/indexing/test_iloc.py b/pandas/tests/indexing/test_iloc.py index d67259e8b7d40..08ea4c1579ef8 100644 --- a/pandas/tests/indexing/test_iloc.py +++ b/pandas/tests/indexing/test_iloc.py @@ -15,6 +15,44 @@ class TestiLoc(Base): + def test_iloc_getitem_int(self): + # integer + self.check_result( + "iloc", + 2, + "iloc", + 2, + typs=["labels", "mixed", "ts", "floats", "empty"], + fails=IndexError, + ) + + def test_iloc_getitem_neg_int(self): + # neg integer + self.check_result( + "iloc", + -1, + "iloc", + -1, + typs=["labels", "mixed", "ts", "floats", "empty"], + fails=IndexError, + ) + + def test_iloc_getitem_list_int(self): + self.check_result( + "iloc", + [0, 1, 2], + "iloc", + [0, 1, 2], + typs=["labels", "mixed", "ts", "floats", "empty"], + fails=IndexError, + ) + + # array of ints (GH5006), make sure that a single indexer is returning + # the correct type + + +class TestiLoc2: + # TODO: better name, just separating out things that dont rely on base class def test_iloc_exceeds_bounds(self): # GH6296 @@ -135,28 +173,6 @@ def test_iloc_non_integer_raises(self, index, columns, index_vals, column_vals): with pytest.raises(IndexError, match=msg): df.iloc[index_vals, column_vals] - def test_iloc_getitem_int(self): - # integer - self.check_result( - "iloc", - 2, - "iloc", - 2, - typs=["labels", "mixed", "ts", "floats", "empty"], - fails=IndexError, - ) - - def test_iloc_getitem_neg_int(self): - # neg integer - self.check_result( - "iloc", - -1, - "iloc", - -1, - typs=["labels", "mixed", "ts", "floats", "empty"], - fails=IndexError, - ) - @pytest.mark.parametrize("dims", [1, 2]) def test_iloc_getitem_invalid_scalar(self, dims): # GH 21982 @@ -183,19 +199,6 @@ def test_iloc_array_not_mutating_negative_indices(self): df.iloc[:, array_with_neg_numbers] tm.assert_numpy_array_equal(array_with_neg_numbers, array_copy) - def test_iloc_getitem_list_int(self): - self.check_result( - "iloc", - [0, 1, 2], - "iloc", - [0, 1, 2], - typs=["labels", "mixed", "ts", "floats", "empty"], - fails=IndexError, - ) - - # array of ints (GH5006), make sure that a single indexer is returning - # the correct type - def test_iloc_getitem_neg_int_can_reach_first_index(self): # GH10547 and GH10779 # negative integers should be able to reach index 0 @@ -286,7 +289,9 @@ def test_iloc_getitem_slice_dups(self): tm.assert_frame_equal(df.iloc[10:, 2:], df1) def test_iloc_setitem(self): - df = self.frame_ints + df = DataFrame( + np.random.randn(4, 4), index=np.arange(0, 8, 2), columns=np.arange(0, 12, 3) + ) df.iloc[1, 1] = 1 result = df.iloc[1, 1] diff --git a/pandas/tests/indexing/test_indexing.py b/pandas/tests/indexing/test_indexing.py index 1913caae93932..98940b64330b4 100644 --- a/pandas/tests/indexing/test_indexing.py +++ b/pandas/tests/indexing/test_indexing.py @@ -17,13 +17,13 @@ from pandas.core.generic import NDFrame from pandas.core.indexers import validate_indices from pandas.core.indexing import _maybe_numeric_slice, _non_reducing_slice -from pandas.tests.indexing.common import Base, _mklbl +from pandas.tests.indexing.common import _mklbl # ------------------------------------------------------------------------ # Indexing test cases -class TestFancy(Base): +class TestFancy: """ pure get/set item & fancy indexing """ def test_setitem_ndarray_1d(self): @@ -137,7 +137,7 @@ def test_setitem_ndarray_3d(self, index, obj, idxr, idxr_id): r"Buffer has wrong number of dimensions \(expected 1, " r"got 3\)|" "'pandas._libs.interval.IntervalTree' object has no attribute " - "'set_value'|" # AttributeError + "'get_loc'|" # AttributeError "unhashable type: 'numpy.ndarray'|" # TypeError "No matching signature found|" # TypeError r"^\[\[\[|" # pandas.core.indexing.IndexingError @@ -750,7 +750,7 @@ def test_index_type_coercion(self): assert s2.index.is_object() -class TestMisc(Base): +class TestMisc: def test_float_index_to_mixed(self): df = DataFrame({0.0: np.random.rand(10), 1.0: np.random.rand(10)}) df["a"] = 10 @@ -875,21 +875,21 @@ def test_indexing_dtypes_on_empty(self): assert df2.loc[:, "a"].dtype == np.int64 tm.assert_series_equal(df2.loc[:, "a"], df2.iloc[:, 0]) - def test_range_in_series_indexing(self): + @pytest.mark.parametrize("size", [5, 999999, 1000000]) + def test_range_in_series_indexing(self, size): # range can cause an indexing error # GH 11652 - for x in [5, 999999, 1000000]: - s = Series(index=range(x), dtype=np.float64) - s.loc[range(1)] = 42 - tm.assert_series_equal(s.loc[range(1)], Series(42.0, index=[0])) + s = Series(index=range(size), dtype=np.float64) + s.loc[range(1)] = 42 + tm.assert_series_equal(s.loc[range(1)], Series(42.0, index=[0])) - s.loc[range(2)] = 43 - tm.assert_series_equal(s.loc[range(2)], Series(43.0, index=[0, 1])) + s.loc[range(2)] = 43 + tm.assert_series_equal(s.loc[range(2)], Series(43.0, index=[0, 1])) - def test_non_reducing_slice(self): - df = DataFrame([[0, 1], [2, 3]]) - - slices = [ + @pytest.mark.parametrize( + "slc", + [ + # FIXME: dont leave commented-out # pd.IndexSlice[:, :], pd.IndexSlice[:, 1], pd.IndexSlice[1, :], @@ -902,10 +902,13 @@ def test_non_reducing_slice(self): [0, 1], np.array([0, 1]), Series([0, 1]), - ] - for slice_ in slices: - tslice_ = _non_reducing_slice(slice_) - assert isinstance(df.loc[tslice_], DataFrame) + ], + ) + def test_non_reducing_slice(self, slc): + df = DataFrame([[0, 1], [2, 3]]) + + tslice_ = _non_reducing_slice(slc) + assert isinstance(df.loc[tslice_], DataFrame) def test_list_slice(self): # like dataframe getitem @@ -965,37 +968,37 @@ class TestSeriesNoneCoercion: (["foo", "bar", "baz"], [None, "bar", "baz"]), ] - def test_coercion_with_setitem(self): - for start_data, expected_result in self.EXPECTED_RESULTS: - start_series = Series(start_data) - start_series[0] = None + @pytest.mark.parametrize("start_data,expected_result", EXPECTED_RESULTS) + def test_coercion_with_setitem(self, start_data, expected_result): + start_series = Series(start_data) + start_series[0] = None - expected_series = Series(expected_result) - tm.assert_series_equal(start_series, expected_series) + expected_series = Series(expected_result) + tm.assert_series_equal(start_series, expected_series) - def test_coercion_with_loc_setitem(self): - for start_data, expected_result in self.EXPECTED_RESULTS: - start_series = Series(start_data) - start_series.loc[0] = None + @pytest.mark.parametrize("start_data,expected_result", EXPECTED_RESULTS) + def test_coercion_with_loc_setitem(self, start_data, expected_result): + start_series = Series(start_data) + start_series.loc[0] = None - expected_series = Series(expected_result) - tm.assert_series_equal(start_series, expected_series) + expected_series = Series(expected_result) + tm.assert_series_equal(start_series, expected_series) - def test_coercion_with_setitem_and_series(self): - for start_data, expected_result in self.EXPECTED_RESULTS: - start_series = Series(start_data) - start_series[start_series == start_series[0]] = None + @pytest.mark.parametrize("start_data,expected_result", EXPECTED_RESULTS) + def test_coercion_with_setitem_and_series(self, start_data, expected_result): + start_series = Series(start_data) + start_series[start_series == start_series[0]] = None - expected_series = Series(expected_result) - tm.assert_series_equal(start_series, expected_series) + expected_series = Series(expected_result) + tm.assert_series_equal(start_series, expected_series) - def test_coercion_with_loc_and_series(self): - for start_data, expected_result in self.EXPECTED_RESULTS: - start_series = Series(start_data) - start_series.loc[start_series == start_series[0]] = None + @pytest.mark.parametrize("start_data,expected_result", EXPECTED_RESULTS) + def test_coercion_with_loc_and_series(self, start_data, expected_result): + start_series = Series(start_data) + start_series.loc[start_series == start_series[0]] = None - expected_series = Series(expected_result) - tm.assert_series_equal(start_series, expected_series) + expected_series = Series(expected_result) + tm.assert_series_equal(start_series, expected_series) class TestDataframeNoneCoercion: @@ -1012,31 +1015,35 @@ class TestDataframeNoneCoercion: (["foo", "bar", "baz"], [None, "bar", "baz"]), ] - def test_coercion_with_loc(self): - for start_data, expected_result in self.EXPECTED_SINGLE_ROW_RESULTS: - start_dataframe = DataFrame({"foo": start_data}) - start_dataframe.loc[0, ["foo"]] = None + @pytest.mark.parametrize("expected", EXPECTED_SINGLE_ROW_RESULTS) + def test_coercion_with_loc(self, expected): + start_data, expected_result = expected + + start_dataframe = DataFrame({"foo": start_data}) + start_dataframe.loc[0, ["foo"]] = None + + expected_dataframe = DataFrame({"foo": expected_result}) + tm.assert_frame_equal(start_dataframe, expected_dataframe) + + @pytest.mark.parametrize("expected", EXPECTED_SINGLE_ROW_RESULTS) + def test_coercion_with_setitem_and_dataframe(self, expected): + start_data, expected_result = expected - expected_dataframe = DataFrame({"foo": expected_result}) - tm.assert_frame_equal(start_dataframe, expected_dataframe) + start_dataframe = DataFrame({"foo": start_data}) + start_dataframe[start_dataframe["foo"] == start_dataframe["foo"][0]] = None - def test_coercion_with_setitem_and_dataframe(self): - for start_data, expected_result in self.EXPECTED_SINGLE_ROW_RESULTS: - start_dataframe = DataFrame({"foo": start_data}) - start_dataframe[start_dataframe["foo"] == start_dataframe["foo"][0]] = None + expected_dataframe = DataFrame({"foo": expected_result}) + tm.assert_frame_equal(start_dataframe, expected_dataframe) - expected_dataframe = DataFrame({"foo": expected_result}) - tm.assert_frame_equal(start_dataframe, expected_dataframe) + @pytest.mark.parametrize("expected", EXPECTED_SINGLE_ROW_RESULTS) + def test_none_coercion_loc_and_dataframe(self, expected): + start_data, expected_result = expected - def test_none_coercion_loc_and_dataframe(self): - for start_data, expected_result in self.EXPECTED_SINGLE_ROW_RESULTS: - start_dataframe = DataFrame({"foo": start_data}) - start_dataframe.loc[ - start_dataframe["foo"] == start_dataframe["foo"][0] - ] = None + start_dataframe = DataFrame({"foo": start_data}) + start_dataframe.loc[start_dataframe["foo"] == start_dataframe["foo"][0]] = None - expected_dataframe = DataFrame({"foo": expected_result}) - tm.assert_frame_equal(start_dataframe, expected_dataframe) + expected_dataframe = DataFrame({"foo": expected_result}) + tm.assert_frame_equal(start_dataframe, expected_dataframe) def test_none_coercion_mixed_dtypes(self): start_dataframe = DataFrame( diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index b9dc96adfa738..3a726fb9923ee 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -13,85 +13,6 @@ class TestLoc(Base): - def test_loc_getitem_dups(self): - # GH 5678 - # repeated getitems on a dup index returning a ndarray - df = DataFrame( - np.random.random_sample((20, 5)), index=["ABCDE"[x % 5] for x in range(20)] - ) - expected = df.loc["A", 0] - result = df.loc[:, 0].loc["A"] - tm.assert_series_equal(result, expected) - - def test_loc_getitem_dups2(self): - - # GH4726 - # dup indexing with iloc/loc - df = DataFrame( - [[1, 2, "foo", "bar", Timestamp("20130101")]], - columns=["a", "a", "a", "a", "a"], - index=[1], - ) - expected = Series( - [1, 2, "foo", "bar", Timestamp("20130101")], - index=["a", "a", "a", "a", "a"], - name=1, - ) - - result = df.iloc[0] - tm.assert_series_equal(result, expected) - - result = df.loc[1] - tm.assert_series_equal(result, expected) - - def test_loc_setitem_dups(self): - - # GH 6541 - df_orig = DataFrame( - { - "me": list("rttti"), - "foo": list("aaade"), - "bar": np.arange(5, dtype="float64") * 1.34 + 2, - "bar2": np.arange(5, dtype="float64") * -0.34 + 2, - } - ).set_index("me") - - indexer = tuple(["r", ["bar", "bar2"]]) - df = df_orig.copy() - df.loc[indexer] *= 2.0 - tm.assert_series_equal(df.loc[indexer], 2.0 * df_orig.loc[indexer]) - - indexer = tuple(["r", "bar"]) - df = df_orig.copy() - df.loc[indexer] *= 2.0 - assert df.loc[indexer] == 2.0 * df_orig.loc[indexer] - - indexer = tuple(["t", ["bar", "bar2"]]) - df = df_orig.copy() - df.loc[indexer] *= 2.0 - tm.assert_frame_equal(df.loc[indexer], 2.0 * df_orig.loc[indexer]) - - def test_loc_setitem_slice(self): - # GH10503 - - # assigning the same type should not change the type - df1 = DataFrame({"a": [0, 1, 1], "b": Series([100, 200, 300], dtype="uint32")}) - ix = df1["a"] == 1 - newb1 = df1.loc[ix, "b"] + 1 - df1.loc[ix, "b"] = newb1 - expected = DataFrame( - {"a": [0, 1, 1], "b": Series([100, 201, 301], dtype="uint32")} - ) - tm.assert_frame_equal(df1, expected) - - # assigning a new type should get the inferred type - df2 = DataFrame({"a": [0, 1, 1], "b": [100, 200, 300]}, dtype="uint64") - ix = df1["a"] == 1 - newb2 = df2.loc[ix, "b"] - df1.loc[ix, "b"] = newb2 - expected = DataFrame({"a": [0, 1, 1], "b": [100, 200, 300]}, dtype="uint64") - tm.assert_frame_equal(df2, expected) - def test_loc_getitem_int(self): # int label @@ -162,17 +83,6 @@ def test_loc_getitem_label_list_with_missing(self): fails=KeyError, ) - def test_getitem_label_list_with_missing(self): - s = Series(range(3), index=["a", "b", "c"]) - - # consistency - with pytest.raises(KeyError, match="with any missing labels"): - s[["a", "d"]] - - s = Series(range(3)) - with pytest.raises(KeyError, match="with any missing labels"): - s[[0, 3]] - def test_loc_getitem_label_list_fails(self): # fails self.check_result( @@ -196,6 +106,168 @@ def test_loc_getitem_bool(self): self.check_result("loc", b, "loc", b, typs=["empty"], fails=IndexError) + def test_loc_getitem_label_slice(self): + + # label slices (with ints) + + # real label slices + + # GH 14316 + + self.check_result( + "loc", + slice(1, 3), + "loc", + slice(1, 3), + typs=["labels", "mixed", "empty", "ts", "floats"], + fails=TypeError, + ) + + self.check_result( + "loc", + slice("20130102", "20130104"), + "loc", + slice("20130102", "20130104"), + typs=["ts"], + axes=1, + fails=TypeError, + ) + + self.check_result( + "loc", + slice(2, 8), + "loc", + slice(2, 8), + typs=["mixed"], + axes=0, + fails=TypeError, + ) + self.check_result( + "loc", + slice(2, 8), + "loc", + slice(2, 8), + typs=["mixed"], + axes=1, + fails=KeyError, + ) + + self.check_result( + "loc", + slice(2, 4, 2), + "loc", + slice(2, 4, 2), + typs=["mixed"], + axes=0, + fails=TypeError, + ) + + +class TestLoc2: + # TODO: better name, just separating out things that rely on base class + + def test_loc_getitem_dups(self): + # GH 5678 + # repeated getitems on a dup index returning a ndarray + df = DataFrame( + np.random.random_sample((20, 5)), index=["ABCDE"[x % 5] for x in range(20)] + ) + expected = df.loc["A", 0] + result = df.loc[:, 0].loc["A"] + tm.assert_series_equal(result, expected) + + def test_loc_getitem_dups2(self): + + # GH4726 + # dup indexing with iloc/loc + df = DataFrame( + [[1, 2, "foo", "bar", Timestamp("20130101")]], + columns=["a", "a", "a", "a", "a"], + index=[1], + ) + expected = Series( + [1, 2, "foo", "bar", Timestamp("20130101")], + index=["a", "a", "a", "a", "a"], + name=1, + ) + + result = df.iloc[0] + tm.assert_series_equal(result, expected) + + result = df.loc[1] + tm.assert_series_equal(result, expected) + + def test_loc_setitem_dups(self): + + # GH 6541 + df_orig = DataFrame( + { + "me": list("rttti"), + "foo": list("aaade"), + "bar": np.arange(5, dtype="float64") * 1.34 + 2, + "bar2": np.arange(5, dtype="float64") * -0.34 + 2, + } + ).set_index("me") + + indexer = tuple(["r", ["bar", "bar2"]]) + df = df_orig.copy() + df.loc[indexer] *= 2.0 + tm.assert_series_equal(df.loc[indexer], 2.0 * df_orig.loc[indexer]) + + indexer = tuple(["r", "bar"]) + df = df_orig.copy() + df.loc[indexer] *= 2.0 + assert df.loc[indexer] == 2.0 * df_orig.loc[indexer] + + indexer = tuple(["t", ["bar", "bar2"]]) + df = df_orig.copy() + df.loc[indexer] *= 2.0 + tm.assert_frame_equal(df.loc[indexer], 2.0 * df_orig.loc[indexer]) + + def test_loc_setitem_slice(self): + # GH10503 + + # assigning the same type should not change the type + df1 = DataFrame({"a": [0, 1, 1], "b": Series([100, 200, 300], dtype="uint32")}) + ix = df1["a"] == 1 + newb1 = df1.loc[ix, "b"] + 1 + df1.loc[ix, "b"] = newb1 + expected = DataFrame( + {"a": [0, 1, 1], "b": Series([100, 201, 301], dtype="uint32")} + ) + tm.assert_frame_equal(df1, expected) + + # assigning a new type should get the inferred type + df2 = DataFrame({"a": [0, 1, 1], "b": [100, 200, 300]}, dtype="uint64") + ix = df1["a"] == 1 + newb2 = df2.loc[ix, "b"] + df1.loc[ix, "b"] = newb2 + expected = DataFrame({"a": [0, 1, 1], "b": [100, 200, 300]}, dtype="uint64") + tm.assert_frame_equal(df2, expected) + + def test_loc_setitem_dtype(self): + # GH31340 + df = DataFrame({"id": ["A"], "a": [1.2], "b": [0.0], "c": [-2.5]}) + cols = ["a", "b", "c"] + df.loc[:, cols] = df.loc[:, cols].astype("float32") + + expected = DataFrame( + {"id": ["A"], "a": [1.2], "b": [0.0], "c": [-2.5]}, dtype="float32" + ) # id is inferred as object + + tm.assert_frame_equal(df, expected) + + def test_getitem_label_list_with_missing(self): + s = Series(range(3), index=["a", "b", "c"]) + + # consistency + with pytest.raises(KeyError, match="with any missing labels"): + s[["a", "d"]] + + s = Series(range(3)) + with pytest.raises(KeyError, match="with any missing labels"): + s[[0, 3]] + @pytest.mark.parametrize("index", [[True, False], [True, False, True, False]]) def test_loc_getitem_bool_diff_len(self, index): # GH26658 @@ -297,62 +369,6 @@ def test_loc_getitem_list_with_fail(self): with pytest.raises(KeyError, match="with any missing labels"): s.loc[[2, 3]] - def test_loc_getitem_label_slice(self): - - # label slices (with ints) - - # real label slices - - # GH 14316 - - self.check_result( - "loc", - slice(1, 3), - "loc", - slice(1, 3), - typs=["labels", "mixed", "empty", "ts", "floats"], - fails=TypeError, - ) - - self.check_result( - "loc", - slice("20130102", "20130104"), - "loc", - slice("20130102", "20130104"), - typs=["ts"], - axes=1, - fails=TypeError, - ) - - self.check_result( - "loc", - slice(2, 8), - "loc", - slice(2, 8), - typs=["mixed"], - axes=0, - fails=TypeError, - ) - self.check_result( - "loc", - slice(2, 8), - "loc", - slice(2, 8), - typs=["mixed"], - axes=1, - fails=KeyError, - ) - - self.check_result( - "loc", - slice(2, 4, 2), - "loc", - slice(2, 4, 2), - typs=["mixed"], - axes=0, - fails=TypeError, - ) - def test_loc_index(self): # gh-17131 # a boolean index should index like a boolean numpy array @@ -559,7 +575,7 @@ def test_loc_modify_datetime(self): tm.assert_frame_equal(df, expected) def test_loc_setitem_frame(self): - df = self.frame_labels + df = DataFrame(np.random.randn(4, 4), index=list("abcd"), columns=list("ABCD")) result = df.iloc[0, 0] diff --git a/pandas/tests/indexing/test_scalar.py b/pandas/tests/indexing/test_scalar.py index a567fb9b8ccc7..3622b12b853a4 100644 --- a/pandas/tests/indexing/test_scalar.py +++ b/pandas/tests/indexing/test_scalar.py @@ -65,6 +65,10 @@ def _check(f, func, values=False): for f in [d["ints"], d["uints"], d["labels"], d["ts"], d["floats"]]: _check(f, "at") + +class TestScalar2: + # TODO: Better name, just separating things that dont need Base class + def test_at_iat_coercion(self): # as timestamp is not a tuple! @@ -125,38 +129,79 @@ def test_imethods_with_dups(self): result = df.iat[2, 0] assert result == 2 - def test_at_to_fail(self): + def test_series_at_raises_type_error(self): # at should not fallback # GH 7814 - s = Series([1, 2, 3], index=list("abc")) - result = s.at["a"] + # GH#31724 .at should match .loc + ser = Series([1, 2, 3], index=list("abc")) + result = ser.at["a"] assert result == 1 + result = ser.loc["a"] + assert result == 1 + msg = ( - "At based indexing on an non-integer index can only have " - "non-integer indexers" + "cannot do label indexing on Index " + r"with these indexers \[0\] of type int" ) - with pytest.raises(ValueError, match=msg): - s.at[0] + with pytest.raises(TypeError, match=msg): + ser.at[0] + with pytest.raises(TypeError, match=msg): + ser.loc[0] + def test_frame_raises_type_error(self): + # GH#31724 .at should match .loc df = DataFrame({"A": [1, 2, 3]}, index=list("abc")) result = df.at["a", "A"] assert result == 1 - with pytest.raises(ValueError, match=msg): + result = df.loc["a", "A"] + assert result == 1 + + msg = ( + "cannot do label indexing on Index " + r"with these indexers \[0\] of type int" + ) + with pytest.raises(TypeError, match=msg): df.at["a", 0] + with pytest.raises(TypeError, match=msg): + df.loc["a", 0] - s = Series([1, 2, 3], index=[3, 2, 1]) - result = s.at[1] + def test_series_at_raises_key_error(self): + # GH#31724 .at should match .loc + + ser = Series([1, 2, 3], index=[3, 2, 1]) + result = ser.at[1] + assert result == 3 + result = ser.loc[1] assert result == 3 - msg = "At based indexing on an integer index can only have integer indexers" - with pytest.raises(ValueError, match=msg): - s.at["a"] + + with pytest.raises(KeyError, match="a"): + ser.at["a"] + with pytest.raises(KeyError, match="a"): + # .at should match .loc + ser.loc["a"] + + def test_frame_at_raises_key_error(self): + # GH#31724 .at should match .loc df = DataFrame({0: [1, 2, 3]}, index=[3, 2, 1]) + result = df.at[1, 0] assert result == 3 - with pytest.raises(ValueError, match=msg): + result = df.loc[1, 0] + assert result == 3 + + with pytest.raises(KeyError, match="a"): df.at["a", 0] + with pytest.raises(KeyError, match="a"): + df.loc["a", 0] + + with pytest.raises(KeyError, match="a"): + df.at[1, "a"] + with pytest.raises(KeyError, match="a"): + df.loc[1, "a"] + # TODO: belongs somewhere else? + def test_getitem_list_missing_key(self): # GH 13822, incorrect error string with non-unique columns when missing # column is accessed df = DataFrame({"x": [1.0], "y": [2.0], "z": [3.0]}) diff --git a/pandas/tests/io/excel/test_writers.py b/pandas/tests/io/excel/test_writers.py index f7b49ccb1a72d..91665a24fc4c5 100644 --- a/pandas/tests/io/excel/test_writers.py +++ b/pandas/tests/io/excel/test_writers.py @@ -1048,6 +1048,27 @@ def test_invalid_columns(self, path): ): write_frame.to_excel(path, "test1", columns=["C", "D"]) + @pytest.mark.parametrize( + "to_excel_index,read_excel_index_col", + [ + (True, 0), # Include index in write to file + (False, None), # Dont include index in write to file + ], + ) + def test_write_subset_columns(self, path, to_excel_index, read_excel_index_col): + # GH 31677 + write_frame = DataFrame({"A": [1, 1, 1], "B": [2, 2, 2], "C": [3, 3, 3]}) + write_frame.to_excel( + path, "col_subset_bug", columns=["A", "B"], index=to_excel_index + ) + + expected = write_frame[["A", "B"]] + read_frame = pd.read_excel( + path, "col_subset_bug", index_col=read_excel_index_col + ) + + tm.assert_frame_equal(expected, read_frame) + def test_comment_arg(self, path): # see gh-18735 # diff --git a/pandas/tests/io/formats/test_format.py b/pandas/tests/io/formats/test_format.py index 7650561d3072d..bf7b98eb78f11 100644 --- a/pandas/tests/io/formats/test_format.py +++ b/pandas/tests/io/formats/test_format.py @@ -239,6 +239,15 @@ def test_repr_truncation(self): with option_context("display.max_colwidth", max_len + 2): assert "..." not in repr(df) + def test_repr_deprecation_negative_int(self): + # FIXME: remove in future version after deprecation cycle + # Non-regression test for: + # https://github.com/pandas-dev/pandas/issues/31532 + width = get_option("display.max_colwidth") + with tm.assert_produces_warning(FutureWarning): + set_option("display.max_colwidth", -1) + set_option("display.max_colwidth", width) + def test_repr_chop_threshold(self): df = DataFrame([[0.1, 0.5], [0.5, -0.1]]) pd.reset_option("display.chop_threshold") # default None diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 94d51589023c4..f2d35bfb3b5ae 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -1662,3 +1662,22 @@ def test_json_multiindex(self, dataframe, expected): series = dataframe.stack() result = series.to_json(orient="index") assert result == expected + + def test_to_s3(self, s3_resource): + # GH 28375 + mock_bucket_name, target_file = "pandas-test", "test.json" + df = DataFrame({"x": [1, 2, 3], "y": [2, 4, 6]}) + df.to_json(f"s3://{mock_bucket_name}/{target_file}") + assert target_file in ( + obj.key for obj in s3_resource.Bucket("pandas-test").objects.all() + ) + + def test_json_pandas_na(self): + # GH 31615 + result = pd.DataFrame([[pd.NA]]).to_json() + assert result == '{"0":{"0":null}}' + + def test_json_pandas_nulls(self, nulls_fixture): + # GH 31615 + result = pd.DataFrame([[nulls_fixture]]).to_json() + assert result == '{"0":{"0":null}}' diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py index 6c17f40b790ac..c19056d434ec3 100644 --- a/pandas/tests/io/parser/test_common.py +++ b/pandas/tests/io/parser/test_common.py @@ -2040,6 +2040,17 @@ def test_read_csv_memory_growth_chunksize(all_parsers): pass +def test_read_csv_raises_on_header_prefix(all_parsers): + # gh-27394 + parser = all_parsers + msg = "Argument prefix must be None if argument header is not None" + + s = StringIO("0,1\n2,3") + + with pytest.raises(ValueError, match=msg): + parser.read_csv(s, header=0, prefix="_X") + + def test_read_table_equivalency_to_read_csv(all_parsers): # see gh-21948 # As of 0.25.0, read_table is undeprecated diff --git a/pandas/tests/io/parser/test_encoding.py b/pandas/tests/io/parser/test_encoding.py index 406e7bedfd298..13f72a0414bac 100644 --- a/pandas/tests/io/parser/test_encoding.py +++ b/pandas/tests/io/parser/test_encoding.py @@ -141,6 +141,7 @@ def test_read_csv_utf_aliases(all_parsers, utf_value, encoding_fmt): ) def test_binary_mode_file_buffers(all_parsers, csv_dir_path, fname, encoding): # gh-23779: Python csv engine shouldn't error on files opened in binary. + # gh-31575: Python csv engine shouldn't error on files opened in raw binary. parser = all_parsers fpath = os.path.join(csv_dir_path, fname) @@ -154,6 +155,10 @@ def test_binary_mode_file_buffers(all_parsers, csv_dir_path, fname, encoding): result = parser.read_csv(fb, encoding=encoding) tm.assert_frame_equal(expected, result) + with open(fpath, mode="rb", buffering=0) as fb: + result = parser.read_csv(fb, encoding=encoding) + tm.assert_frame_equal(expected, result) + @pytest.mark.parametrize("pass_encoding", [True, False]) def test_encoding_temp_file(all_parsers, utf_value, encoding_fmt, pass_encoding): diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py index b01b22e811ee3..073af758f0b29 100644 --- a/pandas/tests/io/parser/test_parse_dates.py +++ b/pandas/tests/io/parser/test_parse_dates.py @@ -1516,3 +1516,15 @@ def test_hypothesis_delimited_date(date_format, dayfirst, delimiter, test_dateti assert except_out_dateutil == except_in_dateutil assert result == expected + + +@pytest.mark.parametrize("parse_dates", [["time", ], {"date": ["time", ]}]) +def test_missing_column(all_parsers, parse_dates): + """GH31251 column names provided in parse_dates could be missing.""" + parser = all_parsers + content = StringIO("time,val\n2020-01-31,32\n") + msg = "Missing column provided to 'parse_dates': 'time'" + with pytest.raises(ValueError, match=msg): + parser.read_csv( + content, sep=",", usecols=["val", ], parse_dates=parse_dates, + ) diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py index d7a21b27308e8..404f5a477187b 100644 --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py @@ -141,24 +141,7 @@ def test_read_non_existant(self, reader, module, error_class, fn_ext): pytest.importorskip(module) path = os.path.join(HERE, "data", "does_not_exist." + fn_ext) - msg1 = r"File (b')?.+does_not_exist\.{}'? does not exist".format(fn_ext) - msg2 = fr"\[Errno 2\] No such file or directory: '.+does_not_exist\.{fn_ext}'" - msg3 = "Expected object or value" - msg4 = "path_or_buf needs to be a string file path or file-like" - msg5 = ( - fr"\[Errno 2\] File .+does_not_exist\.{fn_ext} does not exist: " - fr"'.+does_not_exist\.{fn_ext}'" - ) - msg6 = fr"\[Errno 2\] 没有那个文件或目录: '.+does_not_exist\.{fn_ext}'" - msg7 = ( - fr"\[Errno 2\] File o directory non esistente: '.+does_not_exist\.{fn_ext}'" - ) - msg8 = fr"Failed to open local file.+does_not_exist\.{fn_ext}.?, error: .*" - - with pytest.raises( - error_class, - match=fr"({msg1}|{msg2}|{msg3}|{msg4}|{msg5}|{msg6}|{msg7}|{msg8})", - ): + with tm.external_error_raised(error_class): reader(path) @pytest.mark.parametrize( @@ -184,24 +167,7 @@ def test_read_expands_user_home_dir( path = os.path.join("~", "does_not_exist." + fn_ext) monkeypatch.setattr(icom, "_expand_user", lambda x: os.path.join("foo", x)) - msg1 = fr"File (b')?.+does_not_exist\.{fn_ext}'? does not exist" - msg2 = fr"\[Errno 2\] No such file or directory: '.+does_not_exist\.{fn_ext}'" - msg3 = "Unexpected character found when decoding 'false'" - msg4 = "path_or_buf needs to be a string file path or file-like" - msg5 = ( - fr"\[Errno 2\] File .+does_not_exist\.{fn_ext} does not exist: " - fr"'.+does_not_exist\.{fn_ext}'" - ) - msg6 = fr"\[Errno 2\] 没有那个文件或目录: '.+does_not_exist\.{fn_ext}'" - msg7 = ( - fr"\[Errno 2\] File o directory non esistente: '.+does_not_exist\.{fn_ext}'" - ) - msg8 = fr"Failed to open local file.+does_not_exist\.{fn_ext}.?, error: .*" - - with pytest.raises( - error_class, - match=fr"({msg1}|{msg2}|{msg3}|{msg4}|{msg5}|{msg6}|{msg7}|{msg8})", - ): + with tm.external_error_raised(error_class): reader(path) @pytest.mark.parametrize( diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index d51c712ed5abd..7ed8d8f22764c 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -564,6 +564,13 @@ def test_additional_extension_types(self, pa): ) check_round_trip(df, pa) + @td.skip_if_no("pyarrow", min_version="0.14") + def test_timestamp_nanoseconds(self, pa): + # with version 2.0, pyarrow defaults to writing the nanoseconds, so + # this should work without error + df = pd.DataFrame({"a": pd.date_range("2017-01-01", freq="1n", periods=10)}) + check_round_trip(df, pa, write_kwargs={"version": "2.0"}) + class TestParquetFastParquet(Base): @td.skip_if_no("fastparquet", min_version="0.3.2") diff --git a/pandas/tests/io/test_pickle.py b/pandas/tests/io/test_pickle.py index 04fd4835469a9..78b630bb5ada1 100644 --- a/pandas/tests/io/test_pickle.py +++ b/pandas/tests/io/test_pickle.py @@ -60,9 +60,7 @@ def compare_element(result, expected, typ, version=None): assert result == expected assert result.freq == expected.freq else: - comparator = getattr( - tm, "assert_{typ}_equal".format(typ=typ), tm.assert_almost_equal - ) + comparator = getattr(tm, f"assert_{typ}_equal", tm.assert_almost_equal) comparator(result, expected) @@ -77,7 +75,7 @@ def compare(data, vf, version): # use a specific comparator # if available - comparator = "compare_{typ}_{dt}".format(typ=typ, dt=dt) + comparator = f"compare_{typ}_{dt}" comparator = m.get(comparator, m["compare_element"]) comparator(result, expected, typ, version) @@ -234,7 +232,7 @@ def test_legacy_sparse_warning(datapath): @pytest.fixture def get_random_path(): - return "__{}__.pickle".format(tm.rands(10)) + return f"__{tm.rands(10)}__.pickle" class TestCompression: @@ -262,7 +260,7 @@ def compress_file(self, src_path, dest_path, compression): elif compression == "xz": f = _get_lzma_file(lzma)(dest_path, "w") else: - msg = "Unrecognized compression type: {}".format(compression) + msg = f"Unrecognized compression type: {compression}" raise ValueError(msg) if compression != "zip": diff --git a/pandas/tests/plotting/test_converter.py b/pandas/tests/plotting/test_converter.py index 9cd3ccbf9214e..e54f4784e9c4f 100644 --- a/pandas/tests/plotting/test_converter.py +++ b/pandas/tests/plotting/test_converter.py @@ -8,6 +8,7 @@ import pandas._config.config as cf from pandas.compat.numpy import np_datetime64_compat +import pandas.util._test_decorators as td from pandas import Index, Period, Series, Timestamp, date_range import pandas._testing as tm @@ -59,6 +60,7 @@ def test_register_by_default(self): call = [sys.executable, "-c", code] assert subprocess.check_call(call) == 0 + @td.skip_if_no("matplotlib", min_version="3.1.3") def test_registering_no_warning(self): plt = pytest.importorskip("matplotlib.pyplot") s = Series(range(12), index=date_range("2017", periods=12)) @@ -66,9 +68,7 @@ def test_registering_no_warning(self): # Set to the "warn" state, in case this isn't the first test run register_matplotlib_converters() - with tm.assert_produces_warning(DeprecationWarning, check_stacklevel=False): - # GH#30588 DeprecationWarning from 2D indexing - ax.plot(s.index, s.values) + ax.plot(s.index, s.values) def test_pandas_plots_register(self): pytest.importorskip("matplotlib.pyplot") @@ -91,6 +91,7 @@ def test_matplotlib_formatters(self): assert Timestamp not in units.registry assert Timestamp in units.registry + @td.skip_if_no("matplotlib", min_version="3.1.3") def test_option_no_warning(self): pytest.importorskip("matplotlib.pyplot") ctx = cf.option_context("plotting.matplotlib.register_converters", False) @@ -100,15 +101,12 @@ def test_option_no_warning(self): # Test without registering first, no warning with ctx: - # GH#30588 DeprecationWarning from 2D indexing on Index - with tm.assert_produces_warning(DeprecationWarning, check_stacklevel=False): - ax.plot(s.index, s.values) + ax.plot(s.index, s.values) # Now test with registering register_matplotlib_converters() with ctx: - with tm.assert_produces_warning(DeprecationWarning, check_stacklevel=False): - ax.plot(s.index, s.values) + ax.plot(s.index, s.values) def test_registry_resets(self): units = pytest.importorskip("matplotlib.units") diff --git a/pandas/tests/plotting/test_datetimelike.py b/pandas/tests/plotting/test_datetimelike.py index 84d298cd7c6fe..979b89a87d843 100644 --- a/pandas/tests/plotting/test_datetimelike.py +++ b/pandas/tests/plotting/test_datetimelike.py @@ -43,19 +43,19 @@ def setup_method(self, method): def teardown_method(self, method): tm.close() - # Ignore warning - # ``` - # Converting to PeriodArray/Index representation will drop timezone information. - # ``` - # which occurs for UTC-like timezones. @pytest.mark.slow - @pytest.mark.filterwarnings("ignore:msg:UserWarning") def test_ts_plot_with_tz(self, tz_aware_fixture): - # GH2877, GH17173 + # GH2877, GH17173, GH31205, GH31580 tz = tz_aware_fixture index = date_range("1/1/2011", periods=2, freq="H", tz=tz) ts = Series([188.5, 328.25], index=index) - _check_plot_works(ts.plot) + with tm.assert_produces_warning(None): + _check_plot_works(ts.plot) + ax = ts.plot() + xdata = list(ax.get_lines())[0].get_xdata() + # Check first and last points' labels are correct + assert (xdata[0].hour, xdata[0].minute) == (0, 0) + assert (xdata[-1].hour, xdata[-1].minute) == (1, 0) def test_fontsize_set_correctly(self): # For issue #8765 diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index f9acf5b60a3cd..fd189c7435b29 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -371,10 +371,8 @@ def test_no_overlap_more_informative_error(self): msg = ( "No common columns to perform merge on. " - "Merge options: left_on={lon}, right_on={ron}, " - "left_index={lidx}, right_index={ridx}".format( - lon=None, ron=None, lidx=False, ridx=False - ) + f"Merge options: left_on={None}, right_on={None}, " + f"left_index={False}, right_index={False}" ) with pytest.raises(MergeError, match=msg): diff --git a/pandas/tests/reshape/test_cut.py b/pandas/tests/reshape/test_cut.py index 13b6f05ed304a..830e786fd1c6d 100644 --- a/pandas/tests/reshape/test_cut.py +++ b/pandas/tests/reshape/test_cut.py @@ -612,3 +612,16 @@ def test_cut_incorrect_labels(labels): msg = "Bin labels must either be False, None or passed in as a list-like argument" with pytest.raises(ValueError, match=msg): cut(values, 4, labels=labels) + + +@pytest.mark.parametrize("bins", [3, [0, 5, 15]]) +@pytest.mark.parametrize("right", [True, False]) +@pytest.mark.parametrize("include_lowest", [True, False]) +def test_cut_nullable_integer(bins, right, include_lowest): + a = np.random.randint(0, 10, size=50).astype(float) + a[::2] = np.nan + result = cut( + pd.array(a, dtype="Int64"), bins, right=right, include_lowest=include_lowest + ) + expected = cut(a, bins, right=right, include_lowest=include_lowest) + tm.assert_categorical_equal(result, expected) diff --git a/pandas/tests/reshape/test_qcut.py b/pandas/tests/reshape/test_qcut.py index 95406a5ebf4f7..c436ab5d90578 100644 --- a/pandas/tests/reshape/test_qcut.py +++ b/pandas/tests/reshape/test_qcut.py @@ -3,6 +3,7 @@ import numpy as np import pytest +import pandas as pd from pandas import ( Categorical, DatetimeIndex, @@ -286,3 +287,14 @@ def test_qcut_bool_coercion_to_int(bins, box, compare): expected = qcut(data_expected, bins, duplicates="drop") result = qcut(data_result, bins, duplicates="drop") compare(result, expected) + + +@pytest.mark.parametrize("q", [2, 5, 10]) +def test_qcut_nullable_integer(q, any_nullable_int_dtype): + arr = pd.array(np.arange(100), dtype=any_nullable_int_dtype) + arr[::2] = pd.NA + + result = qcut(arr, q) + expected = qcut(arr.astype(float), q) + + tm.assert_categorical_equal(result, expected) diff --git a/pandas/tests/scalar/period/test_asfreq.py b/pandas/tests/scalar/period/test_asfreq.py index 357274e724c68..436810042186a 100644 --- a/pandas/tests/scalar/period/test_asfreq.py +++ b/pandas/tests/scalar/period/test_asfreq.py @@ -3,7 +3,7 @@ from pandas._libs.tslibs.frequencies import INVALID_FREQ_ERR_MSG, _period_code_map from pandas.errors import OutOfBoundsDatetime -from pandas import Period, offsets +from pandas import Period, Timestamp, offsets class TestFreqConversion: @@ -656,6 +656,23 @@ def test_conv_secondly(self): assert ival_S.asfreq("S") == ival_S + def test_conv_microsecond(self): + # GH#31475 Avoid floating point errors dropping the start_time to + # before the beginning of the Period + per = Period("2020-01-30 15:57:27.576166", freq="U") + assert per.ordinal == 1580399847576166 + + start = per.start_time + expected = Timestamp("2020-01-30 15:57:27.576166") + assert start == expected + assert start.value == per.ordinal * 1000 + + per2 = Period("2300-01-01", "us") + with pytest.raises(OutOfBoundsDatetime, match="2300-01-01"): + per2.start_time + with pytest.raises(OutOfBoundsDatetime, match="2300-01-01"): + per2.end_time + def test_asfreq_mult(self): # normal freq to mult freq p = Period(freq="A", year=2007) diff --git a/pandas/tests/scalar/period/test_period.py b/pandas/tests/scalar/period/test_period.py index bbc81e0dbb6e6..995d47c1473be 100644 --- a/pandas/tests/scalar/period/test_period.py +++ b/pandas/tests/scalar/period/test_period.py @@ -925,7 +925,7 @@ def test_properties_secondly(self): class TestPeriodField: def test_get_period_field_array_raises_on_out_of_range(self): - msg = "Buffer dtype mismatch, expected 'int64_t' but got 'double'" + msg = "Buffer dtype mismatch, expected 'const int64_t' but got 'double'" with pytest.raises(ValueError, match=msg): libperiod.get_period_field_arr(-1, np.empty(1), 0) diff --git a/pandas/tests/scalar/timedelta/test_constructors.py b/pandas/tests/scalar/timedelta/test_constructors.py index ae1e84576c092..25c9fc19981be 100644 --- a/pandas/tests/scalar/timedelta/test_constructors.py +++ b/pandas/tests/scalar/timedelta/test_constructors.py @@ -274,3 +274,10 @@ def test_td_constructor_on_nanoseconds(constructed_td, conversion): def test_td_constructor_value_error(): with pytest.raises(TypeError): Timedelta(nanoseconds="abc") + + +def test_timedelta_constructor_identity(): + # Test for #30543 + expected = Timedelta(np.timedelta64(1, "s")) + result = Timedelta(expected) + assert result is expected diff --git a/pandas/tests/scalar/timedelta/test_timedelta.py b/pandas/tests/scalar/timedelta/test_timedelta.py index e1d965bbb14e9..9cdbeb6ab4845 100644 --- a/pandas/tests/scalar/timedelta/test_timedelta.py +++ b/pandas/tests/scalar/timedelta/test_timedelta.py @@ -821,3 +821,16 @@ def test_resolution_deprecated(self): def test_truthiness(value, expected): # https://github.com/pandas-dev/pandas/issues/21484 assert bool(value) is expected + + +def test_timedelta_attribute_precision(): + # GH 31354 + td = Timedelta(1552211999999999872, unit="ns") + result = td.days * 86400 + result += td.seconds + result *= 1000000 + result += td.microseconds + result *= 1000 + result += td.nanoseconds + expected = td.value + assert result == expected diff --git a/pandas/tests/scalar/timestamp/test_constructors.py b/pandas/tests/scalar/timestamp/test_constructors.py new file mode 100644 index 0000000000000..737a85faa4c9b --- /dev/null +++ b/pandas/tests/scalar/timestamp/test_constructors.py @@ -0,0 +1,552 @@ +import calendar +from datetime import datetime, timedelta + +import dateutil.tz +from dateutil.tz import tzutc +import numpy as np +import pytest +import pytz + +from pandas.errors import OutOfBoundsDatetime + +from pandas import Period, Timedelta, Timestamp, compat + +from pandas.tseries import offsets + + +class TestTimestampConstructors: + def test_constructor(self): + base_str = "2014-07-01 09:00" + base_dt = datetime(2014, 7, 1, 9) + base_expected = 1_404_205_200_000_000_000 + + # confirm base representation is correct + assert calendar.timegm(base_dt.timetuple()) * 1_000_000_000 == base_expected + + tests = [ + (base_str, base_dt, base_expected), + ( + "2014-07-01 10:00", + datetime(2014, 7, 1, 10), + base_expected + 3600 * 1_000_000_000, + ), + ( + "2014-07-01 09:00:00.000008000", + datetime(2014, 7, 1, 9, 0, 0, 8), + base_expected + 8000, + ), + ( + "2014-07-01 09:00:00.000000005", + Timestamp("2014-07-01 09:00:00.000000005"), + base_expected + 5, + ), + ] + + timezones = [ + (None, 0), + ("UTC", 0), + (pytz.utc, 0), + ("Asia/Tokyo", 9), + ("US/Eastern", -4), + ("dateutil/US/Pacific", -7), + (pytz.FixedOffset(-180), -3), + (dateutil.tz.tzoffset(None, 18000), 5), + ] + + for date_str, date, expected in tests: + for result in [Timestamp(date_str), Timestamp(date)]: + # only with timestring + assert result.value == expected + + # re-creation shouldn't affect to internal value + result = Timestamp(result) + assert result.value == expected + + # with timezone + for tz, offset in timezones: + for result in [Timestamp(date_str, tz=tz), Timestamp(date, tz=tz)]: + expected_tz = expected - offset * 3600 * 1_000_000_000 + assert result.value == expected_tz + + # should preserve tz + result = Timestamp(result) + assert result.value == expected_tz + + # should convert to UTC + if tz is not None: + result = Timestamp(result).tz_convert("UTC") + else: + result = Timestamp(result, tz="UTC") + expected_utc = expected - offset * 3600 * 1_000_000_000 + assert result.value == expected_utc + + def test_constructor_with_stringoffset(self): + # GH 7833 + base_str = "2014-07-01 11:00:00+02:00" + base_dt = datetime(2014, 7, 1, 9) + base_expected = 1_404_205_200_000_000_000 + + # confirm base representation is correct + assert calendar.timegm(base_dt.timetuple()) * 1_000_000_000 == base_expected + + tests = [ + (base_str, base_expected), + ("2014-07-01 12:00:00+02:00", base_expected + 3600 * 1_000_000_000), + ("2014-07-01 11:00:00.000008000+02:00", base_expected + 8000), + ("2014-07-01 11:00:00.000000005+02:00", base_expected + 5), + ] + + timezones = [ + (None, 0), + ("UTC", 0), + (pytz.utc, 0), + ("Asia/Tokyo", 9), + ("US/Eastern", -4), + ("dateutil/US/Pacific", -7), + (pytz.FixedOffset(-180), -3), + (dateutil.tz.tzoffset(None, 18000), 5), + ] + + for date_str, expected in tests: + for result in [Timestamp(date_str)]: + # only with timestring + assert result.value == expected + + # re-creation shouldn't affect to internal value + result = Timestamp(result) + assert result.value == expected + + # with timezone + for tz, offset in timezones: + result = Timestamp(date_str, tz=tz) + expected_tz = expected + assert result.value == expected_tz + + # should preserve tz + result = Timestamp(result) + assert result.value == expected_tz + + # should convert to UTC + result = Timestamp(result).tz_convert("UTC") + expected_utc = expected + assert result.value == expected_utc + + # This should be 2013-11-01 05:00 in UTC + # converted to Chicago tz + result = Timestamp("2013-11-01 00:00:00-0500", tz="America/Chicago") + assert result.value == Timestamp("2013-11-01 05:00").value + expected = "Timestamp('2013-11-01 00:00:00-0500', tz='America/Chicago')" # noqa + assert repr(result) == expected + assert result == eval(repr(result)) + + # This should be 2013-11-01 05:00 in UTC + # converted to Tokyo tz (+09:00) + result = Timestamp("2013-11-01 00:00:00-0500", tz="Asia/Tokyo") + assert result.value == Timestamp("2013-11-01 05:00").value + expected = "Timestamp('2013-11-01 14:00:00+0900', tz='Asia/Tokyo')" + assert repr(result) == expected + assert result == eval(repr(result)) + + # GH11708 + # This should be 2015-11-18 10:00 in UTC + # converted to Asia/Katmandu + result = Timestamp("2015-11-18 15:45:00+05:45", tz="Asia/Katmandu") + assert result.value == Timestamp("2015-11-18 10:00").value + expected = "Timestamp('2015-11-18 15:45:00+0545', tz='Asia/Katmandu')" + assert repr(result) == expected + assert result == eval(repr(result)) + + # This should be 2015-11-18 10:00 in UTC + # converted to Asia/Kolkata + result = Timestamp("2015-11-18 15:30:00+05:30", tz="Asia/Kolkata") + assert result.value == Timestamp("2015-11-18 10:00").value + expected = "Timestamp('2015-11-18 15:30:00+0530', tz='Asia/Kolkata')" + assert repr(result) == expected + assert result == eval(repr(result)) + + def test_constructor_invalid(self): + with pytest.raises(TypeError, match="Cannot convert input"): + Timestamp(slice(2)) + with pytest.raises(ValueError, match="Cannot convert Period"): + Timestamp(Period("1000-01-01")) + + def test_constructor_invalid_tz(self): + # GH#17690 + with pytest.raises(TypeError, match="must be a datetime.tzinfo"): + Timestamp("2017-10-22", tzinfo="US/Eastern") + + with pytest.raises(ValueError, match="at most one of"): + Timestamp("2017-10-22", tzinfo=pytz.utc, tz="UTC") + + with pytest.raises(ValueError, match="Invalid frequency:"): + # GH#5168 + # case where user tries to pass tz as an arg, not kwarg, gets + # interpreted as a `freq` + Timestamp("2012-01-01", "US/Pacific") + + def test_constructor_strptime(self): + # GH25016 + # Test support for Timestamp.strptime + fmt = "%Y%m%d-%H%M%S-%f%z" + ts = "20190129-235348-000001+0000" + with pytest.raises(NotImplementedError): + Timestamp.strptime(ts, fmt) + + def test_constructor_tz_or_tzinfo(self): + # GH#17943, GH#17690, GH#5168 + stamps = [ + Timestamp(year=2017, month=10, day=22, tz="UTC"), + Timestamp(year=2017, month=10, day=22, tzinfo=pytz.utc), + Timestamp(year=2017, month=10, day=22, tz=pytz.utc), + Timestamp(datetime(2017, 10, 22), tzinfo=pytz.utc), + Timestamp(datetime(2017, 10, 22), tz="UTC"), + Timestamp(datetime(2017, 10, 22), tz=pytz.utc), + ] + assert all(ts == stamps[0] for ts in stamps) + + def test_constructor_positional(self): + # see gh-10758 + with pytest.raises(TypeError): + Timestamp(2000, 1) + with pytest.raises(ValueError): + Timestamp(2000, 0, 1) + with pytest.raises(ValueError): + Timestamp(2000, 13, 1) + with pytest.raises(ValueError): + Timestamp(2000, 1, 0) + with pytest.raises(ValueError): + Timestamp(2000, 1, 32) + + # see gh-11630 + assert repr(Timestamp(2015, 11, 12)) == repr(Timestamp("20151112")) + assert repr(Timestamp(2015, 11, 12, 1, 2, 3, 999999)) == repr( + Timestamp("2015-11-12 01:02:03.999999") + ) + + def test_constructor_keyword(self): + # GH 10758 + with pytest.raises(TypeError): + Timestamp(year=2000, month=1) + with pytest.raises(ValueError): + Timestamp(year=2000, month=0, day=1) + with pytest.raises(ValueError): + Timestamp(year=2000, month=13, day=1) + with pytest.raises(ValueError): + Timestamp(year=2000, month=1, day=0) + with pytest.raises(ValueError): + Timestamp(year=2000, month=1, day=32) + + assert repr(Timestamp(year=2015, month=11, day=12)) == repr( + Timestamp("20151112") + ) + + assert repr( + Timestamp( + year=2015, + month=11, + day=12, + hour=1, + minute=2, + second=3, + microsecond=999999, + ) + ) == repr(Timestamp("2015-11-12 01:02:03.999999")) + + def test_constructor_fromordinal(self): + base = datetime(2000, 1, 1) + + ts = Timestamp.fromordinal(base.toordinal(), freq="D") + assert base == ts + assert ts.freq == "D" + assert base.toordinal() == ts.toordinal() + + ts = Timestamp.fromordinal(base.toordinal(), tz="US/Eastern") + assert Timestamp("2000-01-01", tz="US/Eastern") == ts + assert base.toordinal() == ts.toordinal() + + # GH#3042 + dt = datetime(2011, 4, 16, 0, 0) + ts = Timestamp.fromordinal(dt.toordinal()) + assert ts.to_pydatetime() == dt + + # with a tzinfo + stamp = Timestamp("2011-4-16", tz="US/Eastern") + dt_tz = stamp.to_pydatetime() + ts = Timestamp.fromordinal(dt_tz.toordinal(), tz="US/Eastern") + assert ts.to_pydatetime() == dt_tz + + @pytest.mark.parametrize( + "result", + [ + Timestamp(datetime(2000, 1, 2, 3, 4, 5, 6), nanosecond=1), + Timestamp( + year=2000, + month=1, + day=2, + hour=3, + minute=4, + second=5, + microsecond=6, + nanosecond=1, + ), + Timestamp( + year=2000, + month=1, + day=2, + hour=3, + minute=4, + second=5, + microsecond=6, + nanosecond=1, + tz="UTC", + ), + Timestamp(2000, 1, 2, 3, 4, 5, 6, 1, None), + Timestamp(2000, 1, 2, 3, 4, 5, 6, 1, pytz.UTC), + ], + ) + def test_constructor_nanosecond(self, result): + # GH 18898 + expected = Timestamp(datetime(2000, 1, 2, 3, 4, 5, 6), tz=result.tz) + expected = expected + Timedelta(nanoseconds=1) + assert result == expected + + @pytest.mark.parametrize("z", ["Z0", "Z00"]) + def test_constructor_invalid_Z0_isostring(self, z): + # GH 8910 + with pytest.raises(ValueError): + Timestamp("2014-11-02 01:00{}".format(z)) + + @pytest.mark.parametrize( + "arg", + [ + "year", + "month", + "day", + "hour", + "minute", + "second", + "microsecond", + "nanosecond", + ], + ) + def test_invalid_date_kwarg_with_string_input(self, arg): + kwarg = {arg: 1} + with pytest.raises(ValueError): + Timestamp("2010-10-10 12:59:59.999999999", **kwarg) + + def test_out_of_bounds_integer_value(self): + # GH#26651 check that we raise OutOfBoundsDatetime, not OverflowError + with pytest.raises(OutOfBoundsDatetime): + Timestamp(Timestamp.max.value * 2) + with pytest.raises(OutOfBoundsDatetime): + Timestamp(Timestamp.min.value * 2) + + def test_out_of_bounds_value(self): + one_us = np.timedelta64(1).astype("timedelta64[us]") + + # By definition we can't go out of bounds in [ns], so we + # convert the datetime64s to [us] so we can go out of bounds + min_ts_us = np.datetime64(Timestamp.min).astype("M8[us]") + max_ts_us = np.datetime64(Timestamp.max).astype("M8[us]") + + # No error for the min/max datetimes + Timestamp(min_ts_us) + Timestamp(max_ts_us) + + # One us less than the minimum is an error + with pytest.raises(ValueError): + Timestamp(min_ts_us - one_us) + + # One us more than the maximum is an error + with pytest.raises(ValueError): + Timestamp(max_ts_us + one_us) + + def test_out_of_bounds_string(self): + with pytest.raises(ValueError): + Timestamp("1676-01-01") + with pytest.raises(ValueError): + Timestamp("2263-01-01") + + def test_barely_out_of_bounds(self): + # GH#19529 + # GH#19382 close enough to bounds that dropping nanos would result + # in an in-bounds datetime + with pytest.raises(OutOfBoundsDatetime): + Timestamp("2262-04-11 23:47:16.854775808") + + def test_bounds_with_different_units(self): + out_of_bounds_dates = ("1677-09-21", "2262-04-12") + + time_units = ("D", "h", "m", "s", "ms", "us") + + for date_string in out_of_bounds_dates: + for unit in time_units: + dt64 = np.datetime64(date_string, unit) + with pytest.raises(ValueError): + Timestamp(dt64) + + in_bounds_dates = ("1677-09-23", "2262-04-11") + + for date_string in in_bounds_dates: + for unit in time_units: + dt64 = np.datetime64(date_string, unit) + Timestamp(dt64) + + def test_min_valid(self): + # Ensure that Timestamp.min is a valid Timestamp + Timestamp(Timestamp.min) + + def test_max_valid(self): + # Ensure that Timestamp.max is a valid Timestamp + Timestamp(Timestamp.max) + + def test_now(self): + # GH#9000 + ts_from_string = Timestamp("now") + ts_from_method = Timestamp.now() + ts_datetime = datetime.now() + + ts_from_string_tz = Timestamp("now", tz="US/Eastern") + ts_from_method_tz = Timestamp.now(tz="US/Eastern") + + # Check that the delta between the times is less than 1s (arbitrarily + # small) + delta = Timedelta(seconds=1) + assert abs(ts_from_method - ts_from_string) < delta + assert abs(ts_datetime - ts_from_method) < delta + assert abs(ts_from_method_tz - ts_from_string_tz) < delta + assert ( + abs( + ts_from_string_tz.tz_localize(None) + - ts_from_method_tz.tz_localize(None) + ) + < delta + ) + + def test_today(self): + ts_from_string = Timestamp("today") + ts_from_method = Timestamp.today() + ts_datetime = datetime.today() + + ts_from_string_tz = Timestamp("today", tz="US/Eastern") + ts_from_method_tz = Timestamp.today(tz="US/Eastern") + + # Check that the delta between the times is less than 1s (arbitrarily + # small) + delta = Timedelta(seconds=1) + assert abs(ts_from_method - ts_from_string) < delta + assert abs(ts_datetime - ts_from_method) < delta + assert abs(ts_from_method_tz - ts_from_string_tz) < delta + assert ( + abs( + ts_from_string_tz.tz_localize(None) + - ts_from_method_tz.tz_localize(None) + ) + < delta + ) + + @pytest.mark.parametrize("tz", [None, pytz.timezone("US/Pacific")]) + def test_disallow_setting_tz(self, tz): + # GH 3746 + ts = Timestamp("2010") + with pytest.raises(AttributeError): + ts.tz = tz + + @pytest.mark.parametrize("offset", ["+0300", "+0200"]) + def test_construct_timestamp_near_dst(self, offset): + # GH 20854 + expected = Timestamp( + "2016-10-30 03:00:00{}".format(offset), tz="Europe/Helsinki" + ) + result = Timestamp(expected).tz_convert("Europe/Helsinki") + assert result == expected + + @pytest.mark.parametrize( + "arg", ["2013/01/01 00:00:00+09:00", "2013-01-01 00:00:00+09:00"] + ) + def test_construct_with_different_string_format(self, arg): + # GH 12064 + result = Timestamp(arg) + expected = Timestamp(datetime(2013, 1, 1), tz=pytz.FixedOffset(540)) + assert result == expected + + def test_construct_timestamp_preserve_original_frequency(self): + # GH 22311 + result = Timestamp(Timestamp("2010-08-08", freq="D")).freq + expected = offsets.Day() + assert result == expected + + def test_constructor_invalid_frequency(self): + # GH 22311 + with pytest.raises(ValueError, match="Invalid frequency:"): + Timestamp("2012-01-01", freq=[]) + + @pytest.mark.parametrize("box", [datetime, Timestamp]) + def test_raise_tz_and_tzinfo_in_datetime_input(self, box): + # GH 23579 + kwargs = {"year": 2018, "month": 1, "day": 1, "tzinfo": pytz.utc} + with pytest.raises(ValueError, match="Cannot pass a datetime or Timestamp"): + Timestamp(box(**kwargs), tz="US/Pacific") + with pytest.raises(ValueError, match="Cannot pass a datetime or Timestamp"): + Timestamp(box(**kwargs), tzinfo=pytz.timezone("US/Pacific")) + + def test_dont_convert_dateutil_utc_to_pytz_utc(self): + result = Timestamp(datetime(2018, 1, 1), tz=tzutc()) + expected = Timestamp(datetime(2018, 1, 1)).tz_localize(tzutc()) + assert result == expected + + def test_constructor_subclassed_datetime(self): + # GH 25851 + # ensure that subclassed datetime works for + # Timestamp creation + class SubDatetime(datetime): + pass + + data = SubDatetime(2000, 1, 1) + result = Timestamp(data) + expected = Timestamp(2000, 1, 1) + assert result == expected + + @pytest.mark.skipif( + not compat.PY38, + reason="datetime.fromisocalendar was added in Python version 3.8", + ) + def test_constructor_fromisocalendar(self): + # GH 30395 + expected_timestamp = Timestamp("2000-01-03 00:00:00") + expected_stdlib = datetime.fromisocalendar(2000, 1, 1) + result = Timestamp.fromisocalendar(2000, 1, 1) + assert result == expected_timestamp + assert result == expected_stdlib + assert isinstance(result, Timestamp) + + +def test_constructor_ambigous_dst(): + # GH 24329 + # Make sure that calling Timestamp constructor + # on Timestamp created from ambiguous time + # doesn't change Timestamp.value + ts = Timestamp(1382835600000000000, tz="dateutil/Europe/London") + expected = ts.value + result = Timestamp(ts).value + assert result == expected + + +@pytest.mark.parametrize("epoch", [1552211999999999872, 1552211999999999999]) +def test_constructor_before_dst_switch(epoch): + # GH 31043 + # Make sure that calling Timestamp constructor + # on time just before DST switch doesn't lead to + # nonexistent time or value change + ts = Timestamp(epoch, tz="dateutil/America/Los_Angeles") + result = ts.tz.dst(ts) + expected = timedelta(seconds=0) + assert Timestamp(ts).value == epoch + assert result == expected + + +def test_timestamp_constructor_identity(): + # Test for #30543 + expected = Timestamp("2017-01-01T12") + result = Timestamp(expected) + assert result is expected diff --git a/pandas/tests/scalar/timestamp/test_timestamp.py b/pandas/tests/scalar/timestamp/test_timestamp.py index 692eb6cd8bc43..cee7ac450e411 100644 --- a/pandas/tests/scalar/timestamp/test_timestamp.py +++ b/pandas/tests/scalar/timestamp/test_timestamp.py @@ -2,11 +2,9 @@ import calendar from datetime import datetime, timedelta -from distutils.version import LooseVersion import locale import unicodedata -import dateutil from dateutil.tz import tzutc import numpy as np import pytest @@ -14,12 +12,10 @@ from pytz import timezone, utc from pandas._libs.tslibs.timezones import dateutil_gettz as gettz, get_timezone -import pandas.compat as compat from pandas.compat.numpy import np_datetime64_compat -from pandas.errors import OutOfBoundsDatetime import pandas.util._test_decorators as td -from pandas import NaT, Period, Timedelta, Timestamp +from pandas import NaT, Timedelta, Timestamp import pandas._testing as tm from pandas.tseries import offsets @@ -198,513 +194,6 @@ def test_resolution(self): assert Timestamp.resolution == Timedelta(nanoseconds=1) -class TestTimestampConstructors: - def test_constructor(self): - base_str = "2014-07-01 09:00" - base_dt = datetime(2014, 7, 1, 9) - base_expected = 1_404_205_200_000_000_000 - - # confirm base representation is correct - assert calendar.timegm(base_dt.timetuple()) * 1_000_000_000 == base_expected - - tests = [ - (base_str, base_dt, base_expected), - ( - "2014-07-01 10:00", - datetime(2014, 7, 1, 10), - base_expected + 3600 * 1_000_000_000, - ), - ( - "2014-07-01 09:00:00.000008000", - datetime(2014, 7, 1, 9, 0, 0, 8), - base_expected + 8000, - ), - ( - "2014-07-01 09:00:00.000000005", - Timestamp("2014-07-01 09:00:00.000000005"), - base_expected + 5, - ), - ] - - timezones = [ - (None, 0), - ("UTC", 0), - (pytz.utc, 0), - ("Asia/Tokyo", 9), - ("US/Eastern", -4), - ("dateutil/US/Pacific", -7), - (pytz.FixedOffset(-180), -3), - (dateutil.tz.tzoffset(None, 18000), 5), - ] - - for date_str, date, expected in tests: - for result in [Timestamp(date_str), Timestamp(date)]: - # only with timestring - assert result.value == expected - - # re-creation shouldn't affect to internal value - result = Timestamp(result) - assert result.value == expected - - # with timezone - for tz, offset in timezones: - for result in [Timestamp(date_str, tz=tz), Timestamp(date, tz=tz)]: - expected_tz = expected - offset * 3600 * 1_000_000_000 - assert result.value == expected_tz - - # should preserve tz - result = Timestamp(result) - assert result.value == expected_tz - - # should convert to UTC - if tz is not None: - result = Timestamp(result).tz_convert("UTC") - else: - result = Timestamp(result, tz="UTC") - expected_utc = expected - offset * 3600 * 1_000_000_000 - assert result.value == expected_utc - - def test_constructor_with_stringoffset(self): - # GH 7833 - base_str = "2014-07-01 11:00:00+02:00" - base_dt = datetime(2014, 7, 1, 9) - base_expected = 1_404_205_200_000_000_000 - - # confirm base representation is correct - assert calendar.timegm(base_dt.timetuple()) * 1_000_000_000 == base_expected - - tests = [ - (base_str, base_expected), - ("2014-07-01 12:00:00+02:00", base_expected + 3600 * 1_000_000_000), - ("2014-07-01 11:00:00.000008000+02:00", base_expected + 8000), - ("2014-07-01 11:00:00.000000005+02:00", base_expected + 5), - ] - - timezones = [ - (None, 0), - ("UTC", 0), - (pytz.utc, 0), - ("Asia/Tokyo", 9), - ("US/Eastern", -4), - ("dateutil/US/Pacific", -7), - (pytz.FixedOffset(-180), -3), - (dateutil.tz.tzoffset(None, 18000), 5), - ] - - for date_str, expected in tests: - for result in [Timestamp(date_str)]: - # only with timestring - assert result.value == expected - - # re-creation shouldn't affect to internal value - result = Timestamp(result) - assert result.value == expected - - # with timezone - for tz, offset in timezones: - result = Timestamp(date_str, tz=tz) - expected_tz = expected - assert result.value == expected_tz - - # should preserve tz - result = Timestamp(result) - assert result.value == expected_tz - - # should convert to UTC - result = Timestamp(result).tz_convert("UTC") - expected_utc = expected - assert result.value == expected_utc - - # This should be 2013-11-01 05:00 in UTC - # converted to Chicago tz - result = Timestamp("2013-11-01 00:00:00-0500", tz="America/Chicago") - assert result.value == Timestamp("2013-11-01 05:00").value - expected = "Timestamp('2013-11-01 00:00:00-0500', tz='America/Chicago')" # noqa - assert repr(result) == expected - assert result == eval(repr(result)) - - # This should be 2013-11-01 05:00 in UTC - # converted to Tokyo tz (+09:00) - result = Timestamp("2013-11-01 00:00:00-0500", tz="Asia/Tokyo") - assert result.value == Timestamp("2013-11-01 05:00").value - expected = "Timestamp('2013-11-01 14:00:00+0900', tz='Asia/Tokyo')" - assert repr(result) == expected - assert result == eval(repr(result)) - - # GH11708 - # This should be 2015-11-18 10:00 in UTC - # converted to Asia/Katmandu - result = Timestamp("2015-11-18 15:45:00+05:45", tz="Asia/Katmandu") - assert result.value == Timestamp("2015-11-18 10:00").value - expected = "Timestamp('2015-11-18 15:45:00+0545', tz='Asia/Katmandu')" - assert repr(result) == expected - assert result == eval(repr(result)) - - # This should be 2015-11-18 10:00 in UTC - # converted to Asia/Kolkata - result = Timestamp("2015-11-18 15:30:00+05:30", tz="Asia/Kolkata") - assert result.value == Timestamp("2015-11-18 10:00").value - expected = "Timestamp('2015-11-18 15:30:00+0530', tz='Asia/Kolkata')" - assert repr(result) == expected - assert result == eval(repr(result)) - - def test_constructor_invalid(self): - with pytest.raises(TypeError, match="Cannot convert input"): - Timestamp(slice(2)) - with pytest.raises(ValueError, match="Cannot convert Period"): - Timestamp(Period("1000-01-01")) - - def test_constructor_invalid_tz(self): - # GH#17690 - with pytest.raises(TypeError, match="must be a datetime.tzinfo"): - Timestamp("2017-10-22", tzinfo="US/Eastern") - - with pytest.raises(ValueError, match="at most one of"): - Timestamp("2017-10-22", tzinfo=utc, tz="UTC") - - with pytest.raises(ValueError, match="Invalid frequency:"): - # GH#5168 - # case where user tries to pass tz as an arg, not kwarg, gets - # interpreted as a `freq` - Timestamp("2012-01-01", "US/Pacific") - - def test_constructor_strptime(self): - # GH25016 - # Test support for Timestamp.strptime - fmt = "%Y%m%d-%H%M%S-%f%z" - ts = "20190129-235348-000001+0000" - with pytest.raises(NotImplementedError): - Timestamp.strptime(ts, fmt) - - def test_constructor_tz_or_tzinfo(self): - # GH#17943, GH#17690, GH#5168 - stamps = [ - Timestamp(year=2017, month=10, day=22, tz="UTC"), - Timestamp(year=2017, month=10, day=22, tzinfo=utc), - Timestamp(year=2017, month=10, day=22, tz=utc), - Timestamp(datetime(2017, 10, 22), tzinfo=utc), - Timestamp(datetime(2017, 10, 22), tz="UTC"), - Timestamp(datetime(2017, 10, 22), tz=utc), - ] - assert all(ts == stamps[0] for ts in stamps) - - def test_constructor_positional(self): - # see gh-10758 - with pytest.raises(TypeError): - Timestamp(2000, 1) - with pytest.raises(ValueError): - Timestamp(2000, 0, 1) - with pytest.raises(ValueError): - Timestamp(2000, 13, 1) - with pytest.raises(ValueError): - Timestamp(2000, 1, 0) - with pytest.raises(ValueError): - Timestamp(2000, 1, 32) - - # see gh-11630 - assert repr(Timestamp(2015, 11, 12)) == repr(Timestamp("20151112")) - assert repr(Timestamp(2015, 11, 12, 1, 2, 3, 999999)) == repr( - Timestamp("2015-11-12 01:02:03.999999") - ) - - def test_constructor_keyword(self): - # GH 10758 - with pytest.raises(TypeError): - Timestamp(year=2000, month=1) - with pytest.raises(ValueError): - Timestamp(year=2000, month=0, day=1) - with pytest.raises(ValueError): - Timestamp(year=2000, month=13, day=1) - with pytest.raises(ValueError): - Timestamp(year=2000, month=1, day=0) - with pytest.raises(ValueError): - Timestamp(year=2000, month=1, day=32) - - assert repr(Timestamp(year=2015, month=11, day=12)) == repr( - Timestamp("20151112") - ) - - assert repr( - Timestamp( - year=2015, - month=11, - day=12, - hour=1, - minute=2, - second=3, - microsecond=999999, - ) - ) == repr(Timestamp("2015-11-12 01:02:03.999999")) - - def test_constructor_fromordinal(self): - base = datetime(2000, 1, 1) - - ts = Timestamp.fromordinal(base.toordinal(), freq="D") - assert base == ts - assert ts.freq == "D" - assert base.toordinal() == ts.toordinal() - - ts = Timestamp.fromordinal(base.toordinal(), tz="US/Eastern") - assert Timestamp("2000-01-01", tz="US/Eastern") == ts - assert base.toordinal() == ts.toordinal() - - # GH#3042 - dt = datetime(2011, 4, 16, 0, 0) - ts = Timestamp.fromordinal(dt.toordinal()) - assert ts.to_pydatetime() == dt - - # with a tzinfo - stamp = Timestamp("2011-4-16", tz="US/Eastern") - dt_tz = stamp.to_pydatetime() - ts = Timestamp.fromordinal(dt_tz.toordinal(), tz="US/Eastern") - assert ts.to_pydatetime() == dt_tz - - @pytest.mark.parametrize( - "result", - [ - Timestamp(datetime(2000, 1, 2, 3, 4, 5, 6), nanosecond=1), - Timestamp( - year=2000, - month=1, - day=2, - hour=3, - minute=4, - second=5, - microsecond=6, - nanosecond=1, - ), - Timestamp( - year=2000, - month=1, - day=2, - hour=3, - minute=4, - second=5, - microsecond=6, - nanosecond=1, - tz="UTC", - ), - Timestamp(2000, 1, 2, 3, 4, 5, 6, 1, None), - Timestamp(2000, 1, 2, 3, 4, 5, 6, 1, pytz.UTC), - ], - ) - def test_constructor_nanosecond(self, result): - # GH 18898 - expected = Timestamp(datetime(2000, 1, 2, 3, 4, 5, 6), tz=result.tz) - expected = expected + Timedelta(nanoseconds=1) - assert result == expected - - @pytest.mark.parametrize("z", ["Z0", "Z00"]) - def test_constructor_invalid_Z0_isostring(self, z): - # GH 8910 - with pytest.raises(ValueError): - Timestamp("2014-11-02 01:00{}".format(z)) - - @pytest.mark.parametrize( - "arg", - [ - "year", - "month", - "day", - "hour", - "minute", - "second", - "microsecond", - "nanosecond", - ], - ) - def test_invalid_date_kwarg_with_string_input(self, arg): - kwarg = {arg: 1} - with pytest.raises(ValueError): - Timestamp("2010-10-10 12:59:59.999999999", **kwarg) - - def test_out_of_bounds_integer_value(self): - # GH#26651 check that we raise OutOfBoundsDatetime, not OverflowError - with pytest.raises(OutOfBoundsDatetime): - Timestamp(Timestamp.max.value * 2) - with pytest.raises(OutOfBoundsDatetime): - Timestamp(Timestamp.min.value * 2) - - def test_out_of_bounds_value(self): - one_us = np.timedelta64(1).astype("timedelta64[us]") - - # By definition we can't go out of bounds in [ns], so we - # convert the datetime64s to [us] so we can go out of bounds - min_ts_us = np.datetime64(Timestamp.min).astype("M8[us]") - max_ts_us = np.datetime64(Timestamp.max).astype("M8[us]") - - # No error for the min/max datetimes - Timestamp(min_ts_us) - Timestamp(max_ts_us) - - # One us less than the minimum is an error - with pytest.raises(ValueError): - Timestamp(min_ts_us - one_us) - - # One us more than the maximum is an error - with pytest.raises(ValueError): - Timestamp(max_ts_us + one_us) - - def test_out_of_bounds_string(self): - with pytest.raises(ValueError): - Timestamp("1676-01-01") - with pytest.raises(ValueError): - Timestamp("2263-01-01") - - def test_barely_out_of_bounds(self): - # GH#19529 - # GH#19382 close enough to bounds that dropping nanos would result - # in an in-bounds datetime - with pytest.raises(OutOfBoundsDatetime): - Timestamp("2262-04-11 23:47:16.854775808") - - def test_bounds_with_different_units(self): - out_of_bounds_dates = ("1677-09-21", "2262-04-12") - - time_units = ("D", "h", "m", "s", "ms", "us") - - for date_string in out_of_bounds_dates: - for unit in time_units: - dt64 = np.datetime64(date_string, unit) - with pytest.raises(ValueError): - Timestamp(dt64) - - in_bounds_dates = ("1677-09-23", "2262-04-11") - - for date_string in in_bounds_dates: - for unit in time_units: - dt64 = np.datetime64(date_string, unit) - Timestamp(dt64) - - def test_min_valid(self): - # Ensure that Timestamp.min is a valid Timestamp - Timestamp(Timestamp.min) - - def test_max_valid(self): - # Ensure that Timestamp.max is a valid Timestamp - Timestamp(Timestamp.max) - - def test_now(self): - # GH#9000 - ts_from_string = Timestamp("now") - ts_from_method = Timestamp.now() - ts_datetime = datetime.now() - - ts_from_string_tz = Timestamp("now", tz="US/Eastern") - ts_from_method_tz = Timestamp.now(tz="US/Eastern") - - # Check that the delta between the times is less than 1s (arbitrarily - # small) - delta = Timedelta(seconds=1) - assert abs(ts_from_method - ts_from_string) < delta - assert abs(ts_datetime - ts_from_method) < delta - assert abs(ts_from_method_tz - ts_from_string_tz) < delta - assert ( - abs( - ts_from_string_tz.tz_localize(None) - - ts_from_method_tz.tz_localize(None) - ) - < delta - ) - - def test_today(self): - ts_from_string = Timestamp("today") - ts_from_method = Timestamp.today() - ts_datetime = datetime.today() - - ts_from_string_tz = Timestamp("today", tz="US/Eastern") - ts_from_method_tz = Timestamp.today(tz="US/Eastern") - - # Check that the delta between the times is less than 1s (arbitrarily - # small) - delta = Timedelta(seconds=1) - assert abs(ts_from_method - ts_from_string) < delta - assert abs(ts_datetime - ts_from_method) < delta - assert abs(ts_from_method_tz - ts_from_string_tz) < delta - assert ( - abs( - ts_from_string_tz.tz_localize(None) - - ts_from_method_tz.tz_localize(None) - ) - < delta - ) - - @pytest.mark.parametrize("tz", [None, pytz.timezone("US/Pacific")]) - def test_disallow_setting_tz(self, tz): - # GH 3746 - ts = Timestamp("2010") - with pytest.raises(AttributeError): - ts.tz = tz - - @pytest.mark.parametrize("offset", ["+0300", "+0200"]) - def test_construct_timestamp_near_dst(self, offset): - # GH 20854 - expected = Timestamp( - "2016-10-30 03:00:00{}".format(offset), tz="Europe/Helsinki" - ) - result = Timestamp(expected).tz_convert("Europe/Helsinki") - assert result == expected - - @pytest.mark.parametrize( - "arg", ["2013/01/01 00:00:00+09:00", "2013-01-01 00:00:00+09:00"] - ) - def test_construct_with_different_string_format(self, arg): - # GH 12064 - result = Timestamp(arg) - expected = Timestamp(datetime(2013, 1, 1), tz=pytz.FixedOffset(540)) - assert result == expected - - def test_construct_timestamp_preserve_original_frequency(self): - # GH 22311 - result = Timestamp(Timestamp("2010-08-08", freq="D")).freq - expected = offsets.Day() - assert result == expected - - def test_constructor_invalid_frequency(self): - # GH 22311 - with pytest.raises(ValueError, match="Invalid frequency:"): - Timestamp("2012-01-01", freq=[]) - - @pytest.mark.parametrize("box", [datetime, Timestamp]) - def test_raise_tz_and_tzinfo_in_datetime_input(self, box): - # GH 23579 - kwargs = {"year": 2018, "month": 1, "day": 1, "tzinfo": utc} - with pytest.raises(ValueError, match="Cannot pass a datetime or Timestamp"): - Timestamp(box(**kwargs), tz="US/Pacific") - with pytest.raises(ValueError, match="Cannot pass a datetime or Timestamp"): - Timestamp(box(**kwargs), tzinfo=pytz.timezone("US/Pacific")) - - def test_dont_convert_dateutil_utc_to_pytz_utc(self): - result = Timestamp(datetime(2018, 1, 1), tz=tzutc()) - expected = Timestamp(datetime(2018, 1, 1)).tz_localize(tzutc()) - assert result == expected - - def test_constructor_subclassed_datetime(self): - # GH 25851 - # ensure that subclassed datetime works for - # Timestamp creation - class SubDatetime(datetime): - pass - - data = SubDatetime(2000, 1, 1) - result = Timestamp(data) - expected = Timestamp(2000, 1, 1) - assert result == expected - - @pytest.mark.skipif( - not compat.PY38, - reason="datetime.fromisocalendar was added in Python version 3.8", - ) - def test_constructor_fromisocalendar(self): - # GH 30395 - expected_timestamp = Timestamp("2000-01-03 00:00:00") - expected_stdlib = datetime.fromisocalendar(2000, 1, 1) - result = Timestamp.fromisocalendar(2000, 1, 1) - assert result == expected_timestamp - assert result == expected_stdlib - assert isinstance(result, Timestamp) - - class TestTimestamp: def test_tz(self): tstr = "2014-02-01 09:00" @@ -1075,34 +564,3 @@ def test_dt_subclass_add_timedelta(lh, rh): result = lh + rh expected = SubDatetime(2000, 1, 1, 1) assert result == expected - - -def test_constructor_ambigous_dst(): - # GH 24329 - # Make sure that calling Timestamp constructor - # on Timestamp created from ambiguous time - # doesn't change Timestamp.value - ts = Timestamp(1382835600000000000, tz="dateutil/Europe/London") - expected = ts.value - result = Timestamp(ts).value - assert result == expected - - -@pytest.mark.xfail( - LooseVersion(compat._optional._get_version(dateutil)) < LooseVersion("2.7.0"), - reason="dateutil moved to Timedelta.total_seconds() in 2.7.0", -) -@pytest.mark.parametrize("epoch", [1552211999999999872, 1552211999999999999]) -def test_constructor_before_dst_switch(epoch): - # GH 31043 - # Make sure that calling Timestamp constructor - # on time just before DST switch doesn't lead to - # nonexistent time or value change - # Works only with dateutil >= 2.7.0 as dateutil overrid - # pandas.Timedelta.total_seconds with - # datetime.timedelta.total_seconds before - ts = Timestamp(epoch, tz="dateutil/US/Pacific") - result = ts.tz.dst(ts) - expected = timedelta(seconds=0) - assert Timestamp(ts).value == epoch - assert result == expected diff --git a/pandas/tests/series/indexing/test_boolean.py b/pandas/tests/series/indexing/test_boolean.py index 16a29d10eb414..28f3c0f7429f8 100644 --- a/pandas/tests/series/indexing/test_boolean.py +++ b/pandas/tests/series/indexing/test_boolean.py @@ -1,10 +1,7 @@ import numpy as np import pytest -from pandas.core.dtypes.common import is_integer - -import pandas as pd -from pandas import Index, Series, Timestamp, date_range, isna +from pandas import Index, Series import pandas._testing as tm from pandas.core.indexing import IndexingError @@ -136,492 +133,3 @@ def test_get_set_boolean_different_order(string_series): sel = string_series[ordered > 0] exp = string_series[string_series > 0] tm.assert_series_equal(sel, exp) - - -def test_where_unsafe_int(sint_dtype): - s = Series(np.arange(10), dtype=sint_dtype) - mask = s < 5 - - s[mask] = range(2, 7) - expected = Series(list(range(2, 7)) + list(range(5, 10)), dtype=sint_dtype) - - tm.assert_series_equal(s, expected) - - -def test_where_unsafe_float(float_dtype): - s = Series(np.arange(10), dtype=float_dtype) - mask = s < 5 - - s[mask] = range(2, 7) - data = list(range(2, 7)) + list(range(5, 10)) - expected = Series(data, dtype=float_dtype) - - tm.assert_series_equal(s, expected) - - -@pytest.mark.parametrize( - "dtype,expected_dtype", - [ - (np.int8, np.float64), - (np.int16, np.float64), - (np.int32, np.float64), - (np.int64, np.float64), - (np.float32, np.float32), - (np.float64, np.float64), - ], -) -def test_where_unsafe_upcast(dtype, expected_dtype): - # see gh-9743 - s = Series(np.arange(10), dtype=dtype) - values = [2.5, 3.5, 4.5, 5.5, 6.5] - mask = s < 5 - expected = Series(values + list(range(5, 10)), dtype=expected_dtype) - s[mask] = values - tm.assert_series_equal(s, expected) - - -def test_where_unsafe(): - # see gh-9731 - s = Series(np.arange(10), dtype="int64") - values = [2.5, 3.5, 4.5, 5.5] - - mask = s > 5 - expected = Series(list(range(6)) + values, dtype="float64") - - s[mask] = values - tm.assert_series_equal(s, expected) - - # see gh-3235 - s = Series(np.arange(10), dtype="int64") - mask = s < 5 - s[mask] = range(2, 7) - expected = Series(list(range(2, 7)) + list(range(5, 10)), dtype="int64") - tm.assert_series_equal(s, expected) - assert s.dtype == expected.dtype - - s = Series(np.arange(10), dtype="int64") - mask = s > 5 - s[mask] = [0] * 4 - expected = Series([0, 1, 2, 3, 4, 5] + [0] * 4, dtype="int64") - tm.assert_series_equal(s, expected) - - s = Series(np.arange(10)) - mask = s > 5 - - msg = "cannot assign mismatch length to masked array" - with pytest.raises(ValueError, match=msg): - s[mask] = [5, 4, 3, 2, 1] - - with pytest.raises(ValueError, match=msg): - s[mask] = [0] * 5 - - # dtype changes - s = Series([1, 2, 3, 4]) - result = s.where(s > 2, np.nan) - expected = Series([np.nan, np.nan, 3, 4]) - tm.assert_series_equal(result, expected) - - # GH 4667 - # setting with None changes dtype - s = Series(range(10)).astype(float) - s[8] = None - result = s[8] - assert isna(result) - - s = Series(range(10)).astype(float) - s[s > 8] = None - result = s[isna(s)] - expected = Series(np.nan, index=[9]) - tm.assert_series_equal(result, expected) - - -def test_where(): - s = Series(np.random.randn(5)) - cond = s > 0 - - rs = s.where(cond).dropna() - rs2 = s[cond] - tm.assert_series_equal(rs, rs2) - - rs = s.where(cond, -s) - tm.assert_series_equal(rs, s.abs()) - - rs = s.where(cond) - assert s.shape == rs.shape - assert rs is not s - - # test alignment - cond = Series([True, False, False, True, False], index=s.index) - s2 = -(s.abs()) - - expected = s2[cond].reindex(s2.index[:3]).reindex(s2.index) - rs = s2.where(cond[:3]) - tm.assert_series_equal(rs, expected) - - expected = s2.abs() - expected.iloc[0] = s2[0] - rs = s2.where(cond[:3], -s2) - tm.assert_series_equal(rs, expected) - - -def test_where_error(): - s = Series(np.random.randn(5)) - cond = s > 0 - - msg = "Array conditional must be same shape as self" - with pytest.raises(ValueError, match=msg): - s.where(1) - with pytest.raises(ValueError, match=msg): - s.where(cond[:3].values, -s) - - # GH 2745 - s = Series([1, 2]) - s[[True, False]] = [0, 1] - expected = Series([0, 2]) - tm.assert_series_equal(s, expected) - - # failures - msg = "cannot assign mismatch length to masked array" - with pytest.raises(ValueError, match=msg): - s[[True, False]] = [0, 2, 3] - msg = ( - "NumPy boolean array indexing assignment cannot assign 0 input " - "values to the 1 output values where the mask is true" - ) - with pytest.raises(ValueError, match=msg): - s[[True, False]] = [] - - -@pytest.mark.parametrize("klass", [list, tuple, np.array, Series]) -def test_where_array_like(klass): - # see gh-15414 - s = Series([1, 2, 3]) - cond = [False, True, True] - expected = Series([np.nan, 2, 3]) - - result = s.where(klass(cond)) - tm.assert_series_equal(result, expected) - - -@pytest.mark.parametrize( - "cond", - [ - [1, 0, 1], - Series([2, 5, 7]), - ["True", "False", "True"], - [Timestamp("2017-01-01"), pd.NaT, Timestamp("2017-01-02")], - ], -) -def test_where_invalid_input(cond): - # see gh-15414: only boolean arrays accepted - s = Series([1, 2, 3]) - msg = "Boolean array expected for the condition" - - with pytest.raises(ValueError, match=msg): - s.where(cond) - - msg = "Array conditional must be same shape as self" - with pytest.raises(ValueError, match=msg): - s.where([True]) - - -def test_where_ndframe_align(): - msg = "Array conditional must be same shape as self" - s = Series([1, 2, 3]) - - cond = [True] - with pytest.raises(ValueError, match=msg): - s.where(cond) - - expected = Series([1, np.nan, np.nan]) - - out = s.where(Series(cond)) - tm.assert_series_equal(out, expected) - - cond = np.array([False, True, False, True]) - with pytest.raises(ValueError, match=msg): - s.where(cond) - - expected = Series([np.nan, 2, np.nan]) - - out = s.where(Series(cond)) - tm.assert_series_equal(out, expected) - - -def test_where_setitem_invalid(): - # GH 2702 - # make sure correct exceptions are raised on invalid list assignment - - msg = "cannot set using a {} indexer with a different length than the value" - - # slice - s = Series(list("abc")) - - with pytest.raises(ValueError, match=msg.format("slice")): - s[0:3] = list(range(27)) - - s[0:3] = list(range(3)) - expected = Series([0, 1, 2]) - tm.assert_series_equal(s.astype(np.int64), expected) - - # slice with step - s = Series(list("abcdef")) - - with pytest.raises(ValueError, match=msg.format("slice")): - s[0:4:2] = list(range(27)) - - s = Series(list("abcdef")) - s[0:4:2] = list(range(2)) - expected = Series([0, "b", 1, "d", "e", "f"]) - tm.assert_series_equal(s, expected) - - # neg slices - s = Series(list("abcdef")) - - with pytest.raises(ValueError, match=msg.format("slice")): - s[:-1] = list(range(27)) - - s[-3:-1] = list(range(2)) - expected = Series(["a", "b", "c", 0, 1, "f"]) - tm.assert_series_equal(s, expected) - - # list - s = Series(list("abc")) - - with pytest.raises(ValueError, match=msg.format("list-like")): - s[[0, 1, 2]] = list(range(27)) - - s = Series(list("abc")) - - with pytest.raises(ValueError, match=msg.format("list-like")): - s[[0, 1, 2]] = list(range(2)) - - # scalar - s = Series(list("abc")) - s[0] = list(range(10)) - expected = Series([list(range(10)), "b", "c"]) - tm.assert_series_equal(s, expected) - - -@pytest.mark.parametrize("size", range(2, 6)) -@pytest.mark.parametrize( - "mask", [[True, False, False, False, False], [True, False], [False]] -) -@pytest.mark.parametrize( - "item", [2.0, np.nan, np.finfo(np.float).max, np.finfo(np.float).min] -) -# Test numpy arrays, lists and tuples as the input to be -# broadcast -@pytest.mark.parametrize( - "box", [lambda x: np.array([x]), lambda x: [x], lambda x: (x,)] -) -def test_broadcast(size, mask, item, box): - selection = np.resize(mask, size) - - data = np.arange(size, dtype=float) - - # Construct the expected series by taking the source - # data or item based on the selection - expected = Series( - [item if use_item else data[i] for i, use_item in enumerate(selection)] - ) - - s = Series(data) - s[selection] = box(item) - tm.assert_series_equal(s, expected) - - s = Series(data) - result = s.where(~selection, box(item)) - tm.assert_series_equal(result, expected) - - s = Series(data) - result = s.mask(selection, box(item)) - tm.assert_series_equal(result, expected) - - -def test_where_inplace(): - s = Series(np.random.randn(5)) - cond = s > 0 - - rs = s.copy() - - rs.where(cond, inplace=True) - tm.assert_series_equal(rs.dropna(), s[cond]) - tm.assert_series_equal(rs, s.where(cond)) - - rs = s.copy() - rs.where(cond, -s, inplace=True) - tm.assert_series_equal(rs, s.where(cond, -s)) - - -def test_where_dups(): - # GH 4550 - # where crashes with dups in index - s1 = Series(list(range(3))) - s2 = Series(list(range(3))) - comb = pd.concat([s1, s2]) - result = comb.where(comb < 2) - expected = Series([0, 1, np.nan, 0, 1, np.nan], index=[0, 1, 2, 0, 1, 2]) - tm.assert_series_equal(result, expected) - - # GH 4548 - # inplace updating not working with dups - comb[comb < 1] = 5 - expected = Series([5, 1, 2, 5, 1, 2], index=[0, 1, 2, 0, 1, 2]) - tm.assert_series_equal(comb, expected) - - comb[comb < 2] += 10 - expected = Series([5, 11, 2, 5, 11, 2], index=[0, 1, 2, 0, 1, 2]) - tm.assert_series_equal(comb, expected) - - -def test_where_numeric_with_string(): - # GH 9280 - s = pd.Series([1, 2, 3]) - w = s.where(s > 1, "X") - - assert not is_integer(w[0]) - assert is_integer(w[1]) - assert is_integer(w[2]) - assert isinstance(w[0], str) - assert w.dtype == "object" - - w = s.where(s > 1, ["X", "Y", "Z"]) - assert not is_integer(w[0]) - assert is_integer(w[1]) - assert is_integer(w[2]) - assert isinstance(w[0], str) - assert w.dtype == "object" - - w = s.where(s > 1, np.array(["X", "Y", "Z"])) - assert not is_integer(w[0]) - assert is_integer(w[1]) - assert is_integer(w[2]) - assert isinstance(w[0], str) - assert w.dtype == "object" - - -def test_where_timedelta_coerce(): - s = Series([1, 2], dtype="timedelta64[ns]") - expected = Series([10, 10]) - mask = np.array([False, False]) - - rs = s.where(mask, [10, 10]) - tm.assert_series_equal(rs, expected) - - rs = s.where(mask, 10) - tm.assert_series_equal(rs, expected) - - rs = s.where(mask, 10.0) - tm.assert_series_equal(rs, expected) - - rs = s.where(mask, [10.0, 10.0]) - tm.assert_series_equal(rs, expected) - - rs = s.where(mask, [10.0, np.nan]) - expected = Series([10, None], dtype="object") - tm.assert_series_equal(rs, expected) - - -def test_where_datetime_conversion(): - s = Series(date_range("20130102", periods=2)) - expected = Series([10, 10]) - mask = np.array([False, False]) - - rs = s.where(mask, [10, 10]) - tm.assert_series_equal(rs, expected) - - rs = s.where(mask, 10) - tm.assert_series_equal(rs, expected) - - rs = s.where(mask, 10.0) - tm.assert_series_equal(rs, expected) - - rs = s.where(mask, [10.0, 10.0]) - tm.assert_series_equal(rs, expected) - - rs = s.where(mask, [10.0, np.nan]) - expected = Series([10, None], dtype="object") - tm.assert_series_equal(rs, expected) - - # GH 15701 - timestamps = ["2016-12-31 12:00:04+00:00", "2016-12-31 12:00:04.010000+00:00"] - s = Series([pd.Timestamp(t) for t in timestamps]) - rs = s.where(Series([False, True])) - expected = Series([pd.NaT, s[1]]) - tm.assert_series_equal(rs, expected) - - -def test_where_dt_tz_values(tz_naive_fixture): - ser1 = pd.Series( - pd.DatetimeIndex(["20150101", "20150102", "20150103"], tz=tz_naive_fixture) - ) - ser2 = pd.Series( - pd.DatetimeIndex(["20160514", "20160515", "20160516"], tz=tz_naive_fixture) - ) - mask = pd.Series([True, True, False]) - result = ser1.where(mask, ser2) - exp = pd.Series( - pd.DatetimeIndex(["20150101", "20150102", "20160516"], tz=tz_naive_fixture) - ) - tm.assert_series_equal(exp, result) - - -def test_mask(): - # compare with tested results in test_where - s = Series(np.random.randn(5)) - cond = s > 0 - - rs = s.where(~cond, np.nan) - tm.assert_series_equal(rs, s.mask(cond)) - - rs = s.where(~cond) - rs2 = s.mask(cond) - tm.assert_series_equal(rs, rs2) - - rs = s.where(~cond, -s) - rs2 = s.mask(cond, -s) - tm.assert_series_equal(rs, rs2) - - cond = Series([True, False, False, True, False], index=s.index) - s2 = -(s.abs()) - rs = s2.where(~cond[:3]) - rs2 = s2.mask(cond[:3]) - tm.assert_series_equal(rs, rs2) - - rs = s2.where(~cond[:3], -s2) - rs2 = s2.mask(cond[:3], -s2) - tm.assert_series_equal(rs, rs2) - - msg = "Array conditional must be same shape as self" - with pytest.raises(ValueError, match=msg): - s.mask(1) - with pytest.raises(ValueError, match=msg): - s.mask(cond[:3].values, -s) - - # dtype changes - s = Series([1, 2, 3, 4]) - result = s.mask(s > 2, np.nan) - expected = Series([1, 2, np.nan, np.nan]) - tm.assert_series_equal(result, expected) - - # see gh-21891 - s = Series([1, 2]) - res = s.mask([True, False]) - - exp = Series([np.nan, 2]) - tm.assert_series_equal(res, exp) - - -def test_mask_inplace(): - s = Series(np.random.randn(5)) - cond = s > 0 - - rs = s.copy() - rs.mask(cond, inplace=True) - tm.assert_series_equal(rs.dropna(), s[~cond]) - tm.assert_series_equal(rs, s.mask(cond)) - - rs = s.copy() - rs.mask(cond, -s, inplace=True) - tm.assert_series_equal(rs, s.mask(cond, -s)) diff --git a/pandas/tests/series/indexing/test_datetime.py b/pandas/tests/series/indexing/test_datetime.py index 77085ef547690..acaa9de88a836 100644 --- a/pandas/tests/series/indexing/test_datetime.py +++ b/pandas/tests/series/indexing/test_datetime.py @@ -1,4 +1,5 @@ from datetime import datetime, timedelta +import re import numpy as np import pytest @@ -147,7 +148,6 @@ def test_frame_datetime64_duplicated(): def test_getitem_setitem_datetime_tz_pytz(): from pytz import timezone as tz - from pandas import date_range N = 50 # testing with timezone, GH #2785 @@ -188,8 +188,6 @@ def test_getitem_setitem_datetime_tz_dateutil(): lambda x: tzutc() if x == "UTC" else gettz(x) ) # handle special case for utc in dateutil - from pandas import date_range - N = 50 # testing with timezone, GH #2785 @@ -372,7 +370,6 @@ def test_getitem_median_slice_bug(): def test_datetime_indexing(): - from pandas import date_range index = date_range("1/1/2000", "1/7/2000") index = index.repeat(3) @@ -380,7 +377,7 @@ def test_datetime_indexing(): s = Series(len(index), index=index) stamp = Timestamp("1/8/2000") - with pytest.raises(KeyError, match=r"^947289600000000000$"): + with pytest.raises(KeyError, match=re.escape(repr(stamp))): s[stamp] s[stamp] = 0 assert s[stamp] == 0 @@ -389,7 +386,7 @@ def test_datetime_indexing(): s = Series(len(index), index=index) s = s[::-1] - with pytest.raises(KeyError, match=r"^947289600000000000$"): + with pytest.raises(KeyError, match=re.escape(repr(stamp))): s[stamp] s[stamp] = 0 assert s[stamp] == 0 @@ -495,8 +492,9 @@ def test_duplicate_dates_indexing(dups): expected = Series(np.where(mask, 0, ts), index=ts.index) tm.assert_series_equal(cp, expected) - with pytest.raises(KeyError, match=r"^947116800000000000$"): - ts[datetime(2000, 1, 6)] + key = datetime(2000, 1, 6) + with pytest.raises(KeyError, match=re.escape(repr(key))): + ts[key] # new index ts[datetime(2000, 1, 6)] = 0 diff --git a/pandas/tests/series/indexing/test_get.py b/pandas/tests/series/indexing/test_get.py new file mode 100644 index 0000000000000..438b61ed203a3 --- /dev/null +++ b/pandas/tests/series/indexing/test_get.py @@ -0,0 +1,134 @@ +import numpy as np + +import pandas as pd +from pandas import Series + + +def test_get(): + # GH 6383 + s = Series( + np.array( + [ + 43, + 48, + 60, + 48, + 50, + 51, + 50, + 45, + 57, + 48, + 56, + 45, + 51, + 39, + 55, + 43, + 54, + 52, + 51, + 54, + ] + ) + ) + + result = s.get(25, 0) + expected = 0 + assert result == expected + + s = Series( + np.array( + [ + 43, + 48, + 60, + 48, + 50, + 51, + 50, + 45, + 57, + 48, + 56, + 45, + 51, + 39, + 55, + 43, + 54, + 52, + 51, + 54, + ] + ), + index=pd.Float64Index( + [ + 25.0, + 36.0, + 49.0, + 64.0, + 81.0, + 100.0, + 121.0, + 144.0, + 169.0, + 196.0, + 1225.0, + 1296.0, + 1369.0, + 1444.0, + 1521.0, + 1600.0, + 1681.0, + 1764.0, + 1849.0, + 1936.0, + ] + ), + ) + + result = s.get(25, 0) + expected = 43 + assert result == expected + + # GH 7407 + # with a boolean accessor + df = pd.DataFrame({"i": [0] * 3, "b": [False] * 3}) + vc = df.i.value_counts() + result = vc.get(99, default="Missing") + assert result == "Missing" + + vc = df.b.value_counts() + result = vc.get(False, default="Missing") + assert result == 3 + + result = vc.get(True, default="Missing") + assert result == "Missing" + + +def test_get_nan(): + # GH 8569 + s = pd.Float64Index(range(10)).to_series() + assert s.get(np.nan) is None + assert s.get(np.nan, default="Missing") == "Missing" + + +def test_get_nan_multiple(): + # GH 8569 + # ensure that fixing "test_get_nan" above hasn't broken get + # with multiple elements + s = pd.Float64Index(range(10)).to_series() + + idx = [2, 30] + assert s.get(idx) is None + + idx = [2, np.nan] + assert s.get(idx) is None + + # GH 17295 - all missing keys + idx = [20, 30] + assert s.get(idx) is None + + idx = [np.nan, np.nan] + assert s.get(idx) is None diff --git a/pandas/tests/series/indexing/test_indexing.py b/pandas/tests/series/indexing/test_indexing.py index 18dbd22b73b35..fa5c75d5e4ad9 100644 --- a/pandas/tests/series/indexing/test_indexing.py +++ b/pandas/tests/series/indexing/test_indexing.py @@ -17,10 +17,9 @@ def test_basic_indexing(): s = Series(np.random.randn(5), index=["a", "b", "a", "a", "b"]) - msg = "index out of bounds" + msg = "index 5 is out of bounds for axis 0 with size 5" with pytest.raises(IndexError, match=msg): s[5] - msg = "index 5 is out of bounds for axis 0 with size 5" with pytest.raises(IndexError, match=msg): s[5] = 0 @@ -29,7 +28,6 @@ def test_basic_indexing(): s = s.sort_index() - msg = r"index out of bounds|^5$" with pytest.raises(IndexError, match=msg): s[5] msg = r"index 5 is out of bounds for axis (0|1) with size 5|^5$" @@ -165,11 +163,12 @@ def test_getitem_with_duplicates_indices(result_1, duplicate_item, expected_1): def test_getitem_out_of_bounds(datetime_series): # don't segfault, GH #495 - msg = "index out of bounds" + msg = r"index \d+ is out of bounds for axis 0 with size \d+" with pytest.raises(IndexError, match=msg): datetime_series[len(datetime_series)] # GH #917 + msg = r"index -\d+ is out of bounds for axis 0 with size \d+" s = Series([], dtype=object) with pytest.raises(IndexError, match=msg): s[-1] @@ -430,7 +429,7 @@ def test_basic_getitem_setitem_corner(datetime_series): @pytest.mark.parametrize("tz", ["US/Eastern", "UTC", "Asia/Tokyo"]) def test_setitem_with_tz(tz): orig = pd.Series(pd.date_range("2016-01-01", freq="H", periods=3, tz=tz)) - assert orig.dtype == "datetime64[ns, {0}]".format(tz) + assert orig.dtype == f"datetime64[ns, {tz}]" # scalar s = orig.copy() @@ -457,7 +456,7 @@ def test_setitem_with_tz(tz): [pd.Timestamp("2011-01-01", tz=tz), pd.Timestamp("2012-01-01", tz=tz)], index=[1, 2], ) - assert vals.dtype == "datetime64[ns, {0}]".format(tz) + assert vals.dtype == f"datetime64[ns, {tz}]" s[[1, 2]] = vals exp = pd.Series( @@ -482,7 +481,7 @@ def test_setitem_with_tz_dst(): # GH XXX tz = "US/Eastern" orig = pd.Series(pd.date_range("2016-11-06", freq="H", periods=3, tz=tz)) - assert orig.dtype == "datetime64[ns, {0}]".format(tz) + assert orig.dtype == f"datetime64[ns, {tz}]" # scalar s = orig.copy() @@ -509,7 +508,7 @@ def test_setitem_with_tz_dst(): [pd.Timestamp("2011-01-01", tz=tz), pd.Timestamp("2012-01-01", tz=tz)], index=[1, 2], ) - assert vals.dtype == "datetime64[ns, {0}]".format(tz) + assert vals.dtype == f"datetime64[ns, {tz}]" s[[1, 2]] = vals exp = pd.Series( @@ -883,41 +882,6 @@ def test_pop(): tm.assert_series_equal(k, expected) -def test_take(): - s = Series([-1, 5, 6, 2, 4]) - - actual = s.take([1, 3, 4]) - expected = Series([5, 2, 4], index=[1, 3, 4]) - tm.assert_series_equal(actual, expected) - - actual = s.take([-1, 3, 4]) - expected = Series([4, 2, 4], index=[4, 3, 4]) - tm.assert_series_equal(actual, expected) - - msg = "index {} is out of bounds for( axis 0 with)? size 5" - with pytest.raises(IndexError, match=msg.format(10)): - s.take([1, 10]) - with pytest.raises(IndexError, match=msg.format(5)): - s.take([2, 5]) - - -def test_take_categorical(): - # https://github.com/pandas-dev/pandas/issues/20664 - s = Series(pd.Categorical(["a", "b", "c"])) - result = s.take([-2, -2, 0]) - expected = Series( - pd.Categorical(["b", "b", "a"], categories=["a", "b", "c"]), index=[1, 1, 0] - ) - tm.assert_series_equal(result, expected) - - -def test_head_tail(string_series): - tm.assert_series_equal(string_series.head(), string_series[:5]) - tm.assert_series_equal(string_series.head(0), string_series[0:0]) - tm.assert_series_equal(string_series.tail(), string_series[-5:]) - tm.assert_series_equal(string_series.tail(0), string_series[0:0]) - - def test_uint_drop(any_int_dtype): # see GH18311 # assigning series.loc[0] = 4 changed series.dtype to int diff --git a/pandas/tests/series/indexing/test_mask.py b/pandas/tests/series/indexing/test_mask.py new file mode 100644 index 0000000000000..dc4fb530dbb52 --- /dev/null +++ b/pandas/tests/series/indexing/test_mask.py @@ -0,0 +1,65 @@ +import numpy as np +import pytest + +from pandas import Series +import pandas._testing as tm + + +def test_mask(): + # compare with tested results in test_where + s = Series(np.random.randn(5)) + cond = s > 0 + + rs = s.where(~cond, np.nan) + tm.assert_series_equal(rs, s.mask(cond)) + + rs = s.where(~cond) + rs2 = s.mask(cond) + tm.assert_series_equal(rs, rs2) + + rs = s.where(~cond, -s) + rs2 = s.mask(cond, -s) + tm.assert_series_equal(rs, rs2) + + cond = Series([True, False, False, True, False], index=s.index) + s2 = -(s.abs()) + rs = s2.where(~cond[:3]) + rs2 = s2.mask(cond[:3]) + tm.assert_series_equal(rs, rs2) + + rs = s2.where(~cond[:3], -s2) + rs2 = s2.mask(cond[:3], -s2) + tm.assert_series_equal(rs, rs2) + + msg = "Array conditional must be same shape as self" + with pytest.raises(ValueError, match=msg): + s.mask(1) + with pytest.raises(ValueError, match=msg): + s.mask(cond[:3].values, -s) + + # dtype changes + s = Series([1, 2, 3, 4]) + result = s.mask(s > 2, np.nan) + expected = Series([1, 2, np.nan, np.nan]) + tm.assert_series_equal(result, expected) + + # see gh-21891 + s = Series([1, 2]) + res = s.mask([True, False]) + + exp = Series([np.nan, 2]) + tm.assert_series_equal(res, exp) + + +def test_mask_inplace(): + s = Series(np.random.randn(5)) + cond = s > 0 + + rs = s.copy() + rs.mask(cond, inplace=True) + tm.assert_series_equal(rs.dropna(), s[~cond]) + tm.assert_series_equal(rs, s.mask(cond)) + + rs = s.copy() + rs.mask(cond, -s, inplace=True) + tm.assert_series_equal(rs, s.mask(cond, -s)) diff --git a/pandas/tests/series/indexing/test_numeric.py b/pandas/tests/series/indexing/test_numeric.py index 3684ca00c2f17..7e73e6366438b 100644 --- a/pandas/tests/series/indexing/test_numeric.py +++ b/pandas/tests/series/indexing/test_numeric.py @@ -1,141 +1,10 @@ import numpy as np import pytest -import pandas as pd from pandas import DataFrame, Index, Series import pandas._testing as tm -def test_get(): - # GH 6383 - s = Series( - np.array( - [ - 43, - 48, - 60, - 48, - 50, - 51, - 50, - 45, - 57, - 48, - 56, - 45, - 51, - 39, - 55, - 43, - 54, - 52, - 51, - 54, - ] - ) - ) - - result = s.get(25, 0) - expected = 0 - assert result == expected - - s = Series( - np.array( - [ - 43, - 48, - 60, - 48, - 50, - 51, - 50, - 45, - 57, - 48, - 56, - 45, - 51, - 39, - 55, - 43, - 54, - 52, - 51, - 54, - ] - ), - index=pd.Float64Index( - [ - 25.0, - 36.0, - 49.0, - 64.0, - 81.0, - 100.0, - 121.0, - 144.0, - 169.0, - 196.0, - 1225.0, - 1296.0, - 1369.0, - 1444.0, - 1521.0, - 1600.0, - 1681.0, - 1764.0, - 1849.0, - 1936.0, - ] - ), - ) - - result = s.get(25, 0) - expected = 43 - assert result == expected - - # GH 7407 - # with a boolean accessor - df = pd.DataFrame({"i": [0] * 3, "b": [False] * 3}) - vc = df.i.value_counts() - result = vc.get(99, default="Missing") - assert result == "Missing" - - vc = df.b.value_counts() - result = vc.get(False, default="Missing") - assert result == 3 - - result = vc.get(True, default="Missing") - assert result == "Missing" - - -def test_get_nan(): - # GH 8569 - s = pd.Float64Index(range(10)).to_series() - assert s.get(np.nan) is None - assert s.get(np.nan, default="Missing") == "Missing" - - -def test_get_nan_multiple(): - # GH 8569 - # ensure that fixing "test_get_nan" above hasn't broken get - # with multiple elements - s = pd.Float64Index(range(10)).to_series() - - idx = [2, 30] - assert s.get(idx) is None - - idx = [2, np.nan] - assert s.get(idx) is None - - # GH 17295 - all missing keys - idx = [20, 30] - assert s.get(idx) is None - - idx = [np.nan, np.nan] - assert s.get(idx) is None - - def test_delitem(): # GH 5542 # should delete the item inplace @@ -202,10 +71,9 @@ def test_slice_float64(): def test_getitem_negative_out_of_bounds(): s = Series(tm.rands_array(5, 10), index=tm.rands_array(10, 10)) - msg = "index out of bounds" + msg = "index -11 is out of bounds for axis 0 with size 10" with pytest.raises(IndexError, match=msg): s[-11] - msg = "index -11 is out of bounds for axis 0 with size 10" with pytest.raises(IndexError, match=msg): s[-11] = "foo" @@ -260,9 +128,8 @@ def test_setitem_float_labels(): def test_slice_float_get_set(datetime_series): msg = ( - r"cannot do slice indexing on with these indexers \[{key}\] " - r"of " + "cannot do slice indexing on DatetimeIndex with these indexers " + r"\[{key}\] of type float" ) with pytest.raises(TypeError, match=msg.format(key=r"4\.0")): datetime_series[4.0:10.0] diff --git a/pandas/tests/series/indexing/test_take.py b/pandas/tests/series/indexing/test_take.py new file mode 100644 index 0000000000000..9368d49e5ff2b --- /dev/null +++ b/pandas/tests/series/indexing/test_take.py @@ -0,0 +1,33 @@ +import pytest + +import pandas as pd +from pandas import Series +import pandas._testing as tm + + +def test_take(): + ser = Series([-1, 5, 6, 2, 4]) + + actual = ser.take([1, 3, 4]) + expected = Series([5, 2, 4], index=[1, 3, 4]) + tm.assert_series_equal(actual, expected) + + actual = ser.take([-1, 3, 4]) + expected = Series([4, 2, 4], index=[4, 3, 4]) + tm.assert_series_equal(actual, expected) + + msg = "index {} is out of bounds for( axis 0 with)? size 5" + with pytest.raises(IndexError, match=msg.format(10)): + ser.take([1, 10]) + with pytest.raises(IndexError, match=msg.format(5)): + ser.take([2, 5]) + + +def test_take_categorical(): + # https://github.com/pandas-dev/pandas/issues/20664 + ser = Series(pd.Categorical(["a", "b", "c"])) + result = ser.take([-2, -2, 0]) + expected = Series( + pd.Categorical(["b", "b", "a"], categories=["a", "b", "c"]), index=[1, 1, 0] + ) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/series/indexing/test_where.py b/pandas/tests/series/indexing/test_where.py new file mode 100644 index 0000000000000..9703f5afaf689 --- /dev/null +++ b/pandas/tests/series/indexing/test_where.py @@ -0,0 +1,437 @@ +import numpy as np +import pytest + +from pandas.core.dtypes.common import is_integer + +import pandas as pd +from pandas import Series, Timestamp, date_range, isna +import pandas._testing as tm + + +def test_where_unsafe_int(sint_dtype): + s = Series(np.arange(10), dtype=sint_dtype) + mask = s < 5 + + s[mask] = range(2, 7) + expected = Series(list(range(2, 7)) + list(range(5, 10)), dtype=sint_dtype) + + tm.assert_series_equal(s, expected) + + +def test_where_unsafe_float(float_dtype): + s = Series(np.arange(10), dtype=float_dtype) + mask = s < 5 + + s[mask] = range(2, 7) + data = list(range(2, 7)) + list(range(5, 10)) + expected = Series(data, dtype=float_dtype) + + tm.assert_series_equal(s, expected) + + +@pytest.mark.parametrize( + "dtype,expected_dtype", + [ + (np.int8, np.float64), + (np.int16, np.float64), + (np.int32, np.float64), + (np.int64, np.float64), + (np.float32, np.float32), + (np.float64, np.float64), + ], +) +def test_where_unsafe_upcast(dtype, expected_dtype): + # see gh-9743 + s = Series(np.arange(10), dtype=dtype) + values = [2.5, 3.5, 4.5, 5.5, 6.5] + mask = s < 5 + expected = Series(values + list(range(5, 10)), dtype=expected_dtype) + s[mask] = values + tm.assert_series_equal(s, expected) + + +def test_where_unsafe(): + # see gh-9731 + s = Series(np.arange(10), dtype="int64") + values = [2.5, 3.5, 4.5, 5.5] + + mask = s > 5 + expected = Series(list(range(6)) + values, dtype="float64") + + s[mask] = values + tm.assert_series_equal(s, expected) + + # see gh-3235 + s = Series(np.arange(10), dtype="int64") + mask = s < 5 + s[mask] = range(2, 7) + expected = Series(list(range(2, 7)) + list(range(5, 10)), dtype="int64") + tm.assert_series_equal(s, expected) + assert s.dtype == expected.dtype + + s = Series(np.arange(10), dtype="int64") + mask = s > 5 + s[mask] = [0] * 4 + expected = Series([0, 1, 2, 3, 4, 5] + [0] * 4, dtype="int64") + tm.assert_series_equal(s, expected) + + s = Series(np.arange(10)) + mask = s > 5 + + msg = "cannot assign mismatch length to masked array" + with pytest.raises(ValueError, match=msg): + s[mask] = [5, 4, 3, 2, 1] + + with pytest.raises(ValueError, match=msg): + s[mask] = [0] * 5 + + # dtype changes + s = Series([1, 2, 3, 4]) + result = s.where(s > 2, np.nan) + expected = Series([np.nan, np.nan, 3, 4]) + tm.assert_series_equal(result, expected) + + # GH 4667 + # setting with None changes dtype + s = Series(range(10)).astype(float) + s[8] = None + result = s[8] + assert isna(result) + + s = Series(range(10)).astype(float) + s[s > 8] = None + result = s[isna(s)] + expected = Series(np.nan, index=[9]) + tm.assert_series_equal(result, expected) + + +def test_where(): + s = Series(np.random.randn(5)) + cond = s > 0 + + rs = s.where(cond).dropna() + rs2 = s[cond] + tm.assert_series_equal(rs, rs2) + + rs = s.where(cond, -s) + tm.assert_series_equal(rs, s.abs()) + + rs = s.where(cond) + assert s.shape == rs.shape + assert rs is not s + + # test alignment + cond = Series([True, False, False, True, False], index=s.index) + s2 = -(s.abs()) + + expected = s2[cond].reindex(s2.index[:3]).reindex(s2.index) + rs = s2.where(cond[:3]) + tm.assert_series_equal(rs, expected) + + expected = s2.abs() + expected.iloc[0] = s2[0] + rs = s2.where(cond[:3], -s2) + tm.assert_series_equal(rs, expected) + + +def test_where_error(): + s = Series(np.random.randn(5)) + cond = s > 0 + + msg = "Array conditional must be same shape as self" + with pytest.raises(ValueError, match=msg): + s.where(1) + with pytest.raises(ValueError, match=msg): + s.where(cond[:3].values, -s) + + # GH 2745 + s = Series([1, 2]) + s[[True, False]] = [0, 1] + expected = Series([0, 2]) + tm.assert_series_equal(s, expected) + + # failures + msg = "cannot assign mismatch length to masked array" + with pytest.raises(ValueError, match=msg): + s[[True, False]] = [0, 2, 3] + msg = ( + "NumPy boolean array indexing assignment cannot assign 0 input " + "values to the 1 output values where the mask is true" + ) + with pytest.raises(ValueError, match=msg): + s[[True, False]] = [] + + +@pytest.mark.parametrize("klass", [list, tuple, np.array, Series]) +def test_where_array_like(klass): + # see gh-15414 + s = Series([1, 2, 3]) + cond = [False, True, True] + expected = Series([np.nan, 2, 3]) + + result = s.where(klass(cond)) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "cond", + [ + [1, 0, 1], + Series([2, 5, 7]), + ["True", "False", "True"], + [Timestamp("2017-01-01"), pd.NaT, Timestamp("2017-01-02")], + ], +) +def test_where_invalid_input(cond): + # see gh-15414: only boolean arrays accepted + s = Series([1, 2, 3]) + msg = "Boolean array expected for the condition" + + with pytest.raises(ValueError, match=msg): + s.where(cond) + + msg = "Array conditional must be same shape as self" + with pytest.raises(ValueError, match=msg): + s.where([True]) + + +def test_where_ndframe_align(): + msg = "Array conditional must be same shape as self" + s = Series([1, 2, 3]) + + cond = [True] + with pytest.raises(ValueError, match=msg): + s.where(cond) + + expected = Series([1, np.nan, np.nan]) + + out = s.where(Series(cond)) + tm.assert_series_equal(out, expected) + + cond = np.array([False, True, False, True]) + with pytest.raises(ValueError, match=msg): + s.where(cond) + + expected = Series([np.nan, 2, np.nan]) + + out = s.where(Series(cond)) + tm.assert_series_equal(out, expected) + + +def test_where_setitem_invalid(): + # GH 2702 + # make sure correct exceptions are raised on invalid list assignment + + msg = "cannot set using a {} indexer with a different length than the value" + + # slice + s = Series(list("abc")) + + with pytest.raises(ValueError, match=msg.format("slice")): + s[0:3] = list(range(27)) + + s[0:3] = list(range(3)) + expected = Series([0, 1, 2]) + tm.assert_series_equal(s.astype(np.int64), expected) + + # slice with step + s = Series(list("abcdef")) + + with pytest.raises(ValueError, match=msg.format("slice")): + s[0:4:2] = list(range(27)) + + s = Series(list("abcdef")) + s[0:4:2] = list(range(2)) + expected = Series([0, "b", 1, "d", "e", "f"]) + tm.assert_series_equal(s, expected) + + # neg slices + s = Series(list("abcdef")) + + with pytest.raises(ValueError, match=msg.format("slice")): + s[:-1] = list(range(27)) + + s[-3:-1] = list(range(2)) + expected = Series(["a", "b", "c", 0, 1, "f"]) + tm.assert_series_equal(s, expected) + + # list + s = Series(list("abc")) + + with pytest.raises(ValueError, match=msg.format("list-like")): + s[[0, 1, 2]] = list(range(27)) + + s = Series(list("abc")) + + with pytest.raises(ValueError, match=msg.format("list-like")): + s[[0, 1, 2]] = list(range(2)) + + # scalar + s = Series(list("abc")) + s[0] = list(range(10)) + expected = Series([list(range(10)), "b", "c"]) + tm.assert_series_equal(s, expected) + + +@pytest.mark.parametrize("size", range(2, 6)) +@pytest.mark.parametrize( + "mask", [[True, False, False, False, False], [True, False], [False]] +) +@pytest.mark.parametrize( + "item", [2.0, np.nan, np.finfo(np.float).max, np.finfo(np.float).min] +) +# Test numpy arrays, lists and tuples as the input to be +# broadcast +@pytest.mark.parametrize( + "box", [lambda x: np.array([x]), lambda x: [x], lambda x: (x,)] +) +def test_broadcast(size, mask, item, box): + selection = np.resize(mask, size) + + data = np.arange(size, dtype=float) + + # Construct the expected series by taking the source + # data or item based on the selection + expected = Series( + [item if use_item else data[i] for i, use_item in enumerate(selection)] + ) + + s = Series(data) + s[selection] = box(item) + tm.assert_series_equal(s, expected) + + s = Series(data) + result = s.where(~selection, box(item)) + tm.assert_series_equal(result, expected) + + s = Series(data) + result = s.mask(selection, box(item)) + tm.assert_series_equal(result, expected) + + +def test_where_inplace(): + s = Series(np.random.randn(5)) + cond = s > 0 + + rs = s.copy() + + rs.where(cond, inplace=True) + tm.assert_series_equal(rs.dropna(), s[cond]) + tm.assert_series_equal(rs, s.where(cond)) + + rs = s.copy() + rs.where(cond, -s, inplace=True) + tm.assert_series_equal(rs, s.where(cond, -s)) + + +def test_where_dups(): + # GH 4550 + # where crashes with dups in index + s1 = Series(list(range(3))) + s2 = Series(list(range(3))) + comb = pd.concat([s1, s2]) + result = comb.where(comb < 2) + expected = Series([0, 1, np.nan, 0, 1, np.nan], index=[0, 1, 2, 0, 1, 2]) + tm.assert_series_equal(result, expected) + + # GH 4548 + # inplace updating not working with dups + comb[comb < 1] = 5 + expected = Series([5, 1, 2, 5, 1, 2], index=[0, 1, 2, 0, 1, 2]) + tm.assert_series_equal(comb, expected) + + comb[comb < 2] += 10 + expected = Series([5, 11, 2, 5, 11, 2], index=[0, 1, 2, 0, 1, 2]) + tm.assert_series_equal(comb, expected) + + +def test_where_numeric_with_string(): + # GH 9280 + s = pd.Series([1, 2, 3]) + w = s.where(s > 1, "X") + + assert not is_integer(w[0]) + assert is_integer(w[1]) + assert is_integer(w[2]) + assert isinstance(w[0], str) + assert w.dtype == "object" + + w = s.where(s > 1, ["X", "Y", "Z"]) + assert not is_integer(w[0]) + assert is_integer(w[1]) + assert is_integer(w[2]) + assert isinstance(w[0], str) + assert w.dtype == "object" + + w = s.where(s > 1, np.array(["X", "Y", "Z"])) + assert not is_integer(w[0]) + assert is_integer(w[1]) + assert is_integer(w[2]) + assert isinstance(w[0], str) + assert w.dtype == "object" + + +def test_where_timedelta_coerce(): + s = Series([1, 2], dtype="timedelta64[ns]") + expected = Series([10, 10]) + mask = np.array([False, False]) + + rs = s.where(mask, [10, 10]) + tm.assert_series_equal(rs, expected) + + rs = s.where(mask, 10) + tm.assert_series_equal(rs, expected) + + rs = s.where(mask, 10.0) + tm.assert_series_equal(rs, expected) + + rs = s.where(mask, [10.0, 10.0]) + tm.assert_series_equal(rs, expected) + + rs = s.where(mask, [10.0, np.nan]) + expected = Series([10, None], dtype="object") + tm.assert_series_equal(rs, expected) + + +def test_where_datetime_conversion(): + s = Series(date_range("20130102", periods=2)) + expected = Series([10, 10]) + mask = np.array([False, False]) + + rs = s.where(mask, [10, 10]) + tm.assert_series_equal(rs, expected) + + rs = s.where(mask, 10) + tm.assert_series_equal(rs, expected) + + rs = s.where(mask, 10.0) + tm.assert_series_equal(rs, expected) + + rs = s.where(mask, [10.0, 10.0]) + tm.assert_series_equal(rs, expected) + + rs = s.where(mask, [10.0, np.nan]) + expected = Series([10, None], dtype="object") + tm.assert_series_equal(rs, expected) + + # GH 15701 + timestamps = ["2016-12-31 12:00:04+00:00", "2016-12-31 12:00:04.010000+00:00"] + s = Series([pd.Timestamp(t) for t in timestamps]) + rs = s.where(Series([False, True])) + expected = Series([pd.NaT, s[1]]) + tm.assert_series_equal(rs, expected) + + +def test_where_dt_tz_values(tz_naive_fixture): + ser1 = pd.Series( + pd.DatetimeIndex(["20150101", "20150102", "20150103"], tz=tz_naive_fixture) + ) + ser2 = pd.Series( + pd.DatetimeIndex(["20160514", "20160515", "20160516"], tz=tz_naive_fixture) + ) + mask = pd.Series([True, True, False]) + result = ser1.where(mask, ser2) + exp = pd.Series( + pd.DatetimeIndex(["20150101", "20150102", "20160516"], tz=tz_naive_fixture) + ) + tm.assert_series_equal(exp, result) diff --git a/pandas/tests/series/indexing/test_xs.py b/pandas/tests/series/indexing/test_xs.py new file mode 100644 index 0000000000000..43458ca2ebeb2 --- /dev/null +++ b/pandas/tests/series/indexing/test_xs.py @@ -0,0 +1,17 @@ +import numpy as np + +import pandas as pd + + +def test_xs_datetimelike_wrapping(): + # GH#31630 a case where we shouldn't wrap datetime64 in Timestamp + arr = pd.date_range("2016-01-01", periods=3)._data._data + + ser = pd.Series(arr, dtype=object) + for i in range(len(ser)): + ser.iloc[i] = arr[i] + assert ser.dtype == object + assert isinstance(ser[0], np.datetime64) + + result = ser.xs(0) + assert isinstance(result, np.datetime64) diff --git a/pandas/tests/series/test_convert_dtypes.py b/pandas/tests/series/methods/test_convert_dtypes.py similarity index 100% rename from pandas/tests/series/test_convert_dtypes.py rename to pandas/tests/series/methods/test_convert_dtypes.py diff --git a/pandas/tests/series/methods/test_head_tail.py b/pandas/tests/series/methods/test_head_tail.py new file mode 100644 index 0000000000000..d9f8d85eda350 --- /dev/null +++ b/pandas/tests/series/methods/test_head_tail.py @@ -0,0 +1,8 @@ +import pandas._testing as tm + + +def test_head_tail(string_series): + tm.assert_series_equal(string_series.head(), string_series[:5]) + tm.assert_series_equal(string_series.head(0), string_series[0:0]) + tm.assert_series_equal(string_series.tail(), string_series[-5:]) + tm.assert_series_equal(string_series.tail(0), string_series[0:0]) diff --git a/pandas/tests/series/test_reshaping.py b/pandas/tests/series/methods/test_unstack.py similarity index 100% rename from pandas/tests/series/test_reshaping.py rename to pandas/tests/series/methods/test_unstack.py diff --git a/pandas/tests/series/test_combine_concat.py b/pandas/tests/series/test_combine_concat.py index 239353d3955b4..4cb471597b67a 100644 --- a/pandas/tests/series/test_combine_concat.py +++ b/pandas/tests/series/test_combine_concat.py @@ -4,7 +4,7 @@ import pytest import pandas as pd -from pandas import DataFrame, Series +from pandas import DataFrame, Series, to_datetime import pandas._testing as tm @@ -252,7 +252,6 @@ def test_concat_empty_series_dtypes(self): assert result.dtype == expected def test_combine_first_dt64(self): - from pandas.core.tools.datetimes import to_datetime s0 = to_datetime(Series(["2010", np.NaN])) s1 = to_datetime(Series([np.NaN, "2011"])) diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index 640cd8faf6811..b377ca2869bd3 100644 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -2534,3 +2534,29 @@ def test_sort_ascending_list(self): result = s.sort_index(level=["third", "first"], ascending=[False, True]) expected = s.iloc[[0, 4, 1, 5, 2, 6, 3, 7]] tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize( + "keys, expected", + [ + (["b", "a"], [["b", "b", "a", "a"], [1, 2, 1, 2]]), + (["a", "b"], [["a", "a", "b", "b"], [1, 2, 1, 2]]), + ((["a", "b"], [1, 2]), [["a", "a", "b", "b"], [1, 2, 1, 2]]), + ((["a", "b"], [2, 1]), [["a", "a", "b", "b"], [2, 1, 2, 1]]), + ((["b", "a"], [2, 1]), [["b", "b", "a", "a"], [2, 1, 2, 1]]), + ((["b", "a"], [1, 2]), [["b", "b", "a", "a"], [1, 2, 1, 2]]), + ((["c", "a"], [2, 1]), [["c", "a", "a"], [1, 2, 1]]), + ], + ) + @pytest.mark.parametrize("dim", ["index", "columns"]) + def test_multilevel_index_loc_order(self, dim, keys, expected): + # GH 22797 + # Try to respect order of keys given for MultiIndex.loc + kwargs = {dim: [["c", "a", "a", "b", "b"], [1, 1, 2, 1, 2]]} + df = pd.DataFrame(np.arange(25).reshape(5, 5), **kwargs,) + exp_index = MultiIndex.from_arrays(expected) + if dim == "index": + res = df.loc[keys, :] + tm.assert_index_equal(res.index, exp_index) + elif dim == "columns": + res = df.loc[:, keys] + tm.assert_index_equal(res.columns, exp_index) diff --git a/pandas/tests/tseries/frequencies/test_to_offset.py b/pandas/tests/tseries/frequencies/test_to_offset.py index b6069c446160d..beaefe9109e91 100644 --- a/pandas/tests/tseries/frequencies/test_to_offset.py +++ b/pandas/tests/tseries/frequencies/test_to_offset.py @@ -86,7 +86,7 @@ def test_to_offset_invalid(freqstr): # We escape string because some of our # inputs contain regex special characters. - msg = re.escape("Invalid frequency: {freqstr}".format(freqstr=freqstr)) + msg = re.escape(f"Invalid frequency: {freqstr}") with pytest.raises(ValueError, match=msg): frequencies.to_offset(freqstr) diff --git a/pandas/tests/util/test_util.py b/pandas/tests/util/test_util.py index 6a19adef728e4..8860e6fe272ce 100644 --- a/pandas/tests/util/test_util.py +++ b/pandas/tests/util/test_util.py @@ -76,3 +76,8 @@ def test_rng_context(): with tm.RNGContext(1): assert np.random.randn() == expected1 assert np.random.randn() == expected0 + + +def test_external_error_raised(): + with tm.external_error_raised(TypeError): + raise TypeError("Should not check this error message, so it will pass") diff --git a/pandas/util/_print_versions.py b/pandas/util/_print_versions.py index 2801a2bf9c371..fdfa436ce6536 100644 --- a/pandas/util/_print_versions.py +++ b/pandas/util/_print_versions.py @@ -43,7 +43,8 @@ def get_sys_info() -> List[Tuple[str, Optional[Union[str, int]]]]: ("python-bits", struct.calcsize("P") * 8), ("OS", f"{sysname}"), ("OS-release", f"{release}"), - # ("Version", "{version}".format(version=version)), + # FIXME: dont leave commented-out + # ("Version", f"{version}"), ("machine", f"{machine}"), ("processor", f"{processor}"), ("byteorder", f"{sys.byteorder}"), @@ -114,14 +115,13 @@ def show_versions(as_json=False): else: maxlen = max(len(x) for x in deps) - tpl = "{{k:<{maxlen}}}: {{stat}}".format(maxlen=maxlen) print("\nINSTALLED VERSIONS") print("------------------") for k, stat in sys_info: - print(tpl.format(k=k, stat=stat)) + print(f"{{k:<{maxlen}}}: {{stat}}") print("") for k, stat in deps_blob: - print(tpl.format(k=k, stat=stat)) + print(f"{{k:<{maxlen}}}: {{stat}}") def main() -> int: diff --git a/setup.cfg b/setup.cfg index cf931f52489a8..c298aa652824c 100644 --- a/setup.cfg +++ b/setup.cfg @@ -138,9 +138,6 @@ ignore_errors=True [mypy-pandas.tests.extension.decimal.test_decimal] ignore_errors=True -[mypy-pandas.tests.extension.json.array] -ignore_errors=True - [mypy-pandas.tests.extension.json.test_json] ignore_errors=True diff --git a/web/pandas/config.yml b/web/pandas/config.yml index ef0b2a0270a0b..83eb152c9d944 100644 --- a/web/pandas/config.yml +++ b/web/pandas/config.yml @@ -35,15 +35,7 @@ navbar: - name: "Getting started" target: /getting_started.html - name: "Documentation" - target: - - name: "User guide" - target: /docs/user_guide/index.html - - name: "API reference" - target: /docs/reference/index.html - - name: "Release notes" - target: /docs/whatsnew/index.html - - name: "Older versions" - target: https://pandas.pydata.org/pandas-docs/version/ + target: /docs/ - name: "Community" target: - name: "Blog" diff --git a/web/pandas/index.html b/web/pandas/index.html index fedb0b0c5f712..83d0f48197033 100644 --- a/web/pandas/index.html +++ b/web/pandas/index.html @@ -63,7 +63,7 @@
With the support of:
{% if releases %}

Latest version: {{ releases[0].name }}