diff --git a/README.md b/README.md
index 1130eb30954dc..5342eda4390eb 100644
--- a/README.md
+++ b/README.md
@@ -5,82 +5,16 @@
-----------------
# pandas: powerful Python data analysis toolkit
-
-
-
- Latest Release |
-
-
-
-
- |
-
- |
-
-
-
-
- |
-
-
- Package Status |
-
-
-
-
- |
-
-
- License |
-
-
-
-
- |
-
-
- Build Status |
-
-
-
-
- |
-
-
- |
-
-
-
-
- |
-
-
- Coverage |
-
-
-
-
- |
-
-
- Downloads |
-
-
-
-
- |
-
-
- Gitter |
-
-
-
-
- |
-
-
-
-
+[](https://pypi.org/project/pandas/)
+[](https://anaconda.org/anaconda/pandas/)
+[](https://pypi.org/project/pandas/)
+[](https://github.com/pandas-dev/pandas/blob/master/LICENSE)
+[](https://travis-ci.org/pandas-dev/pandas)
+[](https://dev.azure.com/pandas-dev/pandas/_build/latest?definitionId=1&branch=master)
+[](https://codecov.io/gh/pandas-dev/pandas)
+[](https://pandas.pydata.org)
+[](https://gitter.im/pydata/pandas)
+[](https://numfocus.org)
## What is it?
diff --git a/ci/code_checks.sh b/ci/code_checks.sh
index 0cc42be42d61e..b46989894ae12 100755
--- a/ci/code_checks.sh
+++ b/ci/code_checks.sh
@@ -113,7 +113,7 @@ if [[ -z "$CHECK" || "$CHECK" == "lint" ]]; then
# Imports - Check formatting using isort see setup.cfg for settings
MSG='Check import format using isort' ; echo $MSG
- ISORT_CMD="isort --recursive --check-only pandas asv_bench"
+ ISORT_CMD="isort --quiet --recursive --check-only pandas asv_bench"
if [[ "$GITHUB_ACTIONS" == "true" ]]; then
eval $ISORT_CMD | awk '{print "##[error]" $0}'; RET=$(($RET + ${PIPESTATUS[0]}))
else
diff --git a/ci/deps/travis-36-cov.yaml b/ci/deps/travis-36-cov.yaml
index 869d2ab683f0c..6883301a63a9b 100644
--- a/ci/deps/travis-36-cov.yaml
+++ b/ci/deps/travis-36-cov.yaml
@@ -27,8 +27,7 @@ dependencies:
- numexpr
- numpy=1.15.*
- odfpy
- - openpyxl<=3.0.1
- # https://github.com/pandas-dev/pandas/pull/30009 openpyxl 3.0.2 broke
+ - openpyxl
- pandas-gbq
- psycopg2
- pyarrow>=0.13.0
diff --git a/ci/deps/travis-37.yaml b/ci/deps/travis-37.yaml
index 73e2c20b31438..682b1016ff3a2 100644
--- a/ci/deps/travis-37.yaml
+++ b/ci/deps/travis-37.yaml
@@ -2,7 +2,6 @@ name: pandas-dev
channels:
- defaults
- conda-forge
- - c3i_test
dependencies:
- python=3.7.*
diff --git a/doc/redirects.csv b/doc/redirects.csv
index 0a71f037d23c3..3a990b09e7f7d 100644
--- a/doc/redirects.csv
+++ b/doc/redirects.csv
@@ -46,7 +46,10 @@ developer,development/developer
extending,development/extending
internals,development/internals
-# api
+# api moved function
+reference/api/pandas.io.json.json_normalize,pandas.json_normalize
+
+# api rename
api,reference/index
generated/pandas.api.extensions.ExtensionArray.argsort,../reference/api/pandas.api.extensions.ExtensionArray.argsort
generated/pandas.api.extensions.ExtensionArray.astype,../reference/api/pandas.api.extensions.ExtensionArray.astype
diff --git a/doc/source/conf.py b/doc/source/conf.py
index 28df08a8607b9..c12c148d0f10d 100644
--- a/doc/source/conf.py
+++ b/doc/source/conf.py
@@ -209,6 +209,7 @@
"external_links": [],
"github_url": "https://github.com/pandas-dev/pandas",
"twitter_url": "https://twitter.com/pandas_dev",
+ "google_analytics_id": "UA-27880019-2",
}
# Add any paths that contain custom themes here, relative to this directory.
diff --git a/doc/source/getting_started/basics.rst b/doc/source/getting_started/basics.rst
index 4fef5efbd1551..277080006cb3c 100644
--- a/doc/source/getting_started/basics.rst
+++ b/doc/source/getting_started/basics.rst
@@ -1973,7 +1973,7 @@ Pandas has two ways to store strings.
1. ``object`` dtype, which can hold any Python object, including strings.
2. :class:`StringDtype`, which is dedicated to strings.
-Generally, we recommend using :class:`StringDtype`. See :ref:`text.types` fore more.
+Generally, we recommend using :class:`StringDtype`. See :ref:`text.types` for more.
Finally, arbitrary objects may be stored using the ``object`` dtype, but should
be avoided to the extent possible (for performance and interoperability with
diff --git a/doc/source/user_guide/visualization.rst b/doc/source/user_guide/visualization.rst
index 6680ba854cb6f..756dd06aced7f 100644
--- a/doc/source/user_guide/visualization.rst
+++ b/doc/source/user_guide/visualization.rst
@@ -1641,3 +1641,46 @@ when plotting a large number of points.
:suppress:
plt.close('all')
+
+Plotting backends
+-----------------
+
+Starting in version 0.25, pandas can be extended with third-party plotting backends. The
+main idea is letting users select a plotting backend different than the provided
+one based on Matplotlib.
+
+This can be done by passsing 'backend.module' as the argument ``backend`` in ``plot``
+function. For example:
+
+.. code-block:: python
+
+ >>> Series([1, 2, 3]).plot(backend='backend.module')
+
+Alternatively, you can also set this option globally, do you don't need to specify
+the keyword in each ``plot`` call. For example:
+
+.. code-block:: python
+
+ >>> pd.set_option('plotting.backend', 'backend.module')
+ >>> pd.Series([1, 2, 3]).plot()
+
+Or:
+
+.. code-block:: python
+
+ >>> pd.options.plotting.backend = 'backend.module'
+ >>> pd.Series([1, 2, 3]).plot()
+
+This would be more or less equivalent to:
+
+.. code-block:: python
+
+ >>> import backend.module
+ >>> backend.module.plot(pd.Series([1, 2, 3]))
+
+The backend module can then use other visualization tools (Bokeh, Altair, hvplot,...)
+to generate the plots. Some libraries implementing a backend for pandas are listed
+on the ecosystem :ref:`ecosystem.visualization` page.
+
+Developers guide can be found at
+https://dev.pandas.io/docs/development/extending.html#plotting-backends
\ No newline at end of file
diff --git a/doc/source/whatsnew/index.rst b/doc/source/whatsnew/index.rst
index 111caa81f7169..68aabfe76d8de 100644
--- a/doc/source/whatsnew/index.rst
+++ b/doc/source/whatsnew/index.rst
@@ -26,6 +26,7 @@ Version 1.0
v1.0.0
v1.0.1
+ v1.0.2
Version 0.25
------------
diff --git a/doc/source/whatsnew/v1.0.1.rst b/doc/source/whatsnew/v1.0.1.rst
index 801d97b777e00..ef3bb8161d13f 100644
--- a/doc/source/whatsnew/v1.0.1.rst
+++ b/doc/source/whatsnew/v1.0.1.rst
@@ -1,7 +1,7 @@
.. _whatsnew_101:
-What's new in 1.0.1 (??)
-------------------------
+What's new in 1.0.1 (February 5, 2020)
+--------------------------------------
These are the changes in pandas 1.0.1. See :ref:`release` for a full changelog
including other versions of pandas.
@@ -10,126 +10,64 @@ including other versions of pandas.
.. ---------------------------------------------------------------------------
+.. _whatsnew_101.regressions:
-.. _whatsnew_101.bug_fixes:
-
-Bug fixes
-~~~~~~~~~
-- Bug in :meth:`GroupBy.apply` was raising ``TypeError`` if called with function which returned a non-pandas non-scalar object (e.g. a list) (:issue:`31441`)
-
-Categorical
-^^^^^^^^^^^
-
--
--
-
-Datetimelike
-^^^^^^^^^^^^
-- Fixed regression in :meth:`to_datetime` when parsing non-nanosecond resolution datetimes (:issue:`31491`)
-- Fixed bug in :meth:`to_datetime` raising when ``cache=True`` and out-of-bound values are present (:issue:`31491`)
-
-Timedelta
-^^^^^^^^^
-
--
--
-
-Timezones
-^^^^^^^^^
-
--
--
-
-
-Numeric
-^^^^^^^
-- Bug in dtypes being lost in ``DataFrame.__invert__`` (``~`` operator) with mixed dtypes (:issue:`31183`)
-- Bug in :class:`Series` multiplication when multiplying a numeric :class:`Series` with >10000 elements with a timedelta-like scalar (:issue:`31467`)
--
-
-Conversion
-^^^^^^^^^^
-
--
--
-
-Strings
-^^^^^^^
-
--
--
-
+Fixed regressions
+~~~~~~~~~~~~~~~~~
-Interval
-^^^^^^^^
-
--
--
-
-Indexing
-^^^^^^^^
-
-- Fixed regression when indexing a ``Series`` or ``DataFrame`` indexed by ``DatetimeIndex`` with a slice containg a :class:`datetime.date` (:issue:`31501`)
- Fixed regression in :class:`DataFrame` setting values with a slice (e.g. ``df[-4:] = 1``) indexing by label instead of position (:issue:`31469`)
--
--
-- Bug where assigning to a :class:`Series` using a IntegerArray / BooleanArray as a mask would raise ``TypeError`` (:issue:`31446`)
-
-Missing
-^^^^^^^
-
--
--
-
-MultiIndex
-^^^^^^^^^^
-
--
--
+- Fixed regression when indexing a ``Series`` or ``DataFrame`` indexed by ``DatetimeIndex`` with a slice containg a :class:`datetime.date` (:issue:`31501`)
+- Fixed regression in ``DataFrame.__setitem__`` raising an ``AttributeError`` with a :class:`MultiIndex` and a non-monotonic indexer (:issue:`31449`)
+- Fixed regression in :class:`Series` multiplication when multiplying a numeric :class:`Series` with >10000 elements with a timedelta-like scalar (:issue:`31457`)
+- Fixed regression in ``.groupby().agg()`` raising an ``AssertionError`` for some reductions like ``min`` on object-dtype columns (:issue:`31522`)
+- Fixed regression in ``.groupby()`` aggregations with categorical dtype using Cythonized reduction functions (e.g. ``first``) (:issue:`31450`)
+- Fixed regression in :meth:`GroupBy.apply` if called with a function which returned a non-pandas non-scalar object (e.g. a list or numpy array) (:issue:`31441`)
+- Fixed regression in :meth:`DataFrame.groupby` whereby taking the minimum or maximum of a column with period dtype would raise a ``TypeError``. (:issue:`31471`)
+- Fixed regression in :meth:`DataFrame.groupby` with an empty DataFrame grouping by a level of a MultiIndex (:issue:`31670`).
+- Fixed regression in :meth:`DataFrame.apply` with object dtype and non-reducing function (:issue:`31505`)
+- Fixed regression in :meth:`to_datetime` when parsing non-nanosecond resolution datetimes (:issue:`31491`)
+- Fixed regression in :meth:`~DataFrame.to_csv` where specifying an ``na_rep`` might truncate the values written (:issue:`31447`)
+- Fixed regression in :class:`Categorical` construction with ``numpy.str_`` categories (:issue:`31499`)
+- Fixed regression in :meth:`DataFrame.loc` and :meth:`DataFrame.iloc` when selecting a row containing a single ``datetime64`` or ``timedelta64`` column (:issue:`31649`)
+- Fixed regression where setting :attr:`pd.options.display.max_colwidth` was not accepting negative integer. In addition, this behavior has been deprecated in favor of using ``None`` (:issue:`31532`)
+- Fixed regression in objTOJSON.c fix return-type warning (:issue:`31463`)
+- Fixed regression in :meth:`qcut` when passed a nullable integer. (:issue:`31389`)
+- Fixed regression in assigning to a :class:`Series` using a nullable integer dtype (:issue:`31446`)
+- Fixed performance regression when indexing a ``DataFrame`` or ``Series`` with a :class:`MultiIndex` for the index using a list of labels (:issue:`31648`)
+- Fixed regression in :meth:`read_csv` used in file like object ``RawIOBase`` is not recognize ``encoding`` option (:issue:`31575`)
-I/O
-^^^
+.. ---------------------------------------------------------------------------
-- Fixed regression in :meth:`~DataFrame.to_csv` where specifying an ``na_rep`` might truncate the values written (:issue:`31447`)
--
--
+.. _whatsnew_101.deprecations:
-Plotting
-^^^^^^^^
+Deprecations
+~~~~~~~~~~~~
--
--
+- Support for negative integer for :attr:`pd.options.display.max_colwidth` is deprecated in favor of using ``None`` (:issue:`31532`)
-Groupby/resample/rolling
-^^^^^^^^^^^^^^^^^^^^^^^^
+.. ---------------------------------------------------------------------------
--
--
+.. _whatsnew_101.bug_fixes:
+Bug fixes
+~~~~~~~~~
-Reshaping
-^^^^^^^^^
+**Datetimelike**
--
--
+- Fixed bug in :meth:`to_datetime` raising when ``cache=True`` and out-of-bound values are present (:issue:`31491`)
-Sparse
-^^^^^^
+**Numeric**
--
--
+- Bug in dtypes being lost in ``DataFrame.__invert__`` (``~`` operator) with mixed dtypes (:issue:`31183`)
+ and for extension-array backed ``Series`` and ``DataFrame`` (:issue:`23087`)
-ExtensionArray
-^^^^^^^^^^^^^^
+**Plotting**
-- Bug in dtype being lost in ``__invert__`` (``~`` operator) for extension-array backed ``Series`` and ``DataFrame`` (:issue:`23087`)
--
+- Plotting tz-aware timeseries no longer gives UserWarning (:issue:`31205`)
+**Interval**
-Other
-^^^^^
--
--
+- Bug in :meth:`Series.shift` with ``interval`` dtype raising a ``TypeError`` when shifting an interval array of integers or datetimes (:issue:`34195`)
.. ---------------------------------------------------------------------------
@@ -137,3 +75,5 @@ Other
Contributors
~~~~~~~~~~~~
+
+.. contributors:: v1.0.0..v1.0.1|HEAD
diff --git a/doc/source/whatsnew/v1.0.2.rst b/doc/source/whatsnew/v1.0.2.rst
new file mode 100644
index 0000000000000..70aaaa6d0a60d
--- /dev/null
+++ b/doc/source/whatsnew/v1.0.2.rst
@@ -0,0 +1,39 @@
+.. _whatsnew_102:
+
+What's new in 1.0.2 (February ??, 2020)
+---------------------------------------
+
+These are the changes in pandas 1.0.2. See :ref:`release` for a full changelog
+including other versions of pandas.
+
+{{ header }}
+
+.. ---------------------------------------------------------------------------
+
+.. _whatsnew_102.regressions:
+
+Fixed regressions
+~~~~~~~~~~~~~~~~~
+
+- Fixed regression in :meth:`DataFrame.to_excel` when ``columns`` kwarg is passed (:issue:`31677`)
+-
+
+.. ---------------------------------------------------------------------------
+
+.. _whatsnew_102.bug_fixes:
+
+Bug fixes
+~~~~~~~~~
+
+**I/O**
+
+- Using ``pd.NA`` with :meth:`DataFrame.to_json` now correctly outputs a null value instead of an empty object (:issue:`31615`)
+
+.. ---------------------------------------------------------------------------
+
+.. _whatsnew_102.contributors:
+
+Contributors
+~~~~~~~~~~~~
+
+.. contributors:: v1.0.1..v1.0.2|HEAD
\ No newline at end of file
diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst
index e07a8fa0469f4..aea5695a96388 100644
--- a/doc/source/whatsnew/v1.1.0.rst
+++ b/doc/source/whatsnew/v1.1.0.rst
@@ -60,7 +60,11 @@ Backwards incompatible API changes
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- :meth:`DataFrame.swaplevels` now raises a ``TypeError`` if the axis is not a :class:`MultiIndex`.
Previously a ``AttributeError`` was raised (:issue:`31126`)
-
+- :meth:`DataFrameGroupby.mean` and :meth:`SeriesGroupby.mean` (and similarly for :meth:`~DataFrameGroupby.median`, :meth:`~DataFrameGroupby.std`` and :meth:`~DataFrameGroupby.var``)
+ now raise a ``TypeError`` if a not-accepted keyword argument is passed into it.
+ Previously a ``UnsupportedFunctionCall`` was raised (``AssertionError`` if ``min_count`` passed into :meth:`~DataFrameGroupby.median``) (:issue:`31485`)
+- :meth:`DataFrame.at` and :meth:`Series.at` will raise a ``TypeError`` instead of a ``ValueError`` if an incompatible key is passed, and ``KeyError`` if a missing key is passed, matching the behavior of ``.loc[]`` (:issue:`31722`)
+-
.. ---------------------------------------------------------------------------
@@ -105,11 +109,13 @@ Datetimelike
- Bug in :class:`Timestamp` where constructing :class:`Timestamp` from ambiguous epoch time and calling constructor again changed :meth:`Timestamp.value` property (:issue:`24329`)
- :meth:`DatetimeArray.searchsorted`, :meth:`TimedeltaArray.searchsorted`, :meth:`PeriodArray.searchsorted` not recognizing non-pandas scalars and incorrectly raising ``ValueError`` instead of ``TypeError`` (:issue:`30950`)
- Bug in :class:`Timestamp` where constructing :class:`Timestamp` with dateutil timezone less than 128 nanoseconds before daylight saving time switch from winter to summer would result in nonexistent time (:issue:`31043`)
+- Bug in :meth:`DataFrame.reindex` and :meth:`Series.reindex` when reindexing with a tz-aware index (:issue:`26683`)
+- Bug in :meth:`Period.to_timestamp`, :meth:`Period.start_time` with microsecond frequency returning a timestamp one nanosecond earlier than the correct time (:issue:`31475`)
Timedelta
^^^^^^^^^
--
+- Bug in constructing a :class:`Timedelta` with a high precision integer that would round the :class:`Timedelta` components (:issue:`31354`)
-
Timezones
@@ -150,7 +156,8 @@ Indexing
- Bug in :meth:`PeriodIndex.get_loc` treating higher-resolution strings differently from :meth:`PeriodIndex.get_value` (:issue:`31172`)
- Bug in :meth:`Series.at` and :meth:`DataFrame.at` not matching ``.loc`` behavior when looking up an integer in a :class:`Float64Index` (:issue:`31329`)
- Bug in :meth:`PeriodIndex.is_monotonic` incorrectly returning ``True`` when containing leading ``NaT`` entries (:issue:`31437`)
--
+- Bug in :meth:`DatetimeIndex.get_loc` raising ``KeyError`` with converted-integer key instead of the user-passed key (:issue:`31425`)
+- Bug in :meth:`Series.xs` incorrectly returning ``Timestamp`` instead of ``datetime64`` in some object-dtype cases (:issue:`31630`)
Missing
^^^^^^^
@@ -160,15 +167,24 @@ Missing
MultiIndex
^^^^^^^^^^
+- Bug in :meth:`Dataframe.loc` when used with a :class:`MultiIndex`. The returned values were not in the same order as the given inputs (:issue:`22797`)
--
+.. ipython:: python
+
+ df = pd.DataFrame(np.arange(4),
+ index=[["a", "a", "b", "b"], [1, 2, 1, 2]])
+ # Rows are now ordered as the requested keys
+ df.loc[(['b', 'a'], [2, 1]), :]
-
I/O
^^^
- Bug in :meth:`read_json` where integer overflow was occuring when json contains big number strings. (:issue:`30320`)
--
--
+- `read_csv` will now raise a ``ValueError`` when the arguments `header` and `prefix` both are not `None`. (:issue:`27394`)
+- Bug in :meth:`DataFrame.to_json` was raising ``NotFoundError`` when ``path_or_buf`` was an S3 URI (:issue:`28375`)
+- Bug in :meth:`DataFrame.to_parquet` overwriting pyarrow's default for
+ ``coerce_timestamps``; following pyarrow's default allows writing nanosecond
+ timestamps with ``version="2.0"`` (:issue:`31652`).
Plotting
^^^^^^^^
@@ -210,7 +226,7 @@ Other
^^^^^
- Appending a dictionary to a :class:`DataFrame` without passing ``ignore_index=True`` will raise ``TypeError: Can only append a dict if ignore_index=True``
instead of ``TypeError: Can only append a Series if ignore_index=True or if the Series has a name`` (:issue:`30871`)
--
+- Set operations on an object-dtype :class:`Index` now always return object-dtype results (:issue:`31401`)
.. ---------------------------------------------------------------------------
diff --git a/doc/sphinxext/announce.py b/doc/sphinxext/announce.py
index f394aac5c545b..e4859157f73de 100755
--- a/doc/sphinxext/announce.py
+++ b/doc/sphinxext/announce.py
@@ -57,6 +57,16 @@ def get_authors(revision_range):
pat = "^.*\\t(.*)$"
lst_release, cur_release = [r.strip() for r in revision_range.split("..")]
+ if "|" in cur_release:
+ # e.g. v1.0.1|HEAD
+ maybe_tag, head = cur_release.split("|")
+ assert head == "HEAD"
+ if maybe_tag in this_repo.tags:
+ cur_release = maybe_tag
+ else:
+ cur_release = head
+ revision_range = f"{lst_release}..{cur_release}"
+
# authors, in current release and previous to current release.
cur = set(re.findall(pat, this_repo.git.shortlog("-s", revision_range), re.M))
pre = set(re.findall(pat, this_repo.git.shortlog("-s", lst_release), re.M))
diff --git a/doc/sphinxext/contributors.py b/doc/sphinxext/contributors.py
index d9ba2bb2cfb07..c2b21e40cadad 100644
--- a/doc/sphinxext/contributors.py
+++ b/doc/sphinxext/contributors.py
@@ -6,7 +6,13 @@
This will be replaced with a message indicating the number of
code contributors and commits, and then list each contributor
-individually.
+individually. For development versions (before a tag is available)
+use::
+
+ .. contributors:: v0.23.0..v0.23.1|HEAD
+
+While the v0.23.1 tag does not exist, that will use the HEAD of the
+branch as the end of the revision range.
"""
from announce import build_components
from docutils import nodes
diff --git a/pandas/_config/config.py b/pandas/_config/config.py
index cacd6f5454de7..8b6116d3abd60 100644
--- a/pandas/_config/config.py
+++ b/pandas/_config/config.py
@@ -155,9 +155,7 @@ def _describe_option(pat: str = "", _print_desc: bool = True):
if len(keys) == 0:
raise OptionError("No such keys(s)")
- s = ""
- for k in keys: # filter by pat
- s += _build_option_description(k)
+ s = "\n".join([_build_option_description(k) for k in keys])
if _print_desc:
print(s)
diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in
index 7d57c67e70b58..6671375f628e7 100644
--- a/pandas/_libs/hashtable_class_helper.pxi.in
+++ b/pandas/_libs/hashtable_class_helper.pxi.in
@@ -670,7 +670,9 @@ cdef class StringHashTable(HashTable):
val = values[i]
if isinstance(val, str):
- v = get_c_string(val)
+ # GH#31499 if we have a np.str_ get_c_string wont recognize
+ # it as a str, even though isinstance does.
+ v = get_c_string(val)
else:
v = get_c_string(self.na_string_sentinel)
vecs[i] = v
@@ -703,7 +705,9 @@ cdef class StringHashTable(HashTable):
val = values[i]
if isinstance(val, str):
- v = get_c_string(val)
+ # GH#31499 if we have a np.str_ get_c_string wont recognize
+ # it as a str, even though isinstance does.
+ v = get_c_string(val)
else:
v = get_c_string(self.na_string_sentinel)
vecs[i] = v
diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx
index 1915eaf6e07dd..4185cc2084469 100644
--- a/pandas/_libs/index.pyx
+++ b/pandas/_libs/index.pyx
@@ -1,17 +1,12 @@
-from datetime import datetime, timedelta, date
import warnings
-import cython
-
import numpy as np
cimport numpy as cnp
from numpy cimport (ndarray, intp_t,
float64_t, float32_t,
int64_t, int32_t, int16_t, int8_t,
- uint64_t, uint32_t, uint16_t, uint8_t,
- # Note: NPY_DATETIME, NPY_TIMEDELTA are only available
- # for cimport in cython>=0.27.3
- NPY_DATETIME, NPY_TIMEDELTA)
+ uint64_t, uint32_t, uint16_t, uint8_t
+)
cnp.import_array()
@@ -23,7 +18,7 @@ from pandas._libs.tslibs.c_timestamp cimport _Timestamp
from pandas._libs.hashtable cimport HashTable
from pandas._libs import algos, hashtable as _hash
-from pandas._libs.tslibs import Timestamp, Timedelta, period as periodlib
+from pandas._libs.tslibs import Timedelta, period as periodlib
from pandas._libs.missing import checknull
@@ -35,16 +30,6 @@ cdef inline bint is_definitely_invalid_key(object val):
return False
-cpdef get_value_at(ndarray arr, object loc, object tz=None):
- obj = util.get_value_at(arr, loc)
-
- if arr.descr.type_num == NPY_DATETIME:
- return Timestamp(obj, tz=tz)
- elif arr.descr.type_num == NPY_TIMEDELTA:
- return Timedelta(obj)
- return obj
-
-
# Don't populate hash tables in monotonic indexes larger than this
_SIZE_CUTOFF = 1_000_000
@@ -72,35 +57,6 @@ cdef class IndexEngine:
self._ensure_mapping_populated()
return val in self.mapping
- cpdef get_value(self, ndarray arr, object key, object tz=None):
- """
- Parameters
- ----------
- arr : 1-dimensional ndarray
- """
- cdef:
- object loc
-
- loc = self.get_loc(key)
- if isinstance(loc, slice) or util.is_array(loc):
- return arr[loc]
- else:
- return get_value_at(arr, loc, tz=tz)
-
- cpdef set_value(self, ndarray arr, object key, object value):
- """
- Parameters
- ----------
- arr : 1-dimensional ndarray
- """
- cdef:
- object loc
-
- loc = self.get_loc(key)
- value = convert_scalar(arr, value)
-
- arr[loc] = value
-
cpdef get_loc(self, object val):
cdef:
Py_ssize_t loc
@@ -549,54 +505,6 @@ cdef class PeriodEngine(Int64Engine):
return super(PeriodEngine, self).get_indexer_non_unique(ordinal_array)
-cpdef convert_scalar(ndarray arr, object value):
- # we don't turn integers
- # into datetimes/timedeltas
-
- # we don't turn bools into int/float/complex
-
- if arr.descr.type_num == NPY_DATETIME:
- if util.is_array(value):
- pass
- elif isinstance(value, (datetime, np.datetime64, date)):
- return Timestamp(value).to_datetime64()
- elif util.is_timedelta64_object(value):
- # exclude np.timedelta64("NaT") from value != value below
- pass
- elif value is None or value != value:
- return np.datetime64("NaT", "ns")
- raise ValueError("cannot set a Timestamp with a non-timestamp "
- f"{type(value).__name__}")
-
- elif arr.descr.type_num == NPY_TIMEDELTA:
- if util.is_array(value):
- pass
- elif isinstance(value, timedelta) or util.is_timedelta64_object(value):
- value = Timedelta(value)
- if value is NaT:
- return np.timedelta64("NaT", "ns")
- return value.to_timedelta64()
- elif util.is_datetime64_object(value):
- # exclude np.datetime64("NaT") which would otherwise be picked up
- # by the `value != value check below
- pass
- elif value is None or value != value:
- return np.timedelta64("NaT", "ns")
- raise ValueError("cannot set a Timedelta with a non-timedelta "
- f"{type(value).__name__}")
-
- if (issubclass(arr.dtype.type, (np.integer, np.floating, np.complex)) and
- not issubclass(arr.dtype.type, np.bool_)):
- if util.is_bool_object(value):
- raise ValueError("Cannot assign bool to float/integer series")
-
- if issubclass(arr.dtype.type, (np.integer, np.bool_)):
- if util.is_float_object(value) and value != value:
- raise ValueError("Cannot assign nan to integer series")
-
- return value
-
-
cdef class BaseMultiIndexCodesEngine:
"""
Base class for MultiIndexUIntEngine and MultiIndexPyIntEngine, which
diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx
index 377d49f2bbd29..3077f73a8d1a4 100644
--- a/pandas/_libs/parsers.pyx
+++ b/pandas/_libs/parsers.pyx
@@ -638,7 +638,7 @@ cdef class TextReader:
raise ValueError(f'Unrecognized compression type: '
f'{self.compression}')
- if self.encoding and isinstance(source, io.BufferedIOBase):
+ if self.encoding and isinstance(source, (io.BufferedIOBase, io.RawIOBase)):
source = io.TextIOWrapper(
source, self.encoding.decode('utf-8'), newline='')
diff --git a/pandas/_libs/reduction.pyx b/pandas/_libs/reduction.pyx
index 89164c527002a..43d253f632f0f 100644
--- a/pandas/_libs/reduction.pyx
+++ b/pandas/_libs/reduction.pyx
@@ -114,7 +114,8 @@ cdef class Reducer:
if self.typ is not None:
# In this case, we also have self.index
name = labels[i]
- cached_typ = self.typ(chunk, index=self.index, name=name)
+ cached_typ = self.typ(
+ chunk, index=self.index, name=name, dtype=arr.dtype)
# use the cached_typ if possible
if cached_typ is not None:
diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c
index 62c2870c198c4..8cfc20ffd2c1c 100644
--- a/pandas/_libs/src/ujson/python/objToJSON.c
+++ b/pandas/_libs/src/ujson/python/objToJSON.c
@@ -53,6 +53,7 @@ static PyTypeObject *cls_dataframe;
static PyTypeObject *cls_series;
static PyTypeObject *cls_index;
static PyTypeObject *cls_nat;
+static PyTypeObject *cls_na;
PyObject *cls_timedelta;
npy_int64 get_nat(void) { return NPY_MIN_INT64; }
@@ -127,7 +128,6 @@ typedef struct __PyObjectEncoder {
// pass-through to encode numpy data directly
int npyType;
void *npyValue;
- TypeContext basicTypeContext;
int datetimeIso;
NPY_DATETIMEUNIT datetimeUnit;
@@ -150,6 +150,7 @@ int PdBlock_iterNext(JSOBJ, JSONTypeContext *);
void *initObjToJSON(void) {
PyObject *mod_pandas;
PyObject *mod_nattype;
+ PyObject *mod_natype;
PyObject *mod_decimal = PyImport_ImportModule("decimal");
type_decimal =
(PyTypeObject *)PyObject_GetAttrString(mod_decimal, "Decimal");
@@ -175,8 +176,16 @@ void *initObjToJSON(void) {
Py_DECREF(mod_nattype);
}
+ mod_natype = PyImport_ImportModule("pandas._libs.missing");
+ if (mod_natype) {
+ cls_na = (PyTypeObject *)PyObject_GetAttrString(mod_natype, "NAType");
+ Py_DECREF(mod_natype);
+ }
+
/* Initialise numpy API */
import_array();
+ // GH 31463
+ return NULL;
}
static TypeContext *createTypeContext(void) {
@@ -925,15 +934,15 @@ char *Tuple_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *Py_UNUSED(tc),
}
//=============================================================================
-// Iterator iteration functions
+// Set iteration functions
// itemValue is borrowed reference, no ref counting
//=============================================================================
-void Iter_iterBegin(JSOBJ obj, JSONTypeContext *tc) {
+void Set_iterBegin(JSOBJ obj, JSONTypeContext *tc) {
GET_TC(tc)->itemValue = NULL;
GET_TC(tc)->iterator = PyObject_GetIter(obj);
}
-int Iter_iterNext(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) {
+int Set_iterNext(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) {
PyObject *item;
if (GET_TC(tc)->itemValue) {
@@ -951,7 +960,7 @@ int Iter_iterNext(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) {
return 1;
}
-void Iter_iterEnd(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) {
+void Set_iterEnd(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) {
if (GET_TC(tc)->itemValue) {
Py_DECREF(GET_TC(tc)->itemValue);
GET_TC(tc)->itemValue = NULL;
@@ -963,11 +972,11 @@ void Iter_iterEnd(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) {
}
}
-JSOBJ Iter_iterGetValue(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) {
+JSOBJ Set_iterGetValue(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) {
return GET_TC(tc)->itemValue;
}
-char *Iter_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *Py_UNUSED(tc),
+char *Set_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *Py_UNUSED(tc),
size_t *Py_UNUSED(outLen)) {
return NULL;
}
@@ -1788,6 +1797,10 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) {
"%R (0d array) is not JSON serializable at the moment",
obj);
goto INVALID;
+ } else if (PyObject_TypeCheck(obj, cls_na)) {
+ PRINTMARK();
+ tc->type = JT_NULL;
+ return;
}
ISITERABLE:
@@ -2040,11 +2053,11 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) {
} else if (PyAnySet_Check(obj)) {
PRINTMARK();
tc->type = JT_ARRAY;
- pc->iterBegin = Iter_iterBegin;
- pc->iterEnd = Iter_iterEnd;
- pc->iterNext = Iter_iterNext;
- pc->iterGetValue = Iter_iterGetValue;
- pc->iterGetName = Iter_iterGetName;
+ pc->iterBegin = Set_iterBegin;
+ pc->iterEnd = Set_iterEnd;
+ pc->iterNext = Set_iterNext;
+ pc->iterGetValue = Set_iterGetValue;
+ pc->iterGetName = Set_iterGetName;
return;
}
@@ -2115,10 +2128,7 @@ void Object_endTypeContext(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) {
PyObject_Free(GET_TC(tc)->cStr);
GET_TC(tc)->cStr = NULL;
- if (tc->prv !=
- &(((PyObjectEncoder *)tc->encoder)->basicTypeContext)) { // NOLINT
- PyObject_Free(tc->prv);
- }
+ PyObject_Free(tc->prv);
tc->prv = NULL;
}
}
@@ -2216,16 +2226,6 @@ PyObject *objToJSON(PyObject *Py_UNUSED(self), PyObject *args,
pyEncoder.datetimeUnit = NPY_FR_ms;
pyEncoder.outputFormat = COLUMNS;
pyEncoder.defaultHandler = 0;
- pyEncoder.basicTypeContext.newObj = NULL;
- pyEncoder.basicTypeContext.dictObj = NULL;
- pyEncoder.basicTypeContext.itemValue = NULL;
- pyEncoder.basicTypeContext.itemName = NULL;
- pyEncoder.basicTypeContext.attrList = NULL;
- pyEncoder.basicTypeContext.iterator = NULL;
- pyEncoder.basicTypeContext.cStr = NULL;
- pyEncoder.basicTypeContext.npyarr = NULL;
- pyEncoder.basicTypeContext.rowLabels = NULL;
- pyEncoder.basicTypeContext.columnLabels = NULL;
PRINTMARK();
diff --git a/pandas/_libs/tslibs/conversion.pyx b/pandas/_libs/tslibs/conversion.pyx
index e0862b9250045..bf38fcfb6103c 100644
--- a/pandas/_libs/tslibs/conversion.pyx
+++ b/pandas/_libs/tslibs/conversion.pyx
@@ -29,7 +29,7 @@ from pandas._libs.tslibs.util cimport (
from pandas._libs.tslibs.timedeltas cimport cast_from_unit
from pandas._libs.tslibs.timezones cimport (
is_utc, is_tzlocal, is_fixed_offset, get_utcoffset, get_dst_info,
- get_timezone, maybe_get_tz, tz_compare, treat_tz_as_dateutil)
+ get_timezone, maybe_get_tz, tz_compare)
from pandas._libs.tslibs.timezones import UTC
from pandas._libs.tslibs.parsing import parse_datetime_string
@@ -341,14 +341,6 @@ cdef _TSObject convert_datetime_to_tsobject(datetime ts, object tz,
obj.tzinfo = tz
else:
obj.value = pydatetime_to_dt64(ts, &obj.dts)
- # GH 24329 When datetime is ambiguous,
- # pydatetime_to_dt64 doesn't take DST into account
- # but with dateutil timezone, get_utcoffset does
- # so we need to correct for it
- if treat_tz_as_dateutil(ts.tzinfo):
- if ts.tzinfo.is_ambiguous(ts):
- dst_offset = ts.tzinfo.dst(ts)
- obj.value += int(dst_offset.total_seconds() * 1e9)
obj.tzinfo = ts.tzinfo
if obj.tzinfo is not None and not is_utc(obj.tzinfo):
diff --git a/pandas/_libs/tslibs/nattype.pyx b/pandas/_libs/tslibs/nattype.pyx
index 357f183b3a845..9f6f401a1a5f5 100644
--- a/pandas/_libs/tslibs/nattype.pyx
+++ b/pandas/_libs/tslibs/nattype.pyx
@@ -2,7 +2,7 @@ from cpython.object cimport (
PyObject_RichCompare,
Py_GT, Py_GE, Py_EQ, Py_NE, Py_LT, Py_LE)
-from cpython.datetime cimport (datetime,
+from cpython.datetime cimport (datetime, timedelta,
PyDateTime_Check, PyDelta_Check,
PyDateTime_IMPORT)
@@ -276,13 +276,6 @@ cdef class _NaT(datetime):
def __long__(self):
return NPY_NAT
- def total_seconds(self):
- """
- Total duration of timedelta in seconds (to microsecond precision).
- """
- # GH#10939
- return np.nan
-
@property
def is_leap_year(self):
return False
@@ -386,6 +379,7 @@ class NaTType(_NaT):
# nan methods
weekday = _make_nan_func('weekday', datetime.weekday.__doc__)
isoweekday = _make_nan_func('isoweekday', datetime.isoweekday.__doc__)
+ total_seconds = _make_nan_func('total_seconds', timedelta.total_seconds.__doc__)
month_name = _make_nan_func('month_name', # noqa:E128
"""
Return the month name of the Timestamp with specified locale.
diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx
index 3dd560ece188d..9419f0eba39aa 100644
--- a/pandas/_libs/tslibs/period.pyx
+++ b/pandas/_libs/tslibs/period.pyx
@@ -22,7 +22,7 @@ PyDateTime_IMPORT
from pandas._libs.tslibs.np_datetime cimport (
npy_datetimestruct, dtstruct_to_dt64, dt64_to_dtstruct,
pandas_datetime_to_datetimestruct, check_dts_bounds,
- NPY_DATETIMEUNIT, NPY_FR_D)
+ NPY_DATETIMEUNIT, NPY_FR_D, NPY_FR_us)
cdef extern from "src/datetime/np_datetime.h":
int64_t npy_datetimestruct_to_datetime(NPY_DATETIMEUNIT fr,
@@ -272,6 +272,8 @@ cdef int64_t DtoB_weekday(int64_t unix_date) nogil:
cdef int64_t DtoB(npy_datetimestruct *dts, int roll_back,
int64_t unix_date) nogil:
+ # calculate the current week (counting from 1970-01-01) treating
+ # sunday as last day of a week
cdef:
int day_of_week = dayofweek(dts.year, dts.month, dts.day)
@@ -473,9 +475,6 @@ cdef int DtoQ_yq(int64_t ordinal, asfreq_info *af_info, int *year) nogil:
int quarter
pandas_datetime_to_datetimestruct(ordinal, NPY_FR_D, &dts)
- # TODO: Another version of this function used
- # date_info_from_days_and_time(&dts, unix_date, 0)
- # instead of pandas_datetime_to_datetimestruct; is one more performant?
if af_info.to_end != 12:
dts.month -= af_info.to_end
if dts.month <= 0:
@@ -509,14 +508,18 @@ cdef int64_t asfreq_DTtoM(int64_t ordinal, asfreq_info *af_info) nogil:
cdef int64_t asfreq_DTtoW(int64_t ordinal, asfreq_info *af_info) nogil:
ordinal = downsample_daytime(ordinal, af_info)
- return (ordinal + 3 - af_info.to_end) // 7 + 1
+ return unix_date_to_week(ordinal, af_info.to_end)
+
+
+cdef int64_t unix_date_to_week(int64_t unix_date, int to_end) nogil:
+ return (unix_date + 3 - to_end) // 7 + 1
# --------------------------------------------------------------------
# Conversion _from_ BusinessDay Freq
cdef int64_t asfreq_BtoDT(int64_t ordinal, asfreq_info *af_info) nogil:
- ordinal = ((ordinal + 3) // 5) * 7 + (ordinal + 3) % 5 -3
+ ordinal = ((ordinal + 3) // 5) * 7 + (ordinal + 3) % 5 - 3
return upsample_daytime(ordinal, af_info)
@@ -753,14 +756,7 @@ cdef int64_t get_period_ordinal(npy_datetimestruct *dts, int freq) nogil:
if fmonth == 0:
fmonth = 12
- mdiff = dts.month - fmonth
- # TODO: Aren't the next two conditions equivalent to
- # unconditional incrementing?
- if mdiff < 0:
- mdiff += 12
- if dts.month >= fmonth:
- mdiff += 12
-
+ mdiff = dts.month - fmonth + 12
return (dts.year - 1970) * 4 + (mdiff - 1) // 3
elif freq == FR_MTH:
@@ -797,23 +793,10 @@ cdef int64_t get_period_ordinal(npy_datetimestruct *dts, int freq) nogil:
return unix_date
elif freq == FR_BUS:
- # calculate the current week (counting from 1970-01-01) treating
- # sunday as last day of a week
- weeks = (unix_date + 3) // 7
- # calculate the current weekday (in range 1 .. 7)
- delta = (unix_date + 3) % 7 + 1
- # return the number of business days in full weeks plus the business
- # days in the last - possible partial - week
- if delta <= 5:
- return (5 * weeks) + delta - 4
- else:
- return (5 * weeks) + (5 + 1) - 4
+ return DtoB(dts, 0, unix_date)
elif freq_group == FR_WK:
- day_adj = freq - FR_WK
- return (unix_date + 3 - day_adj) // 7 + 1
-
- # raise ValueError
+ return unix_date_to_week(unix_date, freq - FR_WK)
cdef void get_date_info(int64_t ordinal, int freq,
@@ -983,7 +966,7 @@ cdef inline int month_to_quarter(int month) nogil:
@cython.wraparound(False)
@cython.boundscheck(False)
-def dt64arr_to_periodarr(int64_t[:] dtarr, int freq, tz=None):
+def dt64arr_to_periodarr(const int64_t[:] dtarr, int freq, tz=None):
"""
Convert array of datetime64 values (passed in as 'i8' dtype) to a set of
periods corresponding to desired frequency, per period convention.
@@ -1186,7 +1169,12 @@ cdef int64_t period_ordinal_to_dt64(int64_t ordinal, int freq) except? -1:
if ordinal == NPY_NAT:
return NPY_NAT
- get_date_info(ordinal, freq, &dts)
+ if freq == 11000:
+ # Microsecond, avoid get_date_info to prevent floating point errors
+ pandas_datetime_to_datetimestruct(ordinal, NPY_FR_us, &dts)
+ else:
+ get_date_info(ordinal, freq, &dts)
+
check_dts_bounds(&dts)
return dtstruct_to_dt64(&dts)
@@ -1383,7 +1371,7 @@ cdef int pdays_in_month(int64_t ordinal, int freq):
@cython.wraparound(False)
@cython.boundscheck(False)
-def get_period_field_arr(int code, int64_t[:] arr, int freq):
+def get_period_field_arr(int code, const int64_t[:] arr, int freq):
cdef:
Py_ssize_t i, sz
int64_t[:] out
@@ -1496,7 +1484,7 @@ def extract_freq(ndarray[object] values):
@cython.wraparound(False)
@cython.boundscheck(False)
-cdef int64_t[:] localize_dt64arr_to_period(int64_t[:] stamps,
+cdef int64_t[:] localize_dt64arr_to_period(const int64_t[:] stamps,
int freq, object tz):
cdef:
Py_ssize_t n = len(stamps)
@@ -1584,7 +1572,7 @@ cdef class _Period:
return freq
@classmethod
- def _from_ordinal(cls, ordinal, freq):
+ def _from_ordinal(cls, ordinal: int, freq) -> "Period":
"""
Fast creation from an ordinal and freq that are already validated!
"""
@@ -1704,7 +1692,7 @@ cdef class _Period:
else:
return NotImplemented
- def asfreq(self, freq, how='E'):
+ def asfreq(self, freq, how='E') -> "Period":
"""
Convert Period to desired frequency, at the start or end of the interval.
@@ -1735,7 +1723,7 @@ cdef class _Period:
return Period(ordinal=ordinal, freq=freq)
@property
- def start_time(self):
+ def start_time(self) -> Timestamp:
"""
Get the Timestamp for the start of the period.
@@ -1765,13 +1753,13 @@ cdef class _Period:
return self.to_timestamp(how='S')
@property
- def end_time(self):
+ def end_time(self) -> Timestamp:
# freq.n can't be negative or 0
# ordinal = (self + self.freq.n).start_time.value - 1
ordinal = (self + self.freq).start_time.value - 1
return Timestamp(ordinal)
- def to_timestamp(self, freq=None, how='start', tz=None):
+ def to_timestamp(self, freq=None, how='start', tz=None) -> Timestamp:
"""
Return the Timestamp representation of the Period.
@@ -1811,17 +1799,17 @@ cdef class _Period:
return Timestamp(dt64, tz=tz)
@property
- def year(self):
+ def year(self) -> int:
base, mult = get_freq_code(self.freq)
return pyear(self.ordinal, base)
@property
- def month(self):
+ def month(self) -> int:
base, mult = get_freq_code(self.freq)
return pmonth(self.ordinal, base)
@property
- def day(self):
+ def day(self) -> int:
"""
Get day of the month that a Period falls on.
@@ -1844,7 +1832,7 @@ cdef class _Period:
return pday(self.ordinal, base)
@property
- def hour(self):
+ def hour(self) -> int:
"""
Get the hour of the day component of the Period.
@@ -1874,7 +1862,7 @@ cdef class _Period:
return phour(self.ordinal, base)
@property
- def minute(self):
+ def minute(self) -> int:
"""
Get minute of the hour component of the Period.
@@ -1898,7 +1886,7 @@ cdef class _Period:
return pminute(self.ordinal, base)
@property
- def second(self):
+ def second(self) -> int:
"""
Get the second component of the Period.
@@ -1922,12 +1910,12 @@ cdef class _Period:
return psecond(self.ordinal, base)
@property
- def weekofyear(self):
+ def weekofyear(self) -> int:
base, mult = get_freq_code(self.freq)
return pweek(self.ordinal, base)
@property
- def week(self):
+ def week(self) -> int:
"""
Get the week of the year on the given Period.
@@ -1957,7 +1945,7 @@ cdef class _Period:
return self.weekofyear
@property
- def dayofweek(self):
+ def dayofweek(self) -> int:
"""
Day of the week the period lies in, with Monday=0 and Sunday=6.
@@ -2008,7 +1996,7 @@ cdef class _Period:
return pweekday(self.ordinal, base)
@property
- def weekday(self):
+ def weekday(self) -> int:
"""
Day of the week the period lies in, with Monday=0 and Sunday=6.
@@ -2061,7 +2049,7 @@ cdef class _Period:
return self.dayofweek
@property
- def dayofyear(self):
+ def dayofyear(self) -> int:
"""
Return the day of the year.
@@ -2096,12 +2084,12 @@ cdef class _Period:
return pday_of_year(self.ordinal, base)
@property
- def quarter(self):
+ def quarter(self) -> int:
base, mult = get_freq_code(self.freq)
return pquarter(self.ordinal, base)
@property
- def qyear(self):
+ def qyear(self) -> int:
"""
Fiscal year the Period lies in according to its starting-quarter.
@@ -2145,7 +2133,7 @@ cdef class _Period:
return pqyear(self.ordinal, base)
@property
- def days_in_month(self):
+ def days_in_month(self) -> int:
"""
Get the total number of days in the month that this period falls on.
@@ -2179,7 +2167,7 @@ cdef class _Period:
return pdays_in_month(self.ordinal, base)
@property
- def daysinmonth(self):
+ def daysinmonth(self) -> int:
"""
Get the total number of days of the month that the Period falls in.
@@ -2209,7 +2197,7 @@ cdef class _Period:
return Period(datetime.now(), freq=freq)
@property
- def freqstr(self):
+ def freqstr(self) -> str:
return self.freq.freqstr
def __repr__(self) -> str:
diff --git a/pandas/_libs/tslibs/resolution.pyx b/pandas/_libs/tslibs/resolution.pyx
index c0b20c14e9920..1e0eb7f97ec54 100644
--- a/pandas/_libs/tslibs/resolution.pyx
+++ b/pandas/_libs/tslibs/resolution.pyx
@@ -27,7 +27,7 @@ cdef:
# ----------------------------------------------------------------------
-cpdef resolution(int64_t[:] stamps, tz=None):
+cpdef resolution(const int64_t[:] stamps, tz=None):
cdef:
Py_ssize_t i, n = len(stamps)
npy_datetimestruct dts
@@ -38,7 +38,7 @@ cpdef resolution(int64_t[:] stamps, tz=None):
return _reso_local(stamps, tz)
-cdef _reso_local(int64_t[:] stamps, object tz):
+cdef _reso_local(const int64_t[:] stamps, object tz):
cdef:
Py_ssize_t i, n = len(stamps)
int reso = RESO_DAY, curr_reso
@@ -106,7 +106,7 @@ cdef inline int _reso_stamp(npy_datetimestruct *dts):
return RESO_DAY
-def get_freq_group(freq):
+def get_freq_group(freq) -> int:
"""
Return frequency code group of given frequency str or offset.
@@ -189,7 +189,7 @@ class Resolution:
_freq_reso_map = {v: k for k, v in _reso_freq_map.items()}
@classmethod
- def get_str(cls, reso):
+ def get_str(cls, reso: int) -> str:
"""
Return resolution str against resolution code.
@@ -201,7 +201,7 @@ class Resolution:
return cls._reso_str_map.get(reso, 'day')
@classmethod
- def get_reso(cls, resostr):
+ def get_reso(cls, resostr: str) -> int:
"""
Return resolution str against resolution code.
@@ -216,7 +216,7 @@ class Resolution:
return cls._str_reso_map.get(resostr, cls.RESO_DAY)
@classmethod
- def get_freq_group(cls, resostr):
+ def get_freq_group(cls, resostr: str) -> int:
"""
Return frequency str against resolution str.
@@ -228,7 +228,7 @@ class Resolution:
return get_freq_group(cls.get_freq(resostr))
@classmethod
- def get_freq(cls, resostr):
+ def get_freq(cls, resostr: str) -> str:
"""
Return frequency str against resolution str.
@@ -240,7 +240,7 @@ class Resolution:
return cls._reso_freq_map[resostr]
@classmethod
- def get_str_from_freq(cls, freq):
+ def get_str_from_freq(cls, freq: str) -> str:
"""
Return resolution str against frequency str.
@@ -252,7 +252,7 @@ class Resolution:
return cls._freq_reso_map.get(freq, 'day')
@classmethod
- def get_reso_from_freq(cls, freq):
+ def get_reso_from_freq(cls, freq: str) -> int:
"""
Return resolution code against frequency str.
diff --git a/pandas/_libs/tslibs/timedeltas.pyx b/pandas/_libs/tslibs/timedeltas.pyx
index 9c031baf70a77..3742506a7f8af 100644
--- a/pandas/_libs/tslibs/timedeltas.pyx
+++ b/pandas/_libs/tslibs/timedeltas.pyx
@@ -1,5 +1,4 @@
import collections
-import textwrap
import cython
@@ -859,14 +858,6 @@ cdef class _Timedelta(timedelta):
"""
return self.to_timedelta64()
- def total_seconds(self):
- """
- Total duration of timedelta in seconds (to microsecond precision).
- """
- # GH 31043
- # Microseconds precision to avoid confusing tzinfo.utcoffset
- return (self.value - self.value % 1000) / 1e9
-
def view(self, dtype):
"""
Array view compatibility.
@@ -1250,7 +1241,7 @@ class Timedelta(_Timedelta):
return NaT
# make timedelta happy
- td_base = _Timedelta.__new__(cls, microseconds=int(value) / 1000)
+ td_base = _Timedelta.__new__(cls, microseconds=int(value) // 1000)
td_base.value = value
td_base.is_populated = 0
return td_base
diff --git a/pandas/_libs/tslibs/timestamps.pyx b/pandas/_libs/tslibs/timestamps.pyx
index 4915671aa6512..b8c462abe35f1 100644
--- a/pandas/_libs/tslibs/timestamps.pyx
+++ b/pandas/_libs/tslibs/timestamps.pyx
@@ -1,4 +1,3 @@
-import sys
import warnings
import numpy as np
diff --git a/pandas/_libs/util.pxd b/pandas/_libs/util.pxd
index 15fedbb20beec..828bccf7d5641 100644
--- a/pandas/_libs/util.pxd
+++ b/pandas/_libs/util.pxd
@@ -1,7 +1,5 @@
from pandas._libs.tslibs.util cimport *
-from cython cimport Py_ssize_t
-
cimport numpy as cnp
from numpy cimport ndarray
@@ -51,49 +49,3 @@ cdef inline void set_array_not_contiguous(ndarray ao) nogil:
PyArray_CLEARFLAGS(ao,
(NPY_ARRAY_C_CONTIGUOUS | NPY_ARRAY_F_CONTIGUOUS))
-
-cdef inline Py_ssize_t validate_indexer(ndarray arr, object loc) except -1:
- """
- Cast the given indexer `loc` to an integer. If it is negative, i.e. a
- python-style indexing-from-the-end indexer, translate it to a
- from-the-front indexer. Raise if this is not possible.
-
- Parameters
- ----------
- arr : ndarray
- loc : object
-
- Returns
- -------
- idx : Py_ssize_t
-
- Raises
- ------
- IndexError
- """
- cdef:
- Py_ssize_t idx, size
- int casted
-
- if is_float_object(loc):
- casted = int(loc)
- if casted == loc:
- loc = casted
-
- idx = loc
- size = cnp.PyArray_SIZE(arr)
-
- if idx < 0 and size > 0:
- idx += size
- if idx >= size or size == 0 or idx < 0:
- raise IndexError('index out of bounds')
-
- return idx
-
-
-cdef inline object get_value_at(ndarray arr, object loc):
- cdef:
- Py_ssize_t i
-
- i = validate_indexer(arr, loc)
- return arr[i]
diff --git a/pandas/_testing.py b/pandas/_testing.py
index 631d550c60534..13af8703cef93 100644
--- a/pandas/_testing.py
+++ b/pandas/_testing.py
@@ -8,7 +8,7 @@
from shutil import rmtree
import string
import tempfile
-from typing import Any, List, Optional, Union, cast
+from typing import Any, Callable, List, Optional, Type, Union, cast
import warnings
import zipfile
@@ -2757,3 +2757,24 @@ def convert_rows_list_to_csv_str(rows_list: List[str]):
sep = os.linesep
expected = sep.join(rows_list) + sep
return expected
+
+
+def external_error_raised(
+ expected_exception: Type[Exception],
+) -> Callable[[Type[Exception], None], None]:
+ """
+ Helper function to mark pytest.raises that have an external error message.
+
+ Parameters
+ ----------
+ expected_exception : Exception
+ Expected error to raise.
+
+ Returns
+ -------
+ Callable
+ Regular `pytest.raises` function with `match` equal to `None`.
+ """
+ import pytest
+
+ return pytest.raises(expected_exception, match=None)
diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py
index 3a6662d3e3ae2..d26ff7490e714 100644
--- a/pandas/core/arrays/categorical.py
+++ b/pandas/core/arrays/categorical.py
@@ -2504,10 +2504,6 @@ class CategoricalAccessor(PandasDelegate, PandasObject, NoNewAttributesMixin):
>>> s.cat.as_unordered()
"""
- _deprecations = PandasObject._deprecations | frozenset(
- ["categorical", "index", "name"]
- )
-
def __init__(self, data):
self._validate(data)
self._parent = data.values
diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py
index 48ad659b771f6..4bfd5f5770b69 100644
--- a/pandas/core/arrays/integer.py
+++ b/pandas/core/arrays/integer.py
@@ -25,6 +25,7 @@
from pandas.core.dtypes.missing import isna
from pandas.core import nanops, ops
+import pandas.core.common as com
from pandas.core.indexers import check_array_indexer
from pandas.core.ops import invalid_comparison
from pandas.core.ops.common import unpack_zerodim_and_defer
@@ -586,9 +587,8 @@ def _reduce(self, name, skipna=True, **kwargs):
# if we have a preservable numeric op,
# provide coercion back to an integer type if possible
elif name in ["sum", "min", "max", "prod"]:
- int_result = int(result)
- if int_result == result:
- result = int_result
+ # GH#31409 more performant than casting-then-checking
+ result = com.cast_scalar_indexer(result)
return result
diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py
index 398ed75c060ca..0b35a031bc53f 100644
--- a/pandas/core/arrays/interval.py
+++ b/pandas/core/arrays/interval.py
@@ -27,6 +27,7 @@
from pandas.core.dtypes.dtypes import IntervalDtype
from pandas.core.dtypes.generic import (
ABCDatetimeIndex,
+ ABCExtensionArray,
ABCIndexClass,
ABCInterval,
ABCIntervalIndex,
@@ -789,6 +790,33 @@ def size(self) -> int:
# Avoid materializing self.values
return self.left.size
+ def shift(self, periods: int = 1, fill_value: object = None) -> ABCExtensionArray:
+ if not len(self) or periods == 0:
+ return self.copy()
+
+ if isna(fill_value):
+ fill_value = self.dtype.na_value
+
+ # ExtensionArray.shift doesn't work for two reasons
+ # 1. IntervalArray.dtype.na_value may not be correct for the dtype.
+ # 2. IntervalArray._from_sequence only accepts NaN for missing values,
+ # not other values like NaT
+
+ empty_len = min(abs(periods), len(self))
+ if isna(fill_value):
+ fill_value = self.left._na_value
+ empty = IntervalArray.from_breaks([fill_value] * (empty_len + 1))
+ else:
+ empty = self._from_sequence([fill_value] * empty_len)
+
+ if periods > 0:
+ a = empty
+ b = self[:-periods]
+ else:
+ a = self[abs(periods) :]
+ b = empty
+ return self._concat_same_type([a, b])
+
def take(self, indices, allow_fill=False, fill_value=None, axis=None, **kwargs):
"""
Take elements from the IntervalArray.
diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py
index b476a019c66cc..8008805ddcf87 100644
--- a/pandas/core/arrays/sparse/array.py
+++ b/pandas/core/arrays/sparse/array.py
@@ -9,7 +9,7 @@
import numpy as np
-from pandas._libs import index as libindex, lib
+from pandas._libs import lib
import pandas._libs.sparse as splib
from pandas._libs.sparse import BlockIndex, IntIndex, SparseIndex
from pandas._libs.tslibs import NaT
@@ -794,7 +794,9 @@ def _get_val_at(self, loc):
if sp_loc == -1:
return self.fill_value
else:
- return libindex.get_value_at(self.sp_values, sp_loc)
+ val = self.sp_values[sp_loc]
+ val = com.maybe_box_datetimelike(val, self.sp_values.dtype)
+ return val
def take(self, indices, allow_fill=False, fill_value=None):
if is_scalar(indices):
diff --git a/pandas/core/base.py b/pandas/core/base.py
index 9fe1af776dd2b..f3c8b50e774af 100644
--- a/pandas/core/base.py
+++ b/pandas/core/base.py
@@ -1027,12 +1027,10 @@ def tolist(self):
--------
numpy.ndarray.tolist
"""
- if self.dtype.kind in ["m", "M"]:
- return [com.maybe_box_datetimelike(x) for x in self._values]
- elif is_extension_array_dtype(self._values):
+ if not isinstance(self._values, np.ndarray):
+ # check for ndarray instead of dtype to catch DTA/TDA
return list(self._values)
- else:
- return self._values.tolist()
+ return self._values.tolist()
to_list = tolist
@@ -1049,9 +1047,8 @@ def __iter__(self):
iterator
"""
# We are explicitly making element iterators.
- if self.dtype.kind in ["m", "M"]:
- return map(com.maybe_box_datetimelike, self._values)
- elif is_extension_array_dtype(self._values):
+ if not isinstance(self._values, np.ndarray):
+ # Check type instead of dtype to catch DTA/TDA
return iter(self._values)
else:
return map(self._values.item, range(self._values.size))
diff --git a/pandas/core/common.py b/pandas/core/common.py
index a76119da2707a..00c7a41477017 100644
--- a/pandas/core/common.py
+++ b/pandas/core/common.py
@@ -72,8 +72,12 @@ def consensus_name_attr(objs):
return name
-def maybe_box_datetimelike(value):
+def maybe_box_datetimelike(value, dtype=None):
# turn a datetime like into a Timestamp/timedelta as needed
+ if dtype == object:
+ # If we dont have datetime64/timedelta64 dtype, we dont want to
+ # box datetimelike scalars
+ return value
if isinstance(value, (np.datetime64, datetime)):
value = tslibs.Timestamp(value)
@@ -156,7 +160,7 @@ def cast_scalar_indexer(val):
outval : scalar
"""
# assumes lib.is_scalar(val)
- if lib.is_float(val) and val == int(val):
+ if lib.is_float(val) and val.is_integer():
return int(val)
return val
diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py
index 3776c6f816d96..b0410e31c6de7 100644
--- a/pandas/core/config_init.py
+++ b/pandas/core/config_init.py
@@ -9,6 +9,8 @@
module is imported, register them here rather than in the module.
"""
+import warnings
+
import pandas._config.config as cf
from pandas._config.config import (
is_bool,
@@ -341,8 +343,26 @@ def is_terminal() -> bool:
validator=is_instance_factory([type(None), int]),
)
cf.register_option("max_categories", 8, pc_max_categories_doc, validator=is_int)
+
+ def _deprecate_negative_int_max_colwidth(key):
+ value = cf.get_option(key)
+ if value is not None and value < 0:
+ warnings.warn(
+ "Passing a negative integer is deprecated in version 1.0 and "
+ "will not be supported in future version. Instead, use None "
+ "to not limit the column width.",
+ FutureWarning,
+ stacklevel=4,
+ )
+
cf.register_option(
- "max_colwidth", 50, max_colwidth_doc, validator=is_nonnegative_int
+ # FIXME: change `validator=is_nonnegative_int`
+ # in version 1.2
+ "max_colwidth",
+ 50,
+ max_colwidth_doc,
+ validator=is_instance_factory([type(None), int]),
+ cb=_deprecate_negative_int_max_colwidth,
)
if is_terminal():
max_cols = 0 # automatically determine optimal number of columns
diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py
index 52c569793e499..0719b8ce6010b 100644
--- a/pandas/core/dtypes/cast.py
+++ b/pandas/core/dtypes/cast.py
@@ -1,11 +1,18 @@
""" routings for casting """
-from datetime import datetime, timedelta
+from datetime import date, datetime, timedelta
import numpy as np
from pandas._libs import lib, tslib, tslibs
-from pandas._libs.tslibs import NaT, OutOfBoundsDatetime, Period, iNaT
+from pandas._libs.tslibs import (
+ NaT,
+ OutOfBoundsDatetime,
+ Period,
+ Timedelta,
+ Timestamp,
+ iNaT,
+)
from pandas._libs.tslibs.timezones import tz_compare
from pandas._typing import Dtype
from pandas.util._validators import validate_bool_kwarg
@@ -1599,3 +1606,59 @@ def maybe_cast_to_integer_array(arr, dtype, copy: bool = False):
if is_integer_dtype(dtype) and (is_float_dtype(arr) or is_object_dtype(arr)):
raise ValueError("Trying to coerce float values to integers")
+
+
+def convert_scalar_for_putitemlike(scalar, dtype: np.dtype):
+ """
+ Convert datetimelike scalar if we are setting into a datetime64
+ or timedelta64 ndarray.
+
+ Parameters
+ ----------
+ scalar : scalar
+ dtype : np.dtpye
+
+ Returns
+ -------
+ scalar
+ """
+ if dtype.kind == "m":
+ if isinstance(scalar, (timedelta, np.timedelta64)):
+ # We have to cast after asm8 in case we have NaT
+ return Timedelta(scalar).asm8.view("timedelta64[ns]")
+ elif scalar is None or scalar is NaT or (is_float(scalar) and np.isnan(scalar)):
+ return np.timedelta64("NaT", "ns")
+ if dtype.kind == "M":
+ if isinstance(scalar, (date, np.datetime64)):
+ # Note: we include date, not just datetime
+ return Timestamp(scalar).to_datetime64()
+ elif scalar is None or scalar is NaT or (is_float(scalar) and np.isnan(scalar)):
+ return np.datetime64("NaT", "ns")
+ else:
+ validate_numeric_casting(dtype, scalar)
+ return scalar
+
+
+def validate_numeric_casting(dtype: np.dtype, value):
+ """
+ Check that we can losslessly insert the given value into an array
+ with the given dtype.
+
+ Parameters
+ ----------
+ dtype : np.dtype
+ value : scalar
+
+ Raises
+ ------
+ ValueError
+ """
+ if issubclass(dtype.type, (np.integer, np.bool_)):
+ if is_float(value) and np.isnan(value):
+ raise ValueError("Cannot assign nan to integer series")
+
+ if issubclass(dtype.type, (np.integer, np.floating, np.complex)) and not issubclass(
+ dtype.type, np.bool_
+ ):
+ if is_bool(value):
+ raise ValueError("Cannot assign bool to float/integer series")
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
index 0dea8235e9d3f..e0efa93379bca 100644
--- a/pandas/core/frame.py
+++ b/pandas/core/frame.py
@@ -69,6 +69,7 @@
maybe_infer_to_datetimelike,
maybe_upcast,
maybe_upcast_putmask,
+ validate_numeric_casting,
)
from pandas.core.dtypes.common import (
ensure_float64,
@@ -2900,12 +2901,8 @@ def _get_value(self, index, col, takeable: bool = False):
engine = self.index._engine
try:
- if isinstance(series._values, np.ndarray):
- # i.e. not EA, we can use engine
- return engine.get_value(series._values, index)
- else:
- loc = series.index.get_loc(index)
- return series._values[loc]
+ loc = engine.get_loc(index)
+ return series._values[loc]
except KeyError:
# GH 20629
if self.index.nlevels > 1:
@@ -3028,10 +3025,14 @@ def _set_value(self, index, col, value, takeable: bool = False):
series = self._get_item_cache(col)
engine = self.index._engine
- engine.set_value(series._values, index, value)
+ loc = engine.get_loc(index)
+ validate_numeric_casting(series.dtype, value)
+
+ series._values[loc] = value
+ # Note: trying to use series._set_value breaks tests in
+ # tests.frame.indexing.test_indexing and tests.indexing.test_partial
return self
except (KeyError, TypeError):
-
# set using a non-recursive method & reset the cache
if takeable:
self.iloc[index, col] = value
@@ -6556,7 +6557,9 @@ def unstack(self, level=-1, fill_value=None):
@Appender(
_shared_docs["melt"]
% dict(
- caller="df.melt(", versionadded=".. versionadded:: 0.20.0\n", other="melt"
+ caller="df.melt(",
+ versionadded="\n .. versionadded:: 0.20.0\n",
+ other="melt",
)
)
def melt(
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
index 3b1d7e4c50be5..313d40b575629 100644
--- a/pandas/core/generic.py
+++ b/pandas/core/generic.py
@@ -1922,10 +1922,8 @@ def _repr_data_resource_(self):
Parameters
----------
- buf : writable buffer, defaults to sys.stdout
- Where to send the output. By default, the output is printed to
- sys.stdout. Pass a writable buffer if you need to further process
- the output.
+ buf : str, Path or StringIO-like, optional, default None
+ Buffer to write to. If None, the output is returned as a string.
mode : str, optional
Mode in which file is opened.
**kwargs
@@ -3444,15 +3442,14 @@ class animal locomotion
new_index = self.index[loc]
if is_scalar(loc):
- new_values = self._data.fast_xs(loc)
+ # In this case loc should be an integer
+ if self.ndim == 1:
+ # if we encounter an array-like and we only have 1 dim
+ # that means that their are list/ndarrays inside the Series!
+ # so just return them (GH 6394)
+ return self._values[loc]
- # may need to box a datelike-scalar
- #
- # if we encounter an array-like and we only have 1 dim
- # that means that their are list/ndarrays inside the Series!
- # so just return them (GH 6394)
- if not is_list_like(new_values) or self.ndim == 1:
- return com.maybe_box_datetimelike(new_values)
+ new_values = self._data.fast_xs(loc)
result = self._constructor_sliced(
new_values,
@@ -3501,7 +3498,9 @@ def _iget_item_cache(self, item):
def _box_item_values(self, key, values):
raise AbstractMethodError(self)
- def _slice(self: FrameOrSeries, slobj: slice, axis=0, kind=None) -> FrameOrSeries:
+ def _slice(
+ self: FrameOrSeries, slobj: slice, axis=0, kind: str = "getitem"
+ ) -> FrameOrSeries:
"""
Construct a slice of this container.
diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py
index 27dd6e953c219..f194c774cf329 100644
--- a/pandas/core/groupby/generic.py
+++ b/pandas/core/groupby/generic.py
@@ -1022,6 +1022,10 @@ def _cython_agg_blocks(
agg_blocks: List[Block] = []
new_items: List[np.ndarray] = []
deleted_items: List[np.ndarray] = []
+ # Some object-dtype blocks might be split into List[Block[T], Block[U]]
+ split_items: List[np.ndarray] = []
+ split_frames: List[DataFrame] = []
+
no_result = object()
for block in data.blocks:
# Avoid inheriting result from earlier in the loop
@@ -1061,40 +1065,56 @@ def _cython_agg_blocks(
else:
result = cast(DataFrame, result)
# unwrap DataFrame to get array
+ if len(result._data.blocks) != 1:
+ # We've split an object block! Everything we've assumed
+ # about a single block input returning a single block output
+ # is a lie. To keep the code-path for the typical non-split case
+ # clean, we choose to clean up this mess later on.
+ split_items.append(locs)
+ split_frames.append(result)
+ continue
+
assert len(result._data.blocks) == 1
result = result._data.blocks[0].values
if isinstance(result, np.ndarray) and result.ndim == 1:
result = result.reshape(1, -1)
- finally:
- assert not isinstance(result, DataFrame)
-
- if result is not no_result:
- # see if we can cast the block back to the original dtype
- result = maybe_downcast_numeric(result, block.dtype)
-
- if block.is_extension and isinstance(result, np.ndarray):
- # e.g. block.values was an IntegerArray
- # (1, N) case can occur if block.values was Categorical
- # and result is ndarray[object]
- assert result.ndim == 1 or result.shape[0] == 1
- try:
- # Cast back if feasible
- result = type(block.values)._from_sequence(
- result.ravel(), dtype=block.values.dtype
- )
- except ValueError:
- # reshape to be valid for non-Extension Block
- result = result.reshape(1, -1)
+ assert not isinstance(result, DataFrame)
+
+ if result is not no_result:
+ # see if we can cast the block back to the original dtype
+ result = maybe_downcast_numeric(result, block.dtype)
+
+ if block.is_extension and isinstance(result, np.ndarray):
+ # e.g. block.values was an IntegerArray
+ # (1, N) case can occur if block.values was Categorical
+ # and result is ndarray[object]
+ assert result.ndim == 1 or result.shape[0] == 1
+ try:
+ # Cast back if feasible
+ result = type(block.values)._from_sequence(
+ result.ravel(), dtype=block.values.dtype
+ )
+ except ValueError:
+ # reshape to be valid for non-Extension Block
+ result = result.reshape(1, -1)
- agg_block: Block = block.make_block(result)
+ agg_block: Block = block.make_block(result)
new_items.append(locs)
agg_blocks.append(agg_block)
- if not agg_blocks:
+ if not (agg_blocks or split_frames):
raise DataError("No numeric types to aggregate")
+ if split_items:
+ # Clean up the mess left over from split blocks.
+ for locs, result in zip(split_items, split_frames):
+ assert len(locs) == result.shape[1]
+ for i, loc in enumerate(locs):
+ new_items.append(np.array([loc], dtype=locs.dtype))
+ agg_blocks.append(result.iloc[:, [i]]._data.blocks[0])
+
# reset the locs in the blocks to correspond to our
# current ordering
indexer = np.concatenate(new_items)
diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
index 54275dc52bb56..0245b9f74d944 100644
--- a/pandas/core/groupby/groupby.py
+++ b/pandas/core/groupby/groupby.py
@@ -1180,10 +1180,16 @@ def count(self):
@Substitution(name="groupby")
@Substitution(see_also=_common_see_also)
- def mean(self, *args, **kwargs):
+ def mean(self, numeric_only: bool = True):
"""
Compute mean of groups, excluding missing values.
+ Parameters
+ ----------
+ numeric_only : bool, default True
+ Include only float, int, boolean columns. If None, will attempt to use
+ everything, then use only numeric data.
+
Returns
-------
pandas.Series or pandas.DataFrame
@@ -1222,19 +1228,26 @@ def mean(self, *args, **kwargs):
2 4.0
Name: B, dtype: float64
"""
- nv.validate_groupby_func("mean", args, kwargs, ["numeric_only"])
return self._cython_agg_general(
- "mean", alt=lambda x, axis: Series(x).mean(**kwargs), **kwargs
+ "mean",
+ alt=lambda x, axis: Series(x).mean(numeric_only=numeric_only),
+ numeric_only=numeric_only,
)
@Substitution(name="groupby")
@Appender(_common_see_also)
- def median(self, **kwargs):
+ def median(self, numeric_only=True):
"""
Compute median of groups, excluding missing values.
For multiple groupings, the result index will be a MultiIndex
+ Parameters
+ ----------
+ numeric_only : bool, default True
+ Include only float, int, boolean columns. If None, will attempt to use
+ everything, then use only numeric data.
+
Returns
-------
Series or DataFrame
@@ -1242,13 +1255,13 @@ def median(self, **kwargs):
"""
return self._cython_agg_general(
"median",
- alt=lambda x, axis: Series(x).median(axis=axis, **kwargs),
- **kwargs,
+ alt=lambda x, axis: Series(x).median(axis=axis, numeric_only=numeric_only),
+ numeric_only=numeric_only,
)
@Substitution(name="groupby")
@Appender(_common_see_also)
- def std(self, ddof: int = 1, *args, **kwargs):
+ def std(self, ddof: int = 1):
"""
Compute standard deviation of groups, excluding missing values.
@@ -1266,12 +1279,11 @@ def std(self, ddof: int = 1, *args, **kwargs):
"""
# TODO: implement at Cython level?
- nv.validate_groupby_func("std", args, kwargs)
- return np.sqrt(self.var(ddof=ddof, **kwargs))
+ return np.sqrt(self.var(ddof=ddof))
@Substitution(name="groupby")
@Appender(_common_see_also)
- def var(self, ddof: int = 1, *args, **kwargs):
+ def var(self, ddof: int = 1):
"""
Compute variance of groups, excluding missing values.
@@ -1287,15 +1299,14 @@ def var(self, ddof: int = 1, *args, **kwargs):
Series or DataFrame
Variance of values within each group.
"""
- nv.validate_groupby_func("var", args, kwargs)
if ddof == 1:
return self._cython_agg_general(
- "var", alt=lambda x, axis: Series(x).var(ddof=ddof, **kwargs), **kwargs
+ "var", alt=lambda x, axis: Series(x).var(ddof=ddof)
)
else:
- f = lambda x: x.var(ddof=ddof, **kwargs)
+ func = lambda x: x.var(ddof=ddof)
with _group_selection_context(self):
- return self._python_agg_general(f)
+ return self._python_agg_general(func)
@Substitution(name="groupby")
@Appender(_common_see_also)
@@ -1383,7 +1394,9 @@ def func(self, numeric_only=numeric_only, min_count=min_count):
except DataError:
pass
except NotImplementedError as err:
- if "function is not implemented for this dtype" in str(err):
+ if "function is not implemented for this dtype" in str(
+ err
+ ) or "category dtype not supported" in str(err):
# raised in _get_cython_function, in some cases can
# be trimmed by implementing cython funcs for more dtypes
pass
diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py
index 77c54ec736aaa..761353ca5a6ca 100644
--- a/pandas/core/groupby/ops.py
+++ b/pandas/core/groupby/ops.py
@@ -31,6 +31,7 @@
is_extension_array_dtype,
is_integer_dtype,
is_numeric_dtype,
+ is_period_dtype,
is_sparse,
is_timedelta64_dtype,
needs_i8_conversion,
@@ -567,7 +568,12 @@ def _cython_operation(
if swapped:
result = result.swapaxes(0, axis)
- if is_datetime64tz_dtype(orig_values.dtype):
+ if is_datetime64tz_dtype(orig_values.dtype) or is_period_dtype(
+ orig_values.dtype
+ ):
+ # We need to use the constructors directly for these dtypes
+ # since numpy won't recognize them
+ # https://github.com/pandas-dev/pandas/issues/31471
result = type(orig_values)(result.astype(np.int64), dtype=orig_values.dtype)
elif is_datetimelike and kind == "aggregate":
result = result.astype(orig_values.dtype)
diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
index 95cfab4c96af3..e8ad2bef099a1 100644
--- a/pandas/core/indexes/base.py
+++ b/pandas/core/indexes/base.py
@@ -1,7 +1,7 @@
from datetime import datetime
import operator
from textwrap import dedent
-from typing import Any, FrozenSet, Hashable, Optional, Union
+from typing import TYPE_CHECKING, Any, FrozenSet, Hashable, Optional, Union
import warnings
import numpy as np
@@ -18,7 +18,10 @@
from pandas.util._decorators import Appender, Substitution, cache_readonly
from pandas.core.dtypes import concat as _concat
-from pandas.core.dtypes.cast import maybe_cast_to_integer_array
+from pandas.core.dtypes.cast import (
+ maybe_cast_to_integer_array,
+ validate_numeric_casting,
+)
from pandas.core.dtypes.common import (
ensure_categorical,
ensure_int64,
@@ -68,7 +71,7 @@
from pandas.core.arrays import ExtensionArray
from pandas.core.base import IndexOpsMixin, PandasObject
import pandas.core.common as com
-from pandas.core.indexers import deprecate_ndim_indexing, maybe_convert_indices
+from pandas.core.indexers import deprecate_ndim_indexing
from pandas.core.indexes.frozen import FrozenList
import pandas.core.missing as missing
from pandas.core.ops import get_op_result_name
@@ -83,6 +86,10 @@
pprint_thing,
)
+if TYPE_CHECKING:
+ from pandas import Series
+
+
__all__ = ["Index"]
_unsortable_types = frozenset(("mixed", "mixed-integer"))
@@ -522,6 +529,7 @@ def _shallow_copy(self, values=None, **kwargs):
values = self.values
attributes = self._get_attributes_dict()
+
attributes.update(kwargs)
return self._simple_new(values, **attributes)
@@ -2566,6 +2574,7 @@ def _union(self, other, sort):
# worth making this faster? a very unusual case
value_set = set(lvals)
result.extend([x for x in rvals if x not in value_set])
+ result = Index(result)._values # do type inference here
else:
# find indexes of things in "other" that are not in "self"
if self.is_unique:
@@ -2595,7 +2604,8 @@ def _union(self, other, sort):
return self._wrap_setop_result(other, result)
def _wrap_setop_result(self, other, result):
- return self._constructor(result, name=get_op_result_name(self, other))
+ name = get_op_result_name(self, other)
+ return self._shallow_copy(result, name=name)
# TODO: standardize return type of non-union setops type(self vs other)
def intersection(self, other, sort=False):
@@ -2652,9 +2662,10 @@ def intersection(self, other, sort=False):
if self.is_monotonic and other.is_monotonic:
try:
result = self._inner_indexer(lvals, rvals)[0]
- return self._wrap_setop_result(other, result)
except TypeError:
pass
+ else:
+ return self._wrap_setop_result(other, result)
try:
indexer = Index(rvals).get_indexer(lvals)
@@ -2880,10 +2891,15 @@ def get_loc(self, key, method=None, tolerance=None):
"tolerance argument only valid if using pad, "
"backfill or nearest lookups"
)
+ casted_key = self._maybe_cast_indexer(key)
try:
- return self._engine.get_loc(key)
+ return self._engine.get_loc(casted_key)
except KeyError:
- return self._engine.get_loc(self._maybe_cast_indexer(key))
+ raise KeyError(key)
+
+ if tolerance is not None:
+ tolerance = self._convert_tolerance(tolerance, np.asarray(key))
+
indexer = self.get_indexer([key], method=method, tolerance=tolerance)
if indexer.ndim > 1 or indexer.size > 1:
raise TypeError("get_loc requires scalar valued input")
@@ -3061,9 +3077,8 @@ def _get_nearest_indexer(self, target: "Index", limit, tolerance) -> np.ndarray:
left_indexer = self.get_indexer(target, "pad", limit=limit)
right_indexer = self.get_indexer(target, "backfill", limit=limit)
- target = np.asarray(target)
- left_distances = abs(self.values[left_indexer] - target)
- right_distances = abs(self.values[right_indexer] - target)
+ left_distances = np.abs(self[left_indexer] - target)
+ right_distances = np.abs(self[right_indexer] - target)
op = operator.lt if self.is_monotonic_increasing else operator.le
indexer = np.where(
@@ -3085,20 +3100,16 @@ def _filter_indexer_tolerance(
# --------------------------------------------------------------------
# Indexer Conversion Methods
- def _convert_scalar_indexer(self, key, kind=None):
+ def _convert_scalar_indexer(self, key, kind: str_t):
"""
Convert a scalar indexer.
Parameters
----------
key : label of the slice bound
- kind : {'loc', 'getitem', 'iloc'} or None
+ kind : {'loc', 'getitem'}
"""
- assert kind in ["loc", "getitem", "iloc", None]
-
- if kind == "iloc":
- self._validate_indexer("positional", key, "iloc")
- return key
+ assert kind in ["loc", "getitem"]
if len(self) and not isinstance(self, ABCMultiIndex):
@@ -3147,9 +3158,9 @@ def _convert_slice_indexer(self, key: slice, kind=None):
# validate iloc
if kind == "iloc":
- self._validate_indexer("slice", key.start, "iloc")
- self._validate_indexer("slice", key.stop, "iloc")
- self._validate_indexer("slice", key.step, "iloc")
+ self._validate_indexer("positional", key.start, "iloc")
+ self._validate_indexer("positional", key.stop, "iloc")
+ self._validate_indexer("positional", key.step, "iloc")
return key
# potentially cast the bounds to integers
@@ -3200,7 +3211,7 @@ def is_int(v):
return indexer
- def _convert_listlike_indexer(self, keyarr, kind=None):
+ def _convert_listlike_indexer(self, keyarr):
"""
Parameters
----------
@@ -3219,7 +3230,7 @@ def _convert_listlike_indexer(self, keyarr, kind=None):
else:
keyarr = self._convert_arr_indexer(keyarr)
- indexer = self._convert_list_indexer(keyarr, kind=kind)
+ indexer = self._convert_list_indexer(keyarr)
return indexer, keyarr
def _convert_arr_indexer(self, keyarr):
@@ -3253,7 +3264,7 @@ def _convert_index_indexer(self, keyarr):
"""
return keyarr
- def _convert_list_indexer(self, keyarr, kind=None):
+ def _convert_list_indexer(self, keyarr):
"""
Convert a list-like indexer to the appropriate dtype.
@@ -3267,29 +3278,6 @@ def _convert_list_indexer(self, keyarr, kind=None):
-------
positional indexer or None
"""
- if (
- kind in [None, "iloc"]
- and is_integer_dtype(keyarr)
- and not self.is_floating()
- ):
-
- if self.inferred_type == "mixed-integer":
- indexer = self.get_indexer(keyarr)
- if (indexer >= 0).all():
- return indexer
- # missing values are flagged as -1 by get_indexer and negative
- # indices are already converted to positive indices in the
- # above if-statement, so the negative flags are changed to
- # values outside the range of indices so as to trigger an
- # IndexError in maybe_convert_indices
- indexer[indexer < 0] = len(self)
-
- return maybe_convert_indices(indexer, len(self))
-
- elif not self.inferred_type == "integer":
- keyarr = np.where(keyarr < 0, len(self) + keyarr, keyarr)
- return keyarr
-
return None
def _invalid_indexer(self, form: str_t, key):
@@ -3297,8 +3285,8 @@ def _invalid_indexer(self, form: str_t, key):
Consistent invalid indexer message.
"""
raise TypeError(
- f"cannot do {form} indexing on {type(self)} with these "
- f"indexers [{key}] of {type(key)}"
+ f"cannot do {form} indexing on {type(self).__name__} with these "
+ f"indexers [{key}] of type {type(key).__name__}"
)
# --------------------------------------------------------------------
@@ -4096,6 +4084,11 @@ def __contains__(self, key: Any) -> bool:
bool
Whether the key search is in the index.
+ Raises
+ ------
+ TypeError
+ If the key is not hashable.
+
See Also
--------
Index.isin : Returns an ndarray of boolean dtype indicating whether the
@@ -4573,21 +4566,15 @@ def argsort(self, *args, **kwargs) -> np.ndarray:
result = np.array(self)
return result.argsort(*args, **kwargs)
- _index_shared_docs[
- "get_value"
- ] = """
+ def get_value(self, series: "Series", key):
+ """
Fast lookup of value from 1-dimensional ndarray. Only use this if you
know what you're doing.
Returns
-------
- scalar
- A value in the Series with the index of the key value in self.
+ scalar or Series
"""
-
- @Appender(_index_shared_docs["get_value"] % _index_doc_kwargs)
- def get_value(self, series, key):
-
if not is_scalar(key):
# if key is not a scalar, directly raise an error (the code below
# would convert to numpy arrays and raise later any way) - GH29926
@@ -4599,9 +4586,9 @@ def get_value(self, series, key):
# If that fails, raise a KeyError if an integer
# index, otherwise, see if key is an integer, and
# try that
- loc = self._engine.get_loc(key)
+ loc = self.get_loc(key)
except KeyError:
- if len(self) > 0 and (self.holds_integer() or self.is_boolean()):
+ if not self._should_fallback_to_positional():
raise
elif is_integer(key):
# If the Index cannot hold integer, then this is unambiguously
@@ -4612,7 +4599,15 @@ def get_value(self, series, key):
return self._get_values_for_loc(series, loc)
- def _get_values_for_loc(self, series, loc):
+ def _should_fallback_to_positional(self) -> bool:
+ """
+ If an integer key is not found, should we fall back to positional indexing?
+ """
+ if len(self) > 0 and (self.holds_integer() or self.is_boolean()):
+ return False
+ return True
+
+ def _get_values_for_loc(self, series: "Series", loc):
"""
Do a positional lookup on the given Series, returning either a scalar
or a Series.
@@ -4620,10 +4615,6 @@ def _get_values_for_loc(self, series, loc):
Assumes that `series.index is self`
"""
if is_integer(loc):
- if isinstance(series._values, np.ndarray):
- # Since we have an ndarray and not DatetimeArray, we dont
- # have to worry about a tz.
- return libindex.get_value_at(series._values, loc, tz=None)
return series._values[loc]
return series.iloc[loc]
@@ -4646,9 +4637,9 @@ def set_value(self, arr, key, value):
FutureWarning,
stacklevel=2,
)
- self._engine.set_value(
- com.values_from_object(arr), com.values_from_object(key), value
- )
+ loc = self._engine.get_loc(key)
+ validate_numeric_casting(arr.dtype, value)
+ arr[loc] = value
_index_shared_docs[
"get_indexer_non_unique"
@@ -4929,13 +4920,8 @@ def _maybe_cast_indexer(self, key):
to an int if equivalent.
"""
- if is_float(key) and not self.is_floating():
- try:
- ckey = int(key)
- if ckey == key:
- key = ckey
- except (OverflowError, ValueError, TypeError):
- pass
+ if not self.is_floating():
+ return com.cast_scalar_indexer(key)
return key
def _validate_indexer(self, form: str_t, key, kind: str_t):
diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py
index d556c014467cf..85229c728848f 100644
--- a/pandas/core/indexes/category.py
+++ b/pandas/core/indexes/category.py
@@ -1,4 +1,4 @@
-from typing import Any, List
+from typing import TYPE_CHECKING, Any, List
import warnings
import numpy as np
@@ -7,7 +7,6 @@
from pandas._libs import index as libindex
from pandas._libs.hashtable import duplicated_int64
-from pandas._typing import AnyArrayLike
from pandas.util._decorators import Appender, cache_readonly
from pandas.core.dtypes.common import (
@@ -29,7 +28,9 @@
from pandas.core.indexes.base import Index, _index_shared_docs, maybe_extract_name
from pandas.core.indexes.extension import ExtensionIndex, inherit_names
import pandas.core.missing as missing
-from pandas.core.ops import get_op_result_name
+
+if TYPE_CHECKING:
+ from pandas import Series
_index_doc_kwargs = dict(ibase._index_doc_kwargs)
_index_doc_kwargs.update(dict(target_klass="CategoricalIndex"))
@@ -159,17 +160,6 @@ class CategoricalIndex(ExtensionIndex, accessor.PandasDelegate):
_typ = "categoricalindex"
- _raw_inherit = {
- "argsort",
- "_internal_get_values",
- "tolist",
- "codes",
- "categories",
- "ordered",
- "_reverse_indexer",
- "searchsorted",
- }
-
codes: np.ndarray
categories: Index
_data: Categorical
@@ -386,12 +376,6 @@ def _has_complex_internals(self) -> bool:
# used to avoid libreduction code paths, which raise or require conversion
return True
- def _wrap_setop_result(self, other, result):
- name = get_op_result_name(self, other)
- # We use _shallow_copy rather than the Index implementation
- # (which uses _constructor) in order to preserve dtype.
- return self._shallow_copy(result, name=name)
-
@Appender(Index.__contains__.__doc__)
def __contains__(self, key: Any) -> bool:
# if key is a NaN, check if any NaN is in self.
@@ -455,53 +439,19 @@ def _to_safe_for_reshape(self):
""" convert to object if we are a categorical """
return self.astype("object")
- def get_loc(self, key, method=None):
- """
- Get integer location, slice or boolean mask for requested label.
-
- Parameters
- ----------
- key : label
- method : {None}
- * default: exact matches only.
-
- Returns
- -------
- loc : int if unique index, slice if monotonic index, else mask
-
- Raises
- ------
- KeyError : if the key is not in the index
-
- Examples
- --------
- >>> unique_index = pd.CategoricalIndex(list('abc'))
- >>> unique_index.get_loc('b')
- 1
-
- >>> monotonic_index = pd.CategoricalIndex(list('abbc'))
- >>> monotonic_index.get_loc('b')
- slice(1, 3, None)
-
- >>> non_monotonic_index = pd.CategoricalIndex(list('abcb'))
- >>> non_monotonic_index.get_loc('b')
- array([False, True, False, True], dtype=bool)
- """
+ def _maybe_cast_indexer(self, key):
code = self.categories.get_loc(key)
code = self.codes.dtype.type(code)
- try:
- return self._engine.get_loc(code)
- except KeyError:
- raise KeyError(key)
+ return code
- def get_value(self, series: AnyArrayLike, key: Any):
+ def get_value(self, series: "Series", key: Any):
"""
Fast lookup of value from 1-dimensional ndarray. Only use this if you
know what you're doing
Parameters
----------
- series : Series, ExtensionArray, Index, or ndarray
+ series : Series
1-dimensional array to take values from
key: : scalar
The value of this index at the position of the desired value,
@@ -521,7 +471,7 @@ def get_value(self, series: AnyArrayLike, key: Any):
pass
# we might be a positional inexer
- return super().get_value(series, key)
+ return Index.get_value(self, series, key)
@Appender(Index.where.__doc__)
def where(self, cond, other=None):
@@ -674,21 +624,22 @@ def get_indexer_non_unique(self, target):
return ensure_platform_int(indexer), missing
@Appender(Index._convert_scalar_indexer.__doc__)
- def _convert_scalar_indexer(self, key, kind=None):
+ def _convert_scalar_indexer(self, key, kind: str):
+ assert kind in ["loc", "getitem"]
if kind == "loc":
try:
- return self.categories._convert_scalar_indexer(key, kind=kind)
+ return self.categories._convert_scalar_indexer(key, kind="loc")
except TypeError:
self._invalid_indexer("label", key)
return super()._convert_scalar_indexer(key, kind=kind)
@Appender(Index._convert_list_indexer.__doc__)
- def _convert_list_indexer(self, keyarr, kind=None):
+ def _convert_list_indexer(self, keyarr):
# Return our indexer or raise if all of the values are not included in
# the categories
if self.categories._defer_to_indexing:
- indexer = self.categories._convert_list_indexer(keyarr, kind=kind)
+ indexer = self.categories._convert_list_indexer(keyarr)
return Index(self.codes).get_indexer_for(indexer)
indexer = self.categories.get_indexer(np.asarray(keyarr))
@@ -852,18 +803,13 @@ def _concat_same_dtype(self, to_concat, name):
result.name = name
return result
- def _delegate_property_get(self, name: str, *args, **kwargs):
- """ method delegation to the ._values """
- prop = getattr(self._values, name)
- return prop # no wrapping for now
-
def _delegate_method(self, name: str, *args, **kwargs):
""" method delegation to the ._values """
method = getattr(self._values, name)
if "inplace" in kwargs:
raise ValueError("cannot use inplace with CategoricalIndex")
res = method(*args, **kwargs)
- if is_scalar(res) or name in self._raw_inherit:
+ if is_scalar(res):
return res
return CategoricalIndex(res, name=self.name)
diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py
index e3eeca2c45e76..d06d0d499ef47 100644
--- a/pandas/core/indexes/datetimelike.py
+++ b/pandas/core/indexes/datetimelike.py
@@ -80,7 +80,16 @@ def wrapper(left, right):
cache=True,
)
@inherit_names(
- ["__iter__", "mean", "freq", "freqstr", "_ndarray_values", "asi8", "_box_values"],
+ [
+ "__iter__",
+ "mean",
+ "freq",
+ "freqstr",
+ "_ndarray_values",
+ "asi8",
+ "_box_values",
+ "_box_func",
+ ],
DatetimeLikeArrayMixin,
)
class DatetimeIndexOpsMixin(ExtensionIndex):
@@ -191,7 +200,7 @@ def sort_values(self, return_indexer=False, ascending=True):
arr = type(self._data)._simple_new(
sorted_values, dtype=self.dtype, freq=freq
)
- return self._simple_new(arr, name=self.name)
+ return type(self)._simple_new(arr, name=self.name)
@Appender(_index_shared_docs["take"] % _index_doc_kwargs)
def take(self, indices, axis=0, allow_fill=True, fill_value=None, **kwargs):
@@ -374,8 +383,9 @@ def _format_attrs(self):
return attrs
# --------------------------------------------------------------------
+ # Indexing Methods
- def _convert_scalar_indexer(self, key, kind=None):
+ def _convert_scalar_indexer(self, key, kind: str):
"""
We don't allow integer or float indexing on datetime-like when using
loc.
@@ -383,23 +393,27 @@ def _convert_scalar_indexer(self, key, kind=None):
Parameters
----------
key : label of the slice bound
- kind : {'loc', 'getitem', 'iloc'} or None
+ kind : {'loc', 'getitem'}
"""
- assert kind in ["loc", "getitem", "iloc", None]
+ assert kind in ["loc", "getitem"]
+
+ if not is_scalar(key):
+ raise TypeError(key)
# we don't allow integer/float indexing for loc
- # we don't allow float indexing for ix/getitem
- if is_scalar(key):
- is_int = is_integer(key)
- is_flt = is_float(key)
- if kind in ["loc"] and (is_int or is_flt):
- self._invalid_indexer("index", key)
- elif kind in ["getitem"] and is_flt:
- self._invalid_indexer("index", key)
+ # we don't allow float indexing for getitem
+ is_int = is_integer(key)
+ is_flt = is_float(key)
+ if kind == "loc" and (is_int or is_flt):
+ self._invalid_indexer("label", key)
+ elif kind == "getitem" and is_flt:
+ self._invalid_indexer("label", key)
return super()._convert_scalar_indexer(key, kind=kind)
+ # --------------------------------------------------------------------
+
__add__ = make_wrapped_arith_op("__add__")
__radd__ = make_wrapped_arith_op("__radd__")
__sub__ = make_wrapped_arith_op("__sub__")
@@ -514,7 +528,7 @@ def _concat_same_dtype(self, to_concat, name):
if is_diff_evenly_spaced:
new_data._freq = self.freq
- return self._simple_new(new_data, name=name)
+ return type(self)._simple_new(new_data, name=name)
def shift(self, periods=1, freq=None):
"""
@@ -617,7 +631,7 @@ def _shallow_copy(self, values=None, **kwargs):
del attributes["freq"]
attributes.update(kwargs)
- return self._simple_new(values, **attributes)
+ return type(self)._simple_new(values, **attributes)
# --------------------------------------------------------------------
# Set Operation Methods
@@ -789,11 +803,10 @@ def _union(self, other, sort):
if this._can_fast_union(other):
return this._fast_union(other, sort=sort)
else:
- result = Index._union(this, other, sort=sort)
- if isinstance(result, type(self)):
- assert result._data.dtype == this.dtype
- if result.freq is None:
- result._set_freq("infer")
+ i8self = Int64Index._simple_new(self.asi8, name=self.name)
+ i8other = Int64Index._simple_new(other.asi8, name=other.name)
+ i8result = i8self._union(i8other, sort=sort)
+ result = type(self)(i8result, dtype=self.dtype, freq="infer")
return result
# --------------------------------------------------------------------
@@ -875,7 +888,7 @@ def _wrap_joined_index(self, joined, other):
kwargs = {}
if hasattr(self, "tz"):
kwargs["tz"] = getattr(other, "tz", None)
- return self._simple_new(joined, name, **kwargs)
+ return type(self)._simple_new(joined, name, **kwargs)
# --------------------------------------------------------------------
# List-Like Methods
diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py
index 416c3d0701a85..3d57f0944b318 100644
--- a/pandas/core/indexes/datetimes.py
+++ b/pandas/core/indexes/datetimes.py
@@ -5,15 +5,8 @@
import numpy as np
-from pandas._libs import (
- NaT,
- Timedelta,
- Timestamp,
- index as libindex,
- lib,
- tslib as libts,
-)
-from pandas._libs.tslibs import ccalendar, fields, parsing, timezones
+from pandas._libs import NaT, Period, Timestamp, index as libindex, lib, tslib as libts
+from pandas._libs.tslibs import fields, parsing, timezones
from pandas.util._decorators import cache_readonly
from pandas.core.dtypes.common import _NS_DTYPE, is_float, is_integer, is_scalar
@@ -29,7 +22,6 @@
from pandas.core.indexes.base import Index, InvalidIndexError, maybe_extract_name
from pandas.core.indexes.datetimelike import DatetimeTimedeltaMixin
from pandas.core.indexes.extension import inherit_names
-from pandas.core.ops import get_op_result_name
import pandas.core.tools.datetimes as tools
from pandas.tseries.frequencies import Resolution, to_offset
@@ -70,7 +62,6 @@ def _new_DatetimeIndex(cls, d):
"_field_ops",
"_datetimelike_ops",
"_datetimelike_methods",
- "_box_func",
"tz",
"tzinfo",
"dtype",
@@ -348,18 +339,9 @@ def union_many(self, others):
if this._can_fast_union(other):
this = this._fast_union(other)
else:
- dtype = this.dtype
this = Index.union(this, other)
- if isinstance(this, DatetimeIndex):
- # TODO: we shouldn't be setting attributes like this;
- # in all the tests this equality already holds
- this._data._dtype = dtype
return this
- def _wrap_setop_result(self, other, result):
- name = get_op_result_name(self, other)
- return self._shallow_copy(result, name=name, freq=None)
-
# --------------------------------------------------------------------
def _get_time_micros(self):
@@ -476,7 +458,7 @@ def _parsed_string_to_bounds(self, reso: str, parsed: datetime):
Parameters
----------
- reso : Resolution
+ reso : str
Resolution provided by parsed string.
parsed : datetime
Datetime from parsed string.
@@ -484,7 +466,6 @@ def _parsed_string_to_bounds(self, reso: str, parsed: datetime):
Returns
-------
lower, upper: pd.Timestamp
-
"""
valid_resos = {
"year",
@@ -500,50 +481,11 @@ def _parsed_string_to_bounds(self, reso: str, parsed: datetime):
}
if reso not in valid_resos:
raise KeyError
- if reso == "year":
- start = Timestamp(parsed.year, 1, 1)
- end = Timestamp(parsed.year + 1, 1, 1) - Timedelta(nanoseconds=1)
- elif reso == "month":
- d = ccalendar.get_days_in_month(parsed.year, parsed.month)
- start = Timestamp(parsed.year, parsed.month, 1)
- end = start + Timedelta(days=d, nanoseconds=-1)
- elif reso == "quarter":
- qe = (((parsed.month - 1) + 2) % 12) + 1 # two months ahead
- d = ccalendar.get_days_in_month(parsed.year, qe) # at end of month
- start = Timestamp(parsed.year, parsed.month, 1)
- end = Timestamp(parsed.year, qe, 1) + Timedelta(days=d, nanoseconds=-1)
- elif reso == "day":
- start = Timestamp(parsed.year, parsed.month, parsed.day)
- end = start + Timedelta(days=1, nanoseconds=-1)
- elif reso == "hour":
- start = Timestamp(parsed.year, parsed.month, parsed.day, parsed.hour)
- end = start + Timedelta(hours=1, nanoseconds=-1)
- elif reso == "minute":
- start = Timestamp(
- parsed.year, parsed.month, parsed.day, parsed.hour, parsed.minute
- )
- end = start + Timedelta(minutes=1, nanoseconds=-1)
- elif reso == "second":
- start = Timestamp(
- parsed.year,
- parsed.month,
- parsed.day,
- parsed.hour,
- parsed.minute,
- parsed.second,
- )
- end = start + Timedelta(seconds=1, nanoseconds=-1)
- elif reso == "microsecond":
- start = Timestamp(
- parsed.year,
- parsed.month,
- parsed.day,
- parsed.hour,
- parsed.minute,
- parsed.second,
- parsed.microsecond,
- )
- end = start + Timedelta(microseconds=1, nanoseconds=-1)
+
+ grp = Resolution.get_freq_group(reso)
+ per = Period(parsed, freq=(grp, 1))
+ start, end = per.start_time, per.end_time
+
# GH 24076
# If an incoming date string contained a UTC offset, need to localize
# the parsed date to this offset first before aligning with the index's
@@ -601,6 +543,7 @@ def _partial_date_slice(
raise KeyError
# a monotonic (sorted) series can be sliced
+ # Use asi8.searchsorted to avoid re-validating
left = stamps.searchsorted(t1.value, side="left") if use_lhs else None
right = stamps.searchsorted(t2.value, side="right") if use_rhs else None
@@ -617,17 +560,6 @@ def _maybe_promote(self, other):
other = DatetimeIndex(other)
return self, other
- def get_value(self, series, key):
- """
- Fast lookup of value from 1-dimensional ndarray. Only use this if you
- know what you're doing
- """
- if is_integer(key):
- loc = key
- else:
- loc = self.get_loc(key)
- return self._get_values_for_loc(series, loc)
-
def get_loc(self, key, method=None, tolerance=None):
"""
Get integer location for requested label
@@ -639,18 +571,13 @@ def get_loc(self, key, method=None, tolerance=None):
if not is_scalar(key):
raise InvalidIndexError(key)
+ orig_key = key
if is_valid_nat_for_dtype(key, self.dtype):
key = NaT
- if tolerance is not None:
- # try converting tolerance now, so errors don't get swallowed by
- # the try/except clauses below
- tolerance = self._convert_tolerance(tolerance, np.asarray(key))
-
- if isinstance(key, (datetime, np.datetime64)):
+ if isinstance(key, self._data._recognized_scalars):
# needed to localize naive datetimes
key = self._maybe_cast_for_get_loc(key)
- return Index.get_loc(self, key, method, tolerance)
elif isinstance(key, str):
try:
@@ -659,9 +586,8 @@ def get_loc(self, key, method=None, tolerance=None):
pass
try:
- stamp = self._maybe_cast_for_get_loc(key)
- return Index.get_loc(self, stamp, method, tolerance)
- except (KeyError, ValueError):
+ key = self._maybe_cast_for_get_loc(key)
+ except ValueError:
raise KeyError(key)
elif isinstance(key, timedelta):
@@ -670,14 +596,21 @@ def get_loc(self, key, method=None, tolerance=None):
f"Cannot index {type(self).__name__} with {type(key).__name__}"
)
- if isinstance(key, time):
+ elif isinstance(key, time):
if method is not None:
raise NotImplementedError(
"cannot yet lookup inexact labels when key is a time object"
)
return self.indexer_at_time(key)
- return Index.get_loc(self, key, method, tolerance)
+ else:
+ # unrecognized type
+ raise KeyError(key)
+
+ try:
+ return Index.get_loc(self, key, method, tolerance)
+ except KeyError:
+ raise KeyError(orig_key)
def _maybe_cast_for_get_loc(self, key) -> Timestamp:
# needed to localize naive datetimes
diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py
index 6a3e808ab9821..03fb8db2e1e1e 100644
--- a/pandas/core/indexes/interval.py
+++ b/pandas/core/indexes/interval.py
@@ -1,7 +1,7 @@
""" define the IntervalIndex """
from operator import le, lt
import textwrap
-from typing import TYPE_CHECKING, Any, Optional, Tuple, Union
+from typing import Any, Optional, Tuple, Union
import numpy as np
@@ -57,10 +57,6 @@
from pandas.tseries.frequencies import to_offset
from pandas.tseries.offsets import DateOffset
-if TYPE_CHECKING:
- from pandas import Series
-
-
_VALID_CLOSED = {"left", "right", "both", "neither"}
_index_doc_kwargs = dict(ibase._index_doc_kwargs)
@@ -527,17 +523,22 @@ def is_overlapping(self) -> bool:
# GH 23309
return self._engine.is_overlapping
+ def _should_fallback_to_positional(self):
+ # integer lookups in Series.__getitem__ are unambiguously
+ # positional in this case
+ return self.dtype.subtype.kind in ["m", "M"]
+
@Appender(Index._convert_scalar_indexer.__doc__)
- def _convert_scalar_indexer(self, key, kind=None):
- if kind == "iloc":
- return super()._convert_scalar_indexer(key, kind=kind)
+ def _convert_scalar_indexer(self, key, kind: str):
+ assert kind in ["getitem", "loc"]
+ # never iloc, so no-op
return key
def _maybe_cast_slice_bound(self, label, side, kind):
return getattr(self, side)._maybe_cast_slice_bound(label, side, kind)
@Appender(Index._convert_list_indexer.__doc__)
- def _convert_list_indexer(self, keyarr, kind=None):
+ def _convert_list_indexer(self, keyarr):
"""
we are passed a list-like indexer. Return the
indexer for matching intervals.
@@ -884,11 +885,6 @@ def get_indexer_for(self, target: AnyArrayLike, **kwargs) -> np.ndarray:
return self.get_indexer_non_unique(target)[0]
return self.get_indexer(target, **kwargs)
- @Appender(_index_shared_docs["get_value"] % _index_doc_kwargs)
- def get_value(self, series: "Series", key):
- loc = self.get_loc(key)
- return series.iloc[loc]
-
def _convert_slice_indexer(self, key: slice, kind=None):
if not (key.step is None or key.step == 1):
raise ValueError("cannot support not-default step in a slice")
diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py
index 889622f44bbb7..708bea7d132a2 100644
--- a/pandas/core/indexes/multi.py
+++ b/pandas/core/indexes/multi.py
@@ -1,14 +1,14 @@
-import datetime
from sys import getsizeof
-from typing import Any, Hashable, List, Optional, Sequence, Union
+from typing import Any, Hashable, Iterable, List, Optional, Sequence, Tuple, Union
import warnings
import numpy as np
from pandas._config import get_option
-from pandas._libs import Timestamp, algos as libalgos, index as libindex, lib, tslibs
+from pandas._libs import algos as libalgos, index as libindex, lib
from pandas._libs.hashtable import duplicated_int64
+from pandas._typing import AnyArrayLike, ArrayLike, Scalar
from pandas.compat.numpy import function as nv
from pandas.errors import PerformanceWarning, UnsortedIndexError
from pandas.util._decorators import Appender, cache_readonly
@@ -234,6 +234,8 @@ class MultiIndex(Index):
_comparables = ["names"]
rename = Index.set_names
+ _tuples = None
+
# --------------------------------------------------------------------
# Constructors
@@ -620,29 +622,29 @@ def from_frame(cls, df, sortorder=None, names=None):
# --------------------------------------------------------------------
- @property
- def levels(self):
- result = [
- x._shallow_copy(name=name) for x, name in zip(self._levels, self._names)
- ]
- for level in result:
- # disallow midx.levels[0].name = "foo"
- level._no_setting_name = True
- return FrozenList(result)
-
@property
def _values(self):
# We override here, since our parent uses _data, which we don't use.
return self.values
@property
- def shape(self):
- """
- Return a tuple of the shape of the underlying data.
- """
- # overriding the base Index.shape definition to avoid materializing
- # the values (GH-27384, GH-27775)
- return (len(self),)
+ def values(self):
+ if self._tuples is not None:
+ return self._tuples
+
+ values = []
+
+ for i in range(self.nlevels):
+ vals = self._get_level_values(i)
+ if is_categorical_dtype(vals):
+ vals = vals._internal_get_values()
+ if isinstance(vals.dtype, ExtensionDtype) or hasattr(vals, "_box_values"):
+ vals = vals.astype(object)
+ vals = np.array(vals, copy=False)
+ values.append(vals)
+
+ self._tuples = lib.fast_zip(values)
+ return self._tuples
@property
def array(self):
@@ -659,6 +661,34 @@ def array(self):
"'MultiIndex.to_numpy()' to get a NumPy array of tuples."
)
+ @property
+ def shape(self):
+ """
+ Return a tuple of the shape of the underlying data.
+ """
+ # overriding the base Index.shape definition to avoid materializing
+ # the values (GH-27384, GH-27775)
+ return (len(self),)
+
+ def __len__(self) -> int:
+ return len(self.codes[0])
+
+ # --------------------------------------------------------------------
+ # Levels Methods
+
+ @cache_readonly
+ def levels(self):
+ # Use cache_readonly to ensure that self.get_locs doesn't repeatedly
+ # create new IndexEngine
+ # https://github.com/pandas-dev/pandas/issues/31648
+ result = [
+ x._shallow_copy(name=name) for x, name in zip(self._levels, self._names)
+ ]
+ for level in result:
+ # disallow midx.levels[0].name = "foo"
+ level._no_setting_name = True
+ return FrozenList(result)
+
def _set_levels(
self, levels, level=None, copy=False, validate=True, verify_integrity=False
):
@@ -785,6 +815,23 @@ def set_levels(self, levels, level=None, inplace=False, verify_integrity=True):
if not inplace:
return idx
+ @property
+ def nlevels(self) -> int:
+ """
+ Integer number of levels in this MultiIndex.
+ """
+ return len(self._levels)
+
+ @property
+ def levshape(self):
+ """
+ A tuple with the length of each level.
+ """
+ return tuple(len(x) for x in self.levels)
+
+ # --------------------------------------------------------------------
+ # Codes Methods
+
@property
def codes(self):
return self._codes
@@ -895,6 +942,57 @@ def set_codes(self, codes, level=None, inplace=False, verify_integrity=True):
if not inplace:
return idx
+ # --------------------------------------------------------------------
+ # Index Internals
+
+ @cache_readonly
+ def _engine(self):
+ # Calculate the number of bits needed to represent labels in each
+ # level, as log2 of their sizes (including -1 for NaN):
+ sizes = np.ceil(np.log2([len(l) + 1 for l in self.levels]))
+
+ # Sum bit counts, starting from the _right_....
+ lev_bits = np.cumsum(sizes[::-1])[::-1]
+
+ # ... in order to obtain offsets such that sorting the combination of
+ # shifted codes (one for each level, resulting in a unique integer) is
+ # equivalent to sorting lexicographically the codes themselves. Notice
+ # that each level needs to be shifted by the number of bits needed to
+ # represent the _previous_ ones:
+ offsets = np.concatenate([lev_bits[1:], [0]]).astype("uint64")
+
+ # Check the total number of bits needed for our representation:
+ if lev_bits[0] > 64:
+ # The levels would overflow a 64 bit uint - use Python integers:
+ return MultiIndexPyIntEngine(self.levels, self.codes, offsets)
+ return MultiIndexUIntEngine(self.levels, self.codes, offsets)
+
+ @property
+ def _constructor(self):
+ return MultiIndex.from_tuples
+
+ @Appender(Index._shallow_copy.__doc__)
+ def _shallow_copy(self, values=None, **kwargs):
+ if values is not None:
+ names = kwargs.pop("names", kwargs.pop("name", self.names))
+ # discards freq
+ kwargs.pop("freq", None)
+ return MultiIndex.from_tuples(values, names=names, **kwargs)
+ return self.copy(**kwargs)
+
+ def _shallow_copy_with_infer(self, values, **kwargs):
+ # On equal MultiIndexes the difference is empty.
+ # Therefore, an empty MultiIndex is returned GH13490
+ if len(values) == 0:
+ return MultiIndex(
+ levels=[[] for _ in range(self.nlevels)],
+ codes=[[] for _ in range(self.nlevels)],
+ **kwargs,
+ )
+ return self._shallow_copy(values, **kwargs)
+
+ # --------------------------------------------------------------------
+
def copy(
self,
names=None,
@@ -961,17 +1059,6 @@ def view(self, cls=None):
result._id = self._id
return result
- def _shallow_copy_with_infer(self, values, **kwargs):
- # On equal MultiIndexes the difference is empty.
- # Therefore, an empty MultiIndex is returned GH13490
- if len(values) == 0:
- return MultiIndex(
- levels=[[] for _ in range(self.nlevels)],
- codes=[[] for _ in range(self.nlevels)],
- **kwargs,
- )
- return self._shallow_copy(values, **kwargs)
-
@Appender(Index.__contains__.__doc__)
def __contains__(self, key: Any) -> bool:
hash(key)
@@ -981,15 +1068,6 @@ def __contains__(self, key: Any) -> bool:
except (LookupError, TypeError, ValueError):
return False
- @Appender(Index._shallow_copy.__doc__)
- def _shallow_copy(self, values=None, **kwargs):
- if values is not None:
- names = kwargs.pop("names", kwargs.pop("name", self.names))
- # discards freq
- kwargs.pop("freq", None)
- return MultiIndex.from_tuples(values, names=names, **kwargs)
- return self.copy(**kwargs)
-
@cache_readonly
def dtype(self) -> np.dtype:
return np.dtype("O")
@@ -1039,6 +1117,7 @@ def _nbytes(self, deep: bool = False) -> int:
# --------------------------------------------------------------------
# Rendering Methods
+
def _formatter_func(self, tup):
"""
Formats each item in tup according to its level's formatter function.
@@ -1165,9 +1244,7 @@ def format(
return result_levels
# --------------------------------------------------------------------
-
- def __len__(self) -> int:
- return len(self.codes[0])
+ # Names Methods
def _get_names(self):
return FrozenList(self._names)
@@ -1227,10 +1304,15 @@ def _set_names(self, names, level=None, validate=True):
)
self._names[lev] = name
+ # If .levels has been accessed, the names in our cache will be stale.
+ self._reset_cache()
+
names = property(
fset=_set_names, fget=_get_names, doc="""\nNames of levels in MultiIndex.\n"""
)
+ # --------------------------------------------------------------------
+
@Appender(Index._get_grouper_for_level.__doc__)
def _get_grouper_for_level(self, mapper, level):
indexer = self.codes[level]
@@ -1268,10 +1350,6 @@ def _get_grouper_for_level(self, mapper, level):
return grouper, codes, level_index
- @property
- def _constructor(self):
- return MultiIndex.from_tuples
-
@cache_readonly
def inferred_type(self) -> str:
return "mixed"
@@ -1303,49 +1381,6 @@ def _get_level_number(self, level) -> int:
)
return level
- _tuples = None
-
- @cache_readonly
- def _engine(self):
- # Calculate the number of bits needed to represent labels in each
- # level, as log2 of their sizes (including -1 for NaN):
- sizes = np.ceil(np.log2([len(l) + 1 for l in self.levels]))
-
- # Sum bit counts, starting from the _right_....
- lev_bits = np.cumsum(sizes[::-1])[::-1]
-
- # ... in order to obtain offsets such that sorting the combination of
- # shifted codes (one for each level, resulting in a unique integer) is
- # equivalent to sorting lexicographically the codes themselves. Notice
- # that each level needs to be shifted by the number of bits needed to
- # represent the _previous_ ones:
- offsets = np.concatenate([lev_bits[1:], [0]]).astype("uint64")
-
- # Check the total number of bits needed for our representation:
- if lev_bits[0] > 64:
- # The levels would overflow a 64 bit uint - use Python integers:
- return MultiIndexPyIntEngine(self.levels, self.codes, offsets)
- return MultiIndexUIntEngine(self.levels, self.codes, offsets)
-
- @property
- def values(self):
- if self._tuples is not None:
- return self._tuples
-
- values = []
-
- for i in range(self.nlevels):
- vals = self._get_level_values(i)
- if is_categorical_dtype(vals):
- vals = vals._internal_get_values()
- if isinstance(vals.dtype, ExtensionDtype) or hasattr(vals, "_box_values"):
- vals = vals.astype(object)
- vals = np.array(vals, copy=False)
- values.append(vals)
-
- self._tuples = lib.fast_zip(values)
- return self._tuples
-
@property
def _has_complex_internals(self) -> bool:
# used to avoid libreduction code paths, which raise or require conversion
@@ -1461,68 +1496,6 @@ def dropna(self, how="any"):
new_codes = [level_codes[~indexer] for level_codes in self.codes]
return self.copy(codes=new_codes, deep=True)
- def get_value(self, series, key):
- # Label-based
- s = com.values_from_object(series)
- k = com.values_from_object(key)
-
- def _try_mi(k):
- # TODO: what if a level contains tuples??
- loc = self.get_loc(k)
- new_values = series._values[loc]
- new_index = self[loc]
- new_index = maybe_droplevels(new_index, k)
- return series._constructor(
- new_values, index=new_index, name=series.name
- ).__finalize__(self)
-
- try:
- return self._engine.get_value(s, k)
- except KeyError as e1:
- try:
- return _try_mi(key)
- except KeyError:
- pass
-
- try:
- return libindex.get_value_at(s, k)
- except IndexError:
- raise
- except TypeError:
- # generator/iterator-like
- if is_iterator(key):
- raise InvalidIndexError(key)
- else:
- raise e1
- except Exception: # pragma: no cover
- raise e1
- except TypeError:
-
- # a Timestamp will raise a TypeError in a multi-index
- # rather than a KeyError, try it here
- # note that a string that 'looks' like a Timestamp will raise
- # a KeyError! (GH5725)
- if isinstance(key, (datetime.datetime, np.datetime64, str)):
- try:
- return _try_mi(key)
- except KeyError:
- raise
- except (IndexError, ValueError, TypeError):
- pass
-
- try:
- return _try_mi(Timestamp(key))
- except (
- KeyError,
- TypeError,
- IndexError,
- ValueError,
- tslibs.OutOfBoundsDatetime,
- ):
- pass
-
- raise InvalidIndexError(key)
-
def _get_level_values(self, level, unique=False):
"""
Return vector of label values for requested level,
@@ -1869,19 +1842,8 @@ def remove_unused_levels(self):
return result
- @property
- def nlevels(self) -> int:
- """
- Integer number of levels in this MultiIndex.
- """
- return len(self._levels)
-
- @property
- def levshape(self):
- """
- A tuple with the length of each level.
- """
- return tuple(len(x) for x in self.levels)
+ # --------------------------------------------------------------------
+ # Pickling Methods
def __reduce__(self):
"""Necessary for making this object picklable"""
@@ -1915,6 +1877,8 @@ def __setstate__(self, state):
self.sortorder = sortorder
self._reset_identity()
+ # --------------------------------------------------------------------
+
def __getitem__(self, key):
if is_scalar(key):
key = com.cast_scalar_indexer(key)
@@ -2287,7 +2251,104 @@ def sortlevel(self, level=0, ascending=True, sort_remaining=True):
return new_index, indexer
- def _convert_listlike_indexer(self, keyarr, kind=None):
+ def reindex(self, target, method=None, level=None, limit=None, tolerance=None):
+ """
+ Create index with target's values (move/add/delete values as necessary)
+
+ Returns
+ -------
+ new_index : pd.MultiIndex
+ Resulting index
+ indexer : np.ndarray or None
+ Indices of output values in original index.
+
+ """
+ # GH6552: preserve names when reindexing to non-named target
+ # (i.e. neither Index nor Series).
+ preserve_names = not hasattr(target, "names")
+
+ if level is not None:
+ if method is not None:
+ raise TypeError("Fill method not supported if level passed")
+
+ # GH7774: preserve dtype/tz if target is empty and not an Index.
+ # target may be an iterator
+ target = ibase._ensure_has_len(target)
+ if len(target) == 0 and not isinstance(target, Index):
+ idx = self.levels[level]
+ attrs = idx._get_attributes_dict()
+ attrs.pop("freq", None) # don't preserve freq
+ target = type(idx)._simple_new(np.empty(0, dtype=idx.dtype), **attrs)
+ else:
+ target = ensure_index(target)
+ target, indexer, _ = self._join_level(
+ target, level, how="right", return_indexers=True, keep_order=False
+ )
+ else:
+ target = ensure_index(target)
+ if self.equals(target):
+ indexer = None
+ else:
+ if self.is_unique:
+ indexer = self.get_indexer(
+ target, method=method, limit=limit, tolerance=tolerance
+ )
+ else:
+ raise ValueError("cannot handle a non-unique multi-index!")
+
+ if not isinstance(target, MultiIndex):
+ if indexer is None:
+ target = self
+ elif (indexer >= 0).all():
+ target = self.take(indexer)
+ else:
+ # hopefully?
+ target = MultiIndex.from_tuples(target)
+
+ if (
+ preserve_names
+ and target.nlevels == self.nlevels
+ and target.names != self.names
+ ):
+ target = target.copy(deep=False)
+ target.names = self.names
+
+ return target, indexer
+
+ # --------------------------------------------------------------------
+ # Indexing Methods
+
+ def get_value(self, series, key):
+ # Label-based
+ if not is_hashable(key) or is_iterator(key):
+ # We allow tuples if they are hashable, whereas other Index
+ # subclasses require scalar.
+ # We have to explicitly exclude generators, as these are hashable.
+ raise InvalidIndexError(key)
+
+ def _try_mi(k):
+ # TODO: what if a level contains tuples??
+ loc = self.get_loc(k)
+
+ new_values = series._values[loc]
+ if is_scalar(loc):
+ return new_values
+
+ new_index = self[loc]
+ new_index = maybe_droplevels(new_index, k)
+ return series._constructor(
+ new_values, index=new_index, name=series.name
+ ).__finalize__(self)
+
+ try:
+ return _try_mi(key)
+ except KeyError:
+ if is_integer(key):
+ return series._values[key]
+ else:
+ raise
+
+ def _convert_listlike_indexer(self, keyarr):
"""
Parameters
----------
@@ -2300,7 +2361,7 @@ def _convert_listlike_indexer(self, keyarr, kind=None):
indexer is an ndarray or None if cannot convert
keyarr are tuple-safe keys
"""
- indexer, keyarr = super()._convert_listlike_indexer(keyarr, kind=kind)
+ indexer, keyarr = super()._convert_listlike_indexer(keyarr)
# are we indexing a specific level
if indexer is None and len(keyarr) and not isinstance(keyarr[0], tuple):
@@ -2361,70 +2422,6 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None):
def get_indexer_non_unique(self, target):
return super().get_indexer_non_unique(target)
- def reindex(self, target, method=None, level=None, limit=None, tolerance=None):
- """
- Create index with target's values (move/add/delete values as necessary)
-
- Returns
- -------
- new_index : pd.MultiIndex
- Resulting index
- indexer : np.ndarray or None
- Indices of output values in original index.
-
- """
- # GH6552: preserve names when reindexing to non-named target
- # (i.e. neither Index nor Series).
- preserve_names = not hasattr(target, "names")
-
- if level is not None:
- if method is not None:
- raise TypeError("Fill method not supported if level passed")
-
- # GH7774: preserve dtype/tz if target is empty and not an Index.
- # target may be an iterator
- target = ibase._ensure_has_len(target)
- if len(target) == 0 and not isinstance(target, Index):
- idx = self.levels[level]
- attrs = idx._get_attributes_dict()
- attrs.pop("freq", None) # don't preserve freq
- target = type(idx)._simple_new(np.empty(0, dtype=idx.dtype), **attrs)
- else:
- target = ensure_index(target)
- target, indexer, _ = self._join_level(
- target, level, how="right", return_indexers=True, keep_order=False
- )
- else:
- target = ensure_index(target)
- if self.equals(target):
- indexer = None
- else:
- if self.is_unique:
- indexer = self.get_indexer(
- target, method=method, limit=limit, tolerance=tolerance
- )
- else:
- raise ValueError("cannot handle a non-unique multi-index!")
-
- if not isinstance(target, MultiIndex):
- if indexer is None:
- target = self
- elif (indexer >= 0).all():
- target = self.take(indexer)
- else:
- # hopefully?
- target = MultiIndex.from_tuples(target)
-
- if (
- preserve_names
- and target.nlevels == self.nlevels
- and target.names != self.names
- ):
- target = target.copy(deep=False)
- target.names = self.names
-
- return target, indexer
-
def get_slice_bound(
self, label: Union[Hashable, Sequence[Hashable]], side: str, kind: str
) -> int:
@@ -3058,8 +3055,70 @@ def _update_indexer(idxr, indexer=indexer):
# empty indexer
if indexer is None:
return Int64Index([])._ndarray_values
+
+ indexer = self._reorder_indexer(seq, indexer)
+
return indexer._ndarray_values
+ def _reorder_indexer(
+ self, seq: Tuple[Union[Scalar, Iterable, AnyArrayLike], ...], indexer: ArrayLike
+ ) -> ArrayLike:
+ """
+ Reorder an indexer of a MultiIndex (self) so that the label are in the
+ same order as given in seq
+
+ Parameters
+ ----------
+ seq : label/slice/list/mask or a sequence of such
+ indexer: an Int64Index indexer of self
+
+ Returns
+ -------
+ indexer : a sorted Int64Index indexer of self ordered as seq
+ """
+ # If the index is lexsorted and the list_like label in seq are sorted
+ # then we do not need to sort
+ if self.is_lexsorted():
+ need_sort = False
+ for i, k in enumerate(seq):
+ if is_list_like(k):
+ if not need_sort:
+ k_codes = self.levels[i].get_indexer(k)
+ k_codes = k_codes[k_codes >= 0] # Filter absent keys
+ # True if the given codes are not ordered
+ need_sort = (k_codes[:-1] > k_codes[1:]).any()
+ # Bail out if both index and seq are sorted
+ if not need_sort:
+ return indexer
+
+ n = len(self)
+ keys: Tuple[np.ndarray, ...] = tuple()
+ # For each level of the sequence in seq, map the level codes with the
+ # order they appears in a list-like sequence
+ # This mapping is then use to reorder the indexer
+ for i, k in enumerate(seq):
+ if com.is_bool_indexer(k):
+ new_order = np.arange(n)[indexer]
+ elif is_list_like(k):
+ # Generate a map with all level codes as sorted initially
+ key_order_map = np.ones(len(self.levels[i]), dtype=np.uint64) * len(
+ self.levels[i]
+ )
+ # Set order as given in the indexer list
+ level_indexer = self.levels[i].get_indexer(k)
+ level_indexer = level_indexer[level_indexer >= 0] # Filter absent keys
+ key_order_map[level_indexer] = np.arange(len(level_indexer))
+
+ new_order = key_order_map[self.codes[i][indexer]]
+ else:
+ # For all other case, use the same order as the level
+ new_order = np.arange(n)[indexer]
+ keys = (new_order,) + keys
+
+ # Find the reordering using lexsort on the keys mapping
+ ind = np.lexsort(keys)
+ return indexer[ind]
+
def truncate(self, before=None, after=None):
"""
Slice index between two labels / tuples, return new MultiIndex
@@ -3158,6 +3217,9 @@ def equal_levels(self, other) -> bool:
return False
return True
+ # --------------------------------------------------------------------
+ # Set Methods
+
def union(self, other, sort=None):
"""
Form the union of two MultiIndex objects
@@ -3310,21 +3372,6 @@ def difference(self, other, sort=None):
else:
return MultiIndex.from_tuples(difference, sortorder=0, names=result_names)
- @Appender(Index.astype.__doc__)
- def astype(self, dtype, copy=True):
- dtype = pandas_dtype(dtype)
- if is_categorical_dtype(dtype):
- msg = "> 1 ndim Categorical are not supported at this time"
- raise NotImplementedError(msg)
- elif not is_object_dtype(dtype):
- raise TypeError(
- f"Setting {type(self)} dtype to anything other "
- "than object is not supported"
- )
- elif copy is True:
- return self._shallow_copy()
- return self
-
def _convert_can_do_setop(self, other):
result_names = self.names
@@ -3345,6 +3392,23 @@ def _convert_can_do_setop(self, other):
result_names = self.names if self.names == other.names else None
return other, result_names
+ # --------------------------------------------------------------------
+
+ @Appender(Index.astype.__doc__)
+ def astype(self, dtype, copy=True):
+ dtype = pandas_dtype(dtype)
+ if is_categorical_dtype(dtype):
+ msg = "> 1 ndim Categorical are not supported at this time"
+ raise NotImplementedError(msg)
+ elif not is_object_dtype(dtype):
+ raise TypeError(
+ f"Setting {type(self)} dtype to anything other "
+ "than object is not supported"
+ )
+ elif copy is True:
+ return self._shallow_copy()
+ return self
+
def insert(self, loc: int, item):
"""
Make new MultiIndex inserting new item at location
diff --git a/pandas/core/indexes/numeric.py b/pandas/core/indexes/numeric.py
index 4d3d560aaa688..d67c40a78d807 100644
--- a/pandas/core/indexes/numeric.py
+++ b/pandas/core/indexes/numeric.py
@@ -1,4 +1,4 @@
-from typing import TYPE_CHECKING, Any
+from typing import Any
import numpy as np
@@ -32,12 +32,9 @@
from pandas.core import algorithms
import pandas.core.common as com
-from pandas.core.indexes.base import Index, InvalidIndexError, maybe_extract_name
+from pandas.core.indexes.base import Index, maybe_extract_name
from pandas.core.ops import get_op_result_name
-if TYPE_CHECKING:
- from pandas import Series
-
_num_index_shared_docs = dict()
@@ -253,12 +250,11 @@ def asi8(self) -> np.ndarray:
return self.values.view(self._default_dtype)
@Appender(Index._convert_scalar_indexer.__doc__)
- def _convert_scalar_indexer(self, key, kind=None):
- assert kind in ["loc", "getitem", "iloc", None]
+ def _convert_scalar_indexer(self, key, kind: str):
+ assert kind in ["loc", "getitem"]
- # don't coerce ilocs to integers
- if kind != "iloc":
- key = self._maybe_cast_indexer(key)
+ # never iloc, which we don't coerce to integers
+ key = self._maybe_cast_indexer(key)
return super()._convert_scalar_indexer(key, kind=kind)
@@ -383,13 +379,17 @@ def astype(self, dtype, copy=True):
return Int64Index(arr)
return super().astype(dtype, copy=copy)
- @Appender(Index._convert_scalar_indexer.__doc__)
- def _convert_scalar_indexer(self, key, kind=None):
- assert kind in ["loc", "getitem", "iloc", None]
+ # ----------------------------------------------------------------
+ # Indexing Methods
- if kind == "iloc":
- self._validate_indexer("positional", key, "iloc")
+ @Appender(Index._should_fallback_to_positional.__doc__)
+ def _should_fallback_to_positional(self):
+ return False
+ @Appender(Index._convert_scalar_indexer.__doc__)
+ def _convert_scalar_indexer(self, key, kind: str):
+ assert kind in ["loc", "getitem"]
+ # no-op for non-iloc
return key
@Appender(Index._convert_slice_indexer.__doc__)
@@ -401,6 +401,8 @@ def _convert_slice_indexer(self, key: slice, kind=None):
# translate to locations
return self.slice_indexer(key.start, key.stop, key.step, kind=kind)
+ # ----------------------------------------------------------------
+
def _format_native_types(
self, na_rep="", float_format=None, decimal=".", quoting=None, **kwargs
):
@@ -416,16 +418,6 @@ def _format_native_types(
)
return formatter.get_result_as_array()
- def get_value(self, series: "Series", key):
- """
- We always want to get an index value, never a value.
- """
- if not is_scalar(key):
- raise InvalidIndexError
-
- loc = self.get_loc(key)
- return self._get_values_for_loc(series, loc)
-
def equals(self, other) -> bool:
"""
Determines if two Index objects contain the same elements.
diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py
index 75c100c9d2c08..42f0a012902a3 100644
--- a/pandas/core/indexes/period.py
+++ b/pandas/core/indexes/period.py
@@ -1,11 +1,11 @@
from datetime import datetime, timedelta
-from typing import TYPE_CHECKING, Any
+from typing import Any
import weakref
import numpy as np
from pandas._libs import index as libindex
-from pandas._libs.tslibs import NaT, frequencies as libfrequencies, resolution
+from pandas._libs.tslibs import frequencies as libfrequencies, resolution
from pandas._libs.tslibs.parsing import parse_time_string
from pandas._libs.tslibs.period import Period
from pandas.util._decorators import Appender, cache_readonly
@@ -18,7 +18,6 @@
is_float,
is_integer,
is_integer_dtype,
- is_list_like,
is_object_dtype,
is_scalar,
pandas_dtype,
@@ -51,9 +50,6 @@
_index_doc_kwargs = dict(ibase._index_doc_kwargs)
_index_doc_kwargs.update(dict(target_klass="PeriodIndex or list of Periods"))
-if TYPE_CHECKING:
- from pandas import Series
-
# --- Period index sketch
@@ -280,22 +276,6 @@ def _shallow_copy_with_infer(self, values=None, **kwargs):
""" we always want to return a PeriodIndex """
return self._shallow_copy(values=values, **kwargs)
- @property
- def _box_func(self):
- """Maybe box an ordinal or Period"""
- # TODO(DatetimeArray): Avoid double-boxing
- # PeriodArray takes care of boxing already, so we need to check
- # whether we're given an ordinal or a Period. It seems like some
- # places outside of indexes/period.py are calling this _box_func,
- # but passing data that's already boxed.
- def func(x):
- if isinstance(x, Period) or x is NaT:
- return x
- else:
- return Period._from_ordinal(ordinal=x, freq=self.freq)
-
- return func
-
def _maybe_convert_timedelta(self, other):
"""
Convert timedelta-like input to an integer multiple of self.freq
@@ -471,17 +451,6 @@ def inferred_type(self) -> str:
# indexing
return "period"
- def get_value(self, series: "Series", key):
- """
- Fast lookup of value from 1-dimensional ndarray. Only use this if you
- know what you're doing
- """
- if is_integer(key):
- loc = key
- else:
- loc = self.get_loc(key)
- return self._get_values_for_loc(series, loc)
-
@Appender(_index_shared_docs["get_indexer"] % _index_doc_kwargs)
def get_indexer(self, target, method=None, limit=None, tolerance=None):
target = ensure_index(target)
@@ -576,12 +545,9 @@ def get_loc(self, key, method=None, tolerance=None):
key = Period(key, freq=self.freq)
except ValueError:
# we cannot construct the Period
- # as we have an invalid type
- if is_list_like(key):
- raise TypeError(f"'{key}' is an invalid key")
raise KeyError(key)
- ordinal = key.ordinal if key is not NaT else key.value
+ ordinal = self._data._unbox_scalar(key)
try:
return self._engine.get_loc(ordinal)
except KeyError:
diff --git a/pandas/core/indexes/timedeltas.py b/pandas/core/indexes/timedeltas.py
index 08a07e8d30348..ec0414adc1376 100644
--- a/pandas/core/indexes/timedeltas.py
+++ b/pandas/core/indexes/timedeltas.py
@@ -1,7 +1,5 @@
""" implement the TimedeltaIndex """
-import numpy as np
-
from pandas._libs import NaT, Timedelta, index as libindex
from pandas.util._decorators import Appender
@@ -53,7 +51,6 @@
"_datetimelike_methods",
"_other_ops",
"components",
- "_box_func",
"to_pytimedelta",
"sum",
"std",
@@ -225,17 +222,6 @@ def _maybe_promote(self, other):
other = TimedeltaIndex(other)
return self, other
- def get_value(self, series, key):
- """
- Fast lookup of value from 1-dimensional ndarray. Only use this if you
- know what you're doing
- """
- if is_integer(key):
- loc = key
- else:
- loc = self.get_loc(key)
- return self._get_values_for_loc(series, loc)
-
def get_loc(self, key, method=None, tolerance=None):
"""
Get integer location for requested label
@@ -262,11 +248,6 @@ def get_loc(self, key, method=None, tolerance=None):
else:
raise KeyError(key)
- if tolerance is not None:
- # try converting tolerance now, so errors don't get swallowed by
- # the try/except clauses below
- tolerance = self._convert_tolerance(tolerance, np.asarray(key))
-
return Index.get_loc(self, key, method, tolerance)
def _maybe_cast_slice_bound(self, label, side: str, kind):
@@ -297,12 +278,6 @@ def _maybe_cast_slice_bound(self, label, side: str, kind):
return label
- def _get_string_slice(self, key: str, use_lhs: bool = True, use_rhs: bool = True):
- # TODO: Check for non-True use_lhs/use_rhs
- assert isinstance(key, str), type(key)
- # given a key, try to figure out a location for a partial slice
- raise NotImplementedError
-
def is_type_compatible(self, typ) -> bool:
return typ == self.inferred_type or typ == "timedelta"
diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py
index 7e56148b7569e..5c0f893554957 100755
--- a/pandas/core/indexing.py
+++ b/pandas/core/indexing.py
@@ -566,7 +566,7 @@ def iat(self) -> "_iAtIndexer":
return _iAtIndexer("iat", self)
-class _NDFrameIndexer(_NDFrameIndexerBase):
+class _LocationIndexer(_NDFrameIndexerBase):
_valid_types: str
axis = None
@@ -591,15 +591,9 @@ def _get_label(self, label, axis: int):
return self.obj._xs(label, axis=axis)
- def _get_loc(self, key: int, axis: int):
- return self.obj._ixs(key, axis=axis)
-
- def _slice(self, obj, axis: int, kind=None):
- return self.obj._slice(obj, axis=axis, kind=kind)
-
def _get_setitem_indexer(self, key):
if self.axis is not None:
- return self._convert_tuple(key)
+ return self._convert_tuple(key, is_setter=True)
ax = self.obj._get_axis(0)
@@ -612,7 +606,7 @@ def _get_setitem_indexer(self, key):
if isinstance(key, tuple):
try:
- return self._convert_tuple(key)
+ return self._convert_tuple(key, is_setter=True)
except IndexingError:
pass
@@ -620,7 +614,7 @@ def _get_setitem_indexer(self, key):
return list(key)
try:
- return self._convert_to_indexer(key, axis=0)
+ return self._convert_to_indexer(key, axis=0, is_setter=True)
except TypeError as e:
# invalid indexer type vs 'other' indexing errors
@@ -683,68 +677,25 @@ def _is_nested_tuple_indexer(self, tup: Tuple) -> bool:
return any(is_nested_tuple(tup, ax) for ax in self.obj.axes)
return False
- def _convert_tuple(self, key):
+ def _convert_tuple(self, key, is_setter: bool = False):
keyidx = []
if self.axis is not None:
axis = self.obj._get_axis_number(self.axis)
for i in range(self.ndim):
if i == axis:
- keyidx.append(self._convert_to_indexer(key, axis=axis))
+ keyidx.append(
+ self._convert_to_indexer(key, axis=axis, is_setter=is_setter)
+ )
else:
keyidx.append(slice(None))
else:
for i, k in enumerate(key):
if i >= self.ndim:
raise IndexingError("Too many indexers")
- idx = self._convert_to_indexer(k, axis=i)
+ idx = self._convert_to_indexer(k, axis=i, is_setter=is_setter)
keyidx.append(idx)
return tuple(keyidx)
- def _convert_scalar_indexer(self, key, axis: int):
- # if we are accessing via lowered dim, use the last dim
- ax = self.obj._get_axis(min(axis, self.ndim - 1))
- # a scalar
- return ax._convert_scalar_indexer(key, kind=self.name)
-
- def _convert_slice_indexer(self, key: slice, axis: int):
- # if we are accessing via lowered dim, use the last dim
- ax = self.obj._get_axis(min(axis, self.ndim - 1))
- return ax._convert_slice_indexer(key, kind=self.name)
-
- def _has_valid_setitem_indexer(self, indexer) -> bool:
- return True
-
- def _has_valid_positional_setitem_indexer(self, indexer) -> bool:
- """
- Validate that a positional indexer cannot enlarge its target
- will raise if needed, does not modify the indexer externally.
-
- Returns
- -------
- bool
- """
- if isinstance(indexer, dict):
- raise IndexError(f"{self.name} cannot enlarge its target object")
- else:
- if not isinstance(indexer, tuple):
- indexer = _tuplify(self.ndim, indexer)
- for ax, i in zip(self.obj.axes, indexer):
- if isinstance(i, slice):
- # should check the stop slice?
- pass
- elif is_list_like_indexer(i):
- # should check the elements?
- pass
- elif is_integer(i):
- if i >= len(ax):
- raise IndexError(
- f"{self.name} cannot enlarge its target object"
- )
- elif isinstance(i, dict):
- raise IndexError(f"{self.name} cannot enlarge its target object")
-
- return True
-
def _setitem_with_indexer(self, indexer, value):
self._has_valid_setitem_indexer(indexer)
@@ -893,7 +844,8 @@ def _setitem_with_indexer(self, indexer, value):
# we can directly set the series here
# as we select a slice indexer on the mi
- idx = index._convert_slice_indexer(idx)
+ if isinstance(idx, slice):
+ idx = index._convert_slice_indexer(idx)
obj._consolidate_inplace()
obj = obj.copy()
obj._data = obj._data.setitem(indexer=tuple([idx]), value=value)
@@ -1232,80 +1184,6 @@ def _align_frame(self, indexer, df: ABCDataFrame):
raise ValueError("Incompatible indexer with DataFrame")
- def _getitem_tuple(self, tup: Tuple):
- try:
- return self._getitem_lowerdim(tup)
- except IndexingError:
- pass
-
- # no multi-index, so validate all of the indexers
- self._has_valid_tuple(tup)
-
- # ugly hack for GH #836
- if self._multi_take_opportunity(tup):
- return self._multi_take(tup)
-
- # no shortcut needed
- retval = self.obj
- for i, key in enumerate(tup):
- if com.is_null_slice(key):
- continue
-
- retval = getattr(retval, self.name)._getitem_axis(key, axis=i)
-
- return retval
-
- def _multi_take_opportunity(self, tup: Tuple) -> bool:
- """
- Check whether there is the possibility to use ``_multi_take``.
-
- Currently the limit is that all axes being indexed, must be indexed with
- list-likes.
-
- Parameters
- ----------
- tup : tuple
- Tuple of indexers, one per axis.
-
- Returns
- -------
- bool
- Whether the current indexing,
- can be passed through `_multi_take`.
- """
- if not all(is_list_like_indexer(x) for x in tup):
- return False
-
- # just too complicated
- if any(com.is_bool_indexer(x) for x in tup):
- return False
-
- return True
-
- def _multi_take(self, tup: Tuple):
- """
- Create the indexers for the passed tuple of keys, and
- executes the take operation. This allows the take operation to be
- executed all at once, rather than once for each dimension.
- Improving efficiency.
-
- Parameters
- ----------
- tup : tuple
- Tuple of indexers, one per axis.
-
- Returns
- -------
- values: same type as the object being indexed
- """
- # GH 836
- o = self.obj
- d = {
- axis: self._get_listlike_indexer(key, axis)
- for (key, axis) in zip(tup, o._AXIS_ORDERS)
- }
- return o._reindex_with_indexers(d, copy=True, allow_dups=True)
-
def _handle_lowerdim_multi_index_axis0(self, tup: Tuple):
# we have an axis0 multi-index, handle or raise
axis = self.axis or 0
@@ -1426,97 +1304,6 @@ def _getitem_nested_tuple(self, tup: Tuple):
return obj
- def _get_listlike_indexer(self, key, axis: int, raise_missing: bool = False):
- """
- Transform a list-like of keys into a new index and an indexer.
-
- Parameters
- ----------
- key : list-like
- Targeted labels.
- axis: int
- Dimension on which the indexing is being made.
- raise_missing: bool, default False
- Whether to raise a KeyError if some labels were not found.
- Will be removed in the future, and then this method will always behave as
- if ``raise_missing=True``.
-
- Raises
- ------
- KeyError
- If at least one key was requested but none was found, and
- raise_missing=True.
-
- Returns
- -------
- keyarr: Index
- New index (coinciding with 'key' if the axis is unique).
- values : array-like
- Indexer for the return object, -1 denotes keys not found.
- """
- o = self.obj
- ax = o._get_axis(axis)
-
- # Have the index compute an indexer or return None
- # if it cannot handle:
- indexer, keyarr = ax._convert_listlike_indexer(key, kind=self.name)
- # We only act on all found values:
- if indexer is not None and (indexer != -1).all():
- self._validate_read_indexer(key, indexer, axis, raise_missing=raise_missing)
- return ax[indexer], indexer
-
- if ax.is_unique and not getattr(ax, "is_overlapping", False):
- indexer = ax.get_indexer_for(key)
- keyarr = ax.reindex(keyarr)[0]
- else:
- keyarr, indexer, new_indexer = ax._reindex_non_unique(keyarr)
-
- self._validate_read_indexer(keyarr, indexer, axis, raise_missing=raise_missing)
- return keyarr, indexer
-
- def _getitem_iterable(self, key, axis: int):
- """
- Index current object with an an iterable key.
-
- The iterable key can be a boolean indexer or a collection of keys.
-
- Parameters
- ----------
- key : iterable
- Targeted labels or boolean indexer.
- axis: int
- Dimension on which the indexing is being made.
-
- Raises
- ------
- KeyError
- If no key was found. Will change in the future to raise if not all
- keys were found.
- IndexingError
- If the boolean indexer is unalignable with the object being
- indexed.
-
- Returns
- -------
- scalar, DataFrame, or Series: indexed value(s).
- """
- # caller is responsible for ensuring non-None axis
- self._validate_key(key, axis)
-
- labels = self.obj._get_axis(axis)
-
- if com.is_bool_indexer(key):
- # A boolean indexer
- key = check_bool_indexer(labels, key)
- (inds,) = key.nonzero()
- return self.obj._take_with_is_copy(inds, axis=axis)
- else:
- # A collection of keys
- keyarr, indexer = self._get_listlike_indexer(key, axis, raise_missing=False)
- return self.obj._reindex_with_indexers(
- {axis: [keyarr, indexer]}, copy=True, allow_dups=True
- )
-
def _validate_read_indexer(
self, key, indexer, axis: int, raise_missing: bool = False
):
@@ -1577,135 +1364,59 @@ def _validate_read_indexer(
"https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike" # noqa:E501
)
- def _convert_to_indexer(self, key, axis: int):
- """
- Convert indexing key into something we can use to do actual fancy
- indexing on a ndarray.
-
- Examples
- ix[:5] -> slice(0, 5)
- ix[[1,2,3]] -> [1,2,3]
- ix[['foo', 'bar', 'baz']] -> [i, j, k] (indices of foo, bar, baz)
-
- Going by Zen of Python?
- 'In the face of ambiguity, refuse the temptation to guess.'
- raise AmbiguousIndexError with integer labels?
- - No, prefer label-based indexing
- """
- labels = self.obj._get_axis(axis)
-
- if isinstance(key, slice):
- return self._convert_slice_indexer(key, axis)
+ def _convert_to_indexer(self, key, axis: int, is_setter: bool = False):
+ raise AbstractMethodError(self)
- # try to find out correct indexer, if not type correct raise
- try:
- key = self._convert_scalar_indexer(key, axis)
- except TypeError:
- # but we will allow setting
- pass
+ def __getitem__(self, key):
+ if type(key) is tuple:
+ key = tuple(com.apply_if_callable(x, self.obj) for x in key)
+ if self._is_scalar_access(key):
+ try:
+ return self.obj._get_value(*key, takeable=self._takeable)
+ except (KeyError, IndexError, AttributeError):
+ # AttributeError for IntervalTree get_value
+ pass
+ return self._getitem_tuple(key)
+ else:
+ # we by definition only have the 0th axis
+ axis = self.axis or 0
- # see if we are positional in nature
- is_int_index = labels.is_integer()
- is_int_positional = is_integer(key) and not is_int_index
+ maybe_callable = com.apply_if_callable(key, self.obj)
+ return self._getitem_axis(maybe_callable, axis=axis)
- if is_scalar(key) or isinstance(labels, ABCMultiIndex):
- # Otherwise get_loc will raise InvalidIndexError
+ def _is_scalar_access(self, key: Tuple):
+ raise NotImplementedError()
- # if we are a label return me
- try:
- return labels.get_loc(key)
- except LookupError:
- if isinstance(key, tuple) and isinstance(labels, ABCMultiIndex):
- if len(key) == labels.nlevels:
- return {"key": key}
- raise
- except TypeError:
- pass
- except ValueError:
- if not is_int_positional:
- raise
+ def _getitem_tuple(self, tup: Tuple):
+ raise AbstractMethodError(self)
- # a positional
- if is_int_positional:
+ def _getitem_axis(self, key, axis: int):
+ raise NotImplementedError()
- # if we are setting and its not a valid location
- # its an insert which fails by definition
+ def _has_valid_setitem_indexer(self, indexer) -> bool:
+ raise AbstractMethodError(self)
- if self.name == "loc":
- # always valid
- return {"key": key}
-
- if key >= self.obj.shape[axis] and not isinstance(labels, ABCMultiIndex):
- # a positional
- raise ValueError("cannot set by positional indexing with enlargement")
-
- return key
-
- if is_nested_tuple(key, labels):
- return labels.get_locs(key)
-
- elif is_list_like_indexer(key):
-
- if com.is_bool_indexer(key):
- key = check_bool_indexer(labels, key)
- (inds,) = key.nonzero()
- return inds
- else:
- # When setting, missing keys are not allowed, even with .loc:
- return self._get_listlike_indexer(key, axis, raise_missing=True)[1]
- else:
- try:
- return labels.get_loc(key)
- except LookupError:
- # allow a not found key only if we are a setter
- if not is_list_like_indexer(key):
- return {"key": key}
- raise
-
-
-class _LocationIndexer(_NDFrameIndexer):
- _takeable: bool = False
-
- def __getitem__(self, key):
- if type(key) is tuple:
- key = tuple(com.apply_if_callable(x, self.obj) for x in key)
- if self._is_scalar_access(key):
- try:
- return self.obj._get_value(*key, takeable=self._takeable)
- except (KeyError, IndexError, AttributeError):
- # AttributeError for IntervalTree get_value
- pass
- return self._getitem_tuple(key)
- else:
- # we by definition only have the 0th axis
- axis = self.axis or 0
-
- maybe_callable = com.apply_if_callable(key, self.obj)
- return self._getitem_axis(maybe_callable, axis=axis)
-
- def _is_scalar_access(self, key: Tuple):
- raise NotImplementedError()
-
- def _getitem_axis(self, key, axis: int):
- raise NotImplementedError()
-
- def _getbool_axis(self, key, axis: int):
- # caller is responsible for ensuring non-None axis
- labels = self.obj._get_axis(axis)
- key = check_bool_indexer(labels, key)
- inds = key.nonzero()[0]
- return self.obj._take_with_is_copy(inds, axis=axis)
+ def _getbool_axis(self, key, axis: int):
+ # caller is responsible for ensuring non-None axis
+ labels = self.obj._get_axis(axis)
+ key = check_bool_indexer(labels, key)
+ inds = key.nonzero()[0]
+ return self.obj._take_with_is_copy(inds, axis=axis)
@Appender(IndexingMixin.loc.__doc__)
class _LocIndexer(_LocationIndexer):
+ _takeable: bool = False
_valid_types = (
"labels (MUST BE IN THE INDEX), slices of labels (BOTH "
"endpoints included! Can be slices of integers if the "
"index is integers), listlike of labels, boolean"
)
- @Appender(_NDFrameIndexer._validate_key.__doc__)
+ # -------------------------------------------------------------------
+ # Key Checks
+
+ @Appender(_LocationIndexer._validate_key.__doc__)
def _validate_key(self, key, axis: int):
# valid for a collection of labels (we check their presence later)
@@ -1720,7 +1431,11 @@ def _validate_key(self, key, axis: int):
return
if not is_list_like_indexer(key):
- self._convert_scalar_indexer(key, axis)
+ labels = self.obj._get_axis(axis)
+ labels._convert_scalar_indexer(key, kind="loc")
+
+ def _has_valid_setitem_indexer(self, indexer) -> bool:
+ return True
def _is_scalar_access(self, key: Tuple) -> bool:
"""
@@ -1753,6 +1468,61 @@ def _is_scalar_access(self, key: Tuple) -> bool:
return True
+ # -------------------------------------------------------------------
+ # MultiIndex Handling
+
+ def _multi_take_opportunity(self, tup: Tuple) -> bool:
+ """
+ Check whether there is the possibility to use ``_multi_take``.
+
+ Currently the limit is that all axes being indexed, must be indexed with
+ list-likes.
+
+ Parameters
+ ----------
+ tup : tuple
+ Tuple of indexers, one per axis.
+
+ Returns
+ -------
+ bool
+ Whether the current indexing,
+ can be passed through `_multi_take`.
+ """
+ if not all(is_list_like_indexer(x) for x in tup):
+ return False
+
+ # just too complicated
+ if any(com.is_bool_indexer(x) for x in tup):
+ return False
+
+ return True
+
+ def _multi_take(self, tup: Tuple):
+ """
+ Create the indexers for the passed tuple of keys, and
+ executes the take operation. This allows the take operation to be
+ executed all at once, rather than once for each dimension.
+ Improving efficiency.
+
+ Parameters
+ ----------
+ tup : tuple
+ Tuple of indexers, one per axis.
+
+ Returns
+ -------
+ values: same type as the object being indexed
+ """
+ # GH 836
+ d = {
+ axis: self._get_listlike_indexer(key, axis)
+ for (key, axis) in zip(tup, self.obj._AXIS_ORDERS)
+ }
+ return self.obj._reindex_with_indexers(d, copy=True, allow_dups=True)
+
+ # -------------------------------------------------------------------
+
def _get_partial_string_timestamp_match_key(self, key, labels):
"""
Translate any partial string timestamp matches in key, returning the
@@ -1785,6 +1555,60 @@ def _get_partial_string_timestamp_match_key(self, key, labels):
return key
+ def _getitem_iterable(self, key, axis: int):
+ """
+ Index current object with an an iterable collection of keys.
+
+ Parameters
+ ----------
+ key : iterable
+ Targeted labels.
+ axis: int
+ Dimension on which the indexing is being made.
+
+ Raises
+ ------
+ KeyError
+ If no key was found. Will change in the future to raise if not all
+ keys were found.
+
+ Returns
+ -------
+ scalar, DataFrame, or Series: indexed value(s).
+ """
+ # we assume that not com.is_bool_indexer(key), as that is
+ # handled before we get here.
+ self._validate_key(key, axis)
+
+ # A collection of keys
+ keyarr, indexer = self._get_listlike_indexer(key, axis, raise_missing=False)
+ return self.obj._reindex_with_indexers(
+ {axis: [keyarr, indexer]}, copy=True, allow_dups=True
+ )
+
+ def _getitem_tuple(self, tup: Tuple):
+ try:
+ return self._getitem_lowerdim(tup)
+ except IndexingError:
+ pass
+
+ # no multi-index, so validate all of the indexers
+ self._has_valid_tuple(tup)
+
+ # ugly hack for GH #836
+ if self._multi_take_opportunity(tup):
+ return self._multi_take(tup)
+
+ # no shortcut needed
+ retval = self.obj
+ for i, key in enumerate(tup):
+ if com.is_null_slice(key):
+ continue
+
+ retval = getattr(retval, self.name)._getitem_axis(key, axis=i)
+
+ return retval
+
def _getitem_axis(self, key, axis: int):
key = item_from_zerodim(key)
if is_iterator(key):
@@ -1865,12 +1689,139 @@ def _get_slice_axis(self, slice_obj: slice, axis: int):
)
if isinstance(indexer, slice):
- return self._slice(indexer, axis=axis, kind="iloc")
+ return self.obj._slice(indexer, axis=axis, kind="iloc")
else:
# DatetimeIndex overrides Index.slice_indexer and may
# return a DatetimeIndex instead of a slice object.
return self.obj.take(indexer, axis=axis)
+ def _convert_to_indexer(self, key, axis: int, is_setter: bool = False):
+ """
+ Convert indexing key into something we can use to do actual fancy
+ indexing on a ndarray.
+
+ Examples
+ ix[:5] -> slice(0, 5)
+ ix[[1,2,3]] -> [1,2,3]
+ ix[['foo', 'bar', 'baz']] -> [i, j, k] (indices of foo, bar, baz)
+
+ Going by Zen of Python?
+ 'In the face of ambiguity, refuse the temptation to guess.'
+ raise AmbiguousIndexError with integer labels?
+ - No, prefer label-based indexing
+ """
+ labels = self.obj._get_axis(axis)
+
+ if isinstance(key, slice):
+ return labels._convert_slice_indexer(key, kind="loc")
+
+ if is_scalar(key):
+ # try to find out correct indexer, if not type correct raise
+ try:
+ key = labels._convert_scalar_indexer(key, kind="loc")
+ except TypeError:
+ # but we will allow setting
+ if not is_setter:
+ raise
+
+ # see if we are positional in nature
+ is_int_index = labels.is_integer()
+ is_int_positional = is_integer(key) and not is_int_index
+
+ if is_scalar(key) or isinstance(labels, ABCMultiIndex):
+ # Otherwise get_loc will raise InvalidIndexError
+
+ # if we are a label return me
+ try:
+ return labels.get_loc(key)
+ except LookupError:
+ if isinstance(key, tuple) and isinstance(labels, ABCMultiIndex):
+ if len(key) == labels.nlevels:
+ return {"key": key}
+ raise
+ except TypeError:
+ pass
+ except ValueError:
+ if not is_int_positional:
+ raise
+
+ # a positional
+ if is_int_positional:
+
+ # if we are setting and its not a valid location
+ # its an insert which fails by definition
+
+ # always valid
+ return {"key": key}
+
+ if is_nested_tuple(key, labels):
+ return labels.get_locs(key)
+
+ elif is_list_like_indexer(key):
+
+ if com.is_bool_indexer(key):
+ key = check_bool_indexer(labels, key)
+ (inds,) = key.nonzero()
+ return inds
+ else:
+ # When setting, missing keys are not allowed, even with .loc:
+ return self._get_listlike_indexer(key, axis, raise_missing=True)[1]
+ else:
+ try:
+ return labels.get_loc(key)
+ except LookupError:
+ # allow a not found key only if we are a setter
+ if not is_list_like_indexer(key):
+ return {"key": key}
+ raise
+
+ def _get_listlike_indexer(self, key, axis: int, raise_missing: bool = False):
+ """
+ Transform a list-like of keys into a new index and an indexer.
+
+ Parameters
+ ----------
+ key : list-like
+ Targeted labels.
+ axis: int
+ Dimension on which the indexing is being made.
+ raise_missing: bool, default False
+ Whether to raise a KeyError if some labels were not found.
+ Will be removed in the future, and then this method will always behave as
+ if ``raise_missing=True``.
+
+ Raises
+ ------
+ KeyError
+ If at least one key was requested but none was found, and
+ raise_missing=True.
+
+ Returns
+ -------
+ keyarr: Index
+ New index (coinciding with 'key' if the axis is unique).
+ values : array-like
+ Indexer for the return object, -1 denotes keys not found.
+ """
+ ax = self.obj._get_axis(axis)
+
+ # Have the index compute an indexer or return None
+ # if it cannot handle:
+ indexer, keyarr = ax._convert_listlike_indexer(key)
+ # We only act on all found values:
+ if indexer is not None and (indexer != -1).all():
+ self._validate_read_indexer(key, indexer, axis, raise_missing=raise_missing)
+ return ax[indexer], indexer
+
+ if ax.is_unique and not getattr(ax, "is_overlapping", False):
+ indexer = ax.get_indexer_for(key)
+ keyarr = ax.reindex(keyarr)[0]
+ else:
+ keyarr, indexer, new_indexer = ax._reindex_non_unique(keyarr)
+
+ self._validate_read_indexer(keyarr, indexer, axis, raise_missing=raise_missing)
+ return keyarr, indexer
+
@Appender(IndexingMixin.iloc.__doc__)
class _iLocIndexer(_LocationIndexer):
@@ -1880,6 +1831,9 @@ class _iLocIndexer(_LocationIndexer):
)
_takeable = True
+ # -------------------------------------------------------------------
+ # Key Checks
+
def _validate_key(self, key, axis: int):
if com.is_bool_indexer(key):
if hasattr(key, "index") and isinstance(key.index, Index):
@@ -1920,6 +1874,37 @@ def _validate_key(self, key, axis: int):
def _has_valid_setitem_indexer(self, indexer):
self._has_valid_positional_setitem_indexer(indexer)
+ def _has_valid_positional_setitem_indexer(self, indexer) -> bool:
+ """
+ Validate that a positional indexer cannot enlarge its target
+ will raise if needed, does not modify the indexer externally.
+
+ Returns
+ -------
+ bool
+ """
+ if isinstance(indexer, dict):
+ raise IndexError(f"{self.name} cannot enlarge its target object")
+ else:
+ if not isinstance(indexer, tuple):
+ indexer = _tuplify(self.ndim, indexer)
+ for ax, i in zip(self.obj.axes, indexer):
+ if isinstance(i, slice):
+ # should check the stop slice?
+ pass
+ elif is_list_like_indexer(i):
+ # should check the elements?
+ pass
+ elif is_integer(i):
+ if i >= len(ax):
+ raise IndexError(
+ f"{self.name} cannot enlarge its target object"
+ )
+ elif isinstance(i, dict):
+ raise IndexError(f"{self.name} cannot enlarge its target object")
+
+ return True
+
def _is_scalar_access(self, key: Tuple) -> bool:
"""
Returns
@@ -1963,6 +1948,8 @@ def _validate_integer(self, key: int, axis: int) -> None:
if key >= len_axis or key < -len_axis:
raise IndexError("single positional indexer is out-of-bounds")
+ # -------------------------------------------------------------------
+
def _getitem_tuple(self, tup: Tuple):
self._has_valid_tuple(tup)
@@ -2038,7 +2025,7 @@ def _getitem_axis(self, key, axis: int):
# validate the location
self._validate_integer(key, axis)
- return self._get_loc(key, axis=axis)
+ return self.obj._ixs(key, axis=axis)
def _get_slice_axis(self, slice_obj: slice, axis: int):
# caller is responsible for ensuring non-None axis
@@ -2047,25 +2034,26 @@ def _get_slice_axis(self, slice_obj: slice, axis: int):
if not need_slice(slice_obj):
return obj.copy(deep=False)
- indexer = self._convert_slice_indexer(slice_obj, axis)
- return self._slice(indexer, axis=axis, kind="iloc")
+ labels = obj._get_axis(axis)
+ indexer = labels._convert_slice_indexer(slice_obj, kind="iloc")
+ return self.obj._slice(indexer, axis=axis, kind="iloc")
- def _convert_to_indexer(self, key, axis: int):
+ def _convert_to_indexer(self, key, axis: int, is_setter: bool = False):
"""
Much simpler as we only have to deal with our valid types.
"""
+ labels = self.obj._get_axis(axis)
+
# make need to convert a float key
if isinstance(key, slice):
- return self._convert_slice_indexer(key, axis)
+ return labels._convert_slice_indexer(key, kind="iloc")
elif is_float(key):
- return self._convert_scalar_indexer(key, axis)
-
- try:
- self._validate_key(key, axis)
+ labels._validate_indexer("positional", key, "iloc")
return key
- except ValueError:
- raise ValueError(f"Can only index by location with a [{self._valid_types}]")
+
+ self._validate_key(key, axis)
+ return key
class _ScalarAccessIndexer(_NDFrameIndexerBase):
@@ -2116,21 +2104,11 @@ def _convert_key(self, key, is_setter: bool = False):
if is_setter:
return list(key)
- for ax, i in zip(self.obj.axes, key):
- if ax.is_integer():
- if not is_integer(i):
- raise ValueError(
- "At based indexing on an integer index "
- "can only have integer indexers"
- )
- else:
- if is_integer(i) and not (ax.holds_integer() or ax.is_floating()):
- raise ValueError(
- "At based indexing on an non-integer "
- "index can only have non-integer "
- "indexers"
- )
- return key
+ lkey = list(key)
+ for n, (ax, i) in enumerate(zip(self.obj.axes, key)):
+ lkey[n] = ax._convert_scalar_indexer(i, kind="loc")
+
+ return tuple(lkey)
@Appender(IndexingMixin.iat.__doc__)
diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py
index 9e31ccebd0f1b..85a26179276f5 100644
--- a/pandas/core/internals/blocks.py
+++ b/pandas/core/internals/blocks.py
@@ -7,8 +7,7 @@
import numpy as np
-from pandas._libs import NaT, algos as libalgos, lib, tslib, writers
-from pandas._libs.index import convert_scalar
+from pandas._libs import NaT, Timestamp, algos as libalgos, lib, tslib, writers
import pandas._libs.internals as libinternals
from pandas._libs.tslibs import Timedelta, conversion
from pandas._libs.tslibs.timezones import tz_compare
@@ -16,6 +15,7 @@
from pandas.core.dtypes.cast import (
astype_nansafe,
+ convert_scalar_for_putitemlike,
find_common_type,
infer_dtype_from,
infer_dtype_from_scalar,
@@ -762,7 +762,7 @@ def replace(
# The only non-DatetimeLike class that also has a non-trivial
# try_coerce_args is ObjectBlock, but that overrides replace,
# so does not get here.
- to_replace = convert_scalar(values, to_replace)
+ to_replace = convert_scalar_for_putitemlike(to_replace, values.dtype)
mask = missing.mask_missing(values, to_replace)
if filter is not None:
@@ -841,7 +841,7 @@ def setitem(self, indexer, value):
# We only get here for non-Extension Blocks, so _try_coerce_args
# is only relevant for DatetimeBlock and TimedeltaBlock
if lib.is_scalar(value):
- value = convert_scalar(values, value)
+ value = convert_scalar_for_putitemlike(value, values.dtype)
else:
# current dtype cannot store value, coerce to common dtype
@@ -957,7 +957,7 @@ def putmask(self, mask, new, align=True, inplace=False, axis=0, transpose=False)
# We only get here for non-Extension Blocks, so _try_coerce_args
# is only relevant for DatetimeBlock and TimedeltaBlock
if lib.is_scalar(new):
- new = convert_scalar(new_values, new)
+ new = convert_scalar_for_putitemlike(new, new_values.dtype)
if transpose:
new_values = new_values.T
@@ -1200,7 +1200,7 @@ def _interpolate_with_fill(
values = self.values if inplace else self.values.copy()
# We only get here for non-ExtensionBlock
- fill_value = convert_scalar(self.values, fill_value)
+ fill_value = convert_scalar_for_putitemlike(fill_value, self.values.dtype)
values = missing.interpolate_2d(
values,
@@ -1405,7 +1405,7 @@ def where_func(cond, values, other):
raise TypeError
if lib.is_scalar(other) and isinstance(values, np.ndarray):
# convert datetime to datetime64, timedelta to timedelta64
- other = convert_scalar(values, other)
+ other = convert_scalar_for_putitemlike(other, values.dtype)
# By the time we get here, we should have all Series/Index
# args extracted to ndarray
@@ -2158,6 +2158,16 @@ def internal_values(self):
# Override to return DatetimeArray and TimedeltaArray
return self.array_values()
+ def iget(self, key):
+ # GH#31649 we need to wrap scalars in Timestamp/Timedelta
+ # TODO: this can be removed if we ever have 2D EA
+ result = super().iget(key)
+ if isinstance(result, np.datetime64):
+ result = Timestamp(result)
+ elif isinstance(result, np.timedelta64):
+ result = Timedelta(result)
+ return result
+
class DatetimeBlock(DatetimeLikeBlockMixin, Block):
__slots__ = ()
diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py
index 526863d2e5ec3..08ae0b02169d4 100644
--- a/pandas/core/internals/managers.py
+++ b/pandas/core/internals/managers.py
@@ -1565,7 +1565,7 @@ def fast_xs(self, loc):
fast path for getting a cross-section
return a view of the data
"""
- return self._block.values[loc]
+ raise NotImplementedError("Use series._values[loc] instead")
def concat(self, to_concat, new_axis) -> "SingleBlockManager":
"""
diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py
index 8829c242b1129..d9f21f0b274ac 100644
--- a/pandas/core/reshape/concat.py
+++ b/pandas/core/reshape/concat.py
@@ -352,8 +352,8 @@ def __init__(
for obj in objs:
if not isinstance(obj, (Series, DataFrame)):
msg = (
- "cannot concatenate object of type '{typ}'; "
- "only Series and DataFrame objs are valid".format(typ=type(obj))
+ f"cannot concatenate object of type '{type(obj)}'; "
+ "only Series and DataFrame objs are valid"
)
raise TypeError(msg)
@@ -403,8 +403,7 @@ def __init__(
self._is_series = isinstance(sample, ABCSeries)
if not 0 <= axis <= sample.ndim:
raise AssertionError(
- "axis must be between 0 and {ndim}, input was "
- "{axis}".format(ndim=sample.ndim, axis=axis)
+ f"axis must be between 0 and {sample.ndim}, input was {axis}"
)
# if we have mixed ndims, then convert to highest ndim
@@ -622,11 +621,7 @@ def _make_concat_multiindex(indexes, keys, levels=None, names=None) -> MultiInde
try:
i = level.get_loc(key)
except KeyError:
- raise ValueError(
- "Key {key!s} not in level {level!s}".format(
- key=key, level=level
- )
- )
+ raise ValueError(f"Key {key} not in level {level}")
to_concat.append(np.repeat(i, len(index)))
codes_list.append(np.concatenate(to_concat))
@@ -677,11 +672,7 @@ def _make_concat_multiindex(indexes, keys, levels=None, names=None) -> MultiInde
mask = mapped == -1
if mask.any():
- raise ValueError(
- "Values not found in passed level: {hlevel!s}".format(
- hlevel=hlevel[mask]
- )
- )
+ raise ValueError(f"Values not found in passed level: {hlevel[mask]!s}")
new_codes.append(np.repeat(mapped, n))
diff --git a/pandas/core/reshape/melt.py b/pandas/core/reshape/melt.py
index d04287e1e9088..782b8043430e1 100644
--- a/pandas/core/reshape/melt.py
+++ b/pandas/core/reshape/melt.py
@@ -88,9 +88,7 @@ def melt(
if len(frame.columns.names) == len(set(frame.columns.names)):
var_name = frame.columns.names
else:
- var_name = [
- "variable_{i}".format(i=i) for i in range(len(frame.columns.names))
- ]
+ var_name = [f"variable_{i}" for i in range(len(frame.columns.names))]
else:
var_name = [
frame.columns.name if frame.columns.name is not None else "variable"
@@ -417,9 +415,7 @@ def wide_to_long(
"""
def get_var_names(df, stub: str, sep: str, suffix: str) -> List[str]:
- regex = r"^{stub}{sep}{suffix}$".format(
- stub=re.escape(stub), sep=re.escape(sep), suffix=suffix
- )
+ regex = fr"^{re.escape(stub)}{re.escape(sep)}{suffix}$"
pattern = re.compile(regex)
return [col for col in df.columns if pattern.match(col)]
diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py
index ceee2f66dba42..480c5279ad3f6 100644
--- a/pandas/core/reshape/merge.py
+++ b/pandas/core/reshape/merge.py
@@ -611,8 +611,9 @@ def __init__(
if _left.columns.nlevels != _right.columns.nlevels:
msg = (
"merging between different levels can give an unintended "
- "result ({left} levels on the left, {right} on the right)"
- ).format(left=_left.columns.nlevels, right=_right.columns.nlevels)
+ f"result ({left.columns.nlevels} levels on the left,"
+ f"{right.columns.nlevels} on the right)"
+ )
warnings.warn(msg, UserWarning)
self._validate_specification()
@@ -679,7 +680,7 @@ def _indicator_pre_merge(
if i in columns:
raise ValueError(
"Cannot use `indicator=True` option when "
- "data contains a column named {name}".format(name=i)
+ f"data contains a column named {i}"
)
if self.indicator_name in columns:
raise ValueError(
@@ -831,7 +832,7 @@ def _maybe_add_join_keys(self, result, left_indexer, right_indexer):
else:
result.index = Index(key_col, name=name)
else:
- result.insert(i, name or "key_{i}".format(i=i), key_col)
+ result.insert(i, name or f"key_{i}", key_col)
def _get_join_indexers(self):
""" return the join indexers """
@@ -1185,13 +1186,10 @@ def _validate_specification(self):
if len(common_cols) == 0:
raise MergeError(
"No common columns to perform merge on. "
- "Merge options: left_on={lon}, right_on={ron}, "
- "left_index={lidx}, right_index={ridx}".format(
- lon=self.left_on,
- ron=self.right_on,
- lidx=self.left_index,
- ridx=self.right_index,
- )
+ f"Merge options: left_on={self.left_on}, "
+ f"right_on={self.right_on}, "
+ f"left_index={self.left_index}, "
+ f"right_index={self.right_index}"
)
if not common_cols.is_unique:
raise MergeError(f"Data columns not unique: {repr(common_cols)}")
@@ -1486,12 +1484,12 @@ def get_result(self):
def _asof_function(direction: str):
- name = "asof_join_{dir}".format(dir=direction)
+ name = f"asof_join_{direction}"
return getattr(libjoin, name, None)
def _asof_by_function(direction: str):
- name = "asof_join_{dir}_on_X_by_Y".format(dir=direction)
+ name = f"asof_join_{direction}_on_X_by_Y"
return getattr(libjoin, name, None)
@@ -1601,9 +1599,7 @@ def _validate_specification(self):
# check 'direction' is valid
if self.direction not in ["backward", "forward", "nearest"]:
- raise MergeError(
- "direction invalid: {direction}".format(direction=self.direction)
- )
+ raise MergeError(f"direction invalid: {self.direction}")
@property
def _asof_key(self):
@@ -1628,17 +1624,13 @@ def _get_merge_keys(self):
# later with a ValueError, so we don't *need* to check
# for them here.
msg = (
- "incompatible merge keys [{i}] {lkdtype} and "
- "{rkdtype}, both sides category, but not equal ones".format(
- i=i, lkdtype=repr(lk.dtype), rkdtype=repr(rk.dtype)
- )
+ f"incompatible merge keys [{i}] {repr(lk.dtype)} and "
+ f"{repr(rk.dtype)}, both sides category, but not equal ones"
)
else:
msg = (
- "incompatible merge keys [{i}] {lkdtype} and "
- "{rkdtype}, must be the same type".format(
- i=i, lkdtype=repr(lk.dtype), rkdtype=repr(rk.dtype)
- )
+ f"incompatible merge keys [{i}] {repr(lk.dtype)} and "
+ f"{repr(rk.dtype)}, must be the same type"
)
raise MergeError(msg)
@@ -1651,10 +1643,8 @@ def _get_merge_keys(self):
lt = left_join_keys[-1]
msg = (
- "incompatible tolerance {tolerance}, must be compat "
- "with type {lkdtype}".format(
- tolerance=type(self.tolerance), lkdtype=repr(lt.dtype)
- )
+ f"incompatible tolerance {self.tolerance}, must be compat "
+ f"with type {repr(lk.dtype)}"
)
if needs_i8_conversion(lt):
@@ -1680,8 +1670,11 @@ def _get_merge_keys(self):
# validate allow_exact_matches
if not is_bool(self.allow_exact_matches):
- msg = "allow_exact_matches must be boolean, passed {passed}"
- raise MergeError(msg.format(passed=self.allow_exact_matches))
+ msg = (
+ "allow_exact_matches must be boolean, "
+ f"passed {self.allow_exact_matches}"
+ )
+ raise MergeError(msg)
return left_join_keys, right_join_keys, join_names
diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py
index a5a9ec9fb79ba..053fb86836ff8 100644
--- a/pandas/core/reshape/pivot.py
+++ b/pandas/core/reshape/pivot.py
@@ -200,7 +200,7 @@ def _add_margins(
if not isinstance(margins_name, str):
raise ValueError("margins_name argument must be a string")
- msg = 'Conflicting name "{name}" in margins'.format(name=margins_name)
+ msg = f'Conflicting name "{margins_name}" in margins'
for level in table.index.names:
if margins_name in table.index.get_level_values(level):
raise ValueError(msg)
@@ -650,9 +650,7 @@ def _normalize(table, normalize, margins: bool, margins_name="All"):
if (margins_name not in table.iloc[-1, :].name) | (
margins_name != table.iloc[:, -1].name
):
- raise ValueError(
- "{mname} not in pivoted DataFrame".format(mname=margins_name)
- )
+ raise ValueError(f"{margins_name} not in pivoted DataFrame")
column_margin = table.iloc[:-1, -1]
index_margin = table.iloc[-1, :-1]
@@ -702,7 +700,7 @@ def _get_names(arrs, names, prefix: str = "row"):
if isinstance(arr, ABCSeries) and arr.name is not None:
names.append(arr.name)
else:
- names.append("{prefix}_{i}".format(prefix=prefix, i=i))
+ names.append(f"{prefix}_{i}")
else:
if len(names) != len(arrs):
raise AssertionError("arrays and names must have the same length")
diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py
index f00ff0d4ba5ed..359e5b956f8a5 100644
--- a/pandas/core/reshape/reshape.py
+++ b/pandas/core/reshape/reshape.py
@@ -873,15 +873,13 @@ def get_dummies(
# validate prefixes and separator to avoid silently dropping cols
def check_len(item, name):
- len_msg = (
- "Length of '{name}' ({len_item}) did not match the "
- "length of the columns being encoded ({len_enc})."
- )
if is_list_like(item):
if not len(item) == data_to_encode.shape[1]:
- len_msg = len_msg.format(
- name=name, len_item=len(item), len_enc=data_to_encode.shape[1]
+ len_msg = (
+ f"Length of '{name}' ({len(item)}) did not match the "
+ "length of the columns being encoded "
+ f"({data_to_encode.shape[1]})."
)
raise ValueError(len_msg)
@@ -990,8 +988,7 @@ def get_empty_frame(data) -> DataFrame:
# PY2 embedded unicode, gh-22084
def _make_col_name(prefix, prefix_sep, level) -> str:
- fstr = "{prefix}{prefix_sep}{level}"
- return fstr.format(prefix=prefix, prefix_sep=prefix_sep, level=level)
+ return f"{prefix}{prefix_sep}{level}"
dummy_cols = [_make_col_name(prefix, prefix_sep, level) for level in levels]
diff --git a/pandas/core/reshape/tile.py b/pandas/core/reshape/tile.py
index 00a7645d0c7a5..a18b45a077be0 100644
--- a/pandas/core/reshape/tile.py
+++ b/pandas/core/reshape/tile.py
@@ -202,17 +202,10 @@ def cut(
"""
# NOTE: this binning code is changed a bit from histogram for var(x) == 0
- # for handling the cut for datetime and timedelta objects
original = x
x = _preprocess_for_cut(x)
x, dtype = _coerce_to_type(x)
- # To support cut(IntegerArray), we convert to object dtype with NaN
- # Will properly support in the future.
- # https://github.com/pandas-dev/pandas/pull/31290
- if is_extension_array_dtype(x.dtype) and is_integer_dtype(x.dtype):
- x = x.to_numpy(dtype=object, na_value=np.nan)
-
if not np.iterable(bins):
if is_scalar(bins) and bins < 1:
raise ValueError("`bins` should be a positive integer.")
@@ -434,7 +427,7 @@ def _bins_to_cuts(
def _coerce_to_type(x):
"""
- if the passed data is of datetime/timedelta or bool type,
+ if the passed data is of datetime/timedelta, bool or nullable int type,
this method converts it to numeric so that cut or qcut method can
handle it
"""
@@ -451,6 +444,12 @@ def _coerce_to_type(x):
elif is_bool_dtype(x):
# GH 20303
x = x.astype(np.int64)
+ # To support cut and qcut for IntegerArray we convert to float dtype.
+ # Will properly support in the future.
+ # https://github.com/pandas-dev/pandas/pull/31290
+ # https://github.com/pandas-dev/pandas/issues/31389
+ elif is_extension_array_dtype(x) and is_integer_dtype(x):
+ x = x.to_numpy(dtype=np.float64, na_value=np.nan)
if dtype is not None:
# GH 19768: force NaT to NaN during integer conversion
diff --git a/pandas/core/series.py b/pandas/core/series.py
index e5cea8ebfc914..0786674daf874 100644
--- a/pandas/core/series.py
+++ b/pandas/core/series.py
@@ -22,13 +22,13 @@
from pandas._config import get_option
-from pandas._libs import index as libindex, lib, properties, reshape, tslibs
+from pandas._libs import lib, properties, reshape, tslibs
from pandas._typing import Label
from pandas.compat.numpy import function as nv
from pandas.util._decorators import Appender, Substitution
from pandas.util._validators import validate_bool_kwarg, validate_percentile
-from pandas.core.dtypes.cast import convert_dtypes
+from pandas.core.dtypes.cast import convert_dtypes, validate_numeric_casting
from pandas.core.dtypes.common import (
_is_unorderable_exception,
ensure_platform_int,
@@ -838,16 +838,11 @@ def _ixs(self, i: int, axis: int = 0):
-------
scalar (int) or Series (slice, sequence)
"""
+ return self._values[i]
- # dispatch to the values if we need
- values = self._values
- if isinstance(values, np.ndarray):
- return libindex.get_value_at(values, i)
- else:
- return values[i]
-
- def _slice(self, slobj: slice, axis: int = 0, kind=None) -> "Series":
- slobj = self.index._convert_slice_indexer(slobj, kind=kind or "getitem")
+ def _slice(self, slobj: slice, axis: int = 0, kind: str = "getitem") -> "Series":
+ assert kind in ["getitem", "iloc"]
+ slobj = self.index._convert_slice_indexer(slobj, kind=kind)
return self._get_values(slobj)
def __getitem__(self, key):
@@ -856,31 +851,33 @@ def __getitem__(self, key):
if key is Ellipsis:
return self
- try:
- result = self.index.get_value(self, key)
+ key_is_scalar = is_scalar(key)
+ if key_is_scalar:
+ key = self.index._convert_scalar_indexer(key, kind="getitem")
- return result
- except InvalidIndexError:
- pass
- except (KeyError, ValueError):
- if isinstance(key, tuple) and isinstance(self.index, MultiIndex):
- # kludge
- pass
- elif com.is_bool_indexer(key):
- pass
- else:
+ if key_is_scalar or isinstance(self.index, MultiIndex):
+ # Otherwise index.get_value will raise InvalidIndexError
+ try:
+ result = self.index.get_value(self, key)
- # we can try to coerce the indexer (or this will raise)
- new_key = self.index._convert_scalar_indexer(key, kind="getitem")
- if type(new_key) != type(key):
- return self.__getitem__(new_key)
- raise
+ return result
+ except InvalidIndexError:
+ pass
+ except (KeyError, ValueError):
+ if isinstance(key, tuple) and isinstance(self.index, MultiIndex):
+ # kludge
+ pass
+ else:
+ raise
- if is_iterator(key):
- key = list(key)
+ if not key_is_scalar:
+ # avoid expensive checks if we know we have a scalar
+ if is_iterator(key):
+ key = list(key)
- if com.is_bool_indexer(key):
- key = check_bool_indexer(self.index, key)
+ if com.is_bool_indexer(key):
+ key = check_bool_indexer(self.index, key)
+ return self._get_values(key)
return self._get_with(key)
@@ -913,6 +910,8 @@ def _get_with(self, key):
else:
key_type = lib.infer_dtype(key, skipna=False)
+ # Note: The key_type == "boolean" case should be caught by the
+ # com.is_bool_indexer check in __getitem__
if key_type == "integer":
if self.index.is_integer() or self.index.is_floating():
return self.loc[key]
@@ -921,8 +920,6 @@ def _get_with(self, key):
return self.iloc[indexer]
else:
return self._get_values(key)
- elif key_type == "boolean":
- return self._get_values(key)
if isinstance(key, (list, tuple)):
# TODO: de-dup with tuple case handled above?
@@ -981,7 +978,7 @@ def _get_value(self, label, takeable: bool = False):
scalar value
"""
if takeable:
- return com.maybe_box_datetimelike(self._values[label])
+ return self._values[label]
return self.index.get_value(self, label)
def __setitem__(self, key, value):
@@ -1026,17 +1023,10 @@ def __setitem__(self, key, value):
self._maybe_update_cacher()
def _set_with_engine(self, key, value):
- values = self._values
- if is_extension_array_dtype(values.dtype):
- # The cython indexing engine does not support ExtensionArrays.
- values[self.index.get_loc(key)] = value
- return
- try:
- self.index._engine.set_value(values, key, value)
- return
- except KeyError:
- values[self.index.get_loc(key)] = value
- return
+ # fails with AttributeError for IntervalIndex
+ loc = self.index._engine.get_loc(key)
+ validate_numeric_casting(self.dtype, value)
+ self._values[loc] = value
def _set_with(self, key, value):
# other: fancy integer or otherwise
@@ -1116,11 +1106,10 @@ def _set_value(self, label, value, takeable: bool = False):
try:
if takeable:
self._values[label] = value
- elif isinstance(self._values, np.ndarray):
- # i.e. not EA, so we can use _engine
- self.index._engine.set_value(self._values, label, value)
else:
- self.loc[label] = value
+ loc = self.index.get_loc(label)
+ validate_numeric_casting(self.dtype, value)
+ self._values[loc] = value
except KeyError:
# set using a non-recursive method
diff --git a/pandas/io/common.py b/pandas/io/common.py
index 00f2961e41617..c4772895afd1e 100644
--- a/pandas/io/common.py
+++ b/pandas/io/common.py
@@ -3,11 +3,23 @@
import bz2
from collections import abc
import gzip
-from io import BufferedIOBase, BytesIO
+from io import BufferedIOBase, BytesIO, RawIOBase
import mmap
import os
import pathlib
-from typing import IO, Any, AnyStr, Dict, List, Mapping, Optional, Tuple, Union
+from typing import (
+ IO,
+ TYPE_CHECKING,
+ Any,
+ AnyStr,
+ Dict,
+ List,
+ Mapping,
+ Optional,
+ Tuple,
+ Type,
+ Union,
+)
from urllib.parse import ( # noqa
urlencode,
urljoin,
@@ -37,6 +49,10 @@
_VALID_URLS.discard("")
+if TYPE_CHECKING:
+ from io import IOBase # noqa: F401
+
+
def is_url(url) -> bool:
"""
Check to see if a URL has a valid protocol.
@@ -356,12 +372,13 @@ def get_handle(
handles : list of file-like objects
A list of file-like object that were opened in this function.
"""
+ need_text_wrapping: Tuple[Type["IOBase"], ...]
try:
from s3fs import S3File
- need_text_wrapping = (BufferedIOBase, S3File)
+ need_text_wrapping = (BufferedIOBase, RawIOBase, S3File)
except ImportError:
- need_text_wrapping = BufferedIOBase # type: ignore
+ need_text_wrapping = (BufferedIOBase, RawIOBase)
handles: List[IO] = list()
f = path_or_buf
@@ -437,7 +454,7 @@ def get_handle(
from io import TextIOWrapper
g = TextIOWrapper(f, encoding=encoding, newline="")
- if not isinstance(f, BufferedIOBase):
+ if not isinstance(f, (BufferedIOBase, RawIOBase)):
handles.append(g)
f = g
diff --git a/pandas/io/formats/excel.py b/pandas/io/formats/excel.py
index 14e79538541af..28a069bc9fc1b 100644
--- a/pandas/io/formats/excel.py
+++ b/pandas/io/formats/excel.py
@@ -403,7 +403,7 @@ def __init__(
# Deprecated in GH#17295, enforced in 1.0.0
raise KeyError("Not all names specified in 'columns' are found")
- self.df = df
+ self.df = df.reindex(columns=cols)
self.columns = self.df.columns
self.float_format = float_format
diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py
index 204807b55c877..04fd17a00041b 100644
--- a/pandas/io/json/_json.py
+++ b/pandas/io/json/_json.py
@@ -19,12 +19,7 @@
from pandas.core.construction import create_series_with_explicit_dtype
from pandas.core.reshape.concat import concat
-from pandas.io.common import (
- get_filepath_or_buffer,
- get_handle,
- infer_compression,
- stringify_path,
-)
+from pandas.io.common import get_filepath_or_buffer, get_handle, infer_compression
from pandas.io.json._normalize import convert_to_line_delimits
from pandas.io.json._table_schema import build_table_schema, parse_table_schema
from pandas.io.parsers import _validate_integer
@@ -56,7 +51,11 @@ def to_json(
"'index=False' is only valid when 'orient' is 'split' or 'table'"
)
- path_or_buf = stringify_path(path_or_buf)
+ if path_or_buf is not None:
+ path_or_buf, _, _, _ = get_filepath_or_buffer(
+ path_or_buf, compression=compression, mode="w"
+ )
+
if lines and orient != "records":
raise ValueError("'lines' keyword only valid when 'orient' is records")
diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py
index 98f2eb3929b59..926635062d853 100644
--- a/pandas/io/parquet.py
+++ b/pandas/io/parquet.py
@@ -85,7 +85,6 @@ def write(
df: DataFrame,
path,
compression="snappy",
- coerce_timestamps="ms",
index: Optional[bool] = None,
partition_cols=None,
**kwargs,
@@ -103,17 +102,12 @@ def write(
table,
path,
compression=compression,
- coerce_timestamps=coerce_timestamps,
partition_cols=partition_cols,
**kwargs,
)
else:
self.api.parquet.write_table(
- table,
- path,
- compression=compression,
- coerce_timestamps=coerce_timestamps,
- **kwargs,
+ table, path, compression=compression, **kwargs,
)
def read(self, path, columns=None, **kwargs):
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
index a33d81ff437bf..a7d8c374a9aae 100755
--- a/pandas/io/parsers.py
+++ b/pandas/io/parsers.py
@@ -5,7 +5,8 @@
from collections import abc, defaultdict
import csv
import datetime
-from io import BufferedIOBase, StringIO, TextIOWrapper
+from io import BufferedIOBase, RawIOBase, StringIO, TextIOWrapper
+from itertools import chain
import re
import sys
from textwrap import fill
@@ -1399,17 +1400,21 @@ def __init__(self, kwds):
"index_col must only contain row numbers "
"when specifying a multi-index header"
)
-
- # GH 16338
- elif self.header is not None and not is_integer(self.header):
- raise ValueError("header must be integer or list of integers")
-
- # GH 27779
- elif self.header is not None and self.header < 0:
- raise ValueError(
- "Passing negative integer to header is invalid. "
- "For no header, use header=None instead"
- )
+ elif self.header is not None:
+ # GH 27394
+ if self.prefix is not None:
+ raise ValueError(
+ "Argument prefix must be None if argument header is not None"
+ )
+ # GH 16338
+ elif not is_integer(self.header):
+ raise ValueError("header must be integer or list of integers")
+ # GH 27779
+ elif self.header < 0:
+ raise ValueError(
+ "Passing negative integer to header is invalid. "
+ "For no header, use header=None instead"
+ )
self._name_processed = False
@@ -1419,6 +1424,26 @@ def __init__(self, kwds):
# keep references to file handles opened by the parser itself
self.handles = []
+ def _confirm_parse_dates_presence(self, columns):
+ """
+ if user has provided names for parse_dates, check if those columns
+ are available.
+ """
+ if isinstance(self.parse_dates, list):
+ cols_needed = self.parse_dates
+ elif isinstance(self.parse_dates, dict):
+ cols_needed = chain(*self.parse_dates.values())
+ else:
+ cols_needed = []
+
+ missing_cols = ", ".join(
+ [col for col in cols_needed if isinstance(col, str) and col not in columns]
+ )
+ if missing_cols:
+ raise ValueError(
+ f"Missing column provided to 'parse_dates': '{missing_cols}'"
+ )
+
def close(self):
for f in self.handles:
f.close()
@@ -1868,7 +1893,7 @@ def __init__(self, src, **kwds):
# Handle the file object with universal line mode enabled.
# We will handle the newline character ourselves later on.
- if isinstance(src, BufferedIOBase):
+ if isinstance(src, (BufferedIOBase, RawIOBase)):
src = TextIOWrapper(src, encoding=encoding, newline="")
kwds["encoding"] = "utf-8"
@@ -1938,6 +1963,7 @@ def __init__(self, src, **kwds):
if len(self.names) < len(usecols):
_validate_usecols_names(usecols, self.names)
+ self._confirm_parse_dates_presence(self.names)
self._set_noconvert_columns()
self.orig_names = self.names
@@ -2308,6 +2334,7 @@ def __init__(self, f, **kwds):
if self.index_names is None:
self.index_names = index_names
+ self._confirm_parse_dates_presence(self.columns)
if self.parse_dates:
self._no_thousands_columns = self._set_no_thousands_columns()
else:
@@ -3278,6 +3305,10 @@ def _isindex(colspec):
if is_scalar(colspec):
if isinstance(colspec, int) and colspec not in data_dict:
colspec = orig_names[colspec]
+ elif colspec not in orig_names:
+ raise ValueError(
+ f"Missing column provided to 'parse_dates': '{colspec}'"
+ )
if _isindex(colspec):
continue
data_dict[colspec] = converter(data_dict[colspec])
diff --git a/pandas/plotting/_matplotlib/timeseries.py b/pandas/plotting/_matplotlib/timeseries.py
index dd048114142f3..3abce690cbe6b 100644
--- a/pandas/plotting/_matplotlib/timeseries.py
+++ b/pandas/plotting/_matplotlib/timeseries.py
@@ -251,7 +251,7 @@ def _maybe_convert_index(ax, data):
freq = frequencies.get_period_alias(freq)
if isinstance(data.index, ABCDatetimeIndex):
- data = data.to_period(freq=freq)
+ data = data.tz_localize(None).to_period(freq=freq)
elif isinstance(data.index, ABCPeriodIndex):
data.index = data.index.asfreq(freq=freq)
return data
diff --git a/pandas/tests/arrays/categorical/test_constructors.py b/pandas/tests/arrays/categorical/test_constructors.py
index cfba3da354d44..70e1421c8dcf4 100644
--- a/pandas/tests/arrays/categorical/test_constructors.py
+++ b/pandas/tests/arrays/categorical/test_constructors.py
@@ -408,6 +408,11 @@ def test_constructor_str_unknown(self):
with pytest.raises(ValueError, match="Unknown dtype"):
Categorical([1, 2], dtype="foo")
+ def test_constructor_np_strs(self):
+ # GH#31499 Hastable.map_locations needs to work on np.str_ objects
+ cat = pd.Categorical(["1", "0", "1"], [np.str_("0"), np.str_("1")])
+ assert all(isinstance(x, np.str_) for x in cat.categories)
+
def test_constructor_from_categorical_with_dtype(self):
dtype = CategoricalDtype(["a", "b", "c"], ordered=True)
values = Categorical(["a", "b", "d"])
diff --git a/pandas/tests/arrays/interval/test_interval.py b/pandas/tests/arrays/interval/test_interval.py
index 35eda4a0ec5bc..7e7762d8973a0 100644
--- a/pandas/tests/arrays/interval/test_interval.py
+++ b/pandas/tests/arrays/interval/test_interval.py
@@ -81,6 +81,24 @@ def test_where_raises(self, other):
with pytest.raises(ValueError, match=match):
ser.where([True, False, True], other=other)
+ def test_shift(self):
+ # https://github.com/pandas-dev/pandas/issues/31495
+ a = IntervalArray.from_breaks([1, 2, 3])
+ result = a.shift()
+ # int -> float
+ expected = IntervalArray.from_tuples([(np.nan, np.nan), (1.0, 2.0)])
+ tm.assert_interval_array_equal(result, expected)
+
+ def test_shift_datetime(self):
+ a = IntervalArray.from_breaks(pd.date_range("2000", periods=4))
+ result = a.shift(2)
+ expected = a.take([-1, -1, 0], allow_fill=True)
+ tm.assert_interval_array_equal(result, expected)
+
+ result = a.shift(-1)
+ expected = a.take([1, 2, -1], allow_fill=True)
+ tm.assert_interval_array_equal(result, expected)
+
class TestSetitem:
def test_set_na(self, left_right_dtypes):
diff --git a/pandas/tests/arrays/test_integer.py b/pandas/tests/arrays/test_integer.py
index cc81ae4504dd8..7a0c9300a43a2 100644
--- a/pandas/tests/arrays/test_integer.py
+++ b/pandas/tests/arrays/test_integer.py
@@ -1061,19 +1061,6 @@ def test_value_counts_na():
tm.assert_series_equal(result, expected)
-@pytest.mark.parametrize("bins", [3, [0, 5, 15]])
-@pytest.mark.parametrize("right", [True, False])
-@pytest.mark.parametrize("include_lowest", [True, False])
-def test_cut(bins, right, include_lowest):
- a = np.random.randint(0, 10, size=50).astype(object)
- a[::2] = np.nan
- result = pd.cut(
- pd.array(a, dtype="Int64"), bins, right=right, include_lowest=include_lowest
- )
- expected = pd.cut(a, bins, right=right, include_lowest=include_lowest)
- tm.assert_categorical_equal(result, expected)
-
-
def test_array_setitem_nullable_boolean_mask():
# GH 31446
ser = pd.Series([1, 2], dtype="Int64")
diff --git a/pandas/tests/dtypes/test_common.py b/pandas/tests/dtypes/test_common.py
index 097e83d93ee71..4c917b9bb42d2 100644
--- a/pandas/tests/dtypes/test_common.py
+++ b/pandas/tests/dtypes/test_common.py
@@ -675,6 +675,8 @@ def test__get_dtype(input_param, result):
)
def test__get_dtype_fails(input_param, expected_error_message):
# python objects
+ # 2020-02-02 npdev changed error message
+ expected_error_message += f"|Cannot interpret '{input_param}' as a data type"
with pytest.raises(TypeError, match=expected_error_message):
com._get_dtype(input_param)
diff --git a/pandas/tests/dtypes/test_dtypes.py b/pandas/tests/dtypes/test_dtypes.py
index a599a086ae92b..dd99b81fb6764 100644
--- a/pandas/tests/dtypes/test_dtypes.py
+++ b/pandas/tests/dtypes/test_dtypes.py
@@ -32,66 +32,60 @@
class Base:
- def setup_method(self, method):
- self.dtype = self.create()
-
- def test_hash(self):
- hash(self.dtype)
-
- def test_equality_invalid(self):
- assert not self.dtype == "foo"
- assert not is_dtype_equal(self.dtype, np.int64)
-
- def test_numpy_informed(self):
- with pytest.raises(TypeError, match="data type not understood"):
- np.dtype(self.dtype)
+ def test_hash(self, dtype):
+ hash(dtype)
+
+ def test_equality_invalid(self, dtype):
+ assert not dtype == "foo"
+ assert not is_dtype_equal(dtype, np.int64)
+
+ def test_numpy_informed(self, dtype):
+ # npdev 2020-02-02 changed from "data type not understood" to
+ # "Cannot interpret 'foo' as a data type"
+ msg = "|".join(
+ ["data type not understood", "Cannot interpret '.*' as a data type"]
+ )
+ with pytest.raises(TypeError, match=msg):
+ np.dtype(dtype)
- assert not self.dtype == np.str_
- assert not np.str_ == self.dtype
+ assert not dtype == np.str_
+ assert not np.str_ == dtype
- def test_pickle(self):
+ def test_pickle(self, dtype):
# make sure our cache is NOT pickled
# clear the cache
- type(self.dtype).reset_cache()
- assert not len(self.dtype._cache)
+ type(dtype).reset_cache()
+ assert not len(dtype._cache)
# force back to the cache
- result = tm.round_trip_pickle(self.dtype)
- assert not len(self.dtype._cache)
- assert result == self.dtype
+ result = tm.round_trip_pickle(dtype)
+ assert not len(dtype._cache)
+ assert result == dtype
class TestCategoricalDtype(Base):
- def create(self):
+ @pytest.fixture
+ def dtype(self):
+ """
+ Class level fixture of dtype for TestCategoricalDtype
+ """
return CategoricalDtype()
- def test_pickle(self):
- # make sure our cache is NOT pickled
-
- # clear the cache
- type(self.dtype).reset_cache()
- assert not len(self.dtype._cache)
-
- # force back to the cache
- result = tm.round_trip_pickle(self.dtype)
- assert result == self.dtype
-
- def test_hash_vs_equality(self):
- dtype = self.dtype
+ def test_hash_vs_equality(self, dtype):
dtype2 = CategoricalDtype()
assert dtype == dtype2
assert dtype2 == dtype
assert hash(dtype) == hash(dtype2)
- def test_equality(self):
- assert is_dtype_equal(self.dtype, "category")
- assert is_dtype_equal(self.dtype, CategoricalDtype())
- assert not is_dtype_equal(self.dtype, "foo")
+ def test_equality(self, dtype):
+ assert is_dtype_equal(dtype, "category")
+ assert is_dtype_equal(dtype, CategoricalDtype())
+ assert not is_dtype_equal(dtype, "foo")
- def test_construction_from_string(self):
+ def test_construction_from_string(self, dtype):
result = CategoricalDtype.construct_from_string("category")
- assert is_dtype_equal(self.dtype, result)
+ assert is_dtype_equal(dtype, result)
msg = "Cannot construct a 'CategoricalDtype' from 'foo'"
with pytest.raises(TypeError, match=msg):
CategoricalDtype.construct_from_string("foo")
@@ -133,16 +127,16 @@ def test_from_values_or_dtype_raises(self, values, categories, ordered, dtype):
with pytest.raises(ValueError, match=msg):
CategoricalDtype._from_values_or_dtype(values, categories, ordered, dtype)
- def test_is_dtype(self):
- assert CategoricalDtype.is_dtype(self.dtype)
+ def test_is_dtype(self, dtype):
+ assert CategoricalDtype.is_dtype(dtype)
assert CategoricalDtype.is_dtype("category")
assert CategoricalDtype.is_dtype(CategoricalDtype())
assert not CategoricalDtype.is_dtype("foo")
assert not CategoricalDtype.is_dtype(np.float64)
- def test_basic(self):
+ def test_basic(self, dtype):
- assert is_categorical_dtype(self.dtype)
+ assert is_categorical_dtype(dtype)
factor = Categorical(["a", "b", "b", "a", "a", "c", "c", "c"])
@@ -180,7 +174,11 @@ def test_is_boolean(self, categories, expected):
class TestDatetimeTZDtype(Base):
- def create(self):
+ @pytest.fixture
+ def dtype(self):
+ """
+ Class level fixture of dtype for TestDatetimeTZDtype
+ """
return DatetimeTZDtype("ns", "US/Eastern")
def test_alias_to_unit_raises(self):
@@ -196,9 +194,8 @@ def test_alias_to_unit_bad_alias_raises(self):
with pytest.raises(TypeError, match=""):
DatetimeTZDtype("datetime64[ns, US/NotATZ]")
- def test_hash_vs_equality(self):
+ def test_hash_vs_equality(self, dtype):
# make sure that we satisfy is semantics
- dtype = self.dtype
dtype2 = DatetimeTZDtype("ns", "US/Eastern")
dtype3 = DatetimeTZDtype(dtype2)
assert dtype == dtype2
@@ -223,54 +220,54 @@ def test_subclass(self):
assert issubclass(type(a), type(a))
assert issubclass(type(a), type(b))
- def test_compat(self):
- assert is_datetime64tz_dtype(self.dtype)
+ def test_compat(self, dtype):
+ assert is_datetime64tz_dtype(dtype)
assert is_datetime64tz_dtype("datetime64[ns, US/Eastern]")
- assert is_datetime64_any_dtype(self.dtype)
+ assert is_datetime64_any_dtype(dtype)
assert is_datetime64_any_dtype("datetime64[ns, US/Eastern]")
- assert is_datetime64_ns_dtype(self.dtype)
+ assert is_datetime64_ns_dtype(dtype)
assert is_datetime64_ns_dtype("datetime64[ns, US/Eastern]")
- assert not is_datetime64_dtype(self.dtype)
+ assert not is_datetime64_dtype(dtype)
assert not is_datetime64_dtype("datetime64[ns, US/Eastern]")
- def test_construction_from_string(self):
+ def test_construction_from_string(self, dtype):
result = DatetimeTZDtype.construct_from_string("datetime64[ns, US/Eastern]")
- assert is_dtype_equal(self.dtype, result)
- msg = "Cannot construct a 'DatetimeTZDtype' from 'foo'"
- with pytest.raises(TypeError, match=msg):
- DatetimeTZDtype.construct_from_string("foo")
-
- def test_construct_from_string_raises(self):
- with pytest.raises(TypeError, match="notatz"):
- DatetimeTZDtype.construct_from_string("datetime64[ns, notatz]")
+ assert is_dtype_equal(dtype, result)
- msg = "'construct_from_string' expects a string, got "
- with pytest.raises(TypeError, match=re.escape(msg)):
- # list instead of string
- DatetimeTZDtype.construct_from_string(["datetime64[ns, notatz]"])
-
- msg = "^Cannot construct a 'DatetimeTZDtype'"
- with pytest.raises(TypeError, match=msg):
+ @pytest.mark.parametrize(
+ "string",
+ [
+ "foo",
+ "datetime64[ns, notatz]",
# non-nano unit
- DatetimeTZDtype.construct_from_string("datetime64[ps, UTC]")
+ "datetime64[ps, UTC]",
+ # dateutil str that returns None from gettz
+ "datetime64[ns, dateutil/invalid]",
+ ],
+ )
+ def test_construct_from_string_invalid_raises(self, string):
+ msg = f"Cannot construct a 'DatetimeTZDtype' from '{string}'"
+ with pytest.raises(TypeError, match=re.escape(msg)):
+ DatetimeTZDtype.construct_from_string(string)
+ def test_construct_from_string_wrong_type_raises(self):
+ msg = "'construct_from_string' expects a string, got "
with pytest.raises(TypeError, match=msg):
- # dateutil str that returns None from gettz
- DatetimeTZDtype.construct_from_string("datetime64[ns, dateutil/invalid]")
+ DatetimeTZDtype.construct_from_string(["datetime64[ns, notatz]"])
- def test_is_dtype(self):
+ def test_is_dtype(self, dtype):
assert not DatetimeTZDtype.is_dtype(None)
- assert DatetimeTZDtype.is_dtype(self.dtype)
+ assert DatetimeTZDtype.is_dtype(dtype)
assert DatetimeTZDtype.is_dtype("datetime64[ns, US/Eastern]")
assert not DatetimeTZDtype.is_dtype("foo")
assert DatetimeTZDtype.is_dtype(DatetimeTZDtype("ns", "US/Pacific"))
assert not DatetimeTZDtype.is_dtype(np.float64)
- def test_equality(self):
- assert is_dtype_equal(self.dtype, "datetime64[ns, US/Eastern]")
- assert is_dtype_equal(self.dtype, DatetimeTZDtype("ns", "US/Eastern"))
- assert not is_dtype_equal(self.dtype, "foo")
- assert not is_dtype_equal(self.dtype, DatetimeTZDtype("ns", "CET"))
+ def test_equality(self, dtype):
+ assert is_dtype_equal(dtype, "datetime64[ns, US/Eastern]")
+ assert is_dtype_equal(dtype, DatetimeTZDtype("ns", "US/Eastern"))
+ assert not is_dtype_equal(dtype, "foo")
+ assert not is_dtype_equal(dtype, DatetimeTZDtype("ns", "CET"))
assert not is_dtype_equal(
DatetimeTZDtype("ns", "US/Eastern"), DatetimeTZDtype("ns", "US/Pacific")
)
@@ -278,9 +275,9 @@ def test_equality(self):
# numpy compat
assert is_dtype_equal(np.dtype("M8[ns]"), "datetime64[ns]")
- def test_basic(self):
+ def test_basic(self, dtype):
- assert is_datetime64tz_dtype(self.dtype)
+ assert is_datetime64tz_dtype(dtype)
dr = date_range("20130101", periods=3, tz="US/Eastern")
s = Series(dr, name="A")
@@ -326,12 +323,15 @@ def test_tz_standardize(self):
class TestPeriodDtype(Base):
- def create(self):
+ @pytest.fixture
+ def dtype(self):
+ """
+ Class level fixture of dtype for TestPeriodDtype
+ """
return PeriodDtype("D")
- def test_hash_vs_equality(self):
+ def test_hash_vs_equality(self, dtype):
# make sure that we satisfy is semantics
- dtype = self.dtype
dtype2 = PeriodDtype("D")
dtype3 = PeriodDtype(dtype2)
assert dtype == dtype2
@@ -386,17 +386,17 @@ def test_identity(self):
assert PeriodDtype("period[1S1U]") == PeriodDtype("period[1000001U]")
assert PeriodDtype("period[1S1U]") is PeriodDtype("period[1000001U]")
- def test_compat(self):
- assert not is_datetime64_ns_dtype(self.dtype)
+ def test_compat(self, dtype):
+ assert not is_datetime64_ns_dtype(dtype)
assert not is_datetime64_ns_dtype("period[D]")
- assert not is_datetime64_dtype(self.dtype)
+ assert not is_datetime64_dtype(dtype)
assert not is_datetime64_dtype("period[D]")
- def test_construction_from_string(self):
+ def test_construction_from_string(self, dtype):
result = PeriodDtype("period[D]")
- assert is_dtype_equal(self.dtype, result)
+ assert is_dtype_equal(dtype, result)
result = PeriodDtype.construct_from_string("period[D]")
- assert is_dtype_equal(self.dtype, result)
+ assert is_dtype_equal(dtype, result)
with pytest.raises(TypeError):
PeriodDtype.construct_from_string("foo")
with pytest.raises(TypeError):
@@ -412,8 +412,8 @@ def test_construction_from_string(self):
with pytest.raises(TypeError, match="list"):
PeriodDtype.construct_from_string([1, 2, 3])
- def test_is_dtype(self):
- assert PeriodDtype.is_dtype(self.dtype)
+ def test_is_dtype(self, dtype):
+ assert PeriodDtype.is_dtype(dtype)
assert PeriodDtype.is_dtype("period[D]")
assert PeriodDtype.is_dtype("period[3D]")
assert PeriodDtype.is_dtype(PeriodDtype("3D"))
@@ -431,17 +431,17 @@ def test_is_dtype(self):
assert not PeriodDtype.is_dtype(np.int64)
assert not PeriodDtype.is_dtype(np.float64)
- def test_equality(self):
- assert is_dtype_equal(self.dtype, "period[D]")
- assert is_dtype_equal(self.dtype, PeriodDtype("D"))
- assert is_dtype_equal(self.dtype, PeriodDtype("D"))
+ def test_equality(self, dtype):
+ assert is_dtype_equal(dtype, "period[D]")
+ assert is_dtype_equal(dtype, PeriodDtype("D"))
+ assert is_dtype_equal(dtype, PeriodDtype("D"))
assert is_dtype_equal(PeriodDtype("D"), PeriodDtype("D"))
- assert not is_dtype_equal(self.dtype, "D")
+ assert not is_dtype_equal(dtype, "D")
assert not is_dtype_equal(PeriodDtype("D"), PeriodDtype("2D"))
- def test_basic(self):
- assert is_period_dtype(self.dtype)
+ def test_basic(self, dtype):
+ assert is_period_dtype(dtype)
pidx = pd.period_range("2013-01-01 09:00", periods=5, freq="H")
@@ -467,12 +467,15 @@ def test_not_string(self):
class TestIntervalDtype(Base):
- def create(self):
+ @pytest.fixture
+ def dtype(self):
+ """
+ Class level fixture of dtype for TestIntervalDtype
+ """
return IntervalDtype("int64")
- def test_hash_vs_equality(self):
+ def test_hash_vs_equality(self, dtype):
# make sure that we satisfy is semantics
- dtype = self.dtype
dtype2 = IntervalDtype("int64")
dtype3 = IntervalDtype(dtype2)
assert dtype == dtype2
@@ -539,11 +542,11 @@ def test_construction_errors(self, subtype):
with pytest.raises(TypeError, match=msg):
IntervalDtype(subtype)
- def test_construction_from_string(self):
+ def test_construction_from_string(self, dtype):
result = IntervalDtype("interval[int64]")
- assert is_dtype_equal(self.dtype, result)
+ assert is_dtype_equal(dtype, result)
result = IntervalDtype.construct_from_string("interval[int64]")
- assert is_dtype_equal(self.dtype, result)
+ assert is_dtype_equal(dtype, result)
@pytest.mark.parametrize("string", [0, 3.14, ("a", "b"), None])
def test_construction_from_string_errors(self, string):
@@ -572,8 +575,8 @@ def test_subclass(self):
assert issubclass(type(a), type(a))
assert issubclass(type(a), type(b))
- def test_is_dtype(self):
- assert IntervalDtype.is_dtype(self.dtype)
+ def test_is_dtype(self, dtype):
+ assert IntervalDtype.is_dtype(dtype)
assert IntervalDtype.is_dtype("interval")
assert IntervalDtype.is_dtype(IntervalDtype("float64"))
assert IntervalDtype.is_dtype(IntervalDtype("int64"))
@@ -589,12 +592,12 @@ def test_is_dtype(self):
assert not IntervalDtype.is_dtype(np.int64)
assert not IntervalDtype.is_dtype(np.float64)
- def test_equality(self):
- assert is_dtype_equal(self.dtype, "interval[int64]")
- assert is_dtype_equal(self.dtype, IntervalDtype("int64"))
+ def test_equality(self, dtype):
+ assert is_dtype_equal(dtype, "interval[int64]")
+ assert is_dtype_equal(dtype, IntervalDtype("int64"))
assert is_dtype_equal(IntervalDtype("int64"), IntervalDtype("int64"))
- assert not is_dtype_equal(self.dtype, "int64")
+ assert not is_dtype_equal(dtype, "int64")
assert not is_dtype_equal(IntervalDtype("int64"), IntervalDtype("float64"))
# invalid subtype comparisons do not raise when directly compared
@@ -650,8 +653,8 @@ def test_name_repr_generic(self, subtype):
assert str(dtype) == "interval"
assert dtype.name == "interval"
- def test_basic(self):
- assert is_interval_dtype(self.dtype)
+ def test_basic(self, dtype):
+ assert is_interval_dtype(dtype)
ii = IntervalIndex.from_breaks(range(3))
diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py
index 4a84a21084de2..22e53dbc89f01 100644
--- a/pandas/tests/extension/base/methods.py
+++ b/pandas/tests/extension/base/methods.py
@@ -280,6 +280,13 @@ def test_shift_empty_array(self, data, periods):
expected = empty
self.assert_extension_array_equal(result, expected)
+ def test_shift_zero_copies(self, data):
+ result = data.shift(0)
+ assert result is not data
+
+ result = data[:0].shift(2)
+ assert result is not data
+
def test_shift_fill_value(self, data):
arr = data[:4]
fill_value = data[0]
diff --git a/pandas/tests/extension/json/array.py b/pandas/tests/extension/json/array.py
index 9e741bb7f267c..1ba1b872fa5e2 100644
--- a/pandas/tests/extension/json/array.py
+++ b/pandas/tests/extension/json/array.py
@@ -16,7 +16,7 @@
import random
import string
import sys
-from typing import Type
+from typing import Any, Mapping, Type
import numpy as np
@@ -27,7 +27,7 @@
class JSONDtype(ExtensionDtype):
type = abc.Mapping
name = "json"
- na_value = UserDict()
+ na_value: Mapping[str, Any] = UserDict()
@classmethod
def construct_array_type(cls) -> Type["JSONArray"]:
diff --git a/pandas/tests/frame/indexing/test_datetime.py b/pandas/tests/frame/indexing/test_datetime.py
index a1c12be2b0180..6bfcac3793584 100644
--- a/pandas/tests/frame/indexing/test_datetime.py
+++ b/pandas/tests/frame/indexing/test_datetime.py
@@ -45,13 +45,6 @@ def test_set_reset(self):
df = result.set_index("foo")
tm.assert_index_equal(df.index, idx)
- def test_transpose(self, timezone_frame):
-
- result = timezone_frame.T
- expected = DataFrame(timezone_frame.values.T)
- expected.index = ["A", "B", "C"]
- tm.assert_frame_equal(result, expected)
-
def test_scalar_assignment(self):
# issue #19843
df = pd.DataFrame(index=(0, 1, 2))
diff --git a/pandas/tests/frame/indexing/test_iat.py b/pandas/tests/frame/indexing/test_iat.py
new file mode 100644
index 0000000000000..23e3392251a3a
--- /dev/null
+++ b/pandas/tests/frame/indexing/test_iat.py
@@ -0,0 +1,7 @@
+def test_iat(float_frame):
+
+ for i, row in enumerate(float_frame.index):
+ for j, col in enumerate(float_frame.columns):
+ result = float_frame.iat[i, j]
+ expected = float_frame.at[row, col]
+ assert result == expected
diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py
index 64d0f9ee2b062..6fc8c0e9ad459 100644
--- a/pandas/tests/frame/indexing/test_indexing.py
+++ b/pandas/tests/frame/indexing/test_indexing.py
@@ -28,6 +28,29 @@
from pandas.tseries.offsets import BDay
+class TestGet:
+ def test_get(self, float_frame):
+ b = float_frame.get("B")
+ tm.assert_series_equal(b, float_frame["B"])
+
+ assert float_frame.get("foo") is None
+ tm.assert_series_equal(
+ float_frame.get("foo", float_frame["B"]), float_frame["B"]
+ )
+
+ @pytest.mark.parametrize(
+ "df",
+ [
+ DataFrame(),
+ DataFrame(columns=list("AB")),
+ DataFrame(columns=list("AB"), index=range(3)),
+ ],
+ )
+ def test_get_none(self, df):
+ # see gh-5652
+ assert df.get(None) is None
+
+
class TestDataFrameIndexing:
def test_getitem(self, float_frame):
# Slicing
@@ -64,27 +87,6 @@ def test_getitem_dupe_cols(self):
with pytest.raises(KeyError, match=re.escape(msg)):
df[["baf"]]
- def test_get(self, float_frame):
- b = float_frame.get("B")
- tm.assert_series_equal(b, float_frame["B"])
-
- assert float_frame.get("foo") is None
- tm.assert_series_equal(
- float_frame.get("foo", float_frame["B"]), float_frame["B"]
- )
-
- @pytest.mark.parametrize(
- "df",
- [
- DataFrame(),
- DataFrame(columns=list("AB")),
- DataFrame(columns=list("AB"), index=range(3)),
- ],
- )
- def test_get_none(self, df):
- # see gh-5652
- assert df.get(None) is None
-
@pytest.mark.parametrize("key_type", [iter, np.array, Series, Index])
def test_loc_iterable(self, float_frame, key_type):
idx = key_type(["A", "B", "C"])
@@ -1048,9 +1050,8 @@ def test_getitem_setitem_float_labels(self):
# positional slicing only via iloc!
msg = (
- "cannot do slice indexing on "
- r" with "
- r"these indexers \[1.0\] of "
+ "cannot do positional indexing on Float64Index with "
+ r"these indexers \[1.0\] of type float"
)
with pytest.raises(TypeError, match=msg):
df.iloc[1.0:5]
@@ -1547,14 +1548,6 @@ def test_loc_duplicates(self):
df.loc[trange[bool_idx], "A"] += 6
tm.assert_frame_equal(df, expected)
- def test_iat(self, float_frame):
-
- for i, row in enumerate(float_frame.index):
- for j, col in enumerate(float_frame.columns):
- result = float_frame.iat[i, j]
- expected = float_frame.at[row, col]
- assert result == expected
-
@pytest.mark.parametrize(
"method,expected_values",
[
@@ -1608,6 +1601,16 @@ def test_reindex_methods_nearest_special(self):
actual = df.reindex(target, method="nearest", tolerance=[0.5, 0.01, 0.4, 0.1])
tm.assert_frame_equal(expected, actual)
+ def test_reindex_nearest_tz(self, tz_aware_fixture):
+ # GH26683
+ tz = tz_aware_fixture
+ idx = pd.date_range("2019-01-01", periods=5, tz=tz)
+ df = pd.DataFrame({"x": list(range(5))}, index=idx)
+
+ expected = df.head(3)
+ actual = df.reindex(idx[:3], method="nearest")
+ tm.assert_frame_equal(expected, actual)
+
def test_reindex_frame_add_nat(self):
rng = date_range("1/1/2000 00:00:00", periods=10, freq="10s")
df = DataFrame({"A": np.random.randn(len(rng)), "B": rng})
@@ -1916,89 +1919,6 @@ def test_at_time_between_time_datetimeindex(self):
result.loc[bkey] = df.iloc[binds]
tm.assert_frame_equal(result, df)
- def test_xs(self, float_frame, datetime_frame):
- idx = float_frame.index[5]
- xs = float_frame.xs(idx)
- for item, value in xs.items():
- if np.isnan(value):
- assert np.isnan(float_frame[item][idx])
- else:
- assert value == float_frame[item][idx]
-
- # mixed-type xs
- test_data = {"A": {"1": 1, "2": 2}, "B": {"1": "1", "2": "2", "3": "3"}}
- frame = DataFrame(test_data)
- xs = frame.xs("1")
- assert xs.dtype == np.object_
- assert xs["A"] == 1
- assert xs["B"] == "1"
-
- with pytest.raises(
- KeyError, match=re.escape("Timestamp('1999-12-31 00:00:00', freq='B')")
- ):
- datetime_frame.xs(datetime_frame.index[0] - BDay())
-
- # xs get column
- series = float_frame.xs("A", axis=1)
- expected = float_frame["A"]
- tm.assert_series_equal(series, expected)
-
- # view is returned if possible
- series = float_frame.xs("A", axis=1)
- series[:] = 5
- assert (expected == 5).all()
-
- def test_xs_corner(self):
- # pathological mixed-type reordering case
- df = DataFrame(index=[0])
- df["A"] = 1.0
- df["B"] = "foo"
- df["C"] = 2.0
- df["D"] = "bar"
- df["E"] = 3.0
-
- xs = df.xs(0)
- exp = pd.Series([1.0, "foo", 2.0, "bar", 3.0], index=list("ABCDE"), name=0)
- tm.assert_series_equal(xs, exp)
-
- # no columns but Index(dtype=object)
- df = DataFrame(index=["a", "b", "c"])
- result = df.xs("a")
- expected = Series([], name="a", index=pd.Index([]), dtype=np.float64)
- tm.assert_series_equal(result, expected)
-
- def test_xs_duplicates(self):
- df = DataFrame(np.random.randn(5, 2), index=["b", "b", "c", "b", "a"])
-
- cross = df.xs("c")
- exp = df.iloc[2]
- tm.assert_series_equal(cross, exp)
-
- def test_xs_keep_level(self):
- df = DataFrame(
- {
- "day": {0: "sat", 1: "sun"},
- "flavour": {0: "strawberry", 1: "strawberry"},
- "sales": {0: 10, 1: 12},
- "year": {0: 2008, 1: 2008},
- }
- ).set_index(["year", "flavour", "day"])
- result = df.xs("sat", level="day", drop_level=False)
- expected = df[:1]
- tm.assert_frame_equal(result, expected)
-
- result = df.xs([2008, "sat"], level=["year", "day"], drop_level=False)
- tm.assert_frame_equal(result, expected)
-
- def test_xs_view(self):
- # in 0.14 this will return a view if possible a copy otherwise, but
- # this is numpy dependent
-
- dm = DataFrame(np.arange(20.0).reshape(4, 5), index=range(4), columns=range(5))
-
- dm.xs(2)[:] = 10
- assert (dm.xs(2) == 10).all()
-
def test_index_namedtuple(self):
from collections import namedtuple
@@ -2154,31 +2074,6 @@ def test_mask_callable(self):
tm.assert_frame_equal(result, exp)
tm.assert_frame_equal(result, (df + 2).mask((df + 2) > 8, (df + 2) + 10))
- def test_head_tail(self, float_frame):
- tm.assert_frame_equal(float_frame.head(), float_frame[:5])
- tm.assert_frame_equal(float_frame.tail(), float_frame[-5:])
-
- tm.assert_frame_equal(float_frame.head(0), float_frame[0:0])
- tm.assert_frame_equal(float_frame.tail(0), float_frame[0:0])
-
- tm.assert_frame_equal(float_frame.head(-1), float_frame[:-1])
- tm.assert_frame_equal(float_frame.tail(-1), float_frame[1:])
- tm.assert_frame_equal(float_frame.head(1), float_frame[:1])
- tm.assert_frame_equal(float_frame.tail(1), float_frame[-1:])
- # with a float index
- df = float_frame.copy()
- df.index = np.arange(len(float_frame)) + 0.1
- tm.assert_frame_equal(df.head(), df.iloc[:5])
- tm.assert_frame_equal(df.tail(), df.iloc[-5:])
- tm.assert_frame_equal(df.head(0), df[0:0])
- tm.assert_frame_equal(df.tail(0), df[0:0])
- tm.assert_frame_equal(df.head(-1), df.iloc[:-1])
- tm.assert_frame_equal(df.tail(-1), df.iloc[1:])
- # test empty dataframe
- empty_df = DataFrame()
- tm.assert_frame_equal(empty_df.tail(), empty_df)
- tm.assert_frame_equal(empty_df.head(), empty_df)
-
def test_type_error_multiindex(self):
# See gh-12218
df = DataFrame(
@@ -2270,9 +2165,40 @@ def test_set_reset(self):
df = result.set_index("foo")
tm.assert_index_equal(df.index, idx)
- def test_transpose(self, uint64_frame):
- result = uint64_frame.T
- expected = DataFrame(uint64_frame.values.T)
- expected.index = ["A", "B"]
- tm.assert_frame_equal(result, expected)
+def test_object_casting_indexing_wraps_datetimelike():
+ # GH#31649, check the indexing methods all the way down the stack
+ df = pd.DataFrame(
+ {
+ "A": [1, 2],
+ "B": pd.date_range("2000", periods=2),
+ "C": pd.timedelta_range("1 Day", periods=2),
+ }
+ )
+
+ ser = df.loc[0]
+ assert isinstance(ser.values[1], pd.Timestamp)
+ assert isinstance(ser.values[2], pd.Timedelta)
+
+ ser = df.iloc[0]
+ assert isinstance(ser.values[1], pd.Timestamp)
+ assert isinstance(ser.values[2], pd.Timedelta)
+
+ ser = df.xs(0, axis=0)
+ assert isinstance(ser.values[1], pd.Timestamp)
+ assert isinstance(ser.values[2], pd.Timedelta)
+
+ mgr = df._data
+ arr = mgr.fast_xs(0)
+ assert isinstance(arr[1], pd.Timestamp)
+ assert isinstance(arr[2], pd.Timedelta)
+
+ blk = mgr.blocks[mgr._blknos[1]]
+ assert blk.dtype == "M8[ns]" # we got the right block
+ val = blk.iget((0, 0))
+ assert isinstance(val, pd.Timestamp)
+
+ blk = mgr.blocks[mgr._blknos[2]]
+ assert blk.dtype == "m8[ns]" # we got the right block
+ val = blk.iget((0, 0))
+ assert isinstance(val, pd.Timedelta)
diff --git a/pandas/tests/frame/indexing/test_where.py b/pandas/tests/frame/indexing/test_where.py
index df1b128dcd227..507b2e9cd237b 100644
--- a/pandas/tests/frame/indexing/test_where.py
+++ b/pandas/tests/frame/indexing/test_where.py
@@ -10,22 +10,30 @@
import pandas._testing as tm
-class TestDataFrameIndexingWhere:
- def test_where(self, float_string_frame, mixed_float_frame, mixed_int_frame):
- default_frame = DataFrame(np.random.randn(5, 3), columns=["A", "B", "C"])
-
- def _safe_add(df):
- # only add to the numeric items
- def is_ok(s):
- return (
- issubclass(s.dtype.type, (np.integer, np.floating))
- and s.dtype != "uint8"
- )
-
- return DataFrame(
- dict((c, s + 1) if is_ok(s) else (c, s) for c, s in df.items())
- )
+@pytest.fixture(params=["default", "float_string", "mixed_float", "mixed_int"])
+def where_frame(request, float_string_frame, mixed_float_frame, mixed_int_frame):
+ if request.param == "default":
+ return DataFrame(np.random.randn(5, 3), columns=["A", "B", "C"])
+ if request.param == "float_string":
+ return float_string_frame
+ if request.param == "mixed_float":
+ return mixed_float_frame
+ if request.param == "mixed_int":
+ return mixed_int_frame
+
+
+def _safe_add(df):
+ # only add to the numeric items
+ def is_ok(s):
+ return (
+ issubclass(s.dtype.type, (np.integer, np.floating)) and s.dtype != "uint8"
+ )
+
+ return DataFrame(dict((c, s + 1) if is_ok(s) else (c, s) for c, s in df.items()))
+
+class TestDataFrameIndexingWhere:
+ def test_where_get(self, where_frame, float_string_frame):
def _check_get(df, cond, check_dtypes=True):
other1 = _safe_add(df)
rs = df.where(cond, other1)
@@ -40,19 +48,15 @@ def _check_get(df, cond, check_dtypes=True):
assert (rs.dtypes == df.dtypes).all()
# check getting
- for df in [
- default_frame,
- float_string_frame,
- mixed_float_frame,
- mixed_int_frame,
- ]:
- if df is float_string_frame:
- with pytest.raises(TypeError):
- df > 0
- continue
- cond = df > 0
- _check_get(df, cond)
-
+ df = where_frame
+ if df is float_string_frame:
+ with pytest.raises(TypeError):
+ df > 0
+ return
+ cond = df > 0
+ _check_get(df, cond)
+
+ def test_where_upcasting(self):
# upcasting case (GH # 2794)
df = DataFrame(
{
@@ -78,6 +82,7 @@ def _check_get(df, cond, check_dtypes=True):
tm.assert_series_equal(result, expected)
+ def test_where_alignment(self, where_frame, float_string_frame):
# aligning
def _check_align(df, cond, other, check_dtypes=True):
rs = df.where(cond, other)
@@ -107,27 +112,30 @@ def _check_align(df, cond, other, check_dtypes=True):
if check_dtypes and not isinstance(other, np.ndarray):
assert (rs.dtypes == df.dtypes).all()
- for df in [float_string_frame, mixed_float_frame, mixed_int_frame]:
- if df is float_string_frame:
- with pytest.raises(TypeError):
- df > 0
- continue
+ df = where_frame
+ if df is float_string_frame:
+ with pytest.raises(TypeError):
+ df > 0
+ return
- # other is a frame
- cond = (df > 0)[1:]
- _check_align(df, cond, _safe_add(df))
+ # other is a frame
+ cond = (df > 0)[1:]
+ _check_align(df, cond, _safe_add(df))
- # check other is ndarray
- cond = df > 0
- _check_align(df, cond, (_safe_add(df).values))
+ # check other is ndarray
+ cond = df > 0
+ _check_align(df, cond, (_safe_add(df).values))
- # integers are upcast, so don't check the dtypes
- cond = df > 0
- check_dtypes = all(not issubclass(s.type, np.integer) for s in df.dtypes)
- _check_align(df, cond, np.nan, check_dtypes=check_dtypes)
+ # integers are upcast, so don't check the dtypes
+ cond = df > 0
+ check_dtypes = all(not issubclass(s.type, np.integer) for s in df.dtypes)
+ _check_align(df, cond, np.nan, check_dtypes=check_dtypes)
+ def test_where_invalid(self):
# invalid conditions
- df = default_frame
+ df = DataFrame(np.random.randn(5, 3), columns=["A", "B", "C"])
+ cond = df > 0
+
err1 = (df + 1).values[0:2, :]
msg = "other must be the same shape as self when an ndarray"
with pytest.raises(ValueError, match=msg):
@@ -144,7 +152,9 @@ def _check_align(df, cond, other, check_dtypes=True):
with pytest.raises(ValueError, match=msg):
df.mask(0)
+ def test_where_set(self, where_frame, float_string_frame):
# where inplace
+
def _check_set(df, cond, check_dtypes=True):
dfi = df.copy()
econd = cond.reindex_like(df).fillna(True)
@@ -160,27 +170,23 @@ def _check_set(df, cond, check_dtypes=True):
v = np.dtype("float64")
assert dfi[k].dtype == v
- for df in [
- default_frame,
- float_string_frame,
- mixed_float_frame,
- mixed_int_frame,
- ]:
- if df is float_string_frame:
- with pytest.raises(TypeError):
- df > 0
- continue
+ df = where_frame
+ if df is float_string_frame:
+ with pytest.raises(TypeError):
+ df > 0
+ return
- cond = df > 0
- _check_set(df, cond)
+ cond = df > 0
+ _check_set(df, cond)
- cond = df >= 0
- _check_set(df, cond)
+ cond = df >= 0
+ _check_set(df, cond)
- # aligning
- cond = (df >= 0)[1:]
- _check_set(df, cond)
+ # aligning
+ cond = (df >= 0)[1:]
+ _check_set(df, cond)
+ def test_where_series_slicing(self):
# GH 10218
# test DataFrame.where with Series slicing
df = DataFrame({"a": range(3), "b": range(4, 7)})
diff --git a/pandas/tests/frame/indexing/test_xs.py b/pandas/tests/frame/indexing/test_xs.py
new file mode 100644
index 0000000000000..71b40585f0c2f
--- /dev/null
+++ b/pandas/tests/frame/indexing/test_xs.py
@@ -0,0 +1,95 @@
+import re
+
+import numpy as np
+import pytest
+
+import pandas as pd
+from pandas import DataFrame, Series
+import pandas._testing as tm
+
+from pandas.tseries.offsets import BDay
+
+
+class TestXS:
+ def test_xs(self, float_frame, datetime_frame):
+ idx = float_frame.index[5]
+ xs = float_frame.xs(idx)
+ for item, value in xs.items():
+ if np.isnan(value):
+ assert np.isnan(float_frame[item][idx])
+ else:
+ assert value == float_frame[item][idx]
+
+ # mixed-type xs
+ test_data = {"A": {"1": 1, "2": 2}, "B": {"1": "1", "2": "2", "3": "3"}}
+ frame = DataFrame(test_data)
+ xs = frame.xs("1")
+ assert xs.dtype == np.object_
+ assert xs["A"] == 1
+ assert xs["B"] == "1"
+
+ with pytest.raises(
+ KeyError, match=re.escape("Timestamp('1999-12-31 00:00:00', freq='B')")
+ ):
+ datetime_frame.xs(datetime_frame.index[0] - BDay())
+
+ # xs get column
+ series = float_frame.xs("A", axis=1)
+ expected = float_frame["A"]
+ tm.assert_series_equal(series, expected)
+
+ # view is returned if possible
+ series = float_frame.xs("A", axis=1)
+ series[:] = 5
+ assert (expected == 5).all()
+
+ def test_xs_corner(self):
+ # pathological mixed-type reordering case
+ df = DataFrame(index=[0])
+ df["A"] = 1.0
+ df["B"] = "foo"
+ df["C"] = 2.0
+ df["D"] = "bar"
+ df["E"] = 3.0
+
+ xs = df.xs(0)
+ exp = pd.Series([1.0, "foo", 2.0, "bar", 3.0], index=list("ABCDE"), name=0)
+ tm.assert_series_equal(xs, exp)
+
+ # no columns but Index(dtype=object)
+ df = DataFrame(index=["a", "b", "c"])
+ result = df.xs("a")
+ expected = Series([], name="a", index=pd.Index([]), dtype=np.float64)
+ tm.assert_series_equal(result, expected)
+
+ def test_xs_duplicates(self):
+ df = DataFrame(np.random.randn(5, 2), index=["b", "b", "c", "b", "a"])
+
+ cross = df.xs("c")
+ exp = df.iloc[2]
+ tm.assert_series_equal(cross, exp)
+
+ def test_xs_keep_level(self):
+ df = DataFrame(
+ {
+ "day": {0: "sat", 1: "sun"},
+ "flavour": {0: "strawberry", 1: "strawberry"},
+ "sales": {0: 10, 1: 12},
+ "year": {0: 2008, 1: 2008},
+ }
+ ).set_index(["year", "flavour", "day"])
+ result = df.xs("sat", level="day", drop_level=False)
+ expected = df[:1]
+ tm.assert_frame_equal(result, expected)
+
+ result = df.xs([2008, "sat"], level=["year", "day"], drop_level=False)
+ tm.assert_frame_equal(result, expected)
+
+ def test_xs_view(self):
+ # in 0.14 this will return a view if possible a copy otherwise, but
+ # this is numpy dependent
+
+ dm = DataFrame(np.arange(20.0).reshape(4, 5), index=range(4), columns=range(5))
+
+ dm.xs(2)[:] = 10
+ assert (dm.xs(2) == 10).all()
diff --git a/pandas/tests/frame/methods/test_combine_first.py b/pandas/tests/frame/methods/test_combine_first.py
new file mode 100644
index 0000000000000..7715cb1cb6eec
--- /dev/null
+++ b/pandas/tests/frame/methods/test_combine_first.py
@@ -0,0 +1,349 @@
+from datetime import datetime
+
+import numpy as np
+import pytest
+
+import pandas as pd
+from pandas import DataFrame, Index, Series
+import pandas._testing as tm
+
+
+class TestDataFrameCombineFirst:
+ def test_combine_first_mixed(self):
+ a = Series(["a", "b"], index=range(2))
+ b = Series(range(2), index=range(2))
+ f = DataFrame({"A": a, "B": b})
+
+ a = Series(["a", "b"], index=range(5, 7))
+ b = Series(range(2), index=range(5, 7))
+ g = DataFrame({"A": a, "B": b})
+
+ exp = pd.DataFrame(
+ {"A": list("abab"), "B": [0.0, 1.0, 0.0, 1.0]}, index=[0, 1, 5, 6]
+ )
+ combined = f.combine_first(g)
+ tm.assert_frame_equal(combined, exp)
+
+ def test_combine_first(self, float_frame):
+ # disjoint
+ head, tail = float_frame[:5], float_frame[5:]
+
+ combined = head.combine_first(tail)
+ reordered_frame = float_frame.reindex(combined.index)
+ tm.assert_frame_equal(combined, reordered_frame)
+ assert tm.equalContents(combined.columns, float_frame.columns)
+ tm.assert_series_equal(combined["A"], reordered_frame["A"])
+
+ # same index
+ fcopy = float_frame.copy()
+ fcopy["A"] = 1
+ del fcopy["C"]
+
+ fcopy2 = float_frame.copy()
+ fcopy2["B"] = 0
+ del fcopy2["D"]
+
+ combined = fcopy.combine_first(fcopy2)
+
+ assert (combined["A"] == 1).all()
+ tm.assert_series_equal(combined["B"], fcopy["B"])
+ tm.assert_series_equal(combined["C"], fcopy2["C"])
+ tm.assert_series_equal(combined["D"], fcopy["D"])
+
+ # overlap
+ head, tail = reordered_frame[:10].copy(), reordered_frame
+ head["A"] = 1
+
+ combined = head.combine_first(tail)
+ assert (combined["A"][:10] == 1).all()
+
+ # reverse overlap
+ tail["A"][:10] = 0
+ combined = tail.combine_first(head)
+ assert (combined["A"][:10] == 0).all()
+
+ # no overlap
+ f = float_frame[:10]
+ g = float_frame[10:]
+ combined = f.combine_first(g)
+ tm.assert_series_equal(combined["A"].reindex(f.index), f["A"])
+ tm.assert_series_equal(combined["A"].reindex(g.index), g["A"])
+
+ # corner cases
+ comb = float_frame.combine_first(DataFrame())
+ tm.assert_frame_equal(comb, float_frame)
+
+ comb = DataFrame().combine_first(float_frame)
+ tm.assert_frame_equal(comb, float_frame)
+
+ comb = float_frame.combine_first(DataFrame(index=["faz", "boo"]))
+ assert "faz" in comb.index
+
+ # #2525
+ df = DataFrame({"a": [1]}, index=[datetime(2012, 1, 1)])
+ df2 = DataFrame(columns=["b"])
+ result = df.combine_first(df2)
+ assert "b" in result
+
+ def test_combine_first_mixed_bug(self):
+ idx = Index(["a", "b", "c", "e"])
+ ser1 = Series([5.0, -9.0, 4.0, 100.0], index=idx)
+ ser2 = Series(["a", "b", "c", "e"], index=idx)
+ ser3 = Series([12, 4, 5, 97], index=idx)
+
+ frame1 = DataFrame({"col0": ser1, "col2": ser2, "col3": ser3})
+
+ idx = Index(["a", "b", "c", "f"])
+ ser1 = Series([5.0, -9.0, 4.0, 100.0], index=idx)
+ ser2 = Series(["a", "b", "c", "f"], index=idx)
+ ser3 = Series([12, 4, 5, 97], index=idx)
+
+ frame2 = DataFrame({"col1": ser1, "col2": ser2, "col5": ser3})
+
+ combined = frame1.combine_first(frame2)
+ assert len(combined.columns) == 5
+
+ # gh 3016 (same as in update)
+ df = DataFrame(
+ [[1.0, 2.0, False, True], [4.0, 5.0, True, False]],
+ columns=["A", "B", "bool1", "bool2"],
+ )
+
+ other = DataFrame([[45, 45]], index=[0], columns=["A", "B"])
+ result = df.combine_first(other)
+ tm.assert_frame_equal(result, df)
+
+ df.loc[0, "A"] = np.nan
+ result = df.combine_first(other)
+ df.loc[0, "A"] = 45
+ tm.assert_frame_equal(result, df)
+
+ # doc example
+ df1 = DataFrame(
+ {"A": [1.0, np.nan, 3.0, 5.0, np.nan], "B": [np.nan, 2.0, 3.0, np.nan, 6.0]}
+ )
+
+ df2 = DataFrame(
+ {
+ "A": [5.0, 2.0, 4.0, np.nan, 3.0, 7.0],
+ "B": [np.nan, np.nan, 3.0, 4.0, 6.0, 8.0],
+ }
+ )
+
+ result = df1.combine_first(df2)
+ expected = DataFrame({"A": [1, 2, 3, 5, 3, 7.0], "B": [np.nan, 2, 3, 4, 6, 8]})
+ tm.assert_frame_equal(result, expected)
+
+ # GH3552, return object dtype with bools
+ df1 = DataFrame(
+ [[np.nan, 3.0, True], [-4.6, np.nan, True], [np.nan, 7.0, False]]
+ )
+ df2 = DataFrame([[-42.6, np.nan, True], [-5.0, 1.6, False]], index=[1, 2])
+
+ result = df1.combine_first(df2)[2]
+ expected = Series([True, True, False], name=2)
+ tm.assert_series_equal(result, expected)
+
+ # GH 3593, converting datetime64[ns] incorrectly
+ df0 = DataFrame(
+ {"a": [datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)]}
+ )
+ df1 = DataFrame({"a": [None, None, None]})
+ df2 = df1.combine_first(df0)
+ tm.assert_frame_equal(df2, df0)
+
+ df2 = df0.combine_first(df1)
+ tm.assert_frame_equal(df2, df0)
+
+ df0 = DataFrame(
+ {"a": [datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)]}
+ )
+ df1 = DataFrame({"a": [datetime(2000, 1, 2), None, None]})
+ df2 = df1.combine_first(df0)
+ result = df0.copy()
+ result.iloc[0, :] = df1.iloc[0, :]
+ tm.assert_frame_equal(df2, result)
+
+ df2 = df0.combine_first(df1)
+ tm.assert_frame_equal(df2, df0)
+
+ def test_combine_first_align_nan(self):
+ # GH 7509 (not fixed)
+ dfa = pd.DataFrame([[pd.Timestamp("2011-01-01"), 2]], columns=["a", "b"])
+ dfb = pd.DataFrame([[4], [5]], columns=["b"])
+ assert dfa["a"].dtype == "datetime64[ns]"
+ assert dfa["b"].dtype == "int64"
+
+ res = dfa.combine_first(dfb)
+ exp = pd.DataFrame(
+ {"a": [pd.Timestamp("2011-01-01"), pd.NaT], "b": [2.0, 5.0]},
+ columns=["a", "b"],
+ )
+ tm.assert_frame_equal(res, exp)
+ assert res["a"].dtype == "datetime64[ns]"
+ # ToDo: this must be int64
+ assert res["b"].dtype == "float64"
+
+ res = dfa.iloc[:0].combine_first(dfb)
+ exp = pd.DataFrame({"a": [np.nan, np.nan], "b": [4, 5]}, columns=["a", "b"])
+ tm.assert_frame_equal(res, exp)
+ # ToDo: this must be datetime64
+ assert res["a"].dtype == "float64"
+ # ToDo: this must be int64
+ assert res["b"].dtype == "int64"
+
+ def test_combine_first_timezone(self):
+ # see gh-7630
+ data1 = pd.to_datetime("20100101 01:01").tz_localize("UTC")
+ df1 = pd.DataFrame(
+ columns=["UTCdatetime", "abc"],
+ data=data1,
+ index=pd.date_range("20140627", periods=1),
+ )
+ data2 = pd.to_datetime("20121212 12:12").tz_localize("UTC")
+ df2 = pd.DataFrame(
+ columns=["UTCdatetime", "xyz"],
+ data=data2,
+ index=pd.date_range("20140628", periods=1),
+ )
+ res = df2[["UTCdatetime"]].combine_first(df1)
+ exp = pd.DataFrame(
+ {
+ "UTCdatetime": [
+ pd.Timestamp("2010-01-01 01:01", tz="UTC"),
+ pd.Timestamp("2012-12-12 12:12", tz="UTC"),
+ ],
+ "abc": [pd.Timestamp("2010-01-01 01:01:00", tz="UTC"), pd.NaT],
+ },
+ columns=["UTCdatetime", "abc"],
+ index=pd.date_range("20140627", periods=2, freq="D"),
+ )
+ tm.assert_frame_equal(res, exp)
+ assert res["UTCdatetime"].dtype == "datetime64[ns, UTC]"
+ assert res["abc"].dtype == "datetime64[ns, UTC]"
+
+ # see gh-10567
+ dts1 = pd.date_range("2015-01-01", "2015-01-05", tz="UTC")
+ df1 = pd.DataFrame({"DATE": dts1})
+ dts2 = pd.date_range("2015-01-03", "2015-01-05", tz="UTC")
+ df2 = pd.DataFrame({"DATE": dts2})
+
+ res = df1.combine_first(df2)
+ tm.assert_frame_equal(res, df1)
+ assert res["DATE"].dtype == "datetime64[ns, UTC]"
+
+ dts1 = pd.DatetimeIndex(
+ ["2011-01-01", "NaT", "2011-01-03", "2011-01-04"], tz="US/Eastern"
+ )
+ df1 = pd.DataFrame({"DATE": dts1}, index=[1, 3, 5, 7])
+ dts2 = pd.DatetimeIndex(
+ ["2012-01-01", "2012-01-02", "2012-01-03"], tz="US/Eastern"
+ )
+ df2 = pd.DataFrame({"DATE": dts2}, index=[2, 4, 5])
+
+ res = df1.combine_first(df2)
+ exp_dts = pd.DatetimeIndex(
+ [
+ "2011-01-01",
+ "2012-01-01",
+ "NaT",
+ "2012-01-02",
+ "2011-01-03",
+ "2011-01-04",
+ ],
+ tz="US/Eastern",
+ )
+ exp = pd.DataFrame({"DATE": exp_dts}, index=[1, 2, 3, 4, 5, 7])
+ tm.assert_frame_equal(res, exp)
+
+ # different tz
+ dts1 = pd.date_range("2015-01-01", "2015-01-05", tz="US/Eastern")
+ df1 = pd.DataFrame({"DATE": dts1})
+ dts2 = pd.date_range("2015-01-03", "2015-01-05")
+ df2 = pd.DataFrame({"DATE": dts2})
+
+ # if df1 doesn't have NaN, keep its dtype
+ res = df1.combine_first(df2)
+ tm.assert_frame_equal(res, df1)
+ assert res["DATE"].dtype == "datetime64[ns, US/Eastern]"
+
+ dts1 = pd.date_range("2015-01-01", "2015-01-02", tz="US/Eastern")
+ df1 = pd.DataFrame({"DATE": dts1})
+ dts2 = pd.date_range("2015-01-01", "2015-01-03")
+ df2 = pd.DataFrame({"DATE": dts2})
+
+ res = df1.combine_first(df2)
+ exp_dts = [
+ pd.Timestamp("2015-01-01", tz="US/Eastern"),
+ pd.Timestamp("2015-01-02", tz="US/Eastern"),
+ pd.Timestamp("2015-01-03"),
+ ]
+ exp = pd.DataFrame({"DATE": exp_dts})
+ tm.assert_frame_equal(res, exp)
+ assert res["DATE"].dtype == "object"
+
+ def test_combine_first_timedelta(self):
+ data1 = pd.TimedeltaIndex(["1 day", "NaT", "3 day", "4day"])
+ df1 = pd.DataFrame({"TD": data1}, index=[1, 3, 5, 7])
+ data2 = pd.TimedeltaIndex(["10 day", "11 day", "12 day"])
+ df2 = pd.DataFrame({"TD": data2}, index=[2, 4, 5])
+
+ res = df1.combine_first(df2)
+ exp_dts = pd.TimedeltaIndex(
+ ["1 day", "10 day", "NaT", "11 day", "3 day", "4 day"]
+ )
+ exp = pd.DataFrame({"TD": exp_dts}, index=[1, 2, 3, 4, 5, 7])
+ tm.assert_frame_equal(res, exp)
+ assert res["TD"].dtype == "timedelta64[ns]"
+
+ def test_combine_first_period(self):
+ data1 = pd.PeriodIndex(["2011-01", "NaT", "2011-03", "2011-04"], freq="M")
+ df1 = pd.DataFrame({"P": data1}, index=[1, 3, 5, 7])
+ data2 = pd.PeriodIndex(["2012-01-01", "2012-02", "2012-03"], freq="M")
+ df2 = pd.DataFrame({"P": data2}, index=[2, 4, 5])
+
+ res = df1.combine_first(df2)
+ exp_dts = pd.PeriodIndex(
+ ["2011-01", "2012-01", "NaT", "2012-02", "2011-03", "2011-04"], freq="M"
+ )
+ exp = pd.DataFrame({"P": exp_dts}, index=[1, 2, 3, 4, 5, 7])
+ tm.assert_frame_equal(res, exp)
+ assert res["P"].dtype == data1.dtype
+
+ # different freq
+ dts2 = pd.PeriodIndex(["2012-01-01", "2012-01-02", "2012-01-03"], freq="D")
+ df2 = pd.DataFrame({"P": dts2}, index=[2, 4, 5])
+
+ res = df1.combine_first(df2)
+ exp_dts = [
+ pd.Period("2011-01", freq="M"),
+ pd.Period("2012-01-01", freq="D"),
+ pd.NaT,
+ pd.Period("2012-01-02", freq="D"),
+ pd.Period("2011-03", freq="M"),
+ pd.Period("2011-04", freq="M"),
+ ]
+ exp = pd.DataFrame({"P": exp_dts}, index=[1, 2, 3, 4, 5, 7])
+ tm.assert_frame_equal(res, exp)
+ assert res["P"].dtype == "object"
+
+ def test_combine_first_int(self):
+ # GH14687 - integer series that do no align exactly
+
+ df1 = pd.DataFrame({"a": [0, 1, 3, 5]}, dtype="int64")
+ df2 = pd.DataFrame({"a": [1, 4]}, dtype="int64")
+
+ res = df1.combine_first(df2)
+ tm.assert_frame_equal(res, df1)
+ assert res["a"].dtype == "int64"
+
+ @pytest.mark.parametrize("val", [1, 1.0])
+ def test_combine_first_with_asymmetric_other(self, val):
+ # see gh-20699
+ df1 = pd.DataFrame({"isNum": [val]})
+ df2 = pd.DataFrame({"isBool": [True]})
+
+ res = df1.combine_first(df2)
+ exp = pd.DataFrame({"isBool": [True], "isNum": [val]})
+
+ tm.assert_frame_equal(res, exp)
diff --git a/pandas/tests/frame/methods/test_head_tail.py b/pandas/tests/frame/methods/test_head_tail.py
new file mode 100644
index 0000000000000..93763bc12ce0d
--- /dev/null
+++ b/pandas/tests/frame/methods/test_head_tail.py
@@ -0,0 +1,30 @@
+import numpy as np
+
+from pandas import DataFrame
+import pandas._testing as tm
+
+
+def test_head_tail(float_frame):
+ tm.assert_frame_equal(float_frame.head(), float_frame[:5])
+ tm.assert_frame_equal(float_frame.tail(), float_frame[-5:])
+
+ tm.assert_frame_equal(float_frame.head(0), float_frame[0:0])
+ tm.assert_frame_equal(float_frame.tail(0), float_frame[0:0])
+
+ tm.assert_frame_equal(float_frame.head(-1), float_frame[:-1])
+ tm.assert_frame_equal(float_frame.tail(-1), float_frame[1:])
+ tm.assert_frame_equal(float_frame.head(1), float_frame[:1])
+ tm.assert_frame_equal(float_frame.tail(1), float_frame[-1:])
+ # with a float index
+ df = float_frame.copy()
+ df.index = np.arange(len(float_frame)) + 0.1
+ tm.assert_frame_equal(df.head(), df.iloc[:5])
+ tm.assert_frame_equal(df.tail(), df.iloc[-5:])
+ tm.assert_frame_equal(df.head(0), df[0:0])
+ tm.assert_frame_equal(df.tail(0), df[0:0])
+ tm.assert_frame_equal(df.head(-1), df.iloc[:-1])
+ tm.assert_frame_equal(df.tail(-1), df.iloc[1:])
+ # test empty dataframe
+ empty_df = DataFrame()
+ tm.assert_frame_equal(empty_df.tail(), empty_df)
+ tm.assert_frame_equal(empty_df.head(), empty_df)
diff --git a/pandas/tests/frame/methods/test_replace.py b/pandas/tests/frame/methods/test_replace.py
index aa91e7a489356..92b74c4409d7d 100644
--- a/pandas/tests/frame/methods/test_replace.py
+++ b/pandas/tests/frame/methods/test_replace.py
@@ -1356,3 +1356,10 @@ def test_replace_replacer_dtype(self, replacer):
result = df.replace({"a": replacer, "b": replacer})
expected = pd.DataFrame([replacer])
tm.assert_frame_equal(result, expected)
+
+ def test_replace_after_convert_dtypes(self):
+ # GH31517
+ df = pd.DataFrame({"grp": [1, 2, 3, 4, 5]}, dtype="Int64")
+ result = df.replace(1, 10)
+ expected = pd.DataFrame({"grp": [10, 2, 3, 4, 5]}, dtype="Int64")
+ tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/frame/methods/test_transpose.py b/pandas/tests/frame/methods/test_transpose.py
index 428b9e5068407..a5fe5f3a6d5e4 100644
--- a/pandas/tests/frame/methods/test_transpose.py
+++ b/pandas/tests/frame/methods/test_transpose.py
@@ -1,3 +1,5 @@
+import numpy as np
+
import pandas as pd
import pandas._testing as tm
@@ -41,3 +43,34 @@ def test_transpose_object_to_tzaware_mixed_tz(self):
assert (df2.dtypes == object).all()
res2 = df2.T
assert (res2.dtypes == [dti.dtype, dti2.dtype]).all()
+
+ def test_transpose_uint64(self, uint64_frame):
+
+ result = uint64_frame.T
+ expected = pd.DataFrame(uint64_frame.values.T)
+ expected.index = ["A", "B"]
+ tm.assert_frame_equal(result, expected)
+
+ def test_transpose_float(self, float_frame):
+ frame = float_frame
+ dft = frame.T
+ for idx, series in dft.items():
+ for col, value in series.items():
+ if np.isnan(value):
+ assert np.isnan(frame[col][idx])
+ else:
+ assert value == frame[col][idx]
+
+ # mixed type
+ index, data = tm.getMixedTypeDict()
+ mixed = pd.DataFrame(data, index=index)
+
+ mixed_T = mixed.T
+ for col, s in mixed_T.items():
+ assert s.dtype == np.object_
+
+ def test_transpose_get_view(self, float_frame):
+ dft = float_frame.T
+ dft.values[:, 5:10] = 5
+
+ assert (float_frame.values[5:10] == 5).all()
diff --git a/pandas/tests/frame/methods/test_update.py b/pandas/tests/frame/methods/test_update.py
new file mode 100644
index 0000000000000..d9de026dbf4e9
--- /dev/null
+++ b/pandas/tests/frame/methods/test_update.py
@@ -0,0 +1,135 @@
+import numpy as np
+import pytest
+
+import pandas as pd
+from pandas import DataFrame, Series, date_range
+import pandas._testing as tm
+
+
+class TestDataFrameUpdate:
+ def test_update_nan(self):
+ # #15593 #15617
+ # test 1
+ df1 = DataFrame({"A": [1.0, 2, 3], "B": date_range("2000", periods=3)})
+ df2 = DataFrame({"A": [None, 2, 3]})
+ expected = df1.copy()
+ df1.update(df2, overwrite=False)
+
+ tm.assert_frame_equal(df1, expected)
+
+ # test 2
+ df1 = DataFrame({"A": [1.0, None, 3], "B": date_range("2000", periods=3)})
+ df2 = DataFrame({"A": [None, 2, 3]})
+ expected = DataFrame({"A": [1.0, 2, 3], "B": date_range("2000", periods=3)})
+ df1.update(df2, overwrite=False)
+
+ tm.assert_frame_equal(df1, expected)
+
+ def test_update(self):
+ df = DataFrame(
+ [[1.5, np.nan, 3.0], [1.5, np.nan, 3.0], [1.5, np.nan, 3], [1.5, np.nan, 3]]
+ )
+
+ other = DataFrame([[3.6, 2.0, np.nan], [np.nan, np.nan, 7]], index=[1, 3])
+
+ df.update(other)
+
+ expected = DataFrame(
+ [[1.5, np.nan, 3], [3.6, 2, 3], [1.5, np.nan, 3], [1.5, np.nan, 7.0]]
+ )
+ tm.assert_frame_equal(df, expected)
+
+ def test_update_dtypes(self):
+
+ # gh 3016
+ df = DataFrame(
+ [[1.0, 2.0, False, True], [4.0, 5.0, True, False]],
+ columns=["A", "B", "bool1", "bool2"],
+ )
+
+ other = DataFrame([[45, 45]], index=[0], columns=["A", "B"])
+ df.update(other)
+
+ expected = DataFrame(
+ [[45.0, 45.0, False, True], [4.0, 5.0, True, False]],
+ columns=["A", "B", "bool1", "bool2"],
+ )
+ tm.assert_frame_equal(df, expected)
+
+ def test_update_nooverwrite(self):
+ df = DataFrame(
+ [[1.5, np.nan, 3.0], [1.5, np.nan, 3.0], [1.5, np.nan, 3], [1.5, np.nan, 3]]
+ )
+
+ other = DataFrame([[3.6, 2.0, np.nan], [np.nan, np.nan, 7]], index=[1, 3])
+
+ df.update(other, overwrite=False)
+
+ expected = DataFrame(
+ [[1.5, np.nan, 3], [1.5, 2, 3], [1.5, np.nan, 3], [1.5, np.nan, 3.0]]
+ )
+ tm.assert_frame_equal(df, expected)
+
+ def test_update_filtered(self):
+ df = DataFrame(
+ [[1.5, np.nan, 3.0], [1.5, np.nan, 3.0], [1.5, np.nan, 3], [1.5, np.nan, 3]]
+ )
+
+ other = DataFrame([[3.6, 2.0, np.nan], [np.nan, np.nan, 7]], index=[1, 3])
+
+ df.update(other, filter_func=lambda x: x > 2)
+
+ expected = DataFrame(
+ [[1.5, np.nan, 3], [1.5, np.nan, 3], [1.5, np.nan, 3], [1.5, np.nan, 7.0]]
+ )
+ tm.assert_frame_equal(df, expected)
+
+ @pytest.mark.parametrize(
+ "bad_kwarg, exception, msg",
+ [
+ # errors must be 'ignore' or 'raise'
+ ({"errors": "something"}, ValueError, "The parameter errors must.*"),
+ ({"join": "inner"}, NotImplementedError, "Only left join is supported"),
+ ],
+ )
+ def test_update_raise_bad_parameter(self, bad_kwarg, exception, msg):
+ df = DataFrame([[1.5, 1, 3.0]])
+ with pytest.raises(exception, match=msg):
+ df.update(df, **bad_kwarg)
+
+ def test_update_raise_on_overlap(self):
+ df = DataFrame(
+ [[1.5, 1, 3.0], [1.5, np.nan, 3.0], [1.5, np.nan, 3], [1.5, np.nan, 3]]
+ )
+
+ other = DataFrame([[2.0, np.nan], [np.nan, 7]], index=[1, 3], columns=[1, 2])
+ with pytest.raises(ValueError, match="Data overlaps"):
+ df.update(other, errors="raise")
+
+ def test_update_from_non_df(self):
+ d = {"a": Series([1, 2, 3, 4]), "b": Series([5, 6, 7, 8])}
+ df = DataFrame(d)
+
+ d["a"] = Series([5, 6, 7, 8])
+ df.update(d)
+
+ expected = DataFrame(d)
+
+ tm.assert_frame_equal(df, expected)
+
+ d = {"a": [1, 2, 3, 4], "b": [5, 6, 7, 8]}
+ df = DataFrame(d)
+
+ d["a"] = [5, 6, 7, 8]
+ df.update(d)
+
+ expected = DataFrame(d)
+
+ tm.assert_frame_equal(df, expected)
+
+ def test_update_datetime_tz(self):
+ # GH 25807
+ result = DataFrame([pd.Timestamp("2019", tz="UTC")])
+ result.update(result)
+ expected = DataFrame([pd.Timestamp("2019", tz="UTC")])
+ tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/frame/test_api.py b/pandas/tests/frame/test_api.py
index 9de5d6fe16a0d..17cc50661e3cb 100644
--- a/pandas/tests/frame/test_api.py
+++ b/pandas/tests/frame/test_api.py
@@ -14,15 +14,15 @@
class TestDataFrameMisc:
- def test_copy_index_name_checking(self, float_frame):
+ @pytest.mark.parametrize("attr", ["index", "columns"])
+ def test_copy_index_name_checking(self, float_frame, attr):
# don't want to be able to modify the index stored elsewhere after
# making a copy
- for attr in ("index", "columns"):
- ind = getattr(float_frame, attr)
- ind.name = None
- cp = float_frame.copy()
- getattr(cp, attr).name = "foo"
- assert getattr(float_frame, attr).name is None
+ ind = getattr(float_frame, attr)
+ ind.name = None
+ cp = float_frame.copy()
+ getattr(cp, attr).name = "foo"
+ assert getattr(float_frame, attr).name is None
def test_getitem_pop_assign_name(self, float_frame):
s = float_frame["A"]
@@ -358,24 +358,6 @@ def test_to_numpy_copy(self):
assert df.to_numpy(copy=False).base is arr
assert df.to_numpy(copy=True).base is None
- def test_transpose(self, float_frame):
- frame = float_frame
- dft = frame.T
- for idx, series in dft.items():
- for col, value in series.items():
- if np.isnan(value):
- assert np.isnan(frame[col][idx])
- else:
- assert value == frame[col][idx]
-
- # mixed type
- index, data = tm.getMixedTypeDict()
- mixed = DataFrame(data, index=index)
-
- mixed_T = mixed.T
- for col, s in mixed_T.items():
- assert s.dtype == np.object_
-
def test_swapaxes(self):
df = DataFrame(np.random.randn(10, 5))
tm.assert_frame_equal(df.T, df.swapaxes(0, 1))
@@ -470,12 +452,6 @@ def test_deepcopy(self, float_frame):
for idx, value in series.items():
assert float_frame["A"][idx] != value
- def test_transpose_get_view(self, float_frame):
- dft = float_frame.T
- dft.values[:, 5:10] = 5
-
- assert (float_frame.values[5:10] == 5).all()
-
def test_inplace_return_self(self):
# GH 1893
diff --git a/pandas/tests/frame/test_apply.py b/pandas/tests/frame/test_apply.py
index e98f74e133ea9..fe6abef97acc4 100644
--- a/pandas/tests/frame/test_apply.py
+++ b/pandas/tests/frame/test_apply.py
@@ -703,6 +703,14 @@ def apply_list(row):
)
tm.assert_series_equal(result, expected)
+ def test_apply_noreduction_tzaware_object(self):
+ # https://github.com/pandas-dev/pandas/issues/31505
+ df = pd.DataFrame({"foo": [pd.Timestamp("2020", tz="UTC")]}, dtype="object")
+ result = df.apply(lambda x: x)
+ tm.assert_frame_equal(result, df)
+ result = df.apply(lambda x: x.copy())
+ tm.assert_frame_equal(result, df)
+
class TestInferOutputShape:
# the user has supplied an opaque UDF where
diff --git a/pandas/tests/frame/test_combine_concat.py b/pandas/tests/frame/test_combine_concat.py
index 9bad54b051d6c..36a476d195fe5 100644
--- a/pandas/tests/frame/test_combine_concat.py
+++ b/pandas/tests/frame/test_combine_concat.py
@@ -128,115 +128,6 @@ def test_concat_tuple_keys(self):
)
tm.assert_frame_equal(results, expected)
- def test_update(self):
- df = DataFrame(
- [[1.5, np.nan, 3.0], [1.5, np.nan, 3.0], [1.5, np.nan, 3], [1.5, np.nan, 3]]
- )
-
- other = DataFrame([[3.6, 2.0, np.nan], [np.nan, np.nan, 7]], index=[1, 3])
-
- df.update(other)
-
- expected = DataFrame(
- [[1.5, np.nan, 3], [3.6, 2, 3], [1.5, np.nan, 3], [1.5, np.nan, 7.0]]
- )
- tm.assert_frame_equal(df, expected)
-
- def test_update_dtypes(self):
-
- # gh 3016
- df = DataFrame(
- [[1.0, 2.0, False, True], [4.0, 5.0, True, False]],
- columns=["A", "B", "bool1", "bool2"],
- )
-
- other = DataFrame([[45, 45]], index=[0], columns=["A", "B"])
- df.update(other)
-
- expected = DataFrame(
- [[45.0, 45.0, False, True], [4.0, 5.0, True, False]],
- columns=["A", "B", "bool1", "bool2"],
- )
- tm.assert_frame_equal(df, expected)
-
- def test_update_nooverwrite(self):
- df = DataFrame(
- [[1.5, np.nan, 3.0], [1.5, np.nan, 3.0], [1.5, np.nan, 3], [1.5, np.nan, 3]]
- )
-
- other = DataFrame([[3.6, 2.0, np.nan], [np.nan, np.nan, 7]], index=[1, 3])
-
- df.update(other, overwrite=False)
-
- expected = DataFrame(
- [[1.5, np.nan, 3], [1.5, 2, 3], [1.5, np.nan, 3], [1.5, np.nan, 3.0]]
- )
- tm.assert_frame_equal(df, expected)
-
- def test_update_filtered(self):
- df = DataFrame(
- [[1.5, np.nan, 3.0], [1.5, np.nan, 3.0], [1.5, np.nan, 3], [1.5, np.nan, 3]]
- )
-
- other = DataFrame([[3.6, 2.0, np.nan], [np.nan, np.nan, 7]], index=[1, 3])
-
- df.update(other, filter_func=lambda x: x > 2)
-
- expected = DataFrame(
- [[1.5, np.nan, 3], [1.5, np.nan, 3], [1.5, np.nan, 3], [1.5, np.nan, 7.0]]
- )
- tm.assert_frame_equal(df, expected)
-
- @pytest.mark.parametrize(
- "bad_kwarg, exception, msg",
- [
- # errors must be 'ignore' or 'raise'
- ({"errors": "something"}, ValueError, "The parameter errors must.*"),
- ({"join": "inner"}, NotImplementedError, "Only left join is supported"),
- ],
- )
- def test_update_raise_bad_parameter(self, bad_kwarg, exception, msg):
- df = DataFrame([[1.5, 1, 3.0]])
- with pytest.raises(exception, match=msg):
- df.update(df, **bad_kwarg)
-
- def test_update_raise_on_overlap(self):
- df = DataFrame(
- [[1.5, 1, 3.0], [1.5, np.nan, 3.0], [1.5, np.nan, 3], [1.5, np.nan, 3]]
- )
-
- other = DataFrame([[2.0, np.nan], [np.nan, 7]], index=[1, 3], columns=[1, 2])
- with pytest.raises(ValueError, match="Data overlaps"):
- df.update(other, errors="raise")
-
- def test_update_from_non_df(self):
- d = {"a": Series([1, 2, 3, 4]), "b": Series([5, 6, 7, 8])}
- df = DataFrame(d)
-
- d["a"] = Series([5, 6, 7, 8])
- df.update(d)
-
- expected = DataFrame(d)
-
- tm.assert_frame_equal(df, expected)
-
- d = {"a": [1, 2, 3, 4], "b": [5, 6, 7, 8]}
- df = DataFrame(d)
-
- d["a"] = [5, 6, 7, 8]
- df.update(d)
-
- expected = DataFrame(d)
-
- tm.assert_frame_equal(df, expected)
-
- def test_update_datetime_tz(self):
- # GH 25807
- result = DataFrame([pd.Timestamp("2019", tz="UTC")])
- result.update(result)
- expected = DataFrame([pd.Timestamp("2019", tz="UTC")])
- tm.assert_frame_equal(result, expected)
-
def test_join_str_datetime(self):
str_dates = ["20120209", "20120222"]
dt_dates = [datetime(2012, 2, 9), datetime(2012, 2, 22)]
@@ -422,347 +313,6 @@ def test_concat_astype_dup_col(self):
).astype("category")
tm.assert_frame_equal(result, expected)
-
-class TestDataFrameCombineFirst:
- def test_combine_first_mixed(self):
- a = Series(["a", "b"], index=range(2))
- b = Series(range(2), index=range(2))
- f = DataFrame({"A": a, "B": b})
-
- a = Series(["a", "b"], index=range(5, 7))
- b = Series(range(2), index=range(5, 7))
- g = DataFrame({"A": a, "B": b})
-
- exp = pd.DataFrame(
- {"A": list("abab"), "B": [0.0, 1.0, 0.0, 1.0]}, index=[0, 1, 5, 6]
- )
- combined = f.combine_first(g)
- tm.assert_frame_equal(combined, exp)
-
- def test_combine_first(self, float_frame):
- # disjoint
- head, tail = float_frame[:5], float_frame[5:]
-
- combined = head.combine_first(tail)
- reordered_frame = float_frame.reindex(combined.index)
- tm.assert_frame_equal(combined, reordered_frame)
- assert tm.equalContents(combined.columns, float_frame.columns)
- tm.assert_series_equal(combined["A"], reordered_frame["A"])
-
- # same index
- fcopy = float_frame.copy()
- fcopy["A"] = 1
- del fcopy["C"]
-
- fcopy2 = float_frame.copy()
- fcopy2["B"] = 0
- del fcopy2["D"]
-
- combined = fcopy.combine_first(fcopy2)
-
- assert (combined["A"] == 1).all()
- tm.assert_series_equal(combined["B"], fcopy["B"])
- tm.assert_series_equal(combined["C"], fcopy2["C"])
- tm.assert_series_equal(combined["D"], fcopy["D"])
-
- # overlap
- head, tail = reordered_frame[:10].copy(), reordered_frame
- head["A"] = 1
-
- combined = head.combine_first(tail)
- assert (combined["A"][:10] == 1).all()
-
- # reverse overlap
- tail["A"][:10] = 0
- combined = tail.combine_first(head)
- assert (combined["A"][:10] == 0).all()
-
- # no overlap
- f = float_frame[:10]
- g = float_frame[10:]
- combined = f.combine_first(g)
- tm.assert_series_equal(combined["A"].reindex(f.index), f["A"])
- tm.assert_series_equal(combined["A"].reindex(g.index), g["A"])
-
- # corner cases
- comb = float_frame.combine_first(DataFrame())
- tm.assert_frame_equal(comb, float_frame)
-
- comb = DataFrame().combine_first(float_frame)
- tm.assert_frame_equal(comb, float_frame)
-
- comb = float_frame.combine_first(DataFrame(index=["faz", "boo"]))
- assert "faz" in comb.index
-
- # #2525
- df = DataFrame({"a": [1]}, index=[datetime(2012, 1, 1)])
- df2 = DataFrame(columns=["b"])
- result = df.combine_first(df2)
- assert "b" in result
-
- def test_combine_first_mixed_bug(self):
- idx = Index(["a", "b", "c", "e"])
- ser1 = Series([5.0, -9.0, 4.0, 100.0], index=idx)
- ser2 = Series(["a", "b", "c", "e"], index=idx)
- ser3 = Series([12, 4, 5, 97], index=idx)
-
- frame1 = DataFrame({"col0": ser1, "col2": ser2, "col3": ser3})
-
- idx = Index(["a", "b", "c", "f"])
- ser1 = Series([5.0, -9.0, 4.0, 100.0], index=idx)
- ser2 = Series(["a", "b", "c", "f"], index=idx)
- ser3 = Series([12, 4, 5, 97], index=idx)
-
- frame2 = DataFrame({"col1": ser1, "col2": ser2, "col5": ser3})
-
- combined = frame1.combine_first(frame2)
- assert len(combined.columns) == 5
-
- # gh 3016 (same as in update)
- df = DataFrame(
- [[1.0, 2.0, False, True], [4.0, 5.0, True, False]],
- columns=["A", "B", "bool1", "bool2"],
- )
-
- other = DataFrame([[45, 45]], index=[0], columns=["A", "B"])
- result = df.combine_first(other)
- tm.assert_frame_equal(result, df)
-
- df.loc[0, "A"] = np.nan
- result = df.combine_first(other)
- df.loc[0, "A"] = 45
- tm.assert_frame_equal(result, df)
-
- # doc example
- df1 = DataFrame(
- {"A": [1.0, np.nan, 3.0, 5.0, np.nan], "B": [np.nan, 2.0, 3.0, np.nan, 6.0]}
- )
-
- df2 = DataFrame(
- {
- "A": [5.0, 2.0, 4.0, np.nan, 3.0, 7.0],
- "B": [np.nan, np.nan, 3.0, 4.0, 6.0, 8.0],
- }
- )
-
- result = df1.combine_first(df2)
- expected = DataFrame({"A": [1, 2, 3, 5, 3, 7.0], "B": [np.nan, 2, 3, 4, 6, 8]})
- tm.assert_frame_equal(result, expected)
-
- # GH3552, return object dtype with bools
- df1 = DataFrame(
- [[np.nan, 3.0, True], [-4.6, np.nan, True], [np.nan, 7.0, False]]
- )
- df2 = DataFrame([[-42.6, np.nan, True], [-5.0, 1.6, False]], index=[1, 2])
-
- result = df1.combine_first(df2)[2]
- expected = Series([True, True, False], name=2)
- tm.assert_series_equal(result, expected)
-
- # GH 3593, converting datetime64[ns] incorrectly
- df0 = DataFrame(
- {"a": [datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)]}
- )
- df1 = DataFrame({"a": [None, None, None]})
- df2 = df1.combine_first(df0)
- tm.assert_frame_equal(df2, df0)
-
- df2 = df0.combine_first(df1)
- tm.assert_frame_equal(df2, df0)
-
- df0 = DataFrame(
- {"a": [datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)]}
- )
- df1 = DataFrame({"a": [datetime(2000, 1, 2), None, None]})
- df2 = df1.combine_first(df0)
- result = df0.copy()
- result.iloc[0, :] = df1.iloc[0, :]
- tm.assert_frame_equal(df2, result)
-
- df2 = df0.combine_first(df1)
- tm.assert_frame_equal(df2, df0)
-
- def test_combine_first_align_nan(self):
- # GH 7509 (not fixed)
- dfa = pd.DataFrame([[pd.Timestamp("2011-01-01"), 2]], columns=["a", "b"])
- dfb = pd.DataFrame([[4], [5]], columns=["b"])
- assert dfa["a"].dtype == "datetime64[ns]"
- assert dfa["b"].dtype == "int64"
-
- res = dfa.combine_first(dfb)
- exp = pd.DataFrame(
- {"a": [pd.Timestamp("2011-01-01"), pd.NaT], "b": [2.0, 5.0]},
- columns=["a", "b"],
- )
- tm.assert_frame_equal(res, exp)
- assert res["a"].dtype == "datetime64[ns]"
- # ToDo: this must be int64
- assert res["b"].dtype == "float64"
-
- res = dfa.iloc[:0].combine_first(dfb)
- exp = pd.DataFrame({"a": [np.nan, np.nan], "b": [4, 5]}, columns=["a", "b"])
- tm.assert_frame_equal(res, exp)
- # ToDo: this must be datetime64
- assert res["a"].dtype == "float64"
- # ToDo: this must be int64
- assert res["b"].dtype == "int64"
-
- def test_combine_first_timezone(self):
- # see gh-7630
- data1 = pd.to_datetime("20100101 01:01").tz_localize("UTC")
- df1 = pd.DataFrame(
- columns=["UTCdatetime", "abc"],
- data=data1,
- index=pd.date_range("20140627", periods=1),
- )
- data2 = pd.to_datetime("20121212 12:12").tz_localize("UTC")
- df2 = pd.DataFrame(
- columns=["UTCdatetime", "xyz"],
- data=data2,
- index=pd.date_range("20140628", periods=1),
- )
- res = df2[["UTCdatetime"]].combine_first(df1)
- exp = pd.DataFrame(
- {
- "UTCdatetime": [
- pd.Timestamp("2010-01-01 01:01", tz="UTC"),
- pd.Timestamp("2012-12-12 12:12", tz="UTC"),
- ],
- "abc": [pd.Timestamp("2010-01-01 01:01:00", tz="UTC"), pd.NaT],
- },
- columns=["UTCdatetime", "abc"],
- index=pd.date_range("20140627", periods=2, freq="D"),
- )
- tm.assert_frame_equal(res, exp)
- assert res["UTCdatetime"].dtype == "datetime64[ns, UTC]"
- assert res["abc"].dtype == "datetime64[ns, UTC]"
-
- # see gh-10567
- dts1 = pd.date_range("2015-01-01", "2015-01-05", tz="UTC")
- df1 = pd.DataFrame({"DATE": dts1})
- dts2 = pd.date_range("2015-01-03", "2015-01-05", tz="UTC")
- df2 = pd.DataFrame({"DATE": dts2})
-
- res = df1.combine_first(df2)
- tm.assert_frame_equal(res, df1)
- assert res["DATE"].dtype == "datetime64[ns, UTC]"
-
- dts1 = pd.DatetimeIndex(
- ["2011-01-01", "NaT", "2011-01-03", "2011-01-04"], tz="US/Eastern"
- )
- df1 = pd.DataFrame({"DATE": dts1}, index=[1, 3, 5, 7])
- dts2 = pd.DatetimeIndex(
- ["2012-01-01", "2012-01-02", "2012-01-03"], tz="US/Eastern"
- )
- df2 = pd.DataFrame({"DATE": dts2}, index=[2, 4, 5])
-
- res = df1.combine_first(df2)
- exp_dts = pd.DatetimeIndex(
- [
- "2011-01-01",
- "2012-01-01",
- "NaT",
- "2012-01-02",
- "2011-01-03",
- "2011-01-04",
- ],
- tz="US/Eastern",
- )
- exp = pd.DataFrame({"DATE": exp_dts}, index=[1, 2, 3, 4, 5, 7])
- tm.assert_frame_equal(res, exp)
-
- # different tz
- dts1 = pd.date_range("2015-01-01", "2015-01-05", tz="US/Eastern")
- df1 = pd.DataFrame({"DATE": dts1})
- dts2 = pd.date_range("2015-01-03", "2015-01-05")
- df2 = pd.DataFrame({"DATE": dts2})
-
- # if df1 doesn't have NaN, keep its dtype
- res = df1.combine_first(df2)
- tm.assert_frame_equal(res, df1)
- assert res["DATE"].dtype == "datetime64[ns, US/Eastern]"
-
- dts1 = pd.date_range("2015-01-01", "2015-01-02", tz="US/Eastern")
- df1 = pd.DataFrame({"DATE": dts1})
- dts2 = pd.date_range("2015-01-01", "2015-01-03")
- df2 = pd.DataFrame({"DATE": dts2})
-
- res = df1.combine_first(df2)
- exp_dts = [
- pd.Timestamp("2015-01-01", tz="US/Eastern"),
- pd.Timestamp("2015-01-02", tz="US/Eastern"),
- pd.Timestamp("2015-01-03"),
- ]
- exp = pd.DataFrame({"DATE": exp_dts})
- tm.assert_frame_equal(res, exp)
- assert res["DATE"].dtype == "object"
-
- def test_combine_first_timedelta(self):
- data1 = pd.TimedeltaIndex(["1 day", "NaT", "3 day", "4day"])
- df1 = pd.DataFrame({"TD": data1}, index=[1, 3, 5, 7])
- data2 = pd.TimedeltaIndex(["10 day", "11 day", "12 day"])
- df2 = pd.DataFrame({"TD": data2}, index=[2, 4, 5])
-
- res = df1.combine_first(df2)
- exp_dts = pd.TimedeltaIndex(
- ["1 day", "10 day", "NaT", "11 day", "3 day", "4 day"]
- )
- exp = pd.DataFrame({"TD": exp_dts}, index=[1, 2, 3, 4, 5, 7])
- tm.assert_frame_equal(res, exp)
- assert res["TD"].dtype == "timedelta64[ns]"
-
- def test_combine_first_period(self):
- data1 = pd.PeriodIndex(["2011-01", "NaT", "2011-03", "2011-04"], freq="M")
- df1 = pd.DataFrame({"P": data1}, index=[1, 3, 5, 7])
- data2 = pd.PeriodIndex(["2012-01-01", "2012-02", "2012-03"], freq="M")
- df2 = pd.DataFrame({"P": data2}, index=[2, 4, 5])
-
- res = df1.combine_first(df2)
- exp_dts = pd.PeriodIndex(
- ["2011-01", "2012-01", "NaT", "2012-02", "2011-03", "2011-04"], freq="M"
- )
- exp = pd.DataFrame({"P": exp_dts}, index=[1, 2, 3, 4, 5, 7])
- tm.assert_frame_equal(res, exp)
- assert res["P"].dtype == data1.dtype
-
- # different freq
- dts2 = pd.PeriodIndex(["2012-01-01", "2012-01-02", "2012-01-03"], freq="D")
- df2 = pd.DataFrame({"P": dts2}, index=[2, 4, 5])
-
- res = df1.combine_first(df2)
- exp_dts = [
- pd.Period("2011-01", freq="M"),
- pd.Period("2012-01-01", freq="D"),
- pd.NaT,
- pd.Period("2012-01-02", freq="D"),
- pd.Period("2011-03", freq="M"),
- pd.Period("2011-04", freq="M"),
- ]
- exp = pd.DataFrame({"P": exp_dts}, index=[1, 2, 3, 4, 5, 7])
- tm.assert_frame_equal(res, exp)
- assert res["P"].dtype == "object"
-
- def test_combine_first_int(self):
- # GH14687 - integer series that do no align exactly
-
- df1 = pd.DataFrame({"a": [0, 1, 3, 5]}, dtype="int64")
- df2 = pd.DataFrame({"a": [1, 4]}, dtype="int64")
-
- res = df1.combine_first(df2)
- tm.assert_frame_equal(res, df1)
- assert res["a"].dtype == "int64"
-
- @pytest.mark.parametrize("val", [1, 1.0])
- def test_combine_first_with_asymmetric_other(self, val):
- # see gh-20699
- df1 = pd.DataFrame({"isNum": [val]})
- df2 = pd.DataFrame({"isBool": [True]})
-
- res = df1.combine_first(df2)
- exp = pd.DataFrame({"isBool": [True], "isNum": [val]})
-
- tm.assert_frame_equal(res, exp)
-
def test_concat_datetime_datetime64_frame(self):
# #2624
rows = []
@@ -776,23 +326,3 @@ def test_concat_datetime_datetime64_frame(self):
# it works!
pd.concat([df1, df2_obj])
-
-
-class TestDataFrameUpdate:
- def test_update_nan(self):
- # #15593 #15617
- # test 1
- df1 = DataFrame({"A": [1.0, 2, 3], "B": date_range("2000", periods=3)})
- df2 = DataFrame({"A": [None, 2, 3]})
- expected = df1.copy()
- df1.update(df2, overwrite=False)
-
- tm.assert_frame_equal(df1, expected)
-
- # test 2
- df1 = DataFrame({"A": [1.0, None, 3], "B": date_range("2000", periods=3)})
- df2 = DataFrame({"A": [None, 2, 3]})
- expected = DataFrame({"A": [1.0, 2, 3], "B": date_range("2000", periods=3)})
- df1.update(df2, overwrite=False)
-
- tm.assert_frame_equal(df1, expected)
diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py
index 7b1a9d8ff6ae3..5f4c78449f71d 100644
--- a/pandas/tests/frame/test_constructors.py
+++ b/pandas/tests/frame/test_constructors.py
@@ -1860,9 +1860,8 @@ def check(df):
# No NaN found -> error
if len(indexer) == 0:
msg = (
- "cannot do label indexing on "
- r" "
- r"with these indexers \[nan\] of "
+ "cannot do label indexing on RangeIndex "
+ r"with these indexers \[nan\] of type float"
)
with pytest.raises(TypeError, match=msg):
df.loc[:, np.nan]
diff --git a/pandas/tests/frame/test_repr_info.py b/pandas/tests/frame/test_repr_info.py
index 49e6fe4940e18..a7e01d8f1fd6d 100644
--- a/pandas/tests/frame/test_repr_info.py
+++ b/pandas/tests/frame/test_repr_info.py
@@ -223,8 +223,7 @@ def test_info_verbose(self):
for i, line in enumerate(lines):
if i >= start and i < start + size:
- index = i - start
- line_nr = " {} ".format(index)
+ line_nr = f" {i - start} "
assert line.startswith(line_nr)
def test_info_memory(self):
@@ -236,7 +235,7 @@ def test_info_memory(self):
bytes = float(df.memory_usage().sum())
expected = textwrap.dedent(
- """\
+ f"""\
RangeIndex: 2 entries, 0 to 1
Data columns (total 1 columns):
@@ -244,10 +243,8 @@ def test_info_memory(self):
--- ------ -------------- -----
0 a 2 non-null int64
dtypes: int64(1)
- memory usage: {} bytes
- """.format(
- bytes
- )
+ memory usage: {bytes} bytes
+ """
)
assert result == expected
@@ -313,9 +310,7 @@ def test_info_shows_column_dtypes(self):
)
assert header in res
for i, dtype in enumerate(dtypes):
- name = " {i:d} {i:d} {n:d} non-null {dtype}".format(
- i=i, n=n, dtype=dtype
- )
+ name = f" {i:d} {i:d} {n:d} non-null {dtype}"
assert name in res
def test_info_max_cols(self):
diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py
index 2d31996a8a964..ff99081521ffb 100644
--- a/pandas/tests/groupby/aggregate/test_aggregate.py
+++ b/pandas/tests/groupby/aggregate/test_aggregate.py
@@ -377,6 +377,65 @@ def test_agg_index_has_complex_internals(index):
tm.assert_frame_equal(result, expected)
+def test_agg_split_block():
+ # https://github.com/pandas-dev/pandas/issues/31522
+ df = pd.DataFrame(
+ {
+ "key1": ["a", "a", "b", "b", "a"],
+ "key2": ["one", "two", "one", "two", "one"],
+ "key3": ["three", "three", "three", "six", "six"],
+ }
+ )
+ result = df.groupby("key1").min()
+ expected = pd.DataFrame(
+ {"key2": ["one", "one"], "key3": ["six", "six"]},
+ index=pd.Index(["a", "b"], name="key1"),
+ )
+ tm.assert_frame_equal(result, expected)
+
+
+def test_agg_split_object_part_datetime():
+ # https://github.com/pandas-dev/pandas/pull/31616
+ df = pd.DataFrame(
+ {
+ "A": pd.date_range("2000", periods=4),
+ "B": ["a", "b", "c", "d"],
+ "C": [1, 2, 3, 4],
+ "D": ["b", "c", "d", "e"],
+ "E": pd.date_range("2000", periods=4),
+ "F": [1, 2, 3, 4],
+ }
+ ).astype(object)
+ result = df.groupby([0, 0, 0, 0]).min()
+ expected = pd.DataFrame(
+ {
+ "A": [pd.Timestamp("2000")],
+ "B": ["a"],
+ "C": [1],
+ "D": ["b"],
+ "E": [pd.Timestamp("2000")],
+ "F": [1],
+ }
+ )
+ tm.assert_frame_equal(result, expected)
+
+
+def test_agg_cython_category_not_implemented_fallback():
+ # https://github.com/pandas-dev/pandas/issues/31450
+ df = pd.DataFrame({"col_num": [1, 1, 2, 3]})
+ df["col_cat"] = df["col_num"].astype("category")
+
+ result = df.groupby("col_num").col_cat.first()
+ expected = pd.Series(
+ [1, 2, 3], index=pd.Index([1, 2, 3], name="col_num"), name="col_cat"
+ )
+ tm.assert_series_equal(result, expected)
+
+ result = df.groupby("col_num").agg({"col_cat": "first"})
+ expected = expected.to_frame()
+ tm.assert_frame_equal(result, expected)
+
+
class TestNamedAggregationSeries:
def test_series_named_agg(self):
df = pd.Series([1, 2, 3, 4])
@@ -684,6 +743,34 @@ def aggfunc(x):
tm.assert_frame_equal(result, expected)
+@pytest.mark.parametrize("func", ["min", "max"])
+def test_groupby_aggregate_period_column(func):
+ # GH 31471
+ groups = [1, 2]
+ periods = pd.period_range("2020", periods=2, freq="Y")
+ df = pd.DataFrame({"a": groups, "b": periods})
+
+ result = getattr(df.groupby("a")["b"], func)()
+ idx = pd.Int64Index([1, 2], name="a")
+ expected = pd.Series(periods, index=idx, name="b")
+
+ tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize("func", ["min", "max"])
+def test_groupby_aggregate_period_frame(func):
+ # GH 31471
+ groups = [1, 2]
+ periods = pd.period_range("2020", periods=2, freq="Y")
+ df = pd.DataFrame({"a": groups, "b": periods})
+
+ result = getattr(df.groupby("a"), func)()
+ idx = pd.Int64Index([1, 2], name="a")
+ expected = pd.DataFrame({"b": periods}, index=idx)
+
+ tm.assert_frame_equal(result, expected)
+
+
class TestLambdaMangling:
def test_basic(self):
df = pd.DataFrame({"A": [0, 0, 1, 1], "B": [1, 2, 3, 4]})
diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py
index 9c2b045079622..41ec70468aaeb 100644
--- a/pandas/tests/groupby/test_apply.py
+++ b/pandas/tests/groupby/test_apply.py
@@ -851,3 +851,17 @@ def test_apply_function_returns_non_pandas_non_scalar(function, expected_values)
result = df.groupby("groups").apply(function)
expected = pd.Series(expected_values, index=pd.Index(["A", "B"], name="groups"))
tm.assert_series_equal(result, expected)
+
+
+def test_apply_function_returns_numpy_array():
+ # GH 31605
+ def fct(group):
+ return group["B"].values.flatten()
+
+ df = pd.DataFrame({"A": ["a", "a", "b", "none"], "B": [1, 2, 3, np.nan]})
+
+ result = df.groupby("A").apply(fct)
+ expected = pd.Series(
+ [[1.0, 2.0], [3.0], [np.nan]], index=pd.Index(["a", "b", "none"], name="A")
+ )
+ tm.assert_series_equal(result, expected)
diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py
index 97cf1af1d2e9e..73e36cb5e6c84 100644
--- a/pandas/tests/groupby/test_function.py
+++ b/pandas/tests/groupby/test_function.py
@@ -671,7 +671,7 @@ def test_nsmallest():
tm.assert_series_equal(gb.nsmallest(3, keep="last"), e)
-@pytest.mark.parametrize("func", ["mean", "var", "std", "cumprod", "cumsum"])
+@pytest.mark.parametrize("func", ["cumprod", "cumsum"])
def test_numpy_compat(func):
# see gh-12811
df = pd.DataFrame({"A": [1, 2, 1], "B": [1, 2, 3]})
diff --git a/pandas/tests/groupby/test_grouping.py b/pandas/tests/groupby/test_grouping.py
index 4273139b32828..efcd22f9c0c82 100644
--- a/pandas/tests/groupby/test_grouping.py
+++ b/pandas/tests/groupby/test_grouping.py
@@ -676,6 +676,19 @@ def test_groupby_level_index_value_all_na(self):
)
tm.assert_frame_equal(result, expected)
+ def test_groupby_multiindex_level_empty(self):
+ # https://github.com/pandas-dev/pandas/issues/31670
+ df = pd.DataFrame(
+ [[123, "a", 1.0], [123, "b", 2.0]], columns=["id", "category", "value"]
+ )
+ df = df.set_index(["id", "category"])
+ empty = df[df.value < 0]
+ result = empty.groupby("id").sum()
+ expected = pd.DataFrame(
+ dtype="float64", columns=["value"], index=pd.Int64Index([], name="id")
+ )
+ tm.assert_frame_equal(result, expected)
+
# get_group
# --------------------------------
diff --git a/pandas/tests/indexes/base_class/__init__.py b/pandas/tests/indexes/base_class/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/pandas/tests/indexes/base_class/test_constructors.py b/pandas/tests/indexes/base_class/test_constructors.py
new file mode 100644
index 0000000000000..9e6a8f34c135d
--- /dev/null
+++ b/pandas/tests/indexes/base_class/test_constructors.py
@@ -0,0 +1,36 @@
+import pytest
+
+from pandas import Index, MultiIndex
+
+
+class TestIndexConstructor:
+ # Tests for the Index constructor, specifically for cases that do
+ # not return a subclass
+
+ def test_constructor_corner(self):
+ # corner case
+ msg = (
+ r"Index\(\.\.\.\) must be called with a collection of some "
+ "kind, 0 was passed"
+ )
+ with pytest.raises(TypeError, match=msg):
+ Index(0)
+
+ @pytest.mark.parametrize("index_vals", [[("A", 1), "B"], ["B", ("A", 1)]])
+ def test_construction_list_mixed_tuples(self, index_vals):
+ # see gh-10697: if we are constructing from a mixed list of tuples,
+ # make sure that we are independent of the sorting order.
+ index = Index(index_vals)
+ assert isinstance(index, Index)
+ assert not isinstance(index, MultiIndex)
+
+ def test_constructor_wrong_kwargs(self):
+ # GH #19348
+ with pytest.raises(TypeError, match="Unexpected keyword arguments {'foo'}"):
+ Index([], foo="bar")
+
+ @pytest.mark.xfail(reason="see GH#21311: Index doesn't enforce dtype argument")
+ def test_constructor_cast(self):
+ msg = "could not convert string to float"
+ with pytest.raises(ValueError, match=msg):
+ Index(["a", "b", "c"], dtype=float)
diff --git a/pandas/tests/indexes/base_class/test_setops.py b/pandas/tests/indexes/base_class/test_setops.py
new file mode 100644
index 0000000000000..e7d5e21d0ba47
--- /dev/null
+++ b/pandas/tests/indexes/base_class/test_setops.py
@@ -0,0 +1,74 @@
+import numpy as np
+import pytest
+
+from pandas import Index, Series
+import pandas._testing as tm
+from pandas.core.algorithms import safe_sort
+
+
+class TestIndexSetOps:
+ def test_union_base(self):
+ index = Index([0, "a", 1, "b", 2, "c"])
+ first = index[3:]
+ second = index[:5]
+
+ result = first.union(second)
+
+ expected = Index([0, 1, 2, "a", "b", "c"])
+ tm.assert_index_equal(result, expected)
+
+ @pytest.mark.parametrize("klass", [np.array, Series, list])
+ def test_union_different_type_base(self, klass):
+ # GH 10149
+ index = Index([0, "a", 1, "b", 2, "c"])
+ first = index[3:]
+ second = index[:5]
+
+ result = first.union(klass(second.values))
+
+ assert tm.equalContents(result, index)
+
+ @pytest.mark.parametrize("sort", [None, False])
+ def test_intersection_base(self, sort):
+ # (same results for py2 and py3 but sortedness not tested elsewhere)
+ index = Index([0, "a", 1, "b", 2, "c"])
+ first = index[:5]
+ second = index[:3]
+
+ expected = Index([0, 1, "a"]) if sort is None else Index([0, "a", 1])
+ result = first.intersection(second, sort=sort)
+ tm.assert_index_equal(result, expected)
+
+ @pytest.mark.parametrize("klass", [np.array, Series, list])
+ @pytest.mark.parametrize("sort", [None, False])
+ def test_intersection_different_type_base(self, klass, sort):
+ # GH 10149
+ index = Index([0, "a", 1, "b", 2, "c"])
+ first = index[:5]
+ second = index[:3]
+
+ result = first.intersection(klass(second.values), sort=sort)
+ assert tm.equalContents(result, second)
+
+ @pytest.mark.parametrize("sort", [None, False])
+ def test_difference_base(self, sort):
+ # (same results for py2 and py3 but sortedness not tested elsewhere)
+ index = Index([0, "a", 1, "b", 2, "c"])
+ first = index[:4]
+ second = index[3:]
+
+ result = first.difference(second, sort)
+ expected = Index([0, "a", 1])
+ if sort is None:
+ expected = Index(safe_sort(expected))
+ tm.assert_index_equal(result, expected)
+
+ def test_symmetric_difference(self):
+ # (same results for py2 and py3 but sortedness not tested elsewhere)
+ index = Index([0, "a", 1, "b", 2, "c"])
+ first = index[:4]
+ second = index[3:]
+
+ result = first.symmetric_difference(second)
+ expected = Index([0, 1, 2, "a", "c"])
+ tm.assert_index_equal(result, expected)
diff --git a/pandas/tests/indexes/categorical/test_category.py b/pandas/tests/indexes/categorical/test_category.py
index d870259c2539b..c18cd1f252c83 100644
--- a/pandas/tests/indexes/categorical/test_category.py
+++ b/pandas/tests/indexes/categorical/test_category.py
@@ -146,76 +146,6 @@ def test_contains_list(self):
with pytest.raises(TypeError, match="unhashable type"):
["a", "b"] in idx
- def test_map(self):
- ci = pd.CategoricalIndex(list("ABABC"), categories=list("CBA"), ordered=True)
- result = ci.map(lambda x: x.lower())
- exp = pd.CategoricalIndex(list("ababc"), categories=list("cba"), ordered=True)
- tm.assert_index_equal(result, exp)
-
- ci = pd.CategoricalIndex(
- list("ABABC"), categories=list("BAC"), ordered=False, name="XXX"
- )
- result = ci.map(lambda x: x.lower())
- exp = pd.CategoricalIndex(
- list("ababc"), categories=list("bac"), ordered=False, name="XXX"
- )
- tm.assert_index_equal(result, exp)
-
- # GH 12766: Return an index not an array
- tm.assert_index_equal(
- ci.map(lambda x: 1), Index(np.array([1] * 5, dtype=np.int64), name="XXX")
- )
-
- # change categories dtype
- ci = pd.CategoricalIndex(list("ABABC"), categories=list("BAC"), ordered=False)
-
- def f(x):
- return {"A": 10, "B": 20, "C": 30}.get(x)
-
- result = ci.map(f)
- exp = pd.CategoricalIndex(
- [10, 20, 10, 20, 30], categories=[20, 10, 30], ordered=False
- )
- tm.assert_index_equal(result, exp)
-
- result = ci.map(pd.Series([10, 20, 30], index=["A", "B", "C"]))
- tm.assert_index_equal(result, exp)
-
- result = ci.map({"A": 10, "B": 20, "C": 30})
- tm.assert_index_equal(result, exp)
-
- def test_map_with_categorical_series(self):
- # GH 12756
- a = pd.Index([1, 2, 3, 4])
- b = pd.Series(["even", "odd", "even", "odd"], dtype="category")
- c = pd.Series(["even", "odd", "even", "odd"])
-
- exp = CategoricalIndex(["odd", "even", "odd", np.nan])
- tm.assert_index_equal(a.map(b), exp)
- exp = pd.Index(["odd", "even", "odd", np.nan])
- tm.assert_index_equal(a.map(c), exp)
-
- @pytest.mark.parametrize(
- ("data", "f"),
- (
- ([1, 1, np.nan], pd.isna),
- ([1, 2, np.nan], pd.isna),
- ([1, 1, np.nan], {1: False}),
- ([1, 2, np.nan], {1: False, 2: False}),
- ([1, 1, np.nan], pd.Series([False, False])),
- ([1, 2, np.nan], pd.Series([False, False, False])),
- ),
- )
- def test_map_with_nan(self, data, f): # GH 24241
- values = pd.Categorical(data)
- result = values.map(f)
- if data[1] == 1:
- expected = pd.Categorical([False, False, np.nan])
- tm.assert_categorical_equal(result, expected)
- else:
- expected = pd.Index([False, False, np.nan])
- tm.assert_index_equal(result, expected)
-
@pytest.mark.parametrize("klass", [list, tuple, np.array, pd.Series])
def test_where(self, klass):
i = self.create_index()
@@ -384,89 +314,6 @@ def test_astype_category(self, name, dtype_ordered, index_ordered):
expected = index
tm.assert_index_equal(result, expected)
- def test_reindex_base(self):
- # Determined by cat ordering.
- idx = CategoricalIndex(list("cab"), categories=list("cab"))
- expected = np.arange(len(idx), dtype=np.intp)
-
- actual = idx.get_indexer(idx)
- tm.assert_numpy_array_equal(expected, actual)
-
- with pytest.raises(ValueError, match="Invalid fill method"):
- idx.get_indexer(idx, method="invalid")
-
- def test_reindexing(self):
- np.random.seed(123456789)
-
- ci = self.create_index()
- oidx = Index(np.array(ci))
-
- for n in [1, 2, 5, len(ci)]:
- finder = oidx[np.random.randint(0, len(ci), size=n)]
- expected = oidx.get_indexer_non_unique(finder)[0]
-
- actual = ci.get_indexer(finder)
- tm.assert_numpy_array_equal(expected, actual)
-
- # see gh-17323
- #
- # Even when indexer is equal to the
- # members in the index, we should
- # respect duplicates instead of taking
- # the fast-track path.
- for finder in [list("aabbca"), list("aababca")]:
- expected = oidx.get_indexer_non_unique(finder)[0]
-
- actual = ci.get_indexer(finder)
- tm.assert_numpy_array_equal(expected, actual)
-
- def test_reindex_dtype(self):
- c = CategoricalIndex(["a", "b", "c", "a"])
- res, indexer = c.reindex(["a", "c"])
- tm.assert_index_equal(res, Index(["a", "a", "c"]), exact=True)
- tm.assert_numpy_array_equal(indexer, np.array([0, 3, 2], dtype=np.intp))
-
- c = CategoricalIndex(["a", "b", "c", "a"])
- res, indexer = c.reindex(Categorical(["a", "c"]))
-
- exp = CategoricalIndex(["a", "a", "c"], categories=["a", "c"])
- tm.assert_index_equal(res, exp, exact=True)
- tm.assert_numpy_array_equal(indexer, np.array([0, 3, 2], dtype=np.intp))
-
- c = CategoricalIndex(["a", "b", "c", "a"], categories=["a", "b", "c", "d"])
- res, indexer = c.reindex(["a", "c"])
- exp = Index(["a", "a", "c"], dtype="object")
- tm.assert_index_equal(res, exp, exact=True)
- tm.assert_numpy_array_equal(indexer, np.array([0, 3, 2], dtype=np.intp))
-
- c = CategoricalIndex(["a", "b", "c", "a"], categories=["a", "b", "c", "d"])
- res, indexer = c.reindex(Categorical(["a", "c"]))
- exp = CategoricalIndex(["a", "a", "c"], categories=["a", "c"])
- tm.assert_index_equal(res, exp, exact=True)
- tm.assert_numpy_array_equal(indexer, np.array([0, 3, 2], dtype=np.intp))
-
- def test_reindex_duplicate_target(self):
- # See GH25459
- cat = CategoricalIndex(["a", "b", "c"], categories=["a", "b", "c", "d"])
- res, indexer = cat.reindex(["a", "c", "c"])
- exp = Index(["a", "c", "c"], dtype="object")
- tm.assert_index_equal(res, exp, exact=True)
- tm.assert_numpy_array_equal(indexer, np.array([0, 2, 2], dtype=np.intp))
-
- res, indexer = cat.reindex(
- CategoricalIndex(["a", "c", "c"], categories=["a", "b", "c", "d"])
- )
- exp = CategoricalIndex(["a", "c", "c"], categories=["a", "b", "c", "d"])
- tm.assert_index_equal(res, exp, exact=True)
- tm.assert_numpy_array_equal(indexer, np.array([0, 2, 2], dtype=np.intp))
-
- def test_reindex_empty_index(self):
- # See GH16770
- c = CategoricalIndex([])
- res, indexer = c.reindex(["a", "b"])
- tm.assert_index_equal(res, Index(["a", "b"]), exact=True)
- tm.assert_numpy_array_equal(indexer, np.array([-1, -1], dtype=np.intp))
-
@pytest.mark.parametrize(
"data, non_lexsorted_data",
[[[1, 2, 3], [9, 0, 1, 2, 3]], [list("abc"), list("fabcd")]],
@@ -518,75 +365,6 @@ def test_drop_duplicates(self):
tm.assert_index_equal(idx.drop_duplicates(), expected)
tm.assert_index_equal(idx.unique(), expected)
- def test_get_indexer(self):
-
- idx1 = CategoricalIndex(list("aabcde"), categories=list("edabc"))
- idx2 = CategoricalIndex(list("abf"))
-
- for indexer in [idx2, list("abf"), Index(list("abf"))]:
- r1 = idx1.get_indexer(idx2)
- tm.assert_almost_equal(r1, np.array([0, 1, 2, -1], dtype=np.intp))
-
- msg = (
- "method='pad' and method='backfill' not implemented yet for "
- "CategoricalIndex"
- )
- with pytest.raises(NotImplementedError, match=msg):
- idx2.get_indexer(idx1, method="pad")
- with pytest.raises(NotImplementedError, match=msg):
- idx2.get_indexer(idx1, method="backfill")
-
- msg = "method='nearest' not implemented yet for CategoricalIndex"
- with pytest.raises(NotImplementedError, match=msg):
- idx2.get_indexer(idx1, method="nearest")
-
- def test_get_loc(self):
- # GH 12531
- cidx1 = CategoricalIndex(list("abcde"), categories=list("edabc"))
- idx1 = Index(list("abcde"))
- assert cidx1.get_loc("a") == idx1.get_loc("a")
- assert cidx1.get_loc("e") == idx1.get_loc("e")
-
- for i in [cidx1, idx1]:
- with pytest.raises(KeyError, match="'NOT-EXIST'"):
- i.get_loc("NOT-EXIST")
-
- # non-unique
- cidx2 = CategoricalIndex(list("aacded"), categories=list("edabc"))
- idx2 = Index(list("aacded"))
-
- # results in bool array
- res = cidx2.get_loc("d")
- tm.assert_numpy_array_equal(res, idx2.get_loc("d"))
- tm.assert_numpy_array_equal(
- res, np.array([False, False, False, True, False, True])
- )
- # unique element results in scalar
- res = cidx2.get_loc("e")
- assert res == idx2.get_loc("e")
- assert res == 4
-
- for i in [cidx2, idx2]:
- with pytest.raises(KeyError, match="'NOT-EXIST'"):
- i.get_loc("NOT-EXIST")
-
- # non-unique, sliceable
- cidx3 = CategoricalIndex(list("aabbb"), categories=list("abc"))
- idx3 = Index(list("aabbb"))
-
- # results in slice
- res = cidx3.get_loc("a")
- assert res == idx3.get_loc("a")
- assert res == slice(0, 2, None)
-
- res = cidx3.get_loc("b")
- assert res == idx3.get_loc("b")
- assert res == slice(2, 5, None)
-
- for i in [cidx3, idx3]:
- with pytest.raises(KeyError, match="'c'"):
- i.get_loc("c")
-
def test_repr_roundtrip(self):
ci = CategoricalIndex(["a", "b"], categories=["a", "b"], ordered=True)
@@ -837,122 +615,6 @@ def test_fillna_categorical(self):
with pytest.raises(ValueError, match=msg):
idx.fillna(2.0)
- def test_take_fill_value(self):
- # GH 12631
-
- # numeric category
- idx = pd.CategoricalIndex([1, 2, 3], name="xxx")
- result = idx.take(np.array([1, 0, -1]))
- expected = pd.CategoricalIndex([2, 1, 3], name="xxx")
- tm.assert_index_equal(result, expected)
- tm.assert_categorical_equal(result.values, expected.values)
-
- # fill_value
- result = idx.take(np.array([1, 0, -1]), fill_value=True)
- expected = pd.CategoricalIndex([2, 1, np.nan], categories=[1, 2, 3], name="xxx")
- tm.assert_index_equal(result, expected)
- tm.assert_categorical_equal(result.values, expected.values)
-
- # allow_fill=False
- result = idx.take(np.array([1, 0, -1]), allow_fill=False, fill_value=True)
- expected = pd.CategoricalIndex([2, 1, 3], name="xxx")
- tm.assert_index_equal(result, expected)
- tm.assert_categorical_equal(result.values, expected.values)
-
- # object category
- idx = pd.CategoricalIndex(
- list("CBA"), categories=list("ABC"), ordered=True, name="xxx"
- )
- result = idx.take(np.array([1, 0, -1]))
- expected = pd.CategoricalIndex(
- list("BCA"), categories=list("ABC"), ordered=True, name="xxx"
- )
- tm.assert_index_equal(result, expected)
- tm.assert_categorical_equal(result.values, expected.values)
-
- # fill_value
- result = idx.take(np.array([1, 0, -1]), fill_value=True)
- expected = pd.CategoricalIndex(
- ["B", "C", np.nan], categories=list("ABC"), ordered=True, name="xxx"
- )
- tm.assert_index_equal(result, expected)
- tm.assert_categorical_equal(result.values, expected.values)
-
- # allow_fill=False
- result = idx.take(np.array([1, 0, -1]), allow_fill=False, fill_value=True)
- expected = pd.CategoricalIndex(
- list("BCA"), categories=list("ABC"), ordered=True, name="xxx"
- )
- tm.assert_index_equal(result, expected)
- tm.assert_categorical_equal(result.values, expected.values)
-
- msg = (
- "When allow_fill=True and fill_value is not None, "
- "all indices must be >= -1"
- )
- with pytest.raises(ValueError, match=msg):
- idx.take(np.array([1, 0, -2]), fill_value=True)
- with pytest.raises(ValueError, match=msg):
- idx.take(np.array([1, 0, -5]), fill_value=True)
-
- with pytest.raises(IndexError):
- idx.take(np.array([1, -5]))
-
- def test_take_fill_value_datetime(self):
-
- # datetime category
- idx = pd.DatetimeIndex(["2011-01-01", "2011-02-01", "2011-03-01"], name="xxx")
- idx = pd.CategoricalIndex(idx)
- result = idx.take(np.array([1, 0, -1]))
- expected = pd.DatetimeIndex(
- ["2011-02-01", "2011-01-01", "2011-03-01"], name="xxx"
- )
- expected = pd.CategoricalIndex(expected)
- tm.assert_index_equal(result, expected)
-
- # fill_value
- result = idx.take(np.array([1, 0, -1]), fill_value=True)
- expected = pd.DatetimeIndex(["2011-02-01", "2011-01-01", "NaT"], name="xxx")
- exp_cats = pd.DatetimeIndex(["2011-01-01", "2011-02-01", "2011-03-01"])
- expected = pd.CategoricalIndex(expected, categories=exp_cats)
- tm.assert_index_equal(result, expected)
-
- # allow_fill=False
- result = idx.take(np.array([1, 0, -1]), allow_fill=False, fill_value=True)
- expected = pd.DatetimeIndex(
- ["2011-02-01", "2011-01-01", "2011-03-01"], name="xxx"
- )
- expected = pd.CategoricalIndex(expected)
- tm.assert_index_equal(result, expected)
-
- msg = (
- "When allow_fill=True and fill_value is not None, "
- "all indices must be >= -1"
- )
- with pytest.raises(ValueError, match=msg):
- idx.take(np.array([1, 0, -2]), fill_value=True)
- with pytest.raises(ValueError, match=msg):
- idx.take(np.array([1, 0, -5]), fill_value=True)
-
- with pytest.raises(IndexError):
- idx.take(np.array([1, -5]))
-
- def test_take_invalid_kwargs(self):
- idx = pd.CategoricalIndex([1, 2, 3], name="foo")
- indices = [1, 0, -1]
-
- msg = r"take\(\) got an unexpected keyword argument 'foo'"
- with pytest.raises(TypeError, match=msg):
- idx.take(indices, foo=2)
-
- msg = "the 'out' parameter is not supported"
- with pytest.raises(ValueError, match=msg):
- idx.take(indices, out=indices)
-
- msg = "the 'mode' parameter is not supported"
- with pytest.raises(ValueError, match=msg):
- idx.take(indices, mode="clip")
-
@pytest.mark.parametrize(
"dtype, engine_type",
[
@@ -976,19 +638,10 @@ def test_engine_type(self, dtype, engine_type):
assert np.issubdtype(ci.codes.dtype, dtype)
assert isinstance(ci._engine, engine_type)
- @pytest.mark.parametrize(
- "data, categories",
- [
- (list("abcbca"), list("cab")),
- (pd.interval_range(0, 3).repeat(3), pd.interval_range(0, 3)),
- ],
- ids=["string", "interval"],
- )
- def test_map_str(self, data, categories, ordered_fixture):
- # GH 31202 - override base class since we want to maintain categorical/ordered
- index = CategoricalIndex(data, categories=categories, ordered=ordered_fixture)
- result = index.map(str)
- expected = CategoricalIndex(
- map(str, data), categories=map(str, categories), ordered=ordered_fixture
- )
- tm.assert_index_equal(result, expected)
+ def test_reindex_base(self):
+ # See test_reindex.py
+ pass
+
+ def test_map_str(self):
+ # See test_map.py
+ pass
diff --git a/pandas/tests/indexes/categorical/test_indexing.py b/pandas/tests/indexes/categorical/test_indexing.py
new file mode 100644
index 0000000000000..507e38d9acac2
--- /dev/null
+++ b/pandas/tests/indexes/categorical/test_indexing.py
@@ -0,0 +1,250 @@
+import numpy as np
+import pytest
+
+import pandas as pd
+from pandas import CategoricalIndex, Index
+import pandas._testing as tm
+
+
+class TestTake:
+ def test_take_fill_value(self):
+ # GH 12631
+
+ # numeric category
+ idx = pd.CategoricalIndex([1, 2, 3], name="xxx")
+ result = idx.take(np.array([1, 0, -1]))
+ expected = pd.CategoricalIndex([2, 1, 3], name="xxx")
+ tm.assert_index_equal(result, expected)
+ tm.assert_categorical_equal(result.values, expected.values)
+
+ # fill_value
+ result = idx.take(np.array([1, 0, -1]), fill_value=True)
+ expected = pd.CategoricalIndex([2, 1, np.nan], categories=[1, 2, 3], name="xxx")
+ tm.assert_index_equal(result, expected)
+ tm.assert_categorical_equal(result.values, expected.values)
+
+ # allow_fill=False
+ result = idx.take(np.array([1, 0, -1]), allow_fill=False, fill_value=True)
+ expected = pd.CategoricalIndex([2, 1, 3], name="xxx")
+ tm.assert_index_equal(result, expected)
+ tm.assert_categorical_equal(result.values, expected.values)
+
+ # object category
+ idx = pd.CategoricalIndex(
+ list("CBA"), categories=list("ABC"), ordered=True, name="xxx"
+ )
+ result = idx.take(np.array([1, 0, -1]))
+ expected = pd.CategoricalIndex(
+ list("BCA"), categories=list("ABC"), ordered=True, name="xxx"
+ )
+ tm.assert_index_equal(result, expected)
+ tm.assert_categorical_equal(result.values, expected.values)
+
+ # fill_value
+ result = idx.take(np.array([1, 0, -1]), fill_value=True)
+ expected = pd.CategoricalIndex(
+ ["B", "C", np.nan], categories=list("ABC"), ordered=True, name="xxx"
+ )
+ tm.assert_index_equal(result, expected)
+ tm.assert_categorical_equal(result.values, expected.values)
+
+ # allow_fill=False
+ result = idx.take(np.array([1, 0, -1]), allow_fill=False, fill_value=True)
+ expected = pd.CategoricalIndex(
+ list("BCA"), categories=list("ABC"), ordered=True, name="xxx"
+ )
+ tm.assert_index_equal(result, expected)
+ tm.assert_categorical_equal(result.values, expected.values)
+
+ msg = (
+ "When allow_fill=True and fill_value is not None, "
+ "all indices must be >= -1"
+ )
+ with pytest.raises(ValueError, match=msg):
+ idx.take(np.array([1, 0, -2]), fill_value=True)
+ with pytest.raises(ValueError, match=msg):
+ idx.take(np.array([1, 0, -5]), fill_value=True)
+
+ with pytest.raises(IndexError):
+ idx.take(np.array([1, -5]))
+
+ def test_take_fill_value_datetime(self):
+
+ # datetime category
+ idx = pd.DatetimeIndex(["2011-01-01", "2011-02-01", "2011-03-01"], name="xxx")
+ idx = pd.CategoricalIndex(idx)
+ result = idx.take(np.array([1, 0, -1]))
+ expected = pd.DatetimeIndex(
+ ["2011-02-01", "2011-01-01", "2011-03-01"], name="xxx"
+ )
+ expected = pd.CategoricalIndex(expected)
+ tm.assert_index_equal(result, expected)
+
+ # fill_value
+ result = idx.take(np.array([1, 0, -1]), fill_value=True)
+ expected = pd.DatetimeIndex(["2011-02-01", "2011-01-01", "NaT"], name="xxx")
+ exp_cats = pd.DatetimeIndex(["2011-01-01", "2011-02-01", "2011-03-01"])
+ expected = pd.CategoricalIndex(expected, categories=exp_cats)
+ tm.assert_index_equal(result, expected)
+
+ # allow_fill=False
+ result = idx.take(np.array([1, 0, -1]), allow_fill=False, fill_value=True)
+ expected = pd.DatetimeIndex(
+ ["2011-02-01", "2011-01-01", "2011-03-01"], name="xxx"
+ )
+ expected = pd.CategoricalIndex(expected)
+ tm.assert_index_equal(result, expected)
+
+ msg = (
+ "When allow_fill=True and fill_value is not None, "
+ "all indices must be >= -1"
+ )
+ with pytest.raises(ValueError, match=msg):
+ idx.take(np.array([1, 0, -2]), fill_value=True)
+ with pytest.raises(ValueError, match=msg):
+ idx.take(np.array([1, 0, -5]), fill_value=True)
+
+ with pytest.raises(IndexError):
+ idx.take(np.array([1, -5]))
+
+ def test_take_invalid_kwargs(self):
+ idx = pd.CategoricalIndex([1, 2, 3], name="foo")
+ indices = [1, 0, -1]
+
+ msg = r"take\(\) got an unexpected keyword argument 'foo'"
+ with pytest.raises(TypeError, match=msg):
+ idx.take(indices, foo=2)
+
+ msg = "the 'out' parameter is not supported"
+ with pytest.raises(ValueError, match=msg):
+ idx.take(indices, out=indices)
+
+ msg = "the 'mode' parameter is not supported"
+ with pytest.raises(ValueError, match=msg):
+ idx.take(indices, mode="clip")
+
+
+class TestGetLoc:
+ def test_get_loc(self):
+ # GH 12531
+ cidx1 = CategoricalIndex(list("abcde"), categories=list("edabc"))
+ idx1 = Index(list("abcde"))
+ assert cidx1.get_loc("a") == idx1.get_loc("a")
+ assert cidx1.get_loc("e") == idx1.get_loc("e")
+
+ for i in [cidx1, idx1]:
+ with pytest.raises(KeyError, match="'NOT-EXIST'"):
+ i.get_loc("NOT-EXIST")
+
+ # non-unique
+ cidx2 = CategoricalIndex(list("aacded"), categories=list("edabc"))
+ idx2 = Index(list("aacded"))
+
+ # results in bool array
+ res = cidx2.get_loc("d")
+ tm.assert_numpy_array_equal(res, idx2.get_loc("d"))
+ tm.assert_numpy_array_equal(
+ res, np.array([False, False, False, True, False, True])
+ )
+ # unique element results in scalar
+ res = cidx2.get_loc("e")
+ assert res == idx2.get_loc("e")
+ assert res == 4
+
+ for i in [cidx2, idx2]:
+ with pytest.raises(KeyError, match="'NOT-EXIST'"):
+ i.get_loc("NOT-EXIST")
+
+ # non-unique, sliceable
+ cidx3 = CategoricalIndex(list("aabbb"), categories=list("abc"))
+ idx3 = Index(list("aabbb"))
+
+ # results in slice
+ res = cidx3.get_loc("a")
+ assert res == idx3.get_loc("a")
+ assert res == slice(0, 2, None)
+
+ res = cidx3.get_loc("b")
+ assert res == idx3.get_loc("b")
+ assert res == slice(2, 5, None)
+
+ for i in [cidx3, idx3]:
+ with pytest.raises(KeyError, match="'c'"):
+ i.get_loc("c")
+
+ def test_get_loc_unique(self):
+ cidx = pd.CategoricalIndex(list("abc"))
+ result = cidx.get_loc("b")
+ assert result == 1
+
+ def test_get_loc_monotonic_nonunique(self):
+ cidx = pd.CategoricalIndex(list("abbc"))
+ result = cidx.get_loc("b")
+ expected = slice(1, 3, None)
+ assert result == expected
+
+ def test_get_loc_nonmonotonic_nonunique(self):
+ cidx = pd.CategoricalIndex(list("abcb"))
+ result = cidx.get_loc("b")
+ expected = np.array([False, True, False, True], dtype=bool)
+ tm.assert_numpy_array_equal(result, expected)
+
+
+class TestGetIndexer:
+ def test_get_indexer_base(self):
+ # Determined by cat ordering.
+ idx = CategoricalIndex(list("cab"), categories=list("cab"))
+ expected = np.arange(len(idx), dtype=np.intp)
+
+ actual = idx.get_indexer(idx)
+ tm.assert_numpy_array_equal(expected, actual)
+
+ with pytest.raises(ValueError, match="Invalid fill method"):
+ idx.get_indexer(idx, method="invalid")
+
+ def test_get_indexer_non_unique(self):
+ np.random.seed(123456789)
+
+ ci = CategoricalIndex(list("aabbca"), categories=list("cab"), ordered=False)
+ oidx = Index(np.array(ci))
+
+ for n in [1, 2, 5, len(ci)]:
+ finder = oidx[np.random.randint(0, len(ci), size=n)]
+ expected = oidx.get_indexer_non_unique(finder)[0]
+
+ actual = ci.get_indexer(finder)
+ tm.assert_numpy_array_equal(expected, actual)
+
+ # see gh-17323
+ #
+ # Even when indexer is equal to the
+ # members in the index, we should
+ # respect duplicates instead of taking
+ # the fast-track path.
+ for finder in [list("aabbca"), list("aababca")]:
+ expected = oidx.get_indexer_non_unique(finder)[0]
+
+ actual = ci.get_indexer(finder)
+ tm.assert_numpy_array_equal(expected, actual)
+
+ def test_get_indexer(self):
+
+ idx1 = CategoricalIndex(list("aabcde"), categories=list("edabc"))
+ idx2 = CategoricalIndex(list("abf"))
+
+ for indexer in [idx2, list("abf"), Index(list("abf"))]:
+ r1 = idx1.get_indexer(idx2)
+ tm.assert_almost_equal(r1, np.array([0, 1, 2, -1], dtype=np.intp))
+
+ msg = (
+ "method='pad' and method='backfill' not implemented yet for "
+ "CategoricalIndex"
+ )
+ with pytest.raises(NotImplementedError, match=msg):
+ idx2.get_indexer(idx1, method="pad")
+ with pytest.raises(NotImplementedError, match=msg):
+ idx2.get_indexer(idx1, method="backfill")
+
+ msg = "method='nearest' not implemented yet for CategoricalIndex"
+ with pytest.raises(NotImplementedError, match=msg):
+ idx2.get_indexer(idx1, method="nearest")
diff --git a/pandas/tests/indexes/categorical/test_map.py b/pandas/tests/indexes/categorical/test_map.py
new file mode 100644
index 0000000000000..943359a72e971
--- /dev/null
+++ b/pandas/tests/indexes/categorical/test_map.py
@@ -0,0 +1,95 @@
+import numpy as np
+import pytest
+
+import pandas as pd
+from pandas import CategoricalIndex, Index
+import pandas._testing as tm
+
+
+class TestMap:
+ @pytest.mark.parametrize(
+ "data, categories",
+ [
+ (list("abcbca"), list("cab")),
+ (pd.interval_range(0, 3).repeat(3), pd.interval_range(0, 3)),
+ ],
+ ids=["string", "interval"],
+ )
+ def test_map_str(self, data, categories, ordered_fixture):
+ # GH 31202 - override base class since we want to maintain categorical/ordered
+ index = CategoricalIndex(data, categories=categories, ordered=ordered_fixture)
+ result = index.map(str)
+ expected = CategoricalIndex(
+ map(str, data), categories=map(str, categories), ordered=ordered_fixture
+ )
+ tm.assert_index_equal(result, expected)
+
+ def test_map(self):
+ ci = pd.CategoricalIndex(list("ABABC"), categories=list("CBA"), ordered=True)
+ result = ci.map(lambda x: x.lower())
+ exp = pd.CategoricalIndex(list("ababc"), categories=list("cba"), ordered=True)
+ tm.assert_index_equal(result, exp)
+
+ ci = pd.CategoricalIndex(
+ list("ABABC"), categories=list("BAC"), ordered=False, name="XXX"
+ )
+ result = ci.map(lambda x: x.lower())
+ exp = pd.CategoricalIndex(
+ list("ababc"), categories=list("bac"), ordered=False, name="XXX"
+ )
+ tm.assert_index_equal(result, exp)
+
+ # GH 12766: Return an index not an array
+ tm.assert_index_equal(
+ ci.map(lambda x: 1), Index(np.array([1] * 5, dtype=np.int64), name="XXX")
+ )
+
+ # change categories dtype
+ ci = pd.CategoricalIndex(list("ABABC"), categories=list("BAC"), ordered=False)
+
+ def f(x):
+ return {"A": 10, "B": 20, "C": 30}.get(x)
+
+ result = ci.map(f)
+ exp = pd.CategoricalIndex(
+ [10, 20, 10, 20, 30], categories=[20, 10, 30], ordered=False
+ )
+ tm.assert_index_equal(result, exp)
+
+ result = ci.map(pd.Series([10, 20, 30], index=["A", "B", "C"]))
+ tm.assert_index_equal(result, exp)
+
+ result = ci.map({"A": 10, "B": 20, "C": 30})
+ tm.assert_index_equal(result, exp)
+
+ def test_map_with_categorical_series(self):
+ # GH 12756
+ a = pd.Index([1, 2, 3, 4])
+ b = pd.Series(["even", "odd", "even", "odd"], dtype="category")
+ c = pd.Series(["even", "odd", "even", "odd"])
+
+ exp = CategoricalIndex(["odd", "even", "odd", np.nan])
+ tm.assert_index_equal(a.map(b), exp)
+ exp = pd.Index(["odd", "even", "odd", np.nan])
+ tm.assert_index_equal(a.map(c), exp)
+
+ @pytest.mark.parametrize(
+ ("data", "f"),
+ (
+ ([1, 1, np.nan], pd.isna),
+ ([1, 2, np.nan], pd.isna),
+ ([1, 1, np.nan], {1: False}),
+ ([1, 2, np.nan], {1: False, 2: False}),
+ ([1, 1, np.nan], pd.Series([False, False])),
+ ([1, 2, np.nan], pd.Series([False, False, False])),
+ ),
+ )
+ def test_map_with_nan(self, data, f): # GH 24241
+ values = pd.Categorical(data)
+ result = values.map(f)
+ if data[1] == 1:
+ expected = pd.Categorical([False, False, np.nan])
+ tm.assert_categorical_equal(result, expected)
+ else:
+ expected = pd.Index([False, False, np.nan])
+ tm.assert_index_equal(result, expected)
diff --git a/pandas/tests/indexes/categorical/test_reindex.py b/pandas/tests/indexes/categorical/test_reindex.py
new file mode 100644
index 0000000000000..f59ddc42ce4e4
--- /dev/null
+++ b/pandas/tests/indexes/categorical/test_reindex.py
@@ -0,0 +1,53 @@
+import numpy as np
+
+from pandas import Categorical, CategoricalIndex, Index
+import pandas._testing as tm
+
+
+class TestReindex:
+ def test_reindex_dtype(self):
+ c = CategoricalIndex(["a", "b", "c", "a"])
+ res, indexer = c.reindex(["a", "c"])
+ tm.assert_index_equal(res, Index(["a", "a", "c"]), exact=True)
+ tm.assert_numpy_array_equal(indexer, np.array([0, 3, 2], dtype=np.intp))
+
+ c = CategoricalIndex(["a", "b", "c", "a"])
+ res, indexer = c.reindex(Categorical(["a", "c"]))
+
+ exp = CategoricalIndex(["a", "a", "c"], categories=["a", "c"])
+ tm.assert_index_equal(res, exp, exact=True)
+ tm.assert_numpy_array_equal(indexer, np.array([0, 3, 2], dtype=np.intp))
+
+ c = CategoricalIndex(["a", "b", "c", "a"], categories=["a", "b", "c", "d"])
+ res, indexer = c.reindex(["a", "c"])
+ exp = Index(["a", "a", "c"], dtype="object")
+ tm.assert_index_equal(res, exp, exact=True)
+ tm.assert_numpy_array_equal(indexer, np.array([0, 3, 2], dtype=np.intp))
+
+ c = CategoricalIndex(["a", "b", "c", "a"], categories=["a", "b", "c", "d"])
+ res, indexer = c.reindex(Categorical(["a", "c"]))
+ exp = CategoricalIndex(["a", "a", "c"], categories=["a", "c"])
+ tm.assert_index_equal(res, exp, exact=True)
+ tm.assert_numpy_array_equal(indexer, np.array([0, 3, 2], dtype=np.intp))
+
+ def test_reindex_duplicate_target(self):
+ # See GH25459
+ cat = CategoricalIndex(["a", "b", "c"], categories=["a", "b", "c", "d"])
+ res, indexer = cat.reindex(["a", "c", "c"])
+ exp = Index(["a", "c", "c"], dtype="object")
+ tm.assert_index_equal(res, exp, exact=True)
+ tm.assert_numpy_array_equal(indexer, np.array([0, 2, 2], dtype=np.intp))
+
+ res, indexer = cat.reindex(
+ CategoricalIndex(["a", "c", "c"], categories=["a", "b", "c", "d"])
+ )
+ exp = CategoricalIndex(["a", "c", "c"], categories=["a", "b", "c", "d"])
+ tm.assert_index_equal(res, exp, exact=True)
+ tm.assert_numpy_array_equal(indexer, np.array([0, 2, 2], dtype=np.intp))
+
+ def test_reindex_empty_index(self):
+ # See GH16770
+ c = CategoricalIndex([])
+ res, indexer = c.reindex(["a", "b"])
+ tm.assert_index_equal(res, Index(["a", "b"]), exact=True)
+ tm.assert_numpy_array_equal(indexer, np.array([-1, -1], dtype=np.intp))
diff --git a/pandas/tests/indexes/common.py b/pandas/tests/indexes/common.py
index 26d120619defc..da27057a783ab 100644
--- a/pandas/tests/indexes/common.py
+++ b/pandas/tests/indexes/common.py
@@ -167,6 +167,10 @@ def test_create_index_existing_name(self):
def test_numeric_compat(self):
idx = self.create_index()
+ # Check that this doesn't cover MultiIndex case, if/when it does,
+ # we can remove multi.test_compat.test_numeric_compat
+ assert not isinstance(idx, MultiIndex)
+
with pytest.raises(TypeError, match="cannot perform __mul__"):
idx * 1
with pytest.raises(TypeError, match="cannot perform __rmul__"):
diff --git a/pandas/tests/indexes/datetimes/test_constructors.py b/pandas/tests/indexes/datetimes/test_constructors.py
index 68285d41bda70..1d1d371fcec1e 100644
--- a/pandas/tests/indexes/datetimes/test_constructors.py
+++ b/pandas/tests/indexes/datetimes/test_constructors.py
@@ -951,16 +951,11 @@ def test_datetimeindex_constructor_misc(self):
assert len(idx1) == len(idx2)
assert idx1.freq == idx2.freq
+ def test_pass_datetimeindex_to_index(self):
+ # Bugs in #1396
+ rng = date_range("1/1/2000", "3/1/2000")
+ idx = Index(rng, dtype=object)
-def test_timedelta_constructor_identity():
- # Test for #30543
- expected = pd.Timedelta(np.timedelta64(1, "s"))
- result = pd.Timedelta(expected)
- assert result is expected
+ expected = Index(rng.to_pydatetime(), dtype=object)
-
-def test_timestamp_constructor_identity():
- # Test for #30543
- expected = pd.Timestamp("2017-01-01T12")
- result = pd.Timestamp(expected)
- assert result is expected
+ tm.assert_numpy_array_equal(idx.values, expected.values)
diff --git a/pandas/tests/indexes/datetimes/test_indexing.py b/pandas/tests/indexes/datetimes/test_indexing.py
index 2f954117f48d7..c358e72538788 100644
--- a/pandas/tests/indexes/datetimes/test_indexing.py
+++ b/pandas/tests/indexes/datetimes/test_indexing.py
@@ -344,6 +344,115 @@ def test_take_fill_value_with_timezone(self):
idx.take(np.array([1, -5]))
+class TestGetLoc:
+ @pytest.mark.parametrize("method", [None, "pad", "backfill", "nearest"])
+ def test_get_loc_method_exact_match(self, method):
+ idx = pd.date_range("2000-01-01", periods=3)
+ assert idx.get_loc(idx[1], method) == 1
+ assert idx.get_loc(idx[1].to_pydatetime(), method) == 1
+ assert idx.get_loc(str(idx[1]), method) == 1
+
+ if method is not None:
+ assert idx.get_loc(idx[1], method, tolerance=pd.Timedelta("0 days")) == 1
+
+ def test_get_loc(self):
+ idx = pd.date_range("2000-01-01", periods=3)
+
+ assert idx.get_loc("2000-01-01", method="nearest") == 0
+ assert idx.get_loc("2000-01-01T12", method="nearest") == 1
+
+ assert idx.get_loc("2000-01-01T12", method="nearest", tolerance="1 day") == 1
+ assert (
+ idx.get_loc("2000-01-01T12", method="nearest", tolerance=pd.Timedelta("1D"))
+ == 1
+ )
+ assert (
+ idx.get_loc(
+ "2000-01-01T12", method="nearest", tolerance=np.timedelta64(1, "D")
+ )
+ == 1
+ )
+ assert (
+ idx.get_loc("2000-01-01T12", method="nearest", tolerance=timedelta(1)) == 1
+ )
+ with pytest.raises(ValueError, match="unit abbreviation w/o a number"):
+ idx.get_loc("2000-01-01T12", method="nearest", tolerance="foo")
+ with pytest.raises(KeyError, match="'2000-01-01T03'"):
+ idx.get_loc("2000-01-01T03", method="nearest", tolerance="2 hours")
+ with pytest.raises(
+ ValueError, match="tolerance size must match target index size"
+ ):
+ idx.get_loc(
+ "2000-01-01",
+ method="nearest",
+ tolerance=[
+ pd.Timedelta("1day").to_timedelta64(),
+ pd.Timedelta("1day").to_timedelta64(),
+ ],
+ )
+
+ assert idx.get_loc("2000", method="nearest") == slice(0, 3)
+ assert idx.get_loc("2000-01", method="nearest") == slice(0, 3)
+
+ assert idx.get_loc("1999", method="nearest") == 0
+ assert idx.get_loc("2001", method="nearest") == 2
+
+ with pytest.raises(KeyError, match="'1999'"):
+ idx.get_loc("1999", method="pad")
+ with pytest.raises(KeyError, match="'2001'"):
+ idx.get_loc("2001", method="backfill")
+
+ with pytest.raises(KeyError, match="'foobar'"):
+ idx.get_loc("foobar")
+ with pytest.raises(InvalidIndexError, match=r"slice\(None, 2, None\)"):
+ idx.get_loc(slice(2))
+
+ idx = pd.to_datetime(["2000-01-01", "2000-01-04"])
+ assert idx.get_loc("2000-01-02", method="nearest") == 0
+ assert idx.get_loc("2000-01-03", method="nearest") == 1
+ assert idx.get_loc("2000-01", method="nearest") == slice(0, 2)
+
+ # time indexing
+ idx = pd.date_range("2000-01-01", periods=24, freq="H")
+ tm.assert_numpy_array_equal(
+ idx.get_loc(time(12)), np.array([12]), check_dtype=False
+ )
+ tm.assert_numpy_array_equal(
+ idx.get_loc(time(12, 30)), np.array([]), check_dtype=False
+ )
+ with pytest.raises(NotImplementedError):
+ idx.get_loc(time(12, 30), method="pad")
+
+ def test_get_loc_nat(self):
+ # GH#20464
+ index = DatetimeIndex(["1/3/2000", "NaT"])
+ assert index.get_loc(pd.NaT) == 1
+
+ assert index.get_loc(None) == 1
+
+ assert index.get_loc(np.nan) == 1
+
+ assert index.get_loc(pd.NA) == 1
+
+ assert index.get_loc(np.datetime64("NaT")) == 1
+
+ with pytest.raises(KeyError, match="NaT"):
+ index.get_loc(np.timedelta64("NaT"))
+
+ @pytest.mark.parametrize("key", [pd.Timedelta(0), pd.Timedelta(1), timedelta(0)])
+ def test_get_loc_timedelta_invalid_key(self, key):
+ # GH#20464
+ dti = pd.date_range("1970-01-01", periods=10)
+ with pytest.raises(TypeError):
+ dti.get_loc(key)
+
+ def test_get_loc_reasonable_key_error(self):
+ # GH#1062
+ index = DatetimeIndex(["1/3/2000"])
+ with pytest.raises(KeyError, match="2000"):
+ index.get_loc("1/1/2000")
+
+
class TestDatetimeIndex:
@pytest.mark.parametrize(
"null", [None, np.nan, np.datetime64("NaT"), pd.NaT, pd.NA]
@@ -639,84 +748,6 @@ def test_get_value(self):
result = dti.get_value(ser, key.to_datetime64())
assert result == 7
- def test_get_loc(self):
- idx = pd.date_range("2000-01-01", periods=3)
-
- for method in [None, "pad", "backfill", "nearest"]:
- assert idx.get_loc(idx[1], method) == 1
- assert idx.get_loc(idx[1].to_pydatetime(), method) == 1
- assert idx.get_loc(str(idx[1]), method) == 1
-
- if method is not None:
- assert (
- idx.get_loc(idx[1], method, tolerance=pd.Timedelta("0 days")) == 1
- )
-
- assert idx.get_loc("2000-01-01", method="nearest") == 0
- assert idx.get_loc("2000-01-01T12", method="nearest") == 1
-
- assert idx.get_loc("2000-01-01T12", method="nearest", tolerance="1 day") == 1
- assert (
- idx.get_loc("2000-01-01T12", method="nearest", tolerance=pd.Timedelta("1D"))
- == 1
- )
- assert (
- idx.get_loc(
- "2000-01-01T12", method="nearest", tolerance=np.timedelta64(1, "D")
- )
- == 1
- )
- assert (
- idx.get_loc("2000-01-01T12", method="nearest", tolerance=timedelta(1)) == 1
- )
- with pytest.raises(ValueError, match="unit abbreviation w/o a number"):
- idx.get_loc("2000-01-01T12", method="nearest", tolerance="foo")
- with pytest.raises(KeyError, match="'2000-01-01T03'"):
- idx.get_loc("2000-01-01T03", method="nearest", tolerance="2 hours")
- with pytest.raises(
- ValueError, match="tolerance size must match target index size"
- ):
- idx.get_loc(
- "2000-01-01",
- method="nearest",
- tolerance=[
- pd.Timedelta("1day").to_timedelta64(),
- pd.Timedelta("1day").to_timedelta64(),
- ],
- )
-
- assert idx.get_loc("2000", method="nearest") == slice(0, 3)
- assert idx.get_loc("2000-01", method="nearest") == slice(0, 3)
-
- assert idx.get_loc("1999", method="nearest") == 0
- assert idx.get_loc("2001", method="nearest") == 2
-
- with pytest.raises(KeyError, match="'1999'"):
- idx.get_loc("1999", method="pad")
- with pytest.raises(KeyError, match="'2001'"):
- idx.get_loc("2001", method="backfill")
-
- with pytest.raises(KeyError, match="'foobar'"):
- idx.get_loc("foobar")
- with pytest.raises(InvalidIndexError, match=r"slice\(None, 2, None\)"):
- idx.get_loc(slice(2))
-
- idx = pd.to_datetime(["2000-01-01", "2000-01-04"])
- assert idx.get_loc("2000-01-02", method="nearest") == 0
- assert idx.get_loc("2000-01-03", method="nearest") == 1
- assert idx.get_loc("2000-01", method="nearest") == slice(0, 2)
-
- # time indexing
- idx = pd.date_range("2000-01-01", periods=24, freq="H")
- tm.assert_numpy_array_equal(
- idx.get_loc(time(12)), np.array([12]), check_dtype=False
- )
- tm.assert_numpy_array_equal(
- idx.get_loc(time(12, 30)), np.array([]), check_dtype=False
- )
- with pytest.raises(NotImplementedError):
- idx.get_loc(time(12, 30), method="pad")
-
def test_get_indexer(self):
idx = pd.date_range("2000-01-01", periods=3)
exp = np.array([0, 1, 2], dtype=np.intp)
@@ -756,32 +787,3 @@ def test_get_indexer(self):
idx.get_indexer(target, "nearest", tolerance=tol_bad)
with pytest.raises(ValueError):
idx.get_indexer(idx[[0]], method="nearest", tolerance="foo")
-
- def test_reasonable_key_error(self):
- # GH#1062
- index = DatetimeIndex(["1/3/2000"])
- with pytest.raises(KeyError, match="2000"):
- index.get_loc("1/1/2000")
-
- @pytest.mark.parametrize("key", [pd.Timedelta(0), pd.Timedelta(1), timedelta(0)])
- def test_timedelta_invalid_key(self, key):
- # GH#20464
- dti = pd.date_range("1970-01-01", periods=10)
- with pytest.raises(TypeError):
- dti.get_loc(key)
-
- def test_get_loc_nat(self):
- # GH#20464
- index = DatetimeIndex(["1/3/2000", "NaT"])
- assert index.get_loc(pd.NaT) == 1
-
- assert index.get_loc(None) == 1
-
- assert index.get_loc(np.nan) == 1
-
- assert index.get_loc(pd.NA) == 1
-
- assert index.get_loc(np.datetime64("NaT")) == 1
-
- with pytest.raises(KeyError, match="NaT"):
- index.get_loc(np.timedelta64("NaT"))
diff --git a/pandas/tests/indexes/datetimes/test_misc.py b/pandas/tests/indexes/datetimes/test_misc.py
index 340f53b2868bd..d0464698e3f24 100644
--- a/pandas/tests/indexes/datetimes/test_misc.py
+++ b/pandas/tests/indexes/datetimes/test_misc.py
@@ -12,15 +12,6 @@
class TestTimeSeries:
- def test_pass_datetimeindex_to_index(self):
- # Bugs in #1396
- rng = date_range("1/1/2000", "3/1/2000")
- idx = Index(rng, dtype=object)
-
- expected = Index(rng.to_pydatetime(), dtype=object)
-
- tm.assert_numpy_array_equal(idx.values, expected.values)
-
def test_range_edges(self):
# GH#13672
idx = pd.date_range(
diff --git a/pandas/tests/indexes/multi/conftest.py b/pandas/tests/indexes/multi/conftest.py
index acaea4ff96ff5..67ebfcddf6c2d 100644
--- a/pandas/tests/indexes/multi/conftest.py
+++ b/pandas/tests/indexes/multi/conftest.py
@@ -49,12 +49,6 @@ def index_names():
return ["first", "second"]
-@pytest.fixture
-def holder():
- # the MultiIndex constructor used to base compatibility with pickle
- return MultiIndex
-
-
@pytest.fixture
def compat_props():
# a MultiIndex must have these properties associated with it
diff --git a/pandas/tests/indexes/multi/test_analytics.py b/pandas/tests/indexes/multi/test_analytics.py
index e64511efd7ffb..a9e02934f27ab 100644
--- a/pandas/tests/indexes/multi/test_analytics.py
+++ b/pandas/tests/indexes/multi/test_analytics.py
@@ -146,83 +146,6 @@ def test_append_mixed_dtypes():
tm.assert_index_equal(res, exp)
-def test_take(idx):
- indexer = [4, 3, 0, 2]
- result = idx.take(indexer)
- expected = idx[indexer]
- assert result.equals(expected)
-
- # TODO: Remove Commented Code
- # if not isinstance(idx,
- # (DatetimeIndex, PeriodIndex, TimedeltaIndex)):
- # GH 10791
- msg = "'MultiIndex' object has no attribute 'freq'"
- with pytest.raises(AttributeError, match=msg):
- idx.freq
-
-
-def test_take_invalid_kwargs(idx):
- idx = idx
- indices = [1, 2]
-
- msg = r"take\(\) got an unexpected keyword argument 'foo'"
- with pytest.raises(TypeError, match=msg):
- idx.take(indices, foo=2)
-
- msg = "the 'out' parameter is not supported"
- with pytest.raises(ValueError, match=msg):
- idx.take(indices, out=indices)
-
- msg = "the 'mode' parameter is not supported"
- with pytest.raises(ValueError, match=msg):
- idx.take(indices, mode="clip")
-
-
-def test_take_fill_value():
- # GH 12631
- vals = [["A", "B"], [pd.Timestamp("2011-01-01"), pd.Timestamp("2011-01-02")]]
- idx = pd.MultiIndex.from_product(vals, names=["str", "dt"])
-
- result = idx.take(np.array([1, 0, -1]))
- exp_vals = [
- ("A", pd.Timestamp("2011-01-02")),
- ("A", pd.Timestamp("2011-01-01")),
- ("B", pd.Timestamp("2011-01-02")),
- ]
- expected = pd.MultiIndex.from_tuples(exp_vals, names=["str", "dt"])
- tm.assert_index_equal(result, expected)
-
- # fill_value
- result = idx.take(np.array([1, 0, -1]), fill_value=True)
- exp_vals = [
- ("A", pd.Timestamp("2011-01-02")),
- ("A", pd.Timestamp("2011-01-01")),
- (np.nan, pd.NaT),
- ]
- expected = pd.MultiIndex.from_tuples(exp_vals, names=["str", "dt"])
- tm.assert_index_equal(result, expected)
-
- # allow_fill=False
- result = idx.take(np.array([1, 0, -1]), allow_fill=False, fill_value=True)
- exp_vals = [
- ("A", pd.Timestamp("2011-01-02")),
- ("A", pd.Timestamp("2011-01-01")),
- ("B", pd.Timestamp("2011-01-02")),
- ]
- expected = pd.MultiIndex.from_tuples(exp_vals, names=["str", "dt"])
- tm.assert_index_equal(result, expected)
-
- msg = "When allow_fill=True and fill_value is not None, all indices must be >= -1"
- with pytest.raises(ValueError, match=msg):
- idx.take(np.array([1, 0, -2]), fill_value=True)
- with pytest.raises(ValueError, match=msg):
- idx.take(np.array([1, 0, -5]), fill_value=True)
-
- msg = "index -5 is out of bounds for( axis 0 with)? size 4"
- with pytest.raises(IndexError, match=msg):
- idx.take(np.array([1, -5]))
-
-
def test_iter(idx):
result = list(idx)
expected = [
diff --git a/pandas/tests/indexes/multi/test_compat.py b/pandas/tests/indexes/multi/test_compat.py
index 545a7ddef29bb..9a76f0623eb31 100644
--- a/pandas/tests/indexes/multi/test_compat.py
+++ b/pandas/tests/indexes/multi/test_compat.py
@@ -112,8 +112,8 @@ def test_ndarray_compat_properties(idx, compat_props):
idx.values.nbytes
-def test_pickle_compat_construction(holder):
+def test_pickle_compat_construction():
# this is testing for pickle compat
# need an object to create with
with pytest.raises(TypeError, match="Must pass both levels and codes"):
- holder()
+ MultiIndex()
diff --git a/pandas/tests/indexes/multi/test_get_set.py b/pandas/tests/indexes/multi/test_get_set.py
index 074072ae581b2..675a1e2e832f3 100644
--- a/pandas/tests/indexes/multi/test_get_set.py
+++ b/pandas/tests/indexes/multi/test_get_set.py
@@ -57,8 +57,6 @@ def test_get_value_duplicates():
)
assert index.get_loc("D") == slice(0, 3)
- with pytest.raises(KeyError, match=r"^'D'$"):
- index._engine.get_value(np.array([]), "D")
def test_get_level_values_all_na():
@@ -159,7 +157,7 @@ def test_set_levels_codes_directly(idx):
minor_codes = [(x + 1) % 1 for x in minor_codes]
new_codes = [major_codes, minor_codes]
- msg = "can't set attribute"
+ msg = "[Cc]an't set attribute"
with pytest.raises(AttributeError, match=msg):
idx.levels = new_levels
with pytest.raises(AttributeError, match=msg):
diff --git a/pandas/tests/indexes/multi/test_indexing.py b/pandas/tests/indexes/multi/test_indexing.py
index b08280a712642..21a4773fa3683 100644
--- a/pandas/tests/indexes/multi/test_indexing.py
+++ b/pandas/tests/indexes/multi/test_indexing.py
@@ -392,7 +392,7 @@ def test_get_loc_missing_nan():
# GH 8569
idx = MultiIndex.from_arrays([[1.0, 2.0], [3.0, 4.0]])
assert isinstance(idx.get_loc(1), slice)
- with pytest.raises(KeyError, match=r"^3\.0$"):
+ with pytest.raises(KeyError, match=r"^3$"):
idx.get_loc(3)
with pytest.raises(KeyError, match=r"^nan$"):
idx.get_loc(np.nan)
diff --git a/pandas/tests/indexes/multi/test_sorting.py b/pandas/tests/indexes/multi/test_sorting.py
index 50242c1cac549..bb40612b9a55a 100644
--- a/pandas/tests/indexes/multi/test_sorting.py
+++ b/pandas/tests/indexes/multi/test_sorting.py
@@ -1,3 +1,5 @@
+import random
+
import numpy as np
import pytest
@@ -9,8 +11,6 @@
def test_sortlevel(idx):
- import random
-
tuples = list(idx)
random.shuffle(tuples)
diff --git a/pandas/tests/indexes/multi/test_take.py b/pandas/tests/indexes/multi/test_take.py
new file mode 100644
index 0000000000000..85043ff8812af
--- /dev/null
+++ b/pandas/tests/indexes/multi/test_take.py
@@ -0,0 +1,82 @@
+import numpy as np
+import pytest
+
+import pandas as pd
+import pandas._testing as tm
+
+
+def test_take(idx):
+ indexer = [4, 3, 0, 2]
+ result = idx.take(indexer)
+ expected = idx[indexer]
+ assert result.equals(expected)
+
+ # FIXME: Remove Commented Code
+ # if not isinstance(idx,
+ # (DatetimeIndex, PeriodIndex, TimedeltaIndex)):
+ # GH 10791
+ msg = "'MultiIndex' object has no attribute 'freq'"
+ with pytest.raises(AttributeError, match=msg):
+ idx.freq
+
+
+def test_take_invalid_kwargs(idx):
+ idx = idx
+ indices = [1, 2]
+
+ msg = r"take\(\) got an unexpected keyword argument 'foo'"
+ with pytest.raises(TypeError, match=msg):
+ idx.take(indices, foo=2)
+
+ msg = "the 'out' parameter is not supported"
+ with pytest.raises(ValueError, match=msg):
+ idx.take(indices, out=indices)
+
+ msg = "the 'mode' parameter is not supported"
+ with pytest.raises(ValueError, match=msg):
+ idx.take(indices, mode="clip")
+
+
+def test_take_fill_value():
+ # GH 12631
+ vals = [["A", "B"], [pd.Timestamp("2011-01-01"), pd.Timestamp("2011-01-02")]]
+ idx = pd.MultiIndex.from_product(vals, names=["str", "dt"])
+
+ result = idx.take(np.array([1, 0, -1]))
+ exp_vals = [
+ ("A", pd.Timestamp("2011-01-02")),
+ ("A", pd.Timestamp("2011-01-01")),
+ ("B", pd.Timestamp("2011-01-02")),
+ ]
+ expected = pd.MultiIndex.from_tuples(exp_vals, names=["str", "dt"])
+ tm.assert_index_equal(result, expected)
+
+ # fill_value
+ result = idx.take(np.array([1, 0, -1]), fill_value=True)
+ exp_vals = [
+ ("A", pd.Timestamp("2011-01-02")),
+ ("A", pd.Timestamp("2011-01-01")),
+ (np.nan, pd.NaT),
+ ]
+ expected = pd.MultiIndex.from_tuples(exp_vals, names=["str", "dt"])
+ tm.assert_index_equal(result, expected)
+
+ # allow_fill=False
+ result = idx.take(np.array([1, 0, -1]), allow_fill=False, fill_value=True)
+ exp_vals = [
+ ("A", pd.Timestamp("2011-01-02")),
+ ("A", pd.Timestamp("2011-01-01")),
+ ("B", pd.Timestamp("2011-01-02")),
+ ]
+ expected = pd.MultiIndex.from_tuples(exp_vals, names=["str", "dt"])
+ tm.assert_index_equal(result, expected)
+
+ msg = "When allow_fill=True and fill_value is not None, all indices must be >= -1"
+ with pytest.raises(ValueError, match=msg):
+ idx.take(np.array([1, 0, -2]), fill_value=True)
+ with pytest.raises(ValueError, match=msg):
+ idx.take(np.array([1, 0, -5]), fill_value=True)
+
+ msg = "index -5 is out of bounds for( axis 0 with)? size 4"
+ with pytest.raises(IndexError, match=msg):
+ idx.take(np.array([1, -5]))
diff --git a/pandas/tests/indexes/period/test_indexing.py b/pandas/tests/indexes/period/test_indexing.py
index 38514594efe09..fffc4a7562306 100644
--- a/pandas/tests/indexes/period/test_indexing.py
+++ b/pandas/tests/indexes/period/test_indexing.py
@@ -486,15 +486,17 @@ def test_get_value_datetime_hourly(self, freq):
assert ser.loc[ts2] == 7
def test_get_value_integer(self):
+ msg = "index 16801 is out of bounds for axis 0 with size 3"
dti = pd.date_range("2016-01-01", periods=3)
pi = dti.to_period("D")
ser = pd.Series(range(3), index=pi)
- with pytest.raises(IndexError, match="index out of bounds"):
+ with pytest.raises(IndexError, match=msg):
pi.get_value(ser, 16801)
+ msg = "index 46 is out of bounds for axis 0 with size 3"
pi2 = dti.to_period("Y") # duplicates, ordinals are all 46
ser2 = pd.Series(range(3), index=pi2)
- with pytest.raises(IndexError, match="index out of bounds"):
+ with pytest.raises(IndexError, match=msg):
pi2.get_value(ser2, 46)
def test_is_monotonic_increasing(self):
diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py
index e72963de09ab4..04af9b09bbf89 100644
--- a/pandas/tests/indexes/test_base.py
+++ b/pandas/tests/indexes/test_base.py
@@ -34,7 +34,6 @@
period_range,
)
import pandas._testing as tm
-from pandas.core.algorithms import safe_sort
from pandas.core.indexes.api import (
Index,
MultiIndex,
@@ -108,23 +107,6 @@ def test_constructor_copy(self, index):
# arr = np.array(5.)
# pytest.raises(Exception, arr.view, Index)
- def test_constructor_corner(self):
- # corner case
- msg = (
- r"Index\(\.\.\.\) must be called with a collection of some "
- "kind, 0 was passed"
- )
- with pytest.raises(TypeError, match=msg):
- Index(0)
-
- @pytest.mark.parametrize("index_vals", [[("A", 1), "B"], ["B", ("A", 1)]])
- def test_construction_list_mixed_tuples(self, index_vals):
- # see gh-10697: if we are constructing from a mixed list of tuples,
- # make sure that we are independent of the sorting order.
- index = Index(index_vals)
- assert isinstance(index, Index)
- assert not isinstance(index, MultiIndex)
-
@pytest.mark.parametrize("na_value", [None, np.nan])
@pytest.mark.parametrize("vtype", [list, tuple, iter])
def test_construction_list_tuples_nan(self, na_value, vtype):
@@ -359,11 +341,6 @@ def test_constructor_simple_new(self, vals, dtype):
result = index._simple_new(index.values, dtype)
tm.assert_index_equal(result, index)
- def test_constructor_wrong_kwargs(self):
- # GH #19348
- with pytest.raises(TypeError, match="Unexpected keyword arguments {'foo'}"):
- Index([], foo="bar")
-
@pytest.mark.parametrize(
"vals",
[
@@ -554,12 +531,6 @@ def test_constructor_overflow_int64(self):
with pytest.raises(OverflowError, match=msg):
Index([np.iinfo(np.uint64).max - 1], dtype="int64")
- @pytest.mark.xfail(reason="see GH#21311: Index doesn't enforce dtype argument")
- def test_constructor_cast(self):
- msg = "could not convert string to float"
- with pytest.raises(ValueError, match=msg):
- Index(["a", "b", "c"], dtype=float)
-
@pytest.mark.parametrize(
"index",
[
@@ -1047,6 +1018,32 @@ def test_setops_disallow_true(self, method):
with pytest.raises(ValueError, match="The 'sort' keyword only takes"):
getattr(idx1, method)(idx2, sort=True)
+ def test_setops_preserve_object_dtype(self):
+ idx = pd.Index([1, 2, 3], dtype=object)
+ result = idx.intersection(idx[1:])
+ expected = idx[1:]
+ tm.assert_index_equal(result, expected)
+
+ # if other is not monotonic increasing, intersection goes through
+ # a different route
+ result = idx.intersection(idx[1:][::-1])
+ tm.assert_index_equal(result, expected)
+
+ result = idx._union(idx[1:], sort=None)
+ expected = idx
+ tm.assert_index_equal(result, expected)
+
+ result = idx.union(idx[1:], sort=None)
+ tm.assert_index_equal(result, expected)
+
+ # if other is not monotonic increasing, _union goes through
+ # a different route
+ result = idx._union(idx[1:][::-1], sort=None)
+ tm.assert_index_equal(result, expected)
+
+ result = idx.union(idx[1:][::-1], sort=None)
+ tm.assert_index_equal(result, expected)
+
def test_map_identity_mapping(self, indices):
# GH 12766
tm.assert_index_equal(indices, indices.map(lambda x: x))
@@ -2502,78 +2499,12 @@ def test_copy_name2(self):
assert index3.name == "NewName"
assert index3.names == ["NewName"]
- def test_union_base(self):
- index = self.create_index()
- first = index[3:]
- second = index[:5]
-
- result = first.union(second)
-
- expected = Index([0, 1, 2, "a", "b", "c"])
- tm.assert_index_equal(result, expected)
-
- @pytest.mark.parametrize("klass", [np.array, Series, list])
- def test_union_different_type_base(self, klass):
- # GH 10149
- index = self.create_index()
- first = index[3:]
- second = index[:5]
-
- result = first.union(klass(second.values))
-
- assert tm.equalContents(result, index)
-
def test_unique_na(self):
idx = pd.Index([2, np.nan, 2, 1], name="my_index")
expected = pd.Index([2, np.nan, 1], name="my_index")
result = idx.unique()
tm.assert_index_equal(result, expected)
- @pytest.mark.parametrize("sort", [None, False])
- def test_intersection_base(self, sort):
- # (same results for py2 and py3 but sortedness not tested elsewhere)
- index = self.create_index()
- first = index[:5]
- second = index[:3]
-
- expected = Index([0, 1, "a"]) if sort is None else Index([0, "a", 1])
- result = first.intersection(second, sort=sort)
- tm.assert_index_equal(result, expected)
-
- @pytest.mark.parametrize("klass", [np.array, Series, list])
- @pytest.mark.parametrize("sort", [None, False])
- def test_intersection_different_type_base(self, klass, sort):
- # GH 10149
- index = self.create_index()
- first = index[:5]
- second = index[:3]
-
- result = first.intersection(klass(second.values), sort=sort)
- assert tm.equalContents(result, second)
-
- @pytest.mark.parametrize("sort", [None, False])
- def test_difference_base(self, sort):
- # (same results for py2 and py3 but sortedness not tested elsewhere)
- index = self.create_index()
- first = index[:4]
- second = index[3:]
-
- result = first.difference(second, sort)
- expected = Index([0, "a", 1])
- if sort is None:
- expected = Index(safe_sort(expected))
- tm.assert_index_equal(result, expected)
-
- def test_symmetric_difference(self):
- # (same results for py2 and py3 but sortedness not tested elsewhere)
- index = self.create_index()
- first = index[:4]
- second = index[3:]
-
- result = first.symmetric_difference(second)
- expected = Index([0, 1, 2, "a", "c"])
- tm.assert_index_equal(result, expected)
-
def test_logical_compat(self):
index = self.create_index()
assert index.all() == index.values.all()
diff --git a/pandas/tests/indexes/test_numeric.py b/pandas/tests/indexes/test_numeric.py
index 992a91ad8a528..1b504ce99604d 100644
--- a/pandas/tests/indexes/test_numeric.py
+++ b/pandas/tests/indexes/test_numeric.py
@@ -385,7 +385,7 @@ def test_get_loc_missing_nan(self):
# GH 8569
idx = Float64Index([1, 2])
assert idx.get_loc(1) == 0
- with pytest.raises(KeyError, match=r"^3\.0$"):
+ with pytest.raises(KeyError, match=r"^3$"):
idx.get_loc(3)
with pytest.raises(KeyError, match="^nan$"):
idx.get_loc(np.nan)
diff --git a/pandas/tests/indexing/multiindex/test_getitem.py b/pandas/tests/indexing/multiindex/test_getitem.py
index c15fa34283f21..7e75b5324445e 100644
--- a/pandas/tests/indexing/multiindex/test_getitem.py
+++ b/pandas/tests/indexing/multiindex/test_getitem.py
@@ -87,8 +87,8 @@ def test_series_getitem_returns_scalar(
(lambda s: s[(2000, 3, 4)], KeyError, r"^\(2000, 3, 4\)$"),
(lambda s: s.loc[(2000, 3, 4)], KeyError, r"^\(2000, 3, 4\)$"),
(lambda s: s.loc[(2000, 3, 4, 5)], IndexingError, "Too many indexers"),
- (lambda s: s.__getitem__(len(s)), IndexError, "index out of bounds"),
- (lambda s: s[len(s)], IndexError, "index out of bounds"),
+ (lambda s: s.__getitem__(len(s)), IndexError, "is out of bounds"),
+ (lambda s: s[len(s)], IndexError, "is out of bounds"),
(
lambda s: s.iloc[len(s)],
IndexError,
diff --git a/pandas/tests/indexing/multiindex/test_setitem.py b/pandas/tests/indexing/multiindex/test_setitem.py
index aebd1ad2573ed..1e641760f7e8d 100644
--- a/pandas/tests/indexing/multiindex/test_setitem.py
+++ b/pandas/tests/indexing/multiindex/test_setitem.py
@@ -414,6 +414,16 @@ def test_astype_assignment_with_dups(self):
df["A"] = df["A"].astype(np.float64)
tm.assert_index_equal(df.index, index)
+ def test_setitem_nonmonotonic(self):
+ # https://github.com/pandas-dev/pandas/issues/31449
+ index = pd.MultiIndex.from_tuples(
+ [("a", "c"), ("b", "x"), ("a", "d")], names=["l1", "l2"]
+ )
+ df = pd.DataFrame(data=[0, 1, 2], index=index, columns=["e"])
+ df.loc["a", "e"] = np.arange(99, 101, dtype="int64")
+ expected = pd.DataFrame({"e": [99, 1, 100]}, index=index)
+ tm.assert_frame_equal(df, expected)
+
def test_frame_setitem_view_direct(multiindex_dataframe_random_data):
# this works because we are modifying the underlying array
diff --git a/pandas/tests/indexing/test_categorical.py b/pandas/tests/indexing/test_categorical.py
index 8c8dece53277e..da935b1c911d0 100644
--- a/pandas/tests/indexing/test_categorical.py
+++ b/pandas/tests/indexing/test_categorical.py
@@ -83,8 +83,8 @@ def test_loc_scalar(self):
df.loc["d", "C"] = 10
msg = (
- r"cannot do label indexing on with these indexers \[1\] of "
+ "cannot do label indexing on CategoricalIndex with these "
+ r"indexers \[1\] of type int"
)
with pytest.raises(TypeError, match=msg):
df.loc[1]
diff --git a/pandas/tests/indexing/test_floats.py b/pandas/tests/indexing/test_floats.py
index 5530896a90941..8bb88cd9fd63a 100644
--- a/pandas/tests/indexing/test_floats.py
+++ b/pandas/tests/indexing/test_floats.py
@@ -22,16 +22,9 @@ def check(self, result, original, indexer, getitem):
tm.assert_almost_equal(result, expected)
- def test_scalar_error(self):
-
- # GH 4892
- # float_indexers should raise exceptions
- # on appropriate Index types & accessors
- # this duplicates the code below
- # but is specifically testing for the error
- # message
-
- for index in [
+ @pytest.mark.parametrize(
+ "index_func",
+ [
tm.makeStringIndex,
tm.makeUnicodeIndex,
tm.makeCategoricalIndex,
@@ -40,22 +33,31 @@ def test_scalar_error(self):
tm.makePeriodIndex,
tm.makeIntIndex,
tm.makeRangeIndex,
- ]:
+ ],
+ )
+ def test_scalar_error(self, index_func):
- i = index(5)
+ # GH 4892
+ # float_indexers should raise exceptions
+ # on appropriate Index types & accessors
+ # this duplicates the code below
+ # but is specifically testing for the error
+ # message
- s = Series(np.arange(len(i)), index=i)
+ i = index_func(5)
- msg = "Cannot index by location index"
- with pytest.raises(TypeError, match=msg):
- s.iloc[3.0]
+ s = Series(np.arange(len(i)), index=i)
- msg = (
- "cannot do positional indexing on {klass} with these "
- r"indexers \[3\.0\] of {kind}".format(klass=type(i), kind=str(float))
- )
- with pytest.raises(TypeError, match=msg):
- s.iloc[3.0] = 0
+ msg = "Cannot index by location index"
+ with pytest.raises(TypeError, match=msg):
+ s.iloc[3.0]
+
+ msg = (
+ "cannot do positional indexing on {klass} with these "
+ r"indexers \[3\.0\] of type float".format(klass=type(i).__name__)
+ )
+ with pytest.raises(TypeError, match=msg):
+ s.iloc[3.0] = 0
def test_scalar_non_numeric(self):
@@ -90,11 +92,11 @@ def test_scalar_non_numeric(self):
else:
error = TypeError
msg = (
- r"cannot do (label|index|positional) indexing "
+ r"cannot do (label|positional) indexing "
r"on {klass} with these indexers \[3\.0\] of "
- r"{kind}|"
+ r"type float|"
"Cannot index by location index with a "
- "non-integer key".format(klass=type(i), kind=str(float))
+ "non-integer key".format(klass=type(i).__name__)
)
with pytest.raises(error, match=msg):
idxr(s)[3.0]
@@ -107,13 +109,13 @@ def test_scalar_non_numeric(self):
"mixed",
}:
error = KeyError
- msg = r"^3$"
+ msg = r"^3\.0$"
else:
error = TypeError
msg = (
- r"cannot do (label|index) indexing "
+ r"cannot do label indexing "
r"on {klass} with these indexers \[3\.0\] of "
- r"{kind}".format(klass=type(i), kind=str(float))
+ r"type float".format(klass=type(i).__name__)
)
with pytest.raises(error, match=msg):
s.loc[3.0]
@@ -123,9 +125,9 @@ def test_scalar_non_numeric(self):
# setting with a float fails with iloc
msg = (
- r"cannot do (label|index|positional) indexing "
+ r"cannot do (label|positional) indexing "
r"on {klass} with these indexers \[3\.0\] of "
- r"{kind}".format(klass=type(i), kind=str(float))
+ r"type float".format(klass=type(i).__name__)
)
with pytest.raises(TypeError, match=msg):
s.iloc[3.0] = 0
@@ -160,9 +162,9 @@ def test_scalar_non_numeric(self):
s = Series(np.arange(len(i)), index=i)
s[3]
msg = (
- r"cannot do (label|index) indexing "
+ r"cannot do label indexing "
r"on {klass} with these indexers \[3\.0\] of "
- r"{kind}".format(klass=type(i), kind=str(float))
+ r"type float".format(klass=type(i).__name__)
)
with pytest.raises(TypeError, match=msg):
s[3.0]
@@ -179,15 +181,15 @@ def test_scalar_with_mixed(self):
msg = (
r"cannot do label indexing "
r"on {klass} with these indexers \[1\.0\] of "
- r"{kind}|"
+ r"type float|"
"Cannot index by location index with a non-integer key".format(
- klass=str(Index), kind=str(float)
+ klass=Index.__name__
)
)
with pytest.raises(TypeError, match=msg):
idxr(s2)[1.0]
- with pytest.raises(KeyError, match=r"^1$"):
+ with pytest.raises(KeyError, match=r"^1\.0$"):
s2.loc[1.0]
result = s2.loc["b"]
@@ -201,7 +203,7 @@ def test_scalar_with_mixed(self):
msg = (
r"cannot do label indexing "
r"on {klass} with these indexers \[1\.0\] of "
- r"{kind}".format(klass=str(Index), kind=str(float))
+ r"type float".format(klass=Index.__name__)
)
with pytest.raises(TypeError, match=msg):
idxr(s3)[1.0]
@@ -213,7 +215,7 @@ def test_scalar_with_mixed(self):
msg = "Cannot index by location index with a non-integer key"
with pytest.raises(TypeError, match=msg):
s3.iloc[1.0]
- with pytest.raises(KeyError, match=r"^1$"):
+ with pytest.raises(KeyError, match=r"^1\.0$"):
s3.loc[1.0]
result = s3.loc[1.5]
@@ -315,7 +317,7 @@ def test_scalar_float(self):
msg = (
r"cannot do positional indexing "
r"on {klass} with these indexers \[3\.0\] of "
- r"{kind}".format(klass=str(Float64Index), kind=str(float))
+ r"type float".format(klass=Float64Index.__name__)
)
with pytest.raises(TypeError, match=msg):
s2.iloc[3.0] = 0
@@ -344,9 +346,9 @@ def test_slice_non_numeric(self):
for l in [slice(3.0, 4), slice(3, 4.0), slice(3.0, 4.0)]:
msg = (
- "cannot do slice indexing "
+ "cannot do positional indexing "
r"on {klass} with these indexers \[(3|4)\.0\] of "
- "{kind}".format(klass=type(index), kind=str(float))
+ "type float".format(klass=type(index).__name__)
)
with pytest.raises(TypeError, match=msg):
s.iloc[l]
@@ -354,14 +356,10 @@ def test_slice_non_numeric(self):
for idxr in [lambda x: x.loc, lambda x: x.iloc, lambda x: x]:
msg = (
- "cannot do slice indexing "
+ "cannot do (slice|positional) indexing "
r"on {klass} with these indexers "
r"\[(3|4)(\.0)?\] "
- r"of ({kind_float}|{kind_int})".format(
- klass=type(index),
- kind_float=str(float),
- kind_int=str(int),
- )
+ r"of type (float|int)".format(klass=type(index).__name__)
)
with pytest.raises(TypeError, match=msg):
idxr(s)[l]
@@ -370,23 +368,19 @@ def test_slice_non_numeric(self):
for l in [slice(3.0, 4), slice(3, 4.0), slice(3.0, 4.0)]:
msg = (
- "cannot do slice indexing "
+ "cannot do positional indexing "
r"on {klass} with these indexers \[(3|4)\.0\] of "
- "{kind}".format(klass=type(index), kind=str(float))
+ "type float".format(klass=type(index).__name__)
)
with pytest.raises(TypeError, match=msg):
s.iloc[l] = 0
for idxr in [lambda x: x.loc, lambda x: x.iloc, lambda x: x]:
msg = (
- "cannot do slice indexing "
+ "cannot do (slice|positional) indexing "
r"on {klass} with these indexers "
r"\[(3|4)(\.0)?\] "
- r"of ({kind_float}|{kind_int})".format(
- klass=type(index),
- kind_float=str(float),
- kind_int=str(int),
- )
+ r"of type (float|int)".format(klass=type(index).__name__)
)
with pytest.raises(TypeError, match=msg):
idxr(s)[l] = 0
@@ -426,7 +420,7 @@ def test_slice_integer(self):
msg = (
"cannot do slice indexing "
r"on {klass} with these indexers \[(3|4)\.0\] of "
- "{kind}".format(klass=type(index), kind=str(float))
+ "type float".format(klass=type(index).__name__)
)
with pytest.raises(TypeError, match=msg):
s[l]
@@ -450,7 +444,7 @@ def test_slice_integer(self):
msg = (
"cannot do slice indexing "
r"on {klass} with these indexers \[-6\.0\] of "
- "{kind}".format(klass=type(index), kind=str(float))
+ "type float".format(klass=type(index).__name__)
)
with pytest.raises(TypeError, match=msg):
s[slice(-6.0, 6.0)]
@@ -476,7 +470,7 @@ def test_slice_integer(self):
msg = (
"cannot do slice indexing "
r"on {klass} with these indexers \[(2|3)\.5\] of "
- "{kind}".format(klass=type(index), kind=str(float))
+ "type float".format(klass=type(index).__name__)
)
with pytest.raises(TypeError, match=msg):
s[l]
@@ -494,7 +488,7 @@ def test_slice_integer(self):
msg = (
"cannot do slice indexing "
r"on {klass} with these indexers \[(3|4)\.0\] of "
- "{kind}".format(klass=type(index), kind=str(float))
+ "type float".format(klass=type(index).__name__)
)
with pytest.raises(TypeError, match=msg):
s[l] = 0
@@ -515,9 +509,9 @@ def test_integer_positional_indexing(self):
klass = RangeIndex
msg = (
- "cannot do slice indexing "
+ "cannot do (slice|positional) indexing "
r"on {klass} with these indexers \[(2|4)\.0\] of "
- "{kind}".format(klass=str(klass), kind=str(float))
+ "type float".format(klass=klass.__name__)
)
with pytest.raises(TypeError, match=msg):
idxr(s)[l]
@@ -542,7 +536,7 @@ def f(idxr):
msg = (
"cannot do slice indexing "
r"on {klass} with these indexers \[(0|1)\.0\] of "
- "{kind}".format(klass=type(index), kind=str(float))
+ "type float".format(klass=type(index).__name__)
)
with pytest.raises(TypeError, match=msg):
s[l]
@@ -557,7 +551,7 @@ def f(idxr):
msg = (
"cannot do slice indexing "
r"on {klass} with these indexers \[-10\.0\] of "
- "{kind}".format(klass=type(index), kind=str(float))
+ "type float".format(klass=type(index).__name__)
)
with pytest.raises(TypeError, match=msg):
s[slice(-10.0, 10.0)]
@@ -576,7 +570,7 @@ def f(idxr):
msg = (
"cannot do slice indexing "
r"on {klass} with these indexers \[0\.5\] of "
- "{kind}".format(klass=type(index), kind=str(float))
+ "type float".format(klass=type(index).__name__)
)
with pytest.raises(TypeError, match=msg):
s[l]
@@ -593,7 +587,7 @@ def f(idxr):
msg = (
"cannot do slice indexing "
r"on {klass} with these indexers \[(3|4)\.0\] of "
- "{kind}".format(klass=type(index), kind=str(float))
+ "type float".format(klass=type(index).__name__)
)
with pytest.raises(TypeError, match=msg):
s[l] = 0
@@ -666,11 +660,11 @@ def test_floating_misc(self):
# value not found (and no fallbacking at all)
# scalar integers
- with pytest.raises(KeyError, match=r"^4\.0$"):
+ with pytest.raises(KeyError, match=r"^4$"):
s.loc[4]
- with pytest.raises(KeyError, match=r"^4\.0$"):
+ with pytest.raises(KeyError, match=r"^4$"):
s.loc[4]
- with pytest.raises(KeyError, match=r"^4\.0$"):
+ with pytest.raises(KeyError, match=r"^4$"):
s[4]
# fancy floats/integers create the correct entry (as nan)
diff --git a/pandas/tests/indexing/test_iloc.py b/pandas/tests/indexing/test_iloc.py
index d67259e8b7d40..08ea4c1579ef8 100644
--- a/pandas/tests/indexing/test_iloc.py
+++ b/pandas/tests/indexing/test_iloc.py
@@ -15,6 +15,44 @@
class TestiLoc(Base):
+ def test_iloc_getitem_int(self):
+ # integer
+ self.check_result(
+ "iloc",
+ 2,
+ "iloc",
+ 2,
+ typs=["labels", "mixed", "ts", "floats", "empty"],
+ fails=IndexError,
+ )
+
+ def test_iloc_getitem_neg_int(self):
+ # neg integer
+ self.check_result(
+ "iloc",
+ -1,
+ "iloc",
+ -1,
+ typs=["labels", "mixed", "ts", "floats", "empty"],
+ fails=IndexError,
+ )
+
+ def test_iloc_getitem_list_int(self):
+ self.check_result(
+ "iloc",
+ [0, 1, 2],
+ "iloc",
+ [0, 1, 2],
+ typs=["labels", "mixed", "ts", "floats", "empty"],
+ fails=IndexError,
+ )
+
+ # array of ints (GH5006), make sure that a single indexer is returning
+ # the correct type
+
+
+class TestiLoc2:
+ # TODO: better name, just separating out things that dont rely on base class
def test_iloc_exceeds_bounds(self):
# GH6296
@@ -135,28 +173,6 @@ def test_iloc_non_integer_raises(self, index, columns, index_vals, column_vals):
with pytest.raises(IndexError, match=msg):
df.iloc[index_vals, column_vals]
- def test_iloc_getitem_int(self):
- # integer
- self.check_result(
- "iloc",
- 2,
- "iloc",
- 2,
- typs=["labels", "mixed", "ts", "floats", "empty"],
- fails=IndexError,
- )
-
- def test_iloc_getitem_neg_int(self):
- # neg integer
- self.check_result(
- "iloc",
- -1,
- "iloc",
- -1,
- typs=["labels", "mixed", "ts", "floats", "empty"],
- fails=IndexError,
- )
-
@pytest.mark.parametrize("dims", [1, 2])
def test_iloc_getitem_invalid_scalar(self, dims):
# GH 21982
@@ -183,19 +199,6 @@ def test_iloc_array_not_mutating_negative_indices(self):
df.iloc[:, array_with_neg_numbers]
tm.assert_numpy_array_equal(array_with_neg_numbers, array_copy)
- def test_iloc_getitem_list_int(self):
- self.check_result(
- "iloc",
- [0, 1, 2],
- "iloc",
- [0, 1, 2],
- typs=["labels", "mixed", "ts", "floats", "empty"],
- fails=IndexError,
- )
-
- # array of ints (GH5006), make sure that a single indexer is returning
- # the correct type
-
def test_iloc_getitem_neg_int_can_reach_first_index(self):
# GH10547 and GH10779
# negative integers should be able to reach index 0
@@ -286,7 +289,9 @@ def test_iloc_getitem_slice_dups(self):
tm.assert_frame_equal(df.iloc[10:, 2:], df1)
def test_iloc_setitem(self):
- df = self.frame_ints
+ df = DataFrame(
+ np.random.randn(4, 4), index=np.arange(0, 8, 2), columns=np.arange(0, 12, 3)
+ )
df.iloc[1, 1] = 1
result = df.iloc[1, 1]
diff --git a/pandas/tests/indexing/test_indexing.py b/pandas/tests/indexing/test_indexing.py
index 1913caae93932..98940b64330b4 100644
--- a/pandas/tests/indexing/test_indexing.py
+++ b/pandas/tests/indexing/test_indexing.py
@@ -17,13 +17,13 @@
from pandas.core.generic import NDFrame
from pandas.core.indexers import validate_indices
from pandas.core.indexing import _maybe_numeric_slice, _non_reducing_slice
-from pandas.tests.indexing.common import Base, _mklbl
+from pandas.tests.indexing.common import _mklbl
# ------------------------------------------------------------------------
# Indexing test cases
-class TestFancy(Base):
+class TestFancy:
""" pure get/set item & fancy indexing """
def test_setitem_ndarray_1d(self):
@@ -137,7 +137,7 @@ def test_setitem_ndarray_3d(self, index, obj, idxr, idxr_id):
r"Buffer has wrong number of dimensions \(expected 1, "
r"got 3\)|"
"'pandas._libs.interval.IntervalTree' object has no attribute "
- "'set_value'|" # AttributeError
+ "'get_loc'|" # AttributeError
"unhashable type: 'numpy.ndarray'|" # TypeError
"No matching signature found|" # TypeError
r"^\[\[\[|" # pandas.core.indexing.IndexingError
@@ -750,7 +750,7 @@ def test_index_type_coercion(self):
assert s2.index.is_object()
-class TestMisc(Base):
+class TestMisc:
def test_float_index_to_mixed(self):
df = DataFrame({0.0: np.random.rand(10), 1.0: np.random.rand(10)})
df["a"] = 10
@@ -875,21 +875,21 @@ def test_indexing_dtypes_on_empty(self):
assert df2.loc[:, "a"].dtype == np.int64
tm.assert_series_equal(df2.loc[:, "a"], df2.iloc[:, 0])
- def test_range_in_series_indexing(self):
+ @pytest.mark.parametrize("size", [5, 999999, 1000000])
+ def test_range_in_series_indexing(self, size):
# range can cause an indexing error
# GH 11652
- for x in [5, 999999, 1000000]:
- s = Series(index=range(x), dtype=np.float64)
- s.loc[range(1)] = 42
- tm.assert_series_equal(s.loc[range(1)], Series(42.0, index=[0]))
+ s = Series(index=range(size), dtype=np.float64)
+ s.loc[range(1)] = 42
+ tm.assert_series_equal(s.loc[range(1)], Series(42.0, index=[0]))
- s.loc[range(2)] = 43
- tm.assert_series_equal(s.loc[range(2)], Series(43.0, index=[0, 1]))
+ s.loc[range(2)] = 43
+ tm.assert_series_equal(s.loc[range(2)], Series(43.0, index=[0, 1]))
- def test_non_reducing_slice(self):
- df = DataFrame([[0, 1], [2, 3]])
-
- slices = [
+ @pytest.mark.parametrize(
+ "slc",
+ [
+ # FIXME: dont leave commented-out
# pd.IndexSlice[:, :],
pd.IndexSlice[:, 1],
pd.IndexSlice[1, :],
@@ -902,10 +902,13 @@ def test_non_reducing_slice(self):
[0, 1],
np.array([0, 1]),
Series([0, 1]),
- ]
- for slice_ in slices:
- tslice_ = _non_reducing_slice(slice_)
- assert isinstance(df.loc[tslice_], DataFrame)
+ ],
+ )
+ def test_non_reducing_slice(self, slc):
+ df = DataFrame([[0, 1], [2, 3]])
+
+ tslice_ = _non_reducing_slice(slc)
+ assert isinstance(df.loc[tslice_], DataFrame)
def test_list_slice(self):
# like dataframe getitem
@@ -965,37 +968,37 @@ class TestSeriesNoneCoercion:
(["foo", "bar", "baz"], [None, "bar", "baz"]),
]
- def test_coercion_with_setitem(self):
- for start_data, expected_result in self.EXPECTED_RESULTS:
- start_series = Series(start_data)
- start_series[0] = None
+ @pytest.mark.parametrize("start_data,expected_result", EXPECTED_RESULTS)
+ def test_coercion_with_setitem(self, start_data, expected_result):
+ start_series = Series(start_data)
+ start_series[0] = None
- expected_series = Series(expected_result)
- tm.assert_series_equal(start_series, expected_series)
+ expected_series = Series(expected_result)
+ tm.assert_series_equal(start_series, expected_series)
- def test_coercion_with_loc_setitem(self):
- for start_data, expected_result in self.EXPECTED_RESULTS:
- start_series = Series(start_data)
- start_series.loc[0] = None
+ @pytest.mark.parametrize("start_data,expected_result", EXPECTED_RESULTS)
+ def test_coercion_with_loc_setitem(self, start_data, expected_result):
+ start_series = Series(start_data)
+ start_series.loc[0] = None
- expected_series = Series(expected_result)
- tm.assert_series_equal(start_series, expected_series)
+ expected_series = Series(expected_result)
+ tm.assert_series_equal(start_series, expected_series)
- def test_coercion_with_setitem_and_series(self):
- for start_data, expected_result in self.EXPECTED_RESULTS:
- start_series = Series(start_data)
- start_series[start_series == start_series[0]] = None
+ @pytest.mark.parametrize("start_data,expected_result", EXPECTED_RESULTS)
+ def test_coercion_with_setitem_and_series(self, start_data, expected_result):
+ start_series = Series(start_data)
+ start_series[start_series == start_series[0]] = None
- expected_series = Series(expected_result)
- tm.assert_series_equal(start_series, expected_series)
+ expected_series = Series(expected_result)
+ tm.assert_series_equal(start_series, expected_series)
- def test_coercion_with_loc_and_series(self):
- for start_data, expected_result in self.EXPECTED_RESULTS:
- start_series = Series(start_data)
- start_series.loc[start_series == start_series[0]] = None
+ @pytest.mark.parametrize("start_data,expected_result", EXPECTED_RESULTS)
+ def test_coercion_with_loc_and_series(self, start_data, expected_result):
+ start_series = Series(start_data)
+ start_series.loc[start_series == start_series[0]] = None
- expected_series = Series(expected_result)
- tm.assert_series_equal(start_series, expected_series)
+ expected_series = Series(expected_result)
+ tm.assert_series_equal(start_series, expected_series)
class TestDataframeNoneCoercion:
@@ -1012,31 +1015,35 @@ class TestDataframeNoneCoercion:
(["foo", "bar", "baz"], [None, "bar", "baz"]),
]
- def test_coercion_with_loc(self):
- for start_data, expected_result in self.EXPECTED_SINGLE_ROW_RESULTS:
- start_dataframe = DataFrame({"foo": start_data})
- start_dataframe.loc[0, ["foo"]] = None
+ @pytest.mark.parametrize("expected", EXPECTED_SINGLE_ROW_RESULTS)
+ def test_coercion_with_loc(self, expected):
+ start_data, expected_result = expected
+
+ start_dataframe = DataFrame({"foo": start_data})
+ start_dataframe.loc[0, ["foo"]] = None
+
+ expected_dataframe = DataFrame({"foo": expected_result})
+ tm.assert_frame_equal(start_dataframe, expected_dataframe)
+
+ @pytest.mark.parametrize("expected", EXPECTED_SINGLE_ROW_RESULTS)
+ def test_coercion_with_setitem_and_dataframe(self, expected):
+ start_data, expected_result = expected
- expected_dataframe = DataFrame({"foo": expected_result})
- tm.assert_frame_equal(start_dataframe, expected_dataframe)
+ start_dataframe = DataFrame({"foo": start_data})
+ start_dataframe[start_dataframe["foo"] == start_dataframe["foo"][0]] = None
- def test_coercion_with_setitem_and_dataframe(self):
- for start_data, expected_result in self.EXPECTED_SINGLE_ROW_RESULTS:
- start_dataframe = DataFrame({"foo": start_data})
- start_dataframe[start_dataframe["foo"] == start_dataframe["foo"][0]] = None
+ expected_dataframe = DataFrame({"foo": expected_result})
+ tm.assert_frame_equal(start_dataframe, expected_dataframe)
- expected_dataframe = DataFrame({"foo": expected_result})
- tm.assert_frame_equal(start_dataframe, expected_dataframe)
+ @pytest.mark.parametrize("expected", EXPECTED_SINGLE_ROW_RESULTS)
+ def test_none_coercion_loc_and_dataframe(self, expected):
+ start_data, expected_result = expected
- def test_none_coercion_loc_and_dataframe(self):
- for start_data, expected_result in self.EXPECTED_SINGLE_ROW_RESULTS:
- start_dataframe = DataFrame({"foo": start_data})
- start_dataframe.loc[
- start_dataframe["foo"] == start_dataframe["foo"][0]
- ] = None
+ start_dataframe = DataFrame({"foo": start_data})
+ start_dataframe.loc[start_dataframe["foo"] == start_dataframe["foo"][0]] = None
- expected_dataframe = DataFrame({"foo": expected_result})
- tm.assert_frame_equal(start_dataframe, expected_dataframe)
+ expected_dataframe = DataFrame({"foo": expected_result})
+ tm.assert_frame_equal(start_dataframe, expected_dataframe)
def test_none_coercion_mixed_dtypes(self):
start_dataframe = DataFrame(
diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py
index b9dc96adfa738..3a726fb9923ee 100644
--- a/pandas/tests/indexing/test_loc.py
+++ b/pandas/tests/indexing/test_loc.py
@@ -13,85 +13,6 @@
class TestLoc(Base):
- def test_loc_getitem_dups(self):
- # GH 5678
- # repeated getitems on a dup index returning a ndarray
- df = DataFrame(
- np.random.random_sample((20, 5)), index=["ABCDE"[x % 5] for x in range(20)]
- )
- expected = df.loc["A", 0]
- result = df.loc[:, 0].loc["A"]
- tm.assert_series_equal(result, expected)
-
- def test_loc_getitem_dups2(self):
-
- # GH4726
- # dup indexing with iloc/loc
- df = DataFrame(
- [[1, 2, "foo", "bar", Timestamp("20130101")]],
- columns=["a", "a", "a", "a", "a"],
- index=[1],
- )
- expected = Series(
- [1, 2, "foo", "bar", Timestamp("20130101")],
- index=["a", "a", "a", "a", "a"],
- name=1,
- )
-
- result = df.iloc[0]
- tm.assert_series_equal(result, expected)
-
- result = df.loc[1]
- tm.assert_series_equal(result, expected)
-
- def test_loc_setitem_dups(self):
-
- # GH 6541
- df_orig = DataFrame(
- {
- "me": list("rttti"),
- "foo": list("aaade"),
- "bar": np.arange(5, dtype="float64") * 1.34 + 2,
- "bar2": np.arange(5, dtype="float64") * -0.34 + 2,
- }
- ).set_index("me")
-
- indexer = tuple(["r", ["bar", "bar2"]])
- df = df_orig.copy()
- df.loc[indexer] *= 2.0
- tm.assert_series_equal(df.loc[indexer], 2.0 * df_orig.loc[indexer])
-
- indexer = tuple(["r", "bar"])
- df = df_orig.copy()
- df.loc[indexer] *= 2.0
- assert df.loc[indexer] == 2.0 * df_orig.loc[indexer]
-
- indexer = tuple(["t", ["bar", "bar2"]])
- df = df_orig.copy()
- df.loc[indexer] *= 2.0
- tm.assert_frame_equal(df.loc[indexer], 2.0 * df_orig.loc[indexer])
-
- def test_loc_setitem_slice(self):
- # GH10503
-
- # assigning the same type should not change the type
- df1 = DataFrame({"a": [0, 1, 1], "b": Series([100, 200, 300], dtype="uint32")})
- ix = df1["a"] == 1
- newb1 = df1.loc[ix, "b"] + 1
- df1.loc[ix, "b"] = newb1
- expected = DataFrame(
- {"a": [0, 1, 1], "b": Series([100, 201, 301], dtype="uint32")}
- )
- tm.assert_frame_equal(df1, expected)
-
- # assigning a new type should get the inferred type
- df2 = DataFrame({"a": [0, 1, 1], "b": [100, 200, 300]}, dtype="uint64")
- ix = df1["a"] == 1
- newb2 = df2.loc[ix, "b"]
- df1.loc[ix, "b"] = newb2
- expected = DataFrame({"a": [0, 1, 1], "b": [100, 200, 300]}, dtype="uint64")
- tm.assert_frame_equal(df2, expected)
-
def test_loc_getitem_int(self):
# int label
@@ -162,17 +83,6 @@ def test_loc_getitem_label_list_with_missing(self):
fails=KeyError,
)
- def test_getitem_label_list_with_missing(self):
- s = Series(range(3), index=["a", "b", "c"])
-
- # consistency
- with pytest.raises(KeyError, match="with any missing labels"):
- s[["a", "d"]]
-
- s = Series(range(3))
- with pytest.raises(KeyError, match="with any missing labels"):
- s[[0, 3]]
-
def test_loc_getitem_label_list_fails(self):
# fails
self.check_result(
@@ -196,6 +106,168 @@ def test_loc_getitem_bool(self):
self.check_result("loc", b, "loc", b, typs=["empty"], fails=IndexError)
+ def test_loc_getitem_label_slice(self):
+
+ # label slices (with ints)
+
+ # real label slices
+
+ # GH 14316
+
+ self.check_result(
+ "loc",
+ slice(1, 3),
+ "loc",
+ slice(1, 3),
+ typs=["labels", "mixed", "empty", "ts", "floats"],
+ fails=TypeError,
+ )
+
+ self.check_result(
+ "loc",
+ slice("20130102", "20130104"),
+ "loc",
+ slice("20130102", "20130104"),
+ typs=["ts"],
+ axes=1,
+ fails=TypeError,
+ )
+
+ self.check_result(
+ "loc",
+ slice(2, 8),
+ "loc",
+ slice(2, 8),
+ typs=["mixed"],
+ axes=0,
+ fails=TypeError,
+ )
+ self.check_result(
+ "loc",
+ slice(2, 8),
+ "loc",
+ slice(2, 8),
+ typs=["mixed"],
+ axes=1,
+ fails=KeyError,
+ )
+
+ self.check_result(
+ "loc",
+ slice(2, 4, 2),
+ "loc",
+ slice(2, 4, 2),
+ typs=["mixed"],
+ axes=0,
+ fails=TypeError,
+ )
+
+
+class TestLoc2:
+ # TODO: better name, just separating out things that rely on base class
+
+ def test_loc_getitem_dups(self):
+ # GH 5678
+ # repeated getitems on a dup index returning a ndarray
+ df = DataFrame(
+ np.random.random_sample((20, 5)), index=["ABCDE"[x % 5] for x in range(20)]
+ )
+ expected = df.loc["A", 0]
+ result = df.loc[:, 0].loc["A"]
+ tm.assert_series_equal(result, expected)
+
+ def test_loc_getitem_dups2(self):
+
+ # GH4726
+ # dup indexing with iloc/loc
+ df = DataFrame(
+ [[1, 2, "foo", "bar", Timestamp("20130101")]],
+ columns=["a", "a", "a", "a", "a"],
+ index=[1],
+ )
+ expected = Series(
+ [1, 2, "foo", "bar", Timestamp("20130101")],
+ index=["a", "a", "a", "a", "a"],
+ name=1,
+ )
+
+ result = df.iloc[0]
+ tm.assert_series_equal(result, expected)
+
+ result = df.loc[1]
+ tm.assert_series_equal(result, expected)
+
+ def test_loc_setitem_dups(self):
+
+ # GH 6541
+ df_orig = DataFrame(
+ {
+ "me": list("rttti"),
+ "foo": list("aaade"),
+ "bar": np.arange(5, dtype="float64") * 1.34 + 2,
+ "bar2": np.arange(5, dtype="float64") * -0.34 + 2,
+ }
+ ).set_index("me")
+
+ indexer = tuple(["r", ["bar", "bar2"]])
+ df = df_orig.copy()
+ df.loc[indexer] *= 2.0
+ tm.assert_series_equal(df.loc[indexer], 2.0 * df_orig.loc[indexer])
+
+ indexer = tuple(["r", "bar"])
+ df = df_orig.copy()
+ df.loc[indexer] *= 2.0
+ assert df.loc[indexer] == 2.0 * df_orig.loc[indexer]
+
+ indexer = tuple(["t", ["bar", "bar2"]])
+ df = df_orig.copy()
+ df.loc[indexer] *= 2.0
+ tm.assert_frame_equal(df.loc[indexer], 2.0 * df_orig.loc[indexer])
+
+ def test_loc_setitem_slice(self):
+ # GH10503
+
+ # assigning the same type should not change the type
+ df1 = DataFrame({"a": [0, 1, 1], "b": Series([100, 200, 300], dtype="uint32")})
+ ix = df1["a"] == 1
+ newb1 = df1.loc[ix, "b"] + 1
+ df1.loc[ix, "b"] = newb1
+ expected = DataFrame(
+ {"a": [0, 1, 1], "b": Series([100, 201, 301], dtype="uint32")}
+ )
+ tm.assert_frame_equal(df1, expected)
+
+ # assigning a new type should get the inferred type
+ df2 = DataFrame({"a": [0, 1, 1], "b": [100, 200, 300]}, dtype="uint64")
+ ix = df1["a"] == 1
+ newb2 = df2.loc[ix, "b"]
+ df1.loc[ix, "b"] = newb2
+ expected = DataFrame({"a": [0, 1, 1], "b": [100, 200, 300]}, dtype="uint64")
+ tm.assert_frame_equal(df2, expected)
+
+ def test_loc_setitem_dtype(self):
+ # GH31340
+ df = DataFrame({"id": ["A"], "a": [1.2], "b": [0.0], "c": [-2.5]})
+ cols = ["a", "b", "c"]
+ df.loc[:, cols] = df.loc[:, cols].astype("float32")
+
+ expected = DataFrame(
+ {"id": ["A"], "a": [1.2], "b": [0.0], "c": [-2.5]}, dtype="float32"
+ ) # id is inferred as object
+
+ tm.assert_frame_equal(df, expected)
+
+ def test_getitem_label_list_with_missing(self):
+ s = Series(range(3), index=["a", "b", "c"])
+
+ # consistency
+ with pytest.raises(KeyError, match="with any missing labels"):
+ s[["a", "d"]]
+
+ s = Series(range(3))
+ with pytest.raises(KeyError, match="with any missing labels"):
+ s[[0, 3]]
+
@pytest.mark.parametrize("index", [[True, False], [True, False, True, False]])
def test_loc_getitem_bool_diff_len(self, index):
# GH26658
@@ -297,62 +369,6 @@ def test_loc_getitem_list_with_fail(self):
with pytest.raises(KeyError, match="with any missing labels"):
s.loc[[2, 3]]
- def test_loc_getitem_label_slice(self):
-
- # label slices (with ints)
-
- # real label slices
-
- # GH 14316
-
- self.check_result(
- "loc",
- slice(1, 3),
- "loc",
- slice(1, 3),
- typs=["labels", "mixed", "empty", "ts", "floats"],
- fails=TypeError,
- )
-
- self.check_result(
- "loc",
- slice("20130102", "20130104"),
- "loc",
- slice("20130102", "20130104"),
- typs=["ts"],
- axes=1,
- fails=TypeError,
- )
-
- self.check_result(
- "loc",
- slice(2, 8),
- "loc",
- slice(2, 8),
- typs=["mixed"],
- axes=0,
- fails=TypeError,
- )
- self.check_result(
- "loc",
- slice(2, 8),
- "loc",
- slice(2, 8),
- typs=["mixed"],
- axes=1,
- fails=KeyError,
- )
-
- self.check_result(
- "loc",
- slice(2, 4, 2),
- "loc",
- slice(2, 4, 2),
- typs=["mixed"],
- axes=0,
- fails=TypeError,
- )
-
def test_loc_index(self):
# gh-17131
# a boolean index should index like a boolean numpy array
@@ -559,7 +575,7 @@ def test_loc_modify_datetime(self):
tm.assert_frame_equal(df, expected)
def test_loc_setitem_frame(self):
- df = self.frame_labels
+ df = DataFrame(np.random.randn(4, 4), index=list("abcd"), columns=list("ABCD"))
result = df.iloc[0, 0]
diff --git a/pandas/tests/indexing/test_scalar.py b/pandas/tests/indexing/test_scalar.py
index a567fb9b8ccc7..3622b12b853a4 100644
--- a/pandas/tests/indexing/test_scalar.py
+++ b/pandas/tests/indexing/test_scalar.py
@@ -65,6 +65,10 @@ def _check(f, func, values=False):
for f in [d["ints"], d["uints"], d["labels"], d["ts"], d["floats"]]:
_check(f, "at")
+
+class TestScalar2:
+ # TODO: Better name, just separating things that dont need Base class
+
def test_at_iat_coercion(self):
# as timestamp is not a tuple!
@@ -125,38 +129,79 @@ def test_imethods_with_dups(self):
result = df.iat[2, 0]
assert result == 2
- def test_at_to_fail(self):
+ def test_series_at_raises_type_error(self):
# at should not fallback
# GH 7814
- s = Series([1, 2, 3], index=list("abc"))
- result = s.at["a"]
+ # GH#31724 .at should match .loc
+ ser = Series([1, 2, 3], index=list("abc"))
+ result = ser.at["a"]
assert result == 1
+ result = ser.loc["a"]
+ assert result == 1
+
msg = (
- "At based indexing on an non-integer index can only have "
- "non-integer indexers"
+ "cannot do label indexing on Index "
+ r"with these indexers \[0\] of type int"
)
- with pytest.raises(ValueError, match=msg):
- s.at[0]
+ with pytest.raises(TypeError, match=msg):
+ ser.at[0]
+ with pytest.raises(TypeError, match=msg):
+ ser.loc[0]
+ def test_frame_raises_type_error(self):
+ # GH#31724 .at should match .loc
df = DataFrame({"A": [1, 2, 3]}, index=list("abc"))
result = df.at["a", "A"]
assert result == 1
- with pytest.raises(ValueError, match=msg):
+ result = df.loc["a", "A"]
+ assert result == 1
+
+ msg = (
+ "cannot do label indexing on Index "
+ r"with these indexers \[0\] of type int"
+ )
+ with pytest.raises(TypeError, match=msg):
df.at["a", 0]
+ with pytest.raises(TypeError, match=msg):
+ df.loc["a", 0]
- s = Series([1, 2, 3], index=[3, 2, 1])
- result = s.at[1]
+ def test_series_at_raises_key_error(self):
+ # GH#31724 .at should match .loc
+
+ ser = Series([1, 2, 3], index=[3, 2, 1])
+ result = ser.at[1]
+ assert result == 3
+ result = ser.loc[1]
assert result == 3
- msg = "At based indexing on an integer index can only have integer indexers"
- with pytest.raises(ValueError, match=msg):
- s.at["a"]
+
+ with pytest.raises(KeyError, match="a"):
+ ser.at["a"]
+ with pytest.raises(KeyError, match="a"):
+ # .at should match .loc
+ ser.loc["a"]
+
+ def test_frame_at_raises_key_error(self):
+ # GH#31724 .at should match .loc
df = DataFrame({0: [1, 2, 3]}, index=[3, 2, 1])
+
result = df.at[1, 0]
assert result == 3
- with pytest.raises(ValueError, match=msg):
+ result = df.loc[1, 0]
+ assert result == 3
+
+ with pytest.raises(KeyError, match="a"):
df.at["a", 0]
+ with pytest.raises(KeyError, match="a"):
+ df.loc["a", 0]
+
+ with pytest.raises(KeyError, match="a"):
+ df.at[1, "a"]
+ with pytest.raises(KeyError, match="a"):
+ df.loc[1, "a"]
+ # TODO: belongs somewhere else?
+ def test_getitem_list_missing_key(self):
# GH 13822, incorrect error string with non-unique columns when missing
# column is accessed
df = DataFrame({"x": [1.0], "y": [2.0], "z": [3.0]})
diff --git a/pandas/tests/io/excel/test_writers.py b/pandas/tests/io/excel/test_writers.py
index f7b49ccb1a72d..91665a24fc4c5 100644
--- a/pandas/tests/io/excel/test_writers.py
+++ b/pandas/tests/io/excel/test_writers.py
@@ -1048,6 +1048,27 @@ def test_invalid_columns(self, path):
):
write_frame.to_excel(path, "test1", columns=["C", "D"])
+ @pytest.mark.parametrize(
+ "to_excel_index,read_excel_index_col",
+ [
+ (True, 0), # Include index in write to file
+ (False, None), # Dont include index in write to file
+ ],
+ )
+ def test_write_subset_columns(self, path, to_excel_index, read_excel_index_col):
+ # GH 31677
+ write_frame = DataFrame({"A": [1, 1, 1], "B": [2, 2, 2], "C": [3, 3, 3]})
+ write_frame.to_excel(
+ path, "col_subset_bug", columns=["A", "B"], index=to_excel_index
+ )
+
+ expected = write_frame[["A", "B"]]
+ read_frame = pd.read_excel(
+ path, "col_subset_bug", index_col=read_excel_index_col
+ )
+
+ tm.assert_frame_equal(expected, read_frame)
+
def test_comment_arg(self, path):
# see gh-18735
#
diff --git a/pandas/tests/io/formats/test_format.py b/pandas/tests/io/formats/test_format.py
index 7650561d3072d..bf7b98eb78f11 100644
--- a/pandas/tests/io/formats/test_format.py
+++ b/pandas/tests/io/formats/test_format.py
@@ -239,6 +239,15 @@ def test_repr_truncation(self):
with option_context("display.max_colwidth", max_len + 2):
assert "..." not in repr(df)
+ def test_repr_deprecation_negative_int(self):
+ # FIXME: remove in future version after deprecation cycle
+ # Non-regression test for:
+ # https://github.com/pandas-dev/pandas/issues/31532
+ width = get_option("display.max_colwidth")
+ with tm.assert_produces_warning(FutureWarning):
+ set_option("display.max_colwidth", -1)
+ set_option("display.max_colwidth", width)
+
def test_repr_chop_threshold(self):
df = DataFrame([[0.1, 0.5], [0.5, -0.1]])
pd.reset_option("display.chop_threshold") # default None
diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py
index 94d51589023c4..f2d35bfb3b5ae 100644
--- a/pandas/tests/io/json/test_pandas.py
+++ b/pandas/tests/io/json/test_pandas.py
@@ -1662,3 +1662,22 @@ def test_json_multiindex(self, dataframe, expected):
series = dataframe.stack()
result = series.to_json(orient="index")
assert result == expected
+
+ def test_to_s3(self, s3_resource):
+ # GH 28375
+ mock_bucket_name, target_file = "pandas-test", "test.json"
+ df = DataFrame({"x": [1, 2, 3], "y": [2, 4, 6]})
+ df.to_json(f"s3://{mock_bucket_name}/{target_file}")
+ assert target_file in (
+ obj.key for obj in s3_resource.Bucket("pandas-test").objects.all()
+ )
+
+ def test_json_pandas_na(self):
+ # GH 31615
+ result = pd.DataFrame([[pd.NA]]).to_json()
+ assert result == '{"0":{"0":null}}'
+
+ def test_json_pandas_nulls(self, nulls_fixture):
+ # GH 31615
+ result = pd.DataFrame([[nulls_fixture]]).to_json()
+ assert result == '{"0":{"0":null}}'
diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py
index 6c17f40b790ac..c19056d434ec3 100644
--- a/pandas/tests/io/parser/test_common.py
+++ b/pandas/tests/io/parser/test_common.py
@@ -2040,6 +2040,17 @@ def test_read_csv_memory_growth_chunksize(all_parsers):
pass
+def test_read_csv_raises_on_header_prefix(all_parsers):
+ # gh-27394
+ parser = all_parsers
+ msg = "Argument prefix must be None if argument header is not None"
+
+ s = StringIO("0,1\n2,3")
+
+ with pytest.raises(ValueError, match=msg):
+ parser.read_csv(s, header=0, prefix="_X")
+
+
def test_read_table_equivalency_to_read_csv(all_parsers):
# see gh-21948
# As of 0.25.0, read_table is undeprecated
diff --git a/pandas/tests/io/parser/test_encoding.py b/pandas/tests/io/parser/test_encoding.py
index 406e7bedfd298..13f72a0414bac 100644
--- a/pandas/tests/io/parser/test_encoding.py
+++ b/pandas/tests/io/parser/test_encoding.py
@@ -141,6 +141,7 @@ def test_read_csv_utf_aliases(all_parsers, utf_value, encoding_fmt):
)
def test_binary_mode_file_buffers(all_parsers, csv_dir_path, fname, encoding):
# gh-23779: Python csv engine shouldn't error on files opened in binary.
+ # gh-31575: Python csv engine shouldn't error on files opened in raw binary.
parser = all_parsers
fpath = os.path.join(csv_dir_path, fname)
@@ -154,6 +155,10 @@ def test_binary_mode_file_buffers(all_parsers, csv_dir_path, fname, encoding):
result = parser.read_csv(fb, encoding=encoding)
tm.assert_frame_equal(expected, result)
+ with open(fpath, mode="rb", buffering=0) as fb:
+ result = parser.read_csv(fb, encoding=encoding)
+ tm.assert_frame_equal(expected, result)
+
@pytest.mark.parametrize("pass_encoding", [True, False])
def test_encoding_temp_file(all_parsers, utf_value, encoding_fmt, pass_encoding):
diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py
index b01b22e811ee3..073af758f0b29 100644
--- a/pandas/tests/io/parser/test_parse_dates.py
+++ b/pandas/tests/io/parser/test_parse_dates.py
@@ -1516,3 +1516,15 @@ def test_hypothesis_delimited_date(date_format, dayfirst, delimiter, test_dateti
assert except_out_dateutil == except_in_dateutil
assert result == expected
+
+
+@pytest.mark.parametrize("parse_dates", [["time", ], {"date": ["time", ]}])
+def test_missing_column(all_parsers, parse_dates):
+ """GH31251 column names provided in parse_dates could be missing."""
+ parser = all_parsers
+ content = StringIO("time,val\n2020-01-31,32\n")
+ msg = "Missing column provided to 'parse_dates': 'time'"
+ with pytest.raises(ValueError, match=msg):
+ parser.read_csv(
+ content, sep=",", usecols=["val", ], parse_dates=parse_dates,
+ )
diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py
index d7a21b27308e8..404f5a477187b 100644
--- a/pandas/tests/io/test_common.py
+++ b/pandas/tests/io/test_common.py
@@ -141,24 +141,7 @@ def test_read_non_existant(self, reader, module, error_class, fn_ext):
pytest.importorskip(module)
path = os.path.join(HERE, "data", "does_not_exist." + fn_ext)
- msg1 = r"File (b')?.+does_not_exist\.{}'? does not exist".format(fn_ext)
- msg2 = fr"\[Errno 2\] No such file or directory: '.+does_not_exist\.{fn_ext}'"
- msg3 = "Expected object or value"
- msg4 = "path_or_buf needs to be a string file path or file-like"
- msg5 = (
- fr"\[Errno 2\] File .+does_not_exist\.{fn_ext} does not exist: "
- fr"'.+does_not_exist\.{fn_ext}'"
- )
- msg6 = fr"\[Errno 2\] 没有那个文件或目录: '.+does_not_exist\.{fn_ext}'"
- msg7 = (
- fr"\[Errno 2\] File o directory non esistente: '.+does_not_exist\.{fn_ext}'"
- )
- msg8 = fr"Failed to open local file.+does_not_exist\.{fn_ext}.?, error: .*"
-
- with pytest.raises(
- error_class,
- match=fr"({msg1}|{msg2}|{msg3}|{msg4}|{msg5}|{msg6}|{msg7}|{msg8})",
- ):
+ with tm.external_error_raised(error_class):
reader(path)
@pytest.mark.parametrize(
@@ -184,24 +167,7 @@ def test_read_expands_user_home_dir(
path = os.path.join("~", "does_not_exist." + fn_ext)
monkeypatch.setattr(icom, "_expand_user", lambda x: os.path.join("foo", x))
- msg1 = fr"File (b')?.+does_not_exist\.{fn_ext}'? does not exist"
- msg2 = fr"\[Errno 2\] No such file or directory: '.+does_not_exist\.{fn_ext}'"
- msg3 = "Unexpected character found when decoding 'false'"
- msg4 = "path_or_buf needs to be a string file path or file-like"
- msg5 = (
- fr"\[Errno 2\] File .+does_not_exist\.{fn_ext} does not exist: "
- fr"'.+does_not_exist\.{fn_ext}'"
- )
- msg6 = fr"\[Errno 2\] 没有那个文件或目录: '.+does_not_exist\.{fn_ext}'"
- msg7 = (
- fr"\[Errno 2\] File o directory non esistente: '.+does_not_exist\.{fn_ext}'"
- )
- msg8 = fr"Failed to open local file.+does_not_exist\.{fn_ext}.?, error: .*"
-
- with pytest.raises(
- error_class,
- match=fr"({msg1}|{msg2}|{msg3}|{msg4}|{msg5}|{msg6}|{msg7}|{msg8})",
- ):
+ with tm.external_error_raised(error_class):
reader(path)
@pytest.mark.parametrize(
diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py
index d51c712ed5abd..7ed8d8f22764c 100644
--- a/pandas/tests/io/test_parquet.py
+++ b/pandas/tests/io/test_parquet.py
@@ -564,6 +564,13 @@ def test_additional_extension_types(self, pa):
)
check_round_trip(df, pa)
+ @td.skip_if_no("pyarrow", min_version="0.14")
+ def test_timestamp_nanoseconds(self, pa):
+ # with version 2.0, pyarrow defaults to writing the nanoseconds, so
+ # this should work without error
+ df = pd.DataFrame({"a": pd.date_range("2017-01-01", freq="1n", periods=10)})
+ check_round_trip(df, pa, write_kwargs={"version": "2.0"})
+
class TestParquetFastParquet(Base):
@td.skip_if_no("fastparquet", min_version="0.3.2")
diff --git a/pandas/tests/io/test_pickle.py b/pandas/tests/io/test_pickle.py
index 04fd4835469a9..78b630bb5ada1 100644
--- a/pandas/tests/io/test_pickle.py
+++ b/pandas/tests/io/test_pickle.py
@@ -60,9 +60,7 @@ def compare_element(result, expected, typ, version=None):
assert result == expected
assert result.freq == expected.freq
else:
- comparator = getattr(
- tm, "assert_{typ}_equal".format(typ=typ), tm.assert_almost_equal
- )
+ comparator = getattr(tm, f"assert_{typ}_equal", tm.assert_almost_equal)
comparator(result, expected)
@@ -77,7 +75,7 @@ def compare(data, vf, version):
# use a specific comparator
# if available
- comparator = "compare_{typ}_{dt}".format(typ=typ, dt=dt)
+ comparator = f"compare_{typ}_{dt}"
comparator = m.get(comparator, m["compare_element"])
comparator(result, expected, typ, version)
@@ -234,7 +232,7 @@ def test_legacy_sparse_warning(datapath):
@pytest.fixture
def get_random_path():
- return "__{}__.pickle".format(tm.rands(10))
+ return f"__{tm.rands(10)}__.pickle"
class TestCompression:
@@ -262,7 +260,7 @@ def compress_file(self, src_path, dest_path, compression):
elif compression == "xz":
f = _get_lzma_file(lzma)(dest_path, "w")
else:
- msg = "Unrecognized compression type: {}".format(compression)
+ msg = f"Unrecognized compression type: {compression}"
raise ValueError(msg)
if compression != "zip":
diff --git a/pandas/tests/plotting/test_converter.py b/pandas/tests/plotting/test_converter.py
index 9cd3ccbf9214e..e54f4784e9c4f 100644
--- a/pandas/tests/plotting/test_converter.py
+++ b/pandas/tests/plotting/test_converter.py
@@ -8,6 +8,7 @@
import pandas._config.config as cf
from pandas.compat.numpy import np_datetime64_compat
+import pandas.util._test_decorators as td
from pandas import Index, Period, Series, Timestamp, date_range
import pandas._testing as tm
@@ -59,6 +60,7 @@ def test_register_by_default(self):
call = [sys.executable, "-c", code]
assert subprocess.check_call(call) == 0
+ @td.skip_if_no("matplotlib", min_version="3.1.3")
def test_registering_no_warning(self):
plt = pytest.importorskip("matplotlib.pyplot")
s = Series(range(12), index=date_range("2017", periods=12))
@@ -66,9 +68,7 @@ def test_registering_no_warning(self):
# Set to the "warn" state, in case this isn't the first test run
register_matplotlib_converters()
- with tm.assert_produces_warning(DeprecationWarning, check_stacklevel=False):
- # GH#30588 DeprecationWarning from 2D indexing
- ax.plot(s.index, s.values)
+ ax.plot(s.index, s.values)
def test_pandas_plots_register(self):
pytest.importorskip("matplotlib.pyplot")
@@ -91,6 +91,7 @@ def test_matplotlib_formatters(self):
assert Timestamp not in units.registry
assert Timestamp in units.registry
+ @td.skip_if_no("matplotlib", min_version="3.1.3")
def test_option_no_warning(self):
pytest.importorskip("matplotlib.pyplot")
ctx = cf.option_context("plotting.matplotlib.register_converters", False)
@@ -100,15 +101,12 @@ def test_option_no_warning(self):
# Test without registering first, no warning
with ctx:
- # GH#30588 DeprecationWarning from 2D indexing on Index
- with tm.assert_produces_warning(DeprecationWarning, check_stacklevel=False):
- ax.plot(s.index, s.values)
+ ax.plot(s.index, s.values)
# Now test with registering
register_matplotlib_converters()
with ctx:
- with tm.assert_produces_warning(DeprecationWarning, check_stacklevel=False):
- ax.plot(s.index, s.values)
+ ax.plot(s.index, s.values)
def test_registry_resets(self):
units = pytest.importorskip("matplotlib.units")
diff --git a/pandas/tests/plotting/test_datetimelike.py b/pandas/tests/plotting/test_datetimelike.py
index 84d298cd7c6fe..979b89a87d843 100644
--- a/pandas/tests/plotting/test_datetimelike.py
+++ b/pandas/tests/plotting/test_datetimelike.py
@@ -43,19 +43,19 @@ def setup_method(self, method):
def teardown_method(self, method):
tm.close()
- # Ignore warning
- # ```
- # Converting to PeriodArray/Index representation will drop timezone information.
- # ```
- # which occurs for UTC-like timezones.
@pytest.mark.slow
- @pytest.mark.filterwarnings("ignore:msg:UserWarning")
def test_ts_plot_with_tz(self, tz_aware_fixture):
- # GH2877, GH17173
+ # GH2877, GH17173, GH31205, GH31580
tz = tz_aware_fixture
index = date_range("1/1/2011", periods=2, freq="H", tz=tz)
ts = Series([188.5, 328.25], index=index)
- _check_plot_works(ts.plot)
+ with tm.assert_produces_warning(None):
+ _check_plot_works(ts.plot)
+ ax = ts.plot()
+ xdata = list(ax.get_lines())[0].get_xdata()
+ # Check first and last points' labels are correct
+ assert (xdata[0].hour, xdata[0].minute) == (0, 0)
+ assert (xdata[-1].hour, xdata[-1].minute) == (1, 0)
def test_fontsize_set_correctly(self):
# For issue #8765
diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py
index f9acf5b60a3cd..fd189c7435b29 100644
--- a/pandas/tests/reshape/merge/test_merge.py
+++ b/pandas/tests/reshape/merge/test_merge.py
@@ -371,10 +371,8 @@ def test_no_overlap_more_informative_error(self):
msg = (
"No common columns to perform merge on. "
- "Merge options: left_on={lon}, right_on={ron}, "
- "left_index={lidx}, right_index={ridx}".format(
- lon=None, ron=None, lidx=False, ridx=False
- )
+ f"Merge options: left_on={None}, right_on={None}, "
+ f"left_index={False}, right_index={False}"
)
with pytest.raises(MergeError, match=msg):
diff --git a/pandas/tests/reshape/test_cut.py b/pandas/tests/reshape/test_cut.py
index 13b6f05ed304a..830e786fd1c6d 100644
--- a/pandas/tests/reshape/test_cut.py
+++ b/pandas/tests/reshape/test_cut.py
@@ -612,3 +612,16 @@ def test_cut_incorrect_labels(labels):
msg = "Bin labels must either be False, None or passed in as a list-like argument"
with pytest.raises(ValueError, match=msg):
cut(values, 4, labels=labels)
+
+
+@pytest.mark.parametrize("bins", [3, [0, 5, 15]])
+@pytest.mark.parametrize("right", [True, False])
+@pytest.mark.parametrize("include_lowest", [True, False])
+def test_cut_nullable_integer(bins, right, include_lowest):
+ a = np.random.randint(0, 10, size=50).astype(float)
+ a[::2] = np.nan
+ result = cut(
+ pd.array(a, dtype="Int64"), bins, right=right, include_lowest=include_lowest
+ )
+ expected = cut(a, bins, right=right, include_lowest=include_lowest)
+ tm.assert_categorical_equal(result, expected)
diff --git a/pandas/tests/reshape/test_qcut.py b/pandas/tests/reshape/test_qcut.py
index 95406a5ebf4f7..c436ab5d90578 100644
--- a/pandas/tests/reshape/test_qcut.py
+++ b/pandas/tests/reshape/test_qcut.py
@@ -3,6 +3,7 @@
import numpy as np
import pytest
+import pandas as pd
from pandas import (
Categorical,
DatetimeIndex,
@@ -286,3 +287,14 @@ def test_qcut_bool_coercion_to_int(bins, box, compare):
expected = qcut(data_expected, bins, duplicates="drop")
result = qcut(data_result, bins, duplicates="drop")
compare(result, expected)
+
+
+@pytest.mark.parametrize("q", [2, 5, 10])
+def test_qcut_nullable_integer(q, any_nullable_int_dtype):
+ arr = pd.array(np.arange(100), dtype=any_nullable_int_dtype)
+ arr[::2] = pd.NA
+
+ result = qcut(arr, q)
+ expected = qcut(arr.astype(float), q)
+
+ tm.assert_categorical_equal(result, expected)
diff --git a/pandas/tests/scalar/period/test_asfreq.py b/pandas/tests/scalar/period/test_asfreq.py
index 357274e724c68..436810042186a 100644
--- a/pandas/tests/scalar/period/test_asfreq.py
+++ b/pandas/tests/scalar/period/test_asfreq.py
@@ -3,7 +3,7 @@
from pandas._libs.tslibs.frequencies import INVALID_FREQ_ERR_MSG, _period_code_map
from pandas.errors import OutOfBoundsDatetime
-from pandas import Period, offsets
+from pandas import Period, Timestamp, offsets
class TestFreqConversion:
@@ -656,6 +656,23 @@ def test_conv_secondly(self):
assert ival_S.asfreq("S") == ival_S
+ def test_conv_microsecond(self):
+ # GH#31475 Avoid floating point errors dropping the start_time to
+ # before the beginning of the Period
+ per = Period("2020-01-30 15:57:27.576166", freq="U")
+ assert per.ordinal == 1580399847576166
+
+ start = per.start_time
+ expected = Timestamp("2020-01-30 15:57:27.576166")
+ assert start == expected
+ assert start.value == per.ordinal * 1000
+
+ per2 = Period("2300-01-01", "us")
+ with pytest.raises(OutOfBoundsDatetime, match="2300-01-01"):
+ per2.start_time
+ with pytest.raises(OutOfBoundsDatetime, match="2300-01-01"):
+ per2.end_time
+
def test_asfreq_mult(self):
# normal freq to mult freq
p = Period(freq="A", year=2007)
diff --git a/pandas/tests/scalar/period/test_period.py b/pandas/tests/scalar/period/test_period.py
index bbc81e0dbb6e6..995d47c1473be 100644
--- a/pandas/tests/scalar/period/test_period.py
+++ b/pandas/tests/scalar/period/test_period.py
@@ -925,7 +925,7 @@ def test_properties_secondly(self):
class TestPeriodField:
def test_get_period_field_array_raises_on_out_of_range(self):
- msg = "Buffer dtype mismatch, expected 'int64_t' but got 'double'"
+ msg = "Buffer dtype mismatch, expected 'const int64_t' but got 'double'"
with pytest.raises(ValueError, match=msg):
libperiod.get_period_field_arr(-1, np.empty(1), 0)
diff --git a/pandas/tests/scalar/timedelta/test_constructors.py b/pandas/tests/scalar/timedelta/test_constructors.py
index ae1e84576c092..25c9fc19981be 100644
--- a/pandas/tests/scalar/timedelta/test_constructors.py
+++ b/pandas/tests/scalar/timedelta/test_constructors.py
@@ -274,3 +274,10 @@ def test_td_constructor_on_nanoseconds(constructed_td, conversion):
def test_td_constructor_value_error():
with pytest.raises(TypeError):
Timedelta(nanoseconds="abc")
+
+
+def test_timedelta_constructor_identity():
+ # Test for #30543
+ expected = Timedelta(np.timedelta64(1, "s"))
+ result = Timedelta(expected)
+ assert result is expected
diff --git a/pandas/tests/scalar/timedelta/test_timedelta.py b/pandas/tests/scalar/timedelta/test_timedelta.py
index e1d965bbb14e9..9cdbeb6ab4845 100644
--- a/pandas/tests/scalar/timedelta/test_timedelta.py
+++ b/pandas/tests/scalar/timedelta/test_timedelta.py
@@ -821,3 +821,16 @@ def test_resolution_deprecated(self):
def test_truthiness(value, expected):
# https://github.com/pandas-dev/pandas/issues/21484
assert bool(value) is expected
+
+
+def test_timedelta_attribute_precision():
+ # GH 31354
+ td = Timedelta(1552211999999999872, unit="ns")
+ result = td.days * 86400
+ result += td.seconds
+ result *= 1000000
+ result += td.microseconds
+ result *= 1000
+ result += td.nanoseconds
+ expected = td.value
+ assert result == expected
diff --git a/pandas/tests/scalar/timestamp/test_constructors.py b/pandas/tests/scalar/timestamp/test_constructors.py
new file mode 100644
index 0000000000000..737a85faa4c9b
--- /dev/null
+++ b/pandas/tests/scalar/timestamp/test_constructors.py
@@ -0,0 +1,552 @@
+import calendar
+from datetime import datetime, timedelta
+
+import dateutil.tz
+from dateutil.tz import tzutc
+import numpy as np
+import pytest
+import pytz
+
+from pandas.errors import OutOfBoundsDatetime
+
+from pandas import Period, Timedelta, Timestamp, compat
+
+from pandas.tseries import offsets
+
+
+class TestTimestampConstructors:
+ def test_constructor(self):
+ base_str = "2014-07-01 09:00"
+ base_dt = datetime(2014, 7, 1, 9)
+ base_expected = 1_404_205_200_000_000_000
+
+ # confirm base representation is correct
+ assert calendar.timegm(base_dt.timetuple()) * 1_000_000_000 == base_expected
+
+ tests = [
+ (base_str, base_dt, base_expected),
+ (
+ "2014-07-01 10:00",
+ datetime(2014, 7, 1, 10),
+ base_expected + 3600 * 1_000_000_000,
+ ),
+ (
+ "2014-07-01 09:00:00.000008000",
+ datetime(2014, 7, 1, 9, 0, 0, 8),
+ base_expected + 8000,
+ ),
+ (
+ "2014-07-01 09:00:00.000000005",
+ Timestamp("2014-07-01 09:00:00.000000005"),
+ base_expected + 5,
+ ),
+ ]
+
+ timezones = [
+ (None, 0),
+ ("UTC", 0),
+ (pytz.utc, 0),
+ ("Asia/Tokyo", 9),
+ ("US/Eastern", -4),
+ ("dateutil/US/Pacific", -7),
+ (pytz.FixedOffset(-180), -3),
+ (dateutil.tz.tzoffset(None, 18000), 5),
+ ]
+
+ for date_str, date, expected in tests:
+ for result in [Timestamp(date_str), Timestamp(date)]:
+ # only with timestring
+ assert result.value == expected
+
+ # re-creation shouldn't affect to internal value
+ result = Timestamp(result)
+ assert result.value == expected
+
+ # with timezone
+ for tz, offset in timezones:
+ for result in [Timestamp(date_str, tz=tz), Timestamp(date, tz=tz)]:
+ expected_tz = expected - offset * 3600 * 1_000_000_000
+ assert result.value == expected_tz
+
+ # should preserve tz
+ result = Timestamp(result)
+ assert result.value == expected_tz
+
+ # should convert to UTC
+ if tz is not None:
+ result = Timestamp(result).tz_convert("UTC")
+ else:
+ result = Timestamp(result, tz="UTC")
+ expected_utc = expected - offset * 3600 * 1_000_000_000
+ assert result.value == expected_utc
+
+ def test_constructor_with_stringoffset(self):
+ # GH 7833
+ base_str = "2014-07-01 11:00:00+02:00"
+ base_dt = datetime(2014, 7, 1, 9)
+ base_expected = 1_404_205_200_000_000_000
+
+ # confirm base representation is correct
+ assert calendar.timegm(base_dt.timetuple()) * 1_000_000_000 == base_expected
+
+ tests = [
+ (base_str, base_expected),
+ ("2014-07-01 12:00:00+02:00", base_expected + 3600 * 1_000_000_000),
+ ("2014-07-01 11:00:00.000008000+02:00", base_expected + 8000),
+ ("2014-07-01 11:00:00.000000005+02:00", base_expected + 5),
+ ]
+
+ timezones = [
+ (None, 0),
+ ("UTC", 0),
+ (pytz.utc, 0),
+ ("Asia/Tokyo", 9),
+ ("US/Eastern", -4),
+ ("dateutil/US/Pacific", -7),
+ (pytz.FixedOffset(-180), -3),
+ (dateutil.tz.tzoffset(None, 18000), 5),
+ ]
+
+ for date_str, expected in tests:
+ for result in [Timestamp(date_str)]:
+ # only with timestring
+ assert result.value == expected
+
+ # re-creation shouldn't affect to internal value
+ result = Timestamp(result)
+ assert result.value == expected
+
+ # with timezone
+ for tz, offset in timezones:
+ result = Timestamp(date_str, tz=tz)
+ expected_tz = expected
+ assert result.value == expected_tz
+
+ # should preserve tz
+ result = Timestamp(result)
+ assert result.value == expected_tz
+
+ # should convert to UTC
+ result = Timestamp(result).tz_convert("UTC")
+ expected_utc = expected
+ assert result.value == expected_utc
+
+ # This should be 2013-11-01 05:00 in UTC
+ # converted to Chicago tz
+ result = Timestamp("2013-11-01 00:00:00-0500", tz="America/Chicago")
+ assert result.value == Timestamp("2013-11-01 05:00").value
+ expected = "Timestamp('2013-11-01 00:00:00-0500', tz='America/Chicago')" # noqa
+ assert repr(result) == expected
+ assert result == eval(repr(result))
+
+ # This should be 2013-11-01 05:00 in UTC
+ # converted to Tokyo tz (+09:00)
+ result = Timestamp("2013-11-01 00:00:00-0500", tz="Asia/Tokyo")
+ assert result.value == Timestamp("2013-11-01 05:00").value
+ expected = "Timestamp('2013-11-01 14:00:00+0900', tz='Asia/Tokyo')"
+ assert repr(result) == expected
+ assert result == eval(repr(result))
+
+ # GH11708
+ # This should be 2015-11-18 10:00 in UTC
+ # converted to Asia/Katmandu
+ result = Timestamp("2015-11-18 15:45:00+05:45", tz="Asia/Katmandu")
+ assert result.value == Timestamp("2015-11-18 10:00").value
+ expected = "Timestamp('2015-11-18 15:45:00+0545', tz='Asia/Katmandu')"
+ assert repr(result) == expected
+ assert result == eval(repr(result))
+
+ # This should be 2015-11-18 10:00 in UTC
+ # converted to Asia/Kolkata
+ result = Timestamp("2015-11-18 15:30:00+05:30", tz="Asia/Kolkata")
+ assert result.value == Timestamp("2015-11-18 10:00").value
+ expected = "Timestamp('2015-11-18 15:30:00+0530', tz='Asia/Kolkata')"
+ assert repr(result) == expected
+ assert result == eval(repr(result))
+
+ def test_constructor_invalid(self):
+ with pytest.raises(TypeError, match="Cannot convert input"):
+ Timestamp(slice(2))
+ with pytest.raises(ValueError, match="Cannot convert Period"):
+ Timestamp(Period("1000-01-01"))
+
+ def test_constructor_invalid_tz(self):
+ # GH#17690
+ with pytest.raises(TypeError, match="must be a datetime.tzinfo"):
+ Timestamp("2017-10-22", tzinfo="US/Eastern")
+
+ with pytest.raises(ValueError, match="at most one of"):
+ Timestamp("2017-10-22", tzinfo=pytz.utc, tz="UTC")
+
+ with pytest.raises(ValueError, match="Invalid frequency:"):
+ # GH#5168
+ # case where user tries to pass tz as an arg, not kwarg, gets
+ # interpreted as a `freq`
+ Timestamp("2012-01-01", "US/Pacific")
+
+ def test_constructor_strptime(self):
+ # GH25016
+ # Test support for Timestamp.strptime
+ fmt = "%Y%m%d-%H%M%S-%f%z"
+ ts = "20190129-235348-000001+0000"
+ with pytest.raises(NotImplementedError):
+ Timestamp.strptime(ts, fmt)
+
+ def test_constructor_tz_or_tzinfo(self):
+ # GH#17943, GH#17690, GH#5168
+ stamps = [
+ Timestamp(year=2017, month=10, day=22, tz="UTC"),
+ Timestamp(year=2017, month=10, day=22, tzinfo=pytz.utc),
+ Timestamp(year=2017, month=10, day=22, tz=pytz.utc),
+ Timestamp(datetime(2017, 10, 22), tzinfo=pytz.utc),
+ Timestamp(datetime(2017, 10, 22), tz="UTC"),
+ Timestamp(datetime(2017, 10, 22), tz=pytz.utc),
+ ]
+ assert all(ts == stamps[0] for ts in stamps)
+
+ def test_constructor_positional(self):
+ # see gh-10758
+ with pytest.raises(TypeError):
+ Timestamp(2000, 1)
+ with pytest.raises(ValueError):
+ Timestamp(2000, 0, 1)
+ with pytest.raises(ValueError):
+ Timestamp(2000, 13, 1)
+ with pytest.raises(ValueError):
+ Timestamp(2000, 1, 0)
+ with pytest.raises(ValueError):
+ Timestamp(2000, 1, 32)
+
+ # see gh-11630
+ assert repr(Timestamp(2015, 11, 12)) == repr(Timestamp("20151112"))
+ assert repr(Timestamp(2015, 11, 12, 1, 2, 3, 999999)) == repr(
+ Timestamp("2015-11-12 01:02:03.999999")
+ )
+
+ def test_constructor_keyword(self):
+ # GH 10758
+ with pytest.raises(TypeError):
+ Timestamp(year=2000, month=1)
+ with pytest.raises(ValueError):
+ Timestamp(year=2000, month=0, day=1)
+ with pytest.raises(ValueError):
+ Timestamp(year=2000, month=13, day=1)
+ with pytest.raises(ValueError):
+ Timestamp(year=2000, month=1, day=0)
+ with pytest.raises(ValueError):
+ Timestamp(year=2000, month=1, day=32)
+
+ assert repr(Timestamp(year=2015, month=11, day=12)) == repr(
+ Timestamp("20151112")
+ )
+
+ assert repr(
+ Timestamp(
+ year=2015,
+ month=11,
+ day=12,
+ hour=1,
+ minute=2,
+ second=3,
+ microsecond=999999,
+ )
+ ) == repr(Timestamp("2015-11-12 01:02:03.999999"))
+
+ def test_constructor_fromordinal(self):
+ base = datetime(2000, 1, 1)
+
+ ts = Timestamp.fromordinal(base.toordinal(), freq="D")
+ assert base == ts
+ assert ts.freq == "D"
+ assert base.toordinal() == ts.toordinal()
+
+ ts = Timestamp.fromordinal(base.toordinal(), tz="US/Eastern")
+ assert Timestamp("2000-01-01", tz="US/Eastern") == ts
+ assert base.toordinal() == ts.toordinal()
+
+ # GH#3042
+ dt = datetime(2011, 4, 16, 0, 0)
+ ts = Timestamp.fromordinal(dt.toordinal())
+ assert ts.to_pydatetime() == dt
+
+ # with a tzinfo
+ stamp = Timestamp("2011-4-16", tz="US/Eastern")
+ dt_tz = stamp.to_pydatetime()
+ ts = Timestamp.fromordinal(dt_tz.toordinal(), tz="US/Eastern")
+ assert ts.to_pydatetime() == dt_tz
+
+ @pytest.mark.parametrize(
+ "result",
+ [
+ Timestamp(datetime(2000, 1, 2, 3, 4, 5, 6), nanosecond=1),
+ Timestamp(
+ year=2000,
+ month=1,
+ day=2,
+ hour=3,
+ minute=4,
+ second=5,
+ microsecond=6,
+ nanosecond=1,
+ ),
+ Timestamp(
+ year=2000,
+ month=1,
+ day=2,
+ hour=3,
+ minute=4,
+ second=5,
+ microsecond=6,
+ nanosecond=1,
+ tz="UTC",
+ ),
+ Timestamp(2000, 1, 2, 3, 4, 5, 6, 1, None),
+ Timestamp(2000, 1, 2, 3, 4, 5, 6, 1, pytz.UTC),
+ ],
+ )
+ def test_constructor_nanosecond(self, result):
+ # GH 18898
+ expected = Timestamp(datetime(2000, 1, 2, 3, 4, 5, 6), tz=result.tz)
+ expected = expected + Timedelta(nanoseconds=1)
+ assert result == expected
+
+ @pytest.mark.parametrize("z", ["Z0", "Z00"])
+ def test_constructor_invalid_Z0_isostring(self, z):
+ # GH 8910
+ with pytest.raises(ValueError):
+ Timestamp("2014-11-02 01:00{}".format(z))
+
+ @pytest.mark.parametrize(
+ "arg",
+ [
+ "year",
+ "month",
+ "day",
+ "hour",
+ "minute",
+ "second",
+ "microsecond",
+ "nanosecond",
+ ],
+ )
+ def test_invalid_date_kwarg_with_string_input(self, arg):
+ kwarg = {arg: 1}
+ with pytest.raises(ValueError):
+ Timestamp("2010-10-10 12:59:59.999999999", **kwarg)
+
+ def test_out_of_bounds_integer_value(self):
+ # GH#26651 check that we raise OutOfBoundsDatetime, not OverflowError
+ with pytest.raises(OutOfBoundsDatetime):
+ Timestamp(Timestamp.max.value * 2)
+ with pytest.raises(OutOfBoundsDatetime):
+ Timestamp(Timestamp.min.value * 2)
+
+ def test_out_of_bounds_value(self):
+ one_us = np.timedelta64(1).astype("timedelta64[us]")
+
+ # By definition we can't go out of bounds in [ns], so we
+ # convert the datetime64s to [us] so we can go out of bounds
+ min_ts_us = np.datetime64(Timestamp.min).astype("M8[us]")
+ max_ts_us = np.datetime64(Timestamp.max).astype("M8[us]")
+
+ # No error for the min/max datetimes
+ Timestamp(min_ts_us)
+ Timestamp(max_ts_us)
+
+ # One us less than the minimum is an error
+ with pytest.raises(ValueError):
+ Timestamp(min_ts_us - one_us)
+
+ # One us more than the maximum is an error
+ with pytest.raises(ValueError):
+ Timestamp(max_ts_us + one_us)
+
+ def test_out_of_bounds_string(self):
+ with pytest.raises(ValueError):
+ Timestamp("1676-01-01")
+ with pytest.raises(ValueError):
+ Timestamp("2263-01-01")
+
+ def test_barely_out_of_bounds(self):
+ # GH#19529
+ # GH#19382 close enough to bounds that dropping nanos would result
+ # in an in-bounds datetime
+ with pytest.raises(OutOfBoundsDatetime):
+ Timestamp("2262-04-11 23:47:16.854775808")
+
+ def test_bounds_with_different_units(self):
+ out_of_bounds_dates = ("1677-09-21", "2262-04-12")
+
+ time_units = ("D", "h", "m", "s", "ms", "us")
+
+ for date_string in out_of_bounds_dates:
+ for unit in time_units:
+ dt64 = np.datetime64(date_string, unit)
+ with pytest.raises(ValueError):
+ Timestamp(dt64)
+
+ in_bounds_dates = ("1677-09-23", "2262-04-11")
+
+ for date_string in in_bounds_dates:
+ for unit in time_units:
+ dt64 = np.datetime64(date_string, unit)
+ Timestamp(dt64)
+
+ def test_min_valid(self):
+ # Ensure that Timestamp.min is a valid Timestamp
+ Timestamp(Timestamp.min)
+
+ def test_max_valid(self):
+ # Ensure that Timestamp.max is a valid Timestamp
+ Timestamp(Timestamp.max)
+
+ def test_now(self):
+ # GH#9000
+ ts_from_string = Timestamp("now")
+ ts_from_method = Timestamp.now()
+ ts_datetime = datetime.now()
+
+ ts_from_string_tz = Timestamp("now", tz="US/Eastern")
+ ts_from_method_tz = Timestamp.now(tz="US/Eastern")
+
+ # Check that the delta between the times is less than 1s (arbitrarily
+ # small)
+ delta = Timedelta(seconds=1)
+ assert abs(ts_from_method - ts_from_string) < delta
+ assert abs(ts_datetime - ts_from_method) < delta
+ assert abs(ts_from_method_tz - ts_from_string_tz) < delta
+ assert (
+ abs(
+ ts_from_string_tz.tz_localize(None)
+ - ts_from_method_tz.tz_localize(None)
+ )
+ < delta
+ )
+
+ def test_today(self):
+ ts_from_string = Timestamp("today")
+ ts_from_method = Timestamp.today()
+ ts_datetime = datetime.today()
+
+ ts_from_string_tz = Timestamp("today", tz="US/Eastern")
+ ts_from_method_tz = Timestamp.today(tz="US/Eastern")
+
+ # Check that the delta between the times is less than 1s (arbitrarily
+ # small)
+ delta = Timedelta(seconds=1)
+ assert abs(ts_from_method - ts_from_string) < delta
+ assert abs(ts_datetime - ts_from_method) < delta
+ assert abs(ts_from_method_tz - ts_from_string_tz) < delta
+ assert (
+ abs(
+ ts_from_string_tz.tz_localize(None)
+ - ts_from_method_tz.tz_localize(None)
+ )
+ < delta
+ )
+
+ @pytest.mark.parametrize("tz", [None, pytz.timezone("US/Pacific")])
+ def test_disallow_setting_tz(self, tz):
+ # GH 3746
+ ts = Timestamp("2010")
+ with pytest.raises(AttributeError):
+ ts.tz = tz
+
+ @pytest.mark.parametrize("offset", ["+0300", "+0200"])
+ def test_construct_timestamp_near_dst(self, offset):
+ # GH 20854
+ expected = Timestamp(
+ "2016-10-30 03:00:00{}".format(offset), tz="Europe/Helsinki"
+ )
+ result = Timestamp(expected).tz_convert("Europe/Helsinki")
+ assert result == expected
+
+ @pytest.mark.parametrize(
+ "arg", ["2013/01/01 00:00:00+09:00", "2013-01-01 00:00:00+09:00"]
+ )
+ def test_construct_with_different_string_format(self, arg):
+ # GH 12064
+ result = Timestamp(arg)
+ expected = Timestamp(datetime(2013, 1, 1), tz=pytz.FixedOffset(540))
+ assert result == expected
+
+ def test_construct_timestamp_preserve_original_frequency(self):
+ # GH 22311
+ result = Timestamp(Timestamp("2010-08-08", freq="D")).freq
+ expected = offsets.Day()
+ assert result == expected
+
+ def test_constructor_invalid_frequency(self):
+ # GH 22311
+ with pytest.raises(ValueError, match="Invalid frequency:"):
+ Timestamp("2012-01-01", freq=[])
+
+ @pytest.mark.parametrize("box", [datetime, Timestamp])
+ def test_raise_tz_and_tzinfo_in_datetime_input(self, box):
+ # GH 23579
+ kwargs = {"year": 2018, "month": 1, "day": 1, "tzinfo": pytz.utc}
+ with pytest.raises(ValueError, match="Cannot pass a datetime or Timestamp"):
+ Timestamp(box(**kwargs), tz="US/Pacific")
+ with pytest.raises(ValueError, match="Cannot pass a datetime or Timestamp"):
+ Timestamp(box(**kwargs), tzinfo=pytz.timezone("US/Pacific"))
+
+ def test_dont_convert_dateutil_utc_to_pytz_utc(self):
+ result = Timestamp(datetime(2018, 1, 1), tz=tzutc())
+ expected = Timestamp(datetime(2018, 1, 1)).tz_localize(tzutc())
+ assert result == expected
+
+ def test_constructor_subclassed_datetime(self):
+ # GH 25851
+ # ensure that subclassed datetime works for
+ # Timestamp creation
+ class SubDatetime(datetime):
+ pass
+
+ data = SubDatetime(2000, 1, 1)
+ result = Timestamp(data)
+ expected = Timestamp(2000, 1, 1)
+ assert result == expected
+
+ @pytest.mark.skipif(
+ not compat.PY38,
+ reason="datetime.fromisocalendar was added in Python version 3.8",
+ )
+ def test_constructor_fromisocalendar(self):
+ # GH 30395
+ expected_timestamp = Timestamp("2000-01-03 00:00:00")
+ expected_stdlib = datetime.fromisocalendar(2000, 1, 1)
+ result = Timestamp.fromisocalendar(2000, 1, 1)
+ assert result == expected_timestamp
+ assert result == expected_stdlib
+ assert isinstance(result, Timestamp)
+
+
+def test_constructor_ambigous_dst():
+ # GH 24329
+ # Make sure that calling Timestamp constructor
+ # on Timestamp created from ambiguous time
+ # doesn't change Timestamp.value
+ ts = Timestamp(1382835600000000000, tz="dateutil/Europe/London")
+ expected = ts.value
+ result = Timestamp(ts).value
+ assert result == expected
+
+
+@pytest.mark.parametrize("epoch", [1552211999999999872, 1552211999999999999])
+def test_constructor_before_dst_switch(epoch):
+ # GH 31043
+ # Make sure that calling Timestamp constructor
+ # on time just before DST switch doesn't lead to
+ # nonexistent time or value change
+ ts = Timestamp(epoch, tz="dateutil/America/Los_Angeles")
+ result = ts.tz.dst(ts)
+ expected = timedelta(seconds=0)
+ assert Timestamp(ts).value == epoch
+ assert result == expected
+
+
+def test_timestamp_constructor_identity():
+ # Test for #30543
+ expected = Timestamp("2017-01-01T12")
+ result = Timestamp(expected)
+ assert result is expected
diff --git a/pandas/tests/scalar/timestamp/test_timestamp.py b/pandas/tests/scalar/timestamp/test_timestamp.py
index 692eb6cd8bc43..cee7ac450e411 100644
--- a/pandas/tests/scalar/timestamp/test_timestamp.py
+++ b/pandas/tests/scalar/timestamp/test_timestamp.py
@@ -2,11 +2,9 @@
import calendar
from datetime import datetime, timedelta
-from distutils.version import LooseVersion
import locale
import unicodedata
-import dateutil
from dateutil.tz import tzutc
import numpy as np
import pytest
@@ -14,12 +12,10 @@
from pytz import timezone, utc
from pandas._libs.tslibs.timezones import dateutil_gettz as gettz, get_timezone
-import pandas.compat as compat
from pandas.compat.numpy import np_datetime64_compat
-from pandas.errors import OutOfBoundsDatetime
import pandas.util._test_decorators as td
-from pandas import NaT, Period, Timedelta, Timestamp
+from pandas import NaT, Timedelta, Timestamp
import pandas._testing as tm
from pandas.tseries import offsets
@@ -198,513 +194,6 @@ def test_resolution(self):
assert Timestamp.resolution == Timedelta(nanoseconds=1)
-class TestTimestampConstructors:
- def test_constructor(self):
- base_str = "2014-07-01 09:00"
- base_dt = datetime(2014, 7, 1, 9)
- base_expected = 1_404_205_200_000_000_000
-
- # confirm base representation is correct
- assert calendar.timegm(base_dt.timetuple()) * 1_000_000_000 == base_expected
-
- tests = [
- (base_str, base_dt, base_expected),
- (
- "2014-07-01 10:00",
- datetime(2014, 7, 1, 10),
- base_expected + 3600 * 1_000_000_000,
- ),
- (
- "2014-07-01 09:00:00.000008000",
- datetime(2014, 7, 1, 9, 0, 0, 8),
- base_expected + 8000,
- ),
- (
- "2014-07-01 09:00:00.000000005",
- Timestamp("2014-07-01 09:00:00.000000005"),
- base_expected + 5,
- ),
- ]
-
- timezones = [
- (None, 0),
- ("UTC", 0),
- (pytz.utc, 0),
- ("Asia/Tokyo", 9),
- ("US/Eastern", -4),
- ("dateutil/US/Pacific", -7),
- (pytz.FixedOffset(-180), -3),
- (dateutil.tz.tzoffset(None, 18000), 5),
- ]
-
- for date_str, date, expected in tests:
- for result in [Timestamp(date_str), Timestamp(date)]:
- # only with timestring
- assert result.value == expected
-
- # re-creation shouldn't affect to internal value
- result = Timestamp(result)
- assert result.value == expected
-
- # with timezone
- for tz, offset in timezones:
- for result in [Timestamp(date_str, tz=tz), Timestamp(date, tz=tz)]:
- expected_tz = expected - offset * 3600 * 1_000_000_000
- assert result.value == expected_tz
-
- # should preserve tz
- result = Timestamp(result)
- assert result.value == expected_tz
-
- # should convert to UTC
- if tz is not None:
- result = Timestamp(result).tz_convert("UTC")
- else:
- result = Timestamp(result, tz="UTC")
- expected_utc = expected - offset * 3600 * 1_000_000_000
- assert result.value == expected_utc
-
- def test_constructor_with_stringoffset(self):
- # GH 7833
- base_str = "2014-07-01 11:00:00+02:00"
- base_dt = datetime(2014, 7, 1, 9)
- base_expected = 1_404_205_200_000_000_000
-
- # confirm base representation is correct
- assert calendar.timegm(base_dt.timetuple()) * 1_000_000_000 == base_expected
-
- tests = [
- (base_str, base_expected),
- ("2014-07-01 12:00:00+02:00", base_expected + 3600 * 1_000_000_000),
- ("2014-07-01 11:00:00.000008000+02:00", base_expected + 8000),
- ("2014-07-01 11:00:00.000000005+02:00", base_expected + 5),
- ]
-
- timezones = [
- (None, 0),
- ("UTC", 0),
- (pytz.utc, 0),
- ("Asia/Tokyo", 9),
- ("US/Eastern", -4),
- ("dateutil/US/Pacific", -7),
- (pytz.FixedOffset(-180), -3),
- (dateutil.tz.tzoffset(None, 18000), 5),
- ]
-
- for date_str, expected in tests:
- for result in [Timestamp(date_str)]:
- # only with timestring
- assert result.value == expected
-
- # re-creation shouldn't affect to internal value
- result = Timestamp(result)
- assert result.value == expected
-
- # with timezone
- for tz, offset in timezones:
- result = Timestamp(date_str, tz=tz)
- expected_tz = expected
- assert result.value == expected_tz
-
- # should preserve tz
- result = Timestamp(result)
- assert result.value == expected_tz
-
- # should convert to UTC
- result = Timestamp(result).tz_convert("UTC")
- expected_utc = expected
- assert result.value == expected_utc
-
- # This should be 2013-11-01 05:00 in UTC
- # converted to Chicago tz
- result = Timestamp("2013-11-01 00:00:00-0500", tz="America/Chicago")
- assert result.value == Timestamp("2013-11-01 05:00").value
- expected = "Timestamp('2013-11-01 00:00:00-0500', tz='America/Chicago')" # noqa
- assert repr(result) == expected
- assert result == eval(repr(result))
-
- # This should be 2013-11-01 05:00 in UTC
- # converted to Tokyo tz (+09:00)
- result = Timestamp("2013-11-01 00:00:00-0500", tz="Asia/Tokyo")
- assert result.value == Timestamp("2013-11-01 05:00").value
- expected = "Timestamp('2013-11-01 14:00:00+0900', tz='Asia/Tokyo')"
- assert repr(result) == expected
- assert result == eval(repr(result))
-
- # GH11708
- # This should be 2015-11-18 10:00 in UTC
- # converted to Asia/Katmandu
- result = Timestamp("2015-11-18 15:45:00+05:45", tz="Asia/Katmandu")
- assert result.value == Timestamp("2015-11-18 10:00").value
- expected = "Timestamp('2015-11-18 15:45:00+0545', tz='Asia/Katmandu')"
- assert repr(result) == expected
- assert result == eval(repr(result))
-
- # This should be 2015-11-18 10:00 in UTC
- # converted to Asia/Kolkata
- result = Timestamp("2015-11-18 15:30:00+05:30", tz="Asia/Kolkata")
- assert result.value == Timestamp("2015-11-18 10:00").value
- expected = "Timestamp('2015-11-18 15:30:00+0530', tz='Asia/Kolkata')"
- assert repr(result) == expected
- assert result == eval(repr(result))
-
- def test_constructor_invalid(self):
- with pytest.raises(TypeError, match="Cannot convert input"):
- Timestamp(slice(2))
- with pytest.raises(ValueError, match="Cannot convert Period"):
- Timestamp(Period("1000-01-01"))
-
- def test_constructor_invalid_tz(self):
- # GH#17690
- with pytest.raises(TypeError, match="must be a datetime.tzinfo"):
- Timestamp("2017-10-22", tzinfo="US/Eastern")
-
- with pytest.raises(ValueError, match="at most one of"):
- Timestamp("2017-10-22", tzinfo=utc, tz="UTC")
-
- with pytest.raises(ValueError, match="Invalid frequency:"):
- # GH#5168
- # case where user tries to pass tz as an arg, not kwarg, gets
- # interpreted as a `freq`
- Timestamp("2012-01-01", "US/Pacific")
-
- def test_constructor_strptime(self):
- # GH25016
- # Test support for Timestamp.strptime
- fmt = "%Y%m%d-%H%M%S-%f%z"
- ts = "20190129-235348-000001+0000"
- with pytest.raises(NotImplementedError):
- Timestamp.strptime(ts, fmt)
-
- def test_constructor_tz_or_tzinfo(self):
- # GH#17943, GH#17690, GH#5168
- stamps = [
- Timestamp(year=2017, month=10, day=22, tz="UTC"),
- Timestamp(year=2017, month=10, day=22, tzinfo=utc),
- Timestamp(year=2017, month=10, day=22, tz=utc),
- Timestamp(datetime(2017, 10, 22), tzinfo=utc),
- Timestamp(datetime(2017, 10, 22), tz="UTC"),
- Timestamp(datetime(2017, 10, 22), tz=utc),
- ]
- assert all(ts == stamps[0] for ts in stamps)
-
- def test_constructor_positional(self):
- # see gh-10758
- with pytest.raises(TypeError):
- Timestamp(2000, 1)
- with pytest.raises(ValueError):
- Timestamp(2000, 0, 1)
- with pytest.raises(ValueError):
- Timestamp(2000, 13, 1)
- with pytest.raises(ValueError):
- Timestamp(2000, 1, 0)
- with pytest.raises(ValueError):
- Timestamp(2000, 1, 32)
-
- # see gh-11630
- assert repr(Timestamp(2015, 11, 12)) == repr(Timestamp("20151112"))
- assert repr(Timestamp(2015, 11, 12, 1, 2, 3, 999999)) == repr(
- Timestamp("2015-11-12 01:02:03.999999")
- )
-
- def test_constructor_keyword(self):
- # GH 10758
- with pytest.raises(TypeError):
- Timestamp(year=2000, month=1)
- with pytest.raises(ValueError):
- Timestamp(year=2000, month=0, day=1)
- with pytest.raises(ValueError):
- Timestamp(year=2000, month=13, day=1)
- with pytest.raises(ValueError):
- Timestamp(year=2000, month=1, day=0)
- with pytest.raises(ValueError):
- Timestamp(year=2000, month=1, day=32)
-
- assert repr(Timestamp(year=2015, month=11, day=12)) == repr(
- Timestamp("20151112")
- )
-
- assert repr(
- Timestamp(
- year=2015,
- month=11,
- day=12,
- hour=1,
- minute=2,
- second=3,
- microsecond=999999,
- )
- ) == repr(Timestamp("2015-11-12 01:02:03.999999"))
-
- def test_constructor_fromordinal(self):
- base = datetime(2000, 1, 1)
-
- ts = Timestamp.fromordinal(base.toordinal(), freq="D")
- assert base == ts
- assert ts.freq == "D"
- assert base.toordinal() == ts.toordinal()
-
- ts = Timestamp.fromordinal(base.toordinal(), tz="US/Eastern")
- assert Timestamp("2000-01-01", tz="US/Eastern") == ts
- assert base.toordinal() == ts.toordinal()
-
- # GH#3042
- dt = datetime(2011, 4, 16, 0, 0)
- ts = Timestamp.fromordinal(dt.toordinal())
- assert ts.to_pydatetime() == dt
-
- # with a tzinfo
- stamp = Timestamp("2011-4-16", tz="US/Eastern")
- dt_tz = stamp.to_pydatetime()
- ts = Timestamp.fromordinal(dt_tz.toordinal(), tz="US/Eastern")
- assert ts.to_pydatetime() == dt_tz
-
- @pytest.mark.parametrize(
- "result",
- [
- Timestamp(datetime(2000, 1, 2, 3, 4, 5, 6), nanosecond=1),
- Timestamp(
- year=2000,
- month=1,
- day=2,
- hour=3,
- minute=4,
- second=5,
- microsecond=6,
- nanosecond=1,
- ),
- Timestamp(
- year=2000,
- month=1,
- day=2,
- hour=3,
- minute=4,
- second=5,
- microsecond=6,
- nanosecond=1,
- tz="UTC",
- ),
- Timestamp(2000, 1, 2, 3, 4, 5, 6, 1, None),
- Timestamp(2000, 1, 2, 3, 4, 5, 6, 1, pytz.UTC),
- ],
- )
- def test_constructor_nanosecond(self, result):
- # GH 18898
- expected = Timestamp(datetime(2000, 1, 2, 3, 4, 5, 6), tz=result.tz)
- expected = expected + Timedelta(nanoseconds=1)
- assert result == expected
-
- @pytest.mark.parametrize("z", ["Z0", "Z00"])
- def test_constructor_invalid_Z0_isostring(self, z):
- # GH 8910
- with pytest.raises(ValueError):
- Timestamp("2014-11-02 01:00{}".format(z))
-
- @pytest.mark.parametrize(
- "arg",
- [
- "year",
- "month",
- "day",
- "hour",
- "minute",
- "second",
- "microsecond",
- "nanosecond",
- ],
- )
- def test_invalid_date_kwarg_with_string_input(self, arg):
- kwarg = {arg: 1}
- with pytest.raises(ValueError):
- Timestamp("2010-10-10 12:59:59.999999999", **kwarg)
-
- def test_out_of_bounds_integer_value(self):
- # GH#26651 check that we raise OutOfBoundsDatetime, not OverflowError
- with pytest.raises(OutOfBoundsDatetime):
- Timestamp(Timestamp.max.value * 2)
- with pytest.raises(OutOfBoundsDatetime):
- Timestamp(Timestamp.min.value * 2)
-
- def test_out_of_bounds_value(self):
- one_us = np.timedelta64(1).astype("timedelta64[us]")
-
- # By definition we can't go out of bounds in [ns], so we
- # convert the datetime64s to [us] so we can go out of bounds
- min_ts_us = np.datetime64(Timestamp.min).astype("M8[us]")
- max_ts_us = np.datetime64(Timestamp.max).astype("M8[us]")
-
- # No error for the min/max datetimes
- Timestamp(min_ts_us)
- Timestamp(max_ts_us)
-
- # One us less than the minimum is an error
- with pytest.raises(ValueError):
- Timestamp(min_ts_us - one_us)
-
- # One us more than the maximum is an error
- with pytest.raises(ValueError):
- Timestamp(max_ts_us + one_us)
-
- def test_out_of_bounds_string(self):
- with pytest.raises(ValueError):
- Timestamp("1676-01-01")
- with pytest.raises(ValueError):
- Timestamp("2263-01-01")
-
- def test_barely_out_of_bounds(self):
- # GH#19529
- # GH#19382 close enough to bounds that dropping nanos would result
- # in an in-bounds datetime
- with pytest.raises(OutOfBoundsDatetime):
- Timestamp("2262-04-11 23:47:16.854775808")
-
- def test_bounds_with_different_units(self):
- out_of_bounds_dates = ("1677-09-21", "2262-04-12")
-
- time_units = ("D", "h", "m", "s", "ms", "us")
-
- for date_string in out_of_bounds_dates:
- for unit in time_units:
- dt64 = np.datetime64(date_string, unit)
- with pytest.raises(ValueError):
- Timestamp(dt64)
-
- in_bounds_dates = ("1677-09-23", "2262-04-11")
-
- for date_string in in_bounds_dates:
- for unit in time_units:
- dt64 = np.datetime64(date_string, unit)
- Timestamp(dt64)
-
- def test_min_valid(self):
- # Ensure that Timestamp.min is a valid Timestamp
- Timestamp(Timestamp.min)
-
- def test_max_valid(self):
- # Ensure that Timestamp.max is a valid Timestamp
- Timestamp(Timestamp.max)
-
- def test_now(self):
- # GH#9000
- ts_from_string = Timestamp("now")
- ts_from_method = Timestamp.now()
- ts_datetime = datetime.now()
-
- ts_from_string_tz = Timestamp("now", tz="US/Eastern")
- ts_from_method_tz = Timestamp.now(tz="US/Eastern")
-
- # Check that the delta between the times is less than 1s (arbitrarily
- # small)
- delta = Timedelta(seconds=1)
- assert abs(ts_from_method - ts_from_string) < delta
- assert abs(ts_datetime - ts_from_method) < delta
- assert abs(ts_from_method_tz - ts_from_string_tz) < delta
- assert (
- abs(
- ts_from_string_tz.tz_localize(None)
- - ts_from_method_tz.tz_localize(None)
- )
- < delta
- )
-
- def test_today(self):
- ts_from_string = Timestamp("today")
- ts_from_method = Timestamp.today()
- ts_datetime = datetime.today()
-
- ts_from_string_tz = Timestamp("today", tz="US/Eastern")
- ts_from_method_tz = Timestamp.today(tz="US/Eastern")
-
- # Check that the delta between the times is less than 1s (arbitrarily
- # small)
- delta = Timedelta(seconds=1)
- assert abs(ts_from_method - ts_from_string) < delta
- assert abs(ts_datetime - ts_from_method) < delta
- assert abs(ts_from_method_tz - ts_from_string_tz) < delta
- assert (
- abs(
- ts_from_string_tz.tz_localize(None)
- - ts_from_method_tz.tz_localize(None)
- )
- < delta
- )
-
- @pytest.mark.parametrize("tz", [None, pytz.timezone("US/Pacific")])
- def test_disallow_setting_tz(self, tz):
- # GH 3746
- ts = Timestamp("2010")
- with pytest.raises(AttributeError):
- ts.tz = tz
-
- @pytest.mark.parametrize("offset", ["+0300", "+0200"])
- def test_construct_timestamp_near_dst(self, offset):
- # GH 20854
- expected = Timestamp(
- "2016-10-30 03:00:00{}".format(offset), tz="Europe/Helsinki"
- )
- result = Timestamp(expected).tz_convert("Europe/Helsinki")
- assert result == expected
-
- @pytest.mark.parametrize(
- "arg", ["2013/01/01 00:00:00+09:00", "2013-01-01 00:00:00+09:00"]
- )
- def test_construct_with_different_string_format(self, arg):
- # GH 12064
- result = Timestamp(arg)
- expected = Timestamp(datetime(2013, 1, 1), tz=pytz.FixedOffset(540))
- assert result == expected
-
- def test_construct_timestamp_preserve_original_frequency(self):
- # GH 22311
- result = Timestamp(Timestamp("2010-08-08", freq="D")).freq
- expected = offsets.Day()
- assert result == expected
-
- def test_constructor_invalid_frequency(self):
- # GH 22311
- with pytest.raises(ValueError, match="Invalid frequency:"):
- Timestamp("2012-01-01", freq=[])
-
- @pytest.mark.parametrize("box", [datetime, Timestamp])
- def test_raise_tz_and_tzinfo_in_datetime_input(self, box):
- # GH 23579
- kwargs = {"year": 2018, "month": 1, "day": 1, "tzinfo": utc}
- with pytest.raises(ValueError, match="Cannot pass a datetime or Timestamp"):
- Timestamp(box(**kwargs), tz="US/Pacific")
- with pytest.raises(ValueError, match="Cannot pass a datetime or Timestamp"):
- Timestamp(box(**kwargs), tzinfo=pytz.timezone("US/Pacific"))
-
- def test_dont_convert_dateutil_utc_to_pytz_utc(self):
- result = Timestamp(datetime(2018, 1, 1), tz=tzutc())
- expected = Timestamp(datetime(2018, 1, 1)).tz_localize(tzutc())
- assert result == expected
-
- def test_constructor_subclassed_datetime(self):
- # GH 25851
- # ensure that subclassed datetime works for
- # Timestamp creation
- class SubDatetime(datetime):
- pass
-
- data = SubDatetime(2000, 1, 1)
- result = Timestamp(data)
- expected = Timestamp(2000, 1, 1)
- assert result == expected
-
- @pytest.mark.skipif(
- not compat.PY38,
- reason="datetime.fromisocalendar was added in Python version 3.8",
- )
- def test_constructor_fromisocalendar(self):
- # GH 30395
- expected_timestamp = Timestamp("2000-01-03 00:00:00")
- expected_stdlib = datetime.fromisocalendar(2000, 1, 1)
- result = Timestamp.fromisocalendar(2000, 1, 1)
- assert result == expected_timestamp
- assert result == expected_stdlib
- assert isinstance(result, Timestamp)
-
-
class TestTimestamp:
def test_tz(self):
tstr = "2014-02-01 09:00"
@@ -1075,34 +564,3 @@ def test_dt_subclass_add_timedelta(lh, rh):
result = lh + rh
expected = SubDatetime(2000, 1, 1, 1)
assert result == expected
-
-
-def test_constructor_ambigous_dst():
- # GH 24329
- # Make sure that calling Timestamp constructor
- # on Timestamp created from ambiguous time
- # doesn't change Timestamp.value
- ts = Timestamp(1382835600000000000, tz="dateutil/Europe/London")
- expected = ts.value
- result = Timestamp(ts).value
- assert result == expected
-
-
-@pytest.mark.xfail(
- LooseVersion(compat._optional._get_version(dateutil)) < LooseVersion("2.7.0"),
- reason="dateutil moved to Timedelta.total_seconds() in 2.7.0",
-)
-@pytest.mark.parametrize("epoch", [1552211999999999872, 1552211999999999999])
-def test_constructor_before_dst_switch(epoch):
- # GH 31043
- # Make sure that calling Timestamp constructor
- # on time just before DST switch doesn't lead to
- # nonexistent time or value change
- # Works only with dateutil >= 2.7.0 as dateutil overrid
- # pandas.Timedelta.total_seconds with
- # datetime.timedelta.total_seconds before
- ts = Timestamp(epoch, tz="dateutil/US/Pacific")
- result = ts.tz.dst(ts)
- expected = timedelta(seconds=0)
- assert Timestamp(ts).value == epoch
- assert result == expected
diff --git a/pandas/tests/series/indexing/test_boolean.py b/pandas/tests/series/indexing/test_boolean.py
index 16a29d10eb414..28f3c0f7429f8 100644
--- a/pandas/tests/series/indexing/test_boolean.py
+++ b/pandas/tests/series/indexing/test_boolean.py
@@ -1,10 +1,7 @@
import numpy as np
import pytest
-from pandas.core.dtypes.common import is_integer
-
-import pandas as pd
-from pandas import Index, Series, Timestamp, date_range, isna
+from pandas import Index, Series
import pandas._testing as tm
from pandas.core.indexing import IndexingError
@@ -136,492 +133,3 @@ def test_get_set_boolean_different_order(string_series):
sel = string_series[ordered > 0]
exp = string_series[string_series > 0]
tm.assert_series_equal(sel, exp)
-
-
-def test_where_unsafe_int(sint_dtype):
- s = Series(np.arange(10), dtype=sint_dtype)
- mask = s < 5
-
- s[mask] = range(2, 7)
- expected = Series(list(range(2, 7)) + list(range(5, 10)), dtype=sint_dtype)
-
- tm.assert_series_equal(s, expected)
-
-
-def test_where_unsafe_float(float_dtype):
- s = Series(np.arange(10), dtype=float_dtype)
- mask = s < 5
-
- s[mask] = range(2, 7)
- data = list(range(2, 7)) + list(range(5, 10))
- expected = Series(data, dtype=float_dtype)
-
- tm.assert_series_equal(s, expected)
-
-
-@pytest.mark.parametrize(
- "dtype,expected_dtype",
- [
- (np.int8, np.float64),
- (np.int16, np.float64),
- (np.int32, np.float64),
- (np.int64, np.float64),
- (np.float32, np.float32),
- (np.float64, np.float64),
- ],
-)
-def test_where_unsafe_upcast(dtype, expected_dtype):
- # see gh-9743
- s = Series(np.arange(10), dtype=dtype)
- values = [2.5, 3.5, 4.5, 5.5, 6.5]
- mask = s < 5
- expected = Series(values + list(range(5, 10)), dtype=expected_dtype)
- s[mask] = values
- tm.assert_series_equal(s, expected)
-
-
-def test_where_unsafe():
- # see gh-9731
- s = Series(np.arange(10), dtype="int64")
- values = [2.5, 3.5, 4.5, 5.5]
-
- mask = s > 5
- expected = Series(list(range(6)) + values, dtype="float64")
-
- s[mask] = values
- tm.assert_series_equal(s, expected)
-
- # see gh-3235
- s = Series(np.arange(10), dtype="int64")
- mask = s < 5
- s[mask] = range(2, 7)
- expected = Series(list(range(2, 7)) + list(range(5, 10)), dtype="int64")
- tm.assert_series_equal(s, expected)
- assert s.dtype == expected.dtype
-
- s = Series(np.arange(10), dtype="int64")
- mask = s > 5
- s[mask] = [0] * 4
- expected = Series([0, 1, 2, 3, 4, 5] + [0] * 4, dtype="int64")
- tm.assert_series_equal(s, expected)
-
- s = Series(np.arange(10))
- mask = s > 5
-
- msg = "cannot assign mismatch length to masked array"
- with pytest.raises(ValueError, match=msg):
- s[mask] = [5, 4, 3, 2, 1]
-
- with pytest.raises(ValueError, match=msg):
- s[mask] = [0] * 5
-
- # dtype changes
- s = Series([1, 2, 3, 4])
- result = s.where(s > 2, np.nan)
- expected = Series([np.nan, np.nan, 3, 4])
- tm.assert_series_equal(result, expected)
-
- # GH 4667
- # setting with None changes dtype
- s = Series(range(10)).astype(float)
- s[8] = None
- result = s[8]
- assert isna(result)
-
- s = Series(range(10)).astype(float)
- s[s > 8] = None
- result = s[isna(s)]
- expected = Series(np.nan, index=[9])
- tm.assert_series_equal(result, expected)
-
-
-def test_where():
- s = Series(np.random.randn(5))
- cond = s > 0
-
- rs = s.where(cond).dropna()
- rs2 = s[cond]
- tm.assert_series_equal(rs, rs2)
-
- rs = s.where(cond, -s)
- tm.assert_series_equal(rs, s.abs())
-
- rs = s.where(cond)
- assert s.shape == rs.shape
- assert rs is not s
-
- # test alignment
- cond = Series([True, False, False, True, False], index=s.index)
- s2 = -(s.abs())
-
- expected = s2[cond].reindex(s2.index[:3]).reindex(s2.index)
- rs = s2.where(cond[:3])
- tm.assert_series_equal(rs, expected)
-
- expected = s2.abs()
- expected.iloc[0] = s2[0]
- rs = s2.where(cond[:3], -s2)
- tm.assert_series_equal(rs, expected)
-
-
-def test_where_error():
- s = Series(np.random.randn(5))
- cond = s > 0
-
- msg = "Array conditional must be same shape as self"
- with pytest.raises(ValueError, match=msg):
- s.where(1)
- with pytest.raises(ValueError, match=msg):
- s.where(cond[:3].values, -s)
-
- # GH 2745
- s = Series([1, 2])
- s[[True, False]] = [0, 1]
- expected = Series([0, 2])
- tm.assert_series_equal(s, expected)
-
- # failures
- msg = "cannot assign mismatch length to masked array"
- with pytest.raises(ValueError, match=msg):
- s[[True, False]] = [0, 2, 3]
- msg = (
- "NumPy boolean array indexing assignment cannot assign 0 input "
- "values to the 1 output values where the mask is true"
- )
- with pytest.raises(ValueError, match=msg):
- s[[True, False]] = []
-
-
-@pytest.mark.parametrize("klass", [list, tuple, np.array, Series])
-def test_where_array_like(klass):
- # see gh-15414
- s = Series([1, 2, 3])
- cond = [False, True, True]
- expected = Series([np.nan, 2, 3])
-
- result = s.where(klass(cond))
- tm.assert_series_equal(result, expected)
-
-
-@pytest.mark.parametrize(
- "cond",
- [
- [1, 0, 1],
- Series([2, 5, 7]),
- ["True", "False", "True"],
- [Timestamp("2017-01-01"), pd.NaT, Timestamp("2017-01-02")],
- ],
-)
-def test_where_invalid_input(cond):
- # see gh-15414: only boolean arrays accepted
- s = Series([1, 2, 3])
- msg = "Boolean array expected for the condition"
-
- with pytest.raises(ValueError, match=msg):
- s.where(cond)
-
- msg = "Array conditional must be same shape as self"
- with pytest.raises(ValueError, match=msg):
- s.where([True])
-
-
-def test_where_ndframe_align():
- msg = "Array conditional must be same shape as self"
- s = Series([1, 2, 3])
-
- cond = [True]
- with pytest.raises(ValueError, match=msg):
- s.where(cond)
-
- expected = Series([1, np.nan, np.nan])
-
- out = s.where(Series(cond))
- tm.assert_series_equal(out, expected)
-
- cond = np.array([False, True, False, True])
- with pytest.raises(ValueError, match=msg):
- s.where(cond)
-
- expected = Series([np.nan, 2, np.nan])
-
- out = s.where(Series(cond))
- tm.assert_series_equal(out, expected)
-
-
-def test_where_setitem_invalid():
- # GH 2702
- # make sure correct exceptions are raised on invalid list assignment
-
- msg = "cannot set using a {} indexer with a different length than the value"
-
- # slice
- s = Series(list("abc"))
-
- with pytest.raises(ValueError, match=msg.format("slice")):
- s[0:3] = list(range(27))
-
- s[0:3] = list(range(3))
- expected = Series([0, 1, 2])
- tm.assert_series_equal(s.astype(np.int64), expected)
-
- # slice with step
- s = Series(list("abcdef"))
-
- with pytest.raises(ValueError, match=msg.format("slice")):
- s[0:4:2] = list(range(27))
-
- s = Series(list("abcdef"))
- s[0:4:2] = list(range(2))
- expected = Series([0, "b", 1, "d", "e", "f"])
- tm.assert_series_equal(s, expected)
-
- # neg slices
- s = Series(list("abcdef"))
-
- with pytest.raises(ValueError, match=msg.format("slice")):
- s[:-1] = list(range(27))
-
- s[-3:-1] = list(range(2))
- expected = Series(["a", "b", "c", 0, 1, "f"])
- tm.assert_series_equal(s, expected)
-
- # list
- s = Series(list("abc"))
-
- with pytest.raises(ValueError, match=msg.format("list-like")):
- s[[0, 1, 2]] = list(range(27))
-
- s = Series(list("abc"))
-
- with pytest.raises(ValueError, match=msg.format("list-like")):
- s[[0, 1, 2]] = list(range(2))
-
- # scalar
- s = Series(list("abc"))
- s[0] = list(range(10))
- expected = Series([list(range(10)), "b", "c"])
- tm.assert_series_equal(s, expected)
-
-
-@pytest.mark.parametrize("size", range(2, 6))
-@pytest.mark.parametrize(
- "mask", [[True, False, False, False, False], [True, False], [False]]
-)
-@pytest.mark.parametrize(
- "item", [2.0, np.nan, np.finfo(np.float).max, np.finfo(np.float).min]
-)
-# Test numpy arrays, lists and tuples as the input to be
-# broadcast
-@pytest.mark.parametrize(
- "box", [lambda x: np.array([x]), lambda x: [x], lambda x: (x,)]
-)
-def test_broadcast(size, mask, item, box):
- selection = np.resize(mask, size)
-
- data = np.arange(size, dtype=float)
-
- # Construct the expected series by taking the source
- # data or item based on the selection
- expected = Series(
- [item if use_item else data[i] for i, use_item in enumerate(selection)]
- )
-
- s = Series(data)
- s[selection] = box(item)
- tm.assert_series_equal(s, expected)
-
- s = Series(data)
- result = s.where(~selection, box(item))
- tm.assert_series_equal(result, expected)
-
- s = Series(data)
- result = s.mask(selection, box(item))
- tm.assert_series_equal(result, expected)
-
-
-def test_where_inplace():
- s = Series(np.random.randn(5))
- cond = s > 0
-
- rs = s.copy()
-
- rs.where(cond, inplace=True)
- tm.assert_series_equal(rs.dropna(), s[cond])
- tm.assert_series_equal(rs, s.where(cond))
-
- rs = s.copy()
- rs.where(cond, -s, inplace=True)
- tm.assert_series_equal(rs, s.where(cond, -s))
-
-
-def test_where_dups():
- # GH 4550
- # where crashes with dups in index
- s1 = Series(list(range(3)))
- s2 = Series(list(range(3)))
- comb = pd.concat([s1, s2])
- result = comb.where(comb < 2)
- expected = Series([0, 1, np.nan, 0, 1, np.nan], index=[0, 1, 2, 0, 1, 2])
- tm.assert_series_equal(result, expected)
-
- # GH 4548
- # inplace updating not working with dups
- comb[comb < 1] = 5
- expected = Series([5, 1, 2, 5, 1, 2], index=[0, 1, 2, 0, 1, 2])
- tm.assert_series_equal(comb, expected)
-
- comb[comb < 2] += 10
- expected = Series([5, 11, 2, 5, 11, 2], index=[0, 1, 2, 0, 1, 2])
- tm.assert_series_equal(comb, expected)
-
-
-def test_where_numeric_with_string():
- # GH 9280
- s = pd.Series([1, 2, 3])
- w = s.where(s > 1, "X")
-
- assert not is_integer(w[0])
- assert is_integer(w[1])
- assert is_integer(w[2])
- assert isinstance(w[0], str)
- assert w.dtype == "object"
-
- w = s.where(s > 1, ["X", "Y", "Z"])
- assert not is_integer(w[0])
- assert is_integer(w[1])
- assert is_integer(w[2])
- assert isinstance(w[0], str)
- assert w.dtype == "object"
-
- w = s.where(s > 1, np.array(["X", "Y", "Z"]))
- assert not is_integer(w[0])
- assert is_integer(w[1])
- assert is_integer(w[2])
- assert isinstance(w[0], str)
- assert w.dtype == "object"
-
-
-def test_where_timedelta_coerce():
- s = Series([1, 2], dtype="timedelta64[ns]")
- expected = Series([10, 10])
- mask = np.array([False, False])
-
- rs = s.where(mask, [10, 10])
- tm.assert_series_equal(rs, expected)
-
- rs = s.where(mask, 10)
- tm.assert_series_equal(rs, expected)
-
- rs = s.where(mask, 10.0)
- tm.assert_series_equal(rs, expected)
-
- rs = s.where(mask, [10.0, 10.0])
- tm.assert_series_equal(rs, expected)
-
- rs = s.where(mask, [10.0, np.nan])
- expected = Series([10, None], dtype="object")
- tm.assert_series_equal(rs, expected)
-
-
-def test_where_datetime_conversion():
- s = Series(date_range("20130102", periods=2))
- expected = Series([10, 10])
- mask = np.array([False, False])
-
- rs = s.where(mask, [10, 10])
- tm.assert_series_equal(rs, expected)
-
- rs = s.where(mask, 10)
- tm.assert_series_equal(rs, expected)
-
- rs = s.where(mask, 10.0)
- tm.assert_series_equal(rs, expected)
-
- rs = s.where(mask, [10.0, 10.0])
- tm.assert_series_equal(rs, expected)
-
- rs = s.where(mask, [10.0, np.nan])
- expected = Series([10, None], dtype="object")
- tm.assert_series_equal(rs, expected)
-
- # GH 15701
- timestamps = ["2016-12-31 12:00:04+00:00", "2016-12-31 12:00:04.010000+00:00"]
- s = Series([pd.Timestamp(t) for t in timestamps])
- rs = s.where(Series([False, True]))
- expected = Series([pd.NaT, s[1]])
- tm.assert_series_equal(rs, expected)
-
-
-def test_where_dt_tz_values(tz_naive_fixture):
- ser1 = pd.Series(
- pd.DatetimeIndex(["20150101", "20150102", "20150103"], tz=tz_naive_fixture)
- )
- ser2 = pd.Series(
- pd.DatetimeIndex(["20160514", "20160515", "20160516"], tz=tz_naive_fixture)
- )
- mask = pd.Series([True, True, False])
- result = ser1.where(mask, ser2)
- exp = pd.Series(
- pd.DatetimeIndex(["20150101", "20150102", "20160516"], tz=tz_naive_fixture)
- )
- tm.assert_series_equal(exp, result)
-
-
-def test_mask():
- # compare with tested results in test_where
- s = Series(np.random.randn(5))
- cond = s > 0
-
- rs = s.where(~cond, np.nan)
- tm.assert_series_equal(rs, s.mask(cond))
-
- rs = s.where(~cond)
- rs2 = s.mask(cond)
- tm.assert_series_equal(rs, rs2)
-
- rs = s.where(~cond, -s)
- rs2 = s.mask(cond, -s)
- tm.assert_series_equal(rs, rs2)
-
- cond = Series([True, False, False, True, False], index=s.index)
- s2 = -(s.abs())
- rs = s2.where(~cond[:3])
- rs2 = s2.mask(cond[:3])
- tm.assert_series_equal(rs, rs2)
-
- rs = s2.where(~cond[:3], -s2)
- rs2 = s2.mask(cond[:3], -s2)
- tm.assert_series_equal(rs, rs2)
-
- msg = "Array conditional must be same shape as self"
- with pytest.raises(ValueError, match=msg):
- s.mask(1)
- with pytest.raises(ValueError, match=msg):
- s.mask(cond[:3].values, -s)
-
- # dtype changes
- s = Series([1, 2, 3, 4])
- result = s.mask(s > 2, np.nan)
- expected = Series([1, 2, np.nan, np.nan])
- tm.assert_series_equal(result, expected)
-
- # see gh-21891
- s = Series([1, 2])
- res = s.mask([True, False])
-
- exp = Series([np.nan, 2])
- tm.assert_series_equal(res, exp)
-
-
-def test_mask_inplace():
- s = Series(np.random.randn(5))
- cond = s > 0
-
- rs = s.copy()
- rs.mask(cond, inplace=True)
- tm.assert_series_equal(rs.dropna(), s[~cond])
- tm.assert_series_equal(rs, s.mask(cond))
-
- rs = s.copy()
- rs.mask(cond, -s, inplace=True)
- tm.assert_series_equal(rs, s.mask(cond, -s))
diff --git a/pandas/tests/series/indexing/test_datetime.py b/pandas/tests/series/indexing/test_datetime.py
index 77085ef547690..acaa9de88a836 100644
--- a/pandas/tests/series/indexing/test_datetime.py
+++ b/pandas/tests/series/indexing/test_datetime.py
@@ -1,4 +1,5 @@
from datetime import datetime, timedelta
+import re
import numpy as np
import pytest
@@ -147,7 +148,6 @@ def test_frame_datetime64_duplicated():
def test_getitem_setitem_datetime_tz_pytz():
from pytz import timezone as tz
- from pandas import date_range
N = 50
# testing with timezone, GH #2785
@@ -188,8 +188,6 @@ def test_getitem_setitem_datetime_tz_dateutil():
lambda x: tzutc() if x == "UTC" else gettz(x)
) # handle special case for utc in dateutil
- from pandas import date_range
-
N = 50
# testing with timezone, GH #2785
@@ -372,7 +370,6 @@ def test_getitem_median_slice_bug():
def test_datetime_indexing():
- from pandas import date_range
index = date_range("1/1/2000", "1/7/2000")
index = index.repeat(3)
@@ -380,7 +377,7 @@ def test_datetime_indexing():
s = Series(len(index), index=index)
stamp = Timestamp("1/8/2000")
- with pytest.raises(KeyError, match=r"^947289600000000000$"):
+ with pytest.raises(KeyError, match=re.escape(repr(stamp))):
s[stamp]
s[stamp] = 0
assert s[stamp] == 0
@@ -389,7 +386,7 @@ def test_datetime_indexing():
s = Series(len(index), index=index)
s = s[::-1]
- with pytest.raises(KeyError, match=r"^947289600000000000$"):
+ with pytest.raises(KeyError, match=re.escape(repr(stamp))):
s[stamp]
s[stamp] = 0
assert s[stamp] == 0
@@ -495,8 +492,9 @@ def test_duplicate_dates_indexing(dups):
expected = Series(np.where(mask, 0, ts), index=ts.index)
tm.assert_series_equal(cp, expected)
- with pytest.raises(KeyError, match=r"^947116800000000000$"):
- ts[datetime(2000, 1, 6)]
+ key = datetime(2000, 1, 6)
+ with pytest.raises(KeyError, match=re.escape(repr(key))):
+ ts[key]
# new index
ts[datetime(2000, 1, 6)] = 0
diff --git a/pandas/tests/series/indexing/test_get.py b/pandas/tests/series/indexing/test_get.py
new file mode 100644
index 0000000000000..438b61ed203a3
--- /dev/null
+++ b/pandas/tests/series/indexing/test_get.py
@@ -0,0 +1,134 @@
+import numpy as np
+
+import pandas as pd
+from pandas import Series
+
+
+def test_get():
+ # GH 6383
+ s = Series(
+ np.array(
+ [
+ 43,
+ 48,
+ 60,
+ 48,
+ 50,
+ 51,
+ 50,
+ 45,
+ 57,
+ 48,
+ 56,
+ 45,
+ 51,
+ 39,
+ 55,
+ 43,
+ 54,
+ 52,
+ 51,
+ 54,
+ ]
+ )
+ )
+
+ result = s.get(25, 0)
+ expected = 0
+ assert result == expected
+
+ s = Series(
+ np.array(
+ [
+ 43,
+ 48,
+ 60,
+ 48,
+ 50,
+ 51,
+ 50,
+ 45,
+ 57,
+ 48,
+ 56,
+ 45,
+ 51,
+ 39,
+ 55,
+ 43,
+ 54,
+ 52,
+ 51,
+ 54,
+ ]
+ ),
+ index=pd.Float64Index(
+ [
+ 25.0,
+ 36.0,
+ 49.0,
+ 64.0,
+ 81.0,
+ 100.0,
+ 121.0,
+ 144.0,
+ 169.0,
+ 196.0,
+ 1225.0,
+ 1296.0,
+ 1369.0,
+ 1444.0,
+ 1521.0,
+ 1600.0,
+ 1681.0,
+ 1764.0,
+ 1849.0,
+ 1936.0,
+ ]
+ ),
+ )
+
+ result = s.get(25, 0)
+ expected = 43
+ assert result == expected
+
+ # GH 7407
+ # with a boolean accessor
+ df = pd.DataFrame({"i": [0] * 3, "b": [False] * 3})
+ vc = df.i.value_counts()
+ result = vc.get(99, default="Missing")
+ assert result == "Missing"
+
+ vc = df.b.value_counts()
+ result = vc.get(False, default="Missing")
+ assert result == 3
+
+ result = vc.get(True, default="Missing")
+ assert result == "Missing"
+
+
+def test_get_nan():
+ # GH 8569
+ s = pd.Float64Index(range(10)).to_series()
+ assert s.get(np.nan) is None
+ assert s.get(np.nan, default="Missing") == "Missing"
+
+
+def test_get_nan_multiple():
+ # GH 8569
+ # ensure that fixing "test_get_nan" above hasn't broken get
+ # with multiple elements
+ s = pd.Float64Index(range(10)).to_series()
+
+ idx = [2, 30]
+ assert s.get(idx) is None
+
+ idx = [2, np.nan]
+ assert s.get(idx) is None
+
+ # GH 17295 - all missing keys
+ idx = [20, 30]
+ assert s.get(idx) is None
+
+ idx = [np.nan, np.nan]
+ assert s.get(idx) is None
diff --git a/pandas/tests/series/indexing/test_indexing.py b/pandas/tests/series/indexing/test_indexing.py
index 18dbd22b73b35..fa5c75d5e4ad9 100644
--- a/pandas/tests/series/indexing/test_indexing.py
+++ b/pandas/tests/series/indexing/test_indexing.py
@@ -17,10 +17,9 @@
def test_basic_indexing():
s = Series(np.random.randn(5), index=["a", "b", "a", "a", "b"])
- msg = "index out of bounds"
+ msg = "index 5 is out of bounds for axis 0 with size 5"
with pytest.raises(IndexError, match=msg):
s[5]
- msg = "index 5 is out of bounds for axis 0 with size 5"
with pytest.raises(IndexError, match=msg):
s[5] = 0
@@ -29,7 +28,6 @@ def test_basic_indexing():
s = s.sort_index()
- msg = r"index out of bounds|^5$"
with pytest.raises(IndexError, match=msg):
s[5]
msg = r"index 5 is out of bounds for axis (0|1) with size 5|^5$"
@@ -165,11 +163,12 @@ def test_getitem_with_duplicates_indices(result_1, duplicate_item, expected_1):
def test_getitem_out_of_bounds(datetime_series):
# don't segfault, GH #495
- msg = "index out of bounds"
+ msg = r"index \d+ is out of bounds for axis 0 with size \d+"
with pytest.raises(IndexError, match=msg):
datetime_series[len(datetime_series)]
# GH #917
+ msg = r"index -\d+ is out of bounds for axis 0 with size \d+"
s = Series([], dtype=object)
with pytest.raises(IndexError, match=msg):
s[-1]
@@ -430,7 +429,7 @@ def test_basic_getitem_setitem_corner(datetime_series):
@pytest.mark.parametrize("tz", ["US/Eastern", "UTC", "Asia/Tokyo"])
def test_setitem_with_tz(tz):
orig = pd.Series(pd.date_range("2016-01-01", freq="H", periods=3, tz=tz))
- assert orig.dtype == "datetime64[ns, {0}]".format(tz)
+ assert orig.dtype == f"datetime64[ns, {tz}]"
# scalar
s = orig.copy()
@@ -457,7 +456,7 @@ def test_setitem_with_tz(tz):
[pd.Timestamp("2011-01-01", tz=tz), pd.Timestamp("2012-01-01", tz=tz)],
index=[1, 2],
)
- assert vals.dtype == "datetime64[ns, {0}]".format(tz)
+ assert vals.dtype == f"datetime64[ns, {tz}]"
s[[1, 2]] = vals
exp = pd.Series(
@@ -482,7 +481,7 @@ def test_setitem_with_tz_dst():
# GH XXX
tz = "US/Eastern"
orig = pd.Series(pd.date_range("2016-11-06", freq="H", periods=3, tz=tz))
- assert orig.dtype == "datetime64[ns, {0}]".format(tz)
+ assert orig.dtype == f"datetime64[ns, {tz}]"
# scalar
s = orig.copy()
@@ -509,7 +508,7 @@ def test_setitem_with_tz_dst():
[pd.Timestamp("2011-01-01", tz=tz), pd.Timestamp("2012-01-01", tz=tz)],
index=[1, 2],
)
- assert vals.dtype == "datetime64[ns, {0}]".format(tz)
+ assert vals.dtype == f"datetime64[ns, {tz}]"
s[[1, 2]] = vals
exp = pd.Series(
@@ -883,41 +882,6 @@ def test_pop():
tm.assert_series_equal(k, expected)
-def test_take():
- s = Series([-1, 5, 6, 2, 4])
-
- actual = s.take([1, 3, 4])
- expected = Series([5, 2, 4], index=[1, 3, 4])
- tm.assert_series_equal(actual, expected)
-
- actual = s.take([-1, 3, 4])
- expected = Series([4, 2, 4], index=[4, 3, 4])
- tm.assert_series_equal(actual, expected)
-
- msg = "index {} is out of bounds for( axis 0 with)? size 5"
- with pytest.raises(IndexError, match=msg.format(10)):
- s.take([1, 10])
- with pytest.raises(IndexError, match=msg.format(5)):
- s.take([2, 5])
-
-
-def test_take_categorical():
- # https://github.com/pandas-dev/pandas/issues/20664
- s = Series(pd.Categorical(["a", "b", "c"]))
- result = s.take([-2, -2, 0])
- expected = Series(
- pd.Categorical(["b", "b", "a"], categories=["a", "b", "c"]), index=[1, 1, 0]
- )
- tm.assert_series_equal(result, expected)
-
-
-def test_head_tail(string_series):
- tm.assert_series_equal(string_series.head(), string_series[:5])
- tm.assert_series_equal(string_series.head(0), string_series[0:0])
- tm.assert_series_equal(string_series.tail(), string_series[-5:])
- tm.assert_series_equal(string_series.tail(0), string_series[0:0])
-
-
def test_uint_drop(any_int_dtype):
# see GH18311
# assigning series.loc[0] = 4 changed series.dtype to int
diff --git a/pandas/tests/series/indexing/test_mask.py b/pandas/tests/series/indexing/test_mask.py
new file mode 100644
index 0000000000000..dc4fb530dbb52
--- /dev/null
+++ b/pandas/tests/series/indexing/test_mask.py
@@ -0,0 +1,65 @@
+import numpy as np
+import pytest
+
+from pandas import Series
+import pandas._testing as tm
+
+
+def test_mask():
+ # compare with tested results in test_where
+ s = Series(np.random.randn(5))
+ cond = s > 0
+
+ rs = s.where(~cond, np.nan)
+ tm.assert_series_equal(rs, s.mask(cond))
+
+ rs = s.where(~cond)
+ rs2 = s.mask(cond)
+ tm.assert_series_equal(rs, rs2)
+
+ rs = s.where(~cond, -s)
+ rs2 = s.mask(cond, -s)
+ tm.assert_series_equal(rs, rs2)
+
+ cond = Series([True, False, False, True, False], index=s.index)
+ s2 = -(s.abs())
+ rs = s2.where(~cond[:3])
+ rs2 = s2.mask(cond[:3])
+ tm.assert_series_equal(rs, rs2)
+
+ rs = s2.where(~cond[:3], -s2)
+ rs2 = s2.mask(cond[:3], -s2)
+ tm.assert_series_equal(rs, rs2)
+
+ msg = "Array conditional must be same shape as self"
+ with pytest.raises(ValueError, match=msg):
+ s.mask(1)
+ with pytest.raises(ValueError, match=msg):
+ s.mask(cond[:3].values, -s)
+
+ # dtype changes
+ s = Series([1, 2, 3, 4])
+ result = s.mask(s > 2, np.nan)
+ expected = Series([1, 2, np.nan, np.nan])
+ tm.assert_series_equal(result, expected)
+
+ # see gh-21891
+ s = Series([1, 2])
+ res = s.mask([True, False])
+
+ exp = Series([np.nan, 2])
+ tm.assert_series_equal(res, exp)
+
+
+def test_mask_inplace():
+ s = Series(np.random.randn(5))
+ cond = s > 0
+
+ rs = s.copy()
+ rs.mask(cond, inplace=True)
+ tm.assert_series_equal(rs.dropna(), s[~cond])
+ tm.assert_series_equal(rs, s.mask(cond))
+
+ rs = s.copy()
+ rs.mask(cond, -s, inplace=True)
+ tm.assert_series_equal(rs, s.mask(cond, -s))
diff --git a/pandas/tests/series/indexing/test_numeric.py b/pandas/tests/series/indexing/test_numeric.py
index 3684ca00c2f17..7e73e6366438b 100644
--- a/pandas/tests/series/indexing/test_numeric.py
+++ b/pandas/tests/series/indexing/test_numeric.py
@@ -1,141 +1,10 @@
import numpy as np
import pytest
-import pandas as pd
from pandas import DataFrame, Index, Series
import pandas._testing as tm
-def test_get():
- # GH 6383
- s = Series(
- np.array(
- [
- 43,
- 48,
- 60,
- 48,
- 50,
- 51,
- 50,
- 45,
- 57,
- 48,
- 56,
- 45,
- 51,
- 39,
- 55,
- 43,
- 54,
- 52,
- 51,
- 54,
- ]
- )
- )
-
- result = s.get(25, 0)
- expected = 0
- assert result == expected
-
- s = Series(
- np.array(
- [
- 43,
- 48,
- 60,
- 48,
- 50,
- 51,
- 50,
- 45,
- 57,
- 48,
- 56,
- 45,
- 51,
- 39,
- 55,
- 43,
- 54,
- 52,
- 51,
- 54,
- ]
- ),
- index=pd.Float64Index(
- [
- 25.0,
- 36.0,
- 49.0,
- 64.0,
- 81.0,
- 100.0,
- 121.0,
- 144.0,
- 169.0,
- 196.0,
- 1225.0,
- 1296.0,
- 1369.0,
- 1444.0,
- 1521.0,
- 1600.0,
- 1681.0,
- 1764.0,
- 1849.0,
- 1936.0,
- ]
- ),
- )
-
- result = s.get(25, 0)
- expected = 43
- assert result == expected
-
- # GH 7407
- # with a boolean accessor
- df = pd.DataFrame({"i": [0] * 3, "b": [False] * 3})
- vc = df.i.value_counts()
- result = vc.get(99, default="Missing")
- assert result == "Missing"
-
- vc = df.b.value_counts()
- result = vc.get(False, default="Missing")
- assert result == 3
-
- result = vc.get(True, default="Missing")
- assert result == "Missing"
-
-
-def test_get_nan():
- # GH 8569
- s = pd.Float64Index(range(10)).to_series()
- assert s.get(np.nan) is None
- assert s.get(np.nan, default="Missing") == "Missing"
-
-
-def test_get_nan_multiple():
- # GH 8569
- # ensure that fixing "test_get_nan" above hasn't broken get
- # with multiple elements
- s = pd.Float64Index(range(10)).to_series()
-
- idx = [2, 30]
- assert s.get(idx) is None
-
- idx = [2, np.nan]
- assert s.get(idx) is None
-
- # GH 17295 - all missing keys
- idx = [20, 30]
- assert s.get(idx) is None
-
- idx = [np.nan, np.nan]
- assert s.get(idx) is None
-
-
def test_delitem():
# GH 5542
# should delete the item inplace
@@ -202,10 +71,9 @@ def test_slice_float64():
def test_getitem_negative_out_of_bounds():
s = Series(tm.rands_array(5, 10), index=tm.rands_array(10, 10))
- msg = "index out of bounds"
+ msg = "index -11 is out of bounds for axis 0 with size 10"
with pytest.raises(IndexError, match=msg):
s[-11]
- msg = "index -11 is out of bounds for axis 0 with size 10"
with pytest.raises(IndexError, match=msg):
s[-11] = "foo"
@@ -260,9 +128,8 @@ def test_setitem_float_labels():
def test_slice_float_get_set(datetime_series):
msg = (
- r"cannot do slice indexing on with these indexers \[{key}\] "
- r"of "
+ "cannot do slice indexing on DatetimeIndex with these indexers "
+ r"\[{key}\] of type float"
)
with pytest.raises(TypeError, match=msg.format(key=r"4\.0")):
datetime_series[4.0:10.0]
diff --git a/pandas/tests/series/indexing/test_take.py b/pandas/tests/series/indexing/test_take.py
new file mode 100644
index 0000000000000..9368d49e5ff2b
--- /dev/null
+++ b/pandas/tests/series/indexing/test_take.py
@@ -0,0 +1,33 @@
+import pytest
+
+import pandas as pd
+from pandas import Series
+import pandas._testing as tm
+
+
+def test_take():
+ ser = Series([-1, 5, 6, 2, 4])
+
+ actual = ser.take([1, 3, 4])
+ expected = Series([5, 2, 4], index=[1, 3, 4])
+ tm.assert_series_equal(actual, expected)
+
+ actual = ser.take([-1, 3, 4])
+ expected = Series([4, 2, 4], index=[4, 3, 4])
+ tm.assert_series_equal(actual, expected)
+
+ msg = "index {} is out of bounds for( axis 0 with)? size 5"
+ with pytest.raises(IndexError, match=msg.format(10)):
+ ser.take([1, 10])
+ with pytest.raises(IndexError, match=msg.format(5)):
+ ser.take([2, 5])
+
+
+def test_take_categorical():
+ # https://github.com/pandas-dev/pandas/issues/20664
+ ser = Series(pd.Categorical(["a", "b", "c"]))
+ result = ser.take([-2, -2, 0])
+ expected = Series(
+ pd.Categorical(["b", "b", "a"], categories=["a", "b", "c"]), index=[1, 1, 0]
+ )
+ tm.assert_series_equal(result, expected)
diff --git a/pandas/tests/series/indexing/test_where.py b/pandas/tests/series/indexing/test_where.py
new file mode 100644
index 0000000000000..9703f5afaf689
--- /dev/null
+++ b/pandas/tests/series/indexing/test_where.py
@@ -0,0 +1,437 @@
+import numpy as np
+import pytest
+
+from pandas.core.dtypes.common import is_integer
+
+import pandas as pd
+from pandas import Series, Timestamp, date_range, isna
+import pandas._testing as tm
+
+
+def test_where_unsafe_int(sint_dtype):
+ s = Series(np.arange(10), dtype=sint_dtype)
+ mask = s < 5
+
+ s[mask] = range(2, 7)
+ expected = Series(list(range(2, 7)) + list(range(5, 10)), dtype=sint_dtype)
+
+ tm.assert_series_equal(s, expected)
+
+
+def test_where_unsafe_float(float_dtype):
+ s = Series(np.arange(10), dtype=float_dtype)
+ mask = s < 5
+
+ s[mask] = range(2, 7)
+ data = list(range(2, 7)) + list(range(5, 10))
+ expected = Series(data, dtype=float_dtype)
+
+ tm.assert_series_equal(s, expected)
+
+
+@pytest.mark.parametrize(
+ "dtype,expected_dtype",
+ [
+ (np.int8, np.float64),
+ (np.int16, np.float64),
+ (np.int32, np.float64),
+ (np.int64, np.float64),
+ (np.float32, np.float32),
+ (np.float64, np.float64),
+ ],
+)
+def test_where_unsafe_upcast(dtype, expected_dtype):
+ # see gh-9743
+ s = Series(np.arange(10), dtype=dtype)
+ values = [2.5, 3.5, 4.5, 5.5, 6.5]
+ mask = s < 5
+ expected = Series(values + list(range(5, 10)), dtype=expected_dtype)
+ s[mask] = values
+ tm.assert_series_equal(s, expected)
+
+
+def test_where_unsafe():
+ # see gh-9731
+ s = Series(np.arange(10), dtype="int64")
+ values = [2.5, 3.5, 4.5, 5.5]
+
+ mask = s > 5
+ expected = Series(list(range(6)) + values, dtype="float64")
+
+ s[mask] = values
+ tm.assert_series_equal(s, expected)
+
+ # see gh-3235
+ s = Series(np.arange(10), dtype="int64")
+ mask = s < 5
+ s[mask] = range(2, 7)
+ expected = Series(list(range(2, 7)) + list(range(5, 10)), dtype="int64")
+ tm.assert_series_equal(s, expected)
+ assert s.dtype == expected.dtype
+
+ s = Series(np.arange(10), dtype="int64")
+ mask = s > 5
+ s[mask] = [0] * 4
+ expected = Series([0, 1, 2, 3, 4, 5] + [0] * 4, dtype="int64")
+ tm.assert_series_equal(s, expected)
+
+ s = Series(np.arange(10))
+ mask = s > 5
+
+ msg = "cannot assign mismatch length to masked array"
+ with pytest.raises(ValueError, match=msg):
+ s[mask] = [5, 4, 3, 2, 1]
+
+ with pytest.raises(ValueError, match=msg):
+ s[mask] = [0] * 5
+
+ # dtype changes
+ s = Series([1, 2, 3, 4])
+ result = s.where(s > 2, np.nan)
+ expected = Series([np.nan, np.nan, 3, 4])
+ tm.assert_series_equal(result, expected)
+
+ # GH 4667
+ # setting with None changes dtype
+ s = Series(range(10)).astype(float)
+ s[8] = None
+ result = s[8]
+ assert isna(result)
+
+ s = Series(range(10)).astype(float)
+ s[s > 8] = None
+ result = s[isna(s)]
+ expected = Series(np.nan, index=[9])
+ tm.assert_series_equal(result, expected)
+
+
+def test_where():
+ s = Series(np.random.randn(5))
+ cond = s > 0
+
+ rs = s.where(cond).dropna()
+ rs2 = s[cond]
+ tm.assert_series_equal(rs, rs2)
+
+ rs = s.where(cond, -s)
+ tm.assert_series_equal(rs, s.abs())
+
+ rs = s.where(cond)
+ assert s.shape == rs.shape
+ assert rs is not s
+
+ # test alignment
+ cond = Series([True, False, False, True, False], index=s.index)
+ s2 = -(s.abs())
+
+ expected = s2[cond].reindex(s2.index[:3]).reindex(s2.index)
+ rs = s2.where(cond[:3])
+ tm.assert_series_equal(rs, expected)
+
+ expected = s2.abs()
+ expected.iloc[0] = s2[0]
+ rs = s2.where(cond[:3], -s2)
+ tm.assert_series_equal(rs, expected)
+
+
+def test_where_error():
+ s = Series(np.random.randn(5))
+ cond = s > 0
+
+ msg = "Array conditional must be same shape as self"
+ with pytest.raises(ValueError, match=msg):
+ s.where(1)
+ with pytest.raises(ValueError, match=msg):
+ s.where(cond[:3].values, -s)
+
+ # GH 2745
+ s = Series([1, 2])
+ s[[True, False]] = [0, 1]
+ expected = Series([0, 2])
+ tm.assert_series_equal(s, expected)
+
+ # failures
+ msg = "cannot assign mismatch length to masked array"
+ with pytest.raises(ValueError, match=msg):
+ s[[True, False]] = [0, 2, 3]
+ msg = (
+ "NumPy boolean array indexing assignment cannot assign 0 input "
+ "values to the 1 output values where the mask is true"
+ )
+ with pytest.raises(ValueError, match=msg):
+ s[[True, False]] = []
+
+
+@pytest.mark.parametrize("klass", [list, tuple, np.array, Series])
+def test_where_array_like(klass):
+ # see gh-15414
+ s = Series([1, 2, 3])
+ cond = [False, True, True]
+ expected = Series([np.nan, 2, 3])
+
+ result = s.where(klass(cond))
+ tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+ "cond",
+ [
+ [1, 0, 1],
+ Series([2, 5, 7]),
+ ["True", "False", "True"],
+ [Timestamp("2017-01-01"), pd.NaT, Timestamp("2017-01-02")],
+ ],
+)
+def test_where_invalid_input(cond):
+ # see gh-15414: only boolean arrays accepted
+ s = Series([1, 2, 3])
+ msg = "Boolean array expected for the condition"
+
+ with pytest.raises(ValueError, match=msg):
+ s.where(cond)
+
+ msg = "Array conditional must be same shape as self"
+ with pytest.raises(ValueError, match=msg):
+ s.where([True])
+
+
+def test_where_ndframe_align():
+ msg = "Array conditional must be same shape as self"
+ s = Series([1, 2, 3])
+
+ cond = [True]
+ with pytest.raises(ValueError, match=msg):
+ s.where(cond)
+
+ expected = Series([1, np.nan, np.nan])
+
+ out = s.where(Series(cond))
+ tm.assert_series_equal(out, expected)
+
+ cond = np.array([False, True, False, True])
+ with pytest.raises(ValueError, match=msg):
+ s.where(cond)
+
+ expected = Series([np.nan, 2, np.nan])
+
+ out = s.where(Series(cond))
+ tm.assert_series_equal(out, expected)
+
+
+def test_where_setitem_invalid():
+ # GH 2702
+ # make sure correct exceptions are raised on invalid list assignment
+
+ msg = "cannot set using a {} indexer with a different length than the value"
+
+ # slice
+ s = Series(list("abc"))
+
+ with pytest.raises(ValueError, match=msg.format("slice")):
+ s[0:3] = list(range(27))
+
+ s[0:3] = list(range(3))
+ expected = Series([0, 1, 2])
+ tm.assert_series_equal(s.astype(np.int64), expected)
+
+ # slice with step
+ s = Series(list("abcdef"))
+
+ with pytest.raises(ValueError, match=msg.format("slice")):
+ s[0:4:2] = list(range(27))
+
+ s = Series(list("abcdef"))
+ s[0:4:2] = list(range(2))
+ expected = Series([0, "b", 1, "d", "e", "f"])
+ tm.assert_series_equal(s, expected)
+
+ # neg slices
+ s = Series(list("abcdef"))
+
+ with pytest.raises(ValueError, match=msg.format("slice")):
+ s[:-1] = list(range(27))
+
+ s[-3:-1] = list(range(2))
+ expected = Series(["a", "b", "c", 0, 1, "f"])
+ tm.assert_series_equal(s, expected)
+
+ # list
+ s = Series(list("abc"))
+
+ with pytest.raises(ValueError, match=msg.format("list-like")):
+ s[[0, 1, 2]] = list(range(27))
+
+ s = Series(list("abc"))
+
+ with pytest.raises(ValueError, match=msg.format("list-like")):
+ s[[0, 1, 2]] = list(range(2))
+
+ # scalar
+ s = Series(list("abc"))
+ s[0] = list(range(10))
+ expected = Series([list(range(10)), "b", "c"])
+ tm.assert_series_equal(s, expected)
+
+
+@pytest.mark.parametrize("size", range(2, 6))
+@pytest.mark.parametrize(
+ "mask", [[True, False, False, False, False], [True, False], [False]]
+)
+@pytest.mark.parametrize(
+ "item", [2.0, np.nan, np.finfo(np.float).max, np.finfo(np.float).min]
+)
+# Test numpy arrays, lists and tuples as the input to be
+# broadcast
+@pytest.mark.parametrize(
+ "box", [lambda x: np.array([x]), lambda x: [x], lambda x: (x,)]
+)
+def test_broadcast(size, mask, item, box):
+ selection = np.resize(mask, size)
+
+ data = np.arange(size, dtype=float)
+
+ # Construct the expected series by taking the source
+ # data or item based on the selection
+ expected = Series(
+ [item if use_item else data[i] for i, use_item in enumerate(selection)]
+ )
+
+ s = Series(data)
+ s[selection] = box(item)
+ tm.assert_series_equal(s, expected)
+
+ s = Series(data)
+ result = s.where(~selection, box(item))
+ tm.assert_series_equal(result, expected)
+
+ s = Series(data)
+ result = s.mask(selection, box(item))
+ tm.assert_series_equal(result, expected)
+
+
+def test_where_inplace():
+ s = Series(np.random.randn(5))
+ cond = s > 0
+
+ rs = s.copy()
+
+ rs.where(cond, inplace=True)
+ tm.assert_series_equal(rs.dropna(), s[cond])
+ tm.assert_series_equal(rs, s.where(cond))
+
+ rs = s.copy()
+ rs.where(cond, -s, inplace=True)
+ tm.assert_series_equal(rs, s.where(cond, -s))
+
+
+def test_where_dups():
+ # GH 4550
+ # where crashes with dups in index
+ s1 = Series(list(range(3)))
+ s2 = Series(list(range(3)))
+ comb = pd.concat([s1, s2])
+ result = comb.where(comb < 2)
+ expected = Series([0, 1, np.nan, 0, 1, np.nan], index=[0, 1, 2, 0, 1, 2])
+ tm.assert_series_equal(result, expected)
+
+ # GH 4548
+ # inplace updating not working with dups
+ comb[comb < 1] = 5
+ expected = Series([5, 1, 2, 5, 1, 2], index=[0, 1, 2, 0, 1, 2])
+ tm.assert_series_equal(comb, expected)
+
+ comb[comb < 2] += 10
+ expected = Series([5, 11, 2, 5, 11, 2], index=[0, 1, 2, 0, 1, 2])
+ tm.assert_series_equal(comb, expected)
+
+
+def test_where_numeric_with_string():
+ # GH 9280
+ s = pd.Series([1, 2, 3])
+ w = s.where(s > 1, "X")
+
+ assert not is_integer(w[0])
+ assert is_integer(w[1])
+ assert is_integer(w[2])
+ assert isinstance(w[0], str)
+ assert w.dtype == "object"
+
+ w = s.where(s > 1, ["X", "Y", "Z"])
+ assert not is_integer(w[0])
+ assert is_integer(w[1])
+ assert is_integer(w[2])
+ assert isinstance(w[0], str)
+ assert w.dtype == "object"
+
+ w = s.where(s > 1, np.array(["X", "Y", "Z"]))
+ assert not is_integer(w[0])
+ assert is_integer(w[1])
+ assert is_integer(w[2])
+ assert isinstance(w[0], str)
+ assert w.dtype == "object"
+
+
+def test_where_timedelta_coerce():
+ s = Series([1, 2], dtype="timedelta64[ns]")
+ expected = Series([10, 10])
+ mask = np.array([False, False])
+
+ rs = s.where(mask, [10, 10])
+ tm.assert_series_equal(rs, expected)
+
+ rs = s.where(mask, 10)
+ tm.assert_series_equal(rs, expected)
+
+ rs = s.where(mask, 10.0)
+ tm.assert_series_equal(rs, expected)
+
+ rs = s.where(mask, [10.0, 10.0])
+ tm.assert_series_equal(rs, expected)
+
+ rs = s.where(mask, [10.0, np.nan])
+ expected = Series([10, None], dtype="object")
+ tm.assert_series_equal(rs, expected)
+
+
+def test_where_datetime_conversion():
+ s = Series(date_range("20130102", periods=2))
+ expected = Series([10, 10])
+ mask = np.array([False, False])
+
+ rs = s.where(mask, [10, 10])
+ tm.assert_series_equal(rs, expected)
+
+ rs = s.where(mask, 10)
+ tm.assert_series_equal(rs, expected)
+
+ rs = s.where(mask, 10.0)
+ tm.assert_series_equal(rs, expected)
+
+ rs = s.where(mask, [10.0, 10.0])
+ tm.assert_series_equal(rs, expected)
+
+ rs = s.where(mask, [10.0, np.nan])
+ expected = Series([10, None], dtype="object")
+ tm.assert_series_equal(rs, expected)
+
+ # GH 15701
+ timestamps = ["2016-12-31 12:00:04+00:00", "2016-12-31 12:00:04.010000+00:00"]
+ s = Series([pd.Timestamp(t) for t in timestamps])
+ rs = s.where(Series([False, True]))
+ expected = Series([pd.NaT, s[1]])
+ tm.assert_series_equal(rs, expected)
+
+
+def test_where_dt_tz_values(tz_naive_fixture):
+ ser1 = pd.Series(
+ pd.DatetimeIndex(["20150101", "20150102", "20150103"], tz=tz_naive_fixture)
+ )
+ ser2 = pd.Series(
+ pd.DatetimeIndex(["20160514", "20160515", "20160516"], tz=tz_naive_fixture)
+ )
+ mask = pd.Series([True, True, False])
+ result = ser1.where(mask, ser2)
+ exp = pd.Series(
+ pd.DatetimeIndex(["20150101", "20150102", "20160516"], tz=tz_naive_fixture)
+ )
+ tm.assert_series_equal(exp, result)
diff --git a/pandas/tests/series/indexing/test_xs.py b/pandas/tests/series/indexing/test_xs.py
new file mode 100644
index 0000000000000..43458ca2ebeb2
--- /dev/null
+++ b/pandas/tests/series/indexing/test_xs.py
@@ -0,0 +1,17 @@
+import numpy as np
+
+import pandas as pd
+
+
+def test_xs_datetimelike_wrapping():
+ # GH#31630 a case where we shouldn't wrap datetime64 in Timestamp
+ arr = pd.date_range("2016-01-01", periods=3)._data._data
+
+ ser = pd.Series(arr, dtype=object)
+ for i in range(len(ser)):
+ ser.iloc[i] = arr[i]
+ assert ser.dtype == object
+ assert isinstance(ser[0], np.datetime64)
+
+ result = ser.xs(0)
+ assert isinstance(result, np.datetime64)
diff --git a/pandas/tests/series/test_convert_dtypes.py b/pandas/tests/series/methods/test_convert_dtypes.py
similarity index 100%
rename from pandas/tests/series/test_convert_dtypes.py
rename to pandas/tests/series/methods/test_convert_dtypes.py
diff --git a/pandas/tests/series/methods/test_head_tail.py b/pandas/tests/series/methods/test_head_tail.py
new file mode 100644
index 0000000000000..d9f8d85eda350
--- /dev/null
+++ b/pandas/tests/series/methods/test_head_tail.py
@@ -0,0 +1,8 @@
+import pandas._testing as tm
+
+
+def test_head_tail(string_series):
+ tm.assert_series_equal(string_series.head(), string_series[:5])
+ tm.assert_series_equal(string_series.head(0), string_series[0:0])
+ tm.assert_series_equal(string_series.tail(), string_series[-5:])
+ tm.assert_series_equal(string_series.tail(0), string_series[0:0])
diff --git a/pandas/tests/series/test_reshaping.py b/pandas/tests/series/methods/test_unstack.py
similarity index 100%
rename from pandas/tests/series/test_reshaping.py
rename to pandas/tests/series/methods/test_unstack.py
diff --git a/pandas/tests/series/test_combine_concat.py b/pandas/tests/series/test_combine_concat.py
index 239353d3955b4..4cb471597b67a 100644
--- a/pandas/tests/series/test_combine_concat.py
+++ b/pandas/tests/series/test_combine_concat.py
@@ -4,7 +4,7 @@
import pytest
import pandas as pd
-from pandas import DataFrame, Series
+from pandas import DataFrame, Series, to_datetime
import pandas._testing as tm
@@ -252,7 +252,6 @@ def test_concat_empty_series_dtypes(self):
assert result.dtype == expected
def test_combine_first_dt64(self):
- from pandas.core.tools.datetimes import to_datetime
s0 = to_datetime(Series(["2010", np.NaN]))
s1 = to_datetime(Series([np.NaN, "2011"]))
diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py
index 640cd8faf6811..b377ca2869bd3 100644
--- a/pandas/tests/test_multilevel.py
+++ b/pandas/tests/test_multilevel.py
@@ -2534,3 +2534,29 @@ def test_sort_ascending_list(self):
result = s.sort_index(level=["third", "first"], ascending=[False, True])
expected = s.iloc[[0, 4, 1, 5, 2, 6, 3, 7]]
tm.assert_series_equal(result, expected)
+
+ @pytest.mark.parametrize(
+ "keys, expected",
+ [
+ (["b", "a"], [["b", "b", "a", "a"], [1, 2, 1, 2]]),
+ (["a", "b"], [["a", "a", "b", "b"], [1, 2, 1, 2]]),
+ ((["a", "b"], [1, 2]), [["a", "a", "b", "b"], [1, 2, 1, 2]]),
+ ((["a", "b"], [2, 1]), [["a", "a", "b", "b"], [2, 1, 2, 1]]),
+ ((["b", "a"], [2, 1]), [["b", "b", "a", "a"], [2, 1, 2, 1]]),
+ ((["b", "a"], [1, 2]), [["b", "b", "a", "a"], [1, 2, 1, 2]]),
+ ((["c", "a"], [2, 1]), [["c", "a", "a"], [1, 2, 1]]),
+ ],
+ )
+ @pytest.mark.parametrize("dim", ["index", "columns"])
+ def test_multilevel_index_loc_order(self, dim, keys, expected):
+ # GH 22797
+ # Try to respect order of keys given for MultiIndex.loc
+ kwargs = {dim: [["c", "a", "a", "b", "b"], [1, 1, 2, 1, 2]]}
+ df = pd.DataFrame(np.arange(25).reshape(5, 5), **kwargs,)
+ exp_index = MultiIndex.from_arrays(expected)
+ if dim == "index":
+ res = df.loc[keys, :]
+ tm.assert_index_equal(res.index, exp_index)
+ elif dim == "columns":
+ res = df.loc[:, keys]
+ tm.assert_index_equal(res.columns, exp_index)
diff --git a/pandas/tests/tseries/frequencies/test_to_offset.py b/pandas/tests/tseries/frequencies/test_to_offset.py
index b6069c446160d..beaefe9109e91 100644
--- a/pandas/tests/tseries/frequencies/test_to_offset.py
+++ b/pandas/tests/tseries/frequencies/test_to_offset.py
@@ -86,7 +86,7 @@ def test_to_offset_invalid(freqstr):
# We escape string because some of our
# inputs contain regex special characters.
- msg = re.escape("Invalid frequency: {freqstr}".format(freqstr=freqstr))
+ msg = re.escape(f"Invalid frequency: {freqstr}")
with pytest.raises(ValueError, match=msg):
frequencies.to_offset(freqstr)
diff --git a/pandas/tests/util/test_util.py b/pandas/tests/util/test_util.py
index 6a19adef728e4..8860e6fe272ce 100644
--- a/pandas/tests/util/test_util.py
+++ b/pandas/tests/util/test_util.py
@@ -76,3 +76,8 @@ def test_rng_context():
with tm.RNGContext(1):
assert np.random.randn() == expected1
assert np.random.randn() == expected0
+
+
+def test_external_error_raised():
+ with tm.external_error_raised(TypeError):
+ raise TypeError("Should not check this error message, so it will pass")
diff --git a/pandas/util/_print_versions.py b/pandas/util/_print_versions.py
index 2801a2bf9c371..fdfa436ce6536 100644
--- a/pandas/util/_print_versions.py
+++ b/pandas/util/_print_versions.py
@@ -43,7 +43,8 @@ def get_sys_info() -> List[Tuple[str, Optional[Union[str, int]]]]:
("python-bits", struct.calcsize("P") * 8),
("OS", f"{sysname}"),
("OS-release", f"{release}"),
- # ("Version", "{version}".format(version=version)),
+ # FIXME: dont leave commented-out
+ # ("Version", f"{version}"),
("machine", f"{machine}"),
("processor", f"{processor}"),
("byteorder", f"{sys.byteorder}"),
@@ -114,14 +115,13 @@ def show_versions(as_json=False):
else:
maxlen = max(len(x) for x in deps)
- tpl = "{{k:<{maxlen}}}: {{stat}}".format(maxlen=maxlen)
print("\nINSTALLED VERSIONS")
print("------------------")
for k, stat in sys_info:
- print(tpl.format(k=k, stat=stat))
+ print(f"{{k:<{maxlen}}}: {{stat}}")
print("")
for k, stat in deps_blob:
- print(tpl.format(k=k, stat=stat))
+ print(f"{{k:<{maxlen}}}: {{stat}}")
def main() -> int:
diff --git a/setup.cfg b/setup.cfg
index cf931f52489a8..c298aa652824c 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -138,9 +138,6 @@ ignore_errors=True
[mypy-pandas.tests.extension.decimal.test_decimal]
ignore_errors=True
-[mypy-pandas.tests.extension.json.array]
-ignore_errors=True
-
[mypy-pandas.tests.extension.json.test_json]
ignore_errors=True
diff --git a/web/pandas/config.yml b/web/pandas/config.yml
index ef0b2a0270a0b..83eb152c9d944 100644
--- a/web/pandas/config.yml
+++ b/web/pandas/config.yml
@@ -35,15 +35,7 @@ navbar:
- name: "Getting started"
target: /getting_started.html
- name: "Documentation"
- target:
- - name: "User guide"
- target: /docs/user_guide/index.html
- - name: "API reference"
- target: /docs/reference/index.html
- - name: "Release notes"
- target: /docs/whatsnew/index.html
- - name: "Older versions"
- target: https://pandas.pydata.org/pandas-docs/version/
+ target: /docs/
- name: "Community"
target:
- name: "Blog"
diff --git a/web/pandas/index.html b/web/pandas/index.html
index fedb0b0c5f712..83d0f48197033 100644
--- a/web/pandas/index.html
+++ b/web/pandas/index.html
@@ -63,7 +63,7 @@ With the support of:
{% if releases %}
Latest version: {{ releases[0].name }}