table
diff --git a/doc/source/user_guide/timedeltas.rst b/doc/source/user_guide/timedeltas.rst
index 180de1df53f9e..0b4ddaaa8a42a 100644
--- a/doc/source/user_guide/timedeltas.rst
+++ b/doc/source/user_guide/timedeltas.rst
@@ -88,19 +88,13 @@ or a list/array of strings:
pd.to_timedelta(["1 days 06:05:01.00003", "15.5us", "nan"])
-The ``unit`` keyword argument specifies the unit of the Timedelta if the input
-is numeric:
+The ``unit`` keyword argument specifies the unit of the Timedelta:
.. ipython:: python
pd.to_timedelta(np.arange(5), unit="s")
pd.to_timedelta(np.arange(5), unit="d")
-.. warning::
- If a string or array of strings is passed as an input then the ``unit`` keyword
- argument will be ignored. If a string without units is passed then the default
- unit of nanoseconds is assumed.
-
.. _timedeltas.limitations:
Timedelta limitations
diff --git a/doc/source/user_guide/window.rst b/doc/source/user_guide/window.rst
index 3e533cbadc5f7..f7e219ab23e39 100644
--- a/doc/source/user_guide/window.rst
+++ b/doc/source/user_guide/window.rst
@@ -37,14 +37,14 @@ pandas supports 4 types of windowing operations:
#. Expanding window: Accumulating window over the values.
#. Exponentially Weighted window: Accumulating and exponentially weighted window over the values.
-============================= ================= =========================== =========================== ======================== =================================== ===========================
-Concept Method Returned Object Supports time-based windows Supports chained groupby Supports table method Supports online operations
-============================= ================= =========================== =========================== ======================== =================================== ===========================
-Rolling window ``rolling`` ``Rolling`` Yes Yes Yes (as of version 1.3) No
-Weighted window ``rolling`` ``Window`` No No No No
-Expanding window ``expanding`` ``Expanding`` No Yes Yes (as of version 1.3) No
-Exponentially Weighted window ``ewm`` ``ExponentialMovingWindow`` No Yes (as of version 1.2) No Yes (as of version 1.3)
-============================= ================= =========================== =========================== ======================== =================================== ===========================
+============================= ================= =========================== =========================== ======================== ===================================
+Concept Method Returned Object Supports time-based windows Supports chained groupby Supports table method
+============================= ================= =========================== =========================== ======================== ===================================
+Rolling window ``rolling`` ``Rolling`` Yes Yes Yes (as of version 1.3)
+Weighted window ``rolling`` ``Window`` No No No
+Expanding window ``expanding`` ``Expanding`` No Yes Yes (as of version 1.3)
+Exponentially Weighted window ``ewm`` ``ExponentialMovingWindow`` No Yes (as of version 1.2) No
+============================= ================= =========================== =========================== ======================== ===================================
As noted above, some operations support specifying a window based on a time offset:
@@ -98,26 +98,6 @@ be calculated with :meth:`~Rolling.apply` by specifying a separate column of wei
df = pd.DataFrame([[1, 2, 0.6], [2, 3, 0.4], [3, 4, 0.2], [4, 5, 0.7]])
df.rolling(2, method="table", min_periods=0).apply(weighted_mean, raw=True, engine="numba") # noqa:E501
-.. versionadded:: 1.3
-
-Some windowing operations also support an ``online`` method after constructing a windowing object
-which returns a new object that supports passing in new :class:`DataFrame` or :class:`Series` objects
-to continue the windowing calculation with the new values (i.e. online calculations).
-
-The methods on this new windowing objects must call the aggregation method first to "prime" the initial
-state of the online calculation. Then, new :class:`DataFrame` or :class:`Series` objects can be passed in
-the ``update`` argument to continue the windowing calculation.
-
-.. ipython:: python
-
- df = pd.DataFrame([[1, 2, 0.6], [2, 3, 0.4], [3, 4, 0.2], [4, 5, 0.7]])
- df.ewm(0.5).mean()
-
-.. ipython:: python
-
- online_ewm = df.head(2).ewm(0.5).online()
- online_ewm.mean()
- online_ewm.mean(update=df.tail(1))
All windowing operations support a ``min_periods`` argument that dictates the minimum amount of
non-``np.nan`` values a window must have; otherwise, the resulting value is ``np.nan``.
@@ -262,24 +242,26 @@ and we want to use an expanding window where ``use_expanding`` is ``True`` other
.. code-block:: ipython
In [2]: from pandas.api.indexers import BaseIndexer
-
- In [3]: class CustomIndexer(BaseIndexer):
- ...: def get_window_bounds(self, num_values, min_periods, center, closed):
- ...: start = np.empty(num_values, dtype=np.int64)
- ...: end = np.empty(num_values, dtype=np.int64)
- ...: for i in range(num_values):
- ...: if self.use_expanding[i]:
- ...: start[i] = 0
- ...: end[i] = i + 1
- ...: else:
- ...: start[i] = i
- ...: end[i] = i + self.window_size
- ...: return start, end
-
- In [4]: indexer = CustomIndexer(window_size=1, use_expanding=use_expanding)
-
- In [5]: df.rolling(indexer).sum()
- Out[5]:
+ ...:
+ ...: class CustomIndexer(BaseIndexer):
+ ...:
+ ...: def get_window_bounds(self, num_values, min_periods, center, closed):
+ ...: start = np.empty(num_values, dtype=np.int64)
+ ...: end = np.empty(num_values, dtype=np.int64)
+ ...: for i in range(num_values):
+ ...: if self.use_expanding[i]:
+ ...: start[i] = 0
+ ...: end[i] = i + 1
+ ...: else:
+ ...: start[i] = i
+ ...: end[i] = i + self.window_size
+ ...: return start, end
+ ...:
+
+ In [3]: indexer = CustomIndexer(window_size=1, use_expanding=use_expanding)
+
+ In [4]: df.rolling(indexer).sum()
+ Out[4]:
values
0 0.0
1 1.0
@@ -363,21 +345,45 @@ Numba engine
Additionally, :meth:`~Rolling.apply` can leverage `Numba
`__
if installed as an optional dependency. The apply aggregation can be executed using Numba by specifying
``engine='numba'`` and ``engine_kwargs`` arguments (``raw`` must also be set to ``True``).
-See :ref:`enhancing performance with Numba
` for general usage of the arguments and performance considerations.
-
Numba will be applied in potentially two routines:
#. If ``func`` is a standard Python function, the engine will `JIT `__ the passed function. ``func`` can also be a JITed function in which case the engine will not JIT the function again.
#. The engine will JIT the for loop where the apply function is applied to each window.
+.. versionadded:: 1.3.0
+
+``mean``, ``median``, ``max``, ``min``, and ``sum`` also support the ``engine`` and ``engine_kwargs`` arguments.
+
The ``engine_kwargs`` argument is a dictionary of keyword arguments that will be passed into the
`numba.jit decorator `__.
These keyword arguments will be applied to *both* the passed function (if a standard Python function)
-and the apply for loop over each window.
+and the apply for loop over each window. Currently only ``nogil``, ``nopython``, and ``parallel`` are supported,
+and their default values are set to ``False``, ``True`` and ``False`` respectively.
-.. versionadded:: 1.3.0
+.. note::
-``mean``, ``median``, ``max``, ``min``, and ``sum`` also support the ``engine`` and ``engine_kwargs`` arguments.
+ In terms of performance, **the first time a function is run using the Numba engine will be slow**
+ as Numba will have some function compilation overhead. However, the compiled functions are cached,
+ and subsequent calls will be fast. In general, the Numba engine is performant with
+ a larger amount of data points (e.g. 1+ million).
+
+.. code-block:: ipython
+
+ In [1]: data = pd.Series(range(1_000_000))
+
+ In [2]: roll = data.rolling(10)
+
+ In [3]: def f(x):
+ ...: return np.sum(x) + 5
+ # Run the first time, compilation time will affect performance
+ In [4]: %timeit -r 1 -n 1 roll.apply(f, engine='numba', raw=True) # noqa: E225, E999
+ 1.23 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
+ # Function is cached and performance will improve
+ In [5]: %timeit roll.apply(f, engine='numba', raw=True)
+ 188 ms ± 1.93 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
+
+ In [6]: %timeit roll.apply(f, engine='cython', raw=True)
+ 3.92 s ± 59 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
.. _window.cov_corr:
diff --git a/doc/source/whatsnew/index.rst b/doc/source/whatsnew/index.rst
index 1f04eb6f68ae8..986cf43b80494 100644
--- a/doc/source/whatsnew/index.rst
+++ b/doc/source/whatsnew/index.rst
@@ -10,21 +10,12 @@ This is the list of changes to pandas between each release. For full details,
see the `commit logs `_. For install and
upgrade instructions, see :ref:`install`.
-Version 1.4
------------
-
-.. toctree::
- :maxdepth: 2
-
- v1.4.0
-
Version 1.3
-----------
.. toctree::
:maxdepth: 2
- v1.3.1
v1.3.0
Version 1.2
diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst
index 03dfe475475a1..b87274307431b 100755
--- a/doc/source/whatsnew/v1.0.0.rst
+++ b/doc/source/whatsnew/v1.0.0.rst
@@ -338,20 +338,19 @@ maps labels to their new names along the default axis, is allowed to be passed b
*pandas 0.25.x*
-.. code-block:: ipython
+.. code-block:: python
- In [1]: df = pd.DataFrame([[1]])
- In [2]: df.rename({0: 1}, {0: 2})
- Out[2]:
+ >>> df = pd.DataFrame([[1]])
+ >>> df.rename({0: 1}, {0: 2})
FutureWarning: ...Use named arguments to resolve ambiguity...
2
1 1
*pandas 1.0.0*
-.. code-block:: ipython
+.. code-block:: python
- In [3]: df.rename({0: 1}, {0: 2})
+ >>> df.rename({0: 1}, {0: 2})
Traceback (most recent call last):
...
TypeError: rename() takes from 1 to 2 positional arguments but 3 were given
@@ -360,28 +359,26 @@ Note that errors will now be raised when conflicting or potentially ambiguous ar
*pandas 0.25.x*
-.. code-block:: ipython
+.. code-block:: python
- In [4]: df.rename({0: 1}, index={0: 2})
- Out[4]:
+ >>> df.rename({0: 1}, index={0: 2})
0
1 1
- In [5]: df.rename(mapper={0: 1}, index={0: 2})
- Out[5]:
+ >>> df.rename(mapper={0: 1}, index={0: 2})
0
2 1
*pandas 1.0.0*
-.. code-block:: ipython
+.. code-block:: python
- In [6]: df.rename({0: 1}, index={0: 2})
+ >>> df.rename({0: 1}, index={0: 2})
Traceback (most recent call last):
...
TypeError: Cannot specify both 'mapper' and any of 'index' or 'columns'
- In [7]: df.rename(mapper={0: 1}, index={0: 2})
+ >>> df.rename(mapper={0: 1}, index={0: 2})
Traceback (most recent call last):
...
TypeError: Cannot specify both 'mapper' and any of 'index' or 'columns'
@@ -408,12 +405,12 @@ Extended verbose info output for :class:`~pandas.DataFrame`
*pandas 0.25.x*
-.. code-block:: ipython
+.. code-block:: python
- In [1]: df = pd.DataFrame({"int_col": [1, 2, 3],
+ >>> df = pd.DataFrame({"int_col": [1, 2, 3],
... "text_col": ["a", "b", "c"],
... "float_col": [0.0, 0.1, 0.2]})
- In [2]: df.info(verbose=True)
+ >>> df.info(verbose=True)
RangeIndex: 3 entries, 0 to 2
Data columns (total 3 columns):
@@ -443,16 +440,14 @@ Extended verbose info output for :class:`~pandas.DataFrame`
*pandas 0.25.x*
-.. code-block:: ipython
+.. code-block:: python
- In [1]: pd.array(["a", None])
- Out[1]:
+ >>> pd.array(["a", None])
['a', None]
Length: 2, dtype: object
- In [2]: pd.array([1, None])
- Out[2]:
+ >>> pd.array([1, None])
[1, None]
Length: 2, dtype: object
@@ -475,17 +470,15 @@ As a reminder, you can specify the ``dtype`` to disable all inference.
*pandas 0.25.x*
-.. code-block:: ipython
+.. code-block:: python
- In [1]: a = pd.array([1, 2, None], dtype="Int64")
- In [2]: a
- Out[2]:
+ >>> a = pd.array([1, 2, None], dtype="Int64")
+ >>> a
[1, 2, NaN]
Length: 3, dtype: Int64
- In [3]: a[2]
- Out[3]:
+ >>> a[2]
nan
*pandas 1.0.0*
@@ -506,10 +499,9 @@ will now raise.
*pandas 0.25.x*
-.. code-block:: ipython
+.. code-block:: python
- In [1]: np.asarray(a, dtype="float")
- Out[1]:
+ >>> np.asarray(a, dtype="float")
array([ 1., 2., nan])
*pandas 1.0.0*
@@ -533,10 +525,9 @@ will now be ``pd.NA`` instead of ``np.nan`` in presence of missing values
*pandas 0.25.x*
-.. code-block:: ipython
+.. code-block:: python
- In [1]: pd.Series(a).sum(skipna=False)
- Out[1]:
+ >>> pd.Series(a).sum(skipna=False)
nan
*pandas 1.0.0*
@@ -552,10 +543,9 @@ integer dtype for the values.
*pandas 0.25.x*
-.. code-block:: ipython
+.. code-block:: python
- In [1]: pd.Series([2, 1, 1, None], dtype="Int64").value_counts().dtype
- Out[1]:
+ >>> pd.Series([2, 1, 1, None], dtype="Int64").value_counts().dtype
dtype('int64')
*pandas 1.0.0*
@@ -575,17 +565,15 @@ Comparison operations on a :class:`arrays.IntegerArray` now returns a
*pandas 0.25.x*
-.. code-block:: ipython
+.. code-block:: python
- In [1]: a = pd.array([1, 2, None], dtype="Int64")
- In [2]: a
- Out[2]:
+ >>> a = pd.array([1, 2, None], dtype="Int64")
+ >>> a
[1, 2, NaN]
Length: 3, dtype: Int64
- In [3]: a > 1
- Out[3]:
+ >>> a > 1
array([False, True, False])
*pandas 1.0.0*
@@ -652,10 +640,9 @@ scalar values in the result are instances of the extension dtype's scalar type.
*pandas 0.25.x*
-.. code-block:: ipython
+.. code-block:: python
- In [1]> df.resample("2D").agg(lambda x: 'a').A.dtype
- Out[1]:
+ >>> df.resample("2D").agg(lambda x: 'a').A.dtype
CategoricalDtype(categories=['a', 'b'], ordered=False)
*pandas 1.0.0*
@@ -670,10 +657,9 @@ depending on how the results are cast back to the original dtype.
*pandas 0.25.x*
-.. code-block:: ipython
+.. code-block:: python
- In [1] df.resample("2D").agg(lambda x: 'c')
- Out[1]:
+ >>> df.resample("2D").agg(lambda x: 'c')
A
0 NaN
@@ -885,10 +871,10 @@ matplotlib directly rather than :meth:`~DataFrame.plot`.
To use pandas formatters with a matplotlib plot, specify
-.. code-block:: ipython
+.. code-block:: python
- In [1]: import pandas as pd
- In [2]: pd.options.plotting.matplotlib.register_converters = True
+ >>> import pandas as pd
+ >>> pd.options.plotting.matplotlib.register_converters = True
Note that plots created by :meth:`DataFrame.plot` and :meth:`Series.plot` *do* register the converters
automatically. The only behavior change is when plotting a date-like object via ``matplotlib.pyplot.plot``
diff --git a/doc/source/whatsnew/v1.2.1.rst b/doc/source/whatsnew/v1.2.1.rst
index 34e28eab6d4bf..bfe30d52e2aff 100644
--- a/doc/source/whatsnew/v1.2.1.rst
+++ b/doc/source/whatsnew/v1.2.1.rst
@@ -52,23 +52,20 @@ DataFrame / Series combination) would ignore the indices, only match
the inputs by shape, and use the index/columns of the first DataFrame for
the result:
-.. code-block:: ipython
+.. code-block:: python
- In [1]: df1 = pd.DataFrame({"a": [1, 2], "b": [3, 4]}, index=[0, 1])
- In [2]: df2 = pd.DataFrame({"a": [1, 2], "b": [3, 4]}, index=[1, 2])
- In [3]: df1
- Out[3]:
+ >>> df1 = pd.DataFrame({"a": [1, 2], "b": [3, 4]}, index=[0, 1])
+ ... df2 = pd.DataFrame({"a": [1, 2], "b": [3, 4]}, index=[1, 2])
+ >>> df1
a b
0 1 3
1 2 4
- In [4]: df2
- Out[4]:
+ >>> df2
a b
1 1 3
2 2 4
- In [5]: np.add(df1, df2)
- Out[5]:
+ >>> np.add(df1, df2)
a b
0 2 6
1 4 8
@@ -76,10 +73,9 @@ the result:
This contrasts with how other pandas operations work, which first align
the inputs:
-.. code-block:: ipython
+.. code-block:: python
- In [6]: df1 + df2
- Out[6]:
+ >>> df1 + df2
a b
0 NaN NaN
1 3.0 7.0
@@ -98,10 +94,9 @@ objects (eg ``np.add(s1, s2)``) already aligns and continues to do so.
To avoid the warning and keep the current behaviour of ignoring the indices,
convert one of the arguments to a NumPy array:
-.. code-block:: ipython
+.. code-block:: python
- In [7]: np.add(df1, np.asarray(df2))
- Out[7]:
+ >>> np.add(df1, np.asarray(df2))
a b
0 2 6
1 4 8
@@ -109,11 +104,10 @@ convert one of the arguments to a NumPy array:
To obtain the future behaviour and silence the warning, you can align manually
before passing the arguments to the ufunc:
-.. code-block:: ipython
+.. code-block:: python
- In [8]: df1, df2 = df1.align(df2)
- In [9]: np.add(df1, df2)
- Out[9]:
+ >>> df1, df2 = df1.align(df2)
+ >>> np.add(df1, df2)
a b
0 NaN NaN
1 3.0 7.0
diff --git a/doc/source/whatsnew/v1.2.5.rst b/doc/source/whatsnew/v1.2.5.rst
index d3ceb2b919b5d..d0af23b48b1f7 100644
--- a/doc/source/whatsnew/v1.2.5.rst
+++ b/doc/source/whatsnew/v1.2.5.rst
@@ -1,7 +1,7 @@
.. _whatsnew_125:
-What's new in 1.2.5 (June 22, 2021)
------------------------------------
+What's new in 1.2.5 (May ??, 2021)
+----------------------------------
These are the changes in pandas 1.2.5. See :ref:`release` for a full changelog
including other versions of pandas.
@@ -14,12 +14,32 @@ including other versions of pandas.
Fixed regressions
~~~~~~~~~~~~~~~~~
-- Fixed regression in :func:`concat` between two :class:`DataFrame` where one has an :class:`Index` that is all-None and the other is :class:`DatetimeIndex` incorrectly raising (:issue:`40841`)
+- Regression in :func:`concat` between two :class:`DataFrames` where one has an :class:`Index` that is all-None and the other is :class:`DatetimeIndex` incorrectly raising (:issue:`40841`)
- Fixed regression in :meth:`DataFrame.sum` and :meth:`DataFrame.prod` when ``min_count`` and ``numeric_only`` are both given (:issue:`41074`)
-- Fixed regression in :func:`read_csv` when using ``memory_map=True`` with an non-UTF8 encoding (:issue:`40986`)
-- Fixed regression in :meth:`DataFrame.replace` and :meth:`Series.replace` when the values to replace is a NumPy float array (:issue:`40371`)
-- Fixed regression in :func:`ExcelFile` when a corrupt file is opened but not closed (:issue:`41778`)
-- Fixed regression in :meth:`DataFrame.astype` with ``dtype=str`` failing to convert ``NaN`` in categorical columns (:issue:`41797`)
+- Regression in :func:`read_csv` when using ``memory_map=True`` with an non-UTF8 encoding (:issue:`40986`)
+- Regression in :meth:`DataFrame.replace` and :meth:`Series.replace` when the values to replace is a NumPy float array (:issue:`40371`)
+- Regression in :func:`ExcelFile` when a corrupt file is opened but not closed (:issue:`41778`)
+
+.. ---------------------------------------------------------------------------
+
+
+.. _whatsnew_125.bug_fixes:
+
+Bug fixes
+~~~~~~~~~
+
+-
+-
+
+.. ---------------------------------------------------------------------------
+
+.. _whatsnew_125.other:
+
+Other
+~~~~~
+
+-
+-
.. ---------------------------------------------------------------------------
diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst
index ed66861efad93..e2b923812a211 100644
--- a/doc/source/whatsnew/v1.3.0.rst
+++ b/doc/source/whatsnew/v1.3.0.rst
@@ -1,7 +1,7 @@
.. _whatsnew_130:
-What's new in 1.3.0 (July 2, 2021)
-----------------------------------
+What's new in 1.3.0 (??)
+------------------------
These are the changes in pandas 1.3.0. See :ref:`release` for a full changelog
including other versions of pandas.
@@ -124,7 +124,7 @@ which has been revised and improved (:issue:`39720`, :issue:`39317`, :issue:`404
- The methods :meth:`.Styler.highlight_null`, :meth:`.Styler.highlight_min`, and :meth:`.Styler.highlight_max` now allow custom CSS highlighting instead of the default background coloring (:issue:`40242`)
- :meth:`.Styler.apply` now accepts functions that return an ``ndarray`` when ``axis=None``, making it now consistent with the ``axis=0`` and ``axis=1`` behavior (:issue:`39359`)
- When incorrectly formatted CSS is given via :meth:`.Styler.apply` or :meth:`.Styler.applymap`, an error is now raised upon rendering (:issue:`39660`)
- - :meth:`.Styler.format` now accepts the keyword argument ``escape`` for optional HTML and LaTeX escaping (:issue:`40388`, :issue:`41619`)
+ - :meth:`.Styler.format` now accepts the keyword argument ``escape`` for optional HTML and LaTex escaping (:issue:`40388`, :issue:`41619`)
- :meth:`.Styler.background_gradient` has gained the argument ``gmap`` to supply a specific gradient map for shading (:issue:`22727`)
- :meth:`.Styler.clear` now clears :attr:`Styler.hidden_index` and :attr:`Styler.hidden_columns` as well (:issue:`40484`)
- Added the method :meth:`.Styler.highlight_between` (:issue:`39821`)
@@ -136,9 +136,8 @@ which has been revised and improved (:issue:`39720`, :issue:`39317`, :issue:`404
- Many features of the :class:`.Styler` class are now either partially or fully usable on a DataFrame with a non-unique indexes or columns (:issue:`41143`)
- One has greater control of the display through separate sparsification of the index or columns using the :ref:`new styler options `, which are also usable via :func:`option_context` (:issue:`41142`)
- Added the option ``styler.render.max_elements`` to avoid browser overload when styling large DataFrames (:issue:`40712`)
- - Added the method :meth:`.Styler.to_latex` (:issue:`21673`, :issue:`42320`), which also allows some limited CSS conversion (:issue:`40731`)
+ - Added the method :meth:`.Styler.to_latex` (:issue:`21673`)
- Added the method :meth:`.Styler.to_html` (:issue:`13379`)
- - Added the method :meth:`.Styler.set_sticky` to make index and column headers permanently visible in scrolling HTML frames (:issue:`29072`)
.. _whatsnew_130.enhancements.dataframe_honors_copy_with_dict:
@@ -240,19 +239,17 @@ For example:
Other enhancements
^^^^^^^^^^^^^^^^^^
-- :meth:`DataFrame.rolling`, :meth:`Series.rolling`, :meth:`DataFrame.expanding`, and :meth:`Series.expanding` now support a ``method`` argument with a ``'table'`` option that performs the windowing operation over an entire :class:`DataFrame`. See :ref:`Window Overview ` for performance and functional benefits (:issue:`15095`, :issue:`38995`)
-- :class:`.ExponentialMovingWindow` now support a ``online`` method that can perform ``mean`` calculations in an online fashion. See :ref:`Window Overview ` (:issue:`41673`)
+- :meth:`DataFrame.rolling`, :meth:`Series.rolling`, :meth:`DataFrame.expanding`, and :meth:`Series.expanding` now support a ``method`` argument with a ``'table'`` option that performs the windowing operation over an entire DataFrame. See :ref:`Window Overview ` for performance and functional benefits (:issue:`15095`, :issue:`38995`)
- Added :meth:`MultiIndex.dtypes` (:issue:`37062`)
- Added ``end`` and ``end_day`` options for the ``origin`` argument in :meth:`DataFrame.resample` (:issue:`37804`)
- Improved error message when ``usecols`` and ``names`` do not match for :func:`read_csv` and ``engine="c"`` (:issue:`29042`)
- Improved consistency of error messages when passing an invalid ``win_type`` argument in :ref:`Window methods ` (:issue:`15969`)
- :func:`read_sql_query` now accepts a ``dtype`` argument to cast the columnar data from the SQL database based on user input (:issue:`10285`)
-- :func:`read_csv` now raising ``ParserWarning`` if length of header or given names does not match length of data when ``usecols`` is not specified (:issue:`21768`)
- Improved integer type mapping from pandas to SQLAlchemy when using :meth:`DataFrame.to_sql` (:issue:`35076`)
- :func:`to_numeric` now supports downcasting of nullable ``ExtensionDtype`` objects (:issue:`33013`)
- Added support for dict-like names in :class:`MultiIndex.set_names` and :class:`MultiIndex.rename` (:issue:`20421`)
- :func:`read_excel` can now auto-detect .xlsb files and older .xls files (:issue:`35416`, :issue:`41225`)
-- :class:`ExcelWriter` now accepts an ``if_sheet_exists`` parameter to control the behavior of append mode when writing to existing sheets (:issue:`40230`)
+- :class:`ExcelWriter` now accepts an ``if_sheet_exists`` parameter to control the behaviour of append mode when writing to existing sheets (:issue:`40230`)
- :meth:`.Rolling.sum`, :meth:`.Expanding.sum`, :meth:`.Rolling.mean`, :meth:`.Expanding.mean`, :meth:`.ExponentialMovingWindow.mean`, :meth:`.Rolling.median`, :meth:`.Expanding.median`, :meth:`.Rolling.max`, :meth:`.Expanding.max`, :meth:`.Rolling.min`, and :meth:`.Expanding.min` now support `Numba `_ execution with the ``engine`` keyword (:issue:`38895`, :issue:`41267`)
- :meth:`DataFrame.apply` can now accept NumPy unary operators as strings, e.g. ``df.apply("sqrt")``, which was already the case for :meth:`Series.apply` (:issue:`39116`)
- :meth:`DataFrame.apply` can now accept non-callable DataFrame properties as strings, e.g. ``df.apply("size")``, which was already the case for :meth:`Series.apply` (:issue:`39116`)
@@ -269,16 +266,12 @@ Other enhancements
- :meth:`read_csv` and :meth:`read_json` expose the argument ``encoding_errors`` to control how encoding errors are handled (:issue:`39450`)
- :meth:`.GroupBy.any` and :meth:`.GroupBy.all` use Kleene logic with nullable data types (:issue:`37506`)
- :meth:`.GroupBy.any` and :meth:`.GroupBy.all` return a ``BooleanDtype`` for columns with nullable data types (:issue:`33449`)
-- :meth:`.GroupBy.any` and :meth:`.GroupBy.all` raising with ``object`` data containing ``pd.NA`` even when ``skipna=True`` (:issue:`37501`)
- :meth:`.GroupBy.rank` now supports object-dtype data (:issue:`38278`)
- Constructing a :class:`DataFrame` or :class:`Series` with the ``data`` argument being a Python iterable that is *not* a NumPy ``ndarray`` consisting of NumPy scalars will now result in a dtype with a precision the maximum of the NumPy scalars; this was already the case when ``data`` is a NumPy ``ndarray`` (:issue:`40908`)
- Add keyword ``sort`` to :func:`pivot_table` to allow non-sorting of the result (:issue:`39143`)
- Add keyword ``dropna`` to :meth:`DataFrame.value_counts` to allow counting rows that include ``NA`` values (:issue:`41325`)
- :meth:`Series.replace` will now cast results to ``PeriodDtype`` where possible instead of ``object`` dtype (:issue:`41526`)
- Improved error message in ``corr`` and ``cov`` methods on :class:`.Rolling`, :class:`.Expanding`, and :class:`.ExponentialMovingWindow` when ``other`` is not a :class:`DataFrame` or :class:`Series` (:issue:`41741`)
-- :meth:`Series.between` can now accept ``left`` or ``right`` as arguments to ``inclusive`` to include only the left or right boundary (:issue:`40245`)
-- :meth:`DataFrame.explode` now supports exploding multiple columns. Its ``column`` argument now also accepts a list of str or tuples for exploding on multiple columns at the same time (:issue:`39240`)
-- :meth:`DataFrame.sample` now accepts the ``ignore_index`` argument to reset the index after sampling, similar to :meth:`DataFrame.drop_duplicates` and :meth:`DataFrame.sort_values` (:issue:`38581`)
.. ---------------------------------------------------------------------------
@@ -307,7 +300,7 @@ As an example of this, given:
original = pd.Series(cat)
unique = original.unique()
-*Previous behavior*:
+*pandas < 1.3.0*:
.. code-block:: ipython
@@ -317,7 +310,7 @@ As an example of this, given:
In [2]: original.dtype == unique.dtype
False
-*New behavior*:
+*pandas >= 1.3.0*
.. ipython:: python
@@ -339,7 +332,7 @@ Preserve dtypes in :meth:`DataFrame.combine_first`
df2
combined = df1.combine_first(df2)
-*Previous behavior*:
+*pandas 1.2.x*
.. code-block:: ipython
@@ -350,7 +343,7 @@ Preserve dtypes in :meth:`DataFrame.combine_first`
C float64
dtype: object
-*New behavior*:
+*pandas 1.3.0*
.. ipython:: python
@@ -373,7 +366,7 @@ values as measured by ``np.allclose``. Now no such casting occurs.
df = pd.DataFrame({'key': [1, 1], 'a': [True, False], 'b': [True, True]})
df
-*Previous behavior*:
+*pandas 1.2.x*
.. code-block:: ipython
@@ -383,7 +376,7 @@ values as measured by ``np.allclose``. Now no such casting occurs.
key
1 True 2
-*New behavior*:
+*pandas 1.3.0*
.. ipython:: python
@@ -401,7 +394,7 @@ Now, these methods will always return a float dtype. (:issue:`41137`)
df = pd.DataFrame({'a': [True], 'b': [1], 'c': [1.0]})
-*Previous behavior*:
+*pandas 1.2.x*
.. code-block:: ipython
@@ -410,7 +403,7 @@ Now, these methods will always return a float dtype. (:issue:`41137`)
a b c
0 True 1 1.0
-*New behavior*:
+*pandas 1.3.0*
.. ipython:: python
@@ -434,7 +427,7 @@ insert the values into the existing data rather than create an entirely new arra
In both the new and old behavior, the data in ``values`` is overwritten, but in
the old behavior the dtype of ``df["A"]`` changed to ``int64``.
-*Previous behavior*:
+*pandas 1.2.x*
.. code-block:: ipython
@@ -449,7 +442,7 @@ the old behavior the dtype of ``df["A"]`` changed to ``int64``.
In pandas 1.3.0, ``df`` continues to share data with ``values``
-*New behavior*:
+*pandas 1.3.0*
.. ipython:: python
@@ -476,7 +469,7 @@ never casting to the dtypes of the existing arrays.
In the old behavior, ``5`` was cast to ``float64`` and inserted into the existing
array backing ``df``:
-*Previous behavior*:
+*pandas 1.2.x*
.. code-block:: ipython
@@ -486,7 +479,7 @@ array backing ``df``:
In the new behavior, we get a new array, and retain an integer-dtyped ``5``:
-*New behavior*:
+*pandas 1.3.0*
.. ipython:: python
@@ -509,7 +502,7 @@ casts to ``dtype=object`` (:issue:`38709`)
ser2 = orig.copy()
ser2.iloc[1] = 2.0
-*Previous behavior*:
+*pandas 1.2.x*
.. code-block:: ipython
@@ -525,7 +518,7 @@ casts to ``dtype=object`` (:issue:`38709`)
1 2.0
dtype: object
-*New behavior*:
+*pandas 1.3.0*
.. ipython:: python
@@ -643,7 +636,7 @@ If installed, we now require:
+-----------------+-----------------+----------+---------+
| pytest (dev) | 6.0 | | X |
+-----------------+-----------------+----------+---------+
-| mypy (dev) | 0.812 | | X |
+| mypy (dev) | 0.800 | | X |
+-----------------+-----------------+----------+---------+
| setuptools | 38.6.0 | | X |
+-----------------+-----------------+----------+---------+
@@ -707,8 +700,6 @@ Other API changes
- Added new ``engine`` and ``**engine_kwargs`` parameters to :meth:`DataFrame.to_sql` to support other future "SQL engines". Currently we still only use ``SQLAlchemy`` under the hood, but more engines are planned to be supported such as `turbodbc `_ (:issue:`36893`)
- Removed redundant ``freq`` from :class:`PeriodIndex` string representation (:issue:`41653`)
- :meth:`ExtensionDtype.construct_array_type` is now a required method instead of an optional one for :class:`ExtensionDtype` subclasses (:issue:`24860`)
-- Calling ``hash`` on non-hashable pandas objects will now raise ``TypeError`` with the built-in error message (e.g. ``unhashable type: 'Series'``). Previously it would raise a custom message such as ``'Series' objects are mutable, thus they cannot be hashed``. Furthermore, ``isinstance(, abc.collections.Hashable)`` will now return ``False`` (:issue:`40013`)
-- :meth:`.Styler.from_custom_template` now has two new arguments for template names, and removed the old ``name``, due to template inheritance having been introducing for better parsing (:issue:`42053`). Subclassing modifications to Styler attributes are also needed.
.. _whatsnew_130.api_breaking.build:
@@ -722,6 +713,64 @@ Build
Deprecations
~~~~~~~~~~~~
+- Deprecated allowing scalars to be passed to the :class:`Categorical` constructor (:issue:`38433`)
+- Deprecated constructing :class:`CategoricalIndex` without passing list-like data (:issue:`38944`)
+- Deprecated allowing subclass-specific keyword arguments in the :class:`Index` constructor, use the specific subclass directly instead (:issue:`14093`, :issue:`21311`, :issue:`22315`, :issue:`26974`)
+- Deprecated the :meth:`astype` method of datetimelike (``timedelta64[ns]``, ``datetime64[ns]``, ``Datetime64TZDtype``, ``PeriodDtype``) to convert to integer dtypes, use ``values.view(...)`` instead (:issue:`38544`)
+- Deprecated :meth:`MultiIndex.is_lexsorted` and :meth:`MultiIndex.lexsort_depth`, use :meth:`MultiIndex.is_monotonic_increasing` instead (:issue:`32259`)
+- Deprecated keyword ``try_cast`` in :meth:`Series.where`, :meth:`Series.mask`, :meth:`DataFrame.where`, :meth:`DataFrame.mask`; cast results manually if desired (:issue:`38836`)
+- Deprecated comparison of :class:`Timestamp` objects with ``datetime.date`` objects. Instead of e.g. ``ts <= mydate`` use ``ts <= pd.Timestamp(mydate)`` or ``ts.date() <= mydate`` (:issue:`36131`)
+- Deprecated :attr:`Rolling.win_type` returning ``"freq"`` (:issue:`38963`)
+- Deprecated :attr:`Rolling.is_datetimelike` (:issue:`38963`)
+- Deprecated :class:`DataFrame` indexer for :meth:`Series.__setitem__` and :meth:`DataFrame.__setitem__` (:issue:`39004`)
+- Deprecated :meth:`ExponentialMovingWindow.vol` (:issue:`39220`)
+- Using ``.astype`` to convert between ``datetime64[ns]`` dtype and :class:`DatetimeTZDtype` is deprecated and will raise in a future version, use ``obj.tz_localize`` or ``obj.dt.tz_localize`` instead (:issue:`38622`)
+- Deprecated casting ``datetime.date`` objects to ``datetime64`` when used as ``fill_value`` in :meth:`DataFrame.unstack`, :meth:`DataFrame.shift`, :meth:`Series.shift`, and :meth:`DataFrame.reindex`, pass ``pd.Timestamp(dateobj)`` instead (:issue:`39767`)
+- Deprecated :meth:`.Styler.set_na_rep` and :meth:`.Styler.set_precision` in favour of :meth:`.Styler.format` with ``na_rep`` and ``precision`` as existing and new input arguments respectively (:issue:`40134`, :issue:`40425`)
+- Deprecated allowing partial failure in :meth:`Series.transform` and :meth:`DataFrame.transform` when ``func`` is list-like or dict-like and raises anything but ``TypeError``; ``func`` raising anything but a ``TypeError`` will raise in a future version (:issue:`40211`)
+- Deprecated arguments ``error_bad_lines`` and ``warn_bad_lines`` in :meth:`read_csv` and :meth:`read_table` in favor of argument ``on_bad_lines`` (:issue:`15122`)
+- Deprecated support for ``np.ma.mrecords.MaskedRecords`` in the :class:`DataFrame` constructor, pass ``{name: data[name] for name in data.dtype.names}`` instead (:issue:`40363`)
+- Deprecated using :func:`merge`, :meth:`DataFrame.merge`, and :meth:`DataFrame.join` on a different number of levels (:issue:`34862`)
+- Deprecated the use of ``**kwargs`` in :class:`.ExcelWriter`; use the keyword argument ``engine_kwargs`` instead (:issue:`40430`)
+- Deprecated the ``level`` keyword for :class:`DataFrame` and :class:`Series` aggregations; use groupby instead (:issue:`39983`)
+- Deprecated the ``inplace`` parameter of :meth:`Categorical.remove_categories`, :meth:`Categorical.add_categories`, :meth:`Categorical.reorder_categories`, :meth:`Categorical.rename_categories`, :meth:`Categorical.set_categories` and will be removed in a future version (:issue:`37643`)
+- Deprecated :func:`merge` producing duplicated columns through the ``suffixes`` keyword and already existing columns (:issue:`22818`)
+- Deprecated setting :attr:`Categorical._codes`, create a new :class:`Categorical` with the desired codes instead (:issue:`40606`)
+- Deprecated the ``convert_float`` optional argument in :func:`read_excel` and :meth:`ExcelFile.parse` (:issue:`41127`)
+- Deprecated behavior of :meth:`DatetimeIndex.union` with mixed timezones; in a future version both will be cast to UTC instead of object dtype (:issue:`39328`)
+- Deprecated using ``usecols`` with out of bounds indices for :func:`read_csv` with ``engine="c"`` (:issue:`25623`)
+- Deprecated special treatment of lists with first element a Categorical in the :class:`DataFrame` constructor; pass as ``pd.DataFrame({col: categorical, ...})`` instead (:issue:`38845`)
+- Deprecated behavior of :class:`DataFrame` constructor when a ``dtype`` is passed and the data cannot be cast to that dtype. In a future version, this will raise instead of being silently ignored (:issue:`24435`)
+- Deprecated the :attr:`Timestamp.freq` attribute. For the properties that use it (``is_month_start``, ``is_month_end``, ``is_quarter_start``, ``is_quarter_end``, ``is_year_start``, ``is_year_end``), when you have a ``freq``, use e.g. ``freq.is_month_start(ts)`` (:issue:`15146`)
+- Deprecated construction of :class:`Series` or :class:`DataFrame` with ``DatetimeTZDtype`` data and ``datetime64[ns]`` dtype. Use ``Series(data).dt.tz_localize(None)`` instead (:issue:`41555`, :issue:`33401`)
+- Deprecated behavior of :class:`Series` construction with large-integer values and small-integer dtype silently overflowing; use ``Series(data).astype(dtype)`` instead (:issue:`41734`)
+- Deprecated behavior of :class:`DataFrame` construction with floating data and integer dtype casting even when lossy; in a future version this will remain floating, matching :class:`Series` behavior (:issue:`41770`)
+- Deprecated inference of ``timedelta64[ns]``, ``datetime64[ns]``, or ``DatetimeTZDtype`` dtypes in :class:`Series` construction when data containing strings is passed and no ``dtype`` is passed (:issue:`33558`)
+- In a future version, constructing :class:`Series` or :class:`DataFrame` with ``datetime64[ns]`` data and ``DatetimeTZDtype`` will treat the data as wall-times instead of as UTC times (matching DatetimeIndex behavior). To treat the data as UTC times, use ``pd.Series(data).dt.tz_localize("UTC").dt.tz_convert(dtype.tz)`` or ``pd.Series(data.view("int64"), dtype=dtype)`` (:issue:`33401`)
+- Deprecated passing lists as ``key`` to :meth:`DataFrame.xs` and :meth:`Series.xs` (:issue:`41760`)
+- Deprecated passing arguments as positional for all of the following, with exceptions noted (:issue:`41485`):
+ - :func:`concat` (other than ``objs``)
+ - :func:`read_csv` (other than ``filepath_or_buffer``)
+ - :func:`read_table` (other than ``filepath_or_buffer``)
+ - :meth:`DataFrame.clip` and :meth:`Series.clip` (other than ``upper`` and ``lower``)
+ - :meth:`DataFrame.drop_duplicates` (except for ``subset``), :meth:`Series.drop_duplicates`, :meth:`Index.drop_duplicates` and :meth:`MultiIndex.drop_duplicates`
+ - :meth:`DataFrame.drop` (other than ``labels``) and :meth:`Series.drop`
+ - :meth:`DataFrame.dropna` and :meth:`Series.dropna`
+ - :meth:`DataFrame.ffill`, :meth:`Series.ffill`, :meth:`DataFrame.bfill`, and :meth:`Series.bfill`
+ - :meth:`DataFrame.fillna` and :meth:`Series.fillna` (apart from ``value``)
+ - :meth:`DataFrame.interpolate` and :meth:`Series.interpolate` (other than ``method``)
+ - :meth:`DataFrame.mask` and :meth:`Series.mask` (other than ``cond`` and ``other``)
+ - :meth:`DataFrame.reset_index` (other than ``level``) and :meth:`Series.reset_index`
+ - :meth:`DataFrame.set_axis` and :meth:`Series.set_axis` (other than ``labels``)
+ - :meth:`DataFrame.set_index` (other than ``keys``)
+ - :meth:`DataFrame.sort_index` and :meth:`Series.sort_index`
+ - :meth:`DataFrame.sort_values` (other than ``by``) and :meth:`Series.sort_values`
+ - :meth:`DataFrame.where` and :meth:`Series.where` (other than ``cond`` and ``other``)
+ - :meth:`Index.set_names` and :meth:`MultiIndex.set_names` (except for ``names``)
+ - :meth:`MultiIndex.codes` (except for ``codes``)
+ - :meth:`MultiIndex.set_levels` (except for ``levels``)
+ - :meth:`Resampler.interpolate` (other than ``method``)
+
.. _whatsnew_130.deprecations.nuisance_columns:
@@ -790,8 +839,6 @@ For example:
1 2
2 12
-*Future behavior*:
-
.. code-block:: ipython
In [5]: gb.prod(numeric_only=False)
@@ -804,72 +851,6 @@ For example:
1 2
2 12
-.. _whatsnew_130.deprecations.other:
-
-Other Deprecations
-^^^^^^^^^^^^^^^^^^
-- Deprecated allowing scalars to be passed to the :class:`Categorical` constructor (:issue:`38433`)
-- Deprecated constructing :class:`CategoricalIndex` without passing list-like data (:issue:`38944`)
-- Deprecated allowing subclass-specific keyword arguments in the :class:`Index` constructor, use the specific subclass directly instead (:issue:`14093`, :issue:`21311`, :issue:`22315`, :issue:`26974`)
-- Deprecated the :meth:`astype` method of datetimelike (``timedelta64[ns]``, ``datetime64[ns]``, ``Datetime64TZDtype``, ``PeriodDtype``) to convert to integer dtypes, use ``values.view(...)`` instead (:issue:`38544`)
-- Deprecated :meth:`MultiIndex.is_lexsorted` and :meth:`MultiIndex.lexsort_depth`, use :meth:`MultiIndex.is_monotonic_increasing` instead (:issue:`32259`)
-- Deprecated keyword ``try_cast`` in :meth:`Series.where`, :meth:`Series.mask`, :meth:`DataFrame.where`, :meth:`DataFrame.mask`; cast results manually if desired (:issue:`38836`)
-- Deprecated comparison of :class:`Timestamp` objects with ``datetime.date`` objects. Instead of e.g. ``ts <= mydate`` use ``ts <= pd.Timestamp(mydate)`` or ``ts.date() <= mydate`` (:issue:`36131`)
-- Deprecated :attr:`Rolling.win_type` returning ``"freq"`` (:issue:`38963`)
-- Deprecated :attr:`Rolling.is_datetimelike` (:issue:`38963`)
-- Deprecated :class:`DataFrame` indexer for :meth:`Series.__setitem__` and :meth:`DataFrame.__setitem__` (:issue:`39004`)
-- Deprecated :meth:`ExponentialMovingWindow.vol` (:issue:`39220`)
-- Using ``.astype`` to convert between ``datetime64[ns]`` dtype and :class:`DatetimeTZDtype` is deprecated and will raise in a future version, use ``obj.tz_localize`` or ``obj.dt.tz_localize`` instead (:issue:`38622`)
-- Deprecated casting ``datetime.date`` objects to ``datetime64`` when used as ``fill_value`` in :meth:`DataFrame.unstack`, :meth:`DataFrame.shift`, :meth:`Series.shift`, and :meth:`DataFrame.reindex`, pass ``pd.Timestamp(dateobj)`` instead (:issue:`39767`)
-- Deprecated :meth:`.Styler.set_na_rep` and :meth:`.Styler.set_precision` in favor of :meth:`.Styler.format` with ``na_rep`` and ``precision`` as existing and new input arguments respectively (:issue:`40134`, :issue:`40425`)
-- Deprecated :meth:`.Styler.where` in favor of using an alternative formulation with :meth:`Styler.applymap` (:issue:`40821`)
-- Deprecated allowing partial failure in :meth:`Series.transform` and :meth:`DataFrame.transform` when ``func`` is list-like or dict-like and raises anything but ``TypeError``; ``func`` raising anything but a ``TypeError`` will raise in a future version (:issue:`40211`)
-- Deprecated arguments ``error_bad_lines`` and ``warn_bad_lines`` in :meth:`read_csv` and :meth:`read_table` in favor of argument ``on_bad_lines`` (:issue:`15122`)
-- Deprecated support for ``np.ma.mrecords.MaskedRecords`` in the :class:`DataFrame` constructor, pass ``{name: data[name] for name in data.dtype.names}`` instead (:issue:`40363`)
-- Deprecated using :func:`merge`, :meth:`DataFrame.merge`, and :meth:`DataFrame.join` on a different number of levels (:issue:`34862`)
-- Deprecated the use of ``**kwargs`` in :class:`.ExcelWriter`; use the keyword argument ``engine_kwargs`` instead (:issue:`40430`)
-- Deprecated the ``level`` keyword for :class:`DataFrame` and :class:`Series` aggregations; use groupby instead (:issue:`39983`)
-- Deprecated the ``inplace`` parameter of :meth:`Categorical.remove_categories`, :meth:`Categorical.add_categories`, :meth:`Categorical.reorder_categories`, :meth:`Categorical.rename_categories`, :meth:`Categorical.set_categories` and will be removed in a future version (:issue:`37643`)
-- Deprecated :func:`merge` producing duplicated columns through the ``suffixes`` keyword and already existing columns (:issue:`22818`)
-- Deprecated setting :attr:`Categorical._codes`, create a new :class:`Categorical` with the desired codes instead (:issue:`40606`)
-- Deprecated the ``convert_float`` optional argument in :func:`read_excel` and :meth:`ExcelFile.parse` (:issue:`41127`)
-- Deprecated behavior of :meth:`DatetimeIndex.union` with mixed timezones; in a future version both will be cast to UTC instead of object dtype (:issue:`39328`)
-- Deprecated using ``usecols`` with out of bounds indices for :func:`read_csv` with ``engine="c"`` (:issue:`25623`)
-- Deprecated special treatment of lists with first element a Categorical in the :class:`DataFrame` constructor; pass as ``pd.DataFrame({col: categorical, ...})`` instead (:issue:`38845`)
-- Deprecated behavior of :class:`DataFrame` constructor when a ``dtype`` is passed and the data cannot be cast to that dtype. In a future version, this will raise instead of being silently ignored (:issue:`24435`)
-- Deprecated the :attr:`Timestamp.freq` attribute. For the properties that use it (``is_month_start``, ``is_month_end``, ``is_quarter_start``, ``is_quarter_end``, ``is_year_start``, ``is_year_end``), when you have a ``freq``, use e.g. ``freq.is_month_start(ts)`` (:issue:`15146`)
-- Deprecated construction of :class:`Series` or :class:`DataFrame` with ``DatetimeTZDtype`` data and ``datetime64[ns]`` dtype. Use ``Series(data).dt.tz_localize(None)`` instead (:issue:`41555`, :issue:`33401`)
-- Deprecated behavior of :class:`Series` construction with large-integer values and small-integer dtype silently overflowing; use ``Series(data).astype(dtype)`` instead (:issue:`41734`)
-- Deprecated behavior of :class:`DataFrame` construction with floating data and integer dtype casting even when lossy; in a future version this will remain floating, matching :class:`Series` behavior (:issue:`41770`)
-- Deprecated inference of ``timedelta64[ns]``, ``datetime64[ns]``, or ``DatetimeTZDtype`` dtypes in :class:`Series` construction when data containing strings is passed and no ``dtype`` is passed (:issue:`33558`)
-- In a future version, constructing :class:`Series` or :class:`DataFrame` with ``datetime64[ns]`` data and ``DatetimeTZDtype`` will treat the data as wall-times instead of as UTC times (matching DatetimeIndex behavior). To treat the data as UTC times, use ``pd.Series(data).dt.tz_localize("UTC").dt.tz_convert(dtype.tz)`` or ``pd.Series(data.view("int64"), dtype=dtype)`` (:issue:`33401`)
-- Deprecated passing lists as ``key`` to :meth:`DataFrame.xs` and :meth:`Series.xs` (:issue:`41760`)
-- Deprecated boolean arguments of ``inclusive`` in :meth:`Series.between` to have ``{"left", "right", "neither", "both"}`` as standard argument values (:issue:`40628`)
-- Deprecated passing arguments as positional for all of the following, with exceptions noted (:issue:`41485`):
-
- - :func:`concat` (other than ``objs``)
- - :func:`read_csv` (other than ``filepath_or_buffer``)
- - :func:`read_table` (other than ``filepath_or_buffer``)
- - :meth:`DataFrame.clip` and :meth:`Series.clip` (other than ``upper`` and ``lower``)
- - :meth:`DataFrame.drop_duplicates` (except for ``subset``), :meth:`Series.drop_duplicates`, :meth:`Index.drop_duplicates` and :meth:`MultiIndex.drop_duplicates`
- - :meth:`DataFrame.drop` (other than ``labels``) and :meth:`Series.drop`
- - :meth:`DataFrame.dropna` and :meth:`Series.dropna`
- - :meth:`DataFrame.ffill`, :meth:`Series.ffill`, :meth:`DataFrame.bfill`, and :meth:`Series.bfill`
- - :meth:`DataFrame.fillna` and :meth:`Series.fillna` (apart from ``value``)
- - :meth:`DataFrame.interpolate` and :meth:`Series.interpolate` (other than ``method``)
- - :meth:`DataFrame.mask` and :meth:`Series.mask` (other than ``cond`` and ``other``)
- - :meth:`DataFrame.reset_index` (other than ``level``) and :meth:`Series.reset_index`
- - :meth:`DataFrame.set_axis` and :meth:`Series.set_axis` (other than ``labels``)
- - :meth:`DataFrame.set_index` (other than ``keys``)
- - :meth:`DataFrame.sort_index` and :meth:`Series.sort_index`
- - :meth:`DataFrame.sort_values` (other than ``by``) and :meth:`Series.sort_values`
- - :meth:`DataFrame.where` and :meth:`Series.where` (other than ``cond`` and ``other``)
- - :meth:`Index.set_names` and :meth:`MultiIndex.set_names` (except for ``names``)
- - :meth:`MultiIndex.codes` (except for ``codes``)
- - :meth:`MultiIndex.set_levels` (except for ``levels``)
- - :meth:`Resampler.interpolate` (other than ``method``)
-
-
.. ---------------------------------------------------------------------------
@@ -891,7 +872,7 @@ Performance improvements
- Performance improvement in :class:`.Styler` where render times are more than 50% reduced and now matches :meth:`DataFrame.to_html` (:issue:`39972` :issue:`39952`, :issue:`40425`)
- The method :meth:`.Styler.set_td_classes` is now as performant as :meth:`.Styler.apply` and :meth:`.Styler.applymap`, and even more so in some cases (:issue:`40453`)
- Performance improvement in :meth:`.ExponentialMovingWindow.mean` with ``times`` (:issue:`39784`)
-- Performance improvement in :meth:`.GroupBy.apply` when requiring the Python fallback implementation (:issue:`40176`)
+- Performance improvement in :meth:`.GroupBy.apply` when requiring the python fallback implementation (:issue:`40176`)
- Performance improvement in the conversion of a PyArrow Boolean array to a pandas nullable Boolean array (:issue:`41051`)
- Performance improvement for concatenation of data with type :class:`CategoricalDtype` (:issue:`40193`)
- Performance improvement in :meth:`.GroupBy.cummin` and :meth:`.GroupBy.cummax` with nullable data types (:issue:`37493`)
@@ -923,7 +904,6 @@ Datetimelike
- Bug in constructing a :class:`DataFrame` or :class:`Series` with mismatched ``datetime64`` data and ``timedelta64`` dtype, or vice-versa, failing to raise a ``TypeError`` (:issue:`38575`, :issue:`38764`, :issue:`38792`)
- Bug in constructing a :class:`Series` or :class:`DataFrame` with a ``datetime`` object out of bounds for ``datetime64[ns]`` dtype or a ``timedelta`` object out of bounds for ``timedelta64[ns]`` dtype (:issue:`38792`, :issue:`38965`)
- Bug in :meth:`DatetimeIndex.intersection`, :meth:`DatetimeIndex.symmetric_difference`, :meth:`PeriodIndex.intersection`, :meth:`PeriodIndex.symmetric_difference` always returning object-dtype when operating with :class:`CategoricalIndex` (:issue:`38741`)
-- Bug in :meth:`DatetimeIndex.intersection` giving incorrect results with non-Tick frequencies with ``n != 1`` (:issue:`42104`)
- Bug in :meth:`Series.where` incorrectly casting ``datetime64`` values to ``int64`` (:issue:`37682`)
- Bug in :class:`Categorical` incorrectly typecasting ``datetime`` object to ``Timestamp`` (:issue:`38878`)
- Bug in comparisons between :class:`Timestamp` object and ``datetime64`` objects just outside the implementation bounds for nanosecond ``datetime64`` (:issue:`39221`)
@@ -931,7 +911,6 @@ Datetimelike
- Bug in :meth:`Timedelta.round`, :meth:`Timedelta.floor`, :meth:`Timedelta.ceil` for values near the implementation bounds of :class:`Timedelta` (:issue:`38964`)
- Bug in :func:`date_range` incorrectly creating :class:`DatetimeIndex` containing ``NaT`` instead of raising ``OutOfBoundsDatetime`` in corner cases (:issue:`24124`)
- Bug in :func:`infer_freq` incorrectly fails to infer 'H' frequency of :class:`DatetimeIndex` if the latter has a timezone and crosses DST boundaries (:issue:`39556`)
-- Bug in :class:`Series` backed by :class:`DatetimeArray` or :class:`TimedeltaArray` sometimes failing to set the array's ``freq`` to ``None`` (:issue:`41425`)
Timedelta
^^^^^^^^^
@@ -961,9 +940,6 @@ Numeric
- Bug in :meth:`Series.count` would result in an ``int32`` result on 32-bit platforms when argument ``level=None`` (:issue:`40908`)
- Bug in :class:`Series` and :class:`DataFrame` reductions with methods ``any`` and ``all`` not returning Boolean results for object data (:issue:`12863`, :issue:`35450`, :issue:`27709`)
- Bug in :meth:`Series.clip` would fail if the Series contains NA values and has nullable int or float as a data type (:issue:`40851`)
-- Bug in :meth:`UInt64Index.where` and :meth:`UInt64Index.putmask` with an ``np.int64`` dtype ``other`` incorrectly raising ``TypeError`` (:issue:`41974`)
-- Bug in :meth:`DataFrame.agg()` not sorting the aggregated axis in the order of the provided aggregation functions when one or more aggregation function fails to produce results (:issue:`33634`)
-- Bug in :meth:`DataFrame.clip` not interpreting missing values as no threshold (:issue:`40420`)
Conversion
^^^^^^^^^^
@@ -979,12 +955,6 @@ Conversion
- Bug in :class:`DataFrame` and :class:`Series` construction with ``datetime64[ns]`` data and ``dtype=object`` resulting in ``datetime`` objects instead of :class:`Timestamp` objects (:issue:`41599`)
- Bug in :class:`DataFrame` and :class:`Series` construction with ``timedelta64[ns]`` data and ``dtype=object`` resulting in ``np.timedelta64`` objects instead of :class:`Timedelta` objects (:issue:`41599`)
- Bug in :class:`DataFrame` construction when given a two-dimensional object-dtype ``np.ndarray`` of :class:`Period` or :class:`Interval` objects failing to cast to :class:`PeriodDtype` or :class:`IntervalDtype`, respectively (:issue:`41812`)
-- Bug in constructing a :class:`Series` from a list and a :class:`PandasDtype` (:issue:`39357`)
-- Bug in creating a :class:`Series` from a ``range`` object that does not fit in the bounds of ``int64`` dtype (:issue:`30173`)
-- Bug in creating a :class:`Series` from a ``dict`` with all-tuple keys and an :class:`Index` that requires reindexing (:issue:`41707`)
-- Bug in :func:`.infer_dtype` not recognizing Series, Index, or array with a Period dtype (:issue:`23553`)
-- Bug in :func:`.infer_dtype` raising an error for general :class:`.ExtensionArray` objects. It will now return ``"unknown-array"`` instead of raising (:issue:`37367`)
-- Bug in :meth:`DataFrame.convert_dtypes` incorrectly raised a ``ValueError`` when called on an empty DataFrame (:issue:`40393`)
Strings
^^^^^^^
@@ -1005,7 +975,6 @@ Indexing
^^^^^^^^
- Bug in :meth:`Index.union` and :meth:`MultiIndex.union` dropping duplicate ``Index`` values when ``Index`` was not monotonic or ``sort`` was set to ``False`` (:issue:`36289`, :issue:`31326`, :issue:`40862`)
- Bug in :meth:`CategoricalIndex.get_indexer` failing to raise ``InvalidIndexError`` when non-unique (:issue:`38372`)
-- Bug in :meth:`IntervalIndex.get_indexer` when ``target`` has ``CategoricalDtype`` and both the index and the target contain NA values (:issue:`41934`)
- Bug in :meth:`Series.loc` raising a ``ValueError`` when input was filtered with a Boolean list and values to set were a list with lower dimension (:issue:`20438`)
- Bug in inserting many new columns into a :class:`DataFrame` causing incorrect subsequent indexing behavior (:issue:`38380`)
- Bug in :meth:`DataFrame.__setitem__` raising a ``ValueError`` when setting multiple values to duplicate columns (:issue:`15695`)
@@ -1037,17 +1006,12 @@ Indexing
- Bug in :meth:`DataFrame.loc.__setitem__` when setting-with-expansion incorrectly raising when the index in the expanding axis contained duplicates (:issue:`40096`)
- Bug in :meth:`DataFrame.loc.__getitem__` with :class:`MultiIndex` casting to float when at least one index column has float dtype and we retrieve a scalar (:issue:`41369`)
- Bug in :meth:`DataFrame.loc` incorrectly matching non-Boolean index elements (:issue:`20432`)
-- Bug in indexing with ``np.nan`` on a :class:`Series` or :class:`DataFrame` with a :class:`CategoricalIndex` incorrectly raising ``KeyError`` when ``np.nan`` keys are present (:issue:`41933`)
- Bug in :meth:`Series.__delitem__` with ``ExtensionDtype`` incorrectly casting to ``ndarray`` (:issue:`40386`)
-- Bug in :meth:`DataFrame.at` with a :class:`CategoricalIndex` returning incorrect results when passed integer keys (:issue:`41846`)
- Bug in :meth:`DataFrame.loc` returning a :class:`MultiIndex` in the wrong order if an indexer has duplicates (:issue:`40978`)
- Bug in :meth:`DataFrame.__setitem__` raising a ``TypeError`` when using a ``str`` subclass as the column name with a :class:`DatetimeIndex` (:issue:`37366`)
- Bug in :meth:`PeriodIndex.get_loc` failing to raise a ``KeyError`` when given a :class:`Period` with a mismatched ``freq`` (:issue:`41670`)
- Bug ``.loc.__getitem__`` with a :class:`UInt64Index` and negative-integer keys raising ``OverflowError`` instead of ``KeyError`` in some cases, wrapping around to positive integers in others (:issue:`41777`)
- Bug in :meth:`Index.get_indexer` failing to raise ``ValueError`` in some cases with invalid ``method``, ``limit``, or ``tolerance`` arguments (:issue:`41918`)
-- Bug when slicing a :class:`Series` or :class:`DataFrame` with a :class:`TimedeltaIndex` when passing an invalid string raising ``ValueError`` instead of a ``TypeError`` (:issue:`41821`)
-- Bug in :class:`Index` constructor sometimes silently ignoring a specified ``dtype`` (:issue:`38879`)
-- :meth:`Index.where` behavior now mirrors :meth:`Index.putmask` behavior, i.e. ``index.where(mask, other)`` matches ``index.putmask(~mask, other)`` (:issue:`39412`)
Missing
^^^^^^^
@@ -1056,7 +1020,6 @@ Missing
- Bug in :meth:`DataFrame.fillna` not accepting a dictionary for the ``downcast`` keyword (:issue:`40809`)
- Bug in :func:`isna` not returning a copy of the mask for nullable types, causing any subsequent mask modification to change the original array (:issue:`40935`)
- Bug in :class:`DataFrame` construction with float data containing ``NaN`` and an integer ``dtype`` casting instead of retaining the ``NaN`` (:issue:`26919`)
-- Bug in :meth:`Series.isin` and :meth:`MultiIndex.isin` didn't treat all nans as equivalent if they were in tuples (:issue:`41836`)
MultiIndex
^^^^^^^^^^
@@ -1064,7 +1027,6 @@ MultiIndex
- Bug in :meth:`MultiIndex.intersection` duplicating ``NaN`` in the result (:issue:`38623`)
- Bug in :meth:`MultiIndex.equals` incorrectly returning ``True`` when the :class:`MultiIndex` contained ``NaN`` even when they are differently ordered (:issue:`38439`)
- Bug in :meth:`MultiIndex.intersection` always returning an empty result when intersecting with :class:`CategoricalIndex` (:issue:`38653`)
-- Bug in :meth:`MultiIndex.difference` incorrectly raising ``TypeError`` when indexes contain non-sortable entries (:issue:`41915`)
- Bug in :meth:`MultiIndex.reindex` raising a ``ValueError`` when used on an empty :class:`MultiIndex` and indexing only a specific level (:issue:`41170`)
- Bug in :meth:`MultiIndex.reindex` raising ``TypeError`` when reindexing against a flat :class:`Index` (:issue:`41707`)
@@ -1104,7 +1066,6 @@ I/O
- Bug in the conversion from PyArrow to pandas (e.g. for reading Parquet) with nullable dtypes and a PyArrow array whose data buffer size is not a multiple of the dtype size (:issue:`40896`)
- Bug in :func:`read_excel` would raise an error when pandas could not determine the file type even though the user specified the ``engine`` argument (:issue:`41225`)
- Bug in :func:`read_clipboard` copying from an excel file shifts values into the wrong column if there are null values in first column (:issue:`41108`)
-- Bug in :meth:`DataFrame.to_hdf` and :meth:`Series.to_hdf` raising a ``TypeError`` when trying to append a string column to an incompatible column (:issue:`41897`)
Period
^^^^^^
@@ -1164,8 +1125,6 @@ Groupby/resample/rolling
- Bug in :class:`DataFrameGroupBy` aggregations incorrectly failing to drop columns with invalid dtypes for that aggregation when there are no valid columns (:issue:`41291`)
- Bug in :meth:`DataFrame.rolling.__iter__` where ``on`` was not assigned to the index of the resulting objects (:issue:`40373`)
- Bug in :meth:`.DataFrameGroupBy.transform` and :meth:`.DataFrameGroupBy.agg` with ``engine="numba"`` where ``*args`` were being cached with the user passed function (:issue:`41647`)
-- Bug in :class:`DataFrameGroupBy` methods ``agg``, ``transform``, ``sum``, ``bfill``, ``ffill``, ``pad``, ``pct_change``, ``shift``, ``ohlc`` dropping ``.columns.names`` (:issue:`41497`)
-
Reshaping
^^^^^^^^^
@@ -1188,8 +1147,6 @@ Reshaping
- Bug in :func:`to_datetime` raising an error when the input sequence contained unhashable items (:issue:`39756`)
- Bug in :meth:`Series.explode` preserving the index when ``ignore_index`` was ``True`` and values were scalars (:issue:`40487`)
- Bug in :func:`to_datetime` raising a ``ValueError`` when :class:`Series` contains ``None`` and ``NaT`` and has more than 50 elements (:issue:`39882`)
-- Bug in :meth:`Series.unstack` and :meth:`DataFrame.unstack` with object-dtype values containing timezone-aware datetime objects incorrectly raising ``TypeError`` (:issue:`41875`)
-- Bug in :meth:`DataFrame.melt` raising ``InvalidIndexError`` when :class:`DataFrame` has duplicate columns used as ``value_vars`` (:issue:`41951`)
Sparse
^^^^^^
@@ -1217,14 +1174,24 @@ Styler
Other
^^^^^
+- Bug in :class:`Index` constructor sometimes silently ignoring a specified ``dtype`` (:issue:`38879`)
+- Bug in :func:`.infer_dtype` not recognizing Series, Index, or array with a Period dtype (:issue:`23553`)
+- Bug in :func:`.infer_dtype` raising an error for general :class:`.ExtensionArray` objects. It will now return ``"unknown-array"`` instead of raising (:issue:`37367`)
+- Bug in constructing a :class:`Series` from a list and a :class:`PandasDtype` (:issue:`39357`)
- ``inspect.getmembers(Series)`` no longer raises an ``AbstractMethodError`` (:issue:`38782`)
- Bug in :meth:`Series.where` with numeric dtype and ``other=None`` not casting to ``nan`` (:issue:`39761`)
+- :meth:`Index.where` behavior now mirrors :meth:`Index.putmask` behavior, i.e. ``index.where(mask, other)`` matches ``index.putmask(~mask, other)`` (:issue:`39412`)
- Bug in :func:`.assert_series_equal`, :func:`.assert_frame_equal`, :func:`.assert_index_equal` and :func:`.assert_extension_array_equal` incorrectly raising when an attribute has an unrecognized NA type (:issue:`39461`)
- Bug in :func:`.assert_index_equal` with ``exact=True`` not raising when comparing :class:`CategoricalIndex` instances with ``Int64Index`` and ``RangeIndex`` categories (:issue:`41263`)
- Bug in :meth:`DataFrame.equals`, :meth:`Series.equals`, and :meth:`Index.equals` with object-dtype containing ``np.datetime64("NaT")`` or ``np.timedelta64("NaT")`` (:issue:`39650`)
- Bug in :func:`show_versions` where console JSON output was not proper JSON (:issue:`39701`)
- pandas can now compile on z/OS when using `xlc `_ (:issue:`35826`)
-- Bug in :func:`pandas.util.hash_pandas_object` not recognizing ``hash_key``, ``encoding`` and ``categorize`` when the input object type is a :class:`DataFrame` (:issue:`41404`)
+- Bug in :meth:`DataFrame.convert_dtypes` incorrectly raised a ``ValueError`` when called on an empty DataFrame (:issue:`40393`)
+- Bug in :meth:`DataFrame.agg()` not sorting the aggregated axis in the order of the provided aggragation functions when one or more aggregation function fails to produce results (:issue:`33634`)
+- Bug in :meth:`DataFrame.clip` not interpreting missing values as no threshold (:issue:`40420`)
+- Bug in :class:`Series` backed by :class:`DatetimeArray` or :class:`TimedeltaArray` sometimes failing to set the array's ``freq`` to ``None`` (:issue:`41425`)
+- Bug in creating a :class:`Series` from a ``range`` object that does not fit in the bounds of ``int64`` dtype (:issue:`30173`)
+- Bug in creating a :class:`Series` from a ``dict`` with all-tuple keys and an :class:`Index` that requires reindexing (:issue:`41707`)
.. ---------------------------------------------------------------------------
@@ -1233,4 +1200,4 @@ Other
Contributors
~~~~~~~~~~~~
-.. contributors:: v1.2.5..v1.3.0
+.. contributors:: v1.2.4..v1.3.0|HEAD
diff --git a/doc/source/whatsnew/v1.3.1.rst b/doc/source/whatsnew/v1.3.1.rst
deleted file mode 100644
index 2ce146660f98c..0000000000000
--- a/doc/source/whatsnew/v1.3.1.rst
+++ /dev/null
@@ -1,54 +0,0 @@
-.. _whatsnew_131:
-
-What's new in 1.3.1 (July ??, 2021)
------------------------------------
-
-These are the changes in pandas 1.3.1. See :ref:`release` for a full changelog
-including other versions of pandas.
-
-{{ header }}
-
-.. ---------------------------------------------------------------------------
-
-.. _whatsnew_131.regressions:
-
-Fixed regressions
-~~~~~~~~~~~~~~~~~
-- Pandas could not be built on PyPy (:issue:`42355`)
-- :class:`DataFrame` constructed with an older version of pandas could not be unpickled (:issue:`42345`)
-- Performance regression in constructing a :class:`DataFrame` from a dictionary of dictionaries (:issue:`42248`)
-- Fixed regression in :meth:`DataFrame.agg` dropping values when the DataFrame had an Extension Array dtype, a duplicate index, and ``axis=1`` (:issue:`42380`)
-- Fixed regression in :meth:`DataFrame.astype` changing the order of noncontiguous data (:issue:`42396`)
-- Performance regression in :class:`DataFrame` in reduction operations requiring casting such as :meth:`DataFrame.mean` on integer data (:issue:`38592`)
-- Performance regression in :meth:`DataFrame.to_dict` and :meth:`Series.to_dict` when ``orient`` argument one of "records", "dict", or "split" (:issue:`42352`)
-- Fixed regression in indexing with a ``list`` subclass incorrectly raising ``TypeError`` (:issue:`42433`, :issue:`42461`)
-- Fixed regression in :meth:`DataFrame.isin` and :meth:`Series.isin` raising ``TypeError`` with nullable data containing at least one missing value (:issue:`42405`)
-- Regression in :func:`concat` between objects with bool dtype and integer dtype casting to object instead of to integer (:issue:`42092`)
-
-.. ---------------------------------------------------------------------------
-
-.. _whatsnew_131.bug_fixes:
-
-Bug fixes
-~~~~~~~~~
-- Fixed bug in :meth:`DataFrame.transpose` dropping values when the DataFrame had an Extension Array dtype and a duplicate index (:issue:`42380`)
-- Fixed bug in :meth:`DataFrame.to_xml` raising ``KeyError`` when called with ``index=False`` and an offset index (:issue:`42458`)
--
-
-.. ---------------------------------------------------------------------------
-
-.. _whatsnew_131.other:
-
-Other
-~~~~~
--
--
-
-.. ---------------------------------------------------------------------------
-
-.. _whatsnew_131.contributors:
-
-Contributors
-~~~~~~~~~~~~
-
-.. contributors:: v1.3.0..v1.3.1|HEAD
diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst
deleted file mode 100644
index 68f1c78688b1d..0000000000000
--- a/doc/source/whatsnew/v1.4.0.rst
+++ /dev/null
@@ -1,295 +0,0 @@
-.. _whatsnew_140:
-
-What's new in 1.4.0 (??)
-------------------------
-
-These are the changes in pandas 1.4.0. See :ref:`release` for a full changelog
-including other versions of pandas.
-
-{{ header }}
-
-.. ---------------------------------------------------------------------------
-
-.. _whatsnew_140.enhancements:
-
-Enhancements
-~~~~~~~~~~~~
-
-.. _whatsnew_140.enhancements.enhancement1:
-
-enhancement1
-^^^^^^^^^^^^
-
-.. _whatsnew_140.enhancements.enhancement2:
-
-enhancement2
-^^^^^^^^^^^^
-
-.. _whatsnew_140.enhancements.other:
-
-Other enhancements
-^^^^^^^^^^^^^^^^^^
-- Add support for assigning values to ``by`` argument in :meth:`DataFrame.plot.hist` and :meth:`DataFrame.plot.box` (:issue:`15079`)
-- :meth:`Series.sample`, :meth:`DataFrame.sample`, and :meth:`.GroupBy.sample` now accept a ``np.random.Generator`` as input to ``random_state``. A generator will be more performant, especially with ``replace=False`` (:issue:`38100`)
-- Additional options added to :meth:`.Styler.bar` to control alignment and display, with keyword only arguments (:issue:`26070`, :issue:`36419`)
-- :meth:`Styler.bar` now validates the input argument ``width`` and ``height`` (:issue:`42511`)
-- :meth:`Series.ewm`, :meth:`DataFrame.ewm`, now support a ``method`` argument with a ``'table'`` option that performs the windowing operation over an entire :class:`DataFrame`. See :ref:`Window Overview ` for performance and functional benefits (:issue:`42273`)
--
-
-.. ---------------------------------------------------------------------------
-
-.. _whatsnew_140.notable_bug_fixes:
-
-Notable bug fixes
-~~~~~~~~~~~~~~~~~
-
-These are bug fixes that might have notable behavior changes.
-
-.. _whatsnew_140.notable_bug_fixes.notable_bug_fix1:
-
-notable_bug_fix1
-^^^^^^^^^^^^^^^^
-
-.. _whatsnew_140.notable_bug_fixes.notable_bug_fix2:
-
-notable_bug_fix2
-^^^^^^^^^^^^^^^^
-
-.. ---------------------------------------------------------------------------
-
-.. _whatsnew_140.api_breaking:
-
-Backwards incompatible API changes
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. _whatsnew_140.api_breaking.deps:
-
-Increased minimum versions for dependencies
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-Some minimum supported versions of dependencies were updated.
-If installed, we now require:
-
-+-----------------+-----------------+----------+---------+
-| Package | Minimum Version | Required | Changed |
-+=================+=================+==========+=========+
-| numpy | 1.18.5 | X | X |
-+-----------------+-----------------+----------+---------+
-| pytz | 2020.1 | X | X |
-+-----------------+-----------------+----------+---------+
-| python-dateutil | 2.8.1 | X | X |
-+-----------------+-----------------+----------+---------+
-| bottleneck | 1.3.1 | | X |
-+-----------------+-----------------+----------+---------+
-| numexpr | 2.7.1 | | X |
-+-----------------+-----------------+----------+---------+
-| pytest (dev) | 6.0 | | |
-+-----------------+-----------------+----------+---------+
-| mypy (dev) | 0.910 | | X |
-+-----------------+-----------------+----------+---------+
-
-For `optional libraries `_ the general recommendation is to use the latest version.
-The following table lists the lowest version per library that is currently being tested throughout the development of pandas.
-Optional libraries below the lowest tested version may still work, but are not considered supported.
-
-+-----------------+-----------------+---------+
-| Package | Minimum Version | Changed |
-+=================+=================+=========+
-| beautifulsoup4 | 4.8.2 | X |
-+-----------------+-----------------+---------+
-| fastparquet | 0.4.0 | |
-+-----------------+-----------------+---------+
-| fsspec | 0.7.4 | |
-+-----------------+-----------------+---------+
-| gcsfs | 0.6.0 | |
-+-----------------+-----------------+---------+
-| lxml | 4.5.0 | X |
-+-----------------+-----------------+---------+
-| matplotlib | 3.3.2 | X |
-+-----------------+-----------------+---------+
-| numba | 0.50.1 | X |
-+-----------------+-----------------+---------+
-| openpyxl | 3.0.2 | X |
-+-----------------+-----------------+---------+
-| pyarrow | 0.17.0 | |
-+-----------------+-----------------+---------+
-| pymysql | 0.10.1 | X |
-+-----------------+-----------------+---------+
-| pytables | 3.6.1 | X |
-+-----------------+-----------------+---------+
-| s3fs | 0.4.0 | |
-+-----------------+-----------------+---------+
-| scipy | 1.4.1 | X |
-+-----------------+-----------------+---------+
-| sqlalchemy | 1.3.11 | X |
-+-----------------+-----------------+---------+
-| tabulate | 0.8.7 | |
-+-----------------+-----------------+---------+
-| xarray | 0.15.1 | X |
-+-----------------+-----------------+---------+
-| xlrd | 2.0.1 | X |
-+-----------------+-----------------+---------+
-| xlsxwriter | 1.2.2 | X |
-+-----------------+-----------------+---------+
-| xlwt | 1.3.0 | |
-+-----------------+-----------------+---------+
-| pandas-gbq | 0.14.0 | X |
-+-----------------+-----------------+---------+
-
-See :ref:`install.dependencies` and :ref:`install.optional_dependencies` for more.
-
-.. _whatsnew_140.api_breaking.other:
-
-Other API changes
-^^^^^^^^^^^^^^^^^
-- :meth:`Index.get_indexer_for` no longer accepts keyword arguments (other than 'target'); in the past these would be silently ignored if the index was not unique (:issue:`42310`)
--
-
-.. ---------------------------------------------------------------------------
-
-.. _whatsnew_140.deprecations:
-
-Deprecations
-~~~~~~~~~~~~
-- Deprecated :meth:`Index.is_type_compatible` (:issue:`42113`)
-- Deprecated ``method`` argument in :meth:`Index.get_loc`, use ``index.get_indexer([label], method=...)`` instead (:issue:`42269`)
-- Deprecated treating integer keys in :meth:`Series.__setitem__` as positional when the index is a :class:`Float64Index` not containing the key, a :class:`IntervalIndex` with no entries containing the key, or a :class:`MultiIndex` with leading :class:`Float64Index` level not containing the key (:issue:`33469`)
-- Deprecated treating ``numpy.datetime64`` objects as UTC times when passed to the :class:`Timestamp` constructor along with a timezone. In a future version, these will be treated as wall-times. To retain the old behavior, use ``Timestamp(dt64).tz_localize("UTC").tz_convert(tz)`` (:issue:`24559`)
-- Deprecated ignoring missing labels when indexing with a sequence of labels on a level of a MultiIndex (:issue:`42351`)
-
-.. ---------------------------------------------------------------------------
-
-.. _whatsnew_140.performance:
-
-Performance improvements
-~~~~~~~~~~~~~~~~~~~~~~~~
-- Performance improvement in :meth:`.GroupBy.sample`, especially when ``weights`` argument provided (:issue:`34483`)
-- Performance improvement in :meth:`.GroupBy.transform` for user-defined functions (:issue:`41598`)
-
-.. ---------------------------------------------------------------------------
-
-.. _whatsnew_140.bug_fixes:
-
-Bug fixes
-~~~~~~~~~
-
-Categorical
-^^^^^^^^^^^
-- Bug in setting dtype-incompatible values into a :class:`Categorical` (or ``Series`` or ``DataFrame`` backed by ``Categorical``) raising ``ValueError`` instead of ``TypeError`` (:issue:`41919`)
-- Bug in :meth:`Categorical.searchsorted` when passing a dtype-incompatible value raising ``KeyError`` instead of ``TypeError`` (:issue:`41919`)
-- Bug in :meth:`Series.where` with ``CategoricalDtype`` when passing a dtype-incompatible value raising ``ValueError`` instead of ``TypeError`` (:issue:`41919`)
-- Bug in :meth:`Categorical.fillna` when passing a dtype-incompatible value raising ``ValueError`` instead of ``TypeError`` (:issue:`41919`)
-- Bug in :meth:`Categorical.fillna` with a tuple-like category raising ``ValueError`` instead of ``TypeError`` when filling with a non-category tuple (:issue:`41919`)
--
-
-Datetimelike
-^^^^^^^^^^^^
-- Bug in :func:`to_datetime` returning pd.NaT for inputs that produce duplicated values, when ``cache=True`` (:issue:`42259`)
-- Bug in :class:`DataFrame` constructor unnecessarily copying non-datetimelike 2D object arrays (:issue:`39272`)
--
-
-Timedelta
-^^^^^^^^^
--
--
-
-Timezones
-^^^^^^^^^
--
--
-
-Numeric
-^^^^^^^
-- Bug in :meth:`DataFrame.rank` raising ``ValueError`` with ``object`` columns and ``method="first"`` (:issue:`41931`)
-- Bug in :meth:`DataFrame.rank` treating missing values and extreme values as equal (for example ``np.nan`` and ``np.inf``), causing incorrect results when ``na_option="bottom"`` or ``na_option="top`` used (:issue:`41931`)
--
-
-Conversion
-^^^^^^^^^^
-- Bug in :class:`UInt64Index` constructor when passing a list containing both positive integers small enough to cast to int64 and integers too large too hold in int64 (:issue:`42201`)
--
-
-Strings
-^^^^^^^
--
--
-
-Interval
-^^^^^^^^
--
--
-
-Indexing
-^^^^^^^^
-- Bug in :meth:`DataFrame.truncate` and :meth:`Series.truncate` when the object's Index has a length greater than one but only one unique value (:issue:`42365`)
-- Bug in :meth:`Series.loc` when with a :class:`MultiIndex` whose first level contains only ``np.nan`` values (:issue:`42055`)
-- Bug in indexing on a :class:`Series` or :class:`DataFrame` with a :class:`DatetimeIndex` when passing a string, the return type depended on whether the index was monotonic (:issue:`24892`)
-- Bug in indexing on a :class:`MultiIndex` failing to drop scalar levels when the indexer is a tuple containing a datetime-like string (:issue:`42476`)
--
-
-Missing
-^^^^^^^
--
--
-
-MultiIndex
-^^^^^^^^^^
-- Bug in :meth:`MultiIndex.get_loc` where the first level is a :class:`DatetimeIndex` and a string key is passed (:issue:`42465`)
-- Bug in :meth:`MultiIndex.reindex` when passing a ``level`` that corresponds to an ``ExtensionDtype`` level (:issue:`42043`)
--
-
-I/O
-^^^
-- Bug in :func:`read_excel` attempting to read chart sheets from .xlsx files (:issue:`41448`)
-- Bug in :func:`json_normalize` where ``errors=ignore`` could fail to ignore missing values of ``meta`` when ``record_path`` has a length greater than one (:issue:`41876`)
-- Bug in :func:`read_csv` with multi-header input and arguments referencing column names as tuples (:issue:`42446`)
--
-
-Period
-^^^^^^
--
--
-
-Plotting
-^^^^^^^^
--
--
-
-Groupby/resample/rolling
-^^^^^^^^^^^^^^^^^^^^^^^^
-- Bug in :meth:`Series.rolling.apply`, :meth:`DataFrame.rolling.apply`, :meth:`Series.expanding.apply` and :meth:`DataFrame.expanding.apply` with ``engine="numba"`` where ``*args`` were being cached with the user passed function (:issue:`42287`)
--
-
-Reshaping
-^^^^^^^^^
--
--
-
-Sparse
-^^^^^^
--
--
-
-ExtensionArray
-^^^^^^^^^^^^^^
--
--
-
-Styler
-^^^^^^
--
--
-
-Other
-^^^^^
-
-.. ***DO NOT USE THIS SECTION***
-
--
--
-
-.. ---------------------------------------------------------------------------
-
-.. _whatsnew_140.contributors:
-
-Contributors
-~~~~~~~~~~~~
diff --git a/environment.yml b/environment.yml
index 9396210da3635..788b88ef16ad6 100644
--- a/environment.yml
+++ b/environment.yml
@@ -3,9 +3,9 @@ channels:
- conda-forge
dependencies:
# required
- - numpy>=1.18.5
+ - numpy>=1.17.3
- python=3.8
- - python-dateutil>=2.8.1
+ - python-dateutil>=2.7.3
- pytz
# benchmarks
@@ -24,7 +24,7 @@ dependencies:
- flake8-bugbear=21.3.2 # used by flake8, find likely bugs
- flake8-comprehensions=3.1.0 # used by flake8, linting of unnecessary comprehensions
- isort>=5.2.1 # check that imports are in the right order
- - mypy=0.910
+ - mypy=0.812
- pre-commit>=2.9.2
- pycodestyle # used by flake8
- pyupgrade
@@ -55,12 +55,12 @@ dependencies:
# testing
- boto3
- botocore>=1.11
- - hypothesis>=5.5.3
+ - hypothesis>=3.82
- moto # mock S3
- flask
- - pytest>=6.0
+ - pytest>=5.0.1
- pytest-cov
- - pytest-xdist>=1.31
+ - pytest-xdist>=1.21
- pytest-asyncio
- pytest-instafail
@@ -71,24 +71,24 @@ dependencies:
# unused (required indirectly may be?)
- ipywidgets
- nbformat
- - notebook>=6.0.3
+ - notebook>=5.7.5
- pip
# optional
- blosc
- - bottleneck>=1.3.1
+ - bottleneck>=1.2.1
- ipykernel
- ipython>=7.11.1
- jinja2 # pandas.Styler
- - matplotlib>=3.3.2 # pandas.plotting, Series.plot, DataFrame.plot
- - numexpr>=2.7.1
- - scipy>=1.4.1
- - numba>=0.50.1
+ - matplotlib>=2.2.2 # pandas.plotting, Series.plot, DataFrame.plot
+ - numexpr>=2.7.0
+ - scipy>=1.2
+ - numba>=0.46.0
# optional for io
# ---------------
# pd.read_html
- - beautifulsoup4>=4.8.2
+ - beautifulsoup4>=4.6.0
- html5lib
- lxml
@@ -99,13 +99,14 @@ dependencies:
- xlwt
- odfpy
- - fastparquet>=0.4.0 # pandas.read_parquet, DataFrame.to_parquet
+ - fastparquet>=0.3.2 # pandas.read_parquet, DataFrame.to_parquet
- pyarrow>=0.17.0 # pandas.read_parquet, DataFrame.to_parquet, pandas.read_feather, DataFrame.to_feather
- python-snappy # required by pyarrow
- - pytables>=3.6.1 # pandas.read_hdf, DataFrame.to_hdf
+ - pyqt>=5.9.2 # pandas.read_clipboard
+ - pytables>=3.5.1 # pandas.read_hdf, DataFrame.to_hdf
- s3fs>=0.4.0 # file IO when using 's3://...' path
- - fsspec>=0.7.4, <2021.6.0 # for generic remote file operations
+ - fsspec>=0.7.4 # for generic remote file operations
- gcsfs>=0.6.0 # file IO when using 'gcs://...' path
- sqlalchemy # pandas.read_sql, DataFrame.to_sql
- xarray # DataFrame.to_xarray
@@ -117,7 +118,3 @@ dependencies:
- git+https://github.com/pydata/pydata-sphinx-theme.git@master
- numpydoc < 1.2 # 2021-02-09 1.2dev breaking CI
- pandas-dev-flaker==0.2.0
- - types-python-dateutil
- - types-PyMySQL
- - types-pytz
- - types-setuptools
diff --git a/pandas/__init__.py b/pandas/__init__.py
index 43f05617584cc..db4043686bcbb 100644
--- a/pandas/__init__.py
+++ b/pandas/__init__.py
@@ -19,7 +19,10 @@
del hard_dependencies, dependency, missing_dependencies
# numpy compat
-from pandas.compat import is_numpy_dev as _is_numpy_dev
+from pandas.compat import (
+ np_version_under1p18 as _np_version_under1p18,
+ is_numpy_dev as _is_numpy_dev,
+)
try:
from pandas._libs import hashtable as _hashtable, lib as _lib, tslib as _tslib
diff --git a/pandas/_config/config.py b/pandas/_config/config.py
index ed48ff7ae08c6..be3498dc0829b 100644
--- a/pandas/_config/config.py
+++ b/pandas/_config/config.py
@@ -157,7 +157,7 @@ def _describe_option(pat: str = "", _print_desc: bool = True):
if len(keys) == 0:
raise OptionError("No such keys(s)")
- s = "\n".join([_build_option_description(k) for k in keys])
+ s = "\n".join(_build_option_description(k) for k in keys)
if _print_desc:
print(s)
diff --git a/pandas/_libs/algos.pyi b/pandas/_libs/algos.pyi
index 9da5534c51321..d0f664c323a89 100644
--- a/pandas/_libs/algos.pyi
+++ b/pandas/_libs/algos.pyi
@@ -123,7 +123,7 @@ def is_monotonic(
def rank_1d(
values: np.ndarray, # ndarray[rank_t, ndim=1]
- labels: np.ndarray | None = ..., # const int64_t[:]=None
+ labels: np.ndarray, # const int64_t[:]
is_datetimelike: bool = ...,
ties_method=...,
ascending: bool = ...,
diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx
index 172f2bfb49160..03f4ce273de6e 100644
--- a/pandas/_libs/algos.pyx
+++ b/pandas/_libs/algos.pyx
@@ -389,8 +389,11 @@ def nancorr_spearman(ndarray[float64_t, ndim=2] mat, Py_ssize_t minp=1) -> ndarr
int64_t nobs = 0
bint no_nans
float64_t vx, vy, sumx, sumxx, sumyy, mean, divisor
+ const int64_t[:] labels_n, labels_nobs
N, K = (mat).shape
+ # For compatibility when calling rank_1d
+ labels_n = np.zeros(N, dtype=np.int64)
# Handle the edge case where we know all results will be nan
# to keep conditional logic inside loop simpler
@@ -409,7 +412,7 @@ def nancorr_spearman(ndarray[float64_t, ndim=2] mat, Py_ssize_t minp=1) -> ndarr
maskedx = np.empty(N, dtype=np.float64)
maskedy = np.empty(N, dtype=np.float64)
for i in range(K):
- ranked_mat[:, i] = rank_1d(mat[:, i])
+ ranked_mat[:, i] = rank_1d(mat[:, i], labels=labels_n)
with nogil:
for xi in range(K):
@@ -448,8 +451,11 @@ def nancorr_spearman(ndarray[float64_t, ndim=2] mat, Py_ssize_t minp=1) -> ndarr
with gil:
# We need to slice back to nobs because rank_1d will
# require arrays of nobs length
- rankedx = rank_1d(np.asarray(maskedx)[:nobs])
- rankedy = rank_1d(np.asarray(maskedy)[:nobs])
+ labels_nobs = np.zeros(nobs, dtype=np.int64)
+ rankedx = rank_1d(np.array(maskedx)[:nobs],
+ labels=labels_nobs)
+ rankedy = rank_1d(np.array(maskedy)[:nobs],
+ labels=labels_nobs)
for i in range(nobs):
maskedx[i] = rankedx[i]
maskedy[i] = rankedy[i]
@@ -512,6 +518,7 @@ def nancorr_kendall(ndarray[float64_t, ndim=2] mat, Py_ssize_t minp=1) -> ndarra
int64_t total_discordant = 0
float64_t kendall_tau
int64_t n_obs
+ const intp_t[:] labels_n
N, K = (mat).shape
@@ -519,9 +526,11 @@ def nancorr_kendall(ndarray[float64_t, ndim=2] mat, Py_ssize_t minp=1) -> ndarra
mask = np.isfinite(mat)
ranked_mat = np.empty((N, K), dtype=np.float64)
+ # For compatibility when calling rank_1d
+ labels_n = np.zeros(N, dtype=np.intp)
for i in range(K):
- ranked_mat[:, i] = rank_1d(mat[:, i])
+ ranked_mat[:, i] = rank_1d(mat[:, i], labels_n)
for xi in range(K):
sorted_idxs = ranked_mat[:, xi].argsort()
@@ -922,37 +931,11 @@ ctypedef fused rank_t:
int64_t
-cdef rank_t get_rank_nan_fill_val(bint rank_nans_highest, rank_t[:] _=None):
- """
- Return the value we'll use to represent missing values when sorting depending
- on if we'd like missing values to end up at the top/bottom. (The second parameter
- is unused, but needed for fused type specialization)
- """
- if rank_nans_highest:
- if rank_t is object:
- return Infinity()
- elif rank_t is int64_t:
- return util.INT64_MAX
- elif rank_t is uint64_t:
- return util.UINT64_MAX
- else:
- return np.inf
- else:
- if rank_t is object:
- return NegInfinity()
- elif rank_t is int64_t:
- return NPY_NAT
- elif rank_t is uint64_t:
- return 0
- else:
- return -np.inf
-
-
@cython.wraparound(False)
@cython.boundscheck(False)
def rank_1d(
ndarray[rank_t, ndim=1] values,
- const intp_t[:] labels=None,
+ const intp_t[:] labels,
bint is_datetimelike=False,
ties_method="average",
bint ascending=True,
@@ -965,10 +948,10 @@ def rank_1d(
Parameters
----------
values : array of rank_t values to be ranked
- labels : np.ndarray[np.intp] or None
+ labels : np.ndarray[np.intp]
Array containing unique label for each group, with its ordering
matching up to the corresponding record in `values`. If not called
- from a groupby operation, will be None.
+ from a groupby operation, will be an array of 0's
is_datetimelike : bool, default False
True if `values` contains datetime-like entries.
ties_method : {'average', 'min', 'max', 'first', 'dense'}, default
@@ -997,7 +980,7 @@ def rank_1d(
ndarray[rank_t, ndim=1] masked_vals
rank_t[:] masked_vals_memview
uint8_t[:] mask
- bint keep_na, nans_rank_highest, check_labels, check_mask
+ bint keep_na, check_labels, check_mask
rank_t nan_fill_val
tiebreak = tiebreakers[ties_method]
@@ -1008,15 +991,14 @@ def rank_1d(
keep_na = na_option == 'keep'
N = len(values)
- if labels is not None:
- # TODO Cython 3.0: cast won't be necessary (#2992)
- assert len(labels) == N
+ # TODO Cython 3.0: cast won't be necessary (#2992)
+ assert len(labels) == N
out = np.empty(N)
grp_sizes = np.ones(N, dtype=np.int64)
- # If we don't care about labels, can short-circuit later label
+ # If all 0 labels, can short-circuit later label
# comparisons
- check_labels = labels is not None
+ check_labels = np.any(labels)
# For cases where a mask is not possible, we can avoid mask checks
check_mask = not (rank_t is uint64_t or (rank_t is int64_t and not is_datetimelike))
@@ -1044,15 +1026,27 @@ def rank_1d(
# If descending, fill with highest value since descending
# will flip the ordering to still end up with lowest rank.
# Symmetric logic applies to `na_option == 'bottom'`
- nans_rank_highest = ascending ^ (na_option == 'top')
- nan_fill_val = get_rank_nan_fill_val[rank_t](nans_rank_highest)
- if nans_rank_highest:
- order = [masked_vals, mask]
+ if ascending ^ (na_option == 'top'):
+ if rank_t is object:
+ nan_fill_val = Infinity()
+ elif rank_t is int64_t:
+ nan_fill_val = np.iinfo(np.int64).max
+ elif rank_t is uint64_t:
+ nan_fill_val = np.iinfo(np.uint64).max
+ else:
+ nan_fill_val = np.inf
+ order = (masked_vals, mask, labels)
else:
- order = [masked_vals, ~(np.asarray(mask))]
+ if rank_t is object:
+ nan_fill_val = NegInfinity()
+ elif rank_t is int64_t:
+ nan_fill_val = NPY_NAT
+ elif rank_t is uint64_t:
+ nan_fill_val = 0
+ else:
+ nan_fill_val = -np.inf
- if check_labels:
- order.append(labels)
+ order = (masked_vals, ~(np.array(mask, copy=False)), labels)
np.putmask(masked_vals, mask, nan_fill_val)
# putmask doesn't accept a memoryview, so we assign as a separate step
@@ -1071,18 +1065,22 @@ def rank_1d(
rank_sorted_1d(
out,
grp_sizes,
+ labels,
lexsort_indexer,
masked_vals_memview,
mask,
- check_mask=check_mask,
- N=N,
- tiebreak=tiebreak,
- keep_na=keep_na,
- pct=pct,
- labels=labels,
+ tiebreak,
+ check_mask,
+ check_labels,
+ keep_na,
+ N,
)
+ if pct:
+ for i in range(N):
+ if grp_sizes[i] != 0:
+ out[i] = out[i] / grp_sizes[i]
- return np.asarray(out)
+ return np.array(out)
@cython.wraparound(False)
@@ -1090,18 +1088,16 @@ def rank_1d(
cdef void rank_sorted_1d(
float64_t[::1] out,
int64_t[::1] grp_sizes,
+ const intp_t[:] labels,
const intp_t[:] sort_indexer,
# Can make const with cython3 (https://github.com/cython/cython/issues/3222)
rank_t[:] masked_vals,
const uint8_t[:] mask,
+ TiebreakEnumType tiebreak,
bint check_mask,
+ bint check_labels,
+ bint keep_na,
Py_ssize_t N,
- TiebreakEnumType tiebreak=TIEBREAK_AVERAGE,
- bint keep_na=True,
- bint pct=False,
- # https://github.com/cython/cython/issues/1630, only trailing arguments can
- # currently be omitted for cdef functions, which is why we keep this at the end
- const intp_t[:] labels=None,
) nogil:
"""
See rank_1d.__doc__. Handles only actual ranking, so sorting and masking should
@@ -1112,36 +1108,33 @@ cdef void rank_sorted_1d(
out : float64_t[::1]
Array to store computed ranks
grp_sizes : int64_t[::1]
- Array to store group counts, only used if pct=True. Should only be None
- if labels is None.
+ Array to store group counts.
+ labels : See rank_1d.__doc__
sort_indexer : intp_t[:]
Array of indices which sorts masked_vals
masked_vals : rank_t[:]
The values input to rank_1d, with missing values replaced by fill values
mask : uint8_t[:]
- Array where entries are True if the value is missing, False otherwise.
- check_mask : bool
+ Array where entries are True if the value is missing, False otherwise
+ tiebreak : TiebreakEnumType
+ See rank_1d.__doc__ for the different modes
+ check_mask : bint
If False, assumes the mask is all False to skip mask indexing
+ check_labels : bint
+ If False, assumes all labels are the same to skip group handling logic
+ keep_na : bint
+ Whether or not to keep nulls
N : Py_ssize_t
The number of elements to rank. Note: it is not always true that
N == len(out) or N == len(masked_vals) (see `nancorr_spearman` usage for why)
- tiebreak : TiebreakEnumType, default TIEBREAK_AVERAGE
- See rank_1d.__doc__ for the different modes
- keep_na : bool, default True
- Whether or not to keep nulls
- pct : bool, default False
- Compute percentage rank of data within each group
- labels : See rank_1d.__doc__, default None. None implies all labels are the same.
"""
cdef:
Py_ssize_t i, j, dups=0, sum_ranks=0,
Py_ssize_t grp_start=0, grp_vals_seen=1, grp_na_count=0
- bint at_end, next_val_diff, group_changed, check_labels
+ bint at_end, next_val_diff, group_changed
int64_t grp_size
- check_labels = labels is not None
-
# Loop over the length of the value array
# each incremental i value can be looked up in the lexsort_indexer
# array that we sorted previously, which gives us the location of
@@ -1349,11 +1342,6 @@ cdef void rank_sorted_1d(
grp_start = i + 1
grp_vals_seen = 1
- if pct:
- for i in range(N):
- if grp_sizes[i] != 0:
- out[i] = out[i] / grp_sizes[i]
-
def rank_2d(
ndarray[rank_t, ndim=2] in_arr,
@@ -1368,28 +1356,26 @@ def rank_2d(
Fast NaN-friendly version of ``scipy.stats.rankdata``.
"""
cdef:
- Py_ssize_t k, n, col
- float64_t[::1, :] out # Column-major so columns are contiguous
- int64_t[::1] grp_sizes
+ Py_ssize_t i, j, z, k, n, dups = 0, total_tie_count = 0
+ Py_ssize_t infs
+ ndarray[float64_t, ndim=2] ranks
ndarray[rank_t, ndim=2] values
- rank_t[:, :] masked_vals
- intp_t[:, :] sort_indexer
- uint8_t[:, :] mask
- TiebreakEnumType tiebreak
- bint check_mask, keep_na, nans_rank_highest
- rank_t nan_fill_val
+ ndarray[intp_t, ndim=2] argsort_indexer
+ ndarray[uint8_t, ndim=2] mask
+ rank_t val, nan_value
+ float64_t count, sum_ranks = 0.0
+ int tiebreak = 0
+ int64_t idx
+ bint check_mask, condition, keep_na
tiebreak = tiebreakers[ties_method]
- if tiebreak == TIEBREAK_FIRST:
- if not ascending:
- tiebreak = TIEBREAK_FIRST_DESCENDING
keep_na = na_option == 'keep'
# For cases where a mask is not possible, we can avoid mask checks
check_mask = not (rank_t is uint64_t or (rank_t is int64_t and not is_datetimelike))
- if axis == 1:
+ if axis == 0:
values = np.asarray(in_arr).T.copy()
else:
values = np.asarray(in_arr).copy()
@@ -1398,62 +1384,120 @@ def rank_2d(
if values.dtype != np.object_:
values = values.astype('O')
- nans_rank_highest = ascending ^ (na_option == 'top')
if check_mask:
- nan_fill_val = get_rank_nan_fill_val[rank_t](nans_rank_highest)
+ if ascending ^ (na_option == 'top'):
+ if rank_t is object:
+ nan_value = Infinity()
+ elif rank_t is float64_t:
+ nan_value = np.inf
+
+ # int64 and datetimelike
+ else:
+ nan_value = np.iinfo(np.int64).max
+
+ else:
+ if rank_t is object:
+ nan_value = NegInfinity()
+ elif rank_t is float64_t:
+ nan_value = -np.inf
+
+ # int64 and datetimelike
+ else:
+ nan_value = NPY_NAT
if rank_t is object:
- mask = missing.isnaobj2d(values).view(np.uint8)
+ mask = missing.isnaobj2d(values)
elif rank_t is float64_t:
- mask = np.isnan(values).view(np.uint8)
+ mask = np.isnan(values)
# int64 and datetimelike
else:
- mask = (values == NPY_NAT).view(np.uint8)
- np.putmask(values, mask, nan_fill_val)
- else:
- mask = np.zeros_like(values, dtype=np.uint8)
+ mask = values == NPY_NAT
- if nans_rank_highest:
- order = (values, mask)
+ np.putmask(values, mask, nan_value)
else:
- order = (values, ~np.asarray(mask))
+ mask = np.zeros_like(values, dtype=bool)
n, k = (values).shape
- out = np.empty((n, k), dtype='f8', order='F')
- grp_sizes = np.ones(n, dtype=np.int64)
+ ranks = np.empty((n, k), dtype='f8')
- # lexsort is slower, so only use if we need to worry about the mask
- if check_mask:
- sort_indexer = np.lexsort(order, axis=0).astype(np.intp, copy=False)
+ if tiebreak == TIEBREAK_FIRST:
+ # need to use a stable sort here
+ argsort_indexer = values.argsort(axis=1, kind='mergesort')
+ if not ascending:
+ tiebreak = TIEBREAK_FIRST_DESCENDING
else:
- kind = "stable" if ties_method == "first" else None
- sort_indexer = values.argsort(axis=0, kind=kind).astype(np.intp, copy=False)
+ argsort_indexer = values.argsort(1)
if not ascending:
- sort_indexer = sort_indexer[::-1, :]
+ argsort_indexer = argsort_indexer[:, ::-1]
- # putmask doesn't accept a memoryview, so we assign in a separate step
- masked_vals = values
- with nogil:
- for col in range(k):
- rank_sorted_1d(
- out[:, col],
- grp_sizes,
- sort_indexer[:, col],
- masked_vals[:, col],
- mask[:, col],
- check_mask=check_mask,
- N=n,
- tiebreak=tiebreak,
- keep_na=keep_na,
- pct=pct,
- )
-
- if axis == 1:
- return np.asarray(out.T)
+ values = _take_2d(values, argsort_indexer)
+
+ for i in range(n):
+ dups = sum_ranks = infs = 0
+
+ total_tie_count = 0
+ count = 0.0
+ for j in range(k):
+ val = values[i, j]
+ idx = argsort_indexer[i, j]
+ if keep_na and check_mask and mask[i, idx]:
+ ranks[i, idx] = NaN
+ infs += 1
+ continue
+
+ count += 1.0
+
+ sum_ranks += (j - infs) + 1
+ dups += 1
+
+ if rank_t is object:
+ condition = (
+ j == k - 1 or
+ are_diff(values[i, j + 1], val) or
+ (keep_na and check_mask and mask[i, argsort_indexer[i, j + 1]])
+ )
+ else:
+ condition = (
+ j == k - 1 or
+ values[i, j + 1] != val or
+ (keep_na and check_mask and mask[i, argsort_indexer[i, j + 1]])
+ )
+
+ if condition:
+ if tiebreak == TIEBREAK_AVERAGE:
+ for z in range(j - dups + 1, j + 1):
+ ranks[i, argsort_indexer[i, z]] = sum_ranks / dups
+ elif tiebreak == TIEBREAK_MIN:
+ for z in range(j - dups + 1, j + 1):
+ ranks[i, argsort_indexer[i, z]] = j - dups + 2
+ elif tiebreak == TIEBREAK_MAX:
+ for z in range(j - dups + 1, j + 1):
+ ranks[i, argsort_indexer[i, z]] = j + 1
+ elif tiebreak == TIEBREAK_FIRST:
+ if rank_t is object:
+ raise ValueError('first not supported for non-numeric data')
+ else:
+ for z in range(j - dups + 1, j + 1):
+ ranks[i, argsort_indexer[i, z]] = z + 1
+ elif tiebreak == TIEBREAK_FIRST_DESCENDING:
+ for z in range(j - dups + 1, j + 1):
+ ranks[i, argsort_indexer[i, z]] = 2 * j - z - dups + 2
+ elif tiebreak == TIEBREAK_DENSE:
+ total_tie_count += 1
+ for z in range(j - dups + 1, j + 1):
+ ranks[i, argsort_indexer[i, z]] = total_tie_count
+ sum_ranks = dups = 0
+ if pct:
+ if tiebreak == TIEBREAK_DENSE:
+ ranks[i, :] /= total_tie_count
+ else:
+ ranks[i, :] /= count
+ if axis == 0:
+ return ranks.T
else:
- return np.asarray(out)
+ return ranks
ctypedef fused diff_t:
diff --git a/pandas/_libs/algos_take_helper.pxi.in b/pandas/_libs/algos_take_helper.pxi.in
index ec041c03b05e1..11679fc432edc 100644
--- a/pandas/_libs/algos_take_helper.pxi.in
+++ b/pandas/_libs/algos_take_helper.pxi.in
@@ -9,6 +9,31 @@ WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in
# ----------------------------------------------------------------------
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def take_1d_intp_intp(
+ const intp_t[:] values,
+ const intp_t[:] indexer,
+ intp_t[::1] out,
+ intp_t fill_value=-1,
+):
+ cdef:
+ Py_ssize_t i, n, idx
+ intp_t fv
+
+ n = indexer.shape[0]
+
+ fv = fill_value
+
+ with nogil:
+ for i in range(n):
+ idx = indexer[i]
+ if idx == -1:
+ out[i] = fv
+ else:
+ out[i] = values[idx]
+
+
{{py:
# c_type_in, c_type_out
@@ -109,33 +134,32 @@ def take_2d_axis0_{{name}}_{{dest}}(ndarray[{{c_type_in}}, ndim=2] values,
cdef:
Py_ssize_t i, j, k, n, idx
{{c_type_out}} fv
- {{if c_type_in == c_type_out != "object"}}
- const {{c_type_out}} *v
- {{c_type_out}} *o
- {{endif}}
n = len(indexer)
k = values.shape[1]
fv = fill_value
- {{if c_type_in == c_type_out != "object"}}
- # GH#3130
- if (values.strides[1] == out.strides[1] and
- values.strides[1] == sizeof({{c_type_out}}) and
- sizeof({{c_type_out}}) * n >= 256):
-
- for i in range(n):
- idx = indexer[i]
- if idx == -1:
- for j in range(k):
- out[i, j] = fv
- else:
- v = &values[idx, 0]
- o = &out[i, 0]
- memmove(o, v, (sizeof({{c_type_out}}) * k))
- return
- {{endif}}
+ IF {{True if c_type_in == c_type_out != "object" else False}}:
+ cdef:
+ const {{c_type_out}} *v
+ {{c_type_out}} *o
+
+ # GH#3130
+ if (values.strides[1] == out.strides[1] and
+ values.strides[1] == sizeof({{c_type_out}}) and
+ sizeof({{c_type_out}}) * n >= 256):
+
+ for i in range(n):
+ idx = indexer[i]
+ if idx == -1:
+ for j in range(k):
+ out[i, j] = fv
+ else:
+ v = &values[idx, 0]
+ o = &out[i, 0]
+ memmove(o, v, (sizeof({{c_type_out}}) * k))
+ return
for i in range(n):
idx = indexer[i]
@@ -220,3 +244,33 @@ def take_2d_multi_{{name}}_{{dest}}(ndarray[{{c_type_in}}, ndim=2] values,
{{endif}}
{{endfor}}
+
+# ----------------------------------------------------------------------
+# take_2d internal function
+# ----------------------------------------------------------------------
+
+ctypedef fused take_t:
+ float64_t
+ uint64_t
+ int64_t
+ object
+
+
+cdef _take_2d(ndarray[take_t, ndim=2] values, ndarray[intp_t, ndim=2] idx):
+ cdef:
+ Py_ssize_t i, j, N, K
+ ndarray[intp_t, ndim=2, cast=True] indexer = idx
+ ndarray[take_t, ndim=2] result
+
+ N, K = (values).shape
+
+ if take_t is object:
+ # evaluated at compile-time
+ result = values.copy()
+ else:
+ result = np.empty_like(values)
+
+ for i in range(N):
+ for j in range(K):
+ result[i, j] = values[i, indexer[i, j]]
+ return result
diff --git a/pandas/_libs/hashtable.pyi b/pandas/_libs/hashtable.pyi
index 951703e04d5a3..5a1b98b190dbc 100644
--- a/pandas/_libs/hashtable.pyi
+++ b/pandas/_libs/hashtable.pyi
@@ -228,5 +228,3 @@ def ismember(
arr: np.ndarray,
values: np.ndarray,
) -> np.ndarray: ... # np.ndarray[bool]
-def object_hash(obj) -> int: ...
-def objects_are_equal(a, b) -> bool: ...
diff --git a/pandas/_libs/hashtable.pyx b/pandas/_libs/hashtable.pyx
index 132435701bddb..7df3f69337643 100644
--- a/pandas/_libs/hashtable.pyx
+++ b/pandas/_libs/hashtable.pyx
@@ -34,8 +34,6 @@ from pandas._libs.khash cimport (
are_equivalent_khcomplex64_t,
are_equivalent_khcomplex128_t,
kh_needed_n_buckets,
- kh_python_hash_equal,
- kh_python_hash_func,
kh_str_t,
khcomplex64_t,
khcomplex128_t,
@@ -48,14 +46,6 @@ def get_hashtable_trace_domain():
return KHASH_TRACE_DOMAIN
-def object_hash(obj):
- return kh_python_hash_func(obj)
-
-
-def objects_are_equal(a, b):
- return kh_python_hash_equal(a, b)
-
-
cdef int64_t NPY_NAT = util.get_nat()
SIZE_HINT_LIMIT = (1 << 20) + 7
diff --git a/pandas/_libs/internals.pyi b/pandas/_libs/internals.pyi
index 3feefe7ac8ff4..d6fac14d3ee6e 100644
--- a/pandas/_libs/internals.pyi
+++ b/pandas/_libs/internals.pyi
@@ -1,7 +1,6 @@
from typing import (
Iterator,
Sequence,
- final,
overload,
)
@@ -51,12 +50,10 @@ class SharedBlock:
class NumpyBlock(SharedBlock):
values: np.ndarray
- @final
def getitem_block_index(self: T, slicer: slice) -> T: ...
class NDArrayBackedBlock(SharedBlock):
values: NDArrayBackedExtensionArray
- @final
def getitem_block_index(self: T, slicer: slice) -> T: ...
class Block(SharedBlock): ...
diff --git a/pandas/_libs/internals.pyx b/pandas/_libs/internals.pyx
index ba59c50142550..6c1ca3deba047 100644
--- a/pandas/_libs/internals.pyx
+++ b/pandas/_libs/internals.pyx
@@ -517,6 +517,7 @@ cdef class NumpyBlock(SharedBlock):
# set placement and ndim
self.values = values
+ # @final # not useful in cython, but we _would_ annotate with @final
cpdef NumpyBlock getitem_block_index(self, slice slicer):
"""
Perform __getitem__-like specialized to slicing along index.
@@ -539,6 +540,7 @@ cdef class NDArrayBackedBlock(SharedBlock):
# set placement and ndim
self.values = values
+ # @final # not useful in cython, but we _would_ annotate with @final
cpdef NDArrayBackedBlock getitem_block_index(self, slice slicer):
"""
Perform __getitem__-like specialized to slicing along index.
@@ -567,12 +569,7 @@ cdef class BlockManager:
public bint _known_consolidated, _is_consolidated
public ndarray _blknos, _blklocs
- def __cinit__(self, blocks=None, axes=None, verify_integrity=True):
- # None as defaults for unpickling GH#42345
- if blocks is None:
- # This adds 1-2 microseconds to DataFrame(np.array([]))
- return
-
+ def __cinit__(self, blocks, axes, verify_integrity=True):
if isinstance(blocks, list):
# Backward compat for e.g. pyarrow
blocks = tuple(blocks)
@@ -583,8 +580,12 @@ cdef class BlockManager:
# Populate known_consolidate, blknos, and blklocs lazily
self._known_consolidated = False
self._is_consolidated = False
- self._blknos = None
- self._blklocs = None
+ # error: Incompatible types in assignment (expression has type "None",
+ # variable has type "ndarray")
+ self._blknos = None # type: ignore[assignment]
+ # error: Incompatible types in assignment (expression has type "None",
+ # variable has type "ndarray")
+ self._blklocs = None # type: ignore[assignment]
# -------------------------------------------------------------------
# Pickle
diff --git a/pandas/_libs/join.pyx b/pandas/_libs/join.pyx
index eefa16d23f576..b69b89c0de019 100644
--- a/pandas/_libs/join.pyx
+++ b/pandas/_libs/join.pyx
@@ -20,22 +20,27 @@ from numpy cimport (
cnp.import_array()
-from pandas._libs.algos import groupsort_indexer
+from pandas._libs.algos import (
+ groupsort_indexer,
+ take_1d_int64_int64,
+ take_1d_intp_intp,
+)
-@cython.wraparound(False)
@cython.boundscheck(False)
def inner_join(const intp_t[:] left, const intp_t[:] right,
Py_ssize_t max_groups):
cdef:
Py_ssize_t i, j, k, count = 0
- intp_t[::1] left_sorter, right_sorter
- intp_t[::1] left_count, right_count
- intp_t[::1] left_indexer, right_indexer
+ ndarray[intp_t] left_sorter, right_sorter
+ ndarray[intp_t] left_count, right_count
+ ndarray[intp_t] left_indexer, right_indexer
intp_t lc, rc
- Py_ssize_t left_pos = 0, right_pos = 0, position = 0
+ Py_ssize_t loc, left_pos = 0, right_pos = 0, position = 0
Py_ssize_t offset
+ # NA group in location 0
+
left_sorter, left_count = groupsort_indexer(left, max_groups)
right_sorter, right_count = groupsort_indexer(right, max_groups)
@@ -48,13 +53,14 @@ def inner_join(const intp_t[:] left, const intp_t[:] right,
if rc > 0 and lc > 0:
count += lc * rc
+ # exclude the NA group
+ left_pos = left_count[0]
+ right_pos = right_count[0]
+
left_indexer = np.empty(count, dtype=np.intp)
right_indexer = np.empty(count, dtype=np.intp)
with nogil:
- # exclude the NA group
- left_pos = left_count[0]
- right_pos = right_count[0]
for i in range(1, max_groups + 1):
lc = left_count[i]
rc = right_count[i]
@@ -69,27 +75,24 @@ def inner_join(const intp_t[:] left, const intp_t[:] right,
left_pos += lc
right_pos += rc
- # Will overwrite left/right indexer with the result
- _get_result_indexer(left_sorter, left_indexer)
- _get_result_indexer(right_sorter, right_indexer)
-
- return np.asarray(left_indexer), np.asarray(right_indexer)
+ return (_get_result_indexer(left_sorter, left_indexer),
+ _get_result_indexer(right_sorter, right_indexer))
-@cython.wraparound(False)
@cython.boundscheck(False)
def left_outer_join(const intp_t[:] left, const intp_t[:] right,
Py_ssize_t max_groups, bint sort=True):
cdef:
Py_ssize_t i, j, k, count = 0
- ndarray[intp_t] rev
- intp_t[::1] left_count, right_count
- intp_t[::1] left_sorter, right_sorter
- intp_t[::1] left_indexer, right_indexer
+ ndarray[intp_t] left_count, right_count
+ ndarray[intp_t] rev, left_sorter, right_sorter
+ ndarray[intp_t] left_indexer, right_indexer
intp_t lc, rc
- Py_ssize_t left_pos = 0, right_pos = 0, position = 0
+ Py_ssize_t loc, left_pos = 0, right_pos = 0, position = 0
Py_ssize_t offset
+ # NA group in location 0
+
left_sorter, left_count = groupsort_indexer(left, max_groups)
right_sorter, right_count = groupsort_indexer(right, max_groups)
@@ -101,13 +104,14 @@ def left_outer_join(const intp_t[:] left, const intp_t[:] right,
else:
count += left_count[i]
+ # exclude the NA group
+ left_pos = left_count[0]
+ right_pos = right_count[0]
+
left_indexer = np.empty(count, dtype=np.intp)
right_indexer = np.empty(count, dtype=np.intp)
with nogil:
- # exclude the NA group
- left_pos = left_count[0]
- right_pos = right_count[0]
for i in range(1, max_groups + 1):
lc = left_count[i]
rc = right_count[i]
@@ -127,38 +131,40 @@ def left_outer_join(const intp_t[:] left, const intp_t[:] right,
left_pos += lc
right_pos += rc
- # Will overwrite left/right indexer with the result
- _get_result_indexer(left_sorter, left_indexer)
- _get_result_indexer(right_sorter, right_indexer)
+ left_indexer = _get_result_indexer(left_sorter, left_indexer)
+ right_indexer = _get_result_indexer(right_sorter, right_indexer)
if not sort: # if not asked to sort, revert to original order
- if len(left) == len(left_indexer):
+ # cast to avoid build warning GH#26757
+ if len(left) == len(left_indexer):
# no multiple matches for any row on the left
# this is a short-cut to avoid groupsort_indexer
# otherwise, the `else` path also works in this case
rev = np.empty(len(left), dtype=np.intp)
- rev.put(np.asarray(left_sorter), np.arange(len(left)))
+ rev.put(left_sorter, np.arange(len(left)))
else:
rev, _ = groupsort_indexer(left_indexer, len(left))
- return np.asarray(left_indexer).take(rev), np.asarray(right_indexer).take(rev)
- else:
- return np.asarray(left_indexer), np.asarray(right_indexer)
+ right_indexer = right_indexer.take(rev)
+ left_indexer = left_indexer.take(rev)
+
+ return left_indexer, right_indexer
-@cython.wraparound(False)
@cython.boundscheck(False)
def full_outer_join(const intp_t[:] left, const intp_t[:] right,
Py_ssize_t max_groups):
cdef:
Py_ssize_t i, j, k, count = 0
- intp_t[::1] left_sorter, right_sorter
- intp_t[::1] left_count, right_count
- intp_t[::1] left_indexer, right_indexer
+ ndarray[intp_t] left_sorter, right_sorter
+ ndarray[intp_t] left_count, right_count
+ ndarray[intp_t] left_indexer, right_indexer
intp_t lc, rc
intp_t left_pos = 0, right_pos = 0
Py_ssize_t offset, position = 0
+ # NA group in location 0
+
left_sorter, left_count = groupsort_indexer(left, max_groups)
right_sorter, right_count = groupsort_indexer(right, max_groups)
@@ -173,13 +179,14 @@ def full_outer_join(const intp_t[:] left, const intp_t[:] right,
else:
count += lc + rc
+ # exclude the NA group
+ left_pos = left_count[0]
+ right_pos = right_count[0]
+
left_indexer = np.empty(count, dtype=np.intp)
right_indexer = np.empty(count, dtype=np.intp)
with nogil:
- # exclude the NA group
- left_pos = left_count[0]
- right_pos = right_count[0]
for i in range(1, max_groups + 1):
lc = left_count[i]
rc = right_count[i]
@@ -204,33 +211,24 @@ def full_outer_join(const intp_t[:] left, const intp_t[:] right,
left_pos += lc
right_pos += rc
- # Will overwrite left/right indexer with the result
- _get_result_indexer(left_sorter, left_indexer)
- _get_result_indexer(right_sorter, right_indexer)
-
- return np.asarray(left_indexer), np.asarray(right_indexer)
+ return (_get_result_indexer(left_sorter, left_indexer),
+ _get_result_indexer(right_sorter, right_indexer))
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef void _get_result_indexer(intp_t[::1] sorter, intp_t[::1] indexer) nogil:
- """NOTE: overwrites indexer with the result to avoid allocating another array"""
- cdef:
- Py_ssize_t i, n, idx
-
+cdef ndarray[intp_t] _get_result_indexer(
+ ndarray[intp_t] sorter, ndarray[intp_t] indexer
+):
if len(sorter) > 0:
# cython-only equivalent to
# `res = algos.take_nd(sorter, indexer, fill_value=-1)`
- n = indexer.shape[0]
- for i in range(n):
- idx = indexer[i]
- if idx == -1:
- indexer[i] = -1
- else:
- indexer[i] = sorter[idx]
+ res = np.empty(len(indexer), dtype=np.intp)
+ take_1d_intp_intp(sorter, indexer, res, -1)
else:
# length-0 case
- indexer[:] = -1
+ res = np.empty(len(indexer), dtype=np.intp)
+ res[:] = -1
+
+ return res
def ffill_indexer(const intp_t[:] indexer) -> np.ndarray:
diff --git a/pandas/_libs/khash.pxd b/pandas/_libs/khash.pxd
index b9c18d6c86039..ba805e9ff1251 100644
--- a/pandas/_libs/khash.pxd
+++ b/pandas/_libs/khash.pxd
@@ -41,9 +41,6 @@ cdef extern from "khash_python.h":
bint are_equivalent_float32_t \
"kh_floats_hash_equal" (float32_t a, float32_t b) nogil
- uint32_t kh_python_hash_func(object key)
- bint kh_python_hash_equal(object a, object b)
-
ctypedef struct kh_pymap_t:
khuint_t n_buckets, size, n_occupied, upper_bound
uint32_t *flags
diff --git a/pandas/_libs/lib.pyi b/pandas/_libs/lib.pyi
index 5be50f16af003..3f4623638c70e 100644
--- a/pandas/_libs/lib.pyi
+++ b/pandas/_libs/lib.pyi
@@ -25,9 +25,6 @@ class NoDefault(Enum): ...
no_default: NoDefault
-i8max: int
-u8max: int
-
def item_from_zerodim(val: object) -> object: ...
def infer_dtype(value: object, skipna: bool = True) -> str: ...
def is_iterator(obj: object) -> bool: ...
@@ -51,7 +48,6 @@ def is_string_array(values: np.ndarray, skipna: bool = False): ...
def is_float_array(values: np.ndarray, skipna: bool = False): ...
def is_integer_array(values: np.ndarray, skipna: bool = False): ...
def is_bool_array(values: np.ndarray, skipna: bool = False): ...
-def fast_multiget(mapping: dict, keys: np.ndarray, default=np.nan) -> np.ndarray: ...
def fast_unique_multiple_list_gen(gen: Generator, sort: bool = True) -> list: ...
def fast_unique_multiple_list(lists: list, sort: bool = True) -> list: ...
def fast_unique_multiple(arrays: list, sort: bool = True) -> list: ...
@@ -220,7 +216,8 @@ def array_equivalent_object(
left: np.ndarray, # object[:]
right: np.ndarray, # object[:]
) -> bool: ...
-def has_infs(arr: np.ndarray) -> bool: ... # const floating[:]
+def has_infs_f8(arr: np.ndarray) -> bool: ... # const float64_t[:]
+def has_infs_f4(arr: np.ndarray) -> bool: ... # const float32_t[:]
def get_reverse_indexer(
indexer: np.ndarray, # const intp_t[:]
length: int,
diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
index 4ab2497be94d5..0aec7e5e5a363 100644
--- a/pandas/_libs/lib.pyx
+++ b/pandas/_libs/lib.pyx
@@ -25,7 +25,6 @@ from cpython.tuple cimport (
PyTuple_New,
PyTuple_SET_ITEM,
)
-from cython cimport floating
PyDateTime_IMPORT
@@ -119,10 +118,6 @@ cdef:
float64_t NaN = np.NaN
-# python-visible
-i8max = INT64_MAX
-u8max = UINT64_MAX
-
@cython.wraparound(False)
@cython.boundscheck(False)
@@ -520,22 +515,36 @@ def get_reverse_indexer(const intp_t[:] indexer, Py_ssize_t length) -> ndarray:
@cython.wraparound(False)
@cython.boundscheck(False)
-# Can add const once https://github.com/cython/cython/issues/1772 resolved
-def has_infs(floating[:] arr) -> bool:
+def has_infs_f4(const float32_t[:] arr) -> bool:
cdef:
Py_ssize_t i, n = len(arr)
- floating inf, neginf, val
- bint ret = False
+ float32_t inf, neginf, val
inf = np.inf
neginf = -inf
- with nogil:
- for i in range(n):
- val = arr[i]
- if val == inf or val == neginf:
- ret = True
- break
- return ret
+
+ for i in range(n):
+ val = arr[i]
+ if val == inf or val == neginf:
+ return True
+ return False
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def has_infs_f8(const float64_t[:] arr) -> bool:
+ cdef:
+ Py_ssize_t i, n = len(arr)
+ float64_t inf, neginf, val
+
+ inf = np.inf
+ neginf = -inf
+
+ for i in range(n):
+ val = arr[i]
+ if val == inf or val == neginf:
+ return True
+ return False
def maybe_indices_to_slice(ndarray[intp_t] indices, int max_len):
@@ -703,14 +712,6 @@ cpdef ndarray[object] ensure_string_array(
Py_ssize_t i = 0, n = len(arr)
if hasattr(arr, "to_numpy"):
-
- if hasattr(arr, "dtype") and arr.dtype.kind in ["m", "M"]:
- # dtype check to exclude DataFrame
- # GH#41409 TODO: not a great place for this
- out = arr.astype(str).astype(object)
- out[arr.isna()] = na_value
- return out
-
arr = arr.to_numpy()
elif not isinstance(arr, np.ndarray):
arr = np.array(arr, dtype="object")
@@ -893,13 +894,12 @@ def count_level_2d(ndarray[uint8_t, ndim=2, cast=True] mask,
return counts
-@cython.wraparound(False)
-@cython.boundscheck(False)
def generate_slices(const intp_t[:] labels, Py_ssize_t ngroups):
cdef:
Py_ssize_t i, group_size, n, start
intp_t lab
- int64_t[::1] starts, ends
+ object slobj
+ ndarray[int64_t] starts, ends
n = len(labels)
@@ -908,20 +908,19 @@ def generate_slices(const intp_t[:] labels, Py_ssize_t ngroups):
start = 0
group_size = 0
- with nogil:
- for i in range(n):
- lab = labels[i]
- if lab < 0:
- start += 1
- else:
- group_size += 1
- if i == n - 1 or lab != labels[i + 1]:
- starts[lab] = start
- ends[lab] = start + group_size
- start += group_size
- group_size = 0
+ for i in range(n):
+ lab = labels[i]
+ if lab < 0:
+ start += 1
+ else:
+ group_size += 1
+ if i == n - 1 or lab != labels[i + 1]:
+ starts[lab] = start
+ ends[lab] = start + group_size
+ start += group_size
+ group_size = 0
- return np.asarray(starts), np.asarray(ends)
+ return starts, ends
def indices_fast(ndarray[intp_t] index, const int64_t[:] labels, list keys,
@@ -1700,7 +1699,7 @@ cdef class Validator:
if not self.is_valid(values[i]):
return False
- return True
+ return self.finalize_validate()
@cython.wraparound(False)
@cython.boundscheck(False)
@@ -1713,7 +1712,7 @@ cdef class Validator:
if not self.is_valid_skipna(values[i]):
return False
- return True
+ return self.finalize_validate_skipna()
cdef bint is_valid(self, object value) except -1:
return self.is_value_typed(value)
@@ -1731,6 +1730,18 @@ cdef class Validator:
cdef bint is_array_typed(self) except -1:
return False
+ cdef inline bint finalize_validate(self):
+ return True
+
+ cdef bint finalize_validate_skipna(self):
+ """
+ If we _only_ saw non-dtype-specific NA values, even if they are valid
+ for this dtype, we do not infer this dtype.
+ """
+ # TODO(phillipc): Remove the existing validate methods and replace them
+ # with the skipna versions upon full deprecation of skipna=False
+ return True
+
@cython.internal
cdef class BoolValidator(Validator):
@@ -1882,14 +1893,14 @@ cdef bint is_bytes_array(ndarray values, bint skipna=False):
@cython.internal
cdef class TemporalValidator(Validator):
cdef:
- bint all_generic_na
+ Py_ssize_t generic_null_count
def __cinit__(self, Py_ssize_t n, dtype dtype=np.dtype(np.object_),
bint skipna=False):
self.n = n
self.dtype = dtype
self.skipna = skipna
- self.all_generic_na = True
+ self.generic_null_count = 0
cdef inline bint is_valid(self, object value) except -1:
return self.is_value_typed(value) or self.is_valid_null(value)
@@ -1902,16 +1913,15 @@ cdef class TemporalValidator(Validator):
cdef:
bint is_typed_null = self.is_valid_null(value)
bint is_generic_null = value is None or util.is_nan(value)
- if not is_generic_null:
- self.all_generic_na = False
+ self.generic_null_count += is_typed_null and is_generic_null
return self.is_value_typed(value) or is_typed_null or is_generic_null
- cdef bint _validate_skipna(self, ndarray values) except -1:
+ cdef inline bint finalize_validate_skipna(self):
"""
If we _only_ saw non-dtype-specific NA values, even if they are valid
for this dtype, we do not infer this dtype.
"""
- return Validator._validate_skipna(self, values) and not self.all_generic_na
+ return self.generic_null_count != self.n
@cython.internal
@@ -2955,28 +2965,6 @@ def to_object_array_tuples(rows: object) -> np.ndarray:
return result
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def fast_multiget(dict mapping, ndarray keys, default=np.nan) -> np.ndarray:
- cdef:
- Py_ssize_t i, n = len(keys)
- object val
- ndarray[object] output = np.empty(n, dtype='O')
-
- if n == 0:
- # kludge, for Series
- return np.empty(0, dtype='f8')
-
- for i in range(n):
- val = keys[i]
- if val in mapping:
- output[i] = mapping[val]
- else:
- output[i] = default
-
- return maybe_convert_objects(output)
-
-
def is_bool_list(obj: list) -> bool:
"""
Check if this list contains only bool or np.bool_ objects.
diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx
index 3655d6efad66e..7d7074988e5f0 100644
--- a/pandas/_libs/parsers.pyx
+++ b/pandas/_libs/parsers.pyx
@@ -108,7 +108,6 @@ from pandas.core.dtypes.common import (
is_object_dtype,
)
from pandas.core.dtypes.dtypes import CategoricalDtype
-from pandas.core.dtypes.inference import is_dict_like
cdef:
float64_t INF = np.inf
@@ -690,7 +689,6 @@ cdef class TextReader:
count = counts.get(name, 0)
if (
self.dtype is not None
- and is_dict_like(self.dtype)
and self.dtype.get(old_name) is not None
and self.dtype.get(name) is None
):
@@ -1280,8 +1278,6 @@ cdef class TextReader:
# generate extra (bogus) headers if there are more columns than headers
if j >= len(self.header[0]):
return j
- elif self.has_mi_columns:
- return tuple(header_row[j] for header_row in self.header)
else:
return self.header[0][j]
else:
diff --git a/pandas/_libs/src/klib/khash_python.h b/pandas/_libs/src/klib/khash_python.h
index c0fca76ef701e..aee018262e3a6 100644
--- a/pandas/_libs/src/klib/khash_python.h
+++ b/pandas/_libs/src/klib/khash_python.h
@@ -163,198 +163,28 @@ KHASH_MAP_INIT_COMPLEX128(complex128, size_t)
#define kh_exist_complex128(h, k) (kh_exist(h, k))
-// NaN-floats should be in the same equivalency class, see GH 22119
-int PANDAS_INLINE floatobject_cmp(PyFloatObject* a, PyFloatObject* b){
- return (
- Py_IS_NAN(PyFloat_AS_DOUBLE(a)) &&
- Py_IS_NAN(PyFloat_AS_DOUBLE(b))
- )
- ||
- ( PyFloat_AS_DOUBLE(a) == PyFloat_AS_DOUBLE(b) );
-}
-
-
-// NaNs should be in the same equivalency class, see GH 41836
-// PyObject_RichCompareBool for complexobjects has a different behavior
-// needs to be replaced
-int PANDAS_INLINE complexobject_cmp(PyComplexObject* a, PyComplexObject* b){
- return (
- Py_IS_NAN(a->cval.real) &&
- Py_IS_NAN(b->cval.real) &&
- Py_IS_NAN(a->cval.imag) &&
- Py_IS_NAN(b->cval.imag)
- )
- ||
- (
- Py_IS_NAN(a->cval.real) &&
- Py_IS_NAN(b->cval.real) &&
- a->cval.imag == b->cval.imag
- )
- ||
- (
- a->cval.real == b->cval.real &&
- Py_IS_NAN(a->cval.imag) &&
- Py_IS_NAN(b->cval.imag)
- )
- ||
- (
- a->cval.real == b->cval.real &&
- a->cval.imag == b->cval.imag
- );
-}
-
-int PANDAS_INLINE pyobject_cmp(PyObject* a, PyObject* b);
-
-
-// replacing PyObject_RichCompareBool (NaN!=NaN) with pyobject_cmp (NaN==NaN),
-// which treats NaNs as equivalent
-// see GH 41836
-int PANDAS_INLINE tupleobject_cmp(PyTupleObject* a, PyTupleObject* b){
- Py_ssize_t i;
-
- if (Py_SIZE(a) != Py_SIZE(b)) {
- return 0;
- }
-
- for (i = 0; i < Py_SIZE(a); ++i) {
- if (!pyobject_cmp(PyTuple_GET_ITEM(a, i), PyTuple_GET_ITEM(b, i))) {
- return 0;
- }
- }
- return 1;
-}
-
-
int PANDAS_INLINE pyobject_cmp(PyObject* a, PyObject* b) {
- if (a == b) {
- return 1;
- }
- if (Py_TYPE(a) == Py_TYPE(b)) {
- // special handling for some built-in types which could have NaNs
- // as we would like to have them equivalent, but the usual
- // PyObject_RichCompareBool would return False
- if (PyFloat_CheckExact(a)) {
- return floatobject_cmp((PyFloatObject*)a, (PyFloatObject*)b);
- }
- if (PyComplex_CheckExact(a)) {
- return complexobject_cmp((PyComplexObject*)a, (PyComplexObject*)b);
- }
- if (PyTuple_CheckExact(a)) {
- return tupleobject_cmp((PyTupleObject*)a, (PyTupleObject*)b);
- }
- // frozenset isn't yet supported
- }
-
int result = PyObject_RichCompareBool(a, b, Py_EQ);
if (result < 0) {
PyErr_Clear();
return 0;
}
- return result;
-}
-
-
-Py_hash_t PANDAS_INLINE _Pandas_HashDouble(double val) {
- //Since Python3.10, nan is no longer has hash 0
- if (Py_IS_NAN(val)) {
- return 0;
- }
-#if PY_VERSION_HEX < 0x030A0000
- return _Py_HashDouble(val);
-#else
- return _Py_HashDouble(NULL, val);
-#endif
-}
-
-
-Py_hash_t PANDAS_INLINE floatobject_hash(PyFloatObject* key) {
- return _Pandas_HashDouble(PyFloat_AS_DOUBLE(key));
-}
-
-
-#define _PandasHASH_IMAG 1000003UL
-
-// replaces _Py_HashDouble with _Pandas_HashDouble
-Py_hash_t PANDAS_INLINE complexobject_hash(PyComplexObject* key) {
- Py_uhash_t realhash = (Py_uhash_t)_Pandas_HashDouble(key->cval.real);
- Py_uhash_t imaghash = (Py_uhash_t)_Pandas_HashDouble(key->cval.imag);
- if (realhash == (Py_uhash_t)-1 || imaghash == (Py_uhash_t)-1) {
- return -1;
- }
- Py_uhash_t combined = realhash + _PandasHASH_IMAG * imaghash;
- if (combined == (Py_uhash_t)-1) {
- return -2;
- }
- return (Py_hash_t)combined;
-}
-
-
-khuint32_t PANDAS_INLINE kh_python_hash_func(PyObject* key);
-
-//we could use any hashing algorithm, this is the original CPython's for tuples
-
-#if SIZEOF_PY_UHASH_T > 4
-#define _PandasHASH_XXPRIME_1 ((Py_uhash_t)11400714785074694791ULL)
-#define _PandasHASH_XXPRIME_2 ((Py_uhash_t)14029467366897019727ULL)
-#define _PandasHASH_XXPRIME_5 ((Py_uhash_t)2870177450012600261ULL)
-#define _PandasHASH_XXROTATE(x) ((x << 31) | (x >> 33)) /* Rotate left 31 bits */
-#else
-#define _PandasHASH_XXPRIME_1 ((Py_uhash_t)2654435761UL)
-#define _PandasHASH_XXPRIME_2 ((Py_uhash_t)2246822519UL)
-#define _PandasHASH_XXPRIME_5 ((Py_uhash_t)374761393UL)
-#define _PandasHASH_XXROTATE(x) ((x << 13) | (x >> 19)) /* Rotate left 13 bits */
-#endif
-
-Py_hash_t PANDAS_INLINE tupleobject_hash(PyTupleObject* key) {
- Py_ssize_t i, len = Py_SIZE(key);
- PyObject **item = key->ob_item;
-
- Py_uhash_t acc = _PandasHASH_XXPRIME_5;
- for (i = 0; i < len; i++) {
- Py_uhash_t lane = kh_python_hash_func(item[i]);
- if (lane == (Py_uhash_t)-1) {
- return -1;
- }
- acc += lane * _PandasHASH_XXPRIME_2;
- acc = _PandasHASH_XXROTATE(acc);
- acc *= _PandasHASH_XXPRIME_1;
- }
-
- /* Add input length, mangled to keep the historical value of hash(()). */
- acc += len ^ (_PandasHASH_XXPRIME_5 ^ 3527539UL);
-
- if (acc == (Py_uhash_t)-1) {
- return 1546275796;
+ if (result == 0) { // still could be two NaNs
+ return PyFloat_CheckExact(a) &&
+ PyFloat_CheckExact(b) &&
+ Py_IS_NAN(PyFloat_AS_DOUBLE(a)) &&
+ Py_IS_NAN(PyFloat_AS_DOUBLE(b));
}
- return acc;
+ return result;
}
-khuint32_t PANDAS_INLINE kh_python_hash_func(PyObject* key) {
- Py_hash_t hash;
+khint32_t PANDAS_INLINE kh_python_hash_func(PyObject* key){
// For PyObject_Hash holds:
// hash(0.0) == 0 == hash(-0.0)
- // yet for different nan-objects different hash-values
- // are possible
- if (PyFloat_CheckExact(key)) {
- // we cannot use kh_float64_hash_func
- // becase float(k) == k holds for any int-object k
- // and kh_float64_hash_func doesn't respect it
- hash = floatobject_hash((PyFloatObject*)key);
- }
- else if (PyComplex_CheckExact(key)) {
- // we cannot use kh_complex128_hash_func
- // becase complex(k,0) == k holds for any int-object k
- // and kh_complex128_hash_func doesn't respect it
- hash = complexobject_hash((PyComplexObject*)key);
- }
- else if (PyTuple_CheckExact(key)) {
- hash = tupleobject_hash((PyTupleObject*)key);
- }
- else {
- hash = PyObject_Hash(key);
- }
-
+ // hash(X) == 0 if X is a NaN-value
+ // so it is OK to use it directly for doubles
+ Py_hash_t hash = PyObject_Hash(key);
if (hash == -1) {
PyErr_Clear();
return 0;
diff --git a/pandas/_libs/tslibs/dtypes.pyx b/pandas/_libs/tslibs/dtypes.pyx
index f79ffd2d425c4..415bdf74db80a 100644
--- a/pandas/_libs/tslibs/dtypes.pyx
+++ b/pandas/_libs/tslibs/dtypes.pyx
@@ -27,11 +27,6 @@ cdef class PeriodDtypeBase:
# See also: libperiod.get_freq_group
return (self._dtype_code // 1000) * 1000
- @property
- def resolution(self) -> "Resolution":
- fgc = self.freq_group_code
- return Resolution.from_freq_group(FreqGroup(fgc))
-
@property
def date_offset(self):
"""
@@ -264,14 +259,6 @@ class Resolution(Enum):
return cls.from_attrname(attr_name)
- @classmethod
- def from_freq_group(cls, freq_group: FreqGroup) -> "Resolution":
- abbrev = _reverse_period_code_map[freq_group.value].split("-")[0]
- if abbrev == "B":
- return cls.RESO_DAY
- attrname = _abbrev_to_attrnames[abbrev]
- return cls.from_attrname(attrname)
-
cdef dict _reso_str_map = {
Resolution.RESO_NS.value: "nanosecond",
diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx
index 6596aebc1892e..ac7447420596a 100644
--- a/pandas/_libs/tslibs/offsets.pyx
+++ b/pandas/_libs/tslibs/offsets.pyx
@@ -1152,13 +1152,12 @@ class DateOffset(RelativeDeltaOffset, metaclass=OffsetMeta):
"""
Standard kind of date increment used for a date range.
- Works exactly like the keyword argument form of relativedelta.
- Note that the positional argument form of relativedelata is not
- supported. Use of the keyword n is discouraged-- you would be better
+ Works exactly like relativedelta in terms of the keyword args you
+ pass in, use of the keyword n is discouraged-- you would be better
off specifying n in the keywords you use, but regardless it is
there for you. n is needed for DateOffset subclasses.
- DateOffset works as follows. Each offset specify a set of dates
+ DateOffset work as follows. Each offset specify a set of dates
that conform to the DateOffset. For example, Bday defines this
set to be the set of dates that are weekdays (M-F). To test if a
date is in the set of a DateOffset dateOffset we can use the
diff --git a/pandas/_libs/tslibs/parsing.pyx b/pandas/_libs/tslibs/parsing.pyx
index 212e40b30848a..9892671f5c18c 100644
--- a/pandas/_libs/tslibs/parsing.pyx
+++ b/pandas/_libs/tslibs/parsing.pyx
@@ -470,7 +470,8 @@ cdef inline object _parse_dateabbr_string(object date_string, datetime default,
except ValueError:
pass
- if date_len == 6 and freq == 'M':
+ if date_len == 6 and (freq == 'M' or
+ getattr(freq, 'rule_code', None) == 'M'):
year = int(date_string[:4])
month = int(date_string[4:6])
try:
diff --git a/pandas/_libs/tslibs/timestamps.pyx b/pandas/_libs/tslibs/timestamps.pyx
index e4e9df5176459..edd3b58867e87 100644
--- a/pandas/_libs/tslibs/timestamps.pyx
+++ b/pandas/_libs/tslibs/timestamps.pyx
@@ -129,13 +129,6 @@ cdef inline object create_timestamp_from_ts(int64_t value,
return ts_base
-def _unpickle_timestamp(value, freq, tz):
- # GH#41949 dont warn on unpickle if we have a freq
- ts = Timestamp(value, tz=tz)
- ts._set_freq(freq)
- return ts
-
-
# ----------------------------------------------------------------------
def integer_op_not_supported(obj):
@@ -732,7 +725,7 @@ cdef class _Timestamp(ABCTimestamp):
def __reduce__(self):
object_state = self.value, self._freq, self.tzinfo
- return (_unpickle_timestamp, object_state)
+ return (Timestamp, object_state)
# -----------------------------------------------------------------
# Rendering Methods
@@ -1329,19 +1322,6 @@ class Timestamp(_Timestamp):
"the tz parameter. Use tz_convert instead.")
tzobj = maybe_get_tz(tz)
- if tzobj is not None and is_datetime64_object(ts_input):
- # GH#24559, GH#42288 In the future we will treat datetime64 as
- # wall-time (consistent with DatetimeIndex)
- warnings.warn(
- "In a future version, when passing a np.datetime64 object and "
- "a timezone to Timestamp, the datetime64 will be interpreted "
- "as a wall time, not a UTC time. To interpret as a UTC time, "
- "use `Timestamp(dt64).tz_localize('UTC').tz_convert(tz)`",
- FutureWarning,
- stacklevel=1,
- )
- # Once this deprecation is enforced, we can do
- # return Timestamp(ts_input).tz_localize(tzobj)
ts = convert_to_tsobject(ts_input, tzobj, unit, 0, 0, nanosecond or 0)
if ts.value == NPY_NAT:
diff --git a/pandas/_testing/_warnings.py b/pandas/_testing/_warnings.py
index 5f01996d0390d..5153118e9b142 100644
--- a/pandas/_testing/_warnings.py
+++ b/pandas/_testing/_warnings.py
@@ -106,7 +106,6 @@ def _assert_caught_expected_warning(
"""Assert that there was the expected warning among the caught warnings."""
saw_warning = False
matched_message = False
- unmatched_messages = []
for actual_warning in caught_warnings:
if issubclass(actual_warning.category, expected_warning):
@@ -117,11 +116,8 @@ def _assert_caught_expected_warning(
):
_assert_raised_with_correct_stacklevel(actual_warning)
- if match is not None:
- if re.search(match, str(actual_warning.message)):
- matched_message = True
- else:
- unmatched_messages.append(actual_warning.message)
+ if match is not None and re.search(match, str(actual_warning.message)):
+ matched_message = True
if not saw_warning:
raise AssertionError(
@@ -132,8 +128,7 @@ def _assert_caught_expected_warning(
if match and not matched_message:
raise AssertionError(
f"Did not see warning {repr(expected_warning.__name__)} "
- f"matching '{match}'. The emitted warning messages are "
- f"{unmatched_messages}"
+ f"matching {match}"
)
diff --git a/pandas/_testing/asserters.py b/pandas/_testing/asserters.py
index d0957b1814213..1942e07d1b562 100644
--- a/pandas/_testing/asserters.py
+++ b/pandas/_testing/asserters.py
@@ -107,7 +107,6 @@ def assert_almost_equal(
FutureWarning,
stacklevel=2,
)
- # https://github.com/python/mypy/issues/7642
# error: Argument 1 to "_get_tol_from_less_precise" has incompatible
# type "Union[bool, int, NoDefault]"; expected "Union[bool, int]"
rtol = atol = _get_tol_from_less_precise(
@@ -315,16 +314,18 @@ def _check_types(left, right, obj="Index") -> None:
return
assert_class_equal(left, right, exact=exact, obj=obj)
- assert_attr_equal("inferred_type", left, right, obj=obj)
# Skip exact dtype checking when `check_categorical` is False
- if is_categorical_dtype(left.dtype) and is_categorical_dtype(right.dtype):
- if check_categorical:
- assert_attr_equal("dtype", left, right, obj=obj)
+ if check_categorical:
+ assert_attr_equal("dtype", left, right, obj=obj)
+ if is_categorical_dtype(left.dtype) and is_categorical_dtype(right.dtype):
assert_index_equal(left.categories, right.categories, exact=exact)
- return
- assert_attr_equal("dtype", left, right, obj=obj)
+ # allow string-like to have different inferred_types
+ if left.inferred_type in ("string"):
+ assert right.inferred_type in ("string")
+ else:
+ assert_attr_equal("inferred_type", left, right, obj=obj)
def _get_ilevel_values(index, level):
# accept level number only
@@ -341,7 +342,6 @@ def _get_ilevel_values(index, level):
FutureWarning,
stacklevel=2,
)
- # https://github.com/python/mypy/issues/7642
# error: Argument 1 to "_get_tol_from_less_precise" has incompatible
# type "Union[bool, int, NoDefault]"; expected "Union[bool, int]"
rtol = atol = _get_tol_from_less_precise(
@@ -437,8 +437,6 @@ def assert_class_equal(left, right, exact: bool | str = True, obj="Input"):
"""
Checks classes are equal.
"""
- from pandas.core.indexes.numeric import NumericIndex
-
__tracebackhide__ = True
def repr_class(x):
@@ -448,16 +446,17 @@ def repr_class(x):
return type(x).__name__
- if type(left) == type(right):
- return
-
if exact == "equiv":
- # accept equivalence of NumericIndex (sub-)classes
- if isinstance(left, NumericIndex) and isinstance(right, NumericIndex):
- return
-
- msg = f"{obj} classes are different"
- raise_assert_detail(obj, msg, repr_class(left), repr_class(right))
+ if type(left) != type(right):
+ # allow equivalence of Int64Index/RangeIndex
+ types = {type(left).__name__, type(right).__name__}
+ if len(types - {"Int64Index", "RangeIndex"}):
+ msg = f"{obj} classes are not equivalent"
+ raise_assert_detail(obj, msg, repr_class(left), repr_class(right))
+ elif exact:
+ if type(left) != type(right):
+ msg = f"{obj} classes are different"
+ raise_assert_detail(obj, msg, repr_class(left), repr_class(right))
def assert_attr_equal(attr: str, left, right, obj: str = "Attributes"):
diff --git a/pandas/_typing.py b/pandas/_typing.py
index 6583a9f60ee15..12d23786c3387 100644
--- a/pandas/_typing.py
+++ b/pandas/_typing.py
@@ -21,7 +21,6 @@
Dict,
Hashable,
List,
- Literal,
Mapping,
Optional,
Sequence,
@@ -37,9 +36,11 @@
# and use a string literal forward reference to it in subsequent types
# https://mypy.readthedocs.io/en/latest/common_issues.html#import-cycles
if TYPE_CHECKING:
- from typing import TypedDict
-
- import numpy.typing as npt
+ from typing import (
+ Literal,
+ TypedDict,
+ final,
+ )
from pandas._libs import (
Period,
@@ -72,7 +73,8 @@
from pandas.io.formats.format import EngFormatter
from pandas.tseries.offsets import DateOffset
else:
- npt: Any = None
+ # typing.final does not exist until py38
+ final = lambda x: x
# typing.TypedDict does not exist until py38
TypedDict = dict
@@ -99,6 +101,12 @@
]
Timezone = Union[str, tzinfo]
+# FrameOrSeriesUnion means either a DataFrame or a Series. E.g.
+# `def func(a: FrameOrSeriesUnion) -> FrameOrSeriesUnion: ...` means that if a Series
+# is passed in, either a Series or DataFrame is returned, and if a DataFrame is passed
+# in, either a DataFrame or a Series is returned.
+FrameOrSeriesUnion = Union["DataFrame", "Series"]
+
# FrameOrSeries is stricter and ensures that the same subclass of NDFrame always is
# used. E.g. `def func(a: FrameOrSeries) -> FrameOrSeries: ...` means that if a
# Series is passed into a function, a Series is always returned and if a DataFrame is
@@ -115,14 +123,6 @@
Frequency = Union[str, "DateOffset"]
Axes = Collection[Any]
-RandomState = Union[
- int,
- ArrayLike,
- np.random.Generator,
- np.random.BitGenerator,
- np.random.RandomState,
-]
-
# dtypes
NpDtype = Union[str, np.dtype]
Dtype = Union[
@@ -193,7 +193,10 @@
]
# Arguments for fillna()
-FillnaOptions = Literal["backfill", "bfill", "ffill", "pad"]
+if TYPE_CHECKING:
+ FillnaOptions = Literal["backfill", "bfill", "ffill", "pad"]
+else:
+ FillnaOptions = str
# internals
Manager = Union[
diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py
index 3233de8e3b6d1..369832e9bc05c 100644
--- a/pandas/compat/__init__.py
+++ b/pandas/compat/__init__.py
@@ -16,6 +16,7 @@
is_numpy_dev,
np_array_datetime64_compat,
np_datetime64_compat,
+ np_version_under1p18,
np_version_under1p19,
np_version_under1p20,
)
@@ -26,6 +27,7 @@
pa_version_under4p0,
)
+PY38 = sys.version_info >= (3, 8)
PY39 = sys.version_info >= (3, 9)
PY310 = sys.version_info >= (3, 10)
PYPY = platform.python_implementation() == "PyPy"
@@ -149,6 +151,7 @@ def get_lzma_file(lzma):
"is_numpy_dev",
"np_array_datetime64_compat",
"np_datetime64_compat",
+ "np_version_under1p18",
"np_version_under1p19",
"np_version_under1p20",
"pa_version_under1p0",
diff --git a/pandas/compat/_optional.py b/pandas/compat/_optional.py
index 651729cd0ad44..941c59592dbbd 100644
--- a/pandas/compat/_optional.py
+++ b/pandas/compat/_optional.py
@@ -10,30 +10,30 @@
# Update install.rst when updating versions!
VERSIONS = {
- "bs4": "4.8.2",
- "bottleneck": "1.3.1",
+ "bs4": "4.6.0",
+ "bottleneck": "1.2.1",
"fsspec": "0.7.4",
"fastparquet": "0.4.0",
"gcsfs": "0.6.0",
- "lxml.etree": "4.5.0",
- "matplotlib": "3.3.2",
- "numexpr": "2.7.1",
- "odfpy": "1.4.1",
- "openpyxl": "3.0.2",
- "pandas_gbq": "0.14.0",
+ "lxml.etree": "4.3.0",
+ "matplotlib": "2.2.3",
+ "numexpr": "2.7.0",
+ "odfpy": "1.3.0",
+ "openpyxl": "3.0.0",
+ "pandas_gbq": "0.12.0",
"pyarrow": "0.17.0",
"pytest": "6.0",
"pyxlsb": "1.0.6",
"s3fs": "0.4.0",
- "scipy": "1.4.1",
- "sqlalchemy": "1.3.11",
- "tables": "3.6.1",
+ "scipy": "1.2.0",
+ "sqlalchemy": "1.3.0",
+ "tables": "3.5.1",
"tabulate": "0.8.7",
- "xarray": "0.15.1",
- "xlrd": "2.0.1",
+ "xarray": "0.12.3",
+ "xlrd": "1.2.0",
"xlwt": "1.3.0",
- "xlsxwriter": "1.2.2",
- "numba": "0.50.1",
+ "xlsxwriter": "1.0.2",
+ "numba": "0.46.0",
}
# A mapping from import name to package name (on PyPI) for packages where
diff --git a/pandas/compat/numpy/__init__.py b/pandas/compat/numpy/__init__.py
index 5b87257651a2d..619713f28ee2d 100644
--- a/pandas/compat/numpy/__init__.py
+++ b/pandas/compat/numpy/__init__.py
@@ -9,10 +9,11 @@
# numpy versioning
_np_version = np.__version__
_nlv = Version(_np_version)
+np_version_under1p18 = _nlv < Version("1.18")
np_version_under1p19 = _nlv < Version("1.19")
np_version_under1p20 = _nlv < Version("1.20")
is_numpy_dev = _nlv.dev is not None
-_min_numpy_ver = "1.18.5"
+_min_numpy_ver = "1.17.3"
if _nlv < Version(_min_numpy_ver):
diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
index df9f3d07ce7fd..177dfee0c03ab 100644
--- a/pandas/core/algorithms.py
+++ b/pandas/core/algorithms.py
@@ -8,11 +8,14 @@
from textwrap import dedent
from typing import (
TYPE_CHECKING,
- Literal,
Union,
cast,
)
-from warnings import warn
+from warnings import (
+ catch_warnings,
+ simplefilter,
+ warn,
+)
import numpy as np
@@ -26,8 +29,8 @@
AnyArrayLike,
ArrayLike,
DtypeObj,
+ FrameOrSeriesUnion,
Scalar,
- npt,
)
from pandas.util._decorators import doc
@@ -81,6 +84,7 @@
from pandas.core.indexers import validate_indices
if TYPE_CHECKING:
+ from typing import Literal
from pandas import (
Categorical,
@@ -136,11 +140,7 @@ def _ensure_data(values: ArrayLike) -> tuple[np.ndarray, DtypeObj]:
return np.asarray(values).view("uint8"), values.dtype
else:
# i.e. all-bool Categorical, BooleanArray
- try:
- return np.asarray(values).astype("uint8", copy=False), values.dtype
- except TypeError:
- # GH#42107 we have pd.NAs present
- return np.asarray(values), values.dtype
+ return np.asarray(values).astype("uint8", copy=False), values.dtype
elif is_integer_dtype(values.dtype):
return np.asarray(values), values.dtype
@@ -155,10 +155,12 @@ def _ensure_data(values: ArrayLike) -> tuple[np.ndarray, DtypeObj]:
return np.asarray(values), values.dtype
elif is_complex_dtype(values.dtype):
- # Incompatible return value type (got "Tuple[Union[Any, ExtensionArray,
- # ndarray[Any, Any]], Union[Any, ExtensionDtype]]", expected
- # "Tuple[ndarray[Any, Any], Union[dtype[Any], ExtensionDtype]]")
- return values, values.dtype # type: ignore[return-value]
+ # ignore the fact that we are casting to float
+ # which discards complex parts
+ with catch_warnings():
+ simplefilter("ignore", np.ComplexWarning)
+ values = ensure_float64(values)
+ return values, np.dtype("float64")
# datetimelike
elif needs_i8_conversion(values.dtype):
@@ -240,8 +242,6 @@ def _ensure_arraylike(values) -> ArrayLike:
_hashtables = {
- "complex128": htable.Complex128HashTable,
- "complex64": htable.Complex64HashTable,
"float64": htable.Float64HashTable,
"float32": htable.Float32HashTable,
"uint64": htable.UInt64HashTable,
@@ -529,9 +529,9 @@ def factorize_array(
size_hint: int | None = None,
na_value=None,
mask: np.ndarray | None = None,
-) -> tuple[npt.NDArray[np.intp], np.ndarray]:
+) -> tuple[np.ndarray, np.ndarray]:
"""
- Factorize a numpy array to codes and uniques.
+ Factorize an array-like to codes and uniques.
This doesn't do any coercion of types or unboxing before factorization.
@@ -910,7 +910,7 @@ def duplicated(
Parameters
----------
- values : nd.array, ExtensionArray or Series
+ values : ndarray-like
Array over which to check for duplicate values.
keep : {'first', 'last', False}, default 'first'
- ``first`` : Mark duplicates as ``True`` except for the first
@@ -1008,6 +1008,7 @@ def rank(
if values.ndim == 1:
ranks = algos.rank_1d(
values,
+ labels=np.zeros(len(values), dtype=np.intp),
is_datetimelike=is_datetimelike,
ties_method=method,
ascending=ascending,
@@ -1091,19 +1092,18 @@ def checked_add_with_arr(
# it is negative, we then check whether its sum with the element in
# 'arr' exceeds np.iinfo(np.int64).min. If so, we have an overflow
# error as well.
- i8max = lib.i8max
- i8min = iNaT
-
mask1 = b2 > 0
mask2 = b2 < 0
if not mask1.any():
- to_raise = ((i8min - b2 > arr) & not_nan).any()
+ to_raise = ((np.iinfo(np.int64).min - b2 > arr) & not_nan).any()
elif not mask2.any():
- to_raise = ((i8max - b2 < arr) & not_nan).any()
+ to_raise = ((np.iinfo(np.int64).max - b2 < arr) & not_nan).any()
else:
- to_raise = ((i8max - b2[mask1] < arr[mask1]) & not_nan[mask1]).any() or (
- (i8min - b2[mask2] > arr[mask2]) & not_nan[mask2]
+ to_raise = (
+ (np.iinfo(np.int64).max - b2[mask1] < arr[mask1]) & not_nan[mask1]
+ ).any() or (
+ (np.iinfo(np.int64).min - b2[mask2] > arr[mask2]) & not_nan[mask2]
).any()
if to_raise:
@@ -1188,10 +1188,13 @@ def _get_score(at):
if is_scalar(q):
return _get_score(q)
-
- q = np.asarray(q, np.float64)
- result = [_get_score(x) for x in q]
- return np.array(result, dtype=np.float64)
+ else:
+ q = np.asarray(q, np.float64)
+ result = [_get_score(x) for x in q]
+ # error: Incompatible types in assignment (expression has type
+ # "ndarray", variable has type "List[Any]")
+ result = np.array(result, dtype=np.float64) # type: ignore[assignment]
+ return result
# --------------- #
@@ -1208,7 +1211,7 @@ def __init__(self, obj, n: int, keep: str):
if self.keep not in ("first", "last", "all"):
raise ValueError('keep must be either "first", "last" or "all"')
- def compute(self, method: str) -> DataFrame | Series:
+ def compute(self, method: str) -> FrameOrSeriesUnion:
raise NotImplementedError
def nlargest(self):
@@ -1412,8 +1415,8 @@ def take(
Parameters
----------
- arr : array-like or scalar value
- Non array-likes (sequences/scalars without a dtype) are coerced
+ arr : sequence
+ Non array-likes (sequences without a dtype) are coerced
to an ndarray.
indices : sequence of integers
Indices to be taken.
@@ -1523,17 +1526,17 @@ def searchsorted(arr, value, side="left", sorter=None) -> np.ndarray:
Parameters
----------
- arr: np.ndarray, ExtensionArray, Series
+ arr: array-like
Input array. If `sorter` is None, then it must be sorted in
ascending order, otherwise `sorter` must be an array of indices
that sort it.
- value : array-like or scalar
+ value : array_like
Values to insert into `arr`.
side : {'left', 'right'}, optional
If 'left', the index of the first suitable location found is given.
If 'right', return the last such index. If there is no suitable
index, return either 0 or N (where N is the length of `self`).
- sorter : 1-D array-like, optional
+ sorter : 1-D array_like, optional
Optional array of integer indices that sort array a into ascending
order. They are typically the result of argsort.
diff --git a/pandas/core/apply.py b/pandas/core/apply.py
index ff3fc30b870dc..388c1881afed7 100644
--- a/pandas/core/apply.py
+++ b/pandas/core/apply.py
@@ -7,7 +7,6 @@
Any,
Dict,
Hashable,
- Iterable,
Iterator,
List,
cast,
@@ -26,6 +25,7 @@
AggObjType,
Axis,
FrameOrSeries,
+ FrameOrSeriesUnion,
)
from pandas.util._decorators import cache_readonly
@@ -137,10 +137,10 @@ def f(x):
self.f: AggFuncType = f
@abc.abstractmethod
- def apply(self) -> DataFrame | Series:
+ def apply(self) -> FrameOrSeriesUnion:
pass
- def agg(self) -> DataFrame | Series | None:
+ def agg(self) -> FrameOrSeriesUnion | None:
"""
Provide an implementation for the aggregators.
@@ -171,7 +171,7 @@ def agg(self) -> DataFrame | Series | None:
# caller can react
return None
- def transform(self) -> DataFrame | Series:
+ def transform(self) -> FrameOrSeriesUnion:
"""
Transform a DataFrame or Series.
@@ -252,7 +252,7 @@ def transform_dict_like(self, func):
func = self.normalize_dictlike_arg("transform", obj, func)
- results: dict[Hashable, DataFrame | Series] = {}
+ results: dict[Hashable, FrameOrSeriesUnion] = {}
failed_names = []
all_type_errors = True
for name, how in func.items():
@@ -283,7 +283,7 @@ def transform_dict_like(self, func):
)
return concat(results, axis=1)
- def transform_str_or_callable(self, func) -> DataFrame | Series:
+ def transform_str_or_callable(self, func) -> FrameOrSeriesUnion:
"""
Compute transform in the case of a string or callable func
"""
@@ -305,7 +305,7 @@ def transform_str_or_callable(self, func) -> DataFrame | Series:
except Exception:
return func(obj, *args, **kwargs)
- def agg_list_like(self) -> DataFrame | Series:
+ def agg_list_like(self) -> FrameOrSeriesUnion:
"""
Compute aggregation in the case of a list-like argument.
@@ -348,7 +348,6 @@ def agg_list_like(self) -> DataFrame | Series:
# multiples
else:
- indices = []
for index, col in enumerate(selected_obj):
colg = obj._gotitem(col, ndim=1, subset=selected_obj.iloc[:, index])
try:
@@ -370,9 +369,7 @@ def agg_list_like(self) -> DataFrame | Series:
raise
else:
results.append(new_res)
- indices.append(index)
-
- keys = selected_obj.columns.take(indices)
+ keys.append(col)
# if we are empty
if not len(results):
@@ -402,7 +399,7 @@ def agg_list_like(self) -> DataFrame | Series:
)
return concatenated.reindex(full_ordered_index, copy=False)
- def agg_dict_like(self) -> DataFrame | Series:
+ def agg_dict_like(self) -> FrameOrSeriesUnion:
"""
Compute aggregation in the case of a dict-like argument.
@@ -410,7 +407,6 @@ def agg_dict_like(self) -> DataFrame | Series:
-------
Result of aggregation.
"""
- from pandas import Index
from pandas.core.reshape.concat import concat
obj = self.obj
@@ -444,20 +440,11 @@ def agg_dict_like(self) -> DataFrame | Series:
# combine results
if all(is_ndframe):
- keys_to_use: Iterable[Hashable]
keys_to_use = [k for k in keys if not results[k].empty]
# Have to check, if at least one DataFrame is not empty.
keys_to_use = keys_to_use if keys_to_use != [] else keys
- if selected_obj.ndim == 2:
- # keys are columns, so we can preserve names
- ktu = Index(keys_to_use)
- ktu._set_names(selected_obj.columns.names)
- keys_to_use = ktu
-
axis = 0 if isinstance(obj, ABCSeries) else 1
- result = concat(
- {k: results[k] for k in keys_to_use}, axis=axis, keys=keys_to_use
- )
+ result = concat({k: results[k] for k in keys_to_use}, axis=axis)
elif any(is_ndframe):
# There is a mix of NDFrames and scalars
raise ValueError(
@@ -480,7 +467,7 @@ def agg_dict_like(self) -> DataFrame | Series:
return result
- def apply_str(self) -> DataFrame | Series:
+ def apply_str(self) -> FrameOrSeriesUnion:
"""
Compute apply in case of a string.
@@ -505,7 +492,7 @@ def apply_str(self) -> DataFrame | Series:
raise ValueError(f"Operation {f} does not support axis=1")
return self._try_aggregate_string_function(obj, f, *self.args, **self.kwargs)
- def apply_multiple(self) -> DataFrame | Series:
+ def apply_multiple(self) -> FrameOrSeriesUnion:
"""
Compute apply in case of a list-like or dict-like.
@@ -517,7 +504,7 @@ def apply_multiple(self) -> DataFrame | Series:
return self.obj.aggregate(self.f, self.axis, *self.args, **self.kwargs)
def normalize_dictlike_arg(
- self, how: str, obj: DataFrame | Series, func: AggFuncTypeDict
+ self, how: str, obj: FrameOrSeriesUnion, func: AggFuncTypeDict
) -> AggFuncTypeDict:
"""
Handler for dict-like argument.
@@ -630,7 +617,7 @@ def series_generator(self) -> Iterator[Series]:
@abc.abstractmethod
def wrap_results_for_axis(
self, results: ResType, res_index: Index
- ) -> DataFrame | Series:
+ ) -> FrameOrSeriesUnion:
pass
# ---------------------------------------------------------------
@@ -651,7 +638,7 @@ def values(self):
def dtypes(self) -> Series:
return self.obj.dtypes
- def apply(self) -> DataFrame | Series:
+ def apply(self) -> FrameOrSeriesUnion:
"""compute the results"""
# dispatch to agg
if is_list_like(self.f):
@@ -825,7 +812,7 @@ def apply_series_generator(self) -> tuple[ResType, Index]:
return results, res_index
- def wrap_results(self, results: ResType, res_index: Index) -> DataFrame | Series:
+ def wrap_results(self, results: ResType, res_index: Index) -> FrameOrSeriesUnion:
from pandas import Series
# see if we can infer the results
@@ -848,14 +835,14 @@ def wrap_results(self, results: ResType, res_index: Index) -> DataFrame | Series
return result
- def apply_str(self) -> DataFrame | Series:
+ def apply_str(self) -> FrameOrSeriesUnion:
# Caller is responsible for checking isinstance(self.f, str)
# TODO: GH#39993 - Avoid special-casing by replacing with lambda
if self.f == "size":
# Special-cased because DataFrame.size returns a single scalar
obj = self.obj
value = obj.shape[self.axis]
- return obj._constructor_sliced(value, index=self.agg_axis)
+ return obj._constructor_sliced(value, index=self.agg_axis, name="size")
return super().apply_str()
@@ -879,7 +866,7 @@ def result_columns(self) -> Index:
def wrap_results_for_axis(
self, results: ResType, res_index: Index
- ) -> DataFrame | Series:
+ ) -> FrameOrSeriesUnion:
"""return the results for the rows"""
if self.result_type == "reduce":
@@ -962,9 +949,9 @@ def result_columns(self) -> Index:
def wrap_results_for_axis(
self, results: ResType, res_index: Index
- ) -> DataFrame | Series:
+ ) -> FrameOrSeriesUnion:
"""return the results for the columns"""
- result: DataFrame | Series
+ result: FrameOrSeriesUnion
# we have requested to expand
if self.result_type == "expand":
@@ -1018,7 +1005,7 @@ def __init__(
kwargs=kwargs,
)
- def apply(self) -> DataFrame | Series:
+ def apply(self) -> FrameOrSeriesUnion:
obj = self.obj
if len(obj) == 0:
@@ -1069,13 +1056,17 @@ def apply_empty_result(self) -> Series:
obj, method="apply"
)
- def apply_standard(self) -> DataFrame | Series:
+ def apply_standard(self) -> FrameOrSeriesUnion:
f = self.f
obj = self.obj
with np.errstate(all="ignore"):
if isinstance(f, np.ufunc):
- return f(obj)
+ # error: Argument 1 to "__call__" of "ufunc" has incompatible type
+ # "Series"; expected "Union[Union[int, float, complex, str, bytes,
+ # generic], Sequence[Union[int, float, complex, str, bytes, generic]],
+ # Sequence[Sequence[Any]], _SupportsArray]"
+ return f(obj) # type: ignore[arg-type]
# row-wise access
if is_extension_array_dtype(obj.dtype) and hasattr(obj._values, "map"):
diff --git a/pandas/core/array_algos/replace.py b/pandas/core/array_algos/replace.py
index df4407067b131..e800f5ac748ec 100644
--- a/pandas/core/array_algos/replace.py
+++ b/pandas/core/array_algos/replace.py
@@ -45,21 +45,21 @@ def compare_or_regex_search(
a: ArrayLike, b: Scalar | Pattern, regex: bool, mask: np.ndarray
) -> ArrayLike | bool:
"""
- Compare two array-like inputs of the same shape or two scalar values
+ Compare two array_like inputs of the same shape or two scalar values
Calls operator.eq or re.search, depending on regex argument. If regex is
True, perform an element-wise regex matching.
Parameters
----------
- a : array-like
+ a : array_like
b : scalar or regex pattern
regex : bool
mask : np.ndarray[bool]
Returns
-------
- mask : array-like of bool
+ mask : array_like of bool
"""
if isna(b):
return ~mask
diff --git a/pandas/core/arrays/_ranges.py b/pandas/core/arrays/_ranges.py
index 3909875e5660a..cac9fcd40fa52 100644
--- a/pandas/core/arrays/_ranges.py
+++ b/pandas/core/arrays/_ranges.py
@@ -6,7 +6,6 @@
import numpy as np
-from pandas._libs.lib import i8max
from pandas._libs.tslibs import (
BaseOffset,
OutOfBoundsDatetime,
@@ -104,7 +103,7 @@ def _generate_range_overflow_safe(
# GH#14187 raise instead of incorrectly wrapping around
assert side in ["start", "end"]
- i64max = np.uint64(i8max)
+ i64max = np.uint64(np.iinfo(np.int64).max)
msg = f"Cannot generate range with {side}={endpoint} and periods={periods}"
with np.errstate(over="raise"):
@@ -181,7 +180,7 @@ def _generate_range_overflow_safe_signed(
# error: Incompatible types in assignment (expression has type
# "unsignedinteger[_64Bit]", variable has type "signedinteger[_64Bit]")
result = np.uint64(endpoint) + np.uint64(addend) # type: ignore[assignment]
- i64max = np.uint64(i8max)
+ i64max = np.uint64(np.iinfo(np.int64).max)
assert result > i64max
if result <= i64max + np.uint64(stride):
# error: Incompatible return value type (got "unsignedinteger", expected
diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py
index b362769f50fa8..a6d1986937d2b 100644
--- a/pandas/core/arrays/base.py
+++ b/pandas/core/arrays/base.py
@@ -14,7 +14,6 @@
Any,
Callable,
Iterator,
- Literal,
Sequence,
TypeVar,
cast,
@@ -73,6 +72,7 @@
)
if TYPE_CHECKING:
+ from typing import Literal
class ExtensionArraySupportsAnyAll("ExtensionArray"):
def any(self, *, skipna: bool = True) -> bool:
@@ -826,13 +826,13 @@ def searchsorted(self, value, side="left", sorter=None):
Parameters
----------
- value : array-like, list or scalar
- Value(s) to insert into `self`.
+ value : array_like
+ Values to insert into `self`.
side : {'left', 'right'}, optional
If 'left', the index of the first suitable location found is given.
If 'right', return the last such index. If there is no suitable
index, return either 0 or N (where N is the length of `self`).
- sorter : 1-D array-like, optional
+ sorter : 1-D array_like, optional
Optional array of integer indices that sort array a into ascending
order. They are typically the result of argsort.
@@ -1296,10 +1296,8 @@ def _reduce(self, name: str, *, skipna: bool = True, **kwargs):
"""
raise TypeError(f"cannot perform {name} with type {self.dtype}")
- # https://github.com/python/typeshed/issues/2148#issuecomment-520783318
- # Incompatible types in assignment (expression has type "None", base class
- # "object" defined the type as "Callable[[object], int]")
- __hash__: None # type: ignore[assignment]
+ def __hash__(self) -> int:
+ raise TypeError(f"unhashable type: {repr(type(self).__name__)}")
# ------------------------------------------------------------------------
# Non-Optimized Default Methods
diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py
index b46679c2fca18..ecc45357db8c1 100644
--- a/pandas/core/arrays/categorical.py
+++ b/pandas/core/arrays/categorical.py
@@ -26,7 +26,6 @@
NaT,
algos as libalgos,
hashtable as htable,
- lib,
)
from pandas._libs.arrays import NDArrayBacked
from pandas._libs.lib import no_default
@@ -37,7 +36,6 @@
Ordered,
Scalar,
Shape,
- npt,
type_t,
)
from pandas.compat.numpy import function as nv
@@ -525,7 +523,6 @@ def astype(self, dtype: Dtype, copy: bool = True) -> ArrayLike:
try:
new_cats = np.asarray(self.categories)
new_cats = new_cats.astype(dtype=dtype, copy=copy)
- fill_value = lib.item_from_zerodim(np.array(np.nan).astype(dtype))
except (
TypeError, # downstream error msg for CategoricalIndex is misleading
ValueError,
@@ -533,9 +530,7 @@ def astype(self, dtype: Dtype, copy: bool = True) -> ArrayLike:
msg = f"Cannot cast {self.categories.dtype} dtype to {dtype}"
raise ValueError(msg)
- result = take_nd(
- new_cats, ensure_platform_int(self._codes), fill_value=fill_value
- )
+ result = take_nd(new_cats, ensure_platform_int(self._codes))
return result
@@ -1394,14 +1389,17 @@ def map(self, mapper):
# -------------------------------------------------------------
# Validators; ideally these can be de-duplicated
- def _validate_setitem_value(self, value):
- if not is_hashable(value):
- # wrap scalars and hashable-listlikes in list
- return self._validate_listlike(value)
+ def _validate_searchsorted_value(self, value):
+ # searchsorted is very performance sensitive. By converting codes
+ # to same dtype as self.codes, we get much faster performance.
+ if is_scalar(value):
+ codes = self._unbox_scalar(value)
else:
- return self._validate_scalar(value)
-
- _validate_searchsorted_value = _validate_setitem_value
+ locs = [self.categories.get_loc(x) for x in value]
+ # error: Incompatible types in assignment (expression has type
+ # "ndarray", variable has type "int")
+ codes = np.array(locs, dtype=self.codes.dtype) # type: ignore[assignment]
+ return codes
def _validate_scalar(self, fill_value):
"""
@@ -1427,8 +1425,8 @@ def _validate_scalar(self, fill_value):
fill_value = self._unbox_scalar(fill_value)
else:
raise TypeError(
- "Cannot setitem on a Categorical with a new "
- f"category ({fill_value}), set the categories first"
+ f"'fill_value={fill_value}' is not present "
+ "in this Categorical's categories"
)
return fill_value
@@ -2013,14 +2011,13 @@ def __getitem__(self, key):
deprecate_ndim_indexing(result)
return result
- def _validate_listlike(self, value):
- # NB: here we assume scalar-like tuples have already been excluded
+ def _validate_setitem_value(self, value):
value = extract_array(value, extract_numpy=True)
# require identical categories set
if isinstance(value, Categorical):
if not is_dtype_equal(self.dtype, value.dtype):
- raise TypeError(
+ raise ValueError(
"Cannot set a Categorical with another, "
"without identical categories"
)
@@ -2028,23 +2025,26 @@ def _validate_listlike(self, value):
value = self._encode_with_my_categories(value)
return value._codes
+ # wrap scalars and hashable-listlikes in list
+ rvalue = value if not is_hashable(value) else [value]
+
from pandas import Index
# tupleize_cols=False for e.g. test_fillna_iterable_category GH#41914
- to_add = Index(value, tupleize_cols=False).difference(self.categories)
+ to_add = Index(rvalue, tupleize_cols=False).difference(self.categories)
# no assignments of values not in categories, but it's always ok to set
# something to np.nan
if len(to_add) and not isna(to_add).all():
- raise TypeError(
+ raise ValueError(
"Cannot setitem on a Categorical with a new "
"category, set the categories first"
)
- codes = self.categories.get_indexer(value)
+ codes = self.categories.get_indexer(rvalue)
return codes.astype(self._ndarray.dtype, copy=False)
- def _reverse_indexer(self) -> dict[Hashable, npt.NDArray[np.intp]]:
+ def _reverse_indexer(self) -> dict[Hashable, np.ndarray]:
"""
Compute the inverse of a categorical, returning
a dict of categories -> indexers.
diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py
index 8b5bda4629506..08cb12a1373bb 100644
--- a/pandas/core/arrays/datetimelike.py
+++ b/pandas/core/arrays/datetimelike.py
@@ -9,7 +9,6 @@
TYPE_CHECKING,
Any,
Callable,
- Literal,
Sequence,
TypeVar,
Union,
@@ -122,6 +121,7 @@
from pandas.tseries import frequencies
if TYPE_CHECKING:
+ from typing import Literal
from pandas.core.arrays import (
DatetimeArray,
diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py
index 8513bbb044e83..92a906e9fd8b0 100644
--- a/pandas/core/arrays/datetimes.py
+++ b/pandas/core/arrays/datetimes.py
@@ -8,7 +8,6 @@
)
from typing import (
TYPE_CHECKING,
- Literal,
cast,
overload,
)
@@ -82,6 +81,7 @@
)
if TYPE_CHECKING:
+ from typing import Literal
from pandas import DataFrame
from pandas.core.arrays import (
@@ -509,11 +509,6 @@ def _check_compatible_with(self, other, setitem: bool = False):
# Descriptive Properties
def _box_func(self, x) -> Timestamp | NaTType:
- if isinstance(x, np.datetime64):
- # GH#42228
- # Argument 1 to "signedinteger" has incompatible type "datetime64";
- # expected "Union[SupportsInt, Union[str, bytes], SupportsIndex]"
- x = np.int64(x) # type: ignore[arg-type]
ts = Timestamp(x, tz=self.tz)
# Non-overlapping identity check (left operand type: "Timestamp",
# right operand type: "NaTType")
diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py
index dd45029336f63..2318cae004c5a 100644
--- a/pandas/core/arrays/interval.py
+++ b/pandas/core/arrays/interval.py
@@ -64,7 +64,6 @@
from pandas.core.algorithms import (
isin,
take,
- unique,
value_counts,
)
from pandas.core.arrays.base import (
@@ -1611,29 +1610,6 @@ def _combined(self) -> ArrayLike:
comb = np.concatenate([left, right], axis=1)
return comb
- def _from_combined(self, combined: np.ndarray) -> IntervalArray:
- """
- Create a new IntervalArray with our dtype from a 1D complex128 ndarray.
- """
- nc = combined.view("i8").reshape(-1, 2)
-
- dtype = self._left.dtype
- if needs_i8_conversion(dtype):
- new_left = type(self._left)._from_sequence(nc[:, 0], dtype=dtype)
- new_right = type(self._right)._from_sequence(nc[:, 1], dtype=dtype)
- else:
- new_left = nc[:, 0].view(dtype)
- new_right = nc[:, 1].view(dtype)
- return self._shallow_copy(left=new_left, right=new_right)
-
- def unique(self) -> IntervalArray:
- # Invalid index type "Tuple[slice, int]" for "Union[ExtensionArray,
- # ndarray[Any, Any]]"; expected type "Union[int, integer[Any], slice,
- # Sequence[int], ndarray[Any, Any]]"
- nc = unique(self._combined.view("complex128")[:, 0]) # type: ignore[index]
- nc = nc[:, None]
- return self._from_combined(nc)
-
def _maybe_convert_platform_interval(values) -> ArrayLike:
"""
diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py
index 3a152bd5889b7..d274501143916 100644
--- a/pandas/core/arrays/masked.py
+++ b/pandas/core/arrays/masked.py
@@ -403,21 +403,15 @@ def isin(self, values) -> BooleanArray: # type: ignore[override]
from pandas.core.arrays import BooleanArray
- # algorithms.isin will eventually convert values to an ndarray, so no extra
- # cost to doing it here first
- values_arr = np.asarray(values)
- result = isin(self._data, values_arr)
-
+ result = isin(self._data, values)
if self._hasna:
- values_have_NA = is_object_dtype(values_arr.dtype) and any(
- val is self.dtype.na_value for val in values_arr
- )
-
- # For now, NA does not propagate so set result according to presence of NA,
- # see https://github.com/pandas-dev/pandas/pull/38379 for some discussion
- result[self._mask] = values_have_NA
-
- mask = np.zeros_like(self, dtype=bool)
+ if libmissing.NA in values:
+ result += self._mask
+ else:
+ result *= np.invert(self._mask)
+ # error: No overload variant of "zeros_like" matches argument types
+ # "BaseMaskedArray", "Type[bool]"
+ mask = np.zeros_like(self, dtype=bool) # type: ignore[call-overload]
return BooleanArray(result, mask, copy=False)
def copy(self: BaseMaskedArrayT) -> BaseMaskedArrayT:
diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py
index 471ee295ebd2f..04db06ee9fb66 100644
--- a/pandas/core/arrays/period.py
+++ b/pandas/core/arrays/period.py
@@ -341,9 +341,7 @@ def freq(self) -> BaseOffset:
def __array__(self, dtype: NpDtype | None = None) -> np.ndarray:
if dtype == "i8":
return self.asi8
- # error: Non-overlapping equality check (left operand type: "Optional[Union[str,
- # dtype[Any]]]", right operand type: "Type[bool]")
- elif dtype == bool: # type: ignore[comparison-overlap]
+ elif dtype == bool:
return ~self._isnan
# This will raise TypeError for non-object dtypes
diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py
index b1cfcbd69a30b..7d3917203d7b6 100644
--- a/pandas/core/arrays/sparse/array.py
+++ b/pandas/core/arrays/sparse/array.py
@@ -226,7 +226,7 @@ class SparseArray(OpsMixin, PandasObject, ExtensionArray):
Parameters
----------
- data : array-like or scalar
+ data : array-like
A dense array of values to store in the SparseArray. This may contain
`fill_value`.
sparse_index : SparseIndex, optional
@@ -1448,7 +1448,7 @@ def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs, **kwargs):
sp_values, self.sp_index, SparseDtype(sp_values.dtype, fill_value)
)
- result = getattr(ufunc, method)(*(np.asarray(x) for x in inputs), **kwargs)
+ result = getattr(ufunc, method)(*[np.asarray(x) for x in inputs], **kwargs)
if out:
if len(out) == 1:
out = out[0]
@@ -1463,7 +1463,11 @@ def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs, **kwargs):
return type(self)(result)
def __abs__(self):
- return np.abs(self)
+ # error: Argument 1 to "__call__" of "ufunc" has incompatible type
+ # "SparseArray"; expected "Union[Union[int, float, complex, str, bytes,
+ # generic], Sequence[Union[int, float, complex, str, bytes, generic]],
+ # Sequence[Sequence[Any]], _SupportsArray]"
+ return np.abs(self) # type: ignore[arg-type]
# ------------------------------------------------------------------------
# Ops
diff --git a/pandas/core/arrays/sparse/scipy_sparse.py b/pandas/core/arrays/sparse/scipy_sparse.py
index f399d3230d897..7ebda1f17ba56 100644
--- a/pandas/core/arrays/sparse/scipy_sparse.py
+++ b/pandas/core/arrays/sparse/scipy_sparse.py
@@ -58,7 +58,7 @@ def _get_label_to_i_dict(labels, sort_labels=False):
return {k: i for i, k in enumerate(labels)}
def _get_index_subset_to_coord_dict(index, subset, sort_labels=False):
- ilabels = list(zip(*(index._get_level_values(i) for i in subset)))
+ ilabels = list(zip(*[index._get_level_values(i) for i in subset]))
labels_to_i = _get_label_to_i_dict(ilabels, sort_labels=sort_labels)
labels_to_i = Series(labels_to_i)
if len(subset) > 1:
diff --git a/pandas/core/base.py b/pandas/core/base.py
index 4d380c6831071..ae7e1a1062cfb 100644
--- a/pandas/core/base.py
+++ b/pandas/core/base.py
@@ -10,10 +10,8 @@
Any,
Generic,
Hashable,
- Literal,
TypeVar,
cast,
- final,
)
import numpy as np
@@ -21,11 +19,12 @@
import pandas._libs.lib as lib
from pandas._typing import (
ArrayLike,
+ Dtype,
DtypeObj,
FrameOrSeries,
IndexLabel,
Shape,
- npt,
+ final,
)
from pandas.compat import PYPY
from pandas.compat.numpy import function as nv
@@ -65,6 +64,7 @@
import pandas.core.nanops as nanops
if TYPE_CHECKING:
+ from typing import Literal
from pandas import Categorical
@@ -411,7 +411,7 @@ def array(self) -> ExtensionArray:
def to_numpy(
self,
- dtype: npt.DTypeLike | None = None,
+ dtype: Dtype | None = None,
copy: bool = False,
na_value=lib.no_default,
**kwargs,
@@ -510,16 +510,8 @@ def to_numpy(
"""
if is_extension_array_dtype(self.dtype):
# error: Too many arguments for "to_numpy" of "ExtensionArray"
-
- # error: Argument 1 to "to_numpy" of "ExtensionArray" has incompatible type
- # "Optional[Union[dtype[Any], None, type, _SupportsDType[dtype[Any]], str,
- # Union[Tuple[Any, int], Tuple[Any, Union[SupportsIndex,
- # Sequence[SupportsIndex]]], List[Any], _DTypeDict, Tuple[Any, Any]]]]";
- # expected "Optional[Union[ExtensionDtype, Union[str, dtype[Any]],
- # Type[str], Type[float], Type[int], Type[complex], Type[bool],
- # Type[object]]]"
return self.array.to_numpy( # type: ignore[call-arg]
- dtype, copy=copy, na_value=na_value, **kwargs # type: ignore[arg-type]
+ dtype, copy=copy, na_value=na_value, **kwargs
)
elif kwargs:
bad_keys = list(kwargs.keys())[0]
@@ -527,7 +519,12 @@ def to_numpy(
f"to_numpy() got an unexpected keyword argument '{bad_keys}'"
)
- result = np.asarray(self._values, dtype=dtype)
+ # error: Argument "dtype" to "asarray" has incompatible type
+ # "Union[ExtensionDtype, str, dtype[Any], Type[str], Type[float], Type[int],
+ # Type[complex], Type[bool], Type[object], None]"; expected "Union[dtype[Any],
+ # None, type, _SupportsDType, str, Union[Tuple[Any, int], Tuple[Any, Union[int,
+ # Sequence[int]]], List[Any], _DTypeDict, Tuple[Any, Any]]]"
+ result = np.asarray(self._values, dtype=dtype) # type: ignore[arg-type]
# TODO(GH-24345): Avoid potential double copy
if copy or na_value is not lib.no_default:
result = result.copy()
@@ -1094,7 +1091,6 @@ def _memory_usage(self, deep: bool = False) -> int:
are not components of the array if deep=False or if used on PyPy
"""
if hasattr(self.array, "memory_usage"):
- # https://github.com/python/mypy/issues/1424
# error: "ExtensionArray" has no attribute "memory_usage"
return self.array.memory_usage(deep=deep) # type: ignore[attr-defined]
@@ -1137,13 +1133,13 @@ def factorize(self, sort: bool = False, na_sentinel: int | None = -1):
Parameters
----------
- value : array-like or scalar
+ value : array_like
Values to insert into `self`.
side : {{'left', 'right'}}, optional
If 'left', the index of the first suitable location found is given.
If 'right', return the last such index. If there is no suitable
index, return either 0 or N (where N is the length of `self`).
- sorter : 1-D array-like, optional
+ sorter : 1-D array_like, optional
Optional array of integer indices that sort `self` into ascending
order. They are typically the result of ``np.argsort``.
diff --git a/pandas/core/common.py b/pandas/core/common.py
index b32614577393d..183607ebb489d 100644
--- a/pandas/core/common.py
+++ b/pandas/core/common.py
@@ -21,7 +21,6 @@
Iterable,
Iterator,
cast,
- overload,
)
import warnings
@@ -30,12 +29,11 @@
from pandas._libs import lib
from pandas._typing import (
AnyArrayLike,
- ArrayLike,
NpDtype,
- RandomState,
Scalar,
T,
)
+from pandas.compat import np_version_under1p18
from pandas.core.dtypes.cast import construct_1d_object_array_from_listlike
from pandas.core.dtypes.common import (
@@ -145,11 +143,7 @@ def is_bool_indexer(key: Any) -> bool:
return True
elif isinstance(key, list):
# check if np.array(key).dtype would be bool
- if len(key) > 0:
- if type(key) is not list:
- # GH#42461 cython will raise TypeError if we pass a subclass
- key = list(key)
- return lib.is_bool_list(key)
+ return len(key) > 0 and lib.is_bool_list(key)
return False
@@ -394,70 +388,44 @@ def standardize_mapping(into):
return into
-@overload
-def random_state(state: np.random.Generator) -> np.random.Generator:
- ...
-
-
-@overload
-def random_state(
- state: int | ArrayLike | np.random.BitGenerator | np.random.RandomState | None,
-) -> np.random.RandomState:
- ...
-
-
-def random_state(state: RandomState | None = None):
+def random_state(state=None):
"""
Helper function for processing random_state arguments.
Parameters
----------
- state : int, array-like, BitGenerator, Generator, np.random.RandomState, None.
+ state : int, array-like, BitGenerator (NumPy>=1.17), np.random.RandomState, None.
If receives an int, array-like, or BitGenerator, passes to
np.random.RandomState() as seed.
- If receives an np.random RandomState or Generator, just returns that unchanged.
+ If receives an np.random.RandomState object, just returns object.
If receives `None`, returns np.random.
If receives anything else, raises an informative ValueError.
.. versionchanged:: 1.1.0
- array-like and BitGenerator object now passed to np.random.RandomState()
- as seed
+ array-like and BitGenerator (for NumPy>=1.18) object now passed to
+ np.random.RandomState() as seed
Default None.
Returns
-------
- np.random.RandomState or np.random.Generator. If state is None, returns np.random
+ np.random.RandomState
"""
if (
is_integer(state)
or is_array_like(state)
- or isinstance(state, np.random.BitGenerator)
+ or (not np_version_under1p18 and isinstance(state, np.random.BitGenerator))
):
- # error: Argument 1 to "RandomState" has incompatible type "Optional[Union[int,
- # Union[ExtensionArray, ndarray[Any, Any]], Generator, RandomState]]"; expected
- # "Union[None, Union[Union[_SupportsArray[dtype[Union[bool_, integer[Any]]]],
- # Sequence[_SupportsArray[dtype[Union[bool_, integer[Any]]]]],
- # Sequence[Sequence[_SupportsArray[dtype[Union[bool_, integer[Any]]]]]],
- # Sequence[Sequence[Sequence[_SupportsArray[dtype[Union[bool_,
- # integer[Any]]]]]]],
- # Sequence[Sequence[Sequence[Sequence[_SupportsArray[dtype[Union[bool_,
- # integer[Any]]]]]]]]], Union[bool, int, Sequence[Union[bool, int]],
- # Sequence[Sequence[Union[bool, int]]], Sequence[Sequence[Sequence[Union[bool,
- # int]]]], Sequence[Sequence[Sequence[Sequence[Union[bool, int]]]]]]],
- # BitGenerator]"
- return np.random.RandomState(state) # type: ignore[arg-type]
+ return np.random.RandomState(state)
elif isinstance(state, np.random.RandomState):
return state
- elif isinstance(state, np.random.Generator):
- return state
elif state is None:
return np.random
else:
raise ValueError(
- "random_state must be an integer, array-like, a BitGenerator, Generator, "
+ "random_state must be an integer, array-like, a BitGenerator, "
"a numpy RandomState, or None"
)
diff --git a/pandas/core/computation/engines.py b/pandas/core/computation/engines.py
index ec3548c9efc6c..62732402dbeea 100644
--- a/pandas/core/computation/engines.py
+++ b/pandas/core/computation/engines.py
@@ -37,7 +37,7 @@ def _check_ne_builtin_clash(expr: Expr) -> None:
overlap = names & _ne_builtins
if overlap:
- s = ", ".join([repr(x) for x in overlap])
+ s = ", ".join(repr(x) for x in overlap)
raise NumExprClobberingError(
f'Variables in expression "{expr}" overlap with builtins: ({s})'
)
diff --git a/pandas/core/computation/parsing.py b/pandas/core/computation/parsing.py
index 5e000116d19f2..b0f817d2c1ff3 100644
--- a/pandas/core/computation/parsing.py
+++ b/pandas/core/computation/parsing.py
@@ -57,7 +57,7 @@ def create_valid_python_identifier(name: str) -> str:
}
)
- name = "".join([special_characters_replacements.get(char, char) for char in name])
+ name = "".join(special_characters_replacements.get(char, char) for char in name)
name = "BACKTICK_QUOTED_STRING_" + name
if not name.isidentifier():
diff --git a/pandas/core/computation/pytables.py b/pandas/core/computation/pytables.py
index ad76a76a954b1..f733a5c43dfb3 100644
--- a/pandas/core/computation/pytables.py
+++ b/pandas/core/computation/pytables.py
@@ -226,7 +226,11 @@ def stringify(value):
if v not in metadata:
result = -1
else:
- result = metadata.searchsorted(v, side="left")
+ # error: Incompatible types in assignment (expression has type
+ # "Union[Any, ndarray]", variable has type "int")
+ result = metadata.searchsorted( # type: ignore[assignment]
+ v, side="left"
+ )
return TermValue(result, result, "integer")
elif kind == "integer":
v = int(float(v))
@@ -575,7 +579,7 @@ def __init__(
else:
w = _validate_where(w)
where[idx] = w
- _where = " & ".join([f"({w})" for w in com.flatten(where)])
+ _where = " & ".join(f"({w})" for w in com.flatten(where))
else:
# _validate_where ensures we otherwise have a string
_where = where
diff --git a/pandas/core/computation/scope.py b/pandas/core/computation/scope.py
index 426cd8fd81f28..09067e7eba6e5 100644
--- a/pandas/core/computation/scope.py
+++ b/pandas/core/computation/scope.py
@@ -50,7 +50,7 @@ def _raw_hex_id(obj) -> str:
"""Return the padded hexadecimal id of ``obj``."""
# interpret as a pointer since that's what really what id returns
packed = struct.pack("@P", id(obj))
- return "".join([_replacer(x) for x in packed])
+ return "".join(_replacer(x) for x in packed)
DEFAULT_GLOBALS = {
diff --git a/pandas/core/describe.py b/pandas/core/describe.py
index fd45da4a3ccc7..dfb18b2c40698 100644
--- a/pandas/core/describe.py
+++ b/pandas/core/describe.py
@@ -11,9 +11,7 @@
)
from typing import (
TYPE_CHECKING,
- Any,
Callable,
- Hashable,
Sequence,
cast,
)
@@ -22,7 +20,11 @@
import numpy as np
from pandas._libs.tslibs import Timestamp
-from pandas._typing import FrameOrSeries
+from pandas._typing import (
+ FrameOrSeries,
+ FrameOrSeriesUnion,
+ Hashable,
+)
from pandas.util._validators import validate_percentile
from pandas.core.dtypes.common import (
@@ -49,7 +51,7 @@ def describe_ndframe(
include: str | Sequence[str] | None,
exclude: str | Sequence[str] | None,
datetime_is_numeric: bool,
- percentiles: Sequence[float] | np.ndarray | None,
+ percentiles: Sequence[float] | None,
) -> FrameOrSeries:
"""Describe series or dataframe.
@@ -105,12 +107,12 @@ class NDFrameDescriberAbstract(ABC):
Whether to treat datetime dtypes as numeric.
"""
- def __init__(self, obj: DataFrame | Series, datetime_is_numeric: bool):
+ def __init__(self, obj: FrameOrSeriesUnion, datetime_is_numeric: bool):
self.obj = obj
self.datetime_is_numeric = datetime_is_numeric
@abstractmethod
- def describe(self, percentiles: Sequence[float] | np.ndarray) -> DataFrame | Series:
+ def describe(self, percentiles: Sequence[float]) -> FrameOrSeriesUnion:
"""Do describe either series or dataframe.
Parameters
@@ -125,7 +127,7 @@ class SeriesDescriber(NDFrameDescriberAbstract):
obj: Series
- def describe(self, percentiles: Sequence[float] | np.ndarray) -> Series:
+ def describe(self, percentiles: Sequence[float]) -> Series:
describe_func = select_describe_func(
self.obj,
self.datetime_is_numeric,
@@ -164,7 +166,7 @@ def __init__(
super().__init__(obj, datetime_is_numeric=datetime_is_numeric)
- def describe(self, percentiles: Sequence[float] | np.ndarray) -> DataFrame:
+ def describe(self, percentiles: Sequence[float]) -> DataFrame:
data = self._select_data()
ldesc: list[Series] = []
@@ -386,11 +388,8 @@ def select_describe_func(
return describe_categorical_1d
-def refine_percentiles(
- percentiles: Sequence[float] | np.ndarray | None,
-) -> np.ndarray[Any, np.dtype[np.float64]]:
- """
- Ensure that percentiles are unique and sorted.
+def refine_percentiles(percentiles: Sequence[float] | None) -> Sequence[float]:
+ """Ensure that percentiles are unique and sorted.
Parameters
----------
@@ -398,7 +397,9 @@ def refine_percentiles(
The percentiles to include in the output.
"""
if percentiles is None:
- return np.array([0.25, 0.5, 0.75])
+ # error: Incompatible return value type (got "ndarray", expected
+ # "Sequence[float]")
+ return np.array([0.25, 0.5, 0.75]) # type: ignore[return-value]
# explicit conversion of `percentiles` to list
percentiles = list(percentiles)
@@ -410,7 +411,9 @@ def refine_percentiles(
if 0.5 not in percentiles:
percentiles.append(0.5)
- percentiles = np.asarray(percentiles)
+ # error: Incompatible types in assignment (expression has type "ndarray", variable
+ # has type "Optional[Sequence[float]]")
+ percentiles = np.asarray(percentiles) # type: ignore[assignment]
# sort and check for duplicates
unique_pcts = np.unique(percentiles)
diff --git a/pandas/core/dtypes/base.py b/pandas/core/dtypes/base.py
index 5b7dadac5d914..e52b318c0b4f7 100644
--- a/pandas/core/dtypes/base.py
+++ b/pandas/core/dtypes/base.py
@@ -12,7 +12,6 @@
import numpy as np
-from pandas._libs.hashtable import object_hash
from pandas._typing import (
DtypeObj,
type_t,
@@ -129,9 +128,7 @@ def __eq__(self, other: Any) -> bool:
return False
def __hash__(self) -> int:
- # for python>=3.10, different nan objects have different hashes
- # we need to avoid that und thus use hash function with old behavior
- return object_hash(tuple(getattr(self, attr) for attr in self._metadata))
+ return hash(tuple(getattr(self, attr) for attr in self._metadata))
def __ne__(self, other: Any) -> bool:
return not self.__eq__(other)
diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py
index 4f4276ceddcf9..433d45d94167d 100644
--- a/pandas/core/dtypes/cast.py
+++ b/pandas/core/dtypes/cast.py
@@ -15,7 +15,6 @@
TYPE_CHECKING,
Any,
Sized,
- TypeVar,
cast,
overload,
)
@@ -58,6 +57,7 @@
is_complex_dtype,
is_datetime64_dtype,
is_datetime64tz_dtype,
+ is_datetime_or_timedelta_dtype,
is_dtype_equal,
is_extension_array_dtype,
is_float,
@@ -92,6 +92,7 @@
)
if TYPE_CHECKING:
+ from typing import Literal
from pandas.core.arrays import (
DatetimeArray,
@@ -106,8 +107,6 @@
_int32_max = np.iinfo(np.int32).max
_int64_max = np.iinfo(np.int64).max
-NumpyArrayT = TypeVar("NumpyArrayT", bound=np.ndarray)
-
def maybe_convert_platform(
values: list | tuple | range | np.ndarray | ExtensionArray,
@@ -180,7 +179,9 @@ def maybe_box_native(value: Scalar) -> Scalar:
-------
scalar or Series
"""
- if is_float(value):
+ if is_datetime_or_timedelta_dtype(value):
+ value = maybe_box_datetimelike(value)
+ elif is_float(value):
# error: Argument 1 to "float" has incompatible type
# "Union[Union[str, int, float, bool], Union[Any, Timestamp, Timedelta, Any]]";
# expected "Union[SupportsFloat, _SupportsIndex, str]"
@@ -192,8 +193,6 @@ def maybe_box_native(value: Scalar) -> Scalar:
value = int(value) # type: ignore[arg-type]
elif is_bool(value):
value = bool(value)
- elif isinstance(value, (np.datetime64, np.timedelta64)):
- value = maybe_box_datetimelike(value)
return value
@@ -660,10 +659,7 @@ def _ensure_dtype_type(value, dtype: np.dtype):
object
"""
# Start with exceptions in which we do _not_ cast to numpy types
-
- # error: Non-overlapping equality check (left operand type: "dtype[Any]", right
- # operand type: "Type[object_]")
- if dtype == np.object_: # type: ignore[comparison-overlap]
+ if dtype == np.object_:
return value
# Note: before we get here we have already excluded isna(value)
@@ -779,21 +775,6 @@ def infer_dtype_from_scalar(val, pandas_dtype: bool = False) -> tuple[DtypeObj,
return dtype, val
-def dict_compat(d: dict[Scalar, Scalar]) -> dict[Scalar, Scalar]:
- """
- Convert datetimelike-keyed dicts to a Timestamp-keyed dict.
-
- Parameters
- ----------
- d: dict-like object
-
- Returns
- -------
- dict
- """
- return {maybe_box_datetimelike(key): value for key, value in d.items()}
-
-
def infer_dtype_from_array(
arr, pandas_dtype: bool = False
) -> tuple[DtypeObj, ArrayLike]:
@@ -885,10 +866,10 @@ def maybe_infer_dtype_type(element):
def maybe_upcast(
- values: NumpyArrayT,
+ values: np.ndarray,
fill_value: Scalar = np.nan,
copy: bool = False,
-) -> tuple[NumpyArrayT, Scalar]:
+) -> tuple[np.ndarray, Scalar]:
"""
Provide explicit type promotion and coercion.
@@ -1092,11 +1073,14 @@ def astype_nansafe(
The dtype was a datetime64/timedelta64 dtype, but it had no unit.
"""
if arr.ndim > 1:
- flat = arr.ravel()
+ # Make sure we are doing non-copy ravel and reshape.
+ flags = arr.flags
+ flat = arr.ravel("K")
result = astype_nansafe(flat, dtype, copy=copy, skipna=skipna)
+ order: Literal["C", "F"] = "F" if flags.f_contiguous else "C"
# error: Item "ExtensionArray" of "Union[ExtensionArray, ndarray]" has no
# attribute "reshape"
- return result.reshape(arr.shape) # type: ignore[union-attr]
+ return result.reshape(arr.shape, order=order) # type: ignore[union-attr]
# We get here with 0-dim from sparse
arr = np.atleast_1d(arr)
@@ -1109,10 +1093,7 @@ def astype_nansafe(
raise ValueError("dtype must be np.dtype or ExtensionDtype")
if arr.dtype.kind in ["m", "M"] and (
- issubclass(dtype.type, str)
- # error: Non-overlapping equality check (left operand type: "dtype[Any]", right
- # operand type: "Type[object]")
- or dtype == object # type: ignore[comparison-overlap]
+ issubclass(dtype.type, str) or dtype == object
):
from pandas.core.construction import ensure_wrapped_if_datetimelike
@@ -1123,9 +1104,7 @@ def astype_nansafe(
return lib.ensure_string_array(arr, skipna=skipna, convert_na_value=False)
elif is_datetime64_dtype(arr):
- # Non-overlapping equality check (left operand type: "dtype[Any]", right
- # operand type: "Type[signedinteger[Any]]")
- if dtype == np.int64: # type: ignore[comparison-overlap]
+ if dtype == np.int64:
warnings.warn(
f"casting {arr.dtype} values to int64 with .astype(...) "
"is deprecated and will raise in a future version. "
@@ -1145,9 +1124,7 @@ def astype_nansafe(
raise TypeError(f"cannot astype a datetimelike from [{arr.dtype}] to [{dtype}]")
elif is_timedelta64_dtype(arr):
- # error: Non-overlapping equality check (left operand type: "dtype[Any]", right
- # operand type: "Type[signedinteger[Any]]")
- if dtype == np.int64: # type: ignore[comparison-overlap]
+ if dtype == np.int64:
warnings.warn(
f"casting {arr.dtype} values to int64 with .astype(...) "
"is deprecated and will raise in a future version. "
@@ -1421,9 +1398,10 @@ def convert_dtypes(
if is_string_dtype(inferred_dtype):
if not convert_string:
- return input_array.dtype
+ inferred_dtype = input_array.dtype
else:
- return pandas_dtype("string")
+ inferred_dtype = pandas_dtype("string")
+ return inferred_dtype
if convert_integer:
target_int_dtype = pandas_dtype("Int64")
@@ -1476,9 +1454,7 @@ def convert_dtypes(
else:
return input_array.dtype
- # error: Incompatible return value type (got "Union[str, Union[dtype[Any],
- # ExtensionDtype]]", expected "Union[dtype[Any], ExtensionDtype]")
- return inferred_dtype # type: ignore[return-value]
+ return inferred_dtype
def maybe_infer_to_datetimelike(
@@ -1855,9 +1831,7 @@ def construct_2d_arraylike_from_scalar(
if dtype.kind in ["m", "M"]:
value = maybe_unbox_datetimelike_tz_deprecation(value, dtype, stacklevel=4)
- # error: Non-overlapping equality check (left operand type: "dtype[Any]", right
- # operand type: "Type[object]")
- elif dtype == object: # type: ignore[comparison-overlap]
+ elif dtype == object:
if isinstance(value, (np.timedelta64, np.datetime64)):
# calling np.array below would cast to pytimedelta/pydatetime
out = np.empty(shape, dtype=object)
@@ -2232,9 +2206,7 @@ def can_hold_element(arr: ArrayLike, element: Any) -> bool:
return tipo.kind == "b"
return lib.is_bool(element)
- # error: Non-overlapping equality check (left operand type: "dtype[Any]", right
- # operand type: "Type[object]")
- elif dtype == object: # type: ignore[comparison-overlap]
+ elif dtype == object:
return True
elif dtype.kind == "S":
diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py
index 08287cc296006..34b9a3f1f14ad 100644
--- a/pandas/core/dtypes/common.py
+++ b/pandas/core/dtypes/common.py
@@ -20,6 +20,7 @@
from pandas._typing import (
ArrayLike,
DtypeObj,
+ Optional,
)
from pandas.core.dtypes.base import _registry as registry
@@ -162,7 +163,7 @@ def is_object_dtype(arr_or_dtype) -> bool:
Parameters
----------
- arr_or_dtype : array-like or dtype
+ arr_or_dtype : array-like
The array-like or dtype to check.
Returns
@@ -316,7 +317,7 @@ def is_datetime64_dtype(arr_or_dtype) -> bool:
Parameters
----------
- arr_or_dtype : array-like or dtype
+ arr_or_dtype : array-like
The array-like or dtype to check.
Returns
@@ -349,7 +350,7 @@ def is_datetime64tz_dtype(arr_or_dtype) -> bool:
Parameters
----------
- arr_or_dtype : array-like or dtype
+ arr_or_dtype : array-like
The array-like or dtype to check.
Returns
@@ -390,7 +391,7 @@ def is_timedelta64_dtype(arr_or_dtype) -> bool:
Parameters
----------
- arr_or_dtype : array-like or dtype
+ arr_or_dtype : array-like
The array-like or dtype to check.
Returns
@@ -424,7 +425,7 @@ def is_period_dtype(arr_or_dtype) -> bool:
Parameters
----------
- arr_or_dtype : array-like or dtype
+ arr_or_dtype : array-like
The array-like or dtype to check.
Returns
@@ -460,7 +461,7 @@ def is_interval_dtype(arr_or_dtype) -> bool:
Parameters
----------
- arr_or_dtype : array-like or dtype
+ arr_or_dtype : array-like
The array-like or dtype to check.
Returns
@@ -498,7 +499,7 @@ def is_categorical_dtype(arr_or_dtype) -> bool:
Parameters
----------
- arr_or_dtype : array-like or dtype
+ arr_or_dtype : array-like
The array-like or dtype to check.
Returns
@@ -534,7 +535,7 @@ def is_string_dtype(arr_or_dtype) -> bool:
Parameters
----------
- arr_or_dtype : array-like or dtype
+ arr_or_dtype : array-like
The array or dtype to check.
Returns
@@ -635,7 +636,7 @@ def is_any_int_dtype(arr_or_dtype) -> bool:
Parameters
----------
- arr_or_dtype : array-like or dtype
+ arr_or_dtype : array-like
The array or dtype to check.
Returns
@@ -680,7 +681,7 @@ def is_integer_dtype(arr_or_dtype) -> bool:
Parameters
----------
- arr_or_dtype : array-like or dtype
+ arr_or_dtype : array-like
The array or dtype to check.
Returns
@@ -732,7 +733,7 @@ def is_signed_integer_dtype(arr_or_dtype) -> bool:
Parameters
----------
- arr_or_dtype : array-like or dtype
+ arr_or_dtype : array-like
The array or dtype to check.
Returns
@@ -784,7 +785,7 @@ def is_unsigned_integer_dtype(arr_or_dtype) -> bool:
Parameters
----------
- arr_or_dtype : array-like or dtype
+ arr_or_dtype : array-like
The array or dtype to check.
Returns
@@ -828,7 +829,7 @@ def is_int64_dtype(arr_or_dtype) -> bool:
Parameters
----------
- arr_or_dtype : array-like or dtype
+ arr_or_dtype : array-like
The array or dtype to check.
Returns
@@ -878,7 +879,7 @@ def is_datetime64_any_dtype(arr_or_dtype) -> bool:
Parameters
----------
- arr_or_dtype : array-like or dtype
+ arr_or_dtype : array-like
The array or dtype to check.
Returns
@@ -920,7 +921,7 @@ def is_datetime64_ns_dtype(arr_or_dtype) -> bool:
Parameters
----------
- arr_or_dtype : array-like or dtype
+ arr_or_dtype : array-like
The array or dtype to check.
Returns
@@ -970,7 +971,7 @@ def is_timedelta64_ns_dtype(arr_or_dtype) -> bool:
Parameters
----------
- arr_or_dtype : array-like or dtype
+ arr_or_dtype : array-like
The array or dtype to check.
Returns
@@ -999,7 +1000,7 @@ def is_datetime_or_timedelta_dtype(arr_or_dtype) -> bool:
Parameters
----------
- arr_or_dtype : array-like or dtype
+ arr_or_dtype : array-like
The array or dtype to check.
Returns
@@ -1039,7 +1040,7 @@ def is_numeric_v_string_like(a: ArrayLike, b):
Parameters
----------
- a : array-like, scalar
+ a : array-like
The first object to check.
b : array-like, scalar
The second object to check.
@@ -1146,7 +1147,7 @@ def needs_i8_conversion(arr_or_dtype) -> bool:
Parameters
----------
- arr_or_dtype : array-like or dtype
+ arr_or_dtype : array-like
The array or dtype to check.
Returns
@@ -1190,7 +1191,7 @@ def is_numeric_dtype(arr_or_dtype) -> bool:
Parameters
----------
- arr_or_dtype : array-like or dtype
+ arr_or_dtype : array-like
The array or dtype to check.
Returns
@@ -1234,7 +1235,7 @@ def is_float_dtype(arr_or_dtype) -> bool:
Parameters
----------
- arr_or_dtype : array-like or dtype
+ arr_or_dtype : array-like
The array or dtype to check.
Returns
@@ -1266,7 +1267,7 @@ def is_bool_dtype(arr_or_dtype) -> bool:
Parameters
----------
- arr_or_dtype : array-like or dtype
+ arr_or_dtype : array-like
The array or dtype to check.
Returns
@@ -1337,7 +1338,7 @@ def is_extension_type(arr) -> bool:
Parameters
----------
- arr : array-like, scalar
+ arr : array-like
The array-like to check.
Returns
@@ -1405,7 +1406,7 @@ def is_1d_only_ea_obj(obj: Any) -> bool:
)
-def is_1d_only_ea_dtype(dtype: DtypeObj | None) -> bool:
+def is_1d_only_ea_dtype(dtype: Optional[DtypeObj]) -> bool:
"""
Analogue to is_extension_array_dtype but excluding DatetimeTZDtype.
"""
@@ -1470,7 +1471,7 @@ def is_extension_array_dtype(arr_or_dtype) -> bool:
return registry.find(dtype) is not None
-def is_ea_or_datetimelike_dtype(dtype: DtypeObj | None) -> bool:
+def is_ea_or_datetimelike_dtype(dtype: Optional[DtypeObj]) -> bool:
"""
Check for ExtensionDtype, datetime64 dtype, or timedelta64 dtype.
@@ -1489,7 +1490,7 @@ def is_complex_dtype(arr_or_dtype) -> bool:
Parameters
----------
- arr_or_dtype : array-like or dtype
+ arr_or_dtype : array-like
The array or dtype to check.
Returns
@@ -1546,7 +1547,7 @@ def get_dtype(arr_or_dtype) -> DtypeObj:
Parameters
----------
- arr_or_dtype : array-like or dtype
+ arr_or_dtype : array-like
The array-like or dtype object whose dtype we want to extract.
Returns
@@ -1580,7 +1581,7 @@ def _is_dtype_type(arr_or_dtype, condition) -> bool:
Parameters
----------
- arr_or_dtype : array-like or dtype
+ arr_or_dtype : array-like
The array-like or dtype object whose dtype we want to extract.
condition : callable[Union[np.dtype, ExtensionDtypeType]]
@@ -1714,7 +1715,7 @@ def _validate_date_like_dtype(dtype) -> None:
)
-def validate_all_hashable(*args, error_name: str | None = None) -> None:
+def validate_all_hashable(*args, error_name: Optional[str] = None) -> None:
"""
Return None if all args are hashable, else raise a TypeError.
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
index d54a3047a3ab9..91b9bdd564676 100644
--- a/pandas/core/frame.py
+++ b/pandas/core/frame.py
@@ -27,7 +27,6 @@
Hashable,
Iterable,
Iterator,
- Literal,
Sequence,
cast,
overload,
@@ -59,19 +58,18 @@
FillnaOptions,
FloatFormatType,
FormattersType,
+ FrameOrSeriesUnion,
Frequency,
IndexKeyFunc,
IndexLabel,
Level,
+ NpDtype,
PythonFuncType,
Renamer,
Scalar,
StorageOptions,
Suffixes,
- TimedeltaConvertibleTypes,
- TimestampConvertibleTypes,
ValueKeyFunc,
- npt,
)
from pandas.compat._optional import import_optional_dependency
from pandas.compat.numpy import function as nv
@@ -210,6 +208,12 @@
import pandas.plotting
if TYPE_CHECKING:
+ from typing import Literal
+
+ from pandas._typing import (
+ TimedeltaConvertibleTypes,
+ TimestampConvertibleTypes,
+ )
from pandas.core.groupby.generic import DataFrameGroupBy
from pandas.core.resample import Resampler
@@ -643,7 +647,7 @@ def __init__(
elif isinstance(data, (np.ndarray, Series, Index)):
if data.dtype.names:
# i.e. numpy structured array
- data = cast(np.ndarray, data)
+
mgr = rec_array_to_mgr(
data,
index,
@@ -1358,7 +1362,7 @@ def dot(self, other: Series) -> Series:
def dot(self, other: DataFrame | Index | ArrayLike) -> DataFrame:
...
- def dot(self, other: AnyArrayLike | DataFrame | Series) -> DataFrame | Series:
+ def dot(self, other: AnyArrayLike | FrameOrSeriesUnion) -> FrameOrSeriesUnion:
"""
Compute the matrix multiplication between the DataFrame and other.
@@ -1474,13 +1478,13 @@ def __matmul__(self, other: Series) -> Series:
@overload
def __matmul__(
- self, other: AnyArrayLike | DataFrame | Series
- ) -> DataFrame | Series:
+ self, other: AnyArrayLike | FrameOrSeriesUnion
+ ) -> FrameOrSeriesUnion:
...
def __matmul__(
- self, other: AnyArrayLike | DataFrame | Series
- ) -> DataFrame | Series:
+ self, other: AnyArrayLike | FrameOrSeriesUnion
+ ) -> FrameOrSeriesUnion:
"""
Matrix multiplication using binary `@` operator in Python>=3.5.
"""
@@ -1589,7 +1593,7 @@ def from_dict(
def to_numpy(
self,
- dtype: npt.DTypeLike | None = None,
+ dtype: NpDtype | None = None,
copy: bool = False,
na_value=lib.no_default,
) -> np.ndarray:
@@ -2275,7 +2279,9 @@ def to_records(
if dtype_mapping is None:
formats.append(v.dtype)
elif isinstance(dtype_mapping, (type, np.dtype, str)):
- formats.append(dtype_mapping)
+ # error: Argument 1 to "append" of "list" has incompatible type
+ # "Union[type, dtype, str]"; expected "dtype"
+ formats.append(dtype_mapping) # type: ignore[arg-type]
else:
element = "row" if i < index_len else "column"
msg = f"Invalid dtype {dtype_mapping} specified for {element} {name}"
@@ -2537,7 +2543,8 @@ def to_feather(self, path: FilePathOrBuffer[AnyStr], **kwargs) -> None:
| 0 | elk | dog |
+----+------------+------------+
| 1 | pig | quetzal |
- +----+------------+------------+""",
+ +----+------------+------------+
+ """,
)
def to_markdown(
self,
@@ -3338,8 +3345,8 @@ def transpose(self, *args, copy: bool = False) -> DataFrame:
values = self.values
new_values = [arr_type._from_sequence(row, dtype=dtype) for row in values]
- result = type(self)._from_arrays(
- new_values, index=self.columns, columns=self.index
+ result = self._constructor(
+ dict(zip(self.index, new_values)), index=self.columns
)
else:
@@ -3452,7 +3459,7 @@ def __getitem__(self, key):
else:
if is_iterator(key):
key = list(key)
- indexer = self.columns._get_indexer_strict(key, "columns")[1]
+ indexer = self.loc._get_listlike_indexer(key, axis=1)[1]
# take() does not accept boolean indexers
if getattr(indexer, "dtype", None) == bool:
@@ -3546,11 +3553,6 @@ def _get_value(self, index, col, takeable: bool = False) -> Scalar:
Returns
-------
scalar
-
- Notes
- -----
- Assumes that both `self.index._index_as_unique` and
- `self.columns._index_as_unique`; Caller is responsible for checking.
"""
if takeable:
series = self._ixs(col, axis=1)
@@ -3559,17 +3561,20 @@ def _get_value(self, index, col, takeable: bool = False) -> Scalar:
series = self._get_item_cache(col)
engine = self.index._engine
- if not isinstance(self.index, MultiIndex):
- # CategoricalIndex: Trying to use the engine fastpath may give incorrect
- # results if our categories are integers that dont match our codes
- # IntervalIndex: IntervalTree has no get_loc
- row = self.index.get_loc(index)
- return series._values[row]
+ try:
+ loc = engine.get_loc(index)
+ return series._values[loc]
+ except KeyError:
+ # GH 20629
+ if self.index.nlevels > 1:
+ # partial indexing forbidden
+ raise
- # For MultiIndex going through engine effectively restricts us to
- # same-length tuples; see test_get_set_value_no_partial_indexing
- loc = engine.get_loc(index)
- return series._values[loc]
+ # we cannot handle direct indexing
+ # use positional
+ col = self.columns.get_loc(col)
+ index = self.index.get_loc(index)
+ return self._get_value(index, col, takeable=True)
def __setitem__(self, key, value):
key = com.apply_if_callable(key, self)
@@ -3804,7 +3809,8 @@ def _set_value(
return
series = self._get_item_cache(col)
- loc = self.index.get_loc(index)
+ engine = self.index._engine
+ loc = engine.get_loc(index)
validate_numeric_casting(series.dtype, value)
series._values[loc] = value
@@ -5274,28 +5280,45 @@ def shift(
axis = self._get_axis_number(axis)
ncols = len(self.columns)
- if axis == 1 and periods != 0 and fill_value is lib.no_default and ncols > 0:
- # We will infer fill_value to match the closest column
- # Use a column that we know is valid for our column's dtype GH#38434
- label = self.columns[0]
+ if (
+ axis == 1
+ and periods != 0
+ and ncols > 0
+ and (fill_value is lib.no_default or len(self._mgr.arrays) > 1)
+ ):
+ # Exclude single-array-with-fill_value case so we issue a FutureWarning
+ # if an integer is passed with datetimelike dtype GH#31971
+ from pandas import concat
+
+ # tail: the data that is still in our shifted DataFrame
+ if periods > 0:
+ tail = self.iloc[:, :-periods]
+ else:
+ tail = self.iloc[:, -periods:]
+ # pin a simple Index to avoid costly casting
+ tail.columns = range(len(tail.columns))
+
+ if fill_value is not lib.no_default:
+ # GH#35488
+ # TODO(EA2D): with 2D EAs we could construct other directly
+ ser = Series(fill_value, index=self.index)
+ else:
+ # We infer fill_value to match the closest column
+ if periods > 0:
+ ser = self.iloc[:, 0].shift(len(self))
+ else:
+ ser = self.iloc[:, -1].shift(len(self))
+
+ width = min(abs(periods), ncols)
+ other = concat([ser] * width, axis=1)
if periods > 0:
- result = self.iloc[:, :-periods]
- for col in range(min(ncols, abs(periods))):
- # TODO(EA2D): doing this in a loop unnecessary with 2D EAs
- # Define filler inside loop so we get a copy
- filler = self.iloc[:, 0].shift(len(self))
- result.insert(0, label, filler, allow_duplicates=True)
+ result = concat([other, tail], axis=1)
else:
- result = self.iloc[:, -periods:]
- for col in range(min(ncols, abs(periods))):
- # Define filler inside loop so we get a copy
- filler = self.iloc[:, -1].shift(len(self))
- result.insert(
- len(result.columns), label, filler, allow_duplicates=True
- )
+ result = concat([tail, other], axis=1)
+ result = cast(DataFrame, result)
result.columns = self.columns.copy()
return result
@@ -6158,10 +6181,7 @@ def f(vals) -> tuple[np.ndarray, int]:
return labels.astype("i8", copy=False), len(shape)
if subset is None:
- # https://github.com/pandas-dev/pandas/issues/28770
- # Incompatible types in assignment (expression has type "Index", variable
- # has type "Sequence[Any]")
- subset = self.columns # type: ignore[assignment]
+ subset = self.columns
elif (
not np.iterable(subset)
or isinstance(subset, str)
@@ -6171,7 +6191,7 @@ def f(vals) -> tuple[np.ndarray, int]:
subset = (subset,)
# needed for mypy since can't narrow types using np.iterable
- subset = cast(Sequence, subset)
+ subset = cast(Iterable, subset)
# Verify all columns in subset exist in the queried dataframe
# Otherwise, raise a KeyError, same as if you try to __getitem__ with a
@@ -6238,7 +6258,6 @@ def sort_values( # type: ignore[override]
keys, orders=ascending, na_position=na_position, key=key
)
elif len(by):
- # len(by) == 1
by = by[0]
k = self._get_label_or_level_values(by, axis=axis)
@@ -6720,16 +6739,23 @@ def nsmallest(self, n, columns, keep: str = "first") -> DataFrame:
self, n=n, keep=keep, columns=columns
).nsmallest()
- @doc(
- Series.swaplevel,
- klass=_shared_doc_kwargs["klass"],
- extra_params=dedent(
- """axis : {0 or 'index', 1 or 'columns'}, default 0
+ def swaplevel(self, i: Axis = -2, j: Axis = -1, axis: Axis = 0) -> DataFrame:
+ """
+ Swap levels i and j in a MultiIndex on a particular axis.
+
+ Parameters
+ ----------
+ i, j : int or str
+ Levels of the indices to be swapped. Can pass level name as string.
+ axis : {0 or 'index', 1 or 'columns'}, default 0
The axis to swap levels on. 0 or 'index' for row-wise, 1 or
- 'columns' for column-wise."""
- ),
- examples=dedent(
- """Examples
+ 'columns' for column-wise.
+
+ Returns
+ -------
+ DataFrame
+
+ Examples
--------
>>> df = pd.DataFrame(
... {"Grade": ["A", "B", "A", "C"]},
@@ -6778,10 +6804,8 @@ def nsmallest(self, n, columns, keep: str = "first") -> DataFrame:
History Final exam January A
Geography Final exam February B
History Coursework March A
- Geography Coursework April C"""
- ),
- )
- def swaplevel(self, i: Axis = -2, j: Axis = -1, axis: Axis = 0) -> DataFrame:
+ Geography Coursework April C
+ """
result = self.copy()
axis = self._get_axis_number(axis)
@@ -7611,7 +7635,6 @@ def groupby(
raise TypeError("You have to supply one of 'by' and 'level'")
axis = self._get_axis_number(axis)
- # https://github.com/python/mypy/issues/7642
# error: Argument "squeeze" to "DataFrameGroupBy" has incompatible type
# "Union[bool, NoDefault]"; expected "bool"
return DataFrameGroupBy(
@@ -8121,11 +8144,7 @@ def stack(self, level: Level = -1, dropna: bool = True):
return result.__finalize__(self, method="stack")
- def explode(
- self,
- column: str | tuple | list[str | tuple],
- ignore_index: bool = False,
- ) -> DataFrame:
+ def explode(self, column: str | tuple, ignore_index: bool = False) -> DataFrame:
"""
Transform each element of a list-like to a row, replicating index values.
@@ -8133,15 +8152,8 @@ def explode(
Parameters
----------
- column : str or tuple or list thereof
- Column(s) to explode.
- For multiple columns, specify a non-empty list with each element
- be str or tuple, and all specified columns their list-like data
- on same row of the frame must have matching length.
-
- .. versionadded:: 1.3.0
- Multi-column explode
-
+ column : str or tuple
+ Column to explode.
ignore_index : bool, default False
If True, the resulting index will be labeled 0, 1, …, n - 1.
@@ -8156,10 +8168,7 @@ def explode(
Raises
------
ValueError :
- * If columns of the frame are not unique.
- * If specified columns to explode is empty list.
- * If specified columns to explode have not matching count of
- elements rowwise in the frame.
+ if columns of the frame are not unique.
See Also
--------
@@ -8178,69 +8187,32 @@ def explode(
Examples
--------
- >>> df = pd.DataFrame({'A': [[0, 1, 2], 'foo', [], [3, 4]],
- ... 'B': 1,
- ... 'C': [['a', 'b', 'c'], np.nan, [], ['d', 'e']]})
+ >>> df = pd.DataFrame({'A': [[1, 2, 3], 'foo', [], [3, 4]], 'B': 1})
>>> df
- A B C
- 0 [0, 1, 2] 1 [a, b, c]
- 1 foo 1 NaN
- 2 [] 1 []
- 3 [3, 4] 1 [d, e]
-
- Single-column explode.
+ A B
+ 0 [1, 2, 3] 1
+ 1 foo 1
+ 2 [] 1
+ 3 [3, 4] 1
>>> df.explode('A')
- A B C
- 0 0 1 [a, b, c]
- 0 1 1 [a, b, c]
- 0 2 1 [a, b, c]
- 1 foo 1 NaN
- 2 NaN 1 []
- 3 3 1 [d, e]
- 3 4 1 [d, e]
-
- Multi-column explode.
-
- >>> df.explode(list('AC'))
- A B C
- 0 0 1 a
- 0 1 1 b
- 0 2 1 c
- 1 foo 1 NaN
- 2 NaN 1 NaN
- 3 3 1 d
- 3 4 1 e
- """
+ A B
+ 0 1 1
+ 0 2 1
+ 0 3 1
+ 1 foo 1
+ 2 NaN 1
+ 3 3 1
+ 3 4 1
+ """
+ if not (is_scalar(column) or isinstance(column, tuple)):
+ raise ValueError("column must be a scalar")
if not self.columns.is_unique:
raise ValueError("columns must be unique")
- columns: list[str | tuple]
- if is_scalar(column) or isinstance(column, tuple):
- assert isinstance(column, (str, tuple))
- columns = [column]
- elif isinstance(column, list) and all(
- map(lambda c: is_scalar(c) or isinstance(c, tuple), column)
- ):
- if not column:
- raise ValueError("column must be nonempty")
- if len(column) > len(set(column)):
- raise ValueError("column must be unique")
- columns = column
- else:
- raise ValueError("column must be a scalar, tuple, or list thereof")
-
df = self.reset_index(drop=True)
- if len(columns) == 1:
- result = df[columns[0]].explode()
- else:
- mylen = lambda x: len(x) if is_list_like(x) else -1
- counts0 = self[columns[0]].apply(mylen)
- for c in columns[1:]:
- if not all(counts0 == self[c].apply(mylen)):
- raise ValueError("columns must have matching element counts")
- result = DataFrame({c: df[c].explode() for c in columns})
- result = df.drop(columns, axis=1).join(result)
+ result = df[column].explode()
+ result = df.drop([column], axis=1).join(result)
if ignore_index:
result.index = ibase.default_index(len(result))
else:
@@ -8429,8 +8401,8 @@ def _gotitem(
self,
key: IndexLabel,
ndim: int,
- subset: DataFrame | Series | None = None,
- ) -> DataFrame | Series:
+ subset: FrameOrSeriesUnion | None = None,
+ ) -> FrameOrSeriesUnion:
"""
Sub-classes to define. Return a sliced object.
@@ -8959,7 +8931,7 @@ def append(
def join(
self,
- other: DataFrame | Series,
+ other: FrameOrSeriesUnion,
on: IndexLabel | None = None,
how: str = "left",
lsuffix: str = "",
@@ -9089,7 +9061,7 @@ def join(
def _join_compat(
self,
- other: DataFrame | Series,
+ other: FrameOrSeriesUnion,
on: IndexLabel | None = None,
how: str = "left",
lsuffix: str = "",
@@ -9159,7 +9131,7 @@ def _join_compat(
@Appender(_merge_doc, indents=2)
def merge(
self,
- right: DataFrame | Series,
+ right: FrameOrSeriesUnion,
how: str = "inner",
on: IndexLabel | None = None,
left_on: IndexLabel | None = None,
@@ -9788,12 +9760,8 @@ def _reduce(
FutureWarning,
stacklevel=5,
)
- # Non-copy equivalent to
- # cols = self.columns[~dtype_is_dt]
- # self = self[cols]
- predicate = lambda x: not is_datetime64_any_dtype(x.dtype)
- mgr = self._mgr._get_data_subset(predicate)
- self = type(self)(mgr)
+ cols = self.columns[~dtype_is_dt]
+ self = self[cols]
# TODO: Make other agg func handle axis=None properly GH#21597
axis = self._get_axis_number(axis)
@@ -10756,7 +10724,7 @@ def _from_nested_dict(data) -> collections.defaultdict:
return new_data
-def _reindex_for_setitem(value: DataFrame | Series, index: Index) -> ArrayLike:
+def _reindex_for_setitem(value: FrameOrSeriesUnion, index: Index) -> ArrayLike:
# reindex if necessary
if value.index.equals(index) or not len(index):
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
index c63aeb736d16a..99e4888d08be6 100644
--- a/pandas/core/generic.py
+++ b/pandas/core/generic.py
@@ -14,11 +14,9 @@
AnyStr,
Callable,
Hashable,
- Literal,
Mapping,
Sequence,
cast,
- final,
overload,
)
import warnings
@@ -48,14 +46,14 @@
JSONSerializable,
Level,
Manager,
- RandomState,
+ NpDtype,
Renamer,
StorageOptions,
T,
TimedeltaConvertibleTypes,
TimestampConvertibleTypes,
ValueKeyFunc,
- npt,
+ final,
)
from pandas.compat._optional import import_optional_dependency
from pandas.compat.numpy import function as nv
@@ -138,7 +136,6 @@
from pandas.core.missing import find_valid_index
from pandas.core.ops import align_method_FRAME
from pandas.core.reshape.concat import concat
-import pandas.core.sample as sample
from pandas.core.shared_docs import _shared_docs
from pandas.core.sorting import get_indexer_indexer
from pandas.core.window import (
@@ -156,6 +153,7 @@
from pandas.io.formats.printing import pprint_thing
if TYPE_CHECKING:
+ from typing import Literal
from pandas._libs.tslibs import BaseOffset
@@ -1152,7 +1150,7 @@ def rename(
]
raise KeyError(f"{missing_labels} not found in axis")
- new_index = ax._transform_index(f, level=level)
+ new_index = ax._transform_index(f, level)
result._set_axis_nocheck(new_index, axis=axis_no, inplace=True)
result._clear_item_cache()
@@ -1875,10 +1873,11 @@ def _drop_labels_or_levels(self, keys, axis: int = 0):
# ----------------------------------------------------------------------
# Iteration
- # https://github.com/python/typeshed/issues/2148#issuecomment-520783318
- # Incompatible types in assignment (expression has type "None", base class
- # "object" defined the type as "Callable[[object], int]")
- __hash__: None # type: ignore[assignment]
+ def __hash__(self) -> int:
+ raise TypeError(
+ f"{repr(type(self).__name__)} objects are mutable, "
+ f"thus they cannot be hashed"
+ )
def __iter__(self):
"""
@@ -1988,7 +1987,7 @@ def empty(self) -> bool_t:
# GH#23114 Ensure ndarray.__op__(DataFrame) returns NotImplemented
__array_priority__ = 1000
- def __array__(self, dtype: npt.DTypeLike | None = None) -> np.ndarray:
+ def __array__(self, dtype: NpDtype | None = None) -> np.ndarray:
return np.asarray(self._values, dtype=dtype)
def __array_wrap__(
@@ -3765,12 +3764,12 @@ class animal locomotion
self._consolidate_inplace()
if isinstance(index, MultiIndex):
- loc, new_index = index._get_loc_level(key, level=0)
- if not drop_level:
- if lib.is_integer(loc):
- new_index = index[loc : loc + 1]
- else:
- new_index = index[loc]
+ try:
+ loc, new_index = index._get_loc_level(
+ key, level=0, drop_level=drop_level
+ )
+ except TypeError as e:
+ raise TypeError(f"Expected label or tuple of labels, got {key}") from e
else:
loc = index.get_loc(key)
@@ -5143,13 +5142,12 @@ def tail(self: FrameOrSeries, n: int = 5) -> FrameOrSeries:
@final
def sample(
self: FrameOrSeries,
- n: int | None = None,
- frac: float | None = None,
- replace: bool_t = False,
+ n=None,
+ frac=None,
+ replace=False,
weights=None,
- random_state: RandomState | None = None,
- axis: Axis | None = None,
- ignore_index: bool_t = False,
+ random_state=None,
+ axis=None,
) -> FrameOrSeries:
"""
Return a random sample of items from an axis of object.
@@ -5178,27 +5176,19 @@ def sample(
If weights do not sum to 1, they will be normalized to sum to 1.
Missing values in the weights column will be treated as zero.
Infinite values not allowed.
- random_state : int, array-like, BitGenerator, np.random.RandomState,
- np.random.Generator, optional. If int, array-like, or BitGenerator, seed for
- random number generator. If np.random.RandomState or np.random.Generator,
- use as given.
+ random_state : int, array-like, BitGenerator, np.random.RandomState, optional
+ If int, array-like, or BitGenerator (NumPy>=1.17), seed for
+ random number generator
+ If np.random.RandomState, use as numpy RandomState object.
.. versionchanged:: 1.1.0
- array-like and BitGenerator object now passed to np.random.RandomState()
- as seed
-
- .. versionchanged:: 1.4.0
-
- np.random.Generator objects now accepted
+ array-like and BitGenerator (for NumPy>=1.17) object now passed to
+ np.random.RandomState() as seed
axis : {0 or ‘index’, 1 or ‘columns’, None}, default None
Axis to sample. Accepts axis number or name. Default is stat axis
for given data type (0 for Series and DataFrames).
- ignore_index : bool, default False
- If True, the resulting index will be labeled 0, 1, …, n - 1.
-
- .. versionadded:: 1.3.0
Returns
-------
@@ -5275,26 +5265,92 @@ def sample(
axis = self._stat_axis_number
axis = self._get_axis_number(axis)
- obj_len = self.shape[axis]
+ axis_length = self.shape[axis]
# Process random_state argument
rs = com.random_state(random_state)
- size = sample.process_sampling_size(n, frac, replace)
- if size is None:
- assert frac is not None
- size = round(frac * obj_len)
-
+ # Check weights for compliance
if weights is not None:
- weights = sample.preprocess_weights(self, weights, axis)
- sampled_indices = sample.sample(obj_len, size, replace, weights, rs)
- result = self.take(sampled_indices, axis=axis)
+ # If a series, align with frame
+ if isinstance(weights, ABCSeries):
+ weights = weights.reindex(self.axes[axis])
- if ignore_index:
- result.index = ibase.default_index(len(result))
+ # Strings acceptable if a dataframe and axis = 0
+ if isinstance(weights, str):
+ if isinstance(self, ABCDataFrame):
+ if axis == 0:
+ try:
+ weights = self[weights]
+ except KeyError as err:
+ raise KeyError(
+ "String passed to weights not a valid column"
+ ) from err
+ else:
+ raise ValueError(
+ "Strings can only be passed to "
+ "weights when sampling from rows on "
+ "a DataFrame"
+ )
+ else:
+ raise ValueError(
+ "Strings cannot be passed as weights "
+ "when sampling from a Series."
+ )
- return result
+ if isinstance(self, ABCSeries):
+ func = self._constructor
+ else:
+ func = self._constructor_sliced
+ weights = func(weights, dtype="float64")
+
+ if len(weights) != axis_length:
+ raise ValueError(
+ "Weights and axis to be sampled must be of same length"
+ )
+
+ if (weights == np.inf).any() or (weights == -np.inf).any():
+ raise ValueError("weight vector may not include `inf` values")
+
+ if (weights < 0).any():
+ raise ValueError("weight vector many not include negative values")
+
+ # If has nan, set to zero.
+ weights = weights.fillna(0)
+
+ # Renormalize if don't sum to 1
+ if weights.sum() != 1:
+ if weights.sum() != 0:
+ weights = weights / weights.sum()
+ else:
+ raise ValueError("Invalid weights: weights sum to zero")
+
+ weights = weights._values
+
+ # If no frac or n, default to n=1.
+ if n is None and frac is None:
+ n = 1
+ elif frac is not None and frac > 1 and not replace:
+ raise ValueError(
+ "Replace has to be set to `True` when "
+ "upsampling the population `frac` > 1."
+ )
+ elif frac is None and n % 1 != 0:
+ raise ValueError("Only integers accepted as `n` values")
+ elif n is None and frac is not None:
+ n = round(frac * axis_length)
+ elif frac is not None:
+ raise ValueError("Please enter a value for `frac` OR `n`, not both")
+
+ # Check for negative sizes
+ if n < 0:
+ raise ValueError(
+ "A negative number of rows requested. Please provide positive value."
+ )
+
+ locs = rs.choice(axis_length, size=n, replace=replace, p=weights)
+ return self.take(locs, axis=axis)
@final
@doc(klass=_shared_doc_kwargs["klass"])
@@ -7229,11 +7285,11 @@ def clip(
Parameters
----------
- lower : float or array-like, default None
+ lower : float or array_like, default None
Minimum threshold value. All values below this
threshold will be set to it. A missing
threshold (e.g `NA`) will not clip the value.
- upper : float or array-like, default None
+ upper : float or array_like, default None
Maximum threshold value. All values above this
threshold will be set to it. A missing
threshold (e.g `NA`) will not clip the value.
@@ -7833,8 +7889,8 @@ def resample(
Pass a custom function via ``apply``
- >>> def custom_resampler(arraylike):
- ... return np.sum(arraylike) + 5
+ >>> def custom_resampler(array_like):
+ ... return np.sum(array_like) + 5
...
>>> series.resample('3T').apply(custom_resampler)
2000-01-01 00:00:00 8
@@ -9382,7 +9438,7 @@ def truncate(
if before is not None and after is not None and before > after:
raise ValueError(f"Truncate: {after} must be after {before}")
- if len(ax) > 1 and ax.is_monotonic_decreasing and ax.nunique() > 1:
+ if len(ax) > 1 and ax.is_monotonic_decreasing:
before, after = after, before
slicer = [slice(None, None)] * self._AXIS_LEN
@@ -9695,9 +9751,11 @@ def abs(self: FrameOrSeries) -> FrameOrSeries:
2 6 30 -30
3 7 40 -50
"""
- # error: Incompatible return value type (got "ndarray[Any, dtype[Any]]",
- # expected "FrameOrSeries")
- return np.abs(self) # type: ignore[return-value]
+ # error: Argument 1 to "__call__" of "ufunc" has incompatible type
+ # "FrameOrSeries"; expected "Union[Union[int, float, complex, str, bytes,
+ # generic], Sequence[Union[int, float, complex, str, bytes, generic]],
+ # Sequence[Sequence[Any]], _SupportsArray]"
+ return np.abs(self) # type: ignore[arg-type]
@final
def describe(
@@ -10499,7 +10557,6 @@ def mad(self, axis=None, skipna=None, level=None):
name1=name1,
name2=name2,
axis_descr=axis_descr,
- notes="",
)
def sem(
self,
@@ -10521,7 +10578,6 @@ def sem(
name1=name1,
name2=name2,
axis_descr=axis_descr,
- notes="",
)
def var(
self,
@@ -10544,7 +10600,6 @@ def var(
name1=name1,
name2=name2,
axis_descr=axis_descr,
- notes=_std_notes,
)
def std(
self,
@@ -10838,12 +10893,11 @@ def ewm(
span: float | None = None,
halflife: float | TimedeltaConvertibleTypes | None = None,
alpha: float | None = None,
- min_periods: int | None = 0,
+ min_periods: int = 0,
adjust: bool_t = True,
ignore_na: bool_t = False,
axis: Axis = 0,
times: str | np.ndarray | FrameOrSeries | None = None,
- method: str = "single",
) -> ExponentialMovingWindow:
axis = self._get_axis_number(axis)
# error: Value of type variable "FrameOrSeries" of "ExponentialMovingWindow"
@@ -10859,7 +10913,6 @@ def ewm(
ignore_na=ignore_na,
axis=axis,
times=times,
- method=method,
)
# ----------------------------------------------------------------------
@@ -10983,7 +11036,7 @@ def last_valid_index(self) -> Hashable | None:
def _doc_params(cls):
"""Return a tuple of the doc params."""
axis_descr = (
- f"{{{', '.join([f'{a} ({i})' for i, a in enumerate(cls._AXIS_ORDERS)])}}}"
+ f"{{{', '.join(f'{a} ({i})' for i, a in enumerate(cls._AXIS_ORDERS))}}}"
)
name = cls._constructor_sliced.__name__ if cls._AXIS_LEN > 1 else "scalar"
name2 = cls.__name__
@@ -11037,16 +11090,12 @@ def _doc_params(cls):
Returns
-------
-{name1} or {name2} (if level specified) \
-{notes}
-"""
-
-_std_notes = """
+{name1} or {name2} (if level specified)
Notes
-----
To have the same behaviour as `numpy.std`, use `ddof=0` (instead of the
-default `ddof=1`)"""
+default `ddof=1`)\n"""
_bool_doc = """
{desc}
diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py
index 88d1baae86467..69f992f840c7c 100644
--- a/pandas/core/groupby/generic.py
+++ b/pandas/core/groupby/generic.py
@@ -33,6 +33,7 @@
from pandas._typing import (
ArrayLike,
FrameOrSeries,
+ FrameOrSeriesUnion,
Manager2D,
)
from pandas.util._decorators import (
@@ -295,7 +296,7 @@ def _aggregate_multiple_funcs(self, arg) -> DataFrame:
arg = zip(columns, arg)
- results: dict[base.OutputKey, DataFrame | Series] = {}
+ results: dict[base.OutputKey, FrameOrSeriesUnion] = {}
for idx, (name, func) in enumerate(arg):
key = base.OutputKey(label=name, position=idx)
@@ -421,7 +422,7 @@ def _wrap_applied_output(
keys: Index,
values: list[Any] | None,
not_indexed_same: bool = False,
- ) -> DataFrame | Series:
+ ) -> FrameOrSeriesUnion:
"""
Wrap the output of SeriesGroupBy.apply into the expected result.
@@ -1019,15 +1020,13 @@ def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs)
if isinstance(sobj, Series):
# GH#35246 test_groupby_as_index_select_column_sum_empty_df
- result.columns = self._obj_with_exclusions.columns.copy()
+ result.columns = [sobj.name]
else:
- # Retain our column names
- result.columns._set_names(
- sobj.columns.names, level=list(range(sobj.columns.nlevels))
- )
# select everything except for the last level, which is the one
# containing the name of the function(s), see GH#32040
- result.columns = result.columns.droplevel(-1)
+ result.columns = result.columns.rename(
+ [sobj.columns.name] * result.columns.nlevels
+ ).droplevel(-1)
if not self.as_index:
self._insert_inaxis_grouper_inplace(result)
@@ -1192,7 +1191,7 @@ def _wrap_applied_output_series(
not_indexed_same: bool,
first_not_none,
key_index,
- ) -> DataFrame | Series:
+ ) -> FrameOrSeriesUnion:
# this is to silence a DeprecationWarning
# TODO: Remove when default dtype of empty Series is object
kwargs = first_not_none._construct_axes_dict()
@@ -1308,15 +1307,10 @@ def _transform_general(self, func, *args, **kwargs):
gen = self.grouper.get_iterator(obj, axis=self.axis)
fast_path, slow_path = self._define_paths(func, *args, **kwargs)
- # Determine whether to use slow or fast path by evaluating on the first group.
- # Need to handle the case of an empty generator and process the result so that
- # it does not need to be computed again.
- try:
- name, group = next(gen)
- except StopIteration:
- pass
- else:
+ for name, group in gen:
object.__setattr__(group, "name", name)
+
+ # Try slow path and fast path.
try:
path, res = self._choose_path(fast_path, slow_path, group)
except TypeError:
@@ -1324,18 +1318,30 @@ def _transform_general(self, func, *args, **kwargs):
except ValueError as err:
msg = "transform must return a scalar value for each group"
raise ValueError(msg) from err
- if group.size > 0:
- res = _wrap_transform_general_frame(self.obj, group, res)
- applied.append(res)
- # Compute and process with the remaining groups
- for name, group in gen:
- if group.size == 0:
- continue
- object.__setattr__(group, "name", name)
- res = path(group)
- res = _wrap_transform_general_frame(self.obj, group, res)
- applied.append(res)
+ if isinstance(res, Series):
+
+ # we need to broadcast across the
+ # other dimension; this will preserve dtypes
+ # GH14457
+ if not np.prod(group.shape):
+ continue
+ elif res.index.is_(obj.index):
+ r = concat([res] * len(group.columns), axis=1)
+ r.columns = group.columns
+ r.index = group.index
+ else:
+ r = self.obj._constructor(
+ np.concatenate([res.values] * len(group.index)).reshape(
+ group.shape
+ ),
+ columns=group.columns,
+ index=group.index,
+ )
+
+ applied.append(r)
+ else:
+ applied.append(res)
concat_index = obj.columns if self.axis == 0 else obj.index
other_axis = 1 if self.axis == 0 else 0 # switches between 0 & 1
@@ -1659,7 +1665,7 @@ def _wrap_transformed_output(
result.columns = self.obj.columns
else:
columns = Index(key.label for key in output)
- columns._set_names(self.obj._get_axis(1 - self.axis).names)
+ columns.name = self.obj.columns.name
result.columns = columns
result.index = self.obj.index
@@ -1668,9 +1674,7 @@ def _wrap_transformed_output(
def _wrap_agged_manager(self, mgr: Manager2D) -> DataFrame:
if not self.as_index:
- # GH 41998 - empty mgr always gets index of length 0
- rows = mgr.shape[1] if mgr.shape[0] > 0 else 0
- index = Index(range(rows))
+ index = Index(range(mgr.shape[1]))
mgr.set_axis(1, index)
result = self.obj._constructor(mgr)
@@ -1796,6 +1800,7 @@ def nunique(self, dropna: bool = True) -> DataFrame:
results = self._apply_to_column_groupbys(
lambda sgb: sgb.nunique(dropna), obj=obj
)
+ results.columns.names = obj.columns.names # TODO: do at higher level?
if not self.as_index:
results.index = Index(range(len(results)))
@@ -1846,28 +1851,3 @@ def func(df):
return self._python_apply_general(func, self._obj_with_exclusions)
boxplot = boxplot_frame_groupby
-
-
-def _wrap_transform_general_frame(
- obj: DataFrame, group: DataFrame, res: DataFrame | Series
-) -> DataFrame:
- from pandas import concat
-
- if isinstance(res, Series):
- # we need to broadcast across the
- # other dimension; this will preserve dtypes
- # GH14457
- if res.index.is_(obj.index):
- res_frame = concat([res] * len(group.columns), axis=1)
- res_frame.columns = group.columns
- res_frame.index = group.index
- else:
- res_frame = obj._constructor(
- np.concatenate([res.values] * len(group.index)).reshape(group.shape),
- columns=group.columns,
- index=group.index,
- )
- assert isinstance(res_frame, DataFrame)
- return res_frame
- else:
- return res
diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
index d3a86fa5950ed..f694dcce809ea 100644
--- a/pandas/core/groupby/groupby.py
+++ b/pandas/core/groupby/groupby.py
@@ -18,18 +18,17 @@ class providing the base-class of operations.
from textwrap import dedent
import types
from typing import (
+ TYPE_CHECKING,
Callable,
Hashable,
Iterable,
Iterator,
List,
- Literal,
Mapping,
Sequence,
TypeVar,
Union,
cast,
- final,
)
import warnings
@@ -46,10 +45,11 @@ class providing the base-class of operations.
ArrayLike,
F,
FrameOrSeries,
+ FrameOrSeriesUnion,
IndexLabel,
- RandomState,
Scalar,
T,
+ final,
)
from pandas.compat.numpy import function as nv
from pandas.errors import AbstractMethodError
@@ -101,7 +101,6 @@ class providing the base-class of operations.
MultiIndex,
)
from pandas.core.internals.blocks import ensure_block_shape
-import pandas.core.sample as sample
from pandas.core.series import Series
from pandas.core.sorting import get_group_index_sorter
from pandas.core.util.numba_ import (
@@ -109,6 +108,9 @@ class providing the base-class of operations.
maybe_use_numba,
)
+if TYPE_CHECKING:
+ from typing import Literal
+
_common_see_also = """
See Also
--------
@@ -726,7 +728,7 @@ def pipe(
plot = property(GroupByPlot)
@final
- def get_group(self, name, obj=None) -> DataFrame | Series:
+ def get_group(self, name, obj=None) -> FrameOrSeriesUnion:
"""
Construct DataFrame from group with provided name.
@@ -1265,8 +1267,8 @@ def f(g):
@final
def _python_apply_general(
- self, f: F, data: DataFrame | Series
- ) -> DataFrame | Series:
+ self, f: F, data: FrameOrSeriesUnion
+ ) -> FrameOrSeriesUnion:
"""
Apply function f in python space
@@ -1517,11 +1519,7 @@ def _bool_agg(self, val_test, skipna):
def objs_to_bool(vals: ArrayLike) -> tuple[np.ndarray, type]:
if is_object_dtype(vals):
- # GH#37501: don't raise on pd.NA when skipna=True
- if skipna:
- vals = np.array([bool(x) if not isna(x) else True for x in vals])
- else:
- vals = np.array([bool(x) for x in vals])
+ vals = np.array([bool(x) for x in vals])
elif isinstance(vals, BaseMaskedArray):
vals = vals._data.astype(bool, copy=False)
else:
@@ -1788,7 +1786,7 @@ def sem(self, ddof: int = 1):
@final
@Substitution(name="groupby")
@Appender(_common_see_also)
- def size(self) -> DataFrame | Series:
+ def size(self) -> FrameOrSeriesUnion:
"""
Compute group sizes.
@@ -2637,7 +2635,7 @@ def cumcount(self, ascending: bool = True):
@final
@Substitution(name="groupby")
- @Substitution(see_also=_common_see_also)
+ @Appender(_common_see_also)
def rank(
self,
method: str = "average",
@@ -2671,41 +2669,6 @@ def rank(
Returns
-------
DataFrame with ranking of values within each group
- %(see_also)s
- Examples
- --------
- >>> df = pd.DataFrame(
- ... {
- ... "group": ["a", "a", "a", "a", "a", "b", "b", "b", "b", "b"],
- ... "value": [2, 4, 2, 3, 5, 1, 2, 4, 1, 5],
- ... }
- ... )
- >>> df
- group value
- 0 a 2
- 1 a 4
- 2 a 2
- 3 a 3
- 4 a 5
- 5 b 1
- 6 b 2
- 7 b 4
- 8 b 1
- 9 b 5
- >>> for method in ['average', 'min', 'max', 'dense', 'first']:
- ... df[f'{method}_rank'] = df.groupby('group')['value'].rank(method)
- >>> df
- group value average_rank min_rank max_rank dense_rank first_rank
- 0 a 2 1.5 1.0 2.0 1.0 1.0
- 1 a 4 4.0 4.0 4.0 3.0 4.0
- 2 a 2 1.5 1.0 2.0 1.0 2.0
- 3 a 3 3.0 3.0 3.0 2.0 3.0
- 4 a 5 5.0 5.0 5.0 4.0 5.0
- 5 b 1 1.5 1.0 2.0 1.0 1.0
- 6 b 2 3.0 3.0 3.0 2.0 3.0
- 7 b 4 4.0 4.0 4.0 3.0 4.0
- 8 b 1 1.5 1.0 2.0 1.0 2.0
- 9 b 5 5.0 5.0 5.0 4.0 5.0
"""
if na_option not in {"keep", "top", "bottom"}:
msg = "na_option must be one of 'keep', 'top', or 'bottom'"
@@ -3217,7 +3180,7 @@ def sample(
frac: float | None = None,
replace: bool = False,
weights: Sequence | Series | None = None,
- random_state: RandomState | None = None,
+ random_state=None,
):
"""
Return a random sample of items from each group.
@@ -3243,14 +3206,10 @@ def sample(
sampling probabilities after normalization within each group.
Values must be non-negative with at least one positive element
within each group.
- random_state : int, array-like, BitGenerator, np.random.RandomState,
- np.random.Generator, optional. If int, array-like, or BitGenerator, seed for
- random number generator. If np.random.RandomState or np.random.Generator,
- use as given.
-
- .. versionchanged:: 1.4.0
-
- np.random.Generator objects now accepted
+ random_state : int, array-like, BitGenerator, np.random.RandomState, optional
+ If int, array-like, or BitGenerator (NumPy>=1.17), seed for
+ random number generator
+ If np.random.RandomState, use as numpy RandomState object.
Returns
-------
@@ -3307,37 +3266,26 @@ def sample(
2 blue 2
0 red 0
"""
- size = sample.process_sampling_size(n, frac, replace)
+ from pandas.core.reshape.concat import concat
+
if weights is not None:
- weights_arr = sample.preprocess_weights(
- self._selected_obj, weights, axis=self.axis
- )
+ weights = Series(weights, index=self._selected_obj.index)
+ ws = [weights.iloc[idx] for idx in self.indices.values()]
+ else:
+ ws = [None] * self.ngroups
- random_state = com.random_state(random_state)
+ if random_state is not None:
+ random_state = com.random_state(random_state)
group_iterator = self.grouper.get_iterator(self._selected_obj, self.axis)
-
- sampled_indices = []
- for labels, obj in group_iterator:
- grp_indices = self.indices[labels]
- group_size = len(grp_indices)
- if size is not None:
- sample_size = size
- else:
- assert frac is not None
- sample_size = round(frac * group_size)
-
- grp_sample = sample.sample(
- group_size,
- size=sample_size,
- replace=replace,
- weights=None if weights is None else weights_arr[grp_indices],
- random_state=random_state,
+ samples = [
+ obj.sample(
+ n=n, frac=frac, replace=replace, weights=w, random_state=random_state
)
- sampled_indices.append(grp_indices[grp_sample])
+ for (_, obj), w in zip(group_iterator, ws)
+ ]
- sampled_indices = np.concatenate(sampled_indices)
- return self._selected_obj.take(sampled_indices, axis=self.axis)
+ return concat(samples, axis=self.axis)
@doc(GroupBy)
diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py
index 3307558deec33..c5d5d5a301336 100644
--- a/pandas/core/groupby/grouper.py
+++ b/pandas/core/groupby/grouper.py
@@ -4,10 +4,7 @@
"""
from __future__ import annotations
-from typing import (
- Hashable,
- final,
-)
+from typing import Hashable
import warnings
import numpy as np
@@ -15,6 +12,7 @@
from pandas._typing import (
ArrayLike,
FrameOrSeries,
+ final,
)
from pandas.errors import InvalidIndexError
from pandas.util._decorators import cache_readonly
@@ -492,7 +490,7 @@ def __init__(
self.grouping_vector, # Index
self._codes,
self._group_index,
- ) = index._get_grouper_for_level(mapper, level=ilevel)
+ ) = index._get_grouper_for_level(mapper, ilevel)
# a passed Grouper like, directly get the grouper in the same way
# as single grouper groupby, use the group_info to get codes
diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py
index 36fbda5974ea0..b65f26c7174fc 100644
--- a/pandas/core/groupby/ops.py
+++ b/pandas/core/groupby/ops.py
@@ -14,7 +14,6 @@
Hashable,
Iterator,
Sequence,
- final,
overload,
)
@@ -32,7 +31,7 @@
F,
FrameOrSeries,
Shape,
- npt,
+ final,
)
from pandas.errors import AbstractMethodError
from pandas.util._decorators import cache_readonly
@@ -162,9 +161,7 @@ def _get_cython_function(
f = getattr(libgroupby, ftype)
if is_numeric:
return f
- # error: Non-overlapping equality check (left operand type: "dtype[Any]", right
- # operand type: "Literal['object']")
- elif dtype == object: # type: ignore[comparison-overlap]
+ elif dtype == object:
if "object" not in f.__signatures__:
# raise NotImplementedError here rather than TypeError later
raise NotImplementedError(
@@ -342,54 +339,95 @@ def _ea_wrap_cython_operation(
comp_ids=comp_ids,
**kwargs,
)
+ orig_values = values
- if isinstance(values, (DatetimeArray, PeriodArray, TimedeltaArray)):
+ if isinstance(orig_values, (DatetimeArray, PeriodArray)):
# All of the functions implemented here are ordinal, so we can
# operate on the tz-naive equivalents
- npvalues = values._ndarray.view("M8[ns]")
+ npvalues = orig_values._ndarray.view("M8[ns]")
+ res_values = self._cython_op_ndim_compat(
+ npvalues,
+ min_count=min_count,
+ ngroups=ngroups,
+ comp_ids=comp_ids,
+ mask=None,
+ **kwargs,
+ )
+ if self.how in ["rank"]:
+ # i.e. how in WrappedCythonOp.cast_blocklist, since
+ # other cast_blocklist methods dont go through cython_operation
+ # preserve float64 dtype
+ return res_values
+
+ res_values = res_values.view("i8")
+ result = type(orig_values)(res_values, dtype=orig_values.dtype)
+ return result
+
+ elif isinstance(orig_values, TimedeltaArray):
+ # We have an ExtensionArray but not ExtensionDtype
+ res_values = self._cython_op_ndim_compat(
+ orig_values._ndarray,
+ min_count=min_count,
+ ngroups=ngroups,
+ comp_ids=comp_ids,
+ mask=None,
+ **kwargs,
+ )
+ if self.how in ["rank"]:
+ # i.e. how in WrappedCythonOp.cast_blocklist, since
+ # other cast_blocklist methods dont go through cython_operation
+ # preserve float64 dtype
+ return res_values
+
+ # otherwise res_values has the same dtype as original values
+ return type(orig_values)(res_values)
+
elif isinstance(values.dtype, (BooleanDtype, _IntegerDtype)):
# IntegerArray or BooleanArray
npvalues = values.to_numpy("float64", na_value=np.nan)
- elif isinstance(values.dtype, FloatingDtype):
- # FloatingArray
- npvalues = values.to_numpy(values.dtype.numpy_dtype, na_value=np.nan)
- else:
- raise NotImplementedError(
- f"function is not implemented for this dtype: {values.dtype}"
+ res_values = self._cython_op_ndim_compat(
+ npvalues,
+ min_count=min_count,
+ ngroups=ngroups,
+ comp_ids=comp_ids,
+ mask=None,
+ **kwargs,
)
+ if self.how in ["rank"]:
+ # i.e. how in WrappedCythonOp.cast_blocklist, since
+ # other cast_blocklist methods dont go through cython_operation
+ return res_values
- res_values = self._cython_op_ndim_compat(
- npvalues,
- min_count=min_count,
- ngroups=ngroups,
- comp_ids=comp_ids,
- mask=None,
- **kwargs,
- )
-
- if self.how in ["rank"]:
- # i.e. how in WrappedCythonOp.cast_blocklist, since
- # other cast_blocklist methods dont go through cython_operation
- return res_values
-
- return self._reconstruct_ea_result(values, res_values)
+ dtype = self._get_result_dtype(orig_values.dtype)
+ cls = dtype.construct_array_type()
+ return cls._from_sequence(res_values, dtype=dtype)
- def _reconstruct_ea_result(self, values, res_values):
- """
- Construct an ExtensionArray result from an ndarray result.
- """
- # TODO: allow EAs to override this logic
+ elif isinstance(values.dtype, FloatingDtype):
+ # FloatingArray
+ npvalues = values.to_numpy(
+ values.dtype.numpy_dtype,
+ na_value=np.nan,
+ )
+ res_values = self._cython_op_ndim_compat(
+ npvalues,
+ min_count=min_count,
+ ngroups=ngroups,
+ comp_ids=comp_ids,
+ mask=None,
+ **kwargs,
+ )
+ if self.how in ["rank"]:
+ # i.e. how in WrappedCythonOp.cast_blocklist, since
+ # other cast_blocklist methods dont go through cython_operation
+ return res_values
- if isinstance(values.dtype, (BooleanDtype, _IntegerDtype, FloatingDtype)):
- dtype = self._get_result_dtype(values.dtype)
+ dtype = self._get_result_dtype(orig_values.dtype)
cls = dtype.construct_array_type()
return cls._from_sequence(res_values, dtype=dtype)
- elif needs_i8_conversion(values.dtype):
- i8values = res_values.view("i8")
- return type(values)(i8values, dtype=values.dtype)
-
- raise NotImplementedError
+ raise NotImplementedError(
+ f"function is not implemented for this dtype: {values.dtype}"
+ )
@final
def _masked_ea_wrap_cython_operation(
@@ -438,8 +476,6 @@ def _cython_op_ndim_compat(
if values.ndim == 1:
# expand to 2d, dispatch, then squeeze if appropriate
values2d = values[None, :]
- if mask is not None:
- mask = mask[None, :]
res = self._call_cython_op(
values2d,
min_count=min_count,
@@ -495,8 +531,9 @@ def _call_cython_op(
values = ensure_float64(values)
values = values.T
+
if mask is not None:
- mask = mask.T
+ mask = mask.reshape(values.shape, order="C")
out_shape = self._get_output_shape(ngroups, values)
func, values = self.get_cython_func_and_vals(values, is_numeric)
@@ -638,7 +675,7 @@ def __init__(
sort: bool = True,
group_keys: bool = True,
mutated: bool = False,
- indexer: npt.NDArray[np.intp] | None = None,
+ indexer: np.ndarray | None = None,
dropna: bool = True,
):
assert isinstance(axis, Index), axis
@@ -1229,13 +1266,7 @@ def _is_indexed_like(obj, axes, axis: int) -> bool:
class DataSplitter(Generic[FrameOrSeries]):
- def __init__(
- self,
- data: FrameOrSeries,
- labels: npt.NDArray[np.intp],
- ngroups: int,
- axis: int = 0,
- ):
+ def __init__(self, data: FrameOrSeries, labels, ngroups: int, axis: int = 0):
self.data = data
self.labels = ensure_platform_int(labels) # _should_ already be np.intp
self.ngroups = ngroups
diff --git a/pandas/core/indexers.py b/pandas/core/indexers.py
index eacc7960a82aa..ed4b1a3fbb39c 100644
--- a/pandas/core/indexers.py
+++ b/pandas/core/indexers.py
@@ -3,19 +3,16 @@
"""
from __future__ import annotations
-from typing import (
- TYPE_CHECKING,
- Any,
-)
+from typing import TYPE_CHECKING
import warnings
import numpy as np
from pandas._typing import (
+ Any,
AnyArrayLike,
ArrayLike,
)
-from pandas.util._exceptions import find_stack_level
from pandas.core.dtypes.common import (
is_array_like,
@@ -378,7 +375,7 @@ def deprecate_ndim_indexing(result, stacklevel: int = 3):
"is deprecated and will be removed in a future "
"version. Convert to a numpy array before indexing instead.",
FutureWarning,
- stacklevel=find_stack_level(),
+ stacklevel=stacklevel,
)
diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
index 5866644860831..e4c21b3de2cac 100644
--- a/pandas/core/indexes/base.py
+++ b/pandas/core/indexes/base.py
@@ -9,11 +9,9 @@
Any,
Callable,
Hashable,
- Literal,
Sequence,
TypeVar,
cast,
- final,
overload,
)
import warnings
@@ -44,7 +42,8 @@
DtypeObj,
F,
Shape,
- npt,
+ T,
+ final,
)
from pandas.compat.numpy import function as nv
from pandas.errors import (
@@ -160,6 +159,7 @@
)
if TYPE_CHECKING:
+ from typing import Literal
from pandas import (
CategoricalIndex,
@@ -306,7 +306,8 @@ class Index(IndexOpsMixin, PandasObject):
# given the dtypes of the passed arguments
@final
- def _left_indexer_unique(self: _IndexT, other: _IndexT) -> npt.NDArray[np.intp]:
+ def _left_indexer_unique(self: _IndexT, other: _IndexT) -> np.ndarray:
+ # -> np.ndarray[np.intp]
# Caller is responsible for ensuring other.dtype == self.dtype
sv = self._get_join_target()
ov = other._get_join_target()
@@ -315,7 +316,7 @@ def _left_indexer_unique(self: _IndexT, other: _IndexT) -> npt.NDArray[np.intp]:
@final
def _left_indexer(
self: _IndexT, other: _IndexT
- ) -> tuple[ArrayLike, npt.NDArray[np.intp], npt.NDArray[np.intp]]:
+ ) -> tuple[ArrayLike, np.ndarray, np.ndarray]:
# Caller is responsible for ensuring other.dtype == self.dtype
sv = self._get_join_target()
ov = other._get_join_target()
@@ -326,7 +327,7 @@ def _left_indexer(
@final
def _inner_indexer(
self: _IndexT, other: _IndexT
- ) -> tuple[ArrayLike, npt.NDArray[np.intp], npt.NDArray[np.intp]]:
+ ) -> tuple[ArrayLike, np.ndarray, np.ndarray]:
# Caller is responsible for ensuring other.dtype == self.dtype
sv = self._get_join_target()
ov = other._get_join_target()
@@ -337,7 +338,7 @@ def _inner_indexer(
@final
def _outer_indexer(
self: _IndexT, other: _IndexT
- ) -> tuple[ArrayLike, npt.NDArray[np.intp], npt.NDArray[np.intp]]:
+ ) -> tuple[ArrayLike, np.ndarray, np.ndarray]:
# Caller is responsible for ensuring other.dtype == self.dtype
sv = self._get_join_target()
ov = other._get_join_target()
@@ -558,9 +559,7 @@ def _dtype_to_subclass(cls, dtype: DtypeObj):
return Int64Index
- # error: Non-overlapping equality check (left operand type: "dtype[Any]", right
- # operand type: "Type[object]")
- elif dtype == object: # type: ignore[comparison-overlap]
+ elif dtype == object:
# NB: assuming away MultiIndex
return Index
@@ -1499,7 +1498,7 @@ def _validate_names(
def _get_names(self) -> FrozenList:
return FrozenList((self.name,))
- def _set_names(self, values, *, level=None) -> None:
+ def _set_names(self, values, level=None) -> None:
"""
Set new names on index. Each name has to be a hashable type.
@@ -1886,21 +1885,14 @@ def _drop_level_numbers(self, levnums: list[int]):
new_names.pop(i)
if len(new_levels) == 1:
- lev = new_levels[0]
-
- if len(lev) == 0:
- # If lev is empty, lev.take will fail GH#42055
- res_values = algos.take(lev._values, new_codes[0], allow_fill=True)
- result = type(lev)._simple_new(res_values, name=new_names[0])
- else:
- # set nan if needed
- mask = new_codes[0] == -1
- result = new_levels[0].take(new_codes[0])
- if mask.any():
- result = result.putmask(mask, np.nan)
- result._name = new_names[0]
+ # set nan if needed
+ mask = new_codes[0] == -1
+ result = new_levels[0].take(new_codes[0])
+ if mask.any():
+ result = result.putmask(mask, np.nan)
+ result._name = new_names[0]
return result
else:
from pandas.core.indexes.multi import MultiIndex
@@ -1912,7 +1904,7 @@ def _drop_level_numbers(self, levnums: list[int]):
verify_integrity=False,
)
- def _get_grouper_for_level(self, mapper, *, level=None):
+ def _get_grouper_for_level(self, mapper, level=None):
"""
Get index grouper corresponding to an index level
@@ -1921,7 +1913,7 @@ def _get_grouper_for_level(self, mapper, *, level=None):
mapper: Group mapping function or None
Function mapping index values to groups
level : int or None
- Index level, positional
+ Index level
Returns
-------
@@ -2760,6 +2752,16 @@ def duplicated(self, keep: Literal["first", "last", False] = "first") -> np.ndar
return np.zeros(len(self), dtype=bool)
return self._duplicated(keep=keep)
+ def _get_unique_index(self: _IndexT) -> _IndexT:
+ """
+ Returns an index containing unique values.
+
+ Returns
+ -------
+ Index
+ """
+ return self.unique()
+
# --------------------------------------------------------------------
# Arithmetic & Logical Methods
@@ -3076,30 +3078,6 @@ def intersection(self, other, sort=False):
return self.unique()._get_reconciled_name_object(other)
return self._get_reconciled_name_object(other)
- if len(self) == 0 or len(other) == 0:
- # fastpath; we need to be careful about having commutativity
-
- if self._is_multi or other._is_multi:
- # _convert_can_do_setop ensures that we have both or neither
- # We retain self.levels
- return self[:0].rename(result_name)
-
- dtype = self._find_common_type_compat(other)
- if is_dtype_equal(self.dtype, dtype):
- # Slicing allows us to retain DTI/TDI.freq, RangeIndex
-
- # Note: self[:0] vs other[:0] affects
- # 1) which index's `freq` we get in DTI/TDI cases
- # This may be a historical artifact, i.e. no documented
- # reason for this choice.
- # 2) The `step` we get in RangeIndex cases
- if len(self) == 0:
- return self[:0].rename(result_name)
- else:
- return other[:0].rename(result_name)
-
- return Index([], dtype=dtype, name=result_name)
-
elif not self._should_compare(other):
# We can infer that the intersection is empty.
if isinstance(self, ABCMultiIndex):
@@ -3107,25 +3085,21 @@ def intersection(self, other, sort=False):
return Index([], name=result_name)
elif not is_dtype_equal(self.dtype, other.dtype):
- dtype = self._find_common_type_compat(other)
+ dtype = find_common_type([self.dtype, other.dtype])
this = self.astype(dtype, copy=False)
other = other.astype(dtype, copy=False)
return this.intersection(other, sort=sort)
result = self._intersection(other, sort=sort)
- return self._wrap_intersection_result(other, result)
+ return self._wrap_setop_result(other, result)
def _intersection(self, other: Index, sort=False):
"""
intersection specialized to the case with matching dtypes.
"""
- if (
- self.is_monotonic
- and other.is_monotonic
- and not is_interval_dtype(self.dtype)
- ):
- # For IntervalIndex _inner_indexer is not more performant than get_indexer,
- # so don't take this fastpath
+ # TODO(EA): setops-refactor, clean all this up
+
+ if self.is_monotonic and other.is_monotonic:
try:
result = self._inner_indexer(other)[0]
except TypeError:
@@ -3139,11 +3113,6 @@ def _intersection(self, other: Index, sort=False):
res_values = _maybe_try_sort(res_values, sort)
return res_values
- def _wrap_intersection_result(self, other, result):
- # We will override for MultiIndex to handle empty results
- return self._wrap_setop_result(other, result)
-
- @final
def _intersection_via_get_indexer(self, other: Index, sort) -> ArrayLike:
"""
Find the intersection of two Indexes using get_indexer.
@@ -3153,8 +3122,10 @@ def _intersection_via_get_indexer(self, other: Index, sort) -> ArrayLike:
np.ndarray or ExtensionArray
The returned array will be unique.
"""
- left_unique = self.unique()
- right_unique = other.unique()
+ # Note: drop_duplicates vs unique matters for MultiIndex, though
+ # it should not, see GH#41823
+ left_unique = self.drop_duplicates()
+ right_unique = other.drop_duplicates()
# even though we are unique, we need get_indexer_for for IntervalIndex
indexer = left_unique.get_indexer_for(right_unique)
@@ -3219,12 +3190,11 @@ def difference(self, other, sort=None):
return self.rename(result_name)
result = self._difference(other, sort=sort)
- return self._wrap_difference_result(other, result)
+ return self._wrap_setop_result(other, result)
def _difference(self, other, sort):
- # overridden by RangeIndex
- this = self.unique()
+ this = self._get_unique_index()
indexer = this.get_indexer_for(other)
indexer = indexer.take((indexer != -1).nonzero()[0])
@@ -3235,10 +3205,6 @@ def _difference(self, other, sort):
return the_diff
- def _wrap_difference_result(self, other, result):
- # We will override for MultiIndex to handle empty results
- return self._wrap_setop_result(other, result)
-
def symmetric_difference(self, other, result_name=None, sort=None):
"""
Compute the symmetric difference of two Index objects.
@@ -3280,47 +3246,12 @@ def symmetric_difference(self, other, result_name=None, sort=None):
if result_name is None:
result_name = result_name_update
- if not self._should_compare(other):
- return self.union(other, sort=sort).rename(result_name)
-
- elif not is_dtype_equal(self.dtype, other.dtype):
- dtype = self._find_common_type_compat(other)
- this = self.astype(dtype, copy=False)
- that = other.astype(dtype, copy=False)
- return this.symmetric_difference(that, sort=sort).rename(result_name)
-
- this = self.unique()
- other = other.unique()
- indexer = this.get_indexer_for(other)
-
- # {this} minus {other}
- common_indexer = indexer.take((indexer != -1).nonzero()[0])
- left_indexer = np.setdiff1d(
- np.arange(this.size), common_indexer, assume_unique=True
- )
- left_diff = this._values.take(left_indexer)
-
- # {other} minus {this}
- right_indexer = (indexer == -1).nonzero()[0]
- right_diff = other._values.take(right_indexer)
-
- res_values = concat_compat([left_diff, right_diff])
- res_values = _maybe_try_sort(res_values, sort)
-
- result = Index(res_values, name=result_name)
-
- if self._is_multi:
- self = cast("MultiIndex", self)
- if len(result) == 0:
- # On equal symmetric_difference MultiIndexes the difference is empty.
- # Therefore, an empty MultiIndex is returned GH#13490
- return type(self)(
- levels=[[] for _ in range(self.nlevels)],
- codes=[[] for _ in range(self.nlevels)],
- names=result.name,
- )
- return type(self).from_tuples(result, names=result.name)
+ left = self.difference(other, sort=False)
+ right = other.difference(self, sort=False)
+ result = left.union(right, sort=sort)
+ if result_name is not None:
+ result = result.rename(result_name)
return result
@final
@@ -3388,15 +3319,6 @@ def get_loc(self, key, method=None, tolerance=None):
except KeyError as err:
raise KeyError(key) from err
- # GH#42269
- warnings.warn(
- f"Passing method to {type(self).__name__}.get_loc is deprecated "
- "and will raise in a future version. Use "
- "index.get_indexer([item], method=...) instead",
- FutureWarning,
- stacklevel=2,
- )
-
if is_scalar(key) and isna(key) and not self.hasnans:
raise KeyError(key)
@@ -3466,7 +3388,8 @@ def get_indexer(
method: str_t | None = None,
limit: int | None = None,
tolerance=None,
- ) -> npt.NDArray[np.intp]:
+ ) -> np.ndarray:
+ # returned ndarray is np.intp
method = missing.clean_reindex_fill_method(method)
target = self._maybe_cast_listlike_indexer(target)
@@ -3475,67 +3398,17 @@ def get_indexer(
if not self._index_as_unique:
raise InvalidIndexError(self._requires_unique_msg)
- if len(target) == 0:
- return np.array([], dtype=np.intp)
-
- if not self._should_compare(target) and not self._should_partial_index(target):
+ if not self._should_compare(target) and not is_interval_dtype(self.dtype):
# IntervalIndex get special treatment bc numeric scalars can be
# matched to Interval scalars
return self._get_indexer_non_comparable(target, method=method, unique=True)
- if is_categorical_dtype(self.dtype):
- # _maybe_cast_listlike_indexer ensures target has our dtype
- # (could improve perf by doing _should_compare check earlier?)
- assert is_dtype_equal(self.dtype, target.dtype)
-
- indexer = self._engine.get_indexer(target.codes)
- if self.hasnans and target.hasnans:
- loc = self.get_loc(np.nan)
- mask = target.isna()
- indexer[mask] = loc
- return indexer
-
- if is_categorical_dtype(target.dtype):
- # potential fastpath
- # get an indexer for unique categories then propagate to codes via take_nd
- # get_indexer instead of _get_indexer needed for MultiIndex cases
- # e.g. test_append_different_columns_types
- categories_indexer = self.get_indexer(target.categories)
-
- indexer = algos.take_nd(categories_indexer, target.codes, fill_value=-1)
-
- if (not self._is_multi and self.hasnans) and target.hasnans:
- # Exclude MultiIndex because hasnans raises NotImplementedError
- # we should only get here if we are unique, so loc is an integer
- # GH#41934
- loc = self.get_loc(np.nan)
- mask = target.isna()
- indexer[mask] = loc
-
- return ensure_platform_int(indexer)
-
pself, ptarget = self._maybe_promote(target)
if pself is not self or ptarget is not target:
return pself.get_indexer(
ptarget, method=method, limit=limit, tolerance=tolerance
)
- if is_dtype_equal(self.dtype, target.dtype) and self.equals(target):
- # Only call equals if we have same dtype to avoid inference/casting
- return np.arange(len(target), dtype=np.intp)
-
- if not is_dtype_equal(self.dtype, target.dtype) and not is_interval_dtype(
- self.dtype
- ):
- # IntervalIndex gets special treatment for partial-indexing
- dtype = self._find_common_type_compat(target)
-
- this = self.astype(dtype, copy=False)
- target = target.astype(dtype, copy=False)
- return this._get_indexer(
- target, method=method, limit=limit, tolerance=tolerance
- )
-
return self._get_indexer(target, method, limit, tolerance)
def _get_indexer(
@@ -3548,6 +3421,15 @@ def _get_indexer(
if tolerance is not None:
tolerance = self._convert_tolerance(tolerance, target)
+ if not is_dtype_equal(self.dtype, target.dtype):
+ dtype = self._find_common_type_compat(target)
+
+ this = self.astype(dtype, copy=False)
+ target = target.astype(dtype, copy=False)
+ return this.get_indexer(
+ target, method=method, limit=limit, tolerance=tolerance
+ )
+
if method in ["pad", "backfill"]:
indexer = self._get_fill_indexer(target, method, limit, tolerance)
elif method == "nearest":
@@ -3557,16 +3439,6 @@ def _get_indexer(
return ensure_platform_int(indexer)
- @final
- def _should_partial_index(self, target: Index) -> bool:
- """
- Should we attempt partial-matching indexing?
- """
- if is_interval_dtype(self.dtype):
- # "Index" has no attribute "left"
- return self.left._should_compare(target) # type: ignore[attr-defined]
- return False
-
@final
def _check_indexing_method(
self,
@@ -3625,13 +3497,6 @@ def _get_fill_indexer(
self, target: Index, method: str_t, limit: int | None = None, tolerance=None
) -> np.ndarray:
- if self._is_multi:
- # TODO: get_indexer_with_fill docstring says values must be _sorted_
- # but that doesn't appear to be enforced
- return self._engine.get_indexer_with_fill(
- target=target._values, values=self._values, method=method, limit=limit
- )
-
target_values = target._get_engine_target()
if self.is_monotonic_increasing and target.is_monotonic_increasing:
@@ -3725,6 +3590,16 @@ def _filter_indexer_tolerance(
# --------------------------------------------------------------------
# Indexer Conversion Methods
+ def _get_partial_string_timestamp_match_key(self, key: T) -> T:
+ """
+ Translate any partial string timestamp matches in key, returning the
+ new key.
+
+ Only relevant for MultiIndex.
+ """
+ # GH#10331
+ return key
+
@final
def _validate_positional_slice(self, key: slice) -> None:
"""
@@ -3833,11 +3708,11 @@ def _validate_can_reindex(self, indexer: np.ndarray) -> None:
"""
# trying to reindex on an axis with duplicates
if not self._index_as_unique and len(indexer):
- raise ValueError("cannot reindex on an axis with duplicate labels")
+ raise ValueError("cannot reindex from a duplicate axis")
def reindex(
self, target, method=None, level=None, limit=None, tolerance=None
- ) -> tuple[Index, npt.NDArray[np.intp] | None]:
+ ) -> tuple[Index, np.ndarray | None]:
"""
Create index with target's values.
@@ -3860,25 +3735,14 @@ def reindex(
target = ensure_has_len(target) # target may be an iterator
if not isinstance(target, Index) and len(target) == 0:
- if level is not None and self._is_multi:
- # "Index" has no attribute "levels"; maybe "nlevels"?
- idx = self.levels[level] # type: ignore[attr-defined]
- else:
- idx = self
- target = idx[:0]
+ target = self[:0]
else:
target = ensure_index(target)
if level is not None:
if method is not None:
raise TypeError("Fill method not supported if level passed")
-
- # TODO: tests where passing `keep_order=not self._is_multi`
- # makes a difference for non-MultiIndex case
- target, indexer, _ = self._join_level(
- target, level, how="right", keep_order=not self._is_multi
- )
-
+ _, indexer, _ = self._join_level(target, level, how="right")
else:
if self.equals(target):
indexer = None
@@ -3887,8 +3751,6 @@ def reindex(
indexer = self.get_indexer(
target, method=method, limit=limit, tolerance=tolerance
)
- elif self._is_multi:
- raise ValueError("cannot handle a non-unique multi-index!")
else:
if method is not None or limit is not None:
raise ValueError(
@@ -3897,23 +3759,15 @@ def reindex(
)
indexer, _ = self.get_indexer_non_unique(target)
- target = self._wrap_reindex_result(target, indexer, preserve_names)
- return target, indexer
-
- def _wrap_reindex_result(self, target, indexer, preserve_names: bool):
- target = self._maybe_preserve_names(target, preserve_names)
- return target
-
- def _maybe_preserve_names(self, target: Index, preserve_names: bool):
if preserve_names and target.nlevels == 1 and target.name != self.name:
- target = target.copy(deep=False)
+ target = target.copy()
target.name = self.name
- return target
- @final
+ return target, indexer
+
def _reindex_non_unique(
self, target: Index
- ) -> tuple[Index, npt.NDArray[np.intp], npt.NDArray[np.intp] | None]:
+ ) -> tuple[Index, np.ndarray, np.ndarray | None]:
"""
Create a new index with target's values (move/add/delete values as
necessary) use with non-unique Index and a possibly non-unique target.
@@ -3942,15 +3796,14 @@ def _reindex_non_unique(
new_indexer = None
if len(missing):
- length = np.arange(len(indexer), dtype=np.intp)
+ length = np.arange(len(indexer))
missing = ensure_platform_int(missing)
missing_labels = target.take(missing)
- missing_indexer = length[~check]
+ missing_indexer = ensure_platform_int(length[~check])
cur_labels = self.take(indexer[check]).values
- cur_indexer = length[check]
+ cur_indexer = ensure_platform_int(length[check])
- # Index constructor below will do inference
new_labels = np.empty((len(indexer),), dtype=object)
new_labels[cur_indexer] = cur_labels
new_labels[missing_indexer] = missing_labels
@@ -3988,7 +3841,6 @@ def _reindex_non_unique(
# --------------------------------------------------------------------
# Join Methods
- @final
@_maybe_return_indexers
def join(
self,
@@ -4020,19 +3872,6 @@ def join(
self_is_mi = isinstance(self, ABCMultiIndex)
other_is_mi = isinstance(other, ABCMultiIndex)
- if isinstance(self, ABCDatetimeIndex) and isinstance(other, ABCDatetimeIndex):
- if (self.tz is None) ^ (other.tz is None):
- # Raise instead of casting to object below.
- raise TypeError("Cannot join tz-naive with tz-aware DatetimeIndex")
-
- if not self._is_multi and not other._is_multi:
- # We have specific handling for MultiIndex below
- pself, pother = self._maybe_promote(other)
- if pself is not self or pother is not other:
- return pself.join(
- pother, how=how, level=level, return_indexers=True, sort=sort
- )
-
lindexer: np.ndarray | None
rindexer: np.ndarray | None
@@ -4069,9 +3908,8 @@ def join(
return join_index, lidx, ridx
if not is_dtype_equal(self.dtype, other.dtype):
- dtype = self._find_common_type_compat(other)
- this = self.astype(dtype, copy=False)
- other = other.astype(dtype, copy=False)
+ this = self.astype("O")
+ other = other.astype("O")
return this.join(other, how=how, return_indexers=True)
_validate_join_method(how)
@@ -4201,7 +4039,8 @@ def _join_multi(self, other: Index, how: str_t):
@final
def _join_non_unique(
self, other: Index, how: str_t = "left"
- ) -> tuple[Index, npt.NDArray[np.intp], npt.NDArray[np.intp]]:
+ ) -> tuple[Index, np.ndarray, np.ndarray]:
+ # returned ndarrays are np.intp
from pandas.core.reshape.merge import get_join_indexers
# We only get here if dtypes match
@@ -4229,7 +4068,8 @@ def _join_non_unique(
@final
def _join_level(
self, other: Index, level, how: str_t = "left", keep_order: bool = True
- ) -> tuple[MultiIndex, npt.NDArray[np.intp] | None, npt.NDArray[np.intp] | None]:
+ ) -> tuple[MultiIndex, np.ndarray | None, np.ndarray | None]:
+ # Any returned ndarrays are np.intp
"""
The join method *only* affects the level of the resulting
MultiIndex. Otherwise it just exactly aligns the Index data to the
@@ -4241,7 +4081,7 @@ def _join_level(
"""
from pandas.core.indexes.multi import MultiIndex
- def _get_leaf_sorter(labels: list[np.ndarray]) -> npt.NDArray[np.intp]:
+ def _get_leaf_sorter(labels: list[np.ndarray]) -> np.ndarray:
"""
Returns sorter for the inner most level while preserving the
order of higher levels.
@@ -4618,12 +4458,6 @@ def is_type_compatible(self, kind: str_t) -> bool:
"""
Whether the index type is compatible with the provided type.
"""
- warnings.warn(
- "Index.is_type_compatible is deprecated and will be removed in a "
- "future version",
- FutureWarning,
- stacklevel=2,
- )
return kind == self.inferred_type
def __contains__(self, key: Any) -> bool:
@@ -4667,10 +4501,9 @@ def __contains__(self, key: Any) -> bool:
except (OverflowError, TypeError, ValueError):
return False
- # https://github.com/python/typeshed/issues/2148#issuecomment-520783318
- # Incompatible types in assignment (expression has type "None", base class
- # "object" defined the type as "Callable[[object], int]")
- __hash__: None # type: ignore[assignment]
+ @final
+ def __hash__(self):
+ raise TypeError(f"unhashable type: {repr(type(self).__name__)}")
@final
def __setitem__(self, key, value):
@@ -4974,26 +4807,16 @@ def asof(self, label):
Traceback (most recent call last):
ValueError: index must be monotonic increasing or decreasing
"""
- self._searchsorted_monotonic(label) # validate sortedness
try:
- loc = self.get_loc(label)
- except (KeyError, TypeError):
- # KeyError -> No exact match, try for padded
- # TypeError -> passed e.g. non-hashable, fall through to get
- # the tested exception message
- indexer = self.get_indexer([label], method="pad")
- if indexer.ndim > 1 or indexer.size > 1:
- raise TypeError("asof requires scalar valued input")
- loc = indexer.item()
- if loc == -1:
- return self._na_value
+ loc = self.get_loc(label, method="pad")
+ except KeyError:
+ return self._na_value
else:
if isinstance(loc, slice):
loc = loc.indices(len(self))[-1]
+ return self[loc]
- return self[loc]
-
- def asof_locs(self, where: Index, mask: np.ndarray) -> npt.NDArray[np.intp]:
+ def asof_locs(self, where: Index, mask: np.ndarray) -> np.ndarray:
"""
Return the locations (indices) of labels in the index.
@@ -5184,7 +5007,7 @@ def shift(self, periods=1, freq=None):
f"TimedeltaIndex; Got type {type(self).__name__}"
)
- def argsort(self, *args, **kwargs) -> npt.NDArray[np.intp]:
+ def argsort(self, *args, **kwargs) -> np.ndarray:
"""
Return the integer indices that would sort the index.
@@ -5251,7 +5074,7 @@ def get_value(self, series: Series, key):
# try that
loc = self.get_loc(key)
except KeyError:
- if not self._should_fallback_to_positional:
+ if not self._should_fallback_to_positional():
raise
elif is_integer(key):
# If the Index cannot hold integer, then this is unambiguously
@@ -5268,7 +5091,6 @@ def _check_indexing_error(self, key):
# would convert to numpy arrays and raise later any way) - GH29926
raise InvalidIndexError(key)
- @cache_readonly
def _should_fallback_to_positional(self) -> bool:
"""
Should an integer key be treated as positional?
@@ -5335,11 +5157,9 @@ def set_value(self, arr, key, value):
"""
@Appender(_index_shared_docs["get_indexer_non_unique"] % _index_doc_kwargs)
- def get_indexer_non_unique(
- self, target
- ) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]:
+ def get_indexer_non_unique(self, target) -> tuple[np.ndarray, np.ndarray]:
+ # both returned ndarrays are np.intp
target = ensure_index(target)
- target = self._maybe_cast_listlike_indexer(target)
if not self._should_compare(target) and not is_interval_dtype(self.dtype):
# IntervalIndex get special treatment bc numeric scalars can be
@@ -5359,15 +5179,13 @@ def get_indexer_non_unique(
that = target.astype(dtype, copy=False)
return this.get_indexer_non_unique(that)
- # Note: _maybe_promote ensures we never get here with MultiIndex
- # self and non-Multi target
tgt_values = target._get_engine_target()
indexer, missing = self._engine.get_indexer_non_unique(tgt_values)
return ensure_platform_int(indexer), ensure_platform_int(missing)
@final
- def get_indexer_for(self, target) -> npt.NDArray[np.intp]:
+ def get_indexer_for(self, target, **kwargs) -> np.ndarray:
"""
Guaranteed return of an indexer even when non-unique.
@@ -5380,115 +5198,35 @@ def get_indexer_for(self, target) -> npt.NDArray[np.intp]:
List of indices.
"""
if self._index_as_unique:
- return self.get_indexer(target)
+ return self.get_indexer(target, **kwargs)
indexer, _ = self.get_indexer_non_unique(target)
return indexer
- def _get_indexer_strict(self, key, axis_name: str_t) -> tuple[Index, np.ndarray]:
- """
- Analogue to get_indexer that raises if any elements are missing.
- """
- keyarr = key
- if not isinstance(keyarr, Index):
- keyarr = com.asarray_tuplesafe(keyarr)
-
- if self._index_as_unique:
- indexer = self.get_indexer_for(keyarr)
- keyarr = self.reindex(keyarr)[0]
- else:
- keyarr, indexer, new_indexer = self._reindex_non_unique(keyarr)
-
- self._raise_if_missing(keyarr, indexer, axis_name)
-
- if (
- needs_i8_conversion(self.dtype)
- or is_categorical_dtype(self.dtype)
- or is_interval_dtype(self.dtype)
- ):
- # For CategoricalIndex take instead of reindex to preserve dtype.
- # For IntervalIndex this is to map integers to the Intervals they match to.
- keyarr = self.take(indexer)
- if keyarr.dtype.kind in ["m", "M"]:
- # DTI/TDI.take can infer a freq in some cases when we dont want one
- if isinstance(key, list) or (
- isinstance(key, type(self))
- # "Index" has no attribute "freq"
- and key.freq is None # type: ignore[attr-defined]
- ):
- keyarr = keyarr._with_freq(None)
-
- return keyarr, indexer
-
- def _raise_if_missing(self, key, indexer, axis_name: str_t):
- """
- Check that indexer can be used to return a result.
-
- e.g. at least one element was found,
- unless the list of keys was actually empty.
-
- Parameters
- ----------
- key : list-like
- Targeted labels (only used to show correct error message).
- indexer: array-like of booleans
- Indices corresponding to the key,
- (with -1 indicating not found).
- axis_name : str
-
- Raises
- ------
- KeyError
- If at least one key was requested but none was found.
- """
- if len(key) == 0:
- return
-
- # Count missing values
- missing_mask = indexer < 0
- nmissing = missing_mask.sum()
-
- if nmissing:
-
- # TODO: remove special-case; this is just to keep exception
- # message tests from raising while debugging
- use_interval_msg = is_interval_dtype(self.dtype) or (
- is_categorical_dtype(self.dtype)
- # "Index" has no attribute "categories" [attr-defined]
- and is_interval_dtype(
- self.categories.dtype # type: ignore[attr-defined]
- )
- )
-
- if nmissing == len(indexer):
- if use_interval_msg:
- key = list(key)
- raise KeyError(f"None of [{key}] are in the [{axis_name}]")
-
- not_found = list(ensure_index(key)[missing_mask.nonzero()[0]].unique())
- raise KeyError(f"{not_found} not in index")
-
@overload
def _get_indexer_non_comparable(
self, target: Index, method, unique: Literal[True] = ...
- ) -> npt.NDArray[np.intp]:
+ ) -> np.ndarray:
+ # returned ndarray is np.intp
...
@overload
def _get_indexer_non_comparable(
self, target: Index, method, unique: Literal[False]
- ) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]:
+ ) -> tuple[np.ndarray, np.ndarray]:
+ # both returned ndarrays are np.intp
...
@overload
def _get_indexer_non_comparable(
self, target: Index, method, unique: bool = True
- ) -> npt.NDArray[np.intp] | tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]:
+ ) -> np.ndarray | tuple[np.ndarray, np.ndarray]:
+ # any returned ndarrays are np.intp
...
@final
def _get_indexer_non_comparable(
self, target: Index, method, unique: bool = True
- ) -> npt.NDArray[np.intp] | tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]:
+ ) -> np.ndarray | tuple[np.ndarray, np.ndarray]:
"""
Called from get_indexer or get_indexer_non_unique when the target
is of a non-comparable dtype.
@@ -5514,16 +5252,6 @@ def _get_indexer_non_comparable(
"""
if method is not None:
other = unpack_nested_dtype(target)
- if self._is_multi ^ other._is_multi:
- kind = other.dtype.type if self._is_multi else self.dtype.type
- raise TypeError(
- f"'<' not supported between instances of {kind} and 'tuple'"
- )
- elif self._is_multi and other._is_multi:
- assert self.nlevels != other.nlevels
- # Python allows comparison between tuples of different lengths,
- # but for our purposes such a comparison is not meaningful.
- raise TypeError("'<' not supported between tuples of different lengths")
raise TypeError(f"Cannot compare dtypes {self.dtype} and {other.dtype}")
no_matches = -1 * np.ones(target.shape, dtype=np.intp)
@@ -5575,21 +5303,6 @@ def _maybe_promote(self, other: Index) -> tuple[Index, Index]:
if not is_object_dtype(self.dtype):
return self.astype("object"), other.astype("object")
- elif self.dtype.kind == "u" and other.dtype.kind == "i":
- # GH#41873
- if other.min() >= 0:
- # lookup min as it may be cached
- # TODO: may need itemsize check if we have non-64-bit Indexes
- return self, other.astype(self.dtype)
-
- elif self._is_multi and not other._is_multi:
- try:
- # "Type[Index]" has no attribute "from_tuples"
- other = type(self).from_tuples(other) # type: ignore[attr-defined]
- except (TypeError, ValueError):
- # let's instead try with a straight Index
- self = Index(self._values)
-
if not is_object_dtype(self.dtype) and is_object_dtype(other.dtype):
# Reverse op so we dont need to re-implement on the subclasses
other, self = other._maybe_promote(self)
@@ -5653,14 +5366,6 @@ def _should_compare(self, other: Index) -> bool:
other = unpack_nested_dtype(other)
dtype = other.dtype
- if other._is_multi:
- if not self._is_multi:
- # other contains only tuples so unless we are object-dtype,
- # there can never be any matches
- return self._is_comparable_dtype(dtype)
- return self.nlevels == other.nlevels
- # TODO: we can get more specific requiring levels are comparable?
-
return self._is_comparable_dtype(dtype) or is_object_dtype(dtype)
def _is_comparable_dtype(self, dtype: DtypeObj) -> bool:
@@ -5740,7 +5445,7 @@ def map(self, mapper, na_action=None):
# TODO: De-duplicate with map, xref GH#32349
@final
- def _transform_index(self, func, *, level=None) -> Index:
+ def _transform_index(self, func, level=None) -> Index:
"""
Apply function to all values found in index.
@@ -5749,7 +5454,6 @@ def _transform_index(self, func, *, level=None) -> Index:
"""
if isinstance(self, ABCMultiIndex):
if level is not None:
- # Caller is responsible for ensuring level is positional.
items = [
tuple(func(y) if i == level else y for i, y in enumerate(x))
for x in self
@@ -6192,7 +5896,8 @@ def insert(self, loc: int, item) -> Index:
try:
item = self._validate_fill_value(item)
except TypeError:
- dtype = self._find_common_type_compat(item)
+ inferred, _ = infer_dtype_from(item)
+ dtype = find_common_type([self.dtype, inferred])
return self.astype(dtype).insert(loc, item)
arr = np.asarray(self)
@@ -6208,7 +5913,7 @@ def drop(self, labels, errors: str_t = "raise") -> Index:
Parameters
----------
- labels : array-like or scalar
+ labels : array-like
errors : {'ignore', 'raise'}, default 'raise'
If 'ignore', suppress error and existing labels are dropped.
@@ -6307,9 +6012,6 @@ def __inv__(self):
# TODO: __inv__ vs __invert__?
return self._unary_method(lambda x: -x)
- # --------------------------------------------------------------------
- # Reductions
-
def any(self, *args, **kwargs):
"""
Return whether any element is Truthy.
@@ -6323,8 +6025,8 @@ def any(self, *args, **kwargs):
Returns
-------
- any : bool or array-like (if axis is specified)
- A single element array-like may be converted to bool.
+ any : bool or array_like (if axis is specified)
+ A single element array_like may be converted to bool.
See Also
--------
@@ -6367,8 +6069,8 @@ def all(self, *args, **kwargs):
Returns
-------
- all : bool or array-like (if axis is specified)
- A single element array-like may be converted to bool.
+ all : bool or array_like (if axis is specified)
+ A single element array_like may be converted to bool.
See Also
--------
@@ -6430,84 +6132,6 @@ def _maybe_disable_logical_methods(self, opname: str_t):
# This call will raise
make_invalid_op(opname)(self)
- @Appender(IndexOpsMixin.argmin.__doc__)
- def argmin(self, axis=None, skipna=True, *args, **kwargs):
- nv.validate_argmin(args, kwargs)
- nv.validate_minmax_axis(axis)
-
- if not self._is_multi and self.hasnans:
- # Take advantage of cache
- mask = self._isnan
- if not skipna or mask.all():
- return -1
- return super().argmin(skipna=skipna)
-
- @Appender(IndexOpsMixin.argmax.__doc__)
- def argmax(self, axis=None, skipna=True, *args, **kwargs):
- nv.validate_argmax(args, kwargs)
- nv.validate_minmax_axis(axis)
-
- if not self._is_multi and self.hasnans:
- # Take advantage of cache
- mask = self._isnan
- if not skipna or mask.all():
- return -1
- return super().argmax(skipna=skipna)
-
- @doc(IndexOpsMixin.min)
- def min(self, axis=None, skipna=True, *args, **kwargs):
- nv.validate_min(args, kwargs)
- nv.validate_minmax_axis(axis)
-
- if not len(self):
- return self._na_value
-
- if len(self) and self.is_monotonic_increasing:
- # quick check
- first = self[0]
- if not isna(first):
- return first
-
- if not self._is_multi and self.hasnans:
- # Take advantage of cache
- mask = self._isnan
- if not skipna or mask.all():
- return self._na_value
-
- if not self._is_multi and not isinstance(self._values, np.ndarray):
- # "ExtensionArray" has no attribute "min"
- return self._values.min(skipna=skipna) # type: ignore[attr-defined]
-
- return super().min(skipna=skipna)
-
- @doc(IndexOpsMixin.max)
- def max(self, axis=None, skipna=True, *args, **kwargs):
- nv.validate_max(args, kwargs)
- nv.validate_minmax_axis(axis)
-
- if not len(self):
- return self._na_value
-
- if len(self) and self.is_monotonic_increasing:
- # quick check
- last = self[-1]
- if not isna(last):
- return last
-
- if not self._is_multi and self.hasnans:
- # Take advantage of cache
- mask = self._isnan
- if not skipna or mask.all():
- return self._na_value
-
- if not self._is_multi and not isinstance(self._values, np.ndarray):
- # "ExtensionArray" has no attribute "max"
- return self._values.max(skipna=skipna) # type: ignore[attr-defined]
-
- return super().max(skipna=skipna)
-
- # --------------------------------------------------------------------
-
@final
@property
def shape(self) -> Shape:
diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py
index 7339c82cbcc77..228f58d47b8ed 100644
--- a/pandas/core/indexes/category.py
+++ b/pandas/core/indexes/category.py
@@ -12,13 +12,17 @@
from pandas._libs import index as libindex
from pandas._typing import (
+ ArrayLike,
Dtype,
DtypeObj,
- npt,
)
-from pandas.util._decorators import doc
+from pandas.util._decorators import (
+ Appender,
+ doc,
+)
from pandas.core.dtypes.common import (
+ ensure_platform_int,
is_categorical_dtype,
is_scalar,
)
@@ -28,6 +32,7 @@
notna,
)
+from pandas.core import accessor
from pandas.core.arrays.categorical import (
Categorical,
contains,
@@ -36,6 +41,7 @@
import pandas.core.indexes.base as ibase
from pandas.core.indexes.base import (
Index,
+ _index_shared_docs,
maybe_extract_name,
)
from pandas.core.indexes.extension import (
@@ -63,8 +69,9 @@
],
Categorical,
)
-@inherit_names(
- [
+@accessor.delegate_names(
+ delegate=Categorical,
+ accessors=[
"rename_categories",
"reorder_categories",
"add_categories",
@@ -74,10 +81,10 @@
"as_ordered",
"as_unordered",
],
- Categorical,
- wrap=True,
+ typ="method",
+ overwrite=True,
)
-class CategoricalIndex(NDArrayBackedExtensionIndex):
+class CategoricalIndex(NDArrayBackedExtensionIndex, accessor.PandasDelegate):
"""
Index based on an underlying :class:`Categorical`.
@@ -186,12 +193,17 @@ def _can_hold_strings(self):
def _engine_type(self):
# self.codes can have dtype int8, int16, int32 or int64, so we need
# to return the corresponding engine type (libindex.Int8Engine, etc.).
+
+ # error: Invalid index type "Type[generic]" for "Dict[Type[signedinteger[Any]],
+ # Any]"; expected type "Type[signedinteger[Any]]"
return {
np.int8: libindex.Int8Engine,
np.int16: libindex.Int16Engine,
np.int32: libindex.Int32Engine,
np.int64: libindex.Int64Engine,
- }[self.codes.dtype.type]
+ }[
+ self.codes.dtype.type # type: ignore[index]
+ ]
_attributes = ["name"]
@@ -369,7 +381,7 @@ def fillna(self, value, downcast=None):
def reindex(
self, target, method=None, level=None, limit=None, tolerance=None
- ) -> tuple[Index, npt.NDArray[np.intp] | None]:
+ ) -> tuple[Index, np.ndarray | None]:
"""
Create index with target's values (move/add/delete values as necessary)
@@ -400,9 +412,9 @@ def reindex(
indexer = None
missing = np.array([], dtype=np.intp)
else:
- indexer, missing = self.get_indexer_non_unique(target)
+ indexer, missing = self.get_indexer_non_unique(np.array(target))
- if len(self) and indexer is not None:
+ if len(self.codes) and indexer is not None:
new_target = self.take(indexer)
else:
new_target = target
@@ -411,8 +423,10 @@ def reindex(
if len(missing):
cats = self.categories.get_indexer(target)
- if not isinstance(target, CategoricalIndex) or (cats == -1).any():
- new_target, indexer, _ = super()._reindex_non_unique(target)
+ if not isinstance(cats, CategoricalIndex) or (cats == -1).any():
+ # coerce to a regular index here!
+ result = Index(np.array(self), name=self.name)
+ new_target, indexer, _ = result._reindex_non_unique(target)
else:
codes = new_target.codes.copy()
@@ -425,32 +439,84 @@ def reindex(
# coerce based on the actual values, only on the dtype)
# unless we had an initial Categorical to begin with
# in which case we are going to conform to the passed Categorical
+ new_target = np.asarray(new_target)
if is_categorical_dtype(target):
cat = Categorical(new_target, dtype=target.dtype)
new_target = type(self)._simple_new(cat, name=self.name)
else:
- # e.g. test_reindex_with_categoricalindex, test_reindex_duplicate_target
- new_target = np.asarray(new_target)
new_target = Index(new_target, name=self.name)
return new_target, indexer
+ # error: Return type "Tuple[Index, Optional[ndarray], Optional[ndarray]]"
+ # of "_reindex_non_unique" incompatible with return type
+ # "Tuple[Index, ndarray, Optional[ndarray]]" in supertype "Index"
+ def _reindex_non_unique( # type: ignore[override]
+ self, target: Index
+ ) -> tuple[Index, np.ndarray | None, np.ndarray | None]:
+ """
+ reindex from a non-unique; which CategoricalIndex's are almost
+ always
+ """
+ # TODO: rule out `indexer is None` here to make the signature
+ # match the parent class's signature. This should be equivalent
+ # to ruling out `self.equals(target)`
+ new_target, indexer = self.reindex(target)
+ new_indexer = None
+
+ check = indexer == -1
+ # error: Item "bool" of "Union[Any, bool]" has no attribute "any"
+ if check.any(): # type: ignore[union-attr]
+ new_indexer = np.arange(len(self.take(indexer)), dtype=np.intp)
+ new_indexer[check] = -1
+
+ cats = self.categories.get_indexer(target)
+ if not (cats == -1).any():
+ # .reindex returns normal Index. Revert to CategoricalIndex if
+ # all targets are included in my categories
+ cat = Categorical(new_target, dtype=self.dtype)
+ new_target = type(self)._simple_new(cat, name=self.name)
+
+ return new_target, indexer, new_indexer
+
# --------------------------------------------------------------------
# Indexing Methods
def _maybe_cast_indexer(self, key) -> int:
- # GH#41933: we have to do this instead of self._data._validate_scalar
- # because this will correctly get partial-indexing on Interval categories
- try:
- return self._data._unbox_scalar(key)
- except KeyError:
- if is_valid_na_for_dtype(key, self.categories.dtype):
- return -1
- raise
-
- def _maybe_cast_listlike_indexer(self, values) -> CategoricalIndex:
- if isinstance(values, CategoricalIndex):
- values = values._data
+ return self._data._unbox_scalar(key)
+
+ def _get_indexer(
+ self,
+ target: Index,
+ method: str | None = None,
+ limit: int | None = None,
+ tolerance=None,
+ ) -> np.ndarray:
+ # returned ndarray is np.intp
+
+ if self.equals(target):
+ return np.arange(len(self), dtype="intp")
+
+ return self._get_indexer_non_unique(target._values)[0]
+
+ @Appender(_index_shared_docs["get_indexer_non_unique"] % _index_doc_kwargs)
+ def get_indexer_non_unique(self, target) -> tuple[np.ndarray, np.ndarray]:
+ # both returned ndarrays are np.intp
+ target = ibase.ensure_index(target)
+ return self._get_indexer_non_unique(target._values)
+
+ def _get_indexer_non_unique(
+ self, values: ArrayLike
+ ) -> tuple[np.ndarray, np.ndarray]:
+ # both returned ndarrays are np.intp
+ """
+ get_indexer_non_unique but after unrapping the target Index object.
+ """
+ # Note: we use engine.get_indexer_non_unique for get_indexer in addition
+ # to get_indexer_non_unique because, even if `target` is unique, any
+ # non-category entries in it will be encoded as -1 so `codes` may
+ # not be unique.
+
if isinstance(values, Categorical):
# Indexing on codes is more efficient if categories are the same,
# so we can apply some optimizations based on the degree of
@@ -459,9 +525,9 @@ def _maybe_cast_listlike_indexer(self, values) -> CategoricalIndex:
codes = cat._codes
else:
codes = self.categories.get_indexer(values)
- codes = codes.astype(self.codes.dtype, copy=False)
- cat = self._data._from_backing_data(codes)
- return type(self)._simple_new(cat)
+
+ indexer, missing = self._engine.get_indexer_non_unique(codes)
+ return ensure_platform_int(indexer), ensure_platform_int(missing)
# --------------------------------------------------------------------
@@ -560,3 +626,13 @@ def _concat(self, to_concat: list[Index], name: Hashable) -> Index:
else:
cat = self._data._from_backing_data(codes)
return type(self)._simple_new(cat, name=name)
+
+ def _delegate_method(self, name: str, *args, **kwargs):
+ """method delegation to the ._values"""
+ method = getattr(self._values, name)
+ if "inplace" in kwargs:
+ raise ValueError("cannot use inplace with CategoricalIndex")
+ res = method(*args, **kwargs)
+ if is_scalar(res):
+ return res
+ return CategoricalIndex(res, name=self.name)
diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py
index 07c6a84f75302..df7fae0763c42 100644
--- a/pandas/core/indexes/datetimelike.py
+++ b/pandas/core/indexes/datetimelike.py
@@ -7,19 +7,17 @@
from typing import (
TYPE_CHECKING,
Any,
- Callable,
Sequence,
TypeVar,
cast,
- final,
)
-import warnings
import numpy as np
from pandas._libs import (
NaT,
Timedelta,
+ iNaT,
lib,
)
from pandas._libs.tslibs import (
@@ -27,8 +25,8 @@
NaTType,
Resolution,
Tick,
- parsing,
)
+from pandas._typing import Callable
from pandas.compat.numpy import function as nv
from pandas.util._decorators import (
Appender,
@@ -47,7 +45,6 @@
from pandas.core.arrays import (
DatetimeArray,
- ExtensionArray,
PeriodArray,
TimedeltaArray,
)
@@ -61,6 +58,7 @@
from pandas.core.indexes.extension import (
NDArrayBackedExtensionIndex,
inherit_names,
+ make_wrapped_arith_op,
)
from pandas.core.tools.timedeltas import to_timedelta
@@ -96,6 +94,7 @@ class DatetimeIndexOpsMixin(NDArrayBackedExtensionIndex):
hasnans = cache_readonly(
DatetimeLikeArrayMixin._hasnans.fget # type: ignore[attr-defined]
)
+ _hasnans = hasnans # for index / array -agnostic code
@property
def _is_all_dates(self) -> bool:
@@ -195,6 +194,120 @@ def tolist(self) -> list:
"""
return list(self.astype(object))
+ def min(self, axis=None, skipna=True, *args, **kwargs):
+ """
+ Return the minimum value of the Index or minimum along
+ an axis.
+
+ See Also
+ --------
+ numpy.ndarray.min
+ Series.min : Return the minimum value in a Series.
+ """
+ nv.validate_min(args, kwargs)
+ nv.validate_minmax_axis(axis)
+
+ if not len(self):
+ return self._na_value
+
+ i8 = self.asi8
+
+ if len(i8) and self.is_monotonic_increasing:
+ # quick check
+ if i8[0] != iNaT:
+ return self._data._box_func(i8[0])
+
+ if self.hasnans:
+ if not skipna:
+ return self._na_value
+ i8 = i8[~self._isnan]
+
+ if not len(i8):
+ return self._na_value
+
+ min_stamp = i8.min()
+ return self._data._box_func(min_stamp)
+
+ def argmin(self, axis=None, skipna=True, *args, **kwargs):
+ """
+ Returns the indices of the minimum values along an axis.
+
+ See `numpy.ndarray.argmin` for more information on the
+ `axis` parameter.
+
+ See Also
+ --------
+ numpy.ndarray.argmin
+ """
+ nv.validate_argmin(args, kwargs)
+ nv.validate_minmax_axis(axis)
+
+ i8 = self.asi8
+ if self.hasnans:
+ mask = self._isnan
+ if mask.all() or not skipna:
+ return -1
+ i8 = i8.copy()
+ i8[mask] = np.iinfo("int64").max
+ return i8.argmin()
+
+ def max(self, axis=None, skipna=True, *args, **kwargs):
+ """
+ Return the maximum value of the Index or maximum along
+ an axis.
+
+ See Also
+ --------
+ numpy.ndarray.max
+ Series.max : Return the maximum value in a Series.
+ """
+ nv.validate_max(args, kwargs)
+ nv.validate_minmax_axis(axis)
+
+ if not len(self):
+ return self._na_value
+
+ i8 = self.asi8
+
+ if len(i8) and self.is_monotonic:
+ # quick check
+ if i8[-1] != iNaT:
+ return self._data._box_func(i8[-1])
+
+ if self.hasnans:
+ if not skipna:
+ return self._na_value
+ i8 = i8[~self._isnan]
+
+ if not len(i8):
+ return self._na_value
+
+ max_stamp = i8.max()
+ return self._data._box_func(max_stamp)
+
+ def argmax(self, axis=None, skipna=True, *args, **kwargs):
+ """
+ Returns the indices of the maximum values along an axis.
+
+ See `numpy.ndarray.argmax` for more information on the
+ `axis` parameter.
+
+ See Also
+ --------
+ numpy.ndarray.argmax
+ """
+ nv.validate_argmax(args, kwargs)
+ nv.validate_minmax_axis(axis)
+
+ i8 = self.asi8
+ if self.hasnans:
+ mask = self._isnan
+ if mask.all() or not skipna:
+ return -1
+ i8 = i8.copy()
+ i8[mask] = 0
+ return i8.argmax()
+
# --------------------------------------------------------------------
# Rendering Methods
@@ -280,26 +393,12 @@ def _summary(self, name=None) -> str:
# --------------------------------------------------------------------
# Indexing Methods
- def _can_partial_date_slice(self, reso: Resolution) -> bool:
+ def _validate_partial_date_slice(self, reso: Resolution):
raise NotImplementedError
- def _parsed_string_to_bounds(self, reso: Resolution, parsed):
+ def _parsed_string_to_bounds(self, reso: Resolution, parsed: datetime):
raise NotImplementedError
- def _parse_with_reso(self, label: str):
- # overridden by TimedeltaIndex
- parsed, reso_str = parsing.parse_time_string(label, self.freq)
- reso = Resolution.from_attrname(reso_str)
- return parsed, reso
-
- def _get_string_slice(self, key: str):
- parsed, reso = self._parse_with_reso(key)
- try:
- return self._partial_date_slice(reso, parsed)
- except KeyError as err:
- raise KeyError(key) from err
-
- @final
def _partial_date_slice(
self,
reso: Resolution,
@@ -315,8 +414,7 @@ def _partial_date_slice(
-------
slice or ndarray[intp]
"""
- if not self._can_partial_date_slice(reso):
- raise ValueError
+ self._validate_partial_date_slice(reso)
t1, t2 = self._parsed_string_to_bounds(reso, parsed)
vals = self._data._ndarray
@@ -347,6 +445,23 @@ def _partial_date_slice(
# --------------------------------------------------------------------
# Arithmetic Methods
+ __add__ = make_wrapped_arith_op("__add__")
+ __sub__ = make_wrapped_arith_op("__sub__")
+ __radd__ = make_wrapped_arith_op("__radd__")
+ __rsub__ = make_wrapped_arith_op("__rsub__")
+ __pow__ = make_wrapped_arith_op("__pow__")
+ __rpow__ = make_wrapped_arith_op("__rpow__")
+ __mul__ = make_wrapped_arith_op("__mul__")
+ __rmul__ = make_wrapped_arith_op("__rmul__")
+ __floordiv__ = make_wrapped_arith_op("__floordiv__")
+ __rfloordiv__ = make_wrapped_arith_op("__rfloordiv__")
+ __mod__ = make_wrapped_arith_op("__mod__")
+ __rmod__ = make_wrapped_arith_op("__rmod__")
+ __divmod__ = make_wrapped_arith_op("__divmod__")
+ __rdivmod__ = make_wrapped_arith_op("__rdivmod__")
+ __truediv__ = make_wrapped_arith_op("__truediv__")
+ __rtruediv__ = make_wrapped_arith_op("__rtruediv__")
+
def shift(self: _T, periods: int = 1, freq=None) -> _T:
"""
Shift index by desired number of time frequency increments.
@@ -480,12 +595,7 @@ def _maybe_cast_listlike_indexer(self, keyarr):
try:
res = self._data._validate_listlike(keyarr, allow_object=True)
except (ValueError, TypeError):
- if not isinstance(keyarr, ExtensionArray):
- # e.g. we don't want to cast DTA to ndarray[object]
- res = com.asarray_tuplesafe(keyarr)
- # TODO: com.asarray_tuplesafe shouldn't cast e.g. DatetimeArray
- else:
- res = keyarr
+ res = com.asarray_tuplesafe(keyarr)
return Index(res, dtype=res.dtype)
@@ -504,8 +614,6 @@ class DatetimeTimedeltaMixin(DatetimeIndexOpsMixin):
_is_monotonic_decreasing = Index.is_monotonic_decreasing
_is_unique = Index.is_unique
- _join_precedence = 10
-
def _with_freq(self, freq):
arr = self._data._with_freq(freq)
return type(self)._simple_new(arr, name=self._name)
@@ -516,12 +624,6 @@ def _has_complex_internals(self) -> bool:
return False
def is_type_compatible(self, kind: str) -> bool:
- warnings.warn(
- f"{type(self).__name__}.is_type_compatible is deprecated and will be "
- "removed in a future version",
- FutureWarning,
- stacklevel=2,
- )
return kind in self._data._infer_matches
# --------------------------------------------------------------------
@@ -529,11 +631,15 @@ def is_type_compatible(self, kind: str) -> bool:
def _intersection(self, other: Index, sort=False) -> Index:
"""
- intersection specialized to the case with matching dtypes and both non-empty.
+ intersection specialized to the case with matching dtypes.
"""
other = cast("DatetimeTimedeltaMixin", other)
+ if len(self) == 0:
+ return self.copy()._get_reconciled_name_object(other)
+ if len(other) == 0:
+ return other.copy()._get_reconciled_name_object(self)
- if not self._can_fast_intersect(other):
+ elif not self._can_fast_intersect(other):
result = Index._intersection(self, other, sort=sort)
# We need to invalidate the freq because Index._intersection
# uses _shallow_copy on a view of self._data, which will preserve
@@ -543,11 +649,6 @@ def _intersection(self, other: Index, sort=False) -> Index:
result = self._wrap_setop_result(other, result)
return result._with_freq(None)._with_freq("infer")
- else:
- return self._fast_intersect(other, sort)
-
- def _fast_intersect(self, other, sort):
-
# to make our life easier, "sort" the two ranges
if self[0] <= other[0]:
left, right = self, other
@@ -582,8 +683,7 @@ def _can_fast_intersect(self: _T, other: _T) -> bool:
elif self.freq.is_anchored():
# this along with matching freqs ensure that we "line up",
# so intersection will preserve freq
- # GH#42104
- return self.freq.n == 1
+ return True
elif isinstance(self.freq, Tick):
# We "line up" if and only if the difference between two of our points
@@ -592,8 +692,7 @@ def _can_fast_intersect(self: _T, other: _T) -> bool:
remainder = diff % self.freq.delta
return remainder == Timedelta(0)
- # GH#42104
- return self.freq.n == 1
+ return True
def _can_fast_union(self: _T, other: _T) -> bool:
# Assumes that type(self) == type(other), as per the annotation
@@ -625,7 +724,11 @@ def _can_fast_union(self: _T, other: _T) -> bool:
return (right_start == left_end + freq) or right_start in left
def _fast_union(self: _T, other: _T, sort=None) -> _T:
- # Caller is responsible for ensuring self and other are non-empty
+ if len(other) == 0:
+ return self.view(type(self))
+
+ if len(self) == 0:
+ return other.view(type(self))
# to make our life easier, "sort" the two ranges
if self[0] <= other[0]:
@@ -675,4 +778,39 @@ def _union(self, other, sort):
# that result.freq == self.freq
return result
else:
- return super()._union(other, sort)._with_freq("infer")
+ return super()._union(other, sort=sort)._with_freq("infer")
+
+ # --------------------------------------------------------------------
+ # Join Methods
+ _join_precedence = 10
+
+ def join(
+ self,
+ other,
+ how: str = "left",
+ level=None,
+ return_indexers: bool = False,
+ sort: bool = False,
+ ):
+ """
+ See Index.join
+ """
+ pself, pother = self._maybe_promote(other)
+ if pself is not self or pother is not other:
+ return pself.join(
+ pother, how=how, level=level, return_indexers=return_indexers, sort=sort
+ )
+
+ self._maybe_utc_convert(other) # raises if we dont have tzawareness compat
+ return Index.join(
+ self,
+ other,
+ how=how,
+ level=level,
+ return_indexers=return_indexers,
+ sort=sort,
+ )
+
+ def _maybe_utc_convert(self: _T, other: Index) -> tuple[_T, Index]:
+ # Overridden by DatetimeIndex
+ return self, other
diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py
index 9712a5d95a234..fbfee9a1f524c 100644
--- a/pandas/core/indexes/datetimes.py
+++ b/pandas/core/indexes/datetimes.py
@@ -33,8 +33,8 @@
from pandas._typing import (
Dtype,
DtypeObj,
- npt,
)
+from pandas.errors import InvalidIndexError
from pandas.util._decorators import (
cache_readonly,
doc,
@@ -117,10 +117,16 @@ def _new_DatetimeIndex(cls, d):
@inherit_names(["is_normalized", "_resolution_obj"], DatetimeArray, cache=True)
@inherit_names(
[
+ "_bool_ops",
+ "_object_ops",
+ "_field_ops",
+ "_datetimelike_ops",
+ "_datetimelike_methods",
"tz",
"tzinfo",
"dtype",
"to_pydatetime",
+ "_has_same_tz",
"_format_native_types",
"date",
"time",
@@ -406,8 +412,7 @@ def union_many(self, others):
this, other = this._maybe_utc_convert(other)
- if len(self) and len(other) and this._can_fast_union(other):
- # union already has fastpath handling for empty cases
+ if this._can_fast_union(other):
this = this._fast_union(other)
else:
this = Index.union(this, other)
@@ -569,6 +574,21 @@ def _parsed_string_to_bounds(self, reso: Resolution, parsed: datetime):
-------
lower, upper: pd.Timestamp
"""
+ assert isinstance(reso, Resolution), (type(reso), reso)
+ valid_resos = {
+ "year",
+ "month",
+ "quarter",
+ "day",
+ "hour",
+ "minute",
+ "second",
+ "millisecond",
+ "microsecond",
+ }
+ if reso.attrname not in valid_resos:
+ raise KeyError
+
grp = reso.freq_group
per = Period(parsed, freq=grp.value)
start, end = per.start_time, per.end_time
@@ -577,22 +597,36 @@ def _parsed_string_to_bounds(self, reso: Resolution, parsed: datetime):
# If an incoming date string contained a UTC offset, need to localize
# the parsed date to this offset first before aligning with the index's
# timezone
- start = start.tz_localize(parsed.tzinfo)
- end = end.tz_localize(parsed.tzinfo)
-
if parsed.tzinfo is not None:
if self.tz is None:
raise ValueError(
"The index must be timezone aware when indexing "
"with a date string with a UTC offset"
)
- start = self._maybe_cast_for_get_loc(start)
- end = self._maybe_cast_for_get_loc(end)
+ start = start.tz_localize(parsed.tzinfo).tz_convert(self.tz)
+ end = end.tz_localize(parsed.tzinfo).tz_convert(self.tz)
+ elif self.tz is not None:
+ start = start.tz_localize(self.tz)
+ end = end.tz_localize(self.tz)
return start, end
- def _can_partial_date_slice(self, reso: Resolution) -> bool:
- # History of conversation GH#3452, GH#3931, GH#2369, GH#14826
- return reso > self._resolution_obj
+ def _validate_partial_date_slice(self, reso: Resolution):
+ assert isinstance(reso, Resolution), (type(reso), reso)
+ if (
+ self.is_monotonic
+ and reso.attrname in ["day", "hour", "minute", "second"]
+ and self._resolution_obj >= reso
+ ):
+ # These resolution/monotonicity validations came from GH3931,
+ # GH3452 and GH2369.
+
+ # See also GH14826
+ raise KeyError
+
+ if reso.attrname == "microsecond":
+ # _partial_date_slice doesn't allow microsecond resolution, but
+ # _parsed_string_to_bounds allows it.
+ raise KeyError
def _deprecate_mismatched_indexing(self, key) -> None:
# GH#36148
@@ -624,7 +658,8 @@ def get_loc(self, key, method=None, tolerance=None):
-------
loc : int
"""
- self._check_indexing_error(key)
+ if not is_scalar(key):
+ raise InvalidIndexError(key)
orig_key = key
if is_valid_na_for_dtype(key, self.dtype):
@@ -636,22 +671,14 @@ def get_loc(self, key, method=None, tolerance=None):
key = self._maybe_cast_for_get_loc(key)
elif isinstance(key, str):
-
try:
- parsed, reso = self._parse_with_reso(key)
- except ValueError as err:
- raise KeyError(key) from err
+ return self._get_string_slice(key)
+ except (TypeError, KeyError, ValueError, OverflowError):
+ pass
- if self._can_partial_date_slice(reso):
- try:
- return self._partial_date_slice(reso, parsed)
- except KeyError as err:
- if method is None:
- raise KeyError(key) from err
try:
key = self._maybe_cast_for_get_loc(key)
except ValueError as err:
- # FIXME: we get here because parse_with_reso doesn't raise on "t2m"
raise KeyError(key) from err
elif isinstance(key, timedelta):
@@ -707,11 +734,13 @@ def _maybe_cast_slice_bound(self, label, side: str, kind=lib.no_default):
self._deprecated_arg(kind, "kind", "_maybe_cast_slice_bound")
if isinstance(label, str):
+ freq = getattr(self, "freqstr", getattr(self, "inferred_freq", None))
try:
- parsed, reso = self._parse_with_reso(label)
+ parsed, reso_str = parsing.parse_time_string(label, freq)
except parsing.DateParseError as err:
raise self._invalid_indexer("slice", label) from err
+ reso = Resolution.from_attrname(reso_str)
lower, upper = self._parsed_string_to_bounds(reso, parsed)
# lower, upper form the half-open interval:
# [parsed, parsed + 1 freq)
@@ -729,6 +758,12 @@ def _maybe_cast_slice_bound(self, label, side: str, kind=lib.no_default):
return self._maybe_cast_for_get_loc(label)
+ def _get_string_slice(self, key: str):
+ freq = getattr(self, "freqstr", getattr(self, "inferred_freq", None))
+ parsed, reso_str = parsing.parse_time_string(key, freq)
+ reso = Resolution.from_attrname(reso_str)
+ return self._partial_date_slice(reso, parsed)
+
def slice_indexer(self, start=None, end=None, step=None, kind=None):
"""
Return indexer for specified label slice.
@@ -808,7 +843,7 @@ def inferred_type(self) -> str:
# sure we can't have ambiguous indexing
return "datetime64"
- def indexer_at_time(self, time, asof: bool = False) -> npt.NDArray[np.intp]:
+ def indexer_at_time(self, time, asof: bool = False) -> np.ndarray:
"""
Return index locations of values at particular time of day
(e.g. 9:30AM).
@@ -849,7 +884,7 @@ def indexer_at_time(self, time, asof: bool = False) -> npt.NDArray[np.intp]:
def indexer_between_time(
self, start_time, end_time, include_start: bool = True, include_end: bool = True
- ) -> npt.NDArray[np.intp]:
+ ) -> np.ndarray:
"""
Return index locations of values between particular times of day
(e.g., 9:00-9:30AM).
diff --git a/pandas/core/indexes/extension.py b/pandas/core/indexes/extension.py
index 1458ff1cdaa51..6ff20f7d009bc 100644
--- a/pandas/core/indexes/extension.py
+++ b/pandas/core/indexes/extension.py
@@ -102,8 +102,6 @@ def fset(self, value):
else:
def method(self, *args, **kwargs):
- if "inplace" in kwargs:
- raise ValueError(f"cannot use inplace with {type(self).__name__}")
result = attr(self._data, *args, **kwargs)
if wrap:
if isinstance(result, type(self._data)):
@@ -161,7 +159,7 @@ def wrapper(self, other):
return wrapper
-def _make_wrapped_arith_op(opname: str):
+def make_wrapped_arith_op(opname: str):
def method(self, other):
if (
isinstance(other, Index)
@@ -172,16 +170,7 @@ def method(self, other):
# a chance to implement ops before we unwrap them.
# See https://github.com/pandas-dev/pandas/issues/31109
return NotImplemented
-
- try:
- meth = getattr(self._data, opname)
- except AttributeError as err:
- # e.g. Categorical, IntervalArray
- cls = type(self).__name__
- raise TypeError(
- f"cannot perform {opname} with this index type: {cls}"
- ) from err
-
+ meth = getattr(self._data, opname)
result = meth(_maybe_unwrap_index(other))
return _wrap_arithmetic_op(self, other, result)
@@ -278,23 +267,6 @@ def _simple_new(
__le__ = _make_wrapped_comparison_op("__le__")
__ge__ = _make_wrapped_comparison_op("__ge__")
- __add__ = _make_wrapped_arith_op("__add__")
- __sub__ = _make_wrapped_arith_op("__sub__")
- __radd__ = _make_wrapped_arith_op("__radd__")
- __rsub__ = _make_wrapped_arith_op("__rsub__")
- __pow__ = _make_wrapped_arith_op("__pow__")
- __rpow__ = _make_wrapped_arith_op("__rpow__")
- __mul__ = _make_wrapped_arith_op("__mul__")
- __rmul__ = _make_wrapped_arith_op("__rmul__")
- __floordiv__ = _make_wrapped_arith_op("__floordiv__")
- __rfloordiv__ = _make_wrapped_arith_op("__rfloordiv__")
- __mod__ = _make_wrapped_arith_op("__mod__")
- __rmod__ = _make_wrapped_arith_op("__rmod__")
- __divmod__ = _make_wrapped_arith_op("__divmod__")
- __rdivmod__ = _make_wrapped_arith_op("__rdivmod__")
- __truediv__ = _make_wrapped_arith_op("__truediv__")
- __rtruediv__ = _make_wrapped_arith_op("__rtruediv__")
-
@property
def _has_complex_internals(self) -> bool:
# used to avoid libreduction code paths, which raise or require conversion
@@ -392,6 +364,13 @@ def _validate_fill_value(self, value):
"""
return self._data._validate_setitem_value(value)
+ def _get_unique_index(self):
+ if self.is_unique:
+ return self
+
+ result = self._data.unique()
+ return type(self)._simple_new(result, name=self.name)
+
@doc(Index.map)
def map(self, mapper, na_action=None):
# Try to run function on index first, and then on elements of index
@@ -418,13 +397,11 @@ def astype(self, dtype, copy: bool = True) -> Index:
return self
return self.copy()
- # error: Non-overlapping equality check (left operand type: "dtype[Any]", right
- # operand type: "Literal['M8[ns]']")
if (
isinstance(self.dtype, np.dtype)
and isinstance(dtype, np.dtype)
and dtype.kind == "M"
- and dtype != "M8[ns]" # type: ignore[comparison-overlap]
+ and dtype != "M8[ns]"
):
# For now Datetime supports this by unwrapping ndarray, but DTI doesn't
raise TypeError(f"Cannot cast {type(self).__name__} to dtype")
diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py
index c401ad0c1e0d5..072ab7dff8e5b 100644
--- a/pandas/core/indexes/interval.py
+++ b/pandas/core/indexes/interval.py
@@ -7,8 +7,10 @@
)
import textwrap
from typing import (
+ TYPE_CHECKING,
Any,
Hashable,
+ cast,
)
import numpy as np
@@ -28,7 +30,6 @@
from pandas._typing import (
Dtype,
DtypeObj,
- npt,
)
from pandas.errors import InvalidIndexError
from pandas.util._decorators import (
@@ -38,7 +39,6 @@
from pandas.util._exceptions import rewrite_exception
from pandas.core.dtypes.cast import (
- construct_1d_object_array_from_listlike,
find_common_type,
infer_dtype_from_scalar,
maybe_box_datetimelike,
@@ -46,6 +46,7 @@
)
from pandas.core.dtypes.common import (
ensure_platform_int,
+ is_categorical_dtype,
is_datetime64tz_dtype,
is_datetime_or_timedelta_dtype,
is_dtype_equal,
@@ -62,7 +63,7 @@
from pandas.core.dtypes.dtypes import IntervalDtype
from pandas.core.dtypes.missing import is_valid_na_for_dtype
-from pandas.core.algorithms import unique
+from pandas.core.algorithms import take_nd
from pandas.core.arrays.interval import (
IntervalArray,
_interval_shared_docs,
@@ -90,6 +91,9 @@
timedelta_range,
)
+if TYPE_CHECKING:
+ from pandas import CategoricalIndex
+
_index_doc_kwargs = dict(ibase._index_doc_kwargs)
_index_doc_kwargs.update(
@@ -609,7 +613,9 @@ def get_loc(
0
"""
self._check_indexing_method(method)
- self._check_indexing_error(key)
+
+ if not is_scalar(key):
+ raise InvalidIndexError(key)
if isinstance(key, Interval):
if self.closed != key.closed:
@@ -645,40 +651,48 @@ def _get_indexer(
method: str | None = None,
limit: int | None = None,
tolerance: Any | None = None,
- ) -> npt.NDArray[np.intp]:
+ ) -> np.ndarray:
+ # returned ndarray is np.intp
if isinstance(target, IntervalIndex):
+ # equal indexes -> 1:1 positional match
+ if self.equals(target):
+ return np.arange(len(self), dtype="intp")
+
+ if not self._should_compare(target):
+ return self._get_indexer_non_comparable(target, method, unique=True)
+
# non-overlapping -> at most one match per interval in target
# want exact matches -> need both left/right to match, so defer to
# left/right get_indexer, compare elementwise, equality -> match
left_indexer = self.left.get_indexer(target.left)
right_indexer = self.right.get_indexer(target.right)
indexer = np.where(left_indexer == right_indexer, left_indexer, -1)
-
- elif not is_object_dtype(target.dtype):
+ elif is_categorical_dtype(target.dtype):
+ target = cast("CategoricalIndex", target)
+ # get an indexer for unique categories then propagate to codes via take_nd
+ categories_indexer = self.get_indexer(target.categories)
+ indexer = take_nd(categories_indexer, target.codes, fill_value=-1)
+ elif not is_object_dtype(target):
# homogeneous scalar index: use IntervalTree
- # we should always have self._should_partial_index(target) here
target = self._maybe_convert_i8(target)
indexer = self._engine.get_indexer(target.values)
else:
# heterogeneous scalar index: defer elementwise to get_loc
- # we should always have self._should_partial_index(target) here
return self._get_indexer_pointwise(target)[0]
return ensure_platform_int(indexer)
@Appender(_index_shared_docs["get_indexer_non_unique"] % _index_doc_kwargs)
- def get_indexer_non_unique(
- self, target: Index
- ) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]:
+ def get_indexer_non_unique(self, target: Index) -> tuple[np.ndarray, np.ndarray]:
+ # both returned ndarrays are np.intp
target = ensure_index(target)
- if not self._should_compare(target) and not self._should_partial_index(target):
- # e.g. IntervalIndex with different closed or incompatible subtype
- # -> no matches
+ if isinstance(target, IntervalIndex) and not self._should_compare(target):
+ # different closed or incompatible subtype -> no matches
return self._get_indexer_non_comparable(target, None, unique=False)
- elif is_object_dtype(target.dtype) or not self._should_partial_index(target):
+ elif is_object_dtype(target.dtype) or isinstance(target, IntervalIndex):
# target might contain intervals: defer elementwise to get_loc
return self._get_indexer_pointwise(target)
@@ -690,9 +704,8 @@ def get_indexer_non_unique(
return ensure_platform_int(indexer), ensure_platform_int(missing)
- def _get_indexer_pointwise(
- self, target: Index
- ) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]:
+ def _get_indexer_pointwise(self, target: Index) -> tuple[np.ndarray, np.ndarray]:
+ # both returned ndarrays are np.intp
"""
pointwise implementation for get_indexer and get_indexer_non_unique.
"""
@@ -740,7 +753,6 @@ def _convert_slice_indexer(self, key: slice, kind: str):
return super()._convert_slice_indexer(key, kind)
- @cache_readonly
def _should_fallback_to_positional(self) -> bool:
# integer lookups in Series.__getitem__ are unambiguously
# positional in this case
@@ -790,80 +802,6 @@ def _format_data(self, name=None) -> str:
# name argument is unused here; just for compat with base / categorical
return self._data._format_data() + "," + self._format_space()
- # --------------------------------------------------------------------
- # Set Operations
-
- def _intersection(self, other, sort):
- """
- intersection specialized to the case with matching dtypes.
- """
- # For IntervalIndex we also know other.closed == self.closed
- if self.left.is_unique and self.right.is_unique:
- taken = self._intersection_unique(other)
- elif other.left.is_unique and other.right.is_unique and self.isna().sum() <= 1:
- # Swap other/self if other is unique and self does not have
- # multiple NaNs
- taken = other._intersection_unique(self)
- else:
- # duplicates
- taken = self._intersection_non_unique(other)
-
- if sort is None:
- taken = taken.sort_values()
-
- return taken
-
- def _intersection_unique(self, other: IntervalIndex) -> IntervalIndex:
- """
- Used when the IntervalIndex does not have any common endpoint,
- no matter left or right.
- Return the intersection with another IntervalIndex.
- Parameters
- ----------
- other : IntervalIndex
- Returns
- -------
- IntervalIndex
- """
- # Note: this is much more performant than super()._intersection(other)
- lindexer = self.left.get_indexer(other.left)
- rindexer = self.right.get_indexer(other.right)
-
- match = (lindexer == rindexer) & (lindexer != -1)
- indexer = lindexer.take(match.nonzero()[0])
- indexer = unique(indexer)
-
- return self.take(indexer)
-
- def _intersection_non_unique(self, other: IntervalIndex) -> IntervalIndex:
- """
- Used when the IntervalIndex does have some common endpoints,
- on either sides.
- Return the intersection with another IntervalIndex.
-
- Parameters
- ----------
- other : IntervalIndex
-
- Returns
- -------
- IntervalIndex
- """
- # Note: this is about 3.25x faster than super()._intersection(other)
- # in IntervalIndexMethod.time_intersection_both_duplicate(1000)
- mask = np.zeros(len(self), dtype=bool)
-
- if self.hasnans and other.hasnans:
- first_nan_loc = np.arange(len(self))[self.isna()][0]
- mask[first_nan_loc] = True
-
- other_tups = set(zip(other.left, other.right))
- for i, tup in enumerate(zip(self.left, self.right)):
- if tup in other_tups:
- mask[i] = True
-
- return self[mask]
-
# --------------------------------------------------------------------
@property
@@ -874,19 +812,6 @@ def _is_all_dates(self) -> bool:
"""
return False
- def _get_join_target(self) -> np.ndarray:
- # constructing tuples is much faster than constructing Intervals
- tups = list(zip(self.left, self.right))
- target = construct_1d_object_array_from_listlike(tups)
- return target
-
- def _from_join_target(self, result):
- left, right = list(zip(*result))
- arr = type(self._data).from_arrays(
- left, right, dtype=self.dtype, closed=self.closed
- )
- return type(self)._simple_new(arr, name=self.name)
-
# TODO: arithmetic operations
diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py
index 9b56e4cf89498..821d696200175 100644
--- a/pandas/core/indexes/multi.py
+++ b/pandas/core/indexes/multi.py
@@ -73,6 +73,7 @@
from pandas.core.arrays import Categorical
from pandas.core.arrays.categorical import factorize_from_iterables
import pandas.core.common as com
+from pandas.core.indexers import is_empty_indexer
import pandas.core.indexes.base as ibase
from pandas.core.indexes.base import (
Index,
@@ -760,7 +761,6 @@ def levels(self) -> FrozenList:
def _set_levels(
self,
levels,
- *,
level=None,
copy: bool = False,
validate: bool = True,
@@ -954,7 +954,6 @@ def codes(self):
def _set_codes(
self,
codes,
- *,
level=None,
copy: bool = False,
validate: bool = True,
@@ -1393,7 +1392,7 @@ def format(
def _get_names(self) -> FrozenList:
return FrozenList(self._names)
- def _set_names(self, names, *, level=None, validate: bool = True):
+ def _set_names(self, names, level=None, validate: bool = True):
"""
Set new names on index. Each name has to be a hashable type.
@@ -1474,7 +1473,7 @@ def _set_names(self, names, *, level=None, validate: bool = True):
# --------------------------------------------------------------------
@doc(Index._get_grouper_for_level)
- def _get_grouper_for_level(self, mapper, *, level):
+ def _get_grouper_for_level(self, mapper, level):
indexer = self.codes[level]
level_index = self.levels[level]
@@ -2477,7 +2476,53 @@ def sortlevel(
return new_index, indexer
- def _wrap_reindex_result(self, target, indexer, preserve_names: bool):
+ def reindex(
+ self, target, method=None, level=None, limit=None, tolerance=None
+ ) -> tuple[MultiIndex, np.ndarray | None]:
+ """
+ Create index with target's values (move/add/delete values as necessary)
+
+ Returns
+ -------
+ new_index : pd.MultiIndex
+ Resulting index
+ indexer : np.ndarray[np.intp] or None
+ Indices of output values in original index.
+
+ """
+ # GH6552: preserve names when reindexing to non-named target
+ # (i.e. neither Index nor Series).
+ preserve_names = not hasattr(target, "names")
+
+ if level is not None:
+ if method is not None:
+ raise TypeError("Fill method not supported if level passed")
+
+ # GH7774: preserve dtype/tz if target is empty and not an Index.
+ # target may be an iterator
+ target = ibase.ensure_has_len(target)
+ if len(target) == 0 and not isinstance(target, Index):
+ idx = self.levels[level]
+ attrs = idx._get_attributes_dict()
+ attrs.pop("freq", None) # don't preserve freq
+ target = type(idx)._simple_new(np.empty(0, dtype=idx.dtype), **attrs)
+ else:
+ target = ensure_index(target)
+ target, indexer, _ = self._join_level(
+ target, level, how="right", keep_order=False
+ )
+ else:
+ target = ensure_index(target)
+ if self.equals(target):
+ indexer = None
+ else:
+ if self.is_unique:
+ indexer = self.get_indexer(
+ target, method=method, limit=limit, tolerance=tolerance
+ )
+ else:
+ raise ValueError("cannot handle a non-unique multi-index!")
+
if not isinstance(target, MultiIndex):
if indexer is None:
target = self
@@ -2488,12 +2533,7 @@ def _wrap_reindex_result(self, target, indexer, preserve_names: bool):
target = MultiIndex.from_tuples(target)
except TypeError:
# not all tuples, see test_constructor_dict_multiindex_reindex_flat
- return target
-
- target = self._maybe_preserve_names(target, preserve_names)
- return target
-
- def _maybe_preserve_names(self, target: Index, preserve_names: bool):
+ return target, indexer
if (
preserve_names
and target.nlevels == self.nlevels
@@ -2501,7 +2541,8 @@ def _maybe_preserve_names(self, target: Index, preserve_names: bool):
):
target = target.copy(deep=False)
target.names = self.names
- return target
+
+ return target, indexer
# --------------------------------------------------------------------
# Indexing Methods
@@ -2513,13 +2554,12 @@ def _check_indexing_error(self, key):
# We have to explicitly exclude generators, as these are hashable.
raise InvalidIndexError(key)
- @cache_readonly
def _should_fallback_to_positional(self) -> bool:
"""
Should integer key(s) be treated as positional?
"""
# GH#33355
- return self.levels[0]._should_fallback_to_positional
+ return self.levels[0]._should_fallback_to_positional()
def _get_values_for_loc(self, series: Series, loc, key):
"""
@@ -2541,49 +2581,109 @@ def _get_values_for_loc(self, series: Series, loc, key):
new_ser = series._constructor(new_values, index=new_index, name=series.name)
return new_ser.__finalize__(series)
- def _get_indexer_strict(self, key, axis_name: str) -> tuple[Index, np.ndarray]:
-
- keyarr = key
- if not isinstance(keyarr, Index):
- keyarr = com.asarray_tuplesafe(keyarr)
-
- if len(keyarr) and not isinstance(keyarr[0], tuple):
- indexer = self._get_indexer_level_0(keyarr)
-
- self._raise_if_missing(key, indexer, axis_name)
- return self[indexer], indexer
+ def _convert_listlike_indexer(self, keyarr) -> np.ndarray | None:
+ """
+ Analogous to get_indexer when we are partial-indexing on our first level.
- return super()._get_indexer_strict(key, axis_name)
+ Parameters
+ ----------
+ keyarr : Index, np.ndarray, or ExtensionArray
+ Indexer to convert.
- def _raise_if_missing(self, key, indexer, axis_name: str):
- keyarr = key
- if not isinstance(key, Index):
- keyarr = com.asarray_tuplesafe(key)
+ Returns
+ -------
+ np.ndarray[intp] or None
+ """
+ indexer = None
+ # are we indexing a specific level
if len(keyarr) and not isinstance(keyarr[0], tuple):
- # i.e. same condition for special case in MultiIndex._get_indexer_strict
+ _, indexer = self.reindex(keyarr, level=0)
- mask = indexer == -1
+ # take all
+ if indexer is None:
+ indexer = np.arange(len(self), dtype=np.intp)
+ return indexer
+
+ check = self.levels[0].get_indexer(keyarr)
+ mask = check == -1
if mask.any():
- check = self.levels[0].get_indexer(keyarr)
- cmask = check == -1
- if cmask.any():
- raise KeyError(f"{keyarr[cmask]} not in index")
+ raise KeyError(f"{keyarr[mask]} not in index")
+ elif is_empty_indexer(indexer, keyarr):
# We get here when levels still contain values which are not
# actually in Index anymore
raise KeyError(f"{keyarr} not in index")
- else:
- return super()._raise_if_missing(key, indexer, axis_name)
- def _get_indexer_level_0(self, target) -> np.ndarray:
+ return indexer
+
+ def _get_partial_string_timestamp_match_key(self, key):
"""
- Optimized equivalent to `self.get_level_values(0).get_indexer_for(target)`.
+ Translate any partial string timestamp matches in key, returning the
+ new key.
+
+ Only relevant for MultiIndex.
"""
- lev = self.levels[0]
- codes = self._codes[0]
- cat = Categorical.from_codes(codes=codes, categories=lev)
- ci = Index(cat)
- return ci.get_indexer_for(target)
+ # GH#10331
+ if isinstance(key, str) and self.levels[0]._supports_partial_string_indexing:
+ # Convert key '2016-01-01' to
+ # ('2016-01-01'[, slice(None, None, None)]+)
+ key = (key,) + (slice(None),) * (len(self.levels) - 1)
+
+ if isinstance(key, tuple):
+ # Convert (..., '2016-01-01', ...) in tuple to
+ # (..., slice('2016-01-01', '2016-01-01', None), ...)
+ new_key = []
+ for i, component in enumerate(key):
+ if (
+ isinstance(component, str)
+ and self.levels[i]._supports_partial_string_indexing
+ ):
+ new_key.append(slice(component, component, None))
+ else:
+ new_key.append(component)
+ key = tuple(new_key)
+
+ return key
+
+ def _get_indexer(
+ self,
+ target: Index,
+ method: str | None = None,
+ limit: int | None = None,
+ tolerance=None,
+ ) -> np.ndarray:
+ # returned ndarray is np.intp
+
+ # empty indexer
+ if not len(target):
+ return ensure_platform_int(np.array([]))
+
+ if not isinstance(target, MultiIndex):
+ try:
+ target = MultiIndex.from_tuples(target)
+ except (TypeError, ValueError):
+
+ # let's instead try with a straight Index
+ if method is None:
+ return Index(self._values).get_indexer(
+ target, method=method, limit=limit, tolerance=tolerance
+ )
+
+ # TODO: explicitly raise here? we only have one test that
+ # gets here, and it is checking that we raise with method="nearest"
+
+ if method == "pad" or method == "backfill":
+ # TODO: get_indexer_with_fill docstring says values must be _sorted_
+ # but that doesn't appear to be enforced
+ indexer = self._engine.get_indexer_with_fill(
+ target=target._values, values=self._values, method=method, limit=limit
+ )
+ else:
+ indexer = self._engine.get_indexer(target._values)
+
+ # Note: we only get here (in extant tests at least) with
+ # target.nlevels == self.nlevels
+ return ensure_platform_int(indexer)
def get_slice_bound(
self, label: Hashable | Sequence[Hashable], side: str, kind: str | None = None
@@ -2702,19 +2802,15 @@ def _partial_tup_index(self, tup: tuple, side="left"):
n = len(tup)
start, end = 0, len(self)
zipped = zip(tup, self.levels, self.codes)
- for k, (lab, lev, level_codes) in enumerate(zipped):
- section = level_codes[start:end]
+ for k, (lab, lev, labs) in enumerate(zipped):
+ section = labs[start:end]
if lab not in lev and not isna(lab):
- # short circuit
- try:
- loc = lev.searchsorted(lab, side=side)
- except TypeError as err:
- # non-comparable e.g. test_slice_locs_with_type_mismatch
- raise TypeError(f"Level type mismatch: {lab}") from err
- if not is_integer(loc):
- # non-comparable level, e.g. test_groupby_example
+ if not lev.is_type_compatible(lib.infer_dtype([lab], skipna=False)):
raise TypeError(f"Level type mismatch: {lab}")
+
+ # short circuit
+ loc = lev.searchsorted(lab, side=side)
if side == "right" and loc >= 0:
loc -= 1
return start + section.searchsorted(loc, side=side)
@@ -2833,12 +2929,7 @@ def _maybe_to_slice(loc):
)
if keylen == self.nlevels and self.is_unique:
- try:
- return self._engine.get_loc(key)
- except TypeError:
- # e.g. partial string slicing
- loc, _ = self.get_loc_level(key, list(range(self.nlevels)))
- return loc
+ return self._engine.get_loc(key)
# -- partial selection or non-unique index
# break the key into 2 parts based on the lexsort_depth of the index;
@@ -2917,27 +3008,27 @@ def get_loc_level(self, key, level=0, drop_level: bool = True):
level = self._get_level_number(level)
else:
level = [self._get_level_number(lev) for lev in level]
+ return self._get_loc_level(key, level=level, drop_level=drop_level)
- loc, mi = self._get_loc_level(key, level=level)
- if not drop_level:
- if lib.is_integer(loc):
- mi = self[loc : loc + 1]
- else:
- mi = self[loc]
- return loc, mi
-
- def _get_loc_level(self, key, level: int | list[int] = 0):
+ def _get_loc_level(self, key, level: int | list[int] = 0, drop_level: bool = True):
"""
get_loc_level but with `level` known to be positional, not name-based.
"""
# different name to distinguish from maybe_droplevels
- def maybe_mi_droplevels(indexer, levels):
- new_index = self[indexer]
+ def maybe_mi_droplevels(indexer, levels, drop_level: bool):
+ if not drop_level:
+ return self[indexer]
+ # kludge around
+ orig_index = new_index = self[indexer]
for i in sorted(levels, reverse=True):
- new_index = new_index._drop_level_numbers([i])
+ try:
+ new_index = new_index._drop_level_numbers([i])
+ except ValueError:
+ # no dropping here
+ return orig_index
return new_index
if isinstance(level, (tuple, list)):
@@ -2952,18 +3043,10 @@ def maybe_mi_droplevels(indexer, levels):
mask = np.zeros(len(self), dtype=bool)
mask[loc] = True
loc = mask
- result = loc if result is None else result & loc
- try:
- # FIXME: we should be only dropping levels on which we are
- # scalar-indexing
- mi = maybe_mi_droplevels(result, level)
- except ValueError:
- # droplevel failed because we tried to drop all levels,
- # i.e. len(level) == self.nlevels
- mi = self[result]
+ result = loc if result is None else result & loc
- return result, mi
+ return result, maybe_mi_droplevels(result, level, drop_level)
# kludge for #1796
if isinstance(key, list):
@@ -2972,105 +3055,64 @@ def maybe_mi_droplevels(indexer, levels):
if isinstance(key, tuple) and level == 0:
try:
- # Check if this tuple is a single key in our first level
if key in self.levels[0]:
indexer = self._get_level_indexer(key, level=level)
- new_index = maybe_mi_droplevels(indexer, [0])
+ new_index = maybe_mi_droplevels(indexer, [0], drop_level)
return indexer, new_index
except (TypeError, InvalidIndexError):
pass
if not any(isinstance(k, slice) for k in key):
- if len(key) == self.nlevels and self.is_unique:
- # Complete key in unique index -> standard get_loc
- try:
- return (self._engine.get_loc(key), None)
- except KeyError as err:
- raise KeyError(key) from err
- except TypeError:
- # e.g. partial string indexing
- # test_partial_string_timestamp_multiindex
- pass
-
# partial selection
- indexer = self.get_loc(key)
- ilevels = [i for i in range(len(key)) if key[i] != slice(None, None)]
- if len(ilevels) == self.nlevels:
- if is_integer(indexer):
- # we are dropping all levels
- return indexer, None
-
- # TODO: in some cases we still need to drop some levels,
- # e.g. test_multiindex_perf_warn
- # test_partial_string_timestamp_multiindex
+ # optionally get indexer to avoid re-calculation
+ def partial_selection(key, indexer=None):
+ if indexer is None:
+ indexer = self.get_loc(key)
ilevels = [
- i
- for i in range(len(key))
- if (
- not isinstance(key[i], str)
- or not self.levels[i]._supports_partial_string_indexing
- )
- and key[i] != slice(None, None)
+ i for i in range(len(key)) if key[i] != slice(None, None)
]
- if len(ilevels) == self.nlevels:
- # TODO: why?
- ilevels = []
- return indexer, maybe_mi_droplevels(indexer, ilevels)
+ return indexer, maybe_mi_droplevels(indexer, ilevels, drop_level)
+ if len(key) == self.nlevels and self.is_unique:
+ # Complete key in unique index -> standard get_loc
+ try:
+ return (self._engine.get_loc(key), None)
+ except KeyError as e:
+ raise KeyError(key) from e
+ else:
+ return partial_selection(key)
else:
indexer = None
for i, k in enumerate(key):
if not isinstance(k, slice):
- loc_level = self._get_level_indexer(k, level=i)
- if isinstance(loc_level, slice):
- if com.is_null_slice(loc_level) or com.is_full_slice(
- loc_level, len(self)
- ):
- # everything
- continue
- else:
- # e.g. test_xs_IndexSlice_argument_not_implemented
- k_index = np.zeros(len(self), dtype=bool)
- k_index[loc_level] = True
-
+ k = self._get_level_indexer(k, level=i)
+ if isinstance(k, slice):
+ # everything
+ if k.start == 0 and k.stop == len(self):
+ k = slice(None, None)
else:
- k_index = loc_level
-
- elif com.is_null_slice(k):
- # taking everything, does not affect `indexer` below
- continue
+ k_index = k
- else:
- # FIXME: this message can be inaccurate, e.g.
- # test_series_varied_multiindex_alignment
- raise TypeError(f"Expected label or tuple of labels, got {key}")
+ if isinstance(k, slice):
+ if k == slice(None, None):
+ continue
+ else:
+ raise TypeError(key)
if indexer is None:
indexer = k_index
- else:
+ else: # pragma: no cover
indexer &= k_index
if indexer is None:
indexer = slice(None, None)
ilevels = [i for i in range(len(key)) if key[i] != slice(None, None)]
- return indexer, maybe_mi_droplevels(indexer, ilevels)
+ return indexer, maybe_mi_droplevels(indexer, ilevels, drop_level)
else:
indexer = self._get_level_indexer(key, level=level)
- if (
- isinstance(key, str)
- and self.levels[level]._supports_partial_string_indexing
- ):
- # check to see if we did an exact lookup vs sliced
- check = self.levels[level].get_loc(key)
- if not is_integer(check):
- # e.g. test_partial_string_timestamp_multiindex
- return indexer, self[indexer]
-
- return indexer, maybe_mi_droplevels(indexer, [level])
+ return indexer, maybe_mi_droplevels(indexer, [level], drop_level)
- def _get_level_indexer(
- self, key, level: int = 0, indexer: Int64Index | None = None
- ):
+ def _get_level_indexer(self, key, level: int = 0, indexer=None):
# `level` kwarg is _always_ positional, never name
# return an indexer, boolean array or a slice showing where the key is
# in the totality of values
@@ -3163,23 +3205,15 @@ def convert_indexer(start, stop, step, indexer=indexer, codes=level_codes):
if level > 0 or self._lexsort_depth == 0:
# Desired level is not sorted
- if isinstance(idx, slice):
- # test_get_loc_partial_timestamp_multiindex
- locs = (level_codes >= idx.start) & (level_codes < idx.stop)
- return locs
-
locs = np.array(level_codes == idx, dtype=bool, copy=False)
-
if not locs.any():
# The label is present in self.levels[level] but unused:
raise KeyError(key)
return locs
if isinstance(idx, slice):
- # e.g. test_partial_string_timestamp_multiindex
- start = level_codes.searchsorted(idx.start, side="left")
- # NB: "left" here bc of slice semantics
- end = level_codes.searchsorted(idx.stop, side="left")
+ start = idx.start
+ end = idx.stop
else:
start = level_codes.searchsorted(idx, side="left")
end = level_codes.searchsorted(idx, side="right")
@@ -3231,12 +3265,10 @@ def get_locs(self, seq):
"MultiIndex slicing requires the index to be lexsorted: slicing "
f"on levels {true_slices}, lexsort depth {self._lexsort_depth}"
)
-
+ # indexer
+ # this is the list of all values that we want to select
n = len(self)
- # indexer is the list of all positions that we want to take; we
- # start with it being everything and narrow it down as we look at each
- # entry in `seq`
- indexer = Index(np.arange(n))
+ indexer = None
def _convert_to_indexer(r) -> Int64Index:
# return an indexer
@@ -3254,10 +3286,14 @@ def _convert_to_indexer(r) -> Int64Index:
r = r.nonzero()[0]
return Int64Index(r)
- def _update_indexer(idxr: Index, indexer: Index) -> Index:
+ def _update_indexer(idxr: Index | None, indexer: Index | None, key) -> Index:
+ if indexer is None:
+ indexer = Index(np.arange(n))
+ if idxr is None:
+ return indexer
indexer_intersection = indexer.intersection(idxr)
if indexer_intersection.empty and not idxr.empty and not indexer.empty:
- raise KeyError(seq)
+ raise KeyError(key)
return indexer_intersection
for i, k in enumerate(seq):
@@ -3265,85 +3301,65 @@ def _update_indexer(idxr: Index, indexer: Index) -> Index:
if com.is_bool_indexer(k):
# a boolean indexer, must be the same length!
k = np.asarray(k)
- lvl_indexer = _convert_to_indexer(k)
- indexer = _update_indexer(lvl_indexer, indexer=indexer)
+ indexer = _update_indexer(
+ _convert_to_indexer(k), indexer=indexer, key=seq
+ )
elif is_list_like(k):
# a collection of labels to include from this level (these
# are or'd)
-
indexers: Int64Index | None = None
for x in k:
try:
- # Argument "indexer" to "_get_level_indexer" of "MultiIndex"
- # has incompatible type "Index"; expected "Optional[Int64Index]"
- item_lvl_indexer = self._get_level_indexer(
- x, level=i, indexer=indexer # type: ignore[arg-type]
+ idxrs = _convert_to_indexer(
+ self._get_level_indexer(x, level=i, indexer=indexer)
)
- except KeyError:
- # ignore not founds; see discussion in GH#39424
- warnings.warn(
- "The behavior of indexing on a MultiIndex with a nested "
- "sequence of labels is deprecated and will change in a "
- "future version. `series.loc[label, sequence]` will "
- "raise if any members of 'sequence' or not present in "
- "the index's second level. To retain the old behavior, "
- "use `series.index.isin(sequence, level=1)`",
- # TODO: how to opt in to the future behavior?
- # TODO: how to handle IntervalIndex level? (no test cases)
- FutureWarning,
- stacklevel=7,
+ indexers = (idxrs if indexers is None else indexers).union(
+ idxrs, sort=False
)
- continue
- else:
- idxrs = _convert_to_indexer(item_lvl_indexer)
+ except KeyError:
- if indexers is None:
- indexers = idxrs
- else:
- indexers = indexers.union(idxrs, sort=False)
+ # ignore not founds
+ continue
if indexers is not None:
- indexer = _update_indexer(indexers, indexer=indexer)
+ indexer = _update_indexer(indexers, indexer=indexer, key=seq)
else:
# no matches we are done
- # test_loc_getitem_duplicates_multiindex_empty_indexer
- return np.array([], dtype=np.intp)
+ return np.array([], dtype=np.int64)
elif com.is_null_slice(k):
# empty slice
- pass
+ indexer = _update_indexer(None, indexer=indexer, key=seq)
elif isinstance(k, slice):
# a slice, include BOTH of the labels
- # Argument "indexer" to "_get_level_indexer" of "MultiIndex" has
- # incompatible type "Index"; expected "Optional[Int64Index]"
- lvl_indexer = self._get_level_indexer(
- k,
- level=i,
- indexer=indexer, # type: ignore[arg-type]
- )
indexer = _update_indexer(
- _convert_to_indexer(lvl_indexer),
+ _convert_to_indexer(
+ self._get_level_indexer(k, level=i, indexer=indexer)
+ ),
indexer=indexer,
+ key=seq,
)
else:
# a single label
- lvl_indexer = self._get_loc_level(k, level=i)[0]
indexer = _update_indexer(
- _convert_to_indexer(lvl_indexer),
+ _convert_to_indexer(
+ self.get_loc_level(k, level=i, drop_level=False)[0]
+ ),
indexer=indexer,
+ key=seq,
)
# empty indexer
if indexer is None:
- return np.array([], dtype=np.intp)
+ return np.array([], dtype=np.int64)
assert isinstance(indexer, Int64Index), type(indexer)
indexer = self._reorder_indexer(seq, indexer)
- return indexer._values.astype(np.intp, copy=False)
+ return indexer._values
# --------------------------------------------------------------------
@@ -3572,10 +3588,27 @@ def _maybe_match_names(self, other):
names.append(None)
return names
- def _wrap_intersection_result(self, other, result):
- _, result_names = self._convert_can_do_setop(other)
+ def _intersection(self, other, sort=False) -> MultiIndex:
+ other, result_names = self._convert_can_do_setop(other)
+ other = other.astype(object, copy=False)
+
+ uniq_tuples = None # flag whether _inner_indexer was successful
+ if self.is_monotonic and other.is_monotonic:
+ try:
+ inner_tuples = self._inner_indexer(other)[0]
+ sort = False # inner_tuples is already sorted
+ except TypeError:
+ pass
+ else:
+ uniq_tuples = algos.unique(inner_tuples)
+
+ if uniq_tuples is None:
+ uniq_tuples = self._intersection_via_get_indexer(other, sort)
+
+ if sort is None:
+ uniq_tuples = sorted(uniq_tuples)
- if len(result) == 0:
+ if len(uniq_tuples) == 0:
return MultiIndex(
levels=self.levels,
codes=[[]] * self.nlevels,
@@ -3583,12 +3616,24 @@ def _wrap_intersection_result(self, other, result):
verify_integrity=False,
)
else:
- return MultiIndex.from_arrays(zip(*result), sortorder=0, names=result_names)
+ return MultiIndex.from_arrays(
+ zip(*uniq_tuples), sortorder=0, names=result_names
+ )
+
+ def _difference(self, other, sort) -> MultiIndex:
+ other, result_names = self._convert_can_do_setop(other)
+
+ this = self._get_unique_index()
- def _wrap_difference_result(self, other, result):
- _, result_names = self._convert_can_do_setop(other)
+ indexer = this.get_indexer(other)
+ indexer = indexer.take((indexer != -1).nonzero()[0])
- if len(result) == 0:
+ label_diff = np.setdiff1d(np.arange(this.size), indexer, assume_unique=True)
+ difference = this._values.take(label_diff)
+ if sort is None:
+ difference = sorted(difference)
+
+ if len(difference) == 0:
return MultiIndex(
levels=[[]] * self.nlevels,
codes=[[]] * self.nlevels,
@@ -3596,7 +3641,7 @@ def _wrap_difference_result(self, other, result):
verify_integrity=False,
)
else:
- return MultiIndex.from_tuples(result, sortorder=0, names=result_names)
+ return MultiIndex.from_tuples(difference, sortorder=0, names=result_names)
def _convert_can_do_setop(self, other):
result_names = self.names
@@ -3618,6 +3663,18 @@ def _convert_can_do_setop(self, other):
return other, result_names
+ def symmetric_difference(self, other, result_name=None, sort=None):
+ # On equal symmetric_difference MultiIndexes the difference is empty.
+ # Therefore, an empty MultiIndex is returned GH13490
+ tups = Index.symmetric_difference(self, other, result_name, sort)
+ if len(tups) == 0:
+ return type(self)(
+ levels=[[] for _ in range(self.nlevels)],
+ codes=[[] for _ in range(self.nlevels)],
+ names=tups.names,
+ )
+ return tups
+
# --------------------------------------------------------------------
@doc(Index.astype)
@@ -3820,7 +3877,7 @@ def maybe_droplevels(index: Index, key) -> Index:
def _coerce_indexer_frozen(array_like, categories, copy: bool = False) -> np.ndarray:
"""
- Coerce the array-like indexer to the smallest integer dtype that can encode all
+ Coerce the array_like indexer to the smallest integer dtype that can encode all
of the given categories.
Parameters
diff --git a/pandas/core/indexes/numeric.py b/pandas/core/indexes/numeric.py
index 24f3df684ab10..ea2d5d9eec6ac 100644
--- a/pandas/core/indexes/numeric.py
+++ b/pandas/core/indexes/numeric.py
@@ -153,12 +153,7 @@ def _ensure_array(cls, data, dtype, copy: bool):
if not isinstance(data, (ABCSeries, list, tuple)):
data = list(data)
- orig = data
data = np.asarray(data, dtype=dtype)
- if dtype is None and data.dtype.kind == "f":
- if cls is UInt64Index and (data >= 0).all():
- # https://github.com/numpy/numpy/issues/19146
- data = np.asarray(orig, dtype=np.uint64)
if issubclass(data.dtype.type, str):
cls._string_data_error(data)
@@ -233,7 +228,6 @@ def astype(self, dtype, copy=True):
# ----------------------------------------------------------------
# Indexing Methods
- @cache_readonly
@doc(Index._should_fallback_to_positional)
def _should_fallback_to_positional(self) -> bool:
return False
@@ -376,16 +370,6 @@ class UInt64Index(IntegerIndex):
_default_dtype = np.dtype(np.uint64)
_dtype_validation_metadata = (is_unsigned_integer_dtype, "unsigned integer")
- def _validate_fill_value(self, value):
- # e.g. np.array([1]) we want np.array([1], dtype=np.uint64)
- # see test_where_uin64
- super()._validate_fill_value(value)
- if hasattr(value, "dtype") and is_signed_integer_dtype(value.dtype):
- if (value >= 0).all():
- return value.astype(self.dtype)
- raise TypeError
- return value
-
class Float64Index(NumericIndex):
_index_descr_args = {
diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py
index df3862553a70c..c1104b80a0a7a 100644
--- a/pandas/core/indexes/period.py
+++ b/pandas/core/indexes/period.py
@@ -15,24 +15,29 @@
)
from pandas._libs.tslibs import (
BaseOffset,
- NaT,
Period,
Resolution,
Tick,
)
+from pandas._libs.tslibs.parsing import (
+ DateParseError,
+ parse_time_string,
+)
from pandas._typing import (
Dtype,
DtypeObj,
)
+from pandas.errors import InvalidIndexError
from pandas.util._decorators import doc
from pandas.core.dtypes.common import (
is_datetime64_any_dtype,
+ is_float,
is_integer,
+ is_scalar,
pandas_dtype,
)
from pandas.core.dtypes.dtypes import PeriodDtype
-from pandas.core.dtypes.missing import is_valid_na_for_dtype
from pandas.core.arrays.period import (
PeriodArray,
@@ -406,59 +411,55 @@ def get_loc(self, key, method=None, tolerance=None):
"""
orig_key = key
- self._check_indexing_error(key)
+ if not is_scalar(key):
+ raise InvalidIndexError(key)
- if is_valid_na_for_dtype(key, self.dtype):
- key = NaT
+ if isinstance(key, str):
- elif isinstance(key, str):
+ try:
+ loc = self._get_string_slice(key)
+ return loc
+ except (TypeError, ValueError):
+ pass
try:
- parsed, reso = self._parse_with_reso(key)
- except ValueError as err:
+ asdt, reso_str = parse_time_string(key, self.freq)
+ except (ValueError, DateParseError) as err:
# A string with invalid format
raise KeyError(f"Cannot interpret '{key}' as period") from err
- if self._can_partial_date_slice(reso):
- try:
- return self._partial_date_slice(reso, parsed)
- except KeyError as err:
- # TODO: pass if method is not None, like DTI does?
- raise KeyError(key) from err
+ reso = Resolution.from_attrname(reso_str)
+ grp = reso.freq_group.value
+ freqn = self.dtype.freq_group_code
+
+ # _get_string_slice will handle cases where grp < freqn
+ assert grp >= freqn
- if reso == self.dtype.resolution:
- # the reso < self.dtype.resolution case goes through _get_string_slice
- key = Period(parsed, freq=self.freq)
+ # BusinessDay is a bit strange. It has a *lower* code, but we never parse
+ # a string as "BusinessDay" resolution, just Day.
+ if grp == freqn or (
+ reso == Resolution.RESO_DAY and self.dtype.freq.name == "B"
+ ):
+ key = Period(asdt, freq=self.freq)
loc = self.get_loc(key, method=method, tolerance=tolerance)
- # Recursing instead of falling through matters for the exception
- # message in test_get_loc3 (though not clear if that really matters)
return loc
elif method is None:
raise KeyError(key)
else:
- key = Period(parsed, freq=self.freq)
-
- elif isinstance(key, Period):
- sfreq = self.freq
- kfreq = key.freq
- if not (
- sfreq.n == kfreq.n
- and sfreq._period_dtype_code == kfreq._period_dtype_code
- ):
- # GH#42247 For the subset of DateOffsets that can be Period freqs,
- # checking these two attributes is sufficient to check equality,
- # and much more performant than `self.freq == key.freq`
- raise KeyError(key)
- elif isinstance(key, datetime):
- try:
- key = Period(key, freq=self.freq)
- except ValueError as err:
- # we cannot construct the Period
- raise KeyError(orig_key) from err
- else:
- # in particular integer, which Period constructor would cast to string
+ key = asdt
+
+ elif is_integer(key):
+ # Period constructor will cast to string, which we dont want
+ raise KeyError(key)
+ elif isinstance(key, Period) and key.freq != self.freq:
raise KeyError(key)
+ try:
+ key = Period(key, freq=self.freq)
+ except ValueError as err:
+ # we cannot construct the Period
+ raise KeyError(orig_key) from err
+
try:
return Index.get_loc(self, key, method, tolerance)
except KeyError as err:
@@ -491,14 +492,14 @@ def _maybe_cast_slice_bound(self, label, side: str, kind=lib.no_default):
return Period(label, freq=self.freq)
elif isinstance(label, str):
try:
- parsed, reso = self._parse_with_reso(label)
+ parsed, reso_str = parse_time_string(label, self.freq)
+ reso = Resolution.from_attrname(reso_str)
+ bounds = self._parsed_string_to_bounds(reso, parsed)
+ return bounds[0 if side == "left" else 1]
except ValueError as err:
# string cannot be parsed as datetime-like
raise self._invalid_indexer("slice", label) from err
-
- lower, upper = self._parsed_string_to_bounds(reso, parsed)
- return lower if side == "left" else upper
- elif not isinstance(label, self._data._recognized_scalars):
+ elif is_integer(label) or is_float(label):
raise self._invalid_indexer("slice", label)
return label
@@ -508,10 +509,24 @@ def _parsed_string_to_bounds(self, reso: Resolution, parsed: datetime):
iv = Period(parsed, freq=grp.value)
return (iv.asfreq(self.freq, how="start"), iv.asfreq(self.freq, how="end"))
- def _can_partial_date_slice(self, reso: Resolution) -> bool:
+ def _validate_partial_date_slice(self, reso: Resolution):
assert isinstance(reso, Resolution), (type(reso), reso)
- # e.g. test_getitem_setitem_periodindex
- return reso > self.dtype.resolution
+ grp = reso.freq_group
+ freqn = self.dtype.freq_group_code
+
+ if not grp.value < freqn:
+ # TODO: we used to also check for
+ # reso in ["day", "hour", "minute", "second"]
+ # why is that check not needed?
+ raise ValueError
+
+ def _get_string_slice(self, key: str):
+ parsed, reso_str = parse_time_string(key, self.freq)
+ reso = Resolution.from_attrname(reso_str)
+ try:
+ return self._partial_date_slice(reso, parsed)
+ except KeyError as err:
+ raise KeyError(key) from err
def period_range(
diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py
index 0ce99df44a5f9..746246172b967 100644
--- a/pandas/core/indexes/range.py
+++ b/pandas/core/indexes/range.py
@@ -17,10 +17,7 @@
from pandas._libs import index as libindex
from pandas._libs.lib import no_default
-from pandas._typing import (
- Dtype,
- npt,
-)
+from pandas._typing import Dtype
from pandas.compat.numpy import function as nv
from pandas.util._decorators import (
cache_readonly,
@@ -388,7 +385,6 @@ def get_loc(self, key, method=None, tolerance=None):
return self._range.index(new_key)
except ValueError as err:
raise KeyError(key) from err
- self._check_indexing_error(key)
raise KeyError(key)
return super().get_loc(key, method=method, tolerance=tolerance)
@@ -398,7 +394,8 @@ def _get_indexer(
method: str | None = None,
limit: int | None = None,
tolerance=None,
- ) -> npt.NDArray[np.intp]:
+ ) -> np.ndarray:
+ # -> np.ndarray[np.intp]
if com.any_not_none(method, tolerance, limit):
return super()._get_indexer(
target, method=method, tolerance=tolerance, limit=limit
@@ -411,6 +408,10 @@ def _get_indexer(
reverse = self._range[::-1]
start, stop, step = reverse.start, reverse.stop, reverse.step
+ if not is_signed_integer_dtype(target):
+ # checks/conversions/roundings are delegated to general method
+ return super()._get_indexer(target, method=method, tolerance=tolerance)
+
target_array = np.asarray(target)
locs = target_array - start
valid = (locs % step == 0) & (locs >= 0) & (target_array < stop)
@@ -504,7 +505,7 @@ def max(self, axis=None, skipna: bool = True, *args, **kwargs) -> int:
nv.validate_max(args, kwargs)
return self._minmax("max")
- def argsort(self, *args, **kwargs) -> npt.NDArray[np.intp]:
+ def argsort(self, *args, **kwargs) -> np.ndarray:
"""
Returns the indices that would sort the index and its
underlying data.
@@ -531,7 +532,7 @@ def argsort(self, *args, **kwargs) -> npt.NDArray[np.intp]:
def factorize(
self, sort: bool = False, na_sentinel: int | None = -1
- ) -> tuple[npt.NDArray[np.intp], RangeIndex]:
+ ) -> tuple[np.ndarray, RangeIndex]:
codes = np.arange(len(self), dtype=np.intp)
uniques = self
if sort and self.step < 0:
@@ -551,12 +552,14 @@ def equals(self, other: object) -> bool:
# Set Operations
def _intersection(self, other: Index, sort=False):
- # caller is responsible for checking self and other are both non-empty
if not isinstance(other, RangeIndex):
# Int64Index
return super()._intersection(other, sort=sort)
+ if not len(self) or not len(other):
+ return self._simple_new(_empty_range)
+
first = self._range[::-1] if self.step < 0 else self._range
second = other._range[::-1] if other.step < 0 else other._range
@@ -727,18 +730,6 @@ def _difference(self, other, sort=None):
new_index = new_index[::-1]
return new_index
- def symmetric_difference(self, other, result_name: Hashable = None, sort=None):
- if not isinstance(other, RangeIndex) or sort is not None:
- return super().symmetric_difference(other, result_name, sort)
-
- left = self.difference(other)
- right = other.difference(self)
- result = left.union(right)
-
- if result_name is not None:
- result = result.rename(result_name)
- return result
-
# --------------------------------------------------------------------
def _concat(self, indexes: list[Index], name: Hashable) -> Index:
diff --git a/pandas/core/indexes/timedeltas.py b/pandas/core/indexes/timedeltas.py
index 023cb651c9632..c60ab06dd08f3 100644
--- a/pandas/core/indexes/timedeltas.py
+++ b/pandas/core/indexes/timedeltas.py
@@ -9,7 +9,11 @@
Timedelta,
to_offset,
)
-from pandas._typing import DtypeObj
+from pandas._typing import (
+ DtypeObj,
+ Optional,
+)
+from pandas.errors import InvalidIndexError
from pandas.core.dtypes.common import (
TD64NS_DTYPE,
@@ -105,9 +109,6 @@ class TimedeltaIndex(DatetimeTimedeltaMixin):
_data: TimedeltaArray
- # Use base class method instead of DatetimeTimedeltaMixin._get_string_slice
- _get_string_slice = Index._get_string_slice
-
# -------------------------------------------------------------------
# Constructors
@@ -169,7 +170,8 @@ def get_loc(self, key, method=None, tolerance=None):
-------
loc : int, slice, or ndarray[int]
"""
- self._check_indexing_error(key)
+ if not is_scalar(key):
+ raise InvalidIndexError(key)
try:
key = self._data._validate_scalar(key, unbox=False)
@@ -196,30 +198,17 @@ def _maybe_cast_slice_bound(self, label, side: str, kind=lib.no_default):
self._deprecated_arg(kind, "kind", "_maybe_cast_slice_bound")
if isinstance(label, str):
- try:
- parsed, reso = self._parse_with_reso(label)
- except ValueError as err:
- # e.g. 'unit abbreviation w/o a number'
- raise self._invalid_indexer("slice", label) from err
-
- lower, upper = self._parsed_string_to_bounds(reso, parsed)
- return lower if side == "left" else upper
+ parsed = Timedelta(label)
+ lbound = parsed.round(parsed.resolution_string)
+ if side == "left":
+ return lbound
+ else:
+ return lbound + to_offset(parsed.resolution_string) - Timedelta(1, "ns")
elif not isinstance(label, self._data._recognized_scalars):
raise self._invalid_indexer("slice", label)
return label
- def _parse_with_reso(self, label: str):
- # the "with_reso" is a no-op for TimedeltaIndex
- parsed = Timedelta(label)
- return parsed, None
-
- def _parsed_string_to_bounds(self, reso, parsed: Timedelta):
- # reso is unused, included to match signature of DTI/PI
- lbound = parsed.round(parsed.resolution_string)
- rbound = lbound + to_offset(parsed.resolution_string) - Timedelta(1, "ns")
- return lbound, rbound
-
# -------------------------------------------------------------------
@property
@@ -230,7 +219,7 @@ def inferred_type(self) -> str:
def timedelta_range(
start=None,
end=None,
- periods: int | None = None,
+ periods: Optional[int] = None,
freq=None,
name=None,
closed=None,
diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py
index 7d92f7ff11ed3..3707e141bc447 100644
--- a/pandas/core/indexing.py
+++ b/pandas/core/indexing.py
@@ -30,6 +30,7 @@
is_object_dtype,
is_scalar,
is_sequence,
+ needs_i8_conversion,
)
from pandas.core.dtypes.concat import concat_compat
from pandas.core.dtypes.generic import (
@@ -41,12 +42,8 @@
isna,
)
-from pandas.core import algorithms as algos
import pandas.core.common as com
-from pandas.core.construction import (
- array as pd_array,
- extract_array,
-)
+from pandas.core.construction import array as pd_array
from pandas.core.indexers import (
check_array_indexer,
is_empty_indexer,
@@ -55,8 +52,11 @@
length_of_indexer,
)
from pandas.core.indexes.api import (
+ CategoricalIndex,
Index,
+ IntervalIndex,
MultiIndex,
+ ensure_index,
)
if TYPE_CHECKING:
@@ -821,15 +821,7 @@ def _getitem_lowerdim(self, tup: tuple):
ax0 = self.obj._get_axis(0)
# ...but iloc should handle the tuple as simple integer-location
# instead of checking it as multiindex representation (GH 13797)
- if (
- isinstance(ax0, MultiIndex)
- and self.name != "iloc"
- and not any(isinstance(x, slice) for x in tup)
- ):
- # Note: in all extant test cases, replacing the slice condition with
- # `all(is_hashable(x) or com.is_null_slice(x) for x in tup)`
- # is equivalent.
- # (see the other place where we call _handle_lowerdim_multi_index_axis0)
+ if isinstance(ax0, MultiIndex) and self.name != "iloc":
with suppress(IndexingError):
return self._handle_lowerdim_multi_index_axis0(tup)
@@ -878,21 +870,17 @@ def _getitem_nested_tuple(self, tup: tuple):
if self.name != "loc":
# This should never be reached, but lets be explicit about it
raise ValueError("Too many indices")
- if all(is_hashable(x) or com.is_null_slice(x) for x in tup):
+ if isinstance(self.obj, ABCSeries) and any(
+ isinstance(k, tuple) for k in tup
+ ):
+ # GH#35349 Raise if tuple in tuple for series
+ raise ValueError("Too many indices")
+ if self.ndim == 1 or not any(isinstance(x, slice) for x in tup):
# GH#10521 Series should reduce MultiIndex dimensions instead of
# DataFrame, IndexingError is not raised when slice(None,None,None)
# with one row.
with suppress(IndexingError):
return self._handle_lowerdim_multi_index_axis0(tup)
- elif isinstance(self.obj, ABCSeries) and any(
- isinstance(k, tuple) for k in tup
- ):
- # GH#35349 Raise if tuple in tuple for series
- # Do this after the all-hashable-or-null-slice check so that
- # we are only getting non-hashable tuples, in particular ones
- # that themselves contain a slice entry
- # See test_loc_series_getitem_too_many_dimensions
- raise ValueError("Too many indices")
# this is a series with a multi-index specified a tuple of
# selectors
@@ -928,7 +916,9 @@ def __getitem__(self, key):
key = tuple(list(x) if is_iterator(x) else x for x in key)
key = tuple(com.apply_if_callable(x, self.obj) for x in key)
if self._is_scalar_access(key):
- return self.obj._get_value(*key, takeable=self._takeable)
+ with suppress(KeyError, IndexError, AttributeError):
+ # AttributeError for IntervalTree get_value
+ return self.obj._get_value(*key, takeable=self._takeable)
return self._getitem_tuple(key)
else:
# we by definition only have the 0th axis
@@ -1014,7 +1004,7 @@ def _is_scalar_access(self, key: tuple) -> bool:
# should not be considered scalar
return False
- if not ax._index_as_unique:
+ if not ax.is_unique:
return False
return True
@@ -1125,13 +1115,16 @@ def _handle_lowerdim_multi_index_axis0(self, tup: tuple):
try:
# fast path for series or for tup devoid of slices
return self._get_label(tup, axis=axis)
-
+ except (TypeError, InvalidIndexError):
+ # slices are unhashable
+ pass
except KeyError as ek:
# raise KeyError if number of indexers match
# else IndexingError will be raised
if self.ndim < len(tup) <= self.obj.index.nlevels:
raise ek
- raise IndexingError("No label returned") from ek
+
+ raise IndexingError("No label returned")
def _getitem_axis(self, key, axis: int):
key = item_from_zerodim(key)
@@ -1139,6 +1132,7 @@ def _getitem_axis(self, key, axis: int):
key = list(key)
labels = self.obj._get_axis(axis)
+ key = labels._get_partial_string_timestamp_match_key(key)
if isinstance(key, slice):
self._validate_key(key, axis)
@@ -1240,7 +1234,9 @@ def _convert_to_indexer(self, key, axis: int, is_setter: bool = False):
return {"key": key}
if is_nested_tuple(key, labels):
- if self.ndim == 1 and any(isinstance(k, tuple) for k in key):
+ if isinstance(self.obj, ABCSeries) and any(
+ isinstance(k, tuple) for k in key
+ ):
# GH#35349 Raise if tuple in tuple for series
raise ValueError("Too many indices")
return labels.get_locs(key)
@@ -1289,12 +1285,94 @@ def _get_listlike_indexer(self, key, axis: int):
Indexer for the return object, -1 denotes keys not found.
"""
ax = self.obj._get_axis(axis)
- axis_name = self.obj._get_axis_name(axis)
- keyarr, indexer = ax._get_indexer_strict(key, axis_name)
+ keyarr = key
+ if not isinstance(keyarr, Index):
+ keyarr = com.asarray_tuplesafe(keyarr)
+
+ if isinstance(ax, MultiIndex):
+ # get_indexer expects a MultiIndex or sequence of tuples, but
+ # we may be doing partial-indexing, so need an extra check
+
+ # Have the index compute an indexer or return None
+ # if it cannot handle:
+ indexer = ax._convert_listlike_indexer(keyarr)
+ # We only act on all found values:
+ if indexer is not None and (indexer != -1).all():
+ # _validate_read_indexer is a no-op if no -1s, so skip
+ return ax[indexer], indexer
+
+ if ax._index_as_unique:
+ indexer = ax.get_indexer_for(keyarr)
+ keyarr = ax.reindex(keyarr)[0]
+ else:
+ keyarr, indexer, new_indexer = ax._reindex_non_unique(keyarr)
+
+ self._validate_read_indexer(keyarr, indexer, axis)
+
+ if needs_i8_conversion(ax.dtype) or isinstance(
+ ax, (IntervalIndex, CategoricalIndex)
+ ):
+ # For CategoricalIndex take instead of reindex to preserve dtype.
+ # For IntervalIndex this is to map integers to the Intervals they match to.
+ keyarr = ax.take(indexer)
+ if keyarr.dtype.kind in ["m", "M"]:
+ # DTI/TDI.take can infer a freq in some cases when we dont want one
+ if isinstance(key, list) or (
+ isinstance(key, type(ax)) and key.freq is None
+ ):
+ keyarr = keyarr._with_freq(None)
return keyarr, indexer
+ def _validate_read_indexer(self, key, indexer, axis: int):
+ """
+ Check that indexer can be used to return a result.
+
+ e.g. at least one element was found,
+ unless the list of keys was actually empty.
+
+ Parameters
+ ----------
+ key : list-like
+ Targeted labels (only used to show correct error message).
+ indexer: array-like of booleans
+ Indices corresponding to the key,
+ (with -1 indicating not found).
+ axis : int
+ Dimension on which the indexing is being made.
+
+ Raises
+ ------
+ KeyError
+ If at least one key was requested but none was found.
+ """
+ if len(key) == 0:
+ return
+
+ # Count missing values:
+ missing_mask = indexer < 0
+ missing = (missing_mask).sum()
+
+ if missing:
+ ax = self.obj._get_axis(axis)
+
+ # TODO: remove special-case; this is just to keep exception
+ # message tests from raising while debugging
+ use_interval_msg = isinstance(ax, IntervalIndex) or (
+ isinstance(ax, CategoricalIndex)
+ and isinstance(ax.categories, IntervalIndex)
+ )
+
+ if missing == len(indexer):
+ axis_name = self.obj._get_axis_name(axis)
+ if use_interval_msg:
+ key = list(key)
+ raise KeyError(f"None of [{key}] are in the [{axis_name}]")
+
+ not_found = list(ensure_index(key)[missing_mask.nonzero()[0]].unique())
+ raise KeyError(f"{not_found} not in index")
+
@doc(IndexingMixin.iloc)
class _iLocIndexer(_LocationIndexer):
@@ -1584,21 +1662,6 @@ def _setitem_with_indexer(self, indexer, value, name="iloc"):
if com.is_null_slice(indexer[0]):
# We are setting an entire column
self.obj[key] = value
- return
- elif is_array_like(value):
- # GH#42099
- arr = extract_array(value, extract_numpy=True)
- taker = -1 * np.ones(len(self.obj), dtype=np.intp)
- empty_value = algos.take_nd(arr, taker)
- if not isinstance(value, ABCSeries):
- # if not Series (in which case we need to align),
- # we can short-circuit
- empty_value[indexer[0]] = arr
- self.obj[key] = empty_value
- return
-
- self.obj[key] = empty_value
-
else:
self.obj[key] = infer_fill_value(value)
@@ -2125,7 +2188,7 @@ class _ScalarAccessIndexer(NDFrameIndexerBase):
Access scalars quickly.
"""
- def _convert_key(self, key):
+ def _convert_key(self, key, is_setter: bool = False):
raise AbstractMethodError(self)
def __getitem__(self, key):
@@ -2149,7 +2212,7 @@ def __setitem__(self, key, value):
if not isinstance(key, tuple):
key = _tuplify(self.ndim, key)
- key = list(self._convert_key(key))
+ key = list(self._convert_key(key, is_setter=True))
if len(key) != self.ndim:
raise ValueError("Not enough indexers for scalar access (setting)!")
@@ -2160,7 +2223,7 @@ def __setitem__(self, key, value):
class _AtIndexer(_ScalarAccessIndexer):
_takeable = False
- def _convert_key(self, key):
+ def _convert_key(self, key, is_setter: bool = False):
"""
Require they keys to be the same type as the index. (so we don't
fallback)
@@ -2171,6 +2234,10 @@ def _convert_key(self, key):
if self.ndim == 1 and len(key) > 1:
key = (key,)
+ # allow arbitrary setting
+ if is_setter:
+ return list(key)
+
return key
@property
@@ -2205,7 +2272,7 @@ def __setitem__(self, key, value):
class _iAtIndexer(_ScalarAccessIndexer):
_takeable = True
- def _convert_key(self, key):
+ def _convert_key(self, key, is_setter: bool = False):
"""
Require integer args. (and convert to label arguments)
"""
diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py
index 51ea45ac18ce0..76967cdc9b52e 100644
--- a/pandas/core/internals/array_manager.py
+++ b/pandas/core/internals/array_manager.py
@@ -7,7 +7,6 @@
TYPE_CHECKING,
Any,
Callable,
- Hashable,
TypeVar,
)
@@ -20,6 +19,7 @@
from pandas._typing import (
ArrayLike,
DtypeObj,
+ Hashable,
)
from pandas.util._validators import validate_bool_kwarg
@@ -820,7 +820,9 @@ def iset(self, loc: int | slice | np.ndarray, value: ArrayLike):
assert isinstance(value, (np.ndarray, ExtensionArray))
assert value.ndim == 1
assert len(value) == len(self._axes[0])
- self.arrays[loc] = value
+ # error: Invalid index type "Union[int, slice, ndarray]" for
+ # "List[Union[ndarray, ExtensionArray]]"; expected type "int"
+ self.arrays[loc] = value # type: ignore[index]
return
# multiple columns -> convert slice or array to integer indices
diff --git a/pandas/core/internals/base.py b/pandas/core/internals/base.py
index 0ee22200ed495..2bb14efad1ce7 100644
--- a/pandas/core/internals/base.py
+++ b/pandas/core/internals/base.py
@@ -4,14 +4,12 @@
"""
from __future__ import annotations
-from typing import (
- TypeVar,
- final,
-)
+from typing import TypeVar
from pandas._typing import (
DtypeObj,
Shape,
+ final,
)
from pandas.errors import AbstractMethodError
diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py
index 314ab5243b6c8..2e7e6c7f7a100 100644
--- a/pandas/core/internals/blocks.py
+++ b/pandas/core/internals/blocks.py
@@ -9,7 +9,6 @@
Iterable,
Sequence,
cast,
- final,
)
import warnings
@@ -29,6 +28,7 @@
DtypeObj,
F,
Shape,
+ final,
)
from pandas.util._decorators import cache_readonly
from pandas.util._validators import validate_bool_kwarg
@@ -281,7 +281,7 @@ def __repr__(self) -> str:
result = f"{name}: {len(self)} dtype: {self.dtype}"
else:
- shape = " x ".join([str(s) for s in self.shape])
+ shape = " x ".join(str(s) for s in self.shape)
result = f"{name}: {self.mgr_locs.indexer}, {shape}, dtype: {self.dtype}"
return result
@@ -312,6 +312,17 @@ def getitem_block(self, slicer) -> Block:
return type(self)(new_values, new_mgr_locs, self.ndim)
+ def getitem_block_index(self, slicer: slice) -> Block:
+ """
+ Perform __getitem__-like specialized to slicing along index.
+
+ Assumes self.ndim == 2
+ """
+ # error: Invalid index type "Tuple[ellipsis, slice]" for
+ # "Union[ndarray, ExtensionArray]"; expected type "Union[int, slice, ndarray]"
+ new_values = self.values[..., slicer] # type: ignore[index]
+ return type(self)(new_values, self._mgr_locs, ndim=self.ndim)
+
@final
def getitem_block_columns(self, slicer, new_mgr_locs: BlockPlacement) -> Block:
"""
@@ -886,7 +897,7 @@ def setitem(self, indexer, value):
Parameters
----------
- indexer : tuple, list-like, array-like, slice, int
+ indexer : tuple, list-like, array-like, slice
The subset of self.values to set
value : object
The value being set
@@ -1156,17 +1167,13 @@ def shift(self, periods: int, axis: int = 0, fill_value: Any = None) -> list[Blo
# convert integer to float if necessary. need to do a lot more than
# that, handle boolean etc also
- # error: Value of type variable "NumpyArrayT" of "maybe_upcast" cannot be
- # "Union[ndarray[Any, Any], ExtensionArray]"
+ # error: Argument 1 to "maybe_upcast" has incompatible type "Union[ndarray,
+ # ExtensionArray]"; expected "ndarray"
new_values, fill_value = maybe_upcast(
- self.values, fill_value # type: ignore[type-var]
+ self.values, fill_value # type: ignore[arg-type]
)
- # error: Argument 1 to "shift" has incompatible type "Union[ndarray[Any, Any],
- # ExtensionArray]"; expected "ndarray[Any, Any]"
- new_values = shift(
- new_values, periods, axis, fill_value # type: ignore[arg-type]
- )
+ new_values = shift(new_values, periods, axis, fill_value)
return [self.make_block(new_values)]
@@ -1269,7 +1276,7 @@ def _unstack(self, unstacker, fill_value, new_placement):
-------
blocks : list of Block
New blocks of unstacked values.
- mask : array-like of bool
+ mask : array_like of bool
The mask of columns of `blocks` we should keep.
"""
new_values, mask = unstacker.get_new_values(
@@ -1450,7 +1457,7 @@ def setitem(self, indexer, value):
Parameters
----------
- indexer : tuple, list-like, array-like, slice, int
+ indexer : tuple, list-like, array-like, slice
The subset of self.values to set
value : object
The value being set
@@ -1546,18 +1553,6 @@ def _slice(self, slicer):
return self.values[slicer]
- @final
- def getitem_block_index(self, slicer: slice) -> ExtensionBlock:
- """
- Perform __getitem__-like specialized to slicing along index.
-
- Assumes self.ndim == 2
- """
- # error: Invalid index type "Tuple[ellipsis, slice]" for
- # "Union[ndarray, ExtensionArray]"; expected type "Union[int, slice, ndarray]"
- new_values = self.values[..., slicer] # type: ignore[index]
- return type(self)(new_values, self._mgr_locs, ndim=self.ndim)
-
def fillna(
self, value, limit=None, inplace: bool = False, downcast=None
) -> list[Block]:
@@ -1628,10 +1623,6 @@ def where(self, other, cond, errors="raise") -> list[Block]:
# NotImplementedError for class not implementing `__setitem__`
# TypeError for SparseArray, which implements just to raise
# a TypeError
- if isinstance(result, Categorical):
- # TODO: don't special-case
- raise
-
result = type(self.values)._from_sequence(
np.where(cond, self.values, other), dtype=dtype
)
@@ -1666,6 +1657,8 @@ def _unstack(self, unstacker, fill_value, new_placement):
class NumpyBlock(libinternals.NumpyBlock, Block):
values: np.ndarray
+ getitem_block_index = libinternals.NumpyBlock.getitem_block_index
+
class NumericBlock(NumpyBlock):
__slots__ = ()
@@ -1678,6 +1671,7 @@ class NDArrayBackedExtensionBlock(libinternals.NDArrayBackedBlock, EABackedBlock
"""
values: NDArrayBackedExtensionArray
+ getitem_block_index = libinternals.NDArrayBackedBlock.getitem_block_index
@property
def is_view(self) -> bool:
@@ -1900,7 +1894,9 @@ def get_block_type(values, dtype: Dtype | None = None):
cls = ExtensionBlock
elif isinstance(dtype, CategoricalDtype):
cls = CategoricalBlock
- elif vtype is Timestamp:
+ # error: Non-overlapping identity check (left operand type: "Type[generic]",
+ # right operand type: "Type[Timestamp]")
+ elif vtype is Timestamp: # type: ignore[comparison-overlap]
cls = DatetimeTZBlock
elif isinstance(dtype, ExtensionDtype):
# Note: need to be sure PandasArray is unwrapped before we get here
diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py
index 203e48ae48b58..9642b30ab91ca 100644
--- a/pandas/core/internals/concat.py
+++ b/pandas/core/internals/concat.py
@@ -592,9 +592,6 @@ def _is_uniform_join_units(join_units: list[JoinUnit]) -> bool:
# e.g. DatetimeLikeBlock can be dt64 or td64, but these are not uniform
all(
is_dtype_equal(ju.block.dtype, join_units[0].block.dtype)
- # GH#42092 we only want the dtype_equal check for non-numeric blocks
- # (for now, may change but that would need a deprecation)
- or ju.block.dtype.kind in ["b", "i", "u"]
for ju in join_units
)
and
diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py
index 22cce5c614d5a..81bf3ca4ba07a 100644
--- a/pandas/core/internals/construction.py
+++ b/pandas/core/internals/construction.py
@@ -10,7 +10,6 @@
Any,
Hashable,
Sequence,
- cast,
)
import warnings
@@ -26,7 +25,6 @@
from pandas.core.dtypes.cast import (
construct_1d_arraylike_from_scalar,
- dict_compat,
maybe_cast_to_datetime,
maybe_convert_platform,
maybe_infer_to_datetimelike,
@@ -60,6 +58,7 @@
TimedeltaArray,
)
from pandas.core.construction import (
+ create_series_with_explicit_dtype,
ensure_wrapped_if_datetimelike,
extract_array,
range_to_ndarray,
@@ -67,9 +66,7 @@
)
from pandas.core.indexes import base as ibase
from pandas.core.indexes.api import (
- DatetimeIndex,
Index,
- TimedeltaIndex,
ensure_index,
get_objs_combined_axis,
union_indexes,
@@ -168,9 +165,6 @@ def rec_array_to_mgr(
# fill if needed
if isinstance(data, np.ma.MaskedArray):
- # GH#42200 we only get here with MaskedRecords, but check for the
- # parent class MaskedArray to avoid the need to import MaskedRecords
- data = cast("MaskedRecords", data)
new_arrays = fill_masked_arrays(data, arr_columns)
else:
# error: Incompatible types in assignment (expression has type
@@ -348,17 +342,22 @@ def ndarray_to_mgr(
# on the entire block; this is to convert if we have datetimelike's
# embedded in an object type
if dtype is None and is_object_dtype(values.dtype):
- obj_columns = list(values)
- maybe_datetime = [maybe_infer_to_datetimelike(x) for x in obj_columns]
- # don't convert (and copy) the objects if no type inference occurs
- if any(x is not y for x, y in zip(obj_columns, maybe_datetime)):
- dvals_list = [ensure_block_shape(dval, 2) for dval in maybe_datetime]
+
+ if values.ndim == 2 and values.shape[0] != 1:
+ # transpose and separate blocks
+
+ dtlike_vals = [maybe_infer_to_datetimelike(row) for row in values]
+ dvals_list = [ensure_block_shape(dval, 2) for dval in dtlike_vals]
+
+ # TODO: What about re-joining object columns?
block_values = [
new_block(dvals_list[n], placement=n, ndim=2)
for n in range(len(dvals_list))
]
+
else:
- nb = new_block(values, placement=slice(len(columns)), ndim=2)
+ datelike_vals = maybe_infer_to_datetimelike(values)
+ nb = new_block(datelike_vals, placement=slice(len(columns)), ndim=2)
block_values = [nb]
else:
nb = new_block(values, placement=slice(len(columns)), ndim=2)
@@ -553,7 +552,6 @@ def convert(v):
def _homogenize(data, index: Index, dtype: DtypeObj | None) -> list[ArrayLike]:
- oindex = None
homogenized = []
for val in data:
@@ -568,18 +566,9 @@ def _homogenize(data, index: Index, dtype: DtypeObj | None) -> list[ArrayLike]:
val = val._values
else:
if isinstance(val, dict):
- # GH#41785 this _should_ be equivalent to (but faster than)
- # val = create_series_with_explicit_dtype(val, index=index)._values
- if oindex is None:
- oindex = index.astype("O")
-
- if isinstance(index, (DatetimeIndex, TimedeltaIndex)):
- # see test_constructor_dict_datetime64_index
- val = dict_compat(val)
- else:
- # see test_constructor_subclass_dict
- val = dict(val)
- val = lib.fast_multiget(val, oindex._values, default=np.nan)
+ # see test_constructor_subclass_dict
+ # test_constructor_dict_datetime64_index
+ val = create_series_with_explicit_dtype(val, index=index)._values
val = sanitize_array(
val, index, dtype=dtype, copy=False, raise_cast_failure=False
diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py
index dca6ddf703446..48f0b7f7f964b 100644
--- a/pandas/core/internals/managers.py
+++ b/pandas/core/internals/managers.py
@@ -22,9 +22,9 @@
from pandas._libs.internals import BlockPlacement
from pandas._typing import (
ArrayLike,
+ Dtype,
DtypeObj,
Shape,
- npt,
type_t,
)
from pandas.errors import PerformanceWarning
@@ -381,25 +381,6 @@ def shift(self: T, periods: int, axis: int, fill_value) -> T:
if fill_value is lib.no_default:
fill_value = None
- if axis == 0 and self.ndim == 2 and self.nblocks > 1:
- # GH#35488 we need to watch out for multi-block cases
- # We only get here with fill_value not-lib.no_default
- ncols = self.shape[0]
- if periods > 0:
- indexer = [-1] * periods + list(range(ncols - periods))
- else:
- nper = abs(periods)
- indexer = list(range(nper, ncols)) + [-1] * nper
- result = self.reindex_indexer(
- self.items,
- indexer,
- axis=0,
- fill_value=fill_value,
- allow_dups=True,
- consolidate=False,
- )
- return result
-
return self.apply("shift", periods=periods, axis=axis, fill_value=fill_value)
def fillna(self: T, value, limit, inplace: bool, downcast) -> T:
@@ -498,10 +479,6 @@ def is_view(self) -> bool:
return False
- def _get_data_subset(self: T, predicate: Callable) -> T:
- blocks = [blk for blk in self.blocks if predicate(blk.values)]
- return self._combine(blocks, copy=False)
-
def get_bool_data(self: T, copy: bool = False) -> T:
"""
Select blocks that are bool-dtype and columns from object-dtype blocks
@@ -1408,7 +1385,7 @@ def to_dict(self, copy: bool = True):
def as_array(
self,
transpose: bool = False,
- dtype: npt.DTypeLike | None = None,
+ dtype: Dtype | None = None,
copy: bool = False,
na_value=lib.no_default,
) -> np.ndarray:
@@ -1448,21 +1425,17 @@ def as_array(
# error: Item "ndarray" of "Union[ndarray, ExtensionArray]" has no
# attribute "to_numpy"
arr = blk.values.to_numpy( # type: ignore[union-attr]
- # pandas/core/internals/managers.py:1428: error: Argument "dtype" to
- # "to_numpy" of "ExtensionArray" has incompatible type
- # "Optional[Union[dtype[Any], None, type, _SupportsDType, str,
- # Union[Tuple[Any, int], Tuple[Any, Union[SupportsIndex,
- # Sequence[SupportsIndex]]], List[Any], _DTypeDict, Tuple[Any,
- # Any]]]]"; expected "Optional[Union[ExtensionDtype, Union[str,
- # dtype[Any]], Type[str], Type[float], Type[int], Type[complex],
- # Type[bool], Type[object]]]"
- dtype=dtype, # type: ignore[arg-type]
- na_value=na_value,
+ dtype=dtype, na_value=na_value
).reshape(blk.shape)
else:
arr = np.asarray(blk.get_values())
if dtype:
- arr = arr.astype(dtype, copy=False)
+ # error: Argument 1 to "astype" of "_ArrayOrScalarCommon" has
+ # incompatible type "Union[ExtensionDtype, str, dtype[Any],
+ # Type[object]]"; expected "Union[dtype[Any], None, type,
+ # _SupportsDType, str, Union[Tuple[Any, int], Tuple[Any, Union[int,
+ # Sequence[int]]], List[Any], _DTypeDict, Tuple[Any, Any]]]"
+ arr = arr.astype(dtype, copy=False) # type: ignore[arg-type]
else:
arr = self._interleave(dtype=dtype, na_value=na_value)
# The underlying data was copied within _interleave
@@ -1477,9 +1450,7 @@ def as_array(
return arr.transpose() if transpose else arr
def _interleave(
- self,
- dtype: npt.DTypeLike | ExtensionDtype | None = None,
- na_value=lib.no_default,
+ self, dtype: Dtype | None = None, na_value=lib.no_default
) -> np.ndarray:
"""
Return ndarray from blocks with specified item order
@@ -1514,16 +1485,7 @@ def _interleave(
# error: Item "ndarray" of "Union[ndarray, ExtensionArray]" has no
# attribute "to_numpy"
arr = blk.values.to_numpy( # type: ignore[union-attr]
- # pandas/core/internals/managers.py:1485: error: Argument "dtype" to
- # "to_numpy" of "ExtensionArray" has incompatible type
- # "Union[dtype[Any], None, type, _SupportsDType, str, Tuple[Any,
- # Union[SupportsIndex, Sequence[SupportsIndex]]], List[Any],
- # _DTypeDict, Tuple[Any, Any], ExtensionDtype]"; expected
- # "Optional[Union[ExtensionDtype, Union[str, dtype[Any]], Type[str],
- # Type[float], Type[int], Type[complex], Type[bool], Type[object]]]"
- # [arg-type]
- dtype=dtype, # type: ignore[arg-type]
- na_value=na_value,
+ dtype=dtype, na_value=na_value
)
else:
# error: Argument 1 to "get_values" of "Block" has incompatible type
diff --git a/pandas/core/missing.py b/pandas/core/missing.py
index f144821220e4b..8849eb0670faa 100644
--- a/pandas/core/missing.py
+++ b/pandas/core/missing.py
@@ -398,7 +398,7 @@ def interpolate_1d(
# preserve NaNs on the inside
preserve_nans |= mid_nans
- # sort preserve_nans and convert to list
+ # sort preserve_nans and covert to list
preserve_nans = sorted(preserve_nans)
result = yvalues.copy()
@@ -524,11 +524,11 @@ def _from_derivatives(xi, yi, x, order=None, der=0, extrapolate=False):
Parameters
----------
- xi : array-like
+ xi : array_like
sorted 1D array of x-coordinates
- yi : array-like or list of array-likes
+ yi : array_like or list of array-likes
yi[i][j] is the j-th derivative known at xi[i]
- order: None or int or array-like of ints. Default: None.
+ order: None or int or array_like of ints. Default: None.
Specifies the degree of local polynomials. If not None, some
derivatives are ignored.
der : int or list
@@ -546,7 +546,7 @@ def _from_derivatives(xi, yi, x, order=None, der=0, extrapolate=False):
Returns
-------
- y : scalar or array-like
+ y : scalar or array_like
The result, of length R or length M or M by R.
"""
from scipy import interpolate
@@ -568,13 +568,13 @@ def _akima_interpolate(xi, yi, x, der=0, axis=0):
Parameters
----------
- xi : array-like
+ xi : array_like
A sorted list of x-coordinates, of length N.
- yi : array-like
+ yi : array_like
A 1-D array of real values. `yi`'s length along the interpolation
axis must be equal to the length of `xi`. If N-D array, use axis
parameter to select correct axis.
- x : scalar or array-like
+ x : scalar or array_like
Of length M.
der : int, optional
How many derivatives to extract; None for all potentially
@@ -590,7 +590,7 @@ def _akima_interpolate(xi, yi, x, der=0, axis=0):
Returns
-------
- y : scalar or array-like
+ y : scalar or array_like
The result, of length R or length M or M by R,
"""
@@ -609,14 +609,14 @@ def _cubicspline_interpolate(xi, yi, x, axis=0, bc_type="not-a-knot", extrapolat
Parameters
----------
- xi : array-like, shape (n,)
+ xi : array_like, shape (n,)
1-d array containing values of the independent variable.
Values must be real, finite and in strictly increasing order.
- yi : array-like
+ yi : array_like
Array containing values of the dependent variable. It can have
arbitrary number of dimensions, but the length along ``axis``
(see below) must match the length of ``x``. Values must be finite.
- x : scalar or array-like, shape (m,)
+ x : scalar or array_like, shape (m,)
axis : int, optional
Axis along which `y` is assumed to be varying. Meaning that for
``x[i]`` the corresponding values are ``np.take(y, i, axis=axis)``.
@@ -644,7 +644,7 @@ def _cubicspline_interpolate(xi, yi, x, axis=0, bc_type="not-a-knot", extrapolat
tuple `(order, deriv_values)` allowing to specify arbitrary
derivatives at curve ends:
* `order`: the derivative order, 1 or 2.
- * `deriv_value`: array-like containing derivative values, shape must
+ * `deriv_value`: array_like containing derivative values, shape must
be the same as `y`, excluding ``axis`` dimension. For example, if
`y` is 1D, then `deriv_value` must be a scalar. If `y` is 3D with
the shape (n0, n1, n2) and axis=2, then `deriv_value` must be 2D
@@ -661,7 +661,7 @@ def _cubicspline_interpolate(xi, yi, x, axis=0, bc_type="not-a-knot", extrapolat
Returns
-------
- y : scalar or array-like
+ y : scalar or array_like
The result, of shape (m,)
References
diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py
index db7289f7c3547..ecdf2624c8ec1 100644
--- a/pandas/core/nanops.py
+++ b/pandas/core/nanops.py
@@ -177,8 +177,10 @@ def _bn_ok_dtype(dtype: DtypeObj, name: str) -> bool:
def _has_infs(result) -> bool:
if isinstance(result, np.ndarray):
- if result.dtype == "f8" or result.dtype == "f4":
- return lib.has_infs(result.ravel("K"))
+ if result.dtype == "f8":
+ return lib.has_infs_f8(result.ravel("K"))
+ elif result.dtype == "f4":
+ return lib.has_infs_f4(result.ravel("K"))
try:
return np.isinf(result).any()
except (TypeError, NotImplementedError):
@@ -203,7 +205,7 @@ def _get_fill_value(
else:
if fill_value_typ == "+inf":
# need the max int here
- return lib.i8max
+ return np.iinfo(np.int64).max
else:
return iNaT
@@ -374,7 +376,7 @@ def _wrap_results(result, dtype: np.dtype, fill_value=None):
result = np.nan
# raise if we have a timedelta64[ns] which is too large
- if np.fabs(result) > lib.i8max:
+ if np.fabs(result) > np.iinfo(np.int64).max:
raise ValueError("overflow in timedelta operation")
result = Timedelta(result, unit="ns")
@@ -581,7 +583,9 @@ def nansum(
if is_float_dtype(dtype):
dtype_sum = dtype
elif is_timedelta64_dtype(dtype):
- dtype_sum = np.dtype(np.float64)
+ # error: Incompatible types in assignment (expression has type
+ # "Type[float64]", variable has type "dtype")
+ dtype_sum = np.float64 # type: ignore[assignment]
the_sum = values.sum(axis, dtype=dtype_sum)
the_sum = _maybe_null_out(the_sum, axis, mask, values.shape, min_count=min_count)
@@ -599,9 +603,7 @@ def _mask_datetimelike_result(
# we need to apply the mask
result = result.astype("i8").view(orig_values.dtype)
axis_mask = mask.any(axis=axis)
- # error: Unsupported target for indexed assignment ("Union[ndarray[Any, Any],
- # datetime64, timedelta64]")
- result[axis_mask] = iNaT # type: ignore[index]
+ result[axis_mask] = iNaT
else:
if mask.any():
return NaT
@@ -753,10 +755,7 @@ def get_median(x):
def get_empty_reduction_result(
- shape: tuple[int, ...],
- axis: int,
- dtype: np.dtype | type[np.floating],
- fill_value: Any,
+ shape: tuple[int, ...], axis: int, dtype: np.dtype, fill_value: Any
) -> np.ndarray:
"""
The result from a reduction on an empty ndarray.
@@ -785,7 +784,7 @@ def _get_counts_nanvar(
axis: int | None,
ddof: int,
dtype: Dtype = float,
-) -> tuple[int | float | np.ndarray, int | float | np.ndarray]:
+) -> tuple[int | np.ndarray, int | np.ndarray]:
"""
Get the count of non-null values along an axis, accounting
for degrees of freedom.
@@ -805,12 +804,14 @@ def _get_counts_nanvar(
Returns
-------
- count : int, np.nan or np.ndarray
- d : int, np.nan or np.ndarray
+ count : scalar or array
+ d : scalar or array
"""
dtype = get_dtype(dtype)
count = _get_counts(values_shape, mask, axis, dtype=dtype)
- d = count - dtype.type(ddof)
+ # error: Unsupported operand types for - ("int" and "generic")
+ # error: Unsupported operand types for - ("float" and "generic")
+ d = count - dtype.type(ddof) # type: ignore[operator]
# always return NaN, never inf
if is_scalar(count):
@@ -818,13 +819,16 @@ def _get_counts_nanvar(
count = np.nan
d = np.nan
else:
- # count is not narrowed by is_scalar check
- count = cast(np.ndarray, count)
- mask = count <= ddof
- if mask.any():
- np.putmask(d, mask, np.nan)
- np.putmask(count, mask, np.nan)
- return count, d
+ # error: Incompatible types in assignment (expression has type
+ # "Union[bool, Any]", variable has type "ndarray")
+ mask2: np.ndarray = count <= ddof # type: ignore[assignment]
+ if mask2.any():
+ np.putmask(d, mask2, np.nan)
+ np.putmask(count, mask2, np.nan)
+ # error: Incompatible return value type (got "Tuple[Union[int, float,
+ # ndarray], Any]", expected "Tuple[Union[int, ndarray], Union[int,
+ # ndarray]]")
+ return count, d # type: ignore[return-value]
@bottleneck_switch(ddof=1)
@@ -1396,7 +1400,9 @@ def _get_counts(
n = mask.size - mask.sum()
else:
n = np.prod(values_shape)
- return dtype.type(n)
+ # error: Incompatible return value type (got "Union[Any, generic]",
+ # expected "Union[int, float, ndarray]")
+ return dtype.type(n) # type: ignore[return-value]
if mask is not None:
count = mask.shape[axis] - mask.sum(axis)
@@ -1404,7 +1410,9 @@ def _get_counts(
count = values_shape[axis]
if is_scalar(count):
- return dtype.type(count)
+ # error: Incompatible return value type (got "Union[Any, generic]",
+ # expected "Union[int, float, ndarray]")
+ return dtype.type(count) # type: ignore[return-value]
try:
return count.astype(dtype)
except AttributeError:
@@ -1750,7 +1758,7 @@ def na_accum_func(values: ArrayLike, accum_func, *, skipna: bool) -> ArrayLike:
if accum_func == np.minimum.accumulate:
# Note: the accum_func comparison fails as an "is" comparison
y = values.view("i8")
- y[mask] = lib.i8max
+ y[mask] = np.iinfo(np.int64).max
changed = True
else:
y = values
@@ -1777,9 +1785,8 @@ def na_accum_func(values: ArrayLike, accum_func, *, skipna: bool) -> ArrayLike:
# TODO: have this case go through a DTA method?
# For DatetimeTZDtype, view result as M8[ns]
npdtype = orig_dtype if isinstance(orig_dtype, np.dtype) else "M8[ns]"
- # Item "type" of "Union[Type[ExtensionArray], Type[ndarray[Any, Any]]]"
- # has no attribute "_simple_new"
- result = type(values)._simple_new( # type: ignore[union-attr]
+ # error: "Type[ExtensionArray]" has no attribute "_simple_new"
+ result = type(values)._simple_new( # type: ignore[attr-defined]
result.view(npdtype), dtype=orig_dtype
)
diff --git a/pandas/core/resample.py b/pandas/core/resample.py
index b9a75a6917140..76e23f1bf77e0 100644
--- a/pandas/core/resample.py
+++ b/pandas/core/resample.py
@@ -4,10 +4,9 @@
from datetime import timedelta
from textwrap import dedent
from typing import (
+ TYPE_CHECKING,
Callable,
Hashable,
- Literal,
- final,
no_type_check,
)
@@ -28,7 +27,7 @@
T,
TimedeltaConvertibleTypes,
TimestampConvertibleTypes,
- npt,
+ final,
)
from pandas.compat.numpy import function as nv
from pandas.errors import AbstractMethodError
@@ -89,6 +88,9 @@
Tick,
)
+if TYPE_CHECKING:
+ from typing import Literal
+
_shared_docs_kwargs: dict[str, str] = {}
@@ -1769,8 +1771,9 @@ def _get_period_bins(self, ax: PeriodIndex):
def _take_new_index(
- obj: FrameOrSeries, indexer: npt.NDArray[np.intp], new_index: Index, axis: int = 0
+ obj: FrameOrSeries, indexer: np.ndarray, new_index: Index, axis: int = 0
) -> FrameOrSeries:
+ # indexer: np.ndarray[np.intp]
if isinstance(obj, ABCSeries):
new_values = algos.take_nd(obj._values, indexer)
diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py
index d908638c4706b..ea34bc75b4e31 100644
--- a/pandas/core/reshape/concat.py
+++ b/pandas/core/reshape/concat.py
@@ -15,6 +15,7 @@
import numpy as np
+from pandas._typing import FrameOrSeriesUnion
from pandas.util._decorators import (
cache_readonly,
deprecate_nonkeyword_arguments,
@@ -82,7 +83,7 @@ def concat(
verify_integrity: bool = False,
sort: bool = False,
copy: bool = True,
-) -> DataFrame | Series:
+) -> FrameOrSeriesUnion:
...
@@ -98,7 +99,7 @@ def concat(
verify_integrity: bool = False,
sort: bool = False,
copy: bool = True,
-) -> DataFrame | Series:
+) -> FrameOrSeriesUnion:
"""
Concatenate pandas objects along a particular axis with optional set logic
along the other axes.
@@ -361,13 +362,8 @@ def __init__(
clean_keys.append(k)
clean_objs.append(v)
objs = clean_objs
-
- if isinstance(keys, MultiIndex):
- # TODO: retain levels?
- keys = type(keys).from_tuples(clean_keys, names=keys.names)
- else:
- name = getattr(keys, "name", None)
- keys = Index(clean_keys, name=name)
+ name = getattr(keys, "name", None)
+ keys = Index(clean_keys, name=name)
if len(objs) == 0:
raise ValueError("All objects passed were None")
@@ -458,7 +454,7 @@ def __init__(
if self._is_frame and axis == 1:
name = 0
# mypy needs to know sample is not an NDFrame
- sample = cast("DataFrame | Series", sample)
+ sample = cast("FrameOrSeriesUnion", sample)
obj = sample._constructor({name: obj})
self.objs.append(obj)
@@ -478,8 +474,8 @@ def __init__(
self.new_axes = self._get_new_axes()
def get_result(self):
- cons: type[DataFrame | Series]
- sample: DataFrame | Series
+ cons: type[FrameOrSeriesUnion]
+ sample: FrameOrSeriesUnion
# series only
if self._is_series:
diff --git a/pandas/core/reshape/melt.py b/pandas/core/reshape/melt.py
index acd6e540aaae3..6a0fad9ee729b 100644
--- a/pandas/core/reshape/melt.py
+++ b/pandas/core/reshape/melt.py
@@ -21,7 +21,6 @@
from pandas.core.dtypes.concat import concat_compat
from pandas.core.dtypes.missing import notna
-import pandas.core.algorithms as algos
from pandas.core.arrays import Categorical
import pandas.core.common as com
from pandas.core.indexes.api import (
@@ -107,7 +106,7 @@ def melt(
id_vars + value_vars
)
else:
- idx = algos.unique(frame.columns.get_indexer_for(id_vars + value_vars))
+ idx = frame.columns.get_indexer(id_vars + value_vars)
frame = frame.iloc[:, idx]
else:
frame = frame.copy()
@@ -227,7 +226,7 @@ def lreshape(data: DataFrame, groups, dropna: bool = True, label=None) -> DataFr
else:
keys, values = zip(*groups)
- all_cols = list(set.union(*(set(x) for x in values)))
+ all_cols = list(set.union(*[set(x) for x in values]))
id_cols = list(data.columns.difference(all_cols))
K = len(values[0])
diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py
index f82fcfcf172a9..143999a4677b3 100644
--- a/pandas/core/reshape/merge.py
+++ b/pandas/core/reshape/merge.py
@@ -29,7 +29,6 @@
FrameOrSeries,
IndexLabel,
Suffixes,
- npt,
)
from pandas.errors import MergeError
from pandas.util._decorators import (
@@ -1004,7 +1003,7 @@ def _create_join_index(
self,
index: Index,
other_index: Index,
- indexer: npt.NDArray[np.intp],
+ indexer: np.ndarray,
how: str = "left",
) -> Index:
"""
@@ -1449,7 +1448,7 @@ def _validate(self, validate: str) -> None:
def get_join_indexers(
left_keys, right_keys, sort: bool = False, how: str = "inner", **kwargs
-) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]:
+) -> tuple[np.ndarray, np.ndarray]:
"""
Parameters
@@ -1508,9 +1507,9 @@ def restore_dropped_levels_multijoin(
right: MultiIndex,
dropped_level_names,
join_index: Index,
- lindexer: npt.NDArray[np.intp],
- rindexer: npt.NDArray[np.intp],
-) -> tuple[list[Index], npt.NDArray[np.intp], list[Hashable]]:
+ lindexer: np.ndarray,
+ rindexer: np.ndarray,
+) -> tuple[list[Index], np.ndarray, list[Hashable]]:
"""
*this is an internal non-public method*
@@ -1540,7 +1539,7 @@ def restore_dropped_levels_multijoin(
-------
levels : list of Index
levels of combined multiindexes
- labels : np.ndarray[np.intp]
+ labels : intp array
labels of combined multiindexes
names : List[Hashable]
names of combined multiindex levels
@@ -2056,7 +2055,7 @@ def _left_join_on_index(
def _factorize_keys(
lk: ArrayLike, rk: ArrayLike, sort: bool = True, how: str = "inner"
-) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp], int]:
+) -> tuple[np.ndarray, np.ndarray, int]:
"""
Encode left and right keys as enumerated types.
diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py
index fcf00276aa8af..51556fda6da04 100644
--- a/pandas/core/reshape/pivot.py
+++ b/pandas/core/reshape/pivot.py
@@ -14,6 +14,7 @@
AggFuncType,
AggFuncTypeBase,
AggFuncTypeDict,
+ FrameOrSeriesUnion,
IndexLabel,
)
from pandas.util._decorators import (
@@ -253,7 +254,7 @@ def __internal_pivot_table(
def _add_margins(
- table: DataFrame | Series,
+ table: FrameOrSeriesUnion,
data,
values,
rows,
@@ -481,7 +482,7 @@ def pivot(
if columns is None:
raise TypeError("pivot() missing 1 required argument: 'columns'")
- columns_listlike = com.convert_to_list_like(columns)
+ columns = com.convert_to_list_like(columns)
if values is None:
if index is not None:
@@ -493,27 +494,28 @@ def pivot(
# error: Unsupported operand types for + ("List[Any]" and "ExtensionArray")
# error: Unsupported left operand type for + ("ExtensionArray")
indexed = data.set_index(
- cols + columns_listlike, append=append # type: ignore[operator]
+ cols + columns, append=append # type: ignore[operator]
)
else:
if index is None:
- index_list = [Series(data.index, name=data.index.name)]
+ index = [Series(data.index, name=data.index.name)]
else:
- index_list = [data[idx] for idx in com.convert_to_list_like(index)]
+ index = com.convert_to_list_like(index)
+ index = [data[idx] for idx in index]
- data_columns = [data[col] for col in columns_listlike]
- index_list.extend(data_columns)
- multiindex = MultiIndex.from_arrays(index_list)
+ data_columns = [data[col] for col in columns]
+ index.extend(data_columns)
+ index = MultiIndex.from_arrays(index)
if is_list_like(values) and not isinstance(values, tuple):
# Exclude tuple because it is seen as a single column name
values = cast(Sequence[Hashable], values)
indexed = data._constructor(
- data[values]._values, index=multiindex, columns=values
+ data[values]._values, index=index, columns=values
)
else:
- indexed = data._constructor_sliced(data[values]._values, index=multiindex)
- return indexed.unstack(columns_listlike)
+ indexed = data._constructor_sliced(data[values]._values, index=index)
+ return indexed.unstack(columns)
def crosstab(
diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py
index 12ab08c4e30a1..93859eb11dd44 100644
--- a/pandas/core/reshape/reshape.py
+++ b/pandas/core/reshape/reshape.py
@@ -10,10 +10,7 @@
import pandas._libs.reshape as libreshape
from pandas._libs.sparse import IntIndex
-from pandas._typing import (
- Dtype,
- npt,
-)
+from pandas._typing import Dtype
from pandas.util._decorators import cache_readonly
from pandas.core.dtypes.cast import maybe_promote
@@ -28,13 +25,11 @@
is_object_dtype,
needs_i8_conversion,
)
-from pandas.core.dtypes.dtypes import ExtensionDtype
from pandas.core.dtypes.missing import notna
import pandas.core.algorithms as algos
from pandas.core.arrays import SparseArray
from pandas.core.arrays.categorical import factorize_from_iterable
-from pandas.core.construction import ensure_wrapped_if_datetimelike
from pandas.core.frame import DataFrame
from pandas.core.indexes.api import (
Index,
@@ -139,7 +134,7 @@ def __init__(self, index: MultiIndex, level=-1, constructor=None):
def _indexer_and_to_sort(
self,
) -> tuple[
- npt.NDArray[np.intp],
+ np.ndarray, # np.ndarray[np.intp]
list[np.ndarray], # each has _some_ signed integer dtype
]:
v = self.level
@@ -238,22 +233,15 @@ def get_new_values(self, values, fill_value=None):
if mask_all:
dtype = values.dtype
new_values = np.empty(result_shape, dtype=dtype)
- name = np.dtype(dtype).name
else:
dtype, fill_value = maybe_promote(values.dtype, fill_value)
- if isinstance(dtype, ExtensionDtype):
- # GH#41875
- cls = dtype.construct_array_type()
- new_values = cls._empty(result_shape, dtype=dtype)
- new_values[:] = fill_value
- name = dtype.name
- else:
- new_values = np.empty(result_shape, dtype=dtype)
- new_values.fill(fill_value)
- name = np.dtype(dtype).name
+ new_values = np.empty(result_shape, dtype=dtype)
+ new_values.fill(fill_value)
new_mask = np.zeros(result_shape, dtype=bool)
+ name = np.dtype(dtype).name
+
# we need to convert to a basic dtype
# and possibly coerce an input to our output dtype
# e.g. ints -> floats
@@ -279,10 +267,6 @@ def get_new_values(self, values, fill_value=None):
# reconstruct dtype if needed
if needs_i8_conversion(values.dtype):
- # view as datetime64 so we can wrap in DatetimeArray and use
- # DTA's view method
- new_values = new_values.view("M8[ns]")
- new_values = ensure_wrapped_if_datetimelike(new_values)
new_values = new_values.view(values.dtype)
return new_values, new_mask
@@ -1020,9 +1004,7 @@ def get_empty_frame(data) -> DataFrame:
fill_value: bool | float | int
if is_integer_dtype(dtype):
fill_value = 0
- # error: Non-overlapping equality check (left operand type: "dtype[Any]", right
- # operand type: "Type[bool]")
- elif dtype == bool: # type: ignore[comparison-overlap]
+ elif dtype == bool:
fill_value = False
else:
fill_value = 0.0
diff --git a/pandas/core/sample.py b/pandas/core/sample.py
deleted file mode 100644
index e4bad22e8e43c..0000000000000
--- a/pandas/core/sample.py
+++ /dev/null
@@ -1,144 +0,0 @@
-"""
-Module containing utilities for NDFrame.sample() and .GroupBy.sample()
-"""
-from __future__ import annotations
-
-import numpy as np
-
-from pandas._libs import lib
-from pandas._typing import FrameOrSeries
-
-from pandas.core.dtypes.generic import (
- ABCDataFrame,
- ABCSeries,
-)
-
-
-def preprocess_weights(obj: FrameOrSeries, weights, axis: int) -> np.ndarray:
- """
- Process and validate the `weights` argument to `NDFrame.sample` and
- `.GroupBy.sample`.
-
- Returns `weights` as an ndarray[np.float64], validated except for normalizing
- weights (because that must be done groupwise in groupby sampling).
- """
- # If a series, align with frame
- if isinstance(weights, ABCSeries):
- weights = weights.reindex(obj.axes[axis])
-
- # Strings acceptable if a dataframe and axis = 0
- if isinstance(weights, str):
- if isinstance(obj, ABCDataFrame):
- if axis == 0:
- try:
- weights = obj[weights]
- except KeyError as err:
- raise KeyError(
- "String passed to weights not a valid column"
- ) from err
- else:
- raise ValueError(
- "Strings can only be passed to "
- "weights when sampling from rows on "
- "a DataFrame"
- )
- else:
- raise ValueError(
- "Strings cannot be passed as weights when sampling from a Series."
- )
-
- if isinstance(obj, ABCSeries):
- func = obj._constructor
- else:
- func = obj._constructor_sliced
-
- weights = func(weights, dtype="float64")._values
-
- if len(weights) != obj.shape[axis]:
- raise ValueError("Weights and axis to be sampled must be of same length")
-
- if lib.has_infs(weights):
- raise ValueError("weight vector may not include `inf` values")
-
- if (weights < 0).any():
- raise ValueError("weight vector many not include negative values")
-
- weights[np.isnan(weights)] = 0
- return weights
-
-
-def process_sampling_size(
- n: int | None, frac: float | None, replace: bool
-) -> int | None:
- """
- Process and validate the `n` and `frac` arguments to `NDFrame.sample` and
- `.GroupBy.sample`.
-
- Returns None if `frac` should be used (variable sampling sizes), otherwise returns
- the constant sampling size.
- """
- # If no frac or n, default to n=1.
- if n is None and frac is None:
- n = 1
- elif n is not None and frac is not None:
- raise ValueError("Please enter a value for `frac` OR `n`, not both")
- elif n is not None:
- if n < 0:
- raise ValueError(
- "A negative number of rows requested. Please provide `n` >= 0."
- )
- if n % 1 != 0:
- raise ValueError("Only integers accepted as `n` values")
- else:
- assert frac is not None # for mypy
- if frac > 1 and not replace:
- raise ValueError(
- "Replace has to be set to `True` when "
- "upsampling the population `frac` > 1."
- )
- if frac < 0:
- raise ValueError(
- "A negative number of rows requested. Please provide `frac` >= 0."
- )
-
- return n
-
-
-def sample(
- obj_len: int,
- size: int,
- replace: bool,
- weights: np.ndarray | None,
- random_state: np.random.RandomState | np.random.Generator,
-) -> np.ndarray:
- """
- Randomly sample `size` indices in `np.arange(obj_len)`
-
- Parameters
- ----------
- obj_len : int
- The length of the indices being considered
- size : int
- The number of values to choose
- replace : bool
- Allow or disallow sampling of the same row more than once.
- weights : np.ndarray[np.float64] or None
- If None, equal probability weighting, otherwise weights according
- to the vector normalized
- random_state: np.random.RandomState or np.random.Generator
- State used for the random sampling
-
- Returns
- -------
- np.ndarray[np.intp]
- """
- if weights is not None:
- weight_sum = weights.sum()
- if weight_sum != 0:
- weights = weights / weight_sum
- else:
- raise ValueError("Invalid weights: weights sum to zero")
-
- return random_state.choice(obj_len, size=size, replace=replace, p=weights).astype(
- np.intp, copy=False
- )
diff --git a/pandas/core/series.py b/pandas/core/series.py
index e61ce8e74629b..59ea6710ea6cd 100644
--- a/pandas/core/series.py
+++ b/pandas/core/series.py
@@ -13,7 +13,6 @@
Callable,
Hashable,
Iterable,
- Literal,
Sequence,
Union,
cast,
@@ -40,13 +39,12 @@
Dtype,
DtypeObj,
FillnaOptions,
+ FrameOrSeriesUnion,
IndexKeyFunc,
+ NpDtype,
SingleManager,
StorageOptions,
- TimedeltaConvertibleTypes,
- TimestampConvertibleTypes,
ValueKeyFunc,
- npt,
)
from pandas.compat.numpy import function as nv
from pandas.errors import InvalidIndexError
@@ -143,6 +141,12 @@
import pandas.plotting
if TYPE_CHECKING:
+ from typing import Literal
+
+ from pandas._typing import (
+ TimedeltaConvertibleTypes,
+ TimestampConvertibleTypes,
+ )
from pandas.core.frame import DataFrame
from pandas.core.groupby.generic import SeriesGroupBy
@@ -198,7 +202,7 @@ class Series(base.IndexOpsMixin, generic.NDFrame):
methods from ndarray have been overridden to automatically exclude
missing data (currently represented as NaN).
- Operations between Series (+, -, /, \\*, \\*\\*) align values based on their
+ Operations between Series (+, -, /, *, **) align values based on their
associated index values-- they need not be the same length. The result
index will be the sorted union of the two indexes.
@@ -301,6 +305,7 @@ class Series(base.IndexOpsMixin, generic.NDFrame):
hasnans = property( # type: ignore[assignment]
base.IndexOpsMixin.hasnans.func, doc=base.IndexOpsMixin.hasnans.__doc__
)
+ __hash__ = generic.NDFrame.__hash__
_mgr: SingleManager
div: Callable[[Series, Any], Series]
rdiv: Callable[[Series, Any], Series]
@@ -383,7 +388,9 @@ def __init__(
copy = False
elif isinstance(data, np.ndarray):
- if len(data.dtype):
+ # error: Argument 1 to "len" has incompatible type "dtype"; expected
+ # "Sized"
+ if len(data.dtype): # type: ignore[arg-type]
# GH#13296 we are dealing with a compound dtype, which
# should be treated as 2D
raise ValueError(
@@ -803,7 +810,7 @@ def view(self, dtype: Dtype | None = None) -> Series:
# NDArray Compat
_HANDLED_TYPES = (Index, ExtensionArray, np.ndarray)
- def __array__(self, dtype: npt.DTypeLike | None = None) -> np.ndarray:
+ def __array__(self, dtype: NpDtype | None = None) -> np.ndarray:
"""
Return the values as a NumPy array.
@@ -931,7 +938,7 @@ def __getitem__(self, key):
if isinstance(key, (list, tuple)):
key = unpack_1tuple(key)
- if is_integer(key) and self.index._should_fallback_to_positional:
+ if is_integer(key) and self.index._should_fallback_to_positional():
return self._values[key]
elif key_is_scalar:
@@ -993,7 +1000,7 @@ def _get_with(self, key):
if key_type == "integer":
# We need to decide whether to treat this as a positional indexer
# (i.e. self.iloc) or label-based (i.e. self.loc)
- if not self.index._should_fallback_to_positional:
+ if not self.index._should_fallback_to_positional():
return self.loc[key]
else:
return self.iloc[key]
@@ -1054,35 +1061,19 @@ def __setitem__(self, key, value) -> None:
if key is Ellipsis:
key = slice(None)
- if isinstance(key, slice):
- indexer = self.index._convert_slice_indexer(key, kind="getitem")
- return self._set_values(indexer, value)
-
try:
self._set_with_engine(key, value)
except (KeyError, ValueError):
values = self._values
if is_integer(key) and self.index.inferred_type != "integer":
# positional setter
- if not self.index._should_fallback_to_positional:
- # GH#33469
- warnings.warn(
- "Treating integers as positional in Series.__setitem__ "
- "with a Float64Index is deprecated. In a future version, "
- "`series[an_int] = val` will insert a new key into the "
- "Series. Use `series.iloc[an_int] = val` to treat the "
- "key as positional.",
- FutureWarning,
- stacklevel=2,
- )
values[key] = value
else:
# GH#12862 adding a new key to the Series
self.loc[key] = value
- except (InvalidIndexError, TypeError) as err:
+ except TypeError as err:
if isinstance(key, tuple) and not isinstance(self.index, MultiIndex):
- # cases with MultiIndex don't get here bc they raise KeyError
raise KeyError(
"key of type tuple not found and not a MultiIndex"
) from err
@@ -1103,7 +1094,8 @@ def __setitem__(self, key, value) -> None:
self._maybe_update_cacher()
def _set_with_engine(self, key, value) -> None:
- loc = self.index.get_loc(key)
+ # fails with AttributeError for IntervalIndex
+ loc = self.index._engine.get_loc(key)
# error: Argument 1 to "validate_numeric_casting" has incompatible type
# "Union[dtype, ExtensionDtype]"; expected "dtype"
validate_numeric_casting(self.dtype, value) # type: ignore[arg-type]
@@ -1111,25 +1103,31 @@ def _set_with_engine(self, key, value) -> None:
def _set_with(self, key, value):
# other: fancy integer or otherwise
- assert not isinstance(key, tuple)
+ if isinstance(key, slice):
+ indexer = self.index._convert_slice_indexer(key, kind="getitem")
+ return self._set_values(indexer, value)
- if is_scalar(key):
- key = [key]
- elif is_iterator(key):
- # Without this, the call to infer_dtype will consume the generator
- key = list(key)
+ else:
+ assert not isinstance(key, tuple)
- key_type = lib.infer_dtype(key, skipna=False)
+ if is_scalar(key):
+ key = [key]
- # Note: key_type == "boolean" should not occur because that
- # should be caught by the is_bool_indexer check in __setitem__
- if key_type == "integer":
- if not self.index._should_fallback_to_positional:
- self._set_labels(key, value)
+ if isinstance(key, Index):
+ key_type = key.inferred_type
+ key = key._values
else:
- self._set_values(key, value)
- else:
- self.loc[key] = value
+ key_type = lib.infer_dtype(key, skipna=False)
+
+ # Note: key_type == "boolean" should not occur because that
+ # should be caught by the is_bool_indexer check in __setitem__
+ if key_type == "integer":
+ if not self.index._should_fallback_to_positional():
+ self._set_labels(key, value)
+ else:
+ self._set_values(key, value)
+ else:
+ self.loc[key] = value
def _set_labels(self, key, value) -> None:
key = com.asarray_tuplesafe(key)
@@ -1140,7 +1138,7 @@ def _set_labels(self, key, value) -> None:
self._set_values(indexer, value)
def _set_values(self, key, value) -> None:
- if isinstance(key, (Index, Series)):
+ if isinstance(key, Series):
key = key._values
self._mgr = self._mgr.setitem(indexer=key, value=value)
@@ -1557,7 +1555,8 @@ def to_string(
klass=_shared_doc_kwargs["klass"],
storage_options=generic._shared_docs["storage_options"],
examples=dedent(
- """Examples
+ """
+ Examples
--------
>>> s = pd.Series(["elk", "pig", "dog", "quetzal"], name="animal")
>>> print(s.to_markdown())
@@ -1567,21 +1566,7 @@ def to_string(
| 1 | pig |
| 2 | dog |
| 3 | quetzal |
-
- Output markdown with a tabulate option.
-
- >>> print(s.to_markdown(tablefmt="grid"))
- +----+----------+
- | | animal |
- +====+==========+
- | 0 | elk |
- +----+----------+
- | 1 | pig |
- +----+----------+
- | 2 | dog |
- +----+----------+
- | 3 | quetzal |
- +----+----------+"""
+ """
),
)
def to_markdown(
@@ -1624,7 +1609,31 @@ def to_markdown(
-----
Requires the `tabulate `_ package.
- {examples}
+ Examples
+ --------
+ >>> s = pd.Series(["elk", "pig", "dog", "quetzal"], name="animal")
+ >>> print(s.to_markdown())
+ | | animal |
+ |---:|:---------|
+ | 0 | elk |
+ | 1 | pig |
+ | 2 | dog |
+ | 3 | quetzal |
+
+ Output markdown with a tabulate option.
+
+ >>> print(s.to_markdown(tablefmt="grid"))
+ +----+----------+
+ | | animal |
+ +====+==========+
+ | 0 | elk |
+ +----+----------+
+ | 1 | pig |
+ +----+----------+
+ | 2 | dog |
+ +----+----------+
+ | 3 | quetzal |
+ +----+----------+
"""
return self.to_frame().to_markdown(
buf, mode, index, storage_options=storage_options, **kwargs
@@ -3013,7 +3022,7 @@ def compare(
align_axis: Axis = 1,
keep_shape: bool = False,
keep_equal: bool = False,
- ) -> DataFrame | Series:
+ ) -> FrameOrSeriesUnion:
return super().compare(
other=other,
align_axis=align_axis,
@@ -3856,65 +3865,6 @@ def nsmallest(self, n: int = 5, keep: str = "first") -> Series:
"""
return algorithms.SelectNSeries(self, n=n, keep=keep).nsmallest()
- @doc(
- klass=_shared_doc_kwargs["klass"],
- extra_params=dedent(
- """copy : bool, default True
- Whether to copy underlying data."""
- ),
- examples=dedent(
- """Examples
- --------
- >>> s = pd.Series(
- ... ["A", "B", "A", "C"],
- ... index=[
- ... ["Final exam", "Final exam", "Coursework", "Coursework"],
- ... ["History", "Geography", "History", "Geography"],
- ... ["January", "February", "March", "April"],
- ... ],
- ... )
- >>> s
- Final exam History January A
- Geography February B
- Coursework History March A
- Geography April C
- dtype: object
-
- In the following example, we will swap the levels of the indices.
- Here, we will swap the levels column-wise, but levels can be swapped row-wise
- in a similar manner. Note that column-wise is the default behaviour.
- By not supplying any arguments for i and j, we swap the last and second to
- last indices.
-
- >>> s.swaplevel()
- Final exam January History A
- February Geography B
- Coursework March History A
- April Geography C
- dtype: object
-
- By supplying one argument, we can choose which index to swap the last
- index with. We can for example swap the first index with the last one as
- follows.
-
- >>> s.swaplevel(0)
- January History Final exam A
- February Geography Final exam B
- March History Coursework A
- April Geography Coursework C
- dtype: object
-
- We can also define explicitly which indices we want to swap by supplying values
- for both i and j. Here, we for example swap the first and second indices.
-
- >>> s.swaplevel(0, 1)
- History Final exam January A
- Geography Final exam February B
- History Coursework March A
- Geography Coursework April C
- dtype: object"""
- ),
- )
def swaplevel(self, i=-2, j=-1, copy=True) -> Series:
"""
Swap levels i and j in a :class:`MultiIndex`.
@@ -3923,16 +3873,15 @@ def swaplevel(self, i=-2, j=-1, copy=True) -> Series:
Parameters
----------
- i, j : int or str
- Levels of the indices to be swapped. Can pass level name as string.
- {extra_params}
+ i, j : int, str
+ Level of the indices to be swapped. Can pass level name as string.
+ copy : bool, default True
+ Whether to copy underlying data.
Returns
-------
- {klass}
- {klass} with levels swapped in MultiIndex.
-
- {examples}
+ Series
+ Series with levels swapped in MultiIndex.
"""
assert isinstance(self.index, MultiIndex)
new_index = self.index.swaplevel(i, j)
@@ -4229,7 +4178,7 @@ def aggregate(self, func=None, axis=0, *args, **kwargs):
)
def transform(
self, func: AggFuncType, axis: Axis = 0, *args, **kwargs
- ) -> DataFrame | Series:
+ ) -> FrameOrSeriesUnion:
# Validate axis argument
self._get_axis_number(axis)
result = SeriesApply(
@@ -4243,7 +4192,7 @@ def apply(
convert_dtype: bool = True,
args: tuple[Any, ...] = (),
**kwargs,
- ) -> DataFrame | Series:
+ ) -> FrameOrSeriesUnion:
"""
Invoke function on values of Series.
@@ -4257,7 +4206,7 @@ def apply(
convert_dtype : bool, default True
Try to find better dtype for elementwise function results. If
False, leave as dtype=object. Note that the dtype is always
- preserved for some extension array dtypes, such as Categorical.
+ preserved for extension array dtypes, such as Categorical.
args : tuple
Positional arguments passed to func after the series value.
**kwargs
@@ -5020,7 +4969,7 @@ def isin(self, values) -> Series:
self, method="isin"
)
- def between(self, left, right, inclusive="both") -> Series:
+ def between(self, left, right, inclusive=True) -> Series:
"""
Return boolean Series equivalent to left <= series <= right.
@@ -5034,10 +4983,8 @@ def between(self, left, right, inclusive="both") -> Series:
Left boundary.
right : scalar or list-like
Right boundary.
- inclusive : {"both", "neither", "left", "right"}
- Include boundaries. Whether to set each bound as closed or open.
-
- .. versionchanged:: 1.3.0
+ inclusive : bool, default True
+ Include boundaries.
Returns
-------
@@ -5068,9 +5015,9 @@ def between(self, left, right, inclusive="both") -> Series:
4 False
dtype: bool
- With `inclusive` set to ``"neither"`` boundary values are excluded:
+ With `inclusive` set to ``False`` boundary values are excluded:
- >>> s.between(1, 4, inclusive="neither")
+ >>> s.between(1, 4, inclusive=False)
0 True
1 False
2 False
@@ -5088,34 +5035,12 @@ def between(self, left, right, inclusive="both") -> Series:
3 False
dtype: bool
"""
- if inclusive is True or inclusive is False:
- warnings.warn(
- "Boolean inputs to the `inclusive` argument are deprecated in"
- "favour of `both` or `neither`.",
- FutureWarning,
- stacklevel=2,
- )
- if inclusive:
- inclusive = "both"
- else:
- inclusive = "neither"
- if inclusive == "both":
+ if inclusive:
lmask = self >= left
rmask = self <= right
- elif inclusive == "left":
- lmask = self >= left
- rmask = self < right
- elif inclusive == "right":
- lmask = self > left
- rmask = self <= right
- elif inclusive == "neither":
+ else:
lmask = self > left
rmask = self < right
- else:
- raise ValueError(
- "Inclusive has to be either string of 'both',"
- "'left', 'right', or 'neither'."
- )
return lmask & rmask
diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py
index befa67350e182..8531f93fba321 100644
--- a/pandas/core/sorting.py
+++ b/pandas/core/sorting.py
@@ -21,7 +21,6 @@
from pandas._typing import (
IndexKeyFunc,
Shape,
- npt,
)
from pandas.core.dtypes.common import (
@@ -41,6 +40,8 @@
from pandas import MultiIndex
from pandas.core.indexes.base import Index
+_INT64_MAX = np.iinfo(np.int64).max
+
def get_indexer_indexer(
target: Index,
@@ -132,7 +133,7 @@ def _int64_cut_off(shape) -> int:
acc = 1
for i, mul in enumerate(shape):
acc *= int(mul)
- if not acc < lib.i8max:
+ if not acc < _INT64_MAX:
return i
return len(shape)
@@ -152,7 +153,7 @@ def maybe_lift(lab, size) -> tuple[np.ndarray, int]:
labels = list(labels)
# Iteratively process all the labels in chunks sized so less
- # than lib.i8max unique int ids will be required for each chunk
+ # than _INT64_MAX unique int ids will be required for each chunk
while True:
# how many levels can be done without overflow:
nlev = _int64_cut_off(lshape)
@@ -187,9 +188,7 @@ def maybe_lift(lab, size) -> tuple[np.ndarray, int]:
return out
-def get_compressed_ids(
- labels, sizes: Shape
-) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.int64]]:
+def get_compressed_ids(labels, sizes: Shape) -> tuple[np.ndarray, np.ndarray]:
"""
Group_index is offsets into cartesian product of all possible labels. This
space can be huge, so this function compresses it, by computing offsets
@@ -216,7 +215,7 @@ def is_int64_overflow_possible(shape) -> bool:
for x in shape:
the_prod *= int(x)
- return the_prod >= lib.i8max
+ return the_prod >= _INT64_MAX
def decons_group_index(comp_labels, shape):
@@ -239,9 +238,7 @@ def decons_group_index(comp_labels, shape):
return label_list[::-1]
-def decons_obs_group_ids(
- comp_ids: npt.NDArray[np.intp], obs_ids, shape, labels, xnull: bool
-):
+def decons_obs_group_ids(comp_ids: np.ndarray, obs_ids, shape, labels, xnull: bool):
"""
Reconstruct labels from observed group ids.
@@ -265,9 +262,8 @@ def decons_obs_group_ids(
return [lab[indexer].astype(np.intp, subok=False, copy=True) for lab in labels]
-def indexer_from_factorized(
- labels, shape: Shape, compress: bool = True
-) -> npt.NDArray[np.intp]:
+def indexer_from_factorized(labels, shape: Shape, compress: bool = True) -> np.ndarray:
+ # returned ndarray is np.intp
ids = get_group_index(labels, shape, sort=True, xnull=False)
if not compress:
@@ -281,7 +277,7 @@ def indexer_from_factorized(
def lexsort_indexer(
keys, orders=None, na_position: str = "last", key: Callable | None = None
-) -> npt.NDArray[np.intp]:
+) -> np.ndarray:
"""
Performs lexical sorting on a set of keys
@@ -353,7 +349,7 @@ def nargsort(
na_position: str = "last",
key: Callable | None = None,
mask: np.ndarray | None = None,
-) -> npt.NDArray[np.intp]:
+):
"""
Intended to be a drop-in replacement for np.argsort which handles NaNs.
@@ -558,7 +554,7 @@ def ensure_key_mapped(values, key: Callable | None, levels=None):
def get_flattened_list(
- comp_ids: npt.NDArray[np.intp],
+ comp_ids: np.ndarray, # np.ndarray[np.intp]
ngroups: int,
levels: Iterable[Index],
labels: Iterable[np.ndarray],
@@ -608,8 +604,8 @@ def get_indexer_dict(
def get_group_index_sorter(
- group_index: npt.NDArray[np.intp], ngroups: int | None = None
-) -> npt.NDArray[np.intp]:
+ group_index: np.ndarray, ngroups: int | None = None
+) -> np.ndarray:
"""
algos.groupsort_indexer implements `counting sort` and it is at least
O(ngroups), where
@@ -652,7 +648,7 @@ def get_group_index_sorter(
def compress_group_index(
group_index: np.ndarray, sort: bool = True
-) -> tuple[npt.NDArray[np.int64], npt.NDArray[np.int64]]:
+) -> tuple[np.ndarray, np.ndarray]:
"""
Group_index is offsets into cartesian product of all possible labels. This
space can be huge, so this function compresses it, by computing offsets
@@ -673,8 +669,8 @@ def compress_group_index(
def _reorder_by_uniques(
- uniques: npt.NDArray[np.int64], labels: npt.NDArray[np.intp]
-) -> tuple[npt.NDArray[np.int64], npt.NDArray[np.intp]]:
+ uniques: np.ndarray, labels: np.ndarray
+) -> tuple[np.ndarray, np.ndarray]:
"""
Parameters
----------
diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py
index 717287360df8f..323cb6bd9fedd 100644
--- a/pandas/core/strings/accessor.py
+++ b/pandas/core/strings/accessor.py
@@ -13,7 +13,10 @@
import numpy as np
import pandas._libs.lib as lib
-from pandas._typing import DtypeObj
+from pandas._typing import (
+ DtypeObj,
+ FrameOrSeriesUnion,
+)
from pandas.util._decorators import Appender
from pandas.core.dtypes.common import (
@@ -36,11 +39,7 @@
from pandas.core.base import NoNewAttributesMixin
if TYPE_CHECKING:
- from pandas import (
- DataFrame,
- Index,
- Series,
- )
+ from pandas import Index
_shared_docs: dict[str, str] = {}
_cpython_optimized_encoders = (
@@ -2315,7 +2314,7 @@ def findall(self, pat, flags=0):
@forbid_nonstring_types(["bytes"])
def extract(
self, pat: str, flags: int = 0, expand: bool = True
- ) -> DataFrame | Series | Index:
+ ) -> FrameOrSeriesUnion | Index:
r"""
Extract capture groups in the regex `pat` as columns in a DataFrame.
@@ -3006,7 +3005,7 @@ def casefold(self):
"isdigit", docstring=_shared_docs["ismethods"] % _doc_args["isdigit"]
)
isspace = _map_and_wrap(
- "isspace", docstring=_shared_docs["ismethods"] % _doc_args["isspace"]
+ "isspace", docstring=_shared_docs["ismethods"] % _doc_args["isalnum"]
)
islower = _map_and_wrap(
"islower", docstring=_shared_docs["ismethods"] % _doc_args["islower"]
diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py
index 26349a3b2c6c1..014a702618bda 100644
--- a/pandas/core/tools/datetimes.py
+++ b/pandas/core/tools/datetimes.py
@@ -194,9 +194,9 @@ def _maybe_cache(
if len(unique_dates) < len(arg):
cache_dates = convert_listlike(unique_dates, format)
cache_array = Series(cache_dates, index=unique_dates)
- # GH#39882 and GH#35888 in case of None and NaT we get duplicates
- if not cache_array.index.is_unique:
- cache_array = cache_array[~cache_array.index.duplicated()]
+ if not cache_array.is_unique:
+ # GH#39882 in case of None and NaT we get duplicates
+ cache_array = cache_array.drop_duplicates()
return cache_array
@@ -762,9 +762,7 @@ def to_datetime(
If parsing succeeded.
Return type depends on input:
- - list-like:
- - DatetimeIndex, if timezone naive or aware with the same timezone
- - Index of object dtype, if timezone aware with mixed time offsets
+ - list-like: DatetimeIndex
- Series: Series of datetime64 dtype
- scalar: Timestamp
diff --git a/pandas/core/tools/numeric.py b/pandas/core/tools/numeric.py
index 7d2bb75934c33..6dfd67f5dc5ec 100644
--- a/pandas/core/tools/numeric.py
+++ b/pandas/core/tools/numeric.py
@@ -190,7 +190,7 @@ def to_numeric(arg, errors="raise", downcast=None):
# attempt downcast only if the data has been successfully converted
# to a numerical dtype and if a downcast method has been specified
if downcast is not None and is_numeric_dtype(values.dtype):
- typecodes: str | None = None
+ typecodes = None
if downcast in ("integer", "signed"):
typecodes = np.typecodes["Integer"]
@@ -208,8 +208,8 @@ def to_numeric(arg, errors="raise", downcast=None):
if typecodes is not None:
# from smallest to largest
- for typecode in typecodes:
- dtype = np.dtype(typecode)
+ for dtype in typecodes:
+ dtype = np.dtype(dtype)
if dtype.itemsize <= values.dtype.itemsize:
values = maybe_downcast_numeric(values, dtype)
diff --git a/pandas/core/util/hashing.py b/pandas/core/util/hashing.py
index aa8ec157265ce..fb5002648b6a5 100644
--- a/pandas/core/util/hashing.py
+++ b/pandas/core/util/hashing.py
@@ -14,9 +14,11 @@
import numpy as np
-from pandas._libs import lib
from pandas._libs.hashing import hash_object_array
-from pandas._typing import ArrayLike
+from pandas._typing import (
+ ArrayLike,
+ FrameOrSeriesUnion,
+)
from pandas.core.dtypes.common import (
is_categorical_dtype,
@@ -32,7 +34,6 @@
if TYPE_CHECKING:
from pandas import (
Categorical,
- DataFrame,
Index,
MultiIndex,
Series,
@@ -76,7 +77,7 @@ def combine_hash_arrays(arrays: Iterator[np.ndarray], num_items: int) -> np.ndar
def hash_pandas_object(
- obj: Index | DataFrame | Series,
+ obj: Index | FrameOrSeriesUnion,
index: bool = True,
encoding: str = "utf8",
hash_key: str | None = _default_hash_key,
@@ -137,10 +138,7 @@ def hash_pandas_object(
ser = Series(h, index=obj.index, dtype="uint64", copy=False)
elif isinstance(obj, ABCDataFrame):
- hashes = (
- hash_array(series._values, encoding, hash_key, categorize)
- for _, series in obj.items()
- )
+ hashes = (hash_array(series._values) for _, series in obj.items())
num_items = len(obj.columns)
if index:
index_hash_generator = (
@@ -246,7 +244,7 @@ def _hash_categorical(cat: Categorical, encoding: str, hash_key: str) -> np.ndar
result = np.zeros(len(mask), dtype="uint64")
if mask.any():
- result[mask] = lib.u8max
+ result[mask] = np.iinfo(np.uint64).max
return result
diff --git a/pandas/core/window/doc.py b/pandas/core/window/doc.py
index b80a73a930818..df69553a74683 100644
--- a/pandas/core/window/doc.py
+++ b/pandas/core/window/doc.py
@@ -94,8 +94,8 @@ def create_section_header(header: str) -> str:
).replace("\n", "", 1)
numba_notes = (
- "See :ref:`window.numba_engine` and :ref:`enhancingperf.numba` for "
- "extended documentation and performance considerations for the Numba engine.\n\n"
+ "See :ref:`window.numba_engine` for extended documentation "
+ "and performance considerations for the Numba engine.\n\n"
)
window_agg_numba_parameters = dedent(
diff --git a/pandas/core/window/ewm.py b/pandas/core/window/ewm.py
index ee99692b85432..4187c56079060 100644
--- a/pandas/core/window/ewm.py
+++ b/pandas/core/window/ewm.py
@@ -3,7 +3,6 @@
import datetime
from functools import partial
from textwrap import dedent
-from typing import TYPE_CHECKING
import warnings
import numpy as np
@@ -13,12 +12,9 @@
from pandas._typing import (
Axis,
FrameOrSeries,
+ FrameOrSeriesUnion,
TimedeltaConvertibleTypes,
)
-
-if TYPE_CHECKING:
- from pandas import DataFrame, Series
-
from pandas.compat.numpy import function as nv
from pandas.util._decorators import doc
@@ -44,14 +40,7 @@
ExponentialMovingWindowIndexer,
GroupbyIndexer,
)
-from pandas.core.window.numba_ import (
- generate_ewma_numba_table_func,
- generate_numba_ewma_func,
-)
-from pandas.core.window.online import (
- EWMMeanState,
- generate_online_numba_ewma_func,
-)
+from pandas.core.window.numba_ import generate_numba_ewma_func
from pandas.core.window.rolling import (
BaseWindow,
BaseWindowGroupby,
@@ -207,16 +196,6 @@ class ExponentialMovingWindow(BaseWindow):
If 1-D array like, a sequence with the same shape as the observations.
Only applicable to ``mean()``.
- method : str {'single', 'table'}, default 'single'
- Execute the rolling operation per single column or row (``'single'``)
- or over the entire object (``'table'``).
-
- This argument is only implemented when specifying ``engine='numba'``
- in the method call.
-
- Only applicable to ``mean()``
-
- .. versionadded:: 1.4.0
Returns
-------
@@ -275,7 +254,6 @@ class ExponentialMovingWindow(BaseWindow):
"ignore_na",
"axis",
"times",
- "method",
]
def __init__(
@@ -285,22 +263,21 @@ def __init__(
span: float | None = None,
halflife: float | TimedeltaConvertibleTypes | None = None,
alpha: float | None = None,
- min_periods: int | None = 0,
+ min_periods: int = 0,
adjust: bool = True,
ignore_na: bool = False,
axis: Axis = 0,
times: str | np.ndarray | FrameOrSeries | None = None,
- method: str = "single",
*,
selection=None,
):
super().__init__(
obj=obj,
- min_periods=1 if min_periods is None else max(int(min_periods), 1),
+ min_periods=max(int(min_periods), 1),
on=None,
center=False,
closed=None,
- method=method,
+ method="single",
axis=axis,
selection=selection,
)
@@ -361,48 +338,6 @@ def _get_window_indexer(self) -> BaseIndexer:
"""
return ExponentialMovingWindowIndexer()
- def online(self, engine="numba", engine_kwargs=None):
- """
- Return an ``OnlineExponentialMovingWindow`` object to calculate
- exponentially moving window aggregations in an online method.
-
- .. versionadded:: 1.3.0
-
- Parameters
- ----------
- engine: str, default ``'numba'``
- Execution engine to calculate online aggregations.
- Applies to all supported aggregation methods.
-
- engine_kwargs : dict, default None
- Applies to all supported aggregation methods.
-
- * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil``
- and ``parallel`` dictionary keys. The values must either be ``True`` or
- ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is
- ``{{'nopython': True, 'nogil': False, 'parallel': False}}`` and will be
- applied to the function
-
- Returns
- -------
- OnlineExponentialMovingWindow
- """
- return OnlineExponentialMovingWindow(
- obj=self.obj,
- com=self.com,
- span=self.span,
- halflife=self.halflife,
- alpha=self.alpha,
- min_periods=self.min_periods,
- adjust=self.adjust,
- ignore_na=self.ignore_na,
- axis=self.axis,
- times=self.times,
- engine=engine,
- engine_kwargs=engine_kwargs,
- selection=self._selection,
- )
-
@doc(
_shared_docs["aggregate"],
see_also=dedent(
@@ -456,19 +391,12 @@ def aggregate(self, func, *args, **kwargs):
)
def mean(self, *args, engine=None, engine_kwargs=None, **kwargs):
if maybe_use_numba(engine):
- if self.method == "single":
- ewma_func = generate_numba_ewma_func(
- engine_kwargs, self._com, self.adjust, self.ignore_na, self._deltas
- )
- numba_cache_key = (lambda x: x, "ewma")
- else:
- ewma_func = generate_ewma_numba_table_func(
- engine_kwargs, self._com, self.adjust, self.ignore_na, self._deltas
- )
- numba_cache_key = (lambda x: x, "ewma_table")
+ ewma_func = generate_numba_ewma_func(
+ engine_kwargs, self._com, self.adjust, self.ignore_na, self._deltas
+ )
return self._apply(
ewma_func,
- numba_cache_key=numba_cache_key,
+ numba_cache_key=(lambda x: x, "ewma"),
)
elif engine in ("cython", None):
if engine_kwargs is not None:
@@ -584,7 +512,7 @@ def var_func(values, begin, end, min_periods):
)
def cov(
self,
- other: DataFrame | Series | None = None,
+ other: FrameOrSeriesUnion | None = None,
pairwise: bool | None = None,
bias: bool = False,
**kwargs,
@@ -651,7 +579,7 @@ def cov_func(x, y):
)
def corr(
self,
- other: DataFrame | Series | None = None,
+ other: FrameOrSeriesUnion | None = None,
pairwise: bool | None = None,
**kwargs,
):
@@ -727,167 +655,3 @@ def _get_window_indexer(self) -> GroupbyIndexer:
window_indexer=ExponentialMovingWindowIndexer,
)
return window_indexer
-
-
-class OnlineExponentialMovingWindow(ExponentialMovingWindow):
- def __init__(
- self,
- obj: FrameOrSeries,
- com: float | None = None,
- span: float | None = None,
- halflife: float | TimedeltaConvertibleTypes | None = None,
- alpha: float | None = None,
- min_periods: int | None = 0,
- adjust: bool = True,
- ignore_na: bool = False,
- axis: Axis = 0,
- times: str | np.ndarray | FrameOrSeries | None = None,
- engine: str = "numba",
- engine_kwargs: dict[str, bool] | None = None,
- *,
- selection=None,
- ):
- if times is not None:
- raise NotImplementedError(
- "times is not implemented with online operations."
- )
- super().__init__(
- obj=obj,
- com=com,
- span=span,
- halflife=halflife,
- alpha=alpha,
- min_periods=min_periods,
- adjust=adjust,
- ignore_na=ignore_na,
- axis=axis,
- times=times,
- selection=selection,
- )
- self._mean = EWMMeanState(
- self._com, self.adjust, self.ignore_na, self.axis, obj.shape
- )
- if maybe_use_numba(engine):
- self.engine = engine
- self.engine_kwargs = engine_kwargs
- else:
- raise ValueError("'numba' is the only supported engine")
-
- def reset(self):
- """
- Reset the state captured by `update` calls.
- """
- self._mean.reset()
-
- def aggregate(self, func, *args, **kwargs):
- return NotImplementedError
-
- def std(self, bias: bool = False, *args, **kwargs):
- return NotImplementedError
-
- def corr(
- self,
- other: DataFrame | Series | None = None,
- pairwise: bool | None = None,
- **kwargs,
- ):
- return NotImplementedError
-
- def cov(
- self,
- other: DataFrame | Series | None = None,
- pairwise: bool | None = None,
- bias: bool = False,
- **kwargs,
- ):
- return NotImplementedError
-
- def var(self, bias: bool = False, *args, **kwargs):
- return NotImplementedError
-
- def mean(self, *args, update=None, update_times=None, **kwargs):
- """
- Calculate an online exponentially weighted mean.
-
- Parameters
- ----------
- update: DataFrame or Series, default None
- New values to continue calculating the
- exponentially weighted mean from the last values and weights.
- Values should be float64 dtype.
-
- ``update`` needs to be ``None`` the first time the
- exponentially weighted mean is calculated.
-
- update_times: Series or 1-D np.ndarray, default None
- New times to continue calculating the
- exponentially weighted mean from the last values and weights.
- If ``None``, values are assumed to be evenly spaced
- in time.
- This feature is currently unsupported.
-
- Returns
- -------
- DataFrame or Series
-
- Examples
- --------
- >>> df = pd.DataFrame({"a": range(5), "b": range(5, 10)})
- >>> online_ewm = df.head(2).ewm(0.5).online()
- >>> online_ewm.mean()
- a b
- 0 0.00 5.00
- 1 0.75 5.75
- >>> online_ewm.mean(update=df.tail(3))
- a b
- 2 1.615385 6.615385
- 3 2.550000 7.550000
- 4 3.520661 8.520661
- >>> online_ewm.reset()
- >>> online_ewm.mean()
- a b
- 0 0.00 5.00
- 1 0.75 5.75
- """
- result_kwargs = {}
- is_frame = True if self._selected_obj.ndim == 2 else False
- if update_times is not None:
- raise NotImplementedError("update_times is not implemented.")
- else:
- update_deltas = np.ones(
- max(self._selected_obj.shape[self.axis - 1] - 1, 0), dtype=np.float64
- )
- if update is not None:
- if self._mean.last_ewm is None:
- raise ValueError(
- "Must call mean with update=None first before passing update"
- )
- result_from = 1
- result_kwargs["index"] = update.index
- if is_frame:
- last_value = self._mean.last_ewm[np.newaxis, :]
- result_kwargs["columns"] = update.columns
- else:
- last_value = self._mean.last_ewm
- result_kwargs["name"] = update.name
- np_array = np.concatenate((last_value, update.to_numpy()))
- else:
- result_from = 0
- result_kwargs["index"] = self._selected_obj.index
- if is_frame:
- result_kwargs["columns"] = self._selected_obj.columns
- else:
- result_kwargs["name"] = self._selected_obj.name
- np_array = self._selected_obj.astype(np.float64).to_numpy()
- ewma_func = generate_online_numba_ewma_func(self.engine_kwargs)
- result = self._mean.run_ewm(
- np_array if is_frame else np_array[:, np.newaxis],
- update_deltas,
- self.min_periods,
- ewma_func,
- )
- if not is_frame:
- result = result.squeeze()
- result = result[result_from:]
- result = self._selected_obj._constructor(result, **result_kwargs)
- return result
diff --git a/pandas/core/window/expanding.py b/pandas/core/window/expanding.py
index eedb6930bad66..02cf31cad7b8d 100644
--- a/pandas/core/window/expanding.py
+++ b/pandas/core/window/expanding.py
@@ -2,7 +2,6 @@
from textwrap import dedent
from typing import (
- TYPE_CHECKING,
Any,
Callable,
)
@@ -10,11 +9,8 @@
from pandas._typing import (
Axis,
FrameOrSeries,
+ FrameOrSeriesUnion,
)
-
-if TYPE_CHECKING:
- from pandas import DataFrame, Series
-
from pandas.compat.numpy import function as nv
from pandas.util._decorators import doc
@@ -595,7 +591,7 @@ def quantile(
)
def cov(
self,
- other: DataFrame | Series | None = None,
+ other: FrameOrSeriesUnion | None = None,
pairwise: bool | None = None,
ddof: int = 1,
**kwargs,
@@ -660,7 +656,7 @@ def cov(
)
def corr(
self,
- other: DataFrame | Series | None = None,
+ other: FrameOrSeriesUnion | None = None,
pairwise: bool | None = None,
ddof: int = 1,
**kwargs,
diff --git a/pandas/core/window/numba_.py b/pandas/core/window/numba_.py
index ab1eb9d3a2688..d00be0ea840a8 100644
--- a/pandas/core/window/numba_.py
+++ b/pandas/core/window/numba_.py
@@ -19,6 +19,7 @@
def generate_numba_apply_func(
+ args: tuple,
kwargs: dict[str, Any],
func: Callable[..., Scalar],
engine_kwargs: dict[str, bool] | None,
@@ -35,6 +36,8 @@ def generate_numba_apply_func(
Parameters
----------
+ args : tuple
+ *args to be passed into the function
kwargs : dict
**kwargs to be passed into the function
func : function
@@ -59,11 +62,7 @@ def generate_numba_apply_func(
@numba.jit(nopython=nopython, nogil=nogil, parallel=parallel)
def roll_apply(
- values: np.ndarray,
- begin: np.ndarray,
- end: np.ndarray,
- minimum_periods: int,
- *args: Any,
+ values: np.ndarray, begin: np.ndarray, end: np.ndarray, minimum_periods: int
) -> np.ndarray:
result = np.empty(len(begin))
for i in numba.prange(len(result)):
@@ -170,6 +169,7 @@ def ewma(
def generate_numba_table_func(
+ args: tuple,
kwargs: dict[str, Any],
func: Callable[..., np.ndarray],
engine_kwargs: dict[str, bool] | None,
@@ -187,6 +187,8 @@ def generate_numba_table_func(
Parameters
----------
+ args : tuple
+ *args to be passed into the function
kwargs : dict
**kwargs to be passed into the function
func : function
@@ -211,11 +213,7 @@ def generate_numba_table_func(
@numba.jit(nopython=nopython, nogil=nogil, parallel=parallel)
def roll_table(
- values: np.ndarray,
- begin: np.ndarray,
- end: np.ndarray,
- minimum_periods: int,
- *args: Any,
+ values: np.ndarray, begin: np.ndarray, end: np.ndarray, minimum_periods: int
):
result = np.empty(values.shape)
min_periods_mask = np.empty(values.shape)
@@ -250,82 +248,3 @@ def nan_agg_with_axis(table):
return result
return nan_agg_with_axis
-
-
-def generate_ewma_numba_table_func(
- engine_kwargs: dict[str, bool] | None,
- com: float,
- adjust: bool,
- ignore_na: bool,
- deltas: np.ndarray,
-):
- """
- Generate a numba jitted ewma function applied table wise specified
- by values from engine_kwargs.
-
- Parameters
- ----------
- engine_kwargs : dict
- dictionary of arguments to be passed into numba.jit
- com : float
- adjust : bool
- ignore_na : bool
- deltas : numpy.ndarray
-
- Returns
- -------
- Numba function
- """
- nopython, nogil, parallel = get_jit_arguments(engine_kwargs)
-
- cache_key = (lambda x: x, "ewma_table")
- if cache_key in NUMBA_FUNC_CACHE:
- return NUMBA_FUNC_CACHE[cache_key]
-
- numba = import_optional_dependency("numba")
-
- @numba.jit(nopython=nopython, nogil=nogil, parallel=parallel)
- def ewma_table(
- values: np.ndarray,
- begin: np.ndarray,
- end: np.ndarray,
- minimum_periods: int,
- ) -> np.ndarray:
- alpha = 1.0 / (1.0 + com)
- old_wt_factor = 1.0 - alpha
- new_wt = 1.0 if adjust else alpha
- old_wt = np.ones(values.shape[1])
-
- result = np.empty(values.shape)
- weighted_avg = values[0].copy()
- nobs = (~np.isnan(weighted_avg)).astype(np.int64)
- result[0] = np.where(nobs >= minimum_periods, weighted_avg, np.nan)
- for i in range(1, len(values)):
- cur = values[i]
- is_observations = ~np.isnan(cur)
- nobs += is_observations.astype(np.int64)
- for j in numba.prange(len(cur)):
- if not np.isnan(weighted_avg[j]):
- if is_observations[j] or not ignore_na:
-
- # note that len(deltas) = len(vals) - 1 and deltas[i] is to be
- # used in conjunction with vals[i+1]
- old_wt[j] *= old_wt_factor ** deltas[i - 1]
- if is_observations[j]:
- # avoid numerical errors on constant series
- if weighted_avg[j] != cur[j]:
- weighted_avg[j] = (
- (old_wt[j] * weighted_avg[j]) + (new_wt * cur[j])
- ) / (old_wt[j] + new_wt)
- if adjust:
- old_wt[j] += new_wt
- else:
- old_wt[j] = 1.0
- elif is_observations[j]:
- weighted_avg[j] = cur[j]
-
- result[i] = np.where(nobs >= minimum_periods, weighted_avg, np.nan)
-
- return result
-
- return ewma_table
diff --git a/pandas/core/window/online.py b/pandas/core/window/online.py
deleted file mode 100644
index 5a9e8d65255ae..0000000000000
--- a/pandas/core/window/online.py
+++ /dev/null
@@ -1,118 +0,0 @@
-from typing import (
- Dict,
- Optional,
-)
-
-import numpy as np
-
-from pandas.compat._optional import import_optional_dependency
-
-from pandas.core.util.numba_ import (
- NUMBA_FUNC_CACHE,
- get_jit_arguments,
-)
-
-
-def generate_online_numba_ewma_func(engine_kwargs: Optional[Dict[str, bool]]):
- """
- Generate a numba jitted groupby ewma function specified by values
- from engine_kwargs.
- Parameters
- ----------
- engine_kwargs : dict
- dictionary of arguments to be passed into numba.jit
- Returns
- -------
- Numba function
- """
- nopython, nogil, parallel = get_jit_arguments(engine_kwargs)
-
- cache_key = (lambda x: x, "online_ewma")
- if cache_key in NUMBA_FUNC_CACHE:
- return NUMBA_FUNC_CACHE[cache_key]
-
- numba = import_optional_dependency("numba")
-
- @numba.jit(nopython=nopython, nogil=nogil, parallel=parallel)
- def online_ewma(
- values: np.ndarray,
- deltas: np.ndarray,
- minimum_periods: int,
- old_wt_factor: float,
- new_wt: float,
- old_wt: np.ndarray,
- adjust: bool,
- ignore_na: bool,
- ):
- """
- Compute online exponentially weighted mean per column over 2D values.
-
- Takes the first observation as is, then computes the subsequent
- exponentially weighted mean accounting minimum periods.
- """
- result = np.empty(values.shape)
- weighted_avg = values[0]
- nobs = (~np.isnan(weighted_avg)).astype(np.int64)
- result[0] = np.where(nobs >= minimum_periods, weighted_avg, np.nan)
-
- for i in range(1, len(values)):
- cur = values[i]
- is_observations = ~np.isnan(cur)
- nobs += is_observations.astype(np.int64)
- for j in numba.prange(len(cur)):
- if not np.isnan(weighted_avg[j]):
- if is_observations[j] or not ignore_na:
-
- # note that len(deltas) = len(vals) - 1 and deltas[i] is to be
- # used in conjunction with vals[i+1]
- old_wt[j] *= old_wt_factor ** deltas[j - 1]
- if is_observations[j]:
- # avoid numerical errors on constant series
- if weighted_avg[j] != cur[j]:
- weighted_avg[j] = (
- (old_wt[j] * weighted_avg[j]) + (new_wt * cur[j])
- ) / (old_wt[j] + new_wt)
- if adjust:
- old_wt[j] += new_wt
- else:
- old_wt[j] = 1.0
- elif is_observations[j]:
- weighted_avg[j] = cur[j]
-
- result[i] = np.where(nobs >= minimum_periods, weighted_avg, np.nan)
-
- return result, old_wt
-
- return online_ewma
-
-
-class EWMMeanState:
- def __init__(self, com, adjust, ignore_na, axis, shape):
- alpha = 1.0 / (1.0 + com)
- self.axis = axis
- self.shape = shape
- self.adjust = adjust
- self.ignore_na = ignore_na
- self.new_wt = 1.0 if adjust else alpha
- self.old_wt_factor = 1.0 - alpha
- self.old_wt = np.ones(self.shape[self.axis - 1])
- self.last_ewm = None
-
- def run_ewm(self, weighted_avg, deltas, min_periods, ewm_func):
- result, old_wt = ewm_func(
- weighted_avg,
- deltas,
- min_periods,
- self.old_wt_factor,
- self.new_wt,
- self.old_wt,
- self.adjust,
- self.ignore_na,
- )
- self.old_wt = old_wt
- self.last_ewm = result[-1]
- return result
-
- def reset(self):
- self.old_wt = np.ones(self.shape[self.axis - 1])
- self.last_ewm = None
diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py
index 8a253726ab0b6..2d5f148a6437a 100644
--- a/pandas/core/window/rolling.py
+++ b/pandas/core/window/rolling.py
@@ -28,6 +28,7 @@
ArrayLike,
Axis,
FrameOrSeries,
+ FrameOrSeriesUnion,
)
from pandas.compat._optional import import_optional_dependency
from pandas.compat.numpy import function as nv
@@ -407,7 +408,7 @@ def _apply_series(
def _apply_blockwise(
self, homogeneous_func: Callable[..., ArrayLike], name: str | None = None
- ) -> DataFrame | Series:
+ ) -> FrameOrSeriesUnion:
"""
Apply the given function to the DataFrame broken down into homogeneous
sub-frames.
@@ -442,7 +443,7 @@ def hfunc2d(values: ArrayLike) -> ArrayLike:
def _apply_tablewise(
self, homogeneous_func: Callable[..., ArrayLike], name: str | None = None
- ) -> DataFrame | Series:
+ ) -> FrameOrSeriesUnion:
"""
Apply the given function to the DataFrame across the entire object
"""
@@ -459,11 +460,11 @@ def _apply_tablewise(
def _apply_pairwise(
self,
- target: DataFrame | Series,
- other: DataFrame | Series | None,
+ target: FrameOrSeriesUnion,
+ other: FrameOrSeriesUnion | None,
pairwise: bool | None,
- func: Callable[[DataFrame | Series, DataFrame | Series], DataFrame | Series],
- ) -> DataFrame | Series:
+ func: Callable[[FrameOrSeriesUnion, FrameOrSeriesUnion], FrameOrSeriesUnion],
+ ) -> FrameOrSeriesUnion:
"""
Apply the given pairwise function given 2 pandas objects (DataFrame/Series)
"""
@@ -481,7 +482,6 @@ def _apply(
func: Callable[..., Any],
name: str | None = None,
numba_cache_key: tuple[Callable, str] | None = None,
- numba_args: tuple[Any, ...] = (),
**kwargs,
):
"""
@@ -495,8 +495,6 @@ def _apply(
name : str,
numba_cache_key : tuple
caching key to be used to store a compiled numba func
- numba_args : tuple
- args to be passed when func is a numba func
**kwargs
additional arguments for rolling function and window function
@@ -524,7 +522,7 @@ def calc(x):
center=self.center,
closed=self.closed,
)
- return func(x, start, end, min_periods, *numba_args)
+ return func(x, start, end, min_periods)
with np.errstate(all="ignore"):
if values.ndim > 1 and self.method == "single":
@@ -585,14 +583,12 @@ def _apply(
func: Callable[..., Any],
name: str | None = None,
numba_cache_key: tuple[Callable, str] | None = None,
- numba_args: tuple[Any, ...] = (),
**kwargs,
) -> FrameOrSeries:
result = super()._apply(
func,
name,
numba_cache_key,
- numba_args,
**kwargs,
)
# Reconstruct the resulting MultiIndex
@@ -643,11 +639,11 @@ def _apply(
def _apply_pairwise(
self,
- target: DataFrame | Series,
- other: DataFrame | Series | None,
+ target: FrameOrSeriesUnion,
+ other: FrameOrSeriesUnion | None,
pairwise: bool | None,
- func: Callable[[DataFrame | Series, DataFrame | Series], DataFrame | Series],
- ) -> DataFrame | Series:
+ func: Callable[[FrameOrSeriesUnion, FrameOrSeriesUnion], FrameOrSeriesUnion],
+ ) -> FrameOrSeriesUnion:
"""
Apply the given pairwise function given 2 pandas objects (DataFrame/Series)
"""
@@ -973,7 +969,6 @@ def _apply(
func: Callable[[np.ndarray, int, int], np.ndarray],
name: str | None = None,
numba_cache_key: tuple[Callable, str] | None = None,
- numba_args: tuple[Any, ...] = (),
**kwargs,
):
"""
@@ -987,8 +982,6 @@ def _apply(
name : str,
use_numba_cache : tuple
unused
- numba_args : tuple
- unused
**kwargs
additional arguments for scipy windows if necessary
@@ -1166,20 +1159,18 @@ def apply(
raise ValueError("raw parameter must be `True` or `False`")
numba_cache_key = None
- numba_args: tuple[Any, ...] = ()
if maybe_use_numba(engine):
if raw is False:
raise ValueError("raw must be `True` when using the numba engine")
caller_name = type(self).__name__
- numba_args = args
if self.method == "single":
apply_func = generate_numba_apply_func(
- kwargs, func, engine_kwargs, caller_name
+ args, kwargs, func, engine_kwargs, caller_name
)
numba_cache_key = (func, f"{caller_name}_apply_single")
else:
apply_func = generate_numba_table_func(
- kwargs, func, engine_kwargs, f"{caller_name}_apply"
+ args, kwargs, func, engine_kwargs, f"{caller_name}_apply"
)
numba_cache_key = (func, f"{caller_name}_apply_table")
elif engine in ("cython", None):
@@ -1192,7 +1183,6 @@ def apply(
return self._apply(
apply_func,
numba_cache_key=numba_cache_key,
- numba_args=numba_args,
)
def _generate_cython_apply_func(
@@ -1389,7 +1379,7 @@ def quantile(self, quantile: float, interpolation: str = "linear", **kwargs):
def cov(
self,
- other: DataFrame | Series | None = None,
+ other: FrameOrSeriesUnion | None = None,
pairwise: bool | None = None,
ddof: int = 1,
**kwargs,
@@ -1427,7 +1417,7 @@ def cov_func(x, y):
def corr(
self,
- other: DataFrame | Series | None = None,
+ other: FrameOrSeriesUnion | None = None,
pairwise: bool | None = None,
ddof: int = 1,
**kwargs,
@@ -2169,7 +2159,7 @@ def quantile(self, quantile: float, interpolation: str = "linear", **kwargs):
)
def cov(
self,
- other: DataFrame | Series | None = None,
+ other: FrameOrSeriesUnion | None = None,
pairwise: bool | None = None,
ddof: int = 1,
**kwargs,
@@ -2294,7 +2284,7 @@ def cov(
)
def corr(
self,
- other: DataFrame | Series | None = None,
+ other: FrameOrSeriesUnion | None = None,
pairwise: bool | None = None,
ddof: int = 1,
**kwargs,
diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py
index 4d6a766ad6cfa..719a4472fb9e3 100644
--- a/pandas/io/excel/_base.py
+++ b/pandas/io/excel/_base.py
@@ -82,9 +82,8 @@
or ``StringIO``.
sheet_name : str, int, list, or None, default 0
Strings are used for sheet names. Integers are used in zero-indexed
- sheet positions (chart sheets do not count as a sheet position).
- Lists of strings/integers are used to request multiple sheets.
- Specify None to get all worksheets.
+ sheet positions. Lists of strings/integers are used to request
+ multiple sheets. Specify None to get all sheets.
Available cases:
@@ -93,7 +92,7 @@
* ``"Sheet1"``: Load sheet with name "Sheet1"
* ``[0, 1, "Sheet5"]``: Load first, second and sheet named "Sheet5"
as a dict of `DataFrame`
- * None: All worksheets.
+ * None: All sheets.
header : int, list of int, default 0
Row (0-indexed) to use for the column labels of the parsed
diff --git a/pandas/io/excel/_odswriter.py b/pandas/io/excel/_odswriter.py
index fa2779b01d681..efef86329314b 100644
--- a/pandas/io/excel/_odswriter.py
+++ b/pandas/io/excel/_odswriter.py
@@ -29,7 +29,6 @@ def __init__(
storage_options: StorageOptions = None,
if_sheet_exists: str | None = None,
engine_kwargs: dict[str, Any] | None = None,
- **kwargs,
):
from odf.opendocument import OpenDocumentSpreadsheet
diff --git a/pandas/io/excel/_openpyxl.py b/pandas/io/excel/_openpyxl.py
index d499f1a5ea89f..bc067e216760c 100644
--- a/pandas/io/excel/_openpyxl.py
+++ b/pandas/io/excel/_openpyxl.py
@@ -19,10 +19,7 @@
BaseExcelReader,
ExcelWriter,
)
-from pandas.io.excel._util import (
- combine_kwargs,
- validate_freeze_panes,
-)
+from pandas.io.excel._util import validate_freeze_panes
if TYPE_CHECKING:
from openpyxl.descriptors.serialisable import Serialisable
@@ -42,13 +39,10 @@ def __init__(
storage_options: StorageOptions = None,
if_sheet_exists: str | None = None,
engine_kwargs: dict[str, Any] | None = None,
- **kwargs,
):
# Use the openpyxl module as the Excel writer.
from openpyxl.workbook import Workbook
- engine_kwargs = combine_kwargs(engine_kwargs, kwargs)
-
super().__init__(
path,
mode=mode,
@@ -536,7 +530,7 @@ def load_workbook(self, filepath_or_buffer: FilePathOrBuffer):
@property
def sheet_names(self) -> list[str]:
- return [sheet.title for sheet in self.book.worksheets]
+ return self.book.sheetnames
def get_sheet_by_name(self, name: str):
self.raise_if_bad_sheet_by_name(name)
diff --git a/pandas/io/excel/_util.py b/pandas/io/excel/_util.py
index 66a66fbbcd78a..7d8028de23257 100644
--- a/pandas/io/excel/_util.py
+++ b/pandas/io/excel/_util.py
@@ -1,9 +1,6 @@
from __future__ import annotations
-from typing import (
- Any,
- MutableMapping,
-)
+from typing import MutableMapping
from pandas.compat._optional import import_optional_dependency
@@ -249,30 +246,3 @@ def pop_header_name(row, index_col):
header_name = None if header_name == "" else header_name
return header_name, row[:i] + [""] + row[i + 1 :]
-
-
-def combine_kwargs(engine_kwargs: dict[str, Any] | None, kwargs: dict) -> dict:
- """
- Used to combine two sources of kwargs for the backend engine.
-
- Use of kwargs is deprecated, this function is solely for use in 1.3 and should
- be removed in 1.4/2.0. Also _base.ExcelWriter.__new__ ensures either engine_kwargs
- or kwargs must be None or empty respectively.
-
- Parameters
- ----------
- engine_kwargs: dict
- kwargs to be passed through to the engine.
- kwargs: dict
- kwargs to be psased through to the engine (deprecated)
-
- Returns
- -------
- engine_kwargs combined with kwargs
- """
- if engine_kwargs is None:
- result = {}
- else:
- result = engine_kwargs.copy()
- result.update(kwargs)
- return result
diff --git a/pandas/io/excel/_xlsxwriter.py b/pandas/io/excel/_xlsxwriter.py
index 06c73f2c6199e..7500a33b1f097 100644
--- a/pandas/io/excel/_xlsxwriter.py
+++ b/pandas/io/excel/_xlsxwriter.py
@@ -6,10 +6,7 @@
from pandas._typing import StorageOptions
from pandas.io.excel._base import ExcelWriter
-from pandas.io.excel._util import (
- combine_kwargs,
- validate_freeze_panes,
-)
+from pandas.io.excel._util import validate_freeze_panes
class _XlsxStyler:
@@ -178,12 +175,11 @@ def __init__(
storage_options: StorageOptions = None,
if_sheet_exists: str | None = None,
engine_kwargs: dict[str, Any] | None = None,
- **kwargs,
):
# Use the xlsxwriter module as the Excel writer.
from xlsxwriter import Workbook
- engine_kwargs = combine_kwargs(engine_kwargs, kwargs)
+ engine_kwargs = engine_kwargs or {}
if mode == "a":
raise ValueError("Append mode is not supported with xlsxwriter!")
diff --git a/pandas/io/excel/_xlwt.py b/pandas/io/excel/_xlwt.py
index 4dadf64b44515..8a7605b80f6b4 100644
--- a/pandas/io/excel/_xlwt.py
+++ b/pandas/io/excel/_xlwt.py
@@ -9,10 +9,7 @@
from pandas._typing import StorageOptions
from pandas.io.excel._base import ExcelWriter
-from pandas.io.excel._util import (
- combine_kwargs,
- validate_freeze_panes,
-)
+from pandas.io.excel._util import validate_freeze_panes
if TYPE_CHECKING:
from xlwt import XFStyle
@@ -33,13 +30,10 @@ def __init__(
storage_options: StorageOptions = None,
if_sheet_exists: str | None = None,
engine_kwargs: dict[str, Any] | None = None,
- **kwargs,
):
# Use the xlwt module as the Excel writer.
import xlwt
- engine_kwargs = combine_kwargs(engine_kwargs, kwargs)
-
if mode == "a":
raise ValueError("Append mode is not supported with xlwt!")
diff --git a/pandas/io/formats/excel.py b/pandas/io/formats/excel.py
index 0c625e8a68db0..b285fa5f315ed 100644
--- a/pandas/io/formats/excel.py
+++ b/pandas/io/formats/excel.py
@@ -769,7 +769,7 @@ def _generate_body(self, coloffset: int) -> Iterable[ExcelCell]:
series = self.df.iloc[:, colidx]
for i, val in enumerate(series):
if styles is not None:
- css = ";".join([a + ":" + str(v) for (a, v) in styles[i, colidx]])
+ css = ";".join(a + ":" + str(v) for (a, v) in styles[i, colidx])
xlstyle = self.style_converter(css)
yield ExcelCell(self.rowcounter + i, colidx + coloffset, val, xlstyle)
diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py
index 83e0086958b9a..d1c19f348f901 100644
--- a/pandas/io/formats/format.py
+++ b/pandas/io/formats/format.py
@@ -861,7 +861,7 @@ def space_format(x, y):
return y
str_columns = list(
- zip(*([space_format(x, y) for y in x] for x in fmt_columns))
+ zip(*[[space_format(x, y) for y in x] for x in fmt_columns])
)
if self.sparsify and len(str_columns):
str_columns = sparsify_labels(str_columns)
@@ -1635,10 +1635,24 @@ def format_percentiles(
percentiles = 100 * percentiles
- int_idx = np.isclose(percentiles.astype(int), percentiles)
+ # error: Item "List[Union[int, float]]" of "Union[ndarray, List[Union[int, float]],
+ # List[float], List[Union[str, float]]]" has no attribute "astype"
+ # error: Item "List[float]" of "Union[ndarray, List[Union[int, float]], List[float],
+ # List[Union[str, float]]]" has no attribute "astype"
+ # error: Item "List[Union[str, float]]" of "Union[ndarray, List[Union[int, float]],
+ # List[float], List[Union[str, float]]]" has no attribute "astype"
+ int_idx = np.isclose(
+ percentiles.astype(int), percentiles # type: ignore[union-attr]
+ )
if np.all(int_idx):
- out = percentiles.astype(int).astype(str)
+ # error: Item "List[Union[int, float]]" of "Union[ndarray, List[Union[int,
+ # float]], List[float], List[Union[str, float]]]" has no attribute "astype"
+ # error: Item "List[float]" of "Union[ndarray, List[Union[int, float]],
+ # List[float], List[Union[str, float]]]" has no attribute "astype"
+ # error: Item "List[Union[str, float]]" of "Union[ndarray, List[Union[int,
+ # float]], List[float], List[Union[str, float]]]" has no attribute "astype"
+ out = percentiles.astype(int).astype(str) # type: ignore[union-attr]
return [i + "%" for i in out]
unique_pcts = np.unique(percentiles)
diff --git a/pandas/io/formats/info.py b/pandas/io/formats/info.py
index 64a59778a54f3..e014d7d63a35f 100644
--- a/pandas/io/formats/info.py
+++ b/pandas/io/formats/info.py
@@ -16,7 +16,10 @@
from pandas._config import get_option
-from pandas._typing import Dtype
+from pandas._typing import (
+ Dtype,
+ FrameOrSeriesUnion,
+)
from pandas.core.indexes.api import Index
@@ -24,10 +27,7 @@
from pandas.io.formats.printing import pprint_thing
if TYPE_CHECKING:
- from pandas.core.frame import (
- DataFrame,
- Series,
- )
+ from pandas.core.frame import DataFrame
def _put_str(s: str | Dtype, space: int) -> str:
@@ -110,7 +110,7 @@ class BaseInfo(ABC):
values.
"""
- data: DataFrame | Series
+ data: FrameOrSeriesUnion
memory_usage: bool | str
@property
@@ -413,7 +413,7 @@ def get_lines(self) -> list[str]:
"""Product in a form of list of lines (strings)."""
@property
- def data(self) -> DataFrame | Series:
+ def data(self) -> FrameOrSeriesUnion:
return self.info.data
@property
diff --git a/pandas/io/formats/latex.py b/pandas/io/formats/latex.py
index 93069a1e2955d..e9e2b830e32cb 100644
--- a/pandas/io/formats/latex.py
+++ b/pandas/io/formats/latex.py
@@ -358,7 +358,7 @@ def get_result(self) -> str:
self.bottom_separator,
self.env_end,
]
- result = "\n".join([item for item in elements if item])
+ result = "\n".join(item for item in elements if item)
trailing_newline = "\n"
result += trailing_newline
return result
@@ -527,13 +527,13 @@ def env_begin(self) -> str:
f"\\begin{{longtable}}{self._position_macro}{{{self.column_format}}}"
)
elements = [first_row, f"{self._caption_and_label()}"]
- return "\n".join([item for item in elements if item])
+ return "\n".join(item for item in elements if item)
def _caption_and_label(self) -> str:
if self.caption or self.label:
double_backslash = "\\\\"
elements = [f"{self._caption_macro}", f"{self._label_macro}"]
- caption_and_label = "\n".join([item for item in elements if item])
+ caption_and_label = "\n".join(item for item in elements if item)
caption_and_label += double_backslash
return caption_and_label
else:
@@ -611,7 +611,7 @@ def env_begin(self) -> str:
f"{self._label_macro}",
f"\\begin{{tabular}}{{{self.column_format}}}",
]
- return "\n".join([item for item in elements if item])
+ return "\n".join(item for item in elements if item)
@property
def bottom_separator(self) -> str:
diff --git a/pandas/io/formats/string.py b/pandas/io/formats/string.py
index 90a4800c805b6..2610b7777207f 100644
--- a/pandas/io/formats/string.py
+++ b/pandas/io/formats/string.py
@@ -119,7 +119,13 @@ def _join_multiline(self, strcols_input: Iterable[list[str]]) -> str:
if self.fmt.index:
idx = strcols.pop(0)
- lwidth -= np.array([self.adj.len(x) for x in idx]).max() + adjoin_width
+ # error: Argument 1 to "__call__" of "_NumberOp" has incompatible type
+ # "None"; expected "Union[int, float, complex, number, bool_]"
+ # error: Incompatible types in assignment (expression has type "number",
+ # variable has type "Optional[int]")
+ lwidth -= ( # type: ignore[assignment,arg-type]
+ np.array([self.adj.len(x) for x in idx]).max() + adjoin_width
+ )
col_widths = [
np.array([self.adj.len(x) for x in col]).max() if len(col) > 0 else 0
@@ -127,7 +133,9 @@ def _join_multiline(self, strcols_input: Iterable[list[str]]) -> str:
]
assert lwidth is not None
- col_bins = _binify(col_widths, lwidth)
+ # error: Argument 1 to "_binify" has incompatible type "List[object]"; expected
+ # "List[int]"
+ col_bins = _binify(col_widths, lwidth) # type: ignore[arg-type]
nbins = len(col_bins)
if self.fmt.is_truncated_vertically:
diff --git a/pandas/io/formats/style.py b/pandas/io/formats/style.py
index cb56ea33acad8..93c3843b36846 100644
--- a/pandas/io/formats/style.py
+++ b/pandas/io/formats/style.py
@@ -23,6 +23,7 @@
Axis,
FilePathOrBuffer,
FrameOrSeries,
+ FrameOrSeriesUnion,
IndexLabel,
Scalar,
)
@@ -30,10 +31,7 @@
from pandas.util._decorators import doc
import pandas as pd
-from pandas import (
- IndexSlice,
- RangeIndex,
-)
+from pandas import RangeIndex
from pandas.api.types import is_list_like
from pandas.core import generic
import pandas.core.common as com
@@ -173,7 +171,7 @@ class Styler(StylerRenderer):
def __init__(
self,
- data: DataFrame | Series,
+ data: FrameOrSeriesUnion,
precision: int | None = None,
table_styles: CSSStyles | None = None,
uuid: str | None = None,
@@ -428,7 +426,6 @@ def to_latex(
multicol_align: str = "r",
siunitx: bool = False,
encoding: str | None = None,
- convert_css: bool = False,
):
r"""
Write Styler to a file, buffer or string in LaTeX format.
@@ -485,10 +482,6 @@ def to_latex(
Set to ``True`` to structure LaTeX compatible with the {siunitx} package.
encoding : str, default "utf-8"
Character encoding setting.
- convert_css : bool, default False
- Convert simple cell-styles from CSS to LaTeX format. Any CSS not found in
- conversion table is dropped. A style can be forced by adding option
- `--latex`. See notes.
Returns
-------
@@ -668,48 +661,7 @@ def to_latex(
& ix2 & \$3 & 4.400 & CATS \\
L1 & ix3 & \$2 & 6.600 & COWS \\
\end{tabular}
-
- **CSS Conversion**
-
- This method can convert a Styler constructured with HTML-CSS to LaTeX using
- the following limited conversions.
-
- ================== ==================== ============= ==========================
- CSS Attribute CSS value LaTeX Command LaTeX Options
- ================== ==================== ============= ==========================
- font-weight | bold | bfseries
- | bolder | bfseries
- font-style | italic | itshape
- | oblique | slshape
- background-color | red cellcolor | {red}--lwrap
- | #fe01ea | [HTML]{FE01EA}--lwrap
- | #f0e | [HTML]{FF00EE}--lwrap
- | rgb(128,255,0) | [rgb]{0.5,1,0}--lwrap
- | rgba(128,0,0,0.5) | [rgb]{0.5,0,0}--lwrap
- | rgb(25%,255,50%) | [rgb]{0.25,1,0.5}--lwrap
- color | red color | {red}
- | #fe01ea | [HTML]{FE01EA}
- | #f0e | [HTML]{FF00EE}
- | rgb(128,255,0) | [rgb]{0.5,1,0}
- | rgba(128,0,0,0.5) | [rgb]{0.5,0,0}
- | rgb(25%,255,50%) | [rgb]{0.25,1,0.5}
- ================== ==================== ============= ==========================
-
- It is also possible to add user-defined LaTeX only styles to a HTML-CSS Styler
- using the ``--latex`` flag, and to add LaTeX parsing options that the
- converter will detect within a CSS-comment.
-
- >>> df = pd.DataFrame([[1]])
- >>> df.style.set_properties(
- ... **{"font-weight": "bold /* --dwrap */", "Huge": "--latex--rwrap"}
- ... ).to_latex(convert_css=True)
- \begin{tabular}{lr}
- {} & {0} \\
- 0 & {\bfseries}{\Huge{1}} \\
- \end{tabular}
"""
- obj = self._copy(deepcopy=True) # manipulate table_styles on obj, not self
-
table_selectors = (
[style["selector"] for style in self.table_styles]
if self.table_styles is not None
@@ -718,7 +670,7 @@ def to_latex(
if column_format is not None:
# add more recent setting to table_styles
- obj.set_table_styles(
+ self.set_table_styles(
[{"selector": "column_format", "props": f":{column_format}"}],
overwrite=False,
)
@@ -730,19 +682,19 @@ def to_latex(
self.data.columns = RangeIndex(stop=len(self.data.columns))
numeric_cols = self.data._get_numeric_data().columns.to_list()
self.data.columns = _original_columns
- column_format = "" if self.hide_index_ else "l" * self.data.index.nlevels
+ column_format = "" if self.hidden_index else "l" * self.data.index.nlevels
for ci, _ in enumerate(self.data.columns):
if ci not in self.hidden_columns:
column_format += (
("r" if not siunitx else "S") if ci in numeric_cols else "l"
)
- obj.set_table_styles(
+ self.set_table_styles(
[{"selector": "column_format", "props": f":{column_format}"}],
overwrite=False,
)
if position:
- obj.set_table_styles(
+ self.set_table_styles(
[{"selector": "position", "props": f":{position}"}],
overwrite=False,
)
@@ -754,13 +706,13 @@ def to_latex(
f"'raggedright', 'raggedleft', 'centering', "
f"got: '{position_float}'"
)
- obj.set_table_styles(
+ self.set_table_styles(
[{"selector": "position_float", "props": f":{position_float}"}],
overwrite=False,
)
if hrules:
- obj.set_table_styles(
+ self.set_table_styles(
[
{"selector": "toprule", "props": ":toprule"},
{"selector": "midrule", "props": ":midrule"},
@@ -770,25 +722,24 @@ def to_latex(
)
if label:
- obj.set_table_styles(
+ self.set_table_styles(
[{"selector": "label", "props": f":{{{label.replace(':', '§')}}}"}],
overwrite=False,
)
if caption:
- obj.set_caption(caption)
+ self.set_caption(caption)
if sparse_index is None:
sparse_index = get_option("styler.sparse.index")
if sparse_columns is None:
sparse_columns = get_option("styler.sparse.columns")
- latex = obj._render_latex(
+ latex = self._render_latex(
sparse_index=sparse_index,
sparse_columns=sparse_columns,
multirow_align=multirow_align,
multicol_align=multicol_align,
- convert_css=convert_css,
)
return save_to_buffer(latex, buf=buf, encoding=encoding)
@@ -965,60 +916,39 @@ def _update_ctx(self, attrs: DataFrame) -> None:
self.ctx[(i, j)].extend(css_list)
def _copy(self, deepcopy: bool = False) -> Styler:
- """
- Copies a Styler, allowing for deepcopy or shallow copy
-
- Copying a Styler aims to recreate a new Styler object which contains the same
- data and styles as the original.
-
- Data dependent attributes [copied and NOT exported]:
- - formatting (._display_funcs)
- - hidden index values or column values (.hidden_rows, .hidden_columns)
- - tooltips
- - cell_context (cell css classes)
- - ctx (cell css styles)
- - caption
-
- Non-data dependent attributes [copied and exported]:
- - hidden index state and hidden columns state (.hide_index_, .hide_columns_)
- - table_attributes
- - table_styles
- - applied styles (_todo)
-
- """
- # GH 40675
styler = Styler(
- self.data, # populates attributes 'data', 'columns', 'index' as shallow
- uuid_len=self.uuid_len,
+ self.data,
+ precision=self.precision,
+ caption=self.caption,
+ table_attributes=self.table_attributes,
+ cell_ids=self.cell_ids,
+ na_rep=self.na_rep,
)
- shallow = [ # simple string or boolean immutables
- "hide_index_",
- "hide_columns_",
- "table_attributes",
- "cell_ids",
- "caption",
- ]
- deep = [ # nested lists or dicts
- "_display_funcs",
- "hidden_rows",
- "hidden_columns",
- "ctx",
- "cell_context",
- "_todo",
- "table_styles",
- "tooltips",
- ]
-
- for attr in shallow:
- setattr(styler, attr, getattr(self, attr))
-
- for attr in deep:
- val = getattr(self, attr)
- setattr(styler, attr, copy.deepcopy(val) if deepcopy else val)
+
+ styler.uuid = self.uuid
+ styler.hidden_index = self.hidden_index
+
+ if deepcopy:
+ styler.ctx = copy.deepcopy(self.ctx)
+ styler._todo = copy.deepcopy(self._todo)
+ styler.table_styles = copy.deepcopy(self.table_styles)
+ styler.hidden_columns = copy.copy(self.hidden_columns)
+ styler.cell_context = copy.deepcopy(self.cell_context)
+ styler.tooltips = copy.deepcopy(self.tooltips)
+ else:
+ styler.ctx = self.ctx
+ styler._todo = self._todo
+ styler.table_styles = self.table_styles
+ styler.hidden_columns = self.hidden_columns
+ styler.cell_context = self.cell_context
+ styler.tooltips = self.tooltips
return styler
def __copy__(self) -> Styler:
+ """
+ Deep copy by default.
+ """
return self._copy(deepcopy=False)
def __deepcopy__(self, memo) -> Styler:
@@ -1030,14 +960,15 @@ def clear(self) -> None:
Returns None.
"""
- # create default GH 40675
- clean_copy = Styler(self.data, uuid=self.uuid)
- clean_attrs = [a for a in clean_copy.__dict__ if not callable(a)]
- self_attrs = [a for a in self.__dict__ if not callable(a)] # maybe more attrs
- for attr in clean_attrs:
- setattr(self, attr, getattr(clean_copy, attr))
- for attr in set(self_attrs).difference(clean_attrs):
- delattr(self, attr)
+ self.ctx.clear()
+ self.tooltips = None
+ self.cell_context.clear()
+ self._todo.clear()
+
+ self.hidden_index = False
+ self.hidden_columns = []
+ # self.format and self.table_styles may be dependent on user
+ # input in self.__init__()
def _apply(
self,
@@ -1165,7 +1096,7 @@ def _applymap(
) -> Styler:
func = partial(func, **kwargs) # applymap doesn't take kwargs?
if subset is None:
- subset = IndexSlice[:]
+ subset = pd.IndexSlice[:]
subset = non_reducing_slice(subset)
result = self.data.loc[subset].applymap(func)
self._update_ctx(result)
@@ -1275,14 +1206,12 @@ def where(
recommend using instead.
The example:
-
>>> df = pd.DataFrame([[1, 2], [3, 4]])
>>> def cond(v, limit=4):
... return v > 1 and v != limit
>>> df.style.where(cond, value='color:green;', other='color:red;')
should be refactored to:
-
>>> def style_func(v, value, other, limit=4):
... cond = v > 1 and v != limit
... return value if cond else other
@@ -1435,71 +1364,6 @@ def set_caption(self, caption: str | tuple) -> Styler:
self.caption = caption
return self
- def set_sticky(
- self,
- axis: Axis = 0,
- pixel_size: int | None = None,
- levels: list[int] | None = None,
- ) -> Styler:
- """
- Add CSS to permanently display the index or column headers in a scrolling frame.
-
- Parameters
- ----------
- axis : {0 or 'index', 1 or 'columns', None}, default 0
- Whether to make the index or column headers sticky.
- pixel_size : int, optional
- Required to configure the width of index cells or the height of column
- header cells when sticking a MultiIndex. Defaults to 75 and 25 respectively.
- levels : list of int
- If ``axis`` is a MultiIndex the specific levels to stick. If ``None`` will
- stick all levels.
-
- Returns
- -------
- self : Styler
- """
- if axis in [0, "index"]:
- axis, obj, tag, pos = 0, self.data.index, "tbody", "left"
- pixel_size = 75 if not pixel_size else pixel_size
- elif axis in [1, "columns"]:
- axis, obj, tag, pos = 1, self.data.columns, "thead", "top"
- pixel_size = 25 if not pixel_size else pixel_size
- else:
- raise ValueError("`axis` must be one of {0, 1, 'index', 'columns'}")
-
- if not isinstance(obj, pd.MultiIndex):
- return self.set_table_styles(
- [
- {
- "selector": f"{tag} th",
- "props": f"position:sticky; {pos}:0px; background-color:white;",
- }
- ],
- overwrite=False,
- )
- else:
- range_idx = list(range(obj.nlevels))
-
- levels = sorted(levels) if levels else range_idx
- for i, level in enumerate(levels):
- self.set_table_styles(
- [
- {
- "selector": f"{tag} th.level{level}",
- "props": f"position: sticky; "
- f"{pos}: {i * pixel_size}px; "
- f"{f'height: {pixel_size}px; ' if axis == 1 else ''}"
- f"{f'min-width: {pixel_size}px; ' if axis == 0 else ''}"
- f"{f'max-width: {pixel_size}px; ' if axis == 0 else ''}"
- f"background-color: white;",
- }
- ],
- overwrite=False,
- )
-
- return self
-
def set_table_styles(
self,
table_styles: dict[Any, CSSStyles] | CSSStyles,
@@ -1645,169 +1509,37 @@ def set_na_rep(self, na_rep: str) -> StylerRenderer:
self.na_rep = na_rep
return self.format(na_rep=na_rep, precision=self.precision)
- def hide_index(self, subset: Subset | None = None) -> Styler:
+ def hide_index(self) -> Styler:
"""
- Hide the entire index, or specific keys in the index from rendering.
-
- This method has dual functionality:
-
- - if ``subset`` is ``None`` then the entire index will be hidden whilst
- displaying all data-rows.
- - if a ``subset`` is given then those specific rows will be hidden whilst the
- index itself remains visible.
-
- .. versionchanged:: 1.3.0
-
- Parameters
- ----------
- subset : label, array-like, IndexSlice, optional
- A valid 1d input or single key along the index axis within
- `DataFrame.loc[, :]`, to limit ``data`` to *before* applying
- the function.
+ Hide any indices from rendering.
Returns
-------
self : Styler
-
- See Also
- --------
- Styler.hide_columns: Hide the entire column headers row, or specific columns.
-
- Examples
- --------
- Simple application hiding specific rows:
-
- >>> df = pd.DataFrame([[1,2], [3,4], [5,6]], index=["a", "b", "c"])
- >>> df.style.hide_index(["a", "b"])
- 0 1
- c 5 6
-
- Hide the index and retain the data values:
-
- >>> midx = pd.MultiIndex.from_product([["x", "y"], ["a", "b", "c"]])
- >>> df = pd.DataFrame(np.random.randn(6,6), index=midx, columns=midx)
- >>> df.style.format("{:.1f}").hide_index()
- x y
- a b c a b c
- 0.1 0.0 0.4 1.3 0.6 -1.4
- 0.7 1.0 1.3 1.5 -0.0 -0.2
- 1.4 -0.8 1.6 -0.2 -0.4 -0.3
- 0.4 1.0 -0.2 -0.8 -1.2 1.1
- -0.6 1.2 1.8 1.9 0.3 0.3
- 0.8 0.5 -0.3 1.2 2.2 -0.8
-
- Hide specific rows but retain the index:
-
- >>> df.style.format("{:.1f}").hide_index(subset=(slice(None), ["a", "c"]))
- x y
- a b c a b c
- x b 0.7 1.0 1.3 1.5 -0.0 -0.2
- y b -0.6 1.2 1.8 1.9 0.3 0.3
-
- Hide specific rows and the index:
-
- >>> df.style.format("{:.1f}").hide_index(subset=(slice(None), ["a", "c"]))
- ... .hide_index()
- x y
- a b c a b c
- 0.7 1.0 1.3 1.5 -0.0 -0.2
- -0.6 1.2 1.8 1.9 0.3 0.3
"""
- if subset is None:
- self.hide_index_ = True
- else:
- subset_ = IndexSlice[subset, :] # new var so mypy reads not Optional
- subset = non_reducing_slice(subset_)
- hide = self.data.loc[subset]
- hrows = self.index.get_indexer_for(hide.index)
- # error: Incompatible types in assignment (expression has type
- # "ndarray", variable has type "Sequence[int]")
- self.hidden_rows = hrows # type: ignore[assignment]
+ self.hidden_index = True
return self
- def hide_columns(self, subset: Subset | None = None) -> Styler:
+ def hide_columns(self, subset: Subset) -> Styler:
"""
- Hide the column headers or specific keys in the columns from rendering.
-
- This method has dual functionality:
-
- - if ``subset`` is ``None`` then the entire column headers row will be hidden
- whilst the data-values remain visible.
- - if a ``subset`` is given then those specific columns, including the
- data-values will be hidden, whilst the column headers row remains visible.
-
- .. versionchanged:: 1.3.0
+ Hide columns from rendering.
Parameters
----------
- subset : label, array-like, IndexSlice, optional
- A valid 1d input or single key along the columns axis within
- `DataFrame.loc[:, ]`, to limit ``data`` to *before* applying
- the function.
+ subset : label, array-like, IndexSlice
+ A valid 1d input or single key along the appropriate axis within
+ `DataFrame.loc[]`, to limit ``data`` to *before* applying the function.
Returns
-------
self : Styler
-
- See Also
- --------
- Styler.hide_index: Hide the entire index, or specific keys in the index.
-
- Examples
- --------
- Simple application hiding specific columns:
-
- >>> df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=["a", "b", "c"])
- >>> df.style.hide_columns(["a", "b"])
- c
- 0 3
- 1 6
-
- Hide column headers and retain the data values:
-
- >>> midx = pd.MultiIndex.from_product([["x", "y"], ["a", "b", "c"]])
- >>> df = pd.DataFrame(np.random.randn(6,6), index=midx, columns=midx)
- >>> df.style.format("{:.1f}").hide_columns()
- x d 0.1 0.0 0.4 1.3 0.6 -1.4
- e 0.7 1.0 1.3 1.5 -0.0 -0.2
- f 1.4 -0.8 1.6 -0.2 -0.4 -0.3
- y d 0.4 1.0 -0.2 -0.8 -1.2 1.1
- e -0.6 1.2 1.8 1.9 0.3 0.3
- f 0.8 0.5 -0.3 1.2 2.2 -0.8
-
- Hide specific columns but retain the column headers:
-
- >>> df.style.format("{:.1f}").hide_columns(subset=(slice(None), ["a", "c"]))
- x y
- b b
- x a 0.0 0.6
- b 1.0 -0.0
- c -0.8 -0.4
- y a 1.0 -1.2
- b 1.2 0.3
- c 0.5 2.2
-
- Hide specific columns and the column headers:
-
- >>> df.style.format("{:.1f}").hide_columns(subset=(slice(None), ["a", "c"]))
- ... .hide_columns()
- x a 0.0 0.6
- b 1.0 -0.0
- c -0.8 -0.4
- y a 1.0 -1.2
- b 1.2 0.3
- c 0.5 2.2
"""
- if subset is None:
- self.hide_columns_ = True
- else:
- subset_ = IndexSlice[:, subset] # new var so mypy reads not Optional
- subset = non_reducing_slice(subset_)
- hide = self.data.loc[subset]
- hcols = self.columns.get_indexer_for(hide.columns)
- # error: Incompatible types in assignment (expression has type
- # "ndarray", variable has type "Sequence[int]")
- self.hidden_columns = hcols # type: ignore[assignment]
+ subset = non_reducing_slice(subset)
+ hidden_df = self.data.loc[subset]
+ hcols = self.columns.get_indexer_for(hidden_df.columns)
+ # error: Incompatible types in assignment (expression has type
+ # "ndarray", variable has type "Sequence[int]")
+ self.hidden_columns = hcols # type: ignore[assignment]
return self
# -----------------------------------------------------------------------
@@ -2039,27 +1771,82 @@ def set_properties(self, subset: Subset | None = None, **kwargs) -> Styler:
>>> df.style.set_properties(color="white", align="right")
>>> df.style.set_properties(**{'background-color': 'yellow'})
"""
- values = "".join([f"{p}: {v};" for p, v in kwargs.items()])
+ values = "".join(f"{p}: {v};" for p, v in kwargs.items())
return self.applymap(lambda x: values, subset=subset)
+ @staticmethod
+ def _bar(
+ s,
+ align: str,
+ colors: list[str],
+ width: float = 100,
+ vmin: float | None = None,
+ vmax: float | None = None,
+ ):
+ """
+ Draw bar chart in dataframe cells.
+ """
+ # Get input value range.
+ smin = np.nanmin(s.to_numpy()) if vmin is None else vmin
+ smax = np.nanmax(s.to_numpy()) if vmax is None else vmax
+ if align == "mid":
+ smin = min(0, smin)
+ smax = max(0, smax)
+ elif align == "zero":
+ # For "zero" mode, we want the range to be symmetrical around zero.
+ smax = max(abs(smin), abs(smax))
+ smin = -smax
+ # Transform to percent-range of linear-gradient
+ normed = width * (s.to_numpy(dtype=float) - smin) / (smax - smin + 1e-12)
+ zero = -width * smin / (smax - smin + 1e-12)
+
+ def css_bar(start: float, end: float, color: str) -> str:
+ """
+ Generate CSS code to draw a bar from start to end.
+ """
+ css = "width: 10em; height: 80%;"
+ if end > start:
+ css += "background: linear-gradient(90deg,"
+ if start > 0:
+ css += f" transparent {start:.1f}%, {color} {start:.1f}%, "
+ e = min(end, width)
+ css += f"{color} {e:.1f}%, transparent {e:.1f}%)"
+ return css
+
+ def css(x):
+ if pd.isna(x):
+ return ""
+
+ # avoid deprecated indexing `colors[x > zero]`
+ color = colors[1] if x > zero else colors[0]
+
+ if align == "left":
+ return css_bar(0, x, color)
+ else:
+ return css_bar(min(x, zero), max(x, zero), color)
+
+ if s.ndim == 1:
+ return [css(x) for x in normed]
+ else:
+ return DataFrame(
+ [[css(x) for x in row] for row in normed],
+ index=s.index,
+ columns=s.columns,
+ )
+
def bar(
self,
subset: Subset | None = None,
axis: Axis | None = 0,
- *,
color="#d65f5f",
width: float = 100,
- height: float = 100,
- align: str | float | int | Callable = "mid",
+ align: str = "left",
vmin: float | None = None,
vmax: float | None = None,
- props: str = "width: 10em;",
) -> Styler:
"""
Draw bar chart in the cell backgrounds.
- .. versionchanged:: 1.4.0
-
Parameters
----------
subset : label, array-like, IndexSlice, optional
@@ -2076,30 +1863,16 @@ def bar(
first element is the color_negative and the second is the
color_positive (eg: ['#d65f5f', '#5fba7d']).
width : float, default 100
- The percentage of the cell, measured from the left, in which to draw the
- bars, in [0, 100].
- height : float, default 100
- The percentage height of the bar in the cell, centrally aligned, in [0,100].
-
- .. versionadded:: 1.4.0
- align : str, int, float, callable, default 'mid'
- How to align the bars within the cells relative to a width adjusted center.
- If string must be one of:
-
- - 'left' : bars are drawn rightwards from the minimum data value.
- - 'right' : bars are drawn leftwards from the maximum data value.
- - 'zero' : a value of zero is located at the center of the cell.
- - 'mid' : a value of (max-min)/2 is located at the center of the cell,
- or if all values are negative (positive) the zero is
- aligned at the right (left) of the cell.
- - 'mean' : the mean value of the data is located at the center of the cell.
-
- If a float or integer is given this will indicate the center of the cell.
-
- If a callable should take a 1d or 2d array and return a scalar.
-
- .. versionchanged:: 1.4.0
+ A number between 0 or 100. The largest value will cover `width`
+ percent of the cell's width.
+ align : {'left', 'zero',' mid'}, default 'left'
+ How to align the bars with the cells.
+ - 'left' : the min value starts at the left of the cell.
+ - 'zero' : a value of zero is located at the center of the cell.
+ - 'mid' : the center of the cell is at (max-min)/2, or
+ if values are all negative (positive) the zero is aligned
+ at the right (left) of the cell.
vmin : float, optional
Minimum bar value, defining the left hand limit
of the bar drawing range, lower values are clipped to `vmin`.
@@ -2108,16 +1881,14 @@ def bar(
Maximum bar value, defining the right hand limit
of the bar drawing range, higher values are clipped to `vmax`.
When None (default): the maximum value of the data will be used.
- props : str, optional
- The base CSS of the cell that is extended to add the bar chart. Defaults to
- `"width: 10em;"`
-
- .. versionadded:: 1.4.0
Returns
-------
self : Styler
"""
+ if align not in ("left", "zero", "mid"):
+ raise ValueError("`align` must be one of {'left', 'zero',' mid'}")
+
if not (is_list_like(color)):
color = [color, color]
elif len(color) == 1:
@@ -2129,25 +1900,18 @@ def bar(
"(eg: color=['#d65f5f', '#5fba7d'])"
)
- if not (0 <= width <= 100):
- raise ValueError(f"`width` must be a value in [0, 100], got {width}")
- elif not (0 <= height <= 100):
- raise ValueError(f"`height` must be a value in [0, 100], got {height}")
-
if subset is None:
subset = self.data.select_dtypes(include=np.number).columns
self.apply(
- _bar,
+ self._bar,
subset=subset,
axis=axis,
align=align,
colors=color,
- width=width / 100,
- height=height / 100,
+ width=width,
vmin=vmin,
vmax=vmax,
- base_css=props,
)
return self
@@ -2518,35 +2282,23 @@ def highlight_quantile(
)
@classmethod
- def from_custom_template(
- cls, searchpath, html_table: str | None = None, html_style: str | None = None
- ):
+ def from_custom_template(cls, searchpath, name):
"""
Factory function for creating a subclass of ``Styler``.
- Uses custom templates and Jinja environment.
-
- .. versionchanged:: 1.3.0
+ Uses a custom template and Jinja environment.
Parameters
----------
searchpath : str or list
Path or paths of directories containing the templates.
- html_table : str
- Name of your custom template to replace the html_table template.
-
- .. versionadded:: 1.3.0
-
- html_style : str
- Name of your custom template to replace the html_style template.
-
- .. versionadded:: 1.3.0
+ name : str
+ Name of your custom template to use for rendering.
Returns
-------
MyStyler : subclass of Styler
- Has the correct ``env``,``template_html``, ``template_html_table`` and
- ``template_html_style`` class attributes set.
+ Has the correct ``env`` and ``template`` class attributes set.
"""
loader = jinja2.ChoiceLoader([jinja2.FileSystemLoader(searchpath), cls.loader])
@@ -2555,10 +2307,7 @@ def from_custom_template(
# error: Invalid base class "cls"
class MyStyler(cls): # type:ignore[valid-type,misc]
env = jinja2.Environment(loader=loader)
- if html_table:
- template_html_table = env.get_template(html_table)
- if html_style:
- template_html_style = env.get_template(html_style)
+ template_html = env.get_template(name)
return MyStyler
@@ -2798,176 +2547,3 @@ def _highlight_between(
else np.full(data.shape, True, dtype=bool)
)
return np.where(g_left & l_right, props, "")
-
-
-def _bar(
- data: FrameOrSeries,
- align: str | float | int | Callable,
- colors: list[str],
- width: float,
- height: float,
- vmin: float | None,
- vmax: float | None,
- base_css: str,
-):
- """
- Draw bar chart in data cells using HTML CSS linear gradient.
-
- Parameters
- ----------
- data : Series or DataFrame
- Underling subset of Styler data on which operations are performed.
- align : str in {"left", "right", "mid", "zero", "mean"}, int, float, callable
- Method for how bars are structured or scalar value of centre point.
- colors : list-like of str
- Two listed colors as string in valid CSS.
- width : float in [0,1]
- The percentage of the cell, measured from left, where drawn bars will reside.
- height : float in [0,1]
- The percentage of the cell's height where drawn bars will reside, centrally
- aligned.
- vmin : float, optional
- Overwrite the minimum value of the window.
- vmax : float, optional
- Overwrite the maximum value of the window.
- base_css : str
- Additional CSS that is included in the cell before bars are drawn.
- """
-
- def css_bar(start: float, end: float, color: str) -> str:
- """
- Generate CSS code to draw a bar from start to end in a table cell.
-
- Uses linear-gradient.
-
- Parameters
- ----------
- start : float
- Relative positional start of bar coloring in [0,1]
- end : float
- Relative positional end of the bar coloring in [0,1]
- color : str
- CSS valid color to apply.
-
- Returns
- -------
- str : The CSS applicable to the cell.
-
- Notes
- -----
- Uses ``base_css`` from outer scope.
- """
- cell_css = base_css
- if end > start:
- cell_css += "background: linear-gradient(90deg,"
- if start > 0:
- cell_css += f" transparent {start*100:.1f}%, {color} {start*100:.1f}%,"
- cell_css += f" {color} {end*100:.1f}%, transparent {end*100:.1f}%)"
- return cell_css
-
- def css_calc(x, left: float, right: float, align: str):
- """
- Return the correct CSS for bar placement based on calculated values.
-
- Parameters
- ----------
- x : float
- Value which determines the bar placement.
- left : float
- Value marking the left side of calculation, usually minimum of data.
- right : float
- Value marking the right side of the calculation, usually maximum of data
- (left < right).
- align : {"left", "right", "zero", "mid"}
- How the bars will be positioned.
- "left", "right", "zero" can be used with any values for ``left``, ``right``.
- "mid" can only be used where ``left <= 0`` and ``right >= 0``.
- "zero" is used to specify a center when all values ``x``, ``left``,
- ``right`` are translated, e.g. by say a mean or median.
-
- Returns
- -------
- str : Resultant CSS with linear gradient.
-
- Notes
- -----
- Uses ``colors``, ``width`` and ``height`` from outer scope.
- """
- if pd.isna(x):
- return base_css
-
- color = colors[0] if x < 0 else colors[1]
- x = left if x < left else x
- x = right if x > right else x # trim data if outside of the window
-
- start: float = 0
- end: float = 1
-
- if align == "left":
- # all proportions are measured from the left side between left and right
- end = (x - left) / (right - left)
-
- elif align == "right":
- # all proportions are measured from the right side between left and right
- start = (x - left) / (right - left)
-
- else:
- z_frac: float = 0.5 # location of zero based on the left-right range
- if align == "zero":
- # all proportions are measured from the center at zero
- limit: float = max(abs(left), abs(right))
- left, right = -limit, limit
- elif align == "mid":
- # bars drawn from zero either leftwards or rightwards with center at mid
- mid: float = (left + right) / 2
- z_frac = (
- -mid / (right - left) + 0.5 if mid < 0 else -left / (right - left)
- )
-
- if x < 0:
- start, end = (x - left) / (right - left), z_frac
- else:
- start, end = z_frac, (x - left) / (right - left)
-
- ret = css_bar(start * width, end * width, color)
- if height < 1 and "background: linear-gradient(" in ret:
- return (
- ret + f" no-repeat center; background-size: 100% {height * 100:.1f}%;"
- )
- else:
- return ret
-
- values = data.to_numpy()
- left = np.nanmin(values) if vmin is None else vmin
- right = np.nanmax(values) if vmax is None else vmax
- z: float = 0 # adjustment to translate data
-
- if align == "mid":
- if left >= 0: # "mid" is documented to act as "left" if all values positive
- align, left = "left", 0 if vmin is None else vmin
- elif right <= 0: # "mid" is documented to act as "right" if all values negative
- align, right = "right", 0 if vmax is None else vmax
- elif align == "mean":
- z, align = np.nanmean(values), "zero"
- elif callable(align):
- z, align = align(values), "zero"
- elif isinstance(align, (float, int)):
- z, align = float(align), "zero"
- elif not (align == "left" or align == "right" or align == "zero"):
- raise ValueError(
- "`align` should be in {'left', 'right', 'mid', 'mean', 'zero'} or be a "
- "value defining the center line or a callable that returns a float"
- )
-
- assert isinstance(align, str) # mypy: should now be in [left, right, mid, zero]
- if data.ndim == 1:
- return [css_calc(x - z, left - z, right - z, align) for x in values]
- else:
- return DataFrame(
- [
- [css_calc(x - z, left - z, right - z, align) for x in row]
- for row in values
- ],
- index=data.index,
- columns=data.columns,
- )
diff --git a/pandas/io/formats/style_render.py b/pandas/io/formats/style_render.py
index e240c04f97ed1..7686d8a340c37 100644
--- a/pandas/io/formats/style_render.py
+++ b/pandas/io/formats/style_render.py
@@ -2,7 +2,6 @@
from collections import defaultdict
from functools import partial
-import re
from typing import (
Any,
Callable,
@@ -21,7 +20,10 @@
from pandas._config import get_option
from pandas._libs import lib
-from pandas._typing import TypedDict
+from pandas._typing import (
+ FrameOrSeriesUnion,
+ TypedDict,
+)
from pandas.compat._optional import import_optional_dependency
from pandas.core.dtypes.generic import ABCSeries
@@ -64,13 +66,11 @@ class StylerRenderer:
loader = jinja2.PackageLoader("pandas", "io/formats/templates")
env = jinja2.Environment(loader=loader, trim_blocks=True)
template_html = env.get_template("html.tpl")
- template_html_table = env.get_template("html_table.tpl")
- template_html_style = env.get_template("html_style.tpl")
template_latex = env.get_template("latex.tpl")
def __init__(
self,
- data: DataFrame | Series,
+ data: FrameOrSeriesUnion,
uuid: str | None = None,
uuid_len: int = 5,
table_styles: CSSStyles | None = None,
@@ -97,9 +97,7 @@ def __init__(
self.cell_ids = cell_ids
# add rendering variables
- self.hide_index_: bool = False # bools for hiding col/row headers
- self.hide_columns_: bool = False
- self.hidden_rows: Sequence[int] = [] # sequence for specific hidden rows/cols
+ self.hidden_index: bool = False
self.hidden_columns: Sequence[int] = []
self.ctx: DefaultDict[tuple[int, int], CSSList] = defaultdict(list)
self.cell_context: DefaultDict[tuple[int, int], str] = defaultdict(str)
@@ -119,11 +117,7 @@ def _render_html(self, sparse_index: bool, sparse_columns: bool, **kwargs) -> st
# TODO: namespace all the pandas keys
d = self._translate(sparse_index, sparse_columns)
d.update(kwargs)
- return self.template_html.render(
- **d,
- html_table_tpl=self.template_html_table,
- html_style_tpl=self.template_html_style,
- )
+ return self.template_html.render(**d)
def _render_latex(self, sparse_index: bool, sparse_columns: bool, **kwargs) -> str:
"""
@@ -303,56 +297,55 @@ def _translate_header(
head = []
# 1) column headers
- if not self.hide_columns_:
- for r in range(self.data.columns.nlevels):
- index_blanks = [
- _element("th", blank_class, blank_value, not self.hide_index_)
- ] * (self.data.index.nlevels - 1)
-
- name = self.data.columns.names[r]
- column_name = [
+ for r in range(self.data.columns.nlevels):
+ index_blanks = [
+ _element("th", blank_class, blank_value, not self.hidden_index)
+ ] * (self.data.index.nlevels - 1)
+
+ name = self.data.columns.names[r]
+ column_name = [
+ _element(
+ "th",
+ f"{blank_class if name is None else index_name_class} level{r}",
+ name if name is not None else blank_value,
+ not self.hidden_index,
+ )
+ ]
+
+ if clabels:
+ column_headers = [
_element(
"th",
- f"{blank_class if name is None else index_name_class} level{r}",
- name if name is not None else blank_value,
- not self.hide_index_,
+ f"{col_heading_class} level{r} col{c}",
+ value,
+ _is_visible(c, r, col_lengths),
+ attributes=(
+ f'colspan="{col_lengths.get((r, c), 0)}"'
+ if col_lengths.get((r, c), 0) > 1
+ else ""
+ ),
)
+ for c, value in enumerate(clabels[r])
]
- if clabels:
- column_headers = [
+ if len(self.data.columns) > max_cols:
+ # add an extra column with `...` value to indicate trimming
+ column_headers.append(
_element(
"th",
- f"{col_heading_class} level{r} col{c}",
- value,
- _is_visible(c, r, col_lengths),
- attributes=(
- f'colspan="{col_lengths.get((r, c), 0)}"'
- if col_lengths.get((r, c), 0) > 1
- else ""
- ),
- )
- for c, value in enumerate(clabels[r])
- ]
-
- if len(self.data.columns) > max_cols:
- # add an extra column with `...` value to indicate trimming
- column_headers.append(
- _element(
- "th",
- f"{col_heading_class} level{r} {trimmed_col_class}",
- "...",
- True,
- attributes="",
- )
+ f"{col_heading_class} level{r} {trimmed_col_class}",
+ "...",
+ True,
+ attributes="",
)
- head.append(index_blanks + column_name + column_headers)
+ )
+ head.append(index_blanks + column_name + column_headers)
# 2) index names
if (
self.data.index.names
and com.any_not_none(*self.data.index.names)
- and not self.hide_index_
+ and not self.hidden_index
):
index_names = [
_element(
@@ -418,9 +411,7 @@ def _translate_body(
The associated HTML elements needed for template rendering.
"""
# for sparsifying a MultiIndex
- idx_lengths = _get_level_lengths(
- self.index, sparsify_index, max_rows, self.hidden_rows
- )
+ idx_lengths = _get_level_lengths(self.index, sparsify_index, max_rows)
rlabels = self.data.index.tolist()[:max_rows] # slice to allow trimming
if self.data.index.nlevels == 1:
@@ -434,7 +425,7 @@ def _translate_body(
"th",
f"{row_heading_class} level{c} {trimmed_row_class}",
"...",
- not self.hide_index_,
+ not self.hidden_index,
attributes="",
)
for c in range(self.data.index.nlevels)
@@ -471,7 +462,7 @@ def _translate_body(
"th",
f"{row_heading_class} level{c} row{r}",
value,
- (_is_visible(r, c, idx_lengths) and not self.hide_index_),
+ (_is_visible(r, c, idx_lengths) and not self.hidden_index),
id=f"level{c}_row{r}",
attributes=(
f'rowspan="{idx_lengths.get((c, r), 0)}"'
@@ -505,7 +496,7 @@ def _translate_body(
"td",
f"{data_class} row{r} col{c}{cls}",
value,
- (c not in self.hidden_columns and r not in self.hidden_rows),
+ (c not in self.hidden_columns),
attributes="",
display_value=self._display_funcs[(r, c)](value),
)
@@ -536,7 +527,7 @@ def _translate_latex(self, d: dict) -> None:
d["head"] = [[col for col in row if col["is_visible"]] for row in d["head"]]
body = []
for r, row in enumerate(d["body"]):
- if self.hide_index_:
+ if self.hidden_index:
row_body_headers = []
else:
row_body_headers = [
@@ -851,13 +842,7 @@ def _get_level_lengths(
last_label = j
lengths[(i, last_label)] = 0
elif j not in hidden_elements:
- if lengths[(i, last_label)] == 0:
- # if the previous iteration was first-of-kind but hidden then offset
- last_label = j
- lengths[(i, last_label)] = 1
- else:
- # else add to previous iteration
- lengths[(i, last_label)] += 1
+ lengths[(i, last_label)] += 1
non_zero_lengths = {
element: length for element, length in lengths.items() if length >= 1
@@ -1174,7 +1159,7 @@ def _pseudo_css(self, uuid: str, name: str, row: int, col: int, text: str):
},
]
- def _translate(self, styler_data: DataFrame | Series, uuid: str, d: dict):
+ def _translate(self, styler_data: FrameOrSeriesUnion, uuid: str, d: dict):
"""
Mutate the render dictionary to allow for tooltips:
@@ -1268,9 +1253,7 @@ def _parse_latex_table_styles(table_styles: CSSStyles, selector: str) -> str | N
return None
-def _parse_latex_cell_styles(
- latex_styles: CSSList, display_value: str, convert_css: bool = False
-) -> str:
+def _parse_latex_cell_styles(latex_styles: CSSList, display_value: str) -> str:
r"""
Mutate the ``display_value`` string including LaTeX commands from ``latex_styles``.
@@ -1296,8 +1279,6 @@ def _parse_latex_cell_styles(
For example for styles:
`[('c1', 'o1--wrap'), ('c2', 'o2')]` this returns: `{\c1o1 \c2o2{display_value}}
"""
- if convert_css:
- latex_styles = _parse_latex_css_conversion(latex_styles)
for (command, options) in latex_styles[::-1]: # in reverse for most recent style
formatter = {
"--wrap": f"{{\\{command}--to_parse {display_value}}}",
@@ -1370,82 +1351,6 @@ def _parse_latex_options_strip(value: str | int | float, arg: str) -> str:
return str(value).replace(arg, "").replace("/*", "").replace("*/", "").strip()
-def _parse_latex_css_conversion(styles: CSSList) -> CSSList:
- """
- Convert CSS (attribute,value) pairs to equivalent LaTeX (command,options) pairs.
-
- Ignore conversion if tagged with `--latex` option, skipped if no conversion found.
- """
-
- def font_weight(value, arg):
- if value == "bold" or value == "bolder":
- return "bfseries", f"{arg}"
- return None
-
- def font_style(value, arg):
- if value == "italic":
- return "itshape", f"{arg}"
- elif value == "oblique":
- return "slshape", f"{arg}"
- return None
-
- def color(value, user_arg, command, comm_arg):
- """
- CSS colors have 5 formats to process:
-
- - 6 digit hex code: "#ff23ee" --> [HTML]{FF23EE}
- - 3 digit hex code: "#f0e" --> [HTML]{FF00EE}
- - rgba: rgba(128, 255, 0, 0.5) --> [rgb]{0.502, 1.000, 0.000}
- - rgb: rgb(128, 255, 0,) --> [rbg]{0.502, 1.000, 0.000}
- - string: red --> {red}
-
- Additionally rgb or rgba can be expressed in % which is also parsed.
- """
- arg = user_arg if user_arg != "" else comm_arg
-
- if value[0] == "#" and len(value) == 7: # color is hex code
- return command, f"[HTML]{{{value[1:].upper()}}}{arg}"
- if value[0] == "#" and len(value) == 4: # color is short hex code
- val = f"{value[1].upper()*2}{value[2].upper()*2}{value[3].upper()*2}"
- return command, f"[HTML]{{{val}}}{arg}"
- elif value[:3] == "rgb": # color is rgb or rgba
- r = re.findall("(?<=\\()[0-9\\s%]+(?=,)", value)[0].strip()
- r = float(r[:-1]) / 100 if "%" in r else int(r) / 255
- g = re.findall("(?<=,)[0-9\\s%]+(?=,)", value)[0].strip()
- g = float(g[:-1]) / 100 if "%" in g else int(g) / 255
- if value[3] == "a": # color is rgba
- b = re.findall("(?<=,)[0-9\\s%]+(?=,)", value)[1].strip()
- else: # color is rgb
- b = re.findall("(?<=,)[0-9\\s%]+(?=\\))", value)[0].strip()
- b = float(b[:-1]) / 100 if "%" in b else int(b) / 255
- return command, f"[rgb]{{{r:.3f}, {g:.3f}, {b:.3f}}}{arg}"
- else:
- return command, f"{{{value}}}{arg}" # color is likely string-named
-
- CONVERTED_ATTRIBUTES: dict[str, Callable] = {
- "font-weight": font_weight,
- "background-color": partial(color, command="cellcolor", comm_arg="--lwrap"),
- "color": partial(color, command="color", comm_arg=""),
- "font-style": font_style,
- }
-
- latex_styles: CSSList = []
- for (attribute, value) in styles:
- if isinstance(value, str) and "--latex" in value:
- # return the style without conversion but drop '--latex'
- latex_styles.append((attribute, value.replace("--latex", "")))
- if attribute in CONVERTED_ATTRIBUTES.keys():
- arg = ""
- for x in ["--wrap", "--nowrap", "--lwrap", "--dwrap", "--rwrap"]:
- if x in str(value):
- arg, value = x, _parse_latex_options_strip(value, x)
- break
- latex_style = CONVERTED_ATTRIBUTES[attribute](value, arg)
- if latex_style is not None:
- latex_styles.extend([latex_style])
- return latex_styles
-
-
def _escape_latex(s):
r"""
Replace the characters ``&``, ``%``, ``$``, ``#``, ``_``, ``{``, ``}``,
diff --git a/pandas/io/formats/templates/html.tpl b/pandas/io/formats/templates/html.tpl
index 8c63be3ad788a..880c78c8d6b05 100644
--- a/pandas/io/formats/templates/html.tpl
+++ b/pandas/io/formats/templates/html.tpl
@@ -1,16 +1,16 @@
-{# Update the html_style/table_structure.html documentation too #}
+{# Update the template_structure.html documentation too #}
{% if doctype_html %}
-{% if not exclude_styles %}{% include html_style_tpl %}{% endif %}
+{% if not exclude_styles %}{% include "html_style.tpl" %}{% endif %}
-{% include html_table_tpl %}
+{% include "html_table.tpl" %}
{% elif not doctype_html %}
-{% if not exclude_styles %}{% include html_style_tpl %}{% endif %}
-{% include html_table_tpl %}
+{% if not exclude_styles %}{% include "html_style.tpl" %}{% endif %}
+{% include "html_table.tpl" %}
{% endif %}
diff --git a/pandas/io/formats/templates/latex.tpl b/pandas/io/formats/templates/latex.tpl
index fe081676d87af..66fe99642850f 100644
--- a/pandas/io/formats/templates/latex.tpl
+++ b/pandas/io/formats/templates/latex.tpl
@@ -39,7 +39,7 @@
{% endif %}
{% for row in body %}
{% for c in row %}{% if not loop.first %} & {% endif %}
- {%- if c.type == 'th' %}{{parse_header(c, multirow_align, multicol_align)}}{% else %}{{parse_cell(c.cellstyle, c.display_value, convert_css)}}{% endif %}
+ {%- if c.type == 'th' %}{{parse_header(c, multirow_align, multicol_align)}}{% else %}{{parse_cell(c.cellstyle, c.display_value)}}{% endif %}
{%- endfor %} \\
{% endfor %}
{% set bottomrule = parse_table(table_styles, 'bottomrule') %}
diff --git a/pandas/io/formats/xml.py b/pandas/io/formats/xml.py
index f5ba8c6b53335..5be6ae0382d87 100644
--- a/pandas/io/formats/xml.py
+++ b/pandas/io/formats/xml.py
@@ -195,18 +195,14 @@ def handle_indexes(self) -> None:
This method will add indexes into attr_cols or elem_cols.
"""
- if not self.index:
- return
-
- first_key = next(iter(self.frame_dicts))
indexes: list[str] = [
- x for x in self.frame_dicts[first_key].keys() if x not in self.orig_cols
+ x for x in self.frame_dicts[0].keys() if x not in self.orig_cols
]
- if self.attr_cols:
+ if self.attr_cols and self.index:
self.attr_cols = indexes + self.attr_cols
- if self.elem_cols:
+ if self.elem_cols and self.index:
self.elem_cols = indexes + self.elem_cols
def get_prefix_uri(self) -> str:
@@ -311,7 +307,7 @@ def build_tree(self) -> bytes:
self.elem_row = SubElement(self.root, f"{self.prefix_uri}{self.row_name}")
if not self.attr_cols and not self.elem_cols:
- self.elem_cols = list(self.d.keys())
+ self.elem_cols = list(self.frame_dicts[0].keys())
self.build_elems()
else:
@@ -361,9 +357,9 @@ def build_attribs(self) -> None:
flat_col = col
if isinstance(col, tuple):
flat_col = (
- "".join([str(c) for c in col]).strip()
+ "".join(str(c) for c in col).strip()
if "" in col
- else "_".join([str(c) for c in col]).strip()
+ else "_".join(str(c) for c in col).strip()
)
attr_name = f"{self.prefix_uri}{flat_col}"
@@ -388,9 +384,9 @@ def build_elems(self) -> None:
flat_col = col
if isinstance(col, tuple):
flat_col = (
- "".join([str(c) for c in col]).strip()
+ "".join(str(c) for c in col).strip()
if "" in col
- else "_".join([str(c) for c in col]).strip()
+ else "_".join(str(c) for c in col).strip()
)
elem_name = f"{self.prefix_uri}{flat_col}"
@@ -481,7 +477,7 @@ def build_tree(self) -> bytes:
self.elem_row = SubElement(self.root, f"{self.prefix_uri}{self.row_name}")
if not self.attr_cols and not self.elem_cols:
- self.elem_cols = list(self.d.keys())
+ self.elem_cols = list(self.frame_dicts[0].keys())
self.build_elems()
else:
@@ -533,9 +529,9 @@ def build_attribs(self) -> None:
flat_col = col
if isinstance(col, tuple):
flat_col = (
- "".join([str(c) for c in col]).strip()
+ "".join(str(c) for c in col).strip()
if "" in col
- else "_".join([str(c) for c in col]).strip()
+ else "_".join(str(c) for c in col).strip()
)
attr_name = f"{self.prefix_uri}{flat_col}"
@@ -560,9 +556,9 @@ def build_elems(self) -> None:
flat_col = col
if isinstance(col, tuple):
flat_col = (
- "".join([str(c) for c in col]).strip()
+ "".join(str(c) for c in col).strip()
if "" in col
- else "_".join([str(c) for c in col]).strip()
+ else "_".join(str(c) for c in col).strip()
)
elem_name = f"{self.prefix_uri}{flat_col}"
diff --git a/pandas/io/html.py b/pandas/io/html.py
index 2947b22f85d61..0a91d065379cb 100644
--- a/pandas/io/html.py
+++ b/pandas/io/html.py
@@ -627,7 +627,7 @@ def _build_xpath_expr(attrs) -> str:
if "class_" in attrs:
attrs["class"] = attrs.pop("class_")
- s = " and ".join([f"@{k}={repr(v)}" for k, v in attrs.items()])
+ s = " and ".join(f"@{k}={repr(v)}" for k, v in attrs.items())
return f"[{s}]"
@@ -861,7 +861,7 @@ def _parser_dispatch(flavor):
def _print_as_set(s) -> str:
- arg = ", ".join([pprint_thing(el) for el in s])
+ arg = ", ".join(pprint_thing(el) for el in s)
return f"{{{arg}}}"
diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py
index fdeda868fdb5e..77582c46977c1 100644
--- a/pandas/io/json/_json.py
+++ b/pandas/io/json/_json.py
@@ -21,6 +21,7 @@
from pandas._typing import (
CompressionOptions,
DtypeArg,
+ FrameOrSeriesUnion,
IndexLabel,
JSONSerializable,
StorageOptions,
@@ -862,7 +863,7 @@ def __init__(
self.convert_dates = convert_dates
self.date_unit = date_unit
self.keep_default_dates = keep_default_dates
- self.obj: DataFrame | Series | None = None
+ self.obj: FrameOrSeriesUnion | None = None
def check_keys_split(self, decoded):
"""
diff --git a/pandas/io/json/_normalize.py b/pandas/io/json/_normalize.py
index 729d60ca78944..5927d6482d3b0 100644
--- a/pandas/io/json/_normalize.py
+++ b/pandas/io/json/_normalize.py
@@ -380,31 +380,14 @@ def _json_normalize(
Returns normalized data with columns prefixed with the given string.
"""
- def _pull_field(
- js: dict[str, Any], spec: list | str, extract_record: bool = False
- ) -> Scalar | Iterable:
+ def _pull_field(js: dict[str, Any], spec: list | str) -> Scalar | Iterable:
"""Internal function to pull field"""
result = js
- try:
- if isinstance(spec, list):
- for field in spec:
- result = result[field]
- else:
- result = result[spec]
- except KeyError as e:
- if extract_record:
- raise KeyError(
- f"Key {e} not found. If specifying a record_path, all elements of "
- f"data should have the path."
- ) from e
- elif errors == "ignore":
- return np.nan
- else:
- raise KeyError(
- f"Key {e} not found. To replace missing values of {e} with "
- f"np.nan, pass in errors='ignore'"
- ) from e
-
+ if isinstance(spec, list):
+ for field in spec:
+ result = result[field]
+ else:
+ result = result[spec]
return result
def _pull_records(js: dict[str, Any], spec: list | str) -> list:
@@ -413,7 +396,7 @@ def _pull_records(js: dict[str, Any], spec: list | str) -> list:
_pull_field, but require to return list. And will raise error
if has non iterable value.
"""
- result = _pull_field(js, spec, extract_record=True)
+ result = _pull_field(js, spec)
# GH 31507 GH 30145, GH 26284 if result is not list, raise TypeError if not
# null, otherwise return an empty list
@@ -505,7 +488,16 @@ def _recursive_extract(data, path, seen_meta, level=0):
if level + 1 > len(val):
meta_val = seen_meta[key]
else:
- meta_val = _pull_field(obj, val[level:])
+ try:
+ meta_val = _pull_field(obj, val[level:])
+ except KeyError as e:
+ if errors == "ignore":
+ meta_val = np.nan
+ else:
+ raise KeyError(
+ "Try running with errors='ignore' as key "
+ f"{e} is not always present"
+ ) from e
meta_vals[key].append(meta_val)
records.extend(recs)
diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py
index 5714bbab016c8..2a86ff13a2edc 100644
--- a/pandas/io/parsers/base_parser.py
+++ b/pandas/io/parsers/base_parser.py
@@ -12,7 +12,6 @@
Iterable,
Sequence,
cast,
- final,
)
import warnings
@@ -24,9 +23,9 @@
from pandas._libs.parsers import STR_NA_VALUES
from pandas._libs.tslibs import parsing
from pandas._typing import (
- ArrayLike,
DtypeArg,
FilePathOrBuffer,
+ final,
)
from pandas.errors import (
ParserError,
@@ -351,7 +350,7 @@ def extract(r):
# level, then our header was too long.
for n in range(len(columns[0])):
if all(ensure_str(col[n]) in self.unnamed_cols for col in columns):
- header = ",".join([str(x) for x in self.header])
+ header = ",".join(str(x) for x in self.header)
raise ParserError(
f"Passed header=[{header}] are too many rows "
"for this multi_index of columns"
@@ -804,29 +803,6 @@ def _do_date_conversions(self, names, data):
return names, data
- def _check_data_length(self, columns: list[str], data: list[ArrayLike]) -> None:
- """Checks if length of data is equal to length of column names.
-
- One set of trailing commas is allowed. self.index_col not False
- results in a ParserError previously when lengths do not match.
-
- Parameters
- ----------
- columns: list of column names
- data: list of array-likes containing the data column-wise.
- """
- if not self.index_col and len(columns) != len(data) and columns:
- if len(columns) == len(data) - 1 and np.all(
- (is_object_dtype(data[-1]) and data[-1] == "") | isna(data[-1])
- ):
- return
- warnings.warn(
- "Length of header or names does not match length of data. This leads "
- "to a loss of data with index_col=False.",
- ParserWarning,
- stacklevel=6,
- )
-
def _evaluate_usecols(self, usecols, names):
"""
Check whether or not the 'usecols' parameter
@@ -1138,7 +1114,7 @@ def _try_convert_dates(parser: Callable, colspec, data_dict, columns):
else:
colnames.append(c)
- new_name = "_".join([str(x) for x in colnames])
+ new_name = "_".join(str(x) for x in colnames)
to_parse = [data_dict[c] for c in colnames if c in data_dict]
new_col = parser(*to_parse)
diff --git a/pandas/io/parsers/c_parser_wrapper.py b/pandas/io/parsers/c_parser_wrapper.py
index ae62cc3b45578..5c1f8f94a72da 100644
--- a/pandas/io/parsers/c_parser_wrapper.py
+++ b/pandas/io/parsers/c_parser_wrapper.py
@@ -300,8 +300,6 @@ def read(self, nrows=None):
# columns as list
alldata = [x[1] for x in data_tups]
- if self.usecols is None:
- self._check_data_length(names, alldata)
data = {k: v for k, (i, v) in zip(names, data_tups)}
@@ -365,9 +363,7 @@ def _concatenate_chunks(chunks: list[dict[int, ArrayLike]]) -> dict:
numpy_dtypes, # type: ignore[arg-type]
[],
)
- # error: Non-overlapping equality check (left operand type: "dtype[Any]",
- # right operand type: "Type[object]")
- if common_type == object: # type: ignore[comparison-overlap]
+ if common_type == object:
warning_columns.append(str(name))
dtype = dtypes.pop()
diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py
index 7c9fcde08bf24..670868c6f4261 100644
--- a/pandas/io/parsers/python_parser.py
+++ b/pandas/io/parsers/python_parser.py
@@ -25,7 +25,6 @@
)
from pandas.core.dtypes.common import is_integer
-from pandas.core.dtypes.inference import is_dict_like
from pandas.io.parsers.base_parser import (
ParserBase,
@@ -293,8 +292,6 @@ def _exclude_implicit_index(self, alldata):
offset = len(self.index_col) # type: ignore[has-type]
len_alldata = len(alldata)
- self._check_data_length(names, alldata)
-
return {
name: alldata[i + offset] for i, name in enumerate(names) if i < len_alldata
}, names
@@ -427,7 +424,6 @@ def _infer_columns(self):
cur_count = counts[col]
if (
self.dtype is not None
- and is_dict_like(self.dtype)
and self.dtype.get(old_col) is not None
and self.dtype.get(col) is None
):
@@ -1159,7 +1155,7 @@ def get_rows(self, infer_nrows, skiprows=None):
def detect_colspecs(self, infer_nrows=100, skiprows=None):
# Regex escape the delimiters
- delimiters = "".join([fr"\{x}" for x in self.delimiter])
+ delimiters = "".join(fr"\{x}" for x in self.delimiter)
pattern = re.compile(f"([^{delimiters}]+)")
rows = self.get_rows(infer_nrows, skiprows)
if not rows:
diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py
index f1e97ab90793e..208b8a008ffe6 100644
--- a/pandas/io/pytables.py
+++ b/pandas/io/pytables.py
@@ -40,6 +40,7 @@
ArrayLike,
DtypeArg,
FrameOrSeries,
+ FrameOrSeriesUnion,
Shape,
)
from pandas.compat._optional import import_optional_dependency
@@ -2592,7 +2593,7 @@ class Fixed:
pandas_kind: str
format_type: str = "fixed" # GH#30962 needed by dask
- obj_type: type[DataFrame | Series]
+ obj_type: type[FrameOrSeriesUnion]
ndim: int
encoding: str
parent: HDFStore
@@ -2641,7 +2642,7 @@ def __repr__(self) -> str:
s = self.shape
if s is not None:
if isinstance(s, (list, tuple)):
- jshape = ",".join([pprint_thing(x) for x in s])
+ jshape = ",".join(pprint_thing(x) for x in s)
s = f"[{jshape}]"
return f"{self.pandas_type:12.12} (shape->{s})"
return self.pandas_type
@@ -3308,10 +3309,10 @@ def __repr__(self) -> str:
ver = ""
if self.is_old_version:
- jver = ".".join([str(x) for x in self.version])
+ jver = ".".join(str(x) for x in self.version)
ver = f"[{jver}]"
- jindex_axes = ",".join([a.name for a in self.index_axes])
+ jindex_axes = ",".join(a.name for a in self.index_axes)
return (
f"{self.pandas_type:12.12}{ver} "
f"(typ->{self.table_type_short},nrows->{self.nrows},"
@@ -3362,7 +3363,7 @@ def is_multi_index(self) -> bool:
return isinstance(self.levels, list)
def validate_multiindex(
- self, obj: DataFrame | Series
+ self, obj: FrameOrSeriesUnion
) -> tuple[DataFrame, list[Hashable]]:
"""
validate that we can store the multi-index; reset and return the
@@ -3518,7 +3519,7 @@ def validate_version(self, where=None):
"""are we trying to operate on an old version?"""
if where is not None:
if self.version[0] <= 0 and self.version[1] <= 10 and self.version[2] < 1:
- ws = incompatibility_doc % ".".join([str(x) for x in self.version])
+ ws = incompatibility_doc % ".".join(str(x) for x in self.version)
warnings.warn(ws, IncompatibilityWarning)
def validate_min_itemsize(self, min_itemsize):
@@ -4065,7 +4066,7 @@ def get_blk_items(mgr):
new_blocks.append(b)
new_blk_items.append(b_items)
except (IndexError, KeyError) as err:
- jitems = ",".join([pprint_thing(item) for item in items])
+ jitems = ",".join(pprint_thing(item) for item in items)
raise ValueError(
f"cannot match existing table structure for [{jitems}] "
"on appending data"
@@ -4499,7 +4500,7 @@ class AppendableFrameTable(AppendableTable):
pandas_kind = "frame_table"
table_type = "appendable_frame"
ndim = 2
- obj_type: type[DataFrame | Series] = DataFrame
+ obj_type: type[FrameOrSeriesUnion] = DataFrame
@property
def is_transposed(self) -> bool:
@@ -4999,7 +5000,7 @@ def _maybe_convert_for_string_atom(
# check for column in the values conflicts
if existing_col is not None:
eci = existing_col.validate_col(itemsize)
- if eci is not None and eci > itemsize:
+ if eci > itemsize:
itemsize = eci
data_converted = data_converted.astype(f"|S{itemsize}", copy=False)
diff --git a/pandas/io/sql.py b/pandas/io/sql.py
index df9c7e28bff69..b9d5b18b85e02 100644
--- a/pandas/io/sql.py
+++ b/pandas/io/sql.py
@@ -182,7 +182,7 @@ def _wrap_result(
return frame
-def execute(sql, con, params=None):
+def execute(sql, con, cur=None, params=None):
"""
Execute the given SQL query using the provided connection object.
@@ -194,6 +194,7 @@ def execute(sql, con, params=None):
Using SQLAlchemy makes it possible to use any DB supported by the
library.
If a DBAPI2 object, only sqlite3 is supported.
+ cur : deprecated, cursor is obtained from connection, default: None
params : list or tuple, optional, default: None
List of parameters to pass to execute method.
@@ -201,7 +202,10 @@ def execute(sql, con, params=None):
-------
Results Iterable
"""
- pandas_sql = pandasSQL_builder(con)
+ if cur is None:
+ pandas_sql = pandasSQL_builder(con)
+ else:
+ pandas_sql = pandasSQL_builder(cur, is_cursor=True)
args = _convert_params(sql, params)
return pandas_sql.execute(*args)
@@ -770,18 +774,22 @@ def _engine_builder(con):
return con
-def pandasSQL_builder(con, schema: str | None = None, meta=None):
+def pandasSQL_builder(
+ con, schema: str | None = None, meta=None, is_cursor: bool = False
+):
"""
Convenience function to return the correct PandasSQL subclass based on the
provided parameters.
"""
+ # When support for DBAPI connections is removed,
+ # is_cursor should not be necessary.
con = _engine_builder(con)
if _is_sqlalchemy_connectable(con):
return SQLDatabase(con, schema=schema, meta=meta)
elif isinstance(con, str):
raise ImportError("Using URI string without sqlalchemy installed.")
else:
- return SQLiteDatabase(con)
+ return SQLiteDatabase(con, is_cursor=is_cursor)
class SQLTable(PandasObject):
@@ -955,7 +963,7 @@ def insert(self, chunksize: int | None = None, method: str | None = None):
if start_i >= end_i:
break
- chunk_iter = zip(*(arr[start_i:end_i] for arr in data_list))
+ chunk_iter = zip(*[arr[start_i:end_i] for arr in data_list])
exec_insert(conn, keys, chunk_iter)
def _query_iterator(
@@ -1905,7 +1913,7 @@ def insert_statement(self, *, num_rows: int):
col_names = ",".join(bracketed_names)
row_wildcards = ",".join([wld] * len(names))
- wildcards = ",".join([f"({row_wildcards})" for _ in range(num_rows)])
+ wildcards = ",".join(f"({row_wildcards})" for _ in range(num_rows))
insert_statement = (
f"INSERT INTO {escape(self.name)} ({col_names}) VALUES {wildcards}"
)
@@ -1944,7 +1952,7 @@ def _create_table_setup(self):
keys = [self.keys]
else:
keys = self.keys
- cnames_br = ", ".join([escape(c) for c in keys])
+ cnames_br = ", ".join(escape(c) for c in keys)
create_tbl_stmts.append(
f"CONSTRAINT {self.name}_pk PRIMARY KEY ({cnames_br})"
)
@@ -1964,7 +1972,7 @@ def _create_table_setup(self):
ix_cols = [cname for cname, _, is_index in column_names_and_types if is_index]
if len(ix_cols):
cnames = "_".join(ix_cols)
- cnames_br = ",".join([escape(c) for c in ix_cols])
+ cnames_br = ",".join(escape(c) for c in ix_cols)
create_stmts.append(
"CREATE INDEX "
+ escape("ix_" + self.name + "_" + cnames)
@@ -2023,7 +2031,8 @@ class SQLiteDatabase(PandasSQL):
"""
- def __init__(self, con):
+ def __init__(self, con, is_cursor: bool = False):
+ self.is_cursor = is_cursor
self.con = con
@contextmanager
@@ -2039,7 +2048,10 @@ def run_transaction(self):
cur.close()
def execute(self, *args, **kwargs):
- cur = self.con.cursor()
+ if self.is_cursor:
+ cur = self.con
+ else:
+ cur = self.con.cursor()
try:
cur.execute(*args, **kwargs)
return cur
diff --git a/pandas/io/stata.py b/pandas/io/stata.py
index 7f37f0293e417..ffaebb3c10ae2 100644
--- a/pandas/io/stata.py
+++ b/pandas/io/stata.py
@@ -853,25 +853,15 @@ def __eq__(self, other: Any) -> bool:
@classmethod
def get_base_missing_value(cls, dtype: np.dtype) -> int | float:
- # error: Non-overlapping equality check (left operand type: "dtype[Any]", right
- # operand type: "Type[signedinteger[Any]]")
- if dtype == np.int8: # type: ignore[comparison-overlap]
+ if dtype == np.int8:
value = cls.BASE_MISSING_VALUES["int8"]
- # error: Non-overlapping equality check (left operand type: "dtype[Any]", right
- # operand type: "Type[signedinteger[Any]]")
- elif dtype == np.int16: # type: ignore[comparison-overlap]
+ elif dtype == np.int16:
value = cls.BASE_MISSING_VALUES["int16"]
- # error: Non-overlapping equality check (left operand type: "dtype[Any]", right
- # operand type: "Type[signedinteger[Any]]")
- elif dtype == np.int32: # type: ignore[comparison-overlap]
+ elif dtype == np.int32:
value = cls.BASE_MISSING_VALUES["int32"]
- # error: Non-overlapping equality check (left operand type: "dtype[Any]", right
- # operand type: "Type[floating[Any]]")
- elif dtype == np.float32: # type: ignore[comparison-overlap]
+ elif dtype == np.float32:
value = cls.BASE_MISSING_VALUES["float32"]
- # error: Non-overlapping equality check (left operand type: "dtype[Any]", right
- # operand type: "Type[floating[Any]]")
- elif dtype == np.float64: # type: ignore[comparison-overlap]
+ elif dtype == np.float64:
value = cls.BASE_MISSING_VALUES["float64"]
else:
raise ValueError("Unsupported dtype")
@@ -1367,12 +1357,12 @@ def _read_old_header(self, first_char: bytes) -> None:
try:
self.typlist = [self.TYPE_MAP[typ] for typ in typlist]
except ValueError as err:
- invalid_types = ",".join([str(x) for x in typlist])
+ invalid_types = ",".join(str(x) for x in typlist)
raise ValueError(f"cannot convert stata types [{invalid_types}]") from err
try:
self.dtyplist = [self.DTYPE_MAP[typ] for typ in typlist]
except ValueError as err:
- invalid_dtypes = ",".join([str(x) for x in typlist])
+ invalid_dtypes = ",".join(str(x) for x in typlist)
raise ValueError(f"cannot convert stata dtypes [{invalid_dtypes}]") from err
if self.format_version > 108:
@@ -2043,25 +2033,15 @@ def _dtype_to_stata_type(dtype: np.dtype, column: Series) -> int:
# do?
itemsize = max_len_string_array(ensure_object(column._values))
return max(itemsize, 1)
- # error: Non-overlapping equality check (left operand type: "dtype[Any]", right
- # operand type: "Type[floating[Any]]")
- elif dtype == np.float64: # type: ignore[comparison-overlap]
+ elif dtype == np.float64:
return 255
- # Non-overlapping equality check (left operand type: "dtype[Any]", right
- # operand type: "Type[floating[Any]]")
- elif dtype == np.float32: # type: ignore[comparison-overlap]
+ elif dtype == np.float32:
return 254
- # error: Non-overlapping equality check (left operand type: "dtype[Any]", right
- # operand type: "Type[signedinteger[Any]]")
- elif dtype == np.int32: # type: ignore[comparison-overlap]
+ elif dtype == np.int32:
return 253
- # error: Non-overlapping equality check (left operand type: "dtype[Any]", right
- # operand type: "Type[signedinteger[Any]]")
- elif dtype == np.int16: # type: ignore[comparison-overlap]
+ elif dtype == np.int16:
return 252
- # error: Non-overlapping equality check (left operand type: "dtype[Any]", right
- # operand type: "Type[signedinteger[Any]]")
- elif dtype == np.int8: # type: ignore[comparison-overlap]
+ elif dtype == np.int8:
return 251
else: # pragma : no cover
raise NotImplementedError(f"Data type {dtype} not supported.")
@@ -2781,25 +2761,15 @@ def _dtype_to_stata_type_117(dtype: np.dtype, column: Series, force_strl: bool)
if itemsize <= 2045:
return itemsize
return 32768
- # error: Non-overlapping equality check (left operand type: "dtype[Any]", right
- # operand type: "Type[floating[Any]]")
- elif dtype == np.float64: # type: ignore[comparison-overlap]
+ elif dtype == np.float64:
return 65526
- # error: Non-overlapping equality check (left operand type: "dtype[Any]", right
- # operand type: "Type[floating[Any]]")
- elif dtype == np.float32: # type: ignore[comparison-overlap]
+ elif dtype == np.float32:
return 65527
- # error: Non-overlapping equality check (left operand type: "dtype[Any]", right
- # operand type: "Type[signedinteger[Any]]") [comparison-overlap]
- elif dtype == np.int32: # type: ignore[comparison-overlap]
+ elif dtype == np.int32:
return 65528
- # error: Non-overlapping equality check (left operand type: "dtype[Any]", right
- # operand type: "Type[signedinteger[Any]]")
- elif dtype == np.int16: # type: ignore[comparison-overlap]
+ elif dtype == np.int16:
return 65529
- # error: Non-overlapping equality check (left operand type: "dtype[Any]", right
- # operand type: "Type[signedinteger[Any]]")
- elif dtype == np.int8: # type: ignore[comparison-overlap]
+ elif dtype == np.int8:
return 65530
else: # pragma : no cover
raise NotImplementedError(f"Data type {dtype} not supported.")
diff --git a/pandas/plotting/_core.py b/pandas/plotting/_core.py
index 990ccbc2a015b..5d3db13610845 100644
--- a/pandas/plotting/_core.py
+++ b/pandas/plotting/_core.py
@@ -7,6 +7,8 @@
Sequence,
)
+import pkg_resources
+
from pandas._config import get_option
from pandas._typing import IndexLabel
@@ -428,7 +430,7 @@ def hist_frame(
y : label or position, optional
Allows plotting of one column versus another. If not specified,
all numerical columns are used.
- color : str, array-like, or dict, optional
+ color : str, array_like, or dict, optional
The color for each of the DataFrame's columns. Possible values are:
- A single color string referred to by name, RGB or RGBA code,
@@ -866,7 +868,7 @@ def _get_call_args(backend_name, data, args, kwargs):
if args and isinstance(data, ABCSeries):
positional_args = str(args)[1:-1]
keyword_args = ", ".join(
- [f"{name}={repr(value)}" for (name, _), value in zip(arg_def, args)]
+ f"{name}={repr(value)}" for (name, _), value in zip(arg_def, args)
)
msg = (
"`Series.plot()` should not be called with positional "
@@ -1237,11 +1239,6 @@ def box(self, by=None, **kwargs):
----------
by : str or sequence
Column in the DataFrame to group by.
-
- .. versionchanged:: 1.4.0
-
- Previously, `by` is silently ignore and makes no groupings
-
**kwargs
Additional keywords are documented in
:meth:`DataFrame.plot`.
@@ -1283,11 +1280,6 @@ def hist(self, by=None, bins=10, **kwargs):
----------
by : str or sequence, optional
Column in the DataFrame to group by.
-
- .. versionchanged:: 1.4.0
-
- Previously, `by` is silently ignore and makes no groupings
-
bins : int, default 10
Number of histogram bins to be used.
**kwargs
@@ -1319,16 +1311,6 @@ def hist(self, by=None, bins=10, **kwargs):
... columns = ['one'])
>>> df['two'] = df['one'] + np.random.randint(1, 7, 6000)
>>> ax = df.plot.hist(bins=12, alpha=0.5)
-
- A grouped histogram can be generated by providing the parameter `by` (which
- can be a column name, or a list of column names):
-
- .. plot::
- :context: close-figs
-
- >>> age_list = [8, 10, 12, 14, 72, 74, 76, 78, 20, 25, 30, 35, 60, 85]
- >>> df = pd.DataFrame({"gender": list("MMMMMMMMFFFFFF"), "age": age_list})
- >>> ax = df.plot.hist(column=["age"], by="gender", figsize=(10, 8))
"""
return self(kind="hist", by=by, bins=bins, **kwargs)
@@ -1589,7 +1571,7 @@ def scatter(self, x, y, s=None, c=None, **kwargs):
y : int or str
The column name or column position to be used as vertical
coordinates for each point.
- s : str, scalar or array-like, optional
+ s : str, scalar or array_like, optional
The size of each point. Possible values are:
- A string with the name of the column to be used for marker's size.
@@ -1602,7 +1584,7 @@ def scatter(self, x, y, s=None, c=None, **kwargs):
.. versionchanged:: 1.1.0
- c : str, int or array-like, optional
+ c : str, int or array_like, optional
The color of each point. Possible values are:
- A single color string referred to by name, RGB or RGBA code,
@@ -1763,8 +1745,6 @@ def _load_backend(backend: str) -> types.ModuleType:
types.ModuleType
The imported backend.
"""
- from importlib.metadata import entry_points
-
if backend == "matplotlib":
# Because matplotlib is an optional dependency and first-party backend,
# we need to attempt an import here to raise an ImportError if needed.
@@ -1779,13 +1759,11 @@ def _load_backend(backend: str) -> types.ModuleType:
found_backend = False
- eps = entry_points()
- if "pandas_plotting_backends" in eps:
- for entry_point in eps["pandas_plotting_backends"]:
- found_backend = entry_point.name == backend
- if found_backend:
- module = entry_point.load()
- break
+ for entry_point in pkg_resources.iter_entry_points("pandas_plotting_backends"):
+ found_backend = entry_point.name == backend
+ if found_backend:
+ module = entry_point.load()
+ break
if not found_backend:
# Fall back to unregistered, module name approach.
diff --git a/pandas/plotting/_matplotlib/boxplot.py b/pandas/plotting/_matplotlib/boxplot.py
index 8b4cf158ac827..21f30c1311e17 100644
--- a/pandas/plotting/_matplotlib/boxplot.py
+++ b/pandas/plotting/_matplotlib/boxplot.py
@@ -18,7 +18,6 @@
LinePlot,
MPLPlot,
)
-from pandas.plotting._matplotlib.groupby import create_iter_data_given_by
from pandas.plotting._matplotlib.style import get_standard_colors
from pandas.plotting._matplotlib.tools import (
create_subplots,
@@ -136,37 +135,18 @@ def _make_plot(self):
if self.subplots:
self._return_obj = pd.Series(dtype=object)
- # Re-create iterated data if `by` is assigned by users
- data = (
- create_iter_data_given_by(self.data, self._kind)
- if self.by is not None
- else self.data
- )
-
- for i, (label, y) in enumerate(self._iter_data(data=data)):
+ for i, (label, y) in enumerate(self._iter_data()):
ax = self._get_ax(i)
kwds = self.kwds.copy()
- # When by is applied, show title for subplots to know which group it is
- # just like df.boxplot, and need to apply T on y to provide right input
- if self.by is not None:
- y = y.T
- ax.set_title(pprint_thing(label))
-
- # When `by` is assigned, the ticklabels will become unique grouped
- # values, instead of label which is used as subtitle in this case.
- ticklabels = [
- pprint_thing(col) for col in self.data.columns.levels[0]
- ]
- else:
- ticklabels = [pprint_thing(label)]
-
ret, bp = self._plot(
ax, y, column_num=i, return_type=self.return_type, **kwds
)
self.maybe_color_bp(bp)
self._return_obj[label] = ret
- self._set_ticklabels(ax, ticklabels)
+
+ label = [pprint_thing(label)]
+ self._set_ticklabels(ax, label)
else:
y = self.data.values.T
ax = self._get_ax(0)
diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py
index ff76bd771d1c0..7ddab91a24ec0 100644
--- a/pandas/plotting/_matplotlib/core.py
+++ b/pandas/plotting/_matplotlib/core.py
@@ -9,7 +9,6 @@
from matplotlib.artist import Artist
import numpy as np
-from pandas._typing import IndexLabel
from pandas.errors import AbstractMethodError
from pandas.util._decorators import cache_readonly
@@ -39,12 +38,10 @@
)
import pandas.core.common as com
-from pandas.core.frame import DataFrame
from pandas.io.formats.printing import pprint_thing
from pandas.plotting._matplotlib.compat import mpl_ge_3_0_0
from pandas.plotting._matplotlib.converter import register_pandas_matplotlib_converters
-from pandas.plotting._matplotlib.groupby import reconstruct_data_with_by
from pandas.plotting._matplotlib.style import get_standard_colors
from pandas.plotting._matplotlib.timeseries import (
decorate_axes,
@@ -102,7 +99,7 @@ def __init__(
self,
data,
kind=None,
- by: IndexLabel | None = None,
+ by=None,
subplots=False,
sharex=None,
sharey=False,
@@ -127,42 +124,13 @@ def __init__(
table=False,
layout=None,
include_bool=False,
- column: IndexLabel | None = None,
**kwds,
):
import matplotlib.pyplot as plt
self.data = data
-
- # if users assign an empty list or tuple, raise `ValueError`
- # similar to current `df.box` and `df.hist` APIs.
- if by in ([], ()):
- raise ValueError("No group keys passed!")
- self.by = com.maybe_make_list(by)
-
- # Assign the rest of columns into self.columns if by is explicitly defined
- # while column is not, only need `columns` in hist/box plot when it's DF
- # TODO: Might deprecate `column` argument in future PR (#28373)
- if isinstance(data, DataFrame):
- if column:
- self.columns = com.maybe_make_list(column)
- else:
- if self.by is None:
- self.columns = [
- col for col in data.columns if is_numeric_dtype(data[col])
- ]
- else:
- self.columns = [
- col
- for col in data.columns
- if col not in self.by and is_numeric_dtype(data[col])
- ]
-
- # For `hist` plot, need to get grouped original data before `self.data` is
- # updated later
- if self.by is not None and self._kind == "hist":
- self._grouped = data.groupby(self.by)
+ self.by = by
self.kind = kind
@@ -171,9 +139,7 @@ def __init__(
self.subplots = subplots
if sharex is None:
-
- # if by is defined, subplots are used and sharex should be False
- if ax is None and by is None:
+ if ax is None:
self.sharex = True
else:
# if we get an axis, the users should do the visibility
@@ -307,15 +273,8 @@ def _iter_data(self, data=None, keep_index=False, fillna=None):
@property
def nseries(self) -> int:
-
- # When `by` is explicitly assigned, grouped data size will be defined, and
- # this will determine number of subplots to have, aka `self.nseries`
if self.data.ndim == 1:
return 1
- elif self.by is not None and self._kind == "hist":
- return len(self._grouped)
- elif self.by is not None and self._kind == "box":
- return len(self.columns)
else:
return self.data.shape[1]
@@ -461,14 +420,6 @@ def _compute_plot_data(self):
if label is None and data.name is None:
label = "None"
data = data.to_frame(name=label)
- elif self._kind in ("hist", "box"):
- cols = self.columns if self.by is None else self.columns + self.by
- data = data.loc[:, cols]
-
- # GH15079 reconstruct data if by is defined
- if self.by is not None:
- self.subplots = True
- data = reconstruct_data_with_by(self.data, by=self.by, cols=self.columns)
# GH16953, _convert is needed as fallback, for ``Series``
# with ``dtype == object``
@@ -763,7 +714,7 @@ def _get_index_name(self) -> str | None:
if isinstance(self.data.index, ABCMultiIndex):
name = self.data.index.names
if com.any_not_none(*name):
- name = ",".join([pprint_thing(x) for x in name])
+ name = ",".join(pprint_thing(x) for x in name)
else:
name = None
else:
diff --git a/pandas/plotting/_matplotlib/groupby.py b/pandas/plotting/_matplotlib/groupby.py
deleted file mode 100644
index 37cc3186fe097..0000000000000
--- a/pandas/plotting/_matplotlib/groupby.py
+++ /dev/null
@@ -1,127 +0,0 @@
-from __future__ import annotations
-
-import numpy as np
-
-from pandas._typing import (
- Dict,
- IndexLabel,
-)
-
-from pandas.core.dtypes.missing import remove_na_arraylike
-
-from pandas import (
- DataFrame,
- MultiIndex,
- Series,
- concat,
-)
-
-
-def create_iter_data_given_by(
- data: DataFrame, kind: str = "hist"
-) -> Dict[str, DataFrame | Series]:
- """
- Create data for iteration given `by` is assigned or not, and it is only
- used in both hist and boxplot.
-
- If `by` is assigned, return a dictionary of DataFrames in which the key of
- dictionary is the values in groups.
- If `by` is not assigned, return input as is, and this preserves current
- status of iter_data.
-
- Parameters
- ----------
- data : reformatted grouped data from `_compute_plot_data` method.
- kind : str, plot kind. This function is only used for `hist` and `box` plots.
-
- Returns
- -------
- iter_data : DataFrame or Dictionary of DataFrames
-
- Examples
- --------
- If `by` is assigned:
-
- >>> import numpy as np
- >>> tuples = [('h1', 'a'), ('h1', 'b'), ('h2', 'a'), ('h2', 'b')]
- >>> mi = MultiIndex.from_tuples(tuples)
- >>> value = [[1, 3, np.nan, np.nan],
- ... [3, 4, np.nan, np.nan], [np.nan, np.nan, 5, 6]]
- >>> data = DataFrame(value, columns=mi)
- >>> create_iter_data_given_by(data)
- {'h1': DataFrame({'a': [1, 3, np.nan], 'b': [3, 4, np.nan]}),
- 'h2': DataFrame({'a': [np.nan, np.nan, 5], 'b': [np.nan, np.nan, 6]})}
- """
-
- # For `hist` plot, before transformation, the values in level 0 are values
- # in groups and subplot titles, and later used for column subselection and
- # iteration; For `box` plot, values in level 1 are column names to show,
- # and are used for iteration and as subplots titles.
- if kind == "hist":
- level = 0
- else:
- level = 1
-
- # Select sub-columns based on the value of level of MI, and if `by` is
- # assigned, data must be a MI DataFrame
- assert isinstance(data.columns, MultiIndex)
- return {
- col: data.loc[:, data.columns.get_level_values(level) == col]
- for col in data.columns.levels[level]
- }
-
-
-def reconstruct_data_with_by(
- data: DataFrame, by: IndexLabel, cols: IndexLabel
-) -> DataFrame:
- """
- Internal function to group data, and reassign multiindex column names onto the
- result in order to let grouped data be used in _compute_plot_data method.
-
- Parameters
- ----------
- data : Original DataFrame to plot
- by : grouped `by` parameter selected by users
- cols : columns of data set (excluding columns used in `by`)
-
- Returns
- -------
- Output is the reconstructed DataFrame with MultiIndex columns. The first level
- of MI is unique values of groups, and second level of MI is the columns
- selected by users.
-
- Examples
- --------
- >>> d = {'h': ['h1', 'h1', 'h2'], 'a': [1, 3, 5], 'b': [3, 4, 6]}
- >>> df = DataFrame(d)
- >>> reconstruct_data_with_by(df, by='h', cols=['a', 'b'])
- h1 h2
- a b a b
- 0 1 3 NaN NaN
- 1 3 4 NaN NaN
- 2 NaN NaN 5 6
- """
- grouped = data.groupby(by)
-
- data_list = []
- for key, group in grouped:
- columns = MultiIndex.from_product([[key], cols])
- sub_group = group[cols]
- sub_group.columns = columns
- data_list.append(sub_group)
-
- data = concat(data_list, axis=1)
- return data
-
-
-def reformat_hist_y_given_by(
- y: Series | np.ndarray, by: IndexLabel | None
-) -> Series | np.ndarray:
- """Internal function to reformat y given `by` is applied or not for hist plot.
-
- If by is None, input y is 1-d with NaN removed; and if by is not None, groupby
- will take place and input y is multi-dimensional array.
- """
- if by is not None and len(y.shape) > 1:
- return np.array([remove_na_arraylike(col) for col in y.T]).T
- return remove_na_arraylike(y)
diff --git a/pandas/plotting/_matplotlib/hist.py b/pandas/plotting/_matplotlib/hist.py
index 08cffbf475db0..a02d9a2b9dc8d 100644
--- a/pandas/plotting/_matplotlib/hist.py
+++ b/pandas/plotting/_matplotlib/hist.py
@@ -17,17 +17,11 @@
remove_na_arraylike,
)
-from pandas.core.frame import DataFrame
-
from pandas.io.formats.printing import pprint_thing
from pandas.plotting._matplotlib.core import (
LinePlot,
MPLPlot,
)
-from pandas.plotting._matplotlib.groupby import (
- create_iter_data_given_by,
- reformat_hist_y_given_by,
-)
from pandas.plotting._matplotlib.tools import (
create_subplots,
flatten_axes,
@@ -49,30 +43,19 @@ def __init__(self, data, bins=10, bottom=0, **kwargs):
MPLPlot.__init__(self, data, **kwargs)
def _args_adjust(self):
-
- # calculate bin number separately in different subplots
- # where subplots are created based on by argument
if is_integer(self.bins):
- if self.by is not None:
- grouped = self.data.groupby(self.by)[self.columns]
- self.bins = [self._calculate_bins(group) for key, group in grouped]
- else:
- self.bins = self._calculate_bins(self.data)
+ # create common bin edge
+ values = self.data._convert(datetime=True)._get_numeric_data()
+ values = np.ravel(values)
+ values = values[~isna(values)]
+
+ _, self.bins = np.histogram(
+ values, bins=self.bins, range=self.kwds.get("range", None)
+ )
if is_list_like(self.bottom):
self.bottom = np.array(self.bottom)
- def _calculate_bins(self, data: DataFrame) -> np.ndarray:
- """Calculate bins given data"""
- values = data._convert(datetime=True)._get_numeric_data()
- values = np.ravel(values)
- values = values[~isna(values)]
-
- hist, bins = np.histogram(
- values, bins=self.bins, range=self.kwds.get("range", None)
- )
- return bins
-
@classmethod
def _plot(
cls,
@@ -87,6 +70,7 @@ def _plot(
):
if column_num == 0:
cls._initialize_stacker(ax, stacking_id, len(bins) - 1)
+ y = y[~isna(y)]
base = np.zeros(len(bins) - 1)
bottom = bottom + cls._get_stacked_values(ax, stacking_id, base, kwds["label"])
@@ -99,14 +83,7 @@ def _make_plot(self):
colors = self._get_colors()
stacking_id = self._get_stacking_id()
- # Re-create iterated data if `by` is assigned by users
- data = (
- create_iter_data_given_by(self.data, self._kind)
- if self.by is not None
- else self.data
- )
-
- for i, (label, y) in enumerate(self._iter_data(data=data)):
+ for i, (label, y) in enumerate(self._iter_data()):
ax = self._get_ax(i)
kwds = self.kwds.copy()
@@ -121,15 +98,6 @@ def _make_plot(self):
kwds = self._make_plot_keywords(kwds, y)
- # the bins is multi-dimension array now and each plot need only 1-d and
- # when by is applied, label should be columns that are grouped
- if self.by is not None:
- kwds["bins"] = kwds["bins"][i]
- kwds["label"] = self.columns
- kwds.pop("color")
-
- y = reformat_hist_y_given_by(y, self.by)
-
# We allow weights to be a multi-dimensional array, e.g. a (10, 2) array,
# and each sub-array (10,) will be called in each iteration. If users only
# provide 1D array, we assume the same weights is used for all iterations
@@ -138,11 +106,6 @@ def _make_plot(self):
kwds["weights"] = weights[:, i]
artists = self._plot(ax, y, column_num=i, stacking_id=stacking_id, **kwds)
-
- # when by is applied, show title for subplots to know which group it is
- if self.by is not None:
- ax.set_title(pprint_thing(label))
-
self._append_legend_handles_labels(artists[0], label)
def _make_plot_keywords(self, kwds, y):
diff --git a/pandas/plotting/_matplotlib/timeseries.py b/pandas/plotting/_matplotlib/timeseries.py
index 3cd312b06020d..3b9c5eae70b42 100644
--- a/pandas/plotting/_matplotlib/timeseries.py
+++ b/pandas/plotting/_matplotlib/timeseries.py
@@ -16,6 +16,7 @@
to_offset,
)
from pandas._libs.tslibs.dtypes import FreqGroup
+from pandas._typing import FrameOrSeriesUnion
from pandas.core.dtypes.generic import (
ABCDatetimeIndex,
@@ -39,7 +40,6 @@
from matplotlib.axes import Axes
from pandas import (
- DataFrame,
DatetimeIndex,
Index,
Series,
@@ -210,7 +210,7 @@ def _get_freq(ax: Axes, series: Series):
return freq, ax_freq
-def use_dynamic_x(ax: Axes, data: DataFrame | Series) -> bool:
+def use_dynamic_x(ax: Axes, data: FrameOrSeriesUnion) -> bool:
freq = _get_index_freq(data.index)
ax_freq = _get_ax_freq(ax)
diff --git a/pandas/plotting/_matplotlib/tools.py b/pandas/plotting/_matplotlib/tools.py
index 9d509d02c2e4f..9bfa24b6371ab 100644
--- a/pandas/plotting/_matplotlib/tools.py
+++ b/pandas/plotting/_matplotlib/tools.py
@@ -13,6 +13,8 @@
import matplotlib.ticker as ticker
import numpy as np
+from pandas._typing import FrameOrSeriesUnion
+
from pandas.core.dtypes.common import is_list_like
from pandas.core.dtypes.generic import (
ABCDataFrame,
@@ -29,11 +31,6 @@
from matplotlib.lines import Line2D
from matplotlib.table import Table
- from pandas import (
- DataFrame,
- Series,
- )
-
def do_adjust_figure(fig: Figure):
"""Whether fig has constrained_layout enabled."""
@@ -58,7 +55,7 @@ def format_date_labels(ax: Axes, rot):
def table(
- ax, data: DataFrame | Series, rowLabels=None, colLabels=None, **kwargs
+ ax, data: FrameOrSeriesUnion, rowLabels=None, colLabels=None, **kwargs
) -> Table:
if isinstance(data, ABCSeries):
data = data.to_frame()
diff --git a/pandas/tests/api/test_api.py b/pandas/tests/api/test_api.py
index 95dc1d82cb286..38984238ecf65 100644
--- a/pandas/tests/api/test_api.py
+++ b/pandas/tests/api/test_api.py
@@ -193,6 +193,7 @@ class TestPDApi(Base):
"_hashtable",
"_lib",
"_libs",
+ "_np_version_under1p18",
"_is_numpy_dev",
"_testing",
"_tslib",
diff --git a/pandas/tests/apply/test_frame_apply.py b/pandas/tests/apply/test_frame_apply.py
index 801cbdf3d0a87..2511f6fc2563c 100644
--- a/pandas/tests/apply/test_frame_apply.py
+++ b/pandas/tests/apply/test_frame_apply.py
@@ -1,4 +1,5 @@
from datetime import datetime
+from itertools import chain
import warnings
import numpy as np
@@ -52,17 +53,6 @@ def test_apply_axis1_with_ea():
tm.assert_frame_equal(result, expected)
-@pytest.mark.parametrize(
- "data, dtype",
- [(1, None), (1, CategoricalDtype([1])), (Timestamp("2013-01-01", tz="UTC"), None)],
-)
-def test_agg_axis1_duplicate_index(data, dtype):
- # GH 42380
- expected = DataFrame([[data], [data]], index=["a", "a"], dtype=dtype)
- result = expected.agg(lambda x: x, axis=1)
- tm.assert_frame_equal(result, expected)
-
-
def test_apply_mixed_datetimelike():
# mixed datetimelike
# GH 7778
@@ -158,6 +148,32 @@ def test_apply_standard_nonunique():
tm.assert_series_equal(result, expected)
+@pytest.mark.parametrize("func", ["sum", "mean", "min", "max", "std"])
+@pytest.mark.parametrize(
+ "args,kwds",
+ [
+ pytest.param([], {}, id="no_args_or_kwds"),
+ pytest.param([1], {}, id="axis_from_args"),
+ pytest.param([], {"axis": 1}, id="axis_from_kwds"),
+ pytest.param([], {"numeric_only": True}, id="optional_kwds"),
+ pytest.param([1, None], {"numeric_only": True}, id="args_and_kwds"),
+ ],
+)
+@pytest.mark.parametrize("how", ["agg", "apply"])
+def test_apply_with_string_funcs(request, float_frame, func, args, kwds, how):
+ if len(args) > 1 and how == "agg":
+ request.node.add_marker(
+ pytest.mark.xfail(
+ raises=TypeError,
+ reason="agg/apply signature mismatch - agg passes 2nd "
+ "argument to func",
+ )
+ )
+ result = getattr(float_frame, how)(func, *args, **kwds)
+ expected = getattr(float_frame, func)(*args, **kwds)
+ tm.assert_series_equal(result, expected)
+
+
def test_apply_broadcast(float_frame, int_frame_const_col):
# scalars
@@ -1263,9 +1279,9 @@ def test_size_as_str(how, axis):
# on the columns
result = getattr(df, how)("size", axis=axis)
if axis == 0 or axis == "index":
- expected = Series(df.shape[0], index=df.columns)
+ expected = Series(df.shape[0], index=df.columns, name="size")
else:
- expected = Series(df.shape[1], index=df.index)
+ expected = Series(df.shape[1], index=df.index, name="size")
tm.assert_series_equal(result, expected)
@@ -1285,6 +1301,76 @@ def func(group_col):
tm.assert_frame_equal(result, expected)
+@pytest.mark.parametrize(
+ "df, func, expected",
+ chain(
+ tm.get_cython_table_params(
+ DataFrame(),
+ [
+ ("sum", Series(dtype="float64")),
+ ("max", Series(dtype="float64")),
+ ("min", Series(dtype="float64")),
+ ("all", Series(dtype=bool)),
+ ("any", Series(dtype=bool)),
+ ("mean", Series(dtype="float64")),
+ ("prod", Series(dtype="float64")),
+ ("std", Series(dtype="float64")),
+ ("var", Series(dtype="float64")),
+ ("median", Series(dtype="float64")),
+ ],
+ ),
+ tm.get_cython_table_params(
+ DataFrame([[np.nan, 1], [1, 2]]),
+ [
+ ("sum", Series([1.0, 3])),
+ ("max", Series([1.0, 2])),
+ ("min", Series([1.0, 1])),
+ ("all", Series([True, True])),
+ ("any", Series([True, True])),
+ ("mean", Series([1, 1.5])),
+ ("prod", Series([1.0, 2])),
+ ("std", Series([np.nan, 0.707107])),
+ ("var", Series([np.nan, 0.5])),
+ ("median", Series([1, 1.5])),
+ ],
+ ),
+ ),
+)
+def test_agg_cython_table(df, func, expected, axis):
+ # GH 21224
+ # test reducing functions in
+ # pandas.core.base.SelectionMixin._cython_table
+ result = df.agg(func, axis=axis)
+ tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+ "df, func, expected",
+ chain(
+ tm.get_cython_table_params(
+ DataFrame(), [("cumprod", DataFrame()), ("cumsum", DataFrame())]
+ ),
+ tm.get_cython_table_params(
+ DataFrame([[np.nan, 1], [1, 2]]),
+ [
+ ("cumprod", DataFrame([[np.nan, 1], [1, 2]])),
+ ("cumsum", DataFrame([[np.nan, 1], [1, 3]])),
+ ],
+ ),
+ ),
+)
+def test_agg_cython_table_transform(df, func, expected, axis):
+ # GH 21224
+ # test transforming functions in
+ # pandas.core.base.SelectionMixin._cython_table (cumprod, cumsum)
+ if axis == "columns" or axis == 1:
+ # operating blockwise doesn't let us preserve dtypes
+ expected = expected.astype("float64")
+
+ result = df.agg(func, axis=axis)
+ tm.assert_frame_equal(result, expected)
+
+
@pytest.mark.parametrize("axis", [0, 1])
@pytest.mark.parametrize(
"args, kwargs",
@@ -1413,6 +1499,31 @@ def test_apply_raw_returns_string():
tm.assert_series_equal(result, expected)
+@pytest.mark.parametrize(
+ "op", ["abs", "ceil", "cos", "cumsum", "exp", "log", "sqrt", "square"]
+)
+@pytest.mark.parametrize("how", ["transform", "apply"])
+def test_apply_np_transformer(float_frame, op, how):
+ # GH 39116
+ result = getattr(float_frame, how)(op)
+ expected = getattr(np, op)(float_frame)
+ tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize("op", ["mean", "median", "std", "var"])
+@pytest.mark.parametrize("how", ["agg", "apply"])
+def test_apply_np_reducer(float_frame, op, how):
+ # GH 39116
+ float_frame = DataFrame({"a": [1, 2], "b": [3, 4]})
+ result = getattr(float_frame, how)(op)
+ # pandas ddof defaults to 1, numpy to 0
+ kwargs = {"ddof": 1} if op in ("std", "var") else {}
+ expected = Series(
+ getattr(np, op)(float_frame, axis=0, **kwargs), index=float_frame.columns
+ )
+ tm.assert_series_equal(result, expected)
+
+
def test_aggregation_func_column_order():
# GH40420: the result of .agg should have an index that is sorted
# according to the arguments provided to agg.
diff --git a/pandas/tests/apply/test_frame_transform.py b/pandas/tests/apply/test_frame_transform.py
index 47173d14c543d..9050fab702881 100644
--- a/pandas/tests/apply/test_frame_transform.py
+++ b/pandas/tests/apply/test_frame_transform.py
@@ -1,3 +1,5 @@
+import operator
+
import numpy as np
import pytest
@@ -36,6 +38,33 @@ def test_transform_ufunc(axis, float_frame, frame_or_series):
tm.assert_equal(result, expected)
+@pytest.mark.parametrize("op", frame_transform_kernels)
+def test_transform_groupby_kernel(axis, float_frame, op, request):
+ # GH 35964
+
+ args = [0.0] if op == "fillna" else []
+ if axis == 0 or axis == "index":
+ ones = np.ones(float_frame.shape[0])
+ else:
+ ones = np.ones(float_frame.shape[1])
+ expected = float_frame.groupby(ones, axis=axis).transform(op, *args)
+ result = float_frame.transform(op, axis, *args)
+ tm.assert_frame_equal(result, expected)
+
+ # same thing, but ensuring we have multiple blocks
+ assert "E" not in float_frame.columns
+ float_frame["E"] = float_frame["A"].copy()
+ assert len(float_frame._mgr.arrays) > 1
+
+ if axis == 0 or axis == "index":
+ ones = np.ones(float_frame.shape[0])
+ else:
+ ones = np.ones(float_frame.shape[1])
+ expected2 = float_frame.groupby(ones, axis=axis).transform(op, *args)
+ result2 = float_frame.transform(op, axis, *args)
+ tm.assert_frame_equal(result2, expected2)
+
+
@pytest.mark.parametrize(
"ops, names",
[
@@ -126,6 +155,15 @@ def func(x):
tm.assert_equal(result, expected)
+@pytest.mark.parametrize("method", ["abs", "shift", "pct_change", "cumsum", "rank"])
+def test_transform_method_name(method):
+ # GH 19760
+ df = DataFrame({"A": [-1, 2]})
+ result = df.transform(method)
+ expected = operator.methodcaller(method)(df)
+ tm.assert_frame_equal(result, expected)
+
+
wont_fail = ["ffill", "bfill", "fillna", "pad", "backfill", "shift"]
frame_kernels_raise = [x for x in frame_transform_kernels if x not in wont_fail]
diff --git a/pandas/tests/apply/test_series_apply.py b/pandas/tests/apply/test_series_apply.py
index 2af340f0c1bb9..34d00e653b52d 100644
--- a/pandas/tests/apply/test_series_apply.py
+++ b/pandas/tests/apply/test_series_apply.py
@@ -2,10 +2,13 @@
Counter,
defaultdict,
)
+from itertools import chain
import numpy as np
import pytest
+from pandas.core.dtypes.common import is_number
+
import pandas as pd
from pandas import (
DataFrame,
@@ -84,6 +87,14 @@ def f(x):
assert result.dtype == object
+def test_with_string_args(datetime_series):
+
+ for arg in ["sum", "mean", "min", "max", "std"]:
+ result = datetime_series.apply(arg)
+ expected = getattr(datetime_series, arg)()
+ assert result == expected
+
+
def test_apply_args():
s = Series(["foo,bar"])
@@ -407,6 +418,92 @@ def test_non_callable_aggregates(how):
tm.assert_series_equal(result, expected)
+@pytest.mark.parametrize(
+ "series, func, expected",
+ chain(
+ tm.get_cython_table_params(
+ Series(dtype=np.float64),
+ [
+ ("sum", 0),
+ ("max", np.nan),
+ ("min", np.nan),
+ ("all", True),
+ ("any", False),
+ ("mean", np.nan),
+ ("prod", 1),
+ ("std", np.nan),
+ ("var", np.nan),
+ ("median", np.nan),
+ ],
+ ),
+ tm.get_cython_table_params(
+ Series([np.nan, 1, 2, 3]),
+ [
+ ("sum", 6),
+ ("max", 3),
+ ("min", 1),
+ ("all", True),
+ ("any", True),
+ ("mean", 2),
+ ("prod", 6),
+ ("std", 1),
+ ("var", 1),
+ ("median", 2),
+ ],
+ ),
+ tm.get_cython_table_params(
+ Series("a b c".split()),
+ [
+ ("sum", "abc"),
+ ("max", "c"),
+ ("min", "a"),
+ ("all", True),
+ ("any", True),
+ ],
+ ),
+ ),
+)
+def test_agg_cython_table(series, func, expected):
+ # GH21224
+ # test reducing functions in
+ # pandas.core.base.SelectionMixin._cython_table
+ result = series.agg(func)
+ if is_number(expected):
+ assert np.isclose(result, expected, equal_nan=True)
+ else:
+ assert result == expected
+
+
+@pytest.mark.parametrize(
+ "series, func, expected",
+ chain(
+ tm.get_cython_table_params(
+ Series(dtype=np.float64),
+ [
+ ("cumprod", Series([], Index([]), dtype=np.float64)),
+ ("cumsum", Series([], Index([]), dtype=np.float64)),
+ ],
+ ),
+ tm.get_cython_table_params(
+ Series([np.nan, 1, 2, 3]),
+ [
+ ("cumprod", Series([np.nan, 1, 2, 6])),
+ ("cumsum", Series([np.nan, 1, 3, 6])),
+ ],
+ ),
+ tm.get_cython_table_params(
+ Series("a b c".split()), [("cumsum", Series(["a", "ab", "abc"]))]
+ ),
+ ),
+)
+def test_agg_cython_table_transform(series, func, expected):
+ # GH21224
+ # test transforming functions in
+ # pandas.core.base.SelectionMixin._cython_table (cumprod, cumsum)
+ result = series.agg(func)
+ tm.assert_series_equal(result, expected)
+
+
def test_series_apply_no_suffix_index():
# GH36189
s = Series([4] * 3)
diff --git a/pandas/tests/apply/test_series_transform.py b/pandas/tests/apply/test_series_transform.py
index b10af13eae20c..90065d20e1a59 100644
--- a/pandas/tests/apply/test_series_transform.py
+++ b/pandas/tests/apply/test_series_transform.py
@@ -8,6 +8,24 @@
concat,
)
import pandas._testing as tm
+from pandas.core.groupby.base import transformation_kernels
+
+# tshift only works on time index and is deprecated
+# There is no Series.cumcount
+series_kernels = [
+ x for x in sorted(transformation_kernels) if x not in ["tshift", "cumcount"]
+]
+
+
+@pytest.mark.parametrize("op", series_kernels)
+def test_transform_groupby_kernel(string_series, op):
+ # GH 35964
+
+ args = [0.0] if op == "fillna" else []
+ ones = np.ones(string_series.shape[0])
+ expected = string_series.groupby(ones).transform(op, *args)
+ result = string_series.transform(op, 0, *args)
+ tm.assert_series_equal(result, expected)
@pytest.mark.parametrize(
diff --git a/pandas/tests/apply/test_str.py b/pandas/tests/apply/test_str.py
deleted file mode 100644
index 67e8dd520dc3b..0000000000000
--- a/pandas/tests/apply/test_str.py
+++ /dev/null
@@ -1,289 +0,0 @@
-from itertools import chain
-import operator
-
-import numpy as np
-import pytest
-
-from pandas.core.dtypes.common import is_number
-
-from pandas import (
- DataFrame,
- Index,
- Series,
-)
-import pandas._testing as tm
-from pandas.tests.apply.common import (
- frame_transform_kernels,
- series_transform_kernels,
-)
-
-
-@pytest.mark.parametrize("func", ["sum", "mean", "min", "max", "std"])
-@pytest.mark.parametrize(
- "args,kwds",
- [
- pytest.param([], {}, id="no_args_or_kwds"),
- pytest.param([1], {}, id="axis_from_args"),
- pytest.param([], {"axis": 1}, id="axis_from_kwds"),
- pytest.param([], {"numeric_only": True}, id="optional_kwds"),
- pytest.param([1, None], {"numeric_only": True}, id="args_and_kwds"),
- ],
-)
-@pytest.mark.parametrize("how", ["agg", "apply"])
-def test_apply_with_string_funcs(request, float_frame, func, args, kwds, how):
- if len(args) > 1 and how == "agg":
- request.node.add_marker(
- pytest.mark.xfail(
- raises=TypeError,
- reason="agg/apply signature mismatch - agg passes 2nd "
- "argument to func",
- )
- )
- result = getattr(float_frame, how)(func, *args, **kwds)
- expected = getattr(float_frame, func)(*args, **kwds)
- tm.assert_series_equal(result, expected)
-
-
-def test_with_string_args(datetime_series):
-
- for arg in ["sum", "mean", "min", "max", "std"]:
- result = datetime_series.apply(arg)
- expected = getattr(datetime_series, arg)()
- assert result == expected
-
-
-@pytest.mark.parametrize("op", ["mean", "median", "std", "var"])
-@pytest.mark.parametrize("how", ["agg", "apply"])
-def test_apply_np_reducer(float_frame, op, how):
- # GH 39116
- float_frame = DataFrame({"a": [1, 2], "b": [3, 4]})
- result = getattr(float_frame, how)(op)
- # pandas ddof defaults to 1, numpy to 0
- kwargs = {"ddof": 1} if op in ("std", "var") else {}
- expected = Series(
- getattr(np, op)(float_frame, axis=0, **kwargs), index=float_frame.columns
- )
- tm.assert_series_equal(result, expected)
-
-
-@pytest.mark.parametrize(
- "op", ["abs", "ceil", "cos", "cumsum", "exp", "log", "sqrt", "square"]
-)
-@pytest.mark.parametrize("how", ["transform", "apply"])
-def test_apply_np_transformer(float_frame, op, how):
- # GH 39116
- result = getattr(float_frame, how)(op)
- expected = getattr(np, op)(float_frame)
- tm.assert_frame_equal(result, expected)
-
-
-@pytest.mark.parametrize(
- "series, func, expected",
- chain(
- tm.get_cython_table_params(
- Series(dtype=np.float64),
- [
- ("sum", 0),
- ("max", np.nan),
- ("min", np.nan),
- ("all", True),
- ("any", False),
- ("mean", np.nan),
- ("prod", 1),
- ("std", np.nan),
- ("var", np.nan),
- ("median", np.nan),
- ],
- ),
- tm.get_cython_table_params(
- Series([np.nan, 1, 2, 3]),
- [
- ("sum", 6),
- ("max", 3),
- ("min", 1),
- ("all", True),
- ("any", True),
- ("mean", 2),
- ("prod", 6),
- ("std", 1),
- ("var", 1),
- ("median", 2),
- ],
- ),
- tm.get_cython_table_params(
- Series("a b c".split()),
- [
- ("sum", "abc"),
- ("max", "c"),
- ("min", "a"),
- ("all", True),
- ("any", True),
- ],
- ),
- ),
-)
-def test_agg_cython_table_series(series, func, expected):
- # GH21224
- # test reducing functions in
- # pandas.core.base.SelectionMixin._cython_table
- result = series.agg(func)
- if is_number(expected):
- assert np.isclose(result, expected, equal_nan=True)
- else:
- assert result == expected
-
-
-@pytest.mark.parametrize(
- "series, func, expected",
- chain(
- tm.get_cython_table_params(
- Series(dtype=np.float64),
- [
- ("cumprod", Series([], Index([]), dtype=np.float64)),
- ("cumsum", Series([], Index([]), dtype=np.float64)),
- ],
- ),
- tm.get_cython_table_params(
- Series([np.nan, 1, 2, 3]),
- [
- ("cumprod", Series([np.nan, 1, 2, 6])),
- ("cumsum", Series([np.nan, 1, 3, 6])),
- ],
- ),
- tm.get_cython_table_params(
- Series("a b c".split()), [("cumsum", Series(["a", "ab", "abc"]))]
- ),
- ),
-)
-def test_agg_cython_table_transform_series(series, func, expected):
- # GH21224
- # test transforming functions in
- # pandas.core.base.SelectionMixin._cython_table (cumprod, cumsum)
- result = series.agg(func)
- tm.assert_series_equal(result, expected)
-
-
-@pytest.mark.parametrize(
- "df, func, expected",
- chain(
- tm.get_cython_table_params(
- DataFrame(),
- [
- ("sum", Series(dtype="float64")),
- ("max", Series(dtype="float64")),
- ("min", Series(dtype="float64")),
- ("all", Series(dtype=bool)),
- ("any", Series(dtype=bool)),
- ("mean", Series(dtype="float64")),
- ("prod", Series(dtype="float64")),
- ("std", Series(dtype="float64")),
- ("var", Series(dtype="float64")),
- ("median", Series(dtype="float64")),
- ],
- ),
- tm.get_cython_table_params(
- DataFrame([[np.nan, 1], [1, 2]]),
- [
- ("sum", Series([1.0, 3])),
- ("max", Series([1.0, 2])),
- ("min", Series([1.0, 1])),
- ("all", Series([True, True])),
- ("any", Series([True, True])),
- ("mean", Series([1, 1.5])),
- ("prod", Series([1.0, 2])),
- ("std", Series([np.nan, 0.707107])),
- ("var", Series([np.nan, 0.5])),
- ("median", Series([1, 1.5])),
- ],
- ),
- ),
-)
-def test_agg_cython_table_frame(df, func, expected, axis):
- # GH 21224
- # test reducing functions in
- # pandas.core.base.SelectionMixin._cython_table
- result = df.agg(func, axis=axis)
- tm.assert_series_equal(result, expected)
-
-
-@pytest.mark.parametrize(
- "df, func, expected",
- chain(
- tm.get_cython_table_params(
- DataFrame(), [("cumprod", DataFrame()), ("cumsum", DataFrame())]
- ),
- tm.get_cython_table_params(
- DataFrame([[np.nan, 1], [1, 2]]),
- [
- ("cumprod", DataFrame([[np.nan, 1], [1, 2]])),
- ("cumsum", DataFrame([[np.nan, 1], [1, 3]])),
- ],
- ),
- ),
-)
-def test_agg_cython_table_transform_frame(df, func, expected, axis):
- # GH 21224
- # test transforming functions in
- # pandas.core.base.SelectionMixin._cython_table (cumprod, cumsum)
- if axis == "columns" or axis == 1:
- # operating blockwise doesn't let us preserve dtypes
- expected = expected.astype("float64")
-
- result = df.agg(func, axis=axis)
- tm.assert_frame_equal(result, expected)
-
-
-@pytest.mark.parametrize("op", series_transform_kernels)
-def test_transform_groupby_kernel_series(string_series, op):
- # GH 35964
-
- args = [0.0] if op == "fillna" else []
- ones = np.ones(string_series.shape[0])
- expected = string_series.groupby(ones).transform(op, *args)
- result = string_series.transform(op, 0, *args)
- tm.assert_series_equal(result, expected)
-
-
-@pytest.mark.parametrize("op", frame_transform_kernels)
-def test_transform_groupby_kernel_frame(
- axis, float_frame, op, using_array_manager, request
-):
- # GH 35964
- if using_array_manager and op == "pct_change" and axis in (1, "columns"):
- # TODO(ArrayManager) shift with axis=1
- request.node.add_marker(
- pytest.mark.xfail(
- reason="shift axis=1 not yet implemented for ArrayManager"
- )
- )
-
- args = [0.0] if op == "fillna" else []
- if axis == 0 or axis == "index":
- ones = np.ones(float_frame.shape[0])
- else:
- ones = np.ones(float_frame.shape[1])
- expected = float_frame.groupby(ones, axis=axis).transform(op, *args)
- result = float_frame.transform(op, axis, *args)
- tm.assert_frame_equal(result, expected)
-
- # same thing, but ensuring we have multiple blocks
- assert "E" not in float_frame.columns
- float_frame["E"] = float_frame["A"].copy()
- assert len(float_frame._mgr.arrays) > 1
-
- if axis == 0 or axis == "index":
- ones = np.ones(float_frame.shape[0])
- else:
- ones = np.ones(float_frame.shape[1])
- expected2 = float_frame.groupby(ones, axis=axis).transform(op, *args)
- result2 = float_frame.transform(op, axis, *args)
- tm.assert_frame_equal(result2, expected2)
-
-
-@pytest.mark.parametrize("method", ["abs", "shift", "pct_change", "cumsum", "rank"])
-def test_transform_method_name(method):
- # GH 19760
- df = DataFrame({"A": [-1, 2]})
- result = df.transform(method)
- expected = operator.methodcaller(method)(df)
- tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/arrays/categorical/test_analytics.py b/pandas/tests/arrays/categorical/test_analytics.py
index 89f2241fc6993..c0287df1694e9 100644
--- a/pandas/tests/arrays/categorical/test_analytics.py
+++ b/pandas/tests/arrays/categorical/test_analytics.py
@@ -186,19 +186,15 @@ def test_searchsorted(self, ordered):
tm.assert_numpy_array_equal(res_ser, exp)
# Searching for a single value that is not from the Categorical
- with pytest.raises(TypeError, match="cucumber"):
+ with pytest.raises(KeyError, match="cucumber"):
cat.searchsorted("cucumber")
- with pytest.raises(TypeError, match="cucumber"):
+ with pytest.raises(KeyError, match="cucumber"):
ser.searchsorted("cucumber")
# Searching for multiple values one of each is not from the Categorical
- msg = (
- "Cannot setitem on a Categorical with a new category, "
- "set the categories first"
- )
- with pytest.raises(TypeError, match=msg):
+ with pytest.raises(KeyError, match="cucumber"):
cat.searchsorted(["bread", "cucumber"])
- with pytest.raises(TypeError, match=msg):
+ with pytest.raises(KeyError, match="cucumber"):
ser.searchsorted(["bread", "cucumber"])
def test_unique(self, ordered):
diff --git a/pandas/tests/arrays/categorical/test_indexing.py b/pandas/tests/arrays/categorical/test_indexing.py
index 807a046cfbf13..5b31776301f7b 100644
--- a/pandas/tests/arrays/categorical/test_indexing.py
+++ b/pandas/tests/arrays/categorical/test_indexing.py
@@ -73,7 +73,7 @@ def test_setitem_different_unordered_raises(self, other):
target = Categorical(["a", "b"], categories=["a", "b"])
mask = np.array([True, False])
msg = "Cannot set a Categorical with another, without identical categories"
- with pytest.raises(TypeError, match=msg):
+ with pytest.raises(ValueError, match=msg):
target[mask] = other[mask]
@pytest.mark.parametrize(
@@ -89,7 +89,7 @@ def test_setitem_same_ordered_raises(self, other):
target = Categorical(["a", "b"], categories=["a", "b"], ordered=True)
mask = np.array([True, False])
msg = "Cannot set a Categorical with another, without identical categories"
- with pytest.raises(TypeError, match=msg):
+ with pytest.raises(ValueError, match=msg):
target[mask] = other[mask]
def test_setitem_tuple(self):
@@ -260,7 +260,7 @@ def test_where_other_categorical(self):
def test_where_new_category_raises(self):
ser = Series(Categorical(["a", "b", "c"]))
msg = "Cannot setitem on a Categorical with a new category"
- with pytest.raises(TypeError, match=msg):
+ with pytest.raises(ValueError, match=msg):
ser.where([True, False, True], "d")
def test_where_ordered_differs_rasies(self):
@@ -270,7 +270,7 @@ def test_where_ordered_differs_rasies(self):
other = Categorical(
["b", "c", "a"], categories=["a", "c", "b", "d"], ordered=True
)
- with pytest.raises(TypeError, match="without identical categories"):
+ with pytest.raises(ValueError, match="without identical categories"):
ser.where([True, False, True], other)
diff --git a/pandas/tests/arrays/categorical/test_missing.py b/pandas/tests/arrays/categorical/test_missing.py
index f419aa6f181f2..930d890ee91d4 100644
--- a/pandas/tests/arrays/categorical/test_missing.py
+++ b/pandas/tests/arrays/categorical/test_missing.py
@@ -84,12 +84,7 @@ def test_fillna_raises(self, fillna_kwargs, msg):
# https://github.com/pandas-dev/pandas/issues/13628
cat = Categorical([1, 2, 3, None, None])
- if len(fillna_kwargs) == 1 and "value" in fillna_kwargs:
- err = TypeError
- else:
- err = ValueError
-
- with pytest.raises(err, match=msg):
+ with pytest.raises(ValueError, match=msg):
cat.fillna(**fillna_kwargs)
@pytest.mark.parametrize("named", [True, False])
@@ -109,7 +104,7 @@ def test_fillna_iterable_category(self, named):
# not NotImplementedError GH#41914
cat = Categorical(np.array([Point(1, 0), Point(0, 1), None], dtype=object))
msg = "Cannot setitem on a Categorical with a new category"
- with pytest.raises(TypeError, match=msg):
+ with pytest.raises(ValueError, match=msg):
cat.fillna(Point(0, 0))
def test_fillna_array(self):
diff --git a/pandas/tests/arrays/categorical/test_take.py b/pandas/tests/arrays/categorical/test_take.py
index fbdbea1dae3b2..6cb54908724c9 100644
--- a/pandas/tests/arrays/categorical/test_take.py
+++ b/pandas/tests/arrays/categorical/test_take.py
@@ -81,7 +81,7 @@ def test_take_fill_value(self):
def test_take_fill_value_new_raises(self):
# https://github.com/pandas-dev/pandas/issues/23296
cat = Categorical(["a", "b", "c"])
- xpr = r"Cannot setitem on a Categorical with a new category \(d\)"
+ xpr = r"'fill_value=d' is not present in this Categorical's categories"
with pytest.raises(TypeError, match=xpr):
cat.take([0, 1, -1], fill_value="d", allow_fill=True)
diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py
index c6240600d3a05..5731f02430a9d 100644
--- a/pandas/tests/arrays/string_/test_string.py
+++ b/pandas/tests/arrays/string_/test_string.py
@@ -2,6 +2,9 @@
This module tests the functionality of StringArray and ArrowStringArray.
Tests for the str accessors are in pandas/tests/strings/test_string_array.py
"""
+
+import re
+
import numpy as np
import pytest
@@ -311,7 +314,7 @@ def test_astype_int(dtype):
tm.assert_numpy_array_equal(result, expected)
arr = pd.array(["1", pd.NA, "3"], dtype=dtype)
- msg = r"int\(\) argument must be a string, a bytes-like object or a( real)? number"
+ msg = re.escape("int() argument must be a string, a bytes-like object or a number")
with pytest.raises(TypeError, match=msg):
arr.astype("int64")
diff --git a/pandas/tests/arrays/test_array.py b/pandas/tests/arrays/test_array.py
index 958ccec930f0e..61d56df485ab1 100644
--- a/pandas/tests/arrays/test_array.py
+++ b/pandas/tests/arrays/test_array.py
@@ -309,14 +309,6 @@ def test_scalar_raises():
pd.array(1)
-def test_bounds_check():
- # GH21796
- with pytest.raises(
- TypeError, match=r"cannot safely cast non-equivalent int(32|64) to uint16"
- ):
- pd.array([-1, 2, 3], dtype="UInt16")
-
-
# ---------------------------------------------------------------------------
# A couple dummy classes to ensure that Series and Indexes are unboxed before
# getting to the EA classes.
diff --git a/pandas/tests/arrays/test_datetimelike.py b/pandas/tests/arrays/test_datetimelike.py
index 1e150f1b431c7..3f3f3a5ee8d18 100644
--- a/pandas/tests/arrays/test_datetimelike.py
+++ b/pandas/tests/arrays/test_datetimelike.py
@@ -10,6 +10,7 @@
OutOfBoundsDatetime,
Timestamp,
)
+from pandas.compat import np_version_under1p18
import pandas.util._test_decorators as td
import pandas as pd
@@ -287,7 +288,12 @@ def test_searchsorted(self):
# GH#29884 match numpy convention on whether NaT goes
# at the end or the beginning
result = arr.searchsorted(NaT)
- assert result == 10
+ if np_version_under1p18:
+ # Following numpy convention, NaT goes at the beginning
+ # (unlike NaN which goes at the end)
+ assert result == 0
+ else:
+ assert result == 10
@pytest.mark.parametrize("box", [None, "index", "series"])
def test_searchsorted_castable_strings(self, arr1d, box, request, string_storage):
@@ -1238,11 +1244,17 @@ def test_invalid_nat_setitem_array(arr, non_casting_nats):
],
)
def test_to_numpy_extra(arr):
+ if np_version_under1p18:
+ # np.isnan(NaT) raises, so use pandas'
+ isnan = pd.isna
+ else:
+ isnan = np.isnan
+
arr[0] = NaT
original = arr.copy()
result = arr.to_numpy()
- assert np.isnan(result[0])
+ assert isnan(result[0])
result = arr.to_numpy(dtype="int64")
assert result[0] == -9223372036854775808
diff --git a/pandas/tests/base/test_transpose.py b/pandas/tests/base/test_transpose.py
index 246f33d27476c..5ba278368834c 100644
--- a/pandas/tests/base/test_transpose.py
+++ b/pandas/tests/base/test_transpose.py
@@ -1,10 +1,6 @@
import numpy as np
import pytest
-from pandas import (
- CategoricalDtype,
- DataFrame,
-)
import pandas._testing as tm
@@ -29,28 +25,3 @@ def test_numpy_transpose(index_or_series_obj):
with pytest.raises(ValueError, match=msg):
np.transpose(obj, axes=1)
-
-
-@pytest.mark.parametrize(
- "data, transposed_data, index, columns, dtype",
- [
- ([[1], [2]], [[1, 2]], ["a", "a"], ["b"], int),
- ([[1], [2]], [[1, 2]], ["a", "a"], ["b"], CategoricalDtype([1, 2])),
- ([[1, 2]], [[1], [2]], ["b"], ["a", "a"], int),
- ([[1, 2]], [[1], [2]], ["b"], ["a", "a"], CategoricalDtype([1, 2])),
- ([[1, 2], [3, 4]], [[1, 3], [2, 4]], ["a", "a"], ["b", "b"], int),
- (
- [[1, 2], [3, 4]],
- [[1, 3], [2, 4]],
- ["a", "a"],
- ["b", "b"],
- CategoricalDtype([1, 2, 3, 4]),
- ),
- ],
-)
-def test_duplicate_labels(data, transposed_data, index, columns, dtype):
- # GH 42380
- df = DataFrame(data, index=index, columns=columns, dtype=dtype)
- result = df.T
- expected = DataFrame(transposed_data, index=columns, columns=index, dtype=dtype)
- tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/computation/test_eval.py b/pandas/tests/computation/test_eval.py
index b6c6baf6cc7e4..7cf319e1d134c 100644
--- a/pandas/tests/computation/test_eval.py
+++ b/pandas/tests/computation/test_eval.py
@@ -21,6 +21,7 @@
from pandas import (
DataFrame,
Series,
+ compat,
date_range,
)
import pandas._testing as tm
@@ -1282,8 +1283,10 @@ def test_assignment_column(self):
msg = "left hand side of an assignment must be a single name"
with pytest.raises(SyntaxError, match=msg):
df.eval("d,c = a + b")
-
- msg = "cannot assign to function call"
+ if compat.PY38:
+ msg = "cannot assign to function call"
+ else:
+ msg = "can't assign to function call"
with pytest.raises(SyntaxError, match=msg):
df.eval('Timestamp("20131001") = a + b')
@@ -1968,7 +1971,9 @@ def test_bool_ops_fails_on_scalars(lhs, cmp, rhs, engine, parser):
"other",
[
"'x'",
- "...",
+ pytest.param(
+ "...", marks=pytest.mark.xfail(not compat.PY38, reason="GH-28116")
+ ),
],
)
def test_equals_various(other):
diff --git a/pandas/tests/dtypes/cast/test_dict_compat.py b/pandas/tests/dtypes/cast/test_dict_compat.py
deleted file mode 100644
index 13dc82d779f95..0000000000000
--- a/pandas/tests/dtypes/cast/test_dict_compat.py
+++ /dev/null
@@ -1,14 +0,0 @@
-import numpy as np
-
-from pandas.core.dtypes.cast import dict_compat
-
-from pandas import Timestamp
-
-
-def test_dict_compat():
- data_datetime64 = {np.datetime64("1990-03-15"): 1, np.datetime64("2015-03-15"): 2}
- data_unchanged = {1: 2, 3: 4, 5: 6}
- expected = {Timestamp("1990-3-15"): 1, Timestamp("2015-03-15"): 2}
- assert dict_compat(data_datetime64) == expected
- assert dict_compat(expected) == expected
- assert dict_compat(data_unchanged) == data_unchanged
diff --git a/pandas/tests/extension/base/reshaping.py b/pandas/tests/extension/base/reshaping.py
index 8f241679d5108..3e6b1cbfb311c 100644
--- a/pandas/tests/extension/base/reshaping.py
+++ b/pandas/tests/extension/base/reshaping.py
@@ -4,7 +4,6 @@
import pytest
from pandas.core.dtypes.common import (
- is_datetime64tz_dtype,
is_interval_dtype,
is_period_dtype,
)
@@ -329,9 +328,6 @@ def test_unstack(self, data, index, obj):
)
if obj == "series":
# TODO: special cases belong in dtype-specific tests
- if is_datetime64tz_dtype(data.dtype):
- assert expected.dtypes.apply(is_datetime64tz_dtype).all()
- expected = expected.astype(object)
if is_period_dtype(data.dtype):
assert expected.dtypes.apply(is_period_dtype).all()
expected = expected.astype(object)
diff --git a/pandas/tests/extension/test_datetime.py b/pandas/tests/extension/test_datetime.py
index 54e31e05e8b0e..bb8347f0a0122 100644
--- a/pandas/tests/extension/test_datetime.py
+++ b/pandas/tests/extension/test_datetime.py
@@ -193,6 +193,40 @@ def test_concat_mixed_dtypes(self, data):
# drops the tz.
super().test_concat_mixed_dtypes(data)
+ @pytest.mark.parametrize("obj", ["series", "frame"])
+ def test_unstack(self, obj):
+ # GH-13287: can't use base test, since building the expected fails.
+ dtype = DatetimeTZDtype(tz="US/Central")
+ data = DatetimeArray._from_sequence(
+ ["2000", "2001", "2002", "2003"],
+ dtype=dtype,
+ )
+ index = pd.MultiIndex.from_product(([["A", "B"], ["a", "b"]]), names=["a", "b"])
+
+ if obj == "series":
+ ser = pd.Series(data, index=index)
+ expected = pd.DataFrame(
+ {"A": data.take([0, 1]), "B": data.take([2, 3])},
+ index=pd.Index(["a", "b"], name="b"),
+ )
+ expected.columns.name = "a"
+
+ else:
+ ser = pd.DataFrame({"A": data, "B": data}, index=index)
+ expected = pd.DataFrame(
+ {
+ ("A", "A"): data.take([0, 1]),
+ ("A", "B"): data.take([2, 3]),
+ ("B", "A"): data.take([0, 1]),
+ ("B", "B"): data.take([2, 3]),
+ },
+ index=pd.Index(["a", "b"], name="b"),
+ )
+ expected.columns.names = [None, "a"]
+
+ result = ser.unstack(0)
+ self.assert_equal(result, expected)
+
class TestSetitem(BaseDatetimeTests, base.BaseSetitemTests):
pass
diff --git a/pandas/tests/extension/test_sparse.py b/pandas/tests/extension/test_sparse.py
index 9c21f717573c1..f0d3fb7ff9e1b 100644
--- a/pandas/tests/extension/test_sparse.py
+++ b/pandas/tests/extension/test_sparse.py
@@ -13,7 +13,6 @@
be added to the array-specific tests in `pandas/tests/arrays/`.
"""
-
import numpy as np
import pytest
diff --git a/pandas/tests/frame/indexing/test_getitem.py b/pandas/tests/frame/indexing/test_getitem.py
index 71e8f84b4ad01..073e7b0357124 100644
--- a/pandas/tests/frame/indexing/test_getitem.py
+++ b/pandas/tests/frame/indexing/test_getitem.py
@@ -299,7 +299,7 @@ def test_getitem_boolean_frame_unaligned_with_duplicate_columns(self, df_dup_col
# boolean with the duplicate raises
df = df_dup_cols
- msg = "cannot reindex on an axis with duplicate labels"
+ msg = "cannot reindex from a duplicate axis"
with pytest.raises(ValueError, match=msg):
df[df.A > 6]
diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py
index 077301613eb8b..e2121fa2318eb 100644
--- a/pandas/tests/frame/indexing/test_indexing.py
+++ b/pandas/tests/frame/indexing/test_indexing.py
@@ -1283,7 +1283,7 @@ def test_object_casting_indexing_wraps_datetimelike(using_array_manager):
assert isinstance(val, pd.Timedelta)
-msg1 = r"Cannot setitem on a Categorical with a new category( \(.*\))?, set the"
+msg1 = "Cannot setitem on a Categorical with a new category, set the categories first"
msg2 = "Cannot set a Categorical with another, without identical categories"
@@ -1348,7 +1348,7 @@ def test_loc_iloc_setitem_list_of_lists(self, orig, exp_multi_row, indexer):
tm.assert_frame_equal(df, exp_multi_row)
df = orig.copy()
- with pytest.raises(TypeError, match=msg1):
+ with pytest.raises(ValueError, match=msg1):
indexer(df)[key, :] = [["c", 2], ["c", 2]]
@pytest.mark.parametrize("indexer", [tm.loc, tm.iloc, tm.at, tm.iat])
@@ -1367,7 +1367,7 @@ def test_loc_iloc_at_iat_setitem_single_value_in_categories(
tm.assert_frame_equal(df, exp_single_cats_value)
# "c" is not among the categories for df["cat"]
- with pytest.raises(TypeError, match=msg1):
+ with pytest.raises(ValueError, match=msg1):
indexer(df)[key] = "c"
@pytest.mark.parametrize("indexer", [tm.loc, tm.iloc])
@@ -1401,7 +1401,7 @@ def test_loc_iloc_setitem_full_row_non_categorical_rhs(
tm.assert_frame_equal(df, exp_single_row)
# "c" is not among the categories for df["cat"]
- with pytest.raises(TypeError, match=msg1):
+ with pytest.raises(ValueError, match=msg1):
indexer(df)[key, :] = ["c", 2]
@pytest.mark.parametrize("indexer", [tm.loc, tm.iloc])
@@ -1423,14 +1423,14 @@ def test_loc_iloc_setitem_partial_col_categorical_rhs(
# categories do not match df["cat"]'s, but "b" is among them
semi_compat = Categorical(list("bb"), categories=list("abc"))
- with pytest.raises(TypeError, match=msg2):
+ with pytest.raises(ValueError, match=msg2):
# different categories but holdable values
# -> not sure if this should fail or pass
indexer(df)[key] = semi_compat
# categories do not match df["cat"]'s, and "c" is not among them
incompat = Categorical(list("cc"), categories=list("abc"))
- with pytest.raises(TypeError, match=msg2):
+ with pytest.raises(ValueError, match=msg2):
# different values
indexer(df)[key] = incompat
@@ -1450,5 +1450,5 @@ def test_loc_iloc_setitem_non_categorical_rhs(
tm.assert_frame_equal(df, exp_parts_cats_col)
# "c" not part of the categories
- with pytest.raises(TypeError, match=msg1):
+ with pytest.raises(ValueError, match=msg1):
indexer(df)[key] = ["c", "c"]
diff --git a/pandas/tests/frame/indexing/test_setitem.py b/pandas/tests/frame/indexing/test_setitem.py
index 25682330fe19a..62d7535159f13 100644
--- a/pandas/tests/frame/indexing/test_setitem.py
+++ b/pandas/tests/frame/indexing/test_setitem.py
@@ -68,7 +68,7 @@ def test_setitem_error_msmgs(self):
index=Index(["a", "b", "c", "a"], name="foo"),
name="fiz",
)
- msg = "cannot reindex on an axis with duplicate labels"
+ msg = "cannot reindex from a duplicate axis"
with pytest.raises(ValueError, match=msg):
df["newcol"] = ser
diff --git a/pandas/tests/frame/indexing/test_xs.py b/pandas/tests/frame/indexing/test_xs.py
index d2704876c31c5..ccd989e2de411 100644
--- a/pandas/tests/frame/indexing/test_xs.py
+++ b/pandas/tests/frame/indexing/test_xs.py
@@ -129,23 +129,6 @@ def test_xs_view(self, using_array_manager):
class TestXSWithMultiIndex:
- def test_xs_doc_example(self):
- # TODO: more descriptive name
- # based on example in advanced.rst
- arrays = [
- ["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"],
- ["one", "two", "one", "two", "one", "two", "one", "two"],
- ]
- tuples = list(zip(*arrays))
-
- index = MultiIndex.from_tuples(tuples, names=["first", "second"])
- df = DataFrame(np.random.randn(3, 8), index=["A", "B", "C"], columns=index)
-
- result = df.xs(("one", "bar"), level=("second", "first"), axis=1)
-
- expected = df.iloc[:, [0]]
- tm.assert_frame_equal(result, expected)
-
def test_xs_integer_key(self):
# see GH#2107
dates = range(20111201, 20111205)
@@ -318,13 +301,12 @@ def test_xs_IndexSlice_argument_not_implemented(self, klass):
if klass is Series:
obj = obj[0]
- expected = obj.iloc[-2:].droplevel(0)
-
- result = obj.xs(IndexSlice[("foo", "qux", 0), :])
- tm.assert_equal(result, expected)
-
- result = obj.loc[IndexSlice[("foo", "qux", 0), :]]
- tm.assert_equal(result, expected)
+ msg = (
+ "Expected label or tuple of labels, got "
+ r"\(\('foo', 'qux', 0\), slice\(None, None, None\)\)"
+ )
+ with pytest.raises(TypeError, match=msg):
+ obj.xs(IndexSlice[("foo", "qux", 0), :])
@pytest.mark.parametrize("klass", [DataFrame, Series])
def test_xs_levels_raises(self, klass):
diff --git a/pandas/tests/frame/methods/test_astype.py b/pandas/tests/frame/methods/test_astype.py
index 1f1991214aad0..881f8db305240 100644
--- a/pandas/tests/frame/methods/test_astype.py
+++ b/pandas/tests/frame/methods/test_astype.py
@@ -632,9 +632,13 @@ def test_astype_tz_object_conversion(self, tz):
result = result.astype({"tz": "datetime64[ns, Europe/London]"})
tm.assert_frame_equal(result, expected)
- def test_astype_dt64_to_string(self, frame_or_series, tz_naive_fixture):
- # GH#41409
+ def test_astype_dt64_to_string(self, frame_or_series, tz_naive_fixture, request):
tz = tz_naive_fixture
+ if tz is None:
+ mark = pytest.mark.xfail(
+ reason="GH#36153 uses ndarray formatting instead of DTA formatting"
+ )
+ request.node.add_marker(mark)
dti = date_range("2016-01-01", periods=3, tz=tz)
dta = dti._data
@@ -656,40 +660,11 @@ def test_astype_dt64_to_string(self, frame_or_series, tz_naive_fixture):
alt = obj.astype(str)
assert np.all(alt.iloc[1:] == result.iloc[1:])
- def test_astype_td64_to_string(self, frame_or_series):
- # GH#41409
- tdi = pd.timedelta_range("1 Day", periods=3)
- obj = frame_or_series(tdi)
-
- expected = frame_or_series(["1 days", "2 days", "3 days"], dtype="string")
- result = obj.astype("string")
- tm.assert_equal(result, expected)
-
def test_astype_bytes(self):
# GH#39474
result = DataFrame(["foo", "bar", "baz"]).astype(bytes)
assert result.dtypes[0] == np.dtype("S3")
- @pytest.mark.parametrize(
- "index_slice",
- [
- np.s_[:2, :2],
- np.s_[:1, :2],
- np.s_[:2, :1],
- np.s_[::2, ::2],
- np.s_[::1, ::2],
- np.s_[::2, ::1],
- ],
- )
- def test_astype_noncontiguous(self, index_slice):
- # GH#42396
- data = np.arange(16).reshape(4, 4)
- df = DataFrame(data)
-
- result = df.iloc[index_slice].astype("int16")
- expected = df.iloc[index_slice]
- tm.assert_frame_equal(result, expected, check_dtype=False)
-
class TestAstypeCategorical:
def test_astype_from_categorical3(self):
@@ -718,11 +693,3 @@ def test_categorical_astype_to_int(self, any_int_or_nullable_int_dtype):
{"col1": pd.array([2, 1, 3], dtype=any_int_or_nullable_int_dtype)}
)
tm.assert_frame_equal(df, expected)
-
- def test_astype_categorical_to_string_missing(self):
- # https://github.com/pandas-dev/pandas/issues/41797
- df = DataFrame(["a", "b", np.nan])
- expected = df.astype(str)
- cat = df.astype("category")
- result = cat.astype(str)
- tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/frame/methods/test_describe.py b/pandas/tests/frame/methods/test_describe.py
index 3a1228ee5c4a5..fa91eb928e35c 100644
--- a/pandas/tests/frame/methods/test_describe.py
+++ b/pandas/tests/frame/methods/test_describe.py
@@ -346,7 +346,7 @@ def test_describe_percentiles_integer_idx(self):
result = df.describe(percentiles=pct)
expected = DataFrame(
- {"x": [1.0, 1.0, np.NaN, 1.0, *(1.0 for _ in pct), 1.0]},
+ {"x": [1.0, 1.0, np.NaN, 1.0, *[1.0 for _ in pct], 1.0]},
index=[
"count",
"mean",
diff --git a/pandas/tests/frame/methods/test_explode.py b/pandas/tests/frame/methods/test_explode.py
index 6fdf5d806ac6b..bd0901387eeed 100644
--- a/pandas/tests/frame/methods/test_explode.py
+++ b/pandas/tests/frame/methods/test_explode.py
@@ -9,12 +9,7 @@ def test_error():
df = pd.DataFrame(
{"A": pd.Series([[0, 1, 2], np.nan, [], (3, 4)], index=list("abcd")), "B": 1}
)
- with pytest.raises(
- ValueError, match="column must be a scalar, tuple, or list thereof"
- ):
- df.explode([list("AA")])
-
- with pytest.raises(ValueError, match="column must be unique"):
+ with pytest.raises(ValueError, match="column must be a scalar"):
df.explode(list("AA"))
df.columns = list("AA")
@@ -22,37 +17,6 @@ def test_error():
df.explode("A")
-@pytest.mark.parametrize(
- "input_subset, error_message",
- [
- (
- list("AC"),
- "columns must have matching element counts",
- ),
- (
- [],
- "column must be nonempty",
- ),
- (
- list("AC"),
- "columns must have matching element counts",
- ),
- ],
-)
-def test_error_multi_columns(input_subset, error_message):
- # GH 39240
- df = pd.DataFrame(
- {
- "A": [[0, 1, 2], np.nan, [], (3, 4)],
- "B": 1,
- "C": [["a", "b", "c"], "foo", [], ["d", "e", "f"]],
- },
- index=list("abcd"),
- )
- with pytest.raises(ValueError, match=error_message):
- df.explode(input_subset)
-
-
def test_basic():
df = pd.DataFrame(
{"A": pd.Series([[0, 1, 2], np.nan, [], (3, 4)], index=list("abcd")), "B": 1}
@@ -216,58 +180,3 @@ def test_explode_sets():
result = df.explode(column="a").sort_values(by="a")
expected = pd.DataFrame({"a": ["x", "y"], "b": [1, 1]}, index=[1, 1])
tm.assert_frame_equal(result, expected)
-
-
-@pytest.mark.parametrize(
- "input_subset, expected_dict, expected_index",
- [
- (
- list("AC"),
- {
- "A": pd.Series(
- [0, 1, 2, np.nan, np.nan, 3, 4, np.nan],
- index=list("aaabcdde"),
- dtype=object,
- ),
- "B": 1,
- "C": ["a", "b", "c", "foo", np.nan, "d", "e", np.nan],
- },
- list("aaabcdde"),
- ),
- (
- list("A"),
- {
- "A": pd.Series(
- [0, 1, 2, np.nan, np.nan, 3, 4, np.nan],
- index=list("aaabcdde"),
- dtype=object,
- ),
- "B": 1,
- "C": [
- ["a", "b", "c"],
- ["a", "b", "c"],
- ["a", "b", "c"],
- "foo",
- [],
- ["d", "e"],
- ["d", "e"],
- np.nan,
- ],
- },
- list("aaabcdde"),
- ),
- ],
-)
-def test_multi_columns(input_subset, expected_dict, expected_index):
- # GH 39240
- df = pd.DataFrame(
- {
- "A": [[0, 1, 2], np.nan, [], (3, 4), np.nan],
- "B": 1,
- "C": [["a", "b", "c"], "foo", [], ["d", "e"], np.nan],
- },
- index=list("abcde"),
- )
- result = df.explode(input_subset)
- expected = pd.DataFrame(expected_dict, expected_index)
- tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/frame/methods/test_fillna.py b/pandas/tests/frame/methods/test_fillna.py
index b1ce511fc3e4c..065d074eef6e8 100644
--- a/pandas/tests/frame/methods/test_fillna.py
+++ b/pandas/tests/frame/methods/test_fillna.py
@@ -173,7 +173,7 @@ def test_na_actions_categorical(self):
tm.assert_frame_equal(res, df_exp_fill)
msg = "Cannot setitem on a Categorical with a new category"
- with pytest.raises(TypeError, match=msg):
+ with pytest.raises(ValueError, match=msg):
df.fillna(value={"cats": 4, "vals": "c"})
res = df.fillna(method="pad")
diff --git a/pandas/tests/frame/methods/test_rank.py b/pandas/tests/frame/methods/test_rank.py
index 6c5831ad897d1..5ba4ab4408f11 100644
--- a/pandas/tests/frame/methods/test_rank.py
+++ b/pandas/tests/frame/methods/test_rank.py
@@ -246,11 +246,13 @@ def test_rank_methods_frame(self):
expected = DataFrame(sprank, columns=cols).astype("float64")
tm.assert_frame_equal(result, expected)
+ @td.skip_array_manager_not_yet_implemented
@pytest.mark.parametrize("dtype", ["O", "f8", "i8"])
@pytest.mark.filterwarnings("ignore:.*Select only valid:FutureWarning")
def test_rank_descending(self, method, dtype):
+
if "i" in dtype:
- df = self.df.dropna().astype(dtype)
+ df = self.df.dropna()
else:
df = self.df.astype(dtype)
@@ -258,6 +260,9 @@ def test_rank_descending(self, method, dtype):
expected = (df.max() - df).rank()
tm.assert_frame_equal(res, expected)
+ if method == "first" and dtype == "O":
+ return
+
expected = (df.max() - df).rank(method=method)
if dtype != "O":
@@ -282,6 +287,9 @@ def _check2d(df, expected, method="average", axis=0):
result = df.rank(method=method, axis=axis)
tm.assert_frame_equal(result, exp_df)
+ disabled = {(object, "first")}
+ if (dtype, method) in disabled:
+ return
frame = df if dtype is None else df.astype(dtype)
_check2d(frame, self.results[method], method=method, axis=axis)
@@ -448,38 +456,6 @@ def test_rank_both_inf(self):
result = df.rank()
tm.assert_frame_equal(result, expected)
- @pytest.mark.parametrize(
- "na_option,ascending,expected",
- [
- ("top", True, [3.0, 1.0, 2.0]),
- ("top", False, [2.0, 1.0, 3.0]),
- ("bottom", True, [2.0, 3.0, 1.0]),
- ("bottom", False, [1.0, 3.0, 2.0]),
- ],
- )
- def test_rank_inf_nans_na_option(
- self, frame_or_series, method, na_option, ascending, expected
- ):
- obj = frame_or_series([np.inf, np.nan, -np.inf])
- result = obj.rank(method=method, na_option=na_option, ascending=ascending)
- expected = frame_or_series(expected)
- tm.assert_equal(result, expected)
-
- @pytest.mark.parametrize(
- "na_option,ascending,expected",
- [
- ("bottom", True, [1.0, 2.0, 4.0, 3.0]),
- ("bottom", False, [1.0, 2.0, 4.0, 3.0]),
- ("top", True, [2.0, 3.0, 1.0, 4.0]),
- ("top", False, [2.0, 3.0, 1.0, 4.0]),
- ],
- )
- def test_rank_object_first(self, frame_or_series, na_option, ascending, expected):
- obj = frame_or_series(["foo", "foo", None, "foo"])
- result = obj.rank(method="first", na_option=na_option, ascending=ascending)
- expected = frame_or_series(expected)
- tm.assert_equal(result, expected)
-
@pytest.mark.parametrize(
"data,expected",
[
diff --git a/pandas/tests/frame/methods/test_reindex.py b/pandas/tests/frame/methods/test_reindex.py
index d0765084adfa9..84992982a104a 100644
--- a/pandas/tests/frame/methods/test_reindex.py
+++ b/pandas/tests/frame/methods/test_reindex.py
@@ -658,7 +658,7 @@ def test_reindex_dups(self):
tm.assert_frame_equal(result, expected)
# reindex fails
- msg = "cannot reindex on an axis with duplicate labels"
+ msg = "cannot reindex from a duplicate axis"
with pytest.raises(ValueError, match=msg):
df.reindex(index=list(range(len(df))))
@@ -668,7 +668,7 @@ def test_reindex_with_duplicate_columns(self):
df = DataFrame(
[[1, 5, 7.0], [1, 5, 7.0], [1, 5, 7.0]], columns=["bar", "a", "a"]
)
- msg = "cannot reindex on an axis with duplicate labels"
+ msg = "cannot reindex from a duplicate axis"
with pytest.raises(ValueError, match=msg):
df.reindex(columns=["bar"])
with pytest.raises(ValueError, match=msg):
@@ -942,7 +942,7 @@ def test_reindex_with_categoricalindex(self):
index=CategoricalIndex(list("aabbca"), dtype=CDT(list("cabe")), name="B"),
)
# passed duplicate indexers are not allowed
- msg = "cannot reindex on an axis with duplicate labels"
+ msg = "cannot reindex from a duplicate axis"
with pytest.raises(ValueError, match=msg):
df2.reindex(["a", "b"])
diff --git a/pandas/tests/frame/methods/test_sample.py b/pandas/tests/frame/methods/test_sample.py
index 366722531329a..55ef665c55241 100644
--- a/pandas/tests/frame/methods/test_sample.py
+++ b/pandas/tests/frame/methods/test_sample.py
@@ -1,9 +1,10 @@
import numpy as np
import pytest
+from pandas.compat import np_version_under1p18
+
from pandas import (
DataFrame,
- Index,
Series,
)
import pandas._testing as tm
@@ -69,8 +70,8 @@ def test_sample_lengths(self, obj):
def test_sample_invalid_random_state(self, obj):
# Check for error when random_state argument invalid.
msg = (
- "random_state must be an integer, array-like, a BitGenerator, Generator, "
- "a numpy RandomState, or None"
+ "random_state must be an integer, array-like, a BitGenerator, a numpy "
+ "RandomState, or None"
)
with pytest.raises(ValueError, match=msg):
obj.sample(random_state="a_string")
@@ -82,15 +83,10 @@ def test_sample_wont_accept_n_and_frac(self, obj):
obj.sample(n=3, frac=0.3)
def test_sample_requires_positive_n_frac(self, obj):
- with pytest.raises(
- ValueError,
- match="A negative number of rows requested. Please provide `n` >= 0",
- ):
+ msg = "A negative number of rows requested. Please provide positive value."
+ with pytest.raises(ValueError, match=msg):
obj.sample(n=-3)
- with pytest.raises(
- ValueError,
- match="A negative number of rows requested. Please provide `frac` >= 0",
- ):
+ with pytest.raises(ValueError, match=msg):
obj.sample(frac=-0.3)
def test_sample_requires_integer_n(self, obj):
@@ -159,8 +155,16 @@ def test_sample_none_weights(self, obj):
"func_str,arg",
[
("np.array", [2, 3, 1, 0]),
- ("np.random.MT19937", 3),
- ("np.random.PCG64", 11),
+ pytest.param(
+ "np.random.MT19937",
+ 3,
+ marks=pytest.mark.skipif(np_version_under1p18, reason="NumPy<1.18"),
+ ),
+ pytest.param(
+ "np.random.PCG64",
+ 11,
+ marks=pytest.mark.skipif(np_version_under1p18, reason="NumPy<1.18"),
+ ),
],
)
def test_sample_random_state(self, func_str, arg, frame_or_series):
@@ -172,22 +176,6 @@ def test_sample_random_state(self, func_str, arg, frame_or_series):
expected = obj.sample(n=3, random_state=com.random_state(eval(func_str)(arg)))
tm.assert_equal(result, expected)
- def test_sample_generator(self, frame_or_series):
- # GH#38100
- obj = frame_or_series(np.arange(100))
- rng = np.random.default_rng()
-
- # Consecutive calls should advance the seed
- result1 = obj.sample(n=50, random_state=rng)
- result2 = obj.sample(n=50, random_state=rng)
- assert not (result1.index.values == result2.index.values).all()
-
- # Matching generator initialization must give same result
- # Consecutive calls should advance the seed
- result1 = obj.sample(n=50, random_state=np.random.default_rng(11))
- result2 = obj.sample(n=50, random_state=np.random.default_rng(11))
- tm.assert_equal(result1, result2)
-
def test_sample_upsampling_without_replacement(self, frame_or_series):
# GH#27451
@@ -338,12 +326,3 @@ def test_sample_is_copy(self):
with tm.assert_produces_warning(None):
df2["d"] = 1
-
- def test_sample_ignore_index(self):
- # GH 38581
- df = DataFrame(
- {"col1": range(10, 20), "col2": range(20, 30), "colString": ["a"] * 10}
- )
- result = df.sample(3, ignore_index=True)
- expected_index = Index([0, 1, 2])
- tm.assert_index_equal(result.index, expected_index)
diff --git a/pandas/tests/frame/methods/test_to_records.py b/pandas/tests/frame/methods/test_to_records.py
index 2c96cf291c154..ba8fe25401e8c 100644
--- a/pandas/tests/frame/methods/test_to_records.py
+++ b/pandas/tests/frame/methods/test_to_records.py
@@ -3,6 +3,8 @@
import numpy as np
import pytest
+from pandas.compat import is_numpy_dev
+
from pandas import (
CategoricalDtype,
DataFrame,
@@ -171,20 +173,28 @@ def test_to_records_with_categorical(self):
),
),
# Pass in a type instance.
- (
+ pytest.param(
{"column_dtypes": str},
np.rec.array(
[("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")],
dtype=[("index", " return Index
- for accessor in DatetimeArray._field_ops:
+ for accessor in DatetimeIndex._field_ops:
if accessor in ["week", "weekofyear"]:
# GH#33595 Deprecate week and weekofyear
continue
@@ -234,7 +233,7 @@ def test_datetimeindex_accessors(self):
assert res.name == "name"
# boolean accessors -> return array
- for accessor in DatetimeArray._bool_ops:
+ for accessor in DatetimeIndex._bool_ops:
res = getattr(dti, accessor)
assert len(res) == 365
assert isinstance(res, np.ndarray)
diff --git a/pandas/tests/indexes/datetimes/test_partial_slicing.py b/pandas/tests/indexes/datetimes/test_partial_slicing.py
index c5b47053471eb..882515799f943 100644
--- a/pandas/tests/indexes/datetimes/test_partial_slicing.py
+++ b/pandas/tests/indexes/datetimes/test_partial_slicing.py
@@ -16,58 +16,10 @@
date_range,
)
import pandas._testing as tm
+from pandas.core.indexing import IndexingError
class TestSlicing:
- def test_return_type_doesnt_depend_on_monotonicity(self):
- # GH#24892 we get Series back regardless of whether our DTI is monotonic
- dti = date_range(start="2015-5-13 23:59:00", freq="min", periods=3)
- ser = Series(range(3), index=dti)
-
- # non-monotonic index
- ser2 = Series(range(3), index=[dti[1], dti[0], dti[2]])
-
- # key with resolution strictly lower than "min"
- key = "2015-5-14 00"
-
- # monotonic increasing index
- result = ser.loc[key]
- expected = ser.iloc[1:]
- tm.assert_series_equal(result, expected)
-
- # monotonic decreasing index
- result = ser.iloc[::-1].loc[key]
- expected = ser.iloc[::-1][:-1]
- tm.assert_series_equal(result, expected)
-
- # non-monotonic index
- result2 = ser2.loc[key]
- expected2 = ser2.iloc[::2]
- tm.assert_series_equal(result2, expected2)
-
- def test_return_type_doesnt_depend_on_monotonicity_higher_reso(self):
- # GH#24892 we get Series back regardless of whether our DTI is monotonic
- dti = date_range(start="2015-5-13 23:59:00", freq="min", periods=3)
- ser = Series(range(3), index=dti)
-
- # non-monotonic index
- ser2 = Series(range(3), index=[dti[1], dti[0], dti[2]])
-
- # key with resolution strictly *higher) than "min"
- key = "2015-5-14 00:00:00"
-
- # monotonic increasing index
- result = ser.loc[key]
- assert result == 1
-
- # monotonic decreasing index
- result = ser.iloc[::-1].loc[key]
- assert result == 1
-
- # non-monotonic index
- result2 = ser2.loc[key]
- assert result2 == 0
-
def test_monotone_DTI_indexing_bug(self):
# GH 19362
# Testing accessing the first element in a monotonic descending
@@ -86,19 +38,9 @@ def test_monotone_DTI_indexing_bug(self):
expected = DataFrame({0: list(range(5)), "date": date_index})
tm.assert_frame_equal(df, expected)
- # We get a slice because df.index's resolution is hourly and we
- # are slicing with a daily-resolution string. If both were daily,
- # we would get a single item back
- dti = date_range("20170101 01:00:00", periods=3)
- df = DataFrame({"A": [1, 2, 3]}, index=dti[::-1])
-
- expected = DataFrame({"A": 1}, index=dti[-1:][::-1])
- result = df.loc["2017-01-03"]
- tm.assert_frame_equal(result, expected)
-
- result2 = df.iloc[::-1].loc["2017-01-03"]
- expected2 = expected.iloc[::-1]
- tm.assert_frame_equal(result2, expected2)
+ df = DataFrame({"A": [1, 2, 3]}, index=date_range("20170101", periods=3)[::-1])
+ expected = DataFrame({"A": 1}, index=date_range("20170103", periods=1)[::-1])
+ tm.assert_frame_equal(df.loc["2017-01-03"], expected)
def test_slice_year(self):
dti = date_range(freq="B", start=datetime(2005, 1, 1), periods=500)
@@ -336,28 +278,28 @@ def test_partial_slicing_with_multiindex(self):
result = df_multi.loc[("2013-06-19 09:30:00", "ACCT1", "ABC")]
tm.assert_series_equal(result, expected)
- # partial string indexing on first level, scalar indexing on the other two
- result = df_multi.loc[("2013-06-19", "ACCT1", "ABC")]
- expected = df_multi.iloc[:1].droplevel([1, 2])
- tm.assert_frame_equal(result, expected)
+ # this is an IndexingError as we don't do partial string selection on
+ # multi-levels.
+ msg = "Too many indexers"
+ with pytest.raises(IndexingError, match=msg):
+ df_multi.loc[("2013-06-19", "ACCT1", "ABC")]
- def test_partial_slicing_with_multiindex_series(self):
# GH 4294
# partial slice on a series mi
- ser = DataFrame(
+ s = DataFrame(
np.random.rand(1000, 1000), index=date_range("2000-1-1", periods=1000)
).stack()
- s2 = ser[:-1].copy()
+ s2 = s[:-1].copy()
expected = s2["2000-1-4"]
result = s2[Timestamp("2000-1-4")]
tm.assert_series_equal(result, expected)
- result = ser[Timestamp("2000-1-4")]
- expected = ser["2000-1-4"]
+ result = s[Timestamp("2000-1-4")]
+ expected = s["2000-1-4"]
tm.assert_series_equal(result, expected)
- df2 = DataFrame(ser)
+ df2 = DataFrame(s)
expected = df2.xs("2000-1-4")
result = df2.loc[Timestamp("2000-1-4")]
tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/indexes/datetimes/test_setops.py b/pandas/tests/indexes/datetimes/test_setops.py
index 62663c8c6b810..513a47d6be7ab 100644
--- a/pandas/tests/indexes/datetimes/test_setops.py
+++ b/pandas/tests/indexes/datetimes/test_setops.py
@@ -391,23 +391,6 @@ def test_setops_preserve_freq(self, tz):
assert result.freq == rng.freq
assert result.tz == rng.tz
- def test_intersection_non_tick_no_fastpath(self):
- # GH#42104
- dti = DatetimeIndex(
- [
- "2018-12-31",
- "2019-03-31",
- "2019-06-30",
- "2019-09-30",
- "2019-12-31",
- "2020-03-31",
- ],
- freq="Q-DEC",
- )
- result = dti[::2].intersection(dti[1::2])
- expected = dti[:0]
- tm.assert_index_equal(result, expected)
-
class TestBusinessDatetimeIndex:
def setup_method(self, method):
diff --git a/pandas/tests/indexes/interval/test_indexing.py b/pandas/tests/indexes/interval/test_indexing.py
index aa3359d775c5a..a5a921f42c3ef 100644
--- a/pandas/tests/indexes/interval/test_indexing.py
+++ b/pandas/tests/indexes/interval/test_indexing.py
@@ -275,26 +275,6 @@ def test_get_indexer_categorical(self, target, ordered):
expected = index.get_indexer(target)
tm.assert_numpy_array_equal(result, expected)
- def test_get_indexer_categorical_with_nans(self):
- # GH#41934 nans in both index and in target
- ii = IntervalIndex.from_breaks(range(5))
- ii2 = ii.append(IntervalIndex([np.nan]))
- ci2 = CategoricalIndex(ii2)
-
- result = ii2.get_indexer(ci2)
- expected = np.arange(5, dtype=np.intp)
- tm.assert_numpy_array_equal(result, expected)
-
- # not-all-matches
- result = ii2[1:].get_indexer(ci2[::-1])
- expected = np.array([3, 2, 1, 0, -1], dtype=np.intp)
- tm.assert_numpy_array_equal(result, expected)
-
- # non-unique target, non-unique nans
- result = ii2.get_indexer(ci2.append(ci2))
- expected = np.array([0, 1, 2, 3, 4, 0, 1, 2, 3, 4], dtype=np.intp)
- tm.assert_numpy_array_equal(result, expected)
-
@pytest.mark.parametrize(
"tuples, closed",
[
diff --git a/pandas/tests/indexes/multi/test_duplicates.py b/pandas/tests/indexes/multi/test_duplicates.py
index 1fd8b0f8b837a..c2b3647379234 100644
--- a/pandas/tests/indexes/multi/test_duplicates.py
+++ b/pandas/tests/indexes/multi/test_duplicates.py
@@ -74,6 +74,15 @@ def test_unique_level(idx, level):
tm.assert_index_equal(result, expected)
+def test_get_unique_index(idx):
+ mi = idx[[0, 1, 0, 1, 1, 0, 0]]
+ expected = mi._shallow_copy(mi[[0, 1]])
+
+ result = mi._get_unique_index()
+ assert result.unique
+ tm.assert_index_equal(result, expected)
+
+
def test_duplicate_multiindex_codes():
# GH 17464
# Make sure that a MultiIndex with duplicate levels throws a ValueError
diff --git a/pandas/tests/indexes/multi/test_indexing.py b/pandas/tests/indexes/multi/test_indexing.py
index ec7ddf8b4d67a..9e1097ce5951f 100644
--- a/pandas/tests/indexes/multi/test_indexing.py
+++ b/pandas/tests/indexes/multi/test_indexing.py
@@ -457,15 +457,6 @@ def test_get_indexer_kwarg_validation(self):
with pytest.raises(ValueError, match=msg):
mi.get_indexer(mi[:-1], tolerance="piano")
- def test_get_indexer_mismatched_nlevels(self):
- mi = MultiIndex.from_product([range(3), ["A", "B"]])
-
- other = MultiIndex.from_product([range(3), ["A", "B"], range(2)])
-
- msg = "tuples of different lengths"
- with pytest.raises(TypeError, match=msg):
- mi.get_indexer(other, method="pad")
-
def test_getitem(idx):
# scalar
diff --git a/pandas/tests/indexes/multi/test_isin.py b/pandas/tests/indexes/multi/test_isin.py
index 695458273d16e..97eb34e28764b 100644
--- a/pandas/tests/indexes/multi/test_isin.py
+++ b/pandas/tests/indexes/multi/test_isin.py
@@ -1,11 +1,14 @@
import numpy as np
import pytest
+from pandas.compat import PYPY
+
from pandas import MultiIndex
import pandas._testing as tm
-def test_isin_nan():
+@pytest.mark.skipif(not PYPY, reason="tuples cmp recursively on PyPy")
+def test_isin_nan_pypy():
idx = MultiIndex.from_arrays([["foo", "bar"], [1.0, np.nan]])
tm.assert_numpy_array_equal(idx.isin([("bar", np.nan)]), np.array([False, True]))
tm.assert_numpy_array_equal(
@@ -28,6 +31,15 @@ def test_isin():
assert result.dtype == np.bool_
+@pytest.mark.skipif(PYPY, reason="tuples cmp recursively on PyPy")
+def test_isin_nan_not_pypy():
+ idx = MultiIndex.from_arrays([["foo", "bar"], [1.0, np.nan]])
+ tm.assert_numpy_array_equal(idx.isin([("bar", np.nan)]), np.array([False, False]))
+ tm.assert_numpy_array_equal(
+ idx.isin([("bar", float("nan"))]), np.array([False, False])
+ )
+
+
def test_isin_level_kwarg():
idx = MultiIndex.from_arrays([["qux", "baz", "foo", "bar"], np.arange(4)])
diff --git a/pandas/tests/indexes/multi/test_partial_indexing.py b/pandas/tests/indexes/multi/test_partial_indexing.py
index 47efc43d5eae0..286522f6b946d 100644
--- a/pandas/tests/indexes/multi/test_partial_indexing.py
+++ b/pandas/tests/indexes/multi/test_partial_indexing.py
@@ -1,4 +1,3 @@
-import numpy as np
import pytest
from pandas import (
@@ -46,42 +45,6 @@ def test_partial_string_matching_single_index(df):
tm.assert_frame_equal(result, expected)
-def test_get_loc_partial_timestamp_multiindex(df):
- mi = df.index
- key = ("2016-01-01", "a")
- loc = mi.get_loc(key)
-
- expected = np.zeros(len(mi), dtype=bool)
- expected[[0, 3]] = True
- tm.assert_numpy_array_equal(loc, expected)
-
- key2 = ("2016-01-02", "a")
- loc2 = mi.get_loc(key2)
- expected2 = np.zeros(len(mi), dtype=bool)
- expected2[[6, 9]] = True
- tm.assert_numpy_array_equal(loc2, expected2)
-
- key3 = ("2016-01", "a")
- loc3 = mi.get_loc(key3)
- expected3 = np.zeros(len(mi), dtype=bool)
- expected3[mi.get_level_values(1).get_loc("a")] = True
- tm.assert_numpy_array_equal(loc3, expected3)
-
- key4 = ("2016", "a")
- loc4 = mi.get_loc(key4)
- expected4 = expected3
- tm.assert_numpy_array_equal(loc4, expected4)
-
- # non-monotonic
- taker = np.arange(len(mi), dtype=np.intp)
- taker[::2] = taker[::-2]
- mi2 = mi.take(taker)
- loc5 = mi2.get_loc(key)
- expected5 = np.zeros(len(mi2), dtype=bool)
- expected5[[3, 14]] = True
- tm.assert_numpy_array_equal(loc5, expected5)
-
-
def test_partial_string_timestamp_multiindex(df):
# GH10331
df_swap = df.swaplevel(0, 1).sort_index()
@@ -109,9 +72,7 @@ def test_partial_string_timestamp_multiindex(df):
# partial string match on date and hour, from middle
result = df.loc["2016-01-02 12"]
- # hourly resolution, same as index.levels[0], so we are _not_ slicing on
- # that level, so that level gets dropped
- expected = df.iloc[9:12].droplevel(0)
+ expected = df.iloc[9:12]
tm.assert_frame_equal(result, expected)
# partial string match on secondary index
@@ -120,14 +81,11 @@ def test_partial_string_timestamp_multiindex(df):
tm.assert_frame_equal(result, expected)
# tuple selector with partial string match on date
- # "2016-01-01" has daily resolution, so _is_ a slice on the first level.
result = df.loc[("2016-01-01", "a"), :]
expected = df.iloc[[0, 3]]
- expected = df.iloc[[0, 3]].droplevel(1)
tm.assert_frame_equal(result, expected)
- # Slicing date on first level should break (of course) bc the DTI is the
- # second level on df_swap
+ # Slicing date on first level should break (of course)
with pytest.raises(KeyError, match="'2016-01-01'"):
df_swap.loc["2016-01-01"]
diff --git a/pandas/tests/indexes/multi/test_reindex.py b/pandas/tests/indexes/multi/test_reindex.py
index 340b546125d8d..38ff6efec40c9 100644
--- a/pandas/tests/indexes/multi/test_reindex.py
+++ b/pandas/tests/indexes/multi/test_reindex.py
@@ -84,13 +84,6 @@ def test_reindex_lvl_preserves_type_if_target_is_empty_list_or_array():
assert idx.reindex([], level=0)[0].levels[0].dtype.type == np.int64
assert idx.reindex([], level=1)[0].levels[1].dtype.type == np.object_
- # case with EA levels
- cat = pd.Categorical(["foo", "bar"])
- dti = pd.date_range("2016-01-01", periods=2, tz="US/Pacific")
- mi = MultiIndex.from_product([cat, dti])
- assert mi.reindex([], level=0)[0].levels[0].dtype == cat.dtype
- assert mi.reindex([], level=1)[0].levels[1].dtype == dti.dtype
-
def test_reindex_base(idx):
idx = idx
@@ -133,31 +126,3 @@ def test_reindex_not_all_tuples():
tm.assert_index_equal(res, idx)
expected = np.array([0, 1, 2, -1], dtype=np.intp)
tm.assert_numpy_array_equal(indexer, expected)
-
-
-def test_reindex_limit_arg_with_multiindex():
- # GH21247
-
- idx = MultiIndex.from_tuples([(3, "A"), (4, "A"), (4, "B")])
-
- df = pd.Series([0.02, 0.01, 0.012], index=idx)
-
- new_idx = MultiIndex.from_tuples(
- [
- (3, "A"),
- (3, "B"),
- (4, "A"),
- (4, "B"),
- (4, "C"),
- (5, "B"),
- (5, "C"),
- (6, "B"),
- (6, "C"),
- ]
- )
-
- with pytest.raises(
- ValueError,
- match="limit argument only valid if doing pad, backfill or nearest reindexing",
- ):
- df.reindex(new_idx, fill_value=0, limit=1)
diff --git a/pandas/tests/indexes/multi/test_setops.py b/pandas/tests/indexes/multi/test_setops.py
index f43e3104c64d7..eb456bee39dbf 100644
--- a/pandas/tests/indexes/multi/test_setops.py
+++ b/pandas/tests/indexes/multi/test_setops.py
@@ -216,10 +216,11 @@ def test_difference_sort_incomparable():
other = MultiIndex.from_product([[3, pd.Timestamp("2000"), 4], ["c", "d"]])
# sort=None, the default
- msg = "sort order is undefined for incomparable objects"
- with tm.assert_produces_warning(RuntimeWarning, match=msg):
+ # MultiIndex.difference deviates here from other difference
+ # implementations in not catching the TypeError
+ msg = "'<' not supported between instances of 'Timestamp' and 'int'"
+ with pytest.raises(TypeError, match=msg):
result = idx.difference(other)
- tm.assert_index_equal(result, idx)
# sort=False
result = idx.difference(other, sort=False)
diff --git a/pandas/tests/indexes/numeric/test_indexing.py b/pandas/tests/indexes/numeric/test_indexing.py
index e6b418868dbeb..5f2f8f75045bb 100644
--- a/pandas/tests/indexes/numeric/test_indexing.py
+++ b/pandas/tests/indexes/numeric/test_indexing.py
@@ -24,17 +24,12 @@ class TestGetLoc:
@pytest.mark.parametrize("method", [None, "pad", "backfill", "nearest"])
def test_get_loc(self, method):
index = Index([0, 1, 2])
- warn = None if method is None else FutureWarning
-
- with tm.assert_produces_warning(warn, match="deprecated"):
- assert index.get_loc(1, method=method) == 1
+ assert index.get_loc(1, method=method) == 1
if method:
- with tm.assert_produces_warning(warn, match="deprecated"):
- assert index.get_loc(1, method=method, tolerance=0) == 1
+ assert index.get_loc(1, method=method, tolerance=0) == 1
@pytest.mark.parametrize("method", [None, "pad", "backfill", "nearest"])
- @pytest.mark.filterwarnings("ignore:Passing method:FutureWarning")
def test_get_loc_raises_bad_label(self, method):
index = Index([0, 1, 2])
if method:
@@ -48,7 +43,6 @@ def test_get_loc_raises_bad_label(self, method):
@pytest.mark.parametrize(
"method,loc", [("pad", 1), ("backfill", 2), ("nearest", 1)]
)
- @pytest.mark.filterwarnings("ignore:Passing method:FutureWarning")
def test_get_loc_tolerance(self, method, loc):
index = Index([0, 1, 2])
assert index.get_loc(1.1, method) == loc
@@ -58,14 +52,12 @@ def test_get_loc_tolerance(self, method, loc):
def test_get_loc_outside_tolerance_raises(self, method):
index = Index([0, 1, 2])
with pytest.raises(KeyError, match="1.1"):
- with tm.assert_produces_warning(FutureWarning, match="deprecated"):
- index.get_loc(1.1, method, tolerance=0.05)
+ index.get_loc(1.1, method, tolerance=0.05)
def test_get_loc_bad_tolerance_raises(self):
index = Index([0, 1, 2])
with pytest.raises(ValueError, match="must be numeric"):
- with tm.assert_produces_warning(FutureWarning, match="deprecated"):
- index.get_loc(1.1, "nearest", tolerance="invalid")
+ index.get_loc(1.1, "nearest", tolerance="invalid")
def test_get_loc_tolerance_no_method_raises(self):
index = Index([0, 1, 2])
@@ -75,10 +67,8 @@ def test_get_loc_tolerance_no_method_raises(self):
def test_get_loc_raises_missized_tolerance(self):
index = Index([0, 1, 2])
with pytest.raises(ValueError, match="tolerance size must match"):
- with tm.assert_produces_warning(FutureWarning, match="deprecated"):
- index.get_loc(1.1, "nearest", tolerance=[1, 1])
+ index.get_loc(1.1, "nearest", tolerance=[1, 1])
- @pytest.mark.filterwarnings("ignore:Passing method:FutureWarning")
def test_get_loc_float64(self):
idx = Float64Index([0.0, 1.0, 2.0])
for method in [None, "pad", "backfill", "nearest"]:
@@ -149,8 +139,7 @@ def test_get_loc_float_index_nan_with_method(self, vals, method):
# GH#39382
idx = Index(vals)
with pytest.raises(KeyError, match="nan"):
- with tm.assert_produces_warning(FutureWarning, match="deprecated"):
- idx.get_loc(np.nan, method=method)
+ idx.get_loc(np.nan, method=method)
class TestGetIndexer:
@@ -387,19 +376,6 @@ def test_where(self, klass, index):
result = index.where(klass(cond))
tm.assert_index_equal(result, expected)
- def test_where_uin64(self):
- idx = UInt64Index([0, 6, 2])
- mask = np.array([False, True, False])
- other = np.array([1], dtype=np.int64)
-
- expected = UInt64Index([1, 6, 1])
-
- result = idx.where(mask, other)
- tm.assert_index_equal(result, expected)
-
- result = idx.putmask(~mask, other)
- tm.assert_index_equal(result, expected)
-
class TestTake:
@pytest.mark.parametrize("klass", [Float64Index, Int64Index, UInt64Index])
diff --git a/pandas/tests/indexes/numeric/test_numeric.py b/pandas/tests/indexes/numeric/test_numeric.py
index 8cbca0ba8eb65..9572aeaf41c91 100644
--- a/pandas/tests/indexes/numeric/test_numeric.py
+++ b/pandas/tests/indexes/numeric/test_numeric.py
@@ -531,6 +531,7 @@ def test_constructor(self, dtype):
res = Index([1, 2 ** 63 + 1], dtype=dtype)
tm.assert_index_equal(res, idx)
+ @pytest.mark.xfail(reason="https://github.com/numpy/numpy/issues/19146")
def test_constructor_does_not_cast_to_float(self):
# https://github.com/numpy/numpy/issues/19146
values = [0, np.iinfo(np.uint64).max]
diff --git a/pandas/tests/indexes/object/test_indexing.py b/pandas/tests/indexes/object/test_indexing.py
index b26676a0d83cf..a683e9faed1f2 100644
--- a/pandas/tests/indexes/object/test_indexing.py
+++ b/pandas/tests/indexes/object/test_indexing.py
@@ -10,14 +10,12 @@ class TestGetLoc:
def test_get_loc_raises_object_nearest(self):
index = Index(["a", "c"])
with pytest.raises(TypeError, match="unsupported operand type"):
- with tm.assert_produces_warning(FutureWarning, match="deprecated"):
- index.get_loc("a", method="nearest")
+ index.get_loc("a", method="nearest")
def test_get_loc_raises_object_tolerance(self):
index = Index(["a", "c"])
with pytest.raises(TypeError, match="unsupported operand type"):
- with tm.assert_produces_warning(FutureWarning, match="deprecated"):
- index.get_loc("a", method="pad", tolerance="invalid")
+ index.get_loc("a", method="pad", tolerance="invalid")
class TestGetIndexer:
diff --git a/pandas/tests/indexes/period/test_indexing.py b/pandas/tests/indexes/period/test_indexing.py
index 3b7b738bec410..a41d02cfbd394 100644
--- a/pandas/tests/indexes/period/test_indexing.py
+++ b/pandas/tests/indexes/period/test_indexing.py
@@ -339,7 +339,6 @@ def test_get_loc_integer(self):
# TODO: This method came from test_period; de-dup with version above
@pytest.mark.parametrize("method", [None, "pad", "backfill", "nearest"])
- @pytest.mark.filterwarnings("ignore:Passing method:FutureWarning")
def test_get_loc_method(self, method):
idx = period_range("2000-01-01", periods=3)
@@ -353,7 +352,6 @@ def test_get_loc_method(self, method):
idx.get_loc(key, method=method)
# TODO: This method came from test_period; de-dup with version above
- @pytest.mark.filterwarnings("ignore:Passing method:FutureWarning")
def test_get_loc3(self):
idx = period_range("2000-01-01", periods=5)[::2]
@@ -515,13 +513,11 @@ def test_get_indexer_mismatched_dtype_with_method(self, non_comparable_idx, meth
continue
# Two different error message patterns depending on dtypes
msg = "|".join(
- [
- re.escape(msg)
- for msg in (
- f"Cannot compare dtypes {pi.dtype} and {other.dtype}",
- " not supported between instances of ",
- )
- ]
+ re.escape(msg)
+ for msg in (
+ f"Cannot compare dtypes {pi.dtype} and {other.dtype}",
+ " not supported between instances of ",
+ )
)
with pytest.raises(TypeError, match=msg):
pi.get_indexer(other2, method=method)
diff --git a/pandas/tests/indexes/period/test_searchsorted.py b/pandas/tests/indexes/period/test_searchsorted.py
index 27e998284c189..af243eeccc7a4 100644
--- a/pandas/tests/indexes/period/test_searchsorted.py
+++ b/pandas/tests/indexes/period/test_searchsorted.py
@@ -2,6 +2,7 @@
import pytest
from pandas._libs.tslibs import IncompatibleFrequency
+from pandas.compat import np_version_under1p18
from pandas import (
NaT,
@@ -27,7 +28,13 @@ def test_searchsorted(self, freq):
p2 = Period("2014-01-04", freq=freq)
assert pidx.searchsorted(p2) == 3
- assert pidx.searchsorted(NaT) == 5
+ if np_version_under1p18:
+ # GH#36254
+ # Following numpy convention, NaT goes at the beginning
+ # (unlike NaN which goes at the end)
+ assert pidx.searchsorted(NaT) == 0
+ else:
+ assert pidx.searchsorted(NaT) == 5
msg = "Input has different freq=H from PeriodArray"
with pytest.raises(IncompatibleFrequency, match=msg):
diff --git a/pandas/tests/indexes/test_any_index.py b/pandas/tests/indexes/test_any_index.py
index f7dcaa628228b..60fa8f1a0c083 100644
--- a/pandas/tests/indexes/test_any_index.py
+++ b/pandas/tests/indexes/test_any_index.py
@@ -66,13 +66,6 @@ def test_ravel_deprecation(index):
index.ravel()
-def test_is_type_compatible_deprecation(index):
- # GH#42113
- msg = "is_type_compatible is deprecated"
- with tm.assert_produces_warning(FutureWarning, match=msg):
- index.is_type_compatible(index.inferred_type)
-
-
class TestConversion:
def test_to_series(self, index):
# assert that we are creating a copy of the index
diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py
index 826649358e663..d7abaf0b5dfbe 100644
--- a/pandas/tests/indexes/test_base.py
+++ b/pandas/tests/indexes/test_base.py
@@ -992,7 +992,7 @@ def test_isin(self, values, index, expected):
result = index.isin(values)
tm.assert_numpy_array_equal(result, expected)
- def test_isin_nan_common_object(self, request, nulls_fixture, nulls_fixture2):
+ def test_isin_nan_common_object(self, nulls_fixture, nulls_fixture2):
# Test cartesian product of null fixtures and ensure that we don't
# mangle the various types (save a corner case with PyPy)
diff --git a/pandas/tests/indexes/test_common.py b/pandas/tests/indexes/test_common.py
index 882e708a357c8..ec01e35673647 100644
--- a/pandas/tests/indexes/test_common.py
+++ b/pandas/tests/indexes/test_common.py
@@ -126,7 +126,7 @@ def test_copy_and_deepcopy(self, index_flat):
new_copy = index.copy(deep=True, name="banana")
assert new_copy.name == "banana"
- def test_unique_level(self, index_flat):
+ def test_unique(self, index_flat):
# don't test a MultiIndex here (as its tested separated)
index = index_flat
@@ -147,7 +147,7 @@ def test_unique_level(self, index_flat):
with pytest.raises(KeyError, match=msg):
index.unique(level="wrong")
- def test_unique(self, index_flat):
+ def test_get_unique_index(self, index_flat):
# MultiIndex tested separately
index = index_flat
if not len(index):
@@ -164,7 +164,7 @@ def test_unique(self, index_flat):
except NotImplementedError:
pass
- result = idx.unique()
+ result = idx._get_unique_index()
tm.assert_index_equal(result, idx_unique)
# nans:
@@ -195,7 +195,7 @@ def test_unique(self, index_flat):
expected = idx_unique_nan
for i in [idx_nan, idx_unique_nan]:
- result = i.unique()
+ result = i._get_unique_index()
tm.assert_index_equal(result, expected)
def test_searchsorted_monotonic(self, index_flat):
diff --git a/pandas/tests/indexes/test_indexing.py b/pandas/tests/indexes/test_indexing.py
index 5f6d0155ae6cf..379c766b94d6c 100644
--- a/pandas/tests/indexes/test_indexing.py
+++ b/pandas/tests/indexes/test_indexing.py
@@ -26,7 +26,6 @@
IntervalIndex,
MultiIndex,
PeriodIndex,
- RangeIndex,
Series,
TimedeltaIndex,
UInt64Index,
@@ -182,27 +181,6 @@ def test_get_value(self, index):
tm.assert_almost_equal(result, values[67])
-class TestGetLoc:
- def test_get_loc_non_hashable(self, index):
- # MultiIndex and Index raise TypeError, others InvalidIndexError
-
- with pytest.raises((TypeError, InvalidIndexError), match="slice"):
- index.get_loc(slice(0, 1))
-
- def test_get_loc_generator(self, index):
-
- exc = KeyError
- if isinstance(
- index,
- (DatetimeIndex, TimedeltaIndex, PeriodIndex, RangeIndex, IntervalIndex),
- ):
- # TODO: make these more consistent?
- exc = InvalidIndexError
- with pytest.raises(exc, match="generator object"):
- # MultiIndex specifically checks for generator; others for scalar
- index.get_loc(x for x in range(5))
-
-
class TestGetIndexer:
def test_get_indexer_base(self, index):
diff --git a/pandas/tests/indexes/test_numpy_compat.py b/pandas/tests/indexes/test_numpy_compat.py
index 92adc0570dee1..f2ed96d0b65b8 100644
--- a/pandas/tests/indexes/test_numpy_compat.py
+++ b/pandas/tests/indexes/test_numpy_compat.py
@@ -1,6 +1,8 @@
import numpy as np
import pytest
+from pandas.compat import np_version_under1p18
+
from pandas import (
DatetimeIndex,
Float64Index,
@@ -80,12 +82,22 @@ def test_numpy_ufuncs_other(index, func, request):
isinstance(index, DatetimeIndex)
and index.tz is not None
and func in [np.isfinite, np.isnan, np.isinf]
+ and (
+ not np_version_under1p18
+ or (np_version_under1p18 and func is np.isfinite)
+ )
):
mark = pytest.mark.xfail(reason="__array_ufunc__ is not defined")
request.node.add_marker(mark)
- if func in (np.isfinite, np.isinf, np.isnan):
- # numpy 1.18 changed isinf and isnan to not raise on dt64/tfd64
+ if not np_version_under1p18 and func in [np.isfinite, np.isinf, np.isnan]:
+ # numpy 1.18(dev) changed isinf and isnan to not raise on dt64/tfd64
+ result = func(index)
+ assert isinstance(result, np.ndarray)
+
+ elif func is np.isfinite:
+ # ok under numpy >= 1.17
+ # Results in bool array
result = func(index)
assert isinstance(result, np.ndarray)
else:
diff --git a/pandas/tests/indexes/test_setops.py b/pandas/tests/indexes/test_setops.py
index 20174beacf1d3..087ccbef7b778 100644
--- a/pandas/tests/indexes/test_setops.py
+++ b/pandas/tests/indexes/test_setops.py
@@ -178,7 +178,7 @@ def test_intersection_base(self, index):
return
# GH#10149
- cases = [second.to_numpy(), second.to_series(), second.to_list()]
+ cases = [klass(second.values) for klass in [np.array, Series, list]]
for case in cases:
result = first.intersection(case)
assert tm.equalContents(result, second)
@@ -201,10 +201,15 @@ def test_union_base(self, index):
return
# GH#10149
- cases = [second.to_numpy(), second.to_series(), second.to_list()]
+ cases = [klass(second.values) for klass in [np.array, Series, list]]
for case in cases:
- result = first.union(case)
- assert tm.equalContents(result, everything)
+ if not isinstance(index, CategoricalIndex):
+ result = first.union(case)
+ assert tm.equalContents(result, everything), (
+ result,
+ everything,
+ type(case),
+ )
if isinstance(index, MultiIndex):
msg = "other must be a MultiIndex or a list of tuples"
@@ -222,10 +227,16 @@ def test_difference_base(self, sort, index):
assert tm.equalContents(result, answer)
# GH#10149
- cases = [second.to_numpy(), second.to_series(), second.to_list()]
+ cases = [klass(second.values) for klass in [np.array, Series, list]]
for case in cases:
- result = first.difference(case, sort)
- assert tm.equalContents(result, answer)
+ if isinstance(index, (DatetimeIndex, TimedeltaIndex)):
+ assert type(result) == type(answer)
+ tm.assert_numpy_array_equal(
+ result.sort_values().asi8, answer.sort_values().asi8
+ )
+ else:
+ result = first.difference(case, sort)
+ assert tm.equalContents(result, answer)
if isinstance(index, MultiIndex):
msg = "other must be a MultiIndex or a list of tuples"
@@ -249,9 +260,16 @@ def test_symmetric_difference(self, index):
assert tm.equalContents(result, answer)
# GH#10149
- cases = [second.to_numpy(), second.to_series(), second.to_list()]
+ cases = [klass(second.values) for klass in [np.array, Series, list]]
for case in cases:
result = first.symmetric_difference(case)
+
+ if is_datetime64tz_dtype(first):
+ # second.values casts to tznaive
+ expected = first.union(case)
+ tm.assert_index_equal(result, expected)
+ continue
+
assert tm.equalContents(result, answer)
if isinstance(index, MultiIndex):
diff --git a/pandas/tests/indexes/timedeltas/test_indexing.py b/pandas/tests/indexes/timedeltas/test_indexing.py
index 669bbe23af559..5f0101eb4478c 100644
--- a/pandas/tests/indexes/timedeltas/test_indexing.py
+++ b/pandas/tests/indexes/timedeltas/test_indexing.py
@@ -82,7 +82,6 @@ def test_timestamp_invalid_key(self, key):
class TestGetLoc:
- @pytest.mark.filterwarnings("ignore:Passing method:FutureWarning")
def test_get_loc(self):
idx = to_timedelta(["0 days", "1 days", "2 days"])
@@ -292,52 +291,3 @@ def test_take_fill_value(self):
msg = "index -5 is out of bounds for (axis 0 with )?size 3"
with pytest.raises(IndexError, match=msg):
idx.take(np.array([1, -5]))
-
-
-class TestMaybeCastSliceBound:
- @pytest.fixture(params=["increasing", "decreasing", None])
- def monotonic(self, request):
- return request.param
-
- @pytest.fixture
- def tdi(self, monotonic):
- tdi = timedelta_range("1 Day", periods=10)
- if monotonic == "decreasing":
- tdi = tdi[::-1]
- elif monotonic is None:
- taker = np.arange(10, dtype=np.intp)
- np.random.shuffle(taker)
- tdi = tdi.take(taker)
- return tdi
-
- def test_maybe_cast_slice_bound_invalid_str(self, tdi):
- # test the low-level _maybe_cast_slice_bound and that we get the
- # expected exception+message all the way up the stack
- msg = (
- "cannot do slice indexing on TimedeltaIndex with these "
- r"indexers \[foo\] of type str"
- )
- with pytest.raises(TypeError, match=msg):
- tdi._maybe_cast_slice_bound("foo", side="left")
- with pytest.raises(TypeError, match=msg):
- tdi.get_slice_bound("foo", side="left")
- with pytest.raises(TypeError, match=msg):
- tdi.slice_locs("foo", None, None)
-
- def test_slice_invalid_str_with_timedeltaindex(
- self, tdi, frame_or_series, indexer_sl
- ):
- obj = frame_or_series(range(10), index=tdi)
-
- msg = (
- "cannot do slice indexing on TimedeltaIndex with these "
- r"indexers \[foo\] of type str"
- )
- with pytest.raises(TypeError, match=msg):
- indexer_sl(obj)["foo":]
- with pytest.raises(TypeError, match=msg):
- indexer_sl(obj)["foo":-1]
- with pytest.raises(TypeError, match=msg):
- indexer_sl(obj)[:"foo"]
- with pytest.raises(TypeError, match=msg):
- indexer_sl(obj)[tdi[0] : "foo"]
diff --git a/pandas/tests/indexing/multiindex/test_getitem.py b/pandas/tests/indexing/multiindex/test_getitem.py
index 3790a6e9a5319..f1fbe0c5a6b9c 100644
--- a/pandas/tests/indexing/multiindex/test_getitem.py
+++ b/pandas/tests/indexing/multiindex/test_getitem.py
@@ -28,11 +28,9 @@ def test_series_getitem_multiindex(access_method, level1_value, expected):
# GH 6018
# series regression getitem with a multi-index
- mi = MultiIndex.from_tuples([(0, 0), (1, 1), (2, 1)], names=["A", "B"])
- ser = Series([1, 2, 3], index=mi)
- expected.index.name = "A"
-
- result = access_method(ser, level1_value)
+ s = Series([1, 2, 3])
+ s.index = MultiIndex.from_tuples([(0, 0), (1, 1), (2, 1)])
+ result = access_method(s, level1_value)
tm.assert_series_equal(result, expected)
diff --git a/pandas/tests/indexing/multiindex/test_indexing_slow.py b/pandas/tests/indexing/multiindex/test_indexing_slow.py
index e8c766d489813..a38b5f6cc449a 100644
--- a/pandas/tests/indexing/multiindex/test_indexing_slow.py
+++ b/pandas/tests/indexing/multiindex/test_indexing_slow.py
@@ -1,7 +1,3 @@
-from typing import (
- Any,
- List,
-)
import warnings
import numpy as np
@@ -18,7 +14,7 @@
n = 1000
cols = ["jim", "joe", "jolie", "joline", "jolia"]
-vals: List[Any] = [
+vals = [
np.random.randint(0, 10, n),
np.random.choice(list("abcdefghij"), n),
np.random.choice(pd.date_range("20141009", periods=10).tolist(), n),
@@ -28,7 +24,7 @@
vals = list(map(tuple, zip(*vals)))
# bunch of keys for testing
-keys: List[Any] = [
+keys = [
np.random.randint(0, 11, m),
np.random.choice(list("abcdefghijk"), m),
np.random.choice(pd.date_range("20141009", periods=11).tolist(), m),
diff --git a/pandas/tests/indexing/multiindex/test_loc.py b/pandas/tests/indexing/multiindex/test_loc.py
index 104fa2da7a67e..afcff6db5e3dd 100644
--- a/pandas/tests/indexing/multiindex/test_loc.py
+++ b/pandas/tests/indexing/multiindex/test_loc.py
@@ -398,21 +398,14 @@ def test_loc_getitem_duplicates_multiindex_missing_indexers(indexer, pos):
idx = MultiIndex.from_product(
[["A", "B", "C"], ["foo", "bar", "baz"]], names=["one", "two"]
)
- ser = Series(np.arange(9, dtype="int64"), index=idx).sort_index()
- expected = ser.iloc[pos]
+ s = Series(np.arange(9, dtype="int64"), index=idx).sort_index()
+ expected = s.iloc[pos]
if expected.size == 0 and indexer != []:
with pytest.raises(KeyError, match=str(indexer)):
- ser.loc[indexer]
+ s.loc[indexer]
else:
- warn = None
- msg = "MultiIndex with a nested sequence"
- if indexer == (slice(None), ["foo", "bah"]):
- # "bah" is not in idx.levels[1], so is ignored, will raise KeyError
- warn = FutureWarning
-
- with tm.assert_produces_warning(warn, match=msg):
- result = ser.loc[indexer]
+ result = s.loc[indexer]
tm.assert_series_equal(result, expected)
@@ -554,17 +547,15 @@ def test_loc_period_string_indexing():
),
)
result = df.loc[("2013Q1", 1111), "OMS"]
-
- alt = df.loc[(a[0], 1111), "OMS"]
- assert np.isnan(alt)
-
- # Because the resolution of the string matches, it is an exact lookup,
- # not a slice
- assert np.isnan(result)
-
- # TODO: should it figure this out?
- # alt = df.loc["2013Q1", 1111, "OMS"]
- # assert np.isnan(alt)
+ expected = Series(
+ [np.nan],
+ dtype=object,
+ name="OMS",
+ index=MultiIndex.from_tuples(
+ [(pd.Period("2013Q1"), 1111)], names=["Period", "CVR"]
+ ),
+ )
+ tm.assert_series_equal(result, expected)
def test_loc_datetime_mask_slicing():
@@ -745,19 +736,6 @@ def test_get_loc_datetime_index():
assert mi.get_loc("2001-01") == slice(0, 31, None)
assert index.get_loc("2001-01") == slice(0, 31, None)
- loc = mi[::2].get_loc("2001-01")
- expected = index[::2].get_loc("2001-01")
- assert loc == expected
-
- loc = mi.repeat(2).get_loc("2001-01")
- expected = index.repeat(2).get_loc("2001-01")
- assert loc == expected
-
- loc = mi.append(mi).get_loc("2001-01")
- expected = index.append(index).get_loc("2001-01")
- # TODO: standardize return type for MultiIndex.get_loc
- tm.assert_numpy_array_equal(loc.nonzero()[0], expected)
-
def test_loc_setitem_indexer_differently_ordered():
# GH#34603
@@ -809,10 +787,10 @@ def test_loc_getitem_index_differently_ordered_slice_none_duplicates(indexer):
def test_loc_getitem_drops_levels_for_one_row_dataframe():
- # GH#10521 "x" and "z" are both scalar indexing, so those levels are dropped
+ # GH#10521
mi = MultiIndex.from_arrays([["x"], ["y"], ["z"]], names=["a", "b", "c"])
df = DataFrame({"d": [0]}, index=mi)
- expected = df.droplevel([0, 2])
+ expected = df.copy()
result = df.loc["x", :, "z"]
tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/indexing/multiindex/test_partial.py b/pandas/tests/indexing/multiindex/test_partial.py
index 50a31f2fd22c6..a99f09143e282 100644
--- a/pandas/tests/indexing/multiindex/test_partial.py
+++ b/pandas/tests/indexing/multiindex/test_partial.py
@@ -158,8 +158,8 @@ def test_getitem_intkey_leading_level(
assert isinstance(mi.levels[0], Float64Index)
assert 14 not in mi.levels[0]
- assert not mi.levels[0]._should_fallback_to_positional
- assert not mi._should_fallback_to_positional
+ assert not mi.levels[0]._should_fallback_to_positional()
+ assert not mi._should_fallback_to_positional()
with pytest.raises(KeyError, match="14"):
ser[14]
diff --git a/pandas/tests/indexing/test_at.py b/pandas/tests/indexing/test_at.py
index 23d2bee612243..77cfb94bf4629 100644
--- a/pandas/tests/indexing/test_at.py
+++ b/pandas/tests/indexing/test_at.py
@@ -8,7 +8,6 @@
from pandas import (
CategoricalDtype,
- CategoricalIndex,
DataFrame,
Series,
Timestamp,
@@ -142,16 +141,3 @@ def test_at_getitem_mixed_index_no_fallback(self):
ser.at[0]
with pytest.raises(KeyError, match="^4$"):
ser.at[4]
-
- def test_at_categorical_integers(self):
- # CategoricalIndex with integer categories that don't happen to match
- # the Categorical's codes
- ci = CategoricalIndex([3, 4])
-
- arr = np.arange(4).reshape(2, 2)
- frame = DataFrame(arr, index=ci)
-
- for df in [frame, frame.T]:
- for key in [0, 1]:
- with pytest.raises(KeyError, match=str(key)):
- df.at[key, key]
diff --git a/pandas/tests/indexing/test_categorical.py b/pandas/tests/indexing/test_categorical.py
index 9908f79208088..cd49620f45fae 100644
--- a/pandas/tests/indexing/test_categorical.py
+++ b/pandas/tests/indexing/test_categorical.py
@@ -485,9 +485,9 @@ def test_loc_and_at_with_categorical_index(self):
[1.5, 2.5, 3.5],
[-1.5, -2.5, -3.5],
# numpy int/uint
- *(np.array([1, 2, 3], dtype=dtype) for dtype in tm.ALL_INT_DTYPES),
+ *[np.array([1, 2, 3], dtype=dtype) for dtype in tm.ALL_INT_DTYPES],
# numpy floats
- *(np.array([1.5, 2.5, 3.5], dtype=dtyp) for dtyp in tm.FLOAT_DTYPES),
+ *[np.array([1.5, 2.5, 3.5], dtype=dtyp) for dtyp in tm.FLOAT_DTYPES],
# numpy object
np.array([1, "b", 3.5], dtype=object),
# pandas scalars
@@ -495,7 +495,7 @@ def test_loc_and_at_with_categorical_index(self):
[Timestamp(2019, 1, 1), Timestamp(2019, 2, 1), Timestamp(2019, 3, 1)],
[Timedelta(1, "d"), Timedelta(2, "d"), Timedelta(3, "D")],
# pandas Integer arrays
- *(pd.array([1, 2, 3], dtype=dtype) for dtype in tm.ALL_EA_INT_DTYPES),
+ *[pd.array([1, 2, 3], dtype=dtype) for dtype in tm.ALL_EA_INT_DTYPES],
# other pandas arrays
pd.IntervalIndex.from_breaks([1, 4, 6, 9]).array,
pd.date_range("2019-01-01", periods=3).array,
@@ -540,16 +540,3 @@ def test_loc_getitem_with_non_string_categories(self, idx_values, ordered):
result.loc[sl, "A"] = ["qux", "qux2"]
expected = DataFrame({"A": ["qux", "qux2", "baz"]}, index=cat_idx)
tm.assert_frame_equal(result, expected)
-
- def test_getitem_categorical_with_nan(self):
- # GH#41933
- ci = CategoricalIndex(["A", "B", np.nan])
-
- ser = Series(range(3), index=ci)
-
- assert ser[np.nan] == 2
- assert ser.loc[np.nan] == 2
-
- df = DataFrame(ser)
- assert df.loc[np.nan, 0] == 2
- assert df.loc[np.nan][0] == 2
diff --git a/pandas/tests/indexing/test_coercion.py b/pandas/tests/indexing/test_coercion.py
index 761e67bedbf8c..7911cd7f12e0c 100644
--- a/pandas/tests/indexing/test_coercion.py
+++ b/pandas/tests/indexing/test_coercion.py
@@ -273,12 +273,7 @@ def _assert_setitem_index_conversion(
):
"""test index's coercion triggered by assign key"""
temp = original_series.copy()
- warn = None
- if isinstance(loc_key, int) and temp.index.dtype == np.float64:
- # GH#33469
- warn = FutureWarning
- with tm.assert_produces_warning(warn):
- temp[loc_key] = 5
+ temp[loc_key] = 5
exp = pd.Series([1, 2, 3, 4, 5], index=expected_index)
tm.assert_series_equal(temp, exp)
# check dtype explicitly for sure
@@ -329,10 +324,7 @@ def test_setitem_index_float64(self, val, exp_dtype, request):
temp = obj.copy()
msg = "index 5 is out of bounds for axis 0 with size 4"
with pytest.raises(exp_dtype, match=msg):
- # GH#33469
- depr_msg = "Treating integers as positional"
- with tm.assert_produces_warning(FutureWarning, match=depr_msg):
- temp[5] = 5
+ temp[5] = 5
mark = pytest.mark.xfail(reason="TODO_GH12747 The result must be float")
request.node.add_marker(mark)
exp_index = pd.Index([1.1, 2.1, 3.1, 4.1, val])
diff --git a/pandas/tests/indexing/test_iloc.py b/pandas/tests/indexing/test_iloc.py
index b04a2c86a79d7..fc07c14f1e179 100644
--- a/pandas/tests/indexing/test_iloc.py
+++ b/pandas/tests/indexing/test_iloc.py
@@ -596,7 +596,7 @@ def test_iloc_getitem_labelled_frame(self):
assert result == exp
# out-of-bounds exception
- msg = "index 5 is out of bounds for axis 0 with size 4"
+ msg = "single positional indexer is out-of-bounds"
with pytest.raises(IndexError, match=msg):
df.iloc[10, 5]
diff --git a/pandas/tests/indexing/test_indexing.py b/pandas/tests/indexing/test_indexing.py
index 7243f2cddfec6..c945bd6b95ee1 100644
--- a/pandas/tests/indexing/test_indexing.py
+++ b/pandas/tests/indexing/test_indexing.py
@@ -113,6 +113,15 @@ def test_setitem_ndarray_3d(self, index, frame_or_series, indexer_sli):
if indexer_sli is tm.iloc:
err = ValueError
msg = f"Cannot set values with ndim > {obj.ndim}"
+ elif (
+ isinstance(index, pd.IntervalIndex)
+ and indexer_sli is tm.setitem
+ and obj.ndim == 1
+ ):
+ err = AttributeError
+ msg = (
+ "'pandas._libs.interval.IntervalTree' object has no attribute 'get_loc'"
+ )
else:
err = ValueError
msg = "|".join(
@@ -518,7 +527,7 @@ def test_string_slice_empty(self):
with pytest.raises(KeyError, match="'2011'"):
df["2011"]
- with pytest.raises(KeyError, match="^0$"):
+ with pytest.raises(KeyError, match="'2011'"):
df.loc["2011", 0]
def test_astype_assignment(self):
diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py
index 6de83e34122c2..a8a2055ffb093 100644
--- a/pandas/tests/indexing/test_loc.py
+++ b/pandas/tests/indexing/test_loc.py
@@ -1663,30 +1663,6 @@ def test_loc_multiindex_levels_contain_values_not_in_index_anymore(self, lt_valu
with pytest.raises(KeyError, match=r"\['b'\] not in index"):
df.loc[df["a"] < lt_value, :].loc[["b"], :]
- def test_loc_multiindex_null_slice_na_level(self):
- # GH#42055
- lev1 = np.array([np.nan, np.nan])
- lev2 = ["bar", "baz"]
- mi = MultiIndex.from_arrays([lev1, lev2])
- ser = Series([0, 1], index=mi)
- result = ser.loc[:, "bar"]
-
- # TODO: should we have name="bar"?
- expected = Series([0], index=[np.nan])
- tm.assert_series_equal(result, expected)
-
- def test_loc_drops_level(self):
- # Based on test_series_varied_multiindex_alignment, where
- # this used to fail to drop the first level
- mi = MultiIndex.from_product(
- [list("ab"), list("xy"), [1, 2]], names=["ab", "xy", "num"]
- )
- ser = Series(range(8), index=mi)
-
- loc_result = ser.loc["a", :, :]
- expected = ser.index.droplevel(0)[:4]
- tm.assert_index_equal(loc_result.index, expected)
-
class TestLocSetitemWithExpansion:
@pytest.mark.slow
@@ -1854,23 +1830,6 @@ def test_loc_setitem_with_expansion_nonunique_index(self, index, request):
)
tm.assert_frame_equal(df, expected)
- @pytest.mark.parametrize(
- "dtype", ["Int32", "Int64", "UInt32", "UInt64", "Float32", "Float64"]
- )
- def test_loc_setitem_with_expansion_preserves_nullable_int(self, dtype):
- # GH#42099
- ser = Series([0, 1, 2, 3], dtype=dtype)
- df = DataFrame({"data": ser})
-
- result = DataFrame(index=df.index)
- result.loc[df.index, "data"] = ser
-
- tm.assert_frame_equal(result, df)
-
- result = DataFrame(index=df.index)
- result.loc[df.index, "data"] = ser._values
- tm.assert_frame_equal(result, df)
-
class TestLocCallable:
def test_frame_loc_getitem_callable(self):
diff --git a/pandas/tests/io/data/excel/chartsheet.xls b/pandas/tests/io/data/excel/chartsheet.xls
deleted file mode 100644
index 7d027400fbd52..0000000000000
Binary files a/pandas/tests/io/data/excel/chartsheet.xls and /dev/null differ
diff --git a/pandas/tests/io/data/excel/chartsheet.xlsb b/pandas/tests/io/data/excel/chartsheet.xlsb
deleted file mode 100644
index 805087280f851..0000000000000
Binary files a/pandas/tests/io/data/excel/chartsheet.xlsb and /dev/null differ
diff --git a/pandas/tests/io/data/excel/chartsheet.xlsm b/pandas/tests/io/data/excel/chartsheet.xlsm
deleted file mode 100644
index aadb48d6f4824..0000000000000
Binary files a/pandas/tests/io/data/excel/chartsheet.xlsm and /dev/null differ
diff --git a/pandas/tests/io/data/excel/chartsheet.xlsx b/pandas/tests/io/data/excel/chartsheet.xlsx
deleted file mode 100644
index c8d5e7afb3d07..0000000000000
Binary files a/pandas/tests/io/data/excel/chartsheet.xlsx and /dev/null differ
diff --git a/pandas/tests/io/data/legacy_pickle/1.2.4/empty_frame_v1_2_4-GH#42345.pkl b/pandas/tests/io/data/legacy_pickle/1.2.4/empty_frame_v1_2_4-GH#42345.pkl
deleted file mode 100644
index 255a745dd9021..0000000000000
Binary files a/pandas/tests/io/data/legacy_pickle/1.2.4/empty_frame_v1_2_4-GH#42345.pkl and /dev/null differ
diff --git a/pandas/tests/io/excel/test_odswriter.py b/pandas/tests/io/excel/test_odswriter.py
index 4bf6051fd36ef..b50c641ebf0c0 100644
--- a/pandas/tests/io/excel/test_odswriter.py
+++ b/pandas/tests/io/excel/test_odswriter.py
@@ -1,5 +1,3 @@
-import re
-
import pytest
import pandas._testing as tm
@@ -17,25 +15,3 @@ def test_write_append_mode_raises(ext):
with tm.ensure_clean(ext) as f:
with pytest.raises(ValueError, match=msg):
ExcelWriter(f, engine="odf", mode="a")
-
-
-@pytest.mark.parametrize("nan_inf_to_errors", [True, False])
-def test_kwargs(ext, nan_inf_to_errors):
- # GH 42286
- # odswriter doesn't utilize kwargs, nothing to check except that it works
- kwargs = {"options": {"nan_inf_to_errors": nan_inf_to_errors}}
- with tm.ensure_clean(ext) as f:
- msg = re.escape("Use of **kwargs is deprecated")
- with tm.assert_produces_warning(FutureWarning, match=msg):
- with ExcelWriter(f, engine="odf", **kwargs) as _:
- pass
-
-
-@pytest.mark.parametrize("nan_inf_to_errors", [True, False])
-def test_engine_kwargs(ext, nan_inf_to_errors):
- # GH 42286
- # odswriter doesn't utilize engine_kwargs, nothing to check except that it works
- engine_kwargs = {"options": {"nan_inf_to_errors": nan_inf_to_errors}}
- with tm.ensure_clean(ext) as f:
- with ExcelWriter(f, engine="odf", engine_kwargs=engine_kwargs) as _:
- pass
diff --git a/pandas/tests/io/excel/test_openpyxl.py b/pandas/tests/io/excel/test_openpyxl.py
index cd773957c9043..62f567457c3ab 100644
--- a/pandas/tests/io/excel/test_openpyxl.py
+++ b/pandas/tests/io/excel/test_openpyxl.py
@@ -85,30 +85,6 @@ def test_write_cells_merge_styled(ext):
assert xcell_a2.font == openpyxl_sty_merged
-@pytest.mark.parametrize("write_only", [True, False])
-def test_kwargs(ext, write_only):
- # GH 42286
- # openpyxl doesn't utilize kwargs, only test that supplying a kwarg works
- kwargs = {"write_only": write_only}
- with tm.ensure_clean(ext) as f:
- msg = re.escape("Use of **kwargs is deprecated")
- with tm.assert_produces_warning(FutureWarning, match=msg):
- with ExcelWriter(f, engine="openpyxl", **kwargs) as writer:
- # ExcelWriter won't allow us to close without writing something
- DataFrame().to_excel(writer)
-
-
-@pytest.mark.parametrize("write_only", [True, False])
-def test_engine_kwargs(ext, write_only):
- # GH 42286
- # openpyxl doesn't utilize kwargs, only test that supplying a engine_kwarg works
- engine_kwargs = {"write_only": write_only}
- with tm.ensure_clean(ext) as f:
- with ExcelWriter(f, engine="openpyxl", engine_kwargs=engine_kwargs) as writer:
- # ExcelWriter won't allow us to close without writing something
- DataFrame().to_excel(writer)
-
-
@pytest.mark.parametrize(
"mode,expected", [("w", ["baz"]), ("a", ["foo", "bar", "baz"])]
)
diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py
index cbd241ceda0b1..d40fb3ce4a135 100644
--- a/pandas/tests/io/excel/test_readers.py
+++ b/pandas/tests/io/excel/test_readers.py
@@ -1250,34 +1250,6 @@ def test_trailing_blanks(self, read_ext):
result = pd.read_excel(file_name)
assert result.shape == (3, 3)
- def test_ignore_chartsheets_by_str(self, request, read_ext):
- # GH 41448
- if pd.read_excel.keywords["engine"] == "odf":
- pytest.skip("chartsheets do not exist in the ODF format")
- if pd.read_excel.keywords["engine"] == "pyxlsb":
- request.node.add_marker(
- pytest.mark.xfail(
- reason="pyxlsb can't distinguish chartsheets from worksheets"
- )
- )
- with pytest.raises(ValueError, match="Worksheet named 'Chart1' not found"):
- pd.read_excel("chartsheet" + read_ext, sheet_name="Chart1")
-
- def test_ignore_chartsheets_by_int(self, request, read_ext):
- # GH 41448
- if pd.read_excel.keywords["engine"] == "odf":
- pytest.skip("chartsheets do not exist in the ODF format")
- if pd.read_excel.keywords["engine"] == "pyxlsb":
- request.node.add_marker(
- pytest.mark.xfail(
- reason="pyxlsb can't distinguish chartsheets from worksheets"
- )
- )
- with pytest.raises(
- ValueError, match="Worksheet index 1 is invalid, 1 worksheets found"
- ):
- pd.read_excel("chartsheet" + read_ext, sheet_name=1)
-
class TestExcelFileRead:
@pytest.fixture(autouse=True)
@@ -1529,19 +1501,6 @@ def test_engine_invalid_option(self, read_ext):
with pd.option_context(f"io.excel{read_ext}.reader", "abc"):
pass
- def test_ignore_chartsheets(self, request, engine, read_ext):
- # GH 41448
- if engine == "odf":
- pytest.skip("chartsheets do not exist in the ODF format")
- if engine == "pyxlsb":
- request.node.add_marker(
- pytest.mark.xfail(
- reason="pyxlsb can't distinguish chartsheets from worksheets"
- )
- )
- with pd.ExcelFile("chartsheet" + read_ext) as excel:
- assert excel.sheet_names == ["Sheet1"]
-
def test_corrupt_files_closed(self, request, engine, read_ext):
# GH41778
errors = (BadZipFile,)
diff --git a/pandas/tests/io/excel/test_writers.py b/pandas/tests/io/excel/test_writers.py
index 508e767a47004..77837bea3e48a 100644
--- a/pandas/tests/io/excel/test_writers.py
+++ b/pandas/tests/io/excel/test_writers.py
@@ -1399,6 +1399,25 @@ def check_called(func):
with tm.ensure_clean("something.xls") as filepath:
check_called(lambda: df.to_excel(filepath, engine="dummy"))
+ @pytest.mark.parametrize(
+ "ext",
+ [
+ pytest.param(".xlsx", marks=td.skip_if_no("xlsxwriter")),
+ pytest.param(".xlsx", marks=td.skip_if_no("openpyxl")),
+ pytest.param(".ods", marks=td.skip_if_no("odf")),
+ ],
+ )
+ def test_kwargs_deprecated(self, ext):
+ # GH 40430
+ msg = re.escape("Use of **kwargs is deprecated")
+ with tm.assert_produces_warning(FutureWarning, match=msg):
+ with tm.ensure_clean(ext) as path:
+ try:
+ with ExcelWriter(path, kwarg=1):
+ pass
+ except TypeError:
+ pass
+
@pytest.mark.parametrize(
"ext",
[
diff --git a/pandas/tests/io/excel/test_xlsxwriter.py b/pandas/tests/io/excel/test_xlsxwriter.py
index 79d2f55a9b8ff..6de378f6a3d3e 100644
--- a/pandas/tests/io/excel/test_xlsxwriter.py
+++ b/pandas/tests/io/excel/test_xlsxwriter.py
@@ -1,4 +1,3 @@
-import re
import warnings
import pytest
@@ -62,23 +61,3 @@ def test_write_append_mode_raises(ext):
with tm.ensure_clean(ext) as f:
with pytest.raises(ValueError, match=msg):
ExcelWriter(f, engine="xlsxwriter", mode="a")
-
-
-@pytest.mark.parametrize("nan_inf_to_errors", [True, False])
-def test_kwargs(ext, nan_inf_to_errors):
- # GH 42286
- kwargs = {"options": {"nan_inf_to_errors": nan_inf_to_errors}}
- with tm.ensure_clean(ext) as f:
- msg = re.escape("Use of **kwargs is deprecated")
- with tm.assert_produces_warning(FutureWarning, match=msg):
- with ExcelWriter(f, engine="xlsxwriter", **kwargs) as writer:
- assert writer.book.nan_inf_to_errors == nan_inf_to_errors
-
-
-@pytest.mark.parametrize("nan_inf_to_errors", [True, False])
-def test_engine_kwargs(ext, nan_inf_to_errors):
- # GH 42286
- engine_kwargs = {"options": {"nan_inf_to_errors": nan_inf_to_errors}}
- with tm.ensure_clean(ext) as f:
- with ExcelWriter(f, engine="xlsxwriter", engine_kwargs=engine_kwargs) as writer:
- assert writer.book.nan_inf_to_errors == nan_inf_to_errors
diff --git a/pandas/tests/io/excel/test_xlwt.py b/pandas/tests/io/excel/test_xlwt.py
index c58b9763f9618..7e1787d8c55d4 100644
--- a/pandas/tests/io/excel/test_xlwt.py
+++ b/pandas/tests/io/excel/test_xlwt.py
@@ -1,5 +1,3 @@
-import re
-
import numpy as np
import pytest
@@ -99,27 +97,3 @@ def test_option_xls_writer_deprecated(ext):
check_stacklevel=False,
):
options.io.excel.xls.writer = "xlwt"
-
-
-@pytest.mark.parametrize("write_only", [True, False])
-def test_kwargs(ext, write_only):
- # GH 42286
- # xlwt doesn't utilize kwargs, only test that supplying a kwarg works
- kwargs = {"write_only": write_only}
- with tm.ensure_clean(ext) as f:
- msg = re.escape("Use of **kwargs is deprecated")
- with tm.assert_produces_warning(FutureWarning, match=msg):
- with ExcelWriter(f, engine="openpyxl", **kwargs) as writer:
- # xlwt won't allow us to close without writing something
- DataFrame().to_excel(writer)
-
-
-@pytest.mark.parametrize("write_only", [True, False])
-def test_engine_kwargs(ext, write_only):
- # GH 42286
- # xlwt doesn't utilize kwargs, only test that supplying a engine_kwarg works
- engine_kwargs = {"write_only": write_only}
- with tm.ensure_clean(ext) as f:
- with ExcelWriter(f, engine="openpyxl", engine_kwargs=engine_kwargs) as writer:
- # xlwt won't allow us to close without writing something
- DataFrame().to_excel(writer)
diff --git a/pandas/tests/io/formats/style/test_align.py b/pandas/tests/io/formats/style/test_align.py
new file mode 100644
index 0000000000000..f81c1fbd6d85e
--- /dev/null
+++ b/pandas/tests/io/formats/style/test_align.py
@@ -0,0 +1,406 @@
+import pytest
+
+from pandas import DataFrame
+
+pytest.importorskip("jinja2")
+
+
+def bar_grad(a=None, b=None, c=None, d=None):
+ """Used in multiple tests to simplify formatting of expected result"""
+ ret = [("width", "10em"), ("height", "80%")]
+ if all(x is None for x in [a, b, c, d]):
+ return ret
+ return ret + [
+ (
+ "background",
+ f"linear-gradient(90deg,{','.join(x for x in [a, b, c, d] if x)})",
+ )
+ ]
+
+
+class TestStylerBarAlign:
+ def test_bar_align_left(self):
+ df = DataFrame({"A": [0, 1, 2]})
+ result = df.style.bar()._compute().ctx
+ expected = {
+ (0, 0): bar_grad(),
+ (1, 0): bar_grad("#d65f5f 50.0%", " transparent 50.0%"),
+ (2, 0): bar_grad("#d65f5f 100.0%", " transparent 100.0%"),
+ }
+ assert result == expected
+
+ result = df.style.bar(color="red", width=50)._compute().ctx
+ expected = {
+ (0, 0): bar_grad(),
+ (1, 0): bar_grad("red 25.0%", " transparent 25.0%"),
+ (2, 0): bar_grad("red 50.0%", " transparent 50.0%"),
+ }
+ assert result == expected
+
+ df["C"] = ["a"] * len(df)
+ result = df.style.bar(color="red", width=50)._compute().ctx
+ assert result == expected
+ df["C"] = df["C"].astype("category")
+ result = df.style.bar(color="red", width=50)._compute().ctx
+ assert result == expected
+
+ def test_bar_align_left_0points(self):
+ df = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
+ result = df.style.bar()._compute().ctx
+ expected = {
+ (0, 0): bar_grad(),
+ (0, 1): bar_grad(),
+ (0, 2): bar_grad(),
+ (1, 0): bar_grad("#d65f5f 50.0%", " transparent 50.0%"),
+ (1, 1): bar_grad("#d65f5f 50.0%", " transparent 50.0%"),
+ (1, 2): bar_grad("#d65f5f 50.0%", " transparent 50.0%"),
+ (2, 0): bar_grad("#d65f5f 100.0%", " transparent 100.0%"),
+ (2, 1): bar_grad("#d65f5f 100.0%", " transparent 100.0%"),
+ (2, 2): bar_grad("#d65f5f 100.0%", " transparent 100.0%"),
+ }
+ assert result == expected
+
+ result = df.style.bar(axis=1)._compute().ctx
+ expected = {
+ (0, 0): bar_grad(),
+ (0, 1): bar_grad("#d65f5f 50.0%", " transparent 50.0%"),
+ (0, 2): bar_grad("#d65f5f 100.0%", " transparent 100.0%"),
+ (1, 0): bar_grad(),
+ (1, 1): bar_grad("#d65f5f 50.0%", " transparent 50.0%"),
+ (1, 2): bar_grad("#d65f5f 100.0%", " transparent 100.0%"),
+ (2, 0): bar_grad(),
+ (2, 1): bar_grad("#d65f5f 50.0%", " transparent 50.0%"),
+ (2, 2): bar_grad("#d65f5f 100.0%", " transparent 100.0%"),
+ }
+ assert result == expected
+
+ def test_bar_align_mid_pos_and_neg(self):
+ df = DataFrame({"A": [-10, 0, 20, 90]})
+ result = df.style.bar(align="mid", color=["#d65f5f", "#5fba7d"])._compute().ctx
+ expected = {
+ (0, 0): bar_grad(
+ "#d65f5f 10.0%",
+ " transparent 10.0%",
+ ),
+ (1, 0): bar_grad(),
+ (2, 0): bar_grad(
+ " transparent 10.0%",
+ " #5fba7d 10.0%",
+ " #5fba7d 30.0%",
+ " transparent 30.0%",
+ ),
+ (3, 0): bar_grad(
+ " transparent 10.0%",
+ " #5fba7d 10.0%",
+ " #5fba7d 100.0%",
+ " transparent 100.0%",
+ ),
+ }
+ assert result == expected
+
+ def test_bar_align_mid_all_pos(self):
+ df = DataFrame({"A": [10, 20, 50, 100]})
+
+ result = df.style.bar(align="mid", color=["#d65f5f", "#5fba7d"])._compute().ctx
+
+ expected = {
+ (0, 0): bar_grad(
+ "#5fba7d 10.0%",
+ " transparent 10.0%",
+ ),
+ (1, 0): bar_grad(
+ "#5fba7d 20.0%",
+ " transparent 20.0%",
+ ),
+ (2, 0): bar_grad(
+ "#5fba7d 50.0%",
+ " transparent 50.0%",
+ ),
+ (3, 0): bar_grad(
+ "#5fba7d 100.0%",
+ " transparent 100.0%",
+ ),
+ }
+
+ assert result == expected
+
+ def test_bar_align_mid_all_neg(self):
+ df = DataFrame({"A": [-100, -60, -30, -20]})
+
+ result = df.style.bar(align="mid", color=["#d65f5f", "#5fba7d"])._compute().ctx
+
+ expected = {
+ (0, 0): bar_grad(
+ "#d65f5f 100.0%",
+ " transparent 100.0%",
+ ),
+ (1, 0): bar_grad(
+ " transparent 40.0%",
+ " #d65f5f 40.0%",
+ " #d65f5f 100.0%",
+ " transparent 100.0%",
+ ),
+ (2, 0): bar_grad(
+ " transparent 70.0%",
+ " #d65f5f 70.0%",
+ " #d65f5f 100.0%",
+ " transparent 100.0%",
+ ),
+ (3, 0): bar_grad(
+ " transparent 80.0%",
+ " #d65f5f 80.0%",
+ " #d65f5f 100.0%",
+ " transparent 100.0%",
+ ),
+ }
+ assert result == expected
+
+ def test_bar_align_zero_pos_and_neg(self):
+ # See https://github.com/pandas-dev/pandas/pull/14757
+ df = DataFrame({"A": [-10, 0, 20, 90]})
+
+ result = (
+ df.style.bar(align="zero", color=["#d65f5f", "#5fba7d"], width=90)
+ ._compute()
+ .ctx
+ )
+ expected = {
+ (0, 0): bar_grad(
+ " transparent 40.0%",
+ " #d65f5f 40.0%",
+ " #d65f5f 45.0%",
+ " transparent 45.0%",
+ ),
+ (1, 0): bar_grad(),
+ (2, 0): bar_grad(
+ " transparent 45.0%",
+ " #5fba7d 45.0%",
+ " #5fba7d 55.0%",
+ " transparent 55.0%",
+ ),
+ (3, 0): bar_grad(
+ " transparent 45.0%",
+ " #5fba7d 45.0%",
+ " #5fba7d 90.0%",
+ " transparent 90.0%",
+ ),
+ }
+ assert result == expected
+
+ def test_bar_align_left_axis_none(self):
+ df = DataFrame({"A": [0, 1], "B": [2, 4]})
+ result = df.style.bar(axis=None)._compute().ctx
+ expected = {
+ (0, 0): bar_grad(),
+ (1, 0): bar_grad(
+ "#d65f5f 25.0%",
+ " transparent 25.0%",
+ ),
+ (0, 1): bar_grad(
+ "#d65f5f 50.0%",
+ " transparent 50.0%",
+ ),
+ (1, 1): bar_grad(
+ "#d65f5f 100.0%",
+ " transparent 100.0%",
+ ),
+ }
+ assert result == expected
+
+ def test_bar_align_zero_axis_none(self):
+ df = DataFrame({"A": [0, 1], "B": [-2, 4]})
+ result = df.style.bar(align="zero", axis=None)._compute().ctx
+ expected = {
+ (0, 0): bar_grad(),
+ (1, 0): bar_grad(
+ " transparent 50.0%",
+ " #d65f5f 50.0%",
+ " #d65f5f 62.5%",
+ " transparent 62.5%",
+ ),
+ (0, 1): bar_grad(
+ " transparent 25.0%",
+ " #d65f5f 25.0%",
+ " #d65f5f 50.0%",
+ " transparent 50.0%",
+ ),
+ (1, 1): bar_grad(
+ " transparent 50.0%",
+ " #d65f5f 50.0%",
+ " #d65f5f 100.0%",
+ " transparent 100.0%",
+ ),
+ }
+ assert result == expected
+
+ def test_bar_align_mid_axis_none(self):
+ df = DataFrame({"A": [0, 1], "B": [-2, 4]})
+ result = df.style.bar(align="mid", axis=None)._compute().ctx
+ expected = {
+ (0, 0): bar_grad(),
+ (1, 0): bar_grad(
+ " transparent 33.3%",
+ " #d65f5f 33.3%",
+ " #d65f5f 50.0%",
+ " transparent 50.0%",
+ ),
+ (0, 1): bar_grad(
+ "#d65f5f 33.3%",
+ " transparent 33.3%",
+ ),
+ (1, 1): bar_grad(
+ " transparent 33.3%",
+ " #d65f5f 33.3%",
+ " #d65f5f 100.0%",
+ " transparent 100.0%",
+ ),
+ }
+ assert result == expected
+
+ def test_bar_align_mid_vmin(self):
+ df = DataFrame({"A": [0, 1], "B": [-2, 4]})
+ result = df.style.bar(align="mid", axis=None, vmin=-6)._compute().ctx
+ expected = {
+ (0, 0): bar_grad(),
+ (1, 0): bar_grad(
+ " transparent 60.0%",
+ " #d65f5f 60.0%",
+ " #d65f5f 70.0%",
+ " transparent 70.0%",
+ ),
+ (0, 1): bar_grad(
+ " transparent 40.0%",
+ " #d65f5f 40.0%",
+ " #d65f5f 60.0%",
+ " transparent 60.0%",
+ ),
+ (1, 1): bar_grad(
+ " transparent 60.0%",
+ " #d65f5f 60.0%",
+ " #d65f5f 100.0%",
+ " transparent 100.0%",
+ ),
+ }
+ assert result == expected
+
+ def test_bar_align_mid_vmax(self):
+ df = DataFrame({"A": [0, 1], "B": [-2, 4]})
+ result = df.style.bar(align="mid", axis=None, vmax=8)._compute().ctx
+ expected = {
+ (0, 0): bar_grad(),
+ (1, 0): bar_grad(
+ " transparent 20.0%",
+ " #d65f5f 20.0%",
+ " #d65f5f 30.0%",
+ " transparent 30.0%",
+ ),
+ (0, 1): bar_grad(
+ "#d65f5f 20.0%",
+ " transparent 20.0%",
+ ),
+ (1, 1): bar_grad(
+ " transparent 20.0%",
+ " #d65f5f 20.0%",
+ " #d65f5f 60.0%",
+ " transparent 60.0%",
+ ),
+ }
+ assert result == expected
+
+ def test_bar_align_mid_vmin_vmax_wide(self):
+ df = DataFrame({"A": [0, 1], "B": [-2, 4]})
+ result = df.style.bar(align="mid", axis=None, vmin=-3, vmax=7)._compute().ctx
+ expected = {
+ (0, 0): bar_grad(),
+ (1, 0): bar_grad(
+ " transparent 30.0%",
+ " #d65f5f 30.0%",
+ " #d65f5f 40.0%",
+ " transparent 40.0%",
+ ),
+ (0, 1): bar_grad(
+ " transparent 10.0%",
+ " #d65f5f 10.0%",
+ " #d65f5f 30.0%",
+ " transparent 30.0%",
+ ),
+ (1, 1): bar_grad(
+ " transparent 30.0%",
+ " #d65f5f 30.0%",
+ " #d65f5f 70.0%",
+ " transparent 70.0%",
+ ),
+ }
+ assert result == expected
+
+ def test_bar_align_mid_vmin_vmax_clipping(self):
+ df = DataFrame({"A": [0, 1], "B": [-2, 4]})
+ result = df.style.bar(align="mid", axis=None, vmin=-1, vmax=3)._compute().ctx
+ expected = {
+ (0, 0): bar_grad(),
+ (1, 0): bar_grad(
+ " transparent 25.0%",
+ " #d65f5f 25.0%",
+ " #d65f5f 50.0%",
+ " transparent 50.0%",
+ ),
+ (0, 1): bar_grad("#d65f5f 25.0%", " transparent 25.0%"),
+ (1, 1): bar_grad(
+ " transparent 25.0%",
+ " #d65f5f 25.0%",
+ " #d65f5f 100.0%",
+ " transparent 100.0%",
+ ),
+ }
+ assert result == expected
+
+ def test_bar_align_mid_nans(self):
+ df = DataFrame({"A": [1, None], "B": [-1, 3]})
+ result = df.style.bar(align="mid", axis=None)._compute().ctx
+ expected = {
+ (0, 0): bar_grad(
+ " transparent 25.0%",
+ " #d65f5f 25.0%",
+ " #d65f5f 50.0%",
+ " transparent 50.0%",
+ ),
+ (0, 1): bar_grad("#d65f5f 25.0%", " transparent 25.0%"),
+ (1, 1): bar_grad(
+ " transparent 25.0%",
+ " #d65f5f 25.0%",
+ " #d65f5f 100.0%",
+ " transparent 100.0%",
+ ),
+ }
+ assert result == expected
+
+ def test_bar_align_zero_nans(self):
+ df = DataFrame({"A": [1, None], "B": [-1, 2]})
+ result = df.style.bar(align="zero", axis=None)._compute().ctx
+ expected = {
+ (0, 0): bar_grad(
+ " transparent 50.0%",
+ " #d65f5f 50.0%",
+ " #d65f5f 75.0%",
+ " transparent 75.0%",
+ ),
+ (0, 1): bar_grad(
+ " transparent 25.0%",
+ " #d65f5f 25.0%",
+ " #d65f5f 50.0%",
+ " transparent 50.0%",
+ ),
+ (1, 1): bar_grad(
+ " transparent 50.0%",
+ " #d65f5f 50.0%",
+ " #d65f5f 100.0%",
+ " transparent 100.0%",
+ ),
+ }
+ assert result == expected
+
+ def test_bar_bad_align_raises(self):
+ df = DataFrame({"A": [-100, -60, -30, -20]})
+ msg = "`align` must be one of {'left', 'zero',' mid'}"
+ with pytest.raises(ValueError, match=msg):
+ df.style.bar(align="poorly", color=["#d65f5f", "#5fba7d"])
diff --git a/pandas/tests/io/formats/style/test_bar.py b/pandas/tests/io/formats/style/test_bar.py
deleted file mode 100644
index 19884aaac86a7..0000000000000
--- a/pandas/tests/io/formats/style/test_bar.py
+++ /dev/null
@@ -1,307 +0,0 @@
-import numpy as np
-import pytest
-
-from pandas import DataFrame
-
-pytest.importorskip("jinja2")
-
-
-def bar_grad(a=None, b=None, c=None, d=None):
- """Used in multiple tests to simplify formatting of expected result"""
- ret = [("width", "10em")]
- if all(x is None for x in [a, b, c, d]):
- return ret
- return ret + [
- (
- "background",
- f"linear-gradient(90deg,{','.join([x for x in [a, b, c, d] if x])})",
- )
- ]
-
-
-def no_bar():
- return bar_grad()
-
-
-def bar_to(x, color="#d65f5f"):
- return bar_grad(f" {color} {x:.1f}%", f" transparent {x:.1f}%")
-
-
-def bar_from_to(x, y, color="#d65f5f"):
- return bar_grad(
- f" transparent {x:.1f}%",
- f" {color} {x:.1f}%",
- f" {color} {y:.1f}%",
- f" transparent {y:.1f}%",
- )
-
-
-@pytest.fixture
-def df_pos():
- return DataFrame([[1], [2], [3]])
-
-
-@pytest.fixture
-def df_neg():
- return DataFrame([[-1], [-2], [-3]])
-
-
-@pytest.fixture
-def df_mix():
- return DataFrame([[-3], [1], [2]])
-
-
-@pytest.mark.parametrize(
- "align, exp",
- [
- ("left", [no_bar(), bar_to(50), bar_to(100)]),
- ("right", [bar_to(100), bar_from_to(50, 100), no_bar()]),
- ("mid", [bar_to(33.33), bar_to(66.66), bar_to(100)]),
- ("zero", [bar_from_to(50, 66.7), bar_from_to(50, 83.3), bar_from_to(50, 100)]),
- ("mean", [bar_to(50), no_bar(), bar_from_to(50, 100)]),
- (2.0, [bar_to(50), no_bar(), bar_from_to(50, 100)]),
- (np.median, [bar_to(50), no_bar(), bar_from_to(50, 100)]),
- ],
-)
-def test_align_positive_cases(df_pos, align, exp):
- # test different align cases for all positive values
- result = df_pos.style.bar(align=align)._compute().ctx
- expected = {(0, 0): exp[0], (1, 0): exp[1], (2, 0): exp[2]}
- assert result == expected
-
-
-@pytest.mark.parametrize(
- "align, exp",
- [
- ("left", [bar_to(100), bar_to(50), no_bar()]),
- ("right", [no_bar(), bar_from_to(50, 100), bar_to(100)]),
- ("mid", [bar_from_to(66.66, 100), bar_from_to(33.33, 100), bar_to(100)]),
- ("zero", [bar_from_to(33.33, 50), bar_from_to(16.66, 50), bar_to(50)]),
- ("mean", [bar_from_to(50, 100), no_bar(), bar_to(50)]),
- (-2.0, [bar_from_to(50, 100), no_bar(), bar_to(50)]),
- (np.median, [bar_from_to(50, 100), no_bar(), bar_to(50)]),
- ],
-)
-def test_align_negative_cases(df_neg, align, exp):
- # test different align cases for all negative values
- result = df_neg.style.bar(align=align)._compute().ctx
- expected = {(0, 0): exp[0], (1, 0): exp[1], (2, 0): exp[2]}
- assert result == expected
-
-
-@pytest.mark.parametrize(
- "align, exp",
- [
- ("left", [no_bar(), bar_to(80), bar_to(100)]),
- ("right", [bar_to(100), bar_from_to(80, 100), no_bar()]),
- ("mid", [bar_to(60), bar_from_to(60, 80), bar_from_to(60, 100)]),
- ("zero", [bar_to(50), bar_from_to(50, 66.66), bar_from_to(50, 83.33)]),
- ("mean", [bar_to(50), bar_from_to(50, 66.66), bar_from_to(50, 83.33)]),
- (-0.0, [bar_to(50), bar_from_to(50, 66.66), bar_from_to(50, 83.33)]),
- (np.nanmedian, [bar_to(50), no_bar(), bar_from_to(50, 62.5)]),
- ],
-)
-@pytest.mark.parametrize("nans", [True, False])
-def test_align_mixed_cases(df_mix, align, exp, nans):
- # test different align cases for mixed positive and negative values
- # also test no impact of NaNs and no_bar
- expected = {(0, 0): exp[0], (1, 0): exp[1], (2, 0): exp[2]}
- if nans:
- df_mix.loc[3, :] = np.nan
- expected.update({(3, 0): no_bar()})
- result = df_mix.style.bar(align=align)._compute().ctx
- assert result == expected
-
-
-@pytest.mark.parametrize(
- "align, exp",
- [
- (
- "left",
- {
- "index": [[no_bar(), no_bar()], [bar_to(100), bar_to(100)]],
- "columns": [[no_bar(), bar_to(100)], [no_bar(), bar_to(100)]],
- "none": [[no_bar(), bar_to(33.33)], [bar_to(66.66), bar_to(100)]],
- },
- ),
- (
- "mid",
- {
- "index": [[bar_to(33.33), bar_to(50)], [bar_to(100), bar_to(100)]],
- "columns": [[bar_to(50), bar_to(100)], [bar_to(75), bar_to(100)]],
- "none": [[bar_to(25), bar_to(50)], [bar_to(75), bar_to(100)]],
- },
- ),
- (
- "zero",
- {
- "index": [
- [bar_from_to(50, 66.66), bar_from_to(50, 75)],
- [bar_from_to(50, 100), bar_from_to(50, 100)],
- ],
- "columns": [
- [bar_from_to(50, 75), bar_from_to(50, 100)],
- [bar_from_to(50, 87.5), bar_from_to(50, 100)],
- ],
- "none": [
- [bar_from_to(50, 62.5), bar_from_to(50, 75)],
- [bar_from_to(50, 87.5), bar_from_to(50, 100)],
- ],
- },
- ),
- (
- 2,
- {
- "index": [
- [bar_to(50), no_bar()],
- [bar_from_to(50, 100), bar_from_to(50, 100)],
- ],
- "columns": [
- [bar_to(50), no_bar()],
- [bar_from_to(50, 75), bar_from_to(50, 100)],
- ],
- "none": [
- [bar_from_to(25, 50), no_bar()],
- [bar_from_to(50, 75), bar_from_to(50, 100)],
- ],
- },
- ),
- ],
-)
-@pytest.mark.parametrize("axis", ["index", "columns", "none"])
-def test_align_axis(align, exp, axis):
- # test all axis combinations with positive values and different aligns
- data = DataFrame([[1, 2], [3, 4]])
- result = (
- data.style.bar(align=align, axis=None if axis == "none" else axis)
- ._compute()
- .ctx
- )
- expected = {
- (0, 0): exp[axis][0][0],
- (0, 1): exp[axis][0][1],
- (1, 0): exp[axis][1][0],
- (1, 1): exp[axis][1][1],
- }
- assert result == expected
-
-
-@pytest.mark.parametrize(
- "values, vmin, vmax",
- [
- ("positive", 1.5, 2.5),
- ("negative", -2.5, -1.5),
- ("mixed", -2.5, 1.5),
- ],
-)
-@pytest.mark.parametrize("nullify", [None, "vmin", "vmax"]) # test min/max separately
-@pytest.mark.parametrize("align", ["left", "right", "zero", "mid"])
-def test_vmin_vmax_clipping(df_pos, df_neg, df_mix, values, vmin, vmax, nullify, align):
- # test that clipping occurs if any vmin > data_values or vmax < data_values
- if align == "mid": # mid acts as left or right in each case
- if values == "positive":
- align = "left"
- elif values == "negative":
- align = "right"
- df = {"positive": df_pos, "negative": df_neg, "mixed": df_mix}[values]
- vmin = None if nullify == "vmin" else vmin
- vmax = None if nullify == "vmax" else vmax
-
- clip_df = df.where(df <= (vmax if vmax else 999), other=vmax)
- clip_df = clip_df.where(clip_df >= (vmin if vmin else -999), other=vmin)
-
- result = (
- df.style.bar(align=align, vmin=vmin, vmax=vmax, color=["red", "green"])
- ._compute()
- .ctx
- )
- expected = clip_df.style.bar(align=align, color=["red", "green"])._compute().ctx
- assert result == expected
-
-
-@pytest.mark.parametrize(
- "values, vmin, vmax",
- [
- ("positive", 0.5, 4.5),
- ("negative", -4.5, -0.5),
- ("mixed", -4.5, 4.5),
- ],
-)
-@pytest.mark.parametrize("nullify", [None, "vmin", "vmax"]) # test min/max separately
-@pytest.mark.parametrize("align", ["left", "right", "zero", "mid"])
-def test_vmin_vmax_widening(df_pos, df_neg, df_mix, values, vmin, vmax, nullify, align):
- # test that widening occurs if any vmax > data_values or vmin < data_values
- if align == "mid": # mid acts as left or right in each case
- if values == "positive":
- align = "left"
- elif values == "negative":
- align = "right"
- df = {"positive": df_pos, "negative": df_neg, "mixed": df_mix}[values]
- vmin = None if nullify == "vmin" else vmin
- vmax = None if nullify == "vmax" else vmax
-
- expand_df = df.copy()
- expand_df.loc[3, :], expand_df.loc[4, :] = vmin, vmax
-
- result = (
- df.style.bar(align=align, vmin=vmin, vmax=vmax, color=["red", "green"])
- ._compute()
- .ctx
- )
- expected = expand_df.style.bar(align=align, color=["red", "green"])._compute().ctx
- assert result.items() <= expected.items()
-
-
-def test_numerics():
- # test data is pre-selected for numeric values
- data = DataFrame([[1, "a"], [2, "b"]])
- result = data.style.bar()._compute().ctx
- assert (0, 1) not in result
- assert (1, 1) not in result
-
-
-@pytest.mark.parametrize(
- "align, exp",
- [
- ("left", [no_bar(), bar_to(100, "green")]),
- ("right", [bar_to(100, "red"), no_bar()]),
- ("mid", [bar_to(25, "red"), bar_from_to(25, 100, "green")]),
- ("zero", [bar_from_to(33.33, 50, "red"), bar_from_to(50, 100, "green")]),
- ],
-)
-def test_colors_mixed(align, exp):
- data = DataFrame([[-1], [3]])
- result = data.style.bar(align=align, color=["red", "green"])._compute().ctx
- assert result == {(0, 0): exp[0], (1, 0): exp[1]}
-
-
-def test_bar_align_height():
- # test when keyword height is used 'no-repeat center' and 'background-size' present
- data = DataFrame([[1], [2]])
- result = data.style.bar(align="left", height=50)._compute().ctx
- bg_s = "linear-gradient(90deg, #d65f5f 100.0%, transparent 100.0%) no-repeat center"
- expected = {
- (0, 0): [("width", "10em")],
- (1, 0): [
- ("width", "10em"),
- ("background", bg_s),
- ("background-size", "100% 50.0%"),
- ],
- }
- assert result == expected
-
-
-def test_bar_value_error_raises():
- df = DataFrame({"A": [-100, -60, -30, -20]})
-
- msg = "`align` should be in {'left', 'right', 'mid', 'mean', 'zero'} or"
- with pytest.raises(ValueError, match=msg):
- df.style.bar(align="poorly", color=["#d65f5f", "#5fba7d"]).to_html()
-
- msg = r"`width` must be a value in \[0, 100\]"
- with pytest.raises(ValueError, match=msg):
- df.style.bar(width=200).to_html()
-
- msg = r"`height` must be a value in \[0, 100\]"
- with pytest.raises(ValueError, match=msg):
- df.style.bar(height=200).to_html()
diff --git a/pandas/tests/io/formats/style/test_html.py b/pandas/tests/io/formats/style/test_html.py
index 495dc82f0e7bd..74b4c7ea3977c 100644
--- a/pandas/tests/io/formats/style/test_html.py
+++ b/pandas/tests/io/formats/style/test_html.py
@@ -1,12 +1,8 @@
from textwrap import dedent
-import numpy as np
import pytest
-from pandas import (
- DataFrame,
- MultiIndex,
-)
+from pandas import DataFrame
jinja2 = pytest.importorskip("jinja2")
from pandas.io.formats.style import Styler
@@ -20,12 +16,6 @@ def styler():
return Styler(DataFrame([[2.61], [2.69]], index=["a", "b"], columns=["A"]))
-@pytest.fixture
-def styler_mi():
- midx = MultiIndex.from_product([["a", "b"], ["c", "d"]])
- return Styler(DataFrame(np.arange(16).reshape(4, 4), index=midx, columns=midx))
-
-
@pytest.fixture
def tpl_style():
return env.get_template("html_style.tpl")
@@ -41,8 +31,8 @@ def test_html_template_extends_options():
# to understand the dependency
with open("pandas/io/formats/templates/html.tpl") as file:
result = file.read()
- assert "{% include html_style_tpl %}" in result
- assert "{% include html_table_tpl %}" in result
+ assert '{% include "html_style.tpl" %}' in result
+ assert '{% include "html_table.tpl" %}' in result
def test_exclude_styles(styler):
@@ -223,191 +213,26 @@ def test_block_names(tpl_style, tpl_table):
assert result2 == expected_table
-def test_from_custom_template_table(tmpdir):
- p = tmpdir.mkdir("tpl").join("myhtml_table.tpl")
- p.write(
- dedent(
- """\
- {% extends "html_table.tpl" %}
- {% block table %}
- {{custom_title}}
- {{ super() }}
- {% endblock table %}"""
- )
- )
- result = Styler.from_custom_template(str(tmpdir.join("tpl")), "myhtml_table.tpl")
- assert issubclass(result, Styler)
- assert result.env is not Styler.env
- assert result.template_html_table is not Styler.template_html_table
- styler = result(DataFrame({"A": [1, 2]}))
- assert "My Title \n\n\n
- {{ super() }}
- {% endblock style %}"""
+ {% extends "html.tpl" %}
+ {% block table %}
+ {{ table_title|default("My Table") }}
+ {{ super() }}
+ {% endblock table %}"""
)
)
- result = Styler.from_custom_template(
- str(tmpdir.join("tpl")), html_style="myhtml_style.tpl"
- )
+ result = Styler.from_custom_template(str(tmpdir.join("templates")), "myhtml.tpl")
assert issubclass(result, Styler)
assert result.env is not Styler.env
- assert result.template_html_style is not Styler.template_html_style
+ assert result.template_html is not Styler.template_html
styler = result(DataFrame({"A": [1, 2]}))
- assert ' \n\n