pandas-dev
diff --git a/‎.github/workflows/wheels.yml
+4-2 b/‎.github/workflows/wheels.yml
+4-2
diff --git a/‎.pre-commit-config.yaml
+2-2 b/‎.pre-commit-config.yaml
+2-2
diff --git a/‎LICENSE
+1-1 b/‎LICENSE
+1-1
diff --git a/‎asv_bench/benchmarks/indexing.py
+32 b/‎asv_bench/benchmarks/indexing.py
+32
diff --git a/‎asv_bench/benchmarks/indexing_engines.py
+81-1 b/‎asv_bench/benchmarks/indexing_engines.py
+81-1
diff --git a/‎ci/code_checks.sh
+66-3 b/‎ci/code_checks.sh
+66-3
diff --git a/‎doc/source/development/contributing_docstring.rst
-2 b/‎doc/source/development/contributing_docstring.rst
-2
diff --git a/‎doc/source/development/internals.rst
+23-23 b/‎doc/source/development/internals.rst
+23-23
@@ -86,7 +86,8 @@ jobs:
           activate-environment: test
           channels: conda-forge, anaconda
           channel-priority: true
-          mamba-version: "*"
+          # mamba fails to solve, also we really don't need this since we're just installing python
+          # mamba-version: "*"
 
       - name: Test wheels (Windows 64-bit only)
         if: ${{ matrix.buildplat[1] == 'win_amd64' }}
@@ -154,7 +155,8 @@ jobs:
           python-version: '3.8'
           channels: conda-forge
           channel-priority: true
-          mamba-version: "*"
+          # mamba fails to solve, also we really don't need this since we're just installing python
+          # mamba-version: "*"
 
       - name: Build sdist
         run: |
 
@@ -92,7 +92,7 @@ repos:
         args: [--disable=all, --enable=redefined-outer-name]
         stages: [manual]
 -   repo: https://github.com/PyCQA/isort
-    rev: 5.11.4
+    rev: 5.12.0
     hooks:
     -   id: isort
 -   repo: https://github.com/asottile/pyupgrade
@@ -135,7 +135,7 @@ repos:
         types: [python]
         stages: [manual]
         additional_dependencies: &pyright_dependencies
-        - [email protected].276
+        - [email protected].284
     -   id: pyright_reportGeneralTypeIssues
         # note: assumes python env is setup and activated
         name: pyright reportGeneralTypeIssues
 
@@ -3,7 +3,7 @@ BSD 3-Clause License
 Copyright (c) 2008-2011, AQR Capital Management, LLC, Lambda Foundry, Inc. and PyData Development Team
 All rights reserved.
 
-Copyright (c) 2011-2022, Open source contributors.
+Copyright (c) 2011-2023, Open source contributors.
 
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are met:
 
@@ -8,6 +8,7 @@
 import numpy as np
 
 from pandas import (
+    NA,
     CategoricalIndex,
     DataFrame,
     Index,
@@ -83,6 +84,37 @@ def time_loc_slice(self, index, index_structure):
         self.data.loc[:800000]
 
 
+class NumericMaskedIndexing:
+    monotonic_list = list(range(10**6))
+    non_monotonic_list = (
+        list(range(50)) + [54, 53, 52, 51] + list(range(55, 10**6 - 1))
+    )
+
+    params = [
+        ("Int64", "UInt64", "Float64"),
+        (True, False),
+    ]
+    param_names = ["dtype", "monotonic"]
+
+    def setup(self, dtype, monotonic):
+
+        indices = {
+            True: Index(self.monotonic_list, dtype=dtype),
+            False: Index(self.non_monotonic_list, dtype=dtype).append(
+                Index([NA], dtype=dtype)
+            ),
+        }
+        self.data = indices[monotonic]
+        self.indexer = np.arange(300, 1_000)
+        self.data_dups = self.data.append(self.data)
+
+    def time_get_indexer(self, dtype, monotonic):
+        self.data.get_indexer(self.indexer)
+
+    def time_get_indexer_dups(self, dtype, monotonic):
+        self.data.get_indexer_for(self.indexer)
+
+
 class NonNumericSeriesIndexing:
 
     params = [
 
@@ -1,5 +1,8 @@
 """
-Benchmarks in this file depend exclusively on code in _libs/
+Benchmarks in this file depend mostly on code in _libs/
+
+We have to created masked arrays to test the masked engine though. The
+array is unpacked on the Cython level.
 
 If a PR does not edit anything in _libs, it is very unlikely that benchmarks
 in this file will be affected.
@@ -9,6 +12,8 @@
 
 from pandas._libs import index as libindex
 
+from pandas.core.arrays import BaseMaskedArray
+
 
 def _get_numeric_engines():
     engine_names = [
@@ -30,6 +35,26 @@ def _get_numeric_engines():
     ]
 
 
+def _get_masked_engines():
+    engine_names = [
+        ("MaskedInt64Engine", "Int64"),
+        ("MaskedInt32Engine", "Int32"),
+        ("MaskedInt16Engine", "Int16"),
+        ("MaskedInt8Engine", "Int8"),
+        ("MaskedUInt64Engine", "UInt64"),
+        ("MaskedUInt32Engine", "UInt32"),
+        ("MaskedUInt16engine", "UInt16"),
+        ("MaskedUInt8Engine", "UInt8"),
+        ("MaskedFloat64Engine", "Float64"),
+        ("MaskedFloat32Engine", "Float32"),
+    ]
+    return [
+        (getattr(libindex, engine_name), dtype)
+        for engine_name, dtype in engine_names
+        if hasattr(libindex, engine_name)
+    ]
+
+
 class NumericEngineIndexing:
 
     params = [
@@ -80,6 +105,61 @@ def time_get_loc_near_middle(self, engine_and_dtype, index_type, unique, N):
         self.data.get_loc(self.key_middle)
 
 
+class MaskedNumericEngineIndexing:
+
+    params = [
+        _get_masked_engines(),
+        ["monotonic_incr", "monotonic_decr", "non_monotonic"],
+        [True, False],
+        [10**5, 2 * 10**6],  # 2e6 is above SIZE_CUTOFF
+    ]
+    param_names = ["engine_and_dtype", "index_type", "unique", "N"]
+
+    def setup(self, engine_and_dtype, index_type, unique, N):
+        engine, dtype = engine_and_dtype
+
+        if index_type == "monotonic_incr":
+            if unique:
+                arr = np.arange(N * 3, dtype=dtype.lower())
+            else:
+                values = list([1] * N + [2] * N + [3] * N)
+                arr = np.array(values, dtype=dtype.lower())
+            mask = np.zeros(N * 3, dtype=np.bool_)
+        elif index_type == "monotonic_decr":
+            if unique:
+                arr = np.arange(N * 3, dtype=dtype.lower())[::-1]
+            else:
+                values = list([1] * N + [2] * N + [3] * N)
+                arr = np.array(values, dtype=dtype.lower())[::-1]
+            mask = np.zeros(N * 3, dtype=np.bool_)
+        else:
+            assert index_type == "non_monotonic"
+            if unique:
+                arr = np.zeros(N * 3, dtype=dtype.lower())
+                arr[:N] = np.arange(N * 2, N * 3, dtype=dtype.lower())
+                arr[N:] = np.arange(N * 2, dtype=dtype.lower())
+
+            else:
+                arr = np.array([1, 2, 3] * N, dtype=dtype.lower())
+            mask = np.zeros(N * 3, dtype=np.bool_)
+            mask[-1] = True
+
+        self.data = engine(BaseMaskedArray(arr, mask))
+        # code belows avoids populating the mapping etc. while timing.
+        self.data.get_loc(2)
+
+        self.key_middle = arr[len(arr) // 2]
+        self.key_early = arr[2]
+
+    def time_get_loc(self, engine_and_dtype, index_type, unique, N):
+        self.data.get_loc(self.key_early)
+
+    def time_get_loc_near_middle(self, engine_and_dtype, index_type, unique, N):
+        # searchsorted performance may be different near the middle of a range
+        #  vs near an endpoint
+        self.data.get_loc(self.key_middle)
+
+
 class ObjectEngineIndexing:
 
     params = [("monotonic_incr", "monotonic_decr", "non_monotonic")]
 
@@ -83,7 +83,7 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
     $BASE_DIR/scripts/validate_docstrings.py --format=actions --errors=EX04,GL01,GL02,GL03,GL04,GL05,GL06,GL07,GL09,GL10,PR03,PR04,PR05,PR06,PR08,PR09,PR10,RT01,RT02,RT04,RT05,SA02,SA03,SA04,SS01,SS02,SS03,SS04,SS05,SS06
     RET=$(($RET + $?)) ; echo $MSG "DONE"
 
-    MSG='Partially validate docstrings (EX01)' ; echo $MSG
+    MSG='Partially validate docstrings (EX01)' ;  echo $MSG
     $BASE_DIR/scripts/validate_docstrings.py --format=actions --errors=EX01 --ignore_functions \
         pandas.Series.index \
         pandas.Series.dtype \
@@ -187,7 +187,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
         pandas.show_versions \
         pandas.test \
         pandas.NaT \
-        pandas.Timestamp.unit \
         pandas.Timestamp.as_unit \
         pandas.Timestamp.ctime \
         pandas.Timestamp.date \
@@ -574,7 +573,71 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
         pandas.DataFrame.sparse.to_coo \
         pandas.DataFrame.to_gbq \
         pandas.DataFrame.style \
-        pandas.DataFrame.__dataframe__ \
+        pandas.DataFrame.__dataframe__
+    RET=$(($RET + $?)) ; echo $MSG "DONE"
+
+    MSG='Partially validate docstrings (EX02)' ;  echo $MSG
+    $BASE_DIR/scripts/validate_docstrings.py --format=actions --errors=EX02 --ignore_functions \
+        pandas.DataFrame.plot.line \
+        pandas.DataFrame.std \
+        pandas.DataFrame.var \
+        pandas.Index.factorize \
+        pandas.Period.strftime \
+        pandas.Series.factorize \
+        pandas.Series.floordiv \
+        pandas.Series.plot.line \
+        pandas.Series.rfloordiv \
+        pandas.Series.sparse.density \
+        pandas.Series.sparse.npoints \
+        pandas.Series.sparse.sp_values \
+        pandas.Series.std \
+        pandas.Series.var \
+        pandas.Timestamp.fromtimestamp \
+        pandas.api.types.infer_dtype \
+        pandas.api.types.is_bool_dtype \
+        pandas.api.types.is_categorical_dtype \
+        pandas.api.types.is_complex_dtype \
+        pandas.api.types.is_datetime64_any_dtype \
+        pandas.api.types.is_datetime64_dtype \
+        pandas.api.types.is_datetime64_ns_dtype \
+        pandas.api.types.is_datetime64tz_dtype \
+        pandas.api.types.is_dict_like \
+        pandas.api.types.is_file_like \
+        pandas.api.types.is_float_dtype \
+        pandas.api.types.is_hashable \
+        pandas.api.types.is_int64_dtype \
+        pandas.api.types.is_integer_dtype \
+        pandas.api.types.is_interval_dtype \
+        pandas.api.types.is_iterator \
+        pandas.api.types.is_list_like \
+        pandas.api.types.is_named_tuple \
+        pandas.api.types.is_numeric_dtype \
+        pandas.api.types.is_object_dtype \
+        pandas.api.types.is_period_dtype \
+        pandas.api.types.is_re \
+        pandas.api.types.is_re_compilable \
+        pandas.api.types.is_signed_integer_dtype \
+        pandas.api.types.is_sparse \
+        pandas.api.types.is_string_dtype \
+        pandas.api.types.is_timedelta64_dtype \
+        pandas.api.types.is_timedelta64_ns_dtype \
+        pandas.api.types.is_unsigned_integer_dtype \
+        pandas.core.groupby.DataFrameGroupBy.take \
+        pandas.core.groupby.SeriesGroupBy.take \
+        pandas.factorize \
+        pandas.io.formats.style.Styler.concat \
+        pandas.io.formats.style.Styler.export \
+        pandas.io.formats.style.Styler.set_td_classes \
+        pandas.io.formats.style.Styler.use \
+        pandas.io.json.build_table_schema \
+        pandas.merge_ordered \
+        pandas.option_context \
+        pandas.plotting.andrews_curves \
+        pandas.plotting.autocorrelation_plot \
+        pandas.plotting.lag_plot \
+        pandas.plotting.parallel_coordinates \
+        pandas.plotting.radviz \
+        pandas.tseries.frequencies.to_offset
     RET=$(($RET + $?)) ; echo $MSG "DONE"
 
 fi
 
@@ -67,8 +67,6 @@ case of pandas, the NumPy docstring convention is followed. These conventions ar
 explained in this document:
 
 * `numpydoc docstring guide <https://numpydoc.readthedocs.io/en/latest/format.html>`_
-  (which is based in the original `Guide to NumPy/SciPy documentation
-  <https://github.com/numpy/numpy/blob/main/doc/HOWTO_DOCUMENT.rst.txt>`_)
 
 numpydoc is a Sphinx extension to support the NumPy docstring convention.
 
 
@@ -15,24 +15,24 @@ Indexing
 In pandas there are a few objects implemented which can serve as valid
 containers for the axis labels:
 
-* ``Index``: the generic "ordered set" object, an ndarray of object dtype
+* :class:`Index`: the generic "ordered set" object, an ndarray of object dtype
   assuming nothing about its contents. The labels must be hashable (and
   likely immutable) and unique. Populates a dict of label to location in
   Cython to do ``O(1)`` lookups.
 * ``Int64Index``: a version of ``Index`` highly optimized for 64-bit integer
   data, such as time stamps
 * ``Float64Index``: a version of ``Index`` highly optimized for 64-bit float data
-* ``MultiIndex``: the standard hierarchical index object
-* ``DatetimeIndex``: An Index object with ``Timestamp`` boxed elements (impl are the int64 values)
-* ``TimedeltaIndex``: An Index object with ``Timedelta`` boxed elements (impl are the in64 values)
-* ``PeriodIndex``: An Index object with Period elements
+* :class:`MultiIndex`: the standard hierarchical index object
+* :class:`DatetimeIndex`: An Index object with :class:`Timestamp` boxed elements (impl are the int64 values)
+* :class:`TimedeltaIndex`: An Index object with :class:`Timedelta` boxed elements (impl are the in64 values)
+* :class:`PeriodIndex`: An Index object with Period elements
 
 There are functions that make the creation of a regular index easy:
 
-* ``date_range``: fixed frequency date range generated from a time rule or
+* :func:`date_range`: fixed frequency date range generated from a time rule or
   DateOffset. An ndarray of Python datetime objects
-* ``period_range``: fixed frequency date range generated from a time rule or
-  DateOffset. An ndarray of ``Period`` objects, representing timespans
+* :func:`period_range`: fixed frequency date range generated from a time rule or
+  DateOffset. An ndarray of :class:`Period` objects, representing timespans
 
 The motivation for having an ``Index`` class in the first place was to enable
 different implementations of indexing. This means that it's possible for you,
@@ -43,28 +43,28 @@ From an internal implementation point of view, the relevant methods that an
 ``Index`` must define are one or more of the following (depending on how
 incompatible the new object internals are with the ``Index`` functions):
 
-* ``get_loc``: returns an "indexer" (an integer, or in some cases a
+* :meth:`~Index.get_loc`: returns an "indexer" (an integer, or in some cases a
   slice object) for a label
-* ``slice_locs``: returns the "range" to slice between two labels
-* ``get_indexer``: Computes the indexing vector for reindexing / data
+* :meth:`~Index.slice_locs`: returns the "range" to slice between two labels
+* :meth:`~Index.get_indexer`: Computes the indexing vector for reindexing / data
   alignment purposes. See the source / docstrings for more on this
-* ``get_indexer_non_unique``: Computes the indexing vector for reindexing / data
+* :meth:`~Index.get_indexer_non_unique`: Computes the indexing vector for reindexing / data
   alignment purposes when the index is non-unique. See the source / docstrings
   for more on this
-* ``reindex``: Does any pre-conversion of the input index then calls
+* :meth:`~Index.reindex`: Does any pre-conversion of the input index then calls
   ``get_indexer``
-* ``union``, ``intersection``: computes the union or intersection of two
+* :meth:`~Index.union`, :meth:`~Index.intersection`: computes the union or intersection of two
   Index objects
-* ``insert``: Inserts a new label into an Index, yielding a new object
-* ``delete``: Delete a label, yielding a new object
-* ``drop``: Deletes a set of labels
-* ``take``: Analogous to ndarray.take
+* :meth:`~Index.insert`: Inserts a new label into an Index, yielding a new object
+* :meth:`~Index.delete`: Delete a label, yielding a new object
+* :meth:`~Index.drop`: Deletes a set of labels
+* :meth:`~Index.take`: Analogous to ndarray.take
 
 MultiIndex
 ~~~~~~~~~~
 
-Internally, the ``MultiIndex`` consists of a few things: the **levels**, the
-integer **codes** (until version 0.24 named *labels*), and the level **names**:
+Internally, the :class:`MultiIndex` consists of a few things: the **levels**, the
+integer **codes**, and the level **names**:
 
 .. ipython:: python
 
@@ -80,13 +80,13 @@ You can probably guess that the codes determine which unique element is
 identified with that location at each layer of the index. It's important to
 note that sortedness is determined **solely** from the integer codes and does
 not check (or care) whether the levels themselves are sorted. Fortunately, the
-constructors ``from_tuples`` and ``from_arrays`` ensure that this is true, but
-if you compute the levels and codes yourself, please be careful.
+constructors :meth:`~MultiIndex.from_tuples` and :meth:`~MultiIndex.from_arrays` ensure
+that this is true, but if you compute the levels and codes yourself, please be careful.
 
 Values
 ~~~~~~
 
-pandas extends NumPy's type system with custom types, like ``Categorical`` or
+pandas extends NumPy's type system with custom types, like :class:`Categorical` or
 datetimes with a timezone, so we have multiple notions of "values". For 1-D
 containers (``Index`` classes and ``Series``) we have the following convention: