Skip to content

Commit 4a61e6a

Browse files
author
MarcoGorelli
committed
Merge remote-tracking branch 'upstream/main' into allow-mixed-iso
2 parents e01b6ee + 0105aa2 commit 4a61e6a

File tree

206 files changed

+3335
-2047
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

206 files changed

+3335
-2047
lines changed

.github/workflows/wheels.yml

+4-2
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,8 @@ jobs:
8686
activate-environment: test
8787
channels: conda-forge, anaconda
8888
channel-priority: true
89-
mamba-version: "*"
89+
# mamba fails to solve, also we really don't need this since we're just installing python
90+
# mamba-version: "*"
9091

9192
- name: Test wheels (Windows 64-bit only)
9293
if: ${{ matrix.buildplat[1] == 'win_amd64' }}
@@ -154,7 +155,8 @@ jobs:
154155
python-version: '3.8'
155156
channels: conda-forge
156157
channel-priority: true
157-
mamba-version: "*"
158+
# mamba fails to solve, also we really don't need this since we're just installing python
159+
# mamba-version: "*"
158160

159161
- name: Build sdist
160162
run: |

.pre-commit-config.yaml

+2-2
Original file line numberDiff line numberDiff line change
@@ -92,7 +92,7 @@ repos:
9292
args: [--disable=all, --enable=redefined-outer-name]
9393
stages: [manual]
9494
- repo: https://github.com/PyCQA/isort
95-
rev: 5.11.4
95+
rev: 5.12.0
9696
hooks:
9797
- id: isort
9898
- repo: https://github.com/asottile/pyupgrade
@@ -135,7 +135,7 @@ repos:
135135
types: [python]
136136
stages: [manual]
137137
additional_dependencies: &pyright_dependencies
138-
138+
139139
- id: pyright_reportGeneralTypeIssues
140140
# note: assumes python env is setup and activated
141141
name: pyright reportGeneralTypeIssues

LICENSE

+1-1
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ BSD 3-Clause License
33
Copyright (c) 2008-2011, AQR Capital Management, LLC, Lambda Foundry, Inc. and PyData Development Team
44
All rights reserved.
55

6-
Copyright (c) 2011-2022, Open source contributors.
6+
Copyright (c) 2011-2023, Open source contributors.
77

88
Redistribution and use in source and binary forms, with or without
99
modification, are permitted provided that the following conditions are met:

asv_bench/benchmarks/indexing.py

+32
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
import numpy as np
99

1010
from pandas import (
11+
NA,
1112
CategoricalIndex,
1213
DataFrame,
1314
Index,
@@ -83,6 +84,37 @@ def time_loc_slice(self, index, index_structure):
8384
self.data.loc[:800000]
8485

8586

87+
class NumericMaskedIndexing:
88+
monotonic_list = list(range(10**6))
89+
non_monotonic_list = (
90+
list(range(50)) + [54, 53, 52, 51] + list(range(55, 10**6 - 1))
91+
)
92+
93+
params = [
94+
("Int64", "UInt64", "Float64"),
95+
(True, False),
96+
]
97+
param_names = ["dtype", "monotonic"]
98+
99+
def setup(self, dtype, monotonic):
100+
101+
indices = {
102+
True: Index(self.monotonic_list, dtype=dtype),
103+
False: Index(self.non_monotonic_list, dtype=dtype).append(
104+
Index([NA], dtype=dtype)
105+
),
106+
}
107+
self.data = indices[monotonic]
108+
self.indexer = np.arange(300, 1_000)
109+
self.data_dups = self.data.append(self.data)
110+
111+
def time_get_indexer(self, dtype, monotonic):
112+
self.data.get_indexer(self.indexer)
113+
114+
def time_get_indexer_dups(self, dtype, monotonic):
115+
self.data.get_indexer_for(self.indexer)
116+
117+
86118
class NonNumericSeriesIndexing:
87119

88120
params = [

asv_bench/benchmarks/indexing_engines.py

+81-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,8 @@
11
"""
2-
Benchmarks in this file depend exclusively on code in _libs/
2+
Benchmarks in this file depend mostly on code in _libs/
3+
4+
We have to created masked arrays to test the masked engine though. The
5+
array is unpacked on the Cython level.
36
47
If a PR does not edit anything in _libs, it is very unlikely that benchmarks
58
in this file will be affected.
@@ -9,6 +12,8 @@
912

1013
from pandas._libs import index as libindex
1114

15+
from pandas.core.arrays import BaseMaskedArray
16+
1217

1318
def _get_numeric_engines():
1419
engine_names = [
@@ -30,6 +35,26 @@ def _get_numeric_engines():
3035
]
3136

3237

38+
def _get_masked_engines():
39+
engine_names = [
40+
("MaskedInt64Engine", "Int64"),
41+
("MaskedInt32Engine", "Int32"),
42+
("MaskedInt16Engine", "Int16"),
43+
("MaskedInt8Engine", "Int8"),
44+
("MaskedUInt64Engine", "UInt64"),
45+
("MaskedUInt32Engine", "UInt32"),
46+
("MaskedUInt16engine", "UInt16"),
47+
("MaskedUInt8Engine", "UInt8"),
48+
("MaskedFloat64Engine", "Float64"),
49+
("MaskedFloat32Engine", "Float32"),
50+
]
51+
return [
52+
(getattr(libindex, engine_name), dtype)
53+
for engine_name, dtype in engine_names
54+
if hasattr(libindex, engine_name)
55+
]
56+
57+
3358
class NumericEngineIndexing:
3459

3560
params = [
@@ -80,6 +105,61 @@ def time_get_loc_near_middle(self, engine_and_dtype, index_type, unique, N):
80105
self.data.get_loc(self.key_middle)
81106

82107

108+
class MaskedNumericEngineIndexing:
109+
110+
params = [
111+
_get_masked_engines(),
112+
["monotonic_incr", "monotonic_decr", "non_monotonic"],
113+
[True, False],
114+
[10**5, 2 * 10**6], # 2e6 is above SIZE_CUTOFF
115+
]
116+
param_names = ["engine_and_dtype", "index_type", "unique", "N"]
117+
118+
def setup(self, engine_and_dtype, index_type, unique, N):
119+
engine, dtype = engine_and_dtype
120+
121+
if index_type == "monotonic_incr":
122+
if unique:
123+
arr = np.arange(N * 3, dtype=dtype.lower())
124+
else:
125+
values = list([1] * N + [2] * N + [3] * N)
126+
arr = np.array(values, dtype=dtype.lower())
127+
mask = np.zeros(N * 3, dtype=np.bool_)
128+
elif index_type == "monotonic_decr":
129+
if unique:
130+
arr = np.arange(N * 3, dtype=dtype.lower())[::-1]
131+
else:
132+
values = list([1] * N + [2] * N + [3] * N)
133+
arr = np.array(values, dtype=dtype.lower())[::-1]
134+
mask = np.zeros(N * 3, dtype=np.bool_)
135+
else:
136+
assert index_type == "non_monotonic"
137+
if unique:
138+
arr = np.zeros(N * 3, dtype=dtype.lower())
139+
arr[:N] = np.arange(N * 2, N * 3, dtype=dtype.lower())
140+
arr[N:] = np.arange(N * 2, dtype=dtype.lower())
141+
142+
else:
143+
arr = np.array([1, 2, 3] * N, dtype=dtype.lower())
144+
mask = np.zeros(N * 3, dtype=np.bool_)
145+
mask[-1] = True
146+
147+
self.data = engine(BaseMaskedArray(arr, mask))
148+
# code belows avoids populating the mapping etc. while timing.
149+
self.data.get_loc(2)
150+
151+
self.key_middle = arr[len(arr) // 2]
152+
self.key_early = arr[2]
153+
154+
def time_get_loc(self, engine_and_dtype, index_type, unique, N):
155+
self.data.get_loc(self.key_early)
156+
157+
def time_get_loc_near_middle(self, engine_and_dtype, index_type, unique, N):
158+
# searchsorted performance may be different near the middle of a range
159+
# vs near an endpoint
160+
self.data.get_loc(self.key_middle)
161+
162+
83163
class ObjectEngineIndexing:
84164

85165
params = [("monotonic_incr", "monotonic_decr", "non_monotonic")]

ci/code_checks.sh

+66-3
Original file line numberDiff line numberDiff line change
@@ -83,7 +83,7 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
8383
$BASE_DIR/scripts/validate_docstrings.py --format=actions --errors=EX04,GL01,GL02,GL03,GL04,GL05,GL06,GL07,GL09,GL10,PR03,PR04,PR05,PR06,PR08,PR09,PR10,RT01,RT02,RT04,RT05,SA02,SA03,SA04,SS01,SS02,SS03,SS04,SS05,SS06
8484
RET=$(($RET + $?)) ; echo $MSG "DONE"
8585

86-
MSG='Partially validate docstrings (EX01)' ; echo $MSG
86+
MSG='Partially validate docstrings (EX01)' ; echo $MSG
8787
$BASE_DIR/scripts/validate_docstrings.py --format=actions --errors=EX01 --ignore_functions \
8888
pandas.Series.index \
8989
pandas.Series.dtype \
@@ -187,7 +187,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
187187
pandas.show_versions \
188188
pandas.test \
189189
pandas.NaT \
190-
pandas.Timestamp.unit \
191190
pandas.Timestamp.as_unit \
192191
pandas.Timestamp.ctime \
193192
pandas.Timestamp.date \
@@ -574,7 +573,71 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
574573
pandas.DataFrame.sparse.to_coo \
575574
pandas.DataFrame.to_gbq \
576575
pandas.DataFrame.style \
577-
pandas.DataFrame.__dataframe__ \
576+
pandas.DataFrame.__dataframe__
577+
RET=$(($RET + $?)) ; echo $MSG "DONE"
578+
579+
MSG='Partially validate docstrings (EX02)' ; echo $MSG
580+
$BASE_DIR/scripts/validate_docstrings.py --format=actions --errors=EX02 --ignore_functions \
581+
pandas.DataFrame.plot.line \
582+
pandas.DataFrame.std \
583+
pandas.DataFrame.var \
584+
pandas.Index.factorize \
585+
pandas.Period.strftime \
586+
pandas.Series.factorize \
587+
pandas.Series.floordiv \
588+
pandas.Series.plot.line \
589+
pandas.Series.rfloordiv \
590+
pandas.Series.sparse.density \
591+
pandas.Series.sparse.npoints \
592+
pandas.Series.sparse.sp_values \
593+
pandas.Series.std \
594+
pandas.Series.var \
595+
pandas.Timestamp.fromtimestamp \
596+
pandas.api.types.infer_dtype \
597+
pandas.api.types.is_bool_dtype \
598+
pandas.api.types.is_categorical_dtype \
599+
pandas.api.types.is_complex_dtype \
600+
pandas.api.types.is_datetime64_any_dtype \
601+
pandas.api.types.is_datetime64_dtype \
602+
pandas.api.types.is_datetime64_ns_dtype \
603+
pandas.api.types.is_datetime64tz_dtype \
604+
pandas.api.types.is_dict_like \
605+
pandas.api.types.is_file_like \
606+
pandas.api.types.is_float_dtype \
607+
pandas.api.types.is_hashable \
608+
pandas.api.types.is_int64_dtype \
609+
pandas.api.types.is_integer_dtype \
610+
pandas.api.types.is_interval_dtype \
611+
pandas.api.types.is_iterator \
612+
pandas.api.types.is_list_like \
613+
pandas.api.types.is_named_tuple \
614+
pandas.api.types.is_numeric_dtype \
615+
pandas.api.types.is_object_dtype \
616+
pandas.api.types.is_period_dtype \
617+
pandas.api.types.is_re \
618+
pandas.api.types.is_re_compilable \
619+
pandas.api.types.is_signed_integer_dtype \
620+
pandas.api.types.is_sparse \
621+
pandas.api.types.is_string_dtype \
622+
pandas.api.types.is_timedelta64_dtype \
623+
pandas.api.types.is_timedelta64_ns_dtype \
624+
pandas.api.types.is_unsigned_integer_dtype \
625+
pandas.core.groupby.DataFrameGroupBy.take \
626+
pandas.core.groupby.SeriesGroupBy.take \
627+
pandas.factorize \
628+
pandas.io.formats.style.Styler.concat \
629+
pandas.io.formats.style.Styler.export \
630+
pandas.io.formats.style.Styler.set_td_classes \
631+
pandas.io.formats.style.Styler.use \
632+
pandas.io.json.build_table_schema \
633+
pandas.merge_ordered \
634+
pandas.option_context \
635+
pandas.plotting.andrews_curves \
636+
pandas.plotting.autocorrelation_plot \
637+
pandas.plotting.lag_plot \
638+
pandas.plotting.parallel_coordinates \
639+
pandas.plotting.radviz \
640+
pandas.tseries.frequencies.to_offset
578641
RET=$(($RET + $?)) ; echo $MSG "DONE"
579642

580643
fi

doc/source/development/contributing_docstring.rst

-2
Original file line numberDiff line numberDiff line change
@@ -67,8 +67,6 @@ case of pandas, the NumPy docstring convention is followed. These conventions ar
6767
explained in this document:
6868

6969
* `numpydoc docstring guide <https://numpydoc.readthedocs.io/en/latest/format.html>`_
70-
(which is based in the original `Guide to NumPy/SciPy documentation
71-
<https://github.com/numpy/numpy/blob/main/doc/HOWTO_DOCUMENT.rst.txt>`_)
7270

7371
numpydoc is a Sphinx extension to support the NumPy docstring convention.
7472

doc/source/development/internals.rst

+23-23
Original file line numberDiff line numberDiff line change
@@ -15,24 +15,24 @@ Indexing
1515
In pandas there are a few objects implemented which can serve as valid
1616
containers for the axis labels:
1717

18-
* ``Index``: the generic "ordered set" object, an ndarray of object dtype
18+
* :class:`Index`: the generic "ordered set" object, an ndarray of object dtype
1919
assuming nothing about its contents. The labels must be hashable (and
2020
likely immutable) and unique. Populates a dict of label to location in
2121
Cython to do ``O(1)`` lookups.
2222
* ``Int64Index``: a version of ``Index`` highly optimized for 64-bit integer
2323
data, such as time stamps
2424
* ``Float64Index``: a version of ``Index`` highly optimized for 64-bit float data
25-
* ``MultiIndex``: the standard hierarchical index object
26-
* ``DatetimeIndex``: An Index object with ``Timestamp`` boxed elements (impl are the int64 values)
27-
* ``TimedeltaIndex``: An Index object with ``Timedelta`` boxed elements (impl are the in64 values)
28-
* ``PeriodIndex``: An Index object with Period elements
25+
* :class:`MultiIndex`: the standard hierarchical index object
26+
* :class:`DatetimeIndex`: An Index object with :class:`Timestamp` boxed elements (impl are the int64 values)
27+
* :class:`TimedeltaIndex`: An Index object with :class:`Timedelta` boxed elements (impl are the in64 values)
28+
* :class:`PeriodIndex`: An Index object with Period elements
2929

3030
There are functions that make the creation of a regular index easy:
3131

32-
* ``date_range``: fixed frequency date range generated from a time rule or
32+
* :func:`date_range`: fixed frequency date range generated from a time rule or
3333
DateOffset. An ndarray of Python datetime objects
34-
* ``period_range``: fixed frequency date range generated from a time rule or
35-
DateOffset. An ndarray of ``Period`` objects, representing timespans
34+
* :func:`period_range`: fixed frequency date range generated from a time rule or
35+
DateOffset. An ndarray of :class:`Period` objects, representing timespans
3636

3737
The motivation for having an ``Index`` class in the first place was to enable
3838
different implementations of indexing. This means that it's possible for you,
@@ -43,28 +43,28 @@ From an internal implementation point of view, the relevant methods that an
4343
``Index`` must define are one or more of the following (depending on how
4444
incompatible the new object internals are with the ``Index`` functions):
4545

46-
* ``get_loc``: returns an "indexer" (an integer, or in some cases a
46+
* :meth:`~Index.get_loc`: returns an "indexer" (an integer, or in some cases a
4747
slice object) for a label
48-
* ``slice_locs``: returns the "range" to slice between two labels
49-
* ``get_indexer``: Computes the indexing vector for reindexing / data
48+
* :meth:`~Index.slice_locs`: returns the "range" to slice between two labels
49+
* :meth:`~Index.get_indexer`: Computes the indexing vector for reindexing / data
5050
alignment purposes. See the source / docstrings for more on this
51-
* ``get_indexer_non_unique``: Computes the indexing vector for reindexing / data
51+
* :meth:`~Index.get_indexer_non_unique`: Computes the indexing vector for reindexing / data
5252
alignment purposes when the index is non-unique. See the source / docstrings
5353
for more on this
54-
* ``reindex``: Does any pre-conversion of the input index then calls
54+
* :meth:`~Index.reindex`: Does any pre-conversion of the input index then calls
5555
``get_indexer``
56-
* ``union``, ``intersection``: computes the union or intersection of two
56+
* :meth:`~Index.union`, :meth:`~Index.intersection`: computes the union or intersection of two
5757
Index objects
58-
* ``insert``: Inserts a new label into an Index, yielding a new object
59-
* ``delete``: Delete a label, yielding a new object
60-
* ``drop``: Deletes a set of labels
61-
* ``take``: Analogous to ndarray.take
58+
* :meth:`~Index.insert`: Inserts a new label into an Index, yielding a new object
59+
* :meth:`~Index.delete`: Delete a label, yielding a new object
60+
* :meth:`~Index.drop`: Deletes a set of labels
61+
* :meth:`~Index.take`: Analogous to ndarray.take
6262

6363
MultiIndex
6464
~~~~~~~~~~
6565

66-
Internally, the ``MultiIndex`` consists of a few things: the **levels**, the
67-
integer **codes** (until version 0.24 named *labels*), and the level **names**:
66+
Internally, the :class:`MultiIndex` consists of a few things: the **levels**, the
67+
integer **codes**, and the level **names**:
6868

6969
.. ipython:: python
7070
@@ -80,13 +80,13 @@ You can probably guess that the codes determine which unique element is
8080
identified with that location at each layer of the index. It's important to
8181
note that sortedness is determined **solely** from the integer codes and does
8282
not check (or care) whether the levels themselves are sorted. Fortunately, the
83-
constructors ``from_tuples`` and ``from_arrays`` ensure that this is true, but
84-
if you compute the levels and codes yourself, please be careful.
83+
constructors :meth:`~MultiIndex.from_tuples` and :meth:`~MultiIndex.from_arrays` ensure
84+
that this is true, but if you compute the levels and codes yourself, please be careful.
8585

8686
Values
8787
~~~~~~
8888

89-
pandas extends NumPy's type system with custom types, like ``Categorical`` or
89+
pandas extends NumPy's type system with custom types, like :class:`Categorical` or
9090
datetimes with a timezone, so we have multiple notions of "values". For 1-D
9191
containers (``Index`` classes and ``Series``) we have the following convention:
9292

0 commit comments

Comments
 (0)