Skip to content

Commit eedb617

Browse files
authored
Merge branch 'main' into fix-scatter-norm-keyword
2 parents a15bd69 + 3e718e3 commit eedb617

File tree

157 files changed

+3772
-1818
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

157 files changed

+3772
-1818
lines changed

.github/workflows/posix.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ jobs:
2828
pattern: ["not single_cpu", "single_cpu"]
2929
# Don't test pyarrow v2/3: Causes timeouts in read_csv engine
3030
# even if tests are skipped/xfailed
31-
pyarrow_version: ["5", "6", "7"]
31+
pyarrow_version: ["5", "7"]
3232
include:
3333
- env_file: actions-38-downstream_compat.yaml
3434
pattern: "not slow and not network and not single_cpu"

.pre-commit-config.yaml

+1-1
Original file line numberDiff line numberDiff line change
@@ -178,7 +178,7 @@ repos:
178178
language: python
179179
files: ^pandas/core/generic\.py$
180180
- id: pandas-errors-documented
181-
name: Ensure pandas errors are documented in doc/source/reference/general_utility_functions.rst
181+
name: Ensure pandas errors are documented in doc/source/reference/testing.rst
182182
entry: python scripts/pandas_errors_documented.py
183183
language: python
184184
files: ^pandas/errors/__init__.py$

Dockerfile

+6-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
FROM quay.io/condaforge/miniforge3:4.11.0-0
1+
FROM quay.io/condaforge/miniforge3
22

33
# if you forked pandas, you can pass in your own GitHub username to use your fork
44
# i.e. gh_username=myname
@@ -12,6 +12,11 @@ ENV DEBIAN_FRONTEND=noninteractive
1212
RUN apt-get update \
1313
&& apt-get -y install --no-install-recommends apt-utils dialog 2>&1 \
1414
#
15+
# Install tzdata and configure timezone (fix for tests which try to read from "/etc/localtime")
16+
&& apt-get -y install tzdata \
17+
&& ln -fs /usr/share/zoneinfo/Etc/UTC /etc/localtime \
18+
&& dpkg-reconfigure -f noninteractive tzdata \
19+
#
1520
# Verify git, process tools, lsb-release (common in install instructions for CLIs) installed
1621
&& apt-get -y install git iproute2 procps iproute2 lsb-release \
1722
#

asv_bench/benchmarks/groupby.py

+4-2
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818

1919
method_blocklist = {
2020
"object": {
21+
"diff",
2122
"median",
2223
"prod",
2324
"sem",
@@ -405,7 +406,7 @@ class GroupByMethods:
405406

406407
param_names = ["dtype", "method", "application", "ncols"]
407408
params = [
408-
["int", "float", "object", "datetime", "uint"],
409+
["int", "int16", "float", "object", "datetime", "uint"],
409410
[
410411
"all",
411412
"any",
@@ -417,6 +418,7 @@ class GroupByMethods:
417418
"cumprod",
418419
"cumsum",
419420
"describe",
421+
"diff",
420422
"ffill",
421423
"first",
422424
"head",
@@ -478,7 +480,7 @@ def setup(self, dtype, method, application, ncols):
478480
values = rng.take(taker, axis=0)
479481
if dtype == "int":
480482
key = np.random.randint(0, size, size=size)
481-
elif dtype == "uint":
483+
elif dtype in ("int16", "uint"):
482484
key = np.random.randint(0, size, size=size, dtype=dtype)
483485
elif dtype == "float":
484486
key = np.concatenate(

asv_bench/benchmarks/indexing.py

+39-9
Original file line numberDiff line numberDiff line change
@@ -204,11 +204,11 @@ class MultiIndexing:
204204
param_names = ["unique_levels"]
205205

206206
def setup(self, unique_levels):
207-
self.ndim = 2
207+
self.nlevels = 2
208208
if unique_levels:
209-
mi = MultiIndex.from_arrays([range(1000000)] * self.ndim)
209+
mi = MultiIndex.from_arrays([range(1000000)] * self.nlevels)
210210
else:
211-
mi = MultiIndex.from_product([range(1000)] * self.ndim)
211+
mi = MultiIndex.from_product([range(1000)] * self.nlevels)
212212
self.df = DataFrame(np.random.randn(len(mi)), index=mi)
213213

214214
self.tgt_slice = slice(200, 800)
@@ -232,27 +232,27 @@ def time_loc_partial_key_list(self, unique_levels):
232232
def time_loc_partial_key_scalar(self, unique_levels):
233233
self.df.loc[self.tgt_scalar, :]
234234

235-
def time_loc_partial_bool_indexer(self, unique_levels):
235+
def time_loc_partial_key_bool_indexer(self, unique_levels):
236236
self.df.loc[self.tgt_bool_indexer, :]
237237

238238
def time_loc_all_slices(self, unique_levels):
239-
target = tuple([self.tgt_slice] * self.ndim)
239+
target = tuple([self.tgt_slice] * self.nlevels)
240240
self.df.loc[target, :]
241241

242242
def time_loc_all_null_slices(self, unique_levels):
243-
target = tuple([self.tgt_null_slice] * self.ndim)
243+
target = tuple([self.tgt_null_slice] * self.nlevels)
244244
self.df.loc[target, :]
245245

246246
def time_loc_all_lists(self, unique_levels):
247-
target = tuple([self.tgt_list] * self.ndim)
247+
target = tuple([self.tgt_list] * self.nlevels)
248248
self.df.loc[target, :]
249249

250250
def time_loc_all_scalars(self, unique_levels):
251-
target = tuple([self.tgt_scalar] * self.ndim)
251+
target = tuple([self.tgt_scalar] * self.nlevels)
252252
self.df.loc[target, :]
253253

254254
def time_loc_all_bool_indexers(self, unique_levels):
255-
target = tuple([self.tgt_bool_indexer] * self.ndim)
255+
target = tuple([self.tgt_bool_indexer] * self.nlevels)
256256
self.df.loc[target, :]
257257

258258
def time_loc_slice_plus_null_slice(self, unique_levels):
@@ -263,6 +263,18 @@ def time_loc_null_slice_plus_slice(self, unique_levels):
263263
target = (self.tgt_null_slice, self.tgt_slice)
264264
self.df.loc[target, :]
265265

266+
def time_xs_level_0(self, unique_levels):
267+
target = self.tgt_scalar
268+
self.df.xs(target, level=0)
269+
270+
def time_xs_level_1(self, unique_levels):
271+
target = self.tgt_scalar
272+
self.df.xs(target, level=1)
273+
274+
def time_xs_full_key(self, unique_levels):
275+
target = tuple([self.tgt_scalar] * self.nlevels)
276+
self.df.xs(target)
277+
266278

267279
class IntervalIndexing:
268280
def setup_cache(self):
@@ -297,6 +309,24 @@ def time_get_indexer_mismatched_tz(self):
297309
self.dti.get_indexer(self.dti2)
298310

299311

312+
class SortedAndUnsortedDatetimeIndexLoc:
313+
def setup(self):
314+
dti = date_range("2016-01-01", periods=10000, tz="US/Pacific")
315+
index = np.array(dti)
316+
317+
unsorted_index = index.copy()
318+
unsorted_index[10] = unsorted_index[20]
319+
320+
self.df_unsorted = DataFrame(index=unsorted_index, data={"a": 1})
321+
self.df_sort = DataFrame(index=index, data={"a": 1})
322+
323+
def time_loc_unsorted(self):
324+
self.df_unsorted.loc["2016-6-11"]
325+
326+
def time_loc_sorted(self):
327+
self.df_sort.loc["2016-6-11"]
328+
329+
300330
class CategoricalIndexIndexing:
301331

302332
params = ["monotonic_incr", "monotonic_decr", "non_monotonic"]

asv_bench/benchmarks/reindex.py

+7-1
Original file line numberDiff line numberDiff line change
@@ -28,16 +28,22 @@ def setup(self):
2828
index = MultiIndex.from_arrays([level1, level2])
2929
self.s = Series(np.random.randn(N * K), index=index)
3030
self.s_subset = self.s[::2]
31+
self.s_subset_no_cache = self.s[::2].copy()
3132

3233
def time_reindex_dates(self):
3334
self.df.reindex(self.rng_subset)
3435

3536
def time_reindex_columns(self):
3637
self.df2.reindex(columns=self.df.columns[1:5])
3738

38-
def time_reindex_multiindex(self):
39+
def time_reindex_multiindex_with_cache(self):
40+
# MultiIndex._values gets cached
3941
self.s.reindex(self.s_subset.index)
4042

43+
def time_reindex_multiindex_no_cache(self):
44+
# Copy to avoid MultiIndex._values getting cached
45+
self.s.reindex(self.s_subset_no_cache.index.copy())
46+
4147

4248
class ReindexMethod:
4349

12 KB
Loading
8.51 KB
Loading

doc/source/reference/arrays.rst

+89-16
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,10 @@
66
pandas arrays, scalars, and data types
77
======================================
88

9+
*******
10+
Objects
11+
*******
12+
913
.. currentmodule:: pandas
1014

1115
For most data types, pandas uses NumPy arrays as the concrete
@@ -40,8 +44,8 @@ stored in a :class:`Series`, :class:`Index`, or as a column in a :class:`DataFra
4044

4145
.. _api.arrays.datetime:
4246

43-
Datetime data
44-
-------------
47+
Datetimes
48+
---------
4549

4650
NumPy cannot natively represent timezone-aware datetimes. pandas supports this
4751
with the :class:`arrays.DatetimeArray` extension array, which can hold timezone-naive
@@ -161,8 +165,8 @@ If the data are timezone-aware, then every value in the array must have the same
161165

162166
.. _api.arrays.timedelta:
163167

164-
Timedelta data
165-
--------------
168+
Timedeltas
169+
----------
166170

167171
NumPy can natively represent timedeltas. pandas provides :class:`Timedelta`
168172
for symmetry with :class:`Timestamp`.
@@ -216,8 +220,8 @@ A collection of :class:`Timedelta` may be stored in a :class:`TimedeltaArray`.
216220

217221
.. _api.arrays.period:
218222

219-
Timespan data
220-
-------------
223+
Periods
224+
-------
221225

222226
pandas represents spans of times as :class:`Period` objects.
223227

@@ -284,8 +288,8 @@ Every period in a :class:`arrays.PeriodArray` must have the same ``freq``.
284288

285289
.. _api.arrays.interval:
286290

287-
Interval data
288-
-------------
291+
Intervals
292+
---------
289293

290294
Arbitrary intervals can be represented as :class:`Interval` objects.
291295

@@ -379,8 +383,8 @@ pandas provides this through :class:`arrays.IntegerArray`.
379383

380384
.. _api.arrays.categorical:
381385

382-
Categorical data
383-
----------------
386+
Categoricals
387+
------------
384388

385389
pandas defines a custom data type for representing data that can take only a
386390
limited, fixed set of values. The dtype of a :class:`Categorical` can be described by
@@ -444,8 +448,8 @@ data. See :ref:`api.series.cat` for more.
444448

445449
.. _api.arrays.sparse:
446450

447-
Sparse data
448-
-----------
451+
Sparse
452+
------
449453

450454
Data where a single value is repeated many times (e.g. ``0`` or ``NaN``) may
451455
be stored efficiently as a :class:`arrays.SparseArray`.
@@ -469,8 +473,8 @@ and methods if the :class:`Series` contains sparse values. See
469473

470474
.. _api.arrays.string:
471475

472-
Text data
473-
---------
476+
Strings
477+
-------
474478

475479
When working with text data, where each valid element is a string or missing,
476480
we recommend using :class:`StringDtype` (with the alias ``"string"``).
@@ -494,8 +498,8 @@ See :ref:`api.series.str` for more.
494498

495499
.. _api.arrays.bool:
496500

497-
Boolean data with missing values
498-
--------------------------------
501+
Nullable Boolean
502+
----------------
499503

500504
The boolean dtype (with the alias ``"boolean"``) provides support for storing
501505
boolean data (``True``, ``False``) with missing values, which is not possible
@@ -525,3 +529,72 @@ with a bool :class:`numpy.ndarray`.
525529
DatetimeTZDtype.tz
526530
PeriodDtype.freq
527531
IntervalDtype.subtype
532+
533+
*********
534+
Utilities
535+
*********
536+
537+
Constructors
538+
------------
539+
.. autosummary::
540+
:toctree: api/
541+
542+
api.types.union_categoricals
543+
api.types.infer_dtype
544+
api.types.pandas_dtype
545+
546+
Data type introspection
547+
~~~~~~~~~~~~~~~~~~~~~~~
548+
.. autosummary::
549+
:toctree: api/
550+
551+
api.types.is_bool_dtype
552+
api.types.is_categorical_dtype
553+
api.types.is_complex_dtype
554+
api.types.is_datetime64_any_dtype
555+
api.types.is_datetime64_dtype
556+
api.types.is_datetime64_ns_dtype
557+
api.types.is_datetime64tz_dtype
558+
api.types.is_extension_type
559+
api.types.is_extension_array_dtype
560+
api.types.is_float_dtype
561+
api.types.is_int64_dtype
562+
api.types.is_integer_dtype
563+
api.types.is_interval_dtype
564+
api.types.is_numeric_dtype
565+
api.types.is_object_dtype
566+
api.types.is_period_dtype
567+
api.types.is_signed_integer_dtype
568+
api.types.is_string_dtype
569+
api.types.is_timedelta64_dtype
570+
api.types.is_timedelta64_ns_dtype
571+
api.types.is_unsigned_integer_dtype
572+
api.types.is_sparse
573+
574+
Iterable introspection
575+
~~~~~~~~~~~~~~~~~~~~~~
576+
.. autosummary::
577+
:toctree: api/
578+
579+
api.types.is_dict_like
580+
api.types.is_file_like
581+
api.types.is_list_like
582+
api.types.is_named_tuple
583+
api.types.is_iterator
584+
585+
Scalar introspection
586+
~~~~~~~~~~~~~~~~~~~~
587+
.. autosummary::
588+
:toctree: api/
589+
590+
api.types.is_bool
591+
api.types.is_categorical
592+
api.types.is_complex
593+
api.types.is_float
594+
api.types.is_hashable
595+
api.types.is_integer
596+
api.types.is_interval
597+
api.types.is_number
598+
api.types.is_re
599+
api.types.is_re_compilable
600+
api.types.is_scalar

doc/source/reference/general_functions.rst

-7
Original file line numberDiff line numberDiff line change
@@ -78,10 +78,3 @@ Hashing
7878

7979
util.hash_array
8080
util.hash_pandas_object
81-
82-
Testing
83-
~~~~~~~
84-
.. autosummary::
85-
:toctree: api/
86-
87-
test

0 commit comments

Comments
 (0)