From 99b1c3da4c9e0b60ebb8b047de3b039d6d320c19 Mon Sep 17 00:00:00 2001 From: KevsterAmp Date: Sat, 7 Sep 2024 11:17:25 +0800 Subject: [PATCH 01/11] sync to master --- .circleci/config.yml | 6 +- .github/actions/build_pandas/action.yml | 11 +- .github/workflows/unit-tests.yml | 18 +- .gitpod.yml | 4 +- .pre-commit-config.yaml | 4 +- README.md | 2 +- asv_bench/benchmarks/indexing_engines.py | 6 +- ci/code_checks.sh | 294 +-------- ci/deps/actions-310-minimum_versions.yaml | 2 +- ci/deps/actions-310.yaml | 2 +- ci/deps/actions-311-downstream_compat.yaml | 2 +- ci/deps/actions-311-numpydev.yaml | 1 - ci/deps/actions-311-pyarrownightly.yaml | 1 - ci/deps/actions-311.yaml | 2 +- ci/deps/actions-312.yaml | 2 +- ci/deps/actions-pypy-39.yaml | 1 - ci/deps/circle-311-arm64.yaml | 2 +- ci/meta.yaml | 1 - .../development/contributing_codebase.rst | 5 +- .../development/contributing_docstring.rst | 2 +- .../development/contributing_environment.rst | 12 +- .../development/debugging_extensions.rst | 6 +- doc/source/development/maintaining.rst | 4 +- doc/source/getting_started/index.rst | 2 +- doc/source/getting_started/install.rst | 12 +- doc/source/user_guide/cookbook.rst | 2 +- doc/source/user_guide/io.rst | 4 +- doc/source/user_guide/style.ipynb | 2 +- doc/source/user_guide/timeseries.rst | 4 +- doc/source/user_guide/visualization.rst | 2 +- doc/source/whatsnew/index.rst | 7 + doc/source/whatsnew/v0.15.0.rst | 10 +- doc/source/whatsnew/v0.15.1.rst | 2 +- doc/source/whatsnew/v0.21.0.rst | 16 +- doc/source/whatsnew/v0.21.1.rst | 4 +- doc/source/whatsnew/v0.23.0.rst | 22 +- doc/source/whatsnew/v0.23.1.rst | 2 +- doc/source/whatsnew/v0.24.0.rst | 74 +-- doc/source/whatsnew/v0.24.2.rst | 2 +- doc/source/whatsnew/v0.25.0.rst | 10 +- doc/source/whatsnew/v1.0.0.rst | 2 +- doc/source/whatsnew/v1.1.0.rst | 12 +- doc/source/whatsnew/v1.2.1.rst | 2 +- doc/source/whatsnew/v1.3.0.rst | 2 +- doc/source/whatsnew/v1.4.0.rst | 2 +- doc/source/whatsnew/v2.0.0.rst | 2 +- doc/source/whatsnew/v2.3.0.rst | 178 +++++ doc/source/whatsnew/v3.0.0.rst | 52 +- environment.yml | 2 +- pandas/__init__.py | 6 +- pandas/_config/config.py | 5 + pandas/_libs/arrays.pyx | 4 + pandas/_libs/hashtable.pyx | 5 +- pandas/_libs/hashtable_class_helper.pxi.in | 18 +- pandas/_libs/internals.pyx | 25 +- pandas/_libs/lib.pyx | 8 +- .../src/vendored/numpy/datetime/np_datetime.c | 48 +- pandas/_libs/tslib.pyx | 2 +- pandas/_libs/tslibs/conversion.pyx | 13 +- pandas/_libs/tslibs/nattype.pyx | 376 ++++++++++- pandas/_libs/tslibs/np_datetime.pxd | 2 +- pandas/_libs/tslibs/offsets.pyx | 155 ++++- pandas/_libs/tslibs/period.pyx | 157 ++++- pandas/_libs/tslibs/strptime.pyx | 8 +- pandas/_libs/tslibs/timedeltas.pyx | 60 +- pandas/_libs/tslibs/timestamps.pyx | 617 +++++++++++++++++- pandas/_libs/tslibs/timezones.pyx | 32 +- pandas/_libs/tslibs/tzconversion.pyx | 20 +- pandas/_testing/asserters.py | 13 +- pandas/_typing.py | 4 +- pandas/_version.py | 2 +- pandas/compat/__init__.py | 2 + pandas/compat/_optional.py | 1 + pandas/compat/pyarrow.py | 2 + pandas/conftest.py | 48 +- pandas/core/algorithms.py | 12 +- pandas/core/array_algos/quantile.py | 2 +- pandas/core/arraylike.py | 2 +- pandas/core/arrays/_arrow_string_mixins.py | 166 ++++- pandas/core/arrays/_mixins.py | 19 +- pandas/core/arrays/_ranges.py | 14 +- pandas/core/arrays/arrow/accessors.py | 18 + pandas/core/arrays/arrow/array.py | 150 ++--- pandas/core/arrays/base.py | 152 ++++- pandas/core/arrays/categorical.py | 5 - pandas/core/arrays/datetimelike.py | 9 +- pandas/core/arrays/datetimes.py | 34 +- pandas/core/arrays/numpy_.py | 7 - pandas/core/arrays/period.py | 148 ++++- pandas/core/arrays/sparse/accessor.py | 22 + pandas/core/arrays/sparse/array.py | 5 + pandas/core/arrays/string_.py | 137 ++-- pandas/core/arrays/string_arrow.py | 267 +++----- pandas/core/arrays/timedeltas.py | 15 +- pandas/core/computation/parsing.py | 133 +++- pandas/core/config_init.py | 7 +- pandas/core/dtypes/cast.py | 15 +- pandas/core/dtypes/common.py | 8 + pandas/core/dtypes/dtypes.py | 25 +- pandas/core/frame.py | 235 +++++-- pandas/core/generic.py | 81 ++- pandas/core/groupby/categorical.py | 14 +- pandas/core/groupby/groupby.py | 88 ++- pandas/core/indexes/base.py | 6 +- pandas/core/indexes/datetimelike.py | 12 + pandas/core/indexes/datetimes.py | 7 +- pandas/core/indexes/extension.py | 4 +- pandas/core/indexes/multi.py | 80 ++- pandas/core/indexes/period.py | 18 + pandas/core/indexes/range.py | 2 + pandas/core/indexing.py | 4 +- pandas/core/interchange/from_dataframe.py | 7 + pandas/core/internals/blocks.py | 8 +- pandas/core/internals/construction.py | 2 +- pandas/core/internals/managers.py | 6 +- pandas/core/ops/docstrings.py | 6 +- pandas/core/resample.py | 2 +- pandas/core/reshape/concat.py | 2 +- pandas/core/reshape/merge.py | 8 + pandas/core/series.py | 59 +- pandas/core/shared_docs.py | 130 ---- pandas/core/strings/accessor.py | 80 ++- pandas/core/strings/object_array.py | 36 +- pandas/core/tools/datetimes.py | 2 +- pandas/core/tools/numeric.py | 4 +- pandas/core/window/expanding.py | 5 + pandas/core/window/rolling.py | 28 +- pandas/errors/__init__.py | 26 +- pandas/io/_util.py | 2 + pandas/io/clipboards.py | 11 +- pandas/io/excel/_base.py | 22 +- pandas/io/feather_format.py | 13 +- pandas/io/formats/csvs.py | 6 +- pandas/io/formats/style.py | 108 ++- pandas/io/formats/style_render.py | 8 +- pandas/io/html.py | 11 +- pandas/io/json/_json.py | 11 +- pandas/io/json/_table_schema.py | 6 +- pandas/io/orc.py | 11 +- pandas/io/parquet.py | 11 +- pandas/io/parsers/readers.py | 24 +- pandas/io/pytables.py | 2 +- pandas/io/spss.py | 26 +- pandas/io/sql.py | 55 +- pandas/io/xml.py | 11 +- pandas/plotting/_matplotlib/core.py | 47 +- pandas/tests/apply/test_numba.py | 2 +- pandas/tests/arithmetic/test_object.py | 25 +- .../tests/arrays/boolean/test_arithmetic.py | 26 +- pandas/tests/arrays/boolean/test_logical.py | 9 +- pandas/tests/arrays/categorical/test_api.py | 3 - .../tests/arrays/floating/test_arithmetic.py | 23 +- .../tests/arrays/integer/test_arithmetic.py | 34 +- .../tests/arrays/masked/test_arrow_compat.py | 11 +- .../tests/arrays/sparse/test_arithmetics.py | 13 + pandas/tests/arrays/string_/test_concat.py | 73 +++ pandas/tests/arrays/string_/test_string.py | 14 +- .../tests/arrays/string_/test_string_arrow.py | 26 +- pandas/tests/arrays/test_array.py | 34 +- pandas/tests/arrays/test_datetimelike.py | 31 +- pandas/tests/base/test_value_counts.py | 4 +- pandas/tests/copy_view/test_constructors.py | 5 +- pandas/tests/copy_view/test_functions.py | 32 +- pandas/tests/copy_view/test_internals.py | 10 +- .../dtypes/cast/test_construct_ndarray.py | 2 +- pandas/tests/dtypes/test_common.py | 13 +- pandas/tests/dtypes/test_dtypes.py | 3 - pandas/tests/dtypes/test_inference.py | 2 +- pandas/tests/extension/base/dtype.py | 2 +- pandas/tests/extension/base/io.py | 2 +- pandas/tests/extension/base/ops.py | 10 +- .../tests/extension/decimal/test_decimal.py | 2 +- pandas/tests/extension/test_arrow.py | 80 +-- pandas/tests/extension/test_string.py | 68 +- .../frame/methods/test_convert_dtypes.py | 10 +- .../frame/methods/test_drop_duplicates.py | 38 ++ pandas/tests/frame/methods/test_join.py | 4 +- pandas/tests/frame/methods/test_rank.py | 2 +- pandas/tests/frame/test_api.py | 2 +- pandas/tests/frame/test_block_internals.py | 2 +- pandas/tests/frame/test_logical_ops.py | 20 +- pandas/tests/frame/test_query_eval.py | 167 ++++- pandas/tests/frame/test_unary.py | 26 +- pandas/tests/groupby/methods/test_describe.py | 8 +- pandas/tests/groupby/methods/test_nth.py | 6 +- .../groupby/methods/test_value_counts.py | 37 +- pandas/tests/groupby/test_groupby_dropna.py | 19 - .../tests/groupby/transform/test_transform.py | 29 +- .../datetimes/methods/test_tz_localize.py | 17 +- .../indexes/datetimes/test_arithmetic.py | 2 +- .../indexes/datetimes/test_constructors.py | 12 +- .../indexes/datetimes/test_date_range.py | 23 +- .../tests/indexes/datetimes/test_timezones.py | 14 +- .../indexes/interval/test_interval_tree.py | 7 +- pandas/tests/indexes/object/test_indexing.py | 45 +- pandas/tests/indexes/test_old_base.py | 14 +- .../tests/indexing/interval/test_interval.py | 4 +- .../indexing/interval/test_interval_new.py | 4 +- .../multiindex/test_chaining_and_caching.py | 17 + pandas/tests/indexing/test_indexing.py | 2 +- pandas/tests/io/excel/test_readers.py | 46 +- pandas/tests/io/excel/test_writers.py | 1 - .../tests/io/formats/style/test_matplotlib.py | 4 +- pandas/tests/io/json/test_pandas.py | 40 +- .../io/parser/dtypes/test_dtypes_basic.py | 32 +- pandas/tests/io/parser/test_header.py | 4 +- pandas/tests/io/parser/test_network.py | 2 +- pandas/tests/io/parser/test_parse_dates.py | 4 +- pandas/tests/io/parser/test_read_fwf.py | 35 +- .../io/parser/usecols/test_usecols_basic.py | 3 - pandas/tests/io/test_clipboard.py | 30 +- pandas/tests/io/test_feather.py | 55 +- pandas/tests/io/test_fsspec.py | 6 +- pandas/tests/io/test_gcs.py | 2 +- pandas/tests/io/test_html.py | 40 +- pandas/tests/io/test_orc.py | 25 +- pandas/tests/io/test_parquet.py | 97 ++- pandas/tests/io/test_spss.py | 16 + pandas/tests/io/test_sql.py | 29 +- pandas/tests/io/xml/test_to_xml.py | 4 +- pandas/tests/io/xml/test_xml.py | 41 +- pandas/tests/plotting/frame/test_frame.py | 11 +- .../tests/plotting/frame/test_frame_color.py | 66 +- .../plotting/frame/test_frame_subplots.py | 2 +- pandas/tests/plotting/test_hist_method.py | 2 +- pandas/tests/resample/test_datetime_index.py | 13 + pandas/tests/reshape/merge/test_join.py | 26 + pandas/tests/reshape/merge/test_merge.py | 24 + .../scalar/timestamp/methods/test_round.py | 5 +- .../timestamp/methods/test_tz_localize.py | 16 +- .../scalar/timestamp/test_constructors.py | 11 +- .../series/accessors/test_dt_accessor.py | 13 +- .../series/methods/test_convert_dtypes.py | 9 +- pandas/tests/series/methods/test_replace.py | 9 + .../tests/series/methods/test_tz_localize.py | 9 +- pandas/tests/series/test_arithmetic.py | 26 +- pandas/tests/series/test_arrow_interface.py | 23 + pandas/tests/series/test_constructors.py | 2 +- pandas/tests/series/test_logical_ops.py | 66 +- pandas/tests/strings/test_case_justify.py | 6 +- pandas/tests/strings/test_find_replace.py | 46 +- pandas/tests/test_aggregation.py | 2 +- pandas/tests/test_downstream.py | 2 +- pandas/tests/test_sorting.py | 7 + pandas/tests/tools/test_to_datetime.py | 117 ++-- pandas/tests/tseries/offsets/test_dst.py | 10 +- .../offsets/test_offsets_properties.py | 5 +- pandas/tests/tslibs/test_array_to_datetime.py | 2 +- pandas/tests/tslibs/test_tzconversion.py | 17 +- pandas/tests/util/test_assert_frame_equal.py | 2 +- .../util/test_assert_interval_array_equal.py | 21 +- pandas/tests/util/test_assert_series_equal.py | 2 +- pandas/tests/window/test_rolling.py | 6 - pandas/util/_print_versions.py | 1 - pandas/util/version/__init__.py | 238 ++----- pyproject.toml | 3 +- requirements-dev.txt | 2 +- web/pandas/_templates/layout.html | 4 +- web/pandas/community/blog/2019-user-survey.md | 2 +- web/pandas/community/ecosystem.md | 16 +- web/pandas/index.html | 2 +- .../pdeps/0010-required-pyarrow-dependency.md | 4 +- ...2-compact-and-reversible-JSON-interface.md | 2 +- 263 files changed, 5023 insertions(+), 2608 deletions(-) create mode 100644 doc/source/whatsnew/v2.3.0.rst create mode 100644 pandas/tests/arrays/string_/test_concat.py create mode 100644 pandas/tests/series/test_arrow_interface.py diff --git a/.circleci/config.yml b/.circleci/config.yml index b6a5a00429d9a..27b6829dcda70 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -32,7 +32,7 @@ jobs: if pip show pandas 1>/dev/null; then pip uninstall -y pandas fi - python -m pip install --no-build-isolation -ve . --config-settings=setup-args="--werror" + python -m pip install --no-build-isolation -ve . -Csetup-args="--werror" PATH=$HOME/miniconda3/envs/pandas-dev/bin:$HOME/miniconda3/condabin:$PATH sudo apt-get update && sudo apt-get install -y libegl1 libopengl0 ci/run_tests.sh @@ -56,8 +56,8 @@ jobs: /opt/python/cp311-cp311/bin/python -m venv ~/virtualenvs/pandas-dev . ~/virtualenvs/pandas-dev/bin/activate python -m pip install --no-cache-dir -U pip wheel setuptools meson-python==0.13.1 meson[ninja]==1.2.1 - python -m pip install --no-cache-dir versioneer[toml] cython numpy python-dateutil pytz pytest>=7.3.2 pytest-xdist>=3.4.0 hypothesis>=6.84.0 - python -m pip install --no-cache-dir --no-build-isolation -e . --config-settings=setup-args="--werror" + python -m pip install --no-cache-dir versioneer[toml] cython numpy python-dateutil pytest>=7.3.2 pytest-xdist>=3.4.0 hypothesis>=6.84.0 + python -m pip install --no-cache-dir --no-build-isolation -e . -Csetup-args="--werror" python -m pip list --no-cache-dir export PANDAS_CI=1 python -m pytest -m 'not slow and not network and not clipboard and not single_cpu' pandas --junitxml=test-data.xml diff --git a/.github/actions/build_pandas/action.yml b/.github/actions/build_pandas/action.yml index 460ae2f8594c0..9dd0679d62f3e 100644 --- a/.github/actions/build_pandas/action.yml +++ b/.github/actions/build_pandas/action.yml @@ -22,13 +22,20 @@ runs: fi shell: bash -el {0} + - name: Uninstall nomkl + run: | + if conda list nomkl | grep nomkl 1>/dev/null; then + conda remove nomkl -y + fi + shell: bash -el {0} + - name: Build Pandas run: | if [[ ${{ inputs.editable }} == "true" ]]; then pip install -e . --no-build-isolation -v --no-deps \ - --config-settings=setup-args="--werror" + -Csetup-args="--werror" else pip install . --no-build-isolation -v --no-deps \ - --config-settings=setup-args="--werror" + -Csetup-args="--werror" fi shell: bash -el {0} diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index 166c06acccc49..d392c84be66fe 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -235,9 +235,9 @@ jobs: /opt/python/cp311-cp311/bin/python -m venv ~/virtualenvs/pandas-dev . ~/virtualenvs/pandas-dev/bin/activate python -m pip install --no-cache-dir -U pip wheel setuptools meson[ninja]==1.2.1 meson-python==0.13.1 - python -m pip install numpy --config-settings=setup-args="-Dallow-noblas=true" - python -m pip install --no-cache-dir versioneer[toml] cython python-dateutil pytz pytest>=7.3.2 pytest-xdist>=3.4.0 hypothesis>=6.84.0 - python -m pip install --no-cache-dir --no-build-isolation -e . --config-settings=setup-args="--werror" + python -m pip install numpy -Csetup-args="-Dallow-noblas=true" + python -m pip install --no-cache-dir versioneer[toml] cython python-dateutil pytest>=7.3.2 pytest-xdist>=3.4.0 hypothesis>=6.84.0 + python -m pip install --no-cache-dir --no-build-isolation -e . -Csetup-args="--werror" python -m pip list --no-cache-dir export PANDAS_CI=1 python -m pytest -m 'not slow and not network and not clipboard and not single_cpu' pandas --junitxml=test-data.xml @@ -274,8 +274,8 @@ jobs: /opt/python/cp311-cp311/bin/python -m venv ~/virtualenvs/pandas-dev . ~/virtualenvs/pandas-dev/bin/activate python -m pip install --no-cache-dir -U pip wheel setuptools meson-python==0.13.1 meson[ninja]==1.2.1 - python -m pip install --no-cache-dir versioneer[toml] cython numpy python-dateutil pytz pytest>=7.3.2 pytest-xdist>=3.4.0 hypothesis>=6.84.0 - python -m pip install --no-cache-dir --no-build-isolation -e . --config-settings=setup-args="--werror" + python -m pip install --no-cache-dir versioneer[toml] cython numpy python-dateutil pytest>=7.3.2 pytest-xdist>=3.4.0 hypothesis>=6.84.0 + python -m pip install --no-cache-dir --no-build-isolation -e . -Csetup-args="--werror" python -m pip list --no-cache-dir - name: Run Tests @@ -295,7 +295,7 @@ jobs: # In general, this will remain frozen(present, but not running) until: # - The next unreleased Python version has released beta 1 # - This version should be available on GitHub Actions. - # - Our required build/runtime dependencies(numpy, pytz, Cython, python-dateutil) + # - Our required build/runtime dependencies(numpy, Cython, python-dateutil) # support that unreleased Python version. # To unfreeze, comment out the ``if: false`` condition, and make sure you update # the name of the workflow and Python version in actions/setup-python ``python-version:`` @@ -348,8 +348,8 @@ jobs: python -m pip install --upgrade pip setuptools wheel meson[ninja]==1.2.1 meson-python==0.13.1 python -m pip install --pre --extra-index-url https://pypi.anaconda.org/scientific-python-nightly-wheels/simple numpy python -m pip install versioneer[toml] - python -m pip install python-dateutil pytz tzdata cython hypothesis>=6.84.0 pytest>=7.3.2 pytest-xdist>=3.4.0 pytest-cov - python -m pip install -ve . --no-build-isolation --no-index --no-deps --config-settings=setup-args="--werror" + python -m pip install python-dateutil tzdata cython hypothesis>=6.84.0 pytest>=7.3.2 pytest-xdist>=3.4.0 pytest-cov + python -m pip install -ve . --no-build-isolation --no-index --no-deps -Csetup-args="--werror" python -m pip list - name: Run Tests @@ -392,7 +392,7 @@ jobs: python -m pip install --pre --extra-index-url https://pypi.anaconda.org/scientific-python-nightly-wheels/simple numpy cython python -m pip install versioneer[toml] python -m pip install python-dateutil pytz tzdata hypothesis>=6.84.0 pytest>=7.3.2 pytest-xdist>=3.4.0 pytest-cov - python -m pip install -ve . --no-build-isolation --no-index --no-deps --config-settings=setup-args="--werror" + python -m pip install -ve . --no-build-isolation --no-index --no-deps -Csetup-args="--werror" python -m pip list - name: Run Tests diff --git a/.gitpod.yml b/.gitpod.yml index 9ff349747a33e..5bf028750f30f 100644 --- a/.gitpod.yml +++ b/.gitpod.yml @@ -13,10 +13,10 @@ tasks: mkdir -p .vscode cp gitpod/settings.json .vscode/settings.json git fetch --tags - python -m pip install -ve . --no-build-isolation --config-settings editable-verbose=true + python -m pip install -ve . --no-build-isolation -Ceditable-verbose=true pre-commit install --install-hooks command: | - python -m pip install -ve . --no-build-isolation --config-settings editable-verbose=true + python -m pip install -ve . --no-build-isolation -Ceditable-verbose=true echo "✨ Pre-build complete! You can close this terminal ✨ " # -------------------------------------------------------- diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index b81b9ba070a44..f6717dd503c9b 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -23,6 +23,7 @@ repos: hooks: - id: ruff args: [--exit-non-zero-on-fix] + exclude: ^pandas/tests/frame/test_query_eval.py - id: ruff # TODO: remove autofixe-only rules when they are checked by ruff name: ruff-selected-autofixes @@ -31,7 +32,7 @@ repos: exclude: ^pandas/tests args: [--select, "ANN001,ANN2", --fix-only, --exit-non-zero-on-fix] - id: ruff-format - exclude: ^scripts + exclude: ^scripts|^pandas/tests/frame/test_query_eval.py - repo: https://github.com/jendrikseipp/vulture rev: 'v2.11' hooks: @@ -85,6 +86,7 @@ repos: types: [text] # overwrite types: [rst] types_or: [python, rst] - id: rst-inline-touching-normal + exclude: ^pandas/tests/frame/test_query_eval.py types: [text] # overwrite types: [rst] types_or: [python, rst] - repo: https://github.com/sphinx-contrib/sphinx-lint diff --git a/README.md b/README.md index 715b0c9dc459c..1a273fdb896c5 100644 --- a/README.md +++ b/README.md @@ -138,7 +138,7 @@ or for installing in [development mode](https://pip.pypa.io/en/latest/cli/pip_in ```sh -python -m pip install -ve . --no-build-isolation --config-settings=editable-verbose=true +python -m pip install -ve . --no-build-isolation -Ceditable-verbose=true ``` See the full instructions for [installing from source](https://pandas.pydata.org/docs/dev/development/contributing_environment.html). diff --git a/asv_bench/benchmarks/indexing_engines.py b/asv_bench/benchmarks/indexing_engines.py index fd3d0f0b9cf2e..5e3c593e269cb 100644 --- a/asv_bench/benchmarks/indexing_engines.py +++ b/asv_bench/benchmarks/indexing_engines.py @@ -87,7 +87,7 @@ def setup(self, engine_and_dtype, index_type, unique, N): arr = np.array([1, 2, 3], dtype=dtype).repeat(N) self.data = engine(arr) - # code belows avoids populating the mapping etc. while timing. + # code below avoids populating the mapping etc. while timing. self.data.get_loc(2) self.key_middle = arr[len(arr) // 2] @@ -140,7 +140,7 @@ def setup(self, engine_and_dtype, index_type, unique, N): mask[-1] = True self.data = engine(BaseMaskedArray(arr, mask)) - # code belows avoids populating the mapping etc. while timing. + # code below avoids populating the mapping etc. while timing. self.data.get_loc(2) self.key_middle = arr[len(arr) // 2] @@ -169,7 +169,7 @@ def setup(self, index_type): }[index_type] self.data = libindex.ObjectEngine(arr) - # code belows avoids populating the mapping etc. while timing. + # code below avoids populating the mapping etc. while timing. self.data.get_loc("b") def time_get_loc(self, index_type): diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 540bd59cd5924..7ed5103b3b796 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -70,44 +70,10 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then --format=actions \ -i ES01 `# For now it is ok if docstrings are missing the extended summary` \ -i "pandas.Series.dt PR01" `# Accessors are implemented as classes, but we do not document the Parameters section` \ - -i "pandas.MultiIndex.names SA01" \ - -i "pandas.MultiIndex.reorder_levels RT03,SA01" \ - -i "pandas.MultiIndex.sortlevel PR07,SA01" \ - -i "pandas.MultiIndex.to_frame RT03" \ -i "pandas.NA SA01" \ - -i "pandas.NaT SA01" \ - -i "pandas.Period.asfreq SA01" \ -i "pandas.Period.freq GL08" \ - -i "pandas.Period.freqstr SA01" \ - -i "pandas.Period.month SA01" \ -i "pandas.Period.ordinal GL08" \ - -i "pandas.Period.strftime PR01,SA01" \ - -i "pandas.Period.to_timestamp SA01" \ - -i "pandas.Period.year SA01" \ - -i "pandas.PeriodDtype SA01" \ -i "pandas.PeriodDtype.freq SA01" \ - -i "pandas.PeriodIndex.day SA01" \ - -i "pandas.PeriodIndex.day_of_week SA01" \ - -i "pandas.PeriodIndex.day_of_year SA01" \ - -i "pandas.PeriodIndex.dayofweek SA01" \ - -i "pandas.PeriodIndex.dayofyear SA01" \ - -i "pandas.PeriodIndex.days_in_month SA01" \ - -i "pandas.PeriodIndex.daysinmonth SA01" \ - -i "pandas.PeriodIndex.from_fields PR07,SA01" \ - -i "pandas.PeriodIndex.from_ordinals SA01" \ - -i "pandas.PeriodIndex.hour SA01" \ - -i "pandas.PeriodIndex.is_leap_year SA01" \ - -i "pandas.PeriodIndex.minute SA01" \ - -i "pandas.PeriodIndex.month SA01" \ - -i "pandas.PeriodIndex.quarter SA01" \ - -i "pandas.PeriodIndex.qyear GL08" \ - -i "pandas.PeriodIndex.second SA01" \ - -i "pandas.PeriodIndex.to_timestamp RT03,SA01" \ - -i "pandas.PeriodIndex.week SA01" \ - -i "pandas.PeriodIndex.weekday SA01" \ - -i "pandas.PeriodIndex.weekofyear SA01" \ - -i "pandas.PeriodIndex.year SA01" \ - -i "pandas.RangeIndex PR07" \ -i "pandas.RangeIndex.from_range PR01,SA01" \ -i "pandas.RangeIndex.start SA01" \ -i "pandas.RangeIndex.step SA01" \ @@ -125,11 +91,9 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Series.dt.day_name PR01,PR02" \ -i "pandas.Series.dt.floor PR01,PR02" \ -i "pandas.Series.dt.freq GL08" \ - -i "pandas.Series.dt.microseconds SA01" \ -i "pandas.Series.dt.month_name PR01,PR02" \ -i "pandas.Series.dt.nanoseconds SA01" \ -i "pandas.Series.dt.normalize PR01" \ - -i "pandas.Series.dt.qyear GL08" \ -i "pandas.Series.dt.round PR01,PR02" \ -i "pandas.Series.dt.seconds SA01" \ -i "pandas.Series.dt.strftime PR01,PR02" \ @@ -138,130 +102,28 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Series.dt.tz_convert PR01,PR02" \ -i "pandas.Series.dt.tz_localize PR01,PR02" \ -i "pandas.Series.dt.unit GL08" \ - -i "pandas.Series.gt SA01" \ - -i "pandas.Series.list.__getitem__ SA01" \ - -i "pandas.Series.list.flatten SA01" \ - -i "pandas.Series.list.len SA01" \ - -i "pandas.Series.lt SA01" \ - -i "pandas.Series.ne SA01" \ -i "pandas.Series.pad PR01,SA01" \ - -i "pandas.Series.pop SA01" \ - -i "pandas.Series.prod RT03" \ - -i "pandas.Series.product RT03" \ - -i "pandas.Series.reorder_levels RT03,SA01" \ - -i "pandas.Series.sem PR01,RT03,SA01" \ - -i "pandas.Series.sparse PR01,SA01" \ - -i "pandas.Series.sparse.density SA01" \ -i "pandas.Series.sparse.fill_value SA01" \ -i "pandas.Series.sparse.from_coo PR07,SA01" \ -i "pandas.Series.sparse.npoints SA01" \ -i "pandas.Series.sparse.sp_values SA01" \ - -i "pandas.Series.sparse.to_coo PR07,RT03,SA01" \ - -i "pandas.Series.std PR01,RT03,SA01" \ - -i "pandas.Series.str.capitalize RT03" \ - -i "pandas.Series.str.casefold RT03" \ - -i "pandas.Series.str.center RT03,SA01" \ - -i "pandas.Series.str.decode PR07,RT03,SA01" \ - -i "pandas.Series.str.encode PR07,RT03,SA01" \ - -i "pandas.Series.str.ljust RT03,SA01" \ - -i "pandas.Series.str.lower RT03" \ - -i "pandas.Series.str.lstrip RT03" \ - -i "pandas.Series.str.match RT03" \ - -i "pandas.Series.str.normalize RT03,SA01" \ - -i "pandas.Series.str.partition RT03" \ - -i "pandas.Series.str.repeat SA01" \ - -i "pandas.Series.str.replace SA01" \ - -i "pandas.Series.str.rjust RT03,SA01" \ - -i "pandas.Series.str.rpartition RT03" \ - -i "pandas.Series.str.rstrip RT03" \ - -i "pandas.Series.str.strip RT03" \ - -i "pandas.Series.str.swapcase RT03" \ - -i "pandas.Series.str.title RT03" \ - -i "pandas.Series.str.upper RT03" \ - -i "pandas.Series.str.wrap RT03,SA01" \ - -i "pandas.Series.str.zfill RT03" \ - -i "pandas.Series.struct.dtypes SA01" \ - -i "pandas.Series.to_markdown SA01" \ - -i "pandas.Series.update PR07,SA01" \ - -i "pandas.Timedelta.asm8 SA01" \ - -i "pandas.Timedelta.ceil SA01" \ -i "pandas.Timedelta.components SA01" \ - -i "pandas.Timedelta.floor SA01" \ -i "pandas.Timedelta.max PR02" \ -i "pandas.Timedelta.min PR02" \ -i "pandas.Timedelta.resolution PR02" \ - -i "pandas.Timedelta.round SA01" \ - -i "pandas.Timedelta.to_numpy PR01" \ -i "pandas.Timedelta.to_timedelta64 SA01" \ -i "pandas.Timedelta.total_seconds SA01" \ - -i "pandas.Timedelta.view SA01" \ - -i "pandas.TimedeltaIndex.as_unit RT03,SA01" \ - -i "pandas.TimedeltaIndex.components SA01" \ - -i "pandas.TimedeltaIndex.microseconds SA01" \ -i "pandas.TimedeltaIndex.nanoseconds SA01" \ -i "pandas.TimedeltaIndex.seconds SA01" \ -i "pandas.TimedeltaIndex.to_pytimedelta RT03,SA01" \ - -i "pandas.Timestamp.combine PR01,SA01" \ - -i "pandas.Timestamp.ctime SA01" \ - -i "pandas.Timestamp.date SA01" \ - -i "pandas.Timestamp.day GL08" \ - -i "pandas.Timestamp.fold GL08" \ - -i "pandas.Timestamp.fromordinal SA01" \ - -i "pandas.Timestamp.fromtimestamp PR01,SA01" \ - -i "pandas.Timestamp.hour GL08" \ -i "pandas.Timestamp.max PR02" \ - -i "pandas.Timestamp.microsecond GL08" \ -i "pandas.Timestamp.min PR02" \ - -i "pandas.Timestamp.minute GL08" \ - -i "pandas.Timestamp.month GL08" \ - -i "pandas.Timestamp.month_name SA01" \ -i "pandas.Timestamp.nanosecond GL08" \ - -i "pandas.Timestamp.normalize SA01" \ - -i "pandas.Timestamp.quarter SA01" \ - -i "pandas.Timestamp.replace PR07,SA01" \ -i "pandas.Timestamp.resolution PR02" \ - -i "pandas.Timestamp.second GL08" \ - -i "pandas.Timestamp.strptime PR01,SA01" \ - -i "pandas.Timestamp.timestamp SA01" \ - -i "pandas.Timestamp.timetuple SA01" \ - -i "pandas.Timestamp.timetz SA01" \ - -i "pandas.Timestamp.to_datetime64 SA01" \ - -i "pandas.Timestamp.to_julian_date SA01" \ - -i "pandas.Timestamp.to_numpy PR01" \ - -i "pandas.Timestamp.to_period PR01,SA01" \ - -i "pandas.Timestamp.today SA01" \ - -i "pandas.Timestamp.toordinal SA01" \ - -i "pandas.Timestamp.tz_localize SA01" \ -i "pandas.Timestamp.tzinfo GL08" \ - -i "pandas.Timestamp.tzname SA01" \ - -i "pandas.Timestamp.unit SA01" \ - -i "pandas.Timestamp.utcfromtimestamp PR01,SA01" \ - -i "pandas.Timestamp.utcoffset SA01" \ - -i "pandas.Timestamp.utctimetuple SA01" \ - -i "pandas.Timestamp.value GL08" \ -i "pandas.Timestamp.year GL08" \ - -i "pandas.api.extensions.ExtensionArray._pad_or_backfill PR01,RT03,SA01" \ - -i "pandas.api.extensions.ExtensionArray._reduce RT03,SA01" \ - -i "pandas.api.extensions.ExtensionArray._values_for_factorize SA01" \ - -i "pandas.api.extensions.ExtensionArray.astype SA01" \ - -i "pandas.api.extensions.ExtensionArray.dropna RT03,SA01" \ - -i "pandas.api.extensions.ExtensionArray.dtype SA01" \ - -i "pandas.api.extensions.ExtensionArray.duplicated RT03,SA01" \ - -i "pandas.api.extensions.ExtensionArray.fillna SA01" \ - -i "pandas.api.extensions.ExtensionArray.insert PR07,RT03,SA01" \ -i "pandas.api.extensions.ExtensionArray.interpolate PR01,SA01" \ - -i "pandas.api.extensions.ExtensionArray.isin PR07,RT03,SA01" \ - -i "pandas.api.extensions.ExtensionArray.isna SA01" \ - -i "pandas.api.extensions.ExtensionArray.nbytes SA01" \ - -i "pandas.api.extensions.ExtensionArray.ndim SA01" \ - -i "pandas.api.extensions.ExtensionArray.ravel RT03,SA01" \ - -i "pandas.api.extensions.ExtensionArray.take RT03" \ - -i "pandas.api.extensions.ExtensionArray.tolist RT03,SA01" \ - -i "pandas.api.extensions.ExtensionArray.unique RT03,SA01" \ - -i "pandas.api.extensions.ExtensionArray.view SA01" \ - -i "pandas.api.interchange.from_dataframe RT03,SA01" \ -i "pandas.api.types.is_bool PR01,SA01" \ - -i "pandas.api.types.is_bool_dtype SA01" \ -i "pandas.api.types.is_categorical_dtype SA01" \ -i "pandas.api.types.is_complex PR01,SA01" \ -i "pandas.api.types.is_complex_dtype SA01" \ @@ -312,7 +174,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.core.groupby.DataFrameGroupBy.nunique SA01" \ -i "pandas.core.groupby.DataFrameGroupBy.ohlc SA01" \ -i "pandas.core.groupby.DataFrameGroupBy.plot PR02" \ - -i "pandas.core.groupby.DataFrameGroupBy.prod SA01" \ -i "pandas.core.groupby.DataFrameGroupBy.sem SA01" \ -i "pandas.core.groupby.DataFrameGroupBy.sum SA01" \ -i "pandas.core.groupby.SeriesGroupBy.__iter__ RT03,SA01" \ @@ -329,7 +190,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.core.groupby.SeriesGroupBy.nth PR02" \ -i "pandas.core.groupby.SeriesGroupBy.ohlc SA01" \ -i "pandas.core.groupby.SeriesGroupBy.plot PR02" \ - -i "pandas.core.groupby.SeriesGroupBy.prod SA01" \ -i "pandas.core.groupby.SeriesGroupBy.sem SA01" \ -i "pandas.core.groupby.SeriesGroupBy.sum SA01" \ -i "pandas.core.resample.Resampler.__iter__ RT03,SA01" \ @@ -348,13 +208,7 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.core.resample.Resampler.sum SA01" \ -i "pandas.core.resample.Resampler.transform PR01,RT03,SA01" \ -i "pandas.core.resample.Resampler.var SA01" \ - -i "pandas.core.window.expanding.Expanding.corr PR01" \ - -i "pandas.core.window.expanding.Expanding.count PR01" \ - -i "pandas.core.window.rolling.Rolling.max PR01" \ - -i "pandas.core.window.rolling.Window.std PR01" \ - -i "pandas.core.window.rolling.Window.var PR01" \ -i "pandas.date_range RT03" \ - -i "pandas.errors.AbstractMethodError PR01,SA01" \ -i "pandas.errors.AttributeConflictWarning SA01" \ -i "pandas.errors.CSSWarning SA01" \ -i "pandas.errors.CategoricalConversionWarning SA01" \ @@ -382,34 +236,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.errors.UnsupportedFunctionCall SA01" \ -i "pandas.errors.ValueLabelTypeMismatch SA01" \ -i "pandas.infer_freq SA01" \ - -i "pandas.io.formats.style.Styler.apply RT03" \ - -i "pandas.io.formats.style.Styler.apply_index RT03" \ - -i "pandas.io.formats.style.Styler.background_gradient RT03" \ - -i "pandas.io.formats.style.Styler.bar RT03,SA01" \ - -i "pandas.io.formats.style.Styler.clear SA01" \ - -i "pandas.io.formats.style.Styler.concat RT03,SA01" \ - -i "pandas.io.formats.style.Styler.export RT03" \ - -i "pandas.io.formats.style.Styler.from_custom_template SA01" \ - -i "pandas.io.formats.style.Styler.hide RT03,SA01" \ - -i "pandas.io.formats.style.Styler.highlight_between RT03" \ - -i "pandas.io.formats.style.Styler.highlight_max RT03" \ - -i "pandas.io.formats.style.Styler.highlight_min RT03" \ - -i "pandas.io.formats.style.Styler.highlight_null RT03" \ - -i "pandas.io.formats.style.Styler.highlight_quantile RT03" \ - -i "pandas.io.formats.style.Styler.map RT03" \ - -i "pandas.io.formats.style.Styler.map_index RT03" \ - -i "pandas.io.formats.style.Styler.set_caption RT03,SA01" \ - -i "pandas.io.formats.style.Styler.set_properties RT03,SA01" \ - -i "pandas.io.formats.style.Styler.set_sticky RT03,SA01" \ - -i "pandas.io.formats.style.Styler.set_table_attributes PR07,RT03" \ - -i "pandas.io.formats.style.Styler.set_table_styles RT03" \ - -i "pandas.io.formats.style.Styler.set_td_classes RT03" \ - -i "pandas.io.formats.style.Styler.set_tooltips RT03,SA01" \ - -i "pandas.io.formats.style.Styler.set_uuid PR07,RT03,SA01" \ - -i "pandas.io.formats.style.Styler.text_gradient RT03" \ - -i "pandas.io.formats.style.Styler.to_excel PR01" \ - -i "pandas.io.formats.style.Styler.to_string SA01" \ - -i "pandas.io.formats.style.Styler.use RT03" \ -i "pandas.io.json.build_table_schema PR07,RT03,SA01" \ -i "pandas.io.stata.StataReader.data_label SA01" \ -i "pandas.io.stata.StataReader.value_labels RT03,SA01" \ @@ -423,156 +249,103 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.set_eng_float_format RT03,SA01" \ -i "pandas.testing.assert_extension_array_equal SA01" \ -i "pandas.tseries.offsets.BDay PR02,SA01" \ - -i "pandas.tseries.offsets.BQuarterBegin PR02" \ - -i "pandas.tseries.offsets.BQuarterBegin.freqstr SA01" \ -i "pandas.tseries.offsets.BQuarterBegin.is_on_offset GL08" \ -i "pandas.tseries.offsets.BQuarterBegin.n GL08" \ - -i "pandas.tseries.offsets.BQuarterBegin.nanos GL08" \ -i "pandas.tseries.offsets.BQuarterBegin.normalize GL08" \ -i "pandas.tseries.offsets.BQuarterBegin.rule_code GL08" \ -i "pandas.tseries.offsets.BQuarterBegin.startingMonth GL08" \ - -i "pandas.tseries.offsets.BQuarterEnd.freqstr SA01" \ -i "pandas.tseries.offsets.BQuarterEnd.is_on_offset GL08" \ -i "pandas.tseries.offsets.BQuarterEnd.n GL08" \ - -i "pandas.tseries.offsets.BQuarterEnd.nanos GL08" \ -i "pandas.tseries.offsets.BQuarterEnd.normalize GL08" \ -i "pandas.tseries.offsets.BQuarterEnd.rule_code GL08" \ -i "pandas.tseries.offsets.BQuarterEnd.startingMonth GL08" \ - -i "pandas.tseries.offsets.BYearBegin.freqstr SA01" \ -i "pandas.tseries.offsets.BYearBegin.is_on_offset GL08" \ -i "pandas.tseries.offsets.BYearBegin.month GL08" \ -i "pandas.tseries.offsets.BYearBegin.n GL08" \ - -i "pandas.tseries.offsets.BYearBegin.nanos GL08" \ -i "pandas.tseries.offsets.BYearBegin.normalize GL08" \ - -i "pandas.tseries.offsets.BYearBegin.rule_code GL08" \ - -i "pandas.tseries.offsets.BYearEnd PR02" \ - -i "pandas.tseries.offsets.BYearEnd.freqstr SA01" \ -i "pandas.tseries.offsets.BYearEnd.is_on_offset GL08" \ -i "pandas.tseries.offsets.BYearEnd.month GL08" \ -i "pandas.tseries.offsets.BYearEnd.n GL08" \ - -i "pandas.tseries.offsets.BYearEnd.nanos GL08" \ -i "pandas.tseries.offsets.BYearEnd.normalize GL08" \ - -i "pandas.tseries.offsets.BYearEnd.rule_code GL08" \ -i "pandas.tseries.offsets.BusinessDay PR02,SA01" \ -i "pandas.tseries.offsets.BusinessDay.calendar GL08" \ - -i "pandas.tseries.offsets.BusinessDay.freqstr SA01" \ -i "pandas.tseries.offsets.BusinessDay.holidays GL08" \ -i "pandas.tseries.offsets.BusinessDay.is_on_offset GL08" \ -i "pandas.tseries.offsets.BusinessDay.n GL08" \ - -i "pandas.tseries.offsets.BusinessDay.nanos GL08" \ -i "pandas.tseries.offsets.BusinessDay.normalize GL08" \ - -i "pandas.tseries.offsets.BusinessDay.rule_code GL08" \ -i "pandas.tseries.offsets.BusinessDay.weekmask GL08" \ -i "pandas.tseries.offsets.BusinessHour PR02,SA01" \ -i "pandas.tseries.offsets.BusinessHour.calendar GL08" \ -i "pandas.tseries.offsets.BusinessHour.end GL08" \ - -i "pandas.tseries.offsets.BusinessHour.freqstr SA01" \ -i "pandas.tseries.offsets.BusinessHour.holidays GL08" \ -i "pandas.tseries.offsets.BusinessHour.is_on_offset GL08" \ -i "pandas.tseries.offsets.BusinessHour.n GL08" \ - -i "pandas.tseries.offsets.BusinessHour.nanos GL08" \ -i "pandas.tseries.offsets.BusinessHour.normalize GL08" \ - -i "pandas.tseries.offsets.BusinessHour.rule_code GL08" \ -i "pandas.tseries.offsets.BusinessHour.start GL08" \ -i "pandas.tseries.offsets.BusinessHour.weekmask GL08" \ - -i "pandas.tseries.offsets.BusinessMonthBegin.freqstr SA01" \ -i "pandas.tseries.offsets.BusinessMonthBegin.is_on_offset GL08" \ -i "pandas.tseries.offsets.BusinessMonthBegin.n GL08" \ - -i "pandas.tseries.offsets.BusinessMonthBegin.nanos GL08" \ -i "pandas.tseries.offsets.BusinessMonthBegin.normalize GL08" \ - -i "pandas.tseries.offsets.BusinessMonthBegin.rule_code GL08" \ - -i "pandas.tseries.offsets.BusinessMonthEnd.freqstr SA01" \ -i "pandas.tseries.offsets.BusinessMonthEnd.is_on_offset GL08" \ -i "pandas.tseries.offsets.BusinessMonthEnd.n GL08" \ - -i "pandas.tseries.offsets.BusinessMonthEnd.nanos GL08" \ -i "pandas.tseries.offsets.BusinessMonthEnd.normalize GL08" \ - -i "pandas.tseries.offsets.BusinessMonthEnd.rule_code GL08" \ -i "pandas.tseries.offsets.CBMonthBegin PR02" \ -i "pandas.tseries.offsets.CBMonthEnd PR02" \ -i "pandas.tseries.offsets.CDay PR02,SA01" \ -i "pandas.tseries.offsets.CustomBusinessDay PR02,SA01" \ -i "pandas.tseries.offsets.CustomBusinessDay.calendar GL08" \ - -i "pandas.tseries.offsets.CustomBusinessDay.freqstr SA01" \ -i "pandas.tseries.offsets.CustomBusinessDay.holidays GL08" \ -i "pandas.tseries.offsets.CustomBusinessDay.is_on_offset GL08" \ -i "pandas.tseries.offsets.CustomBusinessDay.n GL08" \ - -i "pandas.tseries.offsets.CustomBusinessDay.nanos GL08" \ -i "pandas.tseries.offsets.CustomBusinessDay.normalize GL08" \ - -i "pandas.tseries.offsets.CustomBusinessDay.rule_code GL08" \ -i "pandas.tseries.offsets.CustomBusinessDay.weekmask GL08" \ -i "pandas.tseries.offsets.CustomBusinessHour PR02,SA01" \ -i "pandas.tseries.offsets.CustomBusinessHour.calendar GL08" \ -i "pandas.tseries.offsets.CustomBusinessHour.end GL08" \ - -i "pandas.tseries.offsets.CustomBusinessHour.freqstr SA01" \ -i "pandas.tseries.offsets.CustomBusinessHour.holidays GL08" \ -i "pandas.tseries.offsets.CustomBusinessHour.is_on_offset GL08" \ -i "pandas.tseries.offsets.CustomBusinessHour.n GL08" \ - -i "pandas.tseries.offsets.CustomBusinessHour.nanos GL08" \ -i "pandas.tseries.offsets.CustomBusinessHour.normalize GL08" \ - -i "pandas.tseries.offsets.CustomBusinessHour.rule_code GL08" \ -i "pandas.tseries.offsets.CustomBusinessHour.start GL08" \ -i "pandas.tseries.offsets.CustomBusinessHour.weekmask GL08" \ -i "pandas.tseries.offsets.CustomBusinessMonthBegin PR02" \ -i "pandas.tseries.offsets.CustomBusinessMonthBegin.calendar GL08" \ - -i "pandas.tseries.offsets.CustomBusinessMonthBegin.freqstr SA01" \ -i "pandas.tseries.offsets.CustomBusinessMonthBegin.holidays GL08" \ -i "pandas.tseries.offsets.CustomBusinessMonthBegin.is_on_offset SA01" \ -i "pandas.tseries.offsets.CustomBusinessMonthBegin.m_offset GL08" \ -i "pandas.tseries.offsets.CustomBusinessMonthBegin.n GL08" \ - -i "pandas.tseries.offsets.CustomBusinessMonthBegin.nanos GL08" \ -i "pandas.tseries.offsets.CustomBusinessMonthBegin.normalize GL08" \ - -i "pandas.tseries.offsets.CustomBusinessMonthBegin.rule_code GL08" \ -i "pandas.tseries.offsets.CustomBusinessMonthBegin.weekmask GL08" \ -i "pandas.tseries.offsets.CustomBusinessMonthEnd PR02" \ -i "pandas.tseries.offsets.CustomBusinessMonthEnd.calendar GL08" \ - -i "pandas.tseries.offsets.CustomBusinessMonthEnd.freqstr SA01" \ -i "pandas.tseries.offsets.CustomBusinessMonthEnd.holidays GL08" \ -i "pandas.tseries.offsets.CustomBusinessMonthEnd.is_on_offset SA01" \ -i "pandas.tseries.offsets.CustomBusinessMonthEnd.m_offset GL08" \ -i "pandas.tseries.offsets.CustomBusinessMonthEnd.n GL08" \ - -i "pandas.tseries.offsets.CustomBusinessMonthEnd.nanos GL08" \ -i "pandas.tseries.offsets.CustomBusinessMonthEnd.normalize GL08" \ - -i "pandas.tseries.offsets.CustomBusinessMonthEnd.rule_code GL08" \ -i "pandas.tseries.offsets.CustomBusinessMonthEnd.weekmask GL08" \ - -i "pandas.tseries.offsets.DateOffset PR02" \ - -i "pandas.tseries.offsets.DateOffset.freqstr SA01" \ -i "pandas.tseries.offsets.DateOffset.is_on_offset GL08" \ -i "pandas.tseries.offsets.DateOffset.n GL08" \ - -i "pandas.tseries.offsets.DateOffset.nanos GL08" \ -i "pandas.tseries.offsets.DateOffset.normalize GL08" \ - -i "pandas.tseries.offsets.DateOffset.rule_code GL08" \ - -i "pandas.tseries.offsets.Day.freqstr SA01" \ -i "pandas.tseries.offsets.Day.is_on_offset GL08" \ -i "pandas.tseries.offsets.Day.n GL08" \ - -i "pandas.tseries.offsets.Day.nanos SA01" \ -i "pandas.tseries.offsets.Day.normalize GL08" \ - -i "pandas.tseries.offsets.Day.rule_code GL08" \ - -i "pandas.tseries.offsets.Easter PR02" \ - -i "pandas.tseries.offsets.Easter.freqstr SA01" \ -i "pandas.tseries.offsets.Easter.is_on_offset GL08" \ -i "pandas.tseries.offsets.Easter.n GL08" \ - -i "pandas.tseries.offsets.Easter.nanos GL08" \ -i "pandas.tseries.offsets.Easter.normalize GL08" \ - -i "pandas.tseries.offsets.Easter.rule_code GL08" \ - -i "pandas.tseries.offsets.FY5253 PR02" \ - -i "pandas.tseries.offsets.FY5253.freqstr SA01" \ -i "pandas.tseries.offsets.FY5253.get_rule_code_suffix GL08" \ -i "pandas.tseries.offsets.FY5253.get_year_end GL08" \ -i "pandas.tseries.offsets.FY5253.is_on_offset GL08" \ -i "pandas.tseries.offsets.FY5253.n GL08" \ - -i "pandas.tseries.offsets.FY5253.nanos GL08" \ -i "pandas.tseries.offsets.FY5253.normalize GL08" \ -i "pandas.tseries.offsets.FY5253.rule_code GL08" \ -i "pandas.tseries.offsets.FY5253.startingMonth GL08" \ -i "pandas.tseries.offsets.FY5253.variation GL08" \ -i "pandas.tseries.offsets.FY5253.weekday GL08" \ - -i "pandas.tseries.offsets.FY5253Quarter PR02" \ - -i "pandas.tseries.offsets.FY5253Quarter.freqstr SA01" \ -i "pandas.tseries.offsets.FY5253Quarter.get_rule_code_suffix GL08" \ -i "pandas.tseries.offsets.FY5253Quarter.get_weeks GL08" \ -i "pandas.tseries.offsets.FY5253Quarter.is_on_offset GL08" \ -i "pandas.tseries.offsets.FY5253Quarter.n GL08" \ - -i "pandas.tseries.offsets.FY5253Quarter.nanos GL08" \ -i "pandas.tseries.offsets.FY5253Quarter.normalize GL08" \ -i "pandas.tseries.offsets.FY5253Quarter.qtr_with_extra_week GL08" \ -i "pandas.tseries.offsets.FY5253Quarter.rule_code GL08" \ @@ -580,139 +353,80 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.tseries.offsets.FY5253Quarter.variation GL08" \ -i "pandas.tseries.offsets.FY5253Quarter.weekday GL08" \ -i "pandas.tseries.offsets.FY5253Quarter.year_has_extra_week GL08" \ - -i "pandas.tseries.offsets.Hour PR02" \ - -i "pandas.tseries.offsets.Hour.freqstr SA01" \ -i "pandas.tseries.offsets.Hour.is_on_offset GL08" \ -i "pandas.tseries.offsets.Hour.n GL08" \ - -i "pandas.tseries.offsets.Hour.nanos SA01" \ -i "pandas.tseries.offsets.Hour.normalize GL08" \ - -i "pandas.tseries.offsets.Hour.rule_code GL08" \ - -i "pandas.tseries.offsets.LastWeekOfMonth PR02,SA01" \ - -i "pandas.tseries.offsets.LastWeekOfMonth.freqstr SA01" \ + -i "pandas.tseries.offsets.LastWeekOfMonth SA01" \ -i "pandas.tseries.offsets.LastWeekOfMonth.is_on_offset GL08" \ -i "pandas.tseries.offsets.LastWeekOfMonth.n GL08" \ - -i "pandas.tseries.offsets.LastWeekOfMonth.nanos GL08" \ -i "pandas.tseries.offsets.LastWeekOfMonth.normalize GL08" \ - -i "pandas.tseries.offsets.LastWeekOfMonth.rule_code GL08" \ -i "pandas.tseries.offsets.LastWeekOfMonth.week GL08" \ -i "pandas.tseries.offsets.LastWeekOfMonth.weekday GL08" \ - -i "pandas.tseries.offsets.Micro PR02" \ - -i "pandas.tseries.offsets.Micro.freqstr SA01" \ -i "pandas.tseries.offsets.Micro.is_on_offset GL08" \ -i "pandas.tseries.offsets.Micro.n GL08" \ - -i "pandas.tseries.offsets.Micro.nanos SA01" \ -i "pandas.tseries.offsets.Micro.normalize GL08" \ - -i "pandas.tseries.offsets.Micro.rule_code GL08" \ - -i "pandas.tseries.offsets.Milli PR02" \ - -i "pandas.tseries.offsets.Milli.freqstr SA01" \ -i "pandas.tseries.offsets.Milli.is_on_offset GL08" \ -i "pandas.tseries.offsets.Milli.n GL08" \ - -i "pandas.tseries.offsets.Milli.nanos SA01" \ -i "pandas.tseries.offsets.Milli.normalize GL08" \ - -i "pandas.tseries.offsets.Milli.rule_code GL08" \ - -i "pandas.tseries.offsets.Minute PR02" \ - -i "pandas.tseries.offsets.Minute.freqstr SA01" \ -i "pandas.tseries.offsets.Minute.is_on_offset GL08" \ -i "pandas.tseries.offsets.Minute.n GL08" \ - -i "pandas.tseries.offsets.Minute.nanos SA01" \ -i "pandas.tseries.offsets.Minute.normalize GL08" \ - -i "pandas.tseries.offsets.Minute.rule_code GL08" \ - -i "pandas.tseries.offsets.MonthBegin PR02" \ - -i "pandas.tseries.offsets.MonthBegin.freqstr SA01" \ -i "pandas.tseries.offsets.MonthBegin.is_on_offset GL08" \ -i "pandas.tseries.offsets.MonthBegin.n GL08" \ - -i "pandas.tseries.offsets.MonthBegin.nanos GL08" \ -i "pandas.tseries.offsets.MonthBegin.normalize GL08" \ - -i "pandas.tseries.offsets.MonthBegin.rule_code GL08" \ - -i "pandas.tseries.offsets.MonthEnd.freqstr SA01" \ -i "pandas.tseries.offsets.MonthEnd.is_on_offset GL08" \ -i "pandas.tseries.offsets.MonthEnd.n GL08" \ - -i "pandas.tseries.offsets.MonthEnd.nanos GL08" \ -i "pandas.tseries.offsets.MonthEnd.normalize GL08" \ - -i "pandas.tseries.offsets.MonthEnd.rule_code GL08" \ - -i "pandas.tseries.offsets.Nano PR02" \ - -i "pandas.tseries.offsets.Nano.freqstr SA01" \ -i "pandas.tseries.offsets.Nano.is_on_offset GL08" \ - -i "pandas.tseries.offsets.Nano.n GL08" \ - -i "pandas.tseries.offsets.Nano.nanos SA01" \ -i "pandas.tseries.offsets.Nano.normalize GL08" \ - -i "pandas.tseries.offsets.Nano.rule_code GL08" \ - -i "pandas.tseries.offsets.QuarterBegin PR02" \ - -i "pandas.tseries.offsets.QuarterBegin.freqstr SA01" \ + -i "pandas.tseries.offsets.Nano.n GL08" \ -i "pandas.tseries.offsets.QuarterBegin.is_on_offset GL08" \ -i "pandas.tseries.offsets.QuarterBegin.n GL08" \ - -i "pandas.tseries.offsets.QuarterBegin.nanos GL08" \ -i "pandas.tseries.offsets.QuarterBegin.normalize GL08" \ -i "pandas.tseries.offsets.QuarterBegin.rule_code GL08" \ -i "pandas.tseries.offsets.QuarterBegin.startingMonth GL08" \ - -i "pandas.tseries.offsets.QuarterEnd.freqstr SA01" \ -i "pandas.tseries.offsets.QuarterEnd.is_on_offset GL08" \ -i "pandas.tseries.offsets.QuarterEnd.n GL08" \ - -i "pandas.tseries.offsets.QuarterEnd.nanos GL08" \ -i "pandas.tseries.offsets.QuarterEnd.normalize GL08" \ -i "pandas.tseries.offsets.QuarterEnd.rule_code GL08" \ -i "pandas.tseries.offsets.QuarterEnd.startingMonth GL08" \ - -i "pandas.tseries.offsets.Second PR02" \ - -i "pandas.tseries.offsets.Second.freqstr SA01" \ -i "pandas.tseries.offsets.Second.is_on_offset GL08" \ -i "pandas.tseries.offsets.Second.n GL08" \ - -i "pandas.tseries.offsets.Second.nanos SA01" \ -i "pandas.tseries.offsets.Second.normalize GL08" \ - -i "pandas.tseries.offsets.Second.rule_code GL08" \ - -i "pandas.tseries.offsets.SemiMonthBegin PR02,SA01" \ + -i "pandas.tseries.offsets.SemiMonthBegin SA01" \ -i "pandas.tseries.offsets.SemiMonthBegin.day_of_month GL08" \ - -i "pandas.tseries.offsets.SemiMonthBegin.freqstr SA01" \ -i "pandas.tseries.offsets.SemiMonthBegin.is_on_offset GL08" \ -i "pandas.tseries.offsets.SemiMonthBegin.n GL08" \ - -i "pandas.tseries.offsets.SemiMonthBegin.nanos GL08" \ -i "pandas.tseries.offsets.SemiMonthBegin.normalize GL08" \ -i "pandas.tseries.offsets.SemiMonthBegin.rule_code GL08" \ -i "pandas.tseries.offsets.SemiMonthEnd SA01" \ -i "pandas.tseries.offsets.SemiMonthEnd.day_of_month GL08" \ - -i "pandas.tseries.offsets.SemiMonthEnd.freqstr SA01" \ -i "pandas.tseries.offsets.SemiMonthEnd.is_on_offset GL08" \ -i "pandas.tseries.offsets.SemiMonthEnd.n GL08" \ - -i "pandas.tseries.offsets.SemiMonthEnd.nanos GL08" \ -i "pandas.tseries.offsets.SemiMonthEnd.normalize GL08" \ -i "pandas.tseries.offsets.SemiMonthEnd.rule_code GL08" \ -i "pandas.tseries.offsets.Tick GL08" \ - -i "pandas.tseries.offsets.Tick.freqstr SA01" \ -i "pandas.tseries.offsets.Tick.is_on_offset GL08" \ -i "pandas.tseries.offsets.Tick.n GL08" \ - -i "pandas.tseries.offsets.Tick.nanos SA01" \ -i "pandas.tseries.offsets.Tick.normalize GL08" \ - -i "pandas.tseries.offsets.Tick.rule_code GL08" \ - -i "pandas.tseries.offsets.Week PR02" \ - -i "pandas.tseries.offsets.Week.freqstr SA01" \ -i "pandas.tseries.offsets.Week.is_on_offset GL08" \ -i "pandas.tseries.offsets.Week.n GL08" \ - -i "pandas.tseries.offsets.Week.nanos GL08" \ -i "pandas.tseries.offsets.Week.normalize GL08" \ - -i "pandas.tseries.offsets.Week.rule_code GL08" \ -i "pandas.tseries.offsets.Week.weekday GL08" \ - -i "pandas.tseries.offsets.WeekOfMonth PR02,SA01" \ - -i "pandas.tseries.offsets.WeekOfMonth.freqstr SA01" \ + -i "pandas.tseries.offsets.WeekOfMonth SA01" \ -i "pandas.tseries.offsets.WeekOfMonth.is_on_offset GL08" \ -i "pandas.tseries.offsets.WeekOfMonth.n GL08" \ - -i "pandas.tseries.offsets.WeekOfMonth.nanos GL08" \ -i "pandas.tseries.offsets.WeekOfMonth.normalize GL08" \ - -i "pandas.tseries.offsets.WeekOfMonth.rule_code GL08" \ -i "pandas.tseries.offsets.WeekOfMonth.week GL08" \ -i "pandas.tseries.offsets.WeekOfMonth.weekday GL08" \ - -i "pandas.tseries.offsets.YearBegin.freqstr SA01" \ -i "pandas.tseries.offsets.YearBegin.is_on_offset GL08" \ -i "pandas.tseries.offsets.YearBegin.month GL08" \ -i "pandas.tseries.offsets.YearBegin.n GL08" \ - -i "pandas.tseries.offsets.YearBegin.nanos GL08" \ -i "pandas.tseries.offsets.YearBegin.normalize GL08" \ - -i "pandas.tseries.offsets.YearBegin.rule_code GL08" \ - -i "pandas.tseries.offsets.YearEnd.freqstr SA01" \ -i "pandas.tseries.offsets.YearEnd.is_on_offset GL08" \ -i "pandas.tseries.offsets.YearEnd.month GL08" \ -i "pandas.tseries.offsets.YearEnd.n GL08" \ - -i "pandas.tseries.offsets.YearEnd.nanos GL08" \ -i "pandas.tseries.offsets.YearEnd.normalize GL08" \ - -i "pandas.tseries.offsets.YearEnd.rule_code GL08" \ -i "pandas.util.hash_pandas_object PR07,SA01" # There should be no backslash in the final line, please keep this comment in the last ignored function RET=$(($RET + $?)) ; echo $MSG "DONE" diff --git a/ci/deps/actions-310-minimum_versions.yaml b/ci/deps/actions-310-minimum_versions.yaml index 0c46f476893dd..e670356c95637 100644 --- a/ci/deps/actions-310-minimum_versions.yaml +++ b/ci/deps/actions-310-minimum_versions.yaml @@ -23,7 +23,6 @@ dependencies: # required dependencies - python-dateutil=2.8.2 - numpy=1.23.5 - - pytz=2020.1 # optional dependencies - beautifulsoup4=4.11.2 @@ -49,6 +48,7 @@ dependencies: - pyreadstat=1.2.0 - pytables=3.8.0 - python-calamine=0.1.7 + - pytz=2023.4 - pyxlsb=1.0.10 - s3fs=2022.11.0 - scipy=1.10.0 diff --git a/ci/deps/actions-310.yaml b/ci/deps/actions-310.yaml index 0af46752f5b3d..c33c0344e742f 100644 --- a/ci/deps/actions-310.yaml +++ b/ci/deps/actions-310.yaml @@ -21,7 +21,6 @@ dependencies: # required dependencies - python-dateutil - numpy - - pytz # optional dependencies - beautifulsoup4>=4.11.2 @@ -47,6 +46,7 @@ dependencies: - pyreadstat>=1.2.0 - pytables>=3.8.0 - python-calamine>=0.1.7 + - pytz>=2023.4 - pyxlsb>=1.0.10 - s3fs>=2022.11.0 - scipy>=1.10.0 diff --git a/ci/deps/actions-311-downstream_compat.yaml b/ci/deps/actions-311-downstream_compat.yaml index 1a842c7212c1f..8692b6e35ab2d 100644 --- a/ci/deps/actions-311-downstream_compat.yaml +++ b/ci/deps/actions-311-downstream_compat.yaml @@ -22,7 +22,6 @@ dependencies: # required dependencies - python-dateutil - numpy - - pytz # optional dependencies - beautifulsoup4>=4.11.2 @@ -48,6 +47,7 @@ dependencies: - pyreadstat>=1.2.0 - pytables>=3.8.0 - python-calamine>=0.1.7 + - pytz>=2023.4 - pyxlsb>=1.0.10 - s3fs>=2022.11.0 - scipy>=1.10.0 diff --git a/ci/deps/actions-311-numpydev.yaml b/ci/deps/actions-311-numpydev.yaml index 748cfa861ec32..996ce5cd9ab94 100644 --- a/ci/deps/actions-311-numpydev.yaml +++ b/ci/deps/actions-311-numpydev.yaml @@ -18,7 +18,6 @@ dependencies: # pandas dependencies - python-dateutil - - pytz - pip - pip: diff --git a/ci/deps/actions-311-pyarrownightly.yaml b/ci/deps/actions-311-pyarrownightly.yaml index 469fb1bfb9138..434f1d4f7fed2 100644 --- a/ci/deps/actions-311-pyarrownightly.yaml +++ b/ci/deps/actions-311-pyarrownightly.yaml @@ -19,7 +19,6 @@ dependencies: # required dependencies - python-dateutil - numpy<2 - - pytz - pip - pip: diff --git a/ci/deps/actions-311.yaml b/ci/deps/actions-311.yaml index 75394e2c8e109..8e7d9aba7878d 100644 --- a/ci/deps/actions-311.yaml +++ b/ci/deps/actions-311.yaml @@ -21,7 +21,6 @@ dependencies: # required dependencies - python-dateutil - numpy - - pytz # optional dependencies - beautifulsoup4>=4.11.2 @@ -47,6 +46,7 @@ dependencies: - pyreadstat>=1.2.0 - pytables>=3.8.0 - python-calamine>=0.1.7 + - pytz>=2023.4 - pyxlsb>=1.0.10 - s3fs>=2022.11.0 - scipy>=1.10.0 diff --git a/ci/deps/actions-312.yaml b/ci/deps/actions-312.yaml index d4b43ddef3601..6c97960a62d40 100644 --- a/ci/deps/actions-312.yaml +++ b/ci/deps/actions-312.yaml @@ -21,7 +21,6 @@ dependencies: # required dependencies - python-dateutil - numpy - - pytz # optional dependencies - beautifulsoup4>=4.11.2 @@ -47,6 +46,7 @@ dependencies: - pyreadstat>=1.2.0 - pytables>=3.8.0 - python-calamine>=0.1.7 + - pytz>=2023.4 - pyxlsb>=1.0.10 - s3fs>=2022.11.0 - scipy>=1.10.0 diff --git a/ci/deps/actions-pypy-39.yaml b/ci/deps/actions-pypy-39.yaml index b0ae9f1e48473..c157d2e65c001 100644 --- a/ci/deps/actions-pypy-39.yaml +++ b/ci/deps/actions-pypy-39.yaml @@ -22,6 +22,5 @@ dependencies: # required - numpy - python-dateutil - - pytz - pip: - tzdata>=2022.7 diff --git a/ci/deps/circle-311-arm64.yaml b/ci/deps/circle-311-arm64.yaml index 18535d81e6985..c86534871b3d2 100644 --- a/ci/deps/circle-311-arm64.yaml +++ b/ci/deps/circle-311-arm64.yaml @@ -21,7 +21,6 @@ dependencies: # required dependencies - python-dateutil - numpy - - pytz # optional dependencies - beautifulsoup4>=4.11.2 @@ -47,6 +46,7 @@ dependencies: - pyreadstat>=1.2.0 - pytables>=3.8.0 - python-calamine>=0.1.7 + - pytz>=2023.4 - pyxlsb>=1.0.10 - s3fs>=2022.11.0 - scipy>=1.10.0 diff --git a/ci/meta.yaml b/ci/meta.yaml index b76bef2f630b7..9d434991b12c1 100644 --- a/ci/meta.yaml +++ b/ci/meta.yaml @@ -37,7 +37,6 @@ requirements: - numpy >=1.21.6 # [py<311] - numpy >=1.23.2 # [py>=311] - python-dateutil >=2.8.2 - - pytz >=2020.1 - python-tzdata >=2022.7 test: diff --git a/doc/source/development/contributing_codebase.rst b/doc/source/development/contributing_codebase.rst index 28129440b86d7..9d5a992e911b6 100644 --- a/doc/source/development/contributing_codebase.rst +++ b/doc/source/development/contributing_codebase.rst @@ -605,7 +605,7 @@ The ``temp_file`` pytest fixture creates a temporary file :py:class:`Pathlib` ob pd.DataFrame([1]).to_csv(str(temp_file)) Please reference `pytest's documentation `_ -for the file retension policy. +for the file retention policy. Testing involving network connectivity ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -762,8 +762,7 @@ install pandas) by typing:: your installation is probably fine and you can start contributing! Often it is worth running only a subset of tests first around your changes before running the -entire suite (tip: you can use the `pandas-coverage app `_) -to find out which tests hit the lines of code you've modified, and then run only those). +entire suite. The easiest way to do this is with:: diff --git a/doc/source/development/contributing_docstring.rst b/doc/source/development/contributing_docstring.rst index 0b8c1e16dce0e..e174eea00ca60 100644 --- a/doc/source/development/contributing_docstring.rst +++ b/doc/source/development/contributing_docstring.rst @@ -142,7 +142,7 @@ backticks. The following are considered inline code: With several mistakes in the docstring. - It has a blank like after the signature ``def func():``. + It has a blank line after the signature ``def func():``. The text 'Some function' should go in the line after the opening quotes of the docstring, not in the same line. diff --git a/doc/source/development/contributing_environment.rst b/doc/source/development/contributing_environment.rst index 0691414f53306..643021db7b823 100644 --- a/doc/source/development/contributing_environment.rst +++ b/doc/source/development/contributing_environment.rst @@ -227,7 +227,7 @@ To compile pandas with meson, run:: # By default, this will print verbose output # showing the "rebuild" taking place on import (see section below for explanation) # If you do not want to see this, omit everything after --no-build-isolation - python -m pip install -ve . --no-build-isolation --config-settings editable-verbose=true + python -m pip install -ve . --no-build-isolation -Ceditable-verbose=true .. note:: The version number is pulled from the latest repository tag. Be sure to fetch the latest tags from upstream @@ -242,15 +242,15 @@ To compile pandas with meson, run:: It is possible to pass options from the pip frontend to the meson backend if you would like to configure your install. Occasionally, you'll want to use this to adjust the build directory, and/or toggle debug/optimization levels. -You can pass a build directory to pandas by appending ``--config-settings builddir="your builddir here"`` to your pip command. +You can pass a build directory to pandas by appending ``-Cbuilddir="your builddir here"`` to your pip command. This option allows you to configure where meson stores your built C extensions, and allows for fast rebuilds. Sometimes, it might be useful to compile pandas with debugging symbols, when debugging C extensions. -Appending ``--config-settings setup-args="-Ddebug=true"`` will do the trick. +Appending ``-Csetup-args="-Ddebug=true"`` will do the trick. With pip, it is possible to chain together multiple config settings (for example specifying both a build directory and building with debug symbols would look like -``--config-settings builddir="your builddir here" --config-settings=setup-args="-Dbuildtype=debug"``. +``-Cbuilddir="your builddir here" -Csetup-args="-Dbuildtype=debug"``. **Compiling pandas with setup.py** @@ -291,7 +291,7 @@ At this point you may want to try When building pandas with meson, importing pandas will automatically trigger a rebuild, even when C/Cython files are modified. By default, no output will be produced by this rebuild (the import will just take longer). If you would like to see meson's -output when importing pandas, you can set the environment variable ``MESONPY_EDTIABLE_VERBOSE``. For example, this would be:: +output when importing pandas, you can set the environment variable ``MESONPY_EDITABLE_VERBOSE``. For example, this would be:: # On Linux/macOS MESONPY_EDITABLE_VERBOSE=1 python @@ -302,7 +302,7 @@ output when importing pandas, you can set the environment variable ``MESONPY_EDT If you would like to see this verbose output every time, you can set the ``editable-verbose`` config setting to ``true`` like so:: - python -m pip install -ve . --config-settings editable-verbose=true + python -m pip install -ve . -Ceditable-verbose=true .. tip:: If you ever find yourself wondering whether setuptools or meson was used to build your pandas, diff --git a/doc/source/development/debugging_extensions.rst b/doc/source/development/debugging_extensions.rst index f09d73fa13b9a..0ea1c112cb55b 100644 --- a/doc/source/development/debugging_extensions.rst +++ b/doc/source/development/debugging_extensions.rst @@ -19,7 +19,7 @@ Debugging locally By default building pandas from source will generate a release build. To generate a development build you can type:: - pip install -ve . --no-build-isolation --config-settings=builddir="debug" --config-settings=setup-args="-Dbuildtype=debug" + pip install -ve . --no-build-isolation -Cbuilddir="debug" -Csetup-args="-Dbuildtype=debug" .. note:: @@ -30,7 +30,7 @@ By specifying ``builddir="debug"`` all of the targets will be built and placed i Using Docker ------------ -To simplify the debugging process, pandas has created a Docker image with a debug build of Python and the gdb/Cython debuggers pre-installed. You may either ``docker pull pandas/pandas-debug`` to get access to this image or build it from the ``tooling/debug`` folder locallly. +To simplify the debugging process, pandas has created a Docker image with a debug build of Python and the gdb/Cython debuggers pre-installed. You may either ``docker pull pandas/pandas-debug`` to get access to this image or build it from the ``tooling/debug`` folder locally. You can then mount your pandas repository into this image via: @@ -42,7 +42,7 @@ Inside the image, you can use meson to build/install pandas and place the build .. code-block:: sh - python -m pip install -ve . --no-build-isolation --config-settings=builddir="debug" --config-settings=setup-args="-Dbuildtype=debug" + python -m pip install -ve . --no-build-isolation -Cbuilddir="debug" -Csetup-args="-Dbuildtype=debug" If planning to use cygdb, the files required by that application are placed within the build folder. So you have to first ``cd`` to the build folder, then start that application. diff --git a/doc/source/development/maintaining.rst b/doc/source/development/maintaining.rst index fbcf017d608ce..50d380cab1d50 100644 --- a/doc/source/development/maintaining.rst +++ b/doc/source/development/maintaining.rst @@ -157,7 +157,7 @@ and then run:: git bisect start git bisect good v1.4.0 git bisect bad v1.5.0 - git bisect run bash -c "python -m pip install -ve . --no-build-isolation --config-settings editable-verbose=true; python t.py" + git bisect run bash -c "python -m pip install -ve . --no-build-isolation -Ceditable-verbose=true; python t.py" This finds the first commit that changed the behavior. The C extensions have to be rebuilt at every step, so the search can take a while. @@ -165,7 +165,7 @@ rebuilt at every step, so the search can take a while. Exit bisect and rebuild the current version:: git bisect reset - python -m pip install -ve . --no-build-isolation --config-settings editable-verbose=true + python -m pip install -ve . --no-build-isolation -Ceditable-verbose=true Report your findings under the corresponding issue and ping the commit author to get their input. diff --git a/doc/source/getting_started/index.rst b/doc/source/getting_started/index.rst index 9f29f7f4f4406..36ed553d9d88e 100644 --- a/doc/source/getting_started/index.rst +++ b/doc/source/getting_started/index.rst @@ -613,7 +613,7 @@ the pandas-equivalent operations compared to software you already know: Users of `Excel `__ or other spreadsheet programs will find that many of the concepts are - transferrable to pandas. + transferable to pandas. +++ diff --git a/doc/source/getting_started/install.rst b/doc/source/getting_started/install.rst index 86ce05fde547b..8e6cb9e9a132d 100644 --- a/doc/source/getting_started/install.rst +++ b/doc/source/getting_started/install.rst @@ -205,7 +205,6 @@ Package Minimum support ================================================================ ========================== `NumPy `__ 1.23.5 `python-dateutil `__ 2.8.2 -`pytz `__ 2020.1 `tzdata `__ 2022.7 ================================================================ ========================== @@ -419,3 +418,14 @@ Dependency Minimum Version pip extra Notes ========================= ================== =============== ============================================================= Zstandard 0.19.0 compression Zstandard compression ========================= ================== =============== ============================================================= + +Timezone +^^^^^^^^ + +Installable with ``pip install "pandas[timezone]"`` + +========================= ================== =================== ============================================================= +Dependency Minimum Version pip extra Notes +========================= ================== =================== ============================================================= +pytz 2023.4 timezone Alternative timezone library to ``zoneinfo``. +========================= ================== =================== ============================================================= diff --git a/doc/source/user_guide/cookbook.rst b/doc/source/user_guide/cookbook.rst index 3dfc6534f2b64..42430fb1fbba0 100644 --- a/doc/source/user_guide/cookbook.rst +++ b/doc/source/user_guide/cookbook.rst @@ -914,7 +914,7 @@ Using TimeGrouper and another grouping to create subgroups, then apply a custom `__ `Resample intraday frame without adding new days -`__ +`__ `Resample minute data `__ diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index be40710a9e307..fa64bce60caf4 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -169,7 +169,7 @@ dtype_backend : {"numpy_nullable", "pyarrow"}, defaults to NumPy backed DataFram implementation when "numpy_nullable" is set, pyarrow is used for all dtypes if "pyarrow" is set. - The dtype_backends are still experimential. + The dtype_backends are still experimental. .. versionadded:: 2.0 @@ -2893,7 +2893,7 @@ Read in the content of the "books.xml" as instance of ``StringIO`` or df Even read XML from AWS S3 buckets such as NIH NCBI PMC Article Datasets providing -Biomedical and Life Science Jorurnals: +Biomedical and Life Science Journals: .. code-block:: python diff --git a/doc/source/user_guide/style.ipynb b/doc/source/user_guide/style.ipynb index f4a55280cd1f1..daecfce6ecebc 100644 --- a/doc/source/user_guide/style.ipynb +++ b/doc/source/user_guide/style.ipynb @@ -1182,7 +1182,7 @@ "Some styling functions are common enough that we've \"built them in\" to the `Styler`, so you don't have to write them and apply them yourself. The current list of such functions is:\n", "\n", " - [.highlight_null][nullfunc]: for use with identifying missing data. \n", - " - [.highlight_min][minfunc] and [.highlight_max][maxfunc]: for use with identifying extremeties in data.\n", + " - [.highlight_min][minfunc] and [.highlight_max][maxfunc]: for use with identifying extremities in data.\n", " - [.highlight_between][betweenfunc] and [.highlight_quantile][quantilefunc]: for use with identifying classes within data.\n", " - [.background_gradient][bgfunc]: a flexible method for highlighting cells based on their, or other, values on a numeric scale.\n", " - [.text_gradient][textfunc]: similar method for highlighting text based on their, or other, values on a numeric scale.\n", diff --git a/doc/source/user_guide/timeseries.rst b/doc/source/user_guide/timeseries.rst index 0845417e4910d..4299dca4774b9 100644 --- a/doc/source/user_guide/timeseries.rst +++ b/doc/source/user_guide/timeseries.rst @@ -2569,7 +2569,7 @@ Ambiguous times when localizing because daylight savings time (DST) in a local time zone causes some times to occur twice within one day ("clocks fall back"). The following options are available: -* ``'raise'``: Raises a ``pytz.AmbiguousTimeError`` (the default behavior) +* ``'raise'``: Raises a ``ValueError`` (the default behavior) * ``'infer'``: Attempt to determine the correct offset base on the monotonicity of the timestamps * ``'NaT'``: Replaces ambiguous times with ``NaT`` * ``bool``: ``True`` represents a DST time, ``False`` represents non-DST time. An array-like of ``bool`` values is supported for a sequence of times. @@ -2604,7 +2604,7 @@ A DST transition may also shift the local time ahead by 1 hour creating nonexist local times ("clocks spring forward"). The behavior of localizing a timeseries with nonexistent times can be controlled by the ``nonexistent`` argument. The following options are available: -* ``'raise'``: Raises a ``pytz.NonExistentTimeError`` (the default behavior) +* ``'raise'``: Raises a ``ValueError`` (the default behavior) * ``'NaT'``: Replaces nonexistent times with ``NaT`` * ``'shift_forward'``: Shifts nonexistent times forward to the closest real time * ``'shift_backward'``: Shifts nonexistent times backward to the closest real time diff --git a/doc/source/user_guide/visualization.rst b/doc/source/user_guide/visualization.rst index 9081d13ef2cf1..66eeb74b363a3 100644 --- a/doc/source/user_guide/visualization.rst +++ b/doc/source/user_guide/visualization.rst @@ -1504,7 +1504,7 @@ Plotting with error bars Plotting with error bars is supported in :meth:`DataFrame.plot` and :meth:`Series.plot`. -Horizontal and vertical error bars can be supplied to the ``xerr`` and ``yerr`` keyword arguments to :meth:`~DataFrame.plot()`. The error values can be specified using a variety of formats: +Horizontal and vertical error bars can be supplied to the ``xerr`` and ``yerr`` keyword arguments to :meth:`~DataFrame.plot`. The error values can be specified using a variety of formats: * As a :class:`DataFrame` or ``dict`` of errors with column names matching the ``columns`` attribute of the plotting :class:`DataFrame` or matching the ``name`` attribute of the :class:`Series`. * As a ``str`` indicating which of the columns of plotting :class:`DataFrame` contain the error values. diff --git a/doc/source/whatsnew/index.rst b/doc/source/whatsnew/index.rst index 1a1ecdd0effee..2f7ec52d117f8 100644 --- a/doc/source/whatsnew/index.rst +++ b/doc/source/whatsnew/index.rst @@ -18,6 +18,13 @@ Version 3.0 v3.0.0 +Version 2.3 +----------- + +.. toctree:: + :maxdepth: 2 + + v2.3.0 Version 2.2 ----------- diff --git a/doc/source/whatsnew/v0.15.0.rst b/doc/source/whatsnew/v0.15.0.rst index 70982e723016f..1ee7c5cbc6b9e 100644 --- a/doc/source/whatsnew/v0.15.0.rst +++ b/doc/source/whatsnew/v0.15.0.rst @@ -490,7 +490,7 @@ Rolling/expanding moments improvements ``ddof`` argument (with a default value of ``1``) was previously undocumented. (:issue:`8064`) - :func:`ewma`, :func:`ewmstd`, :func:`ewmvol`, :func:`ewmvar`, :func:`ewmcov`, and :func:`ewmcorr` - now interpret ``min_periods`` in the same manner that the :func:`rolling_*()` and :func:`expanding_*()` functions do: + now interpret ``min_periods`` in the same manner that the :func:`rolling_*` and :func:`expanding_*` functions do: a given result entry will be ``NaN`` if the (expanding, in this case) window does not contain at least ``min_periods`` values. The previous behavior was to set to ``NaN`` the ``min_periods`` entries starting with the first non- ``NaN`` value. (:issue:`7977`) @@ -567,7 +567,7 @@ Rolling/expanding moments improvements .. warning:: - By default (``ignore_na=False``) the :func:`ewm*()` functions' weights calculation + By default (``ignore_na=False``) the :func:`ewm*` functions' weights calculation in the presence of missing values is different than in pre-0.15.0 versions. To reproduce the pre-0.15.0 calculation of weights in the presence of missing values one must specify explicitly ``ignore_na=True``. @@ -576,7 +576,7 @@ Rolling/expanding moments improvements returning results with columns sorted by name and producing an error for non-unique columns; now handles non-unique columns and returns columns in original order (except for the case of two DataFrames with ``pairwise=False``, where behavior is unchanged) (:issue:`7542`) -- Bug in :func:`rolling_count` and :func:`expanding_*()` functions unnecessarily producing error message for zero-length data (:issue:`8056`) +- Bug in :func:`rolling_count` and :func:`expanding_*` functions unnecessarily producing error message for zero-length data (:issue:`8056`) - Bug in :func:`rolling_apply` and :func:`expanding_apply` interpreting ``min_periods=0`` as ``min_periods=1`` (:issue:`8080`) - Bug in :func:`expanding_std` and :func:`expanding_var` for a single value producing a confusing error message (:issue:`7900`) - Bug in :func:`rolling_std` and :func:`rolling_var` for a single value producing ``0`` rather than ``NaN`` (:issue:`7900`) @@ -875,7 +875,7 @@ Other notable API changes: The behaviour of assigning a column to an existing dataframe as ``df['a'] = i`` remains unchanged (this already returned an ``object`` column with a timezone). -- When passing multiple levels to :meth:`~pandas.DataFrame.stack()`, it will now raise a ``ValueError`` when the +- When passing multiple levels to :meth:`~pandas.DataFrame.stack`, it will now raise a ``ValueError`` when the levels aren't all level names or all level numbers (:issue:`7660`). See :ref:`Reshaping by stacking and unstacking `. @@ -1110,7 +1110,7 @@ Other: - ``DataFrame.fillna`` can now accept a ``DataFrame`` as a fill value (:issue:`8377`) -- Passing multiple levels to :meth:`~pandas.DataFrame.stack()` will now work when multiple level +- Passing multiple levels to :meth:`~pandas.DataFrame.stack` will now work when multiple level numbers are passed (:issue:`7660`). See :ref:`Reshaping by stacking and unstacking `. diff --git a/doc/source/whatsnew/v0.15.1.rst b/doc/source/whatsnew/v0.15.1.rst index 765201996d544..f16c9b3f5d45b 100644 --- a/doc/source/whatsnew/v0.15.1.rst +++ b/doc/source/whatsnew/v0.15.1.rst @@ -263,7 +263,7 @@ Enhancements - Raise errors in certain aggregation cases where an argument such as ``numeric_only`` is not handled (:issue:`8592`). -- Added support for 3-character ISO and non-standard country codes in :func:`io.wb.download()` (:issue:`8482`) +- Added support for 3-character ISO and non-standard country codes in :func:`io.wb.download` (:issue:`8482`) - World Bank data requests now will warn/raise based on an ``errors`` argument, as well as a list of hard-coded country codes and diff --git a/doc/source/whatsnew/v0.21.0.rst b/doc/source/whatsnew/v0.21.0.rst index dad69b99ee6a4..43719cd53b0ff 100644 --- a/doc/source/whatsnew/v0.21.0.rst +++ b/doc/source/whatsnew/v0.21.0.rst @@ -318,7 +318,7 @@ New keywords - :func:`Series.set_axis` and :func:`DataFrame.set_axis` now support the ``inplace`` parameter. (:issue:`14636`) - :func:`Series.to_pickle` and :func:`DataFrame.to_pickle` have gained a ``protocol`` parameter (:issue:`16252`). By default, this parameter is set to `HIGHEST_PROTOCOL `__ - :func:`read_feather` has gained the ``nthreads`` parameter for multi-threaded operations (:issue:`16359`) -- :func:`DataFrame.clip()` and :func:`Series.clip()` have gained an ``inplace`` argument. (:issue:`15388`) +- :func:`DataFrame.clip` and :func:`Series.clip` have gained an ``inplace`` argument. (:issue:`15388`) - :func:`crosstab` has gained a ``margins_name`` parameter to define the name of the row / column that will contain the totals when ``margins=True``. (:issue:`15972`) - :func:`read_json` now accepts a ``chunksize`` parameter that can be used when ``lines=True``. If ``chunksize`` is passed, read_json now returns an iterator which reads in ``chunksize`` lines with each iteration. (:issue:`17048`) - :func:`read_json` and :func:`~DataFrame.to_json` now accept a ``compression`` argument which allows them to transparently handle compressed files. (:issue:`17798`) @@ -977,10 +977,10 @@ Other API changes Deprecations ~~~~~~~~~~~~ -- :meth:`DataFrame.from_csv` and :meth:`Series.from_csv` have been deprecated in favor of :func:`read_csv()` (:issue:`4191`) -- :func:`read_excel()` has deprecated ``sheetname`` in favor of ``sheet_name`` for consistency with ``.to_excel()`` (:issue:`10559`). -- :func:`read_excel()` has deprecated ``parse_cols`` in favor of ``usecols`` for consistency with :func:`read_csv` (:issue:`4988`) -- :func:`read_csv()` has deprecated the ``tupleize_cols`` argument. Column tuples will always be converted to a ``MultiIndex`` (:issue:`17060`) +- :meth:`DataFrame.from_csv` and :meth:`Series.from_csv` have been deprecated in favor of :func:`read_csv` (:issue:`4191`) +- :func:`read_excel` has deprecated ``sheetname`` in favor of ``sheet_name`` for consistency with ``.to_excel()`` (:issue:`10559`). +- :func:`read_excel` has deprecated ``parse_cols`` in favor of ``usecols`` for consistency with :func:`read_csv` (:issue:`4988`) +- :func:`read_csv` has deprecated the ``tupleize_cols`` argument. Column tuples will always be converted to a ``MultiIndex`` (:issue:`17060`) - :meth:`DataFrame.to_csv` has deprecated the ``tupleize_cols`` argument. MultiIndex columns will be always written as rows in the CSV file (:issue:`17060`) - The ``convert`` parameter has been deprecated in the ``.take()`` method, as it was not being respected (:issue:`16948`) - ``pd.options.html.border`` has been deprecated in favor of ``pd.options.display.html.border`` (:issue:`15793`). @@ -1045,7 +1045,7 @@ return the position of the maximum or minimum. Removal of prior version deprecations/changes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -- :func:`read_excel()` has dropped the ``has_index_names`` parameter (:issue:`10967`) +- :func:`read_excel` has dropped the ``has_index_names`` parameter (:issue:`10967`) - The ``pd.options.display.height`` configuration has been dropped (:issue:`3663`) - The ``pd.options.display.line_width`` configuration has been dropped (:issue:`2881`) - The ``pd.options.display.mpl_style`` configuration has been dropped (:issue:`12190`) @@ -1154,7 +1154,7 @@ GroupBy/resample/rolling - Bug in ``DataFrame.resample(...).size()`` where an empty ``DataFrame`` did not return a ``Series`` (:issue:`14962`) - Bug in :func:`infer_freq` causing indices with 2-day gaps during the working week to be wrongly inferred as business daily (:issue:`16624`) -- Bug in ``.rolling(...).quantile()`` which incorrectly used different defaults than :func:`Series.quantile()` and :func:`DataFrame.quantile()` (:issue:`9413`, :issue:`16211`) +- Bug in ``.rolling(...).quantile()`` which incorrectly used different defaults than :func:`Series.quantile` and :func:`DataFrame.quantile` (:issue:`9413`, :issue:`16211`) - Bug in ``groupby.transform()`` that would coerce boolean dtypes back to float (:issue:`16875`) - Bug in ``Series.resample(...).apply()`` where an empty ``Series`` modified the source index and did not return the name of a ``Series`` (:issue:`14313`) - Bug in ``.rolling(...).apply(...)`` with a ``DataFrame`` with a ``DatetimeIndex``, a ``window`` of a timedelta-convertible and ``min_periods >= 1`` (:issue:`15305`) @@ -1194,7 +1194,7 @@ Reshaping Numeric ^^^^^^^ - Bug in ``.clip()`` with ``axis=1`` and a list-like for ``threshold`` is passed; previously this raised ``ValueError`` (:issue:`15390`) -- :func:`Series.clip()` and :func:`DataFrame.clip()` now treat NA values for upper and lower arguments as ``None`` instead of raising ``ValueError`` (:issue:`17276`). +- :func:`Series.clip` and :func:`DataFrame.clip` now treat NA values for upper and lower arguments as ``None`` instead of raising ``ValueError`` (:issue:`17276`). Categorical diff --git a/doc/source/whatsnew/v0.21.1.rst b/doc/source/whatsnew/v0.21.1.rst index e217e1a75efc5..a8f9f9c9e0840 100644 --- a/doc/source/whatsnew/v0.21.1.rst +++ b/doc/source/whatsnew/v0.21.1.rst @@ -141,7 +141,7 @@ IO Plotting ^^^^^^^^ -- Bug in ``DataFrame.plot()`` and ``Series.plot()`` with :class:`DatetimeIndex` where a figure generated by them is not pickleable in Python 3 (:issue:`18439`) +- Bug in ``DataFrame.plot()`` and ``Series.plot()`` with :class:`DatetimeIndex` where a figure generated by them is not picklable in Python 3 (:issue:`18439`) GroupBy/resample/rolling ^^^^^^^^^^^^^^^^^^^^^^^^ @@ -176,7 +176,7 @@ Categorical String ^^^^^^ -- :meth:`Series.str.split()` will now propagate ``NaN`` values across all expanded columns instead of ``None`` (:issue:`18450`) +- :meth:`Series.str.split` will now propagate ``NaN`` values across all expanded columns instead of ``None`` (:issue:`18450`) .. _whatsnew_0.21.1.contributors: diff --git a/doc/source/whatsnew/v0.23.0.rst b/doc/source/whatsnew/v0.23.0.rst index 663b47a4d2d55..7f7609edc27b6 100644 --- a/doc/source/whatsnew/v0.23.0.rst +++ b/doc/source/whatsnew/v0.23.0.rst @@ -574,7 +574,7 @@ Other enhancements - :func:`DataFrame.corrwith` now silently drops non-numeric columns when passed a Series. Before, an exception was raised (:issue:`18570`). - :class:`IntervalIndex` now supports time zone aware ``Interval`` objects (:issue:`18537`, :issue:`18538`) - :func:`Series` / :func:`DataFrame` tab completion also returns identifiers in the first level of a :func:`MultiIndex`. (:issue:`16326`) -- :func:`read_excel()` has gained the ``nrows`` parameter (:issue:`16645`) +- :func:`read_excel` has gained the ``nrows`` parameter (:issue:`16645`) - :meth:`DataFrame.append` can now in more cases preserve the type of the calling dataframe's columns (e.g. if both are ``CategoricalIndex``) (:issue:`18359`) - :meth:`DataFrame.to_json` and :meth:`Series.to_json` now accept an ``index`` argument which allows the user to exclude the index from the JSON output (:issue:`17394`) - ``IntervalIndex.to_tuples()`` has gained the ``na_tuple`` parameter to control whether NA is returned as a tuple of NA, or NA itself (:issue:`18756`) @@ -1092,10 +1092,10 @@ Other API changes - :func:`pandas.merge` now raises a ``ValueError`` when trying to merge on incompatible data types (:issue:`9780`) - The default NA value for :class:`UInt64Index` has changed from 0 to ``NaN``, which impacts methods that mask with NA, such as ``UInt64Index.where()`` (:issue:`18398`) - Refactored ``setup.py`` to use ``find_packages`` instead of explicitly listing out all subpackages (:issue:`18535`) -- Rearranged the order of keyword arguments in :func:`read_excel()` to align with :func:`read_csv()` (:issue:`16672`) +- Rearranged the order of keyword arguments in :func:`read_excel` to align with :func:`read_csv` (:issue:`16672`) - :func:`wide_to_long` previously kept numeric-like suffixes as ``object`` dtype. Now they are cast to numeric if possible (:issue:`17627`) - In :func:`read_excel`, the ``comment`` argument is now exposed as a named parameter (:issue:`18735`) -- Rearranged the order of keyword arguments in :func:`read_excel()` to align with :func:`read_csv()` (:issue:`16672`) +- Rearranged the order of keyword arguments in :func:`read_excel` to align with :func:`read_csv` (:issue:`16672`) - The options ``html.border`` and ``mode.use_inf_as_null`` were deprecated in prior versions, these will now show ``FutureWarning`` rather than a ``DeprecationWarning`` (:issue:`19003`) - :class:`IntervalIndex` and ``IntervalDtype`` no longer support categorical, object, and string subtypes (:issue:`19016`) - ``IntervalDtype`` now returns ``True`` when compared against ``'interval'`` regardless of subtype, and ``IntervalDtype.name`` now returns ``'interval'`` regardless of subtype (:issue:`18980`) @@ -1207,7 +1207,7 @@ Performance improvements - ``Series`` construction will reduce the number of copies made of the input data in certain cases (:issue:`17449`) - Improved performance of :func:`Series.dt.date` and :func:`DatetimeIndex.date` (:issue:`18058`) - Improved performance of :func:`Series.dt.time` and :func:`DatetimeIndex.time` (:issue:`18461`) -- Improved performance of :func:`IntervalIndex.symmetric_difference()` (:issue:`18475`) +- Improved performance of :func:`IntervalIndex.symmetric_difference` (:issue:`18475`) - Improved performance of ``DatetimeIndex`` and ``Series`` arithmetic operations with Business-Month and Business-Quarter frequencies (:issue:`18489`) - :func:`Series` / :func:`DataFrame` tab completion limits to 100 values, for better performance. (:issue:`18587`) - Improved performance of :func:`DataFrame.median` with ``axis=1`` when bottleneck is not installed (:issue:`16468`) @@ -1326,7 +1326,7 @@ Timedelta - Bug in :func:`Timedelta.__add__`, :func:`Timedelta.__sub__` where adding or subtracting a ``np.timedelta64`` object would return another ``np.timedelta64`` instead of a ``Timedelta`` (:issue:`19738`) - Bug in :func:`Timedelta.__floordiv__`, :func:`Timedelta.__rfloordiv__` where operating with a ``Tick`` object would raise a ``TypeError`` instead of returning a numeric value (:issue:`19738`) - Bug in :func:`Period.asfreq` where periods near ``datetime(1, 1, 1)`` could be converted incorrectly (:issue:`19643`, :issue:`19834`) -- Bug in :func:`Timedelta.total_seconds()` causing precision errors, for example ``Timedelta('30S').total_seconds()==30.000000000000004`` (:issue:`19458`) +- Bug in :func:`Timedelta.total_seconds` causing precision errors, for example ``Timedelta('30S').total_seconds()==30.000000000000004`` (:issue:`19458`) - Bug in :func:`Timedelta.__rmod__` where operating with a ``numpy.timedelta64`` returned a ``timedelta64`` object instead of a ``Timedelta`` (:issue:`19820`) - Multiplication of :class:`TimedeltaIndex` by ``TimedeltaIndex`` will now raise ``TypeError`` instead of raising ``ValueError`` in cases of length mismatch (:issue:`19333`) - Bug in indexing a :class:`TimedeltaIndex` with a ``np.timedelta64`` object which was raising a ``TypeError`` (:issue:`20393`) @@ -1430,12 +1430,12 @@ IO - Bug in :func:`read_csv` where missing values were not being handled properly when ``keep_default_na=False`` with dictionary ``na_values`` (:issue:`19227`) - Bug in :func:`read_csv` causing heap corruption on 32-bit, big-endian architectures (:issue:`20785`) - Bug in :func:`read_sas` where a file with 0 variables gave an ``AttributeError`` incorrectly. Now it gives an ``EmptyDataError`` (:issue:`18184`) -- Bug in :func:`DataFrame.to_latex()` where pairs of braces meant to serve as invisible placeholders were escaped (:issue:`18667`) -- Bug in :func:`DataFrame.to_latex()` where a ``NaN`` in a ``MultiIndex`` would cause an ``IndexError`` or incorrect output (:issue:`14249`) -- Bug in :func:`DataFrame.to_latex()` where a non-string index-level name would result in an ``AttributeError`` (:issue:`19981`) -- Bug in :func:`DataFrame.to_latex()` where the combination of an index name and the ``index_names=False`` option would result in incorrect output (:issue:`18326`) -- Bug in :func:`DataFrame.to_latex()` where a ``MultiIndex`` with an empty string as its name would result in incorrect output (:issue:`18669`) -- Bug in :func:`DataFrame.to_latex()` where missing space characters caused wrong escaping and produced non-valid latex in some cases (:issue:`20859`) +- Bug in :func:`DataFrame.to_latex` where pairs of braces meant to serve as invisible placeholders were escaped (:issue:`18667`) +- Bug in :func:`DataFrame.to_latex` where a ``NaN`` in a ``MultiIndex`` would cause an ``IndexError`` or incorrect output (:issue:`14249`) +- Bug in :func:`DataFrame.to_latex` where a non-string index-level name would result in an ``AttributeError`` (:issue:`19981`) +- Bug in :func:`DataFrame.to_latex` where the combination of an index name and the ``index_names=False`` option would result in incorrect output (:issue:`18326`) +- Bug in :func:`DataFrame.to_latex` where a ``MultiIndex`` with an empty string as its name would result in incorrect output (:issue:`18669`) +- Bug in :func:`DataFrame.to_latex` where missing space characters caused wrong escaping and produced non-valid latex in some cases (:issue:`20859`) - Bug in :func:`read_json` where large numeric values were causing an ``OverflowError`` (:issue:`18842`) - Bug in :func:`DataFrame.to_parquet` where an exception was raised if the write destination is S3 (:issue:`19134`) - :class:`Interval` now supported in :func:`DataFrame.to_excel` for all Excel file types (:issue:`19242`) diff --git a/doc/source/whatsnew/v0.23.1.rst b/doc/source/whatsnew/v0.23.1.rst index 685fe1b3836bf..a98933e7f5969 100644 --- a/doc/source/whatsnew/v0.23.1.rst +++ b/doc/source/whatsnew/v0.23.1.rst @@ -106,7 +106,7 @@ Bug fixes **Data-type specific** -- Bug in :meth:`Series.str.replace()` where the method throws ``TypeError`` on Python 3.5.2 (:issue:`21078`) +- Bug in :meth:`Series.str.replace` where the method throws ``TypeError`` on Python 3.5.2 (:issue:`21078`) - Bug in :class:`Timedelta` where passing a float with a unit would prematurely round the float precision (:issue:`14156`) - Bug in :func:`pandas.testing.assert_index_equal` which raised ``AssertionError`` incorrectly, when comparing two :class:`CategoricalIndex` objects with param ``check_categorical=False`` (:issue:`19776`) diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index 8ddc8e5d058ca..60e77a8c5d8c5 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -418,7 +418,7 @@ Other enhancements - :func:`~DataFrame.to_parquet` now supports writing a ``DataFrame`` as a directory of parquet files partitioned by a subset of the columns when ``engine = 'pyarrow'`` (:issue:`23283`) - :meth:`Timestamp.tz_localize`, :meth:`DatetimeIndex.tz_localize`, and :meth:`Series.tz_localize` have gained the ``nonexistent`` argument for alternative handling of nonexistent times. See :ref:`timeseries.timezone_nonexistent` (:issue:`8917`, :issue:`24466`) - :meth:`Index.difference`, :meth:`Index.intersection`, :meth:`Index.union`, and :meth:`Index.symmetric_difference` now have an optional ``sort`` parameter to control whether the results should be sorted if possible (:issue:`17839`, :issue:`24471`) -- :meth:`read_excel()` now accepts ``usecols`` as a list of column names or callable (:issue:`18273`) +- :meth:`read_excel` now accepts ``usecols`` as a list of column names or callable (:issue:`18273`) - :meth:`MultiIndex.to_flat_index` has been added to flatten multiple levels into a single-level :class:`Index` object. - :meth:`DataFrame.to_stata` and :class:`pandas.io.stata.StataWriter117` can write mixed string columns to Stata strl format (:issue:`23633`) - :meth:`DataFrame.between_time` and :meth:`DataFrame.at_time` have gained the ``axis`` parameter (:issue:`8839`) @@ -723,8 +723,8 @@ Time values in ``dt.end_time`` and ``to_timestamp(how='end')`` The time values in :class:`Period` and :class:`PeriodIndex` objects are now set to '23:59:59.999999999' when calling :attr:`Series.dt.end_time`, :attr:`Period.end_time`, -:attr:`PeriodIndex.end_time`, :func:`Period.to_timestamp()` with ``how='end'``, -or :func:`PeriodIndex.to_timestamp()` with ``how='end'`` (:issue:`17157`) +:attr:`PeriodIndex.end_time`, :func:`Period.to_timestamp` with ``how='end'``, +or :func:`PeriodIndex.to_timestamp` with ``how='end'`` (:issue:`17157`) *Previous behavior*: @@ -1289,15 +1289,15 @@ ways of adding operator support. - Bug in :meth:`Series.get` for ``Series`` using ``ExtensionArray`` and integer index (:issue:`21257`) - :meth:`~Series.shift` now dispatches to :meth:`ExtensionArray.shift` (:issue:`22386`) -- :meth:`Series.combine()` works correctly with :class:`~pandas.api.extensions.ExtensionArray` inside of :class:`Series` (:issue:`20825`) -- :meth:`Series.combine()` with scalar argument now works for any function type (:issue:`21248`) +- :meth:`Series.combine` works correctly with :class:`~pandas.api.extensions.ExtensionArray` inside of :class:`Series` (:issue:`20825`) +- :meth:`Series.combine` with scalar argument now works for any function type (:issue:`21248`) - :meth:`Series.astype` and :meth:`DataFrame.astype` now dispatch to :meth:`ExtensionArray.astype` (:issue:`21185`). - Slicing a single row of a ``DataFrame`` with multiple ExtensionArrays of the same type now preserves the dtype, rather than coercing to object (:issue:`22784`) - Bug when concatenating multiple ``Series`` with different extension dtypes not casting to object dtype (:issue:`22994`) - Series backed by an ``ExtensionArray`` now work with :func:`util.hash_pandas_object` (:issue:`23066`) - :meth:`DataFrame.stack` no longer converts to object dtype for DataFrames where each column has the same extension dtype. The output Series will have the same dtype as the columns (:issue:`23077`). - :meth:`Series.unstack` and :meth:`DataFrame.unstack` no longer convert extension arrays to object-dtype ndarrays. Each column in the output ``DataFrame`` will now have the same dtype as the input (:issue:`23077`). -- Bug when grouping :meth:`Dataframe.groupby()` and aggregating on ``ExtensionArray`` it was not returning the actual ``ExtensionArray`` dtype (:issue:`23227`). +- Bug when grouping :meth:`Dataframe.groupby` and aggregating on ``ExtensionArray`` it was not returning the actual ``ExtensionArray`` dtype (:issue:`23227`). - Bug in :func:`pandas.merge` when merging on an extension array-backed column (:issue:`23020`). @@ -1586,7 +1586,7 @@ Categorical - Constructing a :class:`CategoricalIndex` with empty values and boolean categories was raising a ``ValueError`` after a change to dtype coercion (:issue:`22702`). - Bug in :meth:`Categorical.take` with a user-provided ``fill_value`` not encoding the ``fill_value``, which could result in a ``ValueError``, incorrect results, or a segmentation fault (:issue:`23296`). - In :meth:`Series.unstack`, specifying a ``fill_value`` not present in the categories now raises a ``TypeError`` rather than ignoring the ``fill_value`` (:issue:`23284`) -- Bug when resampling :meth:`DataFrame.resample()` and aggregating on categorical data, the categorical dtype was getting lost. (:issue:`23227`) +- Bug when resampling :meth:`DataFrame.resample` and aggregating on categorical data, the categorical dtype was getting lost. (:issue:`23227`) - Bug in many methods of the ``.str``-accessor, which always failed on calling the ``CategoricalIndex.str`` constructor (:issue:`23555`, :issue:`23556`) - Bug in :meth:`Series.where` losing the categorical dtype for categorical data (:issue:`24077`) - Bug in :meth:`Categorical.apply` where ``NaN`` values could be handled unpredictably. They now remain unchanged (:issue:`24241`) @@ -1656,7 +1656,7 @@ Timedelta - Fixed bug in adding a :class:`DataFrame` with all-``timedelta64[ns]`` dtypes to a :class:`DataFrame` with all-integer dtypes returning incorrect results instead of raising ``TypeError`` (:issue:`22696`) - Bug in :class:`TimedeltaIndex` where adding a timezone-aware datetime scalar incorrectly returned a timezone-naive :class:`DatetimeIndex` (:issue:`23215`) - Bug in :class:`TimedeltaIndex` where adding ``np.timedelta64('NaT')`` incorrectly returned an all-``NaT`` :class:`DatetimeIndex` instead of an all-``NaT`` :class:`TimedeltaIndex` (:issue:`23215`) -- Bug in :class:`Timedelta` and :func:`to_timedelta()` have inconsistencies in supported unit string (:issue:`21762`) +- Bug in :class:`Timedelta` and :func:`to_timedelta` have inconsistencies in supported unit string (:issue:`21762`) - Bug in :class:`TimedeltaIndex` division where dividing by another :class:`TimedeltaIndex` raised ``TypeError`` instead of returning a :class:`Float64Index` (:issue:`23829`, :issue:`22631`) - Bug in :class:`TimedeltaIndex` comparison operations where comparing against non-``Timedelta``-like objects would raise ``TypeError`` instead of returning all-``False`` for ``__eq__`` and all-``True`` for ``__ne__`` (:issue:`24056`) - Bug in :class:`Timedelta` comparisons when comparing with a ``Tick`` object incorrectly raising ``TypeError`` (:issue:`24710`) @@ -1803,39 +1803,39 @@ IO - Bug in :func:`read_csv` in which unicode column names were not being properly recognized with Python 2.x (:issue:`13253`) - Bug in :meth:`DataFrame.to_sql` when writing timezone aware data (``datetime64[ns, tz]`` dtype) would raise a ``TypeError`` (:issue:`9086`) - Bug in :meth:`DataFrame.to_sql` where a naive :class:`DatetimeIndex` would be written as ``TIMESTAMP WITH TIMEZONE`` type in supported databases, e.g. PostgreSQL (:issue:`23510`) -- Bug in :meth:`read_excel()` when ``parse_cols`` is specified with an empty dataset (:issue:`9208`) -- :func:`read_html()` no longer ignores all-whitespace ```` within ```` when considering the ``skiprows`` and ``header`` arguments. Previously, users had to decrease their ``header`` and ``skiprows`` values on such tables to work around the issue. (:issue:`21641`) -- :func:`read_excel()` will correctly show the deprecation warning for previously deprecated ``sheetname`` (:issue:`17994`) -- :func:`read_csv()` and :func:`read_table()` will throw ``UnicodeError`` and not coredump on badly encoded strings (:issue:`22748`) -- :func:`read_csv()` will correctly parse timezone-aware datetimes (:issue:`22256`) -- Bug in :func:`read_csv()` in which memory management was prematurely optimized for the C engine when the data was being read in chunks (:issue:`23509`) -- Bug in :func:`read_csv()` in unnamed columns were being improperly identified when extracting a multi-index (:issue:`23687`) -- :func:`read_sas()` will parse numbers in sas7bdat-files that have width less than 8 bytes correctly. (:issue:`21616`) -- :func:`read_sas()` will correctly parse sas7bdat files with many columns (:issue:`22628`) -- :func:`read_sas()` will correctly parse sas7bdat files with data page types having also bit 7 set (so page type is 128 + 256 = 384) (:issue:`16615`) -- Bug in :func:`read_sas()` in which an incorrect error was raised on an invalid file format. (:issue:`24548`) +- Bug in :meth:`read_excel` when ``parse_cols`` is specified with an empty dataset (:issue:`9208`) +- :func:`read_html` no longer ignores all-whitespace ```` within ```` when considering the ``skiprows`` and ``header`` arguments. Previously, users had to decrease their ``header`` and ``skiprows`` values on such tables to work around the issue. (:issue:`21641`) +- :func:`read_excel` will correctly show the deprecation warning for previously deprecated ``sheetname`` (:issue:`17994`) +- :func:`read_csv` and :func:`read_table` will throw ``UnicodeError`` and not coredump on badly encoded strings (:issue:`22748`) +- :func:`read_csv` will correctly parse timezone-aware datetimes (:issue:`22256`) +- Bug in :func:`read_csv` in which memory management was prematurely optimized for the C engine when the data was being read in chunks (:issue:`23509`) +- Bug in :func:`read_csv` in unnamed columns were being improperly identified when extracting a multi-index (:issue:`23687`) +- :func:`read_sas` will parse numbers in sas7bdat-files that have width less than 8 bytes correctly. (:issue:`21616`) +- :func:`read_sas` will correctly parse sas7bdat files with many columns (:issue:`22628`) +- :func:`read_sas` will correctly parse sas7bdat files with data page types having also bit 7 set (so page type is 128 + 256 = 384) (:issue:`16615`) +- Bug in :func:`read_sas` in which an incorrect error was raised on an invalid file format. (:issue:`24548`) - Bug in :meth:`detect_client_encoding` where potential ``IOError`` goes unhandled when importing in a mod_wsgi process due to restricted access to stdout. (:issue:`21552`) -- Bug in :func:`DataFrame.to_html()` with ``index=False`` misses truncation indicators (...) on truncated DataFrame (:issue:`15019`, :issue:`22783`) -- Bug in :func:`DataFrame.to_html()` with ``index=False`` when both columns and row index are ``MultiIndex`` (:issue:`22579`) -- Bug in :func:`DataFrame.to_html()` with ``index_names=False`` displaying index name (:issue:`22747`) -- Bug in :func:`DataFrame.to_html()` with ``header=False`` not displaying row index names (:issue:`23788`) -- Bug in :func:`DataFrame.to_html()` with ``sparsify=False`` that caused it to raise ``TypeError`` (:issue:`22887`) -- Bug in :func:`DataFrame.to_string()` that broke column alignment when ``index=False`` and width of first column's values is greater than the width of first column's header (:issue:`16839`, :issue:`13032`) -- Bug in :func:`DataFrame.to_string()` that caused representations of :class:`DataFrame` to not take up the whole window (:issue:`22984`) +- Bug in :func:`DataFrame.to_html` with ``index=False`` misses truncation indicators (...) on truncated DataFrame (:issue:`15019`, :issue:`22783`) +- Bug in :func:`DataFrame.to_html` with ``index=False`` when both columns and row index are ``MultiIndex`` (:issue:`22579`) +- Bug in :func:`DataFrame.to_html` with ``index_names=False`` displaying index name (:issue:`22747`) +- Bug in :func:`DataFrame.to_html` with ``header=False`` not displaying row index names (:issue:`23788`) +- Bug in :func:`DataFrame.to_html` with ``sparsify=False`` that caused it to raise ``TypeError`` (:issue:`22887`) +- Bug in :func:`DataFrame.to_string` that broke column alignment when ``index=False`` and width of first column's values is greater than the width of first column's header (:issue:`16839`, :issue:`13032`) +- Bug in :func:`DataFrame.to_string` that caused representations of :class:`DataFrame` to not take up the whole window (:issue:`22984`) - Bug in :func:`DataFrame.to_csv` where a single level MultiIndex incorrectly wrote a tuple. Now just the value of the index is written (:issue:`19589`). - :class:`HDFStore` will raise ``ValueError`` when the ``format`` kwarg is passed to the constructor (:issue:`13291`) - Bug in :meth:`HDFStore.append` when appending a :class:`DataFrame` with an empty string column and ``min_itemsize`` < 8 (:issue:`12242`) -- Bug in :func:`read_csv()` in which memory leaks occurred in the C engine when parsing ``NaN`` values due to insufficient cleanup on completion or error (:issue:`21353`) -- Bug in :func:`read_csv()` in which incorrect error messages were being raised when ``skipfooter`` was passed in along with ``nrows``, ``iterator``, or ``chunksize`` (:issue:`23711`) -- Bug in :func:`read_csv()` in which :class:`MultiIndex` index names were being improperly handled in the cases when they were not provided (:issue:`23484`) -- Bug in :func:`read_csv()` in which unnecessary warnings were being raised when the dialect's values conflicted with the default arguments (:issue:`23761`) -- Bug in :func:`read_html()` in which the error message was not displaying the valid flavors when an invalid one was provided (:issue:`23549`) -- Bug in :meth:`read_excel()` in which extraneous header names were extracted, even though none were specified (:issue:`11733`) -- Bug in :meth:`read_excel()` in which column names were not being properly converted to string sometimes in Python 2.x (:issue:`23874`) -- Bug in :meth:`read_excel()` in which ``index_col=None`` was not being respected and parsing index columns anyway (:issue:`18792`, :issue:`20480`) -- Bug in :meth:`read_excel()` in which ``usecols`` was not being validated for proper column names when passed in as a string (:issue:`20480`) +- Bug in :func:`read_csv` in which memory leaks occurred in the C engine when parsing ``NaN`` values due to insufficient cleanup on completion or error (:issue:`21353`) +- Bug in :func:`read_csv` in which incorrect error messages were being raised when ``skipfooter`` was passed in along with ``nrows``, ``iterator``, or ``chunksize`` (:issue:`23711`) +- Bug in :func:`read_csv` in which :class:`MultiIndex` index names were being improperly handled in the cases when they were not provided (:issue:`23484`) +- Bug in :func:`read_csv` in which unnecessary warnings were being raised when the dialect's values conflicted with the default arguments (:issue:`23761`) +- Bug in :func:`read_html` in which the error message was not displaying the valid flavors when an invalid one was provided (:issue:`23549`) +- Bug in :meth:`read_excel` in which extraneous header names were extracted, even though none were specified (:issue:`11733`) +- Bug in :meth:`read_excel` in which column names were not being properly converted to string sometimes in Python 2.x (:issue:`23874`) +- Bug in :meth:`read_excel` in which ``index_col=None`` was not being respected and parsing index columns anyway (:issue:`18792`, :issue:`20480`) +- Bug in :meth:`read_excel` in which ``usecols`` was not being validated for proper column names when passed in as a string (:issue:`20480`) - Bug in :meth:`DataFrame.to_dict` when the resulting dict contains non-Python scalars in the case of numeric data (:issue:`23753`) -- :func:`DataFrame.to_string()`, :func:`DataFrame.to_html()`, :func:`DataFrame.to_latex()` will correctly format output when a string is passed as the ``float_format`` argument (:issue:`21625`, :issue:`22270`) +- :func:`DataFrame.to_string`, :func:`DataFrame.to_html`, :func:`DataFrame.to_latex` will correctly format output when a string is passed as the ``float_format`` argument (:issue:`21625`, :issue:`22270`) - Bug in :func:`read_csv` that caused it to raise ``OverflowError`` when trying to use 'inf' as ``na_value`` with integer index column (:issue:`17128`) - Bug in :func:`read_csv` that caused the C engine on Python 3.6+ on Windows to improperly read CSV filenames with accented or special characters (:issue:`15086`) - Bug in :func:`read_fwf` in which the compression type of a file was not being properly inferred (:issue:`22199`) @@ -1843,7 +1843,7 @@ IO - Bug in :meth:`DataFrame.to_stata`, :class:`pandas.io.stata.StataWriter` and :class:`pandas.io.stata.StataWriter117` where a exception would leave a partially written and invalid dta file (:issue:`23573`) - Bug in :meth:`DataFrame.to_stata` and :class:`pandas.io.stata.StataWriter117` that produced invalid files when using strLs with non-ASCII characters (:issue:`23573`) - Bug in :class:`HDFStore` that caused it to raise ``ValueError`` when reading a Dataframe in Python 3 from fixed format written in Python 2 (:issue:`24510`) -- Bug in :func:`DataFrame.to_string()` and more generally in the floating ``repr`` formatter. Zeros were not trimmed if ``inf`` was present in a columns while it was the case with NA values. Zeros are now trimmed as in the presence of NA (:issue:`24861`). +- Bug in :func:`DataFrame.to_string` and more generally in the floating ``repr`` formatter. Zeros were not trimmed if ``inf`` was present in a columns while it was the case with NA values. Zeros are now trimmed as in the presence of NA (:issue:`24861`). - Bug in the ``repr`` when truncating the number of columns and having a wide last column (:issue:`24849`). Plotting diff --git a/doc/source/whatsnew/v0.24.2.rst b/doc/source/whatsnew/v0.24.2.rst index 9a8c2ee5d00fa..d8f2e17cb9e4f 100644 --- a/doc/source/whatsnew/v0.24.2.rst +++ b/doc/source/whatsnew/v0.24.2.rst @@ -22,7 +22,7 @@ Fixed regressions - Fixed issue in ``DataFrame`` construction with passing a mixed list of mixed types could segfault. (:issue:`25075`) - Fixed regression in :meth:`DataFrame.apply` causing ``RecursionError`` when ``dict``-like classes were passed as argument. (:issue:`25196`) - Fixed regression in :meth:`DataFrame.replace` where ``regex=True`` was only replacing patterns matching the start of the string (:issue:`25259`) -- Fixed regression in :meth:`DataFrame.duplicated()`, where empty dataframe was not returning a boolean dtyped Series. (:issue:`25184`) +- Fixed regression in :meth:`DataFrame.duplicated`, where empty dataframe was not returning a boolean dtyped Series. (:issue:`25184`) - Fixed regression in :meth:`Series.min` and :meth:`Series.max` where ``numeric_only=True`` was ignored when the ``Series`` contained ``Categorical`` data (:issue:`25299`) - Fixed regression in subtraction between :class:`Series` objects with ``datetime64[ns]`` dtype incorrectly raising ``OverflowError`` when the ``Series`` on the right contains null values (:issue:`25317`) - Fixed regression in :class:`TimedeltaIndex` where ``np.sum(index)`` incorrectly returned a zero-dimensional object instead of a scalar (:issue:`25282`) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 50be28a912cf6..bddb47cd3f629 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -988,7 +988,7 @@ Performance improvements ~~~~~~~~~~~~~~~~~~~~~~~~ - Significant speedup in :class:`SparseArray` initialization that benefits most operations, fixing performance regression introduced in v0.20.0 (:issue:`24985`) -- :meth:`DataFrame.to_stata()` is now faster when outputting data with any string or non-native endian columns (:issue:`25045`) +- :meth:`DataFrame.to_stata` is now faster when outputting data with any string or non-native endian columns (:issue:`25045`) - Improved performance of :meth:`Series.searchsorted`. The speedup is especially large when the dtype is int8/int16/int32 and the searched key is within the integer bounds for the dtype (:issue:`22034`) - Improved performance of :meth:`.GroupBy.quantile` (:issue:`20405`) @@ -1088,7 +1088,7 @@ Numeric Conversion ^^^^^^^^^^ -- Bug in :func:`DataFrame.astype()` when passing a dict of columns and types the ``errors`` parameter was ignored. (:issue:`25905`) +- Bug in :func:`DataFrame.astype` when passing a dict of columns and types the ``errors`` parameter was ignored. (:issue:`25905`) - Strings @@ -1146,7 +1146,7 @@ MultiIndex IO ^^ -- Bug in :func:`DataFrame.to_html()` where values were truncated using display options instead of outputting the full content (:issue:`17004`) +- Bug in :func:`DataFrame.to_html` where values were truncated using display options instead of outputting the full content (:issue:`17004`) - Fixed bug in missing text when using :meth:`to_clipboard` if copying utf-16 characters in Python 3 on Windows (:issue:`25040`) - Bug in :func:`read_json` for ``orient='table'`` when it tries to infer dtypes by default, which is not applicable as dtypes are already defined in the JSON schema (:issue:`21345`) - Bug in :func:`read_json` for ``orient='table'`` and float index, as it infers index dtype by default, which is not applicable because index dtype is already defined in the JSON schema (:issue:`25433`) @@ -1159,7 +1159,7 @@ IO - Bug in :meth:`DataFrame.to_html` where header numbers would ignore display options when rounding (:issue:`17280`) - Bug in :func:`read_hdf` where reading a table from an HDF5 file written directly with PyTables fails with a ``ValueError`` when using a sub-selection via the ``start`` or ``stop`` arguments (:issue:`11188`) - Bug in :func:`read_hdf` not properly closing store after a ``KeyError`` is raised (:issue:`25766`) -- Improved the explanation for the failure when value labels are repeated in Stata dta files and suggested work-arounds (:issue:`25772`) +- Improved the explanation for the failure when value labels are repeated in Stata dta files and suggested workarounds (:issue:`25772`) - Improved :meth:`pandas.read_stata` and :class:`pandas.io.stata.StataReader` to read incorrectly formatted 118 format files saved by Stata (:issue:`25960`) - Improved the ``col_space`` parameter in :meth:`DataFrame.to_html` to accept a string so CSS length values can be set correctly (:issue:`25941`) - Fixed bug in loading objects from S3 that contain ``#`` characters in the URL (:issue:`25945`) @@ -1171,7 +1171,7 @@ IO - Fixed bug in :func:`pandas.read_csv` where a BOM would result in incorrect parsing using engine='python' (:issue:`26545`) - :func:`read_excel` now raises a ``ValueError`` when input is of type :class:`pandas.io.excel.ExcelFile` and ``engine`` param is passed since :class:`pandas.io.excel.ExcelFile` has an engine defined (:issue:`26566`) - Bug while selecting from :class:`HDFStore` with ``where=''`` specified (:issue:`26610`). -- Fixed bug in :func:`DataFrame.to_excel()` where custom objects (i.e. ``PeriodIndex``) inside merged cells were not being converted into types safe for the Excel writer (:issue:`27006`) +- Fixed bug in :func:`DataFrame.to_excel` where custom objects (i.e. ``PeriodIndex``) inside merged cells were not being converted into types safe for the Excel writer (:issue:`27006`) - Bug in :meth:`read_hdf` where reading a timezone aware :class:`DatetimeIndex` would raise a ``TypeError`` (:issue:`11926`) - Bug in :meth:`to_msgpack` and :meth:`read_msgpack` which would raise a ``ValueError`` rather than a ``FileNotFoundError`` for an invalid path (:issue:`27160`) - Fixed bug in :meth:`DataFrame.to_parquet` which would raise a ``ValueError`` when the dataframe had no columns (:issue:`27339`) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 5dbf6f1c60598..98cb9c4ad7b45 100755 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -900,7 +900,7 @@ or ``matplotlib.Axes.plot``. See :ref:`plotting.formatters` for more. - Removed ``pandas.plotting._matplotlib.tsplot``, use :meth:`Series.plot` instead (:issue:`19980`) - ``pandas.tseries.converter.register`` has been moved to :func:`pandas.plotting.register_matplotlib_converters` (:issue:`18307`) - :meth:`Series.plot` no longer accepts positional arguments, pass keyword arguments instead (:issue:`30003`) -- :meth:`DataFrame.hist` and :meth:`Series.hist` no longer allows ``figsize="default"``, specify figure size by passinig a tuple instead (:issue:`30003`) +- :meth:`DataFrame.hist` and :meth:`Series.hist` no longer allows ``figsize="default"``, specify figure size by passing a tuple instead (:issue:`30003`) - Floordiv of integer-dtyped array by :class:`Timedelta` now raises ``TypeError`` (:issue:`21036`) - :class:`TimedeltaIndex` and :class:`DatetimeIndex` no longer accept non-nanosecond dtype strings like "timedelta64" or "datetime64", use "timedelta64[ns]" and "datetime64[ns]" instead (:issue:`24806`) - Changed the default "skipna" argument in :func:`pandas.api.types.infer_dtype` from ``False`` to ``True`` (:issue:`24050`) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 37d021efddf0b..b199b113d26f2 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -1083,7 +1083,7 @@ IO timestamps with ``version="2.0"`` (:issue:`31652`). - Bug in :func:`read_csv` was raising ``TypeError`` when ``sep=None`` was used in combination with ``comment`` keyword (:issue:`31396`) - Bug in :class:`HDFStore` that caused it to set to ``int64`` the dtype of a ``datetime64`` column when reading a :class:`DataFrame` in Python 3 from fixed format written in Python 2 (:issue:`31750`) -- :func:`read_sas()` now handles dates and datetimes larger than :attr:`Timestamp.max` returning them as :class:`datetime.datetime` objects (:issue:`20927`) +- :func:`read_sas` now handles dates and datetimes larger than :attr:`Timestamp.max` returning them as :class:`datetime.datetime` objects (:issue:`20927`) - Bug in :meth:`DataFrame.to_json` where ``Timedelta`` objects would not be serialized correctly with ``date_format="iso"`` (:issue:`28256`) - :func:`read_csv` will raise a ``ValueError`` when the column names passed in ``parse_dates`` are missing in the :class:`Dataframe` (:issue:`31251`) - Bug in :func:`read_excel` where a UTF-8 string with a high surrogate would cause a segmentation violation (:issue:`23809`) @@ -1174,13 +1174,13 @@ Reshaping - :meth:`DataFrame.agg` now provides more descriptive ``SpecificationError`` message when attempting to aggregate a non-existent column (:issue:`32755`) - Bug in :meth:`DataFrame.unstack` when :class:`MultiIndex` columns and :class:`MultiIndex` rows were used (:issue:`32624`, :issue:`24729` and :issue:`28306`) - Appending a dictionary to a :class:`DataFrame` without passing ``ignore_index=True`` will raise ``TypeError: Can only append a dict if ignore_index=True`` instead of ``TypeError: Can only append a :class:`Series` if ignore_index=True or if the :class:`Series` has a name`` (:issue:`30871`) -- Bug in :meth:`DataFrame.corrwith()`, :meth:`DataFrame.memory_usage()`, :meth:`DataFrame.dot()`, - :meth:`DataFrame.idxmin()`, :meth:`DataFrame.idxmax()`, :meth:`DataFrame.duplicated()`, :meth:`DataFrame.isin()`, - :meth:`DataFrame.count()`, :meth:`Series.explode()`, :meth:`Series.asof()` and :meth:`DataFrame.asof()` not +- Bug in :meth:`DataFrame.corrwith`, :meth:`DataFrame.memory_usage`, :meth:`DataFrame.dot`, + :meth:`DataFrame.idxmin`, :meth:`DataFrame.idxmax`, :meth:`DataFrame.duplicated`, :meth:`DataFrame.isin`, + :meth:`DataFrame.count`, :meth:`Series.explode`, :meth:`Series.asof` and :meth:`DataFrame.asof` not returning subclassed types. (:issue:`31331`) - Bug in :func:`concat` was not allowing for concatenation of :class:`DataFrame` and :class:`Series` with duplicate keys (:issue:`33654`) - Bug in :func:`cut` raised an error when the argument ``labels`` contains duplicates (:issue:`33141`) -- Ensure only named functions can be used in :func:`eval()` (:issue:`32460`) +- Ensure only named functions can be used in :func:`eval` (:issue:`32460`) - Bug in :meth:`Dataframe.aggregate` and :meth:`Series.aggregate` was causing a recursive loop in some cases (:issue:`34224`) - Fixed bug in :func:`melt` where melting :class:`MultiIndex` columns with ``col_level > 0`` would raise a ``KeyError`` on ``id_vars`` (:issue:`34129`) - Bug in :meth:`Series.where` with an empty :class:`Series` and empty ``cond`` having non-bool dtype (:issue:`34592`) @@ -1203,7 +1203,7 @@ ExtensionArray - Fixed bug in :func:`concat` when concatenating :class:`DataFrame` objects with non-overlapping columns resulting in object-dtype columns rather than preserving the extension dtype (:issue:`27692`, :issue:`33027`) - Fixed bug where :meth:`StringArray.isna` would return ``False`` for NA values when ``pandas.options.mode.use_inf_as_na`` was set to ``True`` (:issue:`33655`) - Fixed bug in :class:`Series` construction with EA dtype and index but no data or scalar data fails (:issue:`26469`) -- Fixed bug that caused :meth:`Series.__repr__()` to crash for extension types whose elements are multidimensional arrays (:issue:`33770`). +- Fixed bug that caused :meth:`Series.__repr__` to crash for extension types whose elements are multidimensional arrays (:issue:`33770`). - Fixed bug where :meth:`Series.update` would raise a ``ValueError`` for ``ExtensionArray`` dtypes with missing values (:issue:`33980`) - Fixed bug where :meth:`StringArray.memory_usage` was not implemented (:issue:`33963`) - Fixed bug where :meth:`DataFrameGroupBy` would ignore the ``min_count`` argument for aggregations on nullable Boolean dtypes (:issue:`34051`) diff --git a/doc/source/whatsnew/v1.2.1.rst b/doc/source/whatsnew/v1.2.1.rst index 25e616dcdf37f..cb344cc728566 100644 --- a/doc/source/whatsnew/v1.2.1.rst +++ b/doc/source/whatsnew/v1.2.1.rst @@ -22,7 +22,7 @@ Fixed regressions - Fixed regression in ``DataFrame.__setitem__`` raising ``ValueError`` when expanding :class:`DataFrame` and new column is from type ``"0 - name"`` (:issue:`39010`) - Fixed regression in setting with :meth:`DataFrame.loc` raising ``ValueError`` when :class:`DataFrame` has unsorted :class:`MultiIndex` columns and indexer is a scalar (:issue:`38601`) - Fixed regression in setting with :meth:`DataFrame.loc` raising ``KeyError`` with :class:`MultiIndex` and list-like columns indexer enlarging :class:`DataFrame` (:issue:`39147`) -- Fixed regression in :meth:`~DataFrame.groupby()` with :class:`Categorical` grouping column not showing unused categories for ``grouped.indices`` (:issue:`38642`) +- Fixed regression in :meth:`~DataFrame.groupby` with :class:`Categorical` grouping column not showing unused categories for ``grouped.indices`` (:issue:`38642`) - Fixed regression in :meth:`.DataFrameGroupBy.sem` and :meth:`.SeriesGroupBy.sem` where the presence of non-numeric columns would cause an error instead of being dropped (:issue:`38774`) - Fixed regression in :meth:`.DataFrameGroupBy.diff` raising for ``int8`` and ``int16`` columns (:issue:`39050`) - Fixed regression in :meth:`DataFrame.groupby` when aggregating an ``ExtensionDType`` that could fail for non-numeric values (:issue:`38980`) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 17aab87b93f8e..0e2d487a89ff5 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -975,7 +975,7 @@ Numeric - Bug in :class:`Series` and :class:`DataFrame` reductions with methods ``any`` and ``all`` not returning Boolean results for object data (:issue:`12863`, :issue:`35450`, :issue:`27709`) - Bug in :meth:`Series.clip` would fail if the Series contains NA values and has nullable int or float as a data type (:issue:`40851`) - Bug in :meth:`UInt64Index.where` and :meth:`UInt64Index.putmask` with an ``np.int64`` dtype ``other`` incorrectly raising ``TypeError`` (:issue:`41974`) -- Bug in :meth:`DataFrame.agg()` not sorting the aggregated axis in the order of the provided aggregation functions when one or more aggregation function fails to produce results (:issue:`33634`) +- Bug in :meth:`DataFrame.agg` not sorting the aggregated axis in the order of the provided aggregation functions when one or more aggregation function fails to produce results (:issue:`33634`) - Bug in :meth:`DataFrame.clip` not interpreting missing values as no threshold (:issue:`40420`) Conversion diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 91953f693190c..7b1aef07e5f00 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -1045,7 +1045,7 @@ Reshaping - Bug in :meth:`DataFrame.stack` with ``ExtensionDtype`` columns incorrectly raising (:issue:`43561`) - Bug in :func:`merge` raising ``KeyError`` when joining over differently named indexes with on keywords (:issue:`45094`) - Bug in :meth:`Series.unstack` with object doing unwanted type inference on resulting columns (:issue:`44595`) -- Bug in :meth:`MultiIndex.join()` with overlapping ``IntervalIndex`` levels (:issue:`44096`) +- Bug in :meth:`MultiIndex.join` with overlapping ``IntervalIndex`` levels (:issue:`44096`) - Bug in :meth:`DataFrame.replace` and :meth:`Series.replace` results is different ``dtype`` based on ``regex`` parameter (:issue:`44864`) - Bug in :meth:`DataFrame.pivot` with ``index=None`` when the :class:`DataFrame` index was a :class:`MultiIndex` (:issue:`23955`) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index cacbf8452ba32..ddcd69c3fd962 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -1130,7 +1130,7 @@ Performance improvements - Performance improvement in :func:`to_datetime` when using ``'%Y%m%d'`` format (:issue:`17410`) - Performance improvement in :func:`to_datetime` when format is given or can be inferred (:issue:`50465`) - Performance improvement in :meth:`Series.median` for nullable dtypes (:issue:`50838`) -- Performance improvement in :func:`read_csv` when passing :func:`to_datetime` lambda-function to ``date_parser`` and inputs have mixed timezone offsetes (:issue:`35296`) +- Performance improvement in :func:`read_csv` when passing :func:`to_datetime` lambda-function to ``date_parser`` and inputs have mixed timezone offsets (:issue:`35296`) - Performance improvement in :func:`isna` and :func:`isnull` (:issue:`50658`) - Performance improvement in :meth:`.SeriesGroupBy.value_counts` with categorical dtype (:issue:`46202`) - Fixed a reference leak in :func:`read_hdf` (:issue:`37441`) diff --git a/doc/source/whatsnew/v2.3.0.rst b/doc/source/whatsnew/v2.3.0.rst new file mode 100644 index 0000000000000..03355f655eb28 --- /dev/null +++ b/doc/source/whatsnew/v2.3.0.rst @@ -0,0 +1,178 @@ +.. _whatsnew_230: + +What's new in 2.3.0 (Month XX, 2024) +------------------------------------ + +These are the changes in pandas 2.3.0. See :ref:`release` for a full changelog +including other versions of pandas. + +{{ header }} + +.. --------------------------------------------------------------------------- + +.. _whatsnew_230.upcoming_changes: + +Upcoming changes in pandas 3.0 +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + +.. _whatsnew_230.enhancements: + +Enhancements +~~~~~~~~~~~~ + +.. _whatsnew_230.enhancements.enhancement1: + +enhancement1 +^^^^^^^^^^^^ + + +.. _whatsnew_230.enhancements.other: + +Other enhancements +^^^^^^^^^^^^^^^^^^ + +- +- + +.. --------------------------------------------------------------------------- +.. _whatsnew_230.notable_bug_fixes: + +Notable bug fixes +~~~~~~~~~~~~~~~~~ + +These are bug fixes that might have notable behavior changes. + +.. _whatsnew_230.notable_bug_fixes.notable_bug_fix1: + +notable_bug_fix1 +^^^^^^^^^^^^^^^^ + +.. --------------------------------------------------------------------------- +.. _whatsnew_230.deprecations: + +Deprecations +~~~~~~~~~~~~ +- Deprecated allowing non-``bool`` values for ``na`` in :meth:`.str.contains`, :meth:`.str.startswith`, and :meth:`.str.endswith` for dtypes that do not already disallow these (:issue:`59615`) +- + +.. --------------------------------------------------------------------------- +.. _whatsnew_230.performance: + +Performance improvements +~~~~~~~~~~~~~~~~~~~~~~~~ +- +- + +.. --------------------------------------------------------------------------- +.. _whatsnew_230.bug_fixes: + +Bug fixes +~~~~~~~~~ + +Categorical +^^^^^^^^^^^ +- +- + +Datetimelike +^^^^^^^^^^^^ +- +- + +Timedelta +^^^^^^^^^ +- +- + +Timezones +^^^^^^^^^ +- +- + +Numeric +^^^^^^^ +- +- + +Conversion +^^^^^^^^^^ +- +- + +Strings +^^^^^^^ +- Bug in :meth:`Series.str.replace` when ``n < 0`` for :class:`StringDtype` with ``storage="pyarrow"`` (:issue:`59628`) +- Bug in the ``center`` method on :class:`Series` and :class:`Index` object ``str`` accessors with pyarrow-backed dtype not matching the python behavior in corner cases with an odd number of fill characters (:issue:`54792`) + + +Interval +^^^^^^^^ +- +- + +Indexing +^^^^^^^^ +- +- + +Missing +^^^^^^^ +- +- + +MultiIndex +^^^^^^^^^^ +- +- + +I/O +^^^ +- +- + +Period +^^^^^^ +- +- + +Plotting +^^^^^^^^ +- +- + +Groupby/resample/rolling +^^^^^^^^^^^^^^^^^^^^^^^^ +- +- + +Reshaping +^^^^^^^^^ +- +- + +Sparse +^^^^^^ +- +- + +ExtensionArray +^^^^^^^^^^^^^^ +- +- + +Styler +^^^^^^ +- +- + +Other +^^^^^ +- +- + +.. --------------------------------------------------------------------------- +.. _whatsnew_230.contributors: + +Contributors +~~~~~~~~~~~~ diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index ee9d18d0c7ce2..cd353b60d1a6e 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -1,4 +1,4 @@ -.. _whatsnew_230: +.. _whatsnew_300: What's new in 3.0.0 (Month XX, 2024) ------------------------------------ @@ -31,6 +31,8 @@ Other enhancements - :class:`pandas.api.typing.FrozenList` is available for typing the outputs of :attr:`MultiIndex.names`, :attr:`MultiIndex.codes` and :attr:`MultiIndex.levels` (:issue:`58237`) - :class:`pandas.api.typing.SASReader` is available for typing the output of :func:`read_sas` (:issue:`55689`) - :func:`DataFrame.to_excel` now raises an ``UserWarning`` when the character count in a cell exceeds Excel's limitation of 32767 characters (:issue:`56954`) +- :func:`pandas.merge` now validates the ``how`` parameter input (merge type) (:issue:`59435`) +- :func:`read_spss` now supports kwargs to be passed to pyreadstat (:issue:`56356`) - :func:`read_stata` now returns ``datetime64`` resolutions better matching those natively stored in the stata format (:issue:`55642`) - :meth:`DataFrame.agg` called with ``axis=1`` and a ``func`` which relabels the result index now raises a ``NotImplementedError`` (:issue:`58807`). - :meth:`Index.get_loc` now accepts also subclasses of ``tuple`` as keys (:issue:`57922`) @@ -42,6 +44,7 @@ Other enhancements - Users can globally disable any ``PerformanceWarning`` by setting the option ``mode.performance_warnings`` to ``False`` (:issue:`56920`) - :meth:`Styler.format_index_names` can now be used to format the index and column names (:issue:`48936` and :issue:`47489`) - :class:`.errors.DtypeWarning` improved to include column names when mixed data types are detected (:issue:`58174`) +- :class:`Series` now supports the Arrow PyCapsule Interface for export (:issue:`59518`) - :func:`DataFrame.to_excel` argument ``merge_cells`` now accepts a value of ``"columns"`` to only merge :class:`MultiIndex` column header header cells (:issue:`35384`) - :meth:`DataFrame.corrwith` now accepts ``min_periods`` as optional arguments, as in :meth:`DataFrame.corr` and :meth:`Series.corr` (:issue:`9490`) - :meth:`DataFrame.cummin`, :meth:`DataFrame.cummax`, :meth:`DataFrame.cumprod` and :meth:`DataFrame.cumsum` methods now have a ``numeric_only`` parameter (:issue:`53072`) @@ -50,9 +53,11 @@ Other enhancements - :meth:`DataFrame.pivot_table` and :func:`pivot_table` now allow the passing of keyword arguments to ``aggfunc`` through ``**kwargs`` (:issue:`57884`) - :meth:`Series.cummin` and :meth:`Series.cummax` now supports :class:`CategoricalDtype` (:issue:`52335`) - :meth:`Series.plot` now correctly handle the ``ylabel`` parameter for pie charts, allowing for explicit control over the y-axis label (:issue:`58239`) +- :meth:`DataFrame.plot.scatter` argument ``c`` now accepts a column of strings, where rows with the same string are colored identically (:issue:`16827` and :issue:`16485`) - :meth:`pandas.concat` will raise a ``ValueError`` when ``ignore_index=True`` and ``keys`` is not ``None`` (:issue:`59274`) - Multiplying two :class:`DateOffset` objects will now raise a ``TypeError`` instead of a ``RecursionError`` (:issue:`59442`) - Restore support for reading Stata 104-format and enable reading 103-format dta files (:issue:`58554`) +- Support passing a :class:`Iterable[Hashable]` input to :meth:`DataFrame.drop_duplicates` (:issue:`59237`) - Support reading Stata 102-format (Stata 1) dta files (:issue:`58978`) - Support reading Stata 110-format (Stata 7) dta files (:issue:`47176`) @@ -221,6 +226,8 @@ Optional libraries below the lowest tested version may still work, but are not c +------------------------+---------------------+ | Package | New Minimum Version | +========================+=====================+ +| pytz | 2023.4 | ++------------------------+---------------------+ | fastparquet | 2023.10.0 | +------------------------+---------------------+ | adbc-driver-postgresql | 0.10.0 | @@ -230,6 +237,37 @@ Optional libraries below the lowest tested version may still work, but are not c See :ref:`install.dependencies` and :ref:`install.optional_dependencies` for more. +.. _whatsnew_300.api_breaking.pytz: + +``pytz`` now an optional dependency +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +pandas now uses :py:mod:`zoneinfo` from the standard library as the default timezone implementation when passing a timezone +string to various methods. (:issue:`34916`) + +*Old behavior:* + +.. code-block:: ipython + + In [1]: ts = pd.Timestamp(2024, 1, 1).tz_localize("US/Pacific") + In [2]: ts.tz + + +*New behavior:* + +.. ipython:: python + + ts = pd.Timestamp(2024, 1, 1).tz_localize("US/Pacific") + ts.tz + +``pytz`` timezone objects are still supported when passed directly, but they will no longer be returned by default +from string inputs. Moreover, ``pytz`` is no longer a required dependency of pandas, but can be installed +with the pip extra ``pip install pandas[timezone]``. + + +Additionally, pandas no longer throws ``pytz`` exceptions for timezone operations leading to ambiguous or nonexistent +times. These cases will now raise a ``ValueError``. + .. _whatsnew_300.api_breaking.other: Other API changes @@ -466,9 +504,11 @@ Performance improvements - :meth:`Series.str.extract` returns a :class:`RangeIndex` columns instead of an :class:`Index` column when possible (:issue:`57542`) - :meth:`Series.str.partition` with :class:`ArrowDtype` returns a :class:`RangeIndex` columns instead of an :class:`Index` column when possible (:issue:`57768`) - Performance improvement in :class:`DataFrame` when ``data`` is a ``dict`` and ``columns`` is specified (:issue:`24368`) +- Performance improvement in :class:`MultiIndex` when setting :attr:`MultiIndex.names` doesn't invalidate all cached operations (:issue:`59578`) - Performance improvement in :meth:`DataFrame.join` for sorted but non-unique indexes (:issue:`56941`) - Performance improvement in :meth:`DataFrame.join` when left and/or right are non-unique and ``how`` is ``"left"``, ``"right"``, or ``"inner"`` (:issue:`56817`) - Performance improvement in :meth:`DataFrame.join` with ``how="left"`` or ``how="right"`` and ``sort=True`` (:issue:`56919`) +- Performance improvement in :meth:`DataFrame.to_csv` when ``index=False`` (:issue:`59312`) - Performance improvement in :meth:`DataFrameGroupBy.ffill`, :meth:`DataFrameGroupBy.bfill`, :meth:`SeriesGroupBy.ffill`, and :meth:`SeriesGroupBy.bfill` (:issue:`56902`) - Performance improvement in :meth:`Index.join` by propagating cached attributes in cases where the result matches one of the inputs (:issue:`57023`) - Performance improvement in :meth:`Index.take` when ``indices`` is a full range indexer from zero to length of index (:issue:`56806`) @@ -488,6 +528,7 @@ Performance improvements - Performance improvement in :meth:`RangeIndex.reindex` returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57647`, :issue:`57752`) - Performance improvement in :meth:`RangeIndex.take` returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57445`, :issue:`57752`) - Performance improvement in :func:`merge` if hash-join can be used (:issue:`57970`) +- Performance improvement in :meth:`CategoricalDtype.update_dtype` when ``dtype`` is a :class:`CategoricalDtype` with non ``None`` categories and ordered (:issue:`59647`) - Performance improvement in :meth:`to_hdf` avoid unnecessary reopenings of the HDF5 file to speedup data addition to files with a very large number of groups . (:issue:`58248`) - Performance improvement in ``DataFrameGroupBy.__len__`` and ``SeriesGroupBy.__len__`` (:issue:`57595`) - Performance improvement in indexing operations for string dtypes (:issue:`56997`) @@ -509,7 +550,7 @@ Datetimelike - Bug in :attr:`is_year_start` where a DateTimeIndex constructed via a date_range with frequency 'MS' wouldn't have the correct year or quarter start attributes (:issue:`57377`) - Bug in :class:`Timestamp` constructor failing to raise when ``tz=None`` is explicitly specified in conjunction with timezone-aware ``tzinfo`` or data (:issue:`48688`) - Bug in :func:`date_range` where the last valid timestamp would sometimes not be produced (:issue:`56134`) -- Bug in :func:`date_range` where using a negative frequency value would not include all points between the start and end values (:issue:`56382`) +- Bug in :func:`date_range` where using a negative frequency value would not include all points between the start and end values (:issue:`56147`) - Bug in :func:`tseries.api.guess_datetime_format` would fail to infer time format when "%Y" == "%H%M" (:issue:`57452`) - Bug in :func:`tseries.frequencies.to_offset` would fail to parse frequency strings starting with "LWOM" (:issue:`59218`) - Bug in :meth:`Dataframe.agg` with df with missing values resulting in IndexError (:issue:`58810`) @@ -518,6 +559,7 @@ Datetimelike - Bug in :meth:`DatetimeIndex.union` and :meth:`DatetimeIndex.intersection` when ``unit`` was non-nanosecond (:issue:`59036`) - Bug in :meth:`Series.dt.microsecond` producing incorrect results for pyarrow backed :class:`Series`. (:issue:`59154`) - Bug in :meth:`to_datetime` not respecting dayfirst if an uncommon date string was passed. (:issue:`58859`) +- Bug in :meth:`to_datetime` reports incorrect index in case of any failure scenario. (:issue:`58298`) - Bug in setting scalar values with mismatched resolution into arrays with non-nanosecond ``datetime64``, ``timedelta64`` or :class:`DatetimeTZDtype` incorrectly truncating those scalars (:issue:`56410`) Timedelta @@ -598,6 +640,7 @@ Period Plotting ^^^^^^^^ - Bug in :meth:`.DataFrameGroupBy.boxplot` failed when there were multiple groupings (:issue:`14701`) +- Bug in :meth:`DataFrame.plot.line` raising ``ValueError`` when set both color and a ``dict`` style (:issue:`59461`) - Bug in :meth:`DataFrame.plot` that causes a shift to the right when the frequency multiplier is greater than one. (:issue:`57587`) - Bug in :meth:`Series.plot` with ``kind="pie"`` with :class:`ArrowDtype` (:issue:`59192`) @@ -616,11 +659,14 @@ Groupby/resample/rolling - Bug in :meth:`DataFrameGroupBy.cumsum` where it did not return the correct dtype when the label contained ``None``. (:issue:`58811`) - Bug in :meth:`DataFrameGroupby.transform` and :meth:`SeriesGroupby.transform` with a reducer and ``observed=False`` that coerces dtype to float when there are unobserved categories. (:issue:`55326`) - Bug in :meth:`Rolling.apply` where the applied function could be called on fewer than ``min_period`` periods if ``method="table"``. (:issue:`58868`) +- Bug in :meth:`Series.resample` could raise when the the date range ended shortly before a non-existent time. (:issue:`58380`) Reshaping ^^^^^^^^^ - Bug in :func:`qcut` where values at the quantile boundaries could be incorrectly assigned (:issue:`59355`) - Bug in :meth:`DataFrame.join` inconsistently setting result index name (:issue:`55815`) +- Bug in :meth:`DataFrame.join` when a :class:`DataFrame` with a :class:`MultiIndex` would raise an ``AssertionError`` when :attr:`MultiIndex.names` contained ``None``. (:issue:`58721`) +- Bug in :meth:`DataFrame.merge` where merging on a column containing only ``NaN`` values resulted in an out-of-bounds array access (:issue:`59421`) - Bug in :meth:`DataFrame.unstack` producing incorrect results when ``sort=False`` (:issue:`54987`, :issue:`55516`) - Bug in :meth:`DataFrame.unstack` producing incorrect results when manipulating empty :class:`DataFrame` with an :class:`ExtentionDtype` (:issue:`59123`) @@ -633,6 +679,7 @@ ExtensionArray ^^^^^^^^^^^^^^ - Bug in :meth:`.arrays.ArrowExtensionArray.__setitem__` which caused wrong behavior when using an integer array with repeated values as a key (:issue:`58530`) - Bug in :meth:`api.types.is_datetime64_any_dtype` where a custom :class:`ExtensionDtype` would return ``False`` for array-likes (:issue:`57055`) +- Bug in comparison between object with :class:`ArrowDtype` and incompatible-dtyped (e.g. string vs bool) incorrectly raising instead of returning all-``False`` (for ``==``) or all-``True`` (for ``!=``) (:issue:`59505`) - Bug in various :class:`DataFrame` reductions for pyarrow temporal dtypes returning incorrect dtype when result was null (:issue:`59234`) Styler @@ -649,6 +696,7 @@ Other - Bug in :meth:`DataFrame.apply` where passing ``engine="numba"`` ignored ``args`` passed to the applied function (:issue:`58712`) - Bug in :meth:`DataFrame.eval` and :meth:`DataFrame.query` which caused an exception when using NumPy attributes via ``@`` notation, e.g., ``df.eval("@np.floor(a)")``. (:issue:`58041`) - Bug in :meth:`DataFrame.eval` and :meth:`DataFrame.query` which did not allow to use ``tan`` function. (:issue:`55091`) +- Bug in :meth:`DataFrame.query` which raised an exception or produced incorrect results when expressions contained backtick-quoted column names containing the hash character ``#``, backticks, or characters that fall outside the ASCII range (U+0001..U+007F). (:issue:`59285`) (:issue:`49633`) - Bug in :meth:`DataFrame.sort_index` when passing ``axis="columns"`` and ``ignore_index=True`` and ``ascending=False`` not returning a :class:`RangeIndex` columns (:issue:`57293`) - Bug in :meth:`DataFrame.transform` that was returning the wrong order unless the index was monotonically increasing. (:issue:`57069`) - Bug in :meth:`DataFrame.where` where using a non-bool type array in the function would return a ``ValueError`` instead of a ``TypeError`` (:issue:`56330`) diff --git a/environment.yml b/environment.yml index e5646af07c45c..34bc0591ca8df 100644 --- a/environment.yml +++ b/environment.yml @@ -24,7 +24,6 @@ dependencies: # required dependencies - python-dateutil - numpy<2 - - pytz # optional dependencies - beautifulsoup4>=4.11.2 @@ -50,6 +49,7 @@ dependencies: - pyreadstat>=1.2.0 - pytables>=3.8.0 - python-calamine>=0.1.7 + - pytz>=2023.4 - pyxlsb>=1.0.10 - s3fs>=2022.11.0 - scipy>=1.10.0 diff --git a/pandas/__init__.py b/pandas/__init__.py index 3ee6f6abf97bf..6c97baa890777 100644 --- a/pandas/__init__.py +++ b/pandas/__init__.py @@ -3,7 +3,7 @@ __docformat__ = "restructuredtext" # Let users know if they're missing any of our hard dependencies -_hard_dependencies = ("numpy", "pytz", "dateutil") +_hard_dependencies = ("numpy", "dateutil") _missing_dependencies = [] for _dependency in _hard_dependencies: @@ -28,8 +28,8 @@ raise ImportError( f"C extension: {_module} not built. If you want to import " "pandas from the source directory, you may need to run " - "'python -m pip install -ve . --no-build-isolation --config-settings " - "editable-verbose=true' to build the C extensions first." + "'python -m pip install -ve . --no-build-isolation -Ceditable-verbose=true' " + "to build the C extensions first." ) from _err from pandas._config import ( diff --git a/pandas/_config/config.py b/pandas/_config/config.py index 51794ec04b29e..4ed2d4c3be692 100644 --- a/pandas/_config/config.py +++ b/pandas/_config/config.py @@ -426,6 +426,11 @@ def option_context(*args) -> Generator[None, None, None]: None No return value. + Yields + ------ + None + No yield value. + See Also -------- get_option : Retrieve the value of the specified option. diff --git a/pandas/_libs/arrays.pyx b/pandas/_libs/arrays.pyx index 9889436a542c1..2932f3ff56396 100644 --- a/pandas/_libs/arrays.pyx +++ b/pandas/_libs/arrays.pyx @@ -67,6 +67,10 @@ cdef class NDArrayBacked: """ Construct a new ExtensionArray `new_array` with `arr` as its _ndarray. + The returned array has the same dtype as self. + + Caller is responsible for ensuring `values.dtype == self._ndarray.dtype`. + This should round-trip: self == self._from_backing_data(self._ndarray) """ diff --git a/pandas/_libs/hashtable.pyx b/pandas/_libs/hashtable.pyx index 97fae1d6480ce..b5ae5a3440f39 100644 --- a/pandas/_libs/hashtable.pyx +++ b/pandas/_libs/hashtable.pyx @@ -30,7 +30,10 @@ from pandas._libs.khash cimport ( kh_python_hash_func, khiter_t, ) -from pandas._libs.missing cimport checknull +from pandas._libs.missing cimport ( + checknull, + is_matching_na, +) def get_hashtable_trace_domain(): diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index 5c6254c6a1ec7..210df09f07db6 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -1171,11 +1171,13 @@ cdef class StringHashTable(HashTable): const char **vecs khiter_t k bint use_na_value + bint non_null_na_value if return_inverse: labels = np.zeros(n, dtype=np.intp) uindexer = np.empty(n, dtype=np.int64) use_na_value = na_value is not None + non_null_na_value = not checknull(na_value) # assign pointers and pre-filter out missing (if ignore_na) vecs = malloc(n * sizeof(char *)) @@ -1186,7 +1188,12 @@ cdef class StringHashTable(HashTable): if (ignore_na and (not isinstance(val, str) - or (use_na_value and val == na_value))): + or (use_na_value and ( + (non_null_na_value and val == na_value) or + (not non_null_na_value and is_matching_na(val, na_value))) + ) + ) + ): # if missing values do not count as unique values (i.e. if # ignore_na is True), we can skip the actual value, and # replace the label with na_sentinel directly @@ -1452,10 +1459,11 @@ cdef class PyObjectHashTable(HashTable): object val khiter_t k bint use_na_value - + bint non_null_na_value if return_inverse: labels = np.empty(n, dtype=np.intp) use_na_value = na_value is not None + non_null_na_value = not checknull(na_value) for i in range(n): val = values[i] @@ -1463,7 +1471,11 @@ cdef class PyObjectHashTable(HashTable): if ignore_na and ( checknull(val) - or (use_na_value and val == na_value) + or (use_na_value and ( + (non_null_na_value and val == na_value) or + (not non_null_na_value and is_matching_na(val, na_value)) + ) + ) ): # if missing values do not count as unique values (i.e. if # ignore_na is True), skip the hashtable entry for them, and diff --git a/pandas/_libs/internals.pyx b/pandas/_libs/internals.pyx index 05c4e7bd5e9dc..99737776ff59f 100644 --- a/pandas/_libs/internals.pyx +++ b/pandas/_libs/internals.pyx @@ -1,9 +1,13 @@ from collections import defaultdict -import weakref cimport cython +from cpython.object cimport PyObject from cpython.pyport cimport PY_SSIZE_T_MAX from cpython.slice cimport PySlice_GetIndicesEx +from cpython.weakref cimport ( + PyWeakref_GetObject, + PyWeakref_NewRef, +) from cython cimport Py_ssize_t import numpy as np @@ -26,6 +30,10 @@ from pandas._libs.util cimport ( ) +cdef extern from "Python.h": + PyObject* Py_None + + @cython.final @cython.freelist(32) cdef class BlockPlacement: @@ -746,7 +754,7 @@ cdef class BlockManager: # ------------------------------------------------------------------- # Block Placement - def _rebuild_blknos_and_blklocs(self) -> None: + cpdef _rebuild_blknos_and_blklocs(self): """ Update mgr._blknos / mgr._blklocs. """ @@ -890,19 +898,20 @@ cdef class BlockValuesRefs: def __cinit__(self, blk: Block | None = None) -> None: if blk is not None: - self.referenced_blocks = [weakref.ref(blk)] + self.referenced_blocks = [PyWeakref_NewRef(blk, None)] else: self.referenced_blocks = [] self.clear_counter = 500 # set reasonably high - def _clear_dead_references(self, force=False) -> None: + cdef _clear_dead_references(self, bint force=False): # Use exponential backoff to decide when we want to clear references # if force=False. Clearing for every insertion causes slowdowns if # all these objects stay alive, e.g. df.items() for wide DataFrames # see GH#55245 and GH#55008 if force or len(self.referenced_blocks) > self.clear_counter: self.referenced_blocks = [ - ref for ref in self.referenced_blocks if ref() is not None + ref for ref in self.referenced_blocks + if PyWeakref_GetObject(ref) != Py_None ] nr_of_refs = len(self.referenced_blocks) if nr_of_refs < self.clear_counter // 2: @@ -910,7 +919,7 @@ cdef class BlockValuesRefs: elif nr_of_refs > self.clear_counter: self.clear_counter = max(self.clear_counter * 2, nr_of_refs) - def add_reference(self, blk: Block) -> None: + cpdef add_reference(self, Block blk): """Adds a new reference to our reference collection. Parameters @@ -919,7 +928,7 @@ cdef class BlockValuesRefs: The block that the new references should point to. """ self._clear_dead_references() - self.referenced_blocks.append(weakref.ref(blk)) + self.referenced_blocks.append(PyWeakref_NewRef(blk, None)) def add_index_reference(self, index: object) -> None: """Adds a new reference to our reference collection when creating an index. @@ -930,7 +939,7 @@ cdef class BlockValuesRefs: The index that the new reference should point to. """ self._clear_dead_references() - self.referenced_blocks.append(weakref.ref(index)) + self.referenced_blocks.append(PyWeakref_NewRef(index, None)) def has_reference(self) -> bool: """Checks if block has foreign references. diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 489d4fa111d40..e1a2a0142c52e 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2699,16 +2699,16 @@ def maybe_convert_objects(ndarray[object] objects, seen.object_ = True elif seen.str_: - if using_string_dtype() and is_string_array(objects, skipna=True): + if convert_to_nullable_dtype and is_string_array(objects, skipna=True): from pandas.core.arrays.string_ import StringDtype - dtype = StringDtype(na_value=np.nan) + dtype = StringDtype() return dtype.construct_array_type()._from_sequence(objects, dtype=dtype) - elif convert_to_nullable_dtype and is_string_array(objects, skipna=True): + elif using_string_dtype() and is_string_array(objects, skipna=True): from pandas.core.arrays.string_ import StringDtype - dtype = StringDtype() + dtype = StringDtype(na_value=np.nan) return dtype.construct_array_type()._from_sequence(objects, dtype=dtype) seen.object_ = True diff --git a/pandas/_libs/src/vendored/numpy/datetime/np_datetime.c b/pandas/_libs/src/vendored/numpy/datetime/np_datetime.c index f854f7b9210d8..cc65f34d6b6fe 100644 --- a/pandas/_libs/src/vendored/numpy/datetime/np_datetime.c +++ b/pandas/_libs/src/vendored/numpy/datetime/np_datetime.c @@ -20,14 +20,12 @@ This file is derived from NumPy 1.7. See NUMPY_LICENSE.txt #define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION #endif // NPY_NO_DEPRECATED_API -#include - #include "pandas/vendored/numpy/datetime/np_datetime.h" - #define NO_IMPORT_ARRAY #define PY_ARRAY_UNIQUE_SYMBOL PANDAS_DATETIME_NUMPY #include #include +#include #if defined(_WIN32) #ifndef ENABLE_INTSAFE_SIGNED_FUNCTIONS @@ -58,12 +56,15 @@ _Static_assert(0, "__has_builtin not detected; please try a newer compiler"); #endif #endif +#define XSTR(a) STR(a) +#define STR(a) #a + #define PD_CHECK_OVERFLOW(FUNC) \ do { \ if ((FUNC) != 0) { \ PyGILState_STATE gstate = PyGILState_Ensure(); \ PyErr_SetString(PyExc_OverflowError, \ - "Overflow occurred in npy_datetimestruct_to_datetime"); \ + "Overflow occurred at " __FILE__ ":" XSTR(__LINE__)); \ PyGILState_Release(gstate); \ return -1; \ } \ @@ -139,8 +140,8 @@ npy_int64 get_datetimestruct_days(const npy_datetimestruct *dts) { npy_int64 year, days = 0; const int *month_lengths; - year = dts->year - 1970; - days = year * 365; + PD_CHECK_OVERFLOW(checked_int64_sub(dts->year, 1970, &year)); + PD_CHECK_OVERFLOW(checked_int64_mul(year, 365, &days)); /* Adjust for leap years */ if (days >= 0) { @@ -148,32 +149,32 @@ npy_int64 get_datetimestruct_days(const npy_datetimestruct *dts) { * 1968 is the closest leap year before 1970. * Exclude the current year, so add 1. */ - year += 1; + PD_CHECK_OVERFLOW(checked_int64_add(year, 1, &year)); /* Add one day for each 4 years */ - days += year / 4; + PD_CHECK_OVERFLOW(checked_int64_add(days, year / 4, &days)); /* 1900 is the closest previous year divisible by 100 */ - year += 68; + PD_CHECK_OVERFLOW(checked_int64_add(year, 68, &year)); /* Subtract one day for each 100 years */ - days -= year / 100; + PD_CHECK_OVERFLOW(checked_int64_sub(days, year / 100, &days)); /* 1600 is the closest previous year divisible by 400 */ - year += 300; + PD_CHECK_OVERFLOW(checked_int64_add(year, 300, &year)); /* Add one day for each 400 years */ - days += year / 400; + PD_CHECK_OVERFLOW(checked_int64_add(days, year / 400, &days)); } else { /* * 1972 is the closest later year after 1970. * Include the current year, so subtract 2. */ - year -= 2; + PD_CHECK_OVERFLOW(checked_int64_sub(year, 2, &year)); /* Subtract one day for each 4 years */ - days += year / 4; + PD_CHECK_OVERFLOW(checked_int64_add(days, year / 4, &days)); /* 2000 is the closest later year divisible by 100 */ - year -= 28; + PD_CHECK_OVERFLOW(checked_int64_sub(year, 28, &year)); /* Add one day for each 100 years */ - days -= year / 100; + PD_CHECK_OVERFLOW(checked_int64_sub(days, year / 100, &days)); /* 2000 is also the closest later year divisible by 400 */ /* Subtract one day for each 400 years */ - days += year / 400; + PD_CHECK_OVERFLOW(checked_int64_add(days, year / 400, &days)); } month_lengths = days_per_month_table[is_leapyear(dts->year)]; @@ -181,11 +182,11 @@ npy_int64 get_datetimestruct_days(const npy_datetimestruct *dts) { /* Add the months */ for (i = 0; i < month; ++i) { - days += month_lengths[i]; + PD_CHECK_OVERFLOW(checked_int64_add(days, month_lengths[i], &days)); } /* Add the days */ - days += dts->day - 1; + PD_CHECK_OVERFLOW(checked_int64_add(days, dts->day - 1, &days)); return days; } @@ -430,6 +431,15 @@ npy_datetime npy_datetimestruct_to_datetime(NPY_DATETIMEUNIT base, } const int64_t days = get_datetimestruct_days(dts); + if (days == -1) { + PyGILState_STATE gstate = PyGILState_Ensure(); + bool did_error = PyErr_Occurred() == NULL ? false : true; + PyGILState_Release(gstate); + if (did_error) { + return -1; + } + } + if (base == NPY_FR_D) { return days; } diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index 928d253bf3169..3c5854602df53 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -439,7 +439,7 @@ cpdef array_to_datetime( raise TypeError(f"{type(val)} is not convertible to datetime") except (TypeError, OverflowError, ValueError) as ex: - ex.args = (f"{ex}, at position {i}",) + ex.args = (f"{ex}",) if is_coerce: iresult[i] = NPY_NAT continue diff --git a/pandas/_libs/tslibs/conversion.pyx b/pandas/_libs/tslibs/conversion.pyx index 0fadbbbed2c72..a635dd33f8420 100644 --- a/pandas/_libs/tslibs/conversion.pyx +++ b/pandas/_libs/tslibs/conversion.pyx @@ -69,6 +69,7 @@ from pandas._libs.tslibs.timestamps cimport _Timestamp from pandas._libs.tslibs.timezones cimport ( get_utcoffset, is_utc, + treat_tz_as_pytz, ) from pandas._libs.tslibs.tzconversion cimport ( Localizer, @@ -747,11 +748,17 @@ cdef datetime _localize_pydatetime(datetime dt, tzinfo tz): identically, i.e. discards nanos from Timestamps. It also assumes that the `tz` input is not None. """ - try: + if treat_tz_as_pytz(tz): + import pytz + # datetime.replace with pytz may be incorrect result # TODO: try to respect `fold` attribute - return tz.localize(dt, is_dst=None) - except AttributeError: + try: + return tz.localize(dt, is_dst=None) + except (pytz.AmbiguousTimeError, pytz.NonExistentTimeError) as err: + # As of pandas 3.0, we raise ValueErrors instead of pytz exceptions + raise ValueError(str(err)) from err + else: return dt.replace(tzinfo=tz) diff --git a/pandas/_libs/tslibs/nattype.pyx b/pandas/_libs/tslibs/nattype.pyx index 130e41e5104a2..60afc1acdc297 100644 --- a/pandas/_libs/tslibs/nattype.pyx +++ b/pandas/_libs/tslibs/nattype.pyx @@ -229,7 +229,17 @@ cdef class _NaT(datetime): def to_datetime64(self) -> np.datetime64: """ - Return a numpy.datetime64 object with same precision. + Return a NumPy datetime64 object with same precision. + + This method returns a numpy.datetime64 object with the same + date and time information and precision as the pd.Timestamp object. + + See Also + -------- + numpy.datetime64 : Class to represent dates and times with high precision. + Timestamp.to_numpy : Alias for this method. + Timestamp.asm8 : Alias for this method. + pd.to_datetime : Convert argument to datetime. Examples -------- @@ -244,16 +254,24 @@ cdef class _NaT(datetime): def to_numpy(self, dtype=None, copy=False) -> np.datetime64 | np.timedelta64: """ - Convert the Timestamp to a NumPy datetime64 or timedelta64. + Convert the Timestamp to a NumPy datetime64. - With the default 'dtype', this is an alias method for `NaT.to_datetime64()`. - - The copy parameter is available here only for compatibility. Its value + This is an alias method for `Timestamp.to_datetime64()`. The dtype and + copy parameters are available here only for compatibility. Their values will not affect the return value. + Parameters + ---------- + dtype : dtype, optional + Data type of the output, ignored in this method as the return type + is always `numpy.datetime64`. + copy : bool, default False + Whether to ensure that the returned value is a new object. This + parameter is also ignored as the method does not support copying. + Returns ------- - numpy.datetime64 or numpy.timedelta64 + numpy.datetime64 See Also -------- @@ -269,9 +287,6 @@ cdef class _NaT(datetime): >>> pd.NaT.to_numpy() numpy.datetime64('NaT') - - >>> pd.NaT.to_numpy("m8[ns]") - numpy.timedelta64('NaT','ns') """ if dtype is not None: # GH#44460 @@ -333,6 +348,22 @@ class NaTType(_NaT): """ (N)ot-(A)-(T)ime, the time equivalent of NaN. + NaT is used to denote missing or null values in datetime and timedelta objects + in pandas. It functions similarly to how NaN is used for numerical data. + Operations with NaT will generally propagate the NaT value, similar to NaN. + NaT can be used in pandas data structures like Series and DataFrame + to represent missing datetime values. It is useful in data analysis + and time series analysis when working with incomplete or sparse + time-based data. Pandas provides robust handling of NaT to ensure + consistency and reliability in computations involving datetime objects. + + See Also + -------- + NA : NA ("not available") missing value indicator. + isna : Detect missing values (NaN or NaT) in an array-like object. + notna : Detect non-missing values. + numpy.nan : Floating point representation of Not a Number (NaN) for numerical data. + Examples -------- >>> pd.DataFrame([pd.Timestamp("2023"), np.nan], columns=["col_1"]) @@ -476,6 +507,11 @@ class NaTType(_NaT): """ Return the month name of the Timestamp with specified locale. + This method returns the full name of the month corresponding to the + `Timestamp`, such as 'January', 'February', etc. The month name can + be returned in a specified locale if provided; otherwise, it defaults + to the English locale. + Parameters ---------- locale : str, default None (English locale) @@ -484,9 +520,18 @@ class NaTType(_NaT): Returns ------- str + The full month name as a string. + + See Also + -------- + Timestamp.day_name : Returns the name of the day of the week. + Timestamp.strftime : Returns a formatted string of the Timestamp. + datetime.datetime.strftime : Returns a string representing the date and time. Examples -------- + Get the month name in English (default): + >>> ts = pd.Timestamp('2020-03-14T15:32:52.192548651') >>> ts.month_name() 'March' @@ -581,10 +626,25 @@ class NaTType(_NaT): date = _make_nat_func( "date", """ - Return date object with same year, month and day. + Returns `datetime.date` with the same year, month, and day. + + This method extracts the date component from the `Timestamp` and returns + it as a `datetime.date` object, discarding the time information. + + Returns + ------- + datetime.date + The date part of the `Timestamp`. + + See Also + -------- + Timestamp : Represents a single timestamp, similar to `datetime`. + datetime.datetime.date : Extract the date component from a `datetime` object. Examples -------- + Extract the date from a Timestamp: + >>> ts = pd.Timestamp('2023-01-01 10:00:00.00') >>> ts Timestamp('2023-01-01 10:00:00') @@ -595,7 +655,24 @@ class NaTType(_NaT): utctimetuple = _make_error_func( "utctimetuple", """ - Return UTC time tuple, compatible with time.localtime(). + Return UTC time tuple, compatible with `time.localtime()`. + + This method converts the Timestamp to UTC and returns a time tuple + containing 9 components: year, month, day, hour, minute, second, + weekday, day of year, and DST flag. This is particularly useful for + converting a Timestamp to a format compatible with time module functions. + + Returns + ------- + time.struct_time + A time.struct_time object representing the UTC time. + + See Also + -------- + datetime.datetime.utctimetuple : + Return UTC time tuple, compatible with time.localtime(). + Timestamp.timetuple : Return time tuple of local time. + time.struct_time : Time tuple structure used by time functions. Examples -------- @@ -612,6 +689,22 @@ class NaTType(_NaT): """ Return utc offset. + This method returns the difference between UTC and the local time + as a `timedelta` object. It is useful for understanding the time + difference between the current timezone and UTC. + + Returns + -------- + timedelta + The difference between UTC and the local time as a `timedelta` object. + + See Also + -------- + datetime.datetime.utcoffset : + Standard library method to get the UTC offset of a datetime object. + Timestamp.tzname : Return the name of the timezone. + Timestamp.dst : Return the daylight saving time (DST) adjustment. + Examples -------- >>> ts = pd.Timestamp('2023-01-01 10:00:00', tz='Europe/Brussels') @@ -626,6 +719,13 @@ class NaTType(_NaT): """ Return time zone name. + This method returns the name of the Timestamp's time zone as a string. + + See Also + -------- + Timestamp.tzinfo : Returns the timezone information of the Timestamp. + Timestamp.tz_convert : Convert timezone-aware Timestamp to another time zone. + Examples -------- >>> ts = pd.Timestamp('2023-01-01 10:00:00', tz='Europe/Brussels') @@ -664,6 +764,17 @@ class NaTType(_NaT): """ Return time tuple, compatible with time.localtime(). + This method converts the `Timestamp` into a time tuple, which is compatible + with functions like `time.localtime()`. The time tuple is a named tuple with + attributes such as year, month, day, hour, minute, second, weekday, + day of the year, and daylight savings indicator. + + See Also + -------- + time.localtime : Converts a POSIX timestamp into a time tuple. + Timestamp : The `Timestamp` that represents a specific point in time. + datetime.datetime.timetuple : Equivalent method in the `datetime` module. + Examples -------- >>> ts = pd.Timestamp('2023-01-01 10:00:00') @@ -679,6 +790,19 @@ class NaTType(_NaT): """ Return time object with same time and tzinfo. + This method returns a datetime.time object with + the time and tzinfo corresponding to the pd.Timestamp + object, ignoring any information about the day/date. + + See Also + -------- + datetime.datetime.timetz : Return datetime.time object with the + same time attributes as the datetime object. + datetime.time : Class to represent the time of day, independent + of any particular day. + datetime.datetime.tzinfo : Attribute of datetime.datetime objects + representing the timezone of the datetime object. + Examples -------- >>> ts = pd.Timestamp('2023-01-01 10:00:00', tz='Europe/Brussels') @@ -693,6 +817,17 @@ class NaTType(_NaT): """ Return proleptic Gregorian ordinal. January 1 of year 1 is day 1. + The proleptic Gregorian ordinal is a continuous count of days since + January 1 of year 1, which is considered day 1. This method converts + the `Timestamp` to its equivalent ordinal number, useful for date arithmetic + and comparison operations. + + See Also + -------- + datetime.datetime.toordinal : Equivalent method in the `datetime` module. + Timestamp : The `Timestamp` that represents a specific point in time. + Timestamp.fromordinal : Create a `Timestamp` from an ordinal. + Examples -------- >>> ts = pd.Timestamp('2023-01-01 10:00:50') @@ -705,7 +840,25 @@ class NaTType(_NaT): ctime = _make_error_func( "ctime", """ - Return ctime() style string. + Return a ctime() style string representing the Timestamp. + + This method returns a string representing the date and time + in the format returned by the standard library's `time.ctime()` + function, which is typically in the form 'Day Mon DD HH:MM:SS YYYY'. + + If the `Timestamp` is outside the range supported by Python's + standard library, a `NotImplementedError` is raised. + + Returns + ------- + str + A string representing the Timestamp in ctime format. + + See Also + -------- + time.ctime : Return a string representing time in ctime format. + Timestamp : Represents a single timestamp, similar to `datetime`. + datetime.datetime.ctime : Return a ctime style string from a datetime object. Examples -------- @@ -746,9 +899,27 @@ class NaTType(_NaT): strptime = _make_error_func( "strptime", """ - Timestamp.strptime(string, format) + Convert string argument to datetime. + + This method is not implemented; calling it will raise NotImplementedError. + Use pd.to_datetime() instead. - Function is not implemented. Use pd.to_datetime(). + Parameters + ---------- + date_string : str + String to convert to a datetime. + format : str, default None + The format string to parse time, e.g. "%d/%m/%Y". + + See Also + -------- + pd.to_datetime : Convert argument to datetime. + datetime.datetime.strptime : Return a datetime corresponding to a string + representing a date and time, parsed according to a separate + format string. + datetime.datetime.strftime : Return a string representing the date and + time, controlled by an explicit format string. + Timestamp.isoformat : Return the time formatted according to ISO 8601. Examples -------- @@ -765,6 +936,21 @@ class NaTType(_NaT): Construct a timezone-aware UTC datetime from a POSIX timestamp. + This method creates a datetime object from a POSIX timestamp, keeping the + Timestamp object's timezone. + + Parameters + ---------- + ts : float + POSIX timestamp. + + See Also + -------- + Timezone.tzname : Return time zone name. + Timestamp.utcnow : Return a new Timestamp representing UTC day and time. + Timestamp.fromtimestamp : Transform timestamp[, tz] to tz's local + time from POSIX timestamp. + Notes ----- Timestamp.utcfromtimestamp behavior differs from datetime.utcfromtimestamp @@ -779,16 +965,43 @@ class NaTType(_NaT): fromtimestamp = _make_error_func( "fromtimestamp", """ - Timestamp.fromtimestamp(ts) + Create a `Timestamp` object from a POSIX timestamp. + + This method converts a POSIX timestamp (the number of seconds since + January 1, 1970, 00:00:00 UTC) into a `Timestamp` object. The resulting + `Timestamp` can be localized to a specific time zone if provided. - Transform timestamp[, tz] to tz's local time from POSIX timestamp. + Parameters + ---------- + ts : float + The POSIX timestamp to convert, representing seconds since + the epoch (1970-01-01 00:00:00 UTC). + tz : str, zoneinfo.ZoneInfo, pytz.timezone, dateutil.tz.tzfile, optional + Time zone for the `Timestamp`. If not provided, the `Timestamp` will + be timezone-naive (i.e., without time zone information). + + Returns + ------- + Timestamp + A `Timestamp` object representing the given POSIX timestamp. + + See Also + -------- + Timestamp : Represents a single timestamp, similar to `datetime`. + to_datetime : Converts various types of data to datetime. + datetime.datetime.fromtimestamp : Returns a datetime from a POSIX timestamp. Examples -------- + Convert a POSIX timestamp to a `Timestamp`: + >>> pd.Timestamp.fromtimestamp(1584199972) # doctest: +SKIP Timestamp('2020-03-14 15:32:52') - Note that the output may change depending on your local time. + Note that the output may change depending on your local time and time zone: + + >>> pd.Timestamp.fromtimestamp(1584199972, tz='UTC') # doctest: +SKIP + Timestamp('2020-03-14 15:32:52+0000', tz='UTC') """, ) combine = _make_error_func( @@ -796,7 +1009,28 @@ class NaTType(_NaT): """ Timestamp.combine(date, time) - Combine date, time into datetime with same date and time fields. + Combine a date and time into a single Timestamp object. + + This method takes a `date` object and a `time` object + and combines them into a single `Timestamp` + that has the same date and time fields. + + Parameters + ---------- + date : datetime.date + The date part of the Timestamp. + time : datetime.time + The time part of the Timestamp. + + Returns + ------- + Timestamp + A new `Timestamp` object representing the combined date and time. + + See Also + -------- + Timestamp : Represents a single timestamp, similar to `datetime`. + to_datetime : Converts various types of data to datetime. Examples -------- @@ -836,6 +1070,23 @@ class NaTType(_NaT): """ Return POSIX timestamp as float. + This method converts the `Timestamp` object to a POSIX timestamp, which is + the number of seconds since the Unix epoch (January 1, 1970). The returned + value is a floating-point number, where the integer part represents the + seconds, and the fractional part represents the microseconds. + + Returns + ------- + float + The POSIX timestamp representation of the `Timestamp` object. + + See Also + -------- + Timestamp.fromtimestamp : Construct a `Timestamp` from a POSIX timestamp. + datetime.datetime.timestamp : Equivalent method from the `datetime` module. + Timestamp.to_pydatetime : Convert the `Timestamp` to a `datetime` object. + Timestamp.to_datetime64 : Converts `Timestamp` to `numpy.datetime64`. + Examples -------- >>> ts = pd.Timestamp('2020-03-14T15:32:52.192548') @@ -907,6 +1158,11 @@ class NaTType(_NaT): """ Construct a timestamp from a a proleptic Gregorian ordinal. + This method creates a `Timestamp` object corresponding to the given + proleptic Gregorian ordinal, which is a count of days from January 1, + 0001 (using the proleptic Gregorian calendar). The time part of the + `Timestamp` is set to midnight (00:00:00) by default. + Parameters ---------- ordinal : int @@ -914,14 +1170,31 @@ class NaTType(_NaT): tz : str, zoneinfo.ZoneInfo, pytz.timezone, dateutil.tz.tzfile or None Time zone for the Timestamp. + Returns + ------- + Timestamp + A `Timestamp` object representing the specified ordinal date. + + See Also + -------- + Timestamp : Represents a single timestamp, similar to `datetime`. + to_datetime : Converts various types of data to datetime. + Notes ----- By definition there cannot be any tz info on the ordinal itself. Examples -------- + Convert an ordinal to a `Timestamp`: + >>> pd.Timestamp.fromordinal(737425) Timestamp('2020-01-01 00:00:00') + + Create a `Timestamp` from an ordinal with timezone information: + + >>> pd.Timestamp.fromordinal(737425, tz='UTC') + Timestamp('2020-01-01 00:00:00+0000', tz='UTC') """, ) @@ -1013,6 +1286,12 @@ class NaTType(_NaT): tz : str or timezone object, default None Timezone to localize to. + See Also + -------- + datetime.datetime.today : Returns the current local date. + Timestamp.now : Returns current time with optional timezone. + Timestamp : A class representing a specific timestamp. + Examples -------- >>> pd.Timestamp.today() # doctest: +SKIP @@ -1045,9 +1324,9 @@ class NaTType(_NaT): * bool contains flags to determine if time is dst or not (note that this flag is only applicable for ambiguous fall dst dates). * 'NaT' will return NaT for an ambiguous time. - * 'raise' will raise an AmbiguousTimeError for an ambiguous time. + * 'raise' will raise a ValueError for an ambiguous time. - nonexistent : {'raise', 'shift_forward', 'shift_backward, 'NaT', \ + nonexistent : {'raise', 'shift_forward', 'shift_backward', 'NaT', \ timedelta}, default 'raise' A nonexistent time does not exist in a particular timezone where clocks moved forward due to DST. @@ -1058,7 +1337,7 @@ timedelta}, default 'raise' closest existing time. * 'NaT' will return NaT where there are nonexistent times. * timedelta objects will shift nonexistent times by the timedelta. - * 'raise' will raise an NonExistentTimeError if there are + * 'raise' will raise a ValueError if there are nonexistent times. Returns @@ -1146,9 +1425,9 @@ timedelta}, default 'raise' * bool contains flags to determine if time is dst or not (note that this flag is only applicable for ambiguous fall dst dates). * 'NaT' will return NaT for an ambiguous time. - * 'raise' will raise an AmbiguousTimeError for an ambiguous time. + * 'raise' will raise a ValueError for an ambiguous time. - nonexistent : {'raise', 'shift_forward', 'shift_backward, 'NaT', \ + nonexistent : {'raise', 'shift_forward', 'shift_backward', 'NaT', \ timedelta}, default 'raise' A nonexistent time does not exist in a particular timezone where clocks moved forward due to DST. @@ -1159,7 +1438,7 @@ timedelta}, default 'raise' closest existing time. * 'NaT' will return NaT where there are nonexistent times. * timedelta objects will shift nonexistent times by the timedelta. - * 'raise' will raise an NonExistentTimeError if there are + * 'raise' will raise a ValueError if there are nonexistent times. Raises @@ -1241,9 +1520,9 @@ timedelta}, default 'raise' * bool contains flags to determine if time is dst or not (note that this flag is only applicable for ambiguous fall dst dates). * 'NaT' will return NaT for an ambiguous time. - * 'raise' will raise an AmbiguousTimeError for an ambiguous time. + * 'raise' will raise a ValueError for an ambiguous time. - nonexistent : {'raise', 'shift_forward', 'shift_backward, 'NaT', \ + nonexistent : {'raise', 'shift_forward', 'shift_backward', 'NaT', \ timedelta}, default 'raise' A nonexistent time does not exist in a particular timezone where clocks moved forward due to DST. @@ -1254,7 +1533,7 @@ timedelta}, default 'raise' closest existing time. * 'NaT' will return NaT where there are nonexistent times. * timedelta objects will shift nonexistent times by the timedelta. - * 'raise' will raise an NonExistentTimeError if there are + * 'raise' will raise a ValueError if there are nonexistent times. Raises @@ -1405,9 +1684,9 @@ timedelta}, default 'raise' * bool contains flags to determine if time is dst or not (note that this flag is only applicable for ambiguous fall dst dates). * 'NaT' will return NaT for an ambiguous time. - * 'raise' will raise an AmbiguousTimeError for an ambiguous time. + * 'raise' will raise a ValueError for an ambiguous time. - nonexistent : 'shift_forward', 'shift_backward, 'NaT', timedelta, \ + nonexistent : 'shift_forward', 'shift_backward', 'NaT', timedelta, \ default 'raise' A nonexistent time does not exist in a particular timezone where clocks moved forward due to DST. @@ -1420,7 +1699,7 @@ default 'raise' closest existing time. * 'NaT' will return NaT where there are nonexistent times. * timedelta objects will shift nonexistent times by the timedelta. - * 'raise' will raise an NonExistentTimeError if there are + * 'raise' will raise a ValueError if there are nonexistent times. Returns @@ -1432,6 +1711,13 @@ default 'raise' TypeError If the Timestamp is tz-aware and tz is not None. + See Also + -------- + Timestamp.tzinfo : Returns the timezone information of the Timestamp. + Timestamp.tz_convert : Convert timezone-aware Timestamp to another time zone. + DatetimeIndex.tz_localize : Localize a DatetimeIndex to a specific time zone. + datetime.datetime.astimezone : Convert a datetime object to another time zone. + Examples -------- Create a naive timestamp object: @@ -1456,22 +1742,48 @@ default 'raise' """ Implements datetime.replace, handles nanoseconds. + This method creates a new `Timestamp` object by replacing the specified + fields with new values. The new `Timestamp` retains the original fields + that are not explicitly replaced. This method handles nanoseconds, and + the `tzinfo` parameter allows for timezone replacement without conversion. + Parameters ---------- year : int, optional + The year to replace. If `None`, the year is not changed. month : int, optional + The month to replace. If `None`, the month is not changed. day : int, optional + The day to replace. If `None`, the day is not changed. hour : int, optional + The hour to replace. If `None`, the hour is not changed. minute : int, optional + The minute to replace. If `None`, the minute is not changed. second : int, optional + The second to replace. If `None`, the second is not changed. microsecond : int, optional + The microsecond to replace. If `None`, the microsecond is not changed. nanosecond : int, optional + The nanosecond to replace. If `None`, the nanosecond is not changed. tzinfo : tz-convertible, optional + The timezone information to replace. If `None`, the timezone is not changed. fold : int, optional + The fold information to replace. If `None`, the fold is not changed. Returns ------- - Timestamp with fields replaced + Timestamp + A new `Timestamp` object with the specified fields replaced. + + See Also + -------- + Timestamp : Represents a single timestamp, similar to `datetime`. + to_datetime : Converts various types of data to datetime. + + Notes + ----- + The `replace` method does not perform timezone conversions. If you need + to convert the timezone, use the `tz_convert` method instead. Examples -------- @@ -1509,7 +1821,7 @@ default 'raise' def as_unit(self, str unit, bint round_ok=True) -> "NaTType": """ - Convert the underlying int64 representaton to the given unit. + Convert the underlying int64 representation to the given unit. Parameters ---------- diff --git a/pandas/_libs/tslibs/np_datetime.pxd b/pandas/_libs/tslibs/np_datetime.pxd index cb2658d343772..43240046c6500 100644 --- a/pandas/_libs/tslibs/np_datetime.pxd +++ b/pandas/_libs/tslibs/np_datetime.pxd @@ -34,7 +34,7 @@ cdef extern from "numpy/ndarraytypes.h": NPY_FR_as NPY_FR_GENERIC - int64_t NPY_DATETIME_NAT # elswhere we call this NPY_NAT + int64_t NPY_DATETIME_NAT # elsewhere we call this NPY_NAT cdef extern from "pandas/datetime/pd_datetime.h": diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx index 554c4f109f1c5..043c029ec900c 100644 --- a/pandas/_libs/tslibs/offsets.pyx +++ b/pandas/_libs/tslibs/offsets.pyx @@ -595,6 +595,24 @@ cdef class BaseOffset: @property def rule_code(self) -> str: + """ + Return a string representing the base frequency. + + See Also + -------- + tseries.offsets.Hour.rule_code : + Returns a string representing the base frequency of 'h'. + tseries.offsets.Day.rule_code : + Returns a string representing the base frequency of 'D'. + + Examples + -------- + >>> pd.offsets.Hour().rule_code + 'h' + + >>> pd.offsets.Week(5).rule_code + 'W' + """ return self._prefix @cache_readonly @@ -602,6 +620,17 @@ cdef class BaseOffset: """ Return a string representing the frequency. + See Also + -------- + tseries.offsets.BusinessDay.freqstr : + Return a string representing an offset frequency in Business Days. + tseries.offsets.BusinessHour.freqstr : + Return a string representing an offset frequency in Business Hours. + tseries.offsets.Week.freqstr : + Return a string representing an offset frequency in Weeks. + tseries.offsets.Hour.freqstr : + Return a string representing an offset frequency in Hours. + Examples -------- >>> pd.DateOffset(5).freqstr @@ -763,7 +792,7 @@ cdef class BaseOffset: def __getstate__(self): """ - Return a pickleable state + Return a picklable state """ state = {} state["n"] = self.n @@ -779,6 +808,26 @@ cdef class BaseOffset: @property def nanos(self): + """ + Returns a integer of the total number of nanoseconds for fixed frequencies. + + Raises + ------ + ValueError + If the frequency is non-fixed. + + See Also + -------- + tseries.offsets.Hour.nanos : + Returns an integer of the total number of nanoseconds. + tseries.offsets.Day.nanos : + Returns an integer of the total number of nanoseconds. + + Examples + -------- + >>> pd.offsets.Week(n=1).nanos + ValueError: Week: weekday=None is a non-fixed frequency + """ raise ValueError(f"{self} is a non-fixed frequency") # ------------------------------------------------------------------ @@ -986,12 +1035,14 @@ cdef class Tick(SingleConstructorOffset): @property def nanos(self) -> int64_t: """ - Return an integer of the total number of nanoseconds. + Returns an integer of the total number of nanoseconds. - Raises - ------ - ValueError - If the frequency is non-fixed. + See Also + -------- + tseries.offsets.Hour.nanos : + Returns an integer of the total number of nanoseconds. + tseries.offsets.Day.nanos : + Returns an integer of the total number of nanoseconds. Examples -------- @@ -1147,7 +1198,7 @@ cdef class Hour(Tick): """ Offset ``n`` hours. - Parameters + Attributes ---------- n : int, default 1 The number of hours represented. @@ -1183,7 +1234,7 @@ cdef class Minute(Tick): """ Offset ``n`` minutes. - Parameters + Attributes ---------- n : int, default 1 The number of minutes represented. @@ -1219,7 +1270,7 @@ cdef class Second(Tick): """ Offset ``n`` seconds. - Parameters + Attributes ---------- n : int, default 1 The number of seconds represented. @@ -1255,7 +1306,7 @@ cdef class Milli(Tick): """ Offset ``n`` milliseconds. - Parameters + Attributes ---------- n : int, default 1 The number of milliseconds represented. @@ -1292,7 +1343,7 @@ cdef class Micro(Tick): """ Offset ``n`` microseconds. - Parameters + Attributes ---------- n : int, default 1 The number of microseconds represented. @@ -1329,7 +1380,7 @@ cdef class Nano(Tick): """ Offset ``n`` nanoseconds. - Parameters + Attributes ---------- n : int, default 1 The number of nanoseconds represented. @@ -1405,7 +1456,7 @@ cdef class RelativeDeltaOffset(BaseOffset): def __getstate__(self): """ - Return a pickleable state + Return a picklable state """ # RelativeDeltaOffset (technically DateOffset) is the only non-cdef # class, so the only one with __dict__ @@ -1616,7 +1667,7 @@ class DateOffset(RelativeDeltaOffset, metaclass=OffsetMeta): Besides, adding a DateOffsets specified by the singular form of the date component can be used to replace certain component of the timestamp. - Parameters + Attributes ---------- n : int, default 1 The number of time periods the offset represents. @@ -2426,6 +2477,24 @@ cdef class WeekOfMonthMixin(SingleConstructorOffset): @property def rule_code(self) -> str: + """ + Return a string representing the base frequency. + + See Also + -------- + tseries.offsets.Hour.rule_code : + Returns a string representing the base frequency of 'h'. + tseries.offsets.Day.rule_code : + Returns a string representing the base frequency of 'D'. + + Examples + -------- + >>> pd.offsets.Week(5).rule_code + 'W' + + >>> pd.offsets.WeekOfMonth(n=1, week=0, weekday=0).rule_code + 'WOM-1MON' + """ weekday = int_to_weekday.get(self.weekday, "") if self.week == -1: # LastWeekOfMonth @@ -2472,6 +2541,24 @@ cdef class YearOffset(SingleConstructorOffset): @property def rule_code(self) -> str: + """ + Return a string representing the base frequency. + + See Also + -------- + tseries.offsets.Hour.rule_code : + Returns a string representing the base frequency of 'h'. + tseries.offsets.Day.rule_code : + Returns a string representing the base frequency of 'D'. + + Examples + -------- + >>> pd.tseries.offsets.YearBegin(n=1, month=2).rule_code + 'YS-FEB' + + >>> pd.tseries.offsets.YearEnd(n=1, month=6).rule_code + 'YE-JUN' + """ month = MONTH_ALIASES[self.month] return f"{self._prefix}-{month}" @@ -2506,7 +2593,7 @@ cdef class BYearEnd(YearOffset): """ DateOffset increments between the last business day of the year. - Parameters + Attributes ---------- n : int, default 1 The number of years represented. @@ -2804,7 +2891,7 @@ cdef class BQuarterBegin(QuarterOffset): startingMonth = 2 corresponds to dates like 2/01/2007, 5/01/2007, ... startingMonth = 3 corresponds to dates like 3/01/2007, 6/01/2007, ... - Parameters + Attributes ---------- n : int, default 1 The number of quarters represented. @@ -2886,7 +2973,7 @@ cdef class QuarterBegin(QuarterOffset): startingMonth = 2 corresponds to dates like 2/01/2007, 5/01/2007, ... startingMonth = 3 corresponds to dates like 3/01/2007, 6/01/2007, ... - Parameters + Attributes ---------- n : int, default 1 The number of quarters represented. @@ -2984,7 +3071,7 @@ cdef class MonthBegin(MonthOffset): MonthBegin goes to the next date which is a start of the month. - Parameters + Attributes ---------- n : int, default 1 The number of months represented. @@ -3272,7 +3359,7 @@ cdef class SemiMonthBegin(SemiMonthOffset): """ Two DateOffset's per month repeating on the first day of the month & day_of_month. - Parameters + Attributes ---------- n : int, default 1 The number of months represented. @@ -3304,7 +3391,7 @@ cdef class Week(SingleConstructorOffset): """ Weekly offset. - Parameters + Attributes ---------- n : int, default 1 The number of weeks represented. @@ -3458,6 +3545,24 @@ cdef class Week(SingleConstructorOffset): @property def rule_code(self) -> str: + """ + Return a string representing the base frequency. + + See Also + -------- + tseries.offsets.Hour.name : + Returns a string representing the base frequency of 'h'. + tseries.offsets.Day.name : + Returns a string representing the base frequency of 'D'. + + Examples + -------- + >>> pd.offsets.Hour().rule_code + 'h' + + >>> pd.offsets.Week(5).rule_code + 'W' + """ suffix = "" if self.weekday is not None: weekday = int_to_weekday[self.weekday] @@ -3477,7 +3582,7 @@ cdef class WeekOfMonth(WeekOfMonthMixin): """ Describes monthly dates like "the Tuesday of the 2nd week of each month". - Parameters + Attributes ---------- n : int, default 1 The number of months represented. @@ -3554,7 +3659,7 @@ cdef class LastWeekOfMonth(WeekOfMonthMixin): For example "the last Tuesday of each month". - Parameters + Attributes ---------- n : int, default 1 The number of months represented. @@ -3694,7 +3799,7 @@ cdef class FY5253(FY5253Mixin): X is a specific day of the week. Y is a certain month of the year - Parameters + Attributes ---------- n : int The number of fiscal years represented. @@ -3897,7 +4002,7 @@ cdef class FY5253Quarter(FY5253Mixin): startingMonth = 2 corresponds to dates like 2/28/2007, 5/31/2007, ... startingMonth = 3 corresponds to dates like 3/30/2007, 6/29/2007, ... - Parameters + Attributes ---------- n : int The number of business quarters represented. @@ -4132,7 +4237,7 @@ cdef class Easter(SingleConstructorOffset): Right now uses the revised method which is valid in years 1583-4099. - Parameters + Attributes ---------- n : int, default 1 The number of years represented. diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx index c6ba97fe9f1a2..c563ab91c4142 100644 --- a/pandas/_libs/tslibs/period.pyx +++ b/pandas/_libs/tslibs/period.pyx @@ -1913,20 +1913,58 @@ cdef class _Period(PeriodMixin): Parameters ---------- freq : str, BaseOffset - The desired frequency. If passing a `str`, it needs to be a - valid :ref:`period alias `. + The target frequency to convert the Period object to. + If a string is provided, + it must be a valid :ref:`period alias `. + how : {'E', 'S', 'end', 'start'}, default 'end' - Start or end of the timespan. + Specifies whether to align the period to the start or end of the interval: + - 'E' or 'end': Align to the end of the interval. + - 'S' or 'start': Align to the start of the interval. Returns ------- - resampled : Period + Period : Period object with the specified frequency, aligned to the parameter. + + See Also + -------- + Period.end_time : Return the end Timestamp. + Period.start_time : Return the start Timestamp. + Period.dayofyear : Return the day of the year. + Period.dayofweek : Return the day of the week. Examples -------- - >>> period = pd.Period('2023-1-1', freq='D') + Convert a daily period to an hourly period, aligning to the end of the day: + + >>> period = pd.Period('2023-01-01', freq='D') >>> period.asfreq('h') Period('2023-01-01 23:00', 'h') + + Convert a monthly period to a daily period, aligning to the start of the month: + + >>> period = pd.Period('2023-01', freq='M') + >>> period.asfreq('D', how='start') + Period('2023-01-01', 'D') + + Convert a yearly period to a monthly period, aligning to the last month: + + >>> period = pd.Period('2023', freq='Y') + >>> period.asfreq('M', how='end') + Period('2023-12', 'M') + + Convert a monthly period to an hourly period, + aligning to the first day of the month: + + >>> period = pd.Period('2023-01', freq='M') + >>> period.asfreq('h', how='start') + Period('2023-01-01 00:00', 'H') + + Convert a weekly period to a daily period, aligning to the last day of the week: + + >>> period = pd.Period('2023-08-01', freq='W') + >>> period.asfreq('D', how='end') + Period('2023-08-04', 'D') """ freq = self._maybe_convert_freq(freq) how = validate_end_alias(how) @@ -1963,6 +2001,12 @@ cdef class _Period(PeriodMixin): ------- Timestamp + See Also + -------- + Timestamp : A class representing a single point in time. + Period : Represents a span of time with a fixed frequency. + PeriodIndex.to_timestamp : Convert a `PeriodIndex` to a `DatetimeIndex`. + Examples -------- >>> period = pd.Period('2023-1-1', freq='D') @@ -2000,11 +2044,44 @@ cdef class _Period(PeriodMixin): """ Return the year this Period falls on. + Returns + ------- + int + + See Also + -------- + period.month : Get the month of the year for the given Period. + period.day : Return the day of the month the Period falls on. + + Notes + ----- + The year is based on the `ordinal` and `base` attributes of the Period. + Examples -------- - >>> period = pd.Period('2022-01', 'M') + Create a Period object for January 2023 and get the year: + + >>> period = pd.Period('2023-01', 'M') + >>> period.year + 2023 + + Create a Period object for 01 January 2023 and get the year: + + >>> period = pd.Period('2023', 'D') + >>> period.year + 2023 + + Get the year for a period representing a quarter: + + >>> period = pd.Period('2023Q2', 'Q') >>> period.year - 2022 + 2023 + + Handle a case where the Period object is empty, which results in `NaN`: + + >>> period = pd.Period('nan', 'M') + >>> period.year + nan """ base = self._dtype._dtype_code return pyear(self.ordinal, base) @@ -2014,11 +2091,45 @@ cdef class _Period(PeriodMixin): """ Return the month this Period falls on. + Returns + ------- + int + + See Also + -------- + period.week : Get the week of the year on the given Period. + Period.year : Return the year this Period falls on. + Period.day : Return the day of the month this Period falls on. + + Notes + ----- + The month is based on the `ordinal` and `base` attributes of the Period. + Examples -------- + Create a Period object for January 2022 and get the month: + >>> period = pd.Period('2022-01', 'M') >>> period.month 1 + + Period object with no specified frequency, resulting in a default frequency: + + >>> period = pd.Period('2022', 'Y') + >>> period.month + 12 + + Create a Period object with a specified frequency but an incomplete date string: + + >>> period = pd.Period('2022', 'M') + >>> period.month + 1 + + Handle a case where the Period object is empty, which results in `NaN`: + + >>> period = pd.Period('nan', 'M') + >>> period.month + nan """ base = self._dtype._dtype_code return pmonth(self.ordinal, base) @@ -2502,6 +2613,17 @@ cdef class _Period(PeriodMixin): """ Return a string representation of the frequency. + This property provides the frequency string associated with the `Period` + object. The frequency string describes the granularity of the time span + represented by the `Period`. Common frequency strings include 'D' for + daily, 'M' for monthly, 'Y' for yearly, etc. + + See Also + -------- + Period.asfreq : Convert Period to desired frequency, at the start or end + of the interval. + period_range : Return a fixed frequency PeriodIndex. + Examples -------- >>> pd.Period('2020-01', 'D').freqstr @@ -2639,6 +2761,27 @@ cdef class _Period(PeriodMixin): | ``%%`` | A literal ``'%'`` character. | | +-----------+--------------------------------+-------+ + The `strftime` method provides a way to represent a :class:`Period` + object as a string in a specified format. This is particularly useful + when displaying date and time data in different locales or customized + formats, suitable for reports or user interfaces. It extends the standard + Python string formatting capabilities with additional directives specific + to `pandas`, accommodating features like fiscal years and precise + sub-second components. + + Parameters + ---------- + fmt : str or None + String containing the desired format directives. If ``None``, the + format is determined based on the Period's frequency. + + See Also + -------- + Timestamp.strftime : Return a formatted string of the Timestamp. + to_datetime : Convert argument to datetime. + time.strftime : Format a time object as a string according to a + specified format string in the standard Python library. + Notes ----- diff --git a/pandas/_libs/tslibs/strptime.pyx b/pandas/_libs/tslibs/strptime.pyx index 43279051e2a30..ed784b6f5ab22 100644 --- a/pandas/_libs/tslibs/strptime.pyx +++ b/pandas/_libs/tslibs/strptime.pyx @@ -16,6 +16,7 @@ FUNCTIONS: strptime -- Calculates the time struct represented by the passed-in string """ from datetime import timezone +import zoneinfo from cpython.datetime cimport ( PyDate_Check, @@ -38,7 +39,6 @@ from _thread import allocate_lock as _thread_allocate_lock import re import numpy as np -import pytz cimport numpy as cnp from numpy cimport ( @@ -536,7 +536,7 @@ def array_strptime( except ValueError as ex: ex.args = ( - f"{str(ex)}, at position {i}. You might want to try:\n" + f"{str(ex)}. You might want to try:\n" " - passing `format` if your strings have a consistent format;\n" " - passing `format='ISO8601'` if your strings are " "all ISO8601 but not necessarily in exactly the same format;\n" @@ -747,7 +747,7 @@ cdef tzinfo _parse_with_format( week_of_year_start = 0 elif parse_code == 17: # e.g. val='2011-12-30T00:00:00.000000UTC'; fmt='%Y-%m-%dT%H:%M:%S.%f%Z' - tz = pytz.timezone(found_dict["Z"]) + tz = zoneinfo.ZoneInfo(found_dict["Z"]) elif parse_code == 19: # e.g. val='March 1, 2018 12:00:00+0400'; fmt='%B %d, %Y %H:%M:%S%z' tz = parse_timezone_directive(found_dict["z"]) @@ -837,7 +837,7 @@ class TimeRE(_TimeRE): if key == "Z": # lazy computation if self._Z is None: - self._Z = self.__seqToRE(pytz.all_timezones, "Z") + self._Z = self.__seqToRE(zoneinfo.available_timezones(), "Z") # Note: handling Z is the key difference vs using the stdlib # _strptime.TimeRE. test_to_datetime_parse_tzname_or_tzoffset with # fmt='%Y-%m-%d %H:%M:%S %Z' fails with the stdlib version. diff --git a/pandas/_libs/tslibs/timedeltas.pyx b/pandas/_libs/tslibs/timedeltas.pyx index d5348311f19e2..4f90f26cf31ab 100644 --- a/pandas/_libs/tslibs/timedeltas.pyx +++ b/pandas/_libs/tslibs/timedeltas.pyx @@ -1421,9 +1421,16 @@ cdef class _Timedelta(timedelta): """ Convert the Timedelta to a NumPy timedelta64. - This is an alias method for `Timedelta.to_timedelta64()`. The dtype and - copy parameters are available here only for compatibility. Their values - will not affect the return value. + This is an alias method for `Timedelta.to_timedelta64()`. + + Parameters + ---------- + dtype : NoneType + It is available here only for compatibility. Its value will not + affect the return value. + copy : bool, default False + It is available here only for compatibility. Its value will not + affect the return value. Returns ------- @@ -1451,11 +1458,26 @@ cdef class _Timedelta(timedelta): """ Array view compatibility. + This method allows you to reinterpret the underlying data of a Timedelta + object as a different dtype. The `view` method provides a way to reinterpret + the internal representation of the `Timedelta` object without modifying its + data. This is particularly useful when you need to work with the underlying + data directly, such as for performance optimizations or interfacing with + low-level APIs. The returned value is typically the number of nanoseconds + since the epoch, represented as an integer or another specified dtype. + Parameters ---------- dtype : str or dtype The dtype to view the underlying data as. + See Also + -------- + numpy.ndarray.view : Returns a view of an array with the same data. + Timedelta.to_numpy : Converts the Timedelta to a NumPy timedelta64. + Timedelta.total_seconds : Returns the total duration of the Timedelta + object in seconds. + Examples -------- >>> td = pd.Timedelta('3D') @@ -1498,6 +1520,12 @@ cdef class _Timedelta(timedelta): numpy timedelta64 array scalar view Array scalar view of the timedelta in nanoseconds. + See Also + -------- + Timedelta.total_seconds : Return the total seconds in the duration. + Timedelta.components : Return a namedtuple of the Timedelta's components. + Timedelta.to_timedelta64 : Convert the Timedelta to a numpy.timedelta64. + Examples -------- >>> td = pd.Timedelta('1 days 2 min 3 us 42 ns') @@ -2061,6 +2089,12 @@ class Timedelta(_Timedelta): ------ ValueError if the freq cannot be converted + See Also + -------- + Timedelta.floor : Floor the Timedelta to the specified resolution. + Timedelta.round : Round the Timedelta to the nearest specified resolution. + Timestamp.ceil : Similar method for Timestamp objects. + Examples -------- >>> td = pd.Timedelta('1001ms') @@ -2081,6 +2115,16 @@ class Timedelta(_Timedelta): Frequency string indicating the flooring resolution. It uses the same units as class constructor :class:`~pandas.Timedelta`. + Returns + ------- + Timedelta + A new Timedelta object floored to the specified resolution. + + See Also + -------- + Timestamp.ceil : Round the Timestamp up to the nearest specified resolution. + Timestamp.round : Round the Timestamp to the nearest specified resolution. + Examples -------- >>> td = pd.Timedelta('1001ms') @@ -2101,6 +2145,16 @@ class Timedelta(_Timedelta): Frequency string indicating the ceiling resolution. It uses the same units as class constructor :class:`~pandas.Timedelta`. + Returns + ------- + Timedelta + A new Timedelta object ceiled to the specified resolution. + + See Also + -------- + Timedelta.floor : Floor the Timedelta to the specified resolution. + Timedelta.round : Round the Timedelta to the nearest specified resolution. + Examples -------- >>> td = pd.Timedelta('1001ms') diff --git a/pandas/_libs/tslibs/timestamps.pyx b/pandas/_libs/tslibs/timestamps.pyx index 369184d9df40c..34c84d396ad64 100644 --- a/pandas/_libs/tslibs/timestamps.pyx +++ b/pandas/_libs/tslibs/timestamps.pyx @@ -240,6 +240,27 @@ cdef class _Timestamp(ABCTimestamp): @property def value(self) -> int: + """ + Return the value of the Timestamp. + + Returns + ------- + int + The integer representation of the Timestamp object in nanoseconds + since the Unix epoch (1970-01-01 00:00:00 UTC). + + See Also + -------- + Timestamp.second : Return the second of the Timestamp. + Timestamp.minute : Return the minute of the Timestamp. + + Examples + -------- + >>> ts = pd.Timestamp("2024-08-31 16:16:30") + >>> ts.value + 1725120990000000000 + """ + try: return convert_reso(self._value, self._creso, NPY_FR_ns, False) except OverflowError: @@ -254,6 +275,28 @@ cdef class _Timestamp(ABCTimestamp): """ The abbreviation associated with self._creso. + This property returns a string representing the time unit of the Timestamp's + resolution. It corresponds to the smallest time unit that can be represented + by this Timestamp object. The possible values are: + - 's' (second) + - 'ms' (millisecond) + - 'us' (microsecond) + - 'ns' (nanosecond) + + Returns + ------- + str + A string abbreviation of the Timestamp's resolution unit: + - 's' for second + - 'ms' for millisecond + - 'us' for microsecond + - 'ns' for nanosecond + + See Also + -------- + Timestamp.resolution : Return resolution of the Timestamp. + Timedelta : A duration expressing the difference between two dates or times. + Examples -------- >>> pd.Timestamp("2020-01-01 12:34:56").unit @@ -299,7 +342,7 @@ cdef class _Timestamp(ABCTimestamp): def _from_dt64(cls, dt64: np.datetime64): # construct a Timestamp from a np.datetime64 object, keeping the # resolution of the input. - # This is herely mainly so we can incrementally implement non-nano + # This is here mainly so we can incrementally implement non-nano # (e.g. only tznaive at first) cdef: int64_t value @@ -793,6 +836,11 @@ cdef class _Timestamp(ABCTimestamp): """ Return the month name of the Timestamp with specified locale. + This method returns the full name of the month corresponding to the + `Timestamp`, such as 'January', 'February', etc. The month name can + be returned in a specified locale if provided; otherwise, it defaults + to the English locale. + Parameters ---------- locale : str, default None (English locale) @@ -801,9 +849,18 @@ cdef class _Timestamp(ABCTimestamp): Returns ------- str + The full month name as a string. + + See Also + -------- + Timestamp.day_name : Returns the name of the day of the week. + Timestamp.strftime : Returns a formatted string of the Timestamp. + datetime.datetime.strftime : Returns a string representing the date and time. Examples -------- + Get the month name in English (default): + >>> ts = pd.Timestamp('2020-03-14T15:32:52.192548651') >>> ts.month_name() 'March' @@ -890,20 +947,203 @@ cdef class _Timestamp(ABCTimestamp): @property def quarter(self) -> int: """ - Return the quarter of the year. + Return the quarter of the year for the `Timestamp`. + + This property returns an integer representing the quarter of the year in + which the `Timestamp` falls. The quarters are defined as follows: + - Q1: January 1 to March 31 + - Q2: April 1 to June 30 + - Q3: July 1 to September 30 + - Q4: October 1 to December 31 Returns ------- int + The quarter of the year (1 through 4). + + See Also + -------- + Timestamp.month : Returns the month of the `Timestamp`. + Timestamp.year : Returns the year of the `Timestamp`. Examples -------- + Get the quarter for a `Timestamp`: + >>> ts = pd.Timestamp(2020, 3, 14) >>> ts.quarter 1 + + For a `Timestamp` in the fourth quarter: + + >>> ts = pd.Timestamp(2020, 10, 14) + >>> ts.quarter + 4 """ return ((self.month - 1) // 3) + 1 + @property + def day(self) -> int: + """ + Return the day of the Timestamp. + + Returns + ------- + int + The day of the Timestamp. + + See Also + -------- + Timestamp.week : Return the week number of the year. + Timestamp.weekday : Return the day of the week. + + Examples + -------- + >>> ts = pd.Timestamp("2024-08-31 16:16:30") + >>> ts.day + 31 + """ + return super().day + + @property + def fold(self) -> int: + """ + Return the fold value of the Timestamp. + + Returns + ------- + int + The fold value of the Timestamp, where 0 indicates the first occurrence + of the ambiguous time, and 1 indicates the second. + + See Also + -------- + Timestamp.dst : Return the daylight saving time (DST) adjustment. + Timestamp.tzinfo : Return the timezone information associated. + + Examples + -------- + >>> ts = pd.Timestamp("2024-11-03 01:30:00") + >>> ts.fold + 0 + """ + return super().fold + + @property + def month(self) -> int: + """ + Return the month of the Timestamp. + + Returns + ------- + int + The month of the Timestamp. + + See Also + -------- + Timestamp.day : Return the day of the Timestamp. + Timestamp.year : Return the year of the Timestamp. + + Examples + -------- + >>> ts = pd.Timestamp("2024-08-31 16:16:30") + >>> ts.month + 8 + """ + return super().month + + @property + def hour(self) -> int: + """ + Return the hour of the Timestamp. + + Returns + ------- + int + The hour of the Timestamp. + + See Also + -------- + Timestamp.minute : Return the minute of the Timestamp. + Timestamp.second : Return the second of the Timestamp. + + Examples + -------- + >>> ts = pd.Timestamp("2024-08-31 16:16:30") + >>> ts.hour + 16 + """ + return super().hour + + @property + def minute(self) -> int: + """ + Return the minute of the Timestamp. + + Returns + ------- + int + The minute of the Timestamp. + + See Also + -------- + Timestamp.hour : Return the hour of the Timestamp. + Timestamp.second : Return the second of the Timestamp. + + Examples + -------- + >>> ts = pd.Timestamp("2024-08-31 16:16:30") + >>> ts.minute + 16 + """ + return super().minute + + @property + def second(self) -> int: + """ + Return the second of the Timestamp. + + Returns + ------- + int + The second of the Timestamp. + + See Also + -------- + Timestamp.microsecond : Return the microsecond of the Timestamp. + Timestamp.minute : Return the minute of the Timestamp. + + Examples + -------- + >>> ts = pd.Timestamp("2024-08-31 16:16:30") + >>> ts.second + 30 + """ + return super().second + + @property + def microsecond(self) -> int: + """ + Return the microsecond of the Timestamp. + + Returns + ------- + int + The microsecond of the Timestamp. + + See Also + -------- + Timestamp.second : Return the second of the Timestamp. + Timestamp.minute : Return the minute of the Timestamp. + + Examples + -------- + >>> ts = pd.Timestamp("2024-08-31 16:16:30.2304") + >>> ts.microsecond + 230400 + """ + return super().microsecond + @property def week(self) -> int: """ @@ -955,6 +1195,21 @@ cdef class _Timestamp(ABCTimestamp): """ Normalize Timestamp to midnight, preserving tz information. + This method sets the time component of the `Timestamp` to midnight (00:00:00), + while preserving the date and time zone information. It is useful when you + need to standardize the time across different `Timestamp` objects without + altering the time zone or the date. + + Returns + ------- + Timestamp + + See Also + -------- + Timestamp.floor : Rounds `Timestamp` down to the nearest frequency. + Timestamp.ceil : Rounds `Timestamp` up to the nearest frequency. + Timestamp.round : Rounds `Timestamp` to the nearest frequency. + Examples -------- >>> ts = pd.Timestamp(2020, 3, 14, 15, 30) @@ -1125,7 +1380,7 @@ cdef class _Timestamp(ABCTimestamp): def as_unit(self, str unit, bint round_ok=True): """ - Convert the underlying int64 representaton to the given unit. + Convert the underlying int64 representation to the given unit. Parameters ---------- @@ -1190,6 +1445,23 @@ cdef class _Timestamp(ABCTimestamp): """ Return POSIX timestamp as float. + This method converts the `Timestamp` object to a POSIX timestamp, which is + the number of seconds since the Unix epoch (January 1, 1970). The returned + value is a floating-point number, where the integer part represents the + seconds, and the fractional part represents the microseconds. + + Returns + ------- + float + The POSIX timestamp representation of the `Timestamp` object. + + See Also + -------- + Timestamp.fromtimestamp : Construct a `Timestamp` from a POSIX timestamp. + datetime.datetime.timestamp : Equivalent method from the `datetime` module. + Timestamp.to_pydatetime : Convert the `Timestamp` to a `datetime` object. + Timestamp.to_datetime64 : Converts `Timestamp` to `numpy.datetime64`. + Examples -------- >>> ts = pd.Timestamp('2020-03-14T15:32:52.192548') @@ -1253,7 +1525,17 @@ cdef class _Timestamp(ABCTimestamp): cpdef to_datetime64(self): """ - Return a numpy.datetime64 object with same precision. + Return a NumPy datetime64 object with same precision. + + This method returns a numpy.datetime64 object with the same + date and time information and precision as the pd.Timestamp object. + + See Also + -------- + numpy.datetime64 : Class to represent dates and times with high precision. + Timestamp.to_numpy : Alias for this method. + Timestamp.asm8 : Alias for this method. + pd.to_datetime : Convert argument to datetime. Examples -------- @@ -1276,6 +1558,15 @@ cdef class _Timestamp(ABCTimestamp): copy parameters are available here only for compatibility. Their values will not affect the return value. + Parameters + ---------- + dtype : dtype, optional + Data type of the output, ignored in this method as the return type + is always `numpy.datetime64`. + copy : bool, default False + Whether to ensure that the returned value is a new object. This + parameter is also ignored as the method does not support copying. + Returns ------- numpy.datetime64 @@ -1305,6 +1596,21 @@ cdef class _Timestamp(ABCTimestamp): """ Return an period of which this timestamp is an observation. + This method converts the given Timestamp to a Period object, + which represents a span of time,such as a year, month, etc., + based on the specified frequency. + + Parameters + ---------- + freq : str, optional + Frequency string for the period (e.g., 'Y', 'M', 'W'). Defaults to `None`. + + See Also + -------- + Timestamp : Represents a specific timestamp. + Period : Represents a span of time. + to_period : Converts an object to a Period. + Examples -------- >>> ts = pd.Timestamp('2020-03-14T15:32:52.192548651') @@ -1442,6 +1748,11 @@ class Timestamp(_Timestamp): """ Construct a timestamp from a a proleptic Gregorian ordinal. + This method creates a `Timestamp` object corresponding to the given + proleptic Gregorian ordinal, which is a count of days from January 1, + 0001 (using the proleptic Gregorian calendar). The time part of the + `Timestamp` is set to midnight (00:00:00) by default. + Parameters ---------- ordinal : int @@ -1449,14 +1760,31 @@ class Timestamp(_Timestamp): tz : str, zoneinfo.ZoneInfo, pytz.timezone, dateutil.tz.tzfile or None Time zone for the Timestamp. + Returns + ------- + Timestamp + A `Timestamp` object representing the specified ordinal date. + + See Also + -------- + Timestamp : Represents a single timestamp, similar to `datetime`. + to_datetime : Converts various types of data to datetime. + Notes ----- By definition there cannot be any tz info on the ordinal itself. Examples -------- + Convert an ordinal to a `Timestamp`: + >>> pd.Timestamp.fromordinal(737425) Timestamp('2020-01-01 00:00:00') + + Create a `Timestamp` from an ordinal with timezone information: + + >>> pd.Timestamp.fromordinal(737425, tz='UTC') + Timestamp('2020-01-01 00:00:00+0000', tz='UTC') """ return cls(datetime.fromordinal(ordinal), tz=tz) @@ -1507,6 +1835,12 @@ class Timestamp(_Timestamp): tz : str or timezone object, default None Timezone to localize to. + See Also + -------- + datetime.datetime.today : Returns the current local date. + Timestamp.now : Returns current time with optional timezone. + Timestamp : A class representing a specific timestamp. + Examples -------- >>> pd.Timestamp.today() # doctest: +SKIP @@ -1560,6 +1894,21 @@ class Timestamp(_Timestamp): Construct a timezone-aware UTC datetime from a POSIX timestamp. + This method creates a datetime object from a POSIX timestamp, keeping the + Timestamp object's timezone. + + Parameters + ---------- + ts : float + POSIX timestamp. + + See Also + -------- + Timezone.tzname : Return time zone name. + Timestamp.utcnow : Return a new Timestamp representing UTC day and time. + Timestamp.fromtimestamp : Transform timestamp[, tz] to tz's local + time from POSIX timestamp. + Notes ----- Timestamp.utcfromtimestamp behavior differs from datetime.utcfromtimestamp @@ -1584,16 +1933,43 @@ class Timestamp(_Timestamp): @classmethod def fromtimestamp(cls, ts, tz=None): """ - Timestamp.fromtimestamp(ts) + Create a `Timestamp` object from a POSIX timestamp. + + This method converts a POSIX timestamp (the number of seconds since + January 1, 1970, 00:00:00 UTC) into a `Timestamp` object. The resulting + `Timestamp` can be localized to a specific time zone if provided. + + Parameters + ---------- + ts : float + The POSIX timestamp to convert, representing seconds since + the epoch (1970-01-01 00:00:00 UTC). + tz : str, zoneinfo.ZoneInfo, pytz.timezone, dateutil.tz.tzfile, optional + Time zone for the `Timestamp`. If not provided, the `Timestamp` will + be timezone-naive (i.e., without time zone information). - Transform timestamp[, tz] to tz's local time from POSIX timestamp. + Returns + ------- + Timestamp + A `Timestamp` object representing the given POSIX timestamp. + + See Also + -------- + Timestamp : Represents a single timestamp, similar to `datetime`. + to_datetime : Converts various types of data to datetime. + datetime.datetime.fromtimestamp : Returns a datetime from a POSIX timestamp. Examples -------- + Convert a POSIX timestamp to a `Timestamp`: + >>> pd.Timestamp.fromtimestamp(1584199972) # doctest: +SKIP Timestamp('2020-03-14 15:32:52') - Note that the output may change depending on your local time. + Note that the output may change depending on your local time and time zone: + + >>> pd.Timestamp.fromtimestamp(1584199972, tz='UTC') # doctest: +SKIP + Timestamp('2020-03-14 15:32:52+0000', tz='UTC') """ tz = maybe_get_tz(tz) return cls(datetime.fromtimestamp(ts, tz)) @@ -1636,7 +2012,25 @@ class Timestamp(_Timestamp): def ctime(self): """ - Return ctime() style string. + Return a ctime() style string representing the Timestamp. + + This method returns a string representing the date and time + in the format returned by the standard library's `time.ctime()` + function, which is typically in the form 'Day Mon DD HH:MM:SS YYYY'. + + If the `Timestamp` is outside the range supported by Python's + standard library, a `NotImplementedError` is raised. + + Returns + ------- + str + A string representing the Timestamp in ctime format. + + See Also + -------- + time.ctime : Return a string representing time in ctime format. + Timestamp : Represents a single timestamp, similar to `datetime`. + datetime.datetime.ctime : Return a ctime style string from a datetime object. Examples -------- @@ -1661,10 +2055,25 @@ class Timestamp(_Timestamp): def date(self): """ - Return date object with same year, month and day. + Returns `datetime.date` with the same year, month, and day. + + This method extracts the date component from the `Timestamp` and returns + it as a `datetime.date` object, discarding the time information. + + Returns + ------- + datetime.date + The date part of the `Timestamp`. + + See Also + -------- + Timestamp : Represents a single timestamp, similar to `datetime`. + datetime.datetime.date : Extract the date component from a `datetime` object. Examples -------- + Extract the date from a Timestamp: + >>> ts = pd.Timestamp('2023-01-01 10:00:00.00') >>> ts Timestamp('2023-01-01 10:00:00') @@ -1735,6 +2144,13 @@ class Timestamp(_Timestamp): """ Return time zone name. + This method returns the name of the Timestamp's time zone as a string. + + See Also + -------- + Timestamp.tzinfo : Returns the timezone information of the Timestamp. + Timestamp.tz_convert : Convert timezone-aware Timestamp to another time zone. + Examples -------- >>> ts = pd.Timestamp('2023-01-01 10:00:00', tz='Europe/Brussels') @@ -1749,6 +2165,22 @@ class Timestamp(_Timestamp): """ Return utc offset. + This method returns the difference between UTC and the local time + as a `timedelta` object. It is useful for understanding the time + difference between the current timezone and UTC. + + Returns + -------- + timedelta + The difference between UTC and the local time as a `timedelta` object. + + See Also + -------- + datetime.datetime.utcoffset : + Standard library method to get the UTC offset of a datetime object. + Timestamp.tzname : Return the name of the timezone. + Timestamp.dst : Return the daylight saving time (DST) adjustment. + Examples -------- >>> ts = pd.Timestamp('2023-01-01 10:00:00', tz='Europe/Brussels') @@ -1761,7 +2193,24 @@ class Timestamp(_Timestamp): def utctimetuple(self): """ - Return UTC time tuple, compatible with time.localtime(). + Return UTC time tuple, compatible with `time.localtime()`. + + This method converts the Timestamp to UTC and returns a time tuple + containing 9 components: year, month, day, hour, minute, second, + weekday, day of year, and DST flag. This is particularly useful for + converting a Timestamp to a format compatible with time module functions. + + Returns + ------- + time.struct_time + A time.struct_time object representing the UTC time. + + See Also + -------- + datetime.datetime.utctimetuple : + Return UTC time tuple, compatible with time.localtime(). + Timestamp.timetuple : Return time tuple of local time. + time.struct_time : Time tuple structure used by time functions. Examples -------- @@ -1802,6 +2251,17 @@ class Timestamp(_Timestamp): """ Return time tuple, compatible with time.localtime(). + This method converts the `Timestamp` into a time tuple, which is compatible + with functions like `time.localtime()`. The time tuple is a named tuple with + attributes such as year, month, day, hour, minute, second, weekday, + day of the year, and daylight savings indicator. + + See Also + -------- + time.localtime : Converts a POSIX timestamp into a time tuple. + Timestamp : The `Timestamp` that represents a specific point in time. + datetime.datetime.timetuple : Equivalent method in the `datetime` module. + Examples -------- >>> ts = pd.Timestamp('2023-01-01 10:00:00') @@ -1826,6 +2286,19 @@ class Timestamp(_Timestamp): """ Return time object with same time and tzinfo. + This method returns a datetime.time object with + the time and tzinfo corresponding to the pd.Timestamp + object, ignoring any information about the day/date. + + See Also + -------- + datetime.datetime.timetz : Return datetime.time object with the + same time attributes as the datetime object. + datetime.time : Class to represent the time of day, independent + of any particular day. + datetime.datetime.tzinfo : Attribute of datetime.datetime objects + representing the timezone of the datetime object. + Examples -------- >>> ts = pd.Timestamp('2023-01-01 10:00:00', tz='Europe/Brussels') @@ -1840,6 +2313,17 @@ class Timestamp(_Timestamp): """ Return proleptic Gregorian ordinal. January 1 of year 1 is day 1. + The proleptic Gregorian ordinal is a continuous count of days since + January 1 of year 1, which is considered day 1. This method converts + the `Timestamp` to its equivalent ordinal number, useful for date arithmetic + and comparison operations. + + See Also + -------- + datetime.datetime.toordinal : Equivalent method in the `datetime` module. + Timestamp : The `Timestamp` that represents a specific point in time. + Timestamp.fromordinal : Create a `Timestamp` from an ordinal. + Examples -------- >>> ts = pd.Timestamp('2023-01-01 10:00:50') @@ -1863,9 +2347,27 @@ class Timestamp(_Timestamp): @classmethod def strptime(cls, date_string, format): """ - Timestamp.strptime(string, format) + Convert string argument to datetime. - Function is not implemented. Use pd.to_datetime(). + This method is not implemented; calling it will raise NotImplementedError. + Use pd.to_datetime() instead. + + Parameters + ---------- + date_string : str + String to convert to a datetime. + format : str, default None + The format string to parse time, e.g. "%d/%m/%Y". + + See Also + -------- + pd.to_datetime : Convert argument to datetime. + datetime.datetime.strptime : Return a datetime corresponding to a string + representing a date and time, parsed according to a separate + format string. + datetime.datetime.strftime : Return a string representing the date and + time, controlled by an explicit format string. + Timestamp.isoformat : Return the time formatted according to ISO 8601. Examples -------- @@ -1883,7 +2385,28 @@ class Timestamp(_Timestamp): """ Timestamp.combine(date, time) - Combine date, time into datetime with same date and time fields. + Combine a date and time into a single Timestamp object. + + This method takes a `date` object and a `time` object + and combines them into a single `Timestamp` + that has the same date and time fields. + + Parameters + ---------- + date : datetime.date + The date part of the Timestamp. + time : datetime.time + The time part of the Timestamp. + + Returns + ------- + Timestamp + A new `Timestamp` object representing the combined date and time. + + See Also + -------- + Timestamp : Represents a single timestamp, similar to `datetime`. + to_datetime : Converts various types of data to datetime. Examples -------- @@ -2104,9 +2627,9 @@ class Timestamp(_Timestamp): * bool contains flags to determine if time is dst or not (note that this flag is only applicable for ambiguous fall dst dates). * 'NaT' will return NaT for an ambiguous time. - * 'raise' will raise an AmbiguousTimeError for an ambiguous time. + * 'raise' will raise a ValueError for an ambiguous time. - nonexistent : {'raise', 'shift_forward', 'shift_backward, 'NaT', \ + nonexistent : {'raise', 'shift_forward', 'shift_backward', 'NaT', \ timedelta}, default 'raise' A nonexistent time does not exist in a particular timezone where clocks moved forward due to DST. @@ -2117,7 +2640,7 @@ timedelta}, default 'raise' closest existing time. * 'NaT' will return NaT where there are nonexistent times. * timedelta objects will shift nonexistent times by the timedelta. - * 'raise' will raise an NonExistentTimeError if there are + * 'raise' will raise a ValueError if there are nonexistent times. Returns @@ -2207,9 +2730,9 @@ timedelta}, default 'raise' * bool contains flags to determine if time is dst or not (note that this flag is only applicable for ambiguous fall dst dates). * 'NaT' will return NaT for an ambiguous time. - * 'raise' will raise an AmbiguousTimeError for an ambiguous time. + * 'raise' will raise a ValueError for an ambiguous time. - nonexistent : {'raise', 'shift_forward', 'shift_backward, 'NaT', \ + nonexistent : {'raise', 'shift_forward', 'shift_backward', 'NaT', \ timedelta}, default 'raise' A nonexistent time does not exist in a particular timezone where clocks moved forward due to DST. @@ -2220,7 +2743,7 @@ timedelta}, default 'raise' closest existing time. * 'NaT' will return NaT where there are nonexistent times. * timedelta objects will shift nonexistent times by the timedelta. - * 'raise' will raise an NonExistentTimeError if there are + * 'raise' will raise a ValueError if there are nonexistent times. Raises @@ -2302,9 +2825,9 @@ timedelta}, default 'raise' * bool contains flags to determine if time is dst or not (note that this flag is only applicable for ambiguous fall dst dates). * 'NaT' will return NaT for an ambiguous time. - * 'raise' will raise an AmbiguousTimeError for an ambiguous time. + * 'raise' will raise a ValueError for an ambiguous time. - nonexistent : {'raise', 'shift_forward', 'shift_backward, 'NaT', \ + nonexistent : {'raise', 'shift_forward', 'shift_backward', 'NaT', \ timedelta}, default 'raise' A nonexistent time does not exist in a particular timezone where clocks moved forward due to DST. @@ -2315,7 +2838,7 @@ timedelta}, default 'raise' closest existing time. * 'NaT' will return NaT where there are nonexistent times. * timedelta objects will shift nonexistent times by the timedelta. - * 'raise' will raise an NonExistentTimeError if there are + * 'raise' will raise a ValueError if there are nonexistent times. Raises @@ -2441,9 +2964,9 @@ timedelta}, default 'raise' * bool contains flags to determine if time is dst or not (note that this flag is only applicable for ambiguous fall dst dates). * 'NaT' will return NaT for an ambiguous time. - * 'raise' will raise an AmbiguousTimeError for an ambiguous time. + * 'raise' will raise a ValueError for an ambiguous time. - nonexistent : 'shift_forward', 'shift_backward, 'NaT', timedelta, \ + nonexistent : 'shift_forward', 'shift_backward', 'NaT', timedelta, \ default 'raise' A nonexistent time does not exist in a particular timezone where clocks moved forward due to DST. @@ -2456,7 +2979,7 @@ default 'raise' closest existing time. * 'NaT' will return NaT where there are nonexistent times. * timedelta objects will shift nonexistent times by the timedelta. - * 'raise' will raise an NonExistentTimeError if there are + * 'raise' will raise a ValueError if there are nonexistent times. Returns @@ -2468,6 +2991,13 @@ default 'raise' TypeError If the Timestamp is tz-aware and tz is not None. + See Also + -------- + Timestamp.tzinfo : Returns the timezone information of the Timestamp. + Timestamp.tz_convert : Convert timezone-aware Timestamp to another time zone. + DatetimeIndex.tz_localize : Localize a DatetimeIndex to a specific time zone. + datetime.datetime.astimezone : Convert a datetime object to another time zone. + Examples -------- Create a naive timestamp object: @@ -2603,22 +3133,48 @@ default 'raise' """ Implements datetime.replace, handles nanoseconds. + This method creates a new `Timestamp` object by replacing the specified + fields with new values. The new `Timestamp` retains the original fields + that are not explicitly replaced. This method handles nanoseconds, and + the `tzinfo` parameter allows for timezone replacement without conversion. + Parameters ---------- year : int, optional + The year to replace. If `None`, the year is not changed. month : int, optional + The month to replace. If `None`, the month is not changed. day : int, optional + The day to replace. If `None`, the day is not changed. hour : int, optional + The hour to replace. If `None`, the hour is not changed. minute : int, optional + The minute to replace. If `None`, the minute is not changed. second : int, optional + The second to replace. If `None`, the second is not changed. microsecond : int, optional + The microsecond to replace. If `None`, the microsecond is not changed. nanosecond : int, optional + The nanosecond to replace. If `None`, the nanosecond is not changed. tzinfo : tz-convertible, optional + The timezone information to replace. If `None`, the timezone is not changed. fold : int, optional + The fold information to replace. If `None`, the fold is not changed. Returns ------- - Timestamp with fields replaced + Timestamp + A new `Timestamp` object with the specified fields replaced. + + See Also + -------- + Timestamp : Represents a single timestamp, similar to `datetime`. + to_datetime : Converts various types of data to datetime. + + Notes + ----- + The `replace` method does not perform timezone conversions. If you need + to convert the timezone, use the `tz_convert` method instead. Examples -------- @@ -2741,7 +3297,14 @@ default 'raise' """ Convert TimeStamp to a Julian Date. - 0 Julian date is noon January 1, 4713 BC. + This method returns the number of days as a float since + 0 Julian date, which is noon January 1, 4713 BC. + + See Also + -------- + Timestamp.toordinal : Return proleptic Gregorian ordinal. + Timestamp.timestamp : Return POSIX timestamp as float. + Timestamp : Represents a single timestamp. Examples -------- diff --git a/pandas/_libs/tslibs/timezones.pyx b/pandas/_libs/tslibs/timezones.pyx index 6292b6ce0fd1d..36b644ffc826d 100644 --- a/pandas/_libs/tslibs/timezones.pyx +++ b/pandas/_libs/tslibs/timezones.pyx @@ -2,17 +2,10 @@ from datetime import ( timedelta, timezone, ) +import zoneinfo from pandas.compat._optional import import_optional_dependency -try: - # py39+ - import zoneinfo - from zoneinfo import ZoneInfo -except ImportError: - zoneinfo = None - ZoneInfo = None - from cpython.datetime cimport ( datetime, timedelta, @@ -28,8 +21,8 @@ from dateutil.tz import ( tzutc as _dateutil_tzutc, ) import numpy as np -import pytz -from pytz.tzinfo import BaseTzInfo as _pytz_BaseTzInfo + +pytz = import_optional_dependency("pytz", errors="ignore") cimport numpy as cnp from numpy cimport int64_t @@ -45,10 +38,11 @@ from pandas._libs.tslibs.util cimport ( cdef int64_t NPY_NAT = get_nat() cdef tzinfo utc_stdlib = timezone.utc -cdef tzinfo utc_pytz = pytz.utc +cdef tzinfo utc_pytz = pytz.UTC if pytz else None cdef tzinfo utc_dateutil_str = dateutil_gettz("UTC") # NB: *not* the same as tzutc() cdef tzinfo utc_zoneinfo = None +cdef type ZoneInfo = zoneinfo.ZoneInfo # ---------------------------------------------------------------------- @@ -56,13 +50,13 @@ cdef tzinfo utc_zoneinfo = None cdef bint is_utc_zoneinfo(tzinfo tz): # Workaround for cases with missing tzdata # https://github.com/pandas-dev/pandas/pull/46425#discussion_r830633025 - if tz is None or zoneinfo is None: + if tz is None: return False global utc_zoneinfo if utc_zoneinfo is None: try: - utc_zoneinfo = ZoneInfo("UTC") + utc_zoneinfo = zoneinfo.ZoneInfo("UTC") except zoneinfo.ZoneInfoNotFoundError: return False # Warn if tzdata is too old, even if there is a system tzdata to alert @@ -74,17 +68,15 @@ cdef bint is_utc_zoneinfo(tzinfo tz): cpdef inline bint is_utc(tzinfo tz): return ( - tz is utc_pytz - or tz is utc_stdlib + tz is utc_stdlib or isinstance(tz, _dateutil_tzutc) or tz is utc_dateutil_str or is_utc_zoneinfo(tz) + or (utc_pytz is not None and tz is utc_pytz) ) cdef bint is_zoneinfo(tzinfo tz): - if ZoneInfo is None: - return False return isinstance(tz, ZoneInfo) @@ -166,7 +158,7 @@ cpdef inline tzinfo maybe_get_tz(object tz): elif tz == "UTC" or tz == "utc": tz = utc_stdlib else: - tz = pytz.timezone(tz) + tz = zoneinfo.ZoneInfo(tz) elif is_integer_object(tz): tz = timezone(timedelta(seconds=tz)) elif isinstance(tz, tzinfo): @@ -205,7 +197,7 @@ cdef object tz_cache_key(tzinfo tz): the same tz file). Also, pytz objects are not always hashable so we use str(tz) instead. """ - if isinstance(tz, _pytz_BaseTzInfo): + if pytz is not None and isinstance(tz, pytz.tzinfo.BaseTzInfo): return tz.zone elif isinstance(tz, _dateutil_tzfile): if ".tar.gz" in tz._filename: @@ -239,7 +231,7 @@ cpdef inline bint is_fixed_offset(tzinfo tz): return 1 else: return 0 - elif treat_tz_as_pytz(tz): + elif treat_tz_as_pytz(tz) and pytz is not None: if (len(tz._transition_info) == 0 and len(tz._utc_transition_times) == 0): return 1 diff --git a/pandas/_libs/tslibs/tzconversion.pyx b/pandas/_libs/tslibs/tzconversion.pyx index e3facd3d9599b..c100f315e9a19 100644 --- a/pandas/_libs/tslibs/tzconversion.pyx +++ b/pandas/_libs/tslibs/tzconversion.pyx @@ -15,7 +15,6 @@ from cython cimport Py_ssize_t import_datetime() import numpy as np -import pytz cimport numpy as cnp from numpy cimport ( @@ -196,8 +195,8 @@ def tz_localize_to_utc( NPY_DATETIMEUNIT creso=NPY_DATETIMEUNIT.NPY_FR_ns, ): """ - Localize tzinfo-naive i8 to given time zone (using pytz). If - there are ambiguities in the values, raise AmbiguousTimeError. + Localize tzinfo-naive i8 to given time zone. If + there are ambiguities in the values, raise ValueError. Parameters ---------- @@ -368,7 +367,7 @@ timedelta-like} result[i] = NPY_NAT else: stamp = _render_tstamp(val, creso=creso) - raise pytz.AmbiguousTimeError( + raise ValueError( f"Cannot infer dst time from {stamp}, try using the " "'ambiguous' argument" ) @@ -428,7 +427,10 @@ timedelta-like} result[i] = NPY_NAT else: stamp = _render_tstamp(val, creso=creso) - raise pytz.NonExistentTimeError(stamp) + raise ValueError( + f"{stamp} is a nonexistent time due to daylight savings time. " + "Try using the 'nonexistent' argument." + ) return result.base # .base to get underlying ndarray @@ -631,7 +633,7 @@ cdef ndarray[int64_t] _get_dst_hours( if trans_idx.size == 1: # see test_tz_localize_to_utc_ambiguous_infer stamp = _render_tstamp(vals[trans_idx[0]], creso=creso) - raise pytz.AmbiguousTimeError( + raise ValueError( f"Cannot infer dst time from {stamp} as there " "are no repeated times" ) @@ -653,14 +655,16 @@ cdef ndarray[int64_t] _get_dst_hours( if grp.size == 1 or np.all(delta > 0): # see test_tz_localize_to_utc_ambiguous_infer stamp = _render_tstamp(vals[grp[0]], creso=creso) - raise pytz.AmbiguousTimeError(stamp) + raise ValueError( + f"{stamp} is an ambiguous time and cannot be inferred." + ) # Find the index for the switch and pull from a for dst and b # for standard switch_idxs = (delta <= 0).nonzero()[0] if switch_idxs.size > 1: # see test_tz_localize_to_utc_ambiguous_infer - raise pytz.AmbiguousTimeError( + raise ValueError( f"There are {switch_idxs.size} dst switches when " "there should only be 1." ) diff --git a/pandas/_testing/asserters.py b/pandas/_testing/asserters.py index 047e8f91df23e..bbd5e60a5a812 100644 --- a/pandas/_testing/asserters.py +++ b/pandas/_testing/asserters.py @@ -188,7 +188,7 @@ def assert_index_equal( check_order: bool = True, rtol: float = 1.0e-5, atol: float = 1.0e-8, - obj: str = "Index", + obj: str | None = None, ) -> None: """ Check that left and right Index are equal. @@ -217,7 +217,7 @@ def assert_index_equal( Relative tolerance. Only used when check_exact is False. atol : float, default 1e-8 Absolute tolerance. Only used when check_exact is False. - obj : str, default 'Index' + obj : str, default 'Index' or 'MultiIndex' Specify object name being compared, internally used to show appropriate assertion message. @@ -235,6 +235,9 @@ def assert_index_equal( """ __tracebackhide__ = True + if obj is None: + obj = "MultiIndex" if isinstance(left, MultiIndex) else "Index" + def _check_types(left, right, obj: str = "Index") -> None: if not exact: return @@ -283,7 +286,7 @@ def _check_types(left, right, obj: str = "Index") -> None: right = cast(MultiIndex, right) for level in range(left.nlevels): - lobj = f"MultiIndex level [{level}]" + lobj = f"{obj} level [{level}]" try: # try comparison on levels/codes to avoid densifying MultiIndex assert_index_equal( @@ -314,7 +317,7 @@ def _check_types(left, right, obj: str = "Index") -> None: obj=lobj, ) # get_level_values may change dtype - _check_types(left.levels[level], right.levels[level], obj=obj) + _check_types(left.levels[level], right.levels[level], obj=lobj) # skip exact index checking when `check_categorical` is False elif check_exact and check_categorical: @@ -527,7 +530,7 @@ def assert_interval_array_equal( kwargs["check_freq"] = False assert_equal(left._left, right._left, obj=f"{obj}.left", **kwargs) - assert_equal(left._right, right._right, obj=f"{obj}.left", **kwargs) + assert_equal(left._right, right._right, obj=f"{obj}.right", **kwargs) assert_attr_equal("closed", left, right, obj=obj) diff --git a/pandas/_typing.py b/pandas/_typing.py index d43e6e900546d..c1769126a5776 100644 --- a/pandas/_typing.py +++ b/pandas/_typing.py @@ -429,7 +429,7 @@ def closed(self) -> bool: SortKind = Literal["quicksort", "mergesort", "heapsort", "stable"] NaPosition = Literal["first", "last"] -# Arguments for nsmalles and n_largest +# Arguments for nsmallest and nlargest NsmallestNlargestKeep = Literal["first", "last", "all"] # quantile interpolation @@ -524,7 +524,7 @@ def closed(self) -> bool: None, ] -# maintaine the sub-type of any hashable sequence +# maintain the sub-type of any hashable sequence SequenceT = TypeVar("SequenceT", bound=Sequence[Hashable]) SliceType = Optional[Hashable] diff --git a/pandas/_version.py b/pandas/_version.py index b32c9e67fdbb6..c5e3c16d3f773 100644 --- a/pandas/_version.py +++ b/pandas/_version.py @@ -1,5 +1,5 @@ # This file helps to compute a version number in source trees obtained from -# git-archive tarball (such as those provided by githubs download-from-tag +# git-archive tarball (such as those provided by github's download-from-tag # feature). Distribution tarballs (built by setup.py sdist) and build # directories (produced by setup.py build) will contain a much shorter file # that just contains the computed version number. diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py index 288559d386a71..756c209661fbb 100644 --- a/pandas/compat/__init__.py +++ b/pandas/compat/__init__.py @@ -33,6 +33,7 @@ pa_version_under14p1, pa_version_under16p0, pa_version_under17p0, + pa_version_under18p0, ) if TYPE_CHECKING: @@ -157,6 +158,7 @@ def is_ci_environment() -> bool: "pa_version_under14p1", "pa_version_under16p0", "pa_version_under17p0", + "pa_version_under18p0", "HAS_PYARROW", "IS64", "ISMUSL", diff --git a/pandas/compat/_optional.py b/pandas/compat/_optional.py index 06082e71af32a..6b90389a62056 100644 --- a/pandas/compat/_optional.py +++ b/pandas/compat/_optional.py @@ -43,6 +43,7 @@ "pyreadstat": "1.2.0", "pytest": "7.3.2", "python-calamine": "0.1.7", + "pytz": "2023.4", "pyxlsb": "1.0.10", "s3fs": "2022.11.0", "scipy": "1.10.0", diff --git a/pandas/compat/pyarrow.py b/pandas/compat/pyarrow.py index ebfc0d69d9655..bd009b544f31e 100644 --- a/pandas/compat/pyarrow.py +++ b/pandas/compat/pyarrow.py @@ -17,6 +17,7 @@ pa_version_under15p0 = _palv < Version("15.0.0") pa_version_under16p0 = _palv < Version("16.0.0") pa_version_under17p0 = _palv < Version("17.0.0") + pa_version_under18p0 = _palv < Version("18.0.0") HAS_PYARROW = True except ImportError: pa_version_under10p1 = True @@ -28,4 +29,5 @@ pa_version_under15p0 = True pa_version_under16p0 = True pa_version_under17p0 = True + pa_version_under18p0 = True HAS_PYARROW = False diff --git a/pandas/conftest.py b/pandas/conftest.py index 7c485515f0784..d11213f1164bc 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -32,7 +32,10 @@ import gc import operator import os -from typing import TYPE_CHECKING +from typing import ( + TYPE_CHECKING, + Any, +) import uuid from dateutil.tz import ( @@ -43,11 +46,8 @@ from hypothesis import strategies as st import numpy as np import pytest -from pytz import ( - FixedOffset, - utc, -) +from pandas.compat._optional import import_optional_dependency import pandas.util._test_decorators as td from pandas.core.dtypes.dtypes import ( @@ -92,12 +92,7 @@ del pa has_pyarrow = True -import zoneinfo - -try: - zoneinfo.ZoneInfo("UTC") -except zoneinfo.ZoneInfoNotFoundError: - zoneinfo = None # type: ignore[assignment] +pytz = import_optional_dependency("pytz", errors="ignore") # ---------------------------------------------------------------- @@ -1199,19 +1194,19 @@ def deco(*args): "UTC-02:15", tzutc(), tzlocal(), - FixedOffset(300), - FixedOffset(0), - FixedOffset(-300), timezone.utc, timezone(timedelta(hours=1)), timezone(timedelta(hours=-1), name="foo"), ] -if zoneinfo is not None: +if pytz is not None: TIMEZONES.extend( - [ - zoneinfo.ZoneInfo("US/Pacific"), # type: ignore[list-item] - zoneinfo.ZoneInfo("UTC"), # type: ignore[list-item] - ] + ( + pytz.FixedOffset(300), + pytz.FixedOffset(0), + pytz.FixedOffset(-300), + pytz.timezone("US/Pacific"), + pytz.timezone("UTC"), + ) ) TIMEZONE_IDS = [repr(i) for i in TIMEZONES] @@ -1234,9 +1229,10 @@ def tz_aware_fixture(request): return request.param -_UTCS = ["utc", "dateutil/UTC", utc, tzutc(), timezone.utc] -if zoneinfo is not None: - _UTCS.append(zoneinfo.ZoneInfo("UTC")) +_UTCS = ["utc", "dateutil/UTC", tzutc(), timezone.utc] + +if pytz is not None: + _UTCS.append(pytz.utc) @pytest.fixture(params=_UTCS) @@ -2046,12 +2042,12 @@ def using_infer_string() -> bool: return pd.options.future.infer_string is True -warsaws = ["Europe/Warsaw", "dateutil/Europe/Warsaw"] -if zoneinfo is not None: - warsaws.append(zoneinfo.ZoneInfo("Europe/Warsaw")) # type: ignore[arg-type] +_warsaws: list[Any] = ["Europe/Warsaw", "dateutil/Europe/Warsaw"] +if pytz is not None: + _warsaws.append(pytz.timezone("Europe/Warsaw")) -@pytest.fixture(params=warsaws) +@pytest.fixture(params=_warsaws) def warsaw(request) -> str: """ tzinfo for Europe/Warsaw using pytz, dateutil, or zoneinfo. diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 948836bf6a51d..56f8adda93251 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -1529,9 +1529,7 @@ def safe_sort( order2 = sorter.argsort() if verify: mask = (codes < -len(values)) | (codes >= len(values)) - codes[mask] = 0 - else: - mask = None + codes[mask] = -1 new_codes = take_nd(order2, codes, fill_value=-1) else: reverse_indexer = np.empty(len(sorter), dtype=int) @@ -1540,14 +1538,6 @@ def safe_sort( # may deal with them here without performance loss using `mode='wrap'` new_codes = reverse_indexer.take(codes, mode="wrap") - if use_na_sentinel: - mask = codes == -1 - if verify: - mask = mask | (codes < -len(values)) | (codes >= len(values)) - - if use_na_sentinel and mask is not None: - np.putmask(new_codes, mask, -1) - return ordered, ensure_platform_int(new_codes) diff --git a/pandas/core/array_algos/quantile.py b/pandas/core/array_algos/quantile.py index b2f78182b9bf0..8a920d1849bb3 100644 --- a/pandas/core/array_algos/quantile.py +++ b/pandas/core/array_algos/quantile.py @@ -91,7 +91,7 @@ def quantile_with_mask( if is_empty: # create the array of na_values # 2d len(values) * len(qs) - flat = np.array([fill_value] * len(qs)) + flat = np.full(len(qs), fill_value) result = np.repeat(flat, len(values)).reshape(len(values), len(qs)) else: result = _nanquantile( diff --git a/pandas/core/arraylike.py b/pandas/core/arraylike.py index 03c73489bd3d8..f70bb0743aa0f 100644 --- a/pandas/core/arraylike.py +++ b/pandas/core/arraylike.py @@ -522,7 +522,7 @@ def dispatch_reduction_ufunc(self, ufunc: np.ufunc, method: str, *inputs, **kwar # so calls DataFrame.min (without ever getting here) with the np.min # default of axis=None, which DataFrame.min catches and changes to axis=0. # np.minimum.reduce(df) gets here bc axis is not in kwargs, - # so we set axis=0 to match the behaviorof np.minimum.reduce(df.values) + # so we set axis=0 to match the behavior of np.minimum.reduce(df.values) kwargs["axis"] = 0 # By default, numpy's reductions do not skip NaNs, so we have to diff --git a/pandas/core/arrays/_arrow_string_mixins.py b/pandas/core/arrays/_arrow_string_mixins.py index 06c74290bd82e..950d4cd7cc92e 100644 --- a/pandas/core/arrays/_arrow_string_mixins.py +++ b/pandas/core/arrays/_arrow_string_mixins.py @@ -1,28 +1,55 @@ from __future__ import annotations +from functools import partial from typing import ( TYPE_CHECKING, + Any, Literal, ) import numpy as np -from pandas.compat import pa_version_under10p1 +from pandas.compat import ( + pa_version_under10p1, + pa_version_under13p0, + pa_version_under17p0, +) + +from pandas.core.dtypes.missing import isna if not pa_version_under10p1: import pyarrow as pa import pyarrow.compute as pc if TYPE_CHECKING: - from pandas._typing import Self + from collections.abc import ( + Callable, + Sized, + ) + + from pandas._typing import ( + Scalar, + Self, + ) class ArrowStringArrayMixin: - _pa_array = None + _pa_array: Sized def __init__(self, *args, **kwargs) -> None: raise NotImplementedError + def _convert_bool_result(self, result): + # Convert a bool-dtype result to the appropriate result type + raise NotImplementedError + + def _convert_int_result(self, result): + # Convert an integer-dtype result to the appropriate result type + raise NotImplementedError + + def _apply_elementwise(self, func: Callable) -> list[list[Any]]: + raise NotImplementedError + def _str_pad( self, width: int, @@ -34,7 +61,19 @@ def _str_pad( elif side == "right": pa_pad = pc.utf8_rpad elif side == "both": - pa_pad = pc.utf8_center + if pa_version_under17p0: + # GH#59624 fall back to object dtype + from pandas import array + + obj_arr = self.astype(object, copy=False) # type: ignore[attr-defined] + obj = array(obj_arr, dtype=object) + result = obj._str_pad(width, side, fillchar) # type: ignore[attr-defined] + return type(self)._from_sequence(result, dtype=self.dtype) # type: ignore[attr-defined] + else: + # GH#54792 + # https://github.com/apache/arrow/issues/15053#issuecomment-2317032347 + lean_left = (width % 2) == 0 + pa_pad = partial(pc.utf8_center, lean_left_on_odd_padding=lean_left) else: raise ValueError( f"Invalid side: {side}. Side must be one of 'left', 'right', 'both'" @@ -89,3 +128,122 @@ def _str_removesuffix(self, suffix: str): removed = pc.utf8_slice_codeunits(self._pa_array, 0, stop=-len(suffix)) result = pc.if_else(ends_with, removed, self._pa_array) return type(self)(result) + + def _str_startswith(self, pat: str | tuple[str, ...], na: Scalar | None = None): + if isinstance(pat, str): + result = pc.starts_with(self._pa_array, pattern=pat) + else: + if len(pat) == 0: + # For empty tuple we return null for missing values and False + # for valid values. + result = pc.if_else(pc.is_null(self._pa_array), None, False) + else: + result = pc.starts_with(self._pa_array, pattern=pat[0]) + + for p in pat[1:]: + result = pc.or_(result, pc.starts_with(self._pa_array, pattern=p)) + if not isna(na): # pyright: ignore [reportGeneralTypeIssues] + result = result.fill_null(na) + return self._convert_bool_result(result) + + def _str_endswith(self, pat: str | tuple[str, ...], na: Scalar | None = None): + if isinstance(pat, str): + result = pc.ends_with(self._pa_array, pattern=pat) + else: + if len(pat) == 0: + # For empty tuple we return null for missing values and False + # for valid values. + result = pc.if_else(pc.is_null(self._pa_array), None, False) + else: + result = pc.ends_with(self._pa_array, pattern=pat[0]) + + for p in pat[1:]: + result = pc.or_(result, pc.ends_with(self._pa_array, pattern=p)) + if not isna(na): # pyright: ignore [reportGeneralTypeIssues] + result = result.fill_null(na) + return self._convert_bool_result(result) + + def _str_isalnum(self): + result = pc.utf8_is_alnum(self._pa_array) + return self._convert_bool_result(result) + + def _str_isalpha(self): + result = pc.utf8_is_alpha(self._pa_array) + return self._convert_bool_result(result) + + def _str_isdecimal(self): + result = pc.utf8_is_decimal(self._pa_array) + return self._convert_bool_result(result) + + def _str_isdigit(self): + result = pc.utf8_is_digit(self._pa_array) + return self._convert_bool_result(result) + + def _str_islower(self): + result = pc.utf8_is_lower(self._pa_array) + return self._convert_bool_result(result) + + def _str_isnumeric(self): + result = pc.utf8_is_numeric(self._pa_array) + return self._convert_bool_result(result) + + def _str_isspace(self): + result = pc.utf8_is_space(self._pa_array) + return self._convert_bool_result(result) + + def _str_istitle(self): + result = pc.utf8_is_title(self._pa_array) + return self._convert_bool_result(result) + + def _str_isupper(self): + result = pc.utf8_is_upper(self._pa_array) + return self._convert_bool_result(result) + + def _str_contains( + self, pat, case: bool = True, flags: int = 0, na=None, regex: bool = True + ): + if flags: + raise NotImplementedError(f"contains not implemented with {flags=}") + + if regex: + pa_contains = pc.match_substring_regex + else: + pa_contains = pc.match_substring + result = pa_contains(self._pa_array, pat, ignore_case=not case) + if not isna(na): # pyright: ignore [reportGeneralTypeIssues] + result = result.fill_null(na) + return self._convert_bool_result(result) + + def _str_find(self, sub: str, start: int = 0, end: int | None = None): + if ( + pa_version_under13p0 + and not (start != 0 and end is not None) + and not (start == 0 and end is None) + ): + # GH#59562 + res_list = self._apply_elementwise(lambda val: val.find(sub, start, end)) + return self._convert_int_result(pa.chunked_array(res_list)) + + if (start == 0 or start is None) and end is None: + result = pc.find_substring(self._pa_array, sub) + else: + if sub == "": + # GH#56792 + res_list = self._apply_elementwise( + lambda val: val.find(sub, start, end) + ) + return self._convert_int_result(pa.chunked_array(res_list)) + if start is None: + start_offset = 0 + start = 0 + elif start < 0: + start_offset = pc.add(start, pc.utf8_length(self._pa_array)) + start_offset = pc.if_else(pc.less(start_offset, 0), 0, start_offset) + else: + start_offset = start + slices = pc.utf8_slice_codeunits(self._pa_array, start, stop=end) + result = pc.find_substring(slices, sub) + found = pc.not_equal(result, pa.scalar(-1, type=result.type)) + offset_result = pc.add(result, start_offset) + result = pc.if_else(found, offset_result, -1) + return self._convert_int_result(result) diff --git a/pandas/core/arrays/_mixins.py b/pandas/core/arrays/_mixins.py index 7b941e7ea8338..4e6f20e6ad3dd 100644 --- a/pandas/core/arrays/_mixins.py +++ b/pandas/core/arrays/_mixins.py @@ -496,17 +496,14 @@ def _quantile( fill_value = self._internal_fill_value res_values = quantile_with_mask(arr, mask, fill_value, qs, interpolation) - - res_values = self._cast_quantile_result(res_values) - return self._from_backing_data(res_values) - - # TODO: see if we can share this with other dispatch-wrapping methods - def _cast_quantile_result(self, res_values: np.ndarray) -> np.ndarray: - """ - Cast the result of quantile_with_mask to an appropriate dtype - to pass to _from_backing_data in _quantile. - """ - return res_values + if res_values.dtype == self._ndarray.dtype: + return self._from_backing_data(res_values) + else: + # e.g. test_quantile_empty we are empty integer dtype and res_values + # has floating dtype + # TODO: technically __init__ isn't defined here. + # Should we raise NotImplementedError and handle this on NumpyEA? + return type(self)(res_values) # type: ignore[call-arg] # ------------------------------------------------------------------------ # numpy-like methods diff --git a/pandas/core/arrays/_ranges.py b/pandas/core/arrays/_ranges.py index 600ddc7f717a8..88f5ac4ebdea4 100644 --- a/pandas/core/arrays/_ranges.py +++ b/pandas/core/arrays/_ranges.py @@ -18,6 +18,8 @@ iNaT, ) +from pandas.core.construction import range_to_ndarray + if TYPE_CHECKING: from pandas._typing import npt @@ -82,17 +84,7 @@ def generate_regular_range( "at least 'start' or 'end' should be specified if a 'period' is given." ) - with np.errstate(over="raise"): - # If the range is sufficiently large, np.arange may overflow - # and incorrectly return an empty array if not caught. - try: - values = np.arange(b, e, stride, dtype=np.int64) - except FloatingPointError: - xdr = [b] - while xdr[-1] != e: - xdr.append(xdr[-1] + stride) - values = np.array(xdr[:-1], dtype=np.int64) - return values + return range_to_ndarray(range(b, e, stride)) def _generate_range_overflow_safe( diff --git a/pandas/core/arrays/arrow/accessors.py b/pandas/core/arrays/arrow/accessors.py index d8f948a37d206..d9a80b699b0bb 100644 --- a/pandas/core/arrays/arrow/accessors.py +++ b/pandas/core/arrays/arrow/accessors.py @@ -92,6 +92,12 @@ def len(self) -> Series: pandas.Series The length of each list. + See Also + -------- + str.len : Python built-in function returning the length of an object. + Series.size : Returns the length of the Series. + StringMethods.len : Compute the length of each element in the Series/Index. + Examples -------- >>> import pyarrow as pa @@ -128,6 +134,10 @@ def __getitem__(self, key: int | slice) -> Series: pandas.Series The list at requested index. + See Also + -------- + ListAccessor.flatten : Flatten list values. + Examples -------- >>> import pyarrow as pa @@ -187,6 +197,10 @@ def flatten(self) -> Series: pandas.Series The data from all lists in the series flattened. + See Also + -------- + ListAccessor.__getitem__ : Index or slice values in the Series. + Examples -------- >>> import pyarrow as pa @@ -244,6 +258,10 @@ def dtypes(self) -> Series: pandas.Series The data type of each child field. + See Also + -------- + Series.dtype: Return the dtype object of the underlying data. + Examples -------- >>> import pyarrow as pa diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index d07bfeda50e1d..15f9ba611a642 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -681,7 +681,12 @@ def __invert__(self) -> Self: return type(self)(pc.invert(self._pa_array)) def __neg__(self) -> Self: - return type(self)(pc.negate_checked(self._pa_array)) + try: + return type(self)(pc.negate_checked(self._pa_array)) + except pa.ArrowNotImplementedError as err: + raise TypeError( + f"unary '-' not supported for dtype '{self.dtype}'" + ) from err def __pos__(self) -> Self: return type(self)(self._pa_array) @@ -709,7 +714,13 @@ def _cmp_method(self, other, op) -> ArrowExtensionArray: if isinstance( other, (ArrowExtensionArray, np.ndarray, list, BaseMaskedArray) ) or isinstance(getattr(other, "dtype", None), CategoricalDtype): - result = pc_func(self._pa_array, self._box_pa(other)) + try: + result = pc_func(self._pa_array, self._box_pa(other)) + except pa.ArrowNotImplementedError: + # TODO: could this be wrong if other is object dtype? + # in which case we need to operate pointwise? + result = ops.invalid_comparison(self, other, op) + result = pa.array(result, type=pa.bool_()) elif is_scalar(other): try: result = pc_func(self._pa_array, self._box_pa(other)) @@ -730,8 +741,19 @@ def _cmp_method(self, other, op) -> ArrowExtensionArray: ) return ArrowExtensionArray(result) + def _op_method_error_message(self, other, op) -> str: + if hasattr(other, "dtype"): + other_type = f"dtype '{other.dtype}'" + else: + other_type = f"object of type {type(other)}" + return ( + f"operation '{op.__name__}' not supported for " + f"dtype '{self.dtype}' with {other_type}" + ) + def _evaluate_op_method(self, other, op, arrow_funcs) -> Self: pa_type = self._pa_array.type + other_original = other other = self._box_pa(other) if ( @@ -741,10 +763,15 @@ def _evaluate_op_method(self, other, op, arrow_funcs) -> Self: ): if op in [operator.add, roperator.radd]: sep = pa.scalar("", type=pa_type) - if op is operator.add: - result = pc.binary_join_element_wise(self._pa_array, other, sep) - elif op is roperator.radd: - result = pc.binary_join_element_wise(other, self._pa_array, sep) + try: + if op is operator.add: + result = pc.binary_join_element_wise(self._pa_array, other, sep) + elif op is roperator.radd: + result = pc.binary_join_element_wise(other, self._pa_array, sep) + except pa.ArrowNotImplementedError as err: + raise TypeError( + self._op_method_error_message(other_original, op) + ) from err return type(self)(result) elif op in [operator.mul, roperator.rmul]: binary = self._pa_array @@ -776,9 +803,14 @@ def _evaluate_op_method(self, other, op, arrow_funcs) -> Self: pc_func = arrow_funcs[op.__name__] if pc_func is NotImplemented: + if pa.types.is_string(pa_type) or pa.types.is_large_string(pa_type): + raise TypeError(self._op_method_error_message(other_original, op)) raise NotImplementedError(f"{op.__name__} not implemented.") - result = pc_func(self._pa_array, other) + try: + result = pc_func(self._pa_array, other) + except pa.ArrowNotImplementedError as err: + raise TypeError(self._op_method_error_message(other_original, op)) from err return type(self)(result) def _logical_method(self, other, op) -> Self: @@ -2279,58 +2311,18 @@ def _apply_elementwise(self, func: Callable) -> list[list[Any]]: for chunk in self._pa_array.iterchunks() ] - def _str_count(self, pat: str, flags: int = 0) -> Self: - if flags: - raise NotImplementedError(f"count not implemented with {flags=}") - return type(self)(pc.count_substring_regex(self._pa_array, pat)) - - def _str_contains( - self, pat, case: bool = True, flags: int = 0, na=None, regex: bool = True - ) -> Self: - if flags: - raise NotImplementedError(f"contains not implemented with {flags=}") - - if regex: - pa_contains = pc.match_substring_regex - else: - pa_contains = pc.match_substring - result = pa_contains(self._pa_array, pat, ignore_case=not case) - if not isna(na): - result = result.fill_null(na) + def _convert_bool_result(self, result): return type(self)(result) - def _str_startswith(self, pat: str | tuple[str, ...], na=None) -> Self: - if isinstance(pat, str): - result = pc.starts_with(self._pa_array, pattern=pat) - else: - if len(pat) == 0: - # For empty tuple, pd.StringDtype() returns null for missing values - # and false for valid values. - result = pc.if_else(pc.is_null(self._pa_array), None, False) - else: - result = pc.starts_with(self._pa_array, pattern=pat[0]) - - for p in pat[1:]: - result = pc.or_(result, pc.starts_with(self._pa_array, pattern=p)) - if not isna(na): - result = result.fill_null(na) + def _convert_int_result(self, result): return type(self)(result) - def _str_endswith(self, pat: str | tuple[str, ...], na=None) -> Self: - if isinstance(pat, str): - result = pc.ends_with(self._pa_array, pattern=pat) - else: - if len(pat) == 0: - # For empty tuple, pd.StringDtype() returns null for missing values - # and false for valid values. - result = pc.if_else(pc.is_null(self._pa_array), None, False) - else: - result = pc.ends_with(self._pa_array, pattern=pat[0]) + def _str_count(self, pat: str, flags: int = 0) -> Self: + if flags: + raise NotImplementedError(f"count not implemented with {flags=}") + return type(self)(pc.count_substring_regex(self._pa_array, pat)) - for p in pat[1:]: - result = pc.or_(result, pc.ends_with(self._pa_array, pattern=p)) - if not isna(na): - result = result.fill_null(na) + def _result_converter(self, result): return type(self)(result) def _str_replace( @@ -2381,29 +2373,6 @@ def _str_fullmatch( pat = f"{pat}$" return self._str_match(pat, case, flags, na) - def _str_find(self, sub: str, start: int = 0, end: int | None = None) -> Self: - if (start == 0 or start is None) and end is None: - result = pc.find_substring(self._pa_array, sub) - else: - if sub == "": - # GH 56792 - result = self._apply_elementwise(lambda val: val.find(sub, start, end)) - return type(self)(pa.chunked_array(result)) - if start is None: - start_offset = 0 - start = 0 - elif start < 0: - start_offset = pc.add(start, pc.utf8_length(self._pa_array)) - start_offset = pc.if_else(pc.less(start_offset, 0), 0, start_offset) - else: - start_offset = start - slices = pc.utf8_slice_codeunits(self._pa_array, start, stop=end) - result = pc.find_substring(slices, sub) - found = pc.not_equal(result, pa.scalar(-1, type=result.type)) - offset_result = pc.add(result, start_offset) - result = pc.if_else(found, offset_result, -1) - return type(self)(result) - def _str_join(self, sep: str) -> Self: if pa.types.is_string(self._pa_array.type) or pa.types.is_large_string( self._pa_array.type @@ -2435,33 +2404,6 @@ def _str_slice( pc.utf8_slice_codeunits(self._pa_array, start=start, stop=stop, step=step) ) - def _str_isalnum(self) -> Self: - return type(self)(pc.utf8_is_alnum(self._pa_array)) - - def _str_isalpha(self) -> Self: - return type(self)(pc.utf8_is_alpha(self._pa_array)) - - def _str_isdecimal(self) -> Self: - return type(self)(pc.utf8_is_decimal(self._pa_array)) - - def _str_isdigit(self) -> Self: - return type(self)(pc.utf8_is_digit(self._pa_array)) - - def _str_islower(self) -> Self: - return type(self)(pc.utf8_is_lower(self._pa_array)) - - def _str_isnumeric(self) -> Self: - return type(self)(pc.utf8_is_numeric(self._pa_array)) - - def _str_isspace(self) -> Self: - return type(self)(pc.utf8_is_space(self._pa_array)) - - def _str_istitle(self) -> Self: - return type(self)(pc.utf8_is_title(self._pa_array)) - - def _str_isupper(self) -> Self: - return type(self)(pc.utf8_is_upper(self._pa_array)) - def _str_len(self) -> Self: return type(self)(pc.utf8_length(self._pa_array)) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index b429b7c1b1fc4..536c7303a2f92 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -608,6 +608,14 @@ def dtype(self) -> ExtensionDtype: """ An instance of ExtensionDtype. + See Also + -------- + api.extensions.ExtensionDtype : Base class for extension dtypes. + api.extensions.ExtensionArray : Base class for extension array types. + api.extensions.ExtensionArray.dtype : The dtype of an ExtensionArray. + Series.dtype : The dtype of a Series. + DataFrame.dtype : The dtype of a DataFrame. + Examples -------- >>> pd.array([1, 2, 3]).dtype @@ -649,6 +657,11 @@ def ndim(self) -> int: """ Extension Arrays are only allowed to be 1-dimensional. + See Also + -------- + ExtensionArray.shape: Return a tuple of the array dimensions. + ExtensionArray.size: The number of elements in the array. + Examples -------- >>> arr = pd.array([1, 2, 3]) @@ -662,6 +675,11 @@ def nbytes(self) -> int: """ The number of bytes needed to store this object in memory. + See Also + -------- + ExtensionArray.shape: Return a tuple of the array dimensions. + ExtensionArray.size: The number of elements in the array. + Examples -------- >>> pd.array([1, 2, 3]).nbytes @@ -703,6 +721,16 @@ def astype(self, dtype: AstypeArg, copy: bool = True) -> ArrayLike: An ``ExtensionArray`` if ``dtype`` is ``ExtensionDtype``, otherwise a Numpy ndarray with ``dtype`` for its dtype. + See Also + -------- + Series.astype : Cast a Series to a different dtype. + DataFrame.astype : Cast a DataFrame to a different dtype. + api.extensions.ExtensionArray : Base class for ExtensionArray objects. + core.arrays.DatetimeArray._from_sequence : Create a DatetimeArray from a + sequence. + core.arrays.TimedeltaArray._from_sequence : Create a TimedeltaArray from + a sequence. + Examples -------- >>> arr = pd.array([1, 2, 3]) @@ -767,6 +795,11 @@ def isna(self) -> np.ndarray | ExtensionArraySupportsAnyAll: an ndarray would be expensive, an ExtensionArray may be returned. + See Also + -------- + ExtensionArray.dropna: Return ExtensionArray without NA values. + ExtensionArray.fillna: Fill NA/NaN values using the specified method. + Notes ----- If returning an ExtensionArray, then @@ -1017,6 +1050,12 @@ def _pad_or_backfill( maximum number of entries along the entire axis where NaNs will be filled. + limit_area : {'inside', 'outside'} or None, default None + Specifies which area to limit filling. + - 'inside': Limit the filling to the area within the gaps. + - 'outside': Limit the filling to the area outside the gaps. + If `None`, no limitation is applied. + copy : bool, default True Whether to make a copy of the data before filling. If False, then the original should be modified and no new memory should be allocated. @@ -1028,6 +1067,16 @@ def _pad_or_backfill( Returns ------- Same type as self + The filled array with the same type as the original. + + See Also + -------- + Series.ffill : Forward fill missing values. + Series.bfill : Backward fill missing values. + DataFrame.ffill : Forward fill missing values in DataFrame. + DataFrame.bfill : Backward fill missing values in DataFrame. + api.types.isna : Check for missing values. + api.types.isnull : Check for missing values. Examples -------- @@ -1088,6 +1137,13 @@ def fillna( ExtensionArray With NA/NaN filled. + See Also + -------- + api.extensions.ExtensionArray.dropna : Return ExtensionArray without + NA values. + api.extensions.ExtensionArray.isna : A 1-D array indicating if + each value is missing. + Examples -------- >>> arr = pd.array([np.nan, np.nan, 2, 3, np.nan, np.nan]) @@ -1134,6 +1190,16 @@ def dropna(self) -> Self: Returns ------- + Self + An ExtensionArray of the same type as the original but with all + NA values removed. + + See Also + -------- + Series.dropna : Remove missing values from a Series. + DataFrame.dropna : Remove missing values from a DataFrame. + api.extensions.ExtensionArray.isna : Check for missing values in + an ExtensionArray. Examples -------- @@ -1161,6 +1227,15 @@ def duplicated( Returns ------- ndarray[bool] + With true in indices where elements are duplicated and false otherwise. + + See Also + -------- + DataFrame.duplicated : Return boolean Series denoting + duplicate rows. + Series.duplicated : Indicate duplicate Series values. + api.extensions.ExtensionArray.unique : Compute the ExtensionArray + of unique values. Examples -------- @@ -1244,6 +1319,13 @@ def unique(self) -> Self: Returns ------- pandas.api.extensions.ExtensionArray + With unique values from the input array. + + See Also + -------- + Index.unique: Return unique values in the index. + Series.unique: Return unique values of Series object. + unique: Return unique values based on a hash table. Examples -------- @@ -1377,10 +1459,18 @@ def isin(self, values: ArrayLike) -> npt.NDArray[np.bool_]: Parameters ---------- values : np.ndarray or ExtensionArray + Values to compare every element in the array against. Returns ------- np.ndarray[bool] + With true at indices where value is in `values`. + + See Also + -------- + DataFrame.isin: Whether each element in the DataFrame is contained in values. + Index.isin: Return a boolean array where the index values are in values. + Series.isin: Whether elements in Series are contained in values. Examples -------- @@ -1408,6 +1498,10 @@ def _values_for_factorize(self) -> tuple[np.ndarray, Any]: `-1` and not included in `uniques`. By default, ``np.nan`` is used. + See Also + -------- + util.hash_pandas_object : Hash the pandas object. + Notes ----- The values returned by this method are also used in @@ -1580,6 +1674,7 @@ def take( Returns ------- ExtensionArray + An array formed with selected `indices`. Raises ------ @@ -1679,11 +1774,17 @@ def view(self, dtype: Dtype | None = None) -> ArrayLike: ExtensionArray or np.ndarray A view on the :class:`ExtensionArray`'s data. + See Also + -------- + api.extensions.ExtensionArray.ravel: Return a flattened view on input array. + Index.view: Equivalent function for Index. + ndarray.view: New view of array with the same data. + Examples -------- This gives view on the underlying data of an ``ExtensionArray`` and is not a copy. Modifications on either the view or the original ``ExtensionArray`` - will be reflectd on the underlying data: + will be reflected on the underlying data: >>> arr = pd.array([1, 2, 3]) >>> arr2 = arr.view() @@ -1832,6 +1933,11 @@ def ravel(self, order: Literal["C", "F", "A", "K"] | None = "C") -> Self: Returns ------- ExtensionArray + A flattened view on the array. + + See Also + -------- + ExtensionArray.tolist: Return a list of the values. Notes ----- @@ -1967,16 +2073,43 @@ def _reduce( Returns ------- - scalar + scalar or ndarray: + The result of the reduction operation. The type of the result + depends on `keepdims`: + - If `keepdims` is `False`, a scalar value is returned. + - If `keepdims` is `True`, the result is wrapped in a numpy array with + a single element. Raises ------ TypeError : subclass does not define operations + See Also + -------- + Series.min : Return the minimum value. + Series.max : Return the maximum value. + Series.sum : Return the sum of values. + Series.mean : Return the mean of values. + Series.median : Return the median of values. + Series.std : Return the standard deviation. + Series.var : Return the variance. + Series.prod : Return the product of values. + Series.sem : Return the standard error of the mean. + Series.kurt : Return the kurtosis. + Series.skew : Return the skewness. + Examples -------- >>> pd.array([1, 2, 3])._reduce("min") 1 + >>> pd.array([1, 2, 3])._reduce("max") + 3 + >>> pd.array([1, 2, 3])._reduce("sum") + 6 + >>> pd.array([1, 2, 3])._reduce("mean") + 2.0 + >>> pd.array([1, 2, 3])._reduce("median") + 2.0 """ meth = getattr(self, name, None) if meth is None: @@ -2105,6 +2238,12 @@ def tolist(self) -> list: Returns ------- list + Python list of values in array. + + See Also + -------- + Index.to_list: Return a list of the values in the Index. + Series.to_list: Return a list of the values in the Series. Examples -------- @@ -2127,11 +2266,18 @@ def insert(self, loc: int, item) -> Self: Parameters ---------- loc : int + Index where the `item` needs to be inserted. item : scalar-like + Value to be inserted. Returns ------- - same type as self + ExtensionArray + With `item` inserted at `loc`. + + See Also + -------- + Index.insert: Make new Index inserting new item at location. Notes ----- diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 18b52f741370f..c613a345686cc 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -2495,11 +2495,6 @@ def unique(self) -> Self: """ return super().unique() - def _cast_quantile_result(self, res_values: np.ndarray) -> np.ndarray: - # make sure we have correct itemsize for resulting codes - assert res_values.dtype == self._ndarray.dtype - return res_values - def equals(self, other: object) -> bool: """ Returns True if categorical arrays are equal. diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index ad0bde3abbdd4..fbe1677b95b33 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -19,6 +19,7 @@ import numpy as np +from pandas._config import using_string_dtype from pandas._config.config import get_option from pandas._libs import ( @@ -1759,6 +1760,10 @@ def strftime(self, date_format: str) -> npt.NDArray[np.object_]: dtype='object') """ result = self._format_native_types(date_format=date_format, na_rep=np.nan) + if using_string_dtype(): + from pandas import StringDtype + + return pd_array(result, dtype=StringDtype(na_value=np.nan)) # type: ignore[return-value] return result.astype(object, copy=False) @@ -1781,7 +1786,7 @@ def strftime(self, date_format: str) -> npt.NDArray[np.object_]: a non-DST time (note that this flag is only applicable for ambiguous times) - 'NaT' will return NaT where there are ambiguous times - - 'raise' will raise an AmbiguousTimeError if there are ambiguous + - 'raise' will raise a ValueError if there are ambiguous times. nonexistent : 'shift_forward', 'shift_backward', 'NaT', timedelta, default 'raise' @@ -1794,7 +1799,7 @@ def strftime(self, date_format: str) -> npt.NDArray[np.object_]: closest existing time - 'NaT' will return NaT where there are nonexistent times - timedelta objects will shift nonexistent times by the timedelta - - 'raise' will raise an NonExistentTimeError if there are + - 'raise' will raise a ValueError if there are nonexistent times. Returns diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index dddfc440109d3..201c449185057 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -15,6 +15,7 @@ import numpy as np +from pandas._config import using_string_dtype from pandas._config.config import get_option from pandas._libs import ( @@ -158,15 +159,8 @@ def f(self): # these return a boolean by-definition return result - if field in self._object_ops: - result = fields.get_date_name_field(values, field, reso=self._creso) - result = self._maybe_mask_results(result, fill_value=None) - - else: - result = fields.get_date_field(values, field, reso=self._creso) - result = self._maybe_mask_results( - result, fill_value=None, convert="float64" - ) + result = fields.get_date_field(values, field, reso=self._creso) + result = self._maybe_mask_results(result, fill_value=None, convert="float64") return result @@ -243,7 +237,6 @@ def _scalar_type(self) -> type[Timestamp]: "is_year_end", "is_leap_year", ] - _object_ops: list[str] = ["freq", "tz"] _field_ops: list[str] = [ "year", "month", @@ -264,7 +257,7 @@ def _scalar_type(self) -> type[Timestamp]: ] _other_ops: list[str] = ["date", "time", "timetz"] _datetimelike_ops: list[str] = ( - _field_ops + _object_ops + _bool_ops + _other_ops + ["unit"] + _field_ops + _bool_ops + _other_ops + ["unit", "freq", "tz"] ) _datetimelike_methods: list[str] = [ "to_period", @@ -972,7 +965,7 @@ def tz_localize( non-DST time (note that this flag is only applicable for ambiguous times) - 'NaT' will return NaT where there are ambiguous times - - 'raise' will raise an AmbiguousTimeError if there are ambiguous + - 'raise' will raise a ValueError if there are ambiguous times. nonexistent : 'shift_forward', 'shift_backward, 'NaT', timedelta, \ @@ -986,7 +979,7 @@ def tz_localize( closest existing time - 'NaT' will return NaT where there are nonexistent times - timedelta objects will shift nonexistent times by the timedelta - - 'raise' will raise an NonExistentTimeError if there are + - 'raise' will raise a ValueError if there are nonexistent times. Returns @@ -1340,6 +1333,13 @@ def month_name(self, locale=None) -> npt.NDArray[np.object_]: values, "month_name", locale=locale, reso=self._creso ) result = self._maybe_mask_results(result, fill_value=None) + if using_string_dtype(): + from pandas import ( + StringDtype, + array as pd_array, + ) + + return pd_array(result, dtype=StringDtype(na_value=np.nan)) # type: ignore[return-value] return result def day_name(self, locale=None) -> npt.NDArray[np.object_]: @@ -1401,6 +1401,14 @@ def day_name(self, locale=None) -> npt.NDArray[np.object_]: values, "day_name", locale=locale, reso=self._creso ) result = self._maybe_mask_results(result, fill_value=None) + if using_string_dtype(): + # TODO: no tests that check for dtype of result as of 2024-08-15 + from pandas import ( + StringDtype, + array as pd_array, + ) + + return pd_array(result, dtype=StringDtype(na_value=np.nan)) # type: ignore[return-value] return result @property diff --git a/pandas/core/arrays/numpy_.py b/pandas/core/arrays/numpy_.py index 07eb91e0cb13b..aafcd82114b97 100644 --- a/pandas/core/arrays/numpy_.py +++ b/pandas/core/arrays/numpy_.py @@ -137,9 +137,6 @@ def _from_sequence( result = result.copy() return cls(result) - def _from_backing_data(self, arr: np.ndarray) -> NumpyExtensionArray: - return type(self)(arr) - # ------------------------------------------------------------------------ # Data @@ -557,7 +554,3 @@ def _wrap_ndarray_result(self, result: np.ndarray): return TimedeltaArray._simple_new(result, dtype=result.dtype) return type(self)(result) - - # ------------------------------------------------------------------------ - # String methods interface - _str_na_value = np.nan diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index b3513dd083e41..aa8dacbd6aad5 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -432,6 +432,15 @@ def __arrow_array__(self, type=None): """ The year of the period. + See Also + -------- + PeriodIndex.day_of_year : The ordinal day of the year. + PeriodIndex.dayofyear : The ordinal day of the year. + PeriodIndex.is_leap_year : Logical indicating if the date belongs to a + leap year. + PeriodIndex.weekofyear : The week ordinal of the year. + PeriodIndex.year : The year of the period. + Examples -------- >>> idx = pd.PeriodIndex(["2023", "2024", "2025"], freq="Y") @@ -444,6 +453,11 @@ def __arrow_array__(self, type=None): """ The month as January=1, December=12. + See Also + -------- + PeriodIndex.days_in_month : The number of days in the month. + PeriodIndex.daysinmonth : The number of days in the month. + Examples -------- >>> idx = pd.PeriodIndex(["2023-01", "2023-02", "2023-03"], freq="M") @@ -456,6 +470,16 @@ def __arrow_array__(self, type=None): """ The days of the period. + See Also + -------- + PeriodIndex.day_of_week : The day of the week with Monday=0, Sunday=6. + PeriodIndex.day_of_year : The ordinal day of the year. + PeriodIndex.dayofweek : The day of the week with Monday=0, Sunday=6. + PeriodIndex.dayofyear : The ordinal day of the year. + PeriodIndex.days_in_month : The number of days in the month. + PeriodIndex.daysinmonth : The number of days in the month. + PeriodIndex.weekday : The day of the week with Monday=0, Sunday=6. + Examples -------- >>> idx = pd.PeriodIndex(['2020-01-31', '2020-02-28'], freq='D') @@ -468,6 +492,12 @@ def __arrow_array__(self, type=None): """ The hour of the period. + See Also + -------- + PeriodIndex.minute : The minute of the period. + PeriodIndex.second : The second of the period. + PeriodIndex.to_timestamp : Cast to DatetimeArray/Index. + Examples -------- >>> idx = pd.PeriodIndex(["2023-01-01 10:00", "2023-01-01 11:00"], freq='h') @@ -480,6 +510,12 @@ def __arrow_array__(self, type=None): """ The minute of the period. + See Also + -------- + PeriodIndex.hour : The hour of the period. + PeriodIndex.second : The second of the period. + PeriodIndex.to_timestamp : Cast to DatetimeArray/Index. + Examples -------- >>> idx = pd.PeriodIndex(["2023-01-01 10:30:00", @@ -493,6 +529,12 @@ def __arrow_array__(self, type=None): """ The second of the period. + See Also + -------- + PeriodIndex.hour : The hour of the period. + PeriodIndex.minute : The minute of the period. + PeriodIndex.to_timestamp : Cast to DatetimeArray/Index. + Examples -------- >>> idx = pd.PeriodIndex(["2023-01-01 10:00:30", @@ -506,6 +548,14 @@ def __arrow_array__(self, type=None): """ The week ordinal of the year. + See Also + -------- + PeriodIndex.day_of_week : The day of the week with Monday=0, Sunday=6. + PeriodIndex.dayofweek : The day of the week with Monday=0, Sunday=6. + PeriodIndex.week : The week ordinal of the year. + PeriodIndex.weekday : The day of the week with Monday=0, Sunday=6. + PeriodIndex.year : The year of the period. + Examples -------- >>> idx = pd.PeriodIndex(["2023-01", "2023-02", "2023-03"], freq="M") @@ -519,6 +569,17 @@ def __arrow_array__(self, type=None): """ The day of the week with Monday=0, Sunday=6. + See Also + -------- + PeriodIndex.day : The days of the period. + PeriodIndex.day_of_week : The day of the week with Monday=0, Sunday=6. + PeriodIndex.day_of_year : The ordinal day of the year. + PeriodIndex.dayofweek : The day of the week with Monday=0, Sunday=6. + PeriodIndex.dayofyear : The ordinal day of the year. + PeriodIndex.week : The week ordinal of the year. + PeriodIndex.weekday : The day of the week with Monday=0, Sunday=6. + PeriodIndex.weekofyear : The week ordinal of the year. + Examples -------- >>> idx = pd.PeriodIndex(["2023-01-01", "2023-01-02", "2023-01-03"], freq="D") @@ -533,6 +594,17 @@ def __arrow_array__(self, type=None): """ The ordinal day of the year. + See Also + -------- + PeriodIndex.day : The days of the period. + PeriodIndex.day_of_week : The day of the week with Monday=0, Sunday=6. + PeriodIndex.day_of_year : The ordinal day of the year. + PeriodIndex.dayofweek : The day of the week with Monday=0, Sunday=6. + PeriodIndex.dayofyear : The ordinal day of the year. + PeriodIndex.weekday : The day of the week with Monday=0, Sunday=6. + PeriodIndex.weekofyear : The week ordinal of the year. + PeriodIndex.year : The year of the period. + Examples -------- >>> idx = pd.PeriodIndex(["2023-01-10", "2023-02-01", "2023-03-01"], freq="D") @@ -551,6 +623,11 @@ def __arrow_array__(self, type=None): """ The quarter of the date. + See Also + -------- + PeriodIndex.qyear : Fiscal year the Period lies in according to its + starting-quarter. + Examples -------- >>> idx = pd.PeriodIndex(["2023-01", "2023-02", "2023-03"], freq="M") @@ -558,12 +635,62 @@ def __arrow_array__(self, type=None): Index([1, 1, 1], dtype='int64') """, ) - qyear = _field_accessor("qyear") + qyear = _field_accessor( + "qyear", + """ + Fiscal year the Period lies in according to its starting-quarter. + + The `year` and the `qyear` of the period will be the same if the fiscal + and calendar years are the same. When they are not, the fiscal year + can be different from the calendar year of the period. + + Returns + ------- + int + The fiscal year of the period. + + See Also + -------- + PeriodIndex.quarter : The quarter of the date. + PeriodIndex.year : The year of the period. + + Examples + -------- + If the natural and fiscal year are the same, `qyear` and `year` will + be the same. + + >>> per = pd.Period('2018Q1', freq='Q') + >>> per.qyear + 2018 + >>> per.year + 2018 + + If the fiscal year starts in April (`Q-MAR`), the first quarter of + 2018 will start in April 2017. `year` will then be 2017, but `qyear` + will be the fiscal year, 2018. + + >>> per = pd.Period('2018Q1', freq='Q-MAR') + >>> per.start_time + Timestamp('2017-04-01 00:00:00') + >>> per.qyear + 2018 + >>> per.year + 2017 + """, + ) + days_in_month = _field_accessor( "days_in_month", """ The number of days in the month. + See Also + -------- + PeriodIndex.day : The days of the period. + PeriodIndex.days_in_month : The number of days in the month. + PeriodIndex.daysinmonth : The number of days in the month. + PeriodIndex.month : The month as January=1, December=12. + Examples -------- For Series: @@ -595,6 +722,12 @@ def is_leap_year(self) -> npt.NDArray[np.bool_]: """ Logical indicating if the date belongs to a leap year. + See Also + -------- + PeriodIndex.qyear : Fiscal year the Period lies in according to its + starting-quarter. + PeriodIndex.year : The year of the period. + Examples -------- >>> idx = pd.PeriodIndex(["2023", "2024", "2025"], freq="Y") @@ -618,6 +751,19 @@ def to_timestamp(self, freq=None, how: str = "start") -> DatetimeArray: Returns ------- DatetimeArray/Index + Timestamp representation of given Period-like object. + + See Also + -------- + PeriodIndex.day : The days of the period. + PeriodIndex.from_fields : Construct a PeriodIndex from fields + (year, month, day, etc.). + PeriodIndex.from_ordinals : Construct a PeriodIndex from ordinals. + PeriodIndex.hour : The hour of the period. + PeriodIndex.minute : The minute of the period. + PeriodIndex.month : The month as January=1, December=12. + PeriodIndex.second : The second of the period. + PeriodIndex.year : The year of the period. Examples -------- diff --git a/pandas/core/arrays/sparse/accessor.py b/pandas/core/arrays/sparse/accessor.py index b8245349a4e62..e610e018c5a74 100644 --- a/pandas/core/arrays/sparse/accessor.py +++ b/pandas/core/arrays/sparse/accessor.py @@ -47,6 +47,18 @@ class SparseAccessor(BaseAccessor, PandasDelegate): """ Accessor for SparseSparse from other sparse matrix data types. + Parameters + ---------- + data : Series or DataFrame + The Series or DataFrame to which the SparseAccessor is attached. + + See Also + -------- + Series.sparse.to_coo : Create a scipy.sparse.coo_matrix from a Series with + MultiIndex. + Series.sparse.from_coo : Create a Series with sparse values from a + scipy.sparse.coo_matrix. + Examples -------- >>> ser = pd.Series([0, 0, 2, 2, 2], dtype="Sparse[int]") @@ -135,7 +147,9 @@ def to_coo( Parameters ---------- row_levels : tuple/list + MultiIndex levels to use for row coordinates, specified by name or index. column_levels : tuple/list + MultiIndex levels to use for column coordinates, specified by name or index. sort_labels : bool, default False Sort the row and column labels before forming the sparse matrix. When `row_levels` and/or `column_levels` refer to a single level, @@ -144,8 +158,16 @@ def to_coo( Returns ------- y : scipy.sparse.coo_matrix + The sparse matrix in coordinate format. rows : list (row labels) + Labels corresponding to the row coordinates. columns : list (column labels) + Labels corresponding to the column coordinates. + + See Also + -------- + Series.sparse.from_coo : Create a Series with sparse values from a + scipy.sparse.coo_matrix. Examples -------- diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index 3a08344369822..a09dc20af3b36 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -671,6 +671,11 @@ def density(self) -> float: """ The percent of non- ``fill_value`` points, as decimal. + See Also + -------- + DataFrame.sparse.from_spmatrix : Create a new DataFrame from a + scipy sparse matrix. + Examples -------- >>> from pandas.arrays import SparseArray diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 2ba7c9fccbfce..88fd1481031f8 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -140,12 +140,16 @@ def __init__( # infer defaults if storage is None: if na_value is not libmissing.NA: - if HAS_PYARROW: - storage = "pyarrow" - else: - storage = "python" + storage = get_option("mode.string_storage") + if storage == "auto": + if HAS_PYARROW: + storage = "pyarrow" + else: + storage = "python" else: storage = get_option("mode.string_storage") + if storage == "auto": + storage = "python" if storage == "pyarrow_numpy": # TODO raise a deprecation warning @@ -167,9 +171,9 @@ def __init__( # a consistent NaN value (and we can use `dtype.na_value is np.nan`) na_value = np.nan elif na_value is not libmissing.NA: - raise ValueError("'na_value' must be np.nan or pd.NA, got {na_value}") + raise ValueError(f"'na_value' must be np.nan or pd.NA, got {na_value}") - self.storage = storage + self.storage = cast(str, storage) self._na_value = na_value def __repr__(self) -> str: @@ -280,6 +284,34 @@ def construct_array_type( # type: ignore[override] else: return ArrowStringArrayNumpySemantics + def _get_common_dtype(self, dtypes: list[DtypeObj]) -> DtypeObj | None: + storages = set() + na_values = set() + + for dtype in dtypes: + if isinstance(dtype, StringDtype): + storages.add(dtype.storage) + na_values.add(dtype.na_value) + elif isinstance(dtype, np.dtype) and dtype.kind in ("U", "T"): + continue + else: + return None + + if len(storages) == 2: + # if both python and pyarrow storage -> priority to pyarrow + storage = "pyarrow" + else: + storage = next(iter(storages)) # type: ignore[assignment] + + na_value: libmissing.NAType | float + if len(na_values) == 2: + # if both NaN and NA -> priority to NA + na_value = libmissing.NA + else: + na_value = next(iter(na_values)) + + return StringDtype(storage=storage, na_value=na_value) + def __from_arrow__( self, array: pyarrow.Array | pyarrow.ChunkedArray ) -> BaseStringArray: @@ -350,9 +382,7 @@ def _str_map( self, f, na_value=None, dtype: Dtype | None = None, convert: bool = True ): if self.dtype.na_value is np.nan: - return self._str_map_nan_semantics( - f, na_value=na_value, dtype=dtype, convert=convert - ) + return self._str_map_nan_semantics(f, na_value=na_value, dtype=dtype) from pandas.arrays import BooleanArray @@ -427,9 +457,7 @@ def _str_map_str_or_object( # -> We don't know the result type. E.g. `.get` can return anything. return lib.map_infer_mask(arr, f, mask.view("uint8")) - def _str_map_nan_semantics( - self, f, na_value=None, dtype: Dtype | None = None, convert: bool = True - ): + def _str_map_nan_semantics(self, f, na_value=None, dtype: Dtype | None = None): if dtype is None: dtype = self.dtype if na_value is None: @@ -659,11 +687,10 @@ def __arrow_array__(self, type=None): values[self.isna()] = None return pa.array(values, type=type, from_pandas=True) - def _values_for_factorize(self) -> tuple[np.ndarray, None]: - arr = self._ndarray.copy() - mask = self.isna() - arr[mask] = None - return arr, None + def _values_for_factorize(self) -> tuple[np.ndarray, libmissing.NAType | float]: # type: ignore[override] + arr = self._ndarray + + return arr, self.dtype.na_value def __setitem__(self, key, value) -> None: value = extract_array(value, extract_numpy=True) @@ -746,6 +773,12 @@ def _reduce( axis: AxisInt | None = 0, **kwargs, ): + if self.dtype.na_value is np.nan and name in ["any", "all"]: + if name == "any": + return nanops.nanany(self._ndarray, skipna=skipna) + else: + return nanops.nanall(self._ndarray, skipna=skipna) + if name in ["min", "max"]: result = getattr(self, name)(skipna=skipna, axis=axis) if keepdims: @@ -754,6 +787,12 @@ def _reduce( raise TypeError(f"Cannot perform reduction '{name}' with string dtype") + def _wrap_reduction_result(self, axis: AxisInt | None, result) -> Any: + if self.dtype.na_value is np.nan and result is libmissing.NA: + # the masked_reductions use pd.NA -> convert to np.nan + return np.nan + return super()._wrap_reduction_result(axis, result) + def min(self, axis=None, skipna: bool = True, **kwargs) -> Scalar: nv.validate_min((), kwargs) result = masked_reductions.min( @@ -771,8 +810,11 @@ def max(self, axis=None, skipna: bool = True, **kwargs) -> Scalar: def value_counts(self, dropna: bool = True) -> Series: from pandas.core.algorithms import value_counts_internal as value_counts - result = value_counts(self._ndarray, sort=False, dropna=dropna).astype("Int64") + result = value_counts(self._ndarray, sort=False, dropna=dropna) result.index = result.index.astype(self.dtype) + + if self.dtype.na_value is libmissing.NA: + result = result.astype("Int64") return result def memory_usage(self, deep: bool = False) -> int: @@ -811,8 +853,11 @@ def _cmp_method(self, other, op): f"Lengths of operands do not match: {len(self)} != {len(other)}" ) - other = np.asarray(other) + # for array-likes, first filter out NAs before converting to numpy + if not is_array_like(other): + other = np.asarray(other) other = other[valid] + other = np.asarray(other) if op.__name__ in ops.ARITHMETIC_BINOPS: result = np.empty_like(self._ndarray, dtype="object") @@ -823,16 +868,16 @@ def _cmp_method(self, other, op): # logical result = np.zeros(len(self._ndarray), dtype="bool") result[valid] = op(self._ndarray[valid], other) - return BooleanArray(result, mask) + res_arr = BooleanArray(result, mask) + if self.dtype.na_value is np.nan: + if op == operator.ne: + return res_arr.to_numpy(np.bool_, na_value=True) + else: + return res_arr.to_numpy(np.bool_, na_value=False) + return res_arr _arith_method = _cmp_method - # ------------------------------------------------------------------------ - # String methods interface - # error: Incompatible types in assignment (expression has type "NAType", - # base class "NumpyExtensionArray" defined the type as "float") - _str_na_value = libmissing.NA # type: ignore[assignment] - class StringArrayNumpySemantics(StringArray): _storage = "python" @@ -858,43 +903,3 @@ def _from_sequence( if dtype is None: dtype = StringDtype(storage="python", na_value=np.nan) return super()._from_sequence(scalars, dtype=dtype, copy=copy) - - def _from_backing_data(self, arr: np.ndarray) -> StringArrayNumpySemantics: - # need to override NumpyExtensionArray._from_backing_data to ensure - # we always preserve the dtype - return NDArrayBacked._from_backing_data(self, arr) - - def _reduce( - self, name: str, *, skipna: bool = True, keepdims: bool = False, **kwargs - ): - if name in ["any", "all"]: - if name == "any": - return nanops.nanany(self._ndarray, skipna=skipna) - else: - return nanops.nanall(self._ndarray, skipna=skipna) - else: - return super()._reduce(name, skipna=skipna, keepdims=keepdims, **kwargs) - - def _wrap_reduction_result(self, axis: AxisInt | None, result) -> Any: - # the masked_reductions use pd.NA - if result is libmissing.NA: - return np.nan - return super()._wrap_reduction_result(axis, result) - - def _cmp_method(self, other, op): - result = super()._cmp_method(other, op) - if op == operator.ne: - return result.to_numpy(np.bool_, na_value=True) - else: - return result.to_numpy(np.bool_, na_value=False) - - def value_counts(self, dropna: bool = True) -> Series: - from pandas.core.algorithms import value_counts_internal as value_counts - - result = value_counts(self._ndarray, sort=False, dropna=dropna) - result.index = result.index.astype(self.dtype) - return result - - # ------------------------------------------------------------------------ - # String methods interface - _str_na_value = np.nan diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index cc37995969f0a..97381b82ceab9 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -1,12 +1,12 @@ from __future__ import annotations -from functools import partial import operator import re from typing import ( TYPE_CHECKING, Union, ) +import warnings import numpy as np @@ -20,6 +20,7 @@ pa_version_under10p1, pa_version_under13p0, ) +from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.common import ( is_scalar, @@ -36,7 +37,6 @@ BaseStringArray, StringDtype, ) -from pandas.core.ops import invalid_comparison from pandas.core.strings.object_array import ObjectStringArrayMixin if not pa_version_under10p1: @@ -130,18 +130,22 @@ class ArrowStringArray(ObjectStringArrayMixin, ArrowExtensionArray, BaseStringAr def __init__(self, values) -> None: _chk_pyarrow_available() - if isinstance(values, (pa.Array, pa.ChunkedArray)) and pa.types.is_string( - values.type + if isinstance(values, (pa.Array, pa.ChunkedArray)) and ( + pa.types.is_string(values.type) + or ( + pa.types.is_dictionary(values.type) + and ( + pa.types.is_string(values.type.value_type) + or pa.types.is_large_string(values.type.value_type) + ) + ) ): values = pc.cast(values, pa.large_string()) super().__init__(values) self._dtype = StringDtype(storage=self._storage, na_value=self._na_value) - if not pa.types.is_large_string(self._pa_array.type) and not ( - pa.types.is_dictionary(self._pa_array.type) - and pa.types.is_large_string(self._pa_array.type.value_type) - ): + if not pa.types.is_large_string(self._pa_array.type): raise ValueError( "ArrowStringArray requires a PyArrow (chunked) array of " "large_string type" @@ -213,12 +217,15 @@ def dtype(self) -> StringDtype: # type: ignore[override] return self._dtype def insert(self, loc: int, item) -> ArrowStringArray: + if self.dtype.na_value is np.nan and item is np.nan: + item = libmissing.NA if not isinstance(item, str) and item is not libmissing.NA: raise TypeError("Scalar must be NA or str") return super().insert(loc, item) - @classmethod - def _result_converter(cls, values, na=None): + def _convert_bool_result(self, values): + if self.dtype.na_value is np.nan: + return ArrowExtensionArray(values).to_numpy(na_value=np.nan) return BooleanDtype().__from_arrow__(values) def _maybe_convert_setitem_value(self, value): @@ -250,7 +257,7 @@ def isin(self, values: ArrayLike) -> npt.NDArray[np.bool_]: result = pc.is_in( self._pa_array, value_set=pa.array(value_set, type=self._pa_array.type) ) - # pyarrow 2.0.0 returned nulls, so we explicily specify dtype to convert nulls + # pyarrow 2.0.0 returned nulls, so we explicitly specify dtype to convert nulls # to False return np.array(result, dtype=np.bool_) @@ -272,11 +279,20 @@ def astype(self, dtype, copy: bool = True): # ------------------------------------------------------------------------ # String methods interface - # error: Incompatible types in assignment (expression has type "NAType", - # base class "ObjectStringArrayMixin" defined the type as "float") - _str_na_value = libmissing.NA # type: ignore[assignment] + _str_isalnum = ArrowStringArrayMixin._str_isalnum + _str_isalpha = ArrowStringArrayMixin._str_isalpha + _str_isdecimal = ArrowStringArrayMixin._str_isdecimal + _str_isdigit = ArrowStringArrayMixin._str_isdigit + _str_islower = ArrowStringArrayMixin._str_islower + _str_isnumeric = ArrowStringArrayMixin._str_isnumeric + _str_isspace = ArrowStringArrayMixin._str_isspace + _str_istitle = ArrowStringArrayMixin._str_istitle + _str_isupper = ArrowStringArrayMixin._str_isupper _str_map = BaseStringArray._str_map + _str_startswith = ArrowStringArrayMixin._str_startswith + _str_endswith = ArrowStringArrayMixin._str_endswith + _str_pad = ArrowStringArrayMixin._str_pad def _str_contains( self, pat, case: bool = True, flags: int = 0, na=np.nan, regex: bool = True @@ -286,52 +302,18 @@ def _str_contains( fallback_performancewarning() return super()._str_contains(pat, case, flags, na, regex) - if regex: - result = pc.match_substring_regex(self._pa_array, pat, ignore_case=not case) - else: - result = pc.match_substring(self._pa_array, pat, ignore_case=not case) - result = self._result_converter(result, na=na) if not isna(na): - result[isna(result)] = bool(na) - return result - - def _str_startswith(self, pat: str | tuple[str, ...], na: Scalar | None = None): - if isinstance(pat, str): - result = pc.starts_with(self._pa_array, pattern=pat) - else: - if len(pat) == 0: - # mimic existing behaviour of string extension array - # and python string method - result = pa.array( - np.zeros(len(self._pa_array), dtype=bool), mask=isna(self._pa_array) + if not isinstance(na, bool): + # GH#59561 + warnings.warn( + "Allowing a non-bool 'na' in obj.str.contains is deprecated " + "and will raise in a future version.", + FutureWarning, + stacklevel=find_stack_level(), ) - else: - result = pc.starts_with(self._pa_array, pattern=pat[0]) + na = bool(na) - for p in pat[1:]: - result = pc.or_(result, pc.starts_with(self._pa_array, pattern=p)) - if not isna(na): - result = result.fill_null(na) - return self._result_converter(result) - - def _str_endswith(self, pat: str | tuple[str, ...], na: Scalar | None = None): - if isinstance(pat, str): - result = pc.ends_with(self._pa_array, pattern=pat) - else: - if len(pat) == 0: - # mimic existing behaviour of string extension array - # and python string method - result = pa.array( - np.zeros(len(self._pa_array), dtype=bool), mask=isna(self._pa_array) - ) - else: - result = pc.ends_with(self._pa_array, pattern=pat[0]) - - for p in pat[1:]: - result = pc.or_(result, pc.ends_with(self._pa_array, pattern=p)) - if not isna(na): - result = result.fill_null(na) - return self._result_converter(result) + return ArrowStringArrayMixin._str_contains(self, pat, case, flags, na, regex) def _str_replace( self, @@ -347,9 +329,7 @@ def _str_replace( fallback_performancewarning() return super()._str_replace(pat, repl, n, case, flags, regex) - func = pc.replace_substring_regex if regex else pc.replace_substring - result = func(self._pa_array, pattern=pat, replacement=repl, max_replacements=n) - return type(self)(result) + return ArrowExtensionArray._str_replace(self, pat, repl, n, case, flags, regex) def _str_repeat(self, repeats: int | Sequence[int]): if not isinstance(repeats, int): @@ -384,45 +364,9 @@ def _str_slice( pc.utf8_slice_codeunits(self._pa_array, start=start, stop=stop, step=step) ) - def _str_isalnum(self): - result = pc.utf8_is_alnum(self._pa_array) - return self._result_converter(result) - - def _str_isalpha(self): - result = pc.utf8_is_alpha(self._pa_array) - return self._result_converter(result) - - def _str_isdecimal(self): - result = pc.utf8_is_decimal(self._pa_array) - return self._result_converter(result) - - def _str_isdigit(self): - result = pc.utf8_is_digit(self._pa_array) - return self._result_converter(result) - - def _str_islower(self): - result = pc.utf8_is_lower(self._pa_array) - return self._result_converter(result) - - def _str_isnumeric(self): - result = pc.utf8_is_numeric(self._pa_array) - return self._result_converter(result) - - def _str_isspace(self): - result = pc.utf8_is_space(self._pa_array) - return self._result_converter(result) - - def _str_istitle(self): - result = pc.utf8_is_title(self._pa_array) - return self._result_converter(result) - - def _str_isupper(self): - result = pc.utf8_is_upper(self._pa_array) - return self._result_converter(result) - def _str_len(self): result = pc.utf8_length(self._pa_array) - return self._convert_int_dtype(result) + return self._convert_int_result(result) def _str_lower(self) -> Self: return type(self)(pc.utf8_lower(self._pa_array)) @@ -469,21 +413,17 @@ def _str_count(self, pat: str, flags: int = 0): if flags: return super()._str_count(pat, flags) result = pc.count_substring_regex(self._pa_array, pat) - return self._convert_int_dtype(result) + return self._convert_int_result(result) def _str_find(self, sub: str, start: int = 0, end: int | None = None): - if start != 0 and end is not None: - slices = pc.utf8_slice_codeunits(self._pa_array, start, stop=end) - result = pc.find_substring(slices, sub) - not_found = pc.equal(result, -1) - offset_result = pc.add(result, end - start) - result = pc.if_else(not_found, result, offset_result) - elif start == 0 and end is None: - slices = self._pa_array - result = pc.find_substring(slices, sub) - else: + if ( + pa_version_under13p0 + and not (start != 0 and end is not None) + and not (start == 0 and end is None) + ): + # GH#59562 return super()._str_find(sub, start, end) - return self._convert_int_dtype(result) + return ArrowStringArrayMixin._str_find(self, sub, start, end) def _str_get_dummies(self, sep: str = "|"): dummies_pa, labels = ArrowExtensionArray(self._pa_array)._str_get_dummies(sep) @@ -492,15 +432,34 @@ def _str_get_dummies(self, sep: str = "|"): dummies = np.vstack(dummies_pa.to_numpy()) return dummies.astype(np.int64, copy=False), labels - def _convert_int_dtype(self, result): + def _convert_int_result(self, result): + if self.dtype.na_value is np.nan: + if isinstance(result, pa.Array): + result = result.to_numpy(zero_copy_only=False) + else: + result = result.to_numpy() + if result.dtype == np.int32: + result = result.astype(np.int64) + return result + return Int64Dtype().__from_arrow__(result) def _reduce( self, name: str, *, skipna: bool = True, keepdims: bool = False, **kwargs ): + if self.dtype.na_value is np.nan and name in ["any", "all"]: + if not skipna: + nas = pc.is_null(self._pa_array) + arr = pc.or_kleene(nas, pc.not_equal(self._pa_array, "")) + else: + arr = pc.not_equal(self._pa_array, "") + return ArrowExtensionArray(arr)._reduce( + name, skipna=skipna, keepdims=keepdims, **kwargs + ) + result = self._reduce_calc(name, skipna=skipna, keepdims=keepdims, **kwargs) if name in ("argmin", "argmax") and isinstance(result, pa.Array): - return self._convert_int_dtype(result) + return self._convert_int_result(result) elif isinstance(result, pa.Array): return type(self)(result) else: @@ -518,7 +477,7 @@ def _rank( """ See Series.rank.__doc__. """ - return self._convert_int_dtype( + return self._convert_int_result( self._rank_calc( axis=axis, method=method, @@ -528,70 +487,30 @@ def _rank( ) ) - -class ArrowStringArrayNumpySemantics(ArrowStringArray): - _storage = "pyarrow" - _na_value = np.nan - - @classmethod - def _result_converter(cls, values, na=None): - if not isna(na): - values = values.fill_null(bool(na)) - return ArrowExtensionArray(values).to_numpy(na_value=np.nan) - - def __getattribute__(self, item): - # ArrowStringArray and we both inherit from ArrowExtensionArray, which - # creates inheritance problems (Diamond inheritance) - if item in ArrowStringArrayMixin.__dict__ and item not in ( - "_pa_array", - "__dict__", - ): - return partial(getattr(ArrowStringArrayMixin, item), self) - return super().__getattribute__(item) - - def _convert_int_dtype(self, result): - if isinstance(result, pa.Array): - result = result.to_numpy(zero_copy_only=False) - else: - result = result.to_numpy() - if result.dtype == np.int32: - result = result.astype(np.int64) + def value_counts(self, dropna: bool = True) -> Series: + result = super().value_counts(dropna=dropna) + if self.dtype.na_value is np.nan: + res_values = result._values.to_numpy() + return result._constructor( + res_values, index=result.index, name=result.name, copy=False + ) return result def _cmp_method(self, other, op): - try: - result = super()._cmp_method(other, op) - except pa.ArrowNotImplementedError: - return invalid_comparison(self, other, op) - if op == operator.ne: - return result.to_numpy(np.bool_, na_value=True) - else: - return result.to_numpy(np.bool_, na_value=False) - - def value_counts(self, dropna: bool = True) -> Series: - from pandas import Series - - result = super().value_counts(dropna) - return Series( - result._values.to_numpy(), index=result.index, name=result.name, copy=False - ) - - def _reduce( - self, name: str, *, skipna: bool = True, keepdims: bool = False, **kwargs - ): - if name in ["any", "all"]: - if not skipna: - nas = pc.is_null(self._pa_array) - arr = pc.or_kleene(nas, pc.not_equal(self._pa_array, "")) + result = super()._cmp_method(other, op) + if self.dtype.na_value is np.nan: + if op == operator.ne: + return result.to_numpy(np.bool_, na_value=True) else: - arr = pc.not_equal(self._pa_array, "") - return ArrowExtensionArray(arr)._reduce( - name, skipna=skipna, keepdims=keepdims, **kwargs - ) - else: - return super()._reduce(name, skipna=skipna, keepdims=keepdims, **kwargs) + return result.to_numpy(np.bool_, na_value=False) + return result - def insert(self, loc: int, item) -> ArrowStringArrayNumpySemantics: - if item is np.nan: - item = libmissing.NA - return super().insert(loc, item) # type: ignore[return-value] + +class ArrowStringArrayNumpySemantics(ArrowStringArray): + _na_value = np.nan + _str_get = ArrowStringArrayMixin._str_get + _str_removesuffix = ArrowStringArrayMixin._str_removesuffix + _str_capitalize = ArrowStringArrayMixin._str_capitalize + _str_title = ArrowStringArrayMixin._str_title + _str_swapcase = ArrowStringArrayMixin._str_swapcase + _str_slice_replace = ArrowStringArrayMixin._str_slice_replace diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index 83cc2871f5459..c8a86ffc187d0 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -152,9 +152,8 @@ def _scalar_type(self) -> type[Timedelta]: # define my properties & methods for delegation _other_ops: list[str] = [] _bool_ops: list[str] = [] - _object_ops: list[str] = ["freq"] _field_ops: list[str] = ["days", "seconds", "microseconds", "nanoseconds"] - _datetimelike_ops: list[str] = _field_ops + _object_ops + _bool_ops + ["unit"] + _datetimelike_ops: list[str] = _field_ops + _bool_ops + ["unit", "freq"] _datetimelike_methods: list[str] = [ "to_pytimedelta", "total_seconds", @@ -877,6 +876,12 @@ def to_pytimedelta(self) -> npt.NDArray[np.object_]: microseconds_docstring = textwrap.dedent( """Number of microseconds (>= 0 and less than 1 second) for each element. + See Also + -------- + pd.Timedelta.microseconds : Number of microseconds (>= 0 and less than 1 second). + pd.Timedelta.to_pytimedelta.microseconds : Number of microseconds (>= 0 and less + than 1 second) of a datetime.timedelta. + Examples -------- For Series: @@ -956,6 +961,12 @@ def components(self) -> DataFrame: ------- DataFrame + See Also + -------- + TimedeltaIndex.total_seconds : Return total duration expressed in seconds. + Timedelta.components : Return a components namedtuple-like of a single + timedelta. + Examples -------- >>> tdelta_idx = pd.to_timedelta(["1 day 3 min 2 us 42 ns"]) diff --git a/pandas/core/computation/parsing.py b/pandas/core/computation/parsing.py index 8fbf8936d31ef..35a6d1c6ad269 100644 --- a/pandas/core/computation/parsing.py +++ b/pandas/core/computation/parsing.py @@ -4,6 +4,7 @@ from __future__ import annotations +from enum import Enum from io import StringIO from keyword import iskeyword import token @@ -32,13 +33,21 @@ def create_valid_python_identifier(name: str) -> str: ------ SyntaxError If the returned name is not a Python valid identifier, raise an exception. - This can happen if there is a hashtag in the name, as the tokenizer will - than terminate and not find the backtick. - But also for characters that fall out of the range of (U+0001..U+007F). """ if name.isidentifier() and not iskeyword(name): return name + # Escape characters that fall outside the ASCII range (U+0001..U+007F). + # GH 49633 + gen = ( + (c, "".join(chr(b) for b in c.encode("ascii", "backslashreplace"))) + for c in name + ) + name = "".join( + c_escaped.replace("\\", "_UNICODE_" if c != c_escaped else "_BACKSLASH_") + for c, c_escaped in gen + ) + # Create a dict with the special characters and their replacement string. # EXACT_TOKEN_TYPES contains these special characters # token.tok_name contains a readable description of the replacement string. @@ -54,11 +63,10 @@ def create_valid_python_identifier(name: str) -> str: "$": "_DOLLARSIGN_", "€": "_EUROSIGN_", "°": "_DEGREESIGN_", - # Including quotes works, but there are exceptions. "'": "_SINGLEQUOTE_", '"': "_DOUBLEQUOTE_", - # Currently not possible. Terminates parser and won't find backtick. - # "#": "_HASH_", + "#": "_HASH_", + "`": "_BACKTICK_", } ) @@ -127,6 +135,9 @@ def clean_column_name(name: Hashable) -> Hashable: which is not caught and propagates to the user level. """ try: + # Escape backticks + name = name.replace("`", "``") if isinstance(name, str) else name + tokenized = tokenize_string(f"`{name}`") tokval = next(tokenized)[1] return create_valid_python_identifier(tokval) @@ -168,6 +179,91 @@ def tokenize_backtick_quoted_string( return BACKTICK_QUOTED_STRING, source[string_start:string_end] +class ParseState(Enum): + DEFAULT = 0 + IN_BACKTICK = 1 + IN_SINGLE_QUOTE = 2 + IN_DOUBLE_QUOTE = 3 + + +def _split_by_backtick(s: str) -> list[tuple[bool, str]]: + """ + Splits a str into substrings along backtick characters (`). + + Disregards backticks inside quotes. + + Parameters + ---------- + s : str + The Python source code string. + + Returns + ------- + substrings: list[tuple[bool, str]] + List of tuples, where each tuple has two elements: + The first is a boolean indicating if the substring is backtick-quoted. + The second is the actual substring. + """ + substrings = [] + substr: list[str] = [] # Will join into a string before adding to `substrings` + i = 0 + parse_state = ParseState.DEFAULT + while i < len(s): + char = s[i] + + match char: + case "`": + # start of a backtick-quoted string + if parse_state == ParseState.DEFAULT: + if substr: + substrings.append((False, "".join(substr))) + + substr = [char] + i += 1 + parse_state = ParseState.IN_BACKTICK + continue + + elif parse_state == ParseState.IN_BACKTICK: + # escaped backtick inside a backtick-quoted string + next_char = s[i + 1] if (i != len(s) - 1) else None + if next_char == "`": + substr.append(char) + substr.append(next_char) + i += 2 + continue + + # end of the backtick-quoted string + else: + substr.append(char) + substrings.append((True, "".join(substr))) + + substr = [] + i += 1 + parse_state = ParseState.DEFAULT + continue + case "'": + # start of a single-quoted string + if parse_state == ParseState.DEFAULT: + parse_state = ParseState.IN_SINGLE_QUOTE + # end of a single-quoted string + elif (parse_state == ParseState.IN_SINGLE_QUOTE) and (s[i - 1] != "\\"): + parse_state = ParseState.DEFAULT + case '"': + # start of a double-quoted string + if parse_state == ParseState.DEFAULT: + parse_state = ParseState.IN_DOUBLE_QUOTE + # end of a double-quoted string + elif (parse_state == ParseState.IN_DOUBLE_QUOTE) and (s[i - 1] != "\\"): + parse_state = ParseState.DEFAULT + substr.append(char) + i += 1 + + if substr: + substrings.append((False, "".join(substr))) + + return substrings + + def tokenize_string(source: str) -> Iterator[tuple[int, str]]: """ Tokenize a Python source code string. @@ -182,18 +278,19 @@ def tokenize_string(source: str) -> Iterator[tuple[int, str]]: tok_generator : Iterator[Tuple[int, str]] An iterator yielding all tokens with only toknum and tokval (Tuple[ing, str]). """ + # GH 59285 + # Escape characters, including backticks + source = "".join( + ( + create_valid_python_identifier(substring[1:-1]) + if is_backtick_quoted + else substring + ) + for is_backtick_quoted, substring in _split_by_backtick(source) + ) + line_reader = StringIO(source).readline token_generator = tokenize.generate_tokens(line_reader) - # Loop over all tokens till a backtick (`) is found. - # Then, take all tokens till the next backtick to form a backtick quoted string - for toknum, tokval, start, _, _ in token_generator: - if tokval == "`": - try: - yield tokenize_backtick_quoted_string( - token_generator, source, string_start=start[1] + 1 - ) - except Exception as err: - raise SyntaxError(f"Failed to parse backticks in '{source}'.") from err - else: - yield toknum, tokval + for toknum, tokval, _, _, _ in token_generator: + yield toknum, tokval diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py index e62cda0dfe8d0..e4eefb570fd95 100644 --- a/pandas/core/config_init.py +++ b/pandas/core/config_init.py @@ -452,13 +452,12 @@ def is_terminal() -> bool: string_storage_doc = """ : string - The default storage for StringDtype. This option is ignored if - ``future.infer_string`` is set to True. + The default storage for StringDtype. """ def is_valid_string_storage(value: Any) -> None: - legal_values = ["python", "pyarrow"] + legal_values = ["auto", "python", "pyarrow"] if value not in legal_values: msg = "Value must be one of python|pyarrow" if value == "pyarrow_numpy": @@ -473,7 +472,7 @@ def is_valid_string_storage(value: Any) -> None: with cf.config_prefix("mode"): cf.register_option( "string_storage", - "python", + "auto", string_storage_doc, # validator=is_one_of_factory(["python", "pyarrow"]), validator=is_valid_string_storage, diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 162f6a4d30f3f..6ba07b1761557 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1014,10 +1014,8 @@ def convert_dtypes( Back-end data type applied to the resultant :class:`DataFrame` (still experimental). Behaviour is as follows: - * ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame` - (default). + * ``"numpy_nullable"``: returns nullable-dtype * ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype` - DataFrame. .. versionadded:: 2.0 @@ -1025,6 +1023,8 @@ def convert_dtypes( ------- np.dtype, or ExtensionDtype """ + from pandas.core.arrays.string_ import StringDtype + inferred_dtype: str | DtypeObj if ( @@ -1103,6 +1103,13 @@ def convert_dtypes( # If we couldn't do anything else, then we retain the dtype inferred_dtype = input_array.dtype + elif ( + convert_string + and isinstance(input_array.dtype, StringDtype) + and input_array.dtype.na_value is np.nan + ): + inferred_dtype = pandas_dtype_func("string") + else: inferred_dtype = input_array.dtype @@ -1371,7 +1378,7 @@ def common_dtype_categorical_compat( # TODO: more generally, could do `not can_hold_na(dtype)` if lib.is_np_dtype(dtype, "iu"): for obj in objs: - # We don't want to accientally allow e.g. "categorical" str here + # We don't want to accidentally allow e.g. "categorical" str here obj_dtype = getattr(obj, "dtype", None) if isinstance(obj_dtype, CategoricalDtype): if isinstance(obj, ABCIndex): diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index 64b5278424192..bcf1ade9b0320 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -1274,6 +1274,10 @@ def is_bool_dtype(arr_or_dtype) -> bool: """ Check whether the provided array or dtype is of a boolean dtype. + This function verifies whether a given object is a boolean data type. The input + can be an array or a dtype object. Accepted array types include instances + of ``np.array``, ``pd.Series``, ``pd.Index``, and similar array-like structures. + Parameters ---------- arr_or_dtype : array-like or dtype @@ -1284,6 +1288,10 @@ def is_bool_dtype(arr_or_dtype) -> bool: boolean Whether or not the array or dtype is of a boolean dtype. + See Also + -------- + api.types.is_bool : Check if an object is a boolean. + Notes ----- An ExtensionArray is considered boolean when the ``_is_boolean`` diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index 3aeab96e03163..68b4807961d19 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -18,9 +18,9 @@ cast, ) import warnings +import zoneinfo import numpy as np -import pytz from pandas._config.config import get_option @@ -455,7 +455,7 @@ def __eq__(self, other: object) -> bool: # Because left and right have the same length and are unique, # `indexer` not having any -1s implies that there is a # bijection between `left` and `right`. - return (indexer != -1).all() + return bool((indexer != -1).all()) # With object-dtype we need a comparison that identifies # e.g. int(2) as distinct from float(2) @@ -513,7 +513,7 @@ def _hash_categories(self) -> int: [cat_array, np.arange(len(cat_array), dtype=cat_array.dtype)] ) else: - cat_array = np.array([cat_array]) + cat_array = cat_array.reshape(1, len(cat_array)) combined_hashed = combine_hash_arrays(iter(cat_array), num_items=len(cat_array)) return np.bitwise_xor.reduce(combined_hashed) @@ -611,6 +611,13 @@ def update_dtype(self, dtype: str_type | CategoricalDtype) -> CategoricalDtype: dtype = cast(CategoricalDtype, dtype) # update categories/ordered unless they've been explicitly passed as None + if ( + isinstance(dtype, CategoricalDtype) + and dtype.categories is not None + and dtype.ordered is not None + ): + # Avoid re-validation in CategoricalDtype constructor + return dtype new_categories = ( dtype.categories if dtype.categories is not None else self.categories ) @@ -789,7 +796,7 @@ def __init__(self, unit: str_type | DatetimeTZDtype = "ns", tz=None) -> None: tz = timezones.maybe_get_tz(tz) tz = timezones.tz_standardize(tz) elif tz is not None: - raise pytz.UnknownTimeZoneError(tz) + raise zoneinfo.ZoneInfoNotFoundError(tz) if tz is None: raise TypeError("A 'tz' is required.") @@ -882,7 +889,7 @@ def construct_from_string(cls, string: str_type) -> DatetimeTZDtype: return cls(unit=d["unit"], tz=d["tz"]) except (KeyError, TypeError, ValueError) as err: # KeyError if maybe_get_tz tries and fails to get a - # pytz timezone (actually pytz.UnknownTimeZoneError). + # zoneinfo timezone (actually zoneinfo.ZoneInfoNotFoundError). # TypeError if we pass a nonsense tz; # ValueError if we pass a unit other than "ns" raise TypeError(msg) from err @@ -986,6 +993,14 @@ class PeriodDtype(PeriodDtypeBase, PandasExtensionDtype): ------- None + See Also + -------- + Period : Represents a single time period. + PeriodIndex : Immutable index for period data. + date_range : Return a fixed frequency DatetimeIndex. + Series : One-dimensional array with axis labels. + DataFrame : Two-dimensional, size-mutable, potentially heterogeneous tabular data. + Examples -------- >>> pd.PeriodDtype(freq="D") diff --git a/pandas/core/frame.py b/pandas/core/frame.py index b8039746d9952..c80e9dfd23ba2 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2124,9 +2124,10 @@ def from_records( columns : sequence, default None Column names to use. If the passed data do not have names associated with them, this argument provides names for the - columns. Otherwise this argument indicates the order of the columns + columns. Otherwise, this argument indicates the order of the columns in the result (any names not found in the data will become all-NA - columns). + columns) and limits the data to these columns if not all column names + are provided. coerce_float : bool, default False Attempt to convert values of non-string, non-numeric objects (like decimal.Decimal) to floating point, useful for SQL result sets. @@ -2199,7 +2200,7 @@ def maybe_reorder( ) -> tuple[list[ArrayLike], Index, Index | None]: """ If our desired 'columns' do not match the data's pre-existing 'arr_columns', - we re-order our arrays. This is like a pre-emptive (cheap) reindex. + we re-order our arrays. This is like a preemptive (cheap) reindex. """ if len(arrays): length = len(arrays[0]) @@ -4484,7 +4485,7 @@ def query(self, expr: str, *, inplace: bool = False, **kwargs) -> DataFrame | No You can refer to column names that are not valid Python variable names by surrounding them in backticks. Thus, column names containing spaces - or punctuations (besides underscores) or starting with digits must be + or punctuation (besides underscores) or starting with digits must be surrounded by backticks. (For example, a column named "Area (cm^2)" would be referenced as ```Area (cm^2)```). Column names which are Python keywords (like "if", "for", "import", etc) cannot be used. @@ -4556,17 +4557,8 @@ def query(self, expr: str, *, inplace: bool = False, **kwargs) -> DataFrame | No quoted string are replaced by strings that are allowed as a Python identifier. These characters include all operators in Python, the space character, the question mark, the exclamation mark, the dollar sign, and the euro sign. - For other characters that fall outside the ASCII range (U+0001..U+007F) - and those that are not further specified in PEP 3131, - the query parser will raise an error. - This excludes whitespace different than the space character, - but also the hashtag (as it is used for comments) and the backtick - itself (backtick can also not be escaped). - - In a special case, quotes that make a pair around a backtick can - confuse the parser. - For example, ```it's` > `that's``` will raise an error, - as it forms a quoted string (``'s > `that'``) with a backtick inside. + + A backtick can be escaped by double backticks. See also the `Python documentation about lexical analysis `__ @@ -4620,6 +4612,7 @@ def query(self, expr: str, *, inplace: bool = False, **kwargs) -> DataFrame | No raise ValueError(msg) kwargs["level"] = kwargs.pop("level", 0) + 1 kwargs["target"] = None + res = self.eval(expr, **kwargs) try: @@ -6406,7 +6399,7 @@ def dropna( thresh : int, optional Require that many non-NA values. Cannot be combined with how. - subset : column label or sequence of labels, optional + subset : column label or iterable of labels, optional Labels along other axis to consider, e.g. if you are dropping rows these would be a list of columns to include. inplace : bool, default False @@ -6536,7 +6529,7 @@ def dropna( @overload def drop_duplicates( self, - subset: Hashable | Sequence[Hashable] | None = ..., + subset: Hashable | Iterable[Hashable] | None = ..., *, keep: DropKeep = ..., inplace: Literal[True], @@ -6546,7 +6539,7 @@ def drop_duplicates( @overload def drop_duplicates( self, - subset: Hashable | Sequence[Hashable] | None = ..., + subset: Hashable | Iterable[Hashable] | None = ..., *, keep: DropKeep = ..., inplace: Literal[False] = ..., @@ -6556,7 +6549,7 @@ def drop_duplicates( @overload def drop_duplicates( self, - subset: Hashable | Sequence[Hashable] | None = ..., + subset: Hashable | Iterable[Hashable] | None = ..., *, keep: DropKeep = ..., inplace: bool = ..., @@ -6565,7 +6558,7 @@ def drop_duplicates( def drop_duplicates( self, - subset: Hashable | Sequence[Hashable] | None = None, + subset: Hashable | Iterable[Hashable] | None = None, *, keep: DropKeep = "first", inplace: bool = False, @@ -6579,7 +6572,7 @@ def drop_duplicates( Parameters ---------- - subset : column label or sequence of labels, optional + subset : column label or iterable of labels, optional Only consider certain columns for identifying duplicates, by default use all of the columns. keep : {'first', 'last', ``False``}, default 'first' @@ -6669,7 +6662,7 @@ def drop_duplicates( def duplicated( self, - subset: Hashable | Sequence[Hashable] | None = None, + subset: Hashable | Iterable[Hashable] | None = None, keep: DropKeep = "first", ) -> Series: """ @@ -6679,7 +6672,7 @@ def duplicated( Parameters ---------- - subset : column label or sequence of labels, optional + subset : column label or iterable of labels, optional Only consider certain columns for identifying duplicates, by default use all of the columns. keep : {'first', 'last', False}, default 'first' @@ -6771,10 +6764,7 @@ def f(vals) -> tuple[np.ndarray, int]: return labels.astype("i8"), len(shape) if subset is None: - # https://github.com/pandas-dev/pandas/issues/28770 - # Incompatible types in assignment (expression has type "Index", variable - # has type "Sequence[Any]") - subset = self.columns # type: ignore[assignment] + subset = self.columns elif ( not np.iterable(subset) or isinstance(subset, str) @@ -6795,7 +6785,7 @@ def f(vals) -> tuple[np.ndarray, int]: if len(subset) == 1 and self.columns.is_unique: # GH#45236 This is faster than get_group_index below - result = self[subset[0]].duplicated(keep) + result = self[next(iter(subset))].duplicated(keep) result.name = None else: vals = (col.values for name, col in self.items() if name in subset) @@ -12371,7 +12361,7 @@ def std( -------- Series.std : Return standard deviation over Series values. DataFrame.mean : Return the mean of the values over the requested axis. - DataFrame.mediam : Return the mediam of the values over the requested axis. + DataFrame.median : Return the median of the values over the requested axis. DataFrame.mode : Get the mode(s) of each element along the requested axis. DataFrame.sum : Return the sum of the values over the requested axis. @@ -12756,10 +12746,80 @@ def nunique(self, axis: Axis = 0, dropna: bool = True) -> Series: """ return self.apply(Series.nunique, axis=axis, dropna=dropna) - @doc(_shared_docs["idxmin"], numeric_only_default="False") def idxmin( self, axis: Axis = 0, skipna: bool = True, numeric_only: bool = False ) -> Series: + """ + Return index of first occurrence of minimum over requested axis. + + NA/null values are excluded. + + Parameters + ---------- + axis : {{0 or 'index', 1 or 'columns'}}, default 0 + The axis to use. 0 or 'index' for row-wise, 1 or 'columns' for column-wise. + skipna : bool, default True + Exclude NA/null values. If the entire DataFrame is NA, + or if ``skipna=False`` and there is an NA value, this method + will raise a ``ValueError``. + numeric_only : bool, default False + Include only `float`, `int` or `boolean` data. + + .. versionadded:: 1.5.0 + + Returns + ------- + Series + Indexes of minima along the specified axis. + + Raises + ------ + ValueError + * If the row/column is empty + + See Also + -------- + Series.idxmin : Return index of the minimum element. + + Notes + ----- + This method is the DataFrame version of ``ndarray.argmin``. + + Examples + -------- + Consider a dataset containing food consumption in Argentina. + + >>> df = pd.DataFrame( + ... { + ... { + ... "consumption": [10.51, 103.11, 55.48], + ... "co2_emissions": [37.2, 19.66, 1712], + ... } + ... }, + ... index=["Pork", "Wheat Products", "Beef"], + ... ) + + >>> df + consumption co2_emissions + Pork 10.51 37.20 + Wheat Products 103.11 19.66 + Beef 55.48 1712.00 + + By default, it returns the index for the minimum value in each column. + + >>> df.idxmin() + consumption Pork + co2_emissions Wheat Products + dtype: object + + To return the index for the minimum value in each row, use ``axis="columns"``. + + >>> df.idxmin(axis="columns") + Pork consumption + Wheat Products co2_emissions + Beef consumption + dtype: object + """ axis = self._get_axis_number(axis) if self.empty and len(self.axes[axis]): @@ -12793,10 +12853,80 @@ def idxmin( final_result = data._constructor_sliced(result, index=data._get_agg_axis(axis)) return final_result.__finalize__(self, method="idxmin") - @doc(_shared_docs["idxmax"], numeric_only_default="False") def idxmax( self, axis: Axis = 0, skipna: bool = True, numeric_only: bool = False ) -> Series: + """ + Return index of first occurrence of maximum over requested axis. + + NA/null values are excluded. + + Parameters + ---------- + axis : {{0 or 'index', 1 or 'columns'}}, default 0 + The axis to use. 0 or 'index' for row-wise, 1 or 'columns' for column-wise. + skipna : bool, default True + Exclude NA/null values. If the entire DataFrame is NA, + or if ``skipna=False`` and there is an NA value, this method + will raise a ``ValueError``. + numeric_only : bool, default False + Include only `float`, `int` or `boolean` data. + + .. versionadded:: 1.5.0 + + Returns + ------- + Series + Indexes of maxima along the specified axis. + + Raises + ------ + ValueError + * If the row/column is empty + + See Also + -------- + Series.idxmax : Return index of the maximum element. + + Notes + ----- + This method is the DataFrame version of ``ndarray.argmax``. + + Examples + -------- + Consider a dataset containing food consumption in Argentina. + + >>> df = pd.DataFrame( + ... { + ... { + ... "consumption": [10.51, 103.11, 55.48], + ... "co2_emissions": [37.2, 19.66, 1712], + ... } + ... }, + ... index=["Pork", "Wheat Products", "Beef"], + ... ) + + >>> df + consumption co2_emissions + Pork 10.51 37.20 + Wheat Products 103.11 19.66 + Beef 55.48 1712.00 + + By default, it returns the index for the maximum value in each column. + + >>> df.idxmax() + consumption Wheat Products + co2_emissions Beef + dtype: object + + To return the index for the maximum value in each row, use ``axis="columns"``. + + >>> df.idxmax(axis="columns") + Pork co2_emissions + Wheat Products consumption + Beef co2_emissions + dtype: object + """ axis = self._get_axis_number(axis) if self.empty and len(self.axes[axis]): @@ -13499,26 +13629,29 @@ def isin_(x): ) columns = properties.AxisProperty( axis=0, - doc=dedent( - """ - The column labels of the DataFrame. - - See Also - -------- - DataFrame.index: The index (row labels) of the DataFrame. - DataFrame.axes: Return a list representing the axes of the DataFrame. - - Examples - -------- - >>> df = pd.DataFrame({'A': [1, 2], 'B': [3, 4]}) - >>> df - A B - 0 1 3 - 1 2 4 - >>> df.columns - Index(['A', 'B'], dtype='object') - """ - ), + doc=""" + The column labels of the DataFrame. + + Returns + ------- + pandas.Index + The column labels of the DataFrame. + + See Also + -------- + DataFrame.index: The index (row labels) of the DataFrame. + DataFrame.axes: Return a list representing the axes of the DataFrame. + + Examples + -------- + >>> df = pd.DataFrame({'A': [1, 2], 'B': [3, 4]}) + >>> df + A B + 0 1 3 + 1 2 4 + >>> df.columns + Index(['A', 'B'], dtype='object') + """, ) # ---------------------------------------------------------------------- diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 8a6fc69d47cc3..42516f0a85e07 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -600,9 +600,10 @@ def _get_cleaned_column_resolvers(self) -> dict[Hashable, Series]: if isinstance(self, ABCSeries): return {clean_column_name(self.name): self} + dtypes = self.dtypes return { clean_column_name(k): Series( - v, copy=False, index=self.index, name=k, dtype=self.dtypes[k] + v, copy=False, index=self.index, name=k, dtype=dtypes[k] ).__finalize__(self) for k, v in zip(self.columns, self._iter_column_arrays()) if not isinstance(k, int) @@ -2122,11 +2123,13 @@ def _repr_data_resource_(self): klass="object", storage_options=_shared_docs["storage_options"], storage_options_versionadded="1.2.0", + encoding_parameter="", + verbose_parameter="", extra_parameters=textwrap.dedent( """\ engine_kwargs : dict, optional Arbitrary keyword arguments passed to excel engine. - """ + """ ), ) def to_excel( @@ -2195,9 +2198,11 @@ def to_excel( merge_cells : bool, default True Write MultiIndex and Hierarchical Rows as merged cells. + {encoding_parameter} inf_rep : str, default 'inf' Representation for infinity (there is no native representation for infinity in Excel). + {verbose_parameter} freeze_panes : tuple of int (length 2), optional Specifies the one-based bottommost row and rightmost column that is to be frozen. @@ -2807,8 +2812,8 @@ def to_sql( `index` is True, then the index names are used. A sequence should be given if the DataFrame uses MultiIndex. chunksize : int, optional - Specify the number of rows in each batch to be written at a time. - By default, all rows will be written at once. + Specify the number of rows in each batch to be written to the database connection at a time. + By default, all rows will be written at once. Also see the method keyword. dtype : dict or scalar, optional Specifying the datatype for columns. If a dictionary is used, the keys should be the column names and the values should be the @@ -6670,10 +6675,10 @@ def convert_dtypes( Back-end data type applied to the resultant :class:`DataFrame` or :class:`Series` (still experimental). Behaviour is as follows: - * ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame` - or :class:`Series` (default). + * ``"numpy_nullable"``: returns nullable-dtype-backed + :class:`DataFrame` or :class:`Serires`. * ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype` - DataFrame or Series. + :class:`DataFrame` or :class:`Series`. .. versionadded:: 2.0 @@ -6989,7 +6994,7 @@ def fillna( f'you passed a "{type(value).__name__}"' ) - # set the default here, so functions examining the signaure + # set the default here, so functions examining the signature # can detect if something was set (e.g. in groupby) (GH9221) if axis is None: axis = 0 @@ -7046,7 +7051,7 @@ def fillna( # see test_fillna_dict_inplace_nonunique_columns locs = result.columns.get_loc(k) if isinstance(locs, slice): - locs = np.arange(self.shape[1])[locs] + locs = range(self.shape[1])[locs] elif isinstance(locs, np.ndarray) and locs.dtype.kind == "b": locs = locs.nonzero()[0] elif not ( @@ -7486,9 +7491,13 @@ def replace( if inplace: return None return self.copy(deep=False) - if is_dict_like(to_replace): if is_dict_like(value): # {'A' : NA} -> {'A' : 0} + if isinstance(self, ABCSeries): + raise ValueError( + "to_replace and value cannot be dict-like for " + "Series.replace" + ) # Note: Checking below for `in foo.keys()` instead of # `in foo` is needed for when we have a Series and not dict mapping = { @@ -8447,8 +8456,8 @@ def asfreq( will map one-to-one to the new index). Otherwise, the new index will be equivalent to ``pd.date_range(start, end, - freq=freq)`` where ``start`` and ``end`` are, respectively, the first and - last entries in the original index (see :func:`pandas.date_range`). The + freq=freq)`` where ``start`` and ``end`` are, respectively, the min and + max entries in the original index (see :func:`pandas.date_range`). The values corresponding to any timesteps in the new index which were not present in the original index will be null (``NaN``), unless a method for filling such unknowns is provided (see the ``method`` parameter below). @@ -8466,7 +8475,7 @@ def asfreq( does not fill NaNs that already were present): * 'pad' / 'ffill': propagate last valid observation forward to next - valid + valid based on the order of the index * 'backfill' / 'bfill': use NEXT valid observation to fill. how : {{'start', 'end'}}, default end For PeriodIndex only (see PeriodIndex.asfreq). @@ -10570,7 +10579,7 @@ def tz_localize( a non-DST time (note that this flag is only applicable for ambiguous times) - 'NaT' will return NaT where there are ambiguous times - - 'raise' will raise an AmbiguousTimeError if there are ambiguous + - 'raise' will raise a ValueError if there are ambiguous times. nonexistent : str, default 'raise' A nonexistent time does not exist in a particular timezone @@ -10582,7 +10591,7 @@ def tz_localize( closest existing time - 'NaT' will return NaT where there are nonexistent times - timedelta objects will shift nonexistent times by the timedelta - - 'raise' will raise an NonExistentTimeError if there are + - 'raise' will raise a ValueError if there are nonexistent times. Returns @@ -11810,6 +11819,8 @@ def last_valid_index(self) -> Hashable: Returns ------- {name1} or scalar\ + + Value containing the calculation referenced in the description.\ {see_also}\ {examples} """ @@ -11836,14 +11847,44 @@ def last_valid_index(self) -> Hashable: where N represents the number of elements. numeric_only : bool, default False Include only float, int, boolean columns. Not implemented for Series. +**kwargs : + Additional keywords have no effect but might be accepted + for compatibility with NumPy. Returns ------- -{name1} or {name2} (if level specified) \ +{name1} or {name2} (if level specified) + {return_desc} + +See Also +-------- +{see_also}\ {notes}\ {examples} """ +_sem_see_also = """\ +scipy.stats.sem : Compute standard error of the mean. +{name2}.std : Return sample standard deviation over requested axis. +{name2}.var : Return unbiased variance over requested axis. +{name2}.mean : Return the mean of the values over the requested axis. +{name2}.median : Return the median of the values over the requested axis. +{name2}.mode : Return the mode(s) of the Series.""" + +_sem_return_desc = """\ +Unbiased standard error of the mean over requested axis.""" + +_std_see_also = """\ +numpy.std : Compute the standard deviation along the specified axis. +{name2}.var : Return unbiased variance over requested axis. +{name2}.sem : Return unbiased standard error of the mean over requested axis. +{name2}.mean : Return the mean of the values over the requested axis. +{name2}.median : Return the median of the values over the requested axis. +{name2}.mode : Return the mode(s) of the Series.""" + +_std_return_desc = """\ +Standard deviation over requested axis.""" + _std_notes = """ Notes @@ -12695,8 +12736,8 @@ def make_doc(name: str, ndim: int) -> str: "ddof argument." ) examples = _std_examples - see_also = "" - kwargs = {"notes": _std_notes} + see_also = _std_see_also.format(name2=name2) + kwargs = {"notes": "", "return_desc": _std_return_desc} elif name == "sem": base_doc = _num_ddof_doc @@ -12740,8 +12781,8 @@ def make_doc(name: str, ndim: int) -> str: >>> df.sem(numeric_only=True) a 0.5 dtype: float64""" - see_also = "" - kwargs = {"notes": ""} + see_also = _sem_see_also.format(name2=name2) + kwargs = {"notes": "", "return_desc": _sem_return_desc} elif name == "skew": base_doc = _num_doc diff --git a/pandas/core/groupby/categorical.py b/pandas/core/groupby/categorical.py index 49130d91a0126..90cd8e3ffa1c7 100644 --- a/pandas/core/groupby/categorical.py +++ b/pandas/core/groupby/categorical.py @@ -46,9 +46,8 @@ def recode_for_groupby(c: Categorical, sort: bool, observed: bool) -> Categorica # In cases with c.ordered, this is equivalent to # return c.remove_unused_categories(), c - unique_codes = unique1d(c.codes) # type: ignore[no-untyped-call] + take_codes = unique1d(c.codes[c.codes != -1]) # type: ignore[no-untyped-call] - take_codes = unique_codes[unique_codes != -1] if sort: take_codes = np.sort(take_codes) @@ -67,17 +66,18 @@ def recode_for_groupby(c: Categorical, sort: bool, observed: bool) -> Categorica # sort=False should order groups in as-encountered order (GH-8868) - # xref GH:46909: Re-ordering codes faster than using (set|add|reorder)_categories - all_codes = np.arange(c.categories.nunique()) + # GH:46909: Re-ordering codes faster than using (set|add|reorder)_categories # GH 38140: exclude nan from indexer for categories unique_notnan_codes = unique1d(c.codes[c.codes != -1]) # type: ignore[no-untyped-call] if sort: unique_notnan_codes = np.sort(unique_notnan_codes) - if len(all_codes) > len(unique_notnan_codes): + if (num_cat := len(c.categories)) > len(unique_notnan_codes): # GH 13179: All categories need to be present, even if missing from the data - missing_codes = np.setdiff1d(all_codes, unique_notnan_codes, assume_unique=True) + missing_codes = np.setdiff1d( + np.arange(num_cat), unique_notnan_codes, assume_unique=True + ) take_codes = np.concatenate((unique_notnan_codes, missing_codes)) else: take_codes = unique_notnan_codes - return Categorical(c, c.unique().categories.take(take_codes)) + return Categorical(c, c.categories.take(take_codes)) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index b288dad63179f..79fe78b7e5405 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -164,32 +164,6 @@ class providing the base-class of operations. to each row or column of a DataFrame. """ -_groupby_agg_method_template = """ -Compute {fname} of group values. - -Parameters ----------- -numeric_only : bool, default {no} - Include only float, int, boolean columns. - - .. versionchanged:: 2.0.0 - - numeric_only no longer accepts ``None``. - -min_count : int, default {mc} - The required number of valid values to perform the operation. If fewer - than ``min_count`` non-NA values are present the result will be NA. - -Returns -------- -Series or DataFrame - Computed {fname} of values within each group. - -Examples --------- -{example} -""" - _groupby_agg_method_engine_template = """ Compute {fname} of group values. @@ -440,9 +414,9 @@ class providing the base-class of operations. See Also -------- -{klass}.groupby.apply : Apply function func group-wise +{klass}GroupBy.apply : Apply function func group-wise and combine the results together. -{klass}.groupby.transform : Transforms the Series on each group +{klass}GroupBy.transform : Transforms the Series on each group based on the given function. {klass}.aggregate : Aggregate using one or more operations. @@ -3029,16 +3003,38 @@ def sum( return result @final - @doc( - _groupby_agg_method_template, - fname="prod", - no=False, - mc=0, - example=dedent( - """\ + def prod(self, numeric_only: bool = False, min_count: int = 0) -> NDFrameT: + """ + Compute prod of group values. + + Parameters + ---------- + numeric_only : bool, default False + Include only float, int, boolean columns. + + .. versionchanged:: 2.0.0 + + numeric_only no longer accepts ``None``. + + min_count : int, default 0 + The required number of valid values to perform the operation. If fewer + than ``min_count`` non-NA values are present the result will be NA. + + Returns + ------- + Series or DataFrame + Computed prod of values within each group. + + See Also + -------- + Series.prod : Return the product of the values over the requested axis. + DataFrame.prod : Return the product of the values over the requested axis. + + Examples + -------- For SeriesGroupBy: - >>> lst = ['a', 'a', 'b', 'b'] + >>> lst = ["a", "a", "b", "b"] >>> ser = pd.Series([1, 2, 3, 4], index=lst) >>> ser a 1 @@ -3054,8 +3050,11 @@ def sum( For DataFrameGroupBy: >>> data = [[1, 8, 2], [1, 2, 5], [2, 5, 8], [2, 6, 9]] - >>> df = pd.DataFrame(data, columns=["a", "b", "c"], - ... index=["tiger", "leopard", "cheetah", "lion"]) + >>> df = pd.DataFrame( + ... data, + ... columns=["a", "b", "c"], + ... index=["tiger", "leopard", "cheetah", "lion"], + ... ) >>> df a b c tiger 1 8 2 @@ -3066,10 +3065,8 @@ def sum( b c a 1 16 10 - 2 30 72""" - ), - ) - def prod(self, numeric_only: bool = False, min_count: int = 0) -> NDFrameT: + 2 30 72 + """ return self._agg_general( numeric_only=numeric_only, min_count=min_count, alias="prod", npfunc=np.prod ) @@ -4373,11 +4370,12 @@ def post_processor( return vals - qs = np.array(q, dtype=np.float64) - pass_qs: np.ndarray | None = qs if is_scalar(q): qs = np.array([q], dtype=np.float64) - pass_qs = None + pass_qs: None | np.ndarray = None + else: + qs = np.asarray(q, dtype=np.float64) + pass_qs = qs ids = self._grouper.ids ngroups = self._grouper.ngroups diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index d39c337fbb4b2..582e1f96fa562 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -2943,7 +2943,7 @@ def _dti_setop_align_tzs(self, other: Index, setop: str_t) -> tuple[Index, Index """ With mismatched timezones, cast both to UTC. """ - # Caller is responsibelf or checking + # Caller is responsible for checking # `self.dtype != other.dtype` if ( isinstance(self, ABCDatetimeIndex) @@ -4516,8 +4516,8 @@ def _join_multi(self, other: Index, how: JoinHow): from pandas.core.reshape.merge import restore_dropped_levels_multijoin # figure out join names - self_names_list = list(com.not_none(*self.names)) - other_names_list = list(com.not_none(*other.names)) + self_names_list = list(self.names) + other_names_list = list(other.names) self_names_order = self_names_list.index other_names_order = other_names_list.index self_names = set(self_names_list) diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index e1120466eaf83..8b316de30662c 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -441,6 +441,10 @@ def as_unit(self, unit: str) -> Self: """ Convert to a dtype with the given unit resolution. + This method is for converting the dtype of a ``DatetimeIndex`` or + ``TimedeltaIndex`` to a new dtype with the given unit + resolution/precision. + Parameters ---------- unit : {'s', 'ms', 'us', 'ns'} @@ -448,6 +452,14 @@ def as_unit(self, unit: str) -> Self: Returns ------- same type as self + Converted to the specified unit. + + See Also + -------- + Timestamp.as_unit : Convert to the given unit. + Timedelta.as_unit : Convert to the given unit. + DatetimeIndex.as_unit : Convert to the given unit. + TimedeltaIndex.as_unit : Convert to the given unit. Examples -------- diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index 00a929724ed4c..3b3cda8f7cd33 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -6,7 +6,6 @@ import warnings import numpy as np -import pytz from pandas._libs import ( NaT, @@ -162,7 +161,7 @@ class DatetimeIndex(DatetimeTimedeltaMixin): non-DST time (note that this flag is only applicable for ambiguous times) - 'NaT' will return NaT where there are ambiguous times - - 'raise' will raise an AmbiguousTimeError if there are ambiguous times. + - 'raise' will raise a ValueError if there are ambiguous times. dayfirst : bool, default False If True, parse dates in `data` with the day first order. yearfirst : bool, default False @@ -264,7 +263,7 @@ def _engine_type(self) -> type[libindex.DatetimeEngine]: @doc(DatetimeArray.strftime) def strftime(self, date_format) -> Index: arr = self._data.strftime(date_format) - return Index(arr, name=self.name, dtype=object) + return Index(arr, name=self.name, dtype=arr.dtype) @doc(DatetimeArray.tz_convert) def tz_convert(self, tz) -> Self: @@ -591,7 +590,7 @@ def get_loc(self, key): elif isinstance(key, str): try: parsed, reso = self._parse_with_reso(key) - except (ValueError, pytz.NonExistentTimeError) as err: + except ValueError as err: raise KeyError(key) from err self._disallow_mismatched_indexing(parsed) diff --git a/pandas/core/indexes/extension.py b/pandas/core/indexes/extension.py index 48d5e59250f35..2eeacfb769be4 100644 --- a/pandas/core/indexes/extension.py +++ b/pandas/core/indexes/extension.py @@ -74,7 +74,7 @@ def fget(self): return type(self)._simple_new(result, name=self.name) elif isinstance(result, ABCDataFrame): return result.set_index(self) - return Index(result, name=self.name) + return Index(result, name=self.name, dtype=result.dtype) return result def fset(self, value) -> None: @@ -101,7 +101,7 @@ def method(self, *args, **kwargs): # type: ignore[misc] return type(self)._simple_new(result, name=self.name) elif isinstance(result, ABCDataFrame): return result.set_index(self) - return Index(result, name=self.name) + return Index(result, name=self.name, dtype=result.dtype) return result # error: "property" has no attribute "__name__" diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 0900121ab717f..9eccb7645fbe7 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -799,7 +799,7 @@ def dtypes(self) -> Series: """ from pandas import Series - names = com.fill_missing_names([level.name for level in self.levels]) + names = com.fill_missing_names(self.names) return Series([level.dtype for level in self.levels], index=Index(names)) def __len__(self) -> int: @@ -1302,7 +1302,7 @@ def _view(self) -> MultiIndex: verify_integrity=False, ) result._cache = self._cache.copy() - result._cache.pop("levels", None) # GH32669 + result._reset_cache("levels") # GH32669 return result # -------------------------------------------------------------------- @@ -1384,7 +1384,7 @@ def copy( # type: ignore[override] verify_integrity=False, ) new_index._cache = self._cache.copy() - new_index._cache.pop("levels", None) # GH32669 + new_index._reset_cache("levels") # GH32669 if keep_id: new_index._id = self._id return new_index @@ -1572,7 +1572,7 @@ def _format_multi( def _get_names(self) -> FrozenList: return FrozenList(self._names) - def _set_names(self, names, *, level=None, validate: bool = True) -> None: + def _set_names(self, names, *, level=None) -> None: """ Set new names on index. Each name has to be a hashable type. @@ -1583,8 +1583,6 @@ def _set_names(self, names, *, level=None, validate: bool = True) -> None: level : int, level name, or sequence of int/level names (default None) If the index is a MultiIndex (hierarchical), level(s) to set (None for all levels). Otherwise level must be None - validate : bool, default True - validate that the names match level lengths Raises ------ @@ -1603,13 +1601,12 @@ def _set_names(self, names, *, level=None, validate: bool = True) -> None: raise ValueError("Names should be list-like for a MultiIndex") names = list(names) - if validate: - if level is not None and len(names) != len(level): - raise ValueError("Length of names must match length of level.") - if level is None and len(names) != self.nlevels: - raise ValueError( - "Length of names must match number of levels in MultiIndex." - ) + if level is not None and len(names) != len(level): + raise ValueError("Length of names must match length of level.") + if level is None and len(names) != self.nlevels: + raise ValueError( + "Length of names must match number of levels in MultiIndex." + ) if level is None: level = range(self.nlevels) @@ -1627,8 +1624,9 @@ def _set_names(self, names, *, level=None, validate: bool = True) -> None: ) self._names[lev] = name - # If .levels has been accessed, the names in our cache will be stale. - self._reset_cache() + # If .levels has been accessed, the .name of each level in our cache + # will be stale. + self._reset_cache("levels") names = property( fset=_set_names, @@ -1636,6 +1634,17 @@ def _set_names(self, names, *, level=None, validate: bool = True) -> None: doc=""" Names of levels in MultiIndex. + This attribute provides access to the names of the levels in a `MultiIndex`. + The names are stored as a `FrozenList`, which is an immutable list-like + container. Each name corresponds to a level in the `MultiIndex`, and can be + used to identify or manipulate the levels individually. + + See Also + -------- + MultiIndex.set_names : Set Index or MultiIndex name. + MultiIndex.rename : Rename specific levels in a MultiIndex. + Index.names : Get names on index. + Examples -------- >>> mi = pd.MultiIndex.from_arrays( @@ -1883,6 +1892,7 @@ def to_frame( Returns ------- DataFrame + DataFrame representation of the MultiIndex, with levels as columns. See Also -------- @@ -2592,6 +2602,13 @@ def reorder_levels(self, order) -> MultiIndex: """ Rearrange levels using input order. May not drop or duplicate levels. + `reorder_levels` is useful when you need to change the order of levels in + a MultiIndex, such as when reordering levels for hierarchical indexing. It + maintains the integrity of the MultiIndex, ensuring that all existing levels + are present and no levels are duplicated. This method is helpful for aligning + the index structure with other data structures or for optimizing the order + for specific data operations. + Parameters ---------- order : list of int or list of str @@ -2601,6 +2618,13 @@ def reorder_levels(self, order) -> MultiIndex: Returns ------- MultiIndex + A new MultiIndex with levels rearranged according to the specified order. + + See Also + -------- + MultiIndex.swaplevel : Swap two levels of the MultiIndex. + MultiIndex.set_names : Set names for the MultiIndex levels. + DataFrame.reorder_levels : Reorder levels in a DataFrame with a MultiIndex. Examples -------- @@ -2660,9 +2684,9 @@ def _get_codes_for_sorting(self) -> list[Categorical]: a valid valid """ - def cats(level_codes): + def cats(level_codes: np.ndarray) -> np.ndarray: return np.arange( - np.array(level_codes).max() + 1 if len(level_codes) else 0, + level_codes.max() + 1 if len(level_codes) else 0, dtype=level_codes.dtype, ) @@ -2681,8 +2705,15 @@ def sortlevel( """ Sort MultiIndex at the requested level. - The result will respect the original ordering of the associated - factor at that level. + This method is useful when dealing with MultiIndex objects, allowing for + sorting at a specific level of the index. The function preserves the + relative ordering of data within the same level while sorting + the overall MultiIndex. The method provides flexibility with the `ascending` + parameter to define the sort order and with the `sort_remaining` parameter to + control whether the remaining levels should also be sorted. Sorting a + MultiIndex can be crucial when performing operations that require ordered + indices, such as grouping or merging datasets. The `na_position` argument is + important in handling missing values consistently across different levels. Parameters ---------- @@ -2692,7 +2723,9 @@ def sortlevel( ascending : bool, default True False to sort in descending order. Can also be a list to specify a directed ordering. - sort_remaining : sort by the remaining levels after level + sort_remaining : bool, default True + If True, sorts by the remaining levels after sorting by the specified + `level`. na_position : {'first' or 'last'}, default 'first' Argument 'first' puts NaNs at the beginning, 'last' puts NaNs at the end. @@ -2706,6 +2739,13 @@ def sortlevel( indexer : np.ndarray[np.intp] Indices of output values in original index. + See Also + -------- + MultiIndex : A multi-level, or hierarchical, index object for pandas objects. + Index.sort_values : Sort Index values. + DataFrame.sort_index : Sort DataFrame by the index. + Series.sort_index : Sort Series by the index. + Examples -------- >>> mi = pd.MultiIndex.from_arrays([[0, 0], [2, 1]]) diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index edd1fdd4da943..b5f05ef0ab78f 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -261,12 +261,19 @@ def from_fields( Parameters ---------- year : int, array, or Series, default None + Year for the PeriodIndex. quarter : int, array, or Series, default None + Quarter for the PeriodIndex. month : int, array, or Series, default None + Month for the PeriodIndex. day : int, array, or Series, default None + Day for the PeriodIndex. hour : int, array, or Series, default None + Hour for the PeriodIndex. minute : int, array, or Series, default None + Minute for the PeriodIndex. second : int, array, or Series, default None + Second for the PeriodIndex. freq : str or period object, optional One of pandas period strings or corresponding objects. @@ -274,6 +281,11 @@ def from_fields( ------- PeriodIndex + See Also + -------- + PeriodIndex.from_ordinals : Construct a PeriodIndex from ordinals. + PeriodIndex.to_timestamp : Cast to DatetimeArray/Index. + Examples -------- >>> idx = pd.PeriodIndex.from_fields(year=[2000, 2002], quarter=[1, 3]) @@ -311,6 +323,12 @@ def from_ordinals(cls, ordinals, *, freq, name=None) -> Self: ------- PeriodIndex + See Also + -------- + PeriodIndex.from_fields : Construct a PeriodIndex from fields + (year, month, day, etc.). + PeriodIndex.to_timestamp : Cast to DatetimeArray/Index. + Examples -------- >>> idx = pd.PeriodIndex.from_ordinals([-1, 0, 1], freq="Q") diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index ce9e639656acb..b11ce6bd7b919 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -90,7 +90,9 @@ class RangeIndex(Index): start : int (default: 0), range, or other RangeIndex instance If int and "stop" is not given, interpreted as "stop" instead. stop : int (default: 0) + The end value of the range (exclusive). step : int (default: 1) + The step size of the range. dtype : np.int64 Unused, accepted for homogeneity with other index types. copy : bool, default False diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index debb5bdd4fc4b..08bd3cde60806 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -2308,11 +2308,11 @@ def _ensure_iterable_column_indexer(self, column_indexer): """ Ensure that our column indexer is something that can be iterated over. """ - ilocs: Sequence[int | np.integer] | np.ndarray + ilocs: Sequence[int | np.integer] | np.ndarray | range if is_integer(column_indexer): ilocs = [column_indexer] elif isinstance(column_indexer, slice): - ilocs = np.arange(len(self.obj.columns))[column_indexer] + ilocs = range(len(self.obj.columns))[column_indexer] elif ( isinstance(column_indexer, np.ndarray) and column_indexer.dtype.kind == "b" ): diff --git a/pandas/core/interchange/from_dataframe.py b/pandas/core/interchange/from_dataframe.py index 7f2647d64b190..0e5776ae8cdd9 100644 --- a/pandas/core/interchange/from_dataframe.py +++ b/pandas/core/interchange/from_dataframe.py @@ -60,6 +60,13 @@ def from_dataframe(df, allow_copy: bool = True) -> pd.DataFrame: Returns ------- pd.DataFrame + A pandas DataFrame built from the provided interchange + protocol object. + + See Also + -------- + pd.DataFrame : DataFrame class which can be created from various input data + formats, including objects that support the interchange protocol. Examples -------- diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 149bef6258bfa..dced92ba04520 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -512,7 +512,11 @@ def convert(self) -> list[Block]: convert_non_numeric=True, ) refs = None - if res_values is values: + if ( + res_values is values + or isinstance(res_values, NumpyExtensionArray) + and res_values._ndarray is values + ): refs = self.refs res_values = ensure_block_shape(res_values, self.ndim) @@ -1474,7 +1478,7 @@ def round(self, decimals: int) -> Self: """ Rounds the values. If the block is not of an integer or float dtype, nothing happens. - This is consistent with DataFrame.round behavivor. + This is consistent with DataFrame.round behavior. (Note: Series.round would raise) Parameters diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 535397871588c..07465e7b87fcd 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -621,7 +621,7 @@ def reorder_arrays( arrays: list[ArrayLike], arr_columns: Index, columns: Index | None, length: int ) -> tuple[list[ArrayLike], Index]: """ - Pre-emptively (cheaply) reindex arrays with new columns. + Preemptively (cheaply) reindex arrays with new columns. """ # reorder according to the columns if columns is not None: diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index c42ea44b2fc89..aa4a785519051 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1362,7 +1362,7 @@ def _iset_split_block( """Removes columns from a block by splitting the block. Avoids copying the whole block through slicing and updates the manager - after determinint the new block structure. Optionally adds a new block, + after determining the new block structure. Optionally adds a new block, otherwise has to be done by the caller. Parameters @@ -2154,7 +2154,7 @@ def setitem_inplace(self, indexer, value) -> None: """ if not self._has_no_reference(0): self.blocks = (self._block.copy(),) - self._cache.clear() + self._reset_cache() arr = self.array @@ -2179,7 +2179,7 @@ def idelete(self, indexer) -> SingleBlockManager: nb = self._block.delete(indexer)[0] self.blocks = (nb,) self.axes[0] = self.axes[0].delete(indexer) - self._cache.clear() + self._reset_cache() return self def fast_xs(self, loc): diff --git a/pandas/core/ops/docstrings.py b/pandas/core/ops/docstrings.py index 0ad6db0aefe9c..5ce0a2da86f31 100644 --- a/pandas/core/ops/docstrings.py +++ b/pandas/core/ops/docstrings.py @@ -376,7 +376,7 @@ def make_flex_doc(op_name: str, typ: str) -> str: "ne": { "op": "!=", "desc": "Not equal to", - "reverse": None, + "reverse": "eq", "series_examples": _ne_example_SERIES, "series_returns": _returns_series, }, @@ -397,14 +397,14 @@ def make_flex_doc(op_name: str, typ: str) -> str: "gt": { "op": ">", "desc": "Greater than", - "reverse": None, + "reverse": "lt", "series_examples": _gt_example_SERIES, "series_returns": _returns_series, }, "ge": { "op": ">=", "desc": "Greater than or equal to", - "reverse": None, + "reverse": "le", "series_examples": _ge_example_SERIES, "series_returns": _returns_series, }, diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 8ee71ea2293e6..b621fcf9a6415 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -2466,7 +2466,7 @@ def _get_timestamp_range_edges( ) if isinstance(freq, Day): first = first.tz_localize(index_tz) - last = last.tz_localize(index_tz) + last = last.tz_localize(index_tz, nonexistent="shift_forward") else: first = first.normalize() last = last.normalize() diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index c005a1ce26e4b..cfe83111b6e38 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -201,7 +201,7 @@ def concat( be very expensive relative to the actual data concatenation. sort : bool, default False Sort non-concatenation axis. One exception to this is when the - non-concatentation axis is a DatetimeIndex and join='outer' and the axis is + non-concatenation axis is a DatetimeIndex and join='outer' and the axis is not already aligned. In that case, the non-concatenation axis is always sorted lexicographically. copy : bool, default False diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 6364072fd215c..07e8fa4841c04 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -982,6 +982,14 @@ def __init__( ) raise MergeError(msg) + # GH 59435: raise when "how" is not a valid Merge type + merge_type = {"left", "right", "inner", "outer", "cross", "asof"} + if how not in merge_type: + raise ValueError( + f"'{how}' is not a valid Merge type: " + f"left, right, inner, outer, cross, asof" + ) + self.left_on, self.right_on = self._validate_left_right_on(left_on, right_on) ( diff --git a/pandas/core/series.py b/pandas/core/series.py index a197886748bce..4f79e30f48f3c 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -34,6 +34,7 @@ from pandas._libs.lib import is_range_indexer from pandas.compat import PYPY from pandas.compat._constants import REF_COUNT +from pandas.compat._optional import import_optional_dependency from pandas.compat.numpy import function as nv from pandas.errors import ( ChainedAssignmentError, @@ -558,6 +559,32 @@ def _init_dict( # ---------------------------------------------------------------------- + def __arrow_c_stream__(self, requested_schema=None): + """ + Export the pandas Series as an Arrow C stream PyCapsule. + + This relies on pyarrow to convert the pandas Series to the Arrow + format (and follows the default behaviour of ``pyarrow.Array.from_pandas`` + in its handling of the index, i.e. to ignore it). + This conversion is not necessarily zero-copy. + + Parameters + ---------- + requested_schema : PyCapsule, default None + The schema to which the dataframe should be casted, passed as a + PyCapsule containing a C ArrowSchema representation of the + requested schema. + + Returns + ------- + PyCapsule + """ + pa = import_optional_dependency("pyarrow", min_version="16.0.0") + ca = pa.chunked_array([pa.Array.from_pandas(self, type=requested_schema)]) + return ca.__arrow_c_stream__(requested_schema) + + # ---------------------------------------------------------------------- + @property def _constructor(self) -> type[Series]: return Series @@ -1617,6 +1644,11 @@ def to_markdown( str {klass} in Markdown-friendly format. + See Also + -------- + Series.to_frame : Rrite a text representation of object to the system clipboard. + Series.to_latex : Render Series to LaTeX-formatted table. + Notes ----- Requires the `tabulate `_ package. @@ -2619,6 +2651,13 @@ def corr( >>> s2 = pd.Series([1, 2, 3], index=[2, 1, 0]) >>> s1.corr(s2) -1.0 + + If the input is a constant array, the correlation is not defined in this case, + and ``np.nan`` is returned. + + >>> s1 = pd.Series([0.45, 0.45]) + >>> s1.corr(s1) + nan """ # noqa: E501 this, other = self.align(other, join="inner") if len(this) == 0: @@ -3211,6 +3250,13 @@ def update(self, other: Series | Sequence | Mapping) -> None: Parameters ---------- other : Series, or object coercible into Series + Other Series that provides values to update the current Series. + + See Also + -------- + Series.combine : Perform element-wise operation on two Series + using a given function. + Series.transform: Modify a Series using a function. Examples -------- @@ -4086,7 +4132,13 @@ def reorder_levels(self, order: Sequence[Level]) -> Series: Returns ------- - type of caller (new object) + Series + Type of caller with index as MultiIndex (new object). + + See Also + -------- + DataFrame.reorder_levels : Rearrange index or column levels using + input ``order``. Examples -------- @@ -5041,6 +5093,11 @@ def pop(self, item: Hashable) -> Any: scalar Value that is popped from series. + See Also + -------- + Series.drop: Drop specified values from Series. + Series.drop_duplicates: Return Series with duplicate values removed. + Examples -------- >>> ser = pd.Series([1, 2, 3]) diff --git a/pandas/core/shared_docs.py b/pandas/core/shared_docs.py index 5725b96f66cd4..cb0c3d241534c 100644 --- a/pandas/core/shared_docs.py +++ b/pandas/core/shared_docs.py @@ -649,133 +649,3 @@ 3 3 d e 4 4 e e """ - -_shared_docs["idxmin"] = """ - Return index of first occurrence of minimum over requested axis. - - NA/null values are excluded. - - Parameters - ---------- - axis : {{0 or 'index', 1 or 'columns'}}, default 0 - The axis to use. 0 or 'index' for row-wise, 1 or 'columns' for column-wise. - skipna : bool, default True - Exclude NA/null values. If the entire Series is NA, or if ``skipna=False`` - and there is an NA value, this method will raise a ``ValueError``. - numeric_only : bool, default {numeric_only_default} - Include only `float`, `int` or `boolean` data. - - .. versionadded:: 1.5.0 - - Returns - ------- - Series - Indexes of minima along the specified axis. - - Raises - ------ - ValueError - * If the row/column is empty - - See Also - -------- - Series.idxmin : Return index of the minimum element. - - Notes - ----- - This method is the DataFrame version of ``ndarray.argmin``. - - Examples - -------- - Consider a dataset containing food consumption in Argentina. - - >>> df = pd.DataFrame({{'consumption': [10.51, 103.11, 55.48], - ... 'co2_emissions': [37.2, 19.66, 1712]}}, - ... index=['Pork', 'Wheat Products', 'Beef']) - - >>> df - consumption co2_emissions - Pork 10.51 37.20 - Wheat Products 103.11 19.66 - Beef 55.48 1712.00 - - By default, it returns the index for the minimum value in each column. - - >>> df.idxmin() - consumption Pork - co2_emissions Wheat Products - dtype: object - - To return the index for the minimum value in each row, use ``axis="columns"``. - - >>> df.idxmin(axis="columns") - Pork consumption - Wheat Products co2_emissions - Beef consumption - dtype: object -""" - -_shared_docs["idxmax"] = """ - Return index of first occurrence of maximum over requested axis. - - NA/null values are excluded. - - Parameters - ---------- - axis : {{0 or 'index', 1 or 'columns'}}, default 0 - The axis to use. 0 or 'index' for row-wise, 1 or 'columns' for column-wise. - skipna : bool, default True - Exclude NA/null values. If the entire Series is NA, or if ``skipna=False`` - and there is an NA value, this method will raise a ``ValueError``. - numeric_only : bool, default {numeric_only_default} - Include only `float`, `int` or `boolean` data. - - .. versionadded:: 1.5.0 - - Returns - ------- - Series - Indexes of maxima along the specified axis. - - Raises - ------ - ValueError - * If the row/column is empty - - See Also - -------- - Series.idxmax : Return index of the maximum element. - - Notes - ----- - This method is the DataFrame version of ``ndarray.argmax``. - - Examples - -------- - Consider a dataset containing food consumption in Argentina. - - >>> df = pd.DataFrame({{'consumption': [10.51, 103.11, 55.48], - ... 'co2_emissions': [37.2, 19.66, 1712]}}, - ... index=['Pork', 'Wheat Products', 'Beef']) - - >>> df - consumption co2_emissions - Pork 10.51 37.20 - Wheat Products 103.11 19.66 - Beef 55.48 1712.00 - - By default, it returns the index for the maximum value in each column. - - >>> df.idxmax() - consumption Wheat Products - co2_emissions Beef - dtype: object - - To return the index for the maximum value in each row, use ``axis="columns"``. - - >>> df.idxmax(axis="columns") - Pork co2_emissions - Wheat Products consumption - Beef co2_emissions - dtype: object -""" diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index 25fdafa9b8354..bdb88e981bcda 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -969,6 +969,8 @@ def rsplit(self, pat=None, *, n=-1, expand: bool = False): Returns ------- DataFrame/MultiIndex or Series/Index of objects + Returns appropriate type based on `expand` parameter with strings + split based on the `sep` parameter. See Also -------- @@ -1377,6 +1379,9 @@ def match(self, pat: str, case: bool = True, flags: int = 0, na=None): Returns ------- Series/Index/array of boolean values + A Series, Index, or array of boolean values indicating whether the start + of each string matches the pattern. The result will be of the same type + as the input. See Also -------- @@ -1501,6 +1506,14 @@ def replace( * if `pat` is a compiled regex and `case` or `flags` is set * if `pat` is a dictionary and `repl` is not None. + See Also + -------- + Series.str.replace : Method to replace occurrences of a substring with another + substring. + Series.str.extract : Extract substrings using a regular expression. + Series.str.findall : Find all occurrences of a pattern or regex in each string. + Series.str.split : Split each string by a specified delimiter or pattern. + Notes ----- When `pat` is a compiled regex, all flags should be included in the @@ -1632,6 +1645,20 @@ def repeat(self, repeats): Series or Index of repeated string objects specified by input parameter repeats. + See Also + -------- + Series.str.lower : Convert all characters in each string to lowercase. + Series.str.upper : Convert all characters in each string to uppercase. + Series.str.title : Convert each string to title case (capitalizing the first + letter of each word). + Series.str.strip : Remove leading and trailing whitespace from each string. + Series.str.replace : Replace occurrences of a substring with another substring + in each string. + Series.str.ljust : Left-justify each string in the Series/Index by padding with + a specified character. + Series.str.rjust : Right-justify each string in the Series/Index by padding with + a specified character. + Examples -------- >>> s = pd.Series(["a", "b", "c"]) @@ -1749,6 +1776,18 @@ def pad( Returns ------- Series/Index of objects. + A Series or Index where the strings are modified by :meth:`str.%(method)s`. + + See Also + -------- + Series.str.rjust : Fills the left side of strings with an arbitrary + character. + Series.str.ljust : Fills the right side of strings with an arbitrary + character. + Series.str.center : Fills both sides of strings with an arbitrary + character. + Series.str.zfill : Pad strings in the Series/Index by prepending '0' + character. Examples -------- @@ -1814,6 +1853,7 @@ def zfill(self, width: int): Returns ------- Series/Index of objects. + A Series or Index where the strings are prepended with '0' characters. See Also -------- @@ -2024,11 +2064,19 @@ def decode(self, encoding, errors: str = "strict"): Parameters ---------- encoding : str + Specifies the encoding to be used. errors : str, optional + Specifies the error handling scheme. + Possible values are those supported by :meth:`bytes.decode`. Returns ------- Series or Index + A Series or Index with decoded strings. + + See Also + -------- + Series.str.encode : Encodes strings into bytes in a Series/Index. Examples -------- @@ -2063,11 +2111,19 @@ def encode(self, encoding, errors: str = "strict"): Parameters ---------- encoding : str + Specifies the encoding to be used. errors : str, optional + Specifies the error handling scheme. + Possible values are those supported by :meth:`str.encode`. Returns ------- Series/Index of objects + A Series or Index with strings encoded into bytes. + + See Also + -------- + Series.str.decode : Decodes bytes into strings in a Series/Index. Examples -------- @@ -2099,6 +2155,7 @@ def encode(self, encoding, errors: str = "strict"): Returns ------- Series or Index of object + Series or Index with the strings being stripped from the %(side)s. See Also -------- @@ -2329,6 +2386,13 @@ def wrap( Returns ------- Series or Index + A Series or Index where the strings are wrapped at the specified line width. + + See Also + -------- + Series.str.strip : Remove leading and trailing characters in Series/Index. + Series.str.lstrip : Remove leading characters in Series/Index. + Series.str.rstrip : Remove trailing characters in Series/Index. Notes ----- @@ -3060,6 +3124,19 @@ def normalize(self, form): Returns ------- Series/Index of objects + A Series or Index of strings in the same Unicode form specified by `form`. + The returned object retains the same type as the input (Series or Index), + and contains the normalized strings. + + See Also + -------- + Series.str.upper : Convert all characters in each string to uppercase. + Series.str.lower : Convert all characters in each string to lowercase. + Series.str.title : Convert each string to title case (capitalizing the + first letter of each word). + Series.str.strip : Remove leading and trailing whitespace from each string. + Series.str.replace : Replace occurrences of a substring with another substring + in each string. Examples -------- @@ -3209,7 +3286,8 @@ def len(self): Returns ------- - Series or Index of object + Series or Index of objects + A Series or Index where the strings are modified by :meth:`str.%(method)s`. See Also -------- diff --git a/pandas/core/strings/object_array.py b/pandas/core/strings/object_array.py index 290a28ab60ae1..c6b18d7049c57 100644 --- a/pandas/core/strings/object_array.py +++ b/pandas/core/strings/object_array.py @@ -9,12 +9,14 @@ cast, ) import unicodedata +import warnings import numpy as np from pandas._libs import lib import pandas._libs.missing as libmissing import pandas._libs.ops as libops +from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.missing import isna @@ -37,8 +39,6 @@ class ObjectStringArrayMixin(BaseStringArrayMethods): String Methods operating on object-dtype ndarrays. """ - _str_na_value = np.nan - def __len__(self) -> int: # For typing, _str_map relies on the object being sized. raise NotImplementedError @@ -56,7 +56,7 @@ def _str_map( na_value : Scalar, optional The value to set for NA values. Might also be used for the fill value if the callable `f` raises an exception. - This defaults to ``self._str_na_value`` which is ``np.nan`` + This defaults to ``self.dtype.na_value`` which is ``np.nan`` for object-dtype and Categorical and ``pd.NA`` for StringArray. dtype : Dtype, optional The dtype of the result array. @@ -66,7 +66,7 @@ def _str_map( if dtype is None: dtype = np.dtype("object") if na_value is None: - na_value = self._str_na_value + na_value = self.dtype.na_value # type: ignore[attr-defined] if not len(self): return np.array([], dtype=dtype) @@ -144,14 +144,38 @@ def _str_contains( else: upper_pat = pat.upper() f = lambda x: upper_pat in x.upper() + if not isna(na) and not isinstance(na, bool): + # GH#59561 + warnings.warn( + "Allowing a non-bool 'na' in obj.str.contains is deprecated " + "and will raise in a future version.", + FutureWarning, + stacklevel=find_stack_level(), + ) return self._str_map(f, na, dtype=np.dtype("bool")) def _str_startswith(self, pat, na=None): f = lambda x: x.startswith(pat) + if not isna(na) and not isinstance(na, bool): + # GH#59561 + warnings.warn( + "Allowing a non-bool 'na' in obj.str.startswith is deprecated " + "and will raise in a future version.", + FutureWarning, + stacklevel=find_stack_level(), + ) return self._str_map(f, na_value=na, dtype=np.dtype(bool)) def _str_endswith(self, pat, na=None): f = lambda x: x.endswith(pat) + if not isna(na) and not isinstance(na, bool): + # GH#59561 + warnings.warn( + "Allowing a non-bool 'na' in obj.str.endswith is deprecated " + "and will raise in a future version.", + FutureWarning, + stacklevel=find_stack_level(), + ) return self._str_map(f, na_value=na, dtype=np.dtype(bool)) def _str_replace( @@ -272,7 +296,7 @@ def f(x): return x.get(i) elif len(x) > i >= -len(x): return x[i] - return self._str_na_value + return self.dtype.na_value # type: ignore[attr-defined] return self._str_map(f) @@ -466,7 +490,7 @@ def _str_removesuffix(self, suffix: str): def _str_extract(self, pat: str, flags: int = 0, expand: bool = True): regex = re.compile(pat, flags=flags) - na_value = self._str_na_value + na_value = self.dtype.na_value # type: ignore[attr-defined] if not expand: diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 0e91bfa99e887..86c7316320f44 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -418,7 +418,7 @@ def _convert_listlike_datetimes( arg, _ = maybe_convert_dtype(arg, copy=False, tz=libtimezones.maybe_get_tz(tz)) except TypeError: if errors == "coerce": - npvalues = np.array(["NaT"], dtype="datetime64[ns]").repeat(len(arg)) + npvalues = np.full(len(arg), np.datetime64("NaT", "ns")) return DatetimeIndex(npvalues, name=name) raise diff --git a/pandas/core/tools/numeric.py b/pandas/core/tools/numeric.py index 26e73794af298..982851d0557c3 100644 --- a/pandas/core/tools/numeric.py +++ b/pandas/core/tools/numeric.py @@ -99,8 +99,8 @@ def to_numeric( is to not use nullable data types. If specified, the behavior is as follows: - * ``"numpy_nullable"``: returns with nullable-dtype-backed - * ``"pyarrow"``: returns with pyarrow-backed nullable :class:`ArrowDtype` + * ``"numpy_nullable"``: returns nullable-dtype-backed object + * ``"pyarrow"``: returns with pyarrow-backed nullable object .. versionadded:: 2.0 diff --git a/pandas/core/window/expanding.py b/pandas/core/window/expanding.py index d0c8a2e67b6ca..4bf77b3d38689 100644 --- a/pandas/core/window/expanding.py +++ b/pandas/core/window/expanding.py @@ -174,6 +174,8 @@ def aggregate(self, func, *args, **kwargs): @doc( template_header, + create_section_header("Parameters"), + kwargs_numeric_only, create_section_header("Returns"), template_returns, create_section_header("See Also"), @@ -865,6 +867,9 @@ def cov( output will be a MultiIndexed DataFrame in the case of DataFrame inputs. In the case of missing elements, only complete pairwise observations will be used. + ddof : int, default 1 + Delta Degrees of Freedom. The divisor used in calculations + is ``N - ddof``, where ``N`` represents the number of elements.\n """ ).replace("\n", "", 1), kwargs_numeric_only, diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index 16aa6d7e56a1c..9ea825ad4e44d 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -1186,7 +1186,7 @@ def homogeneous_func(values: np.ndarray): return values.copy() def calc(x): - additional_nans = np.array([np.nan] * offset) + additional_nans = np.full(offset, np.nan) x = np.concatenate((x, additional_nans)) return func( x, @@ -1350,6 +1350,13 @@ def mean(self, numeric_only: bool = False, **kwargs): @doc( template_header, create_section_header("Parameters"), + dedent( + """ + ddof : int, default 1 + Delta Degrees of Freedom. The divisor used in calculations + is ``N - ddof``, where ``N`` represents the number of elements. + """ + ).replace("\n", "", 1), kwargs_numeric_only, kwargs_scipy, create_section_header("Returns"), @@ -1392,6 +1399,13 @@ def var(self, ddof: int = 1, numeric_only: bool = False, **kwargs): @doc( template_header, create_section_header("Parameters"), + dedent( + """ + ddof : int, default 1 + Delta Degrees of Freedom. The divisor used in calculations + is ``N - ddof``, where ``N`` represents the number of elements. + """ + ).replace("\n", "", 1), kwargs_numeric_only, kwargs_scipy, create_section_header("Returns"), @@ -2099,7 +2113,19 @@ def sum( template_header, create_section_header("Parameters"), kwargs_numeric_only, + dedent( + """ + *args : iterable, optional + Positional arguments passed into ``func``.\n + """ + ).replace("\n", "", 1), window_agg_numba_parameters(), + dedent( + """ + **kwargs : mapping, optional + A dictionary of keyword arguments passed into ``func``.\n + """ + ).replace("\n", "", 1), create_section_header("Returns"), template_returns, create_section_header("See Also"), diff --git a/pandas/errors/__init__.py b/pandas/errors/__init__.py index c8863e1b39c94..2f625090e0492 100644 --- a/pandas/errors/__init__.py +++ b/pandas/errors/__init__.py @@ -285,6 +285,30 @@ class AbstractMethodError(NotImplementedError): """ Raise this error instead of NotImplementedError for abstract methods. + The `AbstractMethodError` is designed for use in classes that follow an abstract + base class pattern. By raising this error in the method, it ensures that a subclass + must implement the method to provide specific functionality. This is useful in a + framework or library where certain methods must be implemented by the user to + ensure correct behavior. + + Parameters + ---------- + class_instance : object + The instance of the class where the abstract method is being called. + methodtype : str, default "method" + A string indicating the type of method that is abstract. + Must be one of {"method", "classmethod", "staticmethod", "property"}. + + See Also + -------- + api.extensions.ExtensionArray + An example of a pandas extension mechanism that requires implementing + specific abstract methods. + NotImplementedError + A built-in exception that can also be used for abstract methods but lacks + the specificity of `AbstractMethodError` in indicating the need for subclass + implementation. + Examples -------- >>> class Foo: @@ -378,7 +402,7 @@ class InvalidIndexError(Exception): class DataError(Exception): """ - Exceptionn raised when performing an operation on non-numerical data. + Exception raised when performing an operation on non-numerical data. For example, calling ``ohlc`` on a non-numerical column or a function on a rolling window. diff --git a/pandas/io/_util.py b/pandas/io/_util.py index f502f827faa4e..a1c3318f04466 100644 --- a/pandas/io/_util.py +++ b/pandas/io/_util.py @@ -27,6 +27,8 @@ def _arrow_dtype_mapping() -> dict: pa.string(): pd.StringDtype(), pa.float32(): pd.Float32Dtype(), pa.float64(): pd.Float64Dtype(), + pa.string(): pd.StringDtype(), + pa.large_string(): pd.StringDtype(), } diff --git a/pandas/io/clipboards.py b/pandas/io/clipboards.py index 5a0a8c321e629..2ed241f0b9bca 100644 --- a/pandas/io/clipboards.py +++ b/pandas/io/clipboards.py @@ -38,14 +38,15 @@ def read_clipboard( A string or regex delimiter. The default of ``'\\s+'`` denotes one or more whitespace characters. - dtype_backend : {'numpy_nullable', 'pyarrow'}, default 'numpy_nullable' + dtype_backend : {'numpy_nullable', 'pyarrow'} Back-end data type applied to the resultant :class:`DataFrame` - (still experimental). Behaviour is as follows: + (still experimental). If not specified, the default behavior + is to not use nullable data types. If specified, the behavior + is as follows: * ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame` - (default). - * ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype` - DataFrame. + * ``"pyarrow"``: returns pyarrow-backed nullable + :class:`ArrowDtype` :class:`DataFrame` .. versionadded:: 2.0 diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index f83f9cb1c8d74..ef52107c283e9 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -267,14 +267,15 @@ Rows at the end to skip (0-indexed). {storage_options} -dtype_backend : {{'numpy_nullable', 'pyarrow'}}, default 'numpy_nullable' +dtype_backend : {{'numpy_nullable', 'pyarrow'}} Back-end data type applied to the resultant :class:`DataFrame` - (still experimental). Behaviour is as follows: + (still experimental). If not specified, the default behavior + is to not use nullable data types. If specified, the behavior + is as follows: * ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame` - (default). - * ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype` - DataFrame. + * ``"pyarrow"``: returns pyarrow-backed nullable + :class:`ArrowDtype` :class:`DataFrame` .. versionadded:: 2.0 @@ -1728,14 +1729,15 @@ def parse( comment string and the end of the current line is ignored. skipfooter : int, default 0 Rows at the end to skip (0-indexed). - dtype_backend : {{'numpy_nullable', 'pyarrow'}}, default 'numpy_nullable' + dtype_backend : {{'numpy_nullable', 'pyarrow'}} Back-end data type applied to the resultant :class:`DataFrame` - (still experimental). Behaviour is as follows: + (still experimental). If not specified, the default behavior + is to not use nullable data types. If specified, the behavior + is as follows: * ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame` - (default). - * ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype` - DataFrame. + * ``"pyarrow"``: returns pyarrow-backed nullable + :class:`ArrowDtype` :class:`DataFrame` .. versionadded:: 2.0 **kwds : dict, optional diff --git a/pandas/io/feather_format.py b/pandas/io/feather_format.py index 3df3e77a851a3..aaae9857b4fae 100644 --- a/pandas/io/feather_format.py +++ b/pandas/io/feather_format.py @@ -92,14 +92,15 @@ def read_feather( Whether to parallelize reading using multiple threads. {storage_options} - dtype_backend : {{'numpy_nullable', 'pyarrow'}}, default 'numpy_nullable' + dtype_backend : {{'numpy_nullable', 'pyarrow'}} Back-end data type applied to the resultant :class:`DataFrame` - (still experimental). Behaviour is as follows: + (still experimental). If not specified, the default behavior + is to not use nullable data types. If specified, the behavior + is as follows: - * ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame` - (default). - * ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype` - DataFrame. + * ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame`. + * ``"pyarrow"``: returns pyarrow-backed nullable + :class:`ArrowDtype` :class:`DataFrame` .. versionadded:: 2.0 diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index 50503e862ef43..75bcb51ef4be2 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -320,7 +320,11 @@ def _save_chunk(self, start_i: int, end_i: int) -> None: res = df._get_values_for_csv(**self._number_format) data = list(res._iter_column_arrays()) - ix = self.data_index[slicer]._get_values_for_csv(**self._number_format) + ix = ( + self.data_index[slicer]._get_values_for_csv(**self._number_format) + if self.nlevels != 0 + else np.empty(end_i - start_i) + ) libwriters.write_csv_rows( data, ix, diff --git a/pandas/io/formats/style.py b/pandas/io/formats/style.py index 6f4c2fa6c6eae..6e5ae09485951 100644 --- a/pandas/io/formats/style.py +++ b/pandas/io/formats/style.py @@ -7,6 +7,7 @@ import copy from functools import partial import operator +import textwrap from typing import ( TYPE_CHECKING, overload, @@ -306,6 +307,12 @@ def concat(self, other: Styler) -> Styler: Returns ------- Styler + Instance of class with specified Styler appended. + + See Also + -------- + Styler.clear : Reset the ``Styler``, removing any previously applied styles. + Styler.export : Export the styles applied to the current Styler. Notes ----- @@ -335,7 +342,7 @@ def concat(self, other: Styler) -> Styler: keys ``data``, ``row_heading`` and ``row`` will be prepended with ``foot0_``. If more concats are chained, their styles will be prepended with ``foot1_``, ''foot_2'', etc., and if a concatenated style have - another concatanated style, the second style will be prepended with + another concatenated style, the second style will be prepended with ``foot{parent}_foot{child}_``. A common use case is to concatenate user defined functions with @@ -447,6 +454,15 @@ def set_tooltips( Returns ------- Styler + Instance of class with DataFrame set for strings on ``Styler`` + generating ``:hover`` tooltips. + + See Also + -------- + Styler.set_table_attributes : Set the table attributes added to the + ```` HTML element. + Styler.set_table_styles : Set the table styles included within the + ``