diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index a5a802c678e20..a62942c7cd948 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -7,6 +7,7 @@ on: branches: - master - 1.2.x + - 1.3.x env: ENV_FILE: environment.yml diff --git a/.github/workflows/database.yml b/.github/workflows/database.yml index b15889351386a..d2aa76a3e6110 100644 --- a/.github/workflows/database.yml +++ b/.github/workflows/database.yml @@ -7,6 +7,9 @@ on: branches: - master - 1.2.x + - 1.3.x + paths-ignore: + - "doc/**" env: PYTEST_WORKERS: "auto" diff --git a/.github/workflows/posix.yml b/.github/workflows/posix.yml index 3a4d3c106f851..fa5cf8ead57bd 100644 --- a/.github/workflows/posix.yml +++ b/.github/workflows/posix.yml @@ -7,6 +7,9 @@ on: branches: - master - 1.2.x + - 1.3.x + paths-ignore: + - "doc/**" env: PYTEST_WORKERS: "auto" diff --git a/.github/workflows/python-dev.yml b/.github/workflows/python-dev.yml index 2643dc5ec656e..38b1aa9ae7047 100644 --- a/.github/workflows/python-dev.yml +++ b/.github/workflows/python-dev.yml @@ -7,6 +7,8 @@ on: pull_request: branches: - master + paths-ignore: + - "doc/**" jobs: build: diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 3078619ecac35..d580fcf4fc545 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -9,7 +9,7 @@ repos: - id: absolufy-imports files: ^pandas/ - repo: https://github.com/python/black - rev: 20.8b1 + rev: 21.5b2 hooks: - id: black - repo: https://github.com/codespell-project/codespell diff --git a/MANIFEST.in b/MANIFEST.in index d0d93f2cdba8c..f616fad6b1557 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -17,18 +17,19 @@ global-exclude *.h5 global-exclude *.html global-exclude *.json global-exclude *.jsonl +global-exclude *.msgpack global-exclude *.pdf global-exclude *.pickle global-exclude *.png global-exclude *.pptx -global-exclude *.pyc -global-exclude *.pyd global-exclude *.ods global-exclude *.odt +global-exclude *.orc global-exclude *.sas7bdat global-exclude *.sav global-exclude *.so global-exclude *.xls +global-exclude *.xlsb global-exclude *.xlsm global-exclude *.xlsx global-exclude *.xpt @@ -39,6 +40,13 @@ global-exclude .DS_Store global-exclude .git* global-exclude \#* +global-exclude *.c +global-exclude *.cpp +global-exclude *.h + +global-exclude *.py[ocd] +global-exclude *.pxi + # GH 39321 # csv_dir_path fixture checks the existence of the directory # exclude the whole directory to avoid running related tests in sdist @@ -47,3 +55,6 @@ prune pandas/tests/io/parser/data include versioneer.py include pandas/_version.py include pandas/io/formats/templates/*.tpl + +graft pandas/_libs/src +graft pandas/_libs/tslibs/src diff --git a/README.md b/README.md index d928195bf2a10..04b346c198e90 100644 --- a/README.md +++ b/README.md @@ -10,13 +10,13 @@ [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.3509134.svg)](https://doi.org/10.5281/zenodo.3509134) [![Package Status](https://img.shields.io/pypi/status/pandas.svg)](https://pypi.org/project/pandas/) [![License](https://img.shields.io/pypi/l/pandas.svg)](https://github.com/pandas-dev/pandas/blob/master/LICENSE) -[![Travis Build Status](https://travis-ci.org/pandas-dev/pandas.svg?branch=master)](https://travis-ci.org/pandas-dev/pandas) [![Azure Build Status](https://dev.azure.com/pandas-dev/pandas/_apis/build/status/pandas-dev.pandas?branch=master)](https://dev.azure.com/pandas-dev/pandas/_build/latest?definitionId=1&branch=master) [![Coverage](https://codecov.io/github/pandas-dev/pandas/coverage.svg?branch=master)](https://codecov.io/gh/pandas-dev/pandas) [![Downloads](https://anaconda.org/conda-forge/pandas/badges/downloads.svg)](https://pandas.pydata.org) [![Gitter](https://badges.gitter.im/Join%20Chat.svg)](https://gitter.im/pydata/pandas) [![Powered by NumFOCUS](https://img.shields.io/badge/powered%20by-NumFOCUS-orange.svg?style=flat&colorA=E1523D&colorB=007D8A)](https://numfocus.org) [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black) +[![Imports: isort](https://img.shields.io/badge/%20imports-isort-%231674b1?style=flat&labelColor=ef8336)](https://pycqa.github.io/isort/) ## What is it? @@ -101,8 +101,8 @@ pip install pandas ## Dependencies - [NumPy - Adds support for large, multi-dimensional arrays, matrices and high-level mathematical functions to operate on these arrays](https://www.numpy.org) -- [python-dateutil - Provides powerful extensions to the standard datetime module](https://labix.org/python-dateutil) -- [pytz - Brings the Olson tz database into Python which allows accurate and cross platform timezone calculations](https://pythonhosted.org/pytz) +- [python-dateutil - Provides powerful extensions to the standard datetime module](https://dateutil.readthedocs.io/en/stable/index.html) +- [pytz - Brings the Olson tz database into Python which allows accurate and cross platform timezone calculations](https://github.com/stub42/pytz) See the [full installation instructions](https://pandas.pydata.org/pandas-docs/stable/install.html#dependencies) for minimum supported versions of required, recommended and optional dependencies. @@ -121,7 +121,7 @@ cloning the git repo), execute: python setup.py install ``` -or for installing in [development mode](https://pip.pypa.io/en/latest/reference/pip_install.html#editable-installs): +or for installing in [development mode](https://pip.pypa.io/en/latest/cli/pip_install/#install-editable): ```sh diff --git a/asv_bench/benchmarks/algorithms.py b/asv_bench/benchmarks/algorithms.py index aecc609df574e..e48a2060a3b34 100644 --- a/asv_bench/benchmarks/algorithms.py +++ b/asv_bench/benchmarks/algorithms.py @@ -23,12 +23,12 @@ class Factorize: "int", "uint", "float", - "string", + "object", "datetime64[ns]", "datetime64[ns, tz]", "Int64", "boolean", - "string_arrow", + "string[pyarrow]", ], ] param_names = ["unique", "sort", "dtype"] @@ -36,28 +36,25 @@ class Factorize: def setup(self, unique, sort, dtype): N = 10 ** 5 string_index = tm.makeStringIndex(N) - try: - from pandas.core.arrays.string_arrow import ArrowStringDtype - - string_arrow = pd.array(string_index, dtype=ArrowStringDtype()) - except ImportError: - string_arrow = None - - if dtype == "string_arrow" and not string_arrow: - raise NotImplementedError + string_arrow = None + if dtype == "string[pyarrow]": + try: + string_arrow = pd.array(string_index, dtype="string[pyarrow]") + except ImportError: + raise NotImplementedError data = { "int": pd.Int64Index(np.arange(N)), "uint": pd.UInt64Index(np.arange(N)), "float": pd.Float64Index(np.random.randn(N)), - "string": string_index, + "object": string_index, "datetime64[ns]": pd.date_range("2011-01-01", freq="H", periods=N), "datetime64[ns, tz]": pd.date_range( "2011-01-01", freq="H", periods=N, tz="Asia/Tokyo" ), "Int64": pd.array(np.arange(N), dtype="Int64"), "boolean": pd.array(np.random.randint(0, 2, N), dtype="boolean"), - "string_arrow": string_arrow, + "string[pyarrow]": string_arrow, }[dtype] if not unique: data = data.repeat(5) diff --git a/asv_bench/benchmarks/algos/isin.py b/asv_bench/benchmarks/algos/isin.py index 44245295beafc..296101c9f9800 100644 --- a/asv_bench/benchmarks/algos/isin.py +++ b/asv_bench/benchmarks/algos/isin.py @@ -25,8 +25,8 @@ class IsIn: "category[object]", "category[int]", "str", - "string", - "arrow_string", + "string[python]", + "string[pyarrow]", ] param_names = ["dtype"] @@ -50,8 +50,6 @@ def setup(self, dtype): elif dtype in ["category[object]", "category[int]"]: # Note: sizes are different in this case than others - np.random.seed(1234) - n = 5 * 10 ** 5 sample_size = 100 @@ -62,9 +60,7 @@ def setup(self, dtype): self.values = np.random.choice(arr, sample_size) self.series = Series(arr).astype("category") - elif dtype in ["str", "string", "arrow_string"]: - from pandas.core.arrays.string_arrow import ArrowStringDtype # noqa: F401 - + elif dtype in ["str", "string[python]", "string[pyarrow]"]: try: self.series = Series(tm.makeStringIndex(N), dtype=dtype) except ImportError: @@ -101,7 +97,6 @@ class IsinAlmostFullWithRandomInt: def setup(self, dtype, exponent, title): M = 3 * 2 ** (exponent - 2) # 0.77-the maximal share of occupied buckets - np.random.seed(42) self.series = Series(np.random.randint(0, M, M)).astype(dtype) values = np.random.randint(0, M, M).astype(dtype) @@ -134,7 +129,6 @@ class IsinWithRandomFloat: param_names = ["dtype", "size", "title"] def setup(self, dtype, size, title): - np.random.seed(42) self.values = np.random.rand(size) self.series = Series(self.values).astype(dtype) np.random.shuffle(self.values) @@ -181,7 +175,6 @@ class IsinWithArange: def setup(self, dtype, M, offset_factor): offset = int(M * offset_factor) - np.random.seed(42) tmp = Series(np.random.randint(offset, M + offset, 10 ** 6)) self.series = tmp.astype(dtype) self.values = np.arange(M).astype(dtype) @@ -292,10 +285,8 @@ def setup(self, dtype, MaxNumber, series_type): raise NotImplementedError if series_type == "random_hits": - np.random.seed(42) array = np.random.randint(0, MaxNumber, N) if series_type == "random_misses": - np.random.seed(42) array = np.random.randint(0, MaxNumber, N) + MaxNumber if series_type == "monotone_hits": array = np.repeat(np.arange(MaxNumber), N // MaxNumber) @@ -324,7 +315,6 @@ def setup(self, dtype, series_type): raise NotImplementedError if series_type == "random": - np.random.seed(42) vals = np.random.randint(0, 10 * N, N) if series_type == "monotone": vals = np.arange(N) diff --git a/asv_bench/benchmarks/frame_ctor.py b/asv_bench/benchmarks/frame_ctor.py index 3367898101528..7fbe249788a98 100644 --- a/asv_bench/benchmarks/frame_ctor.py +++ b/asv_bench/benchmarks/frame_ctor.py @@ -67,7 +67,6 @@ class FromDictwithTimestamp: def setup(self, offset): N = 10 ** 3 - np.random.seed(1234) idx = date_range(Timestamp("1/1/1900"), freq=offset, periods=N) df = DataFrame(np.random.randn(N, 10), index=idx) self.d = df.to_dict() diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py index 27761ccd0d917..1648985a56b91 100644 --- a/asv_bench/benchmarks/groupby.py +++ b/asv_bench/benchmarks/groupby.py @@ -393,7 +393,7 @@ class GroupByMethods: param_names = ["dtype", "method", "application"] params = [ - ["int", "float", "object", "datetime"], + ["int", "float", "object", "datetime", "uint"], [ "all", "any", @@ -442,6 +442,8 @@ def setup(self, dtype, method, application): values = rng.take(np.random.randint(0, ngroups, size=size)) if dtype == "int": key = np.random.randint(0, size, size=size) + elif dtype == "uint": + key = np.random.randint(0, size, size=size, dtype=dtype) elif dtype == "float": key = np.concatenate( [np.random.random(ngroups) * 0.1, np.random.random(ngroups) * 10.0] @@ -505,11 +507,11 @@ def time_frame_agg(self, dtype, method): self.df.groupby("key").agg(method) -class CumminMax: +class Cumulative: param_names = ["dtype", "method"] params = [ ["float64", "int64", "Float64", "Int64"], - ["cummin", "cummax"], + ["cummin", "cummax", "cumsum"], ] def setup(self, dtype, method): diff --git a/asv_bench/benchmarks/hash_functions.py b/asv_bench/benchmarks/hash_functions.py index 394433f7c8f99..6703cc791493a 100644 --- a/asv_bench/benchmarks/hash_functions.py +++ b/asv_bench/benchmarks/hash_functions.py @@ -67,7 +67,6 @@ class NumericSeriesIndexingShuffled: def setup(self, index, N): vals = np.array(list(range(55)) + [54] + list(range(55, N - 1))) - np.random.seed(42) np.random.shuffle(vals) indices = index(vals) self.data = pd.Series(np.arange(N), index=indices) diff --git a/asv_bench/benchmarks/indexing.py b/asv_bench/benchmarks/indexing.py index 86790063c5056..10fb926ee4d03 100644 --- a/asv_bench/benchmarks/indexing.py +++ b/asv_bench/benchmarks/indexing.py @@ -368,17 +368,14 @@ def setup(self): self.df = DataFrame(index=range(self.N)) def time_insert(self): - np.random.seed(1234) for i in range(100): self.df.insert(0, i, np.random.randn(self.N), allow_duplicates=True) def time_assign_with_setitem(self): - np.random.seed(1234) for i in range(100): self.df[i] = np.random.randn(self.N) def time_assign_list_like_with_setitem(self): - np.random.seed(1234) self.df[list(range(100))] = np.random.randn(self.N, 100) def time_assign_list_of_columns_concat(self): diff --git a/asv_bench/benchmarks/series_methods.py b/asv_bench/benchmarks/series_methods.py index d05a28e0873d0..7592ce54e3712 100644 --- a/asv_bench/benchmarks/series_methods.py +++ b/asv_bench/benchmarks/series_methods.py @@ -145,7 +145,6 @@ class Mode: param_names = ["N", "dtype"] def setup(self, N, dtype): - np.random.seed(42) self.s = Series(np.random.randint(0, N, size=10 * N)).astype(dtype) def time_mode(self, N, dtype): diff --git a/asv_bench/benchmarks/strings.py b/asv_bench/benchmarks/strings.py index 2e109e59c1c6d..32fbf4e6c7de3 100644 --- a/asv_bench/benchmarks/strings.py +++ b/asv_bench/benchmarks/strings.py @@ -12,12 +12,10 @@ class Dtypes: - params = ["str", "string", "arrow_string"] + params = ["str", "string[python]", "string[pyarrow]"] param_names = ["dtype"] def setup(self, dtype): - from pandas.core.arrays.string_arrow import ArrowStringDtype # noqa: F401 - try: self.s = Series(tm.makeStringIndex(10 ** 5), dtype=dtype) except ImportError: diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 56da4e87f2709..5ba4471c8d303 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -1,11 +1,18 @@ # Adapted from https://github.com/numba/numba/blob/master/azure-pipelines.yml trigger: -- master -- 1.2.x + branches: + include: + - master + - 1.2.x + - 1.3.x + paths: + exclude: + - 'doc/*' pr: - master - 1.2.x +- 1.3.x variables: PYTEST_WORKERS: auto diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 7cc171330e01a..1844cb863c183 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -77,6 +77,10 @@ if [[ -z "$CHECK" || "$CHECK" == "patterns" ]]; then invgrep -R --include="*.rst" -E "[a-zA-Z0-9]\`\`?[a-zA-Z0-9]" doc/source/ RET=$(($RET + $?)) ; echo $MSG "DONE" + MSG='Check for unnecessary random seeds in asv benchmarks' ; echo $MSG + invgrep -R --exclude pandas_vb_common.py -E 'np.random.seed' asv_bench/benchmarks/ + RET=$(($RET + $?)) ; echo $MSG "DONE" + fi ### CODE ### diff --git a/ci/deps/azure-macos-37.yaml b/ci/deps/azure-macos-37.yaml index 63e858eac433f..43e1055347f17 100644 --- a/ci/deps/azure-macos-37.yaml +++ b/ci/deps/azure-macos-37.yaml @@ -22,7 +22,7 @@ dependencies: - numexpr - numpy=1.17.3 - openpyxl - - pyarrow=0.17.0 + - pyarrow=0.17 - pytables - python-dateutil==2.7.3 - pytz diff --git a/doc/source/_static/ci.png b/doc/source/_static/ci.png index 3a4225e3ce1eb..4754dc2945db5 100644 Binary files a/doc/source/_static/ci.png and b/doc/source/_static/ci.png differ diff --git a/doc/source/development/contributing_codebase.rst b/doc/source/development/contributing_codebase.rst index d6ff48ed5fd39..e812aaa760a8f 100644 --- a/doc/source/development/contributing_codebase.rst +++ b/doc/source/development/contributing_codebase.rst @@ -169,7 +169,7 @@ submitting code to run the check yourself:: to auto-format your code. Additionally, many editors have plugins that will apply ``black`` as you edit files. -You should use a ``black`` version 20.8b1 as previous versions are not compatible +You should use a ``black`` version 21.5b2 as previous versions are not compatible with the pandas codebase. One caveat about ``git diff upstream/master -u -- "*.py" | flake8 --diff``: this @@ -407,12 +407,12 @@ pandas uses `mypy `_ to statically analyze the code base a Testing with continuous integration ----------------------------------- -The pandas test suite will run automatically on `Travis-CI `__ and +The pandas test suite will run automatically on `GitHub Actions `__ and `Azure Pipelines `__ continuous integration services, once your pull request is submitted. However, if you wish to run the test suite on a branch prior to submitting the pull request, then the continuous integration services need to be hooked to your GitHub repository. Instructions are here -for `Travis-CI `__ and +for `GitHub Actions `__ and `Azure Pipelines `__. A pull-request will be considered for merging when you have an all 'green' build. If any tests are failing, @@ -421,12 +421,6 @@ This is an example of a green build. .. image:: ../_static/ci.png -.. note:: - - Each time you push to *your* fork, a *new* run of the tests will be triggered on the CI. - You can enable the auto-cancel feature, which removes any non-currently-running tests for that same pull-request, for - `Travis-CI here `__. - .. _contributing.tdd: diff --git a/doc/source/development/extending.rst b/doc/source/development/extending.rst index 9a8a95bec66ad..d5b45f5953453 100644 --- a/doc/source/development/extending.rst +++ b/doc/source/development/extending.rst @@ -106,8 +106,6 @@ extension array for IP Address data, this might be ``ipaddress.IPv4Address``. See the `extension dtype source`_ for interface definition. -.. versionadded:: 0.24.0 - :class:`pandas.api.extension.ExtensionDtype` can be registered to pandas to allow creation via a string dtype name. This allows one to instantiate ``Series`` and ``.astype()`` with a registered string name, for example ``'category'`` is a registered string accessor for the ``CategoricalDtype``. @@ -141,8 +139,6 @@ and comments contain guidance for properly implementing the interface. :class:`~pandas.api.extensions.ExtensionArray` operator support ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -.. versionadded:: 0.24.0 - By default, there are no operators defined for the class :class:`~pandas.api.extensions.ExtensionArray`. There are two approaches for providing operator support for your ExtensionArray: diff --git a/doc/source/getting_started/install.rst b/doc/source/getting_started/install.rst index be9c0da34f8a9..88e54421daa11 100644 --- a/doc/source/getting_started/install.rst +++ b/doc/source/getting_started/install.rst @@ -221,7 +221,6 @@ Dependencies ================================================================ ========================== Package Minimum supported version ================================================================ ========================== -`setuptools `__ 38.6.0 `NumPy `__ 1.17.3 `python-dateutil `__ 2.7.3 `pytz `__ 2017.3 @@ -263,6 +262,7 @@ Visualization ========================= ================== ============================================================= Dependency Minimum Version Notes ========================= ================== ============================================================= +setuptools 38.6.0 Utils for entry points of plotting backend matplotlib 2.2.3 Plotting library Jinja2 2.10 Conditional formatting with DataFrame.style tabulate 0.8.7 Printing in Markdown-friendly format (see `tabulate`_) diff --git a/doc/source/reference/arrays.rst b/doc/source/reference/arrays.rst index 43e2509469488..c6fda85b0486d 100644 --- a/doc/source/reference/arrays.rst +++ b/doc/source/reference/arrays.rst @@ -480,6 +480,7 @@ we recommend using :class:`StringDtype` (with the alias ``"string"``). :template: autosummary/class_without_autosummary.rst arrays.StringArray + arrays.ArrowStringArray .. autosummary:: :toctree: api/ diff --git a/doc/source/user_guide/advanced.rst b/doc/source/user_guide/advanced.rst index bd9463c50ab1f..3b33ebe701037 100644 --- a/doc/source/user_guide/advanced.rst +++ b/doc/source/user_guide/advanced.rst @@ -40,11 +40,6 @@ analysis. See the :ref:`cookbook` for some advanced strategies. -.. versionchanged:: 0.24.0 - - :attr:`MultiIndex.labels` has been renamed to :attr:`MultiIndex.codes` - and :attr:`MultiIndex.set_labels` to :attr:`MultiIndex.set_codes`. - Creating a MultiIndex (hierarchical index) object ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -87,8 +82,6 @@ You can also construct a ``MultiIndex`` from a ``DataFrame`` directly, using the method :meth:`MultiIndex.from_frame`. This is a complementary method to :meth:`MultiIndex.to_frame`. -.. versionadded:: 0.24.0 - .. ipython:: python df = pd.DataFrame( diff --git a/doc/source/user_guide/basics.rst b/doc/source/user_guide/basics.rst index 70cfa3500f6b4..82c8a27bec3a5 100644 --- a/doc/source/user_guide/basics.rst +++ b/doc/source/user_guide/basics.rst @@ -1490,8 +1490,6 @@ for altering the ``Series.name`` attribute. .. _basics.rename_axis: -.. versionadded:: 0.24.0 - The methods :meth:`DataFrame.rename_axis` and :meth:`Series.rename_axis` allow specific names of a ``MultiIndex`` to be changed (as opposed to the labels). diff --git a/doc/source/user_guide/computation.rst b/doc/source/user_guide/computation.rst index 17d1809638d61..6007129e96ba0 100644 --- a/doc/source/user_guide/computation.rst +++ b/doc/source/user_guide/computation.rst @@ -141,8 +141,6 @@ Like ``cov``, ``corr`` also supports the optional ``min_periods`` keyword: frame.corr(min_periods=12) -.. versionadded:: 0.24.0 - The ``method`` argument can also be a callable for a generic correlation calculation. In this case, it should be a single function that produces a single value from two ndarray inputs. Suppose we wanted to diff --git a/doc/source/user_guide/groupby.rst b/doc/source/user_guide/groupby.rst index 7a55acbd3031d..870ec6763c72f 100644 --- a/doc/source/user_guide/groupby.rst +++ b/doc/source/user_guide/groupby.rst @@ -125,8 +125,6 @@ We could naturally group by either the ``A`` or ``B`` columns, or both: grouped = df.groupby("A") grouped = df.groupby(["A", "B"]) -.. versionadded:: 0.24 - If we also have a MultiIndex on columns ``A`` and ``B``, we can group by all but the specified columns diff --git a/doc/source/user_guide/integer_na.rst b/doc/source/user_guide/integer_na.rst index 2d5673fe53be3..2ce8bf23de824 100644 --- a/doc/source/user_guide/integer_na.rst +++ b/doc/source/user_guide/integer_na.rst @@ -8,8 +8,6 @@ Nullable integer data type ************************** -.. versionadded:: 0.24.0 - .. note:: IntegerArray is currently experimental. Its API or implementation may diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index b4e35d1f22840..c2b030d732ba9 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -31,7 +31,6 @@ The pandas I/O API is a set of top level ``reader`` functions accessed like binary;`Feather Format `__;:ref:`read_feather`;:ref:`to_feather` binary;`Parquet Format `__;:ref:`read_parquet`;:ref:`to_parquet` binary;`ORC Format `__;:ref:`read_orc`; - binary;`Msgpack `__;:ref:`read_msgpack`;:ref:`to_msgpack` binary;`Stata `__;:ref:`read_stata`;:ref:`to_stata` binary;`SAS `__;:ref:`read_sas`; binary;`SPSS `__;:ref:`read_spss`; @@ -297,7 +296,6 @@ compression : {``'infer'``, ``'gzip'``, ``'bz2'``, ``'zip'``, ``'xz'``, ``None`` create a reproducible gzip archive: ``compression={'method': 'gzip', 'compresslevel': 1, 'mtime': 1}``. - .. versionchanged:: 0.24.0 'infer' option added and set to default. .. versionchanged:: 1.1.0 dict option extended to support ``gzip`` and ``bz2``. .. versionchanged:: 1.2.0 Previous versions forwarded dict entries for 'gzip' to ``gzip.open``. thousands : str, default ``None`` @@ -351,14 +349,14 @@ error_bad_lines : boolean, default ``None`` ``DataFrame`` that is returned. See :ref:`bad lines ` below. - .. deprecated:: 1.3 + .. deprecated:: 1.3.0 The ``on_bad_lines`` parameter should be used instead to specify behavior upon encountering a bad line instead. warn_bad_lines : boolean, default ``None`` If error_bad_lines is ``False``, and warn_bad_lines is ``True``, a warning for each "bad line" will be output. - .. deprecated:: 1.3 + .. deprecated:: 1.3.0 The ``on_bad_lines`` parameter should be used instead to specify behavior upon encountering a bad line instead. on_bad_lines : {{'error', 'warn', 'skip'}}, default 'error' @@ -369,7 +367,7 @@ on_bad_lines : {{'error', 'warn', 'skip'}}, default 'error' - 'warn', print a warning when a bad line is encountered and skip that line. - 'skip', skip bad lines without raising or warning when they are encountered. - .. versionadded:: 1.3 + .. versionadded:: 1.3.0 .. _io.dtypes: @@ -2714,8 +2712,6 @@ table CSS classes. Note that these classes are *appended* to the existing The ``render_links`` argument provides the ability to add hyperlinks to cells that contain URLs. -.. versionadded:: 0.24 - .. ipython:: python url_df = pd.DataFrame( @@ -3590,8 +3586,6 @@ indices to be parsed. Element order is ignored, so ``usecols=[0, 1]`` is the same as ``[1, 0]``. -.. versionadded:: 0.24 - If ``usecols`` is a list of strings, it is assumed that each string corresponds to a column name provided either by the user in ``names`` or inferred from the document header row(s). Those strings define which columns will be parsed: @@ -3602,8 +3596,6 @@ document header row(s). Those strings define which columns will be parsed: Element order is ignored, so ``usecols=['baz', 'joe']`` is the same as ``['joe', 'baz']``. -.. versionadded:: 0.24 - If ``usecols`` is callable, the callable function will be evaluated against the column names, returning names where the callable function evaluates to ``True``. @@ -4016,21 +4008,13 @@ Passing options to the compression protocol in order to speed up compression: msgpack ------- -pandas support for ``msgpack`` has been removed in version 1.0.0. It is recommended to use pyarrow for on-the-wire transmission of pandas objects. - -Example pyarrow usage: - -.. code-block:: python - - import pandas as pd - import pyarrow as pa - - df = pd.DataFrame({"A": [1, 2, 3]}) +pandas support for ``msgpack`` has been removed in version 1.0.0. It is +recommended to use :ref:`pickle ` instead. - context = pa.default_serialization_context() - df_bytestring = context.serialize(df).to_buffer().to_pybytes() +Alternatively, you can also the Arrow IPC serialization format for on-the-wire +transmission of pandas objects. For documentation on pyarrow, see +`here `__. -For documentation on pyarrow, see `here `__. .. _io.hdf5: @@ -4260,9 +4244,6 @@ everything in the sub-store and **below**, so be *careful*. You can walk through the group hierarchy using the ``walk`` method which will yield a tuple for each group key along with the relative keys of its contents. -.. versionadded:: 0.24.0 - - .. ipython:: python for (path, subgroups, subkeys) in store.walk(): @@ -5439,8 +5420,6 @@ underlying engine's default behavior. Partitioning Parquet files '''''''''''''''''''''''''' -.. versionadded:: 0.24.0 - Parquet supports partitioning of data based on the values of one or more columns. .. ipython:: python @@ -5668,8 +5647,6 @@ will convert the data to UTC. Insertion method ++++++++++++++++ -.. versionadded:: 0.24.0 - The parameter ``method`` controls the SQL insertion clause used. Possible values are: diff --git a/doc/source/user_guide/sparse.rst b/doc/source/user_guide/sparse.rst index 982a5b0a70b55..52d99533c1f60 100644 --- a/doc/source/user_guide/sparse.rst +++ b/doc/source/user_guide/sparse.rst @@ -114,8 +114,6 @@ in many places Sparse accessor --------------- -.. versionadded:: 0.24.0 - pandas provides a ``.sparse`` accessor, similar to ``.str`` for string data, ``.cat`` for categorical data, and ``.dt`` for datetime-like data. This namespace provides attributes and methods that are specific to sparse data. diff --git a/doc/source/user_guide/window.rst b/doc/source/user_guide/window.rst index c8687f808a802..0d6dcaa3726e6 100644 --- a/doc/source/user_guide/window.rst +++ b/doc/source/user_guide/window.rst @@ -37,14 +37,14 @@ pandas supports 4 types of windowing operations: #. Expanding window: Accumulating window over the values. #. Exponentially Weighted window: Accumulating and exponentially weighted window over the values. -============================= ================= =========================== =========================== ======================== =================================== -Concept Method Returned Object Supports time-based windows Supports chained groupby Supports table method -============================= ================= =========================== =========================== ======================== =================================== -Rolling window ``rolling`` ``Rolling`` Yes Yes Yes (as of version 1.3) -Weighted window ``rolling`` ``Window`` No No No -Expanding window ``expanding`` ``Expanding`` No Yes Yes (as of version 1.3) -Exponentially Weighted window ``ewm`` ``ExponentialMovingWindow`` No Yes (as of version 1.2) No -============================= ================= =========================== =========================== ======================== =================================== +============================= ================= =========================== =========================== ======================== =================================== =========================== +Concept Method Returned Object Supports time-based windows Supports chained groupby Supports table method Supports online operations +============================= ================= =========================== =========================== ======================== =================================== =========================== +Rolling window ``rolling`` ``Rolling`` Yes Yes Yes (as of version 1.3) No +Weighted window ``rolling`` ``Window`` No No No No +Expanding window ``expanding`` ``Expanding`` No Yes Yes (as of version 1.3) No +Exponentially Weighted window ``ewm`` ``ExponentialMovingWindow`` No Yes (as of version 1.2) No Yes (as of version 1.3) +============================= ================= =========================== =========================== ======================== =================================== =========================== As noted above, some operations support specifying a window based on a time offset: @@ -76,7 +76,7 @@ which will first group the data by the specified keys and then perform a windowi to compute the rolling sums to preserve accuracy as much as possible. -.. versionadded:: 1.3 +.. versionadded:: 1.3.0 Some windowing operations also support the ``method='table'`` option in the constructor which performs the windowing operation over an entire :class:`DataFrame` instead of a single column or row at a time. @@ -98,6 +98,26 @@ be calculated with :meth:`~Rolling.apply` by specifying a separate column of wei df = pd.DataFrame([[1, 2, 0.6], [2, 3, 0.4], [3, 4, 0.2], [4, 5, 0.7]]) df.rolling(2, method="table", min_periods=0).apply(weighted_mean, raw=True, engine="numba") # noqa:E501 +.. versionadded:: 1.3 + +Some windowing operations also support an ``online`` method after constructing a windowing object +which returns a new object that supports passing in new :class:`DataFrame` or :class:`Series` objects +to continue the windowing calculation with the new values (i.e. online calculations). + +The methods on this new windowing objects must call the aggregation method first to "prime" the initial +state of the online calculation. Then, new :class:`DataFrame` or :class:`Series` objects can be passed in +the ``update`` argument to continue the windowing calculation. + +.. ipython:: python + + df = pd.DataFrame([[1, 2, 0.6], [2, 3, 0.4], [3, 4, 0.2], [4, 5, 0.7]]) + df.ewm(0.5).mean() + +.. ipython:: python + + online_ewm = df.head(2).ewm(0.5).online() + online_ewm.mean() + online_ewm.mean(update=df.tail(1)) All windowing operations support a ``min_periods`` argument that dictates the minimum amount of non-``np.nan`` values a window must have; otherwise, the resulting value is ``np.nan``. @@ -159,7 +179,7 @@ By default the labels are set to the right edge of the window, but a This can also be applied to datetime-like indices. -.. versionadded:: 1.3 +.. versionadded:: 1.3.0 .. ipython:: python @@ -299,6 +319,24 @@ forward-looking rolling window, and we can use it as follows: indexer = FixedForwardWindowIndexer(window_size=2) df.rolling(indexer, min_periods=1).sum() +We can also achieve this by using slicing, applying rolling aggregation, and then flipping the result as shown in example below: + +.. ipython:: python + + df = pd.DataFrame( + data=[ + [pd.Timestamp("2018-01-01 00:00:00"), 100], + [pd.Timestamp("2018-01-01 00:00:01"), 101], + [pd.Timestamp("2018-01-01 00:00:03"), 103], + [pd.Timestamp("2018-01-01 00:00:04"), 111], + ], + columns=["time", "value"], + ).set_index("time") + df + + reversed_df = df[::-1].rolling("2s").sum()[::-1] + reversed_df + .. _window.rolling_apply: Rolling apply @@ -332,7 +370,7 @@ Numba will be applied in potentially two routines: #. If ``func`` is a standard Python function, the engine will `JIT `__ the passed function. ``func`` can also be a JITed function in which case the engine will not JIT the function again. #. The engine will JIT the for loop where the apply function is applied to each window. -.. versionadded:: 1.3 +.. versionadded:: 1.3.0 ``mean``, ``median``, ``max``, ``min``, and ``sum`` also support the ``engine`` and ``engine_kwargs`` arguments. diff --git a/doc/source/whatsnew/index.rst b/doc/source/whatsnew/index.rst index 986cf43b80494..0b27bb4d58d5e 100644 --- a/doc/source/whatsnew/index.rst +++ b/doc/source/whatsnew/index.rst @@ -10,6 +10,14 @@ This is the list of changes to pandas between each release. For full details, see the `commit logs `_. For install and upgrade instructions, see :ref:`install`. +Version 1.4 +----------- + +.. toctree:: + :maxdepth: 2 + + v1.4.0 + Version 1.3 ----------- diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 8b413808503ad..dd95f9088e3da 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -20,15 +20,17 @@ including other versions of pandas. .. --------------------------------------------------------------------------- +.. _whatsnew_130.enhancements: + Enhancements ~~~~~~~~~~~~ -.. _whatsnew_130.read_csv_json_http_headers: +.. _whatsnew_130.enhancements.read_csv_json_http_headers: Custom HTTP(s) headers when reading csv or json files ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -When reading from a remote URL that is not handled by fsspec (ie. HTTP and +When reading from a remote URL that is not handled by fsspec (e.g. HTTP and HTTPS) the dictionary passed to ``storage_options`` will be used to create the headers included in the request. This can be used to control the User-Agent header or send other custom headers (:issue:`36688`). @@ -43,7 +45,7 @@ For example: storage_options=headers ) -.. _whatsnew_130.read_to_xml: +.. _whatsnew_130.enhancements.read_to_xml: Read and write XML documents ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -110,50 +112,40 @@ both XPath 1.0 and XSLT 1.0 are available. (:issue:`27554`) For more, see :ref:`io.xml` in the user guide on IO tools. -Styler Upgrades -^^^^^^^^^^^^^^^ - -We provided some focused development on :class:`.Styler`, including altering methods -to accept more universal CSS language for arguments, such as ``'color:red;'`` instead of -``[('color', 'red')]`` (:issue:`39564`). This is also added to the built-in methods -to allow custom CSS highlighting instead of default background coloring (:issue:`40242`). -Enhancements to other built-in methods include extending the :meth:`.Styler.background_gradient` -method to shade elements based on a given gradient map and not be restricted only to -values in the DataFrame (:issue:`39930` :issue:`22727` :issue:`28901`). Additional -built-in methods such as :meth:`.Styler.highlight_between`, :meth:`.Styler.highlight_quantile` -and :math:`.Styler.text_gradient` have been added (:issue:`39821`, :issue:`40926`, :issue:`41098`). - -The :meth:`.Styler.apply` now consistently allows functions with ``ndarray`` output to -allow more flexible development of UDFs when ``axis`` is ``None`` ``0`` or ``1`` (:issue:`39393`). - -:meth:`.Styler.set_tooltips` is a new method that allows adding on hover tooltips to -enhance interactive displays (:issue:`35643`). :meth:`.Styler.set_td_classes`, which was recently -introduced in v1.2.0 (:issue:`36159`) to allow adding specific CSS classes to data cells, has -been made as performant as :meth:`.Styler.apply` and :meth:`.Styler.applymap` (:issue:`40453`), -if not more performant in some cases. The overall performance of HTML -render times has been considerably improved to -match :meth:`DataFrame.to_html` (:issue:`39952` :issue:`37792` :issue:`40425`). - -The :meth:`.Styler.format` has had upgrades to easily format missing data, -precision, and perform HTML escaping (:issue:`40437` :issue:`40134`). There have been numerous other bug fixes to -properly format HTML and eliminate some inconsistencies (:issue:`39942` :issue:`40356` :issue:`39807` :issue:`39889` :issue:`39627`) - -:class:`.Styler` has also been compatible with non-unique index or columns, at least for as many features as are fully compatible, others made only partially compatible (:issue:`41269`). -One also has greater control of the display through separate sparsification of the index or columns, using the new 'styler' options context (:issue:`41142`). -Render trimming has also been added for large numbers of data elements to avoid browser overload (:issue:`40712`). - -We have added an extension to allow LaTeX styling as an alternative to CSS styling and a method :meth:`.Styler.to_latex` -which renders the necessary LaTeX format including built-up styles (:issue:`21673`, :issue:`41659`). An additional file io function :meth:`Styler.to_html` has been added for convenience (:issue:`40312`). - -Documentation has also seen major revisions in light of new features (:issue:`39720` :issue:`39317` :issue:`40493`) - -.. _whatsnew_130.dataframe_honors_copy_with_dict: +.. _whatsnew_130.enhancements.styler: + +Styler enhancements +^^^^^^^^^^^^^^^^^^^ + +We provided some focused development on :class:`.Styler`. See also the `Styler documentation <../user_guide/style.ipynb>`_ +which has been revised and improved (:issue:`39720`, :issue:`39317`, :issue:`40493`). + + - The method :meth:`.Styler.set_table_styles` can now accept more natural CSS language for arguments, such as ``'color:red;'`` instead of ``[('color', 'red')]`` (:issue:`39563`) + - The methods :meth:`.Styler.highlight_null`, :meth:`.Styler.highlight_min`, and :meth:`.Styler.highlight_max` now allow custom CSS highlighting instead of the default background coloring (:issue:`40242`) + - :meth:`.Styler.apply` now accepts functions that return an ``ndarray`` when ``axis=None``, making it now consistent with the ``axis=0`` and ``axis=1`` behavior (:issue:`39359`) + - When incorrectly formatted CSS is given via :meth:`.Styler.apply` or :meth:`.Styler.applymap`, an error is now raised upon rendering (:issue:`39660`) + - :meth:`.Styler.format` now accepts the keyword argument ``escape`` for optional HTML and LaTex escaping (:issue:`40388`, :issue:`41619`) + - :meth:`.Styler.background_gradient` has gained the argument ``gmap`` to supply a specific gradient map for shading (:issue:`22727`) + - :meth:`.Styler.clear` now clears :attr:`Styler.hidden_index` and :attr:`Styler.hidden_columns` as well (:issue:`40484`) + - Added the method :meth:`.Styler.highlight_between` (:issue:`39821`) + - Added the method :meth:`.Styler.highlight_quantile` (:issue:`40926`) + - Added the method :meth:`.Styler.text_gradient` (:issue:`41098`) + - Added the method :meth:`.Styler.set_tooltips` to allow hover tooltips; this can be used enhance interactive displays (:issue:`21266`, :issue:`40284`) + - Added the parameter ``precision`` to the method :meth:`.Styler.format` to control the display of floating point numbers (:issue:`40134`) + - :class:`.Styler` rendered HTML output now follows the `w3 HTML Style Guide `_ (:issue:`39626`) + - Many features of the :class:`.Styler` class are now either partially or fully usable on a DataFrame with a non-unique indexes or columns (:issue:`41143`) + - One has greater control of the display through separate sparsification of the index or columns using the :ref:`new styler options `, which are also usable via :func:`option_context` (:issue:`41142`) + - Added the option ``styler.render.max_elements`` to avoid browser overload when styling large DataFrames (:issue:`40712`) + - Added the method :meth:`.Styler.to_latex` (:issue:`21673`) + - Added the method :meth:`.Styler.to_html` (:issue:`13379`) + +.. _whatsnew_130.enhancements.dataframe_honors_copy_with_dict: DataFrame constructor honors ``copy=False`` with dict ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ When passing a dictionary to :class:`DataFrame` with ``copy=False``, -a copy will no longer be made (:issue:`32960`) +a copy will no longer be made (:issue:`32960`). .. ipython:: python @@ -171,10 +163,64 @@ a copy will no longer be made (:issue:`32960`) The default behavior when not passing ``copy`` will remain unchanged, i.e. a copy will be made. -Centered Datetime-Like Rolling Windows +.. _whatsnew_130.enhancements.arrow_string: + +PyArrow backed string data type +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +We've enhanced the :class:`StringDtype`, an extension type dedicated to string data. +(:issue:`39908`) + +It is now possible to specify a ``storage`` keyword option to :class:`StringDtype`. Use +pandas options or specify the dtype using ``dtype='string[pyarrow]'`` to allow the +StringArray to be backed by a PyArrow array instead of a NumPy array of Python objects. + +The PyArrow backed StringArray requires pyarrow 1.0.0 or greater to be installed. + +.. warning:: + + ``string[pyarrow]`` is currently considered experimental. The implementation + and parts of the API may change without warning. + +.. ipython:: python + + pd.Series(['abc', None, 'def'], dtype=pd.StringDtype(storage="pyarrow")) + +You can use the alias ``"string[pyarrow]"`` as well. + +.. ipython:: python + + s = pd.Series(['abc', None, 'def'], dtype="string[pyarrow]") + s + +You can also create a PyArrow backed string array using pandas options. + +.. ipython:: python + + with pd.option_context("string_storage", "pyarrow"): + s = pd.Series(['abc', None, 'def'], dtype="string") + s + +The usual string accessor methods work. Where appropriate, the return type of the Series +or columns of a DataFrame will also have string dtype. + +.. ipython:: python + + s.str.upper() + s.str.split('b', expand=True).dtypes + +String accessor methods returning integers will return a value with :class:`Int64Dtype` + +.. ipython:: python + + s.str.count("a") + +.. _whatsnew_130.enhancements.centered_datetimelike_rolling_window: + +Centered datetime-like rolling windows ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -When performing rolling calculations on :class:`DataFrame` and :class:`Series` +When performing rolling calculations on DataFrame and Series objects with a datetime-like index, a centered datetime-like window can now be used (:issue:`38780`). For example: @@ -194,36 +240,28 @@ Other enhancements ^^^^^^^^^^^^^^^^^^ - :meth:`DataFrame.rolling`, :meth:`Series.rolling`, :meth:`DataFrame.expanding`, and :meth:`Series.expanding` now support a ``method`` argument with a ``'table'`` option that performs the windowing operation over an entire :class:`DataFrame`. See :ref:`Window Overview ` for performance and functional benefits (:issue:`15095`, :issue:`38995`) +- :class:`.ExponentialMovingWindow` now support a ``online`` method that can perform ``mean`` calculations in an online fashion. See :ref:`Window Overview ` (:issue:`41673`) - Added :meth:`MultiIndex.dtypes` (:issue:`37062`) - Added ``end`` and ``end_day`` options for the ``origin`` argument in :meth:`DataFrame.resample` (:issue:`37804`) -- Improve error message when ``usecols`` and ``names`` do not match for :func:`read_csv` and ``engine="c"`` (:issue:`29042`) -- Improved consistency of error messages when passing an invalid ``win_type`` argument in :class:`Window` (:issue:`15969`) +- Improved error message when ``usecols`` and ``names`` do not match for :func:`read_csv` and ``engine="c"`` (:issue:`29042`) +- Improved consistency of error messages when passing an invalid ``win_type`` argument in :ref:`Window methods ` (:issue:`15969`) - :func:`read_sql_query` now accepts a ``dtype`` argument to cast the columnar data from the SQL database based on user input (:issue:`10285`) - Improved integer type mapping from pandas to SQLAlchemy when using :meth:`DataFrame.to_sql` (:issue:`35076`) - :func:`to_numeric` now supports downcasting of nullable ``ExtensionDtype`` objects (:issue:`33013`) -- Add support for dict-like names in :class:`MultiIndex.set_names` and :class:`MultiIndex.rename` (:issue:`20421`) -- :func:`read_excel` can now auto detect .xlsb files and older .xls files (:issue:`35416`, :issue:`41225`) +- Added support for dict-like names in :class:`MultiIndex.set_names` and :class:`MultiIndex.rename` (:issue:`20421`) +- :func:`read_excel` can now auto-detect .xlsb files and older .xls files (:issue:`35416`, :issue:`41225`) - :class:`ExcelWriter` now accepts an ``if_sheet_exists`` parameter to control the behaviour of append mode when writing to existing sheets (:issue:`40230`) -- :meth:`.Rolling.sum`, :meth:`.Expanding.sum`, :meth:`.Rolling.mean`, :meth:`.Expanding.mean`, :meth:`.ExponentialMovingWindow.mean`, :meth:`.Rolling.median`, :meth:`.Expanding.median`, :meth:`.Rolling.max`, :meth:`.Expanding.max`, :meth:`.Rolling.min`, and :meth:`.Expanding.min` now support ``Numba`` execution with the ``engine`` keyword (:issue:`38895`, :issue:`41267`) +- :meth:`.Rolling.sum`, :meth:`.Expanding.sum`, :meth:`.Rolling.mean`, :meth:`.Expanding.mean`, :meth:`.ExponentialMovingWindow.mean`, :meth:`.Rolling.median`, :meth:`.Expanding.median`, :meth:`.Rolling.max`, :meth:`.Expanding.max`, :meth:`.Rolling.min`, and :meth:`.Expanding.min` now support `Numba `_ execution with the ``engine`` keyword (:issue:`38895`, :issue:`41267`) - :meth:`DataFrame.apply` can now accept NumPy unary operators as strings, e.g. ``df.apply("sqrt")``, which was already the case for :meth:`Series.apply` (:issue:`39116`) - :meth:`DataFrame.apply` can now accept non-callable DataFrame properties as strings, e.g. ``df.apply("size")``, which was already the case for :meth:`Series.apply` (:issue:`39116`) -- :meth:`DataFrame.applymap` can now accept kwargs to pass on to func (:issue:`39987`) +- :meth:`DataFrame.applymap` can now accept kwargs to pass on to the user-provided ``func`` (:issue:`39987`) - Passing a :class:`DataFrame` indexer to ``iloc`` is now disallowed for :meth:`Series.__getitem__` and :meth:`DataFrame.__getitem__` (:issue:`39004`) - :meth:`Series.apply` can now accept list-like or dictionary-like arguments that aren't lists or dictionaries, e.g. ``ser.apply(np.array(["sum", "mean"]))``, which was already the case for :meth:`DataFrame.apply` (:issue:`39140`) - :meth:`DataFrame.plot.scatter` can now accept a categorical column for the argument ``c`` (:issue:`12380`, :issue:`31357`) -- :meth:`.Styler.set_tooltips` allows on hover tooltips to be added to styled HTML dataframes (:issue:`35643`, :issue:`21266`, :issue:`39317`, :issue:`39708`, :issue:`40284`) -- :meth:`.Styler.set_table_styles` amended to optionally allow certain css-string input arguments (:issue:`39564`) -- :meth:`.Styler.apply` now more consistently accepts ndarray function returns, i.e. in all cases for ``axis`` is ``0, 1 or None`` (:issue:`39359`) -- :meth:`.Styler.apply` and :meth:`.Styler.applymap` now raise errors if incorrectly formatted CSS is passed on render(:issue:`39660`) -- :meth:`.Styler.format` now accepts the keyword argument ``escape`` for optional HTML and LaTeX escaping (:issue:`40437`) -- :meth:`.Styler.background_gradient` now allows the ability to supply a specific gradient map (:issue:`22727`) -- :meth:`.Styler.clear` now clears :attr:`Styler.hidden_index` and :attr:`Styler.hidden_columns` as well (:issue:`40484`) -- Builtin highlighting methods in :class:`.Styler` have a more consistent signature and css customisability (:issue:`40242`) -- :meth:`.Styler.highlight_between` added to list of builtin styling methods (:issue:`39821`) - :meth:`Series.loc` now raises a helpful error message when the Series has a :class:`MultiIndex` and the indexer has too many dimensions (:issue:`35349`) - :func:`read_stata` now supports reading data from compressed files (:issue:`26599`) -- Add support for parsing ``ISO 8601``-like timestamps with negative signs to :class:`Timedelta` (:issue:`37172`) -- Add support for unary operators in :class:`FloatingArray` (:issue:`38749`) +- Added support for parsing ``ISO 8601``-like timestamps with negative signs to :class:`Timedelta` (:issue:`37172`) +- Added support for unary operators in :class:`FloatingArray` (:issue:`38749`) - :class:`RangeIndex` can now be constructed by passing a ``range`` object directly e.g. ``pd.RangeIndex(range(3))`` (:issue:`12067`) - :meth:`Series.round` and :meth:`DataFrame.round` now work with nullable integer and floating dtypes (:issue:`38844`) - :meth:`read_csv` and :meth:`read_json` expose the argument ``encoding_errors`` to control how encoding errors are handled (:issue:`39450`) @@ -245,12 +283,14 @@ Notable bug fixes These are bug fixes that might have notable behavior changes. +.. _whatsnew_130.notable_bug_fixes.categorical_unique_maintains_dtype: + ``Categorical.unique`` now always maintains same dtype as original ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Previously, when calling :meth:`Categorical.unique` with categorical data, unused categories in the new array -would be removed, meaning that the dtype of the new array would be different than the -original, if some categories are not present in the unique array (:issue:`18291`) +would be removed, making the dtype of the new array different than the +original (:issue:`18291`) As an example of this, given: @@ -278,6 +318,8 @@ As an example of this, given: unique original.dtype == unique.dtype +.. _whatsnew_130.notable_bug_fixes.combine_first_preserves_dtype: + Preserve dtypes in :meth:`DataFrame.combine_first` ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -308,8 +350,10 @@ Preserve dtypes in :meth:`DataFrame.combine_first` combined.dtypes -Group by methods agg and transform no longer changes return dtype for callables -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. _whatsnew_130.notable_bug_fixes.groupby_preserves_dtype: + +Groupby methods agg and transform no longer changes return dtype for callables +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Previously the methods :meth:`.DataFrameGroupBy.aggregate`, :meth:`.SeriesGroupBy.aggregate`, :meth:`.DataFrameGroupBy.transform`, and @@ -339,6 +383,8 @@ values as measured by ``np.allclose``. Now no such casting occurs. df.groupby('key').agg(lambda x: x.sum()) +.. _whatsnew_130.notable_bug_fixes.groupby_reductions_float_result: + ``float`` result for :meth:`.GroupBy.mean`, :meth:`.GroupBy.median`, and :meth:`.GroupBy.var` ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -364,6 +410,8 @@ Now, these methods will always return a float dtype. (:issue:`41137`) df.groupby(df.index).mean() +.. _whatsnew_130.notable_bug_fixes.setitem_column_try_inplace: + Try operating inplace when setting values with ``loc`` and ``iloc`` ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -406,7 +454,7 @@ In pandas 1.3.0, ``df`` continues to share data with ``values`` .. _whatsnew_130.notable_bug_fixes.setitem_never_inplace: -Never Operate Inplace When Setting ``frame[keys] = values`` +Never operate inplace when setting ``frame[keys] = values`` ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ When setting multiple columns using ``frame[keys] = values`` new arrays will @@ -441,7 +489,7 @@ In the new behavior, we get a new array, and retain an integer-dtyped ``5``: .. _whatsnew_130.notable_bug_fixes.setitem_with_bool_casting: -Consistent Casting With Setting Into Boolean Series +Consistent casting with setting into Boolean Series ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Setting non-boolean values into a :class:`Series` with ``dtype=bool`` now consistently @@ -560,6 +608,13 @@ with a :class:`MultiIndex` in the result. This can lead to a perceived duplicati df.groupby('label1').rolling(1).sum() +.. --------------------------------------------------------------------------- + +.. _whatsnew_130.api_breaking: + +Backwards incompatible API changes +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + .. _whatsnew_130.api_breaking.deps: Increased minimum versions for dependencies @@ -637,18 +692,20 @@ Optional libraries below the lowest tested version may still work, but are not c See :ref:`install.dependencies` and :ref:`install.optional_dependencies` for more. -.. _whatsnew_130.api.other: +.. _whatsnew_130.api_breaking.other: Other API changes ^^^^^^^^^^^^^^^^^ - Partially initialized :class:`CategoricalDtype` objects (i.e. those with ``categories=None``) will no longer compare as equal to fully initialized dtype objects (:issue:`38516`) - Accessing ``_constructor_expanddim`` on a :class:`DataFrame` and ``_constructor_sliced`` on a :class:`Series` now raise an ``AttributeError``. Previously a ``NotImplementedError`` was raised (:issue:`38782`) -- Added new ``engine`` and ``**engine_kwargs`` parameters to :meth:`DataFrame.to_sql` to support other future "SQL engines". Currently we still only use ``SQLAlchemy`` under the hood, but more engines are planned to be supported such as ``turbodbc`` (:issue:`36893`) +- Added new ``engine`` and ``**engine_kwargs`` parameters to :meth:`DataFrame.to_sql` to support other future "SQL engines". Currently we still only use ``SQLAlchemy`` under the hood, but more engines are planned to be supported such as `turbodbc `_ (:issue:`36893`) - Removed redundant ``freq`` from :class:`PeriodIndex` string representation (:issue:`41653`) +- :meth:`ExtensionDtype.construct_array_type` is now a required method instead of an optional one for :class:`ExtensionDtype` subclasses (:issue:`24860`) -Build -===== +.. _whatsnew_130.api_breaking.build: +Build +^^^^^ - Documentation in ``.pptx`` and ``.pdf`` formats are no longer included in wheels or source distributions. (:issue:`30741`) .. --------------------------------------------------------------------------- @@ -683,42 +740,43 @@ Deprecations - Deprecated the ``convert_float`` optional argument in :func:`read_excel` and :meth:`ExcelFile.parse` (:issue:`41127`) - Deprecated behavior of :meth:`DatetimeIndex.union` with mixed timezones; in a future version both will be cast to UTC instead of object dtype (:issue:`39328`) - Deprecated using ``usecols`` with out of bounds indices for :func:`read_csv` with ``engine="c"`` (:issue:`25623`) -- Deprecated passing arguments as positional (except for ``"codes"``) in :meth:`MultiIndex.codes` (:issue:`41485`) -- Deprecated passing arguments as positional in :meth:`Index.set_names` and :meth:`MultiIndex.set_names` (except for ``names``) (:issue:`41485`) -- Deprecated passing arguments (apart from ``cond`` and ``other``) as positional in :meth:`DataFrame.mask` and :meth:`Series.mask` (:issue:`41485`) -- Deprecated passing arguments as positional in :meth:`Resampler.interpolate` (other than ``"method"``) (:issue:`41485`) -- Deprecated passing arguments as positional in :meth:`DataFrame.clip` and :meth:`Series.clip` (other than ``"upper"`` and ``"lower"``) (:issue:`41485`) - Deprecated special treatment of lists with first element a Categorical in the :class:`DataFrame` constructor; pass as ``pd.DataFrame({col: categorical, ...})`` instead (:issue:`38845`) - Deprecated behavior of :class:`DataFrame` constructor when a ``dtype`` is passed and the data cannot be cast to that dtype. In a future version, this will raise instead of being silently ignored (:issue:`24435`) -- Deprecated passing arguments as positional (except for ``"method"``) in :meth:`DataFrame.interpolate` and :meth:`Series.interpolate` (:issue:`41485`) - Deprecated the :attr:`Timestamp.freq` attribute. For the properties that use it (``is_month_start``, ``is_month_end``, ``is_quarter_start``, ``is_quarter_end``, ``is_year_start``, ``is_year_end``), when you have a ``freq``, use e.g. ``freq.is_month_start(ts)`` (:issue:`15146`) -- Deprecated passing arguments as positional in :meth:`DataFrame.ffill`, :meth:`Series.ffill`, :meth:`DataFrame.bfill`, and :meth:`Series.bfill` (:issue:`41485`) -- Deprecated passing arguments as positional in :meth:`DataFrame.sort_values` (other than ``"by"``) and :meth:`Series.sort_values` (:issue:`41485`) -- Deprecated passing arguments as positional in :meth:`DataFrame.dropna` and :meth:`Series.dropna` (:issue:`41485`) -- Deprecated passing arguments as positional in :meth:`DataFrame.set_index` (other than ``"keys"``) (:issue:`41485`) -- Deprecated passing arguments as positional (except for ``"levels"``) in :meth:`MultiIndex.set_levels` (:issue:`41485`) -- Deprecated passing arguments as positional in :meth:`DataFrame.sort_index` and :meth:`Series.sort_index` (:issue:`41485`) -- Deprecated passing arguments as positional in :meth:`DataFrame.drop_duplicates` (except for ``subset``), :meth:`Series.drop_duplicates`, :meth:`Index.drop_duplicates` and :meth:`MultiIndex.drop_duplicates` (:issue:`41485`) -- Deprecated passing arguments (apart from ``value``) as positional in :meth:`DataFrame.fillna` and :meth:`Series.fillna` (:issue:`41485`) -- Deprecated passing arguments as positional in :meth:`DataFrame.reset_index` (other than ``"level"``) and :meth:`Series.reset_index` (:issue:`41485`) - Deprecated construction of :class:`Series` or :class:`DataFrame` with ``DatetimeTZDtype`` data and ``datetime64[ns]`` dtype. Use ``Series(data).dt.tz_localize(None)`` instead (:issue:`41555`, :issue:`33401`) - Deprecated behavior of :class:`Series` construction with large-integer values and small-integer dtype silently overflowing; use ``Series(data).astype(dtype)`` instead (:issue:`41734`) - Deprecated behavior of :class:`DataFrame` construction with floating data and integer dtype casting even when lossy; in a future version this will remain floating, matching :class:`Series` behavior (:issue:`41770`) - Deprecated inference of ``timedelta64[ns]``, ``datetime64[ns]``, or ``DatetimeTZDtype`` dtypes in :class:`Series` construction when data containing strings is passed and no ``dtype`` is passed (:issue:`33558`) - In a future version, constructing :class:`Series` or :class:`DataFrame` with ``datetime64[ns]`` data and ``DatetimeTZDtype`` will treat the data as wall-times instead of as UTC times (matching DatetimeIndex behavior). To treat the data as UTC times, use ``pd.Series(data).dt.tz_localize("UTC").dt.tz_convert(dtype.tz)`` or ``pd.Series(data.view("int64"), dtype=dtype)`` (:issue:`33401`) -- Deprecated passing arguments as positional in :meth:`DataFrame.set_axis` and :meth:`Series.set_axis` (other than ``"labels"``) (:issue:`41485`) -- Deprecated passing arguments as positional in :meth:`DataFrame.where` and :meth:`Series.where` (other than ``"cond"`` and ``"other"``) (:issue:`41485`) -- Deprecated passing arguments as positional (other than ``filepath_or_buffer``) in :func:`read_csv` (:issue:`41485`) - Deprecated passing lists as ``key`` to :meth:`DataFrame.xs` and :meth:`Series.xs` (:issue:`41760`) -- Deprecated passing arguments as positional in :meth:`DataFrame.drop` (other than ``"labels"``) and :meth:`Series.drop` (:issue:`41485`) -- Deprecated passing arguments as positional (other than ``filepath_or_buffer``) in :func:`read_table` (:issue:`41485`) -- Deprecated passing arguments as positional (other than ``objs``) in :func:`concat` (:issue:`41485`) +- Deprecated passing arguments as positional for all of the following, with exceptions noted (:issue:`41485`): + - :func:`concat` (other than ``objs``) + - :func:`read_csv` (other than ``filepath_or_buffer``) + - :func:`read_table` (other than ``filepath_or_buffer``) + - :meth:`DataFrame.clip` and :meth:`Series.clip` (other than ``upper`` and ``lower``) + - :meth:`DataFrame.drop_duplicates` (except for ``subset``), :meth:`Series.drop_duplicates`, :meth:`Index.drop_duplicates` and :meth:`MultiIndex.drop_duplicates` + - :meth:`DataFrame.drop` (other than ``labels``) and :meth:`Series.drop` + - :meth:`DataFrame.dropna` and :meth:`Series.dropna` + - :meth:`DataFrame.ffill`, :meth:`Series.ffill`, :meth:`DataFrame.bfill`, and :meth:`Series.bfill` + - :meth:`DataFrame.fillna` and :meth:`Series.fillna` (apart from ``value``) + - :meth:`DataFrame.interpolate` and :meth:`Series.interpolate` (other than ``method``) + - :meth:`DataFrame.mask` and :meth:`Series.mask` (other than ``cond`` and ``other``) + - :meth:`DataFrame.reset_index` (other than ``level``) and :meth:`Series.reset_index` + - :meth:`DataFrame.set_axis` and :meth:`Series.set_axis` (other than ``labels``) + - :meth:`DataFrame.set_index` (other than ``keys``) + - :meth:`DataFrame.sort_index` and :meth:`Series.sort_index` + - :meth:`DataFrame.sort_values` (other than ``by``) and :meth:`Series.sort_values` + - :meth:`DataFrame.where` and :meth:`Series.where` (other than ``cond`` and ``other``) + - :meth:`Index.set_names` and :meth:`MultiIndex.set_names` (except for ``names``) + - :meth:`MultiIndex.codes` (except for ``codes``) + - :meth:`MultiIndex.set_levels` (except for ``levels``) + - :meth:`Resampler.interpolate` (other than ``method``) .. _whatsnew_130.deprecations.nuisance_columns: -Deprecated Dropping Nuisance Columns in DataFrame Reductions and DataFrameGroupBy Operations -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Deprecated dropping nuisance columns in DataFrame reductions and DataFrameGroupBy operations +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Calling a reduction (e.g. ``.min``, ``.max``, ``.sum``) on a :class:`DataFrame` with ``numeric_only=None`` (the default), columns where the reduction raises a ``TypeError`` are silently ignored and dropped from the result. @@ -804,14 +862,16 @@ Performance improvements - Performance improvement in :meth:`IntervalIndex.isin` (:issue:`38353`) - Performance improvement in :meth:`Series.mean` for nullable data types (:issue:`34814`) - Performance improvement in :meth:`Series.isin` for nullable data types (:issue:`38340`) -- Performance improvement in :meth:`DataFrame.fillna` with ``method="pad|backfill"`` for nullable floating and nullable integer dtypes (:issue:`39953`) +- Performance improvement in :meth:`DataFrame.fillna` with ``method="pad"`` or ``method="backfill"`` for nullable floating and nullable integer dtypes (:issue:`39953`) - Performance improvement in :meth:`DataFrame.corr` for ``method=kendall`` (:issue:`28329`) +- Performance improvement in :meth:`DataFrame.corr` for ``method=spearman`` (:issue:`40956`, :issue:`41885`) - Performance improvement in :meth:`.Rolling.corr` and :meth:`.Rolling.cov` (:issue:`39388`) - Performance improvement in :meth:`.RollingGroupby.corr`, :meth:`.ExpandingGroupby.corr`, :meth:`.ExpandingGroupby.corr` and :meth:`.ExpandingGroupby.cov` (:issue:`39591`) - Performance improvement in :func:`unique` for object data type (:issue:`37615`) - Performance improvement in :func:`json_normalize` for basic cases (including separators) (:issue:`40035` :issue:`15621`) - Performance improvement in :class:`.ExpandingGroupby` aggregation methods (:issue:`39664`) -- Performance improvement in :class:`.Styler` where render times are more than 50% reduced (:issue:`39972` :issue:`39952`) +- Performance improvement in :class:`.Styler` where render times are more than 50% reduced and now matches :meth:`DataFrame.to_html` (:issue:`39972` :issue:`39952`, :issue:`40425`) +- The method :meth:`.Styler.set_td_classes` is now as performant as :meth:`.Styler.apply` and :meth:`.Styler.applymap`, and even more so in some cases (:issue:`40453`) - Performance improvement in :meth:`.ExponentialMovingWindow.mean` with ``times`` (:issue:`39784`) - Performance improvement in :meth:`.GroupBy.apply` when requiring the python fallback implementation (:issue:`40176`) - Performance improvement in the conversion of a PyArrow Boolean array to a pandas nullable Boolean array (:issue:`41051`) @@ -836,6 +896,7 @@ Categorical - Bug in constructing a :class:`DataFrame` from an ``ndarray`` and a :class:`CategoricalDtype` (:issue:`38857`) - Bug in setting categorical values into an object-dtype column in a :class:`DataFrame` (:issue:`39136`) - Bug in :meth:`DataFrame.reindex` was raising an ``IndexError`` when the new index contained duplicates and the old index was a :class:`CategoricalIndex` (:issue:`38906`) +- Bug in :meth:`Categorical.fillna` with a tuple-like category raising ``NotImplementedError`` instead of ``ValueError`` when filling with a non-category tuple (:issue:`41914`) Datetimelike ^^^^^^^^^^^^ @@ -863,7 +924,6 @@ Timezones ^^^^^^^^^ - Bug in different ``tzinfo`` objects representing UTC not being treated as equivalent (:issue:`39216`) - Bug in ``dateutil.tz.gettz("UTC")`` not being recognized as equivalent to other UTC-representing tzinfos (:issue:`39276`) -- Numeric ^^^^^^^ @@ -899,7 +959,6 @@ Conversion Strings ^^^^^^^ - - Bug in the conversion from ``pyarrow.ChunkedArray`` to :class:`~arrays.StringArray` when the original had zero chunks (:issue:`41040`) - Bug in :meth:`Series.replace` and :meth:`DataFrame.replace` ignoring replacements with ``regex=True`` for ``StringDType`` data (:issue:`41333`, :issue:`35977`) - Bug in :meth:`Series.str.extract` with :class:`~arrays.StringArray` returning object dtype for an empty :class:`DataFrame` (:issue:`41441`) @@ -911,10 +970,10 @@ Interval - Bug in :meth:`IntervalIndex.intersection` returning duplicates when at least one of the :class:`Index` objects have duplicates which are present in the other (:issue:`38743`) - :meth:`IntervalIndex.union`, :meth:`IntervalIndex.intersection`, :meth:`IntervalIndex.difference`, and :meth:`IntervalIndex.symmetric_difference` now cast to the appropriate dtype instead of raising a ``TypeError`` when operating with another :class:`IntervalIndex` with incompatible dtype (:issue:`39267`) - :meth:`PeriodIndex.union`, :meth:`PeriodIndex.intersection`, :meth:`PeriodIndex.symmetric_difference`, :meth:`PeriodIndex.difference` now cast to object dtype instead of raising ``IncompatibleFrequency`` when operating with another :class:`PeriodIndex` with incompatible dtype (:issue:`39306`) +- Bug in :meth:`IntervalIndex.is_monotonic`, :meth:`IntervalIndex.get_loc`, :meth:`IntervalIndex.get_indexer_for`, and :meth:`IntervalIndex.__contains__` when NA values are present (:issue:`41831`) Indexing ^^^^^^^^ - - Bug in :meth:`Index.union` and :meth:`MultiIndex.union` dropping duplicate ``Index`` values when ``Index`` was not monotonic or ``sort`` was set to ``False`` (:issue:`36289`, :issue:`31326`, :issue:`40862`) - Bug in :meth:`CategoricalIndex.get_indexer` failing to raise ``InvalidIndexError`` when non-unique (:issue:`38372`) - Bug in :meth:`Series.loc` raising a ``ValueError`` when input was filtered with a Boolean list and values to set were a list with lower dimension (:issue:`20438`) @@ -924,7 +983,7 @@ Indexing - Bug in :meth:`DataFrame.reindex` and :meth:`Series.reindex` with timezone aware indexes raising a ``TypeError`` for ``method="ffill"`` and ``method="bfill"`` and specified ``tolerance`` (:issue:`38566`) - Bug in :meth:`DataFrame.reindex` with ``datetime64[ns]`` or ``timedelta64[ns]`` incorrectly casting to integers when the ``fill_value`` requires casting to object dtype (:issue:`39755`) - Bug in :meth:`DataFrame.__setitem__` raising a ``ValueError`` when setting on an empty :class:`DataFrame` using specified columns and a nonempty :class:`DataFrame` value (:issue:`38831`) -- Bug in :meth:`DataFrame.loc.__setitem__` raising ValueError when expanding unique column for :class:`DataFrame` with duplicate columns (:issue:`38521`) +- Bug in :meth:`DataFrame.loc.__setitem__` raising a ``ValueError`` when operating on a unique column when the :class:`DataFrame` has duplicate columns (:issue:`38521`) - Bug in :meth:`DataFrame.iloc.__setitem__` and :meth:`DataFrame.loc.__setitem__` with mixed dtypes when setting with a dictionary value (:issue:`38335`) - Bug in :meth:`Series.loc.__setitem__` and :meth:`DataFrame.loc.__setitem__` raising ``KeyError`` when provided a Boolean generator (:issue:`39614`) - Bug in :meth:`Series.iloc` and :meth:`DataFrame.iloc` raising a ``KeyError`` when provided a generator (:issue:`39614`) @@ -953,10 +1012,10 @@ Indexing - Bug in :meth:`DataFrame.__setitem__` raising a ``TypeError`` when using a ``str`` subclass as the column name with a :class:`DatetimeIndex` (:issue:`37366`) - Bug in :meth:`PeriodIndex.get_loc` failing to raise a ``KeyError`` when given a :class:`Period` with a mismatched ``freq`` (:issue:`41670`) - Bug ``.loc.__getitem__`` with a :class:`UInt64Index` and negative-integer keys raising ``OverflowError`` instead of ``KeyError`` in some cases, wrapping around to positive integers in others (:issue:`41777`) +- Bug in :meth:`Index.get_indexer` failing to raise ``ValueError`` in some cases with invalid ``method``, ``limit``, or ``tolerance`` arguments (:issue:`41918`) Missing ^^^^^^^ - - Bug in :class:`Grouper` did not correctly propagate the ``dropna`` argument; :meth:`.DataFrameGroupBy.transform` now correctly handles missing values for ``dropna=True`` (:issue:`35612`) - Bug in :func:`isna`, :meth:`Series.isna`, :meth:`Index.isna`, :meth:`DataFrame.isna`, and the corresponding ``notna`` functions not recognizing ``Decimal("NaN")`` objects (:issue:`39409`) - Bug in :meth:`DataFrame.fillna` not accepting a dictionary for the ``downcast`` keyword (:issue:`40809`) @@ -965,7 +1024,6 @@ Missing MultiIndex ^^^^^^^^^^ - - Bug in :meth:`DataFrame.drop` raising a ``TypeError`` when the :class:`MultiIndex` is non-unique and ``level`` is not provided (:issue:`36293`) - Bug in :meth:`MultiIndex.intersection` duplicating ``NaN`` in the result (:issue:`38623`) - Bug in :meth:`MultiIndex.equals` incorrectly returning ``True`` when the :class:`MultiIndex` contained ``NaN`` even when they are differently ordered (:issue:`38439`) @@ -975,7 +1033,6 @@ MultiIndex I/O ^^^ - - Bug in :meth:`Index.__repr__` when ``display.max_seq_items=1`` (:issue:`38415`) - Bug in :func:`read_csv` not recognizing scientific notation if the argument ``decimal`` is set and ``engine="python"`` (:issue:`31920`) - Bug in :func:`read_csv` interpreting ``NA`` value as comment, when ``NA`` does contain the comment string fixed for ``engine="python"`` (:issue:`34002`) @@ -987,7 +1044,7 @@ I/O - Allow custom error values for the ``parse_dates`` argument of :func:`read_sql`, :func:`read_sql_query` and :func:`read_sql_table` (:issue:`35185`) - Bug in :meth:`DataFrame.to_hdf` and :meth:`Series.to_hdf` raising a ``KeyError`` when trying to apply for subclasses of ``DataFrame`` or ``Series`` (:issue:`33748`) - Bug in :meth:`.HDFStore.put` raising a wrong ``TypeError`` when saving a DataFrame with non-string dtype (:issue:`34274`) -- Bug in :func:`json_normalize` resulting in the first element of a generator object not being included in the returned ``DataFrame`` (:issue:`35923`) +- Bug in :func:`json_normalize` resulting in the first element of a generator object not being included in the returned DataFrame (:issue:`35923`) - Bug in :func:`read_csv` applying the thousands separator to date columns when the column should be parsed for dates and ``usecols`` is specified for ``engine="python"`` (:issue:`39365`) - Bug in :func:`read_excel` forward filling :class:`MultiIndex` names when multiple header and index columns are specified (:issue:`34673`) - Bug in :func:`read_excel` not respecting :func:`set_option` (:issue:`34252`) @@ -1008,32 +1065,28 @@ I/O - Bug in :func:`read_csv` silently ignoring ``sep`` if ``delimiter`` and ``sep`` are defined, now raising a ``ValueError`` (:issue:`39823`) - Bug in :func:`read_csv` and :func:`read_table` misinterpreting arguments when ``sys.setprofile`` had been previously called (:issue:`41069`) - Bug in the conversion from PyArrow to pandas (e.g. for reading Parquet) with nullable dtypes and a PyArrow array whose data buffer size is not a multiple of the dtype size (:issue:`40896`) -- Bug in :func:`read_excel` would raise an error when pandas could not determine the file type, even when user specified the ``engine`` argument (:issue:`41225`) +- Bug in :func:`read_excel` would raise an error when pandas could not determine the file type even though the user specified the ``engine`` argument (:issue:`41225`) - Bug in :func:`read_clipboard` copying from an excel file shifts values into the wrong column if there are null values in first column (:issue:`41108`) Period ^^^^^^ - Comparisons of :class:`Period` objects or :class:`Index`, :class:`Series`, or :class:`DataFrame` with mismatched ``PeriodDtype`` now behave like other mismatched-type comparisons, returning ``False`` for equals, ``True`` for not-equal, and raising ``TypeError`` for inequality checks (:issue:`39274`) -- -- Plotting ^^^^^^^^ - - Bug in :func:`plotting.scatter_matrix` raising when 2d ``ax`` argument passed (:issue:`16253`) - Prevent warnings when Matplotlib's ``constrained_layout`` is enabled (:issue:`25261`) - Bug in :func:`DataFrame.plot` was showing the wrong colors in the legend if the function was called repeatedly and some calls used ``yerr`` while others didn't (:issue:`39522`) - Bug in :func:`DataFrame.plot` was showing the wrong colors in the legend if the function was called repeatedly and some calls used ``secondary_y`` and others use ``legend=False`` (:issue:`40044`) - Bug in :meth:`DataFrame.plot.box` when ``dark_background`` theme was selected, caps or min/max markers for the plot were not visible (:issue:`40769`) - Groupby/resample/rolling ^^^^^^^^^^^^^^^^^^^^^^^^ - Bug in :meth:`.GroupBy.agg` with :class:`PeriodDtype` columns incorrectly casting results too aggressively (:issue:`38254`) - Bug in :meth:`.SeriesGroupBy.value_counts` where unobserved categories in a grouped categorical Series were not tallied (:issue:`38672`) - Bug in :meth:`.SeriesGroupBy.value_counts` where an error was raised on an empty Series (:issue:`39172`) - Bug in :meth:`.GroupBy.indices` would contain non-existent indices when null values were present in the groupby keys (:issue:`9304`) -- Fixed bug in :meth:`.GroupBy.sum` causing loss of precision through using Kahan summation (:issue:`38778`) +- Fixed bug in :meth:`.GroupBy.sum` causing a loss of precision by now using Kahan summation (:issue:`38778`) - Fixed bug in :meth:`.GroupBy.cumsum` and :meth:`.GroupBy.mean` causing loss of precision through using Kahan summation (:issue:`38934`) - Bug in :meth:`.Resampler.aggregate` and :meth:`DataFrame.transform` raising a ``TypeError`` instead of ``SpecificationError`` when missing keys had mixed dtypes (:issue:`39025`) - Bug in :meth:`.DataFrameGroupBy.idxmin` and :meth:`.DataFrameGroupBy.idxmax` with ``ExtensionDtype`` columns (:issue:`38733`) @@ -1098,31 +1151,27 @@ Reshaping Sparse ^^^^^^ - - Bug in :meth:`DataFrame.sparse.to_coo` raising a ``KeyError`` with columns that are a numeric :class:`Index` without a ``0`` (:issue:`18414`) - Bug in :meth:`SparseArray.astype` with ``copy=False`` producing incorrect results when going from integer dtype to floating dtype (:issue:`34456`) - Bug in :meth:`SparseArray.max` and :meth:`SparseArray.min` would always return an empty result (:issue:`40921`) ExtensionArray ^^^^^^^^^^^^^^ - -- Bug in :meth:`DataFrame.where` when ``other`` is a :class:`Series` with :class:`ExtensionArray` dtype (:issue:`38729`) +- Bug in :meth:`DataFrame.where` when ``other`` is a Series with an :class:`ExtensionDtype` (:issue:`38729`) - Fixed bug where :meth:`Series.idxmax`, :meth:`Series.idxmin`, :meth:`Series.argmax`, and :meth:`Series.argmin` would fail when the underlying data is an :class:`ExtensionArray` (:issue:`32749`, :issue:`33719`, :issue:`36566`) - Fixed bug where some properties of subclasses of :class:`PandasExtensionDtype` where improperly cached (:issue:`40329`) -- Bug in :meth:`DataFrame.mask` where masking a :class:`Dataframe` with an :class:`ExtensionArray` dtype raises ``ValueError`` (:issue:`40941`) +- Bug in :meth:`DataFrame.mask` where masking a DataFrame with an :class:`ExtensionDtype` raises a ``ValueError`` (:issue:`40941`) Styler ^^^^^^ - - Bug in :class:`.Styler` where the ``subset`` argument in methods raised an error for some valid MultiIndex slices (:issue:`33562`) - :class:`.Styler` rendered HTML output has seen minor alterations to support w3 good code standards (:issue:`39626`) - Bug in :class:`.Styler` where rendered HTML was missing a column class identifier for certain header cells (:issue:`39716`) - Bug in :meth:`.Styler.background_gradient` where text-color was not determined correctly (:issue:`39888`) -- Bug in :class:`.Styler` where multiple elements in CSS-selectors were not correctly added to ``table_styles`` (:issue:`39942`) +- Bug in :meth:`.Styler.set_table_styles` where multiple elements in CSS-selectors of the ``table_styles`` argument were not correctly added (:issue:`34061`) - Bug in :class:`.Styler` where copying from Jupyter dropped the top left cell and misaligned headers (:issue:`12147`) - Bug in :class:`Styler.where` where ``kwargs`` were not passed to the applicable callable (:issue:`40845`) -- Bug in :class:`.Styler` caused CSS to duplicate on multiple renders (:issue:`39395`, :issue:`40334`) - +- Bug in :class:`.Styler` causing CSS to duplicate on multiple renders (:issue:`39395`, :issue:`40334`) Other ^^^^^ @@ -1151,3 +1200,5 @@ Other Contributors ~~~~~~~~~~~~ + +.. contributors:: v1.2.4..v1.3.0|HEAD diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst new file mode 100644 index 0000000000000..166ea2f0d4164 --- /dev/null +++ b/doc/source/whatsnew/v1.4.0.rst @@ -0,0 +1,226 @@ +.. _whatsnew_140: + +What's new in 1.4.0 (??) +------------------------ + +These are the changes in pandas 1.4.0. See :ref:`release` for a full changelog +including other versions of pandas. + +{{ header }} + +.. --------------------------------------------------------------------------- + +.. _whatsnew_140.enhancements: + +Enhancements +~~~~~~~~~~~~ + +.. _whatsnew_140.enhancements.enhancement1: + +enhancement1 +^^^^^^^^^^^^ + +.. _whatsnew_140.enhancements.enhancement2: + +enhancement2 +^^^^^^^^^^^^ + +.. _whatsnew_140.enhancements.other: + +Other enhancements +^^^^^^^^^^^^^^^^^^ +- +- + +.. --------------------------------------------------------------------------- + +.. _whatsnew_140.notable_bug_fixes: + +Notable bug fixes +~~~~~~~~~~~~~~~~~ + +These are bug fixes that might have notable behavior changes. + +.. _whatsnew_140.notable_bug_fixes.notable_bug_fix1: + +notable_bug_fix1 +^^^^^^^^^^^^^^^^ + +.. _whatsnew_140.notable_bug_fixes.notable_bug_fix2: + +notable_bug_fix2 +^^^^^^^^^^^^^^^^ + +.. --------------------------------------------------------------------------- + +.. _whatsnew_140.api_breaking: + +Backwards incompatible API changes +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. _whatsnew_140.api_breaking.deps: + +Increased minimum versions for dependencies +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Some minimum supported versions of dependencies were updated. +If installed, we now require: + ++-----------------+-----------------+----------+---------+ +| Package | Minimum Version | Required | Changed | ++=================+=================+==========+=========+ +| | | X | X | ++-----------------+-----------------+----------+---------+ + +For `optional libraries `_ the general recommendation is to use the latest version. +The following table lists the lowest version per library that is currently being tested throughout the development of pandas. +Optional libraries below the lowest tested version may still work, but are not considered supported. + ++-----------------+-----------------+---------+ +| Package | Minimum Version | Changed | ++=================+=================+=========+ +| | | X | ++-----------------+-----------------+---------+ + +See :ref:`install.dependencies` and :ref:`install.optional_dependencies` for more. + +.. _whatsnew_140.api_breaking.other: + +Other API changes +^^^^^^^^^^^^^^^^^ +- +- + +.. --------------------------------------------------------------------------- + +.. _whatsnew_140.deprecations: + +Deprecations +~~~~~~~~~~~~ +- +- + +.. --------------------------------------------------------------------------- + +.. _whatsnew_140.performance: + +Performance improvements +~~~~~~~~~~~~~~~~~~~~~~~~ +- +- + +.. --------------------------------------------------------------------------- + +.. _whatsnew_140.bug_fixes: + +Bug fixes +~~~~~~~~~ + +Categorical +^^^^^^^^^^^ +- +- + +Datetimelike +^^^^^^^^^^^^ +- +- + +Timedelta +^^^^^^^^^ +- +- + +Timezones +^^^^^^^^^ +- +- + +Numeric +^^^^^^^ +- +- + +Conversion +^^^^^^^^^^ +- +- + +Strings +^^^^^^^ +- +- + +Interval +^^^^^^^^ +- +- + +Indexing +^^^^^^^^ +- +- + +Missing +^^^^^^^ +- +- + +MultiIndex +^^^^^^^^^^ +- +- + +I/O +^^^ +- +- + +Period +^^^^^^ +- +- + +Plotting +^^^^^^^^ +- +- + +Groupby/resample/rolling +^^^^^^^^^^^^^^^^^^^^^^^^ +- +- + +Reshaping +^^^^^^^^^ +- +- + +Sparse +^^^^^^ +- +- + +ExtensionArray +^^^^^^^^^^^^^^ +- +- + +Styler +^^^^^^ +- +- + +Other +^^^^^ + +.. ***DO NOT USE THIS SECTION*** + +- +- + +.. --------------------------------------------------------------------------- + +.. _whatsnew_140.contributors: + +Contributors +~~~~~~~~~~~~ diff --git a/environment.yml b/environment.yml index bb96235123af3..788b88ef16ad6 100644 --- a/environment.yml +++ b/environment.yml @@ -18,7 +18,7 @@ dependencies: - cython>=0.29.21 # code checks - - black=20.8b1 + - black=21.5b2 - cpplint - flake8=3.9.2 - flake8-bugbear=21.3.2 # used by flake8, find likely bugs @@ -79,7 +79,7 @@ dependencies: - bottleneck>=1.2.1 - ipykernel - ipython>=7.11.1 - - jinja2<3.0.0 # pandas.Styler + - jinja2 # pandas.Styler - matplotlib>=2.2.2 # pandas.plotting, Series.plot, DataFrame.plot - numexpr>=2.7.0 - scipy>=1.2 diff --git a/pandas/_config/config.py b/pandas/_config/config.py index 37f5a5730439d..be3498dc0829b 100644 --- a/pandas/_config/config.py +++ b/pandas/_config/config.py @@ -189,7 +189,7 @@ def get_default_val(pat: str): class DictWrapper: - """ provide attribute-style access to a nested dict""" + """provide attribute-style access to a nested dict""" def __init__(self, d: dict[str, Any], prefix: str = ""): object.__setattr__(self, "d", d) @@ -571,7 +571,7 @@ def _get_root(key: str) -> tuple[dict[str, Any], str]: def _is_deprecated(key: str) -> bool: - """ Returns True if the given option has been deprecated """ + """Returns True if the given option has been deprecated""" key = key.lower() return key in _deprecated_options @@ -643,7 +643,7 @@ def _warn_if_deprecated(key: str) -> bool: def _build_option_description(k: str) -> str: - """ Builds a formatted description of a registered option and prints it """ + """Builds a formatted description of a registered option and prints it""" o = _get_registered_option(k) d = _get_deprecated_option(k) @@ -667,7 +667,7 @@ def _build_option_description(k: str) -> str: def pp_options_list(keys: Iterable[str], width=80, _print: bool = False): - """ Builds a concise listing of available options, grouped by prefix """ + """Builds a concise listing of available options, grouped by prefix""" from itertools import groupby from textwrap import wrap diff --git a/pandas/_libs/algos.pyi b/pandas/_libs/algos.pyi index 30a31d17fc947..d0f664c323a89 100644 --- a/pandas/_libs/algos.pyi +++ b/pandas/_libs/algos.pyi @@ -7,6 +7,7 @@ class Infinity: """ Provide a positive Infinity comparison method for ranking. """ + def __eq__(self, other) -> bool: ... def __ne__(self, other) -> bool: ... def __lt__(self, other) -> bool: ... @@ -14,11 +15,11 @@ class Infinity: def __gt__(self, other) -> bool: ... def __ge__(self, other) -> bool: ... - class NegInfinity: """ Provide a negative Infinity comparison method for ranking. """ + def __eq__(self, other) -> bool: ... def __ne__(self, other) -> bool: ... def __lt__(self, other) -> bool: ... @@ -26,56 +27,38 @@ class NegInfinity: def __gt__(self, other) -> bool: ... def __ge__(self, other) -> bool: ... - def unique_deltas( arr: np.ndarray, # const int64_t[:] ) -> np.ndarray: ... # np.ndarray[np.int64, ndim=1] - - def is_lexsorted(list_of_arrays: list[np.ndarray]) -> bool: ... - - def groupsort_indexer( index: np.ndarray, # const int64_t[:] ngroups: int, ) -> tuple[ np.ndarray, # ndarray[int64_t, ndim=1] np.ndarray, # ndarray[int64_t, ndim=1] -]: - ... - - +]: ... def kth_smallest( a: np.ndarray, # numeric[:] k: int, -) -> Any: ... # numeric - +) -> Any: ... # numeric # ---------------------------------------------------------------------- # Pairwise correlation/covariance - - def nancorr( mat: np.ndarray, # const float64_t[:, :] cov: bool = False, minp=None, -) -> np.ndarray: # np.ndarray[float64_t, ndim=2] - ... - - +) -> np.ndarray: ... # ndarray[float64_t, ndim=2] def nancorr_spearman( mat: np.ndarray, # ndarray[float64_t, ndim=2] minp: int = 1, -) -> np.ndarray: # np.ndarray[np.float64, ndim=2] - ... - - +) -> np.ndarray: ... # ndarray[float64_t, ndim=2] def nancorr_kendall( mat: np.ndarray, # ndarray[float64_t, ndim=2] minp: int = 1, -) -> np.ndarray: # np.ndarray[float64, ndim=2] - ... +) -> np.ndarray: ... # ndarray[float64_t, ndim=2] # ---------------------------------------------------------------------- @@ -92,58 +75,41 @@ def nancorr_kendall( # uint16_t # uint8_t - def validate_limit(nobs: int | None, limit=None) -> int: ... - - def pad( - old: np.ndarray, # ndarray[algos_t] - new: np.ndarray, # ndarray[algos_t] + old: np.ndarray, # ndarray[algos_t] + new: np.ndarray, # ndarray[algos_t] limit=None, ) -> np.ndarray: ... # np.ndarray[np.intp, ndim=1] - - def pad_inplace( values: np.ndarray, # algos_t[:] - mask: np.ndarray, # uint8_t[:] + mask: np.ndarray, # uint8_t[:] limit=None, ) -> None: ... - - def pad_2d_inplace( values: np.ndarray, # algos_t[:, :] - mask: np.ndarray, # const uint8_t[:, :] + mask: np.ndarray, # const uint8_t[:, :] limit=None, -) -> None: - ... - - +) -> None: ... def backfill( old: np.ndarray, # ndarray[algos_t] new: np.ndarray, # ndarray[algos_t] limit=None, -) -> np.ndarray: # np.ndarray[np.intp, ndim=1] - ... - +) -> np.ndarray: ... # np.ndarray[np.intp, ndim=1] def backfill_inplace( values: np.ndarray, # algos_t[:] - mask: np.ndarray, # uint8_t[:] + mask: np.ndarray, # uint8_t[:] limit=None, ) -> None: ... - - def backfill_2d_inplace( values: np.ndarray, # algos_t[:, :] - mask: np.ndarray, # const uint8_t[:, :] + mask: np.ndarray, # const uint8_t[:, :] limit=None, ) -> None: ... - - def is_monotonic( arr: np.ndarray, # ndarray[algos_t, ndim=1] - timelike: bool -) -> tuple[bool, bool, bool]: - ... + timelike: bool, +) -> tuple[bool, bool, bool]: ... # ---------------------------------------------------------------------- # rank_1d, rank_2d @@ -155,7 +121,6 @@ def is_monotonic( # uint64_t # int64_t - def rank_1d( values: np.ndarray, # ndarray[rank_t, ndim=1] labels: np.ndarray, # const int64_t[:] @@ -165,8 +130,6 @@ def rank_1d( pct: bool = ..., na_option=..., ) -> np.ndarray: ... # np.ndarray[float64_t, ndim=1] - - def rank_2d( in_arr: np.ndarray, # ndarray[rank_t, ndim=2] axis: int = ..., @@ -176,8 +139,6 @@ def rank_2d( na_option=..., pct: bool = ..., ) -> np.ndarray: ... # np.ndarray[float64_t, ndim=1] - - def diff_2d( arr: np.ndarray, # ndarray[diff_t, ndim=2] out: np.ndarray, # ndarray[out_t, ndim=2] @@ -185,109 +146,243 @@ def diff_2d( axis: int, datetimelike: bool = ..., ) -> None: ... - - def ensure_platform_int(arr: object) -> np.ndarray: ... - def ensure_object(arr: object) -> np.ndarray: ... - def ensure_float64(arr: object, copy=True) -> np.ndarray: ... - def ensure_float32(arr: object, copy=True) -> np.ndarray: ... - def ensure_int8(arr: object, copy=True) -> np.ndarray: ... - def ensure_int16(arr: object, copy=True) -> np.ndarray: ... - def ensure_int32(arr: object, copy=True) -> np.ndarray: ... - def ensure_int64(arr: object, copy=True) -> np.ndarray: ... - def ensure_uint8(arr: object, copy=True) -> np.ndarray: ... - def ensure_uint16(arr: object, copy=True) -> np.ndarray: ... - def ensure_uint32(arr: object, copy=True) -> np.ndarray: ... - def ensure_uint64(arr: object, copy=True) -> np.ndarray: ... - - -def take_1d_int8_int8(values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=...) -> None: ... -def take_1d_int8_int32(values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=...) -> None: ... -def take_1d_int8_int64(values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=...) -> None: ... -def take_1d_int8_float64(values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=...) -> None: ... -def take_1d_int16_int16(values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=...) -> None: ... -def take_1d_int16_int32(values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=...) -> None: ... -def take_1d_int16_int64(values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=...) -> None: ... -def take_1d_int16_float64(values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=...) -> None: ... -def take_1d_int32_int32(values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=...) -> None: ... -def take_1d_int32_int64(values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=...) -> None: ... -def take_1d_int32_float64(values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=...) -> None: ... -def take_1d_int64_int64(values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=...) -> None: ... -def take_1d_int64_float64(values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=...) -> None: ... -def take_1d_float32_float32(values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=...) -> None: ... -def take_1d_float32_float64(values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=...) -> None: ... -def take_1d_float64_float64(values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=...) -> None: ... -def take_1d_object_object(values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=...) -> None: ... -def take_1d_bool_bool(values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=...) -> None: ... -def take_1d_bool_object(values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=...) -> None: ... - -def take_2d_axis0_int8_int8(values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=...) -> None: ... -def take_2d_axis0_int8_int32(values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=...) -> None: ... -def take_2d_axis0_int8_int64(values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=...) -> None: ... -def take_2d_axis0_int8_float64(values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=...) -> None: ... -def take_2d_axis0_int16_int16(values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=...) -> None: ... -def take_2d_axis0_int16_int32(values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=...) -> None: ... -def take_2d_axis0_int16_int64(values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=...) -> None: ... -def take_2d_axis0_int16_float64(values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=...) -> None: ... -def take_2d_axis0_int32_int32(values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=...) -> None: ... -def take_2d_axis0_int32_int64(values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=...) -> None: ... -def take_2d_axis0_int32_float64(values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=...) -> None: ... -def take_2d_axis0_int64_int64(values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=...) -> None: ... -def take_2d_axis0_int64_float64(values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=...) -> None: ... -def take_2d_axis0_float32_float32(values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=...) -> None: ... -def take_2d_axis0_float32_float64(values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=...) -> None: ... -def take_2d_axis0_float64_float64(values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=...) -> None: ... -def take_2d_axis0_object_object(values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=...) -> None: ... -def take_2d_axis0_bool_bool(values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=...) -> None: ... -def take_2d_axis0_bool_object(values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=...) -> None: ... - -def take_2d_axis1_int8_int8(values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=...) -> None: ... -def take_2d_axis1_int8_int32(values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=...) -> None: ... -def take_2d_axis1_int8_int64(values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=...) -> None: ... -def take_2d_axis1_int8_float64(values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=...) -> None: ... -def take_2d_axis1_int16_int16(values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=...) -> None: ... -def take_2d_axis1_int16_int32(values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=...) -> None: ... -def take_2d_axis1_int16_int64(values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=...) -> None: ... -def take_2d_axis1_int16_float64(values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=...) -> None: ... -def take_2d_axis1_int32_int32(values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=...) -> None: ... -def take_2d_axis1_int32_int64(values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=...) -> None: ... -def take_2d_axis1_int32_float64(values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=...) -> None: ... -def take_2d_axis1_int64_int64(values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=...) -> None: ... -def take_2d_axis1_int64_float64(values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=...) -> None: ... -def take_2d_axis1_float32_float32(values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=...) -> None: ... -def take_2d_axis1_float32_float64(values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=...) -> None: ... -def take_2d_axis1_float64_float64(values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=...) -> None: ... -def take_2d_axis1_object_object(values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=...) -> None: ... -def take_2d_axis1_bool_bool(values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=...) -> None: ... -def take_2d_axis1_bool_object(values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=...) -> None: ... - -def take_2d_multi_int8_int8(values: np.ndarray, indexer, out: np.ndarray, fill_value=...) -> None: ... -def take_2d_multi_int8_int32(values: np.ndarray, indexer, out: np.ndarray, fill_value=...) -> None: ... -def take_2d_multi_int8_int64(values: np.ndarray, indexer, out: np.ndarray, fill_value=...) -> None: ... -def take_2d_multi_int8_float64(values: np.ndarray, indexer, out: np.ndarray, fill_value=...) -> None: ... -def take_2d_multi_int16_int16(values: np.ndarray, indexer, out: np.ndarray, fill_value=...) -> None: ... -def take_2d_multi_int16_int32(values: np.ndarray, indexer, out: np.ndarray, fill_value=...) -> None: ... -def take_2d_multi_int16_int64(values: np.ndarray, indexer, out: np.ndarray, fill_value=...) -> None: ... -def take_2d_multi_int16_float64(values: np.ndarray, indexer, out: np.ndarray, fill_value=...) -> None: ... -def take_2d_multi_int32_int32(values: np.ndarray, indexer, out: np.ndarray, fill_value=...) -> None: ... -def take_2d_multi_int32_int64(values: np.ndarray, indexer, out: np.ndarray, fill_value=...) -> None: ... -def take_2d_multi_int32_float64(values: np.ndarray, indexer, out: np.ndarray, fill_value=...) -> None: ... -def take_2d_multi_int64_float64(values: np.ndarray, indexer, out: np.ndarray, fill_value=...) -> None: ... -def take_2d_multi_float32_float32(values: np.ndarray, indexer, out: np.ndarray, fill_value=...) -> None: ... -def take_2d_multi_float32_float64(values: np.ndarray, indexer, out: np.ndarray, fill_value=...) -> None: ... -def take_2d_multi_float64_float64(values: np.ndarray, indexer, out: np.ndarray, fill_value=...) -> None: ... -def take_2d_multi_object_object(values: np.ndarray, indexer, out: np.ndarray, fill_value=...) -> None: ... -def take_2d_multi_bool_bool(values: np.ndarray, indexer, out: np.ndarray, fill_value=...) -> None: ... -def take_2d_multi_bool_object(values: np.ndarray, indexer, out: np.ndarray, fill_value=...) -> None: ... -def take_2d_multi_int64_int64(values: np.ndarray, indexer, out: np.ndarray, fill_value=...) -> None: ... +def take_1d_int8_int8( + values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=... +) -> None: ... +def take_1d_int8_int32( + values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=... +) -> None: ... +def take_1d_int8_int64( + values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=... +) -> None: ... +def take_1d_int8_float64( + values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=... +) -> None: ... +def take_1d_int16_int16( + values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=... +) -> None: ... +def take_1d_int16_int32( + values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=... +) -> None: ... +def take_1d_int16_int64( + values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=... +) -> None: ... +def take_1d_int16_float64( + values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=... +) -> None: ... +def take_1d_int32_int32( + values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=... +) -> None: ... +def take_1d_int32_int64( + values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=... +) -> None: ... +def take_1d_int32_float64( + values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=... +) -> None: ... +def take_1d_int64_int64( + values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=... +) -> None: ... +def take_1d_int64_float64( + values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=... +) -> None: ... +def take_1d_float32_float32( + values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=... +) -> None: ... +def take_1d_float32_float64( + values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=... +) -> None: ... +def take_1d_float64_float64( + values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=... +) -> None: ... +def take_1d_object_object( + values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=... +) -> None: ... +def take_1d_bool_bool( + values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=... +) -> None: ... +def take_1d_bool_object( + values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=... +) -> None: ... +def take_2d_axis0_int8_int8( + values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=... +) -> None: ... +def take_2d_axis0_int8_int32( + values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=... +) -> None: ... +def take_2d_axis0_int8_int64( + values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=... +) -> None: ... +def take_2d_axis0_int8_float64( + values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=... +) -> None: ... +def take_2d_axis0_int16_int16( + values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=... +) -> None: ... +def take_2d_axis0_int16_int32( + values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=... +) -> None: ... +def take_2d_axis0_int16_int64( + values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=... +) -> None: ... +def take_2d_axis0_int16_float64( + values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=... +) -> None: ... +def take_2d_axis0_int32_int32( + values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=... +) -> None: ... +def take_2d_axis0_int32_int64( + values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=... +) -> None: ... +def take_2d_axis0_int32_float64( + values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=... +) -> None: ... +def take_2d_axis0_int64_int64( + values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=... +) -> None: ... +def take_2d_axis0_int64_float64( + values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=... +) -> None: ... +def take_2d_axis0_float32_float32( + values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=... +) -> None: ... +def take_2d_axis0_float32_float64( + values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=... +) -> None: ... +def take_2d_axis0_float64_float64( + values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=... +) -> None: ... +def take_2d_axis0_object_object( + values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=... +) -> None: ... +def take_2d_axis0_bool_bool( + values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=... +) -> None: ... +def take_2d_axis0_bool_object( + values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=... +) -> None: ... +def take_2d_axis1_int8_int8( + values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=... +) -> None: ... +def take_2d_axis1_int8_int32( + values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=... +) -> None: ... +def take_2d_axis1_int8_int64( + values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=... +) -> None: ... +def take_2d_axis1_int8_float64( + values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=... +) -> None: ... +def take_2d_axis1_int16_int16( + values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=... +) -> None: ... +def take_2d_axis1_int16_int32( + values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=... +) -> None: ... +def take_2d_axis1_int16_int64( + values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=... +) -> None: ... +def take_2d_axis1_int16_float64( + values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=... +) -> None: ... +def take_2d_axis1_int32_int32( + values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=... +) -> None: ... +def take_2d_axis1_int32_int64( + values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=... +) -> None: ... +def take_2d_axis1_int32_float64( + values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=... +) -> None: ... +def take_2d_axis1_int64_int64( + values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=... +) -> None: ... +def take_2d_axis1_int64_float64( + values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=... +) -> None: ... +def take_2d_axis1_float32_float32( + values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=... +) -> None: ... +def take_2d_axis1_float32_float64( + values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=... +) -> None: ... +def take_2d_axis1_float64_float64( + values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=... +) -> None: ... +def take_2d_axis1_object_object( + values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=... +) -> None: ... +def take_2d_axis1_bool_bool( + values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=... +) -> None: ... +def take_2d_axis1_bool_object( + values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=... +) -> None: ... +def take_2d_multi_int8_int8( + values: np.ndarray, indexer, out: np.ndarray, fill_value=... +) -> None: ... +def take_2d_multi_int8_int32( + values: np.ndarray, indexer, out: np.ndarray, fill_value=... +) -> None: ... +def take_2d_multi_int8_int64( + values: np.ndarray, indexer, out: np.ndarray, fill_value=... +) -> None: ... +def take_2d_multi_int8_float64( + values: np.ndarray, indexer, out: np.ndarray, fill_value=... +) -> None: ... +def take_2d_multi_int16_int16( + values: np.ndarray, indexer, out: np.ndarray, fill_value=... +) -> None: ... +def take_2d_multi_int16_int32( + values: np.ndarray, indexer, out: np.ndarray, fill_value=... +) -> None: ... +def take_2d_multi_int16_int64( + values: np.ndarray, indexer, out: np.ndarray, fill_value=... +) -> None: ... +def take_2d_multi_int16_float64( + values: np.ndarray, indexer, out: np.ndarray, fill_value=... +) -> None: ... +def take_2d_multi_int32_int32( + values: np.ndarray, indexer, out: np.ndarray, fill_value=... +) -> None: ... +def take_2d_multi_int32_int64( + values: np.ndarray, indexer, out: np.ndarray, fill_value=... +) -> None: ... +def take_2d_multi_int32_float64( + values: np.ndarray, indexer, out: np.ndarray, fill_value=... +) -> None: ... +def take_2d_multi_int64_float64( + values: np.ndarray, indexer, out: np.ndarray, fill_value=... +) -> None: ... +def take_2d_multi_float32_float32( + values: np.ndarray, indexer, out: np.ndarray, fill_value=... +) -> None: ... +def take_2d_multi_float32_float64( + values: np.ndarray, indexer, out: np.ndarray, fill_value=... +) -> None: ... +def take_2d_multi_float64_float64( + values: np.ndarray, indexer, out: np.ndarray, fill_value=... +) -> None: ... +def take_2d_multi_object_object( + values: np.ndarray, indexer, out: np.ndarray, fill_value=... +) -> None: ... +def take_2d_multi_bool_bool( + values: np.ndarray, indexer, out: np.ndarray, fill_value=... +) -> None: ... +def take_2d_multi_bool_object( + values: np.ndarray, indexer, out: np.ndarray, fill_value=... +) -> None: ... +def take_2d_multi_int64_int64( + values: np.ndarray, indexer, out: np.ndarray, fill_value=... +) -> None: ... diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index 6cc55648b9cf4..03f4ce273de6e 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -383,10 +383,11 @@ def nancorr_spearman(ndarray[float64_t, ndim=2] mat, Py_ssize_t minp=1) -> ndarr Py_ssize_t i, j, xi, yi, N, K ndarray[float64_t, ndim=2] result ndarray[float64_t, ndim=2] ranked_mat - ndarray[float64_t, ndim=1] maskedx - ndarray[float64_t, ndim=1] maskedy + ndarray[float64_t, ndim=1] rankedx, rankedy + float64_t[::1] maskedx, maskedy ndarray[uint8_t, ndim=2] mask int64_t nobs = 0 + bint no_nans float64_t vx, vy, sumx, sumxx, sumyy, mean, divisor const int64_t[:] labels_n, labels_nobs @@ -394,54 +395,81 @@ def nancorr_spearman(ndarray[float64_t, ndim=2] mat, Py_ssize_t minp=1) -> ndarr # For compatibility when calling rank_1d labels_n = np.zeros(N, dtype=np.int64) + # Handle the edge case where we know all results will be nan + # to keep conditional logic inside loop simpler + if N < minp: + result = np.full((K, K), np.nan, dtype=np.float64) + return result + result = np.empty((K, K), dtype=np.float64) mask = np.isfinite(mat).view(np.uint8) + no_nans = mask.all() ranked_mat = np.empty((N, K), dtype=np.float64) + # Note: we index into maskedx, maskedy in loops up to nobs, but using N is safe + # here since N >= nobs and values are stored contiguously + maskedx = np.empty(N, dtype=np.float64) + maskedy = np.empty(N, dtype=np.float64) for i in range(K): ranked_mat[:, i] = rank_1d(mat[:, i], labels=labels_n) - for xi in range(K): - for yi in range(xi + 1): - nobs = 0 - # Keep track of whether we need to recompute ranks - all_ranks = True - for i in range(N): - all_ranks &= not (mask[i, xi] ^ mask[i, yi]) - if mask[i, xi] and mask[i, yi]: - nobs += 1 - - if nobs < minp: - result[xi, yi] = result[yi, xi] = NaN - else: - maskedx = np.empty(nobs, dtype=np.float64) - maskedy = np.empty(nobs, dtype=np.float64) - j = 0 - - for i in range(N): - if mask[i, xi] and mask[i, yi]: - maskedx[j] = ranked_mat[i, xi] - maskedy[j] = ranked_mat[i, yi] - j += 1 - - if not all_ranks: - labels_nobs = np.zeros(nobs, dtype=np.int64) - maskedx = rank_1d(maskedx, labels=labels_nobs) - maskedy = rank_1d(maskedy, labels=labels_nobs) - - mean = (nobs + 1) / 2. - - # now the cov numerator + with nogil: + for xi in range(K): + for yi in range(xi + 1): sumx = sumxx = sumyy = 0 - for i in range(nobs): - vx = maskedx[i] - mean - vy = maskedy[i] - mean + # Fastpath for data with no nans/infs, allows avoiding mask checks + # and array reassignments + if no_nans: + mean = (N + 1) / 2. - sumx += vx * vy - sumxx += vx * vx - sumyy += vy * vy + # now the cov numerator + for i in range(N): + vx = ranked_mat[i, xi] - mean + vy = ranked_mat[i, yi] - mean + + sumx += vx * vy + sumxx += vx * vx + sumyy += vy * vy + else: + nobs = 0 + # Keep track of whether we need to recompute ranks + all_ranks = True + for i in range(N): + all_ranks &= not (mask[i, xi] ^ mask[i, yi]) + if mask[i, xi] and mask[i, yi]: + maskedx[nobs] = ranked_mat[i, xi] + maskedy[nobs] = ranked_mat[i, yi] + nobs += 1 + + if nobs < minp: + result[xi, yi] = result[yi, xi] = NaN + continue + else: + if not all_ranks: + with gil: + # We need to slice back to nobs because rank_1d will + # require arrays of nobs length + labels_nobs = np.zeros(nobs, dtype=np.int64) + rankedx = rank_1d(np.array(maskedx)[:nobs], + labels=labels_nobs) + rankedy = rank_1d(np.array(maskedy)[:nobs], + labels=labels_nobs) + for i in range(nobs): + maskedx[i] = rankedx[i] + maskedy[i] = rankedy[i] + + mean = (nobs + 1) / 2. + + # now the cov numerator + for i in range(nobs): + vx = maskedx[i] - mean + vy = maskedy[i] - mean + + sumx += vx * vy + sumxx += vx * vx + sumyy += vy * vy divisor = sqrt(sumxx * sumyy) @@ -945,16 +973,15 @@ def rank_1d( """ cdef: TiebreakEnumType tiebreak - Py_ssize_t i, j, N, grp_start=0, dups=0, sum_ranks=0 - Py_ssize_t grp_vals_seen=1, grp_na_count=0 - ndarray[int64_t, ndim=1] grp_sizes - ndarray[intp_t, ndim=1] lexsort_indexer - ndarray[float64_t, ndim=1] out + Py_ssize_t N + int64_t[::1] grp_sizes + intp_t[:] lexsort_indexer + float64_t[::1] out ndarray[rank_t, ndim=1] masked_vals - ndarray[uint8_t, ndim=1] mask - bint keep_na, at_end, next_val_diff, check_labels, group_changed + rank_t[:] masked_vals_memview + uint8_t[:] mask + bint keep_na, check_labels, check_mask rank_t nan_fill_val - int64_t grp_size tiebreak = tiebreakers[ties_method] if tiebreak == TIEBREAK_FIRST: @@ -973,6 +1000,9 @@ def rank_1d( # comparisons check_labels = np.any(labels) + # For cases where a mask is not possible, we can avoid mask checks + check_mask = not (rank_t is uint64_t or (rank_t is int64_t and not is_datetimelike)) + # Copy values into new array in order to fill missing data # with mask, without obfuscating location of missing data # in values array @@ -1016,9 +1046,11 @@ def rank_1d( else: nan_fill_val = -np.inf - order = (masked_vals, ~mask, labels) + order = (masked_vals, ~(np.array(mask, copy=False)), labels) np.putmask(masked_vals, mask, nan_fill_val) + # putmask doesn't accept a memoryview, so we assign as a separate step + masked_vals_memview = masked_vals # lexsort using labels, then mask, then actual values # each label corresponds to a different group value, @@ -1029,6 +1061,80 @@ def rank_1d( if not ascending: lexsort_indexer = lexsort_indexer[::-1] + with nogil: + rank_sorted_1d( + out, + grp_sizes, + labels, + lexsort_indexer, + masked_vals_memview, + mask, + tiebreak, + check_mask, + check_labels, + keep_na, + N, + ) + if pct: + for i in range(N): + if grp_sizes[i] != 0: + out[i] = out[i] / grp_sizes[i] + + return np.array(out) + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef void rank_sorted_1d( + float64_t[::1] out, + int64_t[::1] grp_sizes, + const intp_t[:] labels, + const intp_t[:] sort_indexer, + # Can make const with cython3 (https://github.com/cython/cython/issues/3222) + rank_t[:] masked_vals, + const uint8_t[:] mask, + TiebreakEnumType tiebreak, + bint check_mask, + bint check_labels, + bint keep_na, + Py_ssize_t N, +) nogil: + """ + See rank_1d.__doc__. Handles only actual ranking, so sorting and masking should + be handled in the caller. Note that `out` and `grp_sizes` are modified inplace. + + Parameters + ---------- + out : float64_t[::1] + Array to store computed ranks + grp_sizes : int64_t[::1] + Array to store group counts. + labels : See rank_1d.__doc__ + sort_indexer : intp_t[:] + Array of indices which sorts masked_vals + masked_vals : rank_t[:] + The values input to rank_1d, with missing values replaced by fill values + mask : uint8_t[:] + Array where entries are True if the value is missing, False otherwise + tiebreak : TiebreakEnumType + See rank_1d.__doc__ for the different modes + check_mask : bint + If False, assumes the mask is all False to skip mask indexing + check_labels : bint + If False, assumes all labels are the same to skip group handling logic + keep_na : bint + Whether or not to keep nulls + N : Py_ssize_t + The number of elements to rank. Note: it is not always true that + N == len(out) or N == len(masked_vals) (see `nancorr_spearman` usage for why) + """ + + cdef: + Py_ssize_t i, j, dups=0, sum_ranks=0, + Py_ssize_t grp_start=0, grp_vals_seen=1, grp_na_count=0 + bint at_end, next_val_diff, group_changed + int64_t grp_size + # Loop over the length of the value array # each incremental i value can be looked up in the lexsort_indexer # array that we sorted previously, which gives us the location of @@ -1036,105 +1142,7 @@ def rank_1d( # values / masked_vals arrays # TODO: de-duplicate once cython supports conditional nogil if rank_t is object: - for i in range(N): - at_end = i == N - 1 - - # dups and sum_ranks will be incremented each loop where - # the value / group remains the same, and should be reset - # when either of those change. Used to calculate tiebreakers - dups += 1 - sum_ranks += i - grp_start + 1 - - next_val_diff = at_end or are_diff(masked_vals[lexsort_indexer[i]], - masked_vals[lexsort_indexer[i+1]]) - - # We'll need this check later anyway to determine group size, so just - # compute it here since shortcircuiting won't help - group_changed = at_end or (check_labels and - (labels[lexsort_indexer[i]] - != labels[lexsort_indexer[i+1]])) - - # Update out only when there is a transition of values or labels. - # When a new value or group is encountered, go back #dups steps( - # the number of occurrence of current value) and assign the ranks - # based on the starting index of the current group (grp_start) - # and the current index - if (next_val_diff or group_changed - or (mask[lexsort_indexer[i]] ^ mask[lexsort_indexer[i+1]])): - - # If keep_na, check for missing values and assign back - # to the result where appropriate - if keep_na and mask[lexsort_indexer[i]]: - grp_na_count = dups - for j in range(i - dups + 1, i + 1): - out[lexsort_indexer[j]] = NaN - elif tiebreak == TIEBREAK_AVERAGE: - for j in range(i - dups + 1, i + 1): - out[lexsort_indexer[j]] = sum_ranks / dups - elif tiebreak == TIEBREAK_MIN: - for j in range(i - dups + 1, i + 1): - out[lexsort_indexer[j]] = i - grp_start - dups + 2 - elif tiebreak == TIEBREAK_MAX: - for j in range(i - dups + 1, i + 1): - out[lexsort_indexer[j]] = i - grp_start + 1 - - # With n as the previous rank in the group and m as the number - # of duplicates in this stretch, if TIEBREAK_FIRST and ascending, - # then rankings should be n + 1, n + 2 ... n + m - elif tiebreak == TIEBREAK_FIRST: - for j in range(i - dups + 1, i + 1): - out[lexsort_indexer[j]] = j + 1 - grp_start - - # If TIEBREAK_FIRST and descending, the ranking should be - # n + m, n + (m - 1) ... n + 1. This is equivalent to - # (i - dups + 1) + (i - j + 1) - grp_start - elif tiebreak == TIEBREAK_FIRST_DESCENDING: - for j in range(i - dups + 1, i + 1): - out[lexsort_indexer[j]] = 2 * i - j - dups + 2 - grp_start - elif tiebreak == TIEBREAK_DENSE: - for j in range(i - dups + 1, i + 1): - out[lexsort_indexer[j]] = grp_vals_seen - - # Look forward to the next value (using the sorting in - # lexsort_indexer). If the value does not equal the current - # value then we need to reset the dups and sum_ranks, knowing - # that a new value is coming up. The conditional also needs - # to handle nan equality and the end of iteration. If group - # changes we do not record seeing a new value in the group - if not group_changed and (next_val_diff or - (mask[lexsort_indexer[i]] - ^ mask[lexsort_indexer[i+1]])): - dups = sum_ranks = 0 - grp_vals_seen += 1 - - # Similar to the previous conditional, check now if we are - # moving to a new group. If so, keep track of the index where - # the new group occurs, so the tiebreaker calculations can - # decrement that from their position. Fill in the size of each - # group encountered (used by pct calculations later). Also be - # sure to reset any of the items helping to calculate dups - if group_changed: - - # If not dense tiebreak, group size used to compute - # percentile will be # of non-null elements in group - if tiebreak != TIEBREAK_DENSE: - grp_size = i - grp_start + 1 - grp_na_count - - # Otherwise, it will be the number of distinct values - # in the group, subtracting 1 if NaNs are present - # since that is a distinct value we shouldn't count - else: - grp_size = grp_vals_seen - (grp_na_count > 0) - - for j in range(grp_start, i + 1): - grp_sizes[lexsort_indexer[j]] = grp_size - - dups = sum_ranks = 0 - grp_na_count = 0 - grp_start = i + 1 - grp_vals_seen = 1 - else: - with nogil: + with gil: for i in range(N): at_end = i == N - 1 @@ -1144,55 +1152,56 @@ def rank_1d( dups += 1 sum_ranks += i - grp_start + 1 - next_val_diff = at_end or (masked_vals[lexsort_indexer[i]] - != masked_vals[lexsort_indexer[i+1]]) + next_val_diff = at_end or are_diff(masked_vals[sort_indexer[i]], + masked_vals[sort_indexer[i+1]]) # We'll need this check later anyway to determine group size, so just # compute it here since shortcircuiting won't help group_changed = at_end or (check_labels and - (labels[lexsort_indexer[i]] - != labels[lexsort_indexer[i+1]])) + (labels[sort_indexer[i]] + != labels[sort_indexer[i+1]])) # Update out only when there is a transition of values or labels. # When a new value or group is encountered, go back #dups steps( # the number of occurrence of current value) and assign the ranks # based on the starting index of the current group (grp_start) # and the current index - if (next_val_diff or group_changed - or (mask[lexsort_indexer[i]] ^ mask[lexsort_indexer[i+1]])): + if (next_val_diff or group_changed or (check_mask and + (mask[sort_indexer[i]] + ^ mask[sort_indexer[i+1]]))): # If keep_na, check for missing values and assign back # to the result where appropriate - if keep_na and mask[lexsort_indexer[i]]: + if keep_na and check_mask and mask[sort_indexer[i]]: grp_na_count = dups for j in range(i - dups + 1, i + 1): - out[lexsort_indexer[j]] = NaN + out[sort_indexer[j]] = NaN elif tiebreak == TIEBREAK_AVERAGE: for j in range(i - dups + 1, i + 1): - out[lexsort_indexer[j]] = sum_ranks / dups + out[sort_indexer[j]] = sum_ranks / dups elif tiebreak == TIEBREAK_MIN: for j in range(i - dups + 1, i + 1): - out[lexsort_indexer[j]] = i - grp_start - dups + 2 + out[sort_indexer[j]] = i - grp_start - dups + 2 elif tiebreak == TIEBREAK_MAX: for j in range(i - dups + 1, i + 1): - out[lexsort_indexer[j]] = i - grp_start + 1 + out[sort_indexer[j]] = i - grp_start + 1 # With n as the previous rank in the group and m as the number # of duplicates in this stretch, if TIEBREAK_FIRST and ascending, # then rankings should be n + 1, n + 2 ... n + m elif tiebreak == TIEBREAK_FIRST: for j in range(i - dups + 1, i + 1): - out[lexsort_indexer[j]] = j + 1 - grp_start + out[sort_indexer[j]] = j + 1 - grp_start # If TIEBREAK_FIRST and descending, the ranking should be # n + m, n + (m - 1) ... n + 1. This is equivalent to # (i - dups + 1) + (i - j + 1) - grp_start elif tiebreak == TIEBREAK_FIRST_DESCENDING: for j in range(i - dups + 1, i + 1): - out[lexsort_indexer[j]] = 2 * i - j - dups + 2 - grp_start + out[sort_indexer[j]] = 2 * i - j - dups + 2 - grp_start elif tiebreak == TIEBREAK_DENSE: for j in range(i - dups + 1, i + 1): - out[lexsort_indexer[j]] = grp_vals_seen + out[sort_indexer[j]] = grp_vals_seen # Look forward to the next value (using the sorting in # lexsort_indexer). If the value does not equal the current @@ -1200,9 +1209,9 @@ def rank_1d( # that a new value is coming up. The conditional also needs # to handle nan equality and the end of iteration. If group # changes we do not record seeing a new value in the group - if not group_changed and (next_val_diff or - (mask[lexsort_indexer[i]] - ^ mask[lexsort_indexer[i+1]])): + if not group_changed and (next_val_diff or (check_mask and + (mask[sort_indexer[i]] + ^ mask[sort_indexer[i+1]]))): dups = sum_ranks = 0 grp_vals_seen += 1 @@ -1226,19 +1235,112 @@ def rank_1d( grp_size = grp_vals_seen - (grp_na_count > 0) for j in range(grp_start, i + 1): - grp_sizes[lexsort_indexer[j]] = grp_size + grp_sizes[sort_indexer[j]] = grp_size dups = sum_ranks = 0 grp_na_count = 0 grp_start = i + 1 grp_vals_seen = 1 - - if pct: + else: for i in range(N): - if grp_sizes[i] != 0: - out[i] = out[i] / grp_sizes[i] + at_end = i == N - 1 - return out + # dups and sum_ranks will be incremented each loop where + # the value / group remains the same, and should be reset + # when either of those change. Used to calculate tiebreakers + dups += 1 + sum_ranks += i - grp_start + 1 + + next_val_diff = at_end or (masked_vals[sort_indexer[i]] + != masked_vals[sort_indexer[i+1]]) + + # We'll need this check later anyway to determine group size, so just + # compute it here since shortcircuiting won't help + group_changed = at_end or (check_labels and + (labels[sort_indexer[i]] + != labels[sort_indexer[i+1]])) + + # Update out only when there is a transition of values or labels. + # When a new value or group is encountered, go back #dups steps( + # the number of occurrence of current value) and assign the ranks + # based on the starting index of the current group (grp_start) + # and the current index + if (next_val_diff or group_changed + or (check_mask and + (mask[sort_indexer[i]] ^ mask[sort_indexer[i+1]]))): + + # If keep_na, check for missing values and assign back + # to the result where appropriate + if keep_na and check_mask and mask[sort_indexer[i]]: + grp_na_count = dups + for j in range(i - dups + 1, i + 1): + out[sort_indexer[j]] = NaN + elif tiebreak == TIEBREAK_AVERAGE: + for j in range(i - dups + 1, i + 1): + out[sort_indexer[j]] = sum_ranks / dups + elif tiebreak == TIEBREAK_MIN: + for j in range(i - dups + 1, i + 1): + out[sort_indexer[j]] = i - grp_start - dups + 2 + elif tiebreak == TIEBREAK_MAX: + for j in range(i - dups + 1, i + 1): + out[sort_indexer[j]] = i - grp_start + 1 + + # With n as the previous rank in the group and m as the number + # of duplicates in this stretch, if TIEBREAK_FIRST and ascending, + # then rankings should be n + 1, n + 2 ... n + m + elif tiebreak == TIEBREAK_FIRST: + for j in range(i - dups + 1, i + 1): + out[sort_indexer[j]] = j + 1 - grp_start + + # If TIEBREAK_FIRST and descending, the ranking should be + # n + m, n + (m - 1) ... n + 1. This is equivalent to + # (i - dups + 1) + (i - j + 1) - grp_start + elif tiebreak == TIEBREAK_FIRST_DESCENDING: + for j in range(i - dups + 1, i + 1): + out[sort_indexer[j]] = 2 * i - j - dups + 2 - grp_start + elif tiebreak == TIEBREAK_DENSE: + for j in range(i - dups + 1, i + 1): + out[sort_indexer[j]] = grp_vals_seen + + # Look forward to the next value (using the sorting in + # lexsort_indexer). If the value does not equal the current + # value then we need to reset the dups and sum_ranks, knowing + # that a new value is coming up. The conditional also needs + # to handle nan equality and the end of iteration. If group + # changes we do not record seeing a new value in the group + if not group_changed and (next_val_diff + or (check_mask and + (mask[sort_indexer[i]] + ^ mask[sort_indexer[i+1]]))): + dups = sum_ranks = 0 + grp_vals_seen += 1 + + # Similar to the previous conditional, check now if we are + # moving to a new group. If so, keep track of the index where + # the new group occurs, so the tiebreaker calculations can + # decrement that from their position. Fill in the size of each + # group encountered (used by pct calculations later). Also be + # sure to reset any of the items helping to calculate dups + if group_changed: + + # If not dense tiebreak, group size used to compute + # percentile will be # of non-null elements in group + if tiebreak != TIEBREAK_DENSE: + grp_size = i - grp_start + 1 - grp_na_count + + # Otherwise, it will be the number of distinct values + # in the group, subtracting 1 if NaNs are present + # since that is a distinct value we shouldn't count + else: + grp_size = grp_vals_seen - (grp_na_count > 0) + + for j in range(grp_start, i + 1): + grp_sizes[sort_indexer[j]] = grp_size + + dups = sum_ranks = 0 + grp_na_count = 0 + grp_start = i + 1 + grp_vals_seen = 1 def rank_2d( diff --git a/pandas/_libs/arrays.pyi b/pandas/_libs/arrays.pyi index 0ca501c5b712c..67af9653fc75a 100644 --- a/pandas/_libs/arrays.pyi +++ b/pandas/_libs/arrays.pyi @@ -10,36 +10,25 @@ from pandas._typing import ( class NDArrayBacked: _dtype: DtypeObj _ndarray: np.ndarray - def __init__(self, values: np.ndarray, dtype: DtypeObj): ... - @classmethod def _simple_new(cls, values: np.ndarray, dtype: DtypeObj): ... - def _from_backing_data(self, values: np.ndarray): ... - def __setstate__(self, state): ... - def __len__(self) -> int: ... - @property def shape(self) -> Shape: ... - @property def ndim(self) -> int: ... - @property def size(self) -> int: ... - @property def nbytes(self) -> int: ... - def copy(self): ... def delete(self, loc, axis=0): ... def swapaxes(self, axis1, axis2): ... def repeat(self, repeats: int | Sequence[int], axis: int | None = ...): ... def reshape(self, *args, **kwargs): ... def ravel(self, order="C"): ... - @property def T(self): ... diff --git a/pandas/_libs/groupby.pyi b/pandas/_libs/groupby.pyi index 8721624e9881c..7b1dcbe562123 100644 --- a/pandas/_libs/groupby.pyi +++ b/pandas/_libs/groupby.pyi @@ -3,128 +3,111 @@ from typing import Literal import numpy as np def group_median_float64( - out: np.ndarray, # ndarray[float64_t, ndim=2] - counts: np.ndarray, # ndarray[int64_t] - values: np.ndarray, # ndarray[float64_t, ndim=2] - labels: np.ndarray, # ndarray[int64_t] + out: np.ndarray, # ndarray[float64_t, ndim=2] + counts: np.ndarray, # ndarray[int64_t] + values: np.ndarray, # ndarray[float64_t, ndim=2] + labels: np.ndarray, # ndarray[int64_t] min_count: int = ..., # Py_ssize_t ) -> None: ... - def group_cumprod_float64( - out: np.ndarray, # float64_t[:, ::1] + out: np.ndarray, # float64_t[:, ::1] values: np.ndarray, # const float64_t[:, :] labels: np.ndarray, # const int64_t[:] ngroups: int, is_datetimelike: bool, skipna: bool = ..., ) -> None: ... - def group_cumsum( - out: np.ndarray, # numeric[:, ::1] + out: np.ndarray, # numeric[:, ::1] values: np.ndarray, # ndarray[numeric, ndim=2] labels: np.ndarray, # const int64_t[:] ngroups: int, is_datetimelike: bool, skipna: bool = ..., ) -> None: ... - - def group_shift_indexer( - out: np.ndarray, # int64_t[::1] + out: np.ndarray, # int64_t[::1] labels: np.ndarray, # const int64_t[:] ngroups: int, periods: int, ) -> None: ... - - def group_fillna_indexer( - out: np.ndarray, # ndarray[int64_t] + out: np.ndarray, # ndarray[int64_t] labels: np.ndarray, # ndarray[int64_t] - mask: np.ndarray, # ndarray[uint8_t] + mask: np.ndarray, # ndarray[uint8_t] direction: Literal["ffill", "bfill"], - limit: int, # int64_t + limit: int, # int64_t dropna: bool, ) -> None: ... - - def group_any_all( - out: np.ndarray, # uint8_t[::1] + out: np.ndarray, # uint8_t[::1] values: np.ndarray, # const uint8_t[::1] labels: np.ndarray, # const int64_t[:] - mask: np.ndarray, # const uint8_t[::1] + mask: np.ndarray, # const uint8_t[::1] val_test: Literal["any", "all"], skipna: bool, ) -> None: ... - def group_add( - out: np.ndarray, # complexfloating_t[:, ::1] + out: np.ndarray, # complexfloating_t[:, ::1] counts: np.ndarray, # int64_t[::1] values: np.ndarray, # ndarray[complexfloating_t, ndim=2] labels: np.ndarray, # const intp_t[:] - min_count: int = ... + min_count: int = ..., ) -> None: ... - def group_prod( - out: np.ndarray, # floating[:, ::1] + out: np.ndarray, # floating[:, ::1] counts: np.ndarray, # int64_t[::1] values: np.ndarray, # ndarray[floating, ndim=2] labels: np.ndarray, # const intp_t[:] - min_count: int = ... + min_count: int = ..., ) -> None: ... - def group_var( - out: np.ndarray, # floating[:, ::1] - counts: np.ndarray, # int64_t[::1] - values: np.ndarray, # ndarray[floating, ndim=2] - labels: np.ndarray, # const intp_t[:] + out: np.ndarray, # floating[:, ::1] + counts: np.ndarray, # int64_t[::1] + values: np.ndarray, # ndarray[floating, ndim=2] + labels: np.ndarray, # const intp_t[:] min_count: int = ..., # Py_ssize_t - ddof: int = ..., # int64_t + ddof: int = ..., # int64_t ) -> None: ... - def group_mean( - out: np.ndarray, # floating[:, ::1] + out: np.ndarray, # floating[:, ::1] counts: np.ndarray, # int64_t[::1] values: np.ndarray, # ndarray[floating, ndim=2] labels: np.ndarray, # const intp_t[:] - min_count: int = ... + min_count: int = ..., ) -> None: ... - def group_ohlc( - out: np.ndarray, # floating[:, ::1] + out: np.ndarray, # floating[:, ::1] counts: np.ndarray, # int64_t[::1] values: np.ndarray, # ndarray[floating, ndim=2] labels: np.ndarray, # const intp_t[:] - min_count: int = ... + min_count: int = ..., ) -> None: ... - def group_quantile( - out: np.ndarray, # ndarray[float64_t] + out: np.ndarray, # ndarray[float64_t] values: np.ndarray, # ndarray[numeric, ndim=1] labels: np.ndarray, # ndarray[int64_t] - mask: np.ndarray, # ndarray[uint8_t] - q: float, # float64_t + mask: np.ndarray, # ndarray[uint8_t] + q: float, # float64_t interpolation: Literal["linear", "lower", "higher", "nearest", "midpoint"], ) -> None: ... - def group_last( - out: np.ndarray, # rank_t[:, ::1] - counts: np.ndarray, # int64_t[::1] - values: np.ndarray, # ndarray[rank_t, ndim=2] - labels: np.ndarray, # const int64_t[:] + out: np.ndarray, # rank_t[:, ::1] + counts: np.ndarray, # int64_t[::1] + values: np.ndarray, # ndarray[rank_t, ndim=2] + labels: np.ndarray, # const int64_t[:] min_count: int = ..., # Py_ssize_t ) -> None: ... - def group_nth( - out: np.ndarray, # rank_t[:, ::1] - counts: np.ndarray, # int64_t[::1] - values: np.ndarray, # ndarray[rank_t, ndim=2] - labels: np.ndarray, # const int64_t[:] - min_count: int = ..., # int64_t - rank: int = ..., # int64_t + out: np.ndarray, # rank_t[:, ::1] + counts: np.ndarray, # int64_t[::1] + values: np.ndarray, # ndarray[rank_t, ndim=2] + labels: np.ndarray, # const int64_t[:] + min_count: int = ..., # int64_t + rank: int = ..., # int64_t ) -> None: ... - def group_rank( - out: np.ndarray, # float64_t[:, ::1] + out: np.ndarray, # float64_t[:, ::1] values: np.ndarray, # ndarray[rank_t, ndim=2] labels: np.ndarray, # const int64_t[:] ngroups: int, @@ -134,35 +117,31 @@ def group_rank( pct: bool = ..., na_option: Literal["keep", "top", "bottom"] = ..., ) -> None: ... - def group_max( - out: np.ndarray, # groupby_t[:, ::1] + out: np.ndarray, # groupby_t[:, ::1] counts: np.ndarray, # int64_t[::1] values: np.ndarray, # ndarray[groupby_t, ndim=2] labels: np.ndarray, # const int64_t[:] min_count: int = ..., ) -> None: ... - def group_min( - out: np.ndarray, # groupby_t[:, ::1] + out: np.ndarray, # groupby_t[:, ::1] counts: np.ndarray, # int64_t[::1] values: np.ndarray, # ndarray[groupby_t, ndim=2] labels: np.ndarray, # const int64_t[:] min_count: int = ..., ) -> None: ... - def group_cummin( - out: np.ndarray, # groupby_t[:, ::1] - values: np.ndarray, # ndarray[groupby_t, ndim=2] - labels: np.ndarray, # const int64_t[:] + out: np.ndarray, # groupby_t[:, ::1] + values: np.ndarray, # ndarray[groupby_t, ndim=2] + labels: np.ndarray, # const int64_t[:] ngroups: int, is_datetimelike: bool, ) -> None: ... - def group_cummax( - out: np.ndarray, # groupby_t[:, ::1] - values: np.ndarray, # ndarray[groupby_t, ndim=2] - labels: np.ndarray, # const int64_t[:] + out: np.ndarray, # groupby_t[:, ::1] + values: np.ndarray, # ndarray[groupby_t, ndim=2] + labels: np.ndarray, # const int64_t[:] ngroups: int, is_datetimelike: bool, ) -> None: ... diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index b72b927b3c2a8..354b87e03e6c4 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -247,24 +247,24 @@ def group_cumsum(numeric[:, ::1] out, for j in range(K): val = values[i, j] + # For floats, use Kahan summation to reduce floating-point + # error (https://en.wikipedia.org/wiki/Kahan_summation_algorithm) if numeric == float32_t or numeric == float64_t: if val == val: y = val - compensation[lab, j] t = accum[lab, j] + y compensation[lab, j] = t - accum[lab, j] - y accum[lab, j] = t - out[i, j] = accum[lab, j] + out[i, j] = t else: out[i, j] = NaN if not skipna: accum[lab, j] = NaN break else: - y = val - compensation[lab, j] - t = accum[lab, j] + y - compensation[lab, j] = t - accum[lab, j] - y + t = val + accum[lab, j] accum[lab, j] = t - out[i, j] = accum[lab, j] + out[i, j] = t @cython.boundscheck(False) @@ -1345,16 +1345,9 @@ cdef group_cummin_max(groupby_t[:, ::1] out, This method modifies the `out` parameter, rather than returning an object. """ cdef: - Py_ssize_t i, j, N, K, size - groupby_t val, mval groupby_t[:, ::1] accum - intp_t lab - bint val_is_nan, use_mask - - use_mask = mask is not None - N, K = (values).shape - accum = np.empty((ngroups, K), dtype=values.dtype) + accum = np.empty((ngroups, (values).shape[1]), dtype=values.dtype) if groupby_t is int64_t: accum[:] = -_int64_max if compute_max else _int64_max elif groupby_t is uint64_t: @@ -1362,36 +1355,76 @@ cdef group_cummin_max(groupby_t[:, ::1] out, else: accum[:] = -np.inf if compute_max else np.inf + if mask is not None: + masked_cummin_max(out, values, mask, labels, accum, compute_max) + else: + cummin_max(out, values, labels, accum, is_datetimelike, compute_max) + + +@cython.boundscheck(False) +@cython.wraparound(False) +cdef cummin_max(groupby_t[:, ::1] out, + ndarray[groupby_t, ndim=2] values, + const intp_t[:] labels, + groupby_t[:, ::1] accum, + bint is_datetimelike, + bint compute_max): + """ + Compute the cumulative minimum/maximum of columns of `values`, in row groups + `labels`. + """ + cdef: + Py_ssize_t i, j, N, K + groupby_t val, mval + intp_t lab + + N, K = (values).shape with nogil: for i in range(N): lab = labels[i] - if lab < 0: continue for j in range(K): - val_is_nan = False - - if use_mask: - if mask[i, j]: - - # `out` does not need to be set since it - # will be masked anyway - val_is_nan = True + val = values[i, j] + if not _treat_as_na(val, is_datetimelike): + mval = accum[lab, j] + if compute_max: + if val > mval: + accum[lab, j] = mval = val else: + if val < mval: + accum[lab, j] = mval = val + out[i, j] = mval + else: + out[i, j] = val - # If using the mask, we can avoid grabbing the - # value unless necessary - val = values[i, j] - # Otherwise, `out` must be set accordingly if the - # value is missing - else: - val = values[i, j] - if _treat_as_na(val, is_datetimelike): - val_is_nan = True - out[i, j] = val +@cython.boundscheck(False) +@cython.wraparound(False) +cdef masked_cummin_max(groupby_t[:, ::1] out, + ndarray[groupby_t, ndim=2] values, + uint8_t[:, ::1] mask, + const intp_t[:] labels, + groupby_t[:, ::1] accum, + bint compute_max): + """ + Compute the cumulative minimum/maximum of columns of `values`, in row groups + `labels` with a masked algorithm. + """ + cdef: + Py_ssize_t i, j, N, K + groupby_t val, mval + intp_t lab - if not val_is_nan: + N, K = (values).shape + with nogil: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + for j in range(K): + if not mask[i, j]: + val = values[i, j] mval = accum[lab, j] if compute_max: if val > mval: diff --git a/pandas/_libs/hashtable.pyi b/pandas/_libs/hashtable.pyi index 0612acd25a5d5..5a1b98b190dbc 100644 --- a/pandas/_libs/hashtable.pyi +++ b/pandas/_libs/hashtable.pyi @@ -10,18 +10,14 @@ def unique_label_indices( labels: np.ndarray, # const int64_t[:] ) -> np.ndarray: ... - class Factorizer: count: int - def __init__(self, size_hint: int): ... def get_count(self) -> int: ... - class ObjectFactorizer(Factorizer): table: PyObjectHashTable uniques: ObjectVector - def factorize( self, values: np.ndarray, # ndarray[object] @@ -30,11 +26,9 @@ class ObjectFactorizer(Factorizer): na_value=..., ) -> np.ndarray: ... # np.ndarray[intp] - class Int64Factorizer(Factorizer): table: Int64HashTable uniques: Int64Vector - def factorize( self, values: np.ndarray, # const int64_t[:] @@ -43,7 +37,6 @@ class Int64Factorizer(Factorizer): na_value=..., ) -> np.ndarray: ... # np.ndarray[intp] - class Int64Vector: def __init__(self): ... def __len__(self) -> int: ... @@ -114,7 +107,6 @@ class ObjectVector: def __len__(self) -> int: ... def to_array(self) -> np.ndarray: ... # np.ndarray[object] - class HashTable: # NB: The base HashTable class does _not_ actually have these methods; # we are putting the here for the sake of mypy to avoid @@ -124,37 +116,31 @@ class HashTable: def __contains__(self, key: Hashable) -> bool: ... def sizeof(self, deep: bool = ...) -> int: ... def get_state(self) -> dict[str, int]: ... - # TODO: `item` type is subclass-specific def get_item(self, item): ... # TODO: return type? def set_item(self, item) -> None: ... - # FIXME: we don't actually have this for StringHashTable or ObjectHashTable? def map( self, - keys: np.ndarray, # np.ndarray[subclass-specific] - values: np.ndarray, # const int64_t[:] values + keys: np.ndarray, # np.ndarray[subclass-specific] + values: np.ndarray, # const int64_t[:] ) -> None: ... - def map_locations( self, values: np.ndarray, # np.ndarray[subclass-specific] ) -> None: ... - def lookup( self, values: np.ndarray, # np.ndarray[subclass-specific] - ) -> np.ndarray: ... # np.ndarray[np.intp] - + ) -> np.ndarray: ... # np.ndarray[np.intp] def get_labels( self, values: np.ndarray, # np.ndarray[subclass-specific] - uniques, # SubclassTypeVector + uniques, # SubclassTypeVector count_prior: int = ..., na_sentinel: int = ..., na_value: object = ..., - ) -> np.ndarray: ... # np.ndarray[intp_t] - + ) -> np.ndarray: ... # np.ndarray[intp_t] def unique( self, values: np.ndarray, # np.ndarray[subclass-specific] @@ -163,11 +149,10 @@ class HashTable: np.ndarray, # np.ndarray[subclass-specific] np.ndarray, # np.ndarray[np.intp], ] | np.ndarray: ... # np.ndarray[subclass-specific] - def _unique( self, values: np.ndarray, # np.ndarray[subclass-specific] - uniques, # FooVector + uniques, # FooVector count_prior: int = ..., na_sentinel: int = ..., na_value: object = ..., @@ -177,7 +162,6 @@ class HashTable: np.ndarray, # np.ndarray[subclass-specific] np.ndarray, # np.ndarray[np.intp], ] | np.ndarray: ... # np.ndarray[subclass-specific] - def factorize( self, values: np.ndarray, # np.ndarray[subclass-specific] @@ -185,9 +169,9 @@ class HashTable: na_value: object = ..., mask=..., ) -> tuple[ - np.ndarray, # np.ndarray[subclass-specific] - np.ndarray, # np.ndarray[np.intp], - ]: ... + np.ndarray, # np.ndarray[subclass-specific] + np.ndarray, # np.ndarray[np.intp], + ]: ... class Complex128HashTable(HashTable): ... class Complex64HashTable(HashTable): ... @@ -211,46 +195,33 @@ class UInt64HashTable(HashTable): ... class UInt32HashTable(HashTable): ... class UInt16HashTable(HashTable): ... class UInt8HashTable(HashTable): ... - class StringHashTable(HashTable): ... class PyObjectHashTable(HashTable): ... - def duplicated_int64( values: np.ndarray, # const int64_t[:] values keep: Literal["last", "first", False] = ..., ) -> np.ndarray: ... # np.ndarray[bool] + # TODO: Is it actually bool or is it uint8? def mode_int64( values: np.ndarray, # const int64_t[:] values dropna: bool, ) -> np.ndarray: ... # np.ndarray[np.int64] - def value_count_int64( values: np.ndarray, # const int64_t[:] dropna: bool, -) -> tuple[ - np.ndarray, # np.ndarray[np.int64] - np.ndarray, # np.ndarray[np.int64] -]: ... - - +) -> tuple[np.ndarray, np.ndarray,]: ... # np.ndarray[np.int64] # np.ndarray[np.int64] def duplicated( values: np.ndarray, keep: Literal["last", "first", False] = ..., ) -> np.ndarray: ... # np.ndarray[bool] - def mode(values: np.ndarray, dropna: bool) -> np.ndarray: ... - def value_count( values: np.ndarray, dropna: bool, -) -> tuple[ - np.ndarray, - np.ndarray, # np.ndarray[np.int64] -]: ... - +) -> tuple[np.ndarray, np.ndarray,]: ... # np.ndarray[np.int64] # arr and values should have same dtype def ismember( diff --git a/pandas/_libs/index.pyi b/pandas/_libs/index.pyi index 979619c3d14c4..6bb332435be63 100644 --- a/pandas/_libs/index.pyi +++ b/pandas/_libs/index.pyi @@ -2,32 +2,26 @@ import numpy as np class IndexEngine: over_size_threshold: bool - def __init__(self, vgetter, n: int): ... - def __contains__(self, val: object) -> bool: ... - # -> int | slice | np.ndarray[bool] def get_loc(self, val: object) -> int | slice | np.ndarray: ... - def sizeof(self, deep: bool = False) -> int: ... def __sizeof__(self) -> int: ... - @property def is_unique(self) -> bool: ... - @property def is_monotonic_increasing(self) -> bool: ... - @property def is_monotonic_decreasing(self) -> bool: ... - - def get_backfill_indexer(self, other: np.ndarray, limit: int | None =...) -> np.ndarray: ... - def get_pad_indexer(self, other: np.ndarray, limit: int | None =...) -> np.ndarray: ... - + def get_backfill_indexer( + self, other: np.ndarray, limit: int | None = ... + ) -> np.ndarray: ... + def get_pad_indexer( + self, other: np.ndarray, limit: int | None = ... + ) -> np.ndarray: ... @property def is_mapping_populated(self) -> bool: ... - def clear_mapping(self): ... def get_indexer(self, values: np.ndarray) -> np.ndarray: ... # np.ndarray[np.intp] def get_indexer_non_unique( @@ -38,45 +32,35 @@ class IndexEngine: np.ndarray, # np.ndarray[np.intp] ]: ... - class Float64Engine(IndexEngine): ... class Float32Engine(IndexEngine): ... - class Int64Engine(IndexEngine): ... class Int32Engine(IndexEngine): ... class Int16Engine(IndexEngine): ... class Int8Engine(IndexEngine): ... - class UInt64Engine(IndexEngine): ... class UInt32Engine(IndexEngine): ... class UInt16Engine(IndexEngine): ... class UInt8Engine(IndexEngine): ... - class ObjectEngine(IndexEngine): ... - class DatetimeEngine(Int64Engine): ... class TimedeltaEngine(DatetimeEngine): ... class PeriodEngine(Int64Engine): ... - class BaseMultiIndexCodesEngine: levels: list[np.ndarray] offsets: np.ndarray # ndarray[uint64_t, ndim=1] - def __init__( self, levels: list[np.ndarray], # all entries hashable labels: list[np.ndarray], # all entries integer-dtyped offsets: np.ndarray, # np.ndarray[np.uint64, ndim=1] ): ... - def get_indexer( self, target: np.ndarray, # np.ndarray[object] - ) -> np.ndarray: ... # np.ndarray[np.intp] - + ) -> np.ndarray: ... # np.ndarray[np.intp] def _extract_level_codes(self, target: object): ... - def get_indexer_with_fill( self, target: np.ndarray, # np.ndarray[object] of tuples diff --git a/pandas/_libs/internals.pyi b/pandas/_libs/internals.pyi index 74ca311b35ed7..d6fac14d3ee6e 100644 --- a/pandas/_libs/internals.pyi +++ b/pandas/_libs/internals.pyi @@ -16,52 +16,36 @@ from pandas.core.arrays._mixins import NDArrayBackedExtensionArray from pandas.core.internals.blocks import Block as B def slice_len(slc: slice, objlen: int = ...) -> int: ... - - def get_blkno_indexers( blknos: np.ndarray, # int64_t[:] group: bool = ..., ) -> list[tuple[int, slice | np.ndarray]]: ... - - def get_blkno_placements( blknos: np.ndarray, group: bool = ..., ) -> Iterator[tuple[int, BlockPlacement]]: ... - class BlockPlacement: def __init__(self, val: int | slice | np.ndarray): ... - @property def indexer(self) -> np.ndarray | slice: ... - @property def as_array(self) -> np.ndarray: ... - @property def is_slice_like(self) -> bool: ... - @overload def __getitem__(self, loc: slice | Sequence[int]) -> BlockPlacement: ... - @overload def __getitem__(self, loc: int) -> int: ... - def __iter__(self) -> Iterator[int]: ... - def __len__(self) -> int: ... - def delete(self, loc) -> BlockPlacement: ... - def append(self, others: list[BlockPlacement]) -> BlockPlacement: ... - class SharedBlock: _mgr_locs: BlockPlacement ndim: int values: ArrayLike - def __init__(self, values: ArrayLike, placement: BlockPlacement, ndim: int): ... class NumpyBlock(SharedBlock): @@ -72,8 +56,7 @@ class NDArrayBackedBlock(SharedBlock): values: NDArrayBackedExtensionArray def getitem_block_index(self: T, slicer: slice) -> T: ... -class Block(SharedBlock): - ... +class Block(SharedBlock): ... class BlockManager: blocks: tuple[B, ...] @@ -82,7 +65,7 @@ class BlockManager: _is_consolidated: bool _blknos: np.ndarray _blklocs: np.ndarray - - def __init__(self, blocks: tuple[B, ...], axes: list[Index], verify_integrity=True): ... - - def get_slice(self: T, slobj: slice, axis: int=...) -> T: ... + def __init__( + self, blocks: tuple[B, ...], axes: list[Index], verify_integrity=True + ): ... + def get_slice(self: T, slobj: slice, axis: int = ...) -> T: ... diff --git a/pandas/_libs/interval.pyx b/pandas/_libs/interval.pyx index 9ed8b71c2ce17..9d5922f8a50bd 100644 --- a/pandas/_libs/interval.pyx +++ b/pandas/_libs/interval.pyx @@ -449,8 +449,6 @@ cdef class Interval(IntervalMixin): endpoints. Intervals that only have an open endpoint in common do not overlap. - .. versionadded:: 0.24.0 - Parameters ---------- other : Interval diff --git a/pandas/_libs/intervaltree.pxi.in b/pandas/_libs/intervaltree.pxi.in index 55d67f000f93a..547fcc0b8aa07 100644 --- a/pandas/_libs/intervaltree.pxi.in +++ b/pandas/_libs/intervaltree.pxi.in @@ -36,6 +36,7 @@ cdef class IntervalTree(IntervalMixin): object dtype str closed object _is_overlapping, _left_sorter, _right_sorter + Py_ssize_t _na_count def __init__(self, left, right, closed='right', leaf_size=100): """ @@ -67,6 +68,7 @@ cdef class IntervalTree(IntervalMixin): # GH 23352: ensure no nan in nodes mask = ~np.isnan(self.left) + self._na_count = len(mask) - mask.sum() self.left = self.left[mask] self.right = self.right[mask] indices = indices[mask] @@ -116,6 +118,8 @@ cdef class IntervalTree(IntervalMixin): Return True if the IntervalTree is monotonic increasing (only equal or increasing values), else False """ + if self._na_count > 0: + return False values = [self.right, self.left] sort_order = np.lexsort(values) diff --git a/pandas/_libs/join.pyi b/pandas/_libs/join.pyi index 4ae3ef0781dde..f73f495cf4d4f 100644 --- a/pandas/_libs/join.pyi +++ b/pandas/_libs/join.pyi @@ -1,144 +1,91 @@ import numpy as np def inner_join( - left: np.ndarray, # const intp_t[:] + left: np.ndarray, # const intp_t[:] right: np.ndarray, # const intp_t[:] max_groups: int, -) -> tuple[ - np.ndarray, # np.ndarray[np.intp] - np.ndarray, # np.ndarray[np.intp] -]: ... - - +) -> tuple[np.ndarray, np.ndarray,]: ... # np.ndarray[np.intp] # np.ndarray[np.intp] def left_outer_join( - left: np.ndarray, # const intp_t[:] + left: np.ndarray, # const intp_t[:] right: np.ndarray, # const intp_t[:] max_groups: int, sort: bool = True, -) -> tuple[ - np.ndarray, # np.ndarray[np.intp] - np.ndarray, # np.ndarray[np.intp] -]: ... - - +) -> tuple[np.ndarray, np.ndarray,]: ... # np.ndarray[np.intp] # np.ndarray[np.intp] def full_outer_join( - left: np.ndarray, # const intp_t[:] + left: np.ndarray, # const intp_t[:] right: np.ndarray, # const intp_t[:] max_groups: int, -) -> tuple[ - np.ndarray, # np.ndarray[np.intp] - np.ndarray, # np.ndarray[np.intp] -]: ... - - +) -> tuple[np.ndarray, np.ndarray,]: ... # np.ndarray[np.intp] # np.ndarray[np.intp] def ffill_indexer( - indexer: np.ndarray # const intp_t[:] + indexer: np.ndarray, # const intp_t[:] ) -> np.ndarray: ... # np.ndarray[np.intp] - - def left_join_indexer_unique( - left: np.ndarray, # ndarray[join_t] + left: np.ndarray, # ndarray[join_t] right: np.ndarray, # ndarray[join_t] ) -> np.ndarray: ... # np.ndarray[np.intp] - - def left_join_indexer( - left: np.ndarray, # ndarray[join_t] + left: np.ndarray, # ndarray[join_t] right: np.ndarray, # ndarray[join_t] ) -> tuple[ np.ndarray, # np.ndarray[join_t] np.ndarray, # np.ndarray[np.intp] np.ndarray, # np.ndarray[np.intp] ]: ... - - def inner_join_indexer( - left: np.ndarray, # ndarray[join_t] + left: np.ndarray, # ndarray[join_t] right: np.ndarray, # ndarray[join_t] ) -> tuple[ np.ndarray, # np.ndarray[join_t] np.ndarray, # np.ndarray[np.intp] np.ndarray, # np.ndarray[np.intp] ]: ... - - def outer_join_indexer( - left: np.ndarray, # ndarray[join_t] + left: np.ndarray, # ndarray[join_t] right: np.ndarray, # ndarray[join_t] ) -> tuple[ np.ndarray, # np.ndarray[join_t] np.ndarray, # np.ndarray[np.intp] np.ndarray, # np.ndarray[np.intp] ]: ... - - def asof_join_backward_on_X_by_Y( - left_values: np.ndarray, # asof_t[:] - right_values: np.ndarray, # asof_t[:] - left_by_values: np.ndarray, # by_t[:] + left_values: np.ndarray, # asof_t[:] + right_values: np.ndarray, # asof_t[:] + left_by_values: np.ndarray, # by_t[:] right_by_values: np.ndarray, # by_t[:] allow_exact_matches: bool = True, tolerance=None, -) -> tuple[ - np.ndarray, # np.ndarray[np.intp] - np.ndarray, # np.ndarray[np.intp] -]: ... - - +) -> tuple[np.ndarray, np.ndarray,]: ... # np.ndarray[np.intp] # np.ndarray[np.intp] def asof_join_forward_on_X_by_Y( - left_values: np.ndarray, # asof_t[:] - right_values: np.ndarray, # asof_t[:] - left_by_values: np.ndarray, # by_t[:] + left_values: np.ndarray, # asof_t[:] + right_values: np.ndarray, # asof_t[:] + left_by_values: np.ndarray, # by_t[:] right_by_values: np.ndarray, # by_t[:] allow_exact_matches: bool = True, tolerance=None, -) -> tuple[ - np.ndarray, # np.ndarray[np.intp] - np.ndarray, # np.ndarray[np.intp] -]: ... - - +) -> tuple[np.ndarray, np.ndarray,]: ... # np.ndarray[np.intp] # np.ndarray[np.intp] def asof_join_nearest_on_X_by_Y( - left_values: np.ndarray, # asof_t[:] - right_values: np.ndarray, # asof_t[:] - left_by_values: np.ndarray, # by_t[:] + left_values: np.ndarray, # asof_t[:] + right_values: np.ndarray, # asof_t[:] + left_by_values: np.ndarray, # by_t[:] right_by_values: np.ndarray, # by_t[:] allow_exact_matches: bool = True, tolerance=None, -) -> tuple[ - np.ndarray, # np.ndarray[np.intp] - np.ndarray, # np.ndarray[np.intp] -]: ... - - +) -> tuple[np.ndarray, np.ndarray,]: ... # np.ndarray[np.intp] # np.ndarray[np.intp] def asof_join_backward( - left_values: np.ndarray, # asof_t[:] + left_values: np.ndarray, # asof_t[:] right_values: np.ndarray, # asof_t[:] allow_exact_matches: bool = True, tolerance=None, -) -> tuple[ - np.ndarray, # np.ndarray[np.intp] - np.ndarray, # np.ndarray[np.intp] -]: ... - - +) -> tuple[np.ndarray, np.ndarray,]: ... # np.ndarray[np.intp] # np.ndarray[np.intp] def asof_join_forward( - left_values: np.ndarray, # asof_t[:] + left_values: np.ndarray, # asof_t[:] right_values: np.ndarray, # asof_t[:] allow_exact_matches: bool = True, tolerance=None, -) -> tuple[ - np.ndarray, # np.ndarray[np.intp] - np.ndarray, # np.ndarray[np.intp] -]: ... - - +) -> tuple[np.ndarray, np.ndarray,]: ... # np.ndarray[np.intp] # np.ndarray[np.intp] def asof_join_nearest( - left_values: np.ndarray, # asof_t[:] + left_values: np.ndarray, # asof_t[:] right_values: np.ndarray, # asof_t[:] allow_exact_matches: bool = True, tolerance=None, -) -> tuple[ - np.ndarray, # np.ndarray[np.intp] - np.ndarray, # np.ndarray[np.intp] -]: ... +) -> tuple[np.ndarray, np.ndarray,]: ... # np.ndarray[np.intp] # np.ndarray[np.intp] diff --git a/pandas/_libs/lib.pyi b/pandas/_libs/lib.pyi index c890bfbfe3b7d..3f4623638c70e 100644 --- a/pandas/_libs/lib.pyi +++ b/pandas/_libs/lib.pyi @@ -21,19 +21,15 @@ ndarray_obj_2d = np.ndarray from enum import Enum -class NoDefault(Enum): - ... +class NoDefault(Enum): ... no_default: NoDefault - def item_from_zerodim(val: object) -> object: ... def infer_dtype(value: object, skipna: bool = True) -> str: ... - def is_iterator(obj: object) -> bool: ... def is_scalar(val: object) -> bool: ... def is_list_like(obj: object, allow_sets: bool = True) -> bool: ... - def is_period(val: object) -> bool: ... def is_interval(val: object) -> bool: ... def is_decimal(val: object) -> bool: ... @@ -41,12 +37,10 @@ def is_complex(val: object) -> bool: ... def is_bool(val: object) -> bool: ... def is_integer(val: object) -> bool: ... def is_float(val: object) -> bool: ... - def is_interval_array(values: np.ndarray) -> bool: ... def is_datetime64_array(values: np.ndarray) -> bool: ... def is_timedelta_or_timedelta64_array(values: np.ndarray) -> bool: ... def is_datetime_with_singletz_array(values: np.ndarray) -> bool: ... - def is_time_array(values: np.ndarray, skipna: bool = False): ... def is_date_array(values: np.ndarray, skipna: bool = False): ... def is_datetime_array(values: np.ndarray, skipna: bool = False): ... @@ -54,16 +48,15 @@ def is_string_array(values: np.ndarray, skipna: bool = False): ... def is_float_array(values: np.ndarray, skipna: bool = False): ... def is_integer_array(values: np.ndarray, skipna: bool = False): ... def is_bool_array(values: np.ndarray, skipna: bool = False): ... - def fast_unique_multiple_list_gen(gen: Generator, sort: bool = True) -> list: ... def fast_unique_multiple_list(lists: list, sort: bool = True) -> list: ... def fast_unique_multiple(arrays: list, sort: bool = True) -> list: ... - def map_infer( - arr: np.ndarray, f: Callable[[Any], Any], convert: bool = True, ignore_na: bool = False + arr: np.ndarray, + f: Callable[[Any], Any], + convert: bool = True, + ignore_na: bool = False, ) -> np.ndarray: ... - - @overload # both convert_datetime and convert_to_nullable_integer False -> np.ndarray def maybe_convert_objects( objects: np.ndarray, # np.ndarray[object] @@ -77,7 +70,6 @@ def maybe_convert_objects( convert_to_nullable_integer: Literal[False] = ..., dtype_if_all_nat: DtypeObj | None = ..., ) -> np.ndarray: ... - @overload def maybe_convert_objects( objects: np.ndarray, # np.ndarray[object] @@ -91,7 +83,6 @@ def maybe_convert_objects( convert_to_nullable_integer: Literal[True] = ..., dtype_if_all_nat: DtypeObj | None = ..., ) -> ArrayLike: ... - @overload def maybe_convert_objects( objects: np.ndarray, # np.ndarray[object] @@ -105,7 +96,6 @@ def maybe_convert_objects( convert_to_nullable_integer: bool = ..., dtype_if_all_nat: DtypeObj | None = ..., ) -> ArrayLike: ... - @overload def maybe_convert_objects( objects: np.ndarray, # np.ndarray[object] @@ -119,7 +109,6 @@ def maybe_convert_objects( convert_to_nullable_integer: bool = ..., dtype_if_all_nat: DtypeObj | None = ..., ) -> ArrayLike: ... - @overload def maybe_convert_objects( objects: np.ndarray, # np.ndarray[object] @@ -133,7 +122,6 @@ def maybe_convert_objects( convert_to_nullable_integer: bool = ..., dtype_if_all_nat: DtypeObj | None = ..., ) -> ArrayLike: ... - @overload def maybe_convert_numeric( values: np.ndarray, # np.ndarray[object] @@ -142,7 +130,6 @@ def maybe_convert_numeric( coerce_numeric: bool = False, convert_to_masked_nullable: Literal[False] = ..., ) -> tuple[np.ndarray, None]: ... - @overload def maybe_convert_numeric( values: np.ndarray, # np.ndarray[object] @@ -161,50 +148,37 @@ def ensure_string_array( copy: bool = True, skipna: bool = True, ) -> np.ndarray: ... # np.ndarray[object] - def infer_datetimelike_array( - arr: np.ndarray # np.ndarray[object] + arr: np.ndarray, # np.ndarray[object] ) -> tuple[str, bool]: ... - def astype_intsafe( arr: np.ndarray, # np.ndarray[object] new_dtype: np.dtype, ) -> np.ndarray: ... - def fast_zip(ndarrays: list) -> np.ndarray: ... # np.ndarray[object] # TODO: can we be more specific about rows? def to_object_array_tuples(rows: object) -> ndarray_obj_2d: ... - def tuples_to_object_array( - tuples: np.ndarray # np.ndarray[object] + tuples: np.ndarray, # np.ndarray[object] ) -> ndarray_obj_2d: ... # TODO: can we be more specific about rows? def to_object_array(rows: object, min_width: int = 0) -> ndarray_obj_2d: ... - def dicts_to_array(dicts: list, columns: list) -> ndarray_obj_2d: ... - - def maybe_booleans_to_slice( - mask: np.ndarray # ndarray[uint8_t] + mask: np.ndarray, # ndarray[uint8_t] ) -> slice | np.ndarray: ... # np.ndarray[np.uint8] - def maybe_indices_to_slice( indices: np.ndarray, # np.ndarray[np.intp] max_len: int, ) -> slice | np.ndarray: ... # np.ndarray[np.uint8] - def is_all_arraylike(obj: list) -> bool: ... # ----------------------------------------------------------------- # Functions which in reality take memoryviews -def memory_usage_of_objects( - arr: np.ndarray # object[:] -) -> int: ... # np.int64 - - +def memory_usage_of_objects(arr: np.ndarray) -> int: ... # object[:] # np.int64 def map_infer_mask( arr: np.ndarray, f: Callable[[Any], Any], @@ -213,57 +187,39 @@ def map_infer_mask( na_value: Any = ..., dtype: np.dtype = ..., ) -> np.ndarray: ... - def indices_fast( - index: np.ndarray, # ndarray[intp_t] + index: np.ndarray, # ndarray[intp_t] labels: np.ndarray, # const int64_t[:] keys: list, sorted_labels: list[np.ndarray], # list[ndarray[np.int64]] ) -> dict: ... - def generate_slices( - labels: np.ndarray, # const intp_t[:] - ngroups: int -) -> tuple[ - np.ndarray, # np.ndarray[np.int64] - np.ndarray, # np.ndarray[np.int64] -]: ... - + labels: np.ndarray, ngroups: int # const intp_t[:] +) -> tuple[np.ndarray, np.ndarray,]: ... # np.ndarray[np.int64] # np.ndarray[np.int64] def count_level_2d( - mask: np.ndarray, # ndarray[uint8_t, ndim=2, cast=True], + mask: np.ndarray, # ndarray[uint8_t, ndim=2, cast=True], labels: np.ndarray, # const intp_t[:] max_bin: int, - axis: int -) -> np.ndarray: ... # np.ndarray[np.int64, ndim=2] - + axis: int, +) -> np.ndarray: ... # np.ndarray[np.int64, ndim=2] def get_level_sorter( - label: np.ndarray, # const int64_t[:] + label: np.ndarray, # const int64_t[:] starts: np.ndarray, # const intp_t[:] -) -> np.ndarray: ... # np.ndarray[np.intp, ndim=1] - - +) -> np.ndarray: ... # np.ndarray[np.intp, ndim=1] def generate_bins_dt64( values: np.ndarray, # np.ndarray[np.int64] binner: np.ndarray, # const int64_t[:] closed: object = "left", hasnans: bool = False, -) -> np.ndarray: ... # np.ndarray[np.int64, ndim=1] - - +) -> np.ndarray: ... # np.ndarray[np.int64, ndim=1] def array_equivalent_object( - left: np.ndarray, # object[:] + left: np.ndarray, # object[:] right: np.ndarray, # object[:] ) -> bool: ... - -def has_infs_f8( - arr: np.ndarray # const float64_t[:] -) -> bool: ... - -def has_infs_f4( - arr: np.ndarray # const float32_t[:] -) -> bool: ... - +def has_infs_f8(arr: np.ndarray) -> bool: ... # const float64_t[:] +def has_infs_f4(arr: np.ndarray) -> bool: ... # const float32_t[:] def get_reverse_indexer( indexer: np.ndarray, # const intp_t[:] length: int, -) -> np.ndarray: ... # np.ndarray[np.intp] +) -> np.ndarray: ... # np.ndarray[np.intp] +def is_bool_list(obj: list) -> bool: ... diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 341cfe4d6fac6..0aec7e5e5a363 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -1063,8 +1063,6 @@ def is_list_like(obj: object, allow_sets: bool = True) -> bool: allow_sets : bool, default True If this parameter is False, sets will not be considered list-like. - .. versionadded:: 0.24.0 - Returns ------- bool @@ -1956,6 +1954,21 @@ cpdef bint is_datetime64_array(ndarray values): return validator.validate(values) +@cython.internal +cdef class AnyDatetimeValidator(DatetimeValidator): + cdef inline bint is_value_typed(self, object value) except -1: + return util.is_datetime64_object(value) or ( + PyDateTime_Check(value) and value.tzinfo is None + ) + + +cdef bint is_datetime_or_datetime64_array(ndarray values): + cdef: + AnyDatetimeValidator validator = AnyDatetimeValidator(len(values), + skipna=True) + return validator.validate(values) + + # Note: only python-exposed for tests def is_datetime_with_singletz_array(values: ndarray) -> bool: """ @@ -1968,10 +1981,11 @@ def is_datetime_with_singletz_array(values: ndarray) -> bool: if n == 0: return False + # Get a reference timezone to compare with the rest of the tzs in the array for i in range(n): base_val = values[i] - if base_val is not NaT: + if base_val is not NaT and base_val is not None and not util.is_nan(base_val): base_tz = getattr(base_val, 'tzinfo', None) break @@ -1979,11 +1993,13 @@ def is_datetime_with_singletz_array(values: ndarray) -> bool: # Compare val's timezone with the reference timezone # NaT can coexist with tz-aware datetimes, so skip if encountered val = values[j] - if val is not NaT: + if val is not NaT and val is not None and not util.is_nan(val): tz = getattr(val, 'tzinfo', None) if not tz_compare(base_tz, tz): return False + # Note: we should only be called if a tzaware datetime has been seen, + # so base_tz should always be set at this point. return True @@ -2466,6 +2482,7 @@ def maybe_convert_objects(ndarray[object] objects, except OutOfBoundsTimedelta: seen.object_ = True break + break else: seen.object_ = True break @@ -2548,6 +2565,32 @@ def maybe_convert_objects(ndarray[object] objects, return dti._data seen.object_ = True + elif seen.datetime_: + if is_datetime_or_datetime64_array(objects): + from pandas import DatetimeIndex + + try: + dti = DatetimeIndex(objects) + except OutOfBoundsDatetime: + pass + else: + # unbox to ndarray[datetime64[ns]] + return dti._data._ndarray + seen.object_ = True + + elif seen.timedelta_: + if is_timedelta_or_timedelta64_array(objects): + from pandas import TimedeltaIndex + + try: + tdi = TimedeltaIndex(objects) + except OutOfBoundsTimedelta: + pass + else: + # unbox to ndarray[timedelta64[ns]] + return tdi._data._ndarray + seen.object_ = True + if seen.period_: if is_period_array(objects): from pandas import PeriodIndex @@ -2920,3 +2963,41 @@ def to_object_array_tuples(rows: object) -> np.ndarray: result[i, j] = row[j] return result + + +def is_bool_list(obj: list) -> bool: + """ + Check if this list contains only bool or np.bool_ objects. + + This is appreciably faster than checking `np.array(obj).dtype == bool` + + obj1 = [True, False] * 100 + obj2 = obj1 * 100 + obj3 = obj2 * 100 + obj4 = [True, None] + obj1 + + for obj in [obj1, obj2, obj3, obj4]: + %timeit is_bool_list(obj) + %timeit np.array(obj).dtype.kind == "b" + + 340 ns ± 8.22 ns + 8.78 µs ± 253 ns + + 28.8 µs ± 704 ns + 813 µs ± 17.8 µs + + 3.4 ms ± 168 µs + 78.4 ms ± 1.05 ms + + 48.1 ns ± 1.26 ns + 8.1 µs ± 198 ns + """ + cdef: + object item + + for item in obj: + if not util.is_bool_object(item): + return False + + # Note: we return True for empty list + return True diff --git a/pandas/_libs/ops.pyi b/pandas/_libs/ops.pyi index 11d67dfb93d5f..d84b0dee20e7d 100644 --- a/pandas/_libs/ops.pyi +++ b/pandas/_libs/ops.pyi @@ -10,33 +10,26 @@ import numpy as np _BinOp = Callable[[Any, Any], Any] _BoolOp = Callable[[Any, Any], bool] - def scalar_compare( values: np.ndarray, # object[:] val: object, - op: _BoolOp, # {operator.eq, operator.ne, ...} -) -> np.ndarray: ... # np.ndarray[bool] - + op: _BoolOp, # {operator.eq, operator.ne, ...} +) -> np.ndarray: ... # np.ndarray[bool] def vec_compare( - left: np.ndarray, # np.ndarray[object] + left: np.ndarray, # np.ndarray[object] right: np.ndarray, # np.ndarray[object] - op: _BoolOp, # {operator.eq, operator.ne, ...} -) -> np.ndarray: ... # np.ndarray[bool] - - + op: _BoolOp, # {operator.eq, operator.ne, ...} +) -> np.ndarray: ... # np.ndarray[bool] def scalar_binop( - values: np.ndarray, # object[:] + values: np.ndarray, # object[:] val: object, - op: _BinOp, # binary operator + op: _BinOp, # binary operator ) -> np.ndarray: ... - - def vec_binop( - left: np.ndarray, # object[:] + left: np.ndarray, # object[:] right: np.ndarray, # object[:] - op: _BinOp, # binary operator + op: _BinOp, # binary operator ) -> np.ndarray: ... - @overload def maybe_convert_bool( arr: np.ndarray, # np.ndarray[object] @@ -44,7 +37,6 @@ def maybe_convert_bool( false_values=..., convert_to_masked_nullable: Literal[False] = ..., ) -> tuple[np.ndarray, None]: ... - @overload def maybe_convert_bool( arr: np.ndarray, # np.ndarray[object] diff --git a/pandas/_libs/parsers.pyi b/pandas/_libs/parsers.pyi index 92b970d47467e..9ff05adceb2b4 100644 --- a/pandas/_libs/parsers.pyi +++ b/pandas/_libs/parsers.pyi @@ -12,20 +12,17 @@ from pandas._typing import ( STR_NA_VALUES: set[str] - def sanitize_objects( values: np.ndarray, # ndarray[object] na_values: set, convert_empty: bool = ..., ) -> int: ... - class TextReader: unnamed_cols: set[str] - table_width: int # int64_t + table_width: int # int64_t leading_cols: int # int64_t header: list[list[int]] # non-negative integers - def __init__( self, source, @@ -64,14 +61,11 @@ class TextReader: mangle_dupe_cols: bool = ..., float_precision: Literal["round_trip", "legacy", "high"] | None = ..., skip_blank_lines: bool = ..., - encoding_errors: bytes | str = ... + encoding_errors: bytes | str = ..., ): ... - def set_error_bad_lines(self, status: int) -> None: ... def set_noconvert(self, i: int) -> None: ... def remove_noconvert(self, i: int) -> None: ... - def close(self) -> None: ... - def read(self, rows: int | None = ...) -> dict[int, ArrayLike]: ... def read_low_memory(self, rows: int | None) -> list[dict[int, ArrayLike]]: ... diff --git a/pandas/_libs/reshape.pyi b/pandas/_libs/reshape.pyi index 7aaa18a7feff2..0457ceb1e03e6 100644 --- a/pandas/_libs/reshape.pyi +++ b/pandas/_libs/reshape.pyi @@ -1,19 +1,14 @@ import numpy as np def unstack( - values: np.ndarray, # reshape_t[:, :] - mask: np.ndarray, # const uint8_t[:] + values: np.ndarray, # reshape_t[:, :] + mask: np.ndarray, # const uint8_t[:] stride: int, length: int, width: int, new_values: np.ndarray, # reshape_t[:, :] - new_mask: np.ndarray, # uint8_t[:, :] + new_mask: np.ndarray, # uint8_t[:, :] ) -> None: ... - - def explode( values: np.ndarray, # np.ndarray[object] -) -> tuple[ - np.ndarray, # np.ndarray[object] - np.ndarray, # np.ndarray[np.int64] -]: ... +) -> tuple[np.ndarray, np.ndarray,]: ... # np.ndarray[object] # np.ndarray[np.int64] diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c index 31b43cdb28d9d..cf530c8c07440 100644 --- a/pandas/_libs/src/ujson/python/objToJSON.c +++ b/pandas/_libs/src/ujson/python/objToJSON.c @@ -35,11 +35,13 @@ Numeric decoder derived from from TCL library * Copyright (c) 1988-1993 The Regents of the University of California. * Copyright (c) 1994 Sun Microsystems, Inc. */ -#define PY_ARRAY_UNIQUE_SYMBOL UJSON_NUMPY #define PY_SSIZE_T_CLEAN #include #include + +#define NO_IMPORT_ARRAY +#define PY_ARRAY_UNIQUE_SYMBOL UJSON_NUMPY #include #include #include @@ -178,8 +180,6 @@ void *initObjToJSON(void) { Py_DECREF(mod_natype); } - /* Initialise numpy API */ - import_array(); // GH 31463 return NULL; } diff --git a/pandas/_libs/src/ujson/python/ujson.c b/pandas/_libs/src/ujson/python/ujson.c index 32011cb9cb92c..a8fdb4f55bfca 100644 --- a/pandas/_libs/src/ujson/python/ujson.c +++ b/pandas/_libs/src/ujson/python/ujson.c @@ -38,6 +38,8 @@ Numeric decoder derived from from TCL library #include "version.h" #define PY_SSIZE_T_CLEAN #include +#define PY_ARRAY_UNIQUE_SYMBOL UJSON_NUMPY +#include "numpy/arrayobject.h" /* objToJSON */ PyObject *objToJSON(PyObject *self, PyObject *args, PyObject *kwargs); @@ -73,6 +75,7 @@ static PyModuleDef moduledef = { PyMODINIT_FUNC PyInit_json(void) { + import_array() initObjToJSON(); // TODO(username): clean up, maybe via tp_free? return PyModuleDef_Init(&moduledef); } diff --git a/pandas/_libs/testing.pyi b/pandas/_libs/testing.pyi index ac0c772780c5c..01da496975f51 100644 --- a/pandas/_libs/testing.pyi +++ b/pandas/_libs/testing.pyi @@ -1,8 +1,12 @@ - - def assert_dict_equal(a, b, compare_keys: bool = ...): ... - -def assert_almost_equal(a, b, - rtol: float = ..., atol: float = ..., - check_dtype: bool = ..., - obj=..., lobj=..., robj=..., index_values=...): ... +def assert_almost_equal( + a, + b, + rtol: float = ..., + atol: float = ..., + check_dtype: bool = ..., + obj=..., + lobj=..., + robj=..., + index_values=..., +): ... diff --git a/pandas/_libs/tslib.pyi b/pandas/_libs/tslib.pyi index 641e62e7c8973..f43a81f20700a 100644 --- a/pandas/_libs/tslib.pyi +++ b/pandas/_libs/tslib.pyi @@ -3,20 +3,16 @@ from datetime import tzinfo import numpy as np def format_array_from_datetime( - values: np.ndarray, # np.ndarray[np.int64] + values: np.ndarray, # np.ndarray[np.int64] tz: tzinfo | None = ..., format: str | None = ..., - na_rep: object = ... + na_rep: object = ..., ) -> np.ndarray: ... # np.ndarray[object] - - def array_with_unit_to_datetime( values: np.ndarray, unit: str, errors: str = ..., ) -> tuple[np.ndarray, tzinfo | None]: ... - - def array_to_datetime( values: np.ndarray, # np.ndarray[object] errors: str = ..., @@ -26,4 +22,5 @@ def array_to_datetime( require_iso8601: bool = ..., allow_mixed: bool = ..., ) -> tuple[np.ndarray, tzinfo | None]: ... + # returned ndarray may be object dtype or datetime64[ns] diff --git a/pandas/_libs/tslibs/ccalendar.pyi b/pandas/_libs/tslibs/ccalendar.pyi index 500a0423bc9cf..993f18a61d74a 100644 --- a/pandas/_libs/tslibs/ccalendar.pyi +++ b/pandas/_libs/tslibs/ccalendar.pyi @@ -1,4 +1,3 @@ - DAYS: list[str] MONTH_ALIASES: dict[int, str] MONTH_NUMBERS: dict[str, int] diff --git a/pandas/_libs/tslibs/conversion.pyi b/pandas/_libs/tslibs/conversion.pyi index 6470361542597..e74a56a519c5a 100644 --- a/pandas/_libs/tslibs/conversion.pyi +++ b/pandas/_libs/tslibs/conversion.pyi @@ -12,30 +12,16 @@ class OutOfBoundsTimedelta(ValueError): ... def precision_from_unit( unit: str, -) -> tuple[ - int, # int64_t - int, -]: ... - - +) -> tuple[int, int,]: ... # (int64_t, _) def ensure_datetime64ns( arr: np.ndarray, # np.ndarray[datetime64[ANY]] copy: bool = ..., ) -> np.ndarray: ... # np.ndarray[datetime64ns] - - def ensure_timedelta64ns( arr: np.ndarray, # np.ndarray[timedelta64[ANY]] copy: bool = ..., ) -> np.ndarray: ... # np.ndarray[timedelta64ns] - - def datetime_to_datetime64( values: np.ndarray, # np.ndarray[object] -) -> tuple[ - np.ndarray, # np.ndarray[dt64ns] - tzinfo | None, -]: ... - - +) -> tuple[np.ndarray, tzinfo | None,]: ... # (np.ndarray[dt64ns], _) def localize_pydatetime(dt: datetime, tz: tzinfo | None) -> datetime: ... diff --git a/pandas/_libs/tslibs/dtypes.pyi b/pandas/_libs/tslibs/dtypes.pyi index d3aea5b0be796..f6a8d7887ced1 100644 --- a/pandas/_libs/tslibs/dtypes.pyi +++ b/pandas/_libs/tslibs/dtypes.pyi @@ -5,20 +5,16 @@ from pandas._libs.tslibs.offsets import BaseOffset _attrname_to_abbrevs: dict[str, str] _period_code_map: dict[str, int] - class PeriodDtypeBase: _dtype_code: int # PeriodDtypeCode # actually __cinit__ def __new__(self, code: int): ... - def freq_group_code(self) -> int: ... def date_offset(self) -> BaseOffset: ... - @classmethod def from_date_offset(cls, offset: BaseOffset) -> PeriodDtypeBase: ... - class FreqGroup(Enum): FR_ANN: int = ... FR_QTR: int = ... @@ -33,11 +29,9 @@ class FreqGroup(Enum): FR_US: int = ... FR_NS: int = ... FR_UND: int = ... - @staticmethod def get_freq_group(code: int) -> FreqGroup: ... - class Resolution(Enum): RESO_NS: int = ... RESO_US: int = ... @@ -49,19 +43,13 @@ class Resolution(Enum): RESO_MTH: int = ... RESO_QTR: int = ... RESO_YR: int = ... - def __lt__(self, other: Resolution) -> bool: ... - def __ge__(self, other: Resolution) -> bool: ... - @property def freq_group(self) -> FreqGroup: ... - @property def attrname(self) -> str: ... - @classmethod def from_attrname(cls, attrname: str) -> Resolution: ... - @classmethod def get_reso_from_freq(cls, freq: str) -> Resolution: ... diff --git a/pandas/_libs/tslibs/fields.pyi b/pandas/_libs/tslibs/fields.pyi index 22ae156d78b7d..244af38e25da0 100644 --- a/pandas/_libs/tslibs/fields.pyi +++ b/pandas/_libs/tslibs/fields.pyi @@ -3,67 +3,48 @@ import numpy as np def build_field_sarray( dtindex: np.ndarray, # const int64_t[:] ) -> np.ndarray: ... - def month_position_check(fields, weekdays) -> str | None: ... - def get_date_name_field( dtindex: np.ndarray, # const int64_t[:] field: str, locale=..., ) -> np.ndarray: ... # np.ndarray[object] - def get_start_end_field( dtindex: np.ndarray, # const int64_t[:] field: str, freqstr: str | None = ..., - month_kw: int = ... + month_kw: int = ..., ) -> np.ndarray: ... # np.ndarray[bool] - - def get_date_field( dtindex: np.ndarray, # const int64_t[:] - field: str, ) -> np.ndarray: ... # np.ndarray[in32] - - def get_timedelta_field( tdindex: np.ndarray, # const int64_t[:] field: str, ) -> np.ndarray: ... # np.ndarray[int32] - - def isleapyear_arr( years: np.ndarray, ) -> np.ndarray: ... # np.ndarray[bool] - def build_isocalendar_sarray( dtindex: np.ndarray, # const int64_t[:] ) -> np.ndarray: ... - - def get_locale_names(name_type: str, locale: object = None): ... - class RoundTo: @property def MINUS_INFTY(self) -> int: ... - @property def PLUS_INFTY(self) -> int: ... - @property def NEAREST_HALF_EVEN(self) -> int: ... - @property def NEAREST_HALF_PLUS_INFTY(self) -> int: ... - @property def NEAREST_HALF_MINUS_INFTY(self) -> int: ... - def round_nsint64( - values: np.ndarray, # np.ndarray[np.int64] + values: np.ndarray, # np.ndarray[np.int64] mode: RoundTo, nanos: int, ) -> np.ndarray: ... # np.ndarray[np.int64] diff --git a/pandas/_libs/tslibs/nattype.pyi b/pandas/_libs/tslibs/nattype.pyi index 5a2985d0e815b..22e6395a1fe99 100644 --- a/pandas/_libs/tslibs/nattype.pyi +++ b/pandas/_libs/tslibs/nattype.pyi @@ -1,4 +1,3 @@ - from datetime import ( datetime, timedelta, @@ -17,11 +16,9 @@ def is_null_datetimelike(val: object, inat_is_null: bool = ...) -> bool: ... class NaTType(datetime): value: np.int64 - def asm8(self) -> np.datetime64: ... def to_datetime64(self) -> np.datetime64: ... def to_numpy(self, dtype=..., copy: bool = ...) -> np.datetime64: ... - @property def is_leap_year(self) -> bool: ... @property @@ -36,7 +33,6 @@ class NaTType(datetime): def is_quarter_end(self) -> bool: ... @property def is_year_end(self) -> bool: ... - @property def day_of_year(self) -> float: ... @property @@ -53,81 +49,61 @@ class NaTType(datetime): def week(self) -> float: ... @property def weekofyear(self) -> float: ... - def day_name(self) -> float: ... def month_name(self) -> float: ... - # error: Return type "float" of "weekday" incompatible with return # type "int" in supertype "date" def weekday(self) -> float: ... # type: ignore[override] - # error: Return type "float" of "isoweekday" incompatible with return # type "int" in supertype "date" def isoweekday(self) -> float: ... # type: ignore[override] - def total_seconds(self) -> float: ... - # error: Signature of "today" incompatible with supertype "datetime" def today(self, *args, **kwargs) -> NaTType: ... # type: ignore[override] # error: Signature of "today" incompatible with supertype "datetime" def now(self, *args, **kwargs) -> NaTType: ... # type: ignore[override] - def to_pydatetime(self) -> NaTType: ... def date(self) -> NaTType: ... - def round(self) -> NaTType: ... def floor(self) -> NaTType: ... def ceil(self) -> NaTType: ... - def tz_convert(self) -> NaTType: ... def tz_localize(self) -> NaTType: ... - def replace(self, *args, **kwargs) -> NaTType: ... - # error: Return type "float" of "year" incompatible with return # type "int" in supertype "date" @property def year(self) -> float: ... # type: ignore[override] - @property def quarter(self) -> float: ... - # error: Return type "float" of "month" incompatible with return # type "int" in supertype "date" @property def month(self) -> float: ... # type: ignore[override] - # error: Return type "float" of "day" incompatible with return # type "int" in supertype "date" @property def day(self) -> float: ... # type: ignore[override] - # error: Return type "float" of "hour" incompatible with return # type "int" in supertype "date" @property def hour(self) -> float: ... # type: ignore[override] - # error: Return type "float" of "minute" incompatible with return # type "int" in supertype "date" @property def minute(self) -> float: ... # type: ignore[override] - # error: Return type "float" of "second" incompatible with return # type "int" in supertype "date" @property def second(self) -> float: ... # type: ignore[override] - @property def millisecond(self) -> float: ... - # error: Return type "float" of "microsecond" incompatible with return # type "int" in supertype "date" @property def microsecond(self) -> float: ... # type: ignore[override] - @property def nanosecond(self) -> float: ... - # inject Timedelta properties @property def days(self) -> float: ... @@ -135,35 +111,29 @@ class NaTType(datetime): def microseconds(self) -> float: ... @property def nanoseconds(self) -> float: ... - # inject Period properties @property def qyear(self) -> float: ... - def __eq__(self, other: Any) -> bool: ... def __ne__(self, other: Any) -> bool: ... # https://github.com/python/mypy/issues/9015 # error: Argument 1 of "__lt__" is incompatible with supertype "date"; # supertype defines the argument type as "date" def __lt__( # type: ignore[override] - self, - other: datetime | timedelta | Period | np.datetime64 | np.timedelta64 + self, other: datetime | timedelta | Period | np.datetime64 | np.timedelta64 ) -> bool: ... # error: Argument 1 of "__le__" is incompatible with supertype "date"; # supertype defines the argument type as "date" def __le__( # type: ignore[override] - self, - other: datetime | timedelta | Period | np.datetime64 | np.timedelta64 + self, other: datetime | timedelta | Period | np.datetime64 | np.timedelta64 ) -> bool: ... # error: Argument 1 of "__gt__" is incompatible with supertype "date"; # supertype defines the argument type as "date" def __gt__( # type: ignore[override] - self, - other: datetime | timedelta | Period | np.datetime64 | np.timedelta64 + self, other: datetime | timedelta | Period | np.datetime64 | np.timedelta64 ) -> bool: ... # error: Argument 1 of "__ge__" is incompatible with supertype "date"; # supertype defines the argument type as "date" def __ge__( # type: ignore[override] - self, - other: datetime | timedelta | Period | np.datetime64 | np.timedelta64 + self, other: datetime | timedelta | Period | np.datetime64 | np.timedelta64 ) -> bool: ... diff --git a/pandas/_libs/tslibs/nattype.pyx b/pandas/_libs/tslibs/nattype.pyx index 60bfaa38b495f..bac82b158589d 100644 --- a/pandas/_libs/tslibs/nattype.pyx +++ b/pandas/_libs/tslibs/nattype.pyx @@ -744,7 +744,6 @@ class NaTType(_NaT): * 'NaT' will return NaT for an ambiguous time. * 'raise' will raise an AmbiguousTimeError for an ambiguous time. - .. versionadded:: 0.24.0 nonexistent : {'raise', 'shift_forward', 'shift_backward, 'NaT', \ timedelta}, default 'raise' A nonexistent time does not exist in a particular timezone @@ -759,8 +758,6 @@ timedelta}, default 'raise' * 'raise' will raise an NonExistentTimeError if there are nonexistent times. - .. versionadded:: 0.24.0 - Returns ------- a new Timestamp rounded to the given resolution of `freq` @@ -822,7 +819,6 @@ timedelta}, default 'raise' * 'NaT' will return NaT for an ambiguous time. * 'raise' will raise an AmbiguousTimeError for an ambiguous time. - .. versionadded:: 0.24.0 nonexistent : {'raise', 'shift_forward', 'shift_backward, 'NaT', \ timedelta}, default 'raise' A nonexistent time does not exist in a particular timezone @@ -837,8 +833,6 @@ timedelta}, default 'raise' * 'raise' will raise an NonExistentTimeError if there are nonexistent times. - .. versionadded:: 0.24.0 - Raises ------ ValueError if the freq cannot be converted. @@ -896,7 +890,6 @@ timedelta}, default 'raise' * 'NaT' will return NaT for an ambiguous time. * 'raise' will raise an AmbiguousTimeError for an ambiguous time. - .. versionadded:: 0.24.0 nonexistent : {'raise', 'shift_forward', 'shift_backward, 'NaT', \ timedelta}, default 'raise' A nonexistent time does not exist in a particular timezone @@ -911,8 +904,6 @@ timedelta}, default 'raise' * 'raise' will raise an NonExistentTimeError if there are nonexistent times. - .. versionadded:: 0.24.0 - Raises ------ ValueError if the freq cannot be converted. @@ -1041,8 +1032,6 @@ default 'raise' * 'raise' will raise an NonExistentTimeError if there are nonexistent times. - .. versionadded:: 0.24.0 - Returns ------- localized : Timestamp diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx index 1b1a497df4ca7..ac7447420596a 100644 --- a/pandas/_libs/tslibs/offsets.pyx +++ b/pandas/_libs/tslibs/offsets.pyx @@ -1,7 +1,6 @@ import operator import re import time -from typing import Any import warnings import cython @@ -364,7 +363,7 @@ cdef class BaseOffset: self.normalize = normalize self._cache = {} - def __eq__(self, other: Any) -> bool: + def __eq__(self, other) -> bool: if isinstance(other, str): try: # GH#23524 if to_offset fails, we are dealing with an diff --git a/pandas/_libs/tslibs/parsing.pyi b/pandas/_libs/tslibs/parsing.pyi index f346204d69d25..fc08a48cee343 100644 --- a/pandas/_libs/tslibs/parsing.pyi +++ b/pandas/_libs/tslibs/parsing.pyi @@ -6,35 +6,26 @@ from pandas._libs.tslibs.offsets import BaseOffset class DateParseError(ValueError): ... - def parse_datetime_string( date_string: str, dayfirst: bool = ..., yearfirst: bool = ..., **kwargs, ) -> datetime: ... - - def parse_time_string( arg: str, freq: BaseOffset | str | None = ..., dayfirst: bool | None = ..., yearfirst: bool | None = ..., ) -> tuple[datetime, str]: ... - - def _does_string_look_like_datetime(py_string: str) -> bool: ... - def quarter_to_myear(year: int, quarter: int, freq: str) -> tuple[int, int]: ... - - def try_parse_dates( values: np.ndarray, # object[:] parser=..., dayfirst: bool = ..., default: datetime | None = ..., ) -> np.ndarray: ... # np.ndarray[object] - def try_parse_date_and_time( dates: np.ndarray, # object[:] times: np.ndarray, # object[:] @@ -42,40 +33,29 @@ def try_parse_date_and_time( time_parser=..., dayfirst: bool = ..., default: datetime | None = ..., -) -> np.ndarray: ... # np.ndarray[object] - +) -> np.ndarray: ... # np.ndarray[object] def try_parse_year_month_day( years: np.ndarray, # object[:] - months: np.ndarray, # object[:] - days: np.ndarray, # object[:] -) -> np.ndarray: ... # np.ndarray[object] - - + months: np.ndarray, # object[:] + days: np.ndarray, # object[:] +) -> np.ndarray: ... # np.ndarray[object] def try_parse_datetime_components( - years: np.ndarray, # object[:] + years: np.ndarray, # object[:] months: np.ndarray, # object[:] - days: np.ndarray, # object[:] - hours: np.ndarray, # object[:] - minutes: np.ndarray, # object[:] - seconds: np.ndarray, # object[:] -) -> np.ndarray: ... # np.ndarray[object] - - + days: np.ndarray, # object[:] + hours: np.ndarray, # object[:] + minutes: np.ndarray, # object[:] + seconds: np.ndarray, # object[:] +) -> np.ndarray: ... # np.ndarray[object] def format_is_iso(f: str) -> bool: ... - - def guess_datetime_format( dt_str, dayfirst: bool = ..., dt_str_parse=..., dt_str_split=..., ) -> str | None: ... - - def concat_date_cols( date_cols: tuple, keep_trivial_numbers: bool = ..., ) -> np.ndarray: ... # np.ndarray[object] - - def get_rule_month(source: str) -> str: ... diff --git a/pandas/_libs/tslibs/period.pyi b/pandas/_libs/tslibs/period.pyi index 49e630d605310..97738d51b5a0e 100644 --- a/pandas/_libs/tslibs/period.pyi +++ b/pandas/_libs/tslibs/period.pyi @@ -19,41 +19,34 @@ def periodarr_to_dt64arr( periodarr: np.ndarray, # const int64_t[:] freq: int, ) -> np.ndarray: ... # np.ndarray[np.int64] - def period_asfreq_arr( arr: np.ndarray, # ndarray[int64_t] arr, freq1: int, freq2: int, end: bool, -) -> np.ndarray: ... # np.ndarray[np.int64] - +) -> np.ndarray: ... # np.ndarray[np.int64] def get_period_field_arr( field: str, arr: np.ndarray, # const int64_t[:] freq: int, -) -> np.ndarray: ... # np.ndarray[np.int64] - +) -> np.ndarray: ... # np.ndarray[np.int64] def from_ordinals( values: np.ndarray, # const int64_t[:] freq: Frequency, ) -> np.ndarray: ... # np.ndarray[np.int64] - def extract_ordinals( values: np.ndarray, # np.ndarray[object] freq: Frequency | int, ) -> np.ndarray: ... # np.ndarray[np.int64] - def extract_freq( values: np.ndarray, # np.ndarray[object] ) -> BaseOffset: ... # exposed for tests def period_asfreq(ordinal: int, freq1: int, freq2: int, end: bool) -> int: ... - def period_ordinal( y: int, m: int, d: int, h: int, min: int, s: int, us: int, ps: int, freq: int ) -> int: ... - def freq_to_dtype_code(freq: BaseOffset) -> int: ... def validate_end_alias(how: str) -> Literal["E", "S"]: ... @@ -75,84 +68,57 @@ class Period: minute=None, second=None, ) -> Period | NaTType: ... - @classmethod def _maybe_convert_freq(cls, freq) -> BaseOffset: ... - @classmethod def _from_ordinal(cls, ordinal: int, freq) -> Period: ... - @classmethod def now(cls, freq=...) -> Period: ... - def strftime(self, fmt: str) -> str: ... - def to_timestamp( self, - freq: str | BaseOffset | None =..., + freq: str | BaseOffset | None = ..., how: str = ..., tz: Timezone | None = ..., ) -> Timestamp: ... - def asfreq(self, freq, how=...) -> Period: ... - @property def freqstr(self) -> str: ... - @property def is_leap_year(self) -> bool: ... - @property def daysinmonth(self) -> int: ... - @property def days_in_month(self) -> int: ... - @property def qyear(self) -> int: ... - @property def quarter(self) -> int: ... - @property def day_of_year(self) -> int: ... - @property def weekday(self) -> int: ... - @property def day_of_week(self) -> int: ... - @property def week(self) -> int: ... - @property def weekofyear(self) -> int: ... - @property def second(self) -> int: ... - @property def minute(self) -> int: ... - @property def hour(self) -> int: ... - @property def day(self) -> int: ... - @property def month(self) -> int: ... - @property def year(self) -> int: ... - @property def end_time(self) -> Timestamp: ... - @property def start_time(self) -> Timestamp: ... - def __sub__(self, other) -> Period | BaseOffset: ... - def __add__(self, other) -> Period: ... diff --git a/pandas/_libs/tslibs/strptime.pyi b/pandas/_libs/tslibs/strptime.pyi index 3748c169bb1c6..891e257bcbcb4 100644 --- a/pandas/_libs/tslibs/strptime.pyi +++ b/pandas/_libs/tslibs/strptime.pyi @@ -1,11 +1,10 @@ -from typing import Optional - import numpy as np def array_strptime( values: np.ndarray, # np.ndarray[object] - fmt: Optional[str], + fmt: str | None, exact: bool = True, - errors: str = "raise" + errors: str = "raise", ) -> tuple[np.ndarray, np.ndarray]: ... -# first ndarray is M8[ns], second is object ndarray of Optional[tzinfo] + +# first ndarray is M8[ns], second is object ndarray of tzinfo | None diff --git a/pandas/_libs/tslibs/timedeltas.pyi b/pandas/_libs/tslibs/timedeltas.pyi index 9ccc3a8ed5fa4..31a836b2c2079 100644 --- a/pandas/_libs/tslibs/timedeltas.pyi +++ b/pandas/_libs/tslibs/timedeltas.pyi @@ -15,26 +15,18 @@ from pandas._libs.tslibs import ( _S = TypeVar("_S") - def ints_to_pytimedelta( arr: np.ndarray, # const int64_t[:] box: bool = ..., ) -> np.ndarray: ... # np.ndarray[object] - - def array_to_timedelta64( values: np.ndarray, # ndarray[object] unit: str | None = ..., errors: str = ..., ) -> np.ndarray: ... # np.ndarray[m8ns] - - def parse_timedelta_unit(unit: str | None) -> str: ... - - def delta_to_nanoseconds(delta: Tick | np.timedelta64 | timedelta | int) -> int: ... - class Timedelta(timedelta): min: ClassVar[Timedelta] max: ClassVar[Timedelta] @@ -43,12 +35,8 @@ class Timedelta(timedelta): # error: "__new__" must return a class instance (got "Union[Timedelta, NaTType]") def __new__( # type: ignore[misc] - cls: Type[_S], - value=..., - unit=..., - **kwargs + cls: Type[_S], value=..., unit=..., **kwargs ) -> _S | NaTType: ... - @property def days(self) -> int: ... @property @@ -56,21 +44,16 @@ class Timedelta(timedelta): @property def microseconds(self) -> int: ... def total_seconds(self) -> float: ... - def to_pytimedelta(self) -> timedelta: ... def to_timedelta64(self) -> np.timedelta64: ... - @property def asm8(self) -> np.timedelta64: ... - # TODO: round/floor/ceil could return NaT? def round(self: _S, freq) -> _S: ... def floor(self: _S, freq) -> _S: ... def ceil(self: _S, freq) -> _S: ... - @property def resolution_string(self) -> str: ... - def __add__(self, other: timedelta) -> timedelta: ... def __radd__(self, other: timedelta) -> timedelta: ... def __sub__(self, other: timedelta) -> timedelta: ... @@ -80,19 +63,16 @@ class Timedelta(timedelta): def __abs__(self) -> timedelta: ... def __mul__(self, other: float) -> timedelta: ... def __rmul__(self, other: float) -> timedelta: ... - @overload def __floordiv__(self, other: timedelta) -> int: ... @overload def __floordiv__(self, other: int) -> timedelta: ... - @overload def __truediv__(self, other: timedelta) -> float: ... @overload def __truediv__(self, other: float) -> timedelta: ... def __mod__(self, other: timedelta) -> timedelta: ... def __divmod__(self, other: timedelta) -> tuple[int, timedelta]: ... - def __le__(self, other: timedelta) -> bool: ... def __lt__(self, other: timedelta) -> bool: ... def __ge__(self, other: timedelta) -> bool: ... diff --git a/pandas/_libs/tslibs/timestamps.pyi b/pandas/_libs/tslibs/timestamps.pyi index 1c06538c7399e..ff6b18835322e 100644 --- a/pandas/_libs/tslibs/timestamps.pyi +++ b/pandas/_libs/tslibs/timestamps.pyi @@ -9,7 +9,6 @@ import sys from time import struct_time from typing import ( ClassVar, - Optional, Type, TypeVar, overload, @@ -27,10 +26,8 @@ from pandas._libs.tslibs import ( _S = TypeVar("_S") - def integer_op_not_supported(obj) -> None: ... - class Timestamp(datetime): min: ClassVar[Timestamp] max: ClassVar[Timestamp] @@ -41,9 +38,15 @@ class Timestamp(datetime): # error: "__new__" must return a class instance (got "Union[Timestamp, NaTType]") def __new__( # type: ignore[misc] cls: Type[_S], - ts_input: int | np.integer | float | str | _date | datetime | np.datetime64 = ..., + ts_input: int + | np.integer + | float + | str + | _date + | datetime + | np.datetime64 = ..., freq=..., - tz: str | _tzinfo | None | int= ..., + tz: str | _tzinfo | None | int = ..., unit=..., year: int | None = ..., month: int | None = ..., @@ -55,11 +58,9 @@ class Timestamp(datetime): nanosecond: int | None = ..., tzinfo: _tzinfo | None = ..., *, - fold: int | None= ..., + fold: int | None = ..., ) -> _S | NaTType: ... - def _set_freq(self, freq: BaseOffset | None) -> None: ... - @property def year(self) -> int: ... @property @@ -75,22 +76,19 @@ class Timestamp(datetime): @property def microsecond(self) -> int: ... @property - def tzinfo(self) -> Optional[_tzinfo]: ... + def tzinfo(self) -> _tzinfo | None: ... @property - def tz(self) -> Optional[_tzinfo]: ... - + def tz(self) -> _tzinfo | None: ... @property def fold(self) -> int: ... - @classmethod - def fromtimestamp(cls: Type[_S], t: float, tz: Optional[_tzinfo] = ...) -> _S: ... + def fromtimestamp(cls: Type[_S], t: float, tz: _tzinfo | None = ...) -> _S: ... @classmethod def utcfromtimestamp(cls: Type[_S], t: float) -> _S: ... @classmethod def today(cls: Type[_S]) -> _S: ... @classmethod def fromordinal(cls: Type[_S], n: int) -> _S: ... - if sys.version_info >= (3, 8): @classmethod def now(cls: Type[_S], tz: _tzinfo | str | None = ...) -> _S: ... @@ -101,28 +99,23 @@ class Timestamp(datetime): @overload @classmethod def now(cls, tz: _tzinfo) -> datetime: ... - @classmethod def utcnow(cls: Type[_S]) -> _S: ... @classmethod - def combine(cls, date: _date, time: _time, tzinfo: Optional[_tzinfo] = ...) -> datetime: ... - + def combine( + cls, date: _date, time: _time, tzinfo: _tzinfo | None = ... + ) -> datetime: ... @classmethod def fromisoformat(cls: Type[_S], date_string: str) -> _S: ... - def strftime(self, fmt: str) -> str: ... def __format__(self, fmt: str) -> str: ... - def toordinal(self) -> int: ... def timetuple(self) -> struct_time: ... - def timestamp(self) -> float: ... - def utctimetuple(self) -> struct_time: ... def date(self) -> _date: ... def time(self) -> _time: ... def timetz(self) -> _time: ... - def replace( self, year: int = ..., @@ -132,26 +125,21 @@ class Timestamp(datetime): minute: int = ..., second: int = ..., microsecond: int = ..., - tzinfo: Optional[_tzinfo] = ..., + tzinfo: _tzinfo | None = ..., *, fold: int = ..., ) -> datetime: ... - if sys.version_info >= (3, 8): - def astimezone(self: _S, tz: Optional[_tzinfo] = ...) -> _S: ... + def astimezone(self: _S, tz: _tzinfo | None = ...) -> _S: ... else: - def astimezone(self, tz: Optional[_tzinfo] = ...) -> datetime: ... - + def astimezone(self, tz: _tzinfo | None = ...) -> datetime: ... def ctime(self) -> str: ... def isoformat(self, sep: str = ..., timespec: str = ...) -> str: ... - @classmethod def strptime(cls, date_string: str, format: str) -> datetime: ... - - def utcoffset(self) -> Optional[timedelta]: ... - def tzname(self) -> Optional[str]: ... - def dst(self) -> Optional[timedelta]: ... - + def utcoffset(self) -> timedelta | None: ... + def tzname(self) -> str | None: ... + def dst(self) -> timedelta | None: ... def __le__(self, other: datetime) -> bool: ... # type: ignore def __lt__(self, other: datetime) -> bool: ... # type: ignore def __ge__(self, other: datetime) -> bool: ... # type: ignore @@ -166,12 +154,10 @@ class Timestamp(datetime): def __sub__(self, other: datetime) -> timedelta: ... @overload def __sub__(self, other: timedelta) -> datetime: ... - def __hash__(self) -> int: ... def weekday(self) -> int: ... def isoweekday(self) -> int: ... def isocalendar(self) -> tuple[int, int, int]: ... - @property def is_leap_year(self) -> bool: ... @property @@ -186,23 +172,25 @@ class Timestamp(datetime): def is_quarter_end(self) -> bool: ... @property def is_year_end(self) -> bool: ... - def to_pydatetime(self, warn: bool = ...) -> datetime: ... def to_datetime64(self) -> np.datetime64: ... def to_period(self, freq) -> Period: ... def to_julian_date(self) -> np.float64: ... - @property def asm8(self) -> np.datetime64: ... - def tz_convert(self: _S, tz) -> _S: ... - # TODO: could return NaT? - def tz_localize(self: _S, tz, ambiguous: str = ..., nonexistent: str = ...) -> _S: ... - + def tz_localize( + self: _S, tz, ambiguous: str = ..., nonexistent: str = ... + ) -> _S: ... def normalize(self: _S) -> _S: ... - # TODO: round/floor/ceil could return NaT? - def round(self: _S, freq, ambiguous: bool | str = ..., nonexistent: str = ...) -> _S: ... - def floor(self: _S, freq, ambiguous: bool | str = ..., nonexistent: str = ...) -> _S: ... - def ceil(self: _S, freq, ambiguous: bool | str = ..., nonexistent: str = ...) -> _S: ... + def round( + self: _S, freq, ambiguous: bool | str = ..., nonexistent: str = ... + ) -> _S: ... + def floor( + self: _S, freq, ambiguous: bool | str = ..., nonexistent: str = ... + ) -> _S: ... + def ceil( + self: _S, freq, ambiguous: bool | str = ..., nonexistent: str = ... + ) -> _S: ... diff --git a/pandas/_libs/tslibs/timestamps.pyx b/pandas/_libs/tslibs/timestamps.pyx index 7b03522d56d76..edd3b58867e87 100644 --- a/pandas/_libs/tslibs/timestamps.pyx +++ b/pandas/_libs/tslibs/timestamps.pyx @@ -1378,7 +1378,6 @@ class Timestamp(_Timestamp): * 'NaT' will return NaT for an ambiguous time. * 'raise' will raise an AmbiguousTimeError for an ambiguous time. - .. versionadded:: 0.24.0 nonexistent : {'raise', 'shift_forward', 'shift_backward, 'NaT', \ timedelta}, default 'raise' A nonexistent time does not exist in a particular timezone @@ -1393,8 +1392,6 @@ timedelta}, default 'raise' * 'raise' will raise an NonExistentTimeError if there are nonexistent times. - .. versionadded:: 0.24.0 - Returns ------- a new Timestamp rounded to the given resolution of `freq` @@ -1458,7 +1455,6 @@ timedelta}, default 'raise' * 'NaT' will return NaT for an ambiguous time. * 'raise' will raise an AmbiguousTimeError for an ambiguous time. - .. versionadded:: 0.24.0 nonexistent : {'raise', 'shift_forward', 'shift_backward, 'NaT', \ timedelta}, default 'raise' A nonexistent time does not exist in a particular timezone @@ -1473,8 +1469,6 @@ timedelta}, default 'raise' * 'raise' will raise an NonExistentTimeError if there are nonexistent times. - .. versionadded:: 0.24.0 - Raises ------ ValueError if the freq cannot be converted. @@ -1532,7 +1526,6 @@ timedelta}, default 'raise' * 'NaT' will return NaT for an ambiguous time. * 'raise' will raise an AmbiguousTimeError for an ambiguous time. - .. versionadded:: 0.24.0 nonexistent : {'raise', 'shift_forward', 'shift_backward, 'NaT', \ timedelta}, default 'raise' A nonexistent time does not exist in a particular timezone @@ -1547,8 +1540,6 @@ timedelta}, default 'raise' * 'raise' will raise an NonExistentTimeError if there are nonexistent times. - .. versionadded:: 0.24.0 - Raises ------ ValueError if the freq cannot be converted. @@ -1669,8 +1660,6 @@ default 'raise' * 'raise' will raise an NonExistentTimeError if there are nonexistent times. - .. versionadded:: 0.24.0 - Returns ------- localized : Timestamp diff --git a/pandas/_libs/tslibs/tzconversion.pyx b/pandas/_libs/tslibs/tzconversion.pyx index 8e82d8a180aa6..d28b851d0fbc1 100644 --- a/pandas/_libs/tslibs/tzconversion.pyx +++ b/pandas/_libs/tslibs/tzconversion.pyx @@ -110,8 +110,6 @@ def tz_localize_to_utc(ndarray[int64_t] vals, tzinfo tz, object ambiguous=None, timedelta-like} How to handle non-existent times when converting wall times to UTC - .. versionadded:: 0.24.0 - Returns ------- localized : ndarray[int64_t] diff --git a/pandas/_libs/window/aggregations.pyi b/pandas/_libs/window/aggregations.pyi index 3391edac84224..fe083fe415e4b 100644 --- a/pandas/_libs/window/aggregations.pyi +++ b/pandas/_libs/window/aggregations.pyi @@ -11,58 +11,50 @@ def roll_sum( start: np.ndarray, # np.ndarray[np.int64] end: np.ndarray, # np.ndarray[np.int64] minp: int, # int64_t -) -> np.ndarray: ... # np.ndarray[float] - +) -> np.ndarray: ... # np.ndarray[float] def roll_mean( values: np.ndarray, # const float64_t[:] start: np.ndarray, # np.ndarray[np.int64] end: np.ndarray, # np.ndarray[np.int64] minp: int, # int64_t -) -> np.ndarray: ... # np.ndarray[float] - +) -> np.ndarray: ... # np.ndarray[float] def roll_var( values: np.ndarray, # const float64_t[:] start: np.ndarray, # np.ndarray[np.int64] end: np.ndarray, # np.ndarray[np.int64] minp: int, # int64_t ddof: int = ..., -) -> np.ndarray: ... # np.ndarray[float] - +) -> np.ndarray: ... # np.ndarray[float] def roll_skew( values: np.ndarray, # np.ndarray[np.float64] start: np.ndarray, # np.ndarray[np.int64] end: np.ndarray, # np.ndarray[np.int64] minp: int, # int64_t -) -> np.ndarray: ... # np.ndarray[float] - +) -> np.ndarray: ... # np.ndarray[float] def roll_kurt( values: np.ndarray, # np.ndarray[np.float64] start: np.ndarray, # np.ndarray[np.int64] end: np.ndarray, # np.ndarray[np.int64] minp: int, # int64_t -) -> np.ndarray: ... # np.ndarray[float] - +) -> np.ndarray: ... # np.ndarray[float] def roll_median_c( values: np.ndarray, # np.ndarray[np.float64] start: np.ndarray, # np.ndarray[np.int64] end: np.ndarray, # np.ndarray[np.int64] minp: int, # int64_t ) -> np.ndarray: ... # np.ndarray[float] - def roll_max( values: np.ndarray, # np.ndarray[np.float64] start: np.ndarray, # np.ndarray[np.int64] end: np.ndarray, # np.ndarray[np.int64] minp: int, # int64_t ) -> np.ndarray: ... # np.ndarray[float] - def roll_min( values: np.ndarray, # np.ndarray[np.float64] start: np.ndarray, # np.ndarray[np.int64] end: np.ndarray, # np.ndarray[np.int64] minp: int, # int64_t ) -> np.ndarray: ... # np.ndarray[float] - def roll_quantile( values: np.ndarray, # const float64_t[:] start: np.ndarray, # np.ndarray[np.int64] @@ -71,7 +63,6 @@ def roll_quantile( quantile: float, # float64_t interpolation: Literal["linear", "lower", "higher", "nearest", "midpoint"], ) -> np.ndarray: ... # np.ndarray[float] - def roll_apply( obj: object, start: np.ndarray, # np.ndarray[np.int64] @@ -82,26 +73,22 @@ def roll_apply( args: tuple[Any, ...], kwargs: dict[str, Any], ) -> np.ndarray: ... # np.ndarray[float] # FIXME: could also be type(obj) if n==0 - def roll_weighted_sum( values: np.ndarray, # const float64_t[:] weights: np.ndarray, # const float64_t[:] minp: int, ) -> np.ndarray: ... # np.ndarray[np.float64] - def roll_weighted_mean( values: np.ndarray, # const float64_t[:] weights: np.ndarray, # const float64_t[:] minp: int, ) -> np.ndarray: ... # np.ndarray[np.float64] - def roll_weighted_var( values: np.ndarray, # const float64_t[:] weights: np.ndarray, # const float64_t[:] minp: int, # int64_t ddof: int, # unsigned int ) -> np.ndarray: ... # np.ndarray[np.float64] - def ewma( vals: np.ndarray, # const float64_t[:] start: np.ndarray, # const int64_t[:] @@ -112,7 +99,6 @@ def ewma( ignore_na: bool, deltas: np.ndarray, # const float64_t[:] ) -> np.ndarray: ... # np.ndarray[np.float64] - def ewmcov( input_x: np.ndarray, # const float64_t[:] start: np.ndarray, # const int64_t[:] diff --git a/pandas/_libs/window/indexers.pyi b/pandas/_libs/window/indexers.pyi index a32fe2f0f8b03..2dea9362228e5 100644 --- a/pandas/_libs/window/indexers.pyi +++ b/pandas/_libs/window/indexers.pyi @@ -1,13 +1,10 @@ import numpy as np def calculate_variable_window_bounds( - num_values: int, # int64_t - window_size: int, # int64_t + num_values: int, # int64_t + window_size: int, # int64_t min_periods, center: bool, closed: str | None, index: np.ndarray, # const int64_t[:] -) -> tuple[ - np.ndarray, # np.ndarray[np.int64] - np.ndarray, # np.ndarray[np.int64] -]: ... +) -> tuple[np.ndarray, np.ndarray,]: ... # np.ndarray[np.int64] # np.ndarray[np.int64] diff --git a/pandas/_libs/writers.pyi b/pandas/_libs/writers.pyi index 67f6059c2a825..c188dc2bd9048 100644 --- a/pandas/_libs/writers.pyi +++ b/pandas/_libs/writers.pyi @@ -8,15 +8,11 @@ def write_csv_rows( cols: np.ndarray, writer: object, # _csv.writer ) -> None: ... - def convert_json_to_lines(arr: str) -> str: ... - def max_len_string_array( arr: np.ndarray, # pandas_string[:] ) -> int: ... - def word_len(val: object) -> int: ... - def string_array_replace_from_nan_rep( arr: np.ndarray, # np.ndarray[object, ndim=1] nan_rep: object, diff --git a/pandas/_testing/__init__.py b/pandas/_testing/__init__.py index 40f23c25a1e99..fc6c7f4c17ea0 100644 --- a/pandas/_testing/__init__.py +++ b/pandas/_testing/__init__.py @@ -278,7 +278,7 @@ def makeUnicodeIndex(k=10, name=None): def makeCategoricalIndex(k=10, n=3, name=None, **kwargs): - """ make a length k index or n categories """ + """make a length k index or n categories""" x = rands_array(nchars=4, size=n) return CategoricalIndex( Categorical.from_codes(np.arange(k) % n, categories=x), name=name, **kwargs @@ -286,7 +286,7 @@ def makeCategoricalIndex(k=10, n=3, name=None, **kwargs): def makeIntervalIndex(k=10, name=None, **kwargs): - """ make a length k IntervalIndex """ + """make a length k IntervalIndex""" x = np.linspace(0, 100, num=(k + 1)) return IntervalIndex.from_breaks(x, name=name, **kwargs) diff --git a/pandas/_testing/asserters.py b/pandas/_testing/asserters.py index 96d010b487a79..1942e07d1b562 100644 --- a/pandas/_testing/asserters.py +++ b/pandas/_testing/asserters.py @@ -48,6 +48,7 @@ TimedeltaArray, ) from pandas.core.arrays.datetimelike import DatetimeLikeArrayMixin +from pandas.core.arrays.string_ import StringDtype from pandas.io.formats.printing import pprint_thing @@ -638,12 +639,20 @@ def raise_assert_detail(obj, message, left, right, diff=None, index_values=None) if isinstance(left, np.ndarray): left = pprint_thing(left) - elif is_categorical_dtype(left) or isinstance(left, PandasDtype): + elif ( + is_categorical_dtype(left) + or isinstance(left, PandasDtype) + or isinstance(left, StringDtype) + ): left = repr(left) if isinstance(right, np.ndarray): right = pprint_thing(right) - elif is_categorical_dtype(right) or isinstance(right, PandasDtype): + elif ( + is_categorical_dtype(right) + or isinstance(right, PandasDtype) + or isinstance(right, StringDtype) + ): right = repr(right) msg += f""" diff --git a/pandas/_typing.py b/pandas/_typing.py index 7763b0ceb610a..12d23786c3387 100644 --- a/pandas/_typing.py +++ b/pandas/_typing.py @@ -170,8 +170,8 @@ # filenames and file-like-objects Buffer = Union[IO[AnyStr], RawIOBase, BufferedIOBase, TextIOBase, TextIOWrapper, mmap] -FileOrBuffer = Union[str, Buffer[T]] -FilePathOrBuffer = Union["PathLike[str]", FileOrBuffer[T]] +FileOrBuffer = Union[str, Buffer[AnyStr]] +FilePathOrBuffer = Union["PathLike[str]", FileOrBuffer[AnyStr]] # for arbitrary kwargs passed during reading/writing files StorageOptions = Optional[Dict[str, Any]] diff --git a/pandas/arrays/__init__.py b/pandas/arrays/__init__.py index 0fa070b6e4fc4..89d362eb77e68 100644 --- a/pandas/arrays/__init__.py +++ b/pandas/arrays/__init__.py @@ -4,6 +4,7 @@ See :ref:`extending.extension-types` for more. """ from pandas.core.arrays import ( + ArrowStringArray, BooleanArray, Categorical, DatetimeArray, @@ -18,6 +19,7 @@ ) __all__ = [ + "ArrowStringArray", "BooleanArray", "Categorical", "DatetimeArray", diff --git a/pandas/compat/numpy/function.py b/pandas/compat/numpy/function.py index 69dc3ac417510..cea1b80d340c8 100644 --- a/pandas/compat/numpy/function.py +++ b/pandas/compat/numpy/function.py @@ -15,12 +15,9 @@ methods that are spread throughout the codebase. This module will make it easier to adjust to future upstream changes in the analogous numpy signatures. """ -from typing import ( - Any, - Dict, - Optional, - Union, -) +from __future__ import annotations + +from typing import Any from numpy import ndarray @@ -41,7 +38,7 @@ def __init__( self, defaults, fname=None, - method: Optional[str] = None, + method: str | None = None, max_fname_arg_count=None, ): self.fname = fname @@ -55,7 +52,7 @@ def __call__( kwargs, fname=None, max_fname_arg_count=None, - method: Optional[str] = None, + method: str | None = None, ) -> None: if args or kwargs: fname = self.fname if fname is None else fname @@ -119,7 +116,7 @@ def validate_argmax_with_skipna(skipna, args, kwargs): return skipna -ARGSORT_DEFAULTS: Dict[str, Optional[Union[int, str]]] = {} +ARGSORT_DEFAULTS: dict[str, int | str | None] = {} ARGSORT_DEFAULTS["axis"] = -1 ARGSORT_DEFAULTS["kind"] = "quicksort" ARGSORT_DEFAULTS["order"] = None @@ -132,7 +129,7 @@ def validate_argmax_with_skipna(skipna, args, kwargs): # two different signatures of argsort, this second validation for when the # `kind` param is supported -ARGSORT_DEFAULTS_KIND: Dict[str, Optional[int]] = {} +ARGSORT_DEFAULTS_KIND: dict[str, int | None] = {} ARGSORT_DEFAULTS_KIND["axis"] = -1 ARGSORT_DEFAULTS_KIND["order"] = None validate_argsort_kind = CompatValidator( @@ -155,7 +152,7 @@ def validate_argsort_with_ascending(ascending, args, kwargs): return ascending -CLIP_DEFAULTS: Dict[str, Any] = {"out": None} +CLIP_DEFAULTS: dict[str, Any] = {"out": None} validate_clip = CompatValidator( CLIP_DEFAULTS, fname="clip", method="both", max_fname_arg_count=3 ) @@ -176,7 +173,7 @@ def validate_clip_with_axis(axis, args, kwargs): return axis -CUM_FUNC_DEFAULTS: Dict[str, Any] = {} +CUM_FUNC_DEFAULTS: dict[str, Any] = {} CUM_FUNC_DEFAULTS["dtype"] = None CUM_FUNC_DEFAULTS["out"] = None validate_cum_func = CompatValidator( @@ -201,7 +198,7 @@ def validate_cum_func_with_skipna(skipna, args, kwargs, name): return skipna -ALLANY_DEFAULTS: Dict[str, Optional[bool]] = {} +ALLANY_DEFAULTS: dict[str, bool | None] = {} ALLANY_DEFAULTS["dtype"] = None ALLANY_DEFAULTS["out"] = None ALLANY_DEFAULTS["keepdims"] = False @@ -224,28 +221,28 @@ def validate_cum_func_with_skipna(skipna, args, kwargs, name): MINMAX_DEFAULTS, fname="max", method="both", max_fname_arg_count=1 ) -RESHAPE_DEFAULTS: Dict[str, str] = {"order": "C"} +RESHAPE_DEFAULTS: dict[str, str] = {"order": "C"} validate_reshape = CompatValidator( RESHAPE_DEFAULTS, fname="reshape", method="both", max_fname_arg_count=1 ) -REPEAT_DEFAULTS: Dict[str, Any] = {"axis": None} +REPEAT_DEFAULTS: dict[str, Any] = {"axis": None} validate_repeat = CompatValidator( REPEAT_DEFAULTS, fname="repeat", method="both", max_fname_arg_count=1 ) -ROUND_DEFAULTS: Dict[str, Any] = {"out": None} +ROUND_DEFAULTS: dict[str, Any] = {"out": None} validate_round = CompatValidator( ROUND_DEFAULTS, fname="round", method="both", max_fname_arg_count=1 ) -SORT_DEFAULTS: Dict[str, Optional[Union[int, str]]] = {} +SORT_DEFAULTS: dict[str, int | str | None] = {} SORT_DEFAULTS["axis"] = -1 SORT_DEFAULTS["kind"] = "quicksort" SORT_DEFAULTS["order"] = None validate_sort = CompatValidator(SORT_DEFAULTS, fname="sort", method="kwargs") -STAT_FUNC_DEFAULTS: Dict[str, Optional[Any]] = {} +STAT_FUNC_DEFAULTS: dict[str, Any | None] = {} STAT_FUNC_DEFAULTS["dtype"] = None STAT_FUNC_DEFAULTS["out"] = None @@ -279,13 +276,13 @@ def validate_cum_func_with_skipna(skipna, args, kwargs, name): MEDIAN_DEFAULTS, fname="median", method="both", max_fname_arg_count=1 ) -STAT_DDOF_FUNC_DEFAULTS: Dict[str, Optional[bool]] = {} +STAT_DDOF_FUNC_DEFAULTS: dict[str, bool | None] = {} STAT_DDOF_FUNC_DEFAULTS["dtype"] = None STAT_DDOF_FUNC_DEFAULTS["out"] = None STAT_DDOF_FUNC_DEFAULTS["keepdims"] = False validate_stat_ddof_func = CompatValidator(STAT_DDOF_FUNC_DEFAULTS, method="kwargs") -TAKE_DEFAULTS: Dict[str, Optional[str]] = {} +TAKE_DEFAULTS: dict[str, str | None] = {} TAKE_DEFAULTS["out"] = None TAKE_DEFAULTS["mode"] = "raise" validate_take = CompatValidator(TAKE_DEFAULTS, fname="take", method="kwargs") @@ -392,7 +389,7 @@ def validate_resampler_func(method: str, args, kwargs) -> None: raise TypeError("too many arguments passed in") -def validate_minmax_axis(axis: Optional[int], ndim: int = 1) -> None: +def validate_minmax_axis(axis: int | None, ndim: int = 1) -> None: """ Ensure that the axis argument passed to min, max, argmin, or argmax is zero or None, as otherwise it will be incorrectly ignored. diff --git a/pandas/conftest.py b/pandas/conftest.py index 329023ed7ba6a..218fae7ecd969 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -580,7 +580,7 @@ def datetime_series(): def _create_series(index): - """ Helper for the _series dict """ + """Helper for the _series dict""" size = len(index) data = np.random.randn(size) return Series(data, index=index, name="a") @@ -1120,9 +1120,9 @@ def string_dtype(request): @pytest.fixture( params=[ - "string", + "string[python]", pytest.param( - "arrow_string", marks=td.skip_if_no("pyarrow", min_version="1.0.0") + "string[pyarrow]", marks=td.skip_if_no("pyarrow", min_version="1.0.0") ), ] ) @@ -1130,14 +1130,32 @@ def nullable_string_dtype(request): """ Parametrized fixture for string dtypes. - * 'string' - * 'arrow_string' + * 'string[python]' + * 'string[pyarrow]' + """ + return request.param + + +@pytest.fixture( + params=[ + "python", + pytest.param("pyarrow", marks=td.skip_if_no("pyarrow", min_version="1.0.0")), + ] +) +def string_storage(request): """ - from pandas.core.arrays.string_arrow import ArrowStringDtype # noqa: F401 + Parametrized fixture for pd.options.mode.string_storage. + * 'python' + * 'pyarrow' + """ return request.param +# Alias so we can test with cartesian product of string_storage +string_storage2 = string_storage + + @pytest.fixture(params=tm.BYTES_DTYPES) def bytes_dtype(request): """ @@ -1163,9 +1181,9 @@ def object_dtype(request): @pytest.fixture( params=[ "object", - "string", + "string[python]", pytest.param( - "arrow_string", marks=td.skip_if_no("pyarrow", min_version="1.0.0") + "string[pyarrow]", marks=td.skip_if_no("pyarrow", min_version="1.0.0") ), ] ) @@ -1173,11 +1191,9 @@ def any_string_dtype(request): """ Parametrized fixture for string dtypes. * 'object' - * 'string' - * 'arrow_string' + * 'string[python]' + * 'string[pyarrow]' """ - from pandas.core.arrays.string_arrow import ArrowStringDtype # noqa: F401 - return request.param diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 30f42435ad177..f26cf113f7d5e 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -938,8 +938,6 @@ def mode(values, dropna: bool = True) -> Series: dropna : bool, default True Don't consider counts of NaN/NaT. - .. versionadded:: 0.24.0 - Returns ------- mode : Series @@ -1532,13 +1530,13 @@ def searchsorted(arr, value, side="left", sorter=None) -> np.ndarray: Input array. If `sorter` is None, then it must be sorted in ascending order, otherwise `sorter` must be an array of indices that sort it. - value : array_like + value : array-like Values to insert into `arr`. side : {'left', 'right'}, optional If 'left', the index of the first suitable location found is given. If 'right', return the last such index. If there is no suitable index, return either 0 or N (where N is the length of `self`). - sorter : 1-D array_like, optional + sorter : 1-D array-like, optional Optional array of integer indices that sort array a into ascending order. They are typically the result of argsort. @@ -1840,7 +1838,7 @@ def safe_sort( def _sort_mixed(values) -> np.ndarray: - """ order ints before strings in 1d arrays, safe in py3 """ + """order ints before strings in 1d arrays, safe in py3""" str_pos = np.array([isinstance(x, str) for x in values], dtype=bool) nums = np.sort(values[~str_pos]) strs = np.sort(values[str_pos]) diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 00b49c2f4f951..388c1881afed7 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -77,7 +77,7 @@ def frame_apply( args=None, kwargs=None, ) -> FrameApply: - """ construct and return a row or column based frame apply object """ + """construct and return a row or column based frame apply object""" axis = obj._get_axis_number(axis) klass: type[FrameApply] if axis == 0: @@ -639,7 +639,7 @@ def dtypes(self) -> Series: return self.obj.dtypes def apply(self) -> FrameOrSeriesUnion: - """ compute the results """ + """compute the results""" # dispatch to agg if is_list_like(self.f): return self.apply_multiple() @@ -733,7 +733,7 @@ def apply_empty_result(self): return self.obj.copy() def apply_raw(self): - """ apply to the values as a numpy array """ + """apply to the values as a numpy array""" def wrap_function(func): """ @@ -867,7 +867,7 @@ def result_columns(self) -> Index: def wrap_results_for_axis( self, results: ResType, res_index: Index ) -> FrameOrSeriesUnion: - """ return the results for the rows """ + """return the results for the rows""" if self.result_type == "reduce": # e.g. test_apply_dict GH#8735 @@ -950,7 +950,7 @@ def result_columns(self) -> Index: def wrap_results_for_axis( self, results: ResType, res_index: Index ) -> FrameOrSeriesUnion: - """ return the results for the columns """ + """return the results for the columns""" result: FrameOrSeriesUnion # we have requested to expand @@ -969,7 +969,7 @@ def wrap_results_for_axis( return result def infer_to_same_shape(self, results: ResType, res_index: Index) -> DataFrame: - """ infer the results to the same shape as the input object """ + """infer the results to the same shape as the input object""" result = self.obj._constructor(data=results) result = result.T diff --git a/pandas/core/array_algos/putmask.py b/pandas/core/array_algos/putmask.py index 3daf1b3ae3902..3a67f7d871f86 100644 --- a/pandas/core/array_algos/putmask.py +++ b/pandas/core/array_algos/putmask.py @@ -1,10 +1,9 @@ """ EA-compatible analogue to to np.putmask """ -from typing import ( - Any, - Tuple, -) +from __future__ import annotations + +from typing import Any import warnings import numpy as np @@ -171,7 +170,7 @@ def putmask_without_repeat(values: np.ndarray, mask: np.ndarray, new: Any) -> No np.putmask(values, mask, new) -def validate_putmask(values: ArrayLike, mask: np.ndarray) -> Tuple[np.ndarray, bool]: +def validate_putmask(values: ArrayLike, mask: np.ndarray) -> tuple[np.ndarray, bool]: """ Validate mask and check if this putmask operation is a no-op. """ diff --git a/pandas/core/array_algos/replace.py b/pandas/core/array_algos/replace.py index 2d3a168a31e1e..df4407067b131 100644 --- a/pandas/core/array_algos/replace.py +++ b/pandas/core/array_algos/replace.py @@ -1,13 +1,13 @@ """ Methods used by Block.replace and related methods. """ +from __future__ import annotations + import operator import re from typing import ( Any, - Optional, Pattern, - Union, ) import numpy as np @@ -42,30 +42,30 @@ def should_use_regex(regex: bool, to_replace: Any) -> bool: def compare_or_regex_search( - a: ArrayLike, b: Union[Scalar, Pattern], regex: bool, mask: np.ndarray -) -> Union[ArrayLike, bool]: + a: ArrayLike, b: Scalar | Pattern, regex: bool, mask: np.ndarray +) -> ArrayLike | bool: """ - Compare two array_like inputs of the same shape or two scalar values + Compare two array-like inputs of the same shape or two scalar values Calls operator.eq or re.search, depending on regex argument. If regex is True, perform an element-wise regex matching. Parameters ---------- - a : array_like + a : array-like b : scalar or regex pattern regex : bool mask : np.ndarray[bool] Returns ------- - mask : array_like of bool + mask : array-like of bool """ if isna(b): return ~mask def _check_comparison_types( - result: Union[ArrayLike, bool], a: ArrayLike, b: Union[Scalar, Pattern] + result: ArrayLike | bool, a: ArrayLike, b: Scalar | Pattern ): """ Raises an error if the two arrays (a,b) cannot be compared. @@ -115,7 +115,7 @@ def _check_comparison_types( return result -def replace_regex(values: ArrayLike, rx: re.Pattern, value, mask: Optional[np.ndarray]): +def replace_regex(values: ArrayLike, rx: re.Pattern, value, mask: np.ndarray | None): """ Parameters ---------- diff --git a/pandas/core/arrays/__init__.py b/pandas/core/arrays/__init__.py index 22f15ca9650db..e301e82a0ee75 100644 --- a/pandas/core/arrays/__init__.py +++ b/pandas/core/arrays/__init__.py @@ -17,12 +17,14 @@ ) from pandas.core.arrays.sparse import SparseArray from pandas.core.arrays.string_ import StringArray +from pandas.core.arrays.string_arrow import ArrowStringArray from pandas.core.arrays.timedeltas import TimedeltaArray __all__ = [ "ExtensionArray", "ExtensionOpsMixin", "ExtensionScalarOpsMixin", + "ArrowStringArray", "BaseMaskedArray", "BooleanArray", "Categorical", diff --git a/pandas/core/arrays/_ranges.py b/pandas/core/arrays/_ranges.py index a537951786646..cac9fcd40fa52 100644 --- a/pandas/core/arrays/_ranges.py +++ b/pandas/core/arrays/_ranges.py @@ -2,8 +2,7 @@ Helper functions to generate range-like data for DatetimeArray (and possibly TimedeltaArray/PeriodArray) """ - -from typing import Union +from __future__ import annotations import numpy as np @@ -17,8 +16,8 @@ def generate_regular_range( - start: Union[Timestamp, Timedelta], - end: Union[Timestamp, Timedelta], + start: Timestamp | Timedelta, + end: Timestamp | Timedelta, periods: int, freq: BaseOffset, ): diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 7dddb9f3d6f25..888c7cbbffb59 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -35,6 +35,7 @@ from pandas.util._decorators import ( Appender, Substitution, + cache_readonly, ) from pandas.util._validators import ( validate_bool_kwarg, @@ -250,8 +251,6 @@ def _from_sequence_of_strings( """ Construct a new ExtensionArray from a sequence of strings. - .. versionadded:: 0.24.0 - Parameters ---------- strings : Sequence @@ -754,8 +753,6 @@ def shift(self, periods: int = 1, fill_value: object = None) -> ExtensionArray: Newly introduced missing values are filled with ``self.dtype.na_value``. - .. versionadded:: 0.24.0 - Parameters ---------- periods : int, default 1 @@ -766,8 +763,6 @@ def shift(self, periods: int = 1, fill_value: object = None) -> ExtensionArray: The scalar value to use for newly introduced missing values. The default is ``self.dtype.na_value``. - .. versionadded:: 0.24.0 - Returns ------- ExtensionArray @@ -816,8 +811,6 @@ def searchsorted(self, value, side="left", sorter=None): """ Find indices where elements should be inserted to maintain order. - .. versionadded:: 0.24.0 - Find the indices into a sorted array `self` (a) such that, if the corresponding elements in `value` were inserted before the indices, the order of `self` would be preserved. @@ -833,13 +826,13 @@ def searchsorted(self, value, side="left", sorter=None): Parameters ---------- - value : array_like + value : array-like Values to insert into `self`. side : {'left', 'right'}, optional If 'left', the index of the first suitable location found is given. If 'right', return the last such index. If there is no suitable index, return either 0 or N (where N is the length of `self`). - sorter : 1-D array_like, optional + sorter : 1-D array-like, optional Optional array of integer indices that sort array a into ascending order. They are typically the result of argsort. @@ -1273,7 +1266,9 @@ def _concat_same_type( # such as take(), reindex(), shift(), etc. In addition, those results # will then be of the ExtensionArray subclass rather than an array # of objects - _can_hold_na = True + @cache_readonly + def _can_hold_na(self) -> bool: + return self.dtype._can_hold_na def _reduce(self, name: str, *, skipna: bool = True, **kwargs): """ diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 068f5703649fa..ecc45357db8c1 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -353,7 +353,6 @@ class Categorical(NDArrayBackedExtensionArray, PandasObject, ObjectStringArrayMi # tolist is not actually deprecated, just suppressed in the __dir__ _hidden_attrs = PandasObject._hidden_attrs | frozenset(["tolist"]) _typ = "categorical" - _can_hold_na = True _dtype: CategoricalDtype @@ -654,11 +653,6 @@ def from_codes( If :class:`CategoricalDtype`, cannot be used together with `categories` or `ordered`. - .. versionadded:: 0.24.0 - - When `dtype` is provided, neither `categories` nor `ordered` - should be provided. - Returns ------- Categorical @@ -1642,7 +1636,7 @@ def _internal_get_values(self): return np.array(self) def check_for_ordered(self, op): - """ assert that we are ordered """ + """assert that we are ordered""" if not self.ordered: raise TypeError( f"Categorical is not ordered for operation {op}\n" @@ -2036,7 +2030,8 @@ def _validate_setitem_value(self, value): from pandas import Index - to_add = Index(rvalue).difference(self.categories) + # tupleize_cols=False for e.g. test_fillna_iterable_category GH#41914 + to_add = Index(rvalue, tupleize_cols=False).difference(self.categories) # no assignments of values not in categories, but it's always ok to set # something to np.nan @@ -2171,8 +2166,6 @@ def mode(self, dropna=True): dropna : bool, default True Don't consider counts of NaN/NaT. - .. versionadded:: 0.24.0 - Returns ------- modes : `Categorical` (sorted) diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 4b264eef4bada..08cb12a1373bb 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -159,6 +159,10 @@ class DatetimeLikeArrayMixin(OpsMixin, NDArrayBackedExtensionArray): _recognized_scalars: tuple[type, ...] _ndarray: np.ndarray + @cache_readonly + def _can_hold_na(self) -> bool: + return True + def __init__(self, data, dtype: Dtype | None = None, freq=None, copy=False): raise AbstractMethodError(self) @@ -1592,8 +1596,6 @@ def strftime(self, date_format: str) -> np.ndarray: - 'raise' will raise an AmbiguousTimeError if there are ambiguous times. - .. versionadded:: 0.24.0 - nonexistent : 'shift_forward', 'shift_backward', 'NaT', timedelta, default 'raise' A nonexistent time does not exist in a particular timezone where clocks moved forward due to DST. @@ -1607,8 +1609,6 @@ def strftime(self, date_format: str) -> np.ndarray: - 'raise' will raise an NonExistentTimeError if there are nonexistent times. - .. versionadded:: 0.24.0 - Returns ------- DatetimeIndex, TimedeltaIndex, or Series diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index 7867471da6b94..92a906e9fd8b0 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -83,6 +83,7 @@ if TYPE_CHECKING: from typing import Literal + from pandas import DataFrame from pandas.core.arrays import ( PeriodArray, TimedeltaArray, @@ -151,8 +152,6 @@ class DatetimeArray(dtl.TimelikeOps, dtl.DatelikeOps): """ Pandas ExtensionArray for tz-naive or tz-aware datetime data. - .. versionadded:: 0.24.0 - .. warning:: DatetimeArray is currently experimental, and its API may change @@ -910,8 +909,6 @@ def tz_localize(self, tz, ambiguous="raise", nonexistent="raise") -> DatetimeArr - 'raise' will raise an NonExistentTimeError if there are nonexistent times. - .. versionadded:: 0.24.0 - Returns ------- Same type as self @@ -1260,7 +1257,7 @@ def day_name(self, locale=None): return result @property - def time(self): + def time(self) -> np.ndarray: """ Returns numpy array of datetime.time. The time part of the Timestamps. """ @@ -1272,7 +1269,7 @@ def time(self): return ints_to_pydatetime(timestamps, box="time") @property - def timetz(self): + def timetz(self) -> np.ndarray: """ Returns numpy array of datetime.time also containing timezone information. The time part of the Timestamps. @@ -1280,7 +1277,7 @@ def timetz(self): return ints_to_pydatetime(self.asi8, self.tz, box="time") @property - def date(self): + def date(self) -> np.ndarray: """ Returns numpy array of python datetime.date objects (namely, the date part of Timestamps without timezone information). @@ -1292,7 +1289,7 @@ def date(self): return ints_to_pydatetime(timestamps, box="date") - def isocalendar(self): + def isocalendar(self) -> DataFrame: """ Returns a DataFrame with the year, week, and day calculated according to the ISO 8601 standard. @@ -1875,7 +1872,7 @@ def weekofyear(self): """, ) - def to_julian_date(self): + def to_julian_date(self) -> np.ndarray: """ Convert Datetime Array to float64 ndarray of Julian Dates. 0 Julian date is noon January 1, 4713 BC. diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index b533018cdfa6b..c9ba762a271bd 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -237,8 +237,6 @@ class IntegerArray(NumericArray): """ Array of integer (optional missing) values. - .. versionadded:: 0.24.0 - .. versionchanged:: 1.0.0 Now uses :attr:`pandas.NA` as the missing value rather diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index 8836695efcbcb..2318cae004c5a 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -1209,8 +1209,6 @@ def mid(self): endpoints. Intervals that only have an open endpoint in common do not overlap. - .. versionadded:: 0.24.0 - Parameters ---------- other : %(klass)s @@ -1290,8 +1288,6 @@ def closed(self): Return an %(klass)s identical to the current one, but closed on the specified side. - .. versionadded:: 0.24.0 - Parameters ---------- closed : {'left', 'right', 'both', 'neither'} diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 11f9f645920ec..d274501143916 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -78,7 +78,7 @@ class BaseMaskedDtype(ExtensionDtype): @cache_readonly def numpy_dtype(self) -> np.dtype: - """ Return an instance of our numpy dtype """ + """Return an instance of our numpy dtype""" return np.dtype(self.type) @cache_readonly @@ -87,7 +87,7 @@ def kind(self) -> str: @cache_readonly def itemsize(self) -> int: - """ Return the number of bytes in this dtype """ + """Return the number of bytes in this dtype""" return self.numpy_dtype.itemsize @classmethod diff --git a/pandas/core/arrays/numpy_.py b/pandas/core/arrays/numpy_.py index dc592f205b3ea..ec7bd132832d1 100644 --- a/pandas/core/arrays/numpy_.py +++ b/pandas/core/arrays/numpy_.py @@ -36,8 +36,6 @@ class PandasArray( """ A pandas ExtensionArray for NumPy data. - .. versionadded:: 0.24.0 - This is mostly for internal compatibility, and is not especially useful on its own. diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index d8c1b9cef468a..04db06ee9fb66 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -1072,11 +1072,10 @@ def dt64arr_to_periodarr(data, freq, tz=None): elif isinstance(data, ABCSeries): data, freq = data._values, data.dt.freq - freq = Period._maybe_convert_freq(freq) - - if isinstance(data, (ABCIndex, ABCSeries)): + elif isinstance(data, (ABCIndex, ABCSeries)): data = data._values + freq = Period._maybe_convert_freq(freq) base = freq._period_dtype_code return c_dt64arr_to_periodarr(data.view("i8"), base, tz), freq @@ -1138,7 +1137,7 @@ def _range_from_fields( minute=None, second=None, freq=None, -): +) -> tuple[np.ndarray, BaseOffset]: if hour is None: hour = 0 if minute is None: @@ -1176,7 +1175,7 @@ def _range_from_fields( return np.array(ordinals, dtype=np.int64), freq -def _make_field_arrays(*fields): +def _make_field_arrays(*fields) -> list[np.ndarray]: length = None for x in fields: if isinstance(x, (list, np.ndarray, ABCSeries)): diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index 6ab296b314615..7d3917203d7b6 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -224,10 +224,6 @@ class SparseArray(OpsMixin, PandasObject, ExtensionArray): """ An ExtensionArray for storing sparse data. - .. versionchanged:: 0.24.0 - - Implements the ExtensionArray interface. - Parameters ---------- data : array-like diff --git a/pandas/core/arrays/sparse/dtype.py b/pandas/core/arrays/sparse/dtype.py index 4b077c755a029..a8f8f10e8716d 100644 --- a/pandas/core/arrays/sparse/dtype.py +++ b/pandas/core/arrays/sparse/dtype.py @@ -45,8 +45,6 @@ class SparseDtype(ExtensionDtype): This dtype implements the pandas ExtensionDtype interface. - .. versionadded:: 0.24.0 - Parameters ---------- dtype : str, ExtensionDtype, numpy.dtype, type, default numpy.float64 diff --git a/pandas/core/arrays/sparse/scipy_sparse.py b/pandas/core/arrays/sparse/scipy_sparse.py index ad2c5f75fc32c..7ebda1f17ba56 100644 --- a/pandas/core/arrays/sparse/scipy_sparse.py +++ b/pandas/core/arrays/sparse/scipy_sparse.py @@ -34,7 +34,7 @@ def _to_ijv(ss, row_levels=(0,), column_levels=(1,), sort_labels=False): nonnull_labels = ss.dropna() def get_indexers(levels): - """ Return sparse coords and dense labels for subset levels """ + """Return sparse coords and dense labels for subset levels""" # TODO: how to do this better? cleanly slice nonnull_labels given the # coord values_ilabels = [tuple(x[i] for i in levels) for x in nonnull_labels.index] diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index ab1dadf4d2dfa..8d150c8f6ad3d 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -1,9 +1,14 @@ from __future__ import annotations -from typing import TYPE_CHECKING +from typing import ( + TYPE_CHECKING, + Any, +) import numpy as np +from pandas._config import get_option + from pandas._libs import ( lib, missing as libmissing, @@ -14,6 +19,7 @@ Scalar, type_t, ) +from pandas.compat import pa_version_under1p0 from pandas.compat.numpy import function as nv from pandas.core.dtypes.base import ( @@ -37,6 +43,7 @@ IntegerArray, PandasArray, ) +from pandas.core.arrays.base import ExtensionArray from pandas.core.arrays.floating import FloatingDtype from pandas.core.arrays.integer import _IntegerDtype from pandas.core.construction import extract_array @@ -62,6 +69,11 @@ class StringDtype(ExtensionDtype): In particular, StringDtype.na_value may change to no longer be ``numpy.nan``. + Parameters + ---------- + storage : {"python", "pyarrow"}, optional + If not given, the value of ``pd.options.mode.string_storage``. + Attributes ---------- None @@ -73,20 +85,93 @@ class StringDtype(ExtensionDtype): Examples -------- >>> pd.StringDtype() - StringDtype + string[python] + + >>> pd.StringDtype(storage="pyarrow") + string[pyarrow] """ name = "string" #: StringDtype.na_value uses pandas.NA na_value = libmissing.NA + _metadata = ("storage",) + + def __init__(self, storage=None): + if storage is None: + storage = get_option("mode.string_storage") + if storage not in {"python", "pyarrow"}: + raise ValueError( + f"Storage must be 'python' or 'pyarrow'. Got {storage} instead." + ) + if storage == "pyarrow" and pa_version_under1p0: + raise ImportError( + "pyarrow>=1.0.0 is required for PyArrow backed StringArray." + ) + + self.storage = storage @property def type(self) -> type[str]: return str @classmethod - def construct_array_type(cls) -> type_t[StringArray]: + def construct_from_string(cls, string): + """ + Construct a StringDtype from a string. + + Parameters + ---------- + string : str + The type of the name. The storage type will be taking from `string`. + Valid options and their storage types are + + ========================== ============================================== + string result storage + ========================== ============================================== + ``'string'`` pd.options.mode.string_storage, default python + ``'string[python]'`` python + ``'string[pyarrow]'`` pyarrow + ========================== ============================================== + + Returns + ------- + StringDtype + + Raise + ----- + TypeError + If the string is not a valid option. + + """ + if not isinstance(string, str): + raise TypeError( + f"'construct_from_string' expects a string, got {type(string)}" + ) + if string == "string": + return cls() + elif string == "string[python]": + return cls(storage="python") + elif string == "string[pyarrow]": + return cls(storage="pyarrow") + else: + raise TypeError(f"Cannot construct a '{cls.__name__}' from '{string}'") + + def __eq__(self, other: Any) -> bool: + if isinstance(other, str) and other == "string": + return True + return super().__eq__(other) + + def __hash__(self) -> int: + # custom __eq__ so have to override __hash__ + return super().__hash__() + + # https://github.com/pandas-dev/pandas/issues/36126 + # error: Signature of "construct_array_type" incompatible with supertype + # "ExtensionDtype" + def construct_array_type( # type: ignore[override] + self, + ) -> type_t[BaseStringArray]: """ Return the array type associated with this dtype. @@ -94,30 +179,44 @@ def construct_array_type(cls) -> type_t[StringArray]: ------- type """ - return StringArray + from pandas.core.arrays.string_arrow import ArrowStringArray + + if self.storage == "python": + return StringArray + else: + return ArrowStringArray + + def __repr__(self): + return f"string[{self.storage}]" - def __repr__(self) -> str: - return "StringDtype" + def __str__(self): + return self.name def __from_arrow__( self, array: pyarrow.Array | pyarrow.ChunkedArray - ) -> StringArray: + ) -> BaseStringArray: """ Construct StringArray from pyarrow Array/ChunkedArray. """ - import pyarrow + if self.storage == "pyarrow": + from pandas.core.arrays.string_arrow import ArrowStringArray - if isinstance(array, pyarrow.Array): - chunks = [array] + return ArrowStringArray(array) else: - # pyarrow.ChunkedArray - chunks = array.chunks - results = [] - for arr in chunks: - # using _from_sequence to ensure None is converted to NA - str_arr = StringArray._from_sequence(np.array(arr)) - results.append(str_arr) + import pyarrow + + if isinstance(array, pyarrow.Array): + chunks = [array] + else: + # pyarrow.ChunkedArray + chunks = array.chunks + + results = [] + for arr in chunks: + # using _from_sequence to ensure None is converted to NA + str_arr = StringArray._from_sequence(np.array(arr)) + results.append(str_arr) if results: return StringArray._concat_same_type(results) @@ -125,7 +224,11 @@ def __from_arrow__( return StringArray(np.array([], dtype="object")) -class StringArray(PandasArray): +class BaseStringArray(ExtensionArray): + pass + + +class StringArray(BaseStringArray, PandasArray): """ Extension array for string data. @@ -210,7 +313,7 @@ def __init__(self, values, copy=False): super().__init__(values, copy=copy) # error: Incompatible types in assignment (expression has type "StringDtype", # variable has type "PandasDtype") - NDArrayBacked.__init__(self, self._ndarray, StringDtype()) + NDArrayBacked.__init__(self, self._ndarray, StringDtype(storage="python")) if not isinstance(values, type(self)): self._validate() @@ -226,8 +329,9 @@ def _validate(self): @classmethod def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy=False): - if dtype: - assert dtype == "string" + if dtype and not (isinstance(dtype, str) and dtype == "string"): + dtype = pandas_dtype(dtype) + assert isinstance(dtype, StringDtype) and dtype.storage == "python" from pandas.core.arrays.masked import BaseMaskedArray @@ -247,7 +351,7 @@ def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy=False): # Manually creating new array avoids the validation step in the __init__, so is # faster. Refactor need for validation? new_string_array = cls.__new__(cls) - NDArrayBacked.__init__(new_string_array, result, StringDtype()) + NDArrayBacked.__init__(new_string_array, result, StringDtype(storage="python")) return new_string_array @@ -416,7 +520,7 @@ def _str_map( from pandas.arrays import BooleanArray if dtype is None: - dtype = StringDtype() + dtype = StringDtype(storage="python") if na_value is None: na_value = self.dtype.na_value diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 3cf471e381da9..ab8599f0f05ba 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -11,16 +11,12 @@ import numpy as np -from pandas._libs import ( - lib, - missing as libmissing, -) +from pandas._libs import lib from pandas._typing import ( Dtype, NpDtype, PositionalIndexer, Scalar, - type_t, ) from pandas.compat import ( pa_version_under1p0, @@ -43,7 +39,6 @@ is_string_dtype, pandas_dtype, ) -from pandas.core.dtypes.dtypes import register_extension_dtype from pandas.core.dtypes.missing import isna from pandas.core import missing @@ -52,7 +47,10 @@ from pandas.core.arrays.boolean import BooleanDtype from pandas.core.arrays.integer import Int64Dtype from pandas.core.arrays.numeric import NumericDtype -from pandas.core.arrays.string_ import StringDtype +from pandas.core.arrays.string_ import ( + BaseStringArray, + StringDtype, +) from pandas.core.indexers import ( check_array_indexer, validate_indices, @@ -86,99 +84,12 @@ def _chk_pyarrow_available() -> None: raise ImportError(msg) -@register_extension_dtype -class ArrowStringDtype(StringDtype): - """ - Extension dtype for string data in a ``pyarrow.ChunkedArray``. - - .. versionadded:: 1.2.0 - - .. warning:: - - ArrowStringDtype is considered experimental. The implementation and - parts of the API may change without warning. - - Attributes - ---------- - None - - Methods - ------- - None - - Examples - -------- - >>> from pandas.core.arrays.string_arrow import ArrowStringDtype - >>> ArrowStringDtype() - ArrowStringDtype - """ - - name = "arrow_string" - - #: StringDtype.na_value uses pandas.NA - na_value = libmissing.NA - - def __init__(self): - _chk_pyarrow_available() - - @property - def type(self) -> type[str]: - return str - - @classmethod - def construct_array_type(cls) -> type_t[ArrowStringArray]: # type: ignore[override] - """ - Return the array type associated with this dtype. - - Returns - ------- - type - """ - return ArrowStringArray - - def __hash__(self) -> int: - return hash("ArrowStringDtype") - - def __repr__(self) -> str: - return "ArrowStringDtype" - - def __from_arrow__( # type: ignore[override] - self, array: pa.Array | pa.ChunkedArray - ) -> ArrowStringArray: - """ - Construct StringArray from pyarrow Array/ChunkedArray. - """ - return ArrowStringArray(array) - - def __eq__(self, other) -> bool: - """Check whether 'other' is equal to self. - - By default, 'other' is considered equal if - * it's a string matching 'self.name'. - * it's an instance of this type. - - Parameters - ---------- - other : Any - - Returns - ------- - bool - """ - if isinstance(other, ArrowStringDtype): - return True - elif isinstance(other, str) and other == "arrow_string": - return True - else: - return False - - # TODO: Inherit directly from BaseStringArrayMethods. Currently we inherit from # ObjectStringArrayMixin because we want to have the object-dtype based methods as # fallback for the ones that pyarrow doesn't yet support -class ArrowStringArray(OpsMixin, ExtensionArray, ObjectStringArrayMixin): +class ArrowStringArray(OpsMixin, BaseStringArray, ObjectStringArrayMixin): """ Extension array for string data in a ``pyarrow.ChunkedArray``. @@ -216,14 +127,14 @@ class ArrowStringArray(OpsMixin, ExtensionArray, ObjectStringArrayMixin): Examples -------- - >>> pd.array(['This is', 'some text', None, 'data.'], dtype="arrow_string") + >>> pd.array(['This is', 'some text', None, 'data.'], dtype="string[pyarrow]") ['This is', 'some text', , 'data.'] - Length: 4, dtype: arrow_string + Length: 4, dtype: string """ def __init__(self, values): - self._dtype = ArrowStringDtype() + self._dtype = StringDtype(storage="pyarrow") if isinstance(values, pa.Array): self._data = pa.chunked_array([values]) elif isinstance(values, pa.ChunkedArray): @@ -242,6 +153,10 @@ def _from_sequence(cls, scalars, dtype: Dtype | None = None, copy: bool = False) _chk_pyarrow_available() + if dtype and not (isinstance(dtype, str) and dtype == "string"): + dtype = pandas_dtype(dtype) + assert isinstance(dtype, StringDtype) and dtype.storage == "pyarrow" + if isinstance(scalars, BaseMaskedArray): # avoid costly conversion to object dtype in ensure_string_array and # numerical issues with Float32Dtype @@ -261,9 +176,9 @@ def _from_sequence_of_strings( return cls._from_sequence(strings, dtype=dtype, copy=copy) @property - def dtype(self) -> ArrowStringDtype: + def dtype(self) -> StringDtype: """ - An instance of 'ArrowStringDtype'. + An instance of 'string[pyarrow]'. """ return self._dtype @@ -761,7 +676,8 @@ def astype(self, dtype, copy=True): # ------------------------------------------------------------------------ # String methods interface - _str_na_value = ArrowStringDtype.na_value + # error: Cannot determine type of 'na_value' + _str_na_value = StringDtype.na_value # type: ignore[has-type] def _str_map( self, f, na_value=None, dtype: Dtype | None = None, convert: bool = True diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index ea87ac64cfe22..a03a8a412872f 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -70,6 +70,7 @@ from pandas.core.ops.common import unpack_zerodim_and_defer if TYPE_CHECKING: + from pandas import DataFrame from pandas.core.arrays import ( DatetimeArray, PeriodArray, @@ -96,8 +97,6 @@ class TimedeltaArray(dtl.TimelikeOps): """ Pandas ExtensionArray for timedelta data. - .. versionadded:: 0.24.0 - .. warning:: TimedeltaArray is currently experimental, and its API may change @@ -882,14 +881,14 @@ def to_pytimedelta(self) -> np.ndarray: ) @property - def components(self): + def components(self) -> DataFrame: """ Return a dataframe of the components (days, hours, minutes, seconds, milliseconds, microseconds, nanoseconds) of the Timedeltas. Returns ------- - a DataFrame + DataFrame """ from pandas import DataFrame diff --git a/pandas/core/base.py b/pandas/core/base.py index 55e776d2e6b73..104baa04d3459 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -349,8 +349,6 @@ def array(self) -> ExtensionArray: """ The ExtensionArray of the data backing this Series or Index. - .. versionadded:: 0.24.0 - Returns ------- ExtensionArray @@ -421,8 +419,6 @@ def to_numpy( """ A NumPy ndarray representing the values in this Series or Index. - .. versionadded:: 0.24.0 - Parameters ---------- dtype : str or numpy.dtype, optional @@ -1137,13 +1133,13 @@ def factorize(self, sort: bool = False, na_sentinel: int | None = -1): Parameters ---------- - value : array_like + value : array-like Values to insert into `self`. side : {{'left', 'right'}}, optional If 'left', the index of the first suitable location found is given. If 'right', return the last such index. If there is no suitable index, return either 0 or N (where N is the length of `self`). - sorter : 1-D array_like, optional + sorter : 1-D array-like, optional Optional array of integer indices that sort `self` into ascending order. They are typically the result of ``np.argsort``. @@ -1153,11 +1149,6 @@ def factorize(self, sort: bool = False, na_sentinel: int | None = -1): A scalar or array of insertion points with the same shape as `value`. - .. versionchanged:: 0.24.0 - If `value` is a scalar, an int is now always returned. - Previously, scalar inputs returned an 1-item array for - :class:`Series` and :class:`Categorical`. - See Also -------- sort_values : Sort by the values along either axis. diff --git a/pandas/core/common.py b/pandas/core/common.py index c0e44a437f59e..183607ebb489d 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -142,8 +142,8 @@ def is_bool_indexer(key: Any) -> bool: elif is_bool_dtype(key.dtype): return True elif isinstance(key, list): - arr = np.asarray(key) - return arr.dtype == np.bool_ and len(arr) == len(key) + # check if np.array(key).dtype would be bool + return len(key) > 0 and lib.is_bool_list(key) return False diff --git a/pandas/core/computation/engines.py b/pandas/core/computation/engines.py index 7452cf03d0038..62732402dbeea 100644 --- a/pandas/core/computation/engines.py +++ b/pandas/core/computation/engines.py @@ -1,12 +1,9 @@ """ Engine classes for :func:`~pandas.eval` """ +from __future__ import annotations import abc -from typing import ( - Dict, - Type, -) from pandas.core.computation.align import ( align_terms, @@ -140,7 +137,7 @@ def _evaluate(self) -> None: pass -ENGINES: Dict[str, Type[AbstractEngine]] = { +ENGINES: dict[str, type[AbstractEngine]] = { "numexpr": NumExprEngine, "python": PythonEngine, } diff --git a/pandas/core/computation/expr.py b/pandas/core/computation/expr.py index 02660539f4981..d495f89970348 100644 --- a/pandas/core/computation/expr.py +++ b/pandas/core/computation/expr.py @@ -1,6 +1,7 @@ """ :func:`~pandas.eval` parsers. """ +from __future__ import annotations import ast from functools import ( @@ -11,10 +12,6 @@ import tokenize from typing import ( Callable, - Optional, - Set, - Tuple, - Type, TypeVar, ) @@ -50,7 +47,7 @@ import pandas.io.formats.printing as printing -def _rewrite_assign(tok: Tuple[int, str]) -> Tuple[int, str]: +def _rewrite_assign(tok: tuple[int, str]) -> tuple[int, str]: """ Rewrite the assignment operator for PyTables expressions that use ``=`` as a substitute for ``==``. @@ -69,7 +66,7 @@ def _rewrite_assign(tok: Tuple[int, str]) -> Tuple[int, str]: return toknum, "==" if tokval == "=" else tokval -def _replace_booleans(tok: Tuple[int, str]) -> Tuple[int, str]: +def _replace_booleans(tok: tuple[int, str]) -> tuple[int, str]: """ Replace ``&`` with ``and`` and ``|`` with ``or`` so that bitwise precedence is changed to boolean precedence. @@ -94,7 +91,7 @@ def _replace_booleans(tok: Tuple[int, str]) -> Tuple[int, str]: return toknum, tokval -def _replace_locals(tok: Tuple[int, str]) -> Tuple[int, str]: +def _replace_locals(tok: tuple[int, str]) -> tuple[int, str]: """ Replace local variables with a syntactically valid name. @@ -271,7 +268,7 @@ def f(self, *args, **kwargs): _T = TypeVar("_T", bound="BaseExprVisitor") -def disallow(nodes: Set[str]) -> Callable[[Type[_T]], Type[_T]]: +def disallow(nodes: set[str]) -> Callable[[type[_T]], type[_T]]: """ Decorator to disallow certain nodes from parsing. Raises a NotImplementedError instead. @@ -281,7 +278,7 @@ def disallow(nodes: Set[str]) -> Callable[[Type[_T]], Type[_T]]: callable """ - def disallowed(cls: Type[_T]) -> Type[_T]: + def disallowed(cls: type[_T]) -> type[_T]: cls.unsupported_nodes = () for node in nodes: new_method = _node_not_implemented(node) @@ -352,7 +349,7 @@ class BaseExprVisitor(ast.NodeVisitor): preparser : callable """ - const_type: Type[Term] = Constant + const_type: type[Term] = Constant term_type = Term binary_ops = CMP_OPS_SYMS + BOOL_OPS_SYMS + ARITH_OPS_SYMS @@ -390,7 +387,7 @@ class BaseExprVisitor(ast.NodeVisitor): ast.NotIn: ast.NotIn, } - unsupported_nodes: Tuple[str, ...] + unsupported_nodes: tuple[str, ...] def __init__(self, env, engine, parser, preparser=_preparse): self.env = env @@ -567,7 +564,7 @@ def visit_List(self, node, **kwargs): visit_Tuple = visit_List def visit_Index(self, node, **kwargs): - """ df.index[4] """ + """df.index[4]""" return self.visit(node.value) def visit_Subscript(self, node, **kwargs): @@ -591,7 +588,7 @@ def visit_Subscript(self, node, **kwargs): return self.term_type(name, env=self.env) def visit_Slice(self, node, **kwargs): - """ df.index[slice(4,6)] """ + """df.index[slice(4,6)]""" lower = node.lower if lower is not None: lower = self.visit(lower).value @@ -798,7 +795,7 @@ def __init__( expr, engine: str = "numexpr", parser: str = "pandas", - env: Optional[Scope] = None, + env: Scope | None = None, level: int = 0, ): self.expr = expr diff --git a/pandas/core/computation/expressions.py b/pandas/core/computation/expressions.py index 2f87e0bcce70a..a62137bd63692 100644 --- a/pandas/core/computation/expressions.py +++ b/pandas/core/computation/expressions.py @@ -5,12 +5,9 @@ Offer fast expression evaluation through numexpr """ +from __future__ import annotations + import operator -from typing import ( - List, - Optional, - Set, -) import warnings import numpy as np @@ -25,11 +22,11 @@ if NUMEXPR_INSTALLED: import numexpr as ne -_TEST_MODE: Optional[bool] = None -_TEST_RESULT: List[bool] = [] +_TEST_MODE: bool | None = None +_TEST_RESULT: list[bool] = [] USE_NUMEXPR = NUMEXPR_INSTALLED -_evaluate: Optional[FuncType] = None -_where: Optional[FuncType] = None +_evaluate: FuncType | None = None +_where: FuncType | None = None # the set of dtypes that we will allow pass to numexpr _ALLOWED_DTYPES = { @@ -73,13 +70,13 @@ def _evaluate_standard(op, op_str, a, b): def _can_use_numexpr(op, op_str, a, b, dtype_check): - """ return a boolean if we WILL be using numexpr """ + """return a boolean if we WILL be using numexpr""" if op_str is not None: # required min elements (otherwise we are adding overhead) if a.size > _MIN_ELEMENTS: # check for dtype compatibility - dtypes: Set[str] = set() + dtypes: set[str] = set() for o in [a, b]: # ndarray and Series Case if hasattr(o, "dtype"): @@ -277,7 +274,7 @@ def _store_test_result(used_numexpr: bool) -> None: _TEST_RESULT.append(used_numexpr) -def get_test_result() -> List[bool]: +def get_test_result() -> list[bool]: """ Get test result and reset test_results. """ diff --git a/pandas/core/computation/parsing.py b/pandas/core/computation/parsing.py index f3321fc55ad80..b0f817d2c1ff3 100644 --- a/pandas/core/computation/parsing.py +++ b/pandas/core/computation/parsing.py @@ -1,6 +1,7 @@ """ :func:`~pandas.eval` source string parsing functions """ +from __future__ import annotations from io import StringIO from keyword import iskeyword @@ -9,7 +10,6 @@ from typing import ( Hashable, Iterator, - Tuple, ) # A token value Python's tokenizer probably will never use. @@ -66,7 +66,7 @@ def create_valid_python_identifier(name: str) -> str: return name -def clean_backtick_quoted_toks(tok: Tuple[int, str]) -> Tuple[int, str]: +def clean_backtick_quoted_toks(tok: tuple[int, str]) -> tuple[int, str]: """ Clean up a column name if surrounded by backticks. @@ -131,7 +131,7 @@ def clean_column_name(name: Hashable) -> Hashable: def tokenize_backtick_quoted_string( token_generator: Iterator[tokenize.TokenInfo], source: str, string_start: int -) -> Tuple[int, str]: +) -> tuple[int, str]: """ Creates a token from a backtick quoted string. @@ -163,7 +163,7 @@ def tokenize_backtick_quoted_string( return BACKTICK_QUOTED_STRING, source[string_start:string_end] -def tokenize_string(source: str) -> Iterator[Tuple[int, str]]: +def tokenize_string(source: str) -> Iterator[tuple[int, str]]: """ Tokenize a Python source code string. diff --git a/pandas/core/computation/pytables.py b/pandas/core/computation/pytables.py index 0e6a7551ab399..f733a5c43dfb3 100644 --- a/pandas/core/computation/pytables.py +++ b/pandas/core/computation/pytables.py @@ -113,7 +113,7 @@ def _disallow_scalar_only_bool_ops(self): def prune(self, klass): def pr(left, right): - """ create and return a new specialized BinOp from myself """ + """create and return a new specialized BinOp from myself""" if left is None: return right elif right is None: @@ -154,7 +154,7 @@ def pr(left, right): return res def conform(self, rhs): - """ inplace conform rhs """ + """inplace conform rhs""" if not is_list_like(rhs): rhs = [rhs] if isinstance(rhs, np.ndarray): @@ -163,7 +163,7 @@ def conform(self, rhs): @property def is_valid(self) -> bool: - """ return True if this is a valid field """ + """return True if this is a valid field""" return self.lhs in self.queryables @property @@ -176,21 +176,21 @@ def is_in_table(self) -> bool: @property def kind(self): - """ the kind of my field """ + """the kind of my field""" return getattr(self.queryables.get(self.lhs), "kind", None) @property def meta(self): - """ the meta of my field """ + """the meta of my field""" return getattr(self.queryables.get(self.lhs), "meta", None) @property def metadata(self): - """ the metadata of my field """ + """the metadata of my field""" return getattr(self.queryables.get(self.lhs), "metadata", None) def generate(self, v) -> str: - """ create and return the op string for this TermValue """ + """create and return the op string for this TermValue""" val = v.tostring(self.encoding) return f"({self.lhs} {self.op} {val})" @@ -273,7 +273,7 @@ def __repr__(self) -> str: return pprint_thing(f"[Filter : [{self.filter[0]}] -> [{self.filter[1]}]") def invert(self): - """ invert the filter """ + """invert the filter""" if self.filter is not None: self.filter = ( self.filter[0], @@ -283,7 +283,7 @@ def invert(self): return self def format(self): - """ return the actual filter format """ + """return the actual filter format""" return [self.filter] def evaluate(self): @@ -338,7 +338,7 @@ def __repr__(self) -> str: return pprint_thing(f"[Condition : [{self.condition}]]") def invert(self): - """ invert the condition """ + """invert the condition""" # if self.condition is not None: # self.condition = "~(%s)" % self.condition # return self @@ -347,7 +347,7 @@ def invert(self): ) def format(self): - """ return the actual ne format """ + """return the actual ne format""" return self.condition def evaluate(self): @@ -604,7 +604,7 @@ def __repr__(self) -> str: return pprint_thing(self.expr) def evaluate(self): - """ create and return the numexpr condition and filter """ + """create and return the numexpr condition and filter""" try: self.condition = self.terms.prune(ConditionBinOp) except AttributeError as err: @@ -624,7 +624,7 @@ def evaluate(self): class TermValue: - """ hold a term value the we use to construct a condition/filter """ + """hold a term value the we use to construct a condition/filter""" def __init__(self, value, converted, kind: str): assert isinstance(kind, str), kind @@ -633,7 +633,7 @@ def __init__(self, value, converted, kind: str): self.kind = kind def tostring(self, encoding) -> str: - """ quote the string if not encoded else encode and return """ + """quote the string if not encoded else encode and return""" if self.kind == "string": if encoding is not None: return str(self.converted) @@ -646,7 +646,7 @@ def tostring(self, encoding) -> str: def maybe_expression(s) -> bool: - """ loose checking if s is a pytables-acceptable expression """ + """loose checking if s is a pytables-acceptable expression""" if not isinstance(s, str): return False ops = PyTablesExprVisitor.binary_ops + PyTablesExprVisitor.unary_ops + ("=",) diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py index 0db0c5a57207d..27b898782fbef 100644 --- a/pandas/core/config_init.py +++ b/pandas/core/config_init.py @@ -526,6 +526,19 @@ def use_inf_as_na_cb(key): ) +string_storage_doc = """ +: string + The default storage for StringDtype. +""" + +with cf.config_prefix("mode"): + cf.register_option( + "string_storage", + "python", + string_storage_doc, + validator=is_one_of_factory(["python", "pyarrow"]), + ) + # Set up the io.excel specific reader configuration. reader_engine_doc = """ : string diff --git a/pandas/core/construction.py b/pandas/core/construction.py index 330902b402324..7e7205d1351b3 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -77,8 +77,6 @@ def array( """ Create an array. - .. versionadded:: 0.24.0 - Parameters ---------- data : Sequence of objects @@ -109,18 +107,22 @@ def array( Currently, pandas will infer an extension dtype for sequences of - ============================== ===================================== + ============================== ======================================= Scalar Type Array Type - ============================== ===================================== + ============================== ======================================= :class:`pandas.Interval` :class:`pandas.arrays.IntervalArray` :class:`pandas.Period` :class:`pandas.arrays.PeriodArray` :class:`datetime.datetime` :class:`pandas.arrays.DatetimeArray` :class:`datetime.timedelta` :class:`pandas.arrays.TimedeltaArray` :class:`int` :class:`pandas.arrays.IntegerArray` :class:`float` :class:`pandas.arrays.FloatingArray` - :class:`str` :class:`pandas.arrays.StringArray` + :class:`str` :class:`pandas.arrays.StringArray` or + :class:`pandas.arrays.ArrowStringArray` :class:`bool` :class:`pandas.arrays.BooleanArray` - ============================== ===================================== + ============================== ======================================= + + The ExtensionArray created when the scalar type is :class:`str` is determined by + ``pd.options.mode.string_storage`` if the dtype is not explicitly given. For all other cases, NumPy's usual inference rules will be used. @@ -236,6 +238,14 @@ def array( ['a', , 'c'] Length: 3, dtype: string + >>> with pd.option_context("string_storage", "pyarrow"): + ... arr = pd.array(["a", None, "c"]) + ... + >>> arr + + ['a', , 'c'] + Length: 3, dtype: string + >>> pd.array([pd.Period('2000', freq="D"), pd.Period("2000", freq="D")]) ['2000-01-01', '2000-01-01'] @@ -289,9 +299,9 @@ def array( IntervalArray, PandasArray, PeriodArray, - StringArray, TimedeltaArray, ) + from pandas.core.arrays.string_ import StringDtype if lib.is_scalar(data): msg = f"Cannot pass scalar '{data}' to 'pandas.array'." @@ -332,7 +342,8 @@ def array( return TimedeltaArray._from_sequence(data, copy=copy) elif inferred_dtype == "string": - return StringArray._from_sequence(data, copy=copy) + # StringArray/ArrowStringArray depending on pd.options.mode.string_storage + return StringDtype().construct_array_type()._from_sequence(data, copy=copy) elif inferred_dtype == "integer": return IntegerArray._from_sequence(data, copy=copy) diff --git a/pandas/core/dtypes/base.py b/pandas/core/dtypes/base.py index 9671c340a0a92..e52b318c0b4f7 100644 --- a/pandas/core/dtypes/base.py +++ b/pandas/core/dtypes/base.py @@ -48,6 +48,7 @@ class ExtensionDtype: * type * name + * construct_array_type The following attributes and methods influence the behavior of the dtype in pandas operations @@ -56,12 +57,6 @@ class ExtensionDtype: * _is_boolean * _get_common_dtype - Optionally one can override construct_array_type for construction - with the name of this dtype via the Registry. See - :meth:`extensions.register_extension_dtype`. - - * construct_array_type - The `na_value` class attribute can be used to set the default NA value for this type. :attr:`numpy.nan` is used by default. @@ -80,11 +75,6 @@ class property**. ``__eq__`` or ``__hash__``, the default implementations here will not work. - .. versionchanged:: 0.24.0 - - Added ``_metadata``, ``__hash__``, and changed the default definition - of ``__eq__``. - For interaction with Apache Arrow (pyarrow), a ``__from_arrow__`` method can be implemented: this method receives a pyarrow Array or ChunkedArray as only argument and is expected to return the appropriate pandas @@ -210,7 +200,7 @@ def construct_array_type(cls) -> type_t[ExtensionArray]: ------- type """ - raise NotImplementedError + raise AbstractMethodError(cls) @classmethod def construct_from_string(cls, string: str): @@ -367,13 +357,18 @@ def _get_common_dtype(self, dtypes: list[DtypeObj]) -> DtypeObj | None: else: return None + @property + def _can_hold_na(self) -> bool: + """ + Can arrays of this dtype hold NA values? + """ + return True + def register_extension_dtype(cls: type[E]) -> type[E]: """ Register an ExtensionType with pandas as class decorator. - .. versionadded:: 0.24.0 - This enables operations like ``.astype(name)`` for the name of the ExtensionDtype. diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 4abb5d98202f6..433d45d94167d 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -111,7 +111,7 @@ def maybe_convert_platform( values: list | tuple | range | np.ndarray | ExtensionArray, ) -> ArrayLike: - """ try to do platform conversion, allow ndarray or list here """ + """try to do platform conversion, allow ndarray or list here""" arr: ArrayLike if isinstance(values, (list, tuple, range)): @@ -419,18 +419,14 @@ def maybe_cast_to_extension_array( ------- ExtensionArray or obj """ - from pandas.core.arrays.string_ import StringArray - from pandas.core.arrays.string_arrow import ArrowStringArray + from pandas.core.arrays.string_ import BaseStringArray assert isinstance(cls, type), f"must pass a type: {cls}" assertion_msg = f"must pass a subclass of ExtensionArray: {cls}" assert issubclass(cls, ABCExtensionArray), assertion_msg # Everything can be converted to StringArrays, but we may not want to convert - if ( - issubclass(cls, (StringArray, ArrowStringArray)) - and lib.infer_dtype(obj) != "string" - ): + if issubclass(cls, BaseStringArray) and lib.infer_dtype(obj) != "string": return obj try: @@ -917,7 +913,7 @@ def invalidate_string_dtypes(dtype_set: set[DtypeObj]): def coerce_indexer_dtype(indexer, categories): - """ coerce the indexer input array to the smallest dtype possible """ + """coerce the indexer input array to the smallest dtype possible""" length = len(categories) if length < _int8_max: return ensure_int8(indexer) @@ -1977,8 +1973,6 @@ def maybe_cast_to_integer_array( Takes any dtype and returns the casted version, raising for when data is incompatible with integer/unsigned integer dtypes. - .. versionadded:: 0.24.0 - Parameters ---------- arr : np.ndarray or list @@ -2170,6 +2164,8 @@ def can_hold_element(arr: ArrayLike, element: Any) -> bool: if dtype.kind in ["i", "u"]: if tipo is not None: if tipo.kind not in ["i", "u"]: + if is_float(element) and element.is_integer(): + return True # Anything other than integer we cannot hold return False elif dtype.itemsize < tipo.itemsize: diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index 3f43681687945..34b9a3f1f14ad 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -1,11 +1,11 @@ """ Common type operations. """ +from __future__ import annotations from typing import ( Any, Callable, - Union, ) import warnings @@ -102,7 +102,7 @@ def ensure_float(arr): ensure_object = algos.ensure_object -def ensure_str(value: Union[bytes, Any]) -> str: +def ensure_str(value: bytes | Any) -> str: """ Ensure that bytes and non-strings get converted into ``str`` objects. """ @@ -113,7 +113,7 @@ def ensure_str(value: Union[bytes, Any]) -> str: return value -def ensure_python_int(value: Union[int, np.integer]) -> int: +def ensure_python_int(value: int | np.integer) -> int: """ Ensure that a value is a python int. @@ -142,7 +142,7 @@ def ensure_python_int(value: Union[int, np.integer]) -> int: def classes(*klasses) -> Callable: - """ evaluate if the tipo is a subclass of the klasses """ + """evaluate if the tipo is a subclass of the klasses""" return lambda tipo: issubclass(tipo, klasses) @@ -631,10 +631,8 @@ def is_any_int_dtype(arr_or_dtype) -> bool: This function is internal and should not be exposed in the public API. - .. versionchanged:: 0.24.0 - - The nullable Integer dtypes (e.g. pandas.Int64Dtype) are also considered - as integer by this function. + The nullable Integer dtypes (e.g. pandas.Int64Dtype) are also considered + as integer by this function. Parameters ---------- @@ -678,10 +676,8 @@ def is_integer_dtype(arr_or_dtype) -> bool: Unlike in `in_any_int_dtype`, timedelta64 instances will return False. - .. versionchanged:: 0.24.0 - - The nullable Integer dtypes (e.g. pandas.Int64Dtype) are also considered - as integer by this function. + The nullable Integer dtypes (e.g. pandas.Int64Dtype) are also considered + as integer by this function. Parameters ---------- @@ -732,10 +728,8 @@ def is_signed_integer_dtype(arr_or_dtype) -> bool: Unlike in `in_any_int_dtype`, timedelta64 instances will return False. - .. versionchanged:: 0.24.0 - - The nullable Integer dtypes (e.g. pandas.Int64Dtype) are also considered - as integer by this function. + The nullable Integer dtypes (e.g. pandas.Int64Dtype) are also considered + as integer by this function. Parameters ---------- @@ -786,10 +780,8 @@ def is_unsigned_integer_dtype(arr_or_dtype) -> bool: """ Check whether the provided array or dtype is of an unsigned integer dtype. - .. versionchanged:: 0.24.0 - - The nullable Integer dtypes (e.g. pandas.UInt64Dtype) are also - considered as integer by this function. + The nullable Integer dtypes (e.g. pandas.UInt64Dtype) are also + considered as integer by this function. Parameters ---------- diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index a75e4faf03d80..51b0b746cadf9 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -104,7 +104,7 @@ def __getstate__(self) -> dict[str_type, Any]: @classmethod def reset_cache(cls) -> None: - """ clear the cache """ + """clear the cache""" cls._cache_dtypes = {} @@ -1268,8 +1268,6 @@ class PandasDtype(ExtensionDtype): """ A Pandas ExtensionDtype for NumPy dtypes. - .. versionadded:: 0.24.0 - This is mostly for internal compatibility, and is not especially useful on its own. diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 7545ea9a0733c..91b9bdd564676 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1600,8 +1600,6 @@ def to_numpy( """ Convert the DataFrame to a NumPy array. - .. versionadded:: 0.24.0 - By default, the dtype of the returned array will be the common NumPy dtype of all types in the DataFrame. For example, if the dtypes are ``float16`` and ``float32``, the results dtype will be ``float32``. @@ -1920,8 +1918,6 @@ def to_gbq( *New in version 0.8.0 of pandas-gbq*. - .. versionadded:: 0.24.0 - See Also -------- pandas_gbq.to_gbq : This function in the pandas-gbq library. @@ -2141,14 +2137,10 @@ def to_records( Include index in resulting record array, stored in 'index' field or using the index label, if set. column_dtypes : str, type, dict, default None - .. versionadded:: 0.24.0 - If a string or type, the data type to store all columns. If a dictionary, a mapping of column names and indices (zero-indexed) to specific data types. index_dtypes : str, type, dict, default None - .. versionadded:: 0.24.0 - If a string or type, the data type to store all index levels. If a dictionary, a mapping of index level names and indices (zero-indexed) to specific data types. @@ -2632,16 +2624,10 @@ def to_parquet( the RangeIndex will be stored as a range in the metadata so it doesn't require much space and is faster. Other indexes will be included as columns in the file output. - - .. versionadded:: 0.24.0 - partition_cols : list, optional, default None Column names by which to partition the dataset. Columns are partitioned in the order they are given. Must be None if path is not a string. - - .. versionadded:: 0.24.0 - {storage_options} .. versionadded:: 1.2.0 @@ -2759,8 +2745,6 @@ def to_html( A css id is included in the opening `` tag if specified. render_links : bool, default False Convert URLs to HTML links. - - .. versionadded:: 0.24.0 %(returns)s See Also -------- @@ -6574,8 +6558,6 @@ def nlargest(self, n, columns, keep: str = "first") -> DataFrame: - ``all`` : do not drop any duplicates, even it means selecting more than `n` items. - .. versionadded:: 0.24.0 - Returns ------- DataFrame @@ -6683,8 +6665,6 @@ def nsmallest(self, n, columns, keep: str = "first") -> DataFrame: - ``all`` : do not drop any duplicates, even it means selecting more than `n` items. - .. versionadded:: 0.24.0 - Returns ------- DataFrame @@ -7425,10 +7405,6 @@ def update( If 'raise', will raise a ValueError if the DataFrame and `other` both contain non-NA data in the same place. - .. versionchanged:: 0.24.0 - Changed from `raise_conflict=False|True` - to `errors='ignore'|'raise'`. - Returns ------- None : method directly changes calling object @@ -8744,7 +8720,7 @@ def applymap( Additional keyword arguments to pass as keywords arguments to `func`. - .. versionadded:: 1.3 + .. versionadded:: 1.3.0 Returns ------- @@ -9325,9 +9301,6 @@ def corr( and returning a float. Note that the returned matrix from corr will have 1 along the diagonals and will be symmetric regardless of the callable's behavior. - - .. versionadded:: 0.24.0 - min_periods : int, optional Minimum number of observations required per pair of columns to have a valid result. @@ -9542,8 +9515,6 @@ def corrwith(self, other, axis: Axis = 0, drop=False, method="pearson") -> Serie * callable: callable with input two 1d ndarrays and returning a float. - .. versionadded:: 0.24.0 - Returns ------- Series @@ -10133,8 +10104,6 @@ def mode( dropna : bool, default True Don't consider counts of NaN/NaT. - .. versionadded:: 0.24.0 - Returns ------- DataFrame diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 4b24b3f9dee19..5bd845534fc96 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -11,6 +11,7 @@ from typing import ( TYPE_CHECKING, Any, + AnyStr, Callable, Hashable, Mapping, @@ -252,7 +253,7 @@ def _init_mgr( dtype: Dtype | None = None, copy: bool_t = False, ) -> Manager: - """ passed a manager and a axes dict """ + """passed a manager and a axes dict""" for a, axe in axes.items(): if axe is not None: axe = ensure_index(axe) @@ -433,7 +434,7 @@ def set_flags( @final @classmethod def _validate_dtype(cls, dtype) -> DtypeObj | None: - """ validate the passed dtype """ + """validate the passed dtype""" if dtype is not None: dtype = pandas_dtype(dtype) @@ -805,8 +806,6 @@ def droplevel(self: FrameOrSeries, level, axis=0) -> FrameOrSeries: """ Return {klass} with requested index / column level(s) removed. - .. versionadded:: 0.24.0 - Parameters ---------- level : int, str, or list-like @@ -1180,9 +1179,6 @@ def rename_axis(self, mapper=lib.no_default, **kwargs): Use either ``mapper`` and ``axis`` to specify the axis to target with ``mapper``, or ``index`` and/or ``columns``. - - .. versionchanged:: 0.24.0 - axis : {0 or 'index', 1 or 'columns'}, default 0 The axis to rename. copy : bool, default True @@ -2229,8 +2225,8 @@ def to_excel( For compatibility with :meth:`~DataFrame.to_csv`, to_excel serializes lists and dicts to strings before writing. - Once a workbook has been saved it is not possible write further data - without rewriting the whole workbook. + Once a workbook has been saved it is not possible to write further + data without rewriting the whole workbook. Examples -------- @@ -2366,18 +2362,15 @@ def to_json( suitable format for JSON. Should receive a single argument which is the object to convert and return a serialisable object. lines : bool, default False - If 'orient' is 'records' write out line delimited json format. Will - throw ValueError if incorrect 'orient' since others are not list - like. + If 'orient' is 'records' write out line-delimited json format. Will + throw ValueError if incorrect 'orient' since others are not + list-like. compression : {{'infer', 'gzip', 'bz2', 'zip', 'xz', None}} A string representing the compression to use in the output file, only used when the first argument is a filename. By default, the compression is inferred from the filename. - - .. versionchanged:: 0.24.0 - 'infer' option added and set to default index : bool, default True Whether to include the index values in the JSON string. Not including the index (``index=False``) is only supported when @@ -2669,7 +2662,7 @@ def to_hdf( -------- read_hdf : Read from HDF file. DataFrame.to_parquet : Write a DataFrame to the binary parquet format. - DataFrame.to_sql : Write to a sql table. + DataFrame.to_sql : Write to a SQL table. DataFrame.to_feather : Write out feather-format for DataFrames. DataFrame.to_csv : Write out to a csv file. @@ -2788,8 +2781,6 @@ def to_sql( Details and a sample callable implementation can be found in the section :ref:`insert method `. - .. versionadded:: 0.24.0 - Raises ------ ValueError @@ -2807,8 +2798,6 @@ def to_sql( database. Otherwise, the datetimes will be stored as timezone unaware timestamps local to the original timezone. - .. versionadded:: 0.24.0 - References ---------- .. [1] https://docs.sqlalchemy.org @@ -3307,7 +3296,7 @@ def to_latex( @doc(storage_options=_shared_docs["storage_options"]) def to_csv( self, - path_or_buf: FilePathOrBuffer | None = None, + path_or_buf: FilePathOrBuffer[AnyStr] | None = None, sep: str = ",", na_rep: str = "", float_format: str | None = None, @@ -3332,9 +3321,6 @@ def to_csv( r""" Write object to a comma-separated values (csv) file. - .. versionchanged:: 0.24.0 - The order of arguments for Series was changed. - Parameters ---------- path_or_buf : str or file handle, default None @@ -3343,10 +3329,6 @@ def to_csv( with `newline=''`, disabling universal newlines. If a binary file object is passed, `mode` might need to contain a `'b'`. - .. versionchanged:: 0.24.0 - - Was previously named "path" for Series. - .. versionchanged:: 1.2.0 Support for binary file objects was introduced. @@ -3362,11 +3344,6 @@ def to_csv( header : bool or list of str, default True Write out the column names. If a list of strings is given it is assumed to be aliases for the column names. - - .. versionchanged:: 0.24.0 - - Previously defaulted to False for Series. - index : bool, default True Write row names (index). index_label : str or sequence, or False, default None @@ -3424,8 +3401,6 @@ def to_csv( The newline character or character sequence to use in the output file. Defaults to `os.linesep`, which depends on the OS in which this method is called ('\\n' for linux, '\\r\\n' for Windows, i.e.). - - .. versionchanged:: 0.24.0 chunksize : int or None Rows to write at a time. date_format : str, default None @@ -4026,7 +4001,7 @@ def get(self, key, default=None): @final @property def _is_view(self) -> bool_t: - """Return boolean indicating if self is view of another array """ + """Return boolean indicating if self is view of another array""" return self._mgr.is_view @final @@ -4882,7 +4857,7 @@ def _reindex_with_indexers( copy: bool_t = False, allow_dups: bool_t = False, ) -> FrameOrSeries: - """allow_dups indicates an internal call here """ + """allow_dups indicates an internal call here""" # reindex doing multiple operations on different axes if indicated new_data = self._mgr for axis in sorted(reindexers.keys()): @@ -5601,7 +5576,7 @@ def _is_mixed_type(self) -> bool_t: @final def _check_inplace_setting(self, value) -> bool_t: - """ check whether we allow in-place setting with this type of value """ + """check whether we allow in-place setting with this type of value""" if self._is_mixed_type and not self._mgr.is_numeric_mixed_type: # allow an actual np.nan thru @@ -7244,10 +7219,10 @@ def _clip_with_scalar(self, lower, upper, inplace: bool_t = False): with np.errstate(all="ignore"): if upper is not None: - subset = (self <= upper).to_numpy() + subset = self <= upper result = result.where(subset, upper, axis=None, inplace=False) if lower is not None: - subset = (self >= lower).to_numpy() + subset = self >= lower result = result.where(subset, lower, axis=None, inplace=False) if np.any(mask): @@ -7310,11 +7285,11 @@ def clip( Parameters ---------- - lower : float or array_like, default None + lower : float or array-like, default None Minimum threshold value. All values below this threshold will be set to it. A missing threshold (e.g `NA`) will not clip the value. - upper : float or array_like, default None + upper : float or array-like, default None Maximum threshold value. All values above this threshold will be set to it. A missing threshold (e.g `NA`) will not clip the value. @@ -7588,8 +7563,6 @@ def at_time( time : datetime.time or str axis : {0 or 'index', 1 or 'columns'}, default 0 - .. versionadded:: 0.24.0 - Returns ------- Series or DataFrame @@ -7663,8 +7636,6 @@ def between_time( axis : {0 or 'index', 1 or 'columns'}, default 0 Determine range time on index or columns value. - .. versionadded:: 0.24.0 - Returns ------- Series or DataFrame @@ -7918,8 +7889,8 @@ def resample( Pass a custom function via ``apply`` - >>> def custom_resampler(array_like): - ... return np.sum(array_like) + 5 + >>> def custom_resampler(arraylike): + ... return np.sum(arraylike) + 5 ... >>> series.resample('3T').apply(custom_resampler) 2000-01-01 00:00:00 8 @@ -9594,8 +9565,6 @@ def tz_localize( - 'raise' will raise an NonExistentTimeError if there are nonexistent times. - .. versionadded:: 0.24.0 - Returns ------- Series or DataFrame @@ -10924,7 +10893,7 @@ def ewm( span: float | None = None, halflife: float | TimedeltaConvertibleTypes | None = None, alpha: float | None = None, - min_periods: int = 0, + min_periods: int | None = 0, adjust: bool_t = True, ignore_na: bool_t = False, axis: Axis = 0, diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 6deb5bb1a76f0..f694dcce809ea 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -2959,8 +2959,6 @@ def shift(self, periods=1, freq=None, axis=0, fill_value=None): fill_value : optional The scalar value to use for newly introduced missing values. - .. versionadded:: 0.24.0 - Returns ------- Series or DataFrame diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 6903c8e99e489..b65f26c7174fc 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -824,7 +824,7 @@ def apply(self, f: F, data: FrameOrSeries, axis: int = 0): @cache_readonly def indices(self): - """ dict {group name -> group indices} """ + """dict {group name -> group indices}""" if len(self.groupings) == 1 and isinstance(self.result_index, CategoricalIndex): # This shows unused categories in indices GH#38642 return self.groupings[0].indices @@ -858,7 +858,7 @@ def size(self) -> Series: @cache_readonly def groups(self) -> dict[Hashable, np.ndarray]: - """ dict {group name -> group labels} """ + """dict {group name -> group labels}""" if len(self.groupings) == 1: return self.groupings[0].groups else: @@ -1132,7 +1132,7 @@ def __init__( @cache_readonly def groups(self): - """ dict {group name -> group labels} """ + """dict {group name -> group labels}""" # this is mainly for compat # GH 3881 result = { diff --git a/pandas/core/indexes/api.py b/pandas/core/indexes/api.py index f56e13775460b..304c42321e72a 100644 --- a/pandas/core/indexes/api.py +++ b/pandas/core/indexes/api.py @@ -1,8 +1,6 @@ +from __future__ import annotations + import textwrap -from typing import ( - List, - Set, -) from pandas._libs import ( NaT, @@ -98,12 +96,12 @@ def get_objs_combined_axis( return _get_combined_index(obs_idxes, intersect=intersect, sort=sort, copy=copy) -def _get_distinct_objs(objs: List[Index]) -> List[Index]: +def _get_distinct_objs(objs: list[Index]) -> list[Index]: """ Return a list with distinct elements of "objs" (different ids). Preserves order. """ - ids: Set[int] = set() + ids: set[int] = set() res = [] for obj in objs: if id(obj) not in ids: @@ -113,7 +111,7 @@ def _get_distinct_objs(objs: List[Index]) -> List[Index]: def _get_combined_index( - indexes: List[Index], + indexes: list[Index], intersect: bool = False, sort: bool = False, copy: bool = False, diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 871c6a4a1c41d..eaba30012a5b8 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -1305,8 +1305,6 @@ def to_flat_index(self): """ Identity method. - .. versionadded:: 0.24.0 - This is implemented for compatibility with subclass implementations when chaining. @@ -1388,8 +1386,6 @@ def to_frame(self, index: bool = True, name: Hashable = None) -> DataFrame: """ Create a DataFrame with a column containing the Index. - .. versionadded:: 0.24.0 - Parameters ---------- index : bool, default True @@ -2862,13 +2858,6 @@ def union(self, other, sort=None): * False : do not sort the result. - .. versionadded:: 0.24.0 - - .. versionchanged:: 0.24.1 - - Changed the default value from ``True`` to ``None`` - (without change in behaviour). - Returns ------- union : Index @@ -3024,7 +3013,7 @@ def _union(self, other: Index, sort): # Self may have duplicates # find indexes of things in "other" that are not in "self" - if self.is_unique: + if self._index_as_unique: indexer = self.get_indexer(other) missing = (indexer == -1).nonzero()[0] else: @@ -3069,13 +3058,6 @@ def intersection(self, other, sort=False): * None : sort the result, except when `self` and `other` are equal or when the values cannot be compared. - .. versionadded:: 0.24.0 - - .. versionchanged:: 0.24.1 - - Changed the default from ``True`` to ``False``, to match - the behaviour of 0.23.4 and earlier. - Returns ------- intersection : Index @@ -3116,7 +3098,6 @@ def _intersection(self, other: Index, sort=False): intersection specialized to the case with matching dtypes. """ # TODO(EA): setops-refactor, clean all this up - lvals = self._values if self.is_monotonic and other.is_monotonic: try: @@ -3128,21 +3109,36 @@ def _intersection(self, other: Index, sort=False): res = algos.unique1d(result) return ensure_wrapped_if_datetimelike(res) - try: - indexer = other.get_indexer(lvals) - except InvalidIndexError: - # InvalidIndexError raised by get_indexer if non-unique - indexer, _ = other.get_indexer_non_unique(lvals) + res_values = self._intersection_via_get_indexer(other, sort=sort) + res_values = _maybe_try_sort(res_values, sort) + return res_values - mask = indexer != -1 - indexer = indexer.take(mask.nonzero()[0]) + def _intersection_via_get_indexer(self, other: Index, sort) -> ArrayLike: + """ + Find the intersection of two Indexes using get_indexer. - result = other.take(indexer).unique()._values - result = _maybe_try_sort(result, sort) + Returns + ------- + np.ndarray or ExtensionArray + The returned array will be unique. + """ + # Note: drop_duplicates vs unique matters for MultiIndex, though + # it should not, see GH#41823 + left_unique = self.drop_duplicates() + right_unique = other.drop_duplicates() + + # even though we are unique, we need get_indexer_for for IntervalIndex + indexer = left_unique.get_indexer_for(right_unique) + + mask = indexer != -1 - # Intersection has to be unique - assert Index(result).is_unique + taker = indexer.take(mask.nonzero()[0]) + if sort is False: + # sort bc we want the elements in the same order they are in self + # unnecessary in the case with sort=None bc we will sort later + taker = np.sort(taker) + result = left_unique.take(taker)._values return result @final @@ -3164,13 +3160,6 @@ def difference(self, other, sort=None): from comparing incomparable elements. * False : Do not sort the result. - .. versionadded:: 0.24.0 - - .. versionchanged:: 0.24.1 - - Changed the default value from ``True`` to ``None`` - (without change in behaviour). - Returns ------- difference : Index @@ -3196,6 +3185,10 @@ def difference(self, other, sort=None): # Note: we do not (yet) sort even if sort=None GH#24959 return self.rename(result_name) + if not self._should_compare(other): + # Nothing matches -> difference is everything + return self.rename(result_name) + result = self._difference(other, sort=sort) return self._wrap_setop_result(other, result) @@ -3203,7 +3196,7 @@ def _difference(self, other, sort): this = self._get_unique_index() - indexer = this.get_indexer(other) + indexer = this.get_indexer_for(other) indexer = indexer.take((indexer != -1).nonzero()[0]) label_diff = np.setdiff1d(np.arange(this.size), indexer, assume_unique=True) @@ -3229,13 +3222,6 @@ def symmetric_difference(self, other, result_name=None, sort=None): from comparing incomparable elements. * False : Do not sort the result. - .. versionadded:: 0.24.0 - - .. versionchanged:: 0.24.1 - - Changed the default value from ``True`` to ``None`` - (without change in behaviour). - Returns ------- symmetric_difference : Index @@ -3260,33 +3246,13 @@ def symmetric_difference(self, other, result_name=None, sort=None): if result_name is None: result_name = result_name_update - if not self._should_compare(other): - return self.union(other, sort=sort).rename(result_name) - elif not is_dtype_equal(self.dtype, other.dtype): - dtype = find_common_type([self.dtype, other.dtype]) - this = self.astype(dtype, copy=False) - that = other.astype(dtype, copy=False) - return this.symmetric_difference(that, sort=sort).rename(result_name) - - this = self._get_unique_index() - other = other._get_unique_index() - indexer = this.get_indexer_for(other) - - # {this} minus {other} - common_indexer = indexer.take((indexer != -1).nonzero()[0]) - left_indexer = np.setdiff1d( - np.arange(this.size), common_indexer, assume_unique=True - ) - left_diff = this._values.take(left_indexer) - - # {other} minus {this} - right_indexer = (indexer == -1).nonzero()[0] - right_diff = other._values.take(right_indexer) - - the_diff = concat_compat([left_diff, right_diff]) - the_diff = _maybe_try_sort(the_diff, sort) + left = self.difference(other, sort=False) + right = other.difference(self, sort=False) + result = left.union(right, sort=sort) - return Index(the_diff, name=result_name) + if result_name is not None: + result = result.rename(result_name) + return result @final def _assert_can_do_setop(self, other) -> bool: @@ -3427,7 +3393,7 @@ def get_indexer( method = missing.clean_reindex_fill_method(method) target = self._maybe_cast_listlike_indexer(target) - self._check_indexing_method(method) + self._check_indexing_method(method, limit, tolerance) if not self._index_as_unique: raise InvalidIndexError(self._requires_unique_msg) @@ -3469,39 +3435,55 @@ def _get_indexer( elif method == "nearest": indexer = self._get_nearest_indexer(target, limit, tolerance) else: - if tolerance is not None: - raise ValueError( - "tolerance argument only valid if doing pad, " - "backfill or nearest reindexing" - ) - if limit is not None: - raise ValueError( - "limit argument only valid if doing pad, " - "backfill or nearest reindexing" - ) - indexer = self._engine.get_indexer(target._get_engine_target()) return ensure_platform_int(indexer) @final - def _check_indexing_method(self, method: str_t | None) -> None: + def _check_indexing_method( + self, + method: str_t | None, + limit: int | None = None, + tolerance=None, + ) -> None: """ Raise if we have a get_indexer `method` that is not supported or valid. """ - # GH#37871 for now this is only for IntervalIndex and CategoricalIndex - if not (is_interval_dtype(self.dtype) or is_categorical_dtype(self.dtype)): - return + if method not in [None, "bfill", "backfill", "pad", "ffill", "nearest"]: + # in practice the clean_reindex_fill_method call would raise + # before we get here + raise ValueError("Invalid fill method") # pragma: no cover - if method is None: - return + if self._is_multi: + if method == "nearest": + raise NotImplementedError( + "method='nearest' not implemented yet " + "for MultiIndex; see GitHub issue 9365" + ) + elif method == "pad" or method == "backfill": + if tolerance is not None: + raise NotImplementedError( + "tolerance not implemented yet for MultiIndex" + ) - if method in ["bfill", "backfill", "pad", "ffill", "nearest"]: - raise NotImplementedError( - f"method {method} not yet implemented for {type(self).__name__}" - ) + if is_interval_dtype(self.dtype) or is_categorical_dtype(self.dtype): + # GH#37871 for now this is only for IntervalIndex and CategoricalIndex + if method is not None: + raise NotImplementedError( + f"method {method} not yet implemented for {type(self).__name__}" + ) - raise ValueError("Invalid fill method") + if method is None: + if tolerance is not None: + raise ValueError( + "tolerance argument only valid if doing pad, " + "backfill or nearest reindexing" + ) + if limit is not None: + raise ValueError( + "limit argument only valid if doing pad, " + "backfill or nearest reindexing" + ) def _convert_tolerance(self, tolerance, target: np.ndarray | Index) -> np.ndarray: # override this method on subclasses @@ -3698,43 +3680,6 @@ def is_int(v): return indexer - def _convert_listlike_indexer(self, keyarr): - """ - Parameters - ---------- - keyarr : list-like - Indexer to convert. - - Returns - ------- - indexer : numpy.ndarray or None - Return an ndarray or None if cannot convert. - keyarr : numpy.ndarray - Return tuple-safe keys. - """ - if isinstance(keyarr, Index): - pass - else: - keyarr = self._convert_arr_indexer(keyarr) - - indexer = None - return indexer, keyarr - - def _convert_arr_indexer(self, keyarr) -> np.ndarray: - """ - Convert an array-like indexer to the appropriate dtype. - - Parameters - ---------- - keyarr : array-like - Indexer to convert. - - Returns - ------- - converted_keyarr : array-like - """ - return com.asarray_tuplesafe(keyarr) - @final def _invalid_indexer(self, form: str_t, key) -> TypeError: """ @@ -6080,8 +6025,8 @@ def any(self, *args, **kwargs): Returns ------- - any : bool or array_like (if axis is specified) - A single element array_like may be converted to bool. + any : bool or array-like (if axis is specified) + A single element array-like may be converted to bool. See Also -------- @@ -6124,8 +6069,8 @@ def all(self, *args, **kwargs): Returns ------- - all : bool or array_like (if axis is specified) - A single element array_like may be converted to bool. + all : bool or array-like (if axis is specified) + A single element array-like may be converted to bool. See Also -------- @@ -6381,91 +6326,19 @@ def _maybe_cast_data_without_dtype(subarr: np.ndarray) -> ArrayLike: ------- np.ndarray or ExtensionArray """ - # Runtime import needed bc IntervalArray imports Index - from pandas.core.arrays import ( - DatetimeArray, - IntervalArray, - PeriodArray, - TimedeltaArray, - ) - - assert subarr.dtype == object, subarr.dtype - inferred = lib.infer_dtype(subarr, skipna=False) - - if inferred == "integer": - try: - data = _try_convert_to_int_array(subarr) - return data - except ValueError: - pass + result = lib.maybe_convert_objects( + subarr, + convert_datetime=True, + convert_timedelta=True, + convert_period=True, + convert_interval=True, + dtype_if_all_nat=np.dtype("datetime64[ns]"), + ) + if result.dtype.kind in ["b", "c"]: return subarr - - elif inferred in ["floating", "mixed-integer-float", "integer-na"]: - # TODO: Returns IntegerArray for integer-na case in the future - data = np.asarray(subarr).astype(np.float64, copy=False) - return data - - elif inferred == "interval": - ia_data = IntervalArray._from_sequence(subarr, copy=False) - return ia_data - elif inferred == "boolean": - # don't support boolean explicitly ATM - pass - elif inferred != "string": - if inferred.startswith("datetime"): - try: - data = DatetimeArray._from_sequence(subarr, copy=False) - return data - except (ValueError, OutOfBoundsDatetime): - # GH 27011 - # If we have mixed timezones, just send it - # down the base constructor - pass - - elif inferred.startswith("timedelta"): - tda = TimedeltaArray._from_sequence(subarr, copy=False) - return tda - elif inferred == "period": - parr = PeriodArray._from_sequence(subarr) - return parr - - return subarr - - -def _try_convert_to_int_array(data: np.ndarray) -> np.ndarray: - """ - Attempt to convert an array of data into an integer array. - - Parameters - ---------- - data : np.ndarray[object] - - Returns - ------- - int_array : data converted to either an ndarray[int64] or ndarray[uint64] - - Raises - ------ - ValueError if the conversion was not successful. - """ - try: - res = data.astype("i8", copy=False) - if (res == data).all(): - return res - except (OverflowError, TypeError, ValueError): - pass - - # Conversion to int64 failed (possibly due to overflow), - # so let's try now with uint64. - try: - res = data.astype("u8", copy=False) - if (res == data).all(): - return res - except (OverflowError, TypeError, ValueError): - pass - - raise ValueError + result = ensure_wrapped_if_datetimelike(result) + return result def get_unanimous_names(*indexes: Index) -> tuple[Hashable, ...]: diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index 554cf33e22555..228f58d47b8ed 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -269,6 +269,10 @@ def _is_dtype_compat(self, other) -> Categorical: raise TypeError( "categories must match existing categories when appending" ) + + elif other._is_multi: + # preempt raising NotImplementedError in isna call + raise TypeError("MultiIndex is not dtype-compatible with CategoricalIndex") else: values = other @@ -624,7 +628,7 @@ def _concat(self, to_concat: list[Index], name: Hashable) -> Index: return type(self)._simple_new(cat, name=name) def _delegate_method(self, name: str, *args, **kwargs): - """ method delegation to the ._values """ + """method delegation to the ._values""" method = getattr(self._values, name) if "inplace" in kwargs: raise ValueError("cannot use inplace with CategoricalIndex") diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index 19167677257f7..df7fae0763c42 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -60,7 +60,6 @@ inherit_names, make_wrapped_arith_op, ) -from pandas.core.indexes.numeric import Int64Index from pandas.core.tools.timedeltas import to_timedelta if TYPE_CHECKING: @@ -475,9 +474,6 @@ def shift(self: _T, periods: int = 1, freq=None) -> _T: periods : int, default 1 Number of periods (or increments) to shift by, can be positive or negative. - - .. versionchanged:: 0.24.0 - freq : pandas.DateOffset, pandas.Timedelta or string, optional Frequency increment to shift by. If None, the index is shifted by its own `freq` attribute. @@ -782,11 +778,7 @@ def _union(self, other, sort): # that result.freq == self.freq return result else: - i8self = Int64Index._simple_new(self.asi8) - i8other = Int64Index._simple_new(other.asi8) - i8result = i8self._union(i8other, sort=sort) - result = type(self)(i8result, dtype=self.dtype, freq="infer") - return result + return super()._union(other, sort=sort)._with_freq("infer") # -------------------------------------------------------------------- # Join Methods diff --git a/pandas/core/indexes/extension.py b/pandas/core/indexes/extension.py index b1cabf92bf985..6ff20f7d009bc 100644 --- a/pandas/core/indexes/extension.py +++ b/pandas/core/indexes/extension.py @@ -1,12 +1,11 @@ """ Shared methods for Index subclasses backed by ExtensionArray. """ +from __future__ import annotations + from typing import ( Hashable, - List, - Type, TypeVar, - Union, ) import numpy as np @@ -117,7 +116,7 @@ def method(self, *args, **kwargs): return method -def inherit_names(names: List[str], delegate, cache: bool = False, wrap: bool = False): +def inherit_names(names: list[str], delegate, cache: bool = False, wrap: bool = False): """ Class decorator to pin attributes from an ExtensionArray to a Index subclass. @@ -227,20 +226,20 @@ class ExtensionIndex(Index): # The base class already passes through to _data: # size, __len__, dtype - _data: Union[IntervalArray, NDArrayBackedExtensionArray] + _data: IntervalArray | NDArrayBackedExtensionArray - _data_cls: Union[ - Type[Categorical], - Type[DatetimeArray], - Type[TimedeltaArray], - Type[PeriodArray], - Type[IntervalArray], - ] + _data_cls: ( + type[Categorical] + | type[DatetimeArray] + | type[TimedeltaArray] + | type[PeriodArray] + | type[IntervalArray] + ) @classmethod def _simple_new( cls, - array: Union[IntervalArray, NDArrayBackedExtensionArray], + array: IntervalArray | NDArrayBackedExtensionArray, name: Hashable = None, ): """ diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index b906f88d98a46..072ab7dff8e5b 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -1,7 +1,6 @@ """ define the IntervalIndex """ from __future__ import annotations -from functools import wraps from operator import ( le, lt, @@ -62,11 +61,9 @@ is_scalar, ) from pandas.core.dtypes.dtypes import IntervalDtype +from pandas.core.dtypes.missing import is_valid_na_for_dtype -from pandas.core.algorithms import ( - take_nd, - unique, -) +from pandas.core.algorithms import take_nd from pandas.core.arrays.interval import ( IntervalArray, _interval_shared_docs, @@ -93,7 +90,6 @@ TimedeltaIndex, timedelta_range, ) -from pandas.core.ops import get_op_result_name if TYPE_CHECKING: from pandas import CategoricalIndex @@ -151,59 +147,6 @@ def _new_IntervalIndex(cls, d): return cls.from_arrays(**d) -def setop_check(method): - """ - This is called to decorate the set operations of IntervalIndex - to perform the type check in advance. - """ - op_name = method.__name__ - - @wraps(method) - def wrapped(self, other, sort=False): - self._validate_sort_keyword(sort) - self._assert_can_do_setop(other) - other, result_name = self._convert_can_do_setop(other) - - if op_name == "difference": - if not isinstance(other, IntervalIndex): - result = getattr(self.astype(object), op_name)(other, sort=sort) - return result.astype(self.dtype) - - elif not self._should_compare(other): - # GH#19016: ensure set op will not return a prohibited dtype - result = getattr(self.astype(object), op_name)(other, sort=sort) - return result.astype(self.dtype) - - return method(self, other, sort) - - return wrapped - - -def _setop(op_name: str): - """ - Implement set operation. - """ - - def func(self, other, sort=None): - # At this point we are assured - # isinstance(other, IntervalIndex) - # other.closed == self.closed - - result = getattr(self._multiindex, op_name)(other._multiindex, sort=sort) - result_name = get_op_result_name(self, other) - - # GH 19101: ensure empty results have correct dtype - if result.empty: - result = result._values.astype(self.dtype.subtype) - else: - result = result._values - - return type(self).from_tuples(result, closed=self.closed, name=result_name) - - func.__name__ = op_name - return setop_check(func) - - @Appender( _interval_shared_docs["class"] % { @@ -401,6 +344,8 @@ def __contains__(self, key: Any) -> bool: """ hash(key) if not isinstance(key, Interval): + if is_valid_na_for_dtype(key, self.dtype): + return self.hasnans return False try: @@ -472,8 +417,6 @@ def is_overlapping(self) -> bool: endpoints. Intervals that only have an open endpoint in common do not overlap. - .. versionadded:: 0.24.0 - Returns ------- bool @@ -678,6 +621,8 @@ def get_loc( if self.closed != key.closed: raise KeyError(key) mask = (self.left == key.left) & (self.right == key.right) + elif is_valid_na_for_dtype(key, self.dtype): + mask = self.isna() else: # assume scalar op_left = le if self.closed_left else lt @@ -693,7 +638,12 @@ def get_loc( raise KeyError(key) elif matches == 1: return mask.argmax() - return lib.maybe_booleans_to_slice(mask.view("u1")) + + res = lib.maybe_booleans_to_slice(mask.view("u1")) + if isinstance(res, slice) and res.stop is None: + # TODO: DO this in maybe_booleans_to_slice? + res = slice(res.start, len(self), res.step) + return res def _get_indexer( self, @@ -781,9 +731,9 @@ def _get_indexer_pointwise(self, target: Index) -> tuple[np.ndarray, np.ndarray] indexer = np.concatenate(indexer) return ensure_platform_int(indexer), ensure_platform_int(missing) - @property + @cache_readonly def _index_as_unique(self) -> bool: - return not self.is_overlapping + return not self.is_overlapping and self._engine._na_count < 2 _requires_unique_msg = ( "cannot handle overlapping indices; use IntervalIndex.get_indexer_non_unique" @@ -852,82 +802,6 @@ def _format_data(self, name=None) -> str: # name argument is unused here; just for compat with base / categorical return self._data._format_data() + "," + self._format_space() - # -------------------------------------------------------------------- - # Set Operations - - def _intersection(self, other, sort): - """ - intersection specialized to the case with matching dtypes. - """ - # For IntervalIndex we also know other.closed == self.closed - if self.left.is_unique and self.right.is_unique: - taken = self._intersection_unique(other) - elif other.left.is_unique and other.right.is_unique and self.isna().sum() <= 1: - # Swap other/self if other is unique and self does not have - # multiple NaNs - taken = other._intersection_unique(self) - else: - # duplicates - taken = self._intersection_non_unique(other) - - if sort is None: - taken = taken.sort_values() - - return taken - - def _intersection_unique(self, other: IntervalIndex) -> IntervalIndex: - """ - Used when the IntervalIndex does not have any common endpoint, - no matter left or right. - Return the intersection with another IntervalIndex. - - Parameters - ---------- - other : IntervalIndex - - Returns - ------- - IntervalIndex - """ - lindexer = self.left.get_indexer(other.left) - rindexer = self.right.get_indexer(other.right) - - match = (lindexer == rindexer) & (lindexer != -1) - indexer = lindexer.take(match.nonzero()[0]) - indexer = unique(indexer) - - return self.take(indexer) - - def _intersection_non_unique(self, other: IntervalIndex) -> IntervalIndex: - """ - Used when the IntervalIndex does have some common endpoints, - on either sides. - Return the intersection with another IntervalIndex. - - Parameters - ---------- - other : IntervalIndex - - Returns - ------- - IntervalIndex - """ - mask = np.zeros(len(self), dtype=bool) - - if self.hasnans and other.hasnans: - first_nan_loc = np.arange(len(self))[self.isna()][0] - mask[first_nan_loc] = True - - other_tups = set(zip(other.left, other.right)) - for i, tup in enumerate(zip(self.left, self.right)): - if tup in other_tups: - mask[i] = True - - return self[mask] - - _union = _setop("union") - _difference = _setop("difference") - # -------------------------------------------------------------------- @property diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 6825ef4ba4198..4dff63ea22e00 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -214,8 +214,6 @@ class MultiIndex(Index): The unique labels for each level. codes : sequence of arrays Integers for each level designating which label at each location. - - .. versionadded:: 0.24.0 sortorder : optional int Level of sortedness (must be lexicographically sorted by that level). @@ -627,8 +625,6 @@ def from_frame(cls, df: DataFrame, sortorder=None, names=None) -> MultiIndex: """ Make a MultiIndex from a DataFrame. - .. versionadded:: 0.24.0 - Parameters ---------- df : DataFrame @@ -996,10 +992,6 @@ def set_codes(self, codes, level=None, inplace=None, verify_integrity: bool = Tr """ Set new codes on MultiIndex. Defaults to returning new index. - .. versionadded:: 0.24.0 - - New name for deprecated method `set_labels`. - Parameters ---------- codes : sequence or list of sequence @@ -1211,11 +1203,11 @@ def copy( return new_index def __array__(self, dtype=None) -> np.ndarray: - """ the array interface, return my values """ + """the array interface, return my values""" return self.values def view(self, cls=None): - """ this is defined as a copy with the same identity """ + """this is defined as a copy with the same identity""" result = self.copy() result._id = self._id return result @@ -1234,7 +1226,7 @@ def dtype(self) -> np.dtype: return np.dtype("O") def _is_memory_usage_qualified(self) -> bool: - """ return a boolean if we need a qualified .info display """ + """return a boolean if we need a qualified .info display""" def f(level): return "mixed" in level or "string" in level or "unicode" in level @@ -1250,7 +1242,7 @@ def memory_usage(self, deep: bool = False) -> int: @cache_readonly def nbytes(self) -> int: - """ return the number of bytes in the underlying data """ + """return the number of bytes in the underlying data""" return self._nbytes(False) def _nbytes(self, deep: bool = False) -> int: @@ -1591,7 +1583,7 @@ def is_monotonic_decreasing(self) -> bool: @cache_readonly def _inferred_type_levels(self) -> list[str]: - """ return a list of the inferred types, one for each level """ + """return a list of the inferred types, one for each level""" return [i.inferred_type for i in self.levels] @doc(Index.duplicated) @@ -1701,8 +1693,6 @@ def to_frame(self, index: bool = True, name=None) -> DataFrame: Column ordering is determined by the DataFrame constructor with data as a dict. - .. versionadded:: 0.24.0 - Parameters ---------- index : bool, default True @@ -1777,8 +1767,6 @@ def to_flat_index(self) -> Index: """ Convert a MultiIndex to an Index of Tuples containing the level values. - .. versionadded:: 0.24.0 - Returns ------- pd.Index @@ -2593,29 +2581,29 @@ def _get_values_for_loc(self, series: Series, loc, key): new_ser = series._constructor(new_values, index=new_index, name=series.name) return new_ser.__finalize__(series) - def _convert_listlike_indexer(self, keyarr): + def _convert_listlike_indexer(self, keyarr) -> np.ndarray | None: """ + Analogous to get_indexer when we are partial-indexing on our first level. + Parameters ---------- - keyarr : list-like + keyarr : Index, np.ndarray, or ExtensionArray Indexer to convert. Returns ------- - tuple (indexer, keyarr) - indexer is an ndarray or None if cannot convert - keyarr are tuple-safe keys + np.ndarray[intp] or None """ - indexer, keyarr = super()._convert_listlike_indexer(keyarr) + indexer = None # are we indexing a specific level - if indexer is None and len(keyarr) and not isinstance(keyarr[0], tuple): - level = 0 - _, indexer = self.reindex(keyarr, level=level) + if len(keyarr) and not isinstance(keyarr[0], tuple): + _, indexer = self.reindex(keyarr, level=0) # take all if indexer is None: - indexer = np.arange(len(self)) + indexer = np.arange(len(self), dtype=np.intp) + return indexer check = self.levels[0].get_indexer(keyarr) mask = check == -1 @@ -2626,7 +2614,7 @@ def _convert_listlike_indexer(self, keyarr): # actually in Index anymore raise KeyError(f"{keyarr} not in index") - return indexer, keyarr + return indexer def _get_partial_string_timestamp_match_key(self, key): """ @@ -2685,20 +2673,11 @@ def _get_indexer( # gets here, and it is checking that we raise with method="nearest" if method == "pad" or method == "backfill": - if tolerance is not None: - raise NotImplementedError( - "tolerance not implemented yet for MultiIndex" - ) # TODO: get_indexer_with_fill docstring says values must be _sorted_ # but that doesn't appear to be enforced indexer = self._engine.get_indexer_with_fill( target=target._values, values=self._values, method=method, limit=limit ) - elif method == "nearest": - raise NotImplementedError( - "method='nearest' not implemented yet " - "for MultiIndex; see GitHub issue 9365" - ) else: indexer = self._engine.get_indexer(target._values) @@ -3624,9 +3603,7 @@ def _intersection(self, other, sort=False) -> MultiIndex: uniq_tuples = algos.unique(inner_tuples) if uniq_tuples is None: - left_unique = self.drop_duplicates() - indexer = left_unique.get_indexer(other.drop_duplicates()) - uniq_tuples = left_unique.take(np.sort(indexer[indexer != -1])) + uniq_tuples = self._intersection_via_get_indexer(other, sort) if sort is None: uniq_tuples = sorted(uniq_tuples) @@ -3694,9 +3671,9 @@ def symmetric_difference(self, other, result_name=None, sort=None): return type(self)( levels=[[] for _ in range(self.nlevels)], codes=[[] for _ in range(self.nlevels)], - names=tups.name, + names=tups.names, ) - return type(self).from_tuples(tups, names=tups.name) + return tups # -------------------------------------------------------------------- @@ -3900,7 +3877,7 @@ def maybe_droplevels(index: Index, key) -> Index: def _coerce_indexer_frozen(array_like, categories, copy: bool = False) -> np.ndarray: """ - Coerce the array_like indexer to the smallest integer dtype that can encode all + Coerce the array-like indexer to the smallest integer dtype that can encode all of the given categories. Parameters diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index ead1a2a4a544b..746246172b967 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -173,7 +173,7 @@ def _simple_new(cls, values: range, name: Hashable = None) -> RangeIndex: @cache_readonly def _constructor(self) -> type[Int64Index]: - """ return the class to use for construction """ + """return the class to use for construction""" return Int64Index @cache_readonly @@ -197,7 +197,7 @@ def _int64index(self) -> Int64Index: return res def _get_data_as_items(self): - """ return a list of tuples of start, stop, step """ + """return a list of tuples of start, stop, step""" rng = self._range return [("start", rng.start), ("stop", rng.stop), ("step", rng.step)] @@ -350,7 +350,7 @@ def dtype(self) -> np.dtype: @property def is_unique(self) -> bool: - """ return if the index has unique values """ + """return if the index has unique values""" return True @cache_readonly @@ -730,18 +730,6 @@ def _difference(self, other, sort=None): new_index = new_index[::-1] return new_index - def symmetric_difference(self, other, result_name: Hashable = None, sort=None): - if not isinstance(other, RangeIndex) or sort is not None: - return super().symmetric_difference(other, result_name, sort) - - left = self.difference(other) - right = other.difference(self) - result = left.union(right) - - if result_name is not None: - result = result.rename(result_name) - return result - # -------------------------------------------------------------------- def _concat(self, indexes: list[Index], name: Hashable) -> Index: diff --git a/pandas/core/indexes/timedeltas.py b/pandas/core/indexes/timedeltas.py index 85cbea39b9b98..c60ab06dd08f3 100644 --- a/pandas/core/indexes/timedeltas.py +++ b/pandas/core/indexes/timedeltas.py @@ -1,4 +1,5 @@ """ implement the TimedeltaIndex """ +from __future__ import annotations from pandas._libs import ( index as libindex, @@ -39,12 +40,6 @@ ) @inherit_names( [ - "_bool_ops", - "_object_ops", - "_field_ops", - "_datetimelike_ops", - "_datetimelike_methods", - "_other_ops", "components", "to_pytimedelta", "sum", @@ -162,7 +157,7 @@ def _is_comparable_dtype(self, dtype: DtypeObj) -> bool: """ Can we compare values of the given dtype to our own? """ - return is_timedelta64_dtype(dtype) + return is_timedelta64_dtype(dtype) # aka self._data._is_recognized_dtype # ------------------------------------------------------------------- # Indexing Methods diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 4f55459040bc0..3707e141bc447 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -1286,13 +1286,21 @@ def _get_listlike_indexer(self, key, axis: int): """ ax = self.obj._get_axis(axis) - # Have the index compute an indexer or return None - # if it cannot handle: - indexer, keyarr = ax._convert_listlike_indexer(key) - # We only act on all found values: - if indexer is not None and (indexer != -1).all(): - # _validate_read_indexer is a no-op if no -1s, so skip - return ax[indexer], indexer + keyarr = key + if not isinstance(keyarr, Index): + keyarr = com.asarray_tuplesafe(keyarr) + + if isinstance(ax, MultiIndex): + # get_indexer expects a MultiIndex or sequence of tuples, but + # we may be doing partial-indexing, so need an extra check + + # Have the index compute an indexer or return None + # if it cannot handle: + indexer = ax._convert_listlike_indexer(keyarr) + # We only act on all found values: + if indexer is not None and (indexer != -1).all(): + # _validate_read_indexer is a no-op if no -1s, so skip + return ax[indexer], indexer if ax._index_as_unique: indexer = ax.get_indexer_for(keyarr) diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py index 31e32b053367b..76967cdc9b52e 100644 --- a/pandas/core/internals/array_manager.py +++ b/pandas/core/internals/array_manager.py @@ -433,7 +433,7 @@ def replace_list( inplace: bool = False, regex: bool = False, ) -> T: - """ do a list replace """ + """do a list replace""" inplace = validate_bool_kwarg(inplace, "inplace") return self.apply_with_block( @@ -462,7 +462,7 @@ def any_extension_types(self) -> bool: @property def is_view(self) -> bool: - """ return a boolean if we are a single block and are a view """ + """return a boolean if we are a single block and are a view""" # TODO what is this used for? return False diff --git a/pandas/core/internals/base.py b/pandas/core/internals/base.py index f8ccb10655ea1..2bb14efad1ce7 100644 --- a/pandas/core/internals/base.py +++ b/pandas/core/internals/base.py @@ -2,11 +2,9 @@ Base class for the internal managers. Both BlockManager and ArrayManager inherit from this class. """ -from typing import ( - List, - Optional, - TypeVar, -) +from __future__ import annotations + +from typing import TypeVar from pandas._typing import ( DtypeObj, @@ -27,7 +25,7 @@ class DataManager(PandasObject): # TODO share more methods/attributes - axes: List[Index] + axes: list[Index] @property def items(self) -> Index: @@ -123,7 +121,7 @@ def equals(self, other: object) -> bool: def apply( self: T, f, - align_keys: Optional[List[str]] = None, + align_keys: list[str] | None = None, ignore_failures: bool = False, **kwargs, ) -> T: @@ -144,7 +142,7 @@ def array(self): return self.arrays[0] # type: ignore[attr-defined] -def interleaved_dtype(dtypes: List[DtypeObj]) -> Optional[DtypeObj]: +def interleaved_dtype(dtypes: list[DtypeObj]) -> DtypeObj | None: """ Find the common dtype for `blocks`. diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index c7769046c70b2..237d06402a0ee 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -169,21 +169,21 @@ def _consolidate_key(self): @property def is_view(self) -> bool: - """ return a boolean if I am possibly a view """ + """return a boolean if I am possibly a view""" values = self.values values = cast(np.ndarray, values) return values.base is not None @final - @property + @cache_readonly def _can_hold_na(self) -> bool: """ Can we store NA values in this Block? """ - values = self.values - if isinstance(values, np.ndarray): - return values.dtype.kind not in ["b", "i", "u"] - return values._can_hold_na + dtype = self.dtype + if isinstance(dtype, np.dtype): + return dtype.kind not in ["b", "i", "u"] + return dtype._can_hold_na @final @cache_readonly @@ -262,7 +262,7 @@ def make_block(self, values, placement=None) -> Block: def make_block_same_class( self, values, placement: BlockPlacement | None = None ) -> Block: - """ Wrap given values in a block of same type as self. """ + """Wrap given values in a block of same type as self.""" if placement is None: placement = self._mgr_locs @@ -291,7 +291,7 @@ def __len__(self) -> int: return len(self.values) def _slice(self, slicer): - """ return a slice of my values """ + """return a slice of my values""" return self.values[slicer] @@ -530,7 +530,7 @@ def _maybe_downcast(self, blocks: list[Block], downcast=None) -> list[Block]: @final def downcast(self, dtypes=None) -> list[Block]: - """ try to downcast each item to the dict of dtypes if present """ + """try to downcast each item to the dict of dtypes if present""" # turn it off completely if dtypes is False: return [self] @@ -617,7 +617,7 @@ def convert( @final def _can_hold_element(self, element: Any) -> bool: - """ require the same dtype as ourselves """ + """require the same dtype as ourselves""" element = extract_array(element, extract_numpy=True) return can_hold_element(self.values, element) @@ -638,14 +638,14 @@ def should_store(self, value: ArrayLike) -> bool: @final def to_native_types(self, na_rep="nan", quoting=None, **kwargs): - """ convert to our native types format """ + """convert to our native types format""" result = to_native_types(self.values, na_rep=na_rep, quoting=quoting, **kwargs) return self.make_block(result) # block actions # @final def copy(self, deep: bool = True): - """ copy constructor """ + """copy constructor""" values = self.values if deep: values = values.copy() @@ -781,14 +781,6 @@ def _replace_list( # so un-tile here return self.replace(src_list, dest_list[0], inplace, regex) - # https://github.com/pandas-dev/pandas/issues/40371 - # the following pairs check code caused a regression so we catch that case here - # until the issue is fixed properly in can_hold_element - - # error: "Iterable[Any]" has no attribute "tolist" - if hasattr(src_list, "tolist"): - src_list = src_list.tolist() # type: ignore[attr-defined] - # Exclude anything that we know we won't contain pairs = [ (x, y) for x, y in zip(src_list, dest_list) if self._can_hold_element(x) @@ -1166,12 +1158,12 @@ def take_nd( return self.make_block_same_class(new_values, new_mgr_locs) def diff(self, n: int, axis: int = 1) -> list[Block]: - """ return block for the diff of the values """ + """return block for the diff of the values""" new_values = algos.diff(self.values, n, axis=axis, stacklevel=7) return [self.make_block(values=new_values)] def shift(self, periods: int, axis: int = 0, fill_value: Any = None) -> list[Block]: - """ shift the block by periods, possibly upcast """ + """shift the block by periods, possibly upcast""" # convert integer to float if necessary. need to do a lot more than # that, handle boolean etc also @@ -1284,7 +1276,7 @@ def _unstack(self, unstacker, fill_value, new_placement): ------- blocks : list of Block New blocks of unstacked values. - mask : array_like of bool + mask : array-like of bool The mask of columns of `blocks` we should keep. """ new_values, mask = unstacker.get_new_values( @@ -1683,7 +1675,7 @@ class NDArrayBackedExtensionBlock(libinternals.NDArrayBackedBlock, EABackedBlock @property def is_view(self) -> bool: - """ return a boolean if I am possibly a view """ + """return a boolean if I am possibly a view""" # check the ndarray values of the DatetimeIndex values return self.values._ndarray.base is not None @@ -1779,7 +1771,7 @@ class DatetimeLikeBlock(NDArrayBackedExtensionBlock): class DatetimeTZBlock(DatetimeLikeBlock): - """ implement a datetime64 block with a tz attribute """ + """implement a datetime64 block with a tz attribute""" values: DatetimeArray @@ -1999,7 +1991,7 @@ def extract_pandas_array( def extend_blocks(result, blocks=None) -> list[Block]: - """ return a new extended blocks, given the result """ + """return a new extended blocks, given the result""" if blocks is None: blocks = [] if isinstance(result, list): @@ -2039,7 +2031,7 @@ def to_native_types( decimal=".", **kwargs, ) -> np.ndarray: - """ convert to our native types format """ + """convert to our native types format""" values = ensure_wrapped_if_datetimelike(values) if isinstance(values, (DatetimeArray, TimedeltaArray)): diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 323aa45874d96..48f0b7f7f964b 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -183,7 +183,7 @@ def blklocs(self): return self._blklocs def make_empty(self: T, axes=None) -> T: - """ return an empty BlockManager with the items axis of len 0 """ + """return an empty BlockManager with the items axis of len 0""" if axes is None: axes = [Index([])] + self.axes[1:] @@ -422,7 +422,7 @@ def replace_list( inplace: bool = False, regex: bool = False, ) -> T: - """ do a list replace """ + """do a list replace""" inplace = validate_bool_kwarg(inplace, "inplace") bm = self.apply( @@ -466,7 +466,7 @@ def any_extension_types(self) -> bool: @property def is_view(self) -> bool: - """ return a boolean if we are a single block and are a view """ + """return a boolean if we are a single block and are a view""" if len(self.blocks) == 1: return self.blocks[0].is_view @@ -516,7 +516,7 @@ def get_numeric_data(self: T, copy: bool = False) -> T: def _combine( self: T, blocks: list[Block], copy: bool = True, index: Index | None = None ) -> T: - """ return a new manager with the blocks """ + """return a new manager with the blocks""" if len(blocks) == 0: if self.ndim == 2: # retain our own Index dtype @@ -1502,7 +1502,7 @@ def _interleave( class SingleBlockManager(BaseBlockManager, SingleDataManager): - """ manage a single block with """ + """manage a single block with""" ndim = 1 _is_consolidated = True @@ -1596,12 +1596,12 @@ def _block(self) -> Block: @property def _blknos(self): - """ compat with BlockManager """ + """compat with BlockManager""" return None @property def _blklocs(self): - """ compat with BlockManager """ + """compat with BlockManager""" return None def getitem_mgr(self, indexer) -> SingleBlockManager: @@ -1759,7 +1759,7 @@ def construction_error( axes: list[Index], e: ValueError | None = None, ): - """ raise a helpful message about our construction """ + """raise a helpful message about our construction""" passed = tuple(map(int, [tot_items] + list(block_shape))) # Correcting the user facing error message during dataframe construction if len(passed) <= 2: @@ -1885,7 +1885,7 @@ def _simple_blockify(tuples, dtype, consolidate: bool) -> list[Block]: def _multi_blockify(tuples, dtype: DtypeObj | None = None, consolidate: bool = True): - """ return an array of blocks that potentially have different dtypes """ + """return an array of blocks that potentially have different dtypes""" if not consolidate: return _tuples_to_blocks_no_consolidate(tuples, dtype=dtype) diff --git a/pandas/core/missing.py b/pandas/core/missing.py index 8849eb0670faa..424173ccc69f0 100644 --- a/pandas/core/missing.py +++ b/pandas/core/missing.py @@ -524,11 +524,11 @@ def _from_derivatives(xi, yi, x, order=None, der=0, extrapolate=False): Parameters ---------- - xi : array_like + xi : array-like sorted 1D array of x-coordinates - yi : array_like or list of array-likes + yi : array-like or list of array-likes yi[i][j] is the j-th derivative known at xi[i] - order: None or int or array_like of ints. Default: None. + order: None or int or array-like of ints. Default: None. Specifies the degree of local polynomials. If not None, some derivatives are ignored. der : int or list @@ -546,7 +546,7 @@ def _from_derivatives(xi, yi, x, order=None, der=0, extrapolate=False): Returns ------- - y : scalar or array_like + y : scalar or array-like The result, of length R or length M or M by R. """ from scipy import interpolate @@ -568,13 +568,13 @@ def _akima_interpolate(xi, yi, x, der=0, axis=0): Parameters ---------- - xi : array_like + xi : array-like A sorted list of x-coordinates, of length N. - yi : array_like + yi : array-like A 1-D array of real values. `yi`'s length along the interpolation axis must be equal to the length of `xi`. If N-D array, use axis parameter to select correct axis. - x : scalar or array_like + x : scalar or array-like Of length M. der : int, optional How many derivatives to extract; None for all potentially @@ -590,7 +590,7 @@ def _akima_interpolate(xi, yi, x, der=0, axis=0): Returns ------- - y : scalar or array_like + y : scalar or array-like The result, of length R or length M or M by R, """ @@ -609,14 +609,14 @@ def _cubicspline_interpolate(xi, yi, x, axis=0, bc_type="not-a-knot", extrapolat Parameters ---------- - xi : array_like, shape (n,) + xi : array-like, shape (n,) 1-d array containing values of the independent variable. Values must be real, finite and in strictly increasing order. - yi : array_like + yi : array-like Array containing values of the dependent variable. It can have arbitrary number of dimensions, but the length along ``axis`` (see below) must match the length of ``x``. Values must be finite. - x : scalar or array_like, shape (m,) + x : scalar or array-like, shape (m,) axis : int, optional Axis along which `y` is assumed to be varying. Meaning that for ``x[i]`` the corresponding values are ``np.take(y, i, axis=axis)``. @@ -644,7 +644,7 @@ def _cubicspline_interpolate(xi, yi, x, axis=0, bc_type="not-a-knot", extrapolat tuple `(order, deriv_values)` allowing to specify arbitrary derivatives at curve ends: * `order`: the derivative order, 1 or 2. - * `deriv_value`: array_like containing derivative values, shape must + * `deriv_value`: array-like containing derivative values, shape must be the same as `y`, excluding ``axis`` dimension. For example, if `y` is 1D, then `deriv_value` must be a scalar. If `y` is 3D with the shape (n0, n1, n2) and axis=2, then `deriv_value` must be 2D @@ -661,7 +661,7 @@ def _cubicspline_interpolate(xi, yi, x, axis=0, bc_type="not-a-knot", extrapolat Returns ------- - y : scalar or array_like + y : scalar or array-like The result, of shape (m,) References diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index 673c482bced18..ecdf2624c8ec1 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -191,7 +191,7 @@ def _has_infs(result) -> bool: def _get_fill_value( dtype: DtypeObj, fill_value: Scalar | None = None, fill_value_typ=None ): - """ return the correct fill value for the dtype of the values """ + """return the correct fill value for the dtype of the values""" if fill_value is not None: return fill_value if _na_ok_dtype(dtype): @@ -245,7 +245,8 @@ def _maybe_get_mask( """ if mask is None: if is_bool_dtype(values.dtype) or is_integer_dtype(values.dtype): - return np.broadcast_to(False, values.shape) + # Boolean data cannot contain nulls, so signal via mask being None + return None if skipna or needs_i8_conversion(values.dtype): mask = isna(values) @@ -349,7 +350,7 @@ def _na_ok_dtype(dtype: DtypeObj) -> bool: def _wrap_results(result, dtype: np.dtype, fill_value=None): - """ wrap our results if needed """ + """wrap our results if needed""" if result is NaT: pass @@ -1435,8 +1436,15 @@ def _maybe_null_out( Dtype The product of all elements on a given axis. ( NaNs are treated as 1) """ - if mask is not None and axis is not None and isinstance(result, np.ndarray): - null_mask = (mask.shape[axis] - mask.sum(axis) - min_count) < 0 + if axis is not None and isinstance(result, np.ndarray): + if mask is not None: + null_mask = (mask.shape[axis] - mask.sum(axis) - min_count) < 0 + else: + # we have no nulls, kept mask=None in _maybe_get_mask + below_count = shape[axis] - min_count < 0 + new_shape = shape[:axis] + shape[axis + 1 :] + null_mask = np.broadcast_to(below_count, new_shape) + if np.any(null_mask): if is_numeric_dtype(result): if np.iscomplexobj(result): diff --git a/pandas/core/ops/__init__.py b/pandas/core/ops/__init__.py index 9cccf1cff60a1..297769149e5f0 100644 --- a/pandas/core/ops/__init__.py +++ b/pandas/core/ops/__init__.py @@ -149,7 +149,7 @@ def fill_binop(left, right, fill_value): def align_method_SERIES(left: Series, right, align_asobject: bool = False): - """ align lhs and rhs Series """ + """align lhs and rhs Series""" # ToDo: Different from align_method_FRAME, list, tuple and ndarray # are not coerced here # because Series has inconsistencies described in #13637 diff --git a/pandas/core/ops/docstrings.py b/pandas/core/ops/docstrings.py index 06ca6ed806f25..9134ec7a73bea 100644 --- a/pandas/core/ops/docstrings.py +++ b/pandas/core/ops/docstrings.py @@ -1,10 +1,7 @@ """ Templating for ops docstrings """ -from typing import ( - Dict, - Optional, -) +from __future__ import annotations def make_flex_doc(op_name: str, typ: str) -> str: @@ -297,7 +294,7 @@ def make_flex_doc(op_name: str, typ: str) -> str: _returns_tuple = """2-Tuple of Series\n The result of the operation.""" -_op_descriptions: Dict[str, Dict[str, Optional[str]]] = { +_op_descriptions: dict[str, dict[str, str | None]] = { # Arithmetic Operators "add": { "op": "+", diff --git a/pandas/core/ops/mask_ops.py b/pandas/core/ops/mask_ops.py index 968833cd1ae44..d21c80b81b582 100644 --- a/pandas/core/ops/mask_ops.py +++ b/pandas/core/ops/mask_ops.py @@ -1,10 +1,7 @@ """ Ops for masked arrays. """ -from typing import ( - Optional, - Union, -) +from __future__ import annotations import numpy as np @@ -15,10 +12,10 @@ def kleene_or( - left: Union[bool, np.ndarray], - right: Union[bool, np.ndarray], - left_mask: Optional[np.ndarray], - right_mask: Optional[np.ndarray], + left: bool | np.ndarray, + right: bool | np.ndarray, + left_mask: np.ndarray | None, + right_mask: np.ndarray | None, ): """ Boolean ``or`` using Kleene logic. @@ -76,10 +73,10 @@ def kleene_or( def kleene_xor( - left: Union[bool, np.ndarray], - right: Union[bool, np.ndarray], - left_mask: Optional[np.ndarray], - right_mask: Optional[np.ndarray], + left: bool | np.ndarray, + right: bool | np.ndarray, + left_mask: np.ndarray | None, + right_mask: np.ndarray | None, ): """ Boolean ``xor`` using Kleene logic. @@ -125,10 +122,10 @@ def kleene_xor( def kleene_and( - left: Union[bool, libmissing.NAType, np.ndarray], - right: Union[bool, libmissing.NAType, np.ndarray], - left_mask: Optional[np.ndarray], - right_mask: Optional[np.ndarray], + left: bool | libmissing.NAType | np.ndarray, + right: bool | libmissing.NAType | np.ndarray, + left_mask: np.ndarray | None, + right_mask: np.ndarray | None, ): """ Boolean ``and`` using Kleene logic. diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 6378432392a04..76e23f1bf77e0 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -953,8 +953,6 @@ def quantile(self, q=0.5, **kwargs): """ Return value at the given quantile. - .. versionadded:: 0.24.0 - Parameters ---------- q : float or array-like, default 0.5 (50% quantile) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index c05130278f75b..143999a4677b3 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -931,7 +931,7 @@ def _maybe_add_join_keys( result.insert(i, name or f"key_{i}", key_col) def _get_join_indexers(self) -> tuple[np.ndarray, np.ndarray]: - """ return the join indexers """ + """return the join indexers""" # Both returned ndarrays are np.intp return get_join_indexers( self.left_join_keys, self.right_join_keys, sort=self.sort, how=self.how @@ -1692,7 +1692,7 @@ def _asof_by_function(direction: str): def _get_cython_type_upcast(dtype: DtypeObj) -> str: - """ Upcast a dtype to 'int64_t', 'double', or 'object' """ + """Upcast a dtype to 'int64_t', 'double', or 'object'""" if is_integer_dtype(dtype): return "int64_t" elif is_float_dtype(dtype): @@ -1883,10 +1883,10 @@ def _get_merge_keys(self): def _get_join_indexers(self) -> tuple[np.ndarray, np.ndarray]: # Both returned ndarrays are np.intp - """ return the join indexers """ + """return the join indexers""" def flip(xs) -> np.ndarray: - """ unlike np.transpose, this returns an array of tuples """ + """unlike np.transpose, this returns an array of tuples""" # error: Item "ndarray" of "Union[Any, Union[ExtensionArray, ndarray]]" has # no attribute "_values_for_argsort" xs = [ diff --git a/pandas/core/reshape/tile.py b/pandas/core/reshape/tile.py index 64daf2542e15a..7db30dc1ba9b9 100644 --- a/pandas/core/reshape/tile.py +++ b/pandas/core/reshape/tile.py @@ -552,7 +552,7 @@ def _convert_bin_to_datelike_type(bins, dtype): def _format_labels( bins, precision: int, right: bool = True, include_lowest: bool = False, dtype=None ): - """ based on the dtype, return our labels """ + """based on the dtype, return our labels""" closed = "right" if right else "left" formatter: Callable[[Any], Timestamp] | Callable[[Any], Timedelta] diff --git a/pandas/core/series.py b/pandas/core/series.py index 2f45a2adbdec7..59ea6710ea6cd 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1972,8 +1972,6 @@ def mode(self, dropna=True) -> Series: dropna : bool, default True Don't consider counts of NaN/NaT. - .. versionadded:: 0.24.0 - Returns ------- Series @@ -2476,7 +2474,7 @@ def corr(self, other, method="pearson", min_periods=None) -> float: - spearman : Spearman rank correlation - callable: Callable with input two 1d ndarrays and returning a float. - .. versionadded:: 0.24.0 + .. warning:: Note that the returned matrix from corr will have 1 along the diagonals and will be symmetric regardless of the callable's behavior. diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index 7643019ff8c55..323cb6bd9fedd 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -13,7 +13,10 @@ import numpy as np import pandas._libs.lib as lib -from pandas._typing import FrameOrSeriesUnion +from pandas._typing import ( + DtypeObj, + FrameOrSeriesUnion, +) from pandas.util._decorators import Appender from pandas.core.dtypes.common import ( @@ -209,8 +212,12 @@ def _validate(data): # see _libs/lib.pyx for list of inferred types allowed_types = ["string", "empty", "bytes", "mixed", "mixed-integer"] - values = getattr(data, "values", data) # Series / Index - values = getattr(values, "categories", values) # categorical / normal + # TODO: avoid kludge for tests.extension.test_numpy + from pandas.core.internals.managers import _extract_array + + data = _extract_array(data) + + values = getattr(data, "categories", data) # categorical / normal inferred_dtype = lib.infer_dtype(values, skipna=True) @@ -242,6 +249,7 @@ def _wrap_result( expand: bool | None = None, fill_value=np.nan, returns_string=True, + returns_bool: bool = False, ): from pandas import ( Index, @@ -319,11 +327,17 @@ def cons_row(x): else: index = self._orig.index # This is a mess. - dtype: str | None - if self._is_string and returns_string: - dtype = self._orig.dtype + dtype: DtypeObj | str | None + vdtype = getattr(result, "dtype", None) + if self._is_string: + if is_bool_dtype(vdtype): + dtype = result.dtype + elif returns_string: + dtype = self._orig.dtype + else: + dtype = vdtype else: - dtype = None + dtype = vdtype if expand: cons = self._orig._constructor_expanddim @@ -331,7 +345,7 @@ def cons_row(x): else: # Must be a Series cons = self._orig._constructor - result = cons(result, name=name, index=index) + result = cons(result, name=name, index=index, dtype=dtype) result = result.__finalize__(self._orig, method="str") if name is not None and result.ndim == 1: # __finalize__ might copy over the original name, but we may @@ -369,7 +383,7 @@ def _get_series_list(self, others): if isinstance(others, ABCSeries): return [others] elif isinstance(others, ABCIndex): - return [Series(others._values, index=idx)] + return [Series(others._values, index=idx, dtype=others.dtype)] elif isinstance(others, ABCDataFrame): return [others[x] for x in others] elif isinstance(others, np.ndarray) and others.ndim == 2: @@ -547,7 +561,7 @@ def cat(self, others=None, sep=None, na_rep=None, join="left"): sep = "" if isinstance(self._orig, ABCIndex): - data = Series(self._orig, index=self._orig) + data = Series(self._orig, index=self._orig, dtype=self._orig.dtype) else: # Series data = self._orig @@ -3080,7 +3094,7 @@ def _result_dtype(arr): from pandas.core.arrays.string_ import StringDtype if isinstance(arr.dtype, StringDtype): - return arr.dtype.name + return arr.dtype else: return object diff --git a/pandas/core/strings/object_array.py b/pandas/core/strings/object_array.py index 7ce4abe904f3b..02bdb7f181583 100644 --- a/pandas/core/strings/object_array.py +++ b/pandas/core/strings/object_array.py @@ -173,8 +173,7 @@ def scalar_rep(x): return self._str_map(scalar_rep, dtype=str) else: - from pandas.core.arrays.string_ import StringArray - from pandas.core.arrays.string_arrow import ArrowStringArray + from pandas.core.arrays.string_ import BaseStringArray def rep(x, r): if x is libmissing.NA: @@ -186,7 +185,7 @@ def rep(x, r): repeats = np.asarray(repeats, dtype=object) result = libops.vec_binop(np.asarray(self), repeats, rep) - if isinstance(self, (StringArray, ArrowStringArray)): + if isinstance(self, BaseStringArray): # Not going through map, so we have to do this here. result = type(self)._from_sequence(result) return result diff --git a/pandas/core/tools/numeric.py b/pandas/core/tools/numeric.py index 6f5e8ab900dfd..6dfd67f5dc5ec 100644 --- a/pandas/core/tools/numeric.py +++ b/pandas/core/tools/numeric.py @@ -1,4 +1,4 @@ -from typing import Optional +from __future__ import annotations import numpy as np @@ -166,7 +166,7 @@ def to_numeric(arg, errors="raise", downcast=None): # GH33013: for IntegerArray & FloatingArray extract non-null values for casting # save mask to reconstruct the full array after casting - mask: Optional[np.ndarray] = None + mask: np.ndarray | None = None if isinstance(values, NumericArray): mask = values._mask values = values._data[~mask] diff --git a/pandas/core/tools/times.py b/pandas/core/tools/times.py index d5ccae9abc385..030cee3f678f4 100644 --- a/pandas/core/tools/times.py +++ b/pandas/core/tools/times.py @@ -1,11 +1,9 @@ +from __future__ import annotations + from datetime import ( datetime, time, ) -from typing import ( - List, - Optional, -) import numpy as np @@ -61,7 +59,7 @@ def _convert_listlike(arg, format): if infer_time_format and format is None: format = _guess_time_format_for_array(arg) - times: List[Optional[time]] = [] + times: list[time | None] = [] if format is not None: for element in arg: try: diff --git a/pandas/core/util/numba_.py b/pandas/core/util/numba_.py index 8a2e24b25268c..96907df3c48ad 100644 --- a/pandas/core/util/numba_.py +++ b/pandas/core/util/numba_.py @@ -1,11 +1,8 @@ """Common utilities for Numba operations""" +from __future__ import annotations + import types -from typing import ( - Callable, - Dict, - Optional, - Tuple, -) +from typing import Callable import numpy as np @@ -15,10 +12,10 @@ from pandas.util.version import Version GLOBAL_USE_NUMBA: bool = False -NUMBA_FUNC_CACHE: Dict[Tuple[Callable, str], Callable] = {} +NUMBA_FUNC_CACHE: dict[tuple[Callable, str], Callable] = {} -def maybe_use_numba(engine: Optional[str]) -> bool: +def maybe_use_numba(engine: str | None) -> bool: """Signal whether to use numba routines.""" return engine == "numba" or (engine is None and GLOBAL_USE_NUMBA) @@ -31,8 +28,8 @@ def set_use_numba(enable: bool = False) -> None: def get_jit_arguments( - engine_kwargs: Optional[Dict[str, bool]] = None, kwargs: Optional[Dict] = None -) -> Tuple[bool, bool, bool]: + engine_kwargs: dict[str, bool] | None = None, kwargs: dict | None = None +) -> tuple[bool, bool, bool]: """ Return arguments to pass to numba.JIT, falling back on pandas default JIT settings. diff --git a/pandas/core/window/ewm.py b/pandas/core/window/ewm.py index 4187c56079060..c1d532d94eb83 100644 --- a/pandas/core/window/ewm.py +++ b/pandas/core/window/ewm.py @@ -41,6 +41,10 @@ GroupbyIndexer, ) from pandas.core.window.numba_ import generate_numba_ewma_func +from pandas.core.window.online import ( + EWMMeanState, + generate_online_numba_ewma_func, +) from pandas.core.window.rolling import ( BaseWindow, BaseWindowGroupby, @@ -263,7 +267,7 @@ def __init__( span: float | None = None, halflife: float | TimedeltaConvertibleTypes | None = None, alpha: float | None = None, - min_periods: int = 0, + min_periods: int | None = 0, adjust: bool = True, ignore_na: bool = False, axis: Axis = 0, @@ -273,7 +277,7 @@ def __init__( ): super().__init__( obj=obj, - min_periods=max(int(min_periods), 1), + min_periods=1 if min_periods is None else max(int(min_periods), 1), on=None, center=False, closed=None, @@ -338,6 +342,48 @@ def _get_window_indexer(self) -> BaseIndexer: """ return ExponentialMovingWindowIndexer() + def online(self, engine="numba", engine_kwargs=None): + """ + Return an ``OnlineExponentialMovingWindow`` object to calculate + exponentially moving window aggregations in an online method. + + .. versionadded:: 1.3.0 + + Parameters + ---------- + engine: str, default ``'numba'`` + Execution engine to calculate online aggregations. + Applies to all supported aggregation methods. + + engine_kwargs : dict, default None + Applies to all supported aggregation methods. + + * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil`` + and ``parallel`` dictionary keys. The values must either be ``True`` or + ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is + ``{{'nopython': True, 'nogil': False, 'parallel': False}}`` and will be + applied to the function + + Returns + ------- + OnlineExponentialMovingWindow + """ + return OnlineExponentialMovingWindow( + obj=self.obj, + com=self.com, + span=self.span, + halflife=self.halflife, + alpha=self.alpha, + min_periods=self.min_periods, + adjust=self.adjust, + ignore_na=self.ignore_na, + axis=self.axis, + times=self.times, + engine=engine, + engine_kwargs=engine_kwargs, + selection=self._selection, + ) + @doc( _shared_docs["aggregate"], see_also=dedent( @@ -655,3 +701,167 @@ def _get_window_indexer(self) -> GroupbyIndexer: window_indexer=ExponentialMovingWindowIndexer, ) return window_indexer + + +class OnlineExponentialMovingWindow(ExponentialMovingWindow): + def __init__( + self, + obj: FrameOrSeries, + com: float | None = None, + span: float | None = None, + halflife: float | TimedeltaConvertibleTypes | None = None, + alpha: float | None = None, + min_periods: int | None = 0, + adjust: bool = True, + ignore_na: bool = False, + axis: Axis = 0, + times: str | np.ndarray | FrameOrSeries | None = None, + engine: str = "numba", + engine_kwargs: dict[str, bool] | None = None, + *, + selection=None, + ): + if times is not None: + raise NotImplementedError( + "times is not implemented with online operations." + ) + super().__init__( + obj=obj, + com=com, + span=span, + halflife=halflife, + alpha=alpha, + min_periods=min_periods, + adjust=adjust, + ignore_na=ignore_na, + axis=axis, + times=times, + selection=selection, + ) + self._mean = EWMMeanState( + self._com, self.adjust, self.ignore_na, self.axis, obj.shape + ) + if maybe_use_numba(engine): + self.engine = engine + self.engine_kwargs = engine_kwargs + else: + raise ValueError("'numba' is the only supported engine") + + def reset(self): + """ + Reset the state captured by `update` calls. + """ + self._mean.reset() + + def aggregate(self, func, *args, **kwargs): + return NotImplementedError + + def std(self, bias: bool = False, *args, **kwargs): + return NotImplementedError + + def corr( + self, + other: FrameOrSeriesUnion | None = None, + pairwise: bool | None = None, + **kwargs, + ): + return NotImplementedError + + def cov( + self, + other: FrameOrSeriesUnion | None = None, + pairwise: bool | None = None, + bias: bool = False, + **kwargs, + ): + return NotImplementedError + + def var(self, bias: bool = False, *args, **kwargs): + return NotImplementedError + + def mean(self, *args, update=None, update_times=None, **kwargs): + """ + Calculate an online exponentially weighted mean. + + Parameters + ---------- + update: DataFrame or Series, default None + New values to continue calculating the + exponentially weighted mean from the last values and weights. + Values should be float64 dtype. + + ``update`` needs to be ``None`` the first time the + exponentially weighted mean is calculated. + + update_times: Series or 1-D np.ndarray, default None + New times to continue calculating the + exponentially weighted mean from the last values and weights. + If ``None``, values are assumed to be evenly spaced + in time. + This feature is currently unsupported. + + Returns + ------- + DataFrame or Series + + Examples + -------- + >>> df = pd.DataFrame({"a": range(5), "b": range(5, 10)}) + >>> online_ewm = df.head(2).ewm(0.5).online() + >>> online_ewm.mean() + a b + 0 0.00 5.00 + 1 0.75 5.75 + >>> online_ewm.mean(update=df.tail(3)) + a b + 2 1.615385 6.615385 + 3 2.550000 7.550000 + 4 3.520661 8.520661 + >>> online_ewm.reset() + >>> online_ewm.mean() + a b + 0 0.00 5.00 + 1 0.75 5.75 + """ + result_kwargs = {} + is_frame = True if self._selected_obj.ndim == 2 else False + if update_times is not None: + raise NotImplementedError("update_times is not implemented.") + else: + update_deltas = np.ones( + max(self._selected_obj.shape[self.axis - 1] - 1, 0), dtype=np.float64 + ) + if update is not None: + if self._mean.last_ewm is None: + raise ValueError( + "Must call mean with update=None first before passing update" + ) + result_from = 1 + result_kwargs["index"] = update.index + if is_frame: + last_value = self._mean.last_ewm[np.newaxis, :] + result_kwargs["columns"] = update.columns + else: + last_value = self._mean.last_ewm + result_kwargs["name"] = update.name + np_array = np.concatenate((last_value, update.to_numpy())) + else: + result_from = 0 + result_kwargs["index"] = self._selected_obj.index + if is_frame: + result_kwargs["columns"] = self._selected_obj.columns + else: + result_kwargs["name"] = self._selected_obj.name + np_array = self._selected_obj.astype(np.float64).to_numpy() + ewma_func = generate_online_numba_ewma_func(self.engine_kwargs) + result = self._mean.run_ewm( + np_array if is_frame else np_array[:, np.newaxis], + update_deltas, + self.min_periods, + ewma_func, + ) + if not is_frame: + result = result.squeeze() + result = result[result_from:] + result = self._selected_obj._constructor(result, **result_kwargs) + return result diff --git a/pandas/core/window/expanding.py b/pandas/core/window/expanding.py index cddb3ef56250d..02cf31cad7b8d 100644 --- a/pandas/core/window/expanding.py +++ b/pandas/core/window/expanding.py @@ -1,10 +1,9 @@ +from __future__ import annotations + from textwrap import dedent from typing import ( Any, Callable, - Dict, - Optional, - Tuple, ) from pandas._typing import ( @@ -184,10 +183,10 @@ def apply( self, func: Callable[..., Any], raw: bool = False, - engine: Optional[str] = None, - engine_kwargs: Optional[Dict[str, bool]] = None, - args: Optional[Tuple[Any, ...]] = None, - kwargs: Optional[Dict[str, Any]] = None, + engine: str | None = None, + engine_kwargs: dict[str, bool] | None = None, + args: tuple[Any, ...] | None = None, + kwargs: dict[str, Any] | None = None, ): return super().apply( func, @@ -217,8 +216,8 @@ def apply( def sum( self, *args, - engine: Optional[str] = None, - engine_kwargs: Optional[Dict[str, bool]] = None, + engine: str | None = None, + engine_kwargs: dict[str, bool] | None = None, **kwargs, ): nv.validate_expanding_func("sum", args, kwargs) @@ -243,8 +242,8 @@ def sum( def max( self, *args, - engine: Optional[str] = None, - engine_kwargs: Optional[Dict[str, bool]] = None, + engine: str | None = None, + engine_kwargs: dict[str, bool] | None = None, **kwargs, ): nv.validate_expanding_func("max", args, kwargs) @@ -269,8 +268,8 @@ def max( def min( self, *args, - engine: Optional[str] = None, - engine_kwargs: Optional[Dict[str, bool]] = None, + engine: str | None = None, + engine_kwargs: dict[str, bool] | None = None, **kwargs, ): nv.validate_expanding_func("min", args, kwargs) @@ -295,8 +294,8 @@ def min( def mean( self, *args, - engine: Optional[str] = None, - engine_kwargs: Optional[Dict[str, bool]] = None, + engine: str | None = None, + engine_kwargs: dict[str, bool] | None = None, **kwargs, ): nv.validate_expanding_func("mean", args, kwargs) @@ -319,8 +318,8 @@ def mean( ) def median( self, - engine: Optional[str] = None, - engine_kwargs: Optional[Dict[str, bool]] = None, + engine: str | None = None, + engine_kwargs: dict[str, bool] | None = None, **kwargs, ): return super().median(engine=engine, engine_kwargs=engine_kwargs, **kwargs) @@ -592,8 +591,8 @@ def quantile( ) def cov( self, - other: Optional[FrameOrSeriesUnion] = None, - pairwise: Optional[bool] = None, + other: FrameOrSeriesUnion | None = None, + pairwise: bool | None = None, ddof: int = 1, **kwargs, ): @@ -657,8 +656,8 @@ def cov( ) def corr( self, - other: Optional[FrameOrSeriesUnion] = None, - pairwise: Optional[bool] = None, + other: FrameOrSeriesUnion | None = None, + pairwise: bool | None = None, ddof: int = 1, **kwargs, ): diff --git a/pandas/core/window/indexers.py b/pandas/core/window/indexers.py index 1ad80b2e4c908..cef023a647d7f 100644 --- a/pandas/core/window/indexers.py +++ b/pandas/core/window/indexers.py @@ -1,11 +1,7 @@ """Indexer objects for computing start/end window bounds for rolling operations""" +from __future__ import annotations + from datetime import timedelta -from typing import ( - Dict, - Optional, - Tuple, - Type, -) import numpy as np @@ -45,7 +41,7 @@ class BaseIndexer: """Base class for window bounds calculations.""" def __init__( - self, index_array: Optional[np.ndarray] = None, window_size: int = 0, **kwargs + self, index_array: np.ndarray | None = None, window_size: int = 0, **kwargs ): """ Parameters @@ -63,10 +59,10 @@ def __init__( def get_window_bounds( self, num_values: int = 0, - min_periods: Optional[int] = None, - center: Optional[bool] = None, - closed: Optional[str] = None, - ) -> Tuple[np.ndarray, np.ndarray]: + min_periods: int | None = None, + center: bool | None = None, + closed: str | None = None, + ) -> tuple[np.ndarray, np.ndarray]: raise NotImplementedError @@ -78,10 +74,10 @@ class FixedWindowIndexer(BaseIndexer): def get_window_bounds( self, num_values: int = 0, - min_periods: Optional[int] = None, - center: Optional[bool] = None, - closed: Optional[str] = None, - ) -> Tuple[np.ndarray, np.ndarray]: + min_periods: int | None = None, + center: bool | None = None, + closed: str | None = None, + ) -> tuple[np.ndarray, np.ndarray]: if center: offset = (self.window_size - 1) // 2 @@ -108,10 +104,10 @@ class VariableWindowIndexer(BaseIndexer): def get_window_bounds( self, num_values: int = 0, - min_periods: Optional[int] = None, - center: Optional[bool] = None, - closed: Optional[str] = None, - ) -> Tuple[np.ndarray, np.ndarray]: + min_periods: int | None = None, + center: bool | None = None, + closed: str | None = None, + ) -> tuple[np.ndarray, np.ndarray]: # error: Argument 4 to "calculate_variable_window_bounds" has incompatible # type "Optional[bool]"; expected "bool" @@ -132,7 +128,7 @@ class VariableOffsetWindowIndexer(BaseIndexer): def __init__( self, - index_array: Optional[np.ndarray] = None, + index_array: np.ndarray | None = None, window_size: int = 0, index=None, offset=None, @@ -146,10 +142,10 @@ def __init__( def get_window_bounds( self, num_values: int = 0, - min_periods: Optional[int] = None, - center: Optional[bool] = None, - closed: Optional[str] = None, - ) -> Tuple[np.ndarray, np.ndarray]: + min_periods: int | None = None, + center: bool | None = None, + closed: str | None = None, + ) -> tuple[np.ndarray, np.ndarray]: # if windows is variable, default is 'right', otherwise default is 'both' if closed is None: @@ -216,10 +212,10 @@ class ExpandingIndexer(BaseIndexer): def get_window_bounds( self, num_values: int = 0, - min_periods: Optional[int] = None, - center: Optional[bool] = None, - closed: Optional[str] = None, - ) -> Tuple[np.ndarray, np.ndarray]: + min_periods: int | None = None, + center: bool | None = None, + closed: str | None = None, + ) -> tuple[np.ndarray, np.ndarray]: return ( np.zeros(num_values, dtype=np.int64), @@ -257,10 +253,10 @@ class FixedForwardWindowIndexer(BaseIndexer): def get_window_bounds( self, num_values: int = 0, - min_periods: Optional[int] = None, - center: Optional[bool] = None, - closed: Optional[str] = None, - ) -> Tuple[np.ndarray, np.ndarray]: + min_periods: int | None = None, + center: bool | None = None, + closed: str | None = None, + ) -> tuple[np.ndarray, np.ndarray]: if center: raise ValueError("Forward-looking windows can't have center=True") @@ -282,11 +278,11 @@ class GroupbyIndexer(BaseIndexer): def __init__( self, - index_array: Optional[np.ndarray] = None, + index_array: np.ndarray | None = None, window_size: int = 0, - groupby_indicies: Optional[Dict] = None, - window_indexer: Type[BaseIndexer] = BaseIndexer, - indexer_kwargs: Optional[Dict] = None, + groupby_indicies: dict | None = None, + window_indexer: type[BaseIndexer] = BaseIndexer, + indexer_kwargs: dict | None = None, **kwargs, ): """ @@ -318,10 +314,10 @@ def __init__( def get_window_bounds( self, num_values: int = 0, - min_periods: Optional[int] = None, - center: Optional[bool] = None, - closed: Optional[str] = None, - ) -> Tuple[np.ndarray, np.ndarray]: + min_periods: int | None = None, + center: bool | None = None, + closed: str | None = None, + ) -> tuple[np.ndarray, np.ndarray]: # 1) For each group, get the indices that belong to the group # 2) Use the indices to calculate the start & end bounds of the window # 3) Append the window bounds in group order @@ -369,9 +365,9 @@ class ExponentialMovingWindowIndexer(BaseIndexer): def get_window_bounds( self, num_values: int = 0, - min_periods: Optional[int] = None, - center: Optional[bool] = None, - closed: Optional[str] = None, - ) -> Tuple[np.ndarray, np.ndarray]: + min_periods: int | None = None, + center: bool | None = None, + closed: str | None = None, + ) -> tuple[np.ndarray, np.ndarray]: return np.array([0], dtype=np.int64), np.array([num_values], dtype=np.int64) diff --git a/pandas/core/window/numba_.py b/pandas/core/window/numba_.py index 9407efd0bef2b..d00be0ea840a8 100644 --- a/pandas/core/window/numba_.py +++ b/pandas/core/window/numba_.py @@ -1,10 +1,9 @@ +from __future__ import annotations + import functools from typing import ( Any, Callable, - Dict, - Optional, - Tuple, ) import numpy as np @@ -20,10 +19,10 @@ def generate_numba_apply_func( - args: Tuple, - kwargs: Dict[str, Any], + args: tuple, + kwargs: dict[str, Any], func: Callable[..., Scalar], - engine_kwargs: Optional[Dict[str, bool]], + engine_kwargs: dict[str, bool] | None, name: str, ): """ @@ -81,7 +80,7 @@ def roll_apply( def generate_numba_ewma_func( - engine_kwargs: Optional[Dict[str, bool]], + engine_kwargs: dict[str, bool] | None, com: float, adjust: bool, ignore_na: bool, @@ -170,10 +169,10 @@ def ewma( def generate_numba_table_func( - args: Tuple, - kwargs: Dict[str, Any], + args: tuple, + kwargs: dict[str, Any], func: Callable[..., np.ndarray], - engine_kwargs: Optional[Dict[str, bool]], + engine_kwargs: dict[str, bool] | None, name: str, ): """ diff --git a/pandas/core/window/online.py b/pandas/core/window/online.py new file mode 100644 index 0000000000000..5a9e8d65255ae --- /dev/null +++ b/pandas/core/window/online.py @@ -0,0 +1,118 @@ +from typing import ( + Dict, + Optional, +) + +import numpy as np + +from pandas.compat._optional import import_optional_dependency + +from pandas.core.util.numba_ import ( + NUMBA_FUNC_CACHE, + get_jit_arguments, +) + + +def generate_online_numba_ewma_func(engine_kwargs: Optional[Dict[str, bool]]): + """ + Generate a numba jitted groupby ewma function specified by values + from engine_kwargs. + Parameters + ---------- + engine_kwargs : dict + dictionary of arguments to be passed into numba.jit + Returns + ------- + Numba function + """ + nopython, nogil, parallel = get_jit_arguments(engine_kwargs) + + cache_key = (lambda x: x, "online_ewma") + if cache_key in NUMBA_FUNC_CACHE: + return NUMBA_FUNC_CACHE[cache_key] + + numba = import_optional_dependency("numba") + + @numba.jit(nopython=nopython, nogil=nogil, parallel=parallel) + def online_ewma( + values: np.ndarray, + deltas: np.ndarray, + minimum_periods: int, + old_wt_factor: float, + new_wt: float, + old_wt: np.ndarray, + adjust: bool, + ignore_na: bool, + ): + """ + Compute online exponentially weighted mean per column over 2D values. + + Takes the first observation as is, then computes the subsequent + exponentially weighted mean accounting minimum periods. + """ + result = np.empty(values.shape) + weighted_avg = values[0] + nobs = (~np.isnan(weighted_avg)).astype(np.int64) + result[0] = np.where(nobs >= minimum_periods, weighted_avg, np.nan) + + for i in range(1, len(values)): + cur = values[i] + is_observations = ~np.isnan(cur) + nobs += is_observations.astype(np.int64) + for j in numba.prange(len(cur)): + if not np.isnan(weighted_avg[j]): + if is_observations[j] or not ignore_na: + + # note that len(deltas) = len(vals) - 1 and deltas[i] is to be + # used in conjunction with vals[i+1] + old_wt[j] *= old_wt_factor ** deltas[j - 1] + if is_observations[j]: + # avoid numerical errors on constant series + if weighted_avg[j] != cur[j]: + weighted_avg[j] = ( + (old_wt[j] * weighted_avg[j]) + (new_wt * cur[j]) + ) / (old_wt[j] + new_wt) + if adjust: + old_wt[j] += new_wt + else: + old_wt[j] = 1.0 + elif is_observations[j]: + weighted_avg[j] = cur[j] + + result[i] = np.where(nobs >= minimum_periods, weighted_avg, np.nan) + + return result, old_wt + + return online_ewma + + +class EWMMeanState: + def __init__(self, com, adjust, ignore_na, axis, shape): + alpha = 1.0 / (1.0 + com) + self.axis = axis + self.shape = shape + self.adjust = adjust + self.ignore_na = ignore_na + self.new_wt = 1.0 if adjust else alpha + self.old_wt_factor = 1.0 - alpha + self.old_wt = np.ones(self.shape[self.axis - 1]) + self.last_ewm = None + + def run_ewm(self, weighted_avg, deltas, min_periods, ewm_func): + result, old_wt = ewm_func( + weighted_avg, + deltas, + min_periods, + self.old_wt_factor, + self.new_wt, + self.old_wt, + self.adjust, + self.ignore_na, + ) + self.old_wt = old_wt + self.last_ewm = result[-1] + return result + + def reset(self): + self.old_wt = np.ones(self.shape[self.axis - 1]) + self.last_ewm = None diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index f7a4bf7c5ede5..719a4472fb9e3 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -114,16 +114,10 @@ both sides. * If list of int, then indicates list of column numbers to be parsed. * If list of string, then indicates list of column names to be parsed. - - .. versionadded:: 0.24.0 - * If callable, then evaluate each column name against it and parse the column if the callable returns ``True``. Returns a subset of the columns according to behavior above. - - .. versionadded:: 0.24.0 - squeeze : bool, default False If the parsed data only contains one column then return a Series. dtype : Type name or dict of column -> type, default None @@ -680,8 +674,6 @@ class ExcelWriter(metaclass=abc.ABCMeta): (e.g. 'YYYY-MM-DD HH:MM:SS'). mode : {'w', 'a'}, default 'w' File mode to use (write or append). Append does not work with fsspec URLs. - - .. versionadded:: 0.24.0 storage_options : dict, optional Extra options that make sense for a particular storage connection, e.g. host, port, username, password, etc., if using a URL that will diff --git a/pandas/io/excel/_odfreader.py b/pandas/io/excel/_odfreader.py index c105465cddd95..e0c5a2c6a7ff9 100644 --- a/pandas/io/excel/_odfreader.py +++ b/pandas/io/excel/_odfreader.py @@ -1,4 +1,4 @@ -from typing import List +from __future__ import annotations import numpy as np @@ -51,7 +51,7 @@ def empty_value(self) -> str: return "" @property - def sheet_names(self) -> List[str]: + def sheet_names(self) -> list[str]: """Return a list of sheet names present in the document""" from odf.table import Table @@ -78,7 +78,7 @@ def get_sheet_by_name(self, name: str): self.close() raise ValueError(f"sheet {name} not found") - def get_sheet_data(self, sheet, convert_float: bool) -> List[List[Scalar]]: + def get_sheet_data(self, sheet, convert_float: bool) -> list[list[Scalar]]: """ Parse an ODF Table into a list of lists """ @@ -96,12 +96,12 @@ def get_sheet_data(self, sheet, convert_float: bool) -> List[List[Scalar]]: empty_rows = 0 max_row_len = 0 - table: List[List[Scalar]] = [] + table: list[list[Scalar]] = [] for sheet_row in sheet_rows: sheet_cells = [x for x in sheet_row.childNodes if x.qname in cell_names] empty_cells = 0 - table_row: List[Scalar] = [] + table_row: list[Scalar] = [] for sheet_cell in sheet_cells: if sheet_cell.qname == table_cell_name: diff --git a/pandas/io/excel/_odswriter.py b/pandas/io/excel/_odswriter.py index 7b6634fff1c16..efef86329314b 100644 --- a/pandas/io/excel/_odswriter.py +++ b/pandas/io/excel/_odswriter.py @@ -1,13 +1,10 @@ +from __future__ import annotations + from collections import defaultdict import datetime from typing import ( Any, DefaultDict, - Dict, - List, - Optional, - Tuple, - Union, ) import pandas._libs.json as json @@ -25,13 +22,13 @@ class ODSWriter(ExcelWriter): def __init__( self, path: str, - engine: Optional[str] = None, + engine: str | None = None, date_format=None, datetime_format=None, mode: str = "w", storage_options: StorageOptions = None, - if_sheet_exists: Optional[str] = None, - engine_kwargs: Optional[Dict[str, Any]] = None, + if_sheet_exists: str | None = None, + engine_kwargs: dict[str, Any] | None = None, ): from odf.opendocument import OpenDocumentSpreadsheet @@ -47,7 +44,7 @@ def __init__( ) self.book = OpenDocumentSpreadsheet() - self._style_dict: Dict[str, str] = {} + self._style_dict: dict[str, str] = {} def save(self) -> None: """ @@ -59,11 +56,11 @@ def save(self) -> None: def write_cells( self, - cells: List[ExcelCell], - sheet_name: Optional[str] = None, + cells: list[ExcelCell], + sheet_name: str | None = None, startrow: int = 0, startcol: int = 0, - freeze_panes: Optional[Tuple[int, int]] = None, + freeze_panes: tuple[int, int] | None = None, ) -> None: """ Write the frame cells using odf @@ -115,7 +112,7 @@ def write_cells( for row_nr in range(max(rows.keys()) + 1): wks.addElement(rows[row_nr]) - def _make_table_cell_attributes(self, cell) -> Dict[str, Union[int, str]]: + def _make_table_cell_attributes(self, cell) -> dict[str, int | str]: """Convert cell attributes to OpenDocument attributes Parameters @@ -128,7 +125,7 @@ def _make_table_cell_attributes(self, cell) -> Dict[str, Union[int, str]]: attributes : Dict[str, Union[int, str]] Dictionary with attributes and attribute values """ - attributes: Dict[str, Union[int, str]] = {} + attributes: dict[str, int | str] = {} style_name = self._process_style(cell.style) if style_name is not None: attributes["stylename"] = style_name @@ -137,7 +134,7 @@ def _make_table_cell_attributes(self, cell) -> Dict[str, Union[int, str]]: attributes["numbercolumnsspanned"] = cell.mergeend return attributes - def _make_table_cell(self, cell) -> Tuple[str, Any]: + def _make_table_cell(self, cell) -> tuple[str, Any]: """Convert cell data to an OpenDocument spreadsheet cell Parameters @@ -188,7 +185,7 @@ def _make_table_cell(self, cell) -> Tuple[str, Any]: ), ) - def _process_style(self, style: Dict[str, Any]) -> str: + def _process_style(self, style: dict[str, Any]) -> str: """Convert a style dictionary to a OpenDocument style sheet Parameters @@ -241,7 +238,7 @@ def _process_style(self, style: Dict[str, Any]) -> str: return name def _create_freeze_panes( - self, sheet_name: str, freeze_panes: Tuple[int, int] + self, sheet_name: str, freeze_panes: tuple[int, int] ) -> None: """ Create freeze panes in the sheet. diff --git a/pandas/io/excel/_pyxlsb.py b/pandas/io/excel/_pyxlsb.py index 02b8090adcfdf..52a67336aaa82 100644 --- a/pandas/io/excel/_pyxlsb.py +++ b/pandas/io/excel/_pyxlsb.py @@ -1,4 +1,4 @@ -from typing import List +from __future__ import annotations from pandas._typing import ( FilePathOrBuffer, @@ -47,7 +47,7 @@ def load_workbook(self, filepath_or_buffer: FilePathOrBuffer): return open_workbook(filepath_or_buffer) @property - def sheet_names(self) -> List[str]: + def sheet_names(self) -> list[str]: return self.book.sheets def get_sheet_by_name(self, name: str): @@ -74,7 +74,7 @@ def _convert_cell(self, cell, convert_float: bool) -> Scalar: return cell.v - def get_sheet_data(self, sheet, convert_float: bool) -> List[List[Scalar]]: + def get_sheet_data(self, sheet, convert_float: bool) -> list[list[Scalar]]: data: list[list[Scalar]] = [] prevous_row_number = -1 # When sparse=True the rows can have different lengths and empty rows are diff --git a/pandas/io/excel/_util.py b/pandas/io/excel/_util.py index 6612b681a9171..7d8028de23257 100644 --- a/pandas/io/excel/_util.py +++ b/pandas/io/excel/_util.py @@ -1,7 +1,6 @@ -from typing import ( - List, - MutableMapping, -) +from __future__ import annotations + +from typing import MutableMapping from pandas.compat._optional import import_optional_dependency @@ -110,7 +109,7 @@ def _excel2num(x: str) -> int: return index - 1 -def _range2cols(areas: str) -> List[int]: +def _range2cols(areas: str) -> list[int]: """ Convert comma separated list of column names and ranges to indices. @@ -131,7 +130,7 @@ def _range2cols(areas: str) -> List[int]: >>> _range2cols('A,C,Z:AB') [0, 2, 25, 26, 27] """ - cols: List[int] = [] + cols: list[int] = [] for rng in areas.split(","): if ":" in rng: diff --git a/pandas/io/excel/_xlsxwriter.py b/pandas/io/excel/_xlsxwriter.py index 27b3ae3fab9bc..7500a33b1f097 100644 --- a/pandas/io/excel/_xlsxwriter.py +++ b/pandas/io/excel/_xlsxwriter.py @@ -1,10 +1,6 @@ -from typing import ( - Any, - Dict, - List, - Optional, - Tuple, -) +from __future__ import annotations + +from typing import Any import pandas._libs.json as json from pandas._typing import StorageOptions @@ -17,7 +13,7 @@ class _XlsxStyler: # Map from openpyxl-oriented styles to flatter xlsxwriter representation # Ordering necessary for both determinism and because some are keyed by # prefixes of others. - STYLE_MAPPING: Dict[str, List[Tuple[Tuple[str, ...], str]]] = { + STYLE_MAPPING: dict[str, list[tuple[tuple[str, ...], str]]] = { "font": [ (("name",), "font_name"), (("sz",), "font_size"), @@ -177,8 +173,8 @@ def __init__( datetime_format=None, mode: str = "w", storage_options: StorageOptions = None, - if_sheet_exists: Optional[str] = None, - engine_kwargs: Optional[Dict[str, Any]] = None, + if_sheet_exists: str | None = None, + engine_kwargs: dict[str, Any] | None = None, ): # Use the xlsxwriter module as the Excel writer. from xlsxwriter import Workbook diff --git a/pandas/io/excel/_xlwt.py b/pandas/io/excel/_xlwt.py index 8d5bd4a9608d4..8a7605b80f6b4 100644 --- a/pandas/io/excel/_xlwt.py +++ b/pandas/io/excel/_xlwt.py @@ -1,8 +1,8 @@ +from __future__ import annotations + from typing import ( TYPE_CHECKING, Any, - Dict, - Optional, ) import pandas._libs.json as json @@ -28,8 +28,8 @@ def __init__( encoding=None, mode: str = "w", storage_options: StorageOptions = None, - if_sheet_exists: Optional[str] = None, - engine_kwargs: Optional[Dict[str, Any]] = None, + if_sheet_exists: str | None = None, + engine_kwargs: dict[str, Any] | None = None, ): # Use the xlwt module as the Excel writer. import xlwt @@ -76,7 +76,7 @@ def write_cells( wks.set_horz_split_pos(freeze_panes[0]) wks.set_vert_split_pos(freeze_panes[1]) - style_dict: Dict[str, XFStyle] = {} + style_dict: dict[str, XFStyle] = {} for cell in cells: val, fmt = self._value_with_fmt(cell.val) diff --git a/pandas/io/feather_format.py b/pandas/io/feather_format.py index d032c54395c6d..b5d819fefb370 100644 --- a/pandas/io/feather_format.py +++ b/pandas/io/feather_format.py @@ -110,12 +110,8 @@ def read_feather( or ``StringIO``. columns : sequence, default None If not provided, all columns are read. - - .. versionadded:: 0.24.0 use_threads : bool, default True Whether to parallelize reading using multiple threads. - - .. versionadded:: 0.24.0 {storage_options} .. versionadded:: 1.2.0 diff --git a/pandas/io/formats/css.py b/pandas/io/formats/css.py index f27bae2c161f3..956951a6f2f3d 100644 --- a/pandas/io/formats/css.py +++ b/pandas/io/formats/css.py @@ -1,12 +1,9 @@ """ Utilities for interpreting CSS from Stylers for formatting non-HTML outputs. """ +from __future__ import annotations import re -from typing import ( - Dict, - Optional, -) import warnings @@ -91,8 +88,8 @@ class CSSResolver: def __call__( self, declarations_str: str, - inherited: Optional[Dict[str, str]] = None, - ) -> Dict[str, str]: + inherited: dict[str, str] | None = None, + ) -> dict[str, str]: """ The given declarations to atomic properties. @@ -140,9 +137,9 @@ def __call__( def _update_initial( self, - props: Dict[str, str], - inherited: Dict[str, str], - ) -> Dict[str, str]: + props: dict[str, str], + inherited: dict[str, str], + ) -> dict[str, str]: # 1. resolve inherited, initial for prop, val in inherited.items(): if prop not in props: @@ -162,9 +159,9 @@ def _update_initial( def _update_font_size( self, - props: Dict[str, str], - inherited: Dict[str, str], - ) -> Dict[str, str]: + props: dict[str, str], + inherited: dict[str, str], + ) -> dict[str, str]: # 2. resolve relative font size if props.get("font-size"): props["font-size"] = self.size_to_pt( @@ -174,7 +171,7 @@ def _update_font_size( ) return props - def _get_font_size(self, props: Dict[str, str]) -> Optional[float]: + def _get_font_size(self, props: dict[str, str]) -> float | None: if props.get("font-size"): font_size_string = props["font-size"] return self._get_float_font_size_from_pt(font_size_string) @@ -184,7 +181,7 @@ def _get_float_font_size_from_pt(self, font_size_string: str) -> float: assert font_size_string.endswith("pt") return float(font_size_string.rstrip("pt")) - def _update_other_units(self, props: Dict[str, str]) -> Dict[str, str]: + def _update_other_units(self, props: dict[str, str]) -> dict[str, str]: font_size = self._get_font_size(props) # 3. TODO: resolve other font-relative units for side in self.SIDES: diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index 915a17fc702c3..f078975e4b85a 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -48,7 +48,7 @@ class CSVFormatter: def __init__( self, formatter: DataFrameFormatter, - path_or_buf: FilePathOrBuffer[str] = "", + path_or_buf: FilePathOrBuffer[str] | FilePathOrBuffer[bytes] = "", sep: str = ",", cols: Sequence[Hashable] | None = None, index_label: IndexLabel | None = None, diff --git a/pandas/io/formats/excel.py b/pandas/io/formats/excel.py index c6ff4e2180893..b285fa5f315ed 100644 --- a/pandas/io/formats/excel.py +++ b/pandas/io/formats/excel.py @@ -1,19 +1,17 @@ """ Utilities for conversion to writer-agnostic Excel representation. """ +from __future__ import annotations from functools import reduce import itertools import re from typing import ( Callable, - Dict, Hashable, Iterable, Mapping, - Optional, Sequence, - Union, cast, ) import warnings @@ -61,8 +59,8 @@ def __init__( col: int, val, style=None, - mergestart: Optional[int] = None, - mergeend: Optional[int] = None, + mergestart: int | None = None, + mergeend: int | None = None, ): self.row = row self.col = col @@ -135,9 +133,9 @@ class CSSToExcelConverter: # and __call__ make use of instance attributes. We leave them as # instancemethods so that users can easily experiment with extensions # without monkey-patching. - inherited: Optional[Dict[str, str]] + inherited: dict[str, str] | None - def __init__(self, inherited: Optional[str] = None): + def __init__(self, inherited: str | None = None): if inherited is not None: self.inherited = self.compute_css(inherited) else: @@ -145,7 +143,7 @@ def __init__(self, inherited: Optional[str] = None): compute_css = CSSResolver() - def __call__(self, declarations_str: str) -> Dict[str, Dict[str, str]]: + def __call__(self, declarations_str: str) -> dict[str, dict[str, str]]: """ Convert CSS declarations to ExcelWriter style. @@ -165,7 +163,7 @@ def __call__(self, declarations_str: str) -> Dict[str, Dict[str, str]]: properties = self.compute_css(declarations_str, self.inherited) return self.build_xlstyle(properties) - def build_xlstyle(self, props: Mapping[str, str]) -> Dict[str, Dict[str, str]]: + def build_xlstyle(self, props: Mapping[str, str]) -> dict[str, dict[str, str]]: out = { "alignment": self.build_alignment(props), "border": self.build_border(props), @@ -176,7 +174,7 @@ def build_xlstyle(self, props: Mapping[str, str]) -> Dict[str, Dict[str, str]]: # TODO: handle cell width and height: needs support in pandas.io.excel - def remove_none(d: Dict[str, str]) -> None: + def remove_none(d: dict[str, str]) -> None: """Remove key where value is None, through nested dicts""" for k, v in list(d.items()): if v is None: @@ -189,9 +187,7 @@ def remove_none(d: Dict[str, str]) -> None: remove_none(out) return out - def build_alignment( - self, props: Mapping[str, str] - ) -> Dict[str, Optional[Union[bool, str]]]: + def build_alignment(self, props: Mapping[str, str]) -> dict[str, bool | str | None]: # TODO: text-indent, padding-left -> alignment.indent return { "horizontal": props.get("text-align"), @@ -199,20 +195,20 @@ def build_alignment( "wrap_text": self._get_is_wrap_text(props), } - def _get_vertical_alignment(self, props: Mapping[str, str]) -> Optional[str]: + def _get_vertical_alignment(self, props: Mapping[str, str]) -> str | None: vertical_align = props.get("vertical-align") if vertical_align: return self.VERTICAL_MAP.get(vertical_align) return None - def _get_is_wrap_text(self, props: Mapping[str, str]) -> Optional[bool]: + def _get_is_wrap_text(self, props: Mapping[str, str]) -> bool | None: if props.get("white-space") is None: return None return bool(props["white-space"] not in ("nowrap", "pre", "pre-line")) def build_border( self, props: Mapping[str, str] - ) -> Dict[str, Dict[str, Optional[str]]]: + ) -> dict[str, dict[str, str | None]]: return { side: { "style": self._border_style( @@ -224,7 +220,7 @@ def build_border( for side in ["top", "right", "bottom", "left"] } - def _border_style(self, style: Optional[str], width: Optional[str]): + def _border_style(self, style: str | None, width: str | None): # convert styles and widths to openxml, one of: # 'dashDot' # 'dashDotDot' @@ -263,7 +259,7 @@ def _border_style(self, style: Optional[str], width: Optional[str]): return "dashed" return "mediumDashed" - def _get_width_name(self, width_input: Optional[str]) -> Optional[str]: + def _get_width_name(self, width_input: str | None) -> str | None: width = self._width_to_float(width_input) if width < 1e-5: return None @@ -273,7 +269,7 @@ def _get_width_name(self, width_input: Optional[str]) -> Optional[str]: return "medium" return "thick" - def _width_to_float(self, width: Optional[str]) -> float: + def _width_to_float(self, width: str | None) -> float: if width is None: width = "2pt" return self._pt_to_float(width) @@ -289,12 +285,12 @@ def build_fill(self, props: Mapping[str, str]): if fill_color not in (None, "transparent", "none"): return {"fgColor": self.color_to_excel(fill_color), "patternType": "solid"} - def build_number_format(self, props: Mapping[str, str]) -> Dict[str, Optional[str]]: + def build_number_format(self, props: Mapping[str, str]) -> dict[str, str | None]: return {"format_code": props.get("number-format")} def build_font( self, props: Mapping[str, str] - ) -> Dict[str, Optional[Union[bool, int, float, str]]]: + ) -> dict[str, bool | int | float | str | None]: font_names = self._get_font_names(props) decoration = self._get_decoration(props) return { @@ -316,13 +312,13 @@ def build_font( # 'condense': , } - def _get_is_bold(self, props: Mapping[str, str]) -> Optional[bool]: + def _get_is_bold(self, props: Mapping[str, str]) -> bool | None: weight = props.get("font-weight") if weight: return self.BOLD_MAP.get(weight) return None - def _get_is_italic(self, props: Mapping[str, str]) -> Optional[bool]: + def _get_is_italic(self, props: Mapping[str, str]) -> bool | None: font_style = props.get("font-style") if font_style: return self.ITALIC_MAP.get(font_style) @@ -335,12 +331,12 @@ def _get_decoration(self, props: Mapping[str, str]) -> Sequence[str]: else: return () - def _get_underline(self, decoration: Sequence[str]) -> Optional[str]: + def _get_underline(self, decoration: Sequence[str]) -> str | None: if "underline" in decoration: return "single" return None - def _get_shadow(self, props: Mapping[str, str]) -> Optional[bool]: + def _get_shadow(self, props: Mapping[str, str]) -> bool | None: if "text-shadow" in props: return bool(re.search("^[^#(]*[1-9]", props["text-shadow"])) return None @@ -371,13 +367,13 @@ def _get_font_names(self, props: Mapping[str, str]) -> Sequence[str]: font_names.append(name) return font_names - def _get_font_size(self, props: Mapping[str, str]) -> Optional[float]: + def _get_font_size(self, props: Mapping[str, str]) -> float | None: size = props.get("font-size") if size is None: return size return self._pt_to_float(size) - def _select_font_family(self, font_names) -> Optional[int]: + def _select_font_family(self, font_names) -> int | None: family = None for name in font_names: family = self.FAMILY_MAP.get(name) @@ -386,7 +382,7 @@ def _select_font_family(self, font_names) -> Optional[int]: return family - def color_to_excel(self, val: Optional[str]) -> Optional[str]: + def color_to_excel(self, val: str | None) -> str | None: if val is None: return None @@ -463,14 +459,14 @@ def __init__( self, df, na_rep: str = "", - float_format: Optional[str] = None, - cols: Optional[Sequence[Hashable]] = None, - header: Union[Sequence[Hashable], bool] = True, + float_format: str | None = None, + cols: Sequence[Hashable] | None = None, + header: Sequence[Hashable] | bool = True, index: bool = True, - index_label: Optional[IndexLabel] = None, + index_label: IndexLabel | None = None, merge_cells: bool = False, inf_rep: str = "inf", - style_converter: Optional[Callable] = None, + style_converter: Callable | None = None, ): self.rowcounter = 0 self.na_rep = na_rep diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 485610af747f6..d1c19f348f901 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -19,6 +19,7 @@ IO, TYPE_CHECKING, Any, + AnyStr, Callable, Hashable, Iterable, @@ -1054,7 +1055,7 @@ def to_string( def to_csv( self, - path_or_buf: FilePathOrBuffer[str] | None = None, + path_or_buf: FilePathOrBuffer[AnyStr] | None = None, encoding: str | None = None, sep: str = ",", columns: Sequence[Hashable] | None = None, @@ -1541,7 +1542,7 @@ def __init__( self.date_format = date_format def _format_strings(self) -> list[str]: - """ we by definition have DO NOT have a TZ """ + """we by definition have DO NOT have a TZ""" values = self.values if not isinstance(values, DatetimeIndex): @@ -1729,7 +1730,7 @@ def get_format_datetime64( def get_format_datetime64_from_values( values: np.ndarray | DatetimeArray | DatetimeIndex, date_format: str | None ) -> str | None: - """ given values and a date_format, return a string format """ + """given values and a date_format, return a string format""" if isinstance(values, np.ndarray) and values.ndim > 1: # We don't actually care about the order of values, and DatetimeIndex # only accepts 1D values @@ -1743,7 +1744,7 @@ def get_format_datetime64_from_values( class Datetime64TZFormatter(Datetime64Formatter): def _format_strings(self) -> list[str]: - """ we by definition have a TZ """ + """we by definition have a TZ""" values = self.values.astype(object) ido = is_dates_only(values) formatter = self.formatter or get_format_datetime64( diff --git a/pandas/io/formats/html.py b/pandas/io/formats/html.py index 7986d2e4338cb..0c927277e899a 100644 --- a/pandas/io/formats/html.py +++ b/pandas/io/formats/html.py @@ -1,17 +1,13 @@ """ Module for formatting output data in HTML. """ +from __future__ import annotations from textwrap import dedent from typing import ( Any, - Dict, Iterable, - List, Mapping, - Optional, - Tuple, - Union, cast, ) @@ -47,9 +43,9 @@ class HTMLFormatter: def __init__( self, formatter: DataFrameFormatter, - classes: Optional[Union[str, List[str], Tuple[str, ...]]] = None, - border: Optional[int] = None, - table_id: Optional[str] = None, + classes: str | list[str] | tuple[str, ...] | None = None, + border: int | None = None, + table_id: str | None = None, render_links: bool = False, ) -> None: self.fmt = formatter @@ -57,7 +53,7 @@ def __init__( self.frame = self.fmt.frame self.columns = self.fmt.tr_frame.columns - self.elements: List[str] = [] + self.elements: list[str] = [] self.bold_rows = self.fmt.bold_rows self.escape = self.fmt.escape self.show_dimensions = self.fmt.show_dimensions @@ -78,7 +74,7 @@ def to_string(self) -> str: lines = [str(x) for x in lines] return "\n".join(lines) - def render(self) -> List[str]: + def render(self) -> list[str]: self._write_table() if self.should_show_dimensions: @@ -132,7 +128,7 @@ def write(self, s: Any, indent: int = 0) -> None: self.elements.append(" " * indent + rs) def write_th( - self, s: Any, header: bool = False, indent: int = 0, tags: Optional[str] = None + self, s: Any, header: bool = False, indent: int = 0, tags: str | None = None ) -> None: """ Method for writing a formatted ", indent) def _write_regular_rows( - self, fmt_values: Mapping[int, List[str]], indent: int + self, fmt_values: Mapping[int, list[str]], indent: int ) -> None: is_truncated_horizontally = self.fmt.is_truncated_horizontally is_truncated_vertically = self.fmt.is_truncated_vertically @@ -421,7 +417,7 @@ def _write_regular_rows( else: index_values = self.fmt.tr_frame.index.format() - row: List[str] = [] + row: list[str] = [] for i in range(nrows): if is_truncated_vertically and i == (self.fmt.tr_row_num): @@ -453,7 +449,7 @@ def _write_regular_rows( ) def _write_hierarchical_rows( - self, fmt_values: Mapping[int, List[str]], indent: int + self, fmt_values: Mapping[int, list[str]], indent: int ) -> None: template = 'rowspan="{span}" valign="top"' @@ -585,10 +581,10 @@ class NotebookFormatter(HTMLFormatter): DataFrame._repr_html_() and DataFrame.to_html(notebook=True) """ - def _get_formatted_values(self) -> Dict[int, List[str]]: + def _get_formatted_values(self) -> dict[int, list[str]]: return {i: self.fmt.format_col(i) for i in range(self.ncols)} - def _get_columns_formatted_values(self) -> List[str]: + def _get_columns_formatted_values(self) -> list[str]: return self.columns.format() def write_style(self) -> None: @@ -619,7 +615,7 @@ def write_style(self) -> None: template = dedent("\n".join((template_first, template_mid, template_last))) self.write(template) - def render(self) -> List[str]: + def render(self) -> list[str]: self.write("
") self.write_style() super().render() diff --git a/pandas/io/formats/latex.py b/pandas/io/formats/latex.py index 476a3647207d6..e9e2b830e32cb 100644 --- a/pandas/io/formats/latex.py +++ b/pandas/io/formats/latex.py @@ -1,18 +1,15 @@ """ Module for formatting output data in Latex. """ +from __future__ import annotations + from abc import ( ABC, abstractmethod, ) from typing import ( Iterator, - List, - Optional, Sequence, - Tuple, - Type, - Union, ) import numpy as np @@ -23,8 +20,8 @@ def _split_into_full_short_caption( - caption: Optional[Union[str, Tuple[str, str]]] -) -> Tuple[str, str]: + caption: str | tuple[str, str] | None +) -> tuple[str, str]: """Extract full and short captions from caption string/tuple. Parameters @@ -75,7 +72,7 @@ def __init__( self, formatter: DataFrameFormatter, multicolumn: bool = False, - multicolumn_format: Optional[str] = None, + multicolumn_format: str | None = None, multirow: bool = False, ): self.fmt = formatter @@ -83,7 +80,7 @@ def __init__( self.multicolumn = multicolumn self.multicolumn_format = multicolumn_format self.multirow = multirow - self.clinebuf: List[List[int]] = [] + self.clinebuf: list[list[int]] = [] self.strcols = self._get_strcols() self.strrows = list(zip(*self.strcols)) @@ -140,7 +137,7 @@ def header_levels(self) -> int: nlevels += 1 return nlevels - def _get_strcols(self) -> List[List[str]]: + def _get_strcols(self) -> list[list[str]]: """String representation of the columns.""" if self.fmt.frame.empty: strcols = [[self._empty_info_line]] @@ -188,7 +185,7 @@ def _empty_info_line(self): f"Index: {self.frame.index}" ) - def _preprocess_row(self, row: Sequence[str]) -> List[str]: + def _preprocess_row(self, row: Sequence[str]) -> list[str]: """Preprocess elements of the row.""" if self.fmt.escape: crow = _escape_symbols(row) @@ -198,7 +195,7 @@ def _preprocess_row(self, row: Sequence[str]) -> List[str]: crow = _convert_to_bold(crow, self.index_levels) return crow - def _format_multicolumn(self, row: List[str]) -> List[str]: + def _format_multicolumn(self, row: list[str]) -> list[str]: r""" Combine columns belonging to a group to a single multicolumn entry according to self.multicolumn_format @@ -238,7 +235,7 @@ def append_col(): append_col() return row2 - def _format_multirow(self, row: List[str], i: int) -> List[str]: + def _format_multirow(self, row: list[str], i: int) -> list[str]: r""" Check following rows, whether row should be a multirow @@ -331,14 +328,14 @@ class TableBuilderAbstract(ABC): def __init__( self, formatter: DataFrameFormatter, - column_format: Optional[str] = None, + column_format: str | None = None, multicolumn: bool = False, - multicolumn_format: Optional[str] = None, + multicolumn_format: str | None = None, multirow: bool = False, - caption: Optional[str] = None, - short_caption: Optional[str] = None, - label: Optional[str] = None, - position: Optional[str] = None, + caption: str | None = None, + short_caption: str | None = None, + label: str | None = None, + position: str | None = None, ): self.fmt = formatter self.column_format = column_format @@ -477,7 +474,7 @@ def _create_row_iterator(self, over: str) -> RowStringIterator: multirow=self.multirow, ) - def _select_iterator(self, over: str) -> Type[RowStringIterator]: + def _select_iterator(self, over: str) -> type[RowStringIterator]: """Select proper iterator over table rows.""" if over == "header": return RowHeaderIterator @@ -696,13 +693,13 @@ def __init__( self, formatter: DataFrameFormatter, longtable: bool = False, - column_format: Optional[str] = None, + column_format: str | None = None, multicolumn: bool = False, - multicolumn_format: Optional[str] = None, + multicolumn_format: str | None = None, multirow: bool = False, - caption: Optional[Union[str, Tuple[str, str]]] = None, - label: Optional[str] = None, - position: Optional[str] = None, + caption: str | tuple[str, str] | None = None, + label: str | None = None, + position: str | None = None, ): self.fmt = formatter self.frame = self.fmt.frame @@ -743,7 +740,7 @@ def builder(self) -> TableBuilderAbstract: position=self.position, ) - def _select_builder(self) -> Type[TableBuilderAbstract]: + def _select_builder(self) -> type[TableBuilderAbstract]: """Select proper table builder.""" if self.longtable: return LongTableBuilder @@ -752,12 +749,12 @@ def _select_builder(self) -> Type[TableBuilderAbstract]: return TabularBuilder @property - def column_format(self) -> Optional[str]: + def column_format(self) -> str | None: """Column format.""" return self._column_format @column_format.setter - def column_format(self, input_column_format: Optional[str]) -> None: + def column_format(self, input_column_format: str | None) -> None: """Setter for column format.""" if input_column_format is None: self._column_format = ( @@ -790,7 +787,7 @@ def _get_index_format(self) -> str: return "l" * self.frame.index.nlevels if self.fmt.index else "" -def _escape_symbols(row: Sequence[str]) -> List[str]: +def _escape_symbols(row: Sequence[str]) -> list[str]: """Carry out string replacements for special symbols. Parameters @@ -822,7 +819,7 @@ def _escape_symbols(row: Sequence[str]) -> List[str]: ] -def _convert_to_bold(crow: Sequence[str], ilevels: int) -> List[str]: +def _convert_to_bold(crow: Sequence[str], ilevels: int) -> list[str]: """Convert elements in ``crow`` to bold.""" return [ f"\\textbf{{{x}}}" if j < ilevels and x.strip() not in ["", "{}"] else x diff --git a/pandas/io/formats/printing.py b/pandas/io/formats/printing.py index cbc407c2624f2..ac81fffcf353a 100644 --- a/pandas/io/formats/printing.py +++ b/pandas/io/formats/printing.py @@ -1,6 +1,7 @@ """ Printing tools. """ +from __future__ import annotations import sys from typing import ( @@ -8,12 +9,9 @@ Callable, Dict, Iterable, - List, Mapping, - Optional, Sequence, Sized, - Tuple, TypeVar, Union, ) @@ -27,7 +25,7 @@ _VT = TypeVar("_VT") -def adjoin(space: int, *lists: List[str], **kwargs) -> str: +def adjoin(space: int, *lists: list[str], **kwargs) -> str: """ Glues together two sets of strings using the amount of space requested. The idea is to prettify. @@ -62,7 +60,7 @@ def adjoin(space: int, *lists: List[str], **kwargs) -> str: return "\n".join(out_lines) -def justify(texts: Iterable[str], max_len: int, mode: str = "right") -> List[str]: +def justify(texts: Iterable[str], max_len: int, mode: str = "right") -> list[str]: """ Perform ljust, center, rjust against string or list-like """ @@ -99,7 +97,7 @@ def justify(texts: Iterable[str], max_len: int, mode: str = "right") -> List[str def _pprint_seq( - seq: Sequence, _nest_lvl: int = 0, max_seq_items: Optional[int] = None, **kwds + seq: Sequence, _nest_lvl: int = 0, max_seq_items: int | None = None, **kwds ) -> str: """ internal. pprinter for iterables. you should probably use pprint_thing() @@ -134,7 +132,7 @@ def _pprint_seq( def _pprint_dict( - seq: Mapping, _nest_lvl: int = 0, max_seq_items: Optional[int] = None, **kwds + seq: Mapping, _nest_lvl: int = 0, max_seq_items: int | None = None, **kwds ) -> str: """ internal. pprinter for iterables. you should probably use pprint_thing() @@ -167,10 +165,10 @@ def _pprint_dict( def pprint_thing( thing: Any, _nest_lvl: int = 0, - escape_chars: Optional[EscapeChars] = None, + escape_chars: EscapeChars | None = None, default_escapes: bool = False, quote_strings: bool = False, - max_seq_items: Optional[int] = None, + max_seq_items: int | None = None, ) -> str: """ This function is the sanctioned way of converting objects @@ -196,7 +194,7 @@ def pprint_thing( """ def as_escaped_string( - thing: Any, escape_chars: Optional[EscapeChars] = escape_chars + thing: Any, escape_chars: EscapeChars | None = escape_chars ) -> str: translate = {"\t": r"\t", "\n": r"\n", "\r": r"\r"} if isinstance(escape_chars, dict): @@ -277,7 +275,7 @@ class TableSchemaFormatter(BaseFormatter): formatters[mimetype].enabled = False -def default_pprint(thing: Any, max_seq_items: Optional[int] = None) -> str: +def default_pprint(thing: Any, max_seq_items: int | None = None) -> str: return pprint_thing( thing, escape_chars=("\t", "\r", "\n"), @@ -290,7 +288,7 @@ def format_object_summary( obj, formatter: Callable, is_justify: bool = True, - name: Optional[str] = None, + name: str | None = None, indent_for_name: bool = True, line_break_each_value: bool = False, ) -> str: @@ -355,7 +353,7 @@ def format_object_summary( def _extend_line( s: str, line: str, value: str, display_width: int, next_line_prefix: str - ) -> Tuple[str, str]: + ) -> tuple[str, str]: if adj.len(line.rstrip()) + adj.len(value.rstrip()) >= display_width: s += line.rstrip() @@ -363,7 +361,7 @@ def _extend_line( line += value return s, line - def best_len(values: List[str]) -> int: + def best_len(values: list[str]) -> int: if values: return max(adj.len(x) for x in values) else: @@ -463,8 +461,8 @@ def best_len(values: List[str]) -> int: def _justify( - head: List[Sequence[str]], tail: List[Sequence[str]] -) -> Tuple[List[Tuple[str, ...]], List[Tuple[str, ...]]]: + head: list[Sequence[str]], tail: list[Sequence[str]] +) -> tuple[list[tuple[str, ...]], list[tuple[str, ...]]]: """ Justify items in head and tail, so they are right-aligned when stacked. @@ -509,7 +507,7 @@ def _justify( def format_object_attrs( obj: Sized, include_dtype: bool = True -) -> List[Tuple[str, Union[str, int]]]: +) -> list[tuple[str, str | int]]: """ Return a list of tuples of the (attr, formatted_value) for common attrs, including dtype, name, length @@ -526,7 +524,7 @@ def format_object_attrs( list of 2-tuple """ - attrs: List[Tuple[str, Union[str, int]]] = [] + attrs: list[tuple[str, str | int]] = [] if hasattr(obj, "dtype") and include_dtype: # error: "Sized" has no attribute "dtype" attrs.append(("dtype", f"'{obj.dtype}'")) # type: ignore[attr-defined] diff --git a/pandas/io/formats/string.py b/pandas/io/formats/string.py index 20fc84a4df303..2610b7777207f 100644 --- a/pandas/io/formats/string.py +++ b/pandas/io/formats/string.py @@ -1,12 +1,10 @@ """ Module for formatting output data in console (to string). """ +from __future__ import annotations + from shutil import get_terminal_size -from typing import ( - Iterable, - List, - Optional, -) +from typing import Iterable import numpy as np @@ -17,7 +15,7 @@ class StringFormatter: """Formatter for string representation of a dataframe.""" - def __init__(self, fmt: DataFrameFormatter, line_width: Optional[int] = None): + def __init__(self, fmt: DataFrameFormatter, line_width: int | None = None): self.fmt = fmt self.adj = fmt.adj self.frame = fmt.frame @@ -29,7 +27,7 @@ def to_string(self) -> str: text = "".join([text, self.fmt.dimensions_info]) return text - def _get_strcols(self) -> List[List[str]]: + def _get_strcols(self) -> list[list[str]]: strcols = self.fmt.get_strcols() if self.fmt.is_truncated: strcols = self._insert_dot_separators(strcols) @@ -62,7 +60,7 @@ def _empty_info_line(self) -> str: def _need_to_wrap_around(self) -> bool: return bool(self.fmt.max_cols is None or self.fmt.max_cols > 0) - def _insert_dot_separators(self, strcols: List[List[str]]) -> List[List[str]]: + def _insert_dot_separators(self, strcols: list[list[str]]) -> list[list[str]]: str_index = self.fmt._get_formatted_index(self.fmt.tr_frame) index_length = len(str_index) @@ -79,14 +77,14 @@ def _adjusted_tr_col_num(self) -> int: return self.fmt.tr_col_num + 1 if self.fmt.index else self.fmt.tr_col_num def _insert_dot_separator_horizontal( - self, strcols: List[List[str]], index_length: int - ) -> List[List[str]]: + self, strcols: list[list[str]], index_length: int + ) -> list[list[str]]: strcols.insert(self._adjusted_tr_col_num, [" ..."] * index_length) return strcols def _insert_dot_separator_vertical( - self, strcols: List[List[str]], index_length: int - ) -> List[List[str]]: + self, strcols: list[list[str]], index_length: int + ) -> list[list[str]]: n_header_rows = index_length - len(self.fmt.tr_frame) row_num = self.fmt.tr_row_num for ix, col in enumerate(strcols): @@ -114,7 +112,7 @@ def _insert_dot_separator_vertical( col.insert(row_num + n_header_rows, dot_str) return strcols - def _join_multiline(self, strcols_input: Iterable[List[str]]) -> str: + def _join_multiline(self, strcols_input: Iterable[list[str]]) -> str: lwidth = self.line_width adjoin_width = 1 strcols = list(strcols_input) @@ -161,7 +159,7 @@ def _join_multiline(self, strcols_input: Iterable[List[str]]) -> str: start = end return "\n\n".join(str_lst) - def _fit_strcols_to_terminal_width(self, strcols: List[List[str]]) -> str: + def _fit_strcols_to_terminal_width(self, strcols: list[list[str]]) -> str: from pandas import Series lines = self.adj.adjoin(1, *strcols).split("\n") @@ -197,7 +195,7 @@ def _fit_strcols_to_terminal_width(self, strcols: List[List[str]]) -> str: return self.adj.adjoin(1, *strcols) -def _binify(cols: List[int], line_width: int) -> List[int]: +def _binify(cols: list[int], line_width: int) -> list[int]: adjoin_width = 1 bins = [] curr_width = 0 diff --git a/pandas/io/formats/style.py b/pandas/io/formats/style.py index 7b88d53dd7f4e..93c3843b36846 100644 --- a/pandas/io/formats/style.py +++ b/pandas/io/formats/style.py @@ -126,7 +126,7 @@ class Styler(StylerRenderer): ``{``, ``}``, ``~``, ``^``, and ``\`` in the cell display string with LaTeX-safe sequences. - ... versionadded:: 1.3.0 + .. versionadded:: 1.3.0 Attributes ---------- @@ -1597,9 +1597,6 @@ def background_gradient( Luminance threshold for determining text color in [0, 1]. Facilitates text visibility across varying background colors. All text is dark if 0, and light if 1, defaults to 0.408. - - .. versionadded:: 0.24.0 - vmin : float, optional Minimum data value that corresponds to colormap minimum value. If not specified the minimum value of the data (or gmap) will be used. @@ -1880,16 +1877,11 @@ def bar( Minimum bar value, defining the left hand limit of the bar drawing range, lower values are clipped to `vmin`. When None (default): the minimum value of the data will be used. - - .. versionadded:: 0.24.0 - vmax : float, optional Maximum bar value, defining the right hand limit of the bar drawing range, higher values are clipped to `vmax`. When None (default): the maximum value of the data will be used. - .. versionadded:: 0.24.0 - Returns ------- self : Styler @@ -2323,8 +2315,6 @@ def pipe(self, func: Callable, *args, **kwargs): """ Apply ``func(self, *args, **kwargs)``, and return the result. - .. versionadded:: 0.24.0 - Parameters ---------- func : function diff --git a/pandas/io/formats/xml.py b/pandas/io/formats/xml.py index c9dc87ec0588b..5be6ae0382d87 100644 --- a/pandas/io/formats/xml.py +++ b/pandas/io/formats/xml.py @@ -1,16 +1,11 @@ """ :mod:`pandas.io.formats.xml` is a module for formatting data in XML. """ +from __future__ import annotations import codecs import io -from typing import ( - Any, - Dict, - List, - Optional, - Union, -) +from typing import Any from pandas._typing import ( CompressionOptions, @@ -95,19 +90,19 @@ class BaseXMLFormatter: def __init__( self, frame: DataFrame, - path_or_buffer: Optional[FilePathOrBuffer] = None, - index: Optional[bool] = True, - root_name: Optional[str] = "data", - row_name: Optional[str] = "row", - na_rep: Optional[str] = None, - attr_cols: Optional[List[str]] = None, - elem_cols: Optional[List[str]] = None, - namespaces: Optional[Dict[Optional[str], str]] = None, - prefix: Optional[str] = None, + path_or_buffer: FilePathOrBuffer | None = None, + index: bool | None = True, + root_name: str | None = "data", + row_name: str | None = "row", + na_rep: str | None = None, + attr_cols: list[str] | None = None, + elem_cols: list[str] | None = None, + namespaces: dict[str | None, str] | None = None, + prefix: str | None = None, encoding: str = "utf-8", - xml_declaration: Optional[bool] = True, - pretty_print: Optional[bool] = True, - stylesheet: Optional[FilePathOrBuffer] = None, + xml_declaration: bool | None = True, + pretty_print: bool | None = True, + stylesheet: FilePathOrBuffer | None = None, compression: CompressionOptions = "infer", storage_options: StorageOptions = None, ) -> None: @@ -175,7 +170,7 @@ def validate_encoding(self) -> None: codecs.lookup(self.encoding) - def process_dataframe(self) -> Dict[Union[int, str], Dict[str, Any]]: + def process_dataframe(self) -> dict[int | str, dict[str, Any]]: """ Adjust Data Frame to fit xml output. @@ -200,7 +195,7 @@ def handle_indexes(self) -> None: This method will add indexes into attr_cols or elem_cols. """ - indexes: List[str] = [ + indexes: list[str] = [ x for x in self.frame_dicts[0].keys() if x not in self.orig_cols ] @@ -233,7 +228,7 @@ def other_namespaces(self) -> dict: prefix. """ - nmsp_dict: Dict[str, str] = {} + nmsp_dict: dict[str, str] = {} if self.namespaces and self.prefix is None: nmsp_dict = {"xmlns": n for p, n in self.namespaces.items() if p != ""} @@ -262,10 +257,10 @@ def build_elems(self) -> None: raise AbstractMethodError(self) - def write_output(self) -> Optional[str]: + def write_output(self) -> str | None: xml_doc = self.build_tree() - out_str: Optional[str] + out_str: str | None if self.path_or_buffer is not None: with get_handle( diff --git a/pandas/io/gbq.py b/pandas/io/gbq.py index 0d7b7893aa496..77ad40bac9319 100644 --- a/pandas/io/gbq.py +++ b/pandas/io/gbq.py @@ -87,8 +87,6 @@ def read_gbq( compliant with the SQL 2011 standard. For more information see `BigQuery Standard SQL Reference `__. - - .. versionchanged:: 0.24.0 location : str, optional Location where the query job should run. See the `BigQuery locations documentation @@ -112,8 +110,6 @@ def read_gbq( :class:`google.oauth2.service_account.Credentials` directly. *New in version 0.8.0 of pandas-gbq*. - - .. versionadded:: 0.24.0 use_bqstorage_api : bool, default False Use the `BigQuery Storage API `__ to diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index 259850e9a7233..77582c46977c1 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -1,3 +1,5 @@ +from __future__ import annotations + from abc import ( ABC, abstractmethod, @@ -10,10 +12,6 @@ Any, Callable, Mapping, - Optional, - Tuple, - Type, - Union, ) import numpy as np @@ -78,12 +76,12 @@ def to_json( path_or_buf, obj: NDFrame, - orient: Optional[str] = None, + orient: str | None = None, date_format: str = "epoch", double_precision: int = 10, force_ascii: bool = True, date_unit: str = "ms", - default_handler: Optional[Callable[[Any], JSONSerializable]] = None, + default_handler: Callable[[Any], JSONSerializable] | None = None, lines: bool = False, compression: CompressionOptions = "infer", index: bool = True, @@ -102,7 +100,7 @@ def to_json( if orient == "table" and isinstance(obj, Series): obj = obj.to_frame(name=obj.name or "values") - writer: Type[Writer] + writer: type[Writer] if orient == "table" and isinstance(obj, DataFrame): writer = JSONTableWriter elif isinstance(obj, Series): @@ -143,13 +141,13 @@ class Writer(ABC): def __init__( self, obj, - orient: Optional[str], + orient: str | None, date_format: str, double_precision: int, ensure_ascii: bool, date_unit: str, index: bool, - default_handler: Optional[Callable[[Any], JSONSerializable]] = None, + default_handler: Callable[[Any], JSONSerializable] | None = None, indent: int = 0, ): self.obj = obj @@ -187,7 +185,7 @@ def write(self): @property @abstractmethod - def obj_to_write(self) -> Union[NDFrame, Mapping[IndexLabel, Any]]: + def obj_to_write(self) -> NDFrame | Mapping[IndexLabel, Any]: """Object to write in JSON format.""" pass @@ -196,7 +194,7 @@ class SeriesWriter(Writer): _default_orient = "index" @property - def obj_to_write(self) -> Union[NDFrame, Mapping[IndexLabel, Any]]: + def obj_to_write(self) -> NDFrame | Mapping[IndexLabel, Any]: if not self.index and self.orient == "split": return {"name": self.obj.name, "data": self.obj.values} else: @@ -211,7 +209,7 @@ class FrameWriter(Writer): _default_orient = "columns" @property - def obj_to_write(self) -> Union[NDFrame, Mapping[IndexLabel, Any]]: + def obj_to_write(self) -> NDFrame | Mapping[IndexLabel, Any]: if not self.index and self.orient == "split": obj_to_write = self.obj.to_dict(orient="split") del obj_to_write["index"] @@ -243,13 +241,13 @@ class JSONTableWriter(FrameWriter): def __init__( self, obj, - orient: Optional[str], + orient: str | None, date_format: str, double_precision: int, ensure_ascii: bool, date_unit: str, index: bool, - default_handler: Optional[Callable[[Any], JSONSerializable]] = None, + default_handler: Callable[[Any], JSONSerializable] | None = None, indent: int = 0, ): """ @@ -313,7 +311,7 @@ def __init__( self.index = index @property - def obj_to_write(self) -> Union[NDFrame, Mapping[IndexLabel, Any]]: + def obj_to_write(self) -> NDFrame | Mapping[IndexLabel, Any]: return {"schema": self.schema, "data": self.obj} @@ -326,7 +324,7 @@ def read_json( path_or_buf=None, orient=None, typ="frame", - dtype: Optional[DtypeArg] = None, + dtype: DtypeArg | None = None, convert_axes=None, convert_dates=True, keep_default_dates: bool = True, @@ -334,11 +332,11 @@ def read_json( precise_float: bool = False, date_unit=None, encoding=None, - encoding_errors: Optional[str] = "strict", + encoding_errors: str | None = "strict", lines: bool = False, - chunksize: Optional[int] = None, + chunksize: int | None = None, compression: CompressionOptions = "infer", - nrows: Optional[int] = None, + nrows: int | None = None, storage_options: StorageOptions = None, ): """ @@ -461,7 +459,7 @@ def read_json( How encoding errors are treated. `List of possible values `_ . - .. versionadded:: 1.3 + .. versionadded:: 1.3.0 lines : bool, default False Read the file as a json object per line. @@ -639,11 +637,11 @@ def __init__( date_unit, encoding, lines: bool, - chunksize: Optional[int], + chunksize: int | None, compression: CompressionOptions, - nrows: Optional[int], + nrows: int | None, storage_options: StorageOptions = None, - encoding_errors: Optional[str] = "strict", + encoding_errors: str | None = "strict", ): self.orient = orient @@ -663,7 +661,7 @@ def __init__( self.nrows_seen = 0 self.nrows = nrows self.encoding_errors = encoding_errors - self.handles: Optional[IOHandles] = None + self.handles: IOHandles | None = None if self.chunksize is not None: self.chunksize = validate_integer("chunksize", self.chunksize, 1) @@ -816,7 +814,7 @@ def __exit__(self, exc_type, exc_value, traceback): class Parser: - _split_keys: Tuple[str, ...] + _split_keys: tuple[str, ...] _default_orient: str _STAMP_UNITS = ("s", "ms", "us", "ns") @@ -831,7 +829,7 @@ def __init__( self, json, orient, - dtype: Optional[DtypeArg] = None, + dtype: DtypeArg | None = None, convert_axes=True, convert_dates=True, keep_default_dates=False, @@ -865,7 +863,7 @@ def __init__( self.convert_dates = convert_dates self.date_unit = date_unit self.keep_default_dates = keep_default_dates - self.obj: Optional[FrameOrSeriesUnion] = None + self.obj: FrameOrSeriesUnion | None = None def check_keys_split(self, decoded): """ diff --git a/pandas/io/json/_table_schema.py b/pandas/io/json/_table_schema.py index 87ea109c20f43..60b2489005f48 100644 --- a/pandas/io/json/_table_schema.py +++ b/pandas/io/json/_table_schema.py @@ -3,11 +3,11 @@ https://specs.frictionlessdata.io/json-table-schema/ """ +from __future__ import annotations + from typing import ( TYPE_CHECKING, Any, - Dict, - Optional, cast, ) import warnings @@ -117,7 +117,7 @@ def convert_pandas_type_to_json_field(arr): name = "values" else: name = arr.name - field: Dict[str, JSONSerializable] = { + field: dict[str, JSONSerializable] = { "name": name, "type": as_json_table_type(dtype), } @@ -206,9 +206,9 @@ def convert_json_field_to_pandas_type(field): def build_table_schema( data: FrameOrSeries, index: bool = True, - primary_key: Optional[bool] = None, + primary_key: bool | None = None, version: bool = True, -) -> Dict[str, JSONSerializable]: +) -> dict[str, JSONSerializable]: """ Create a Table schema from ``data``. @@ -260,7 +260,7 @@ def build_table_schema( if index is True: data = set_default_names(data) - schema: Dict[str, Any] = {} + schema: dict[str, Any] = {} fields = [] if index: diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index bccf3c3f1011b..b7523fada07d0 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -35,7 +35,7 @@ def get_engine(engine: str) -> BaseImpl: - """ return our implementation """ + """return our implementation""" if engine == "auto": engine = get_option("io.parquet.engine") @@ -388,16 +388,10 @@ def to_parquet( the RangeIndex will be stored as a range in the metadata so it doesn't require much space and is faster. Other indexes will be included as columns in the file output. - - .. versionadded:: 0.24.0 - partition_cols : str or list, optional, default None Column names by which to partition the dataset. Columns are partitioned in the order they are given. Must be None if path is not a string. - - .. versionadded:: 0.24.0 - {storage_options} .. versionadded:: 1.2.0 diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py index 3635d5b32faf4..670868c6f4261 100644 --- a/pandas/io/parsers/python_parser.py +++ b/pandas/io/parsers/python_parser.py @@ -18,10 +18,7 @@ import numpy as np import pandas._libs.lib as lib -from pandas._typing import ( - FilePathOrBuffer, - Union, -) +from pandas._typing import FilePathOrBuffer from pandas.errors import ( EmptyDataError, ParserError, @@ -42,7 +39,7 @@ class PythonParser(ParserBase): - def __init__(self, f: Union[FilePathOrBuffer, list], **kwds): + def __init__(self, f: FilePathOrBuffer | list, **kwds): """ Workhorse function for processing nested list into DataFrame """ diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index a384846b7a063..06bdbe3054a15 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -1,18 +1,13 @@ """ Module contains tools for processing files into DataFrames or other objects """ +from __future__ import annotations + from collections import abc import csv import sys from textwrap import fill -from typing import ( - Any, - Dict, - List, - Optional, - Set, - Type, -) +from typing import Any import warnings import numpy as np @@ -24,7 +19,6 @@ DtypeArg, FilePathOrBuffer, StorageOptions, - Union, ) from pandas.errors import ( AbstractMethodError, @@ -308,7 +302,7 @@ ``open()``. Otherwise, ``errors="strict"`` is passed to ``open()``. This behavior was previously only the case for ``engine="python"``. - .. versionchanged:: 1.3 + .. versionchanged:: 1.3.0 ``encoding_errors`` is a new argument. ``encoding`` has no longer an influence on how encoding errors are handled. @@ -317,7 +311,7 @@ How encoding errors are treated. `List of possible values `_ . - .. versionadded:: 1.3 + .. versionadded:: 1.3.0 dialect : str or csv.Dialect, optional If provided, this parameter will override values (default or not) for the @@ -331,14 +325,14 @@ If False, then these "bad lines" will be dropped from the DataFrame that is returned. - .. deprecated:: 1.3 + .. deprecated:: 1.3.0 The ``on_bad_lines`` parameter should be used instead to specify behavior upon encountering a bad line instead. warn_bad_lines : bool, default ``None`` If error_bad_lines is False, and warn_bad_lines is True, a warning for each "bad line" will be output. - .. deprecated:: 1.3 + .. deprecated:: 1.3.0 The ``on_bad_lines`` parameter should be used instead to specify behavior upon encountering a bad line instead. on_bad_lines : {{'error', 'warn', 'skip'}}, default 'error' @@ -349,7 +343,7 @@ - 'warn', raise a warning when a bad line is encountered and skip that line. - 'skip', skip bad lines without raising or warning when they are encountered. - .. versionadded:: 1.3 + .. versionadded:: 1.3.0 delim_whitespace : bool, default False Specifies whether or not whitespace (e.g. ``' '`` or ``'\t'``) will be @@ -413,8 +407,8 @@ _c_unsupported = {"skipfooter"} _python_unsupported = {"low_memory", "float_precision"} -_deprecated_defaults: Dict[str, Any] = {"error_bad_lines": None, "warn_bad_lines": None} -_deprecated_args: Set[str] = {"error_bad_lines", "warn_bad_lines"} +_deprecated_defaults: dict[str, Any] = {"error_bad_lines": None, "warn_bad_lines": None} +_deprecated_args: set[str] = {"error_bad_lines", "warn_bad_lines"} def validate_integer(name, val, min_val=0): @@ -518,7 +512,7 @@ def read_csv( prefix=lib.no_default, mangle_dupe_cols=True, # General Parsing Configuration - dtype: Optional[DtypeArg] = None, + dtype: DtypeArg | None = None, engine=None, converters=None, true_values=None, @@ -554,7 +548,7 @@ def read_csv( escapechar=None, comment=None, encoding=None, - encoding_errors: Optional[str] = "strict", + encoding_errors: str | None = "strict", dialect=None, # Error Handling error_bad_lines=None, @@ -616,7 +610,7 @@ def read_table( prefix=lib.no_default, mangle_dupe_cols=True, # General Parsing Configuration - dtype: Optional[DtypeArg] = None, + dtype: DtypeArg | None = None, engine=None, converters=None, true_values=None, @@ -659,7 +653,7 @@ def read_table( # TODO (2.0): set on_bad_lines to "error". # See _refine_defaults_read comment for why we do this. on_bad_lines=None, - encoding_errors: Optional[str] = "strict", + encoding_errors: str | None = "strict", # Internal delim_whitespace=False, low_memory=_c_parser_defaults["low_memory"], @@ -731,8 +725,6 @@ def read_fwf( infer_nrows : int, default 100 The number of rows to consider when letting the parser determine the `colspecs`. - - .. versionadded:: 0.24.0 **kwds : optional Optional keyword arguments can be passed to ``TextFileReader``. @@ -825,7 +817,7 @@ def _get_options_with_defaults(self, engine): kwds = self.orig_options options = {} - default: Optional[object] + default: object | None for argname, default in parser_defaults.items(): value = kwds.get(argname, default) @@ -1035,7 +1027,7 @@ def __next__(self): raise def _make_engine(self, engine="c"): - mapping: Dict[str, Type[ParserBase]] = { + mapping: dict[str, type[ParserBase]] = { "c": CParserWrapper, "python": PythonParser, "python-fwf": FixedWidthFieldParser, @@ -1149,7 +1141,7 @@ def TextParser(*args, **kwds): def _clean_na_values(na_values, keep_default_na=True): - na_fvalues: Union[Set, Dict] + na_fvalues: set | dict if na_values is None: if keep_default_na: na_values = STR_NA_VALUES @@ -1199,8 +1191,8 @@ def _floatify_na_values(na_values): def _stringify_na_values(na_values): - """ return a stringified and numeric for these values """ - result: List[Union[int, str, float]] = [] + """return a stringified and numeric for these values""" + result: list[int | str | float] = [] for x in na_values: result.append(str(x)) result.append(x) @@ -1224,17 +1216,17 @@ def _stringify_na_values(na_values): def _refine_defaults_read( - dialect: Union[str, csv.Dialect], - delimiter: Union[str, object], + dialect: str | csv.Dialect, + delimiter: str | object, delim_whitespace: bool, engine: str, - sep: Union[str, object], - error_bad_lines: Optional[bool], - warn_bad_lines: Optional[bool], - on_bad_lines: Optional[str], - names: Union[Optional[ArrayLike], object], - prefix: Union[Optional[str], object], - defaults: Dict[str, Any], + sep: str | object, + error_bad_lines: bool | None, + warn_bad_lines: bool | None, + on_bad_lines: str | None, + names: ArrayLike | None | object, + prefix: str | None | object, + defaults: dict[str, Any], ): """Validate/refine default values of input parameters of read_csv, read_table. @@ -1289,7 +1281,7 @@ def _refine_defaults_read( """ # fix types for sep, delimiter to Union(str, Any) delim_default = defaults["delimiter"] - kwds: Dict[str, Any] = {} + kwds: dict[str, Any] = {} # gh-23761 # # When a dialect is passed, it overrides any of the overlapping @@ -1383,7 +1375,7 @@ def _refine_defaults_read( return kwds -def _extract_dialect(kwds: Dict[str, Any]) -> Optional[csv.Dialect]: +def _extract_dialect(kwds: dict[str, Any]) -> csv.Dialect | None: """ Extract concrete csv dialect instance. @@ -1429,8 +1421,8 @@ def _validate_dialect(dialect: csv.Dialect) -> None: def _merge_with_dialect_properties( dialect: csv.Dialect, - defaults: Dict[str, Any], -) -> Dict[str, Any]: + defaults: dict[str, Any], +) -> dict[str, Any]: """ Merge default kwargs in TextFileReader with dialect parameters. @@ -1479,7 +1471,7 @@ def _merge_with_dialect_properties( return kwds -def _validate_skipfooter(kwds: Dict[str, Any]) -> None: +def _validate_skipfooter(kwds: dict[str, Any]) -> None: """ Check whether skipfooter is compatible with other kwargs in TextFileReader. diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index b32eb9e308780..208b8a008ffe6 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -86,7 +86,10 @@ ) from pandas.core.construction import extract_array from pandas.core.indexes.api import ensure_index -from pandas.core.internals import BlockManager +from pandas.core.internals import ( + ArrayManager, + BlockManager, +) from pandas.io.common import stringify_path from pandas.io.formats.printing import ( @@ -112,7 +115,7 @@ def _ensure_decoded(s): - """ if we have bytes, decode them to unicode """ + """if we have bytes, decode them to unicode""" if isinstance(s, np.bytes_): s = s.decode("UTF-8") return s @@ -274,7 +277,7 @@ def to_hdf( errors: str = "strict", encoding: str = "UTF-8", ): - """ store this object, close it if we opened it """ + """store this object, close it if we opened it""" if append: f = lambda store: store.append( key, @@ -592,7 +595,7 @@ def __fspath__(self): @property def root(self): - """ return the root node """ + """return the root node""" self._check_if_open() assert self._handle is not None # for mypy return self._handle.root @@ -611,7 +614,7 @@ def __delitem__(self, key: str): return self.remove(key) def __getattr__(self, name: str): - """ allow attribute access to get stores """ + """allow attribute access to get stores""" try: return self.get(name) except (KeyError, ClosedFileError): @@ -1453,8 +1456,6 @@ def walk(self, where="/"): child groups (following an alphanumerical order) is also traversed, following the same procedure. - .. versionadded:: 0.24.0 - Parameters ---------- where : str, default "/" @@ -1491,7 +1492,7 @@ def walk(self, where="/"): yield (g._v_pathname.rstrip("/"), groups, leaves) def get_node(self, key: str) -> Node | None: - """ return the node with the key or None if it does not exist """ + """return the node with the key or None if it does not exist""" self._check_if_open() if not key.startswith("/"): key = "/" + key @@ -1507,7 +1508,7 @@ def get_node(self, key: str) -> Node | None: return node def get_storer(self, key: str) -> GenericFixed | Table: - """ return the storer object for a key, raise if not in the file """ + """return the storer object for a key, raise if not in the file""" group = self.get_node(key) if group is None: raise KeyError(f"No object named {key} in the file") @@ -1624,7 +1625,7 @@ def _check_if_open(self): raise ClosedFileError(f"{self._path} file is not open!") def _validate_format(self, format: str) -> str: - """ validate / deprecate formats """ + """validate / deprecate formats""" # validate try: format = _FORMAT_MAP[format.lower()] @@ -1641,7 +1642,7 @@ def _create_storer( encoding: str = "UTF-8", errors: str = "strict", ) -> GenericFixed | Table: - """ return a suitable class to operate """ + """return a suitable class to operate""" cls: type[GenericFixed] | type[Table] if value is not None and not isinstance(value, (Series, DataFrame)): @@ -2019,7 +2020,7 @@ def kind_attr(self) -> str: return f"{self.name}_kind" def set_pos(self, pos: int): - """ set the position of this column in the Table """ + """set the position of this column in the Table""" self.pos = pos if pos is not None and self.typ is not None: self.typ._v_pos = pos @@ -2036,7 +2037,7 @@ def __repr__(self) -> str: ) def __eq__(self, other: Any) -> bool: - """ compare 2 col items """ + """compare 2 col items""" return all( getattr(self, a, None) == getattr(other, a, None) for a in ["name", "cname", "axis", "pos"] @@ -2047,7 +2048,7 @@ def __ne__(self, other) -> bool: @property def is_indexed(self) -> bool: - """ return whether I am an indexed column """ + """return whether I am an indexed column""" if not hasattr(self.table, "cols"): # e.g. if infer hasn't been called yet, self.table will be None. return False @@ -2092,7 +2093,7 @@ def convert(self, values: np.ndarray, nan_rep, encoding: str, errors: str): return new_pd_index, new_pd_index def take_data(self): - """ return the values""" + """return the values""" return self.values @property @@ -2105,12 +2106,12 @@ def description(self): @property def col(self): - """ return my current col description """ + """return my current col description""" return getattr(self.description, self.cname, None) @property def cvalues(self): - """ return my cython values """ + """return my cython values""" return self.values def __iter__(self): @@ -2141,7 +2142,7 @@ def validate_and_set(self, handler: AppendableTable, append: bool): self.set_attr() def validate_col(self, itemsize=None): - """ validate this column: return the compared against itemsize """ + """validate this column: return the compared against itemsize""" # validate this column for string truncation (or reset to the max size) if _ensure_decoded(self.kind) == "string": c = self.col @@ -2200,17 +2201,17 @@ def update_info(self, info): idx[key] = value def set_info(self, info): - """ set my state from the passed info """ + """set my state from the passed info""" idx = info.get(self.name) if idx is not None: self.__dict__.update(idx) def set_attr(self): - """ set the kind for this column """ + """set the kind for this column""" setattr(self.attrs, self.kind_attr, self.kind) def validate_metadata(self, handler: AppendableTable): - """ validate that kind=category does not change the categories """ + """validate that kind=category does not change the categories""" if self.meta == "category": new_metadata = self.metadata cur_metadata = handler.read_metadata(self.cname) @@ -2225,13 +2226,13 @@ def validate_metadata(self, handler: AppendableTable): ) def write_metadata(self, handler: AppendableTable): - """ set the meta data """ + """set the meta data""" if self.metadata is not None: handler.write_metadata(self.cname, self.metadata) class GenericIndexCol(IndexCol): - """ an index which is not represented in the data of the table """ + """an index which is not represented in the data of the table""" @property def is_indexed(self) -> bool: @@ -2330,7 +2331,7 @@ def __repr__(self) -> str: ) def __eq__(self, other: Any) -> bool: - """ compare 2 col items """ + """compare 2 col items""" return all( getattr(self, a, None) == getattr(other, a, None) for a in ["name", "cname", "dtype", "pos"] @@ -2347,7 +2348,7 @@ def set_data(self, data: ArrayLike): self.kind = _dtype_to_kind(dtype_name) def take_data(self): - """ return the data """ + """return the data""" return self.data @classmethod @@ -2388,7 +2389,7 @@ def get_atom_string(cls, shape, itemsize): @classmethod def get_atom_coltype(cls, kind: str) -> type[Col]: - """ return the PyTables column class for this column """ + """return the PyTables column class for this column""" if kind.startswith("uint"): k4 = kind[4:] col_name = f"UInt{k4}Col" @@ -2419,7 +2420,7 @@ def shape(self): @property def cvalues(self): - """ return my cython values """ + """return my cython values""" return self.data def validate_attr(self, append): @@ -2537,7 +2538,7 @@ def convert(self, values: np.ndarray, nan_rep, encoding: str, errors: str): return self.values, converted def set_attr(self): - """ set the data for this column """ + """set the data for this column""" setattr(self.attrs, self.kind_attr, self.values) setattr(self.attrs, self.meta_attr, self.meta) assert self.dtype is not None @@ -2545,7 +2546,7 @@ def set_attr(self): class DataIndexableCol(DataCol): - """ represent a data column that can be indexed """ + """represent a data column that can be indexed""" is_data_indexable = True @@ -2572,7 +2573,7 @@ def get_atom_timedelta64(cls, shape): class GenericDataIndexableCol(DataIndexableCol): - """ represent a generic pytables data column """ + """represent a generic pytables data column""" pass @@ -2621,7 +2622,7 @@ def is_old_version(self) -> bool: @property def version(self) -> tuple[int, int, int]: - """ compute and set our version """ + """compute and set our version""" version = _ensure_decoded(getattr(self.group._v_attrs, "pandas_version", None)) try: version = tuple(int(x) for x in version.split(".")) @@ -2636,7 +2637,7 @@ def pandas_type(self): return _ensure_decoded(getattr(self.group._v_attrs, "pandas_type", None)) def __repr__(self) -> str: - """ return a pretty representation of myself """ + """return a pretty representation of myself""" self.infer_axes() s = self.shape if s is not None: @@ -2647,7 +2648,7 @@ def __repr__(self) -> str: return self.pandas_type def set_object_info(self): - """ set my pandas type & version """ + """set my pandas type & version""" self.attrs.pandas_type = str(self.pandas_kind) self.attrs.pandas_version = str(_version) @@ -2684,16 +2685,16 @@ def attrs(self): return self.group._v_attrs def set_attrs(self): - """ set our object attributes """ + """set our object attributes""" pass def get_attrs(self): - """ get our object attributes """ + """get our object attributes""" pass @property def storable(self): - """ return my storable """ + """return my storable""" return self.group @property @@ -2705,13 +2706,13 @@ def nrows(self): return getattr(self.storable, "nrows", None) def validate(self, other): - """ validate against an existing storable """ + """validate against an existing storable""" if other is None: return return True def validate_version(self, where=None): - """ are we trying to operate on an old version? """ + """are we trying to operate on an old version?""" return True def infer_axes(self): @@ -2754,7 +2755,7 @@ def delete(self, where=None, start: int | None = None, stop: int | None = None): class GenericFixed(Fixed): - """ a generified fixed version """ + """a generified fixed version""" _index_type_map = {DatetimeIndex: "datetime", PeriodIndex: "period"} _reverse_index_map = {v: k for k, v in _index_type_map.items()} @@ -2836,12 +2837,12 @@ def is_exists(self) -> bool: return True def set_attrs(self): - """ set our object attributes """ + """set our object attributes""" self.attrs.encoding = self.encoding self.attrs.errors = self.errors def get_attrs(self): - """ retrieve our attributes """ + """retrieve our attributes""" self.encoding = _ensure_encoding(getattr(self.attrs, "encoding", None)) self.errors = _ensure_decoded(getattr(self.attrs, "errors", "strict")) for n in self.attributes: @@ -2851,7 +2852,7 @@ def write(self, obj, **kwargs): self.set_attrs() def read_array(self, key: str, start: int | None = None, stop: int | None = None): - """ read an array for the specified node (off of group """ + """read an array for the specified node (off of group""" import tables node = getattr(self.group, key) @@ -3008,7 +3009,7 @@ def read_index_node( return index def write_array_empty(self, key: str, value: ArrayLike): - """ write a 0-len array """ + """write a 0-len array""" # ugly hack for length 0 axes arr = np.empty((1,) * value.ndim) self._handle.create_array(self.group, key, arr) @@ -3208,6 +3209,11 @@ def read( def write(self, obj, **kwargs): super().write(obj, **kwargs) + + # TODO(ArrayManager) HDFStore relies on accessing the blocks + if isinstance(obj._mgr, ArrayManager): + obj = obj._as_manager("block") + data = obj._mgr if not data.is_consolidated(): data = data.consolidate() @@ -3296,7 +3302,7 @@ def table_type_short(self) -> str: return self.table_type.split("_")[0] def __repr__(self) -> str: - """ return a pretty representation of myself """ + """return a pretty representation of myself""" self.infer_axes() jdc = ",".join(self.data_columns) if len(self.data_columns) else "" dc = f",dc->[{jdc}]" @@ -3314,14 +3320,14 @@ def __repr__(self) -> str: ) def __getitem__(self, c: str): - """ return the axis for c """ + """return the axis for c""" for a in self.axes: if c == a.name: return a return None def validate(self, other): - """ validate against an existing table """ + """validate against an existing table""" if other is None: return @@ -3377,12 +3383,12 @@ def validate_multiindex( @property def nrows_expected(self) -> int: - """ based on our axes, compute the expected nrows """ + """based on our axes, compute the expected nrows""" return np.prod([i.cvalues.shape[0] for i in self.index_axes]) @property def is_exists(self) -> bool: - """ has this table been created """ + """has this table been created""" return "table" in self.group @property @@ -3391,7 +3397,7 @@ def storable(self): @property def table(self): - """ return the table group (this is my storable) """ + """return the table group (this is my storable)""" return self.storable @property @@ -3408,7 +3414,7 @@ def axes(self): @property def ncols(self) -> int: - """ the number of total columns in the values axes """ + """the number of total columns in the values axes""" return sum(len(a.values) for a in self.values_axes) @property @@ -3426,7 +3432,7 @@ def data_orientation(self): ) def queryables(self) -> dict[str, Any]: - """ return a dict of the kinds allowable columns for this object """ + """return a dict of the kinds allowable columns for this object""" # mypy doesn't recognize DataFrame._AXIS_NAMES, so we re-write it here axis_names = {0: "index", 1: "columns"} @@ -3442,16 +3448,16 @@ def queryables(self) -> dict[str, Any]: return dict(d1 + d2 + d3) # type: ignore[operator] def index_cols(self): - """ return a list of my index cols """ + """return a list of my index cols""" # Note: each `i.cname` below is assured to be a str. return [(i.axis, i.cname) for i in self.index_axes] def values_cols(self) -> list[str]: - """ return a list of my values cols """ + """return a list of my values cols""" return [i.cname for i in self.values_axes] def _get_metadata_path(self, key: str) -> str: - """ return the metadata pathname for this key """ + """return the metadata pathname for this key""" group = self.group._v_pathname return f"{group}/meta/{key}/meta" @@ -3479,13 +3485,13 @@ def write_metadata(self, key: str, values: np.ndarray): ) def read_metadata(self, key: str): - """ return the meta data array for this key """ + """return the meta data array for this key""" if getattr(getattr(self.group, "meta", None), key, None) is not None: return self.parent.select(self._get_metadata_path(key)) return None def set_attrs(self): - """ set our table type & indexables """ + """set our table type & indexables""" self.attrs.table_type = str(self.table_type) self.attrs.index_cols = self.index_cols() self.attrs.values_cols = self.values_cols() @@ -3498,7 +3504,7 @@ def set_attrs(self): self.attrs.info = self.info def get_attrs(self): - """ retrieve our attributes """ + """retrieve our attributes""" self.non_index_axes = getattr(self.attrs, "non_index_axes", None) or [] self.data_columns = getattr(self.attrs, "data_columns", None) or [] self.info = getattr(self.attrs, "info", None) or {} @@ -3510,7 +3516,7 @@ def get_attrs(self): self.values_axes = [a for a in self.indexables if not a.is_an_indexable] def validate_version(self, where=None): - """ are we trying to operate on an old version? """ + """are we trying to operate on an old version?""" if where is not None: if self.version[0] <= 0 and self.version[1] <= 10 and self.version[2] < 1: ws = incompatibility_doc % ".".join(str(x) for x in self.version) @@ -3540,7 +3546,7 @@ def validate_min_itemsize(self, min_itemsize): @cache_readonly def indexables(self): - """ create/cache the indexables if they don't exist """ + """create/cache the indexables if they don't exist""" _indexables = [] desc = self.description @@ -3732,7 +3738,7 @@ def _read_axes( @classmethod def get_object(cls, obj, transposed: bool): - """ return the data for this obj """ + """return the data for this obj""" return obj def validate_data_columns(self, data_columns, min_itemsize, non_index_axes): @@ -4017,6 +4023,10 @@ def _get_blocks_and_items( ): # Helper to clarify non-state-altering parts of _create_axes + # TODO(ArrayManager) HDFStore relies on accessing the blocks + if isinstance(frame._mgr, ArrayManager): + frame = frame._as_manager("block") + def get_blk_items(mgr): return [mgr.items.take(blk.mgr_locs) for blk in mgr.blocks] @@ -4067,7 +4077,7 @@ def get_blk_items(mgr): return blocks, blk_items def process_axes(self, obj, selection: Selection, columns=None): - """ process axes filters """ + """process axes filters""" # make a copy to avoid side effects if columns is not None: columns = list(columns) @@ -4131,7 +4141,7 @@ def create_description( fletcher32: bool, expectedrows: int | None, ) -> dict[str, Any]: - """ create the description of the table from the axes & values """ + """create the description of the table from the axes & values""" # provided expected rows if its passed if expectedrows is None: expectedrows = max(self.nrows_expected, 10000) @@ -4256,7 +4266,7 @@ def write(self, **kwargs): class AppendableTable(Table): - """ support the new appendable table formats """ + """support the new appendable table formats""" table_type = "appendable" @@ -4485,7 +4495,7 @@ def delete(self, where=None, start: int | None = None, stop: int | None = None): class AppendableFrameTable(AppendableTable): - """ support the new appendable table formats """ + """support the new appendable table formats""" pandas_kind = "frame_table" table_type = "appendable_frame" @@ -4498,7 +4508,7 @@ def is_transposed(self) -> bool: @classmethod def get_object(cls, obj, transposed: bool): - """ these are written transposed """ + """these are written transposed""" if transposed: obj = obj.T return obj @@ -4585,7 +4595,7 @@ def read( class AppendableSeriesTable(AppendableFrameTable): - """ support the new appendable table formats """ + """support the new appendable table formats""" pandas_kind = "series_table" table_type = "appendable_series" @@ -4601,7 +4611,7 @@ def get_object(cls, obj, transposed: bool): return obj def write(self, obj, data_columns=None, **kwargs): - """ we are going to write this as a frame table """ + """we are going to write this as a frame table""" if not isinstance(obj, DataFrame): name = obj.name or "values" obj = obj.to_frame(name) @@ -4634,13 +4644,13 @@ def read( class AppendableMultiSeriesTable(AppendableSeriesTable): - """ support the new appendable table formats """ + """support the new appendable table formats""" pandas_kind = "series_table" table_type = "appendable_multiseries" def write(self, obj, **kwargs): - """ we are going to write this as a frame table """ + """we are going to write this as a frame table""" name = obj.name or "values" newobj, self.levels = self.validate_multiindex(obj) assert isinstance(self.levels, list) # for mypy @@ -4651,7 +4661,7 @@ def write(self, obj, **kwargs): class GenericTable(AppendableFrameTable): - """ a table that read/writes the generic pytables table format """ + """a table that read/writes the generic pytables table format""" pandas_kind = "frame_table" table_type = "generic_table" @@ -4668,7 +4678,7 @@ def storable(self): return getattr(self.group, "table", None) or self.group def get_attrs(self): - """ retrieve our attributes """ + """retrieve our attributes""" self.non_index_axes = [] self.nan_rep = None self.levels = [] @@ -4679,7 +4689,7 @@ def get_attrs(self): @cache_readonly def indexables(self): - """ create the indexables from the table description """ + """create the indexables from the table description""" d = self.description # TODO: can we get a typ for this? AFAICT it is the only place @@ -4717,7 +4727,7 @@ def write(self, **kwargs): class AppendableMultiFrameTable(AppendableFrameTable): - """ a frame with a multi-index """ + """a frame with a multi-index""" table_type = "appendable_multiframe" obj_type = DataFrame @@ -4784,7 +4794,7 @@ def _reindex_axis(obj: DataFrame, axis: int, labels: Index, other=None) -> DataF def _get_tz(tz: tzinfo) -> str | tzinfo: - """ for a tz-aware type, return an encoded zone """ + """for a tz-aware type, return an encoded zone""" zone = timezones.get_timezone(tz) return zone @@ -5232,7 +5242,7 @@ def __init__( self.condition, self.filter = self.terms.evaluate() def generate(self, where): - """ where can be a : dict,list,tuple,string """ + """where can be a : dict,list,tuple,string""" if where is None: return None diff --git a/pandas/io/sas/sas_xport.py b/pandas/io/sas/sas_xport.py index f7e1c56cbb196..6ced3febd78f4 100644 --- a/pandas/io/sas/sas_xport.py +++ b/pandas/io/sas/sas_xport.py @@ -138,7 +138,7 @@ def _parse_date(datestr: str) -> datetime: - """ Given a date in xport format, return Python date. """ + """Given a date in xport format, return Python date.""" try: # e.g. "16FEB11:10:07:55" return datetime.strptime(datestr, "%d%b%y:%H:%M:%S") diff --git a/pandas/io/sql.py b/pandas/io/sql.py index a347e7a99be8b..b9d5b18b85e02 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -692,9 +692,6 @@ def to_sql( Details and a sample callable implementation can be found in the section :ref:`insert method `. - - .. versionadded:: 0.24.0 - engine : {'auto', 'sqlalchemy'}, default 'auto' SQL engine library to use. If 'auto', then the option ``io.sql.engine`` is used. The default ``io.sql.engine`` @@ -1354,7 +1351,7 @@ def insert_records( def get_engine(engine: str) -> BaseEngine: - """ return our implementation """ + """return our implementation""" if engine == "auto": engine = get_option("io.sql.engine") @@ -1739,9 +1736,6 @@ def to_sql( Details and a sample callable implementation can be found in the section :ref:`insert method `. - - .. versionadded:: 0.24.0 - engine : {'auto', 'sqlalchemy'}, default 'auto' SQL engine library to use. If 'auto', then the option ``io.sql.engine`` is used. The default ``io.sql.engine`` @@ -2202,8 +2196,6 @@ def to_sql( Details and a sample callable implementation can be found in the section :ref:`insert method `. - - .. versionadded:: 0.24.0 """ if dtype: if not is_dict_like(dtype): diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 1fef33558dd9a..ffaebb3c10ae2 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -1095,15 +1095,15 @@ def __init__( self._setup_dtype() def __enter__(self) -> StataReader: - """ enter context manager """ + """enter context manager""" return self def __exit__(self, exc_type, exc_value, traceback) -> None: - """ exit context manager """ + """exit context manager""" self.close() def close(self) -> None: - """ close the handle if its open """ + """close the handle if its open""" self.path_or_buf.close() def _set_encoding(self) -> None: diff --git a/pandas/plotting/_core.py b/pandas/plotting/_core.py index 5d3db13610845..00d87b707580d 100644 --- a/pandas/plotting/_core.py +++ b/pandas/plotting/_core.py @@ -430,7 +430,7 @@ def hist_frame( y : label or position, optional Allows plotting of one column versus another. If not specified, all numerical columns are used. - color : str, array_like, or dict, optional + color : str, array-like, or dict, optional The color for each of the DataFrame's columns. Possible values are: - A single color string referred to by name, RGB or RGBA code, @@ -1571,7 +1571,7 @@ def scatter(self, x, y, s=None, c=None, **kwargs): y : int or str The column name or column position to be used as vertical coordinates for each point. - s : str, scalar or array_like, optional + s : str, scalar or array-like, optional The size of each point. Possible values are: - A string with the name of the column to be used for marker's size. @@ -1584,7 +1584,7 @@ def scatter(self, x, y, s=None, c=None, **kwargs): .. versionchanged:: 1.1.0 - c : str, int or array_like, optional + c : str, int or array-like, optional The color of each point. Possible values are: - A single color string referred to by name, RGB or RGBA code, diff --git a/pandas/plotting/_matplotlib/converter.py b/pandas/plotting/_matplotlib/converter.py index 7c6a718b34e89..7e3bf0b224e0e 100644 --- a/pandas/plotting/_matplotlib/converter.py +++ b/pandas/plotting/_matplotlib/converter.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import contextlib import datetime as pydt from datetime import ( @@ -6,13 +8,7 @@ tzinfo, ) import functools -from typing import ( - Any, - Dict, - List, - Optional, - Tuple, -) +from typing import Any from dateutil.relativedelta import relativedelta import matplotlib.dates as dates @@ -169,7 +165,7 @@ def convert(value, unit, axis): return value @staticmethod - def axisinfo(unit, axis) -> Optional[units.AxisInfo]: + def axisinfo(unit, axis) -> units.AxisInfo | None: if unit != "time": return None @@ -319,7 +315,7 @@ def try_parse(values): return values @staticmethod - def axisinfo(unit: Optional[tzinfo], axis) -> units.AxisInfo: + def axisinfo(unit: tzinfo | None, axis) -> units.AxisInfo: """ Return the :class:`~matplotlib.units.AxisInfo` for *unit*. @@ -447,7 +443,7 @@ def autoscale(self): return self.nonsingular(vmin, vmax) -def _from_ordinal(x, tz: Optional[tzinfo] = None) -> datetime: +def _from_ordinal(x, tz: tzinfo | None = None) -> datetime: ix = int(x) dt = datetime.fromordinal(ix) remainder = float(x) - ix @@ -476,7 +472,7 @@ def _from_ordinal(x, tz: Optional[tzinfo] = None) -> datetime: # ------------------------------------------------------------------------- -def _get_default_annual_spacing(nyears) -> Tuple[int, int]: +def _get_default_annual_spacing(nyears) -> tuple[int, int]: """ Returns a default spacing between consecutive ticks for annual data. """ @@ -1027,8 +1023,8 @@ def __init__( freq = to_offset(freq) self.format = None self.freq = freq - self.locs: List[Any] = [] # unused, for matplotlib compat - self.formatdict: Optional[Dict[Any, Any]] = None + self.locs: list[Any] = [] # unused, for matplotlib compat + self.formatdict: dict[Any, Any] | None = None self.isminor = minor_locator self.isdynamic = dynamic_mode self.offset = 0 diff --git a/pandas/tests/api/test_api.py b/pandas/tests/api/test_api.py index 8b7070e945439..38984238ecf65 100644 --- a/pandas/tests/api/test_api.py +++ b/pandas/tests/api/test_api.py @@ -1,6 +1,7 @@ +from __future__ import annotations + import subprocess import sys -from typing import List import pytest @@ -46,7 +47,7 @@ class TestPDApi(Base): ] # these are already deprecated; awaiting removal - deprecated_modules: List[str] = ["np", "datetime"] + deprecated_modules: list[str] = ["np", "datetime"] # misc misc = ["IndexSlice", "NaT", "NA"] @@ -98,13 +99,13 @@ class TestPDApi(Base): ] # these are already deprecated; awaiting removal - deprecated_classes: List[str] = [] + deprecated_classes: list[str] = [] # these should be deprecated in the future - deprecated_classes_in_future: List[str] = ["SparseArray"] + deprecated_classes_in_future: list[str] = ["SparseArray"] # external modules exposed in pandas namespace - modules: List[str] = [] + modules: list[str] = [] # top-level functions funcs = [ @@ -181,10 +182,10 @@ class TestPDApi(Base): funcs_to = ["to_datetime", "to_numeric", "to_pickle", "to_timedelta"] # top-level to deprecate in the future - deprecated_funcs_in_future: List[str] = [] + deprecated_funcs_in_future: list[str] = [] # these are already deprecated; awaiting removal - deprecated_funcs: List[str] = [] + deprecated_funcs: list[str] = [] # private modules in pandas namespace private_modules = [ diff --git a/pandas/tests/arithmetic/test_numeric.py b/pandas/tests/arithmetic/test_numeric.py index 9ca7d0b465250..844bdd4bd1944 100644 --- a/pandas/tests/arithmetic/test_numeric.py +++ b/pandas/tests/arithmetic/test_numeric.py @@ -1,14 +1,13 @@ # Arithmetic tests for DataFrame/Series/Index/Array classes that should # behave identically. # Specifically for numeric dtypes +from __future__ import annotations + from collections import abc from decimal import Decimal from itertools import combinations import operator -from typing import ( - Any, - List, -) +from typing import Any import numpy as np import pytest @@ -56,8 +55,8 @@ def adjust_negative_zero(zero, expected): # TODO: remove this kludge once mypy stops giving false positives here # List comprehension has incompatible type List[PandasObject]; expected List[RangeIndex] # See GH#29725 -ser_or_index: List[Any] = [Series, Index] -lefts: List[Any] = [RangeIndex(10, 40, 10)] +ser_or_index: list[Any] = [Series, Index] +lefts: list[Any] = [RangeIndex(10, 40, 10)] lefts.extend( [ cls([10, 20, 30], dtype=dtype) diff --git a/pandas/tests/arithmetic/test_period.py b/pandas/tests/arithmetic/test_period.py index bb78e29924ba2..5f93442cae4f6 100644 --- a/pandas/tests/arithmetic/test_period.py +++ b/pandas/tests/arithmetic/test_period.py @@ -444,7 +444,7 @@ def test_cmp_series_period_series_mixed_freq(self): class TestPeriodIndexSeriesComparisonConsistency: - """ Test PeriodIndex and Period Series Ops consistency """ + """Test PeriodIndex and Period Series Ops consistency""" # TODO: needs parametrization+de-duplication @@ -1306,7 +1306,7 @@ def test_ops_series_period(self): class TestPeriodIndexSeriesMethods: - """ Test PeriodIndex and Period Series Ops consistency """ + """Test PeriodIndex and Period Series Ops consistency""" def _check(self, values, func, expected): idx = PeriodIndex(values) diff --git a/pandas/tests/arrays/categorical/test_missing.py b/pandas/tests/arrays/categorical/test_missing.py index c765416368726..930d890ee91d4 100644 --- a/pandas/tests/arrays/categorical/test_missing.py +++ b/pandas/tests/arrays/categorical/test_missing.py @@ -100,6 +100,13 @@ def test_fillna_iterable_category(self, named): tm.assert_categorical_equal(result, expected) + # Case where the Point is not among our categories; we want ValueError, + # not NotImplementedError GH#41914 + cat = Categorical(np.array([Point(1, 0), Point(0, 1), None], dtype=object)) + msg = "Cannot setitem on a Categorical with a new category" + with pytest.raises(ValueError, match=msg): + cat.fillna(Point(0, 0)) + def test_fillna_array(self): # accept Categorical or ndarray value if it holds appropriate values cat = Categorical(["A", "B", "C", None, None]) diff --git a/pandas/tests/arrays/masked/test_arithmetic.py b/pandas/tests/arrays/masked/test_arithmetic.py index 29998831777f8..bea94095452bd 100644 --- a/pandas/tests/arrays/masked/test_arithmetic.py +++ b/pandas/tests/arrays/masked/test_arithmetic.py @@ -1,7 +1,6 @@ -from typing import ( - Any, - List, -) +from __future__ import annotations + +from typing import Any import numpy as np import pytest @@ -12,7 +11,7 @@ # integer dtypes arrays = [pd.array([1, 2, 3, None], dtype=dtype) for dtype in tm.ALL_EA_INT_DTYPES] -scalars: List[Any] = [2] * len(arrays) +scalars: list[Any] = [2] * len(arrays) # floating dtypes arrays += [pd.array([0.1, 0.2, 0.3, None], dtype=dtype) for dtype in tm.FLOAT_EA_DTYPES] scalars += [0.2, 0.2] diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index c9533e239abe0..5731f02430a9d 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -14,37 +14,17 @@ import pandas as pd import pandas._testing as tm -from pandas.core.arrays.string_arrow import ( - ArrowStringArray, - ArrowStringDtype, -) - -skip_if_no_pyarrow = td.skip_if_no("pyarrow", min_version="1.0.0") - - -@pytest.fixture( - params=["string", pytest.param("arrow_string", marks=skip_if_no_pyarrow)] -) -def dtype(request): - return request.param +from pandas.core.arrays.string_arrow import ArrowStringArray @pytest.fixture -def dtype_object(dtype): - if dtype == "string": - return pd.StringDtype - else: - return ArrowStringDtype +def dtype(string_storage): + return pd.StringDtype(storage=string_storage) -@pytest.fixture( - params=[ - pd.arrays.StringArray, - pytest.param(ArrowStringArray, marks=skip_if_no_pyarrow), - ] -) -def cls(request): - return request.param +@pytest.fixture +def cls(dtype): + return dtype.construct_array_type() def test_repr(dtype): @@ -52,11 +32,11 @@ def test_repr(dtype): expected = " A\n0 a\n1 \n2 b" assert repr(df) == expected - expected = f"0 a\n1 \n2 b\nName: A, dtype: {dtype}" + expected = "0 a\n1 \n2 b\nName: A, dtype: string" assert repr(df.A) == expected - arr_name = "ArrowStringArray" if dtype == "arrow_string" else "StringArray" - expected = f"<{arr_name}>\n['a', , 'b']\nLength: 3, dtype: {dtype}" + arr_name = "ArrowStringArray" if dtype.storage == "pyarrow" else "StringArray" + expected = f"<{arr_name}>\n['a', , 'b']\nLength: 3, dtype: string" assert repr(df.A.array) == expected @@ -94,7 +74,7 @@ def test_setitem_with_scalar_string(dtype): def test_astype_roundtrip(dtype, request): - if dtype == "arrow_string": + if dtype.storage == "pyarrow": reason = "ValueError: Could not convert object to NumPy datetime" mark = pytest.mark.xfail(reason=reason, raises=ValueError) request.node.add_marker(mark) @@ -115,7 +95,7 @@ def test_astype_roundtrip(dtype, request): def test_add(dtype, request): - if dtype == "arrow_string": + if dtype.storage == "pyarrow": reason = ( "unsupported operand type(s) for +: 'ArrowStringArray' and " "'ArrowStringArray'" @@ -143,7 +123,7 @@ def test_add(dtype, request): def test_add_2d(dtype, request): - if dtype == "arrow_string": + if dtype.storage == "pyarrow": reason = "Failed: DID NOT RAISE " mark = pytest.mark.xfail(raises=None, reason=reason) request.node.add_marker(mark) @@ -159,7 +139,7 @@ def test_add_2d(dtype, request): def test_add_sequence(dtype, request): - if dtype == "arrow_string": + if dtype.storage == "pyarrow": reason = "unsupported operand type(s) for +: 'ArrowStringArray' and 'list'" mark = pytest.mark.xfail(raises=TypeError, reason=reason) request.node.add_marker(mark) @@ -177,7 +157,7 @@ def test_add_sequence(dtype, request): def test_mul(dtype, request): - if dtype == "arrow_string": + if dtype.storage == "pyarrow": reason = "unsupported operand type(s) for *: 'ArrowStringArray' and 'int'" mark = pytest.mark.xfail(raises=TypeError, reason=reason) request.node.add_marker(mark) @@ -258,7 +238,7 @@ def test_comparison_methods_scalar_not_string(all_compare_operators, dtype, requ def test_comparison_methods_array(all_compare_operators, dtype, request): - if dtype == "arrow_string": + if dtype.storage == "pyarrow": mark = pytest.mark.xfail( raises=AssertionError, reason="left is not an ExtensionArray" ) @@ -366,7 +346,7 @@ def test_reduce(skipna, dtype): @pytest.mark.parametrize("method", ["min", "max"]) @pytest.mark.parametrize("skipna", [True, False]) def test_min_max(method, skipna, dtype, request): - if dtype == "arrow_string": + if dtype.storage == "pyarrow": reason = "'ArrowStringArray' object has no attribute 'max'" mark = pytest.mark.xfail(raises=AttributeError, reason=reason) request.node.add_marker(mark) @@ -383,7 +363,7 @@ def test_min_max(method, skipna, dtype, request): @pytest.mark.parametrize("method", ["min", "max"]) @pytest.mark.parametrize("box", [pd.Series, pd.array]) def test_min_max_numpy(method, box, dtype, request): - if dtype == "arrow_string": + if dtype.storage == "pyarrow": if box is pd.array: raises = TypeError reason = "'<=' not supported between instances of 'str' and 'NoneType'" @@ -413,7 +393,7 @@ def test_reduce_missing(skipna, dtype): def test_fillna_args(dtype, request): # GH 37987 - if dtype == "arrow_string": + if dtype.storage == "pyarrow": reason = ( "Regex pattern \"Cannot set non-string value '1' into " "a StringArray.\" does not match 'Scalar must be NA or str'" @@ -444,14 +424,14 @@ def test_arrow_array(dtype): data = pd.array(["a", "b", "c"], dtype=dtype) arr = pa.array(data) expected = pa.array(list(data), type=pa.string(), from_pandas=True) - if dtype == "arrow_string": + if dtype.storage == "pyarrow": expected = pa.chunked_array(expected) assert arr.equals(expected) @td.skip_if_no("pyarrow") -def test_arrow_roundtrip(dtype, dtype_object): +def test_arrow_roundtrip(dtype, string_storage2): # roundtrip possible from arrow 1.0.0 import pyarrow as pa @@ -459,15 +439,17 @@ def test_arrow_roundtrip(dtype, dtype_object): df = pd.DataFrame({"a": data}) table = pa.table(df) assert table.field("a").type == "string" - result = table.to_pandas() - assert isinstance(result["a"].dtype, dtype_object) - tm.assert_frame_equal(result, df) + with pd.option_context("string_storage", string_storage2): + result = table.to_pandas() + assert isinstance(result["a"].dtype, pd.StringDtype) + expected = df.astype(f"string[{string_storage2}]") + tm.assert_frame_equal(result, expected) # ensure the missing value is represented by NA and not np.nan or None assert result.loc[2, "a"] is pd.NA @td.skip_if_no("pyarrow") -def test_arrow_load_from_zero_chunks(dtype, dtype_object): +def test_arrow_load_from_zero_chunks(dtype, string_storage2): # GH-41040 import pyarrow as pa @@ -477,9 +459,11 @@ def test_arrow_load_from_zero_chunks(dtype, dtype_object): assert table.field("a").type == "string" # Instantiate the same table with no chunks at all table = pa.table([pa.chunked_array([], type=pa.string())], schema=table.schema) - result = table.to_pandas() - assert isinstance(result["a"].dtype, dtype_object) - tm.assert_frame_equal(result, df) + with pd.option_context("string_storage", string_storage2): + result = table.to_pandas() + assert isinstance(result["a"].dtype, pd.StringDtype) + expected = df.astype(f"string[{string_storage2}]") + tm.assert_frame_equal(result, expected) def test_value_counts_na(dtype): @@ -523,10 +507,10 @@ def test_use_inf_as_na(values, expected, dtype): tm.assert_frame_equal(result, expected) -def test_memory_usage(dtype, request): +def test_memory_usage(dtype): # GH 33963 - if dtype == "arrow_string": + if dtype.storage == "pyarrow": pytest.skip("not applicable") series = pd.Series(["a", "b", "c"], dtype=dtype) diff --git a/pandas/tests/arrays/string_/test_string_arrow.py b/pandas/tests/arrays/string_/test_string_arrow.py index 3db8333798e36..c3f951adf7f89 100644 --- a/pandas/tests/arrays/string_/test_string_arrow.py +++ b/pandas/tests/arrays/string_/test_string_arrow.py @@ -5,16 +5,47 @@ from pandas.compat import pa_version_under1p0 -from pandas.core.arrays.string_arrow import ( - ArrowStringArray, - ArrowStringDtype, +import pandas as pd +import pandas._testing as tm +from pandas.core.arrays.string_ import ( + StringArray, + StringDtype, ) +from pandas.core.arrays.string_arrow import ArrowStringArray - -@pytest.mark.skipif( +skip_if_no_pyarrow = pytest.mark.skipif( pa_version_under1p0, reason="pyarrow>=1.0.0 is required for PyArrow backed StringArray", ) + + +@skip_if_no_pyarrow +def test_eq_all_na(): + a = pd.array([pd.NA, pd.NA], dtype=StringDtype("pyarrow")) + result = a == a + expected = pd.array([pd.NA, pd.NA], dtype="boolean") + tm.assert_extension_array_equal(result, expected) + + +def test_config(string_storage): + with pd.option_context("string_storage", string_storage): + assert StringDtype().storage == string_storage + result = pd.array(["a", "b"]) + assert result.dtype.storage == string_storage + + expected = ( + StringDtype(string_storage).construct_array_type()._from_sequence(["a", "b"]) + ) + tm.assert_equal(result, expected) + + +def test_config_bad_storage_raises(): + msg = re.escape("Value must be one of python|pyarrow") + with pytest.raises(ValueError, match=msg): + pd.options.mode.string_storage = "foo" + + +@skip_if_no_pyarrow @pytest.mark.parametrize("chunked", [True, False]) @pytest.mark.parametrize("array", ["numpy", "pyarrow"]) def test_constructor_not_string_type_raises(array, chunked): @@ -37,6 +68,55 @@ def test_constructor_not_string_type_raises(array, chunked): ArrowStringArray(arr) +@skip_if_no_pyarrow +def test_from_sequence_wrong_dtype_raises(): + with pd.option_context("string_storage", "python"): + ArrowStringArray._from_sequence(["a", None, "c"], dtype="string") + + with pd.option_context("string_storage", "pyarrow"): + ArrowStringArray._from_sequence(["a", None, "c"], dtype="string") + + with pytest.raises(AssertionError, match=None): + ArrowStringArray._from_sequence(["a", None, "c"], dtype="string[python]") + + ArrowStringArray._from_sequence(["a", None, "c"], dtype="string[pyarrow]") + + with pytest.raises(AssertionError, match=None): + with pd.option_context("string_storage", "python"): + ArrowStringArray._from_sequence(["a", None, "c"], dtype=StringDtype()) + + with pd.option_context("string_storage", "pyarrow"): + ArrowStringArray._from_sequence(["a", None, "c"], dtype=StringDtype()) + + with pytest.raises(AssertionError, match=None): + ArrowStringArray._from_sequence(["a", None, "c"], dtype=StringDtype("python")) + + ArrowStringArray._from_sequence(["a", None, "c"], dtype=StringDtype("pyarrow")) + + with pd.option_context("string_storage", "python"): + StringArray._from_sequence(["a", None, "c"], dtype="string") + + with pd.option_context("string_storage", "pyarrow"): + StringArray._from_sequence(["a", None, "c"], dtype="string") + + StringArray._from_sequence(["a", None, "c"], dtype="string[python]") + + with pytest.raises(AssertionError, match=None): + StringArray._from_sequence(["a", None, "c"], dtype="string[pyarrow]") + + with pd.option_context("string_storage", "python"): + StringArray._from_sequence(["a", None, "c"], dtype=StringDtype()) + + with pytest.raises(AssertionError, match=None): + with pd.option_context("string_storage", "pyarrow"): + StringArray._from_sequence(["a", None, "c"], dtype=StringDtype()) + + StringArray._from_sequence(["a", None, "c"], dtype=StringDtype("python")) + + with pytest.raises(AssertionError, match=None): + StringArray._from_sequence(["a", None, "c"], dtype=StringDtype("pyarrow")) + + @pytest.mark.skipif( not pa_version_under1p0, reason="pyarrow is installed", @@ -45,7 +125,7 @@ def test_pyarrow_not_installed_raises(): msg = re.escape("pyarrow>=1.0.0 is required for PyArrow backed StringArray") with pytest.raises(ImportError, match=msg): - ArrowStringDtype() + StringDtype(storage="pyarrow") with pytest.raises(ImportError, match=msg): ArrowStringArray([]) diff --git a/pandas/tests/arrays/test_array.py b/pandas/tests/arrays/test_array.py index bfe588883d9f3..61d56df485ab1 100644 --- a/pandas/tests/arrays/test_array.py +++ b/pandas/tests/arrays/test_array.py @@ -18,7 +18,6 @@ IntegerArray, IntervalArray, SparseArray, - StringArray, TimedeltaArray, ) from pandas.core.arrays import ( @@ -132,8 +131,16 @@ ([1, None], "Int16", pd.array([1, None], dtype="Int16")), (pd.Series([1, 2]), None, PandasArray(np.array([1, 2], dtype=np.int64))), # String - (["a", None], "string", StringArray._from_sequence(["a", None])), - (["a", None], pd.StringDtype(), StringArray._from_sequence(["a", None])), + ( + ["a", None], + "string", + pd.StringDtype().construct_array_type()._from_sequence(["a", None]), + ), + ( + ["a", None], + pd.StringDtype(), + pd.StringDtype().construct_array_type()._from_sequence(["a", None]), + ), # Boolean ([True, None], "boolean", BooleanArray._from_sequence([True, None])), ([True, None], pd.BooleanDtype(), BooleanArray._from_sequence([True, None])), @@ -253,8 +260,14 @@ def test_array_copy(): ([1, 2.0], FloatingArray._from_sequence([1.0, 2.0])), ([1, np.nan, 2.0], FloatingArray._from_sequence([1.0, None, 2.0])), # string - (["a", "b"], StringArray._from_sequence(["a", "b"])), - (["a", None], StringArray._from_sequence(["a", None])), + ( + ["a", "b"], + pd.StringDtype().construct_array_type()._from_sequence(["a", "b"]), + ), + ( + ["a", None], + pd.StringDtype().construct_array_type()._from_sequence(["a", None]), + ), # Boolean ([True, False], BooleanArray._from_sequence([True, False])), ([True, None], BooleanArray._from_sequence([True, None])), diff --git a/pandas/tests/arrays/test_datetimelike.py b/pandas/tests/arrays/test_datetimelike.py index c6f8efe7b939e..3f3f3a5ee8d18 100644 --- a/pandas/tests/arrays/test_datetimelike.py +++ b/pandas/tests/arrays/test_datetimelike.py @@ -1,8 +1,6 @@ +from __future__ import annotations + import re -from typing import ( - Type, - Union, -) import numpy as np import pytest @@ -80,7 +78,7 @@ def timedelta_index(): class SharedTests: - index_cls: Type[Union[DatetimeIndex, PeriodIndex, TimedeltaIndex]] + index_cls: type[DatetimeIndex | PeriodIndex | TimedeltaIndex] @pytest.fixture def arr1d(self): @@ -298,7 +296,7 @@ def test_searchsorted(self): assert result == 10 @pytest.mark.parametrize("box", [None, "index", "series"]) - def test_searchsorted_castable_strings(self, arr1d, box, request): + def test_searchsorted_castable_strings(self, arr1d, box, request, string_storage): if isinstance(arr1d, DatetimeArray): tz = arr1d.tz ts1, ts2 = arr1d[1:3] @@ -341,14 +339,17 @@ def test_searchsorted_castable_strings(self, arr1d, box, request): ): arr.searchsorted("foo") - with pytest.raises( - TypeError, - match=re.escape( - f"value should be a '{arr1d._scalar_type.__name__}', 'NaT', " - "or array of those. Got 'StringArray' instead." - ), - ): - arr.searchsorted([str(arr[1]), "baz"]) + arr_type = "StringArray" if string_storage == "python" else "ArrowStringArray" + + with pd.option_context("string_storage", string_storage): + with pytest.raises( + TypeError, + match=re.escape( + f"value should be a '{arr1d._scalar_type.__name__}', 'NaT', " + f"or array of those. Got '{arr_type}' instead." + ), + ): + arr.searchsorted([str(arr[1]), "baz"]) def test_getitem_near_implementation_bounds(self): # We only check tz-naive for DTA bc the bounds are slightly different diff --git a/pandas/tests/base/test_constructors.py b/pandas/tests/base/test_constructors.py index ceb882ff9c963..16ce709a5b021 100644 --- a/pandas/tests/base/test_constructors.py +++ b/pandas/tests/base/test_constructors.py @@ -47,7 +47,7 @@ def _get_foo(self): foo = property(_get_foo, _set_foo, doc="foo property") def bar(self, *args, **kwargs): - """ a test bar method """ + """a test bar method""" pass class Delegate(PandasDelegate, PandasObject): diff --git a/pandas/tests/base/test_value_counts.py b/pandas/tests/base/test_value_counts.py index 4151781f0dbf5..10f391a49d98f 100644 --- a/pandas/tests/base/test_value_counts.py +++ b/pandas/tests/base/test_value_counts.py @@ -242,6 +242,7 @@ def test_value_counts_datetime64(index_or_series): expected_s = pd.concat([Series([4], index=DatetimeIndex([pd.NaT])), expected_s]) tm.assert_series_equal(result, expected_s) + assert s.dtype == "datetime64[ns]" unique = s.unique() assert unique.dtype == "datetime64[ns]" diff --git a/pandas/tests/computation/test_eval.py b/pandas/tests/computation/test_eval.py index 0467bb1dad676..7cf319e1d134c 100644 --- a/pandas/tests/computation/test_eval.py +++ b/pandas/tests/computation/test_eval.py @@ -1,11 +1,8 @@ +from __future__ import annotations + from functools import reduce from itertools import product import operator -from typing import ( - Dict, - List, - Type, -) import warnings import numpy as np @@ -147,8 +144,8 @@ def lhs(request): @td.skip_if_no_ne class TestEvalNumexprPandas: - exclude_cmp: List[str] = [] - exclude_bool: List[str] = [] + exclude_cmp: list[str] = [] + exclude_bool: list[str] = [] engine = "numexpr" parser = "pandas" @@ -1125,7 +1122,7 @@ def test_performance_warning_for_poor_alignment(self, engine, parser): @td.skip_if_no_ne class TestOperationsNumExprPandas: - exclude_arith: List[str] = [] + exclude_arith: list[str] = [] engine = "numexpr" parser = "pandas" @@ -1629,7 +1626,7 @@ def test_simple_in_ops(self): @td.skip_if_no_ne class TestOperationsNumExprPython(TestOperationsNumExprPandas): - exclude_arith: List[str] = ["in", "not in"] + exclude_arith: list[str] = ["in", "not in"] engine = "numexpr" parser = "python" @@ -1723,7 +1720,7 @@ class TestOperationsPythonPython(TestOperationsNumExprPython): class TestOperationsPythonPandas(TestOperationsNumExprPandas): - exclude_arith: List[str] = [] + exclude_arith: list[str] = [] engine = "python" parser = "pandas" @@ -1878,7 +1875,7 @@ def test_invalid_parser(): pd.eval("x + y", local_dict={"x": 1, "y": 2}, parser="asdf") -_parsers: Dict[str, Type[BaseExprVisitor]] = { +_parsers: dict[str, type[BaseExprVisitor]] = { "python": PythonExprVisitor, "pytables": pytables.PyTablesExprVisitor, "pandas": PandasExprVisitor, diff --git a/pandas/tests/dtypes/test_common.py b/pandas/tests/dtypes/test_common.py index 616f46624bfd7..a2244c4aab923 100644 --- a/pandas/tests/dtypes/test_common.py +++ b/pandas/tests/dtypes/test_common.py @@ -1,5 +1,6 @@ +from __future__ import annotations + from datetime import datetime -from typing import List import numpy as np import pytest @@ -24,12 +25,12 @@ # EA & Actual Dtypes def to_ea_dtypes(dtypes): - """ convert list of string dtypes to EA dtype """ + """convert list of string dtypes to EA dtype""" return [getattr(pd, dt + "Dtype") for dt in dtypes] def to_numpy_dtypes(dtypes): - """ convert list of string dtypes to numpy dtype """ + """convert list of string dtypes to numpy dtype""" return [getattr(np, dt) for dt in dtypes if isinstance(dt, str)] @@ -287,7 +288,7 @@ def test_is_string_dtype_nullable(nullable_string_dtype): assert com.is_string_dtype(pd.array(["a", "b"], dtype=nullable_string_dtype)) -integer_dtypes: List = [] +integer_dtypes: list = [] @pytest.mark.parametrize( @@ -319,7 +320,7 @@ def test_is_not_integer_dtype(dtype): assert not com.is_integer_dtype(dtype) -signed_integer_dtypes: List = [] +signed_integer_dtypes: list = [] @pytest.mark.parametrize( @@ -355,7 +356,7 @@ def test_is_not_signed_integer_dtype(dtype): assert not com.is_signed_integer_dtype(dtype) -unsigned_integer_dtypes: List = [] +unsigned_integer_dtypes: list = [] @pytest.mark.parametrize( diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py index 3c541a309e42a..3c798d82b9485 100644 --- a/pandas/tests/dtypes/test_inference.py +++ b/pandas/tests/dtypes/test_inference.py @@ -14,6 +14,7 @@ from decimal import Decimal from fractions import Fraction from io import StringIO +import itertools from numbers import Number import re @@ -658,8 +659,9 @@ def test_maybe_convert_objects_datetime(self): ) tm.assert_numpy_array_equal(out, exp) + # with convert_timedelta=True, the nan is a valid NA value for td64 arr = np.array([np.timedelta64(1, "s"), np.nan], dtype=object) - exp = arr.copy() + exp = exp[::-1] out = lib.maybe_convert_objects( arr, convert_datetime=True, convert_timedelta=True ) @@ -716,6 +718,16 @@ def test_maybe_convert_objects_datetime_overflow_safe(self, dtype): # no OutOfBoundsDatetime/OutOfBoundsTimedeltas tm.assert_numpy_array_equal(out, arr) + def test_maybe_convert_objects_mixed_datetimes(self): + ts = Timestamp("now") + vals = [ts, ts.to_pydatetime(), ts.to_datetime64(), pd.NaT, np.nan, None] + + for data in itertools.permutations(vals): + data = np.array(list(data), dtype=object) + expected = DatetimeIndex(data)._data._ndarray + result = lib.maybe_convert_objects(data, convert_datetime=True) + tm.assert_numpy_array_equal(result, expected) + def test_maybe_convert_objects_timedelta64_nat(self): obj = np.timedelta64("NaT", "ns") arr = np.array([obj], dtype=object) diff --git a/pandas/tests/extension/arrow/test_string.py b/pandas/tests/extension/arrow/test_string.py index 23a07b2031bf5..67a62978aa1bc 100644 --- a/pandas/tests/extension/arrow/test_string.py +++ b/pandas/tests/extension/arrow/test_string.py @@ -2,12 +2,11 @@ import pandas as pd -pytest.importorskip("pyarrow", minversion="0.13.0") - -from pandas.tests.extension.arrow.arrays import ArrowStringDtype # isort:skip +pytest.importorskip("pyarrow", minversion="1.0.0") def test_constructor_from_list(): # GH 27673 - result = pd.Series(["E"], dtype=ArrowStringDtype()) - assert isinstance(result.dtype, ArrowStringDtype) + result = pd.Series(["E"], dtype=pd.StringDtype(storage="pyarrow")) + assert isinstance(result.dtype, pd.StringDtype) + assert result.dtype.storage == "pyarrow" diff --git a/pandas/tests/extension/base/casting.py b/pandas/tests/extension/base/casting.py index 99a5666926e10..9c59c79f677de 100644 --- a/pandas/tests/extension/base/casting.py +++ b/pandas/tests/extension/base/casting.py @@ -48,16 +48,14 @@ def test_astype_str(self, data): @pytest.mark.parametrize( "nullable_string_dtype", [ - "string", + "string[python]", pytest.param( - "arrow_string", marks=td.skip_if_no("pyarrow", min_version="1.0.0") + "string[pyarrow]", marks=td.skip_if_no("pyarrow", min_version="1.0.0") ), ], ) def test_astype_string(self, data, nullable_string_dtype): # GH-33465 - from pandas.core.arrays.string_arrow import ArrowStringDtype # noqa: F401 - result = pd.Series(data[:5]).astype(nullable_string_dtype) expected = pd.Series([str(x) for x in data[:5]], dtype=nullable_string_dtype) self.assert_series_equal(result, expected) diff --git a/pandas/tests/extension/base/ops.py b/pandas/tests/extension/base/ops.py index 2a27f670fa046..ca22973d0b4d3 100644 --- a/pandas/tests/extension/base/ops.py +++ b/pandas/tests/extension/base/ops.py @@ -1,7 +1,4 @@ -from typing import ( - Optional, - Type, -) +from __future__ import annotations import pytest @@ -67,10 +64,10 @@ class BaseArithmeticOpsTests(BaseOpsUtil): * divmod_exc = TypeError """ - series_scalar_exc: Optional[Type[TypeError]] = TypeError - frame_scalar_exc: Optional[Type[TypeError]] = TypeError - series_array_exc: Optional[Type[TypeError]] = TypeError - divmod_exc: Optional[Type[TypeError]] = TypeError + series_scalar_exc: type[TypeError] | None = TypeError + frame_scalar_exc: type[TypeError] | None = TypeError + series_array_exc: type[TypeError] | None = TypeError + divmod_exc: type[TypeError] | None = TypeError def test_arith_series_with_scalar(self, data, all_arithmetic_operators): # series & scalar diff --git a/pandas/tests/extension/base/reduce.py b/pandas/tests/extension/base/reduce.py index 0f7bd59411eb5..c6a35d8fa5b38 100644 --- a/pandas/tests/extension/base/reduce.py +++ b/pandas/tests/extension/base/reduce.py @@ -20,7 +20,7 @@ def check_reduce(self, s, op_name, skipna): class BaseNoReduceTests(BaseReduceTests): - """ we don't define any reductions """ + """we don't define any reductions""" @pytest.mark.parametrize("skipna", [True, False]) def test_reduce_series_numeric(self, data, all_numeric_reductions, skipna): diff --git a/pandas/tests/extension/test_external_block.py b/pandas/tests/extension/test_external_block.py index 2402c70a166b7..13dec96b144ff 100644 --- a/pandas/tests/extension/test_external_block.py +++ b/pandas/tests/extension/test_external_block.py @@ -14,9 +14,11 @@ class CustomBlock(ExtensionBlock): _holder = np.ndarray - # error: Cannot override final attribute "_can_hold_na" - # (previously declared in base class "Block") - _can_hold_na = False # type: ignore[misc] + + # Cannot override final attribute "_can_hold_na" + @property # type: ignore[misc] + def _can_hold_na(self) -> bool: + return False @pytest.fixture diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py index 49aee76e10f6a..3d0edb70d1ced 100644 --- a/pandas/tests/extension/test_string.py +++ b/pandas/tests/extension/test_string.py @@ -18,16 +18,13 @@ import numpy as np import pytest -import pandas.util._test_decorators as td - import pandas as pd from pandas.core.arrays.string_ import StringDtype -from pandas.core.arrays.string_arrow import ArrowStringDtype from pandas.tests.extension import base def split_array(arr): - if not isinstance(arr.dtype, ArrowStringDtype): + if arr.dtype.storage != "pyarrow": pytest.skip("chunked array n/a") def _split_array(arr): @@ -49,16 +46,9 @@ def chunked(request): return request.param -@pytest.fixture( - params=[ - StringDtype, - pytest.param( - ArrowStringDtype, marks=td.skip_if_no("pyarrow", min_version="1.0.0") - ), - ] -) -def dtype(request): - return request.param() +@pytest.fixture +def dtype(string_storage): + return StringDtype(storage=string_storage) @pytest.fixture @@ -104,24 +94,28 @@ def data_for_grouping(dtype, chunked): class TestDtype(base.BaseDtypeTests): - pass + def test_eq_with_str(self, dtype): + assert dtype == f"string[{dtype.storage}]" + super().test_eq_with_str(dtype) class TestInterface(base.BaseInterfaceTests): def test_view(self, data, request): - if isinstance(data.dtype, ArrowStringDtype): + if data.dtype.storage == "pyarrow": mark = pytest.mark.xfail(reason="not implemented") request.node.add_marker(mark) super().test_view(data) class TestConstructors(base.BaseConstructorsTests): - pass + def test_from_dtype(self, data): + # base test uses string representation of dtype + pass class TestReshaping(base.BaseReshapingTests): - def test_transpose(self, data, dtype, request): - if isinstance(dtype, ArrowStringDtype): + def test_transpose(self, data, request): + if data.dtype.storage == "pyarrow": mark = pytest.mark.xfail(reason="not implemented") request.node.add_marker(mark) super().test_transpose(data) @@ -132,8 +126,8 @@ class TestGetitem(base.BaseGetitemTests): class TestSetitem(base.BaseSetitemTests): - def test_setitem_preserves_views(self, data, dtype, request): - if isinstance(dtype, ArrowStringDtype): + def test_setitem_preserves_views(self, data, request): + if data.dtype.storage == "pyarrow": mark = pytest.mark.xfail(reason="not implemented") request.node.add_marker(mark) super().test_setitem_preserves_views(data) diff --git a/pandas/tests/frame/common.py b/pandas/tests/frame/common.py index 65f228f2be411..a1603ea3dc17a 100644 --- a/pandas/tests/frame/common.py +++ b/pandas/tests/frame/common.py @@ -1,4 +1,4 @@ -from typing import List +from __future__ import annotations from pandas import ( DataFrame, @@ -39,7 +39,7 @@ def _check_mixed_int(df, dtype=None): assert df.dtypes["D"] == dtypes["D"] -def zip_frames(frames: List[DataFrame], axis: int = 1) -> DataFrame: +def zip_frames(frames: list[DataFrame], axis: int = 1) -> DataFrame: """ take a list of frames, zip them together under the assumption that these all have the first frames' index/columns. diff --git a/pandas/tests/frame/methods/test_astype.py b/pandas/tests/frame/methods/test_astype.py index 1583b3f91bea2..881f8db305240 100644 --- a/pandas/tests/frame/methods/test_astype.py +++ b/pandas/tests/frame/methods/test_astype.py @@ -584,10 +584,10 @@ def test_astype_empty_dtype_dict(self): @pytest.mark.parametrize( "data, dtype", [ - (["x", "y", "z"], "string"), + (["x", "y", "z"], "string[python]"), pytest.param( ["x", "y", "z"], - "arrow_string", + "string[pyarrow]", marks=td.skip_if_no("pyarrow", min_version="1.0.0"), ), (["x", "y", "z"], "category"), @@ -598,8 +598,6 @@ def test_astype_empty_dtype_dict(self): @pytest.mark.parametrize("errors", ["raise", "ignore"]) def test_astype_ignores_errors_for_extension_dtypes(self, data, dtype, errors): # https://github.com/pandas-dev/pandas/issues/35471 - from pandas.core.arrays.string_arrow import ArrowStringDtype # noqa: F401 - df = DataFrame(Series(data, dtype=dtype)) if errors == "ignore": expected = df diff --git a/pandas/tests/frame/methods/test_clip.py b/pandas/tests/frame/methods/test_clip.py index 09b33831ed5ec..7258f5eceb54a 100644 --- a/pandas/tests/frame/methods/test_clip.py +++ b/pandas/tests/frame/methods/test_clip.py @@ -139,7 +139,7 @@ def test_clip_against_unordered_columns(self): tm.assert_frame_equal(result_lower_upper, expected_lower_upper) def test_clip_with_na_args(self, float_frame): - """Should process np.nan argument as None """ + """Should process np.nan argument as None""" # GH#17276 tm.assert_frame_equal(float_frame.clip(np.nan), float_frame) tm.assert_frame_equal(float_frame.clip(upper=np.nan, lower=np.nan), float_frame) diff --git a/pandas/tests/frame/methods/test_convert_dtypes.py b/pandas/tests/frame/methods/test_convert_dtypes.py index dd7bf0aada449..a2d539d784d3c 100644 --- a/pandas/tests/frame/methods/test_convert_dtypes.py +++ b/pandas/tests/frame/methods/test_convert_dtypes.py @@ -9,7 +9,7 @@ class TestConvertDtypes: @pytest.mark.parametrize( "convert_integer, expected", [(False, np.dtype("int32")), (True, "Int32")] ) - def test_convert_dtypes(self, convert_integer, expected): + def test_convert_dtypes(self, convert_integer, expected, string_storage): # Specific types are tested in tests/series/test_dtypes.py # Just check that it works for DataFrame here df = pd.DataFrame( @@ -18,11 +18,12 @@ def test_convert_dtypes(self, convert_integer, expected): "b": pd.Series(["x", "y", "z"], dtype=np.dtype("O")), } ) - result = df.convert_dtypes(True, True, convert_integer, False) + with pd.option_context("string_storage", string_storage): + result = df.convert_dtypes(True, True, convert_integer, False) expected = pd.DataFrame( { "a": pd.Series([1, 2, 3], dtype=expected), - "b": pd.Series(["x", "y", "z"], dtype="string"), + "b": pd.Series(["x", "y", "z"], dtype=f"string[{string_storage}]"), } ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_cov_corr.py b/pandas/tests/frame/methods/test_cov_corr.py index 3da3d82ec77f9..352d95156bf98 100644 --- a/pandas/tests/frame/methods/test_cov_corr.py +++ b/pandas/tests/frame/methods/test_cov_corr.py @@ -29,17 +29,16 @@ def test_cov(self, float_frame, float_string_frame): frame = float_frame.copy() frame["A"][:5] = np.nan frame["B"][5:10] = np.nan - result = float_frame.cov(min_periods=len(float_frame) - 8) - expected = float_frame.cov() + result = frame.cov(min_periods=len(frame) - 8) + expected = frame.cov() expected.loc["A", "B"] = np.nan expected.loc["B", "A"] = np.nan + tm.assert_frame_equal(result, expected) # regular - float_frame["A"][:5] = np.nan - float_frame["B"][:10] = np.nan - cov = float_frame.cov() - - tm.assert_almost_equal(cov["A"]["C"], float_frame["A"].cov(float_frame["C"])) + result = frame.cov() + expected = frame["A"].cov(frame["C"]) + tm.assert_almost_equal(result["A"]["C"], expected) # exclude non-numeric types result = float_string_frame.cov() @@ -101,10 +100,7 @@ def test_corr_scipy_method(self, float_frame, method): # --------------------------------------------------------------------- @td.skip_if_no_scipy - def test_corr_non_numeric(self, float_frame, float_string_frame): - float_frame["A"][:5] = np.nan - float_frame["B"][5:10] = np.nan - + def test_corr_non_numeric(self, float_string_frame): # exclude non-numeric types result = float_string_frame.corr() expected = float_string_frame.loc[:, ["A", "B", "C", "D"]].corr() @@ -143,27 +139,27 @@ def test_corr_constant(self, meth): assert isna(rs.values).all() @td.skip_if_no_scipy - def test_corr_int_and_boolean(self): + @pytest.mark.parametrize("meth", ["pearson", "kendall", "spearman"]) + def test_corr_int_and_boolean(self, meth): # when dtypes of pandas series are different # then ndarray will have dtype=object, # so it need to be properly handled df = DataFrame({"a": [True, False], "b": [1, 0]}) expected = DataFrame(np.ones((2, 2)), index=["a", "b"], columns=["a", "b"]) - for meth in ["pearson", "kendall", "spearman"]: - with warnings.catch_warnings(record=True): - warnings.simplefilter("ignore", RuntimeWarning) - result = df.corr(meth) - tm.assert_frame_equal(result, expected) + with warnings.catch_warnings(record=True): + warnings.simplefilter("ignore", RuntimeWarning) + result = df.corr(meth) + tm.assert_frame_equal(result, expected) - def test_corr_cov_independent_index_column(self): + @pytest.mark.parametrize("method", ["cov", "corr"]) + def test_corr_cov_independent_index_column(self, method): # GH#14617 df = DataFrame(np.random.randn(4 * 10).reshape(10, 4), columns=list("abcd")) - for method in ["cov", "corr"]: - result = getattr(df, method)() - assert result.index is not result.columns - assert result.index.equals(result.columns) + result = getattr(df, method)() + assert result.index is not result.columns + assert result.index.equals(result.columns) def test_corr_invalid_method(self): # GH#22298 @@ -174,10 +170,10 @@ def test_corr_invalid_method(self): def test_corr_int(self): # dtypes other than float64 GH#1761 - df3 = DataFrame({"a": [1, 2, 3, 4], "b": [1, 2, 3, 4]}) + df = DataFrame({"a": [1, 2, 3, 4], "b": [1, 2, 3, 4]}) - df3.cov() - df3.corr() + df.cov() + df.corr() @td.skip_if_no_scipy @pytest.mark.parametrize( @@ -232,6 +228,16 @@ def test_calc_corr_small_numbers(self): expected = DataFrame({"A": [1.0, 1.0], "B": [1.0, 1.0]}, index=["A", "B"]) tm.assert_frame_equal(result, expected) + @td.skip_if_no_scipy + @pytest.mark.parametrize("method", ["pearson", "spearman", "kendall"]) + def test_corr_min_periods_greater_than_length(self, method): + df = DataFrame({"A": [1, 2], "B": [1, 2]}) + result = df.corr(method=method, min_periods=3) + expected = DataFrame( + {"A": [np.nan, np.nan], "B": [np.nan, np.nan]}, index=["A", "B"] + ) + tm.assert_frame_equal(result, expected) + class TestDataFrameCorrWith: def test_corrwith(self, datetime_frame): diff --git a/pandas/tests/frame/methods/test_replace.py b/pandas/tests/frame/methods/test_replace.py index 46a5a47e091dd..a89e089f3d8a2 100644 --- a/pandas/tests/frame/methods/test_replace.py +++ b/pandas/tests/frame/methods/test_replace.py @@ -1,11 +1,8 @@ +from __future__ import annotations + from datetime import datetime from io import StringIO import re -from typing import ( - Dict, - List, - Union, -) import numpy as np import pytest @@ -24,12 +21,12 @@ @pytest.fixture -def mix_ab() -> Dict[str, List[Union[int, str]]]: +def mix_ab() -> dict[str, list[int | str]]: return {"a": list(range(4)), "b": list("ab..")} @pytest.fixture -def mix_abc() -> Dict[str, List[Union[float, str]]]: +def mix_abc() -> dict[str, list[float | str]]: return {"a": list(range(4)), "b": list("ab.."), "c": ["a", "b", np.nan, "d"]} diff --git a/pandas/tests/frame/methods/test_to_records.py b/pandas/tests/frame/methods/test_to_records.py index ba8fe25401e8c..2c96cf291c154 100644 --- a/pandas/tests/frame/methods/test_to_records.py +++ b/pandas/tests/frame/methods/test_to_records.py @@ -3,8 +3,6 @@ import numpy as np import pytest -from pandas.compat import is_numpy_dev - from pandas import ( CategoricalDtype, DataFrame, @@ -173,28 +171,20 @@ def test_to_records_with_categorical(self): ), ), # Pass in a type instance. - pytest.param( + ( {"column_dtypes": str}, np.rec.array( [("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")], dtype=[("index", " gets split as the all-NaN is inferred as float - df = DataFrame( - {"key": ["A", "A", "B", "B"], "col1": list("abcd"), "col2": [np.nan] * 4}, - ).astype(object) - result = df.groupby("key").min() - expected = DataFrame( - {"key": ["A", "B"], "col1": ["a", "c"], "col2": [np.nan, np.nan]} - ).set_index("key") - tm.assert_frame_equal(result, expected) - - # same but with numbers - df = DataFrame( - {"key": ["A", "A", "B", "B"], "col1": list("abcd"), "col2": range(4)}, - ).astype(object) - result = df.groupby("key").min() - expected = DataFrame( - {"key": ["A", "B"], "col1": ["a", "c"], "col2": [0, 2]} - ).set_index("key") - tm.assert_frame_equal(result, expected) - - def test_groupby_index_object_dtype(): # GH 40014 df = DataFrame({"c0": ["x", "x", "x"], "c1": ["x", "x", "y"], "p": [0, 1, 2]}) diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 8ce7841bcc2c2..63ae54cafc900 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -1593,20 +1593,6 @@ def test_agg_cython_category_not_implemented_fallback(): tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("func", ["min", "max"]) -def test_aggregate_categorical_lost_index(func: str): - # GH: 28641 groupby drops index, when grouping over categorical column with min/max - ds = Series(["b"], dtype="category").cat.as_ordered() - df = DataFrame({"A": [1997], "B": ds}) - result = df.groupby("A").agg({"B": func}) - expected = DataFrame({"B": ["b"]}, index=Index([1997], name="A")) - - # ordered categorical dtype should be preserved - expected["B"] = expected["B"].astype(ds.dtype) - - tm.assert_frame_equal(result, expected) - - def test_aggregate_categorical_with_isnan(): # GH 29837 df = DataFrame( diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 95bb010015f62..5434fc49e2174 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -4,7 +4,6 @@ import numpy as np import pytest -from pandas._libs.tslibs import iNaT from pandas.errors import UnsupportedFunctionCall import pandas as pd @@ -52,74 +51,6 @@ def dtypes_for_minmax(request): return (dtype, min_val, max_val) -def test_max_min_non_numeric(): - # #2700 - aa = DataFrame({"nn": [11, 11, 22, 22], "ii": [1, 2, 3, 4], "ss": 4 * ["mama"]}) - - result = aa.groupby("nn").max() - assert "ss" in result - - result = aa.groupby("nn").max(numeric_only=False) - assert "ss" in result - - result = aa.groupby("nn").min() - assert "ss" in result - - result = aa.groupby("nn").min(numeric_only=False) - assert "ss" in result - - -def test_max_min_object_multiple_columns(using_array_manager): - # GH#41111 case where the aggregation is valid for some columns but not - # others; we split object blocks column-wise, consistent with - # DataFrame._reduce - - df = DataFrame( - { - "A": [1, 1, 2, 2, 3], - "B": [1, "foo", 2, "bar", False], - "C": ["a", "b", "c", "d", "e"], - } - ) - df._consolidate_inplace() # should already be consolidate, but double-check - if not using_array_manager: - assert len(df._mgr.blocks) == 2 - - gb = df.groupby("A") - - with tm.assert_produces_warning(FutureWarning, match="Dropping invalid"): - result = gb.max(numeric_only=False) - # "max" is valid for column "C" but not for "B" - ei = Index([1, 2, 3], name="A") - expected = DataFrame({"C": ["b", "d", "e"]}, index=ei) - tm.assert_frame_equal(result, expected) - - with tm.assert_produces_warning(FutureWarning, match="Dropping invalid"): - result = gb.min(numeric_only=False) - # "min" is valid for column "C" but not for "B" - ei = Index([1, 2, 3], name="A") - expected = DataFrame({"C": ["a", "c", "e"]}, index=ei) - tm.assert_frame_equal(result, expected) - - -def test_min_date_with_nans(): - # GH26321 - dates = pd.to_datetime( - Series(["2019-05-09", "2019-05-09", "2019-05-09"]), format="%Y-%m-%d" - ).dt.date - df = DataFrame({"a": [np.nan, "1", np.nan], "b": [0, 1, 1], "c": dates}) - - result = df.groupby("b", as_index=False)["c"].min()["c"] - expected = pd.to_datetime( - Series(["2019-05-09", "2019-05-09"], name="c"), format="%Y-%m-%d" - ).dt.date - tm.assert_series_equal(result, expected) - - result = df.groupby("b")["c"].min() - expected.index.name = "b" - tm.assert_series_equal(result, expected) - - def test_intercept_builtin_sum(): s = Series([1.0, 2.0, np.nan, 3.0]) grouped = s.groupby([0, 1, 2, 2]) @@ -664,38 +595,6 @@ def test_max_nan_bug(): assert not r["File"].isna().any() -def test_max_inat(): - # GH#40767 dont interpret iNaT as NaN - ser = Series([1, iNaT]) - gb = ser.groupby([1, 1]) - - result = gb.max(min_count=2) - expected = Series({1: 1}, dtype=np.int64) - tm.assert_series_equal(result, expected, check_exact=True) - - result = gb.min(min_count=2) - expected = Series({1: iNaT}, dtype=np.int64) - tm.assert_series_equal(result, expected, check_exact=True) - - # not enough entries -> gets masked to NaN - result = gb.min(min_count=3) - expected = Series({1: np.nan}) - tm.assert_series_equal(result, expected, check_exact=True) - - -def test_max_inat_not_all_na(): - # GH#40767 dont interpret iNaT as NaN - - # make sure we dont round iNaT+1 to iNaT - ser = Series([1, iNaT, 2, iNaT + 1]) - gb = ser.groupby([1, 2, 3, 3]) - result = gb.min(min_count=2) - - # Note: in converting to float64, the iNaT + 1 maps to iNaT, i.e. is lossy - expected = Series({1: np.nan, 2: np.nan, 3: iNaT + 1}) - tm.assert_series_equal(result, expected, check_exact=True) - - def test_nlargest(): a = Series([1, 3, 5, 7, 2, 9, 0, 4, 6, 10]) b = Series(list("a" * 5 + "b" * 5)) diff --git a/pandas/tests/groupby/test_min_max.py b/pandas/tests/groupby/test_min_max.py new file mode 100644 index 0000000000000..25a57d24e04ef --- /dev/null +++ b/pandas/tests/groupby/test_min_max.py @@ -0,0 +1,178 @@ +import numpy as np +import pytest + +from pandas._libs.tslibs import iNaT + +import pandas as pd +from pandas import ( + DataFrame, + Index, + Series, +) +import pandas._testing as tm + + +def test_max_min_non_numeric(): + # #2700 + aa = DataFrame({"nn": [11, 11, 22, 22], "ii": [1, 2, 3, 4], "ss": 4 * ["mama"]}) + + result = aa.groupby("nn").max() + assert "ss" in result + + result = aa.groupby("nn").max(numeric_only=False) + assert "ss" in result + + result = aa.groupby("nn").min() + assert "ss" in result + + result = aa.groupby("nn").min(numeric_only=False) + assert "ss" in result + + +def test_max_min_object_multiple_columns(using_array_manager): + # GH#41111 case where the aggregation is valid for some columns but not + # others; we split object blocks column-wise, consistent with + # DataFrame._reduce + + df = DataFrame( + { + "A": [1, 1, 2, 2, 3], + "B": [1, "foo", 2, "bar", False], + "C": ["a", "b", "c", "d", "e"], + } + ) + df._consolidate_inplace() # should already be consolidate, but double-check + if not using_array_manager: + assert len(df._mgr.blocks) == 2 + + gb = df.groupby("A") + + with tm.assert_produces_warning(FutureWarning, match="Dropping invalid"): + result = gb.max(numeric_only=False) + # "max" is valid for column "C" but not for "B" + ei = Index([1, 2, 3], name="A") + expected = DataFrame({"C": ["b", "d", "e"]}, index=ei) + tm.assert_frame_equal(result, expected) + + with tm.assert_produces_warning(FutureWarning, match="Dropping invalid"): + result = gb.min(numeric_only=False) + # "min" is valid for column "C" but not for "B" + ei = Index([1, 2, 3], name="A") + expected = DataFrame({"C": ["a", "c", "e"]}, index=ei) + tm.assert_frame_equal(result, expected) + + +def test_min_date_with_nans(): + # GH26321 + dates = pd.to_datetime( + Series(["2019-05-09", "2019-05-09", "2019-05-09"]), format="%Y-%m-%d" + ).dt.date + df = DataFrame({"a": [np.nan, "1", np.nan], "b": [0, 1, 1], "c": dates}) + + result = df.groupby("b", as_index=False)["c"].min()["c"] + expected = pd.to_datetime( + Series(["2019-05-09", "2019-05-09"], name="c"), format="%Y-%m-%d" + ).dt.date + tm.assert_series_equal(result, expected) + + result = df.groupby("b")["c"].min() + expected.index.name = "b" + tm.assert_series_equal(result, expected) + + +def test_max_inat(): + # GH#40767 dont interpret iNaT as NaN + ser = Series([1, iNaT]) + gb = ser.groupby([1, 1]) + + result = gb.max(min_count=2) + expected = Series({1: 1}, dtype=np.int64) + tm.assert_series_equal(result, expected, check_exact=True) + + result = gb.min(min_count=2) + expected = Series({1: iNaT}, dtype=np.int64) + tm.assert_series_equal(result, expected, check_exact=True) + + # not enough entries -> gets masked to NaN + result = gb.min(min_count=3) + expected = Series({1: np.nan}) + tm.assert_series_equal(result, expected, check_exact=True) + + +def test_max_inat_not_all_na(): + # GH#40767 dont interpret iNaT as NaN + + # make sure we dont round iNaT+1 to iNaT + ser = Series([1, iNaT, 2, iNaT + 1]) + gb = ser.groupby([1, 2, 3, 3]) + result = gb.min(min_count=2) + + # Note: in converting to float64, the iNaT + 1 maps to iNaT, i.e. is lossy + expected = Series({1: np.nan, 2: np.nan, 3: iNaT + 1}) + tm.assert_series_equal(result, expected, check_exact=True) + + +@pytest.mark.parametrize("func", ["min", "max"]) +def test_groupby_aggregate_period_column(func): + # GH 31471 + groups = [1, 2] + periods = pd.period_range("2020", periods=2, freq="Y") + df = DataFrame({"a": groups, "b": periods}) + + result = getattr(df.groupby("a")["b"], func)() + idx = pd.Int64Index([1, 2], name="a") + expected = Series(periods, index=idx, name="b") + + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("func", ["min", "max"]) +def test_groupby_aggregate_period_frame(func): + # GH 31471 + groups = [1, 2] + periods = pd.period_range("2020", periods=2, freq="Y") + df = DataFrame({"a": groups, "b": periods}) + + result = getattr(df.groupby("a"), func)() + idx = pd.Int64Index([1, 2], name="a") + expected = DataFrame({"b": periods}, index=idx) + + tm.assert_frame_equal(result, expected) + + +def test_aggregate_numeric_object_dtype(): + # https://github.com/pandas-dev/pandas/issues/39329 + # simplified case: multiple object columns where one is all-NaN + # -> gets split as the all-NaN is inferred as float + df = DataFrame( + {"key": ["A", "A", "B", "B"], "col1": list("abcd"), "col2": [np.nan] * 4}, + ).astype(object) + result = df.groupby("key").min() + expected = DataFrame( + {"key": ["A", "B"], "col1": ["a", "c"], "col2": [np.nan, np.nan]} + ).set_index("key") + tm.assert_frame_equal(result, expected) + + # same but with numbers + df = DataFrame( + {"key": ["A", "A", "B", "B"], "col1": list("abcd"), "col2": range(4)}, + ).astype(object) + result = df.groupby("key").min() + expected = DataFrame( + {"key": ["A", "B"], "col1": ["a", "c"], "col2": [0, 2]} + ).set_index("key") + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("func", ["min", "max"]) +def test_aggregate_categorical_lost_index(func: str): + # GH: 28641 groupby drops index, when grouping over categorical column with min/max + ds = Series(["b"], dtype="category").cat.as_ordered() + df = DataFrame({"A": [1997], "B": ds}) + result = df.groupby("A").agg({"B": func}) + expected = DataFrame({"B": ["b"]}, index=Index([1997], name="A")) + + # ordered categorical dtype should be preserved + expected["B"] = expected["B"].astype(ds.dtype) + + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/indexes/categorical/test_equals.py b/pandas/tests/indexes/categorical/test_equals.py index 2648155c938b0..1ed8f3a903439 100644 --- a/pandas/tests/indexes/categorical/test_equals.py +++ b/pandas/tests/indexes/categorical/test_equals.py @@ -5,6 +5,7 @@ Categorical, CategoricalIndex, Index, + MultiIndex, ) @@ -79,3 +80,11 @@ def test_equals_non_category(self): other = Index(["A", "B", "D", np.nan]) assert not ci.equals(other) + + def test_equals_multiindex(self): + # dont raise NotImplementedError when calling is_dtype_compat + + mi = MultiIndex.from_arrays([["A", "B", "C", "D"], range(4)]) + ci = mi.to_flat_index().astype("category") + + assert not ci.equals(mi) diff --git a/pandas/tests/indexes/common.py b/pandas/tests/indexes/common.py index 0ea3abcaefcf2..cef756b709f70 100644 --- a/pandas/tests/indexes/common.py +++ b/pandas/tests/indexes/common.py @@ -1,6 +1,7 @@ +from __future__ import annotations + from datetime import datetime import gc -from typing import Type import numpy as np import pytest @@ -36,7 +37,7 @@ class Base: Base class for index sub-class tests. """ - _index_cls: Type[Index] + _index_cls: type[Index] @pytest.fixture def simple_index(self): diff --git a/pandas/tests/indexes/interval/test_indexing.py b/pandas/tests/indexes/interval/test_indexing.py index 3abc6e348748a..a5a921f42c3ef 100644 --- a/pandas/tests/indexes/interval/test_indexing.py +++ b/pandas/tests/indexes/interval/test_indexing.py @@ -6,9 +6,11 @@ from pandas.errors import InvalidIndexError from pandas import ( + NA, CategoricalIndex, Interval, IntervalIndex, + NaT, Timedelta, date_range, timedelta_range, @@ -168,6 +170,20 @@ def test_get_loc_non_scalar_errors(self, key): with pytest.raises(InvalidIndexError, match=msg): idx.get_loc(key) + def test_get_indexer_with_nans(self): + # GH#41831 + index = IntervalIndex([np.nan, Interval(1, 2), np.nan]) + + expected = np.array([True, False, True]) + for key in [None, np.nan, NA]: + assert key in index + result = index.get_loc(key) + tm.assert_numpy_array_equal(result, expected) + + for key in [NaT, np.timedelta64("NaT", "ns"), np.datetime64("NaT", "ns")]: + with pytest.raises(KeyError, match=str(key)): + index.get_loc(key) + class TestGetIndexer: @pytest.mark.parametrize( @@ -326,6 +342,17 @@ def test_get_indexer_non_monotonic(self): expected = np.array([1, 2], dtype=np.intp) tm.assert_numpy_array_equal(result, expected) + def test_get_indexer_with_nans(self): + # GH#41831 + index = IntervalIndex([np.nan, np.nan]) + other = IntervalIndex([np.nan]) + + assert not index._index_as_unique + + result = index.get_indexer_for(other) + expected = np.array([0, 1], dtype=np.intp) + tm.assert_numpy_array_equal(result, expected) + class TestSliceLocs: def test_slice_locs_with_interval(self): diff --git a/pandas/tests/indexes/interval/test_interval.py b/pandas/tests/indexes/interval/test_interval.py index cd61fcaa835a4..2ba60999aa36d 100644 --- a/pandas/tests/indexes/interval/test_interval.py +++ b/pandas/tests/indexes/interval/test_interval.py @@ -247,6 +247,16 @@ def test_is_unique_interval(self, closed): idx = IntervalIndex.from_tuples([(-1, 1), (-2, 2)], closed=closed) assert idx.is_unique is True + # unique NaN + idx = IntervalIndex.from_tuples([(np.NaN, np.NaN)], closed=closed) + assert idx.is_unique is True + + # non-unique NaN + idx = IntervalIndex.from_tuples( + [(np.NaN, np.NaN), (np.NaN, np.NaN)], closed=closed + ) + assert idx.is_unique is False + def test_monotonic(self, closed): # increasing non-overlapping idx = IntervalIndex.from_tuples([(0, 1), (2, 3), (4, 5)], closed=closed) @@ -318,6 +328,16 @@ def test_monotonic(self, closed): assert idx.is_monotonic_decreasing is True assert idx._is_strictly_monotonic_decreasing is True + def test_is_monotonic_with_nans(self): + # GH#41831 + index = IntervalIndex([np.nan, np.nan]) + + assert not index.is_monotonic + assert not index._is_strictly_monotonic_increasing + assert not index.is_monotonic_increasing + assert not index._is_strictly_monotonic_decreasing + assert not index.is_monotonic_decreasing + def test_get_item(self, closed): i = IntervalIndex.from_arrays((0, 1, np.nan), (1, 2, np.nan), closed=closed) assert i[0] == Interval(0.0, 1.0, closed=closed) diff --git a/pandas/tests/indexes/multi/test_indexing.py b/pandas/tests/indexes/multi/test_indexing.py index fba94960ddaad..9e1097ce5951f 100644 --- a/pandas/tests/indexes/multi/test_indexing.py +++ b/pandas/tests/indexes/multi/test_indexing.py @@ -445,6 +445,18 @@ def test_get_indexer_crossing_levels(self): expected = np.array([7, 15], dtype=pad_indexer.dtype) tm.assert_almost_equal(expected, pad_indexer) + def test_get_indexer_kwarg_validation(self): + # GH#41918 + mi = MultiIndex.from_product([range(3), ["A", "B"]]) + + msg = "limit argument only valid if doing pad, backfill or nearest" + with pytest.raises(ValueError, match=msg): + mi.get_indexer(mi[:-1], limit=4) + + msg = "tolerance argument only valid if doing pad, backfill or nearest" + with pytest.raises(ValueError, match=msg): + mi.get_indexer(mi[:-1], tolerance="piano") + def test_getitem(idx): # scalar diff --git a/pandas/tests/indexes/test_index_new.py b/pandas/tests/indexes/test_index_new.py index 5937f43102190..7765a4b6b4412 100644 --- a/pandas/tests/indexes/test_index_new.py +++ b/pandas/tests/indexes/test_index_new.py @@ -138,6 +138,16 @@ def test_constructor_mixed_nat_objs_infers_object(self, swap_objs): tm.assert_index_equal(Index(data), expected) tm.assert_index_equal(Index(np.array(data, dtype=object)), expected) + @pytest.mark.parametrize("swap_objs", [True, False]) + def test_constructor_datetime_and_datetime64(self, swap_objs): + data = [Timestamp(2021, 6, 8, 9, 42), np.datetime64("now")] + if swap_objs: + data = data[::-1] + expected = DatetimeIndex(data) + + tm.assert_index_equal(Index(data), expected) + tm.assert_index_equal(Index(np.array(data, dtype=object)), expected) + class TestDtypeEnforced: # check we don't silently ignore the dtype keyword diff --git a/pandas/tests/indexing/common.py b/pandas/tests/indexing/common.py index f7c2266e39fcc..8cde03af1ff92 100644 --- a/pandas/tests/indexing/common.py +++ b/pandas/tests/indexing/common.py @@ -26,7 +26,7 @@ def _axify(obj, key, axis): class Base: - """ indexing comprehensive base class """ + """indexing comprehensive base class""" _kinds = {"series", "frame"} _typs = { @@ -120,7 +120,7 @@ def generate_indices(self, f, values=False): return itertools.product(*axes) def get_value(self, name, f, i, values=False): - """ return the value for the location i """ + """return the value for the location i""" # check against values if values: return f.values[i] @@ -153,7 +153,7 @@ def check_values(self, f, func, values=False): def check_result(self, method, key, typs=None, axes=None, fails=None): def _eq(axis, obj, key): - """ compare equal for these 2 keys """ + """compare equal for these 2 keys""" axified = _axify(obj, key, axis) try: getattr(obj, method).__getitem__(axified) diff --git a/pandas/tests/indexing/interval/test_interval.py b/pandas/tests/indexing/interval/test_interval.py index 503e39041a49f..ccb16c5d97ecc 100644 --- a/pandas/tests/indexing/interval/test_interval.py +++ b/pandas/tests/indexing/interval/test_interval.py @@ -114,6 +114,21 @@ def test_loc_getitem_frame(self): with pytest.raises(KeyError, match=r"\[10\] not in index"): df.loc[[10, 4]] + def test_getitem_interval_with_nans(self, frame_or_series, indexer_sl): + # GH#41831 + + index = IntervalIndex([np.nan, np.nan]) + key = index[:-1] + + obj = frame_or_series(range(2), index=index) + if frame_or_series is DataFrame and indexer_sl is tm.setitem: + obj = obj.T + + result = indexer_sl(obj)[key] + expected = obj + + tm.assert_equal(result, expected) + class TestIntervalIndexInsideMultiIndex: def test_mi_intervalindex_slicing_with_scalar(self): diff --git a/pandas/tests/indexing/test_coercion.py b/pandas/tests/indexing/test_coercion.py index 26f2ba577d184..7911cd7f12e0c 100644 --- a/pandas/tests/indexing/test_coercion.py +++ b/pandas/tests/indexing/test_coercion.py @@ -1,9 +1,7 @@ +from __future__ import annotations + from datetime import timedelta import itertools -from typing import ( - Dict, - List, -) import numpy as np import pytest @@ -79,7 +77,7 @@ class TestSetitemCoercion(CoercionBase): def _assert_setitem_series_conversion( self, original_series, loc_value, expected_series, expected_dtype ): - """ test series value's coercion triggered by assignment """ + """test series value's coercion triggered by assignment""" temp = original_series.copy() temp[1] = loc_value tm.assert_series_equal(temp, expected_series) @@ -273,7 +271,7 @@ def test_setitem_series_no_coercion_from_values_list(self): def _assert_setitem_index_conversion( self, original_series, loc_key, expected_index, expected_dtype ): - """ test index's coercion triggered by assign key """ + """test index's coercion triggered by assign key""" temp = original_series.copy() temp[loc_key] = 5 exp = pd.Series([1, 2, 3, 4, 5], index=expected_index) @@ -367,7 +365,7 @@ class TestInsertIndexCoercion(CoercionBase): method = "insert" def _assert_insert_conversion(self, original, value, expected, expected_dtype): - """ test coercion triggered by insert """ + """test coercion triggered by insert""" target = original.copy() res = target.insert(1, value) tm.assert_index_equal(res, expected) @@ -555,7 +553,7 @@ class TestWhereCoercion(CoercionBase): def _assert_where_conversion( self, original, cond, values, expected, expected_dtype ): - """ test coercion triggered by where """ + """test coercion triggered by where""" target = original.copy() res = target.where(cond, values) tm.assert_equal(res, expected) @@ -869,7 +867,7 @@ def test_has_comprehensive_tests(self): raise NotImplementedError def _assert_fillna_conversion(self, original, value, expected, expected_dtype): - """ test coercion triggered by fillna """ + """test coercion triggered by fillna""" target = original.copy() res = target.fillna(value) tm.assert_equal(res, expected) @@ -1024,7 +1022,7 @@ class TestReplaceSeriesCoercion(CoercionBase): klasses = ["series"] method = "replace" - rep: Dict[str, List] = {} + rep: dict[str, list] = {} rep["object"] = ["a", "b"] rep["int64"] = [4, 5] rep["float64"] = [1.1, 2.2] diff --git a/pandas/tests/indexing/test_indexing.py b/pandas/tests/indexing/test_indexing.py index 425c68725c595..c945bd6b95ee1 100644 --- a/pandas/tests/indexing/test_indexing.py +++ b/pandas/tests/indexing/test_indexing.py @@ -33,7 +33,7 @@ class TestFancy: - """ pure get/set item & fancy indexing """ + """pure get/set item & fancy indexing""" def test_setitem_ndarray_1d(self): # GH5508 diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py index 61bbd4e12e1ba..0f4a30cfa9cf9 100644 --- a/pandas/tests/internals/test_internals.py +++ b/pandas/tests/internals/test_internals.py @@ -561,7 +561,7 @@ def test_astype(self, t): def test_convert(self): def _compare(old_mgr, new_mgr): - """ compare the blocks, numeric compare ==, object don't """ + """compare the blocks, numeric compare ==, object don't""" old_blocks = set(old_mgr.blocks) new_blocks = set(new_mgr.blocks) assert len(old_blocks) == len(new_blocks) diff --git a/pandas/tests/io/generate_legacy_storage_files.py b/pandas/tests/io/generate_legacy_storage_files.py index 601b50fb469cb..dede9127821fd 100644 --- a/pandas/tests/io/generate_legacy_storage_files.py +++ b/pandas/tests/io/generate_legacy_storage_files.py @@ -125,7 +125,7 @@ def _create_sp_frame(): def create_data(): - """ create the pickle data """ + """create the pickle data""" data = { "A": [0.0, 1.0, 2.0, 3.0, np.nan], "B": [0, 1, 0, 1, 0], diff --git a/pandas/tests/io/parser/conftest.py b/pandas/tests/io/parser/conftest.py index 1eb52ab78e1a0..e11746c118ff7 100644 --- a/pandas/tests/io/parser/conftest.py +++ b/pandas/tests/io/parser/conftest.py @@ -1,8 +1,6 @@ +from __future__ import annotations + import os -from typing import ( - List, - Optional, -) import pytest @@ -13,9 +11,9 @@ class BaseParser: - engine: Optional[str] = None + engine: str | None = None low_memory = True - float_precision_choices: List[Optional[str]] = [] + float_precision_choices: list[str | None] = [] def update_kwargs(self, kwargs): kwargs = kwargs.copy() diff --git a/pandas/tests/io/pytables/common.py b/pandas/tests/io/pytables/common.py index 6a9d5745ab457..67c3a2902dbcb 100644 --- a/pandas/tests/io/pytables/common.py +++ b/pandas/tests/io/pytables/common.py @@ -30,7 +30,7 @@ def safe_close(store): def create_tempfile(path): - """ create an unopened named temporary file """ + """create an unopened named temporary file""" return os.path.join(tempfile.gettempdir(), path) diff --git a/pandas/tests/io/pytables/conftest.py b/pandas/tests/io/pytables/conftest.py index 38ffcb3b0e8ec..988f78c5ae843 100644 --- a/pandas/tests/io/pytables/conftest.py +++ b/pandas/tests/io/pytables/conftest.py @@ -11,7 +11,7 @@ def setup_path(): @pytest.fixture(scope="module", autouse=True) def setup_mode(): - """ Reset testing mode fixture""" + """Reset testing mode fixture""" tm.reset_testing_mode() yield tm.set_testing_mode() diff --git a/pandas/tests/io/pytables/test_append.py b/pandas/tests/io/pytables/test_append.py index 2569eb0c9e786..719b54a57a6c7 100644 --- a/pandas/tests/io/pytables/test_append.py +++ b/pandas/tests/io/pytables/test_append.py @@ -25,7 +25,7 @@ ensure_clean_store, ) -pytestmark = [pytest.mark.single, td.skip_array_manager_not_yet_implemented] +pytestmark = pytest.mark.single @pytest.mark.filterwarnings("ignore:object name:tables.exceptions.NaturalNameWarning") @@ -714,6 +714,10 @@ def check(obj, comparator): tm.assert_frame_equal(store.select("df2"), df) +# TODO(ArrayManager) currently we rely on falling back to BlockManager, but +# the conversion from AM->BM converts the invalid object dtype column into +# a datetime64 column no longer raising an error +@td.skip_array_manager_not_yet_implemented def test_append_raise(setup_path): with ensure_clean_store(setup_path) as store: diff --git a/pandas/tests/io/pytables/test_categorical.py b/pandas/tests/io/pytables/test_categorical.py index 0b3d56ebf959e..d2348ca8e314d 100644 --- a/pandas/tests/io/pytables/test_categorical.py +++ b/pandas/tests/io/pytables/test_categorical.py @@ -1,8 +1,6 @@ import numpy as np import pytest -import pandas.util._test_decorators as td - from pandas import ( Categorical, DataFrame, @@ -19,7 +17,6 @@ pytestmark = [ pytest.mark.single, - td.skip_array_manager_not_yet_implemented, # pytables https://github.com/PyTables/PyTables/issues/822 pytest.mark.filterwarnings( "ignore:a closed node found in the registry:UserWarning" diff --git a/pandas/tests/io/pytables/test_compat.py b/pandas/tests/io/pytables/test_compat.py index 4688d7d2be40a..c7200385aa998 100644 --- a/pandas/tests/io/pytables/test_compat.py +++ b/pandas/tests/io/pytables/test_compat.py @@ -1,15 +1,11 @@ import pytest -import pandas.util._test_decorators as td - import pandas as pd import pandas._testing as tm from pandas.tests.io.pytables.common import ensure_clean_path tables = pytest.importorskip("tables") -pytestmark = td.skip_array_manager_not_yet_implemented - @pytest.fixture def pytables_hdf5_file(): diff --git a/pandas/tests/io/pytables/test_complex.py b/pandas/tests/io/pytables/test_complex.py index 6cfe80ae5c87c..f3a43f669b1d5 100644 --- a/pandas/tests/io/pytables/test_complex.py +++ b/pandas/tests/io/pytables/test_complex.py @@ -3,8 +3,6 @@ import numpy as np import pytest -import pandas.util._test_decorators as td - import pandas as pd from pandas import ( DataFrame, @@ -18,9 +16,6 @@ from pandas.io.pytables import read_hdf -# TODO(ArrayManager) HDFStore relies on accessing the blocks -pytestmark = td.skip_array_manager_not_yet_implemented - def test_complex_fixed(setup_path): df = DataFrame( diff --git a/pandas/tests/io/pytables/test_errors.py b/pandas/tests/io/pytables/test_errors.py index 30b07fb572324..2ae330e5139be 100644 --- a/pandas/tests/io/pytables/test_errors.py +++ b/pandas/tests/io/pytables/test_errors.py @@ -6,8 +6,6 @@ import numpy as np import pytest -import pandas.util._test_decorators as td - from pandas import ( CategoricalIndex, DataFrame, @@ -27,7 +25,7 @@ _maybe_adjust_name, ) -pytestmark = [pytest.mark.single, td.skip_array_manager_not_yet_implemented] +pytestmark = pytest.mark.single def test_pass_spec_to_storer(setup_path): diff --git a/pandas/tests/io/pytables/test_file_handling.py b/pandas/tests/io/pytables/test_file_handling.py index 943b1bb06b1f3..88e2b5f080282 100644 --- a/pandas/tests/io/pytables/test_file_handling.py +++ b/pandas/tests/io/pytables/test_file_handling.py @@ -4,7 +4,6 @@ import pytest from pandas.compat import is_platform_little_endian -import pandas.util._test_decorators as td from pandas import ( DataFrame, @@ -27,7 +26,7 @@ Term, ) -pytestmark = [pytest.mark.single, td.skip_array_manager_not_yet_implemented] +pytestmark = pytest.mark.single def test_mode(setup_path): diff --git a/pandas/tests/io/pytables/test_keys.py b/pandas/tests/io/pytables/test_keys.py index 1dc2c9411ed7b..02b79bd0fdbc1 100644 --- a/pandas/tests/io/pytables/test_keys.py +++ b/pandas/tests/io/pytables/test_keys.py @@ -1,7 +1,5 @@ import pytest -import pandas.util._test_decorators as td - from pandas import ( DataFrame, HDFStore, @@ -13,7 +11,7 @@ tables, ) -pytestmark = [pytest.mark.single, td.skip_array_manager_not_yet_implemented] +pytestmark = pytest.mark.single def test_keys(setup_path): diff --git a/pandas/tests/io/pytables/test_put.py b/pandas/tests/io/pytables/test_put.py index 20278914a4838..4f8c7c84a9fcc 100644 --- a/pandas/tests/io/pytables/test_put.py +++ b/pandas/tests/io/pytables/test_put.py @@ -29,7 +29,7 @@ ) from pandas.util import _test_decorators as td -pytestmark = [pytest.mark.single, td.skip_array_manager_not_yet_implemented] +pytestmark = pytest.mark.single def test_format_type(setup_path): diff --git a/pandas/tests/io/pytables/test_pytables_missing.py b/pandas/tests/io/pytables/test_pytables_missing.py index fe474b7503e60..9adb0a6d227da 100644 --- a/pandas/tests/io/pytables/test_pytables_missing.py +++ b/pandas/tests/io/pytables/test_pytables_missing.py @@ -5,8 +5,6 @@ import pandas as pd import pandas._testing as tm -pytestmark = td.skip_array_manager_not_yet_implemented - @td.skip_if_installed("tables") def test_pytables_raises(): diff --git a/pandas/tests/io/pytables/test_read.py b/pandas/tests/io/pytables/test_read.py index 5d1deb45eba8b..1c9e63c66aadb 100644 --- a/pandas/tests/io/pytables/test_read.py +++ b/pandas/tests/io/pytables/test_read.py @@ -25,7 +25,7 @@ from pandas.io.pytables import TableIterator -pytestmark = [pytest.mark.single, td.skip_array_manager_not_yet_implemented] +pytestmark = pytest.mark.single def test_read_missing_key_close_store(setup_path): diff --git a/pandas/tests/io/pytables/test_retain_attributes.py b/pandas/tests/io/pytables/test_retain_attributes.py index c6e2904f7e670..16772d03c6d26 100644 --- a/pandas/tests/io/pytables/test_retain_attributes.py +++ b/pandas/tests/io/pytables/test_retain_attributes.py @@ -3,7 +3,6 @@ import pytest from pandas._libs.tslibs import Timestamp -import pandas.util._test_decorators as td from pandas import ( DataFrame, @@ -18,7 +17,7 @@ ensure_clean_store, ) -pytestmark = [pytest.mark.single, td.skip_array_manager_not_yet_implemented] +pytestmark = pytest.mark.single def test_retain_index_attributes(setup_path): diff --git a/pandas/tests/io/pytables/test_round_trip.py b/pandas/tests/io/pytables/test_round_trip.py index ce075943efe8a..97edc3cdffdf7 100644 --- a/pandas/tests/io/pytables/test_round_trip.py +++ b/pandas/tests/io/pytables/test_round_trip.py @@ -30,7 +30,7 @@ _default_compressor = "blosc" -pytestmark = [pytest.mark.single, td.skip_array_manager_not_yet_implemented] +pytestmark = pytest.mark.single def test_conv_read_write(setup_path): diff --git a/pandas/tests/io/pytables/test_select.py b/pandas/tests/io/pytables/test_select.py index fc19a3bd63c74..56d48945d5852 100644 --- a/pandas/tests/io/pytables/test_select.py +++ b/pandas/tests/io/pytables/test_select.py @@ -4,7 +4,6 @@ import pytest from pandas._libs.tslibs import Timestamp -import pandas.util._test_decorators as td import pandas as pd from pandas import ( @@ -28,7 +27,7 @@ from pandas.io.pytables import Term -pytestmark = [pytest.mark.single, td.skip_array_manager_not_yet_implemented] +pytestmark = pytest.mark.single def test_select_columns_in_where(setup_path): diff --git a/pandas/tests/io/pytables/test_store.py b/pandas/tests/io/pytables/test_store.py index c61864bbc0a76..856a2ca15ec4a 100644 --- a/pandas/tests/io/pytables/test_store.py +++ b/pandas/tests/io/pytables/test_store.py @@ -10,8 +10,6 @@ import numpy as np import pytest -import pandas.util._test_decorators as td - import pandas as pd from pandas import ( DataFrame, @@ -42,8 +40,7 @@ read_hdf, ) -# TODO(ArrayManager) HDFStore relies on accessing the blocks -pytestmark = [pytest.mark.single, td.skip_array_manager_not_yet_implemented] +pytestmark = pytest.mark.single def test_context(setup_path): diff --git a/pandas/tests/io/pytables/test_subclass.py b/pandas/tests/io/pytables/test_subclass.py index 05c9f0c650986..75b04f332e054 100644 --- a/pandas/tests/io/pytables/test_subclass.py +++ b/pandas/tests/io/pytables/test_subclass.py @@ -1,7 +1,5 @@ import numpy as np -import pandas.util._test_decorators as td - from pandas import ( DataFrame, Series, @@ -14,8 +12,6 @@ read_hdf, ) -pytestmark = td.skip_array_manager_not_yet_implemented - class TestHDFStoreSubclass: # GH 33748 diff --git a/pandas/tests/io/pytables/test_time_series.py b/pandas/tests/io/pytables/test_time_series.py index 181f63563665b..5e42dbde4b9f1 100644 --- a/pandas/tests/io/pytables/test_time_series.py +++ b/pandas/tests/io/pytables/test_time_series.py @@ -3,8 +3,6 @@ import numpy as np import pytest -import pandas.util._test_decorators as td - from pandas import ( DataFrame, Series, @@ -12,7 +10,7 @@ ) from pandas.tests.io.pytables.common import ensure_clean_store -pytestmark = [pytest.mark.single, td.skip_array_manager_not_yet_implemented] +pytestmark = pytest.mark.single def test_store_datetime_fractional_secs(setup_path): diff --git a/pandas/tests/io/pytables/test_timezones.py b/pandas/tests/io/pytables/test_timezones.py index 4aa6f94ca38e9..36fa79d0bb7e3 100644 --- a/pandas/tests/io/pytables/test_timezones.py +++ b/pandas/tests/io/pytables/test_timezones.py @@ -24,9 +24,6 @@ ensure_clean_store, ) -# TODO(ArrayManager) HDFStore relies on accessing the blocks -pytestmark = td.skip_array_manager_not_yet_implemented - def _compare_with_tz(a, b): tm.assert_frame_equal(a, b) diff --git a/pandas/tests/io/test_gbq.py b/pandas/tests/io/test_gbq.py index a244f3904553d..e6be3f0567f67 100644 --- a/pandas/tests/io/test_gbq.py +++ b/pandas/tests/io/test_gbq.py @@ -35,22 +35,11 @@ def _skip_if_no_private_key_path(): pytest.skip("Cannot run integration tests without a private key json file path") -def _in_travis_environment(): - return "TRAVIS_BUILD_DIR" in os.environ and "GBQ_PROJECT_ID" in os.environ - - def _get_project_id(): - if _in_travis_environment(): - return os.environ.get("GBQ_PROJECT_ID") return PROJECT_ID or os.environ.get("GBQ_PROJECT_ID") def _get_private_key_path(): - if _in_travis_environment(): - return os.path.join( - *[os.environ.get("TRAVIS_BUILD_DIR"), "ci", "travis_gbq.json"] - ) - private_key_path = PRIVATE_KEY_JSON_PATH if not private_key_path: private_key_path = os.environ.get("GBQ_GOOGLE_APPLICATION_CREDENTIALS") diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index ae6425cd93ac5..d100c584b698a 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -810,12 +810,11 @@ def test_additional_extension_arrays(self, pa): check_round_trip(df, pa) @td.skip_if_no("pyarrow", min_version="1.0.0") - def test_pyarrow_backed_string_array(self, pa): + def test_pyarrow_backed_string_array(self, pa, string_storage): # test ArrowStringArray supported through the __arrow_array__ protocol - from pandas.core.arrays.string_arrow import ArrowStringDtype # noqa: F401 - - df = pd.DataFrame({"a": pd.Series(["a", None, "c"], dtype="arrow_string")}) - check_round_trip(df, pa, expected=df) + df = pd.DataFrame({"a": pd.Series(["a", None, "c"], dtype="string[pyarrow]")}) + with pd.option_context("string_storage", string_storage): + check_round_trip(df, pa, expected=df.astype(f"string[{string_storage}]")) @td.skip_if_no("pyarrow") def test_additional_extension_types(self, pa): diff --git a/pandas/tests/io/xml/test_to_xml.py b/pandas/tests/io/xml/test_to_xml.py index c588c3c3ca0bd..1e2973075f98e 100644 --- a/pandas/tests/io/xml/test_to_xml.py +++ b/pandas/tests/io/xml/test_to_xml.py @@ -1,9 +1,10 @@ +from __future__ import annotations + from io import ( BytesIO, StringIO, ) import os -from typing import Union import numpy as np import pytest @@ -963,7 +964,7 @@ def test_stylesheet_file_like(datapath, mode): def test_stylesheet_io(datapath, mode): xsl_path = datapath("io", "data", "xml", "row_field_output.xsl") - xsl_obj: Union[BytesIO, StringIO] + xsl_obj: BytesIO | StringIO with open(xsl_path, mode) as f: if mode == "rb": diff --git a/pandas/tests/io/xml/test_xml.py b/pandas/tests/io/xml/test_xml.py index 95751b6090a06..823d155360908 100644 --- a/pandas/tests/io/xml/test_xml.py +++ b/pandas/tests/io/xml/test_xml.py @@ -1,9 +1,10 @@ +from __future__ import annotations + from io import ( BytesIO, StringIO, ) import os -from typing import Union from urllib.error import HTTPError import numpy as np @@ -792,7 +793,7 @@ def test_stylesheet_io(datapath, mode): kml = datapath("io", "data", "xml", "cta_rail_lines.kml") xsl = datapath("io", "data", "xml", "flatten_doc.xsl") - xsl_obj: Union[BytesIO, StringIO] + xsl_obj: BytesIO | StringIO with open(xsl, mode) as f: if mode == "rb": @@ -942,7 +943,7 @@ def test_stylesheet_file_close(datapath, mode): kml = datapath("io", "data", "xml", "cta_rail_lines.kml") xsl = datapath("io", "data", "xml", "flatten_doc.xsl") - xsl_obj: Union[BytesIO, StringIO] + xsl_obj: BytesIO | StringIO with open(xsl, mode) as f: if mode == "rb": diff --git a/pandas/tests/plotting/frame/test_frame.py b/pandas/tests/plotting/frame/test_frame.py index 329ace02f4207..ccd0bc3d16896 100644 --- a/pandas/tests/plotting/frame/test_frame.py +++ b/pandas/tests/plotting/frame/test_frame.py @@ -1753,7 +1753,7 @@ def _check(axes): @td.skip_if_no_scipy def test_memory_leak(self): - """ Check that every plot type gets properly collected. """ + """Check that every plot type gets properly collected.""" import gc import weakref diff --git a/pandas/tests/plotting/test_misc.py b/pandas/tests/plotting/test_misc.py index 7f0d1802580b9..adda95f4c5aa0 100644 --- a/pandas/tests/plotting/test_misc.py +++ b/pandas/tests/plotting/test_misc.py @@ -272,7 +272,7 @@ def test_parallel_coordinates(self, iris): # not sure if this is indicative of a problem @pytest.mark.filterwarnings("ignore:Attempting to set:UserWarning") def test_parallel_coordinates_with_sorted_labels(self): - """ For #15908 """ + """For #15908""" from pandas.plotting import parallel_coordinates df = DataFrame( diff --git a/pandas/tests/resample/test_base.py b/pandas/tests/resample/test_base.py index bf3e6d822ab19..450bd8b05ea43 100644 --- a/pandas/tests/resample/test_base.py +++ b/pandas/tests/resample/test_base.py @@ -36,7 +36,7 @@ @pytest.fixture def create_index(_index_factory): def _create_index(*args, **kwargs): - """ return the _index_factory created using the args, kwargs """ + """return the _index_factory created using the args, kwargs""" return _index_factory(*args, **kwargs) return _create_index diff --git a/pandas/tests/resample/test_deprecated.py b/pandas/tests/resample/test_deprecated.py index 1f99c2888aad5..359c3cea62f9c 100644 --- a/pandas/tests/resample/test_deprecated.py +++ b/pandas/tests/resample/test_deprecated.py @@ -42,7 +42,7 @@ def _index_factory(): @pytest.fixture def create_index(_index_factory): def _create_index(*args, **kwargs): - """ return the _index_factory created using the args, kwargs """ + """return the _index_factory created using the args, kwargs""" return _index_factory(*args, **kwargs) return _create_index diff --git a/pandas/tests/reshape/merge/test_merge_asof.py b/pandas/tests/reshape/merge/test_merge_asof.py index 671f0ad2d26c7..6746158179964 100644 --- a/pandas/tests/reshape/merge/test_merge_asof.py +++ b/pandas/tests/reshape/merge/test_merge_asof.py @@ -40,7 +40,7 @@ def setup_method(self, datapath): ) def test_examples1(self): - """ doc-string examples """ + """doc-string examples""" left = pd.DataFrame({"a": [1, 5, 10], "left_val": ["a", "b", "c"]}) right = pd.DataFrame({"a": [1, 2, 3, 6, 7], "right_val": [1, 2, 3, 6, 7]}) @@ -52,7 +52,7 @@ def test_examples1(self): tm.assert_frame_equal(result, expected) def test_examples2(self): - """ doc-string examples """ + """doc-string examples""" trades = pd.DataFrame( { "time": to_datetime( @@ -136,7 +136,7 @@ def test_examples2(self): tm.assert_frame_equal(result, expected) def test_examples3(self): - """ doc-string examples """ + """doc-string examples""" # GH14887 left = pd.DataFrame({"a": [1, 5, 10], "left_val": ["a", "b", "c"]}) @@ -150,7 +150,7 @@ def test_examples3(self): tm.assert_frame_equal(result, expected) def test_examples4(self): - """ doc-string examples """ + """doc-string examples""" # GH14887 left = pd.DataFrame({"a": [1, 5, 10], "left_val": ["a", "b", "c"]}) diff --git a/pandas/tests/series/accessors/test_cat_accessor.py b/pandas/tests/series/accessors/test_cat_accessor.py index 7aea45755f940..fcec06524efab 100644 --- a/pandas/tests/series/accessors/test_cat_accessor.py +++ b/pandas/tests/series/accessors/test_cat_accessor.py @@ -6,17 +6,19 @@ from pandas import ( Categorical, DataFrame, - DatetimeIndex, Index, Series, - TimedeltaIndex, Timestamp, date_range, period_range, timedelta_range, ) import pandas._testing as tm -from pandas.core.arrays import PeriodArray +from pandas.core.arrays import ( + DatetimeArray, + PeriodArray, + TimedeltaArray, +) from pandas.core.arrays.categorical import CategoricalAccessor from pandas.core.indexes.accessors import Properties @@ -178,9 +180,9 @@ def test_dt_accessor_api_for_categorical(self): get_ops = lambda x: x._datetimelike_ops test_data = [ - ("Datetime", get_ops(DatetimeIndex), s_dr, c_dr), + ("Datetime", get_ops(DatetimeArray), s_dr, c_dr), ("Period", get_ops(PeriodArray), s_pr, c_pr), - ("Timedelta", get_ops(TimedeltaIndex), s_tdr, c_tdr), + ("Timedelta", get_ops(TimedeltaArray), s_tdr, c_tdr), ] assert isinstance(c_dr.dt, Properties) diff --git a/pandas/tests/series/accessors/test_dt_accessor.py b/pandas/tests/series/accessors/test_dt_accessor.py index 62a9099fab1ad..076de881eaf96 100644 --- a/pandas/tests/series/accessors/test_dt_accessor.py +++ b/pandas/tests/series/accessors/test_dt_accessor.py @@ -32,7 +32,10 @@ timedelta_range, ) import pandas._testing as tm -from pandas.core.arrays import PeriodArray +from pandas.core.arrays import ( + PeriodArray, + TimedeltaArray, +) import pandas.core.common as com @@ -59,7 +62,7 @@ def test_dt_namespace_accessor(self): "month_name", "isocalendar", ] - ok_for_td = TimedeltaIndex._datetimelike_ops + ok_for_td = TimedeltaArray._datetimelike_ops ok_for_td_methods = [ "components", "to_pytimedelta", diff --git a/pandas/tests/series/indexing/test_setitem.py b/pandas/tests/series/indexing/test_setitem.py index 3f850dfbc6a39..13054062defb4 100644 --- a/pandas/tests/series/indexing/test_setitem.py +++ b/pandas/tests/series/indexing/test_setitem.py @@ -158,7 +158,7 @@ def test_setitem_series_object_dtype(self, indexer, ser_index): expected = Series([Series([42], index=[ser_index]), 0], dtype="object") tm.assert_series_equal(ser, expected) - @pytest.mark.parametrize("index, exp_value", [(0, 42.0), (1, np.nan)]) + @pytest.mark.parametrize("index, exp_value", [(0, 42), (1, np.nan)]) def test_setitem_series(self, index, exp_value): # GH#38303 ser = Series([0, 0]) diff --git a/pandas/tests/series/methods/test_astype.py b/pandas/tests/series/methods/test_astype.py index ffaecf1576364..99a7ba910eb74 100644 --- a/pandas/tests/series/methods/test_astype.py +++ b/pandas/tests/series/methods/test_astype.py @@ -249,10 +249,10 @@ def test_td64_series_astype_object(self): @pytest.mark.parametrize( "data, dtype", [ - (["x", "y", "z"], "string"), + (["x", "y", "z"], "string[python]"), pytest.param( ["x", "y", "z"], - "arrow_string", + "string[pyarrow]", marks=td.skip_if_no("pyarrow", min_version="1.0.0"), ), (["x", "y", "z"], "category"), @@ -263,9 +263,6 @@ def test_td64_series_astype_object(self): @pytest.mark.parametrize("errors", ["raise", "ignore"]) def test_astype_ignores_errors_for_extension_dtypes(self, data, dtype, errors): # https://github.com/pandas-dev/pandas/issues/35471 - - from pandas.core.arrays.string_arrow import ArrowStringDtype # noqa: F401 - ser = Series(data, dtype=dtype) if errors == "ignore": expected = ser diff --git a/pandas/tests/series/methods/test_clip.py b/pandas/tests/series/methods/test_clip.py index 7dbc194669a62..e4803a9cd3038 100644 --- a/pandas/tests/series/methods/test_clip.py +++ b/pandas/tests/series/methods/test_clip.py @@ -61,7 +61,7 @@ def test_series_clipping_with_na_values( tm.assert_series_equal(s_clipped_lower, expected_lower) def test_clip_with_na_args(self): - """Should process np.nan argument as None """ + """Should process np.nan argument as None""" # GH#17276 s = Series([1, 2, 3]) diff --git a/pandas/tests/series/methods/test_update.py b/pandas/tests/series/methods/test_update.py index 9a64877cb92ff..d9d6641d54237 100644 --- a/pandas/tests/series/methods/test_update.py +++ b/pandas/tests/series/methods/test_update.py @@ -11,7 +11,6 @@ Timestamp, ) import pandas._testing as tm -from pandas.core.arrays.string_arrow import ArrowStringDtype # noqa: F401 class TestUpdate: @@ -87,12 +86,12 @@ def test_update_from_non_series(self, series, other, expected): @pytest.mark.parametrize( "data, other, expected, dtype", [ - (["a", None], [None, "b"], ["a", "b"], "string"), + (["a", None], [None, "b"], ["a", "b"], "string[python]"), pytest.param( ["a", None], [None, "b"], ["a", "b"], - "arrow_string", + "string[pyarrow]", marks=td.skip_if_no("pyarrow", min_version="1.0.0"), ), ([1, None], [None, 2], [1, 2], "Int64"), diff --git a/pandas/tests/strings/test_api.py b/pandas/tests/strings/test_api.py index ec8b5bfa11ad5..6cbf2dd606692 100644 --- a/pandas/tests/strings/test_api.py +++ b/pandas/tests/strings/test_api.py @@ -6,6 +6,7 @@ MultiIndex, Series, _testing as tm, + get_option, ) from pandas.core import strings as strings @@ -128,7 +129,9 @@ def test_api_per_method( def test_api_for_categorical(any_string_method, any_string_dtype, request): # https://github.com/pandas-dev/pandas/issues/10661 - if any_string_dtype == "arrow_string": + if any_string_dtype == "string[pyarrow]" or ( + any_string_dtype == "string" and get_option("string_storage") == "pyarrow" + ): # unsupported operand type(s) for +: 'ArrowStringArray' and 'str' mark = pytest.mark.xfail(raises=TypeError, reason="Not Implemented") request.node.add_marker(mark) diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index a0e3399bee49f..e100fef3490ba 100644 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -402,7 +402,7 @@ def test_subsets_multiindex_dtype(self): class TestSorted: - """ everything you wanted to test about sorting """ + """everything you wanted to test about sorting""" def test_sort_non_lexsorted(self): # degenerate case where we sort but don't diff --git a/pandas/tests/tseries/offsets/common.py b/pandas/tests/tseries/offsets/common.py index db63785988977..0227a07877db0 100644 --- a/pandas/tests/tseries/offsets/common.py +++ b/pandas/tests/tseries/offsets/common.py @@ -1,11 +1,9 @@ """ Assertion helpers and base class for offsets tests """ +from __future__ import annotations + from datetime import datetime -from typing import ( - Optional, - Type, -) from dateutil.tz.tz import tzlocal import pytest @@ -61,7 +59,7 @@ class WeekDay: class Base: - _offset: Optional[Type[DateOffset]] = None + _offset: type[DateOffset] | None = None d = Timestamp(datetime(2008, 1, 2)) timezones = [ diff --git a/pandas/tests/tseries/offsets/test_offsets.py b/pandas/tests/tseries/offsets/test_offsets.py index 8872b76cd9bce..08dbc1345b9d4 100644 --- a/pandas/tests/tseries/offsets/test_offsets.py +++ b/pandas/tests/tseries/offsets/test_offsets.py @@ -1,6 +1,8 @@ """ Tests of pandas.tseries.offsets """ +from __future__ import annotations + from datetime import ( datetime, timedelta, diff --git a/pandas/tests/window/test_online.py b/pandas/tests/window/test_online.py new file mode 100644 index 0000000000000..461c62c07326d --- /dev/null +++ b/pandas/tests/window/test_online.py @@ -0,0 +1,91 @@ +import numpy as np +import pytest + +import pandas.util._test_decorators as td + +from pandas import ( + DataFrame, + Series, +) +import pandas._testing as tm + + +@td.skip_if_no("numba", "0.46.0") +@pytest.mark.filterwarnings("ignore:\\nThe keyword argument") +class TestEWM: + def test_invalid_update(self): + df = DataFrame({"a": range(5), "b": range(5)}) + online_ewm = df.head(2).ewm(0.5).online() + with pytest.raises( + ValueError, + match="Must call mean with update=None first before passing update", + ): + online_ewm.mean(update=df.head(1)) + + @pytest.mark.slow + @pytest.mark.parametrize( + "obj", [DataFrame({"a": range(5), "b": range(5)}), Series(range(5), name="foo")] + ) + def test_online_vs_non_online_mean( + self, obj, nogil, parallel, nopython, adjust, ignore_na + ): + expected = obj.ewm(0.5, adjust=adjust, ignore_na=ignore_na).mean() + engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython} + + online_ewm = ( + obj.head(2) + .ewm(0.5, adjust=adjust, ignore_na=ignore_na) + .online(engine_kwargs=engine_kwargs) + ) + # Test resetting once + for _ in range(2): + result = online_ewm.mean() + tm.assert_equal(result, expected.head(2)) + + result = online_ewm.mean(update=obj.tail(3)) + tm.assert_equal(result, expected.tail(3)) + + online_ewm.reset() + + @pytest.mark.xfail(raises=NotImplementedError) + @pytest.mark.parametrize( + "obj", [DataFrame({"a": range(5), "b": range(5)}), Series(range(5), name="foo")] + ) + def test_update_times_mean( + self, obj, nogil, parallel, nopython, adjust, ignore_na, halflife_with_times + ): + times = Series( + np.array( + ["2020-01-01", "2020-01-05", "2020-01-07", "2020-01-17", "2020-01-21"], + dtype="datetime64", + ) + ) + expected = obj.ewm( + 0.5, + adjust=adjust, + ignore_na=ignore_na, + times=times, + halflife=halflife_with_times, + ).mean() + + engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython} + online_ewm = ( + obj.head(2) + .ewm( + 0.5, + adjust=adjust, + ignore_na=ignore_na, + times=times.head(2), + halflife=halflife_with_times, + ) + .online(engine_kwargs=engine_kwargs) + ) + # Test resetting once + for _ in range(2): + result = online_ewm.mean() + tm.assert_equal(result, expected.head(2)) + + result = online_ewm.mean(update=obj.tail(3), update_times=times.tail(3)) + tm.assert_equal(result, expected.tail(3)) + + online_ewm.reset() diff --git a/pandas/util/version/__init__.py b/pandas/util/version/__init__.py index 5ca3abb916ce0..3d59cef4d4f77 100644 --- a/pandas/util/version/__init__.py +++ b/pandas/util/version/__init__.py @@ -6,6 +6,7 @@ # This file is dual licensed under the terms of the Apache License, Version # 2.0, and the BSD License. See the LICENSE file in the root of this repository # for complete details. +from __future__ import annotations import collections import itertools @@ -13,8 +14,6 @@ from typing import ( Callable, Iterator, - List, - Optional, SupportsInt, Tuple, Union, @@ -49,7 +48,7 @@ def __gt__(self, other: object) -> bool: def __ge__(self, other: object) -> bool: return True - def __neg__(self: object) -> "NegativeInfinityType": + def __neg__(self: object) -> NegativeInfinityType: return NegativeInfinity @@ -115,7 +114,7 @@ def __neg__(self: object) -> InfinityType: ) -def parse(version: str) -> Union["LegacyVersion", "Version"]: +def parse(version: str) -> LegacyVersion | Version: """ Parse the given version string and return either a :class:`Version` object or a :class:`LegacyVersion` object depending on if the given version is @@ -134,7 +133,7 @@ class InvalidVersion(ValueError): class _BaseVersion: - _key: Union[CmpKey, LegacyCmpKey] + _key: CmpKey | LegacyCmpKey def __hash__(self) -> int: return hash(self._key) @@ -142,13 +141,13 @@ def __hash__(self) -> int: # Please keep the duplicated `isinstance` check # in the six comparisons hereunder # unless you find a way to avoid adding overhead function calls. - def __lt__(self, other: "_BaseVersion") -> bool: + def __lt__(self, other: _BaseVersion) -> bool: if not isinstance(other, _BaseVersion): return NotImplemented return self._key < other._key - def __le__(self, other: "_BaseVersion") -> bool: + def __le__(self, other: _BaseVersion) -> bool: if not isinstance(other, _BaseVersion): return NotImplemented @@ -160,13 +159,13 @@ def __eq__(self, other: object) -> bool: return self._key == other._key - def __ge__(self, other: "_BaseVersion") -> bool: + def __ge__(self, other: _BaseVersion) -> bool: if not isinstance(other, _BaseVersion): return NotImplemented return self._key >= other._key - def __gt__(self, other: "_BaseVersion") -> bool: + def __gt__(self, other: _BaseVersion) -> bool: if not isinstance(other, _BaseVersion): return NotImplemented @@ -279,7 +278,7 @@ def _legacy_cmpkey(version: str) -> LegacyCmpKey: # This scheme is taken from pkg_resources.parse_version setuptools prior to # it's adoption of the packaging library. - parts: List[str] = [] + parts: list[str] = [] for part in _parse_version_parts(version.lower()): if part.startswith("*"): # remove "-" before a prerelease tag @@ -400,25 +399,25 @@ def epoch(self) -> int: return _epoch @property - def release(self) -> Tuple[int, ...]: - _release: Tuple[int, ...] = self._version.release + def release(self) -> tuple[int, ...]: + _release: tuple[int, ...] = self._version.release return _release @property - def pre(self) -> Optional[Tuple[str, int]]: - _pre: Optional[Tuple[str, int]] = self._version.pre + def pre(self) -> tuple[str, int] | None: + _pre: tuple[str, int] | None = self._version.pre return _pre @property - def post(self) -> Optional[int]: + def post(self) -> int | None: return self._version.post[1] if self._version.post else None @property - def dev(self) -> Optional[int]: + def dev(self) -> int | None: return self._version.dev[1] if self._version.dev else None @property - def local(self) -> Optional[str]: + def local(self) -> str | None: if self._version.local: return ".".join(str(x) for x in self._version.local) else: @@ -467,8 +466,8 @@ def micro(self) -> int: def _parse_letter_version( - letter: str, number: Union[str, bytes, SupportsInt] -) -> Optional[Tuple[str, int]]: + letter: str, number: str | bytes | SupportsInt +) -> tuple[str, int] | None: if letter: # We consider there to be an implicit 0 in a pre-release if there is @@ -505,7 +504,7 @@ def _parse_letter_version( _local_version_separators = re.compile(r"[\._-]") -def _parse_local_version(local: str) -> Optional[LocalType]: +def _parse_local_version(local: str) -> LocalType | None: """ Takes a string like abc.1.twelve and turns it into ("abc", 1, "twelve"). """ @@ -519,11 +518,11 @@ def _parse_local_version(local: str) -> Optional[LocalType]: def _cmpkey( epoch: int, - release: Tuple[int, ...], - pre: Optional[Tuple[str, int]], - post: Optional[Tuple[str, int]], - dev: Optional[Tuple[str, int]], - local: Optional[Tuple[SubLocalType]], + release: tuple[int, ...], + pre: tuple[str, int] | None, + post: tuple[str, int] | None, + dev: tuple[str, int] | None, + local: tuple[SubLocalType] | None, ) -> CmpKey: # When we compare a release version, we want to compare it with all of the diff --git a/pyproject.toml b/pyproject.toml index 01d28777eb47e..3947856d94d01 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -14,7 +14,7 @@ requires = [ # build-backend = "setuptools.build_meta" [tool.black] -target-version = ['py37', 'py38'] +target-version = ['py37', 'py38', 'py39'] exclude = ''' ( asv_bench/env diff --git a/requirements-dev.txt b/requirements-dev.txt index f454bfd15236c..332059341df48 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -6,7 +6,7 @@ python-dateutil>=2.7.3 pytz asv cython>=0.29.21 -black==20.8b1 +black==21.5b2 cpplint flake8==3.9.2 flake8-bugbear==21.3.2 @@ -51,7 +51,7 @@ blosc bottleneck>=1.2.1 ipykernel ipython>=7.11.1 -jinja2<3.0.0 +jinja2 matplotlib>=2.2.2 numexpr>=2.7.0 scipy>=1.2 diff --git a/scripts/no_bool_in_generic.py b/scripts/no_bool_in_generic.py index f80eff56b2729..f63ae4ae1659c 100644 --- a/scripts/no_bool_in_generic.py +++ b/scripts/no_bool_in_generic.py @@ -10,23 +10,18 @@ The function `visit` is adapted from a function by the same name in pyupgrade: https://github.com/asottile/pyupgrade/blob/5495a248f2165941c5d3b82ac3226ba7ad1fa59d/pyupgrade/_data.py#L70-L113 """ +from __future__ import annotations import argparse import ast import collections -from typing import ( - Dict, - List, - Optional, - Sequence, - Tuple, -) +from typing import Sequence -def visit(tree: ast.Module) -> Dict[int, List[int]]: +def visit(tree: ast.Module) -> dict[int, list[int]]: "Step through tree, recording when nodes are in annotations." in_annotation = False - nodes: List[Tuple[bool, ast.AST]] = [(in_annotation, tree)] + nodes: list[tuple[bool, ast.AST]] = [(in_annotation, tree)] to_replace = collections.defaultdict(list) while nodes: @@ -62,7 +57,7 @@ def replace_bool_with_bool_t(to_replace, content: str) -> str: return "\n".join(new_lines) -def check_for_bool_in_generic(content: str) -> Tuple[bool, str]: +def check_for_bool_in_generic(content: str) -> tuple[bool, str]: tree = ast.parse(content) to_replace = visit(tree) @@ -74,7 +69,7 @@ def check_for_bool_in_generic(content: str) -> Tuple[bool, str]: return mutated, replace_bool_with_bool_t(to_replace, content) -def main(argv: Optional[Sequence[str]] = None) -> None: +def main(argv: Sequence[str] | None = None) -> None: parser = argparse.ArgumentParser() parser.add_argument("paths", nargs="*") args = parser.parse_args(argv) diff --git a/scripts/tests/test_validate_docstrings.py b/scripts/tests/test_validate_docstrings.py index cbf3e84044d53..46cfae8e31208 100644 --- a/scripts/tests/test_validate_docstrings.py +++ b/scripts/tests/test_validate_docstrings.py @@ -82,6 +82,12 @@ def missing_whitespace_after_comma(self): """ pass + def write_array_like_with_hyphen_not_underscore(self): + """ + In docstrings, use array-like over array_like + """ + pass + class TestValidator: def _import_path(self, klass=None, func=None): @@ -172,6 +178,11 @@ def test_bad_class(self, capsys): "missing_whitespace_after_comma", ("flake8 error: E231 missing whitespace after ',' (3 times)",), ), + ( + "BadDocstrings", + "write_array_like_with_hyphen_not_underscore", + ("Use 'array-like' rather than 'array_like' in docstrings",), + ), ], ) def test_bad_docstrings(self, capsys, klass, func, msgs): diff --git a/scripts/use_pd_array_in_core.py b/scripts/use_pd_array_in_core.py index 531084683bdb1..61ba070e52f1b 100644 --- a/scripts/use_pd_array_in_core.py +++ b/scripts/use_pd_array_in_core.py @@ -9,13 +9,12 @@ """ +from __future__ import annotations + import argparse import ast import sys -from typing import ( - Optional, - Sequence, -) +from typing import Sequence ERROR_MESSAGE = ( "{path}:{lineno}:{col_offset}: " @@ -62,7 +61,7 @@ def use_pd_array(content: str, path: str) -> None: visitor.visit(tree) -def main(argv: Optional[Sequence[str]] = None) -> None: +def main(argv: Sequence[str] | None = None) -> None: parser = argparse.ArgumentParser() parser.add_argument("paths", nargs="*") args = parser.parse_args(argv) diff --git a/scripts/validate_docstrings.py b/scripts/validate_docstrings.py index d0f32bb554cf9..9b65204403612 100755 --- a/scripts/validate_docstrings.py +++ b/scripts/validate_docstrings.py @@ -13,6 +13,8 @@ $ ./validate_docstrings.py $ ./validate_docstrings.py pandas.DataFrame.head """ +from __future__ import annotations + import argparse import doctest import glob @@ -22,10 +24,6 @@ import subprocess import sys import tempfile -from typing import ( - List, - Optional, -) try: from io import StringIO @@ -56,6 +54,7 @@ ERROR_MSGS = { "GL04": "Private classes ({mentioned_private_classes}) should not be " "mentioned in public docstrings", + "GL05": "Use 'array-like' rather than 'array_like' in docstrings.", "SA05": "{reference_name} in `See Also` section does not need `pandas` " "prefix, use {right_reference} instead.", "EX02": "Examples do not pass tests:\n{doctest_log}", @@ -198,6 +197,9 @@ def validate_pep8(self): error_count, error_code, message = error_message.split(maxsplit=2) yield error_code, message, int(error_count) + def non_hyphenated_array_like(self): + return "array_like" in self.raw_doc + def pandas_validate(func_name: str): """ @@ -258,6 +260,9 @@ def pandas_validate(func_name: str): pandas_error("EX04", imported_library=wrong_import) ) + if doc.non_hyphenated_array_like(): + result["errors"].append(pandas_error("GL05")) + return result @@ -315,7 +320,7 @@ def validate_all(prefix, ignore_deprecated=False): def print_validate_all_results( prefix: str, - errors: Optional[List[str]], + errors: list[str] | None, output_format: str, ignore_deprecated: bool, ): diff --git a/scripts/validate_rst_title_capitalization.py b/scripts/validate_rst_title_capitalization.py index aa17afc4c33ea..9aca47dbddbf2 100755 --- a/scripts/validate_rst_title_capitalization.py +++ b/scripts/validate_rst_title_capitalization.py @@ -11,14 +11,12 @@ From the command-line: python scripts/validate_rst_title_capitalization.py """ +from __future__ import annotations + import argparse import re import sys -from typing import ( - Iterable, - List, - Tuple, -) +from typing import Iterable CAPITALIZATION_EXCEPTIONS = { "pandas", @@ -201,7 +199,7 @@ def correct_title_capitalization(title: str) -> str: return correct_title -def find_titles(rst_file: str) -> Iterable[Tuple[str, int]]: +def find_titles(rst_file: str) -> Iterable[tuple[str, int]]: """ Algorithm to identify particular text that should be considered headings in an RST file. @@ -237,7 +235,7 @@ def find_titles(rst_file: str) -> Iterable[Tuple[str, int]]: previous_line = line -def main(source_paths: List[str]) -> int: +def main(source_paths: list[str]) -> int: """ The main method to print all headings with incorrect capitalization.
cell. @@ -164,11 +160,11 @@ def write_th( self._write_cell(s, kind="th", indent=indent, tags=tags) - def write_td(self, s: Any, indent: int = 0, tags: Optional[str] = None) -> None: + def write_td(self, s: Any, indent: int = 0, tags: str | None = None) -> None: self._write_cell(s, kind="td", indent=indent, tags=tags) def _write_cell( - self, s: Any, kind: str = "td", indent: int = 0, tags: Optional[str] = None + self, s: Any, kind: str = "td", indent: int = 0, tags: str | None = None ) -> None: if tags is not None: start_tag = f"<{kind} {tags}>" @@ -198,8 +194,8 @@ def write_tr( indent: int = 0, indent_delta: int = 0, header: bool = False, - align: Optional[str] = None, - tags: Optional[Dict[int, str]] = None, + align: str | None = None, + tags: dict[int, str] | None = None, nindex_levels: int = 0, ) -> None: if tags is None: @@ -389,7 +385,7 @@ def _write_header(self, indent: int) -> None: self.write("", indent) - def _get_formatted_values(self) -> Dict[int, List[str]]: + def _get_formatted_values(self) -> dict[int, list[str]]: with option_context("display.max_colwidth", None): fmt_values = {i: self.fmt.format_col(i) for i in range(self.ncols)} return fmt_values @@ -407,7 +403,7 @@ def _write_body(self, indent: int) -> None: self.write("