diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index a337ccbc98650..025b6f1813df7 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -125,68 +125,32 @@ jobs: - name: Check ipython directive errors run: "! grep -B1 \"^<<<-------------------------------------------------------------------------$\" sphinx.log" - - name: Merge website and docs - run: | - mkdir -p pandas_web/docs - cp -r web/build/* pandas_web/ - cp -r doc/build/html/* pandas_web/docs/ - if: github.event_name == 'push' - - name: Install Rclone run: sudo apt install rclone -y if: github.event_name == 'push' - name: Set up Rclone run: | - RCLONE_CONFIG_PATH=$HOME/.config/rclone/rclone.conf - mkdir -p `dirname $RCLONE_CONFIG_PATH` - echo "[ovh_cloud_pandas_web]" > $RCLONE_CONFIG_PATH - echo "type = swift" >> $RCLONE_CONFIG_PATH - echo "env_auth = false" >> $RCLONE_CONFIG_PATH - echo "auth_version = 3" >> $RCLONE_CONFIG_PATH - echo "auth = https://auth.cloud.ovh.net/v3/" >> $RCLONE_CONFIG_PATH - echo "endpoint_type = public" >> $RCLONE_CONFIG_PATH - echo "tenant_domain = default" >> $RCLONE_CONFIG_PATH - echo "tenant = 2977553886518025" >> $RCLONE_CONFIG_PATH - echo "domain = default" >> $RCLONE_CONFIG_PATH - echo "user = w4KGs3pmDxpd" >> $RCLONE_CONFIG_PATH - echo "key = ${{ secrets.ovh_object_store_key }}" >> $RCLONE_CONFIG_PATH - echo "region = BHS" >> $RCLONE_CONFIG_PATH + CONF=$HOME/.config/rclone/rclone.conf + mkdir -p `dirname $CONF` + echo "[ovh_host]" > $CONF + echo "type = swift" >> $CONF + echo "env_auth = false" >> $CONF + echo "auth_version = 3" >> $CONF + echo "auth = https://auth.cloud.ovh.net/v3/" >> $CONF + echo "endpoint_type = public" >> $CONF + echo "tenant_domain = default" >> $CONF + echo "tenant = 2977553886518025" >> $CONF + echo "domain = default" >> $CONF + echo "user = w4KGs3pmDxpd" >> $CONF + echo "key = ${{ secrets.ovh_object_store_key }}" >> $CONF + echo "region = BHS" >> $CONF if: github.event_name == 'push' - name: Sync web with OVH - run: rclone sync pandas_web ovh_cloud_pandas_web:dev - if: github.event_name == 'push' - - - name: Create git repo to upload the built docs to GitHub pages - run: | - cd pandas_web - git init - touch .nojekyll - echo "dev.pandas.io" > CNAME - printf "User-agent: *\nDisallow: /" > robots.txt - git add --all . - git config user.email "pandas-dev@python.org" - git config user.name "pandas-bot" - git commit -m "pandas web and documentation in master" + run: rclone sync --exclude pandas-docs/** web/build ovh_host:prod if: github.event_name == 'push' - # For this task to work, next steps are required: - # 1. Generate a pair of private/public keys (i.e. `ssh-keygen -t rsa -b 4096 -C "your_email@example.com"`) - # 2. Go to https://github.com/pandas-dev/pandas/settings/secrets - # 3. Click on "Add a new secret" - # 4. Name: "github_pagas_ssh_key", Value: - # 5. The public key needs to be upladed to https://github.com/pandas-dev/pandas-dev.github.io/settings/keys - - name: Install GitHub pages ssh deployment key - uses: shimataro/ssh-key-action@v2 - with: - key: ${{ secrets.github_pages_ssh_key }} - known_hosts: 'github.com,192.30.252.128 ssh-rsa AAAAB3NzaC1yc2EAAAABIwAAAQEAq2A7hRGmdnm9tUDbO9IDSwBK6TbQa+PXYPCPy6rbTrTtw7PHkccKrpp0yVhp5HdEIcKr6pLlVDBfOLX9QUsyCOV0wzfjIJNlGEYsdlLJizHhbn2mUjvSAHQqZETYP81eFzLQNnPHt4EVVUh7VfDESU84KezmD5QlWpXLmvU31/yMf+Se8xhHTvKSCZIFImWwoG6mbUoWf9nzpIoaSjB+weqqUUmpaaasXVal72J+UX2B+2RPW3RcT0eOzQgqlJL3RKrTJvdsjE3JEAvGq3lGHSZXy28G3skua2SmVi/w4yCE6gbODqnTWlg7+wC604ydGXA8VJiS5ap43JXiUFFAaQ==' - if: github.event_name == 'push' - - - name: Publish web and docs to GitHub pages - run: | - cd pandas_web - git remote add origin git@github.com:pandas-dev/pandas-dev.github.io.git - git push -f origin master || true + - name: Sync dev docs with OVH + run: rclone sync doc/build/html ovh_host:prod/pandas-docs/dev if: github.event_name == 'push' diff --git a/LICENSES/HAVEN_LICENSE b/LICENSES/HAVEN_LICENSE index 2f444cb44d505..ce1b07b783e74 100644 --- a/LICENSES/HAVEN_LICENSE +++ b/LICENSES/HAVEN_LICENSE @@ -1,2 +1,21 @@ -YEAR: 2013-2016 -COPYRIGHT HOLDER: Hadley Wickham; RStudio; and Evan Miller +# MIT License + +Copyright (c) 2019 Hadley Wickham; RStudio; and Evan Miller + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/asv_bench/benchmarks/arithmetic.py b/asv_bench/benchmarks/arithmetic.py index d1e94f62967f4..5a8b109c21858 100644 --- a/asv_bench/benchmarks/arithmetic.py +++ b/asv_bench/benchmarks/arithmetic.py @@ -50,6 +50,36 @@ def time_frame_op_with_scalar(self, dtype, scalar, op): op(self.df, scalar) +class MixedFrameWithSeriesAxis0: + params = [ + [ + "eq", + "ne", + "lt", + "le", + "ge", + "gt", + "add", + "sub", + "div", + "floordiv", + "mul", + "pow", + ] + ] + param_names = ["opname"] + + def setup(self, opname): + arr = np.arange(10 ** 6).reshape(100, -1) + df = DataFrame(arr) + df["C"] = 1.0 + self.df = df + self.ser = df[0] + + def time_frame_op_with_series_axis0(self, opname): + getattr(self.df, opname)(self.ser, axis=0) + + class Ops: params = [[True, False], ["default", 1]] diff --git a/asv_bench/benchmarks/frame_ctor.py b/asv_bench/benchmarks/frame_ctor.py index 2b24bab85bc57..dc6f45f810f3d 100644 --- a/asv_bench/benchmarks/frame_ctor.py +++ b/asv_bench/benchmarks/frame_ctor.py @@ -1,5 +1,6 @@ import numpy as np +import pandas as pd from pandas import DataFrame, MultiIndex, Series, Timestamp, date_range from .pandas_vb_common import tm @@ -118,4 +119,48 @@ def time_frame_from_range(self): self.df = DataFrame(self.data) +class FromArrays: + + goal_time = 0.2 + + def setup(self): + N_rows = 1000 + N_cols = 1000 + self.float_arrays = [np.random.randn(N_rows) for _ in range(N_cols)] + self.sparse_arrays = [ + pd.arrays.SparseArray(np.random.randint(0, 2, N_rows), dtype="float64") + for _ in range(N_cols) + ] + self.int_arrays = [ + pd.array(np.random.randint(1000, size=N_rows), dtype="Int64") + for _ in range(N_cols) + ] + self.index = pd.Index(range(N_rows)) + self.columns = pd.Index(range(N_cols)) + + def time_frame_from_arrays_float(self): + self.df = DataFrame._from_arrays( + self.float_arrays, + index=self.index, + columns=self.columns, + verify_integrity=False, + ) + + def time_frame_from_arrays_int(self): + self.df = DataFrame._from_arrays( + self.int_arrays, + index=self.index, + columns=self.columns, + verify_integrity=False, + ) + + def time_frame_from_arrays_sparse(self): + self.df = DataFrame._from_arrays( + self.sparse_arrays, + index=self.index, + columns=self.columns, + verify_integrity=False, + ) + + from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/rolling.py b/asv_bench/benchmarks/rolling.py index f7e1e395a76bc..5133bbd285b50 100644 --- a/asv_bench/benchmarks/rolling.py +++ b/asv_bench/benchmarks/rolling.py @@ -11,7 +11,7 @@ class Methods: ["int", "float"], ["median", "mean", "max", "min", "std", "count", "skew", "kurt", "sum"], ) - param_names = ["contructor", "window", "dtype", "method"] + param_names = ["constructor", "window", "dtype", "method"] def setup(self, constructor, window, dtype, method): N = 10 ** 5 @@ -72,7 +72,7 @@ class ExpandingMethods: ["int", "float"], ["median", "mean", "max", "min", "std", "count", "skew", "kurt", "sum"], ) - param_names = ["contructor", "window", "dtype", "method"] + param_names = ["constructor", "window", "dtype", "method"] def setup(self, constructor, dtype, method): N = 10 ** 5 @@ -86,7 +86,7 @@ def time_expanding(self, constructor, dtype, method): class EWMMethods: params = (["DataFrame", "Series"], [10, 1000], ["int", "float"], ["mean", "std"]) - param_names = ["contructor", "window", "dtype", "method"] + param_names = ["constructor", "window", "dtype", "method"] def setup(self, constructor, window, dtype, method): N = 10 ** 5 @@ -104,7 +104,7 @@ class VariableWindowMethods(Methods): ["int", "float"], ["median", "mean", "max", "min", "std", "count", "skew", "kurt", "sum"], ) - param_names = ["contructor", "window", "dtype", "method"] + param_names = ["constructor", "window", "dtype", "method"] def setup(self, constructor, window, dtype, method): N = 10 ** 5 diff --git a/asv_bench/benchmarks/sparse.py b/asv_bench/benchmarks/sparse.py index ac78ca53679fd..7a09b03648fa7 100644 --- a/asv_bench/benchmarks/sparse.py +++ b/asv_bench/benchmarks/sparse.py @@ -45,7 +45,6 @@ def time_sparse_array(self, dense_proportion, fill_value, dtype): class SparseDataFrameConstructor: def setup(self): N = 1000 - self.arr = np.arange(N) self.sparse = scipy.sparse.rand(N, N, 0.005) def time_from_scipy(self): diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 42a039af46e94..d042bda77d4e8 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -1,4 +1,10 @@ # Adapted from https://github.com/numba/numba/blob/master/azure-pipelines.yml +trigger: +- master + +pr: +- master + jobs: # Mac and Linux use the same template - template: ci/azure/posix.yml diff --git a/ci/azure/posix.yml b/ci/azure/posix.yml index c9a2e4eefd19d..880fdc46f43f5 100644 --- a/ci/azure/posix.yml +++ b/ci/azure/posix.yml @@ -24,7 +24,7 @@ jobs: ENV_FILE: ci/deps/azure-36-locale_slow.yaml CONDA_PY: "36" PATTERN: "slow" - # pandas does not use the language (zh_CN), but should support diferent encodings (utf8) + # pandas does not use the language (zh_CN), but should support different encodings (utf8) # we should test with encodings different than utf8, but doesn't seem like Ubuntu supports any LANG: "zh_CN.utf8" LC_ALL: "zh_CN.utf8" @@ -38,11 +38,11 @@ jobs: LC_ALL: "it_IT.utf8" EXTRA_APT: "language-pack-it xsel" - py36_32bit: - ENV_FILE: ci/deps/azure-36-32bit.yaml - CONDA_PY: "36" - PATTERN: "not slow and not network and not clipboard" - BITS32: "yes" + #py36_32bit: + # ENV_FILE: ci/deps/azure-36-32bit.yaml + # CONDA_PY: "36" + # PATTERN: "not slow and not network and not clipboard" + # BITS32: "yes" py37_locale: ENV_FILE: ci/deps/azure-37-locale.yaml diff --git a/ci/code_checks.sh b/ci/code_checks.sh index e2dc543360a62..e6a761b91f353 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -267,11 +267,6 @@ if [[ -z "$CHECK" || "$CHECK" == "doctests" ]]; then -k"-nonzero -reindex -searchsorted -to_dict" RET=$(($RET + $?)) ; echo $MSG "DONE" - MSG='Doctests generic.py' ; echo $MSG - pytest -q --doctest-modules pandas/core/generic.py \ - -k"-_set_axis_name -_xs -describe -groupby -interpolate -pct_change -pipe -reindex -reindex_axis -to_json -transpose -values -xs -to_clipboard" - RET=$(($RET + $?)) ; echo $MSG "DONE" - MSG='Doctests groupby.py' ; echo $MSG pytest -q --doctest-modules pandas/core/groupby/groupby.py -k"-cumcount -describe -pipe" RET=$(($RET + $?)) ; echo $MSG "DONE" @@ -311,6 +306,17 @@ if [[ -z "$CHECK" || "$CHECK" == "doctests" ]]; then pytest -q --doctest-modules pandas/core/arrays/boolean.py RET=$(($RET + $?)) ; echo $MSG "DONE" + MSG='Doctests base.py' ; echo $MSG + pytest -q --doctest-modules pandas/core/base.py + RET=$(($RET + $?)) ; echo $MSG "DONE" + + MSG='Doctests construction.py' ; echo $MSG + pytest -q --doctest-modules pandas/core/construction.py + RET=$(($RET + $?)) ; echo $MSG "DONE" + + MSG='Doctests generic.py' ; echo $MSG + pytest -q --doctest-modules pandas/core/generic.py + RET=$(($RET + $?)) ; echo $MSG "DONE" fi ### DOCSTRINGS ### @@ -320,6 +326,10 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then $BASE_DIR/scripts/validate_docstrings.py --format=actions --errors=GL03,GL04,GL05,GL06,GL07,GL09,GL10,SS04,SS05,PR03,PR04,PR05,PR10,EX04,RT01,RT04,RT05,SA02,SA03,SA05 RET=$(($RET + $?)) ; echo $MSG "DONE" + MSG='Validate correct capitalization among titles in documentation' ; echo $MSG + $BASE_DIR/scripts/validate_rst_title_capitalization.py $BASE_DIR/doc/source/development/contributing.rst + RET=$(($RET + $?)) ; echo $MSG "DONE" + fi ### DEPENDENCIES ### diff --git a/doc/redirects.csv b/doc/redirects.csv index ef93955c14fe6..b59ccf649ee21 100644 --- a/doc/redirects.csv +++ b/doc/redirects.csv @@ -7,13 +7,10 @@ release,whatsnew/index # getting started install,getting_started/install -10min,getting_started/10min -basics,getting_started/basics comparison_with_r,getting_started/comparison/comparison_with_r comparison_with_sql,getting_started/comparison/comparison_with_sql comparison_with_sas,getting_started/comparison/comparison_with_sas comparison_with_stata,getting_started/comparison/comparison_with_stata -dsintro,getting_started/dsintro overview,getting_started/overview tutorials,getting_started/tutorials @@ -38,6 +35,9 @@ text,user_guide/text timedeltas,user_guide/timedeltas timeseries,user_guide/timeseries visualization,user_guide/visualization +10min,user_guide/10min +basics,user_guide/basics +dsintro,user_guide/dsintro # development contributing,development/contributing @@ -49,7 +49,25 @@ internals,development/internals # api moved function reference/api/pandas.io.json.json_normalize,pandas.json_normalize -# api rename +# rename due to refactors +reference/api/pandas.core.window.Rolling,pandas.core.window.rolling.Rolling +reference/api/pandas.core.window.Rolling.aggregate,pandas.core.window.rolling.Rolling.aggregate +reference/api/pandas.core.window.Rolling.apply,pandas.core.window.rolling.Rolling.apply +reference/api/pandas.core.window.Rolling.corr,pandas.core.window.rolling.Rolling.corr +reference/api/pandas.core.window.Rolling.count,pandas.core.window.rolling.Rolling.count +reference/api/pandas.core.window.Rolling.cov,pandas.core.window.rolling.Rolling.cov +reference/api/pandas.core.window.Rolling.kurt,pandas.core.window.rolling.Rolling.kurt +reference/api/pandas.core.window.Rolling.max,pandas.core.window.rolling.Rolling.max +reference/api/pandas.core.window.Rolling.mean,pandas.core.window.rolling.Rolling.mean +reference/api/pandas.core.window.Rolling.median,pandas.core.window.rolling.Rolling.median +reference/api/pandas.core.window.Rolling.min,pandas.core.window.rolling.Rolling.min +reference/api/pandas.core.window.Rolling.quantile,pandas.core.window.rolling.Rolling.quantile +reference/api/pandas.core.window.Rolling.skew,pandas.core.window.rolling.Rolling.skew +reference/api/pandas.core.window.Rolling.std,pandas.core.window.rolling.Rolling.std +reference/api/pandas.core.window.Rolling.sum,pandas.core.window.rolling.Rolling.sum +reference/api/pandas.core.window.Rolling.var,pandas.core.window.rolling.Rolling.var + +# api url change (generated -> reference/api rename) api,reference/index generated/pandas.api.extensions.ExtensionArray.argsort,../reference/api/pandas.api.extensions.ExtensionArray.argsort generated/pandas.api.extensions.ExtensionArray.astype,../reference/api/pandas.api.extensions.ExtensionArray.astype diff --git a/doc/source/_static/question_mark_noback.svg b/doc/source/_static/question_mark_noback.svg new file mode 100644 index 0000000000000..3abb4b806d20a --- /dev/null +++ b/doc/source/_static/question_mark_noback.svg @@ -0,0 +1,72 @@ + + + + + + + + + + image/svg+xml + + + + + + + ? + + diff --git a/doc/source/conf.py b/doc/source/conf.py index a95cd4ab696f7..35833627f6c05 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -195,7 +195,7 @@ # The theme to use for HTML and HTML Help pages. Major themes that come with # Sphinx are currently 'default' and 'sphinxdoc'. -html_theme = "pandas_sphinx_theme" +html_theme = "pydata_sphinx_theme" # The style sheet to use for HTML and HTML Help pages. A file of that name # must exist either in Sphinx' static/ path, or in one of the custom paths diff --git a/doc/source/development/code_style.rst b/doc/source/development/code_style.rst index 1b223cf5f026b..fa7532a68a06d 100644 --- a/doc/source/development/code_style.rst +++ b/doc/source/development/code_style.rst @@ -21,9 +21,9 @@ Patterns foo.__class__ ------------- -*pandas* uses 'type(foo)' instead 'foo.__class__' as it makes the code more -readable. +pandas uses 'type(foo)' instead 'foo.__class__' as it is making the code more +readable. For example: **Good:** @@ -50,7 +50,7 @@ Concatenated strings f-strings ~~~~~~~~~ -*pandas* uses f-strings formatting instead of '%' and '.format()' string formatters. +pandas uses f-strings formatting instead of '%' and '.format()' string formatters. The convention of using f-strings on a string that is concatenated over several lines, is to prefix only the lines containing values which need to be interpreted. @@ -114,7 +114,7 @@ For example: Representation function (aka 'repr()') -------------------------------------- -*pandas* uses 'repr()' instead of '%r' and '!r'. +pandas uses 'repr()' instead of '%r' and '!r'. The use of 'repr()' will only happen when the value is not an obvious string. diff --git a/doc/source/development/contributing.rst b/doc/source/development/contributing.rst index f904781178656..88782701b096c 100644 --- a/doc/source/development/contributing.rst +++ b/doc/source/development/contributing.rst @@ -53,7 +53,7 @@ Feel free to ask questions on the `mailing list Bug reports and enhancement requests ==================================== -Bug reports are an important part of making *pandas* more stable. Having a complete bug report +Bug reports are an important part of making pandas more stable. Having a complete bug report will allow others to reproduce the bug and provide insight into fixing. See `this stackoverflow article `_ and `this blogpost `_ @@ -75,14 +75,14 @@ Bug reports must: ... ``` -#. Include the full version string of *pandas* and its dependencies. You can use the built-in function:: +#. Include the full version string of pandas and its dependencies. You can use the built-in function:: >>> import pandas as pd >>> pd.show_versions() #. Explain why the current behavior is wrong/not desired and what you expect instead. -The issue will then show up to the *pandas* community and be open to comments/ideas from others. +The issue will then show up to the pandas community and be open to comments/ideas from others. .. _contributing.github: @@ -90,14 +90,14 @@ Working with the code ===================== Now that you have an issue you want to fix, enhancement to add, or documentation to improve, -you need to learn how to work with GitHub and the *pandas* code base. +you need to learn how to work with GitHub and the pandas code base. .. _contributing.version_control: Version control, Git, and GitHub -------------------------------- -To the new user, working with Git is one of the more daunting aspects of contributing to *pandas*. +To the new user, working with Git is one of the more daunting aspects of contributing to pandas. It can very quickly become overwhelming, but sticking to the guidelines below will help keep the process straightforward and mostly trouble free. As always, if you are having difficulties please feel free to ask for help. @@ -146,7 +146,7 @@ requires a C compiler and Python environment. If you're making documentation changes, you can skip to :ref:`contributing.documentation` but you won't be able to build the documentation locally before pushing your changes. -Using a Docker Container +Using a Docker container ~~~~~~~~~~~~~~~~~~~~~~~~ Instead of manually setting up a development environment, you can use Docker to @@ -221,7 +221,7 @@ environment: `_ * Make sure your conda is up to date (``conda update conda``) * Make sure that you have :ref:`cloned the repository ` -* ``cd`` to the *pandas* source directory +* ``cd`` to the pandas source directory We'll now kick off a three-step process: @@ -330,7 +330,7 @@ The above can be simplified to:: This changes your working directory to the shiny-new-feature branch. Keep any changes in this branch specific to one bug or feature so it is clear -what the branch brings to *pandas*. You can have many shiny-new-features +what the branch brings to pandas. You can have many shiny-new-features and switch in between them using the git checkout command. When creating this branch, make sure your master branch is up to date with @@ -349,9 +349,9 @@ you created the branch, check the section on Contributing to the documentation ================================= -Contributing to the documentation benefits everyone who uses *pandas*. +Contributing to the documentation benefits everyone who uses pandas. We encourage you to help us improve the documentation, and -you don't have to be an expert on *pandas* to do so! In fact, +you don't have to be an expert on pandas to do so! In fact, there are sections of the docs that are worse off after being written by experts. If something in the docs doesn't make sense to you, updating the relevant section after you figure it out is a great way to ensure it will help @@ -361,7 +361,7 @@ the next person. :local: -About the *pandas* documentation +About the pandas documentation -------------------------------- The documentation is written in **reStructuredText**, which is almost like writing @@ -372,7 +372,7 @@ complex changes to the documentation as well. Some other important things to know about the docs: -* The *pandas* documentation consists of two parts: the docstrings in the code +* The pandas documentation consists of two parts: the docstrings in the code itself and the docs in this folder ``doc/``. The docstrings provide a clear explanation of the usage of the individual @@ -452,7 +452,7 @@ This will identify methods documented in ``doc/source/reference`` that are not a class methods, and existing methods that are not documented in ``doc/source/reference``. -Updating a *pandas* docstring +Updating a pandas docstring ----------------------------- When improving a single function or method's docstring, it is not necessarily @@ -477,7 +477,7 @@ When doing a PR with a docstring update, it is good to post the output of the validation script in a comment on github. -How to build the *pandas* documentation +How to build the pandas documentation --------------------------------------- Requirements @@ -543,7 +543,7 @@ And you'll have the satisfaction of seeing your new and improved documentation! Building master branch documentation ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -When pull requests are merged into the *pandas* ``master`` branch, the main parts of +When pull requests are merged into the pandas ``master`` branch, the main parts of the documentation are also built by Travis-CI. These docs are then hosted `here `__, see also the :ref:`Continuous Integration ` section. @@ -563,7 +563,7 @@ Writing good code is not just about what you write. It is also about *how* you write it. During :ref:`Continuous Integration ` testing, several tools will be run to check your code for stylistic errors. Generating any warnings will cause the test to fail. -Thus, good style is a requirement for submitting code to *pandas*. +Thus, good style is a requirement for submitting code to pandas. There is a tool in pandas to help contributors verify their changes before contributing them to the project:: @@ -601,7 +601,7 @@ set in the ``pandas.compat._optional.VERSIONS`` dict. C (cpplint) ~~~~~~~~~~~ -*pandas* uses the `Google `_ +pandas uses the `Google `_ standard. Google provides an open source style checker called ``cpplint``, but we use a fork of it that can be found `here `__. Here are *some* of the more common ``cpplint`` issues: @@ -652,7 +652,7 @@ fixes manually. Python (PEP8 / black) ~~~~~~~~~~~~~~~~~~~~~ -*pandas* follows the `PEP8 `_ standard +pandas follows the `PEP8 `_ standard and uses `Black `_ and `Flake8 `_ to ensure a consistent code format throughout the project. @@ -703,7 +703,7 @@ Note that these commands can be run analogously with ``black``. Import formatting ~~~~~~~~~~~~~~~~~ -*pandas* uses `isort `__ to standardise import +pandas uses `isort `__ to standardise import formatting across the codebase. A guide to import layout as per pep8 can be found `here `__. @@ -754,7 +754,7 @@ You can then verify the changes look ok, then git :ref:`commit `_ and you should use these where applicable. This module is private for now but ultimately this should be exposed to third party libraries who want to implement type checking against pandas. +Commonly used types specific to pandas will appear in `pandas._typing `_ and you should use these where applicable. This module is private for now but ultimately this should be exposed to third party libraries who want to implement type checking against pandas. -For example, quite a few functions in *pandas* accept a ``dtype`` argument. This can be expressed as a string like ``"object"``, a ``numpy.dtype`` like ``np.int64`` or even a pandas ``ExtensionDtype`` like ``pd.CategoricalDtype``. Rather than burden the user with having to constantly annotate all of those options, this can simply be imported and reused from the pandas._typing module +For example, quite a few functions in pandas accept a ``dtype`` argument. This can be expressed as a string like ``"object"``, a ``numpy.dtype`` like ``np.int64`` or even a pandas ``ExtensionDtype`` like ``pd.CategoricalDtype``. Rather than burden the user with having to constantly annotate all of those options, this can simply be imported and reused from the pandas._typing module .. code-block:: python @@ -919,10 +919,10 @@ For example, quite a few functions in *pandas* accept a ``dtype`` argument. This This module will ultimately house types for repeatedly used concepts like "path-like", "array-like", "numeric", etc... and can also hold aliases for commonly appearing parameters like `axis`. Development of this module is active so be sure to refer to the source for the most up to date list of available types. -Validating Type Hints +Validating type hints ~~~~~~~~~~~~~~~~~~~~~ -*pandas* uses `mypy `_ to statically analyze the code base and type hints. After making any change you can ensure your type hints are correct by running +pandas uses `mypy `_ to statically analyze the code base and type hints. After making any change you can ensure your type hints are correct by running .. code-block:: shell @@ -933,7 +933,7 @@ Validating Type Hints Testing with continuous integration ----------------------------------- -The *pandas* test suite will run automatically on `Travis-CI `__ and +The pandas test suite will run automatically on `Travis-CI `__ and `Azure Pipelines `__ continuous integration services, once your pull request is submitted. However, if you wish to run the test suite on a branch prior to submitting the pull request, @@ -959,7 +959,7 @@ This is an example of a green build. Test-driven development/code writing ------------------------------------ -*pandas* is serious about testing and strongly encourages contributors to embrace +pandas is serious about testing and strongly encourages contributors to embrace `test-driven development (TDD) `_. This development process "relies on the repetition of a very short development cycle: first the developer writes an (initially failing) automated test case that defines a desired @@ -968,10 +968,10 @@ So, before actually writing any code, you should write your tests. Often the te taken from the original GitHub issue. However, it is always worth considering additional use cases and writing corresponding tests. -Adding tests is one of the most common requests after code is pushed to *pandas*. Therefore, +Adding tests is one of the most common requests after code is pushed to pandas. Therefore, it is worth getting in the habit of writing tests ahead of time so this is never an issue. -Like many packages, *pandas* uses `pytest +Like many packages, pandas uses `pytest `_ and the convenient extensions in `numpy.testing `_. @@ -1018,7 +1018,7 @@ E.g. "# brief comment, see GH#28907" Transitioning to ``pytest`` ~~~~~~~~~~~~~~~~~~~~~~~~~~~ -*pandas* existing test structure is *mostly* class-based, meaning that you will typically find tests wrapped in a class. +pandas existing test structure is *mostly* class-based, meaning that you will typically find tests wrapped in a class. .. code-block:: python @@ -1220,7 +1220,7 @@ Running the test suite ---------------------- The tests can then be run directly inside your Git clone (without having to -install *pandas*) by typing:: +install pandas) by typing:: pytest pandas @@ -1272,9 +1272,9 @@ Running the performance test suite ---------------------------------- Performance matters and it is worth considering whether your code has introduced -performance regressions. *pandas* is in the process of migrating to +performance regressions. pandas is in the process of migrating to `asv benchmarks `__ -to enable easy monitoring of the performance of critical *pandas* operations. +to enable easy monitoring of the performance of critical pandas operations. These benchmarks are all found in the ``pandas/asv_bench`` directory. asv supports both python2 and python3. @@ -1361,7 +1361,7 @@ directive. This should also be put in the docstring when adding a new function or method (`example `__) or a new keyword argument (`example `__). -Contributing your changes to *pandas* +Contributing your changes to pandas ===================================== .. _contributing.commit-code: @@ -1386,7 +1386,7 @@ Doing 'git status' again should give something like:: # modified: /relative/path/to/file-you-added.py # -Finally, commit your changes to your local repository with an explanatory message. *Pandas* +Finally, commit your changes to your local repository with an explanatory message. pandas uses a convention for commit message prefixes and layout. Here are some common prefixes along with general guidelines for when to use them: @@ -1434,7 +1434,7 @@ like:: upstream git://github.com/pandas-dev/pandas.git (fetch) upstream git://github.com/pandas-dev/pandas.git (push) -Now your code is on GitHub, but it is not yet a part of the *pandas* project. For that to +Now your code is on GitHub, but it is not yet a part of the pandas project. For that to happen, a pull request needs to be submitted on GitHub. Review your code @@ -1539,7 +1539,7 @@ The branch will still exist on GitHub, so to delete it there do:: .. _Gitter: https://gitter.im/pydata/pandas -Tips for a successful Pull Request +Tips for a successful pull request ================================== If you have made it to the `Review your code`_ phase, one of the core contributors may diff --git a/doc/source/development/contributing_docstring.rst b/doc/source/development/contributing_docstring.rst index 1c99b341f6c5a..efa165e0a2d0c 100644 --- a/doc/source/development/contributing_docstring.rst +++ b/doc/source/development/contributing_docstring.rst @@ -17,7 +17,7 @@ Also, it is a common practice to generate online (html) documentation automatically from docstrings. `Sphinx `_ serves this purpose. -Next example gives an idea on how a docstring looks like: +The next example gives an idea of what a docstring looks like: .. code-block:: python @@ -26,8 +26,8 @@ Next example gives an idea on how a docstring looks like: Add up two integer numbers. This function simply wraps the `+` operator, and does not - do anything interesting, except for illustrating what is - the docstring of a very simple function. + do anything interesting, except for illustrating what + the docstring of a very simple function looks like. Parameters ---------- @@ -56,14 +56,14 @@ Next example gives an idea on how a docstring looks like: """ return num1 + num2 -Some standards exist about docstrings, so they are easier to read, and they can -be exported to other formats such as html or pdf. +Some standards regarding docstrings exist, which make them easier to read, and allow them +be easily exported to other formats such as html or pdf. The first conventions every Python docstring should follow are defined in `PEP-257 `_. -As PEP-257 is quite open, and some other standards exist on top of it. In the -case of pandas, the numpy docstring convention is followed. The conventions is +As PEP-257 is quite broad, other more specific standards also exist. In the +case of pandas, the numpy docstring convention is followed. These conventions are explained in this document: * `numpydoc docstring guide `_ @@ -80,11 +80,11 @@ about reStructuredText can be found in: * `Quick reStructuredText reference `_ * `Full reStructuredText specification `_ -Pandas has some helpers for sharing docstrings between related classes, see +pandas has some helpers for sharing docstrings between related classes, see :ref:`docstring.sharing`. -The rest of this document will summarize all the above guides, and will -provide additional convention specific to the pandas project. +The rest of this document will summarize all the above guidelines, and will +provide additional conventions specific to the pandas project. .. _docstring.tutorial: @@ -101,9 +101,9 @@ left before or after the docstring. The text starts in the next line after the opening quotes. The closing quotes have their own line (meaning that they are not at the end of the last sentence). -In rare occasions reST styles like bold text or italics will be used in +On rare occasions reST styles like bold text or italics will be used in docstrings, but is it common to have inline code, which is presented between -backticks. It is considered inline code: +backticks. The following are considered inline code: * The name of a parameter * Python code, a module, function, built-in, type, literal... (e.g. ``os``, @@ -235,8 +235,8 @@ The extended summary provides details on what the function does. It should not go into the details of the parameters, or discuss implementation notes, which go in other sections. -A blank line is left between the short summary and the extended summary. And -every paragraph in the extended summary is finished by a dot. +A blank line is left between the short summary and the extended summary. +Every paragraph in the extended summary ends with a dot. The extended summary should provide details on why the function is useful and their use cases, if it is not too generic. @@ -542,19 +542,19 @@ first (not an alias like ``np``). If the function is in a module which is not the main one, like ``scipy.sparse``, list the full module (e.g. ``scipy.sparse.coo_matrix``). -This section, as the previous, also has a header, "See Also" (note the capital -S and A). Also followed by the line with hyphens, and preceded by a blank line. +This section has a header, "See Also" (note the capital +S and A), followed by the line with hyphens and preceded by a blank line. After the header, we will add a line for each related method or function, followed by a space, a colon, another space, and a short description that -illustrated what this method or function does, why is it relevant in this -context, and what are the key differences between the documented function and -the one referencing. The description must also finish with a dot. +illustrates what this method or function does, why is it relevant in this +context, and what the key differences are between the documented function and +the one being referenced. The description must also end with a dot. -Note that in "Returns" and "Yields", the description is located in the -following line than the type. But in this section it is located in the same -line, with a colon in between. If the description does not fit in the same -line, it can continue in the next ones, but it has to be indented in them. +Note that in "Returns" and "Yields", the description is located on the line +after the type. In this section, however, it is located on the same +line, with a colon in between. If the description does not fit on the same +line, it can continue onto other lines which must be further indented. For example: @@ -587,7 +587,7 @@ Section 6: Notes ~~~~~~~~~~~~~~~~ This is an optional section used for notes about the implementation of the -algorithm. Or to document technical aspects of the function behavior. +algorithm, or to document technical aspects of the function behavior. Feel free to skip it, unless you are familiar with the implementation of the algorithm, or you discover some counter-intuitive behavior while writing the @@ -600,15 +600,15 @@ This section follows the same format as the extended summary section. Section 7: Examples ~~~~~~~~~~~~~~~~~~~ -This is one of the most important sections of a docstring, even if it is -placed in the last position. As often, people understand concepts better -with examples, than with accurate explanations. +This is one of the most important sections of a docstring, despite being +placed in the last position, as often people understand concepts better +by example than through accurate explanations. Examples in docstrings, besides illustrating the usage of the function or -method, must be valid Python code, that in a deterministic way returns the -presented output, and that can be copied and run by users. +method, must be valid Python code, that returns the given output in a +deterministic way, and that can be copied and run by users. -They are presented as a session in the Python terminal. `>>>` is used to +Examples are presented as a session in the Python terminal. `>>>` is used to present code. `...` is used for code continuing from the previous line. Output is presented immediately after the last line of code generating the output (no blank lines in between). Comments describing the examples can @@ -636,7 +636,7 @@ A simple example could be: Return the first elements of the Series. This function is mainly useful to preview the values of the - Series without displaying the whole of it. + Series without displaying all of it. Parameters ---------- @@ -932,7 +932,7 @@ plot will be generated automatically when building the documentation. Sharing docstrings ------------------ -Pandas has a system for sharing docstrings, with slight variations, between +pandas has a system for sharing docstrings, with slight variations, between classes. This helps us keep docstrings consistent, while keeping things clear for the user reading. It comes at the cost of some complexity when writing. diff --git a/doc/source/development/extending.rst b/doc/source/development/extending.rst index 270f20e8118bc..98e3ffcf74ad1 100644 --- a/doc/source/development/extending.rst +++ b/doc/source/development/extending.rst @@ -7,7 +7,7 @@ Extending pandas **************** While pandas provides a rich set of methods, containers, and data types, your -needs may not be fully satisfied. Pandas offers a few options for extending +needs may not be fully satisfied. pandas offers a few options for extending pandas. .. _extending.register-accessors: @@ -80,8 +80,8 @@ Extension types The :class:`pandas.api.extensions.ExtensionDtype` and :class:`pandas.api.extensions.ExtensionArray` APIs are new and experimental. They may change between versions without warning. -Pandas defines an interface for implementing data types and arrays that *extend* -NumPy's type system. Pandas itself uses the extension system for some types +pandas defines an interface for implementing data types and arrays that *extend* +NumPy's type system. pandas itself uses the extension system for some types that aren't built into NumPy (categorical, period, interval, datetime with timezone). @@ -122,7 +122,7 @@ This class provides all the array-like functionality. ExtensionArrays are limited to 1 dimension. An ExtensionArray is linked to an ExtensionDtype via the ``dtype`` attribute. -Pandas makes no restrictions on how an extension array is created via its +pandas makes no restrictions on how an extension array is created via its ``__new__`` or ``__init__``, and puts no restrictions on how you store your data. We do require that your array be convertible to a NumPy array, even if this is relatively expensive (as it is for ``Categorical``). @@ -224,7 +224,7 @@ for an example. As part of your implementation, we require that you defer to pandas when a pandas container (:class:`Series`, :class:`DataFrame`, :class:`Index`) is detected in ``inputs``. -If any of those is present, you should return ``NotImplemented``. Pandas will take care of +If any of those is present, you should return ``NotImplemented``. pandas will take care of unboxing the array from the container and re-calling the ufunc with the unwrapped input. .. _extending.extension.testing: diff --git a/doc/source/development/internals.rst b/doc/source/development/internals.rst index 748caae295460..8f1c3d5d818c2 100644 --- a/doc/source/development/internals.rst +++ b/doc/source/development/internals.rst @@ -85,20 +85,14 @@ if you compute the levels and codes yourself, please be careful. Values ~~~~~~ -Pandas extends NumPy's type system with custom types, like ``Categorical`` or +pandas extends NumPy's type system with custom types, like ``Categorical`` or datetimes with a timezone, so we have multiple notions of "values". For 1-D containers (``Index`` classes and ``Series``) we have the following convention: -* ``cls._ndarray_values`` is *always* a NumPy ``ndarray``. Ideally, - ``_ndarray_values`` is cheap to compute. For example, for a ``Categorical``, - this returns the codes, not the array of objects. * ``cls._values`` refers is the "best possible" array. This could be an - ``ndarray``, ``ExtensionArray``, or in ``Index`` subclass (note: we're in the - process of removing the index subclasses here so that it's always an - ``ndarray`` or ``ExtensionArray``). + ``ndarray`` or ``ExtensionArray``. -So, for example, ``Series[category]._values`` is a ``Categorical``, while -``Series[category]._ndarray_values`` is the underlying codes. +So, for example, ``Series[category]._values`` is a ``Categorical``. .. _ref-subclassing-pandas: diff --git a/doc/source/development/maintaining.rst b/doc/source/development/maintaining.rst index e65b66fc243c5..9ae9d47b89341 100644 --- a/doc/source/development/maintaining.rst +++ b/doc/source/development/maintaining.rst @@ -13,7 +13,7 @@ The main contributing guide is available at :ref:`contributing`. Roles ----- -Pandas uses two levels of permissions: **triage** and **core** team members. +pandas uses two levels of permissions: **triage** and **core** team members. Triage members can label and close issues and pull requests. @@ -25,7 +25,7 @@ GitHub publishes the full `list of permissions`_. Tasks ----- -Pandas is largely a volunteer project, so these tasks shouldn't be read as +pandas is largely a volunteer project, so these tasks shouldn't be read as "expectations" of triage and maintainers. Rather, they're general descriptions of what it means to be a maintainer. diff --git a/doc/source/development/meeting.rst b/doc/source/development/meeting.rst index 1d19408692cda..803f1b7002de0 100644 --- a/doc/source/development/meeting.rst +++ b/doc/source/development/meeting.rst @@ -25,7 +25,7 @@ This calendar shows all the developer meetings. You can subscribe to this calendar with the following links: * `iCal `__ -* `Google calendar `__ +* `Google calendar `__ Additionally, we'll sometimes have one-off meetings on specific topics. These will be published on the same calendar. diff --git a/doc/source/development/policies.rst b/doc/source/development/policies.rst index 224948738341e..b7cc3db3ad260 100644 --- a/doc/source/development/policies.rst +++ b/doc/source/development/policies.rst @@ -11,7 +11,7 @@ Version Policy .. versionchanged:: 1.0.0 -Pandas uses a loose variant of semantic versioning (`SemVer`_) to govern +pandas uses a loose variant of semantic versioning (`SemVer`_) to govern deprecations, API compatibility, and version numbering. A pandas release number is made up of ``MAJOR.MINOR.PATCH``. @@ -23,7 +23,7 @@ and how to migrate existing code to the new behavior. Whenever possible, a deprecation path will be provided rather than an outright breaking change. -Pandas will introduce deprecations in **minor** releases. These deprecations +pandas will introduce deprecations in **minor** releases. These deprecations will preserve the existing behavior while emitting a warning that provide guidance on: @@ -39,19 +39,19 @@ deprecation removed in the next next major release (2.0.0). .. note:: - Pandas will sometimes make *behavior changing* bug fixes, as part of + pandas will sometimes make *behavior changing* bug fixes, as part of minor or patch releases. Whether or not a change is a bug fix or an API-breaking change is a judgement call. We'll do our best, and we invite you to participate in development discussion on the issue tracker or mailing list. These policies do not apply to features marked as **experimental** in the documentation. -Pandas may change the behavior of experimental features at any time. +pandas may change the behavior of experimental features at any time. Python Support ~~~~~~~~~~~~~~ -Pandas will only drop support for specific Python versions (e.g. 3.6.x, 3.7.x) in +pandas will only drop support for specific Python versions (e.g. 3.6.x, 3.7.x) in pandas **major** releases. .. _SemVer: https://semver.org diff --git a/doc/source/development/roadmap.rst b/doc/source/development/roadmap.rst index fafe63d80249c..e57ff82add278 100644 --- a/doc/source/development/roadmap.rst +++ b/doc/source/development/roadmap.rst @@ -22,8 +22,8 @@ See :ref:`roadmap.evolution` for proposing changes to this document. Extensibility ------------- -Pandas :ref:`extending.extension-types` allow for extending NumPy types with custom -data types and array storage. Pandas uses extension types internally, and provides +pandas :ref:`extending.extension-types` allow for extending NumPy types with custom +data types and array storage. pandas uses extension types internally, and provides an interface for 3rd-party libraries to define their own custom data types. Many parts of pandas still unintentionally convert data to a NumPy array. @@ -71,7 +71,7 @@ Block manager rewrite We'd like to replace pandas current internal data structures (a collection of 1 or 2-D arrays) with a simpler collection of 1-D arrays. -Pandas internal data model is quite complex. A DataFrame is made up of +pandas internal data model is quite complex. A DataFrame is made up of one or more 2-dimensional "blocks", with one or more blocks per dtype. This collection of 2-D arrays is managed by the BlockManager. @@ -132,7 +132,7 @@ Some specific goals include Performance monitoring ---------------------- -Pandas uses `airspeed velocity `__ to +pandas uses `airspeed velocity `__ to monitor for performance regressions. ASV itself is a fabulous tool, but requires some additional work to be integrated into an open source project's workflow. @@ -155,7 +155,7 @@ We'd like to fund improvements and maintenance of these tools to Roadmap Evolution ----------------- -Pandas continues to evolve. The direction is primarily determined by community +pandas continues to evolve. The direction is primarily determined by community interest. Everyone is welcome to review existing items on the roadmap and to propose a new item. diff --git a/doc/source/getting_started/comparison/comparison_with_sql.rst b/doc/source/getting_started/comparison/comparison_with_sql.rst index 6a03c06de3699..e3c8f8f5ccbcd 100644 --- a/doc/source/getting_started/comparison/comparison_with_sql.rst +++ b/doc/source/getting_started/comparison/comparison_with_sql.rst @@ -75,7 +75,7 @@ Filtering in SQL is done via a WHERE clause. LIMIT 5; DataFrames can be filtered in multiple ways; the most intuitive of which is using -`boolean indexing `_. +:ref:`boolean indexing ` .. ipython:: python diff --git a/doc/source/getting_started/index.rst b/doc/source/getting_started/index.rst index a2f8f79f22ae4..9ac8c58e1d8f2 100644 --- a/doc/source/getting_started/index.rst +++ b/doc/source/getting_started/index.rst @@ -9,8 +9,6 @@ Getting started Installation ------------ -Before you can use pandas, you’ll need to get it installed. - .. raw:: html
@@ -23,7 +21,7 @@ Before you can use pandas, you’ll need to get it installed.

-Pandas is part of the `Anaconda `__ distribution and can be +pandas is part of the `Anaconda `__ distribution and can be installed with Anaconda or Miniconda: .. raw:: html @@ -49,7 +47,7 @@ installed with Anaconda or Miniconda:

-Pandas can be installed via pip from `PyPI `__. +pandas can be installed via pip from `PyPI `__. .. raw:: html @@ -103,7 +101,7 @@ Intro to pandas

- What kind of data does Pandas handle? + What kind of data does pandas handle?
@@ -117,8 +115,8 @@ Intro to pandas
-When working with tabular data, such as data stored in spreadsheets or databases, Pandas is the right tool for you. Pandas will help you -to explore, clean and process your data. In Pandas, a data table is called a :class:`DataFrame`. +When working with tabular data, such as data stored in spreadsheets or databases, pandas is the right tool for you. pandas will help you +to explore, clean and process your data. In pandas, a data table is called a :class:`DataFrame`. .. image:: ../_static/schemas/01_table_dataframe.svg :align: center @@ -164,7 +162,7 @@ to explore, clean and process your data. In Pandas, a data table is called a :cl
-Pandas supports the integration with many file formats or data sources out of the box (csv, excel, sql, json, parquet,…). Importing data from each of these +pandas supports the integration with many file formats or data sources out of the box (csv, excel, sql, json, parquet,…). Importing data from each of these data sources is provided by function with the prefix ``read_*``. Similarly, the ``to_*`` methods are used to store data. .. image:: ../_static/schemas/02_io_readwrite.svg @@ -212,7 +210,7 @@ data sources is provided by function with the prefix ``read_*``. Similarly, the
Selecting or filtering specific rows and/or columns? Filtering the data on a condition? Methods for slicing, selecting, and extracting the -data you need are available in Pandas. +data you need are available in pandas. .. image:: ../_static/schemas/03_subset_columns_rows.svg :align: center @@ -258,7 +256,7 @@ data you need are available in Pandas.
-Pandas provides plotting your data out of the box, using the power of Matplotlib. You can pick the plot type (scatter, bar, boxplot,...) +pandas provides plotting your data out of the box, using the power of Matplotlib. You can pick the plot type (scatter, bar, boxplot,...) corresponding to your data. .. image:: ../_static/schemas/04_plot_overview.svg @@ -492,7 +490,7 @@ Multiple tables can be concatenated both column wise as row wise and database-li
-Pandas has great support for time series and has an extensive set of tools for working with dates, times, and time-indexed data. +pandas has great support for time series and has an extensive set of tools for working with dates, times, and time-indexed data. .. raw:: html @@ -535,7 +533,7 @@ Pandas has great support for time series and has an extensive set of tools for w
-Data sets do not only contain numerical data. Pandas provides a wide range of functions to cleaning textual data and extract useful information from it. +Data sets do not only contain numerical data. pandas provides a wide range of functions to cleaning textual data and extract useful information from it. .. raw:: html @@ -568,9 +566,8 @@ Data sets do not only contain numerical data. Pandas provides a wide range of fu Coming from... -------------- -Currently working with other software for data manipulation in a tabular format? You're probably familiar to typical -data operations and know *what* to do with your tabular data, but lacking the syntax to execute these operations. Get to know -the pandas syntax by looking for equivalents from the software you already know: +Are you familiar with other software for manipulating tablular data? Learn +the pandas-equivalent operations compared to software you already know: .. raw:: html @@ -580,7 +577,7 @@ the pandas syntax by looking for equivalents from the software you already know:
R project logo
-

The R programming language provides the data.frame data structure and multiple packages, +

The R programming language provides the dataframe data structure and multiple packages, such as tidyverse use and extend data.frames for convenient data handling functionalities similar to pandas.

@@ -597,7 +594,7 @@ the pandas syntax by looking for equivalents from the software you already know:
SQL logo
-

Already familiar to SELECT, GROUP BY, JOIN,...? +

Already familiar to SELECT, GROUP BY, JOIN, etc.? Most of these SQL manipulations do have equivalents in pandas.

.. container:: custom-button @@ -615,7 +612,7 @@ the pandas syntax by looking for equivalents from the software you already know:

The data set included in the STATA statistical software suite corresponds - to the pandas data.frame. Many of the operations known from STATA have an equivalent + to the pandas dataframe. Many of the operations known from STATA have an equivalent in pandas.

.. container:: custom-button @@ -632,8 +629,8 @@ the pandas syntax by looking for equivalents from the software you already know: SAS logo

The SAS statistical software suite - also provides the data set corresponding to the pandas data.frame. - Also vectorized operations, filtering, string processing operations,... from SAS have similar + also provides the data set corresponding to the pandas dataframe. + Also SAS vectorized operations, filtering, string processing operations, and more have similar functions in pandas.

.. container:: custom-button @@ -648,11 +645,16 @@ the pandas syntax by looking for equivalents from the software you already know:
-Community tutorials -------------------- +Tutorials +--------- + +For a quick overview of pandas functionality, see :ref:`10 Minutes to pandas<10min>`. + +You can also reference the pandas `cheat sheet `_ +for a succinct guide for manipulating data with pandas. The community produces a wide variety of tutorials available online. Some of the -material is enlisted in the community contributed :ref:`tutorials`. +material is enlisted in the community contributed :ref:`communitytutorials`. .. If you update this toctree, also update the manual toctree in the @@ -664,9 +666,6 @@ material is enlisted in the community contributed :ref:`tutorials`. install overview - 10min intro_tutorials/index - basics - dsintro comparison/index tutorials diff --git a/doc/source/getting_started/intro_tutorials/02_read_write.rst b/doc/source/getting_started/intro_tutorials/02_read_write.rst index 797bdbcf25d17..1b3bcb799d5ce 100644 --- a/doc/source/getting_started/intro_tutorials/02_read_write.rst +++ b/doc/source/getting_started/intro_tutorials/02_read_write.rst @@ -225,7 +225,7 @@ The method :meth:`~DataFrame.info` provides technical information about a
To user guide -For a complete overview of the input and output possibilites from and to pandas, see the user guide section about :ref:`reader and writer functions `. +For a complete overview of the input and output possibilities from and to pandas, see the user guide section about :ref:`reader and writer functions `. .. raw:: html diff --git a/doc/source/getting_started/intro_tutorials/03_subset_data.rst b/doc/source/getting_started/intro_tutorials/03_subset_data.rst index 7a4347905ad8d..4167166a3f34a 100644 --- a/doc/source/getting_started/intro_tutorials/03_subset_data.rst +++ b/doc/source/getting_started/intro_tutorials/03_subset_data.rst @@ -88,7 +88,7 @@ name of the column of interest. Each column in a :class:`DataFrame` is a :class:`Series`. As a single column is -selected, the returned object is a pandas :class:`DataFrame`. We can verify this +selected, the returned object is a pandas :class:`Series`. We can verify this by checking the type of the output: .. ipython:: python @@ -101,7 +101,7 @@ And have a look at the ``shape`` of the output: titanic["Age"].shape -:attr:`DataFrame.shape` is an attribute (remember :ref:`tutorial on reading and writing <10min_tut_02_read_write>`, do not use parantheses for attributes) of a +:attr:`DataFrame.shape` is an attribute (remember :ref:`tutorial on reading and writing <10min_tut_02_read_write>`, do not use parentheses for attributes) of a pandas ``Series`` and ``DataFrame`` containing the number of rows and columns: *(nrows, ncolumns)*. A pandas Series is 1-dimensional and only the number of rows is returned. diff --git a/doc/source/getting_started/intro_tutorials/08_combine_dataframes.rst b/doc/source/getting_started/intro_tutorials/08_combine_dataframes.rst index f317e7a1f91b4..b6b3c97f2405b 100644 --- a/doc/source/getting_started/intro_tutorials/08_combine_dataframes.rst +++ b/doc/source/getting_started/intro_tutorials/08_combine_dataframes.rst @@ -165,7 +165,7 @@ index. For example: .. note:: The existence of multiple row/column indices at the same time has not been mentioned within these tutorials. *Hierarchical indexing* - or *MultiIndex* is an advanced and powerfull pandas feature to analyze + or *MultiIndex* is an advanced and powerful pandas feature to analyze higher dimensional data. Multi-indexing is out of scope for this pandas introduction. For the diff --git a/doc/source/getting_started/intro_tutorials/10_text_data.rst b/doc/source/getting_started/intro_tutorials/10_text_data.rst index 3ff64875d807b..936d00f68e3f0 100644 --- a/doc/source/getting_started/intro_tutorials/10_text_data.rst +++ b/doc/source/getting_started/intro_tutorials/10_text_data.rst @@ -188,7 +188,7 @@ Which passenger of the titanic has the longest name? titanic["Name"].str.len() -To get the longest name we first have to get the lenghts of each of the +To get the longest name we first have to get the lengths of each of the names in the ``Name`` column. By using pandas string methods, the :meth:`Series.str.len` function is applied to each of the names individually (element-wise). diff --git a/doc/source/getting_started/tutorials.rst b/doc/source/getting_started/tutorials.rst index 434d791474807..fce4aa4beba60 100644 --- a/doc/source/getting_started/tutorials.rst +++ b/doc/source/getting_started/tutorials.rst @@ -1,24 +1,12 @@ -.. _tutorials: +.. _communitytutorials: {{ header }} -********* -Tutorials -********* +******************* +Community Tutorials +******************* -This is a guide to many pandas tutorials, geared mainly for new users. - -Internal guides -=============== - -pandas' own :ref:`10 Minutes to pandas<10min>`. - -More complex recipes are in the :ref:`Cookbook`. - -A handy pandas `cheat sheet `_. - -Community guides -================ +This is a guide to many pandas tutorials by the community, geared mainly for new users. pandas Cookbook by Julia Evans ------------------------------ diff --git a/doc/source/index.rst.template b/doc/source/index.rst.template index 7eb25790f6a7a..4aba8f709fba0 100644 --- a/doc/source/index.rst.template +++ b/doc/source/index.rst.template @@ -119,7 +119,6 @@ programming language. :titlesonly: {% endif %} {% if not single_doc %} - What's New in 1.1.0 getting_started/index user_guide/index {% endif -%} diff --git a/doc/source/reference/extensions.rst b/doc/source/reference/extensions.rst index 78fdfbfd28144..4c0763e091b75 100644 --- a/doc/source/reference/extensions.rst +++ b/doc/source/reference/extensions.rst @@ -37,7 +37,6 @@ objects. api.extensions.ExtensionArray._from_factorized api.extensions.ExtensionArray._from_sequence api.extensions.ExtensionArray._from_sequence_of_strings - api.extensions.ExtensionArray._ndarray_values api.extensions.ExtensionArray._reduce api.extensions.ExtensionArray._values_for_argsort api.extensions.ExtensionArray._values_for_factorize diff --git a/doc/source/getting_started/10min.rst b/doc/source/user_guide/10min.rst similarity index 100% rename from doc/source/getting_started/10min.rst rename to doc/source/user_guide/10min.rst diff --git a/doc/source/getting_started/basics.rst b/doc/source/user_guide/basics.rst similarity index 100% rename from doc/source/getting_started/basics.rst rename to doc/source/user_guide/basics.rst diff --git a/doc/source/user_guide/cookbook.rst b/doc/source/user_guide/cookbook.rst index 4afdb14e5c39e..e51b5c9097951 100644 --- a/doc/source/user_guide/cookbook.rst +++ b/doc/source/user_guide/cookbook.rst @@ -794,8 +794,7 @@ The :ref:`Resample ` docs. `Time grouping with some missing values `__ -`Valid frequency arguments to Grouper -`__ +Valid frequency arguments to Grouper :ref:`Timeseries ` `Grouping using a MultiIndex `__ diff --git a/doc/source/getting_started/dsintro.rst b/doc/source/user_guide/dsintro.rst similarity index 97% rename from doc/source/getting_started/dsintro.rst rename to doc/source/user_guide/dsintro.rst index 200d567a62732..075787d3b9d5b 100644 --- a/doc/source/getting_started/dsintro.rst +++ b/doc/source/user_guide/dsintro.rst @@ -397,6 +397,28 @@ The result will be a DataFrame with the same index as the input Series, and with one column whose name is the original name of the Series (only if no other column name provided). +.. _basics.dataframe.from_list_dataclasses: + +From a list of dataclasses +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. versionadded:: 1.1.0 + +Data Classes as introduced in `PEP557 `__, +can be passed into the DataFrame constructor. +Passing a list of dataclasses is equivalent to passing a list of dictionaries. + +Please be aware, that that all values in the list should be dataclasses, mixing +types in the list would result in a TypeError. + +.. ipython:: python + + from dataclasses import make_dataclass + + Point = make_dataclass("Point", [("x", int), ("y", int)]) + + pd.DataFrame([Point(0, 0), Point(0, 3), Point(2, 3)]) + **Missing data** Much more will be said on this topic in the :ref:`Missing data ` diff --git a/doc/source/user_guide/index.rst b/doc/source/user_guide/index.rst index 30b1c0b4eac0d..8226e72779588 100644 --- a/doc/source/user_guide/index.rst +++ b/doc/source/user_guide/index.rst @@ -12,6 +12,8 @@ pandas approaches the problem, with many examples throughout. Users brand-new to pandas should start with :ref:`10min`. +For a high level summary of the pandas fundamentals, see :ref:`dsintro` and :ref:`basics`. + Further information on any specific method can be obtained in the :ref:`api`. @@ -21,6 +23,9 @@ Further information on any specific method can be obtained in the .. toctree:: :maxdepth: 2 + 10min + dsintro + basics io indexing advanced diff --git a/doc/source/user_guide/indexing.rst b/doc/source/user_guide/indexing.rst index 2bd3ff626f2e1..fb815b3a975d1 100644 --- a/doc/source/user_guide/indexing.rst +++ b/doc/source/user_guide/indexing.rst @@ -374,7 +374,7 @@ For getting values with a boolean array: df1.loc['a'] > 0 df1.loc[:, df1.loc['a'] > 0] -NA values in a boolean array propogate as ``False``: +NA values in a boolean array propagate as ``False``: .. versionchanged:: 1.0.2 diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index c34247a49335d..f3aff0654530e 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -5005,7 +5005,7 @@ Possible values are: This usually provides better performance for analytic databases like *Presto* and *Redshift*, but has worse performance for traditional SQL backend if the table contains many columns. - For more information check the SQLAlchemy `documention + For more information check the SQLAlchemy `documentation `__. - callable with signature ``(pd_table, conn, keys, data_iter)``: This can be used to implement a more performant insertion method based on diff --git a/doc/source/user_guide/scale.rst b/doc/source/user_guide/scale.rst index 43bb4966ec5bf..cddc3cb2600fd 100644 --- a/doc/source/user_guide/scale.rst +++ b/doc/source/user_guide/scale.rst @@ -246,6 +246,7 @@ We'll import ``dask.dataframe`` and notice that the API feels similar to pandas. We can use Dask's ``read_parquet`` function, but provide a globstring of files to read in. .. ipython:: python + :okwarning: import dask.dataframe as dd @@ -258,7 +259,7 @@ Inspecting the ``ddf`` object, we see a few things * There are familiar methods like ``.groupby``, ``.sum``, etc. * There are new attributes like ``.npartitions`` and ``.divisions`` -The partitions and divisions are how Dask parallizes computation. A **Dask** +The partitions and divisions are how Dask parallelizes computation. A **Dask** DataFrame is made up of many **Pandas** DataFrames. A single method call on a Dask DataFrame ends up making many pandas method calls, and Dask knows how to coordinate everything to get the result. diff --git a/doc/source/user_guide/style.ipynb b/doc/source/user_guide/style.ipynb index 1f2f8818c8458..fd8dda4fe365e 100644 --- a/doc/source/user_guide/style.ipynb +++ b/doc/source/user_guide/style.ipynb @@ -620,8 +620,8 @@ "aligns = ['left','zero','mid']\n", "for align in aligns:\n", " row = \"{}\".format(align)\n", - " for serie in [test1,test2,test3]:\n", - " s = serie.copy()\n", + " for series in [test1,test2,test3]:\n", + " s = series.copy()\n", " s.name=''\n", " row += \"{}\".format(s.to_frame().style.bar(align=align, \n", " color=['#d65f5f', '#5fba7d'], \n", diff --git a/doc/source/whatsnew/index.rst b/doc/source/whatsnew/index.rst index cbfeb0352c283..50333b54ca903 100644 --- a/doc/source/whatsnew/index.rst +++ b/doc/source/whatsnew/index.rst @@ -24,6 +24,7 @@ Version 1.0 .. toctree:: :maxdepth: 2 + v1.0.3 v1.0.2 v1.0.1 v1.0.0 diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index 85de0150a5a28..c756bc87e9b89 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -397,7 +397,7 @@ Other enhancements - :func:`~DataFrame.to_csv`, :func:`~Series.to_csv`, :func:`~DataFrame.to_json`, and :func:`~Series.to_json` now support ``compression='infer'`` to infer compression based on filename extension (:issue:`15008`). The default compression for ``to_csv``, ``to_json``, and ``to_pickle`` methods has been updated to ``'infer'`` (:issue:`22004`). - :meth:`DataFrame.to_sql` now supports writing ``TIMESTAMP WITH TIME ZONE`` types for supported databases. For databases that don't support timezones, datetime data will be stored as timezone unaware local timestamps. See the :ref:`io.sql_datetime_data` for implications (:issue:`9086`). -- :func:`to_timedelta` now supports iso-formated timedelta strings (:issue:`21877`) +- :func:`to_timedelta` now supports iso-formatted timedelta strings (:issue:`21877`) - :class:`Series` and :class:`DataFrame` now support :class:`Iterable` objects in the constructor (:issue:`2193`) - :class:`DatetimeIndex` has gained the :attr:`DatetimeIndex.timetz` attribute. This returns the local time with timezone information. (:issue:`21358`) - :meth:`~Timestamp.round`, :meth:`~Timestamp.ceil`, and :meth:`~Timestamp.floor` for :class:`DatetimeIndex` and :class:`Timestamp` diff --git a/doc/source/whatsnew/v1.0.1.rst b/doc/source/whatsnew/v1.0.1.rst index ef3bb8161d13f..c42aab6de4cc3 100644 --- a/doc/source/whatsnew/v1.0.1.rst +++ b/doc/source/whatsnew/v1.0.1.rst @@ -16,7 +16,7 @@ Fixed regressions ~~~~~~~~~~~~~~~~~ - Fixed regression in :class:`DataFrame` setting values with a slice (e.g. ``df[-4:] = 1``) indexing by label instead of position (:issue:`31469`) -- Fixed regression when indexing a ``Series`` or ``DataFrame`` indexed by ``DatetimeIndex`` with a slice containg a :class:`datetime.date` (:issue:`31501`) +- Fixed regression when indexing a ``Series`` or ``DataFrame`` indexed by ``DatetimeIndex`` with a slice containing a :class:`datetime.date` (:issue:`31501`) - Fixed regression in ``DataFrame.__setitem__`` raising an ``AttributeError`` with a :class:`MultiIndex` and a non-monotonic indexer (:issue:`31449`) - Fixed regression in :class:`Series` multiplication when multiplying a numeric :class:`Series` with >10000 elements with a timedelta-like scalar (:issue:`31457`) - Fixed regression in ``.groupby().agg()`` raising an ``AssertionError`` for some reductions like ``min`` on object-dtype columns (:issue:`31522`) diff --git a/doc/source/whatsnew/v1.0.2.rst b/doc/source/whatsnew/v1.0.2.rst index 808e6ae709ce9..c3f144e2f0cb3 100644 --- a/doc/source/whatsnew/v1.0.2.rst +++ b/doc/source/whatsnew/v1.0.2.rst @@ -1,7 +1,7 @@ .. _whatsnew_102: -What's new in 1.0.2 (February ??, 2020) ---------------------------------------- +What's new in 1.0.2 (March 12, 2020) +------------------------------------ These are the changes in pandas 1.0.2. See :ref:`release` for a full changelog including other versions of pandas. @@ -15,16 +15,35 @@ including other versions of pandas. Fixed regressions ~~~~~~~~~~~~~~~~~ -- Fixed regression in :meth:`DataFrame.to_excel` when ``columns`` kwarg is passed (:issue:`31677`) -- Fixed regression in :meth:`Series.align` when ``other`` is a DataFrame and ``method`` is not None (:issue:`31785`) -- Fixed regression in :meth:`pandas.core.groupby.RollingGroupby.apply` where the ``raw`` parameter was ignored (:issue:`31754`) -- Fixed regression in :meth:`rolling(..).corr() ` when using a time offset (:issue:`31789`) -- Fixed regression in :meth:`DataFrameGroupBy.nunique` which was modifying the original values if ``NaN`` values were present (:issue:`31950`) +**Groupby** + +- Fixed regression in :meth:`groupby(..).agg() ` which was failing on frames with :class:`MultiIndex` columns and a custom function (:issue:`31777`) +- Fixed regression in ``groupby(..).rolling(..).apply()`` (``RollingGroupby``) where the ``raw`` parameter was ignored (:issue:`31754`) +- Fixed regression in :meth:`rolling(..).corr() ` when using a time offset (:issue:`31789`) +- Fixed regression in :meth:`groupby(..).nunique() ` which was modifying the original values if ``NaN`` values were present (:issue:`31950`) +- Fixed regression in ``DataFrame.groupby`` raising a ``ValueError`` from an internal operation (:issue:`31802`) +- Fixed regression in :meth:`groupby(..).agg() ` calling a user-provided function an extra time on an empty input (:issue:`31760`) + +**I/O** + +- Fixed regression in :meth:`read_csv` in which the ``encoding`` option was not recognized with certain file-like objects (:issue:`31819`) +- Fixed regression in :meth:`DataFrame.to_excel` when the ``columns`` keyword argument is passed (:issue:`31677`) +- Fixed regression in :class:`ExcelFile` where the stream passed into the function was closed by the destructor. (:issue:`31467`) - Fixed regression where :func:`read_pickle` raised a ``UnicodeDecodeError`` when reading a py27 pickle with :class:`MultiIndex` column (:issue:`31988`). + +**Reindexing/alignment** + +- Fixed regression in :meth:`Series.align` when ``other`` is a :class:`DataFrame` and ``method`` is not ``None`` (:issue:`31785`) +- Fixed regression in :meth:`DataFrame.reindex` and :meth:`Series.reindex` when reindexing with (tz-aware) index and ``method=nearest`` (:issue:`26683`) +- Fixed regression in :meth:`DataFrame.reindex_like` on a :class:`DataFrame` subclass raised an ``AssertionError`` (:issue:`31925`) - Fixed regression in :class:`DataFrame` arithmetic operations with mis-matched columns (:issue:`31623`) -- Fixed regression in :meth:`GroupBy.agg` calling a user-provided function an extra time on an empty input (:issue:`31760`) -- Joining on :class:`DatetimeIndex` or :class:`TimedeltaIndex` will preserve ``freq`` in simple cases (:issue:`32166`) -- + +**Other** + +- Fixed regression in joining on :class:`DatetimeIndex` or :class:`TimedeltaIndex` to preserve ``freq`` in simple cases (:issue:`32166`) +- Fixed regression in :meth:`Series.shift` with ``datetime64`` dtype when passing an integer ``fill_value`` (:issue:`32591`) +- Fixed regression in the repr of an object-dtype :class:`Index` with bools and missing values (:issue:`32146`) + .. --------------------------------------------------------------------------- @@ -62,8 +81,9 @@ Bug fixes **Datetimelike** -- Bug in :meth:`DataFrame.reindex` and :meth:`Series.reindex` when reindexing with a tz-aware index (:issue:`26683`) +- Bug in :meth:`Series.astype` not copying for tz-naive and tz-aware ``datetime64`` dtype (:issue:`32490`) - Bug where :func:`to_datetime` would raise when passed ``pd.NA`` (:issue:`32213`) +- Improved error message when subtracting two :class:`Timestamp` that result in an out-of-bounds :class:`Timedelta` (:issue:`31774`) **Categorical** @@ -74,15 +94,26 @@ Bug fixes **I/O** - Using ``pd.NA`` with :meth:`DataFrame.to_json` now correctly outputs a null value instead of an empty object (:issue:`31615`) +- Bug in :meth:`pandas.json_normalize` when value in meta path is not iterable (:issue:`31507`) - Fixed pickling of ``pandas.NA``. Previously a new object was returned, which broke computations relying on ``NA`` being a singleton (:issue:`31847`) - Fixed bug in parquet roundtrip with nullable unsigned integer dtypes (:issue:`31896`). **Experimental dtypes** -- Fix bug in :meth:`DataFrame.convert_dtypes` for columns that were already using the ``"string"`` dtype (:issue:`31731`). +- Fixed bug in :meth:`DataFrame.convert_dtypes` for columns that were already using the ``"string"`` dtype (:issue:`31731`). +- Fixed bug in :meth:`DataFrame.convert_dtypes` for series with mix of integers and strings (:issue:`32117`) +- Fixed bug in :meth:`DataFrame.convert_dtypes` where ``BooleanDtype`` columns were converted to ``Int64`` (:issue:`32287`) - Fixed bug in setting values using a slice indexer with string dtype (:issue:`31772`) -- Fixed bug where :meth:`GroupBy.first` and :meth:`GroupBy.last` would raise a ``TypeError`` when groups contained ``pd.NA`` in a column of object dtype (:issue:`32123`) -- Fix bug in :meth:`Series.convert_dtypes` for series with mix of integers and strings (:issue:`32117`) +- Fixed bug where :meth:`pandas.core.groupby.GroupBy.first` and :meth:`pandas.core.groupby.GroupBy.last` would raise a ``TypeError`` when groups contained ``pd.NA`` in a column of object dtype (:issue:`32123`) +- Fixed bug where :meth:`DataFrameGroupBy.mean`, :meth:`DataFrameGroupBy.median`, :meth:`DataFrameGroupBy.var`, and :meth:`DataFrameGroupBy.std` would raise a ``TypeError`` on ``Int64`` dtype columns (:issue:`32219`) + +**Strings** + +- Using ``pd.NA`` with :meth:`Series.str.repeat` now correctly outputs a null value instead of raising error for vector inputs (:issue:`31632`) + +**Rolling** + +- Fixed rolling operations with variable window (defined by time duration) on decreasing time index (:issue:`32385`). .. --------------------------------------------------------------------------- @@ -91,4 +122,4 @@ Bug fixes Contributors ~~~~~~~~~~~~ -.. contributors:: v1.0.1..v1.0.2|HEAD +.. contributors:: v1.0.1..v1.0.2 diff --git a/doc/source/whatsnew/v1.0.3.rst b/doc/source/whatsnew/v1.0.3.rst new file mode 100644 index 0000000000000..26d06433bda0c --- /dev/null +++ b/doc/source/whatsnew/v1.0.3.rst @@ -0,0 +1,29 @@ + +.. _whatsnew_103: + +What's new in 1.0.3 (March 17, 2020) +------------------------------------ + +These are the changes in pandas 1.0.3. See :ref:`release` for a full changelog +including other versions of pandas. + +{{ header }} + +.. --------------------------------------------------------------------------- + +.. _whatsnew_103.regressions: + +Fixed regressions +~~~~~~~~~~~~~~~~~ +- Fixed regression in ``resample.agg`` when the underlying data is non-writeable (:issue:`31710`) +- Fixed regression in :class:`DataFrame` exponentiation with reindexing (:issue:`32685`) + +.. _whatsnew_103.bug_fixes: + +Bug fixes +~~~~~~~~~ + +Contributors +~~~~~~~~~~~~ + +.. contributors:: v1.0.2..v1.0.3|HEAD diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 5310419bfc100..692df075f25cb 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -64,9 +64,12 @@ Other enhancements ^^^^^^^^^^^^^^^^^^ - :class:`Styler` may now render CSS more efficiently where multiple cells have the same styling (:issue:`30876`) +- :meth:`Styler.highlight_null` now accepts ``subset`` argument (:issue:`31345`) - When writing directly to a sqlite connection :func:`to_sql` now supports the ``multi`` method (:issue:`29921`) - `OptionError` is now exposed in `pandas.errors` (:issue:`27553`) - :func:`timedelta_range` will now infer a frequency when passed ``start``, ``stop``, and ``periods`` (:issue:`32377`) +- Positional slicing on a :class:`IntervalIndex` now supports slices with ``step > 1`` (:issue:`31658`) +- :meth:`DataFrame.sample` will now also allow array-like and BitGenerator objects to be passed to ``random_state`` as seeds (:issue:`32503`) - .. --------------------------------------------------------------------------- @@ -87,10 +90,12 @@ Backwards incompatible API changes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - :meth:`DataFrame.swaplevels` now raises a ``TypeError`` if the axis is not a :class:`MultiIndex`. Previously a ``AttributeError`` was raised (:issue:`31126`) -- :meth:`DataFrameGroupby.mean` and :meth:`SeriesGroupby.mean` (and similarly for :meth:`~DataFrameGroupby.median`, :meth:`~DataFrameGroupby.std`` and :meth:`~DataFrameGroupby.var``) +- :meth:`DataFrameGroupby.mean` and :meth:`SeriesGroupby.mean` (and similarly for :meth:`~DataFrameGroupby.median`, :meth:`~DataFrameGroupby.std` and :meth:`~DataFrameGroupby.var`) now raise a ``TypeError`` if a not-accepted keyword argument is passed into it. - Previously a ``UnsupportedFunctionCall`` was raised (``AssertionError`` if ``min_count`` passed into :meth:`~DataFrameGroupby.median``) (:issue:`31485`) + Previously a ``UnsupportedFunctionCall`` was raised (``AssertionError`` if ``min_count`` passed into :meth:`~DataFrameGroupby.median`) (:issue:`31485`) - :meth:`DataFrame.at` and :meth:`Series.at` will raise a ``TypeError`` instead of a ``ValueError`` if an incompatible key is passed, and ``KeyError`` if a missing key is passed, matching the behavior of ``.loc[]`` (:issue:`31722`) +- Passing an integer dtype other than ``int64`` to ``np.array(period_index, dtype=...)`` will now raise ``TypeError`` instead of incorrectly using ``int64`` (:issue:`32255`) +- .. _whatsnew_110.api_breaking.indexing_raises_key_errors: @@ -164,14 +169,46 @@ key and type of :class:`Index`. These now consistently raise ``KeyError`` (:iss .. --------------------------------------------------------------------------- +.. _whatsnew_110.api_breaking.assignment_to_multiple_columns: + +Assignment to multiple columns of a DataFrame when some columns do not exist +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Assignment to multiple columns of a :class:`DataFrame` when some of the columns do not exist would previously assign the values to the last column. Now, new columns would be constructed with the right values. (:issue:`13658`) + +.. ipython:: python + + df = pd.DataFrame({'a': [0, 1, 2], 'b': [3, 4, 5]}) + df + +*Previous behavior*: + +.. code-block:: ipython + + In [3]: df[['a', 'c']] = 1 + In [4]: df + Out[4]: + a b + 0 1 1 + 1 1 1 + 2 1 1 + +*New behavior*: + +.. ipython:: python + + df[['a', 'c']] = 1 + df + .. _whatsnew_110.deprecations: Deprecations ~~~~~~~~~~~~ - Lookups on a :class:`Series` with a single-item list containing a slice (e.g. ``ser[[slice(0, 4)]]``) are deprecated, will raise in a future version. Either convert the list to tuple, or pass the slice directly instead (:issue:`31333`) - :meth:`DataFrame.mean` and :meth:`DataFrame.median` with ``numeric_only=None`` will include datetime64 and datetime64tz columns in a future version (:issue:`29941`) -- -- +- Setting values with ``.loc`` using a positional slice is deprecated and will raise in a future version. Use ``.loc`` with labels or ``.iloc`` with positions instead (:issue:`31840`) +- :meth:`DataFrame.to_dict` has deprecated accepting short names for ``orient`` in future versions (:issue:`32515`) +- :meth:`Categorical.to_dense` is deprecated and will be removed in a future version, use ``np.asarray(cat)`` instead (:issue:`32639`) .. --------------------------------------------------------------------------- @@ -183,8 +220,14 @@ Performance improvements - Performance improvement in :class:`Timedelta` constructor (:issue:`30543`) - Performance improvement in :class:`Timestamp` constructor (:issue:`30543`) -- -- +- Performance improvement in flex arithmetic ops between :class:`DataFrame` and :class:`Series` with ``axis=0`` (:issue:`31296`) +- The internal index method :meth:`~Index._shallow_copy` now copies cached attributes over to the new index, + avoiding creating these again on the new index. This can speed up many operations that depend on creating copies of + existing indexes (:issue:`28584`, :issue:`32640`, :issue:`32669`) +- Significant performance improvement when creating a :class:`DataFrame` with + sparse values from ``scipy.sparse`` matrices using the + :meth:`DataFrame.sparse.from_spmatrix` constructor (:issue:`32821`, + :issue:`32825`, :issue:`32826`, :issue:`32856`, :issue:`32858`). .. --------------------------------------------------------------------------- @@ -200,7 +243,7 @@ Categorical - Bug where :func:`merge` was unable to join on non-unique categorical indices (:issue:`28189`) - Bug when passing categorical data to :class:`Index` constructor along with ``dtype=object`` incorrectly returning a :class:`CategoricalIndex` instead of object-dtype :class:`Index` (:issue:`32167`) - Bug where :class:`Categorical` comparison operator ``__ne__`` would incorrectly evaluate to ``False`` when either element was missing (:issue:`32276`) -- +- :meth:`Categorical.fillna` now accepts :class:`Categorical` ``other`` argument (:issue:`32420`) Datetimelike ^^^^^^^^^^^^ @@ -210,6 +253,7 @@ Datetimelike - Bug in :class:`Timestamp` where constructing :class:`Timestamp` with dateutil timezone less than 128 nanoseconds before daylight saving time switch from winter to summer would result in nonexistent time (:issue:`31043`) - Bug in :meth:`Period.to_timestamp`, :meth:`Period.start_time` with microsecond frequency returning a timestamp one nanosecond earlier than the correct time (:issue:`31475`) - :class:`Timestamp` raising confusing error message when year, month or day is missing (:issue:`31200`) +- Bug in :class:`DatetimeIndex` constructor incorrectly accepting ``bool``-dtyped inputs (:issue:`32668`) Timedelta ^^^^^^^^^ @@ -228,14 +272,16 @@ Timezones Numeric ^^^^^^^ - Bug in :meth:`DataFrame.floordiv` with ``axis=0`` not treating division-by-zero like :meth:`Series.floordiv` (:issue:`31271`) -- +- Bug in :meth:`to_numeric` with string argument ``"uint64"`` and ``errors="coerce"`` silently fails (:issue:`32394`) +- Bug in :meth:`to_numeric` with ``downcast="unsigned"`` fails for empty data (:issue:`32493`) +- Bug in :meth:`DataFrame.mean` with ``numeric_only=False`` and either ``datetime64`` dtype or ``PeriodDtype`` column incorrectly raising ``TypeError`` (:issue:`32426`) - Conversion ^^^^^^^^^^ - Bug in :class:`Series` construction from NumPy array with big-endian ``datetime64`` dtype (:issue:`29684`) -- Bug in :class:`Timedelta` construction with large nanoseconds keyword value (:issue:`34202`) -- +- Bug in :class:`Timedelta` construction with large nanoseconds keyword value (:issue:`32402`) +- Bug in :class:`DataFrame` construction where sets would be duplicated rather than raising (:issue:`32582`) Strings ^^^^^^^ @@ -246,7 +292,6 @@ Strings Interval ^^^^^^^^ - - - @@ -260,13 +305,15 @@ Indexing - Bug in :meth:`Series.xs` incorrectly returning ``Timestamp`` instead of ``datetime64`` in some object-dtype cases (:issue:`31630`) - Bug in :meth:`DataFrame.iat` incorrectly returning ``Timestamp`` instead of ``datetime`` in some object-dtype cases (:issue:`32809`) - Bug in :meth:`Series.loc` and :meth:`DataFrame.loc` when indexing with an integer key on a object-dtype :class:`Index` that is not all-integers (:issue:`31905`) -- +- Bug in :meth:`DataFrame.iloc.__setitem__` on a :class:`DataFrame` with duplicate columns incorrectly setting values for all matching columns (:issue:`15686`, :issue:`22036`) +- Bug in :meth:`DataFrame.loc:` and :meth:`Series.loc` with a :class:`DatetimeIndex`, :class:`TimedeltaIndex`, or :class:`PeriodIndex` incorrectly allowing lookups of non-matching datetime-like dtypes (:issue:`32650`) +- Bug in :meth:`Series.__getitem__` indexing with non-standard scalars, e.g. ``np.dtype`` (:issue:`32684`) Missing ^^^^^^^ -- -- +- Calling :meth:`fillna` on an empty Series now correctly returns a shallow copied object. The behaviour is now consistent with :class:`Index`, :class:`DataFrame` and a non-empty :class:`Series` (:issue:`32543`). + MultiIndex ^^^^^^^^^^ @@ -292,7 +339,7 @@ MultiIndex I/O ^^^ -- Bug in :meth:`read_json` where integer overflow was occuring when json contains big number strings. (:issue:`30320`) +- Bug in :meth:`read_json` where integer overflow was occurring when json contains big number strings. (:issue:`30320`) - `read_csv` will now raise a ``ValueError`` when the arguments `header` and `prefix` both are not `None`. (:issue:`27394`) - Bug in :meth:`DataFrame.to_json` was raising ``NotFoundError`` when ``path_or_buf`` was an S3 URI (:issue:`28375`) - Bug in :meth:`DataFrame.to_parquet` overwriting pyarrow's default for @@ -300,6 +347,13 @@ I/O timestamps with ``version="2.0"`` (:issue:`31652`). - Bug in :meth:`read_csv` was raising `TypeError` when `sep=None` was used in combination with `comment` keyword (:issue:`31396`) - Bug in :class:`HDFStore` that caused it to set to ``int64`` the dtype of a ``datetime64`` column when reading a DataFrame in Python 3 from fixed format written in Python 2 (:issue:`31750`) +- Bug in :meth:`DataFrame.to_json` where ``Timedelta`` objects would not be serialized correctly with ``date_format="iso"`` (:issue:`28256`) +- :func:`read_csv` will raise a ``ValueError`` when the column names passed in `parse_dates` are missing in the Dataframe (:issue:`31251`) +- Bug in :meth:`read_excel` where a UTF-8 string with a high surrogate would cause a segmentation violation (:issue:`23809`) +- Bug in :meth:`read_csv` was causing a file descriptor leak on an empty file (:issue:`31488`) +- Bug in :meth:`read_csv` was causing a segfault when there were blank lines between the header and data rows (:issue:`28071`) +- Bug in :meth:`read_csv` was raising a misleading exception on a permissions issue (:issue:`23784`) +- Bug in :meth:`read_csv` was raising an ``IndexError`` when header=None and 2 extra data columns Plotting @@ -329,11 +383,13 @@ Reshaping - Bug in :func:`concat` where the resulting indices are not copied when ``copy=True`` (:issue:`29879`) - :meth:`Series.append` will now raise a ``TypeError`` when passed a DataFrame or a sequence containing Dataframe (:issue:`31413`) - :meth:`DataFrame.replace` and :meth:`Series.replace` will raise a ``TypeError`` if ``to_replace`` is not an expected type. Previously the ``replace`` would fail silently (:issue:`18634`) - +- Bug in :meth:`DataFrame.apply` where callback was called with :class:`Series` parameter even though ``raw=True`` requested. (:issue:`32423`) +- Bug in :meth:`DataFrame.pivot_table` losing timezone information when creating a :class:`MultiIndex` level from a column with timezone-aware dtype (:issue:`32558`) +- :meth:`DataFrame.agg` now provides more descriptive ``SpecificationError`` message when attempting to aggregating non-existant column (:issue:`32755`) Sparse ^^^^^^ - +- Creating a :class:`SparseArray` from timezone-aware dtype will issue a warning before dropping timezone information, instead of doing so silently (:issue:`32501`) - - @@ -350,6 +406,12 @@ Other instead of ``TypeError: Can only append a Series if ignore_index=True or if the Series has a name`` (:issue:`30871`) - Set operations on an object-dtype :class:`Index` now always return object-dtype results (:issue:`31401`) - Bug in :meth:`AbstractHolidayCalendar.holidays` when no rules were defined (:issue:`31415`) +- Bug in :meth:`DataFrame.to_records` incorrectly losing timezone information in timezone-aware ``datetime64`` columns (:issue:`32535`) +- Fixed :func:`pandas.testing.assert_series_equal` to correctly raise if left object is a different subclass with ``check_series_type=True`` (:issue:`32670`). +- :meth:`IntegerArray.astype` now supports ``datetime64`` dtype (:issue:32538`) +- Fixed bug in :func:`pandas.testing.assert_series_equal` where dtypes were checked for ``Interval`` and ``ExtensionArray`` operands when ``check_dtype`` was ``False`` (:issue:`32747`) +- Bug in :meth:`Series.map` not raising on invalid ``na_action`` (:issue:`32815`) +- Bug in :meth:`DataFrame.__dir__` caused a segfault when using unicode surrogates in a column name (:issue:`25509`) .. --------------------------------------------------------------------------- diff --git a/doc/sphinxext/announce.py b/doc/sphinxext/announce.py index e4859157f73de..0084036f1e75c 100755 --- a/doc/sphinxext/announce.py +++ b/doc/sphinxext/announce.py @@ -122,14 +122,15 @@ def build_string(revision_range, heading="Contributors"): components["uline"] = "=" * len(components["heading"]) components["authors"] = "* " + "\n* ".join(components["authors"]) + # Don't change this to an fstring. It breaks the formatting. tpl = textwrap.dedent( - f"""\ - {components['heading']} - {components['uline']} + """\ + {heading} + {uline} - {components['author_message']} - {components['authors']}""" - ) + {author_message} + {authors}""" + ).format(**components) return tpl diff --git a/environment.yml b/environment.yml index cbdaf8e6c4217..532c36038fcaf 100644 --- a/environment.yml +++ b/environment.yml @@ -104,5 +104,5 @@ dependencies: - pyreadstat # pandas.read_spss - tabulate>=0.8.3 # DataFrame.to_markdown - pip: - - git+https://github.com/pandas-dev/pandas-sphinx-theme.git@master + - git+https://github.com/pandas-dev/pydata-sphinx-theme.git@master - git+https://github.com/numpy/numpydoc diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 27b3095d8cb4f..e7ac3b8442c6d 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -848,11 +848,13 @@ cdef inline bint _treat_as_na(rank_t val, bint is_datetimelike) nogil: return val != val +# GH#31710 use memorviews once cython 0.30 is released so we can +# use `const rank_t[:, :] values` @cython.wraparound(False) @cython.boundscheck(False) def group_last(rank_t[:, :] out, int64_t[:] counts, - rank_t[:, :] values, + ndarray[rank_t, ndim=2] values, const int64_t[:] labels, Py_ssize_t min_count=-1): """ @@ -867,7 +869,9 @@ def group_last(rank_t[:, :] out, assert min_count == -1, "'min_count' only used in add and prod" - if not len(values) == len(labels): + # TODO(cython 3.0): + # Instead of `labels.shape[0]` use `len(labels)` + if not len(values) == labels.shape[0]: raise AssertionError("len(index) != len(labels)") nobs = np.zeros((out).shape, dtype=np.int64) @@ -937,11 +941,13 @@ def group_last(rank_t[:, :] out, raise RuntimeError("empty group with uint64_t") +# GH#31710 use memorviews once cython 0.30 is released so we can +# use `const rank_t[:, :] values` @cython.wraparound(False) @cython.boundscheck(False) def group_nth(rank_t[:, :] out, int64_t[:] counts, - rank_t[:, :] values, + ndarray[rank_t, ndim=2] values, const int64_t[:] labels, int64_t rank=1, Py_ssize_t min_count=-1): """ @@ -956,7 +962,9 @@ def group_nth(rank_t[:, :] out, assert min_count == -1, "'min_count' only used in add and prod" - if not len(values) == len(labels): + # TODO(cython 3.0): + # Instead of `labels.shape[0]` use `len(labels)` + if not len(values) == labels.shape[0]: raise AssertionError("len(index) != len(labels)") nobs = np.zeros((out).shape, dtype=np.int64) @@ -1235,7 +1243,7 @@ ctypedef fused groupby_t: @cython.boundscheck(False) def group_max(groupby_t[:, :] out, int64_t[:] counts, - groupby_t[:, :] values, + ndarray[groupby_t, ndim=2] values, const int64_t[:] labels, Py_ssize_t min_count=-1): """ @@ -1250,7 +1258,9 @@ def group_max(groupby_t[:, :] out, assert min_count == -1, "'min_count' only used in add and prod" - if not len(values) == len(labels): + # TODO(cython 3.0): + # Instead of `labels.shape[0]` use `len(labels)` + if not len(values) == labels.shape[0]: raise AssertionError("len(index) != len(labels)") nobs = np.zeros((out).shape, dtype=np.int64) @@ -1308,7 +1318,7 @@ def group_max(groupby_t[:, :] out, @cython.boundscheck(False) def group_min(groupby_t[:, :] out, int64_t[:] counts, - groupby_t[:, :] values, + ndarray[groupby_t, ndim=2] values, const int64_t[:] labels, Py_ssize_t min_count=-1): """ @@ -1323,7 +1333,9 @@ def group_min(groupby_t[:, :] out, assert min_count == -1, "'min_count' only used in add and prod" - if not len(values) == len(labels): + # TODO(cython 3.0): + # Instead of `labels.shape[0]` use `len(labels)` + if not len(values) == labels.shape[0]: raise AssertionError("len(index) != len(labels)") nobs = np.zeros((out).shape, dtype=np.int64) diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index 811025a4b5764..3ce3bc519b311 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -12,6 +12,9 @@ WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in from pandas._libs.tslibs.util cimport get_c_string from pandas._libs.missing cimport C_NA +cdef extern from "Python.h": + void PyErr_Clear() + {{py: # name, dtype, c_type @@ -193,7 +196,7 @@ cdef class StringVector: append_data_string(self.data, x) - cdef extend(self, ndarray[:] x): + cdef extend(self, ndarray[object] x): for i in range(len(x)): self.append(x[i]) @@ -238,7 +241,7 @@ cdef class ObjectVector: self.external_view_exists = True return self.ao - cdef extend(self, ndarray[:] x): + cdef extend(self, ndarray[object] x): for i in range(len(x)): self.append(x[i]) @@ -671,7 +674,7 @@ cdef class StringHashTable(HashTable): val = values[i] if isinstance(val, str): - # GH#31499 if we have a np.str_ get_c_string wont recognize + # GH#31499 if we have a np.str_ get_c_string won't recognize # it as a str, even though isinstance does. v = get_c_string(val) else: @@ -706,7 +709,7 @@ cdef class StringHashTable(HashTable): val = values[i] if isinstance(val, str): - # GH#31499 if we have a np.str_ get_c_string wont recognize + # GH#31499 if we have a np.str_ get_c_string won't recognize # it as a str, even though isinstance does. v = get_c_string(val) else: @@ -790,6 +793,9 @@ cdef class StringHashTable(HashTable): else: # if ignore_na is False, we also stringify NaN/None/etc. v = get_c_string(val) + if v == NULL: + PyErr_Clear() + v = get_c_string(repr(val)) vecs[i] = v # compute diff --git a/pandas/_libs/internals.pyx b/pandas/_libs/internals.pyx index 437406cbbd819..3bebd7e23fb5a 100644 --- a/pandas/_libs/internals.pyx +++ b/pandas/_libs/internals.pyx @@ -1,4 +1,5 @@ import cython +from collections import defaultdict from cython import Py_ssize_t from cpython.slice cimport PySlice_GetIndicesEx @@ -7,7 +8,9 @@ cdef extern from "Python.h": Py_ssize_t PY_SSIZE_T_MAX import numpy as np -from numpy cimport int64_t +cimport numpy as cnp +from numpy cimport NPY_INT64, int64_t +cnp.import_array() from pandas._libs.algos import ensure_int64 @@ -29,7 +32,11 @@ cdef class BlockPlacement: self._has_slice = False self._has_array = False - if isinstance(val, slice): + if isinstance(val, int): + slc = slice(val, val + 1, 1) + self._as_slice = slc + self._has_slice = True + elif isinstance(val, slice): slc = slice_canonize(val) if slc.start != slc.stop: @@ -105,7 +112,9 @@ cdef class BlockPlacement: Py_ssize_t start, stop, end, _ if not self._has_array: start, stop, step, _ = slice_get_indices_ex(self._as_slice) - self._as_array = np.arange(start, stop, step, dtype=np.int64) + # NOTE: this is the C-optimized equivalent of + # np.arange(start, stop, step, dtype=np.int64) + self._as_array = cnp.PyArray_Arange(start, stop, step, NPY_INT64) self._has_array = True return self._as_array @@ -303,7 +312,10 @@ cdef slice_getitem(slice slc, ind): return slice(s_start, s_stop, s_step) else: - return np.arange(s_start, s_stop, s_step, dtype=np.int64)[ind] + # NOTE: + # this is the C-optimized equivalent of + # `np.arange(s_start, s_stop, s_step, dtype=np.int64)[ind]` + return cnp.PyArray_Arange(s_start, s_stop, s_step, NPY_INT64)[ind] @cython.boundscheck(False) @@ -369,64 +381,50 @@ def get_blkno_indexers(int64_t[:] blknos, bint group=True): Py_ssize_t i, start, stop, n, diff object blkno - list group_order - dict group_dict - int64_t[:] res_view + object group_dict = defaultdict(list) n = blknos.shape[0] - - if n == 0: - return - + result = list() start = 0 cur_blkno = blknos[start] - if group is False: + if n == 0: + pass + elif group is False: for i in range(1, n): if blknos[i] != cur_blkno: - yield cur_blkno, slice(start, i) + result.append((cur_blkno, slice(start, i))) start = i cur_blkno = blknos[i] - yield cur_blkno, slice(start, n) + result.append((cur_blkno, slice(start, n))) else: - group_order = [] - group_dict = {} - for i in range(1, n): if blknos[i] != cur_blkno: - if cur_blkno not in group_dict: - group_order.append(cur_blkno) - group_dict[cur_blkno] = [(start, i)] - else: - group_dict[cur_blkno].append((start, i)) + group_dict[cur_blkno].append((start, i)) start = i cur_blkno = blknos[i] - if cur_blkno not in group_dict: - group_order.append(cur_blkno) - group_dict[cur_blkno] = [(start, n)] - else: - group_dict[cur_blkno].append((start, n)) + group_dict[cur_blkno].append((start, n)) - for blkno in group_order: - slices = group_dict[blkno] + for blkno, slices in group_dict.items(): if len(slices) == 1: - yield blkno, slice(slices[0][0], slices[0][1]) + result.append((blkno, slice(slices[0][0], slices[0][1]))) else: tot_len = sum(stop - start for start, stop in slices) - result = np.empty(tot_len, dtype=np.int64) - res_view = result + arr = np.empty(tot_len, dtype=np.int64) i = 0 for start, stop in slices: for diff in range(start, stop): - res_view[i] = diff + arr[i] = diff i += 1 - yield blkno, result + result.append((blkno, arr)) + + return result def get_blkno_placements(blknos, group: bool = True): diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 61d6a660a0357..6aa9a8b2dedfd 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -94,20 +94,6 @@ cdef: float64_t NaN = np.NaN -def values_from_object(obj: object): - """ - Return my values or the object if we are say an ndarray. - """ - func: object - - func = getattr(obj, '_internal_get_values', None) - if func is not None: - # Includes DataFrame, for which we get frame.values - obj = func() - - return obj - - @cython.wraparound(False) @cython.boundscheck(False) def memory_usage_of_objects(arr: object[:]) -> int64_t: @@ -2024,8 +2010,6 @@ def maybe_convert_numeric(ndarray[object] values, set na_values, except (TypeError, ValueError) as err: if not seen.coerce_numeric: raise type(err)(f"{err} at position {i}") - elif "uint64" in str(err): # Exception from check functions. - raise seen.saw_null() floats[i] = NaN @@ -2075,7 +2059,7 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0, If an array-like object contains only timedelta values or NaT is encountered, whether to convert and return an array of m8[ns] dtype. convert_to_nullable_integer : bool, default False - If an array-like object contains only interger values (and NaN) is + If an array-like object contains only integer values (and NaN) is encountered, whether to convert and return an IntegerArray. Returns @@ -2298,7 +2282,7 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0, return uints else: return ints - elif seen.is_bool: + elif seen.is_bool and not seen.nan_: return bools.view(np.bool_) return objects diff --git a/pandas/_libs/ops.pyx b/pandas/_libs/ops.pyx index abe1484e3763d..c0971b91a2fa1 100644 --- a/pandas/_libs/ops.pyx +++ b/pandas/_libs/ops.pyx @@ -100,7 +100,7 @@ def scalar_compare(object[:] values, object val, object op): @cython.wraparound(False) @cython.boundscheck(False) -def vec_compare(object[:] left, object[:] right, object op): +def vec_compare(ndarray[object] left, ndarray[object] right, object op): """ Compare the elements of `left` with the elements of `right` pointwise, with the comparison operation described by `op`. diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index 2fd227694800c..c6b68d9a0ab5c 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -241,9 +241,9 @@ cdef extern from "parser/io.h": void* buffer_mmap_bytes(void *source, size_t nbytes, size_t *bytes_read, int *status) - void *new_file_source(char *fname, size_t buffer_size) + void *new_file_source(char *fname, size_t buffer_size) except NULL - void *new_rd_source(object obj) + void *new_rd_source(object obj) except NULL int del_file_source(void *src) int del_rd_source(void *src) @@ -638,7 +638,8 @@ cdef class TextReader: raise ValueError(f'Unrecognized compression type: ' f'{self.compression}') - if self.encoding and isinstance(source, (io.BufferedIOBase, io.RawIOBase)): + if (self.encoding and hasattr(source, "read") and + not hasattr(source, "encoding")): source = io.TextIOWrapper( source, self.encoding.decode('utf-8'), newline='') @@ -666,26 +667,12 @@ cdef class TextReader: ptr = new_file_source(source, self.parser.chunksize) self.parser.cb_io = &buffer_file_bytes self.parser.cb_cleanup = &del_file_source - - if ptr == NULL: - if not os.path.exists(source): - - raise FileNotFoundError( - ENOENT, - f'File {usource} does not exist', - usource) - raise IOError('Initializing from file failed') - self.parser.source = ptr elif hasattr(source, 'read'): # e.g., StringIO ptr = new_rd_source(source) - if ptr == NULL: - raise IOError('Initializing parser from file-like ' - 'object failed') - self.parser.source = ptr self.parser.cb_io = &buffer_rd_bytes self.parser.cb_cleanup = &del_rd_source @@ -805,7 +792,6 @@ cdef class TextReader: self._tokenize_rows(1) header = [ self.names ] - data_line = 0 if self.parser.lines < 1: field_count = len(header[0]) @@ -1330,8 +1316,8 @@ cdef class TextReader: else: if self.header is not None: j = i - self.leading_cols - # hack for #2442 - if j == len(self.header[0]): + # generate extra (bogus) headers if there are more columns than headers + if j >= len(self.header[0]): return j else: return self.header[0][j] @@ -1595,8 +1581,6 @@ cdef _categorical_convert(parser_t *parser, int64_t col, cdef _to_fw_string(parser_t *parser, int64_t col, int64_t line_start, int64_t line_end, int64_t width): cdef: - Py_ssize_t i - coliter_t it const char *word = NULL char *data ndarray result @@ -1641,15 +1625,11 @@ cdef _try_double(parser_t *parser, int64_t col, bint na_filter, kh_str_starts_t *na_hashset, object na_flist): cdef: int error, na_count = 0 - Py_ssize_t i, lines - coliter_t it - const char *word = NULL - char *p_end + Py_ssize_t lines float64_t *data float64_t NA = na_values[np.float64] kh_float64_t *na_fset ndarray result - khiter_t k bint use_na_flist = len(na_flist) > 0 lines = line_end - line_start @@ -1684,7 +1664,7 @@ cdef inline int _try_double_nogil(parser_t *parser, coliter_t it const char *word = NULL char *p_end - khiter_t k, k64 + khiter_t k64 na_count[0] = 0 coliter_setup(&it, parser, col, line_start) @@ -1747,11 +1727,10 @@ cdef _try_uint64(parser_t *parser, int64_t col, bint na_filter, kh_str_starts_t *na_hashset): cdef: int error - Py_ssize_t i, lines + Py_ssize_t lines coliter_t it uint64_t *data ndarray result - khiter_t k uint_state state lines = line_end - line_start @@ -1821,13 +1800,11 @@ cdef _try_int64(parser_t *parser, int64_t col, bint na_filter, kh_str_starts_t *na_hashset): cdef: int error, na_count = 0 - Py_ssize_t i, lines + Py_ssize_t lines coliter_t it int64_t *data ndarray result - int64_t NA = na_values[np.int64] - khiter_t k lines = line_end - line_start result = np.empty(lines, dtype=np.int64) @@ -1855,7 +1832,6 @@ cdef inline int _try_int64_nogil(parser_t *parser, int64_t col, Py_ssize_t i, lines = line_end - line_start coliter_t it const char *word = NULL - khiter_t k na_count[0] = 0 coliter_setup(&it, parser, col, line_start) @@ -1891,9 +1867,7 @@ cdef _try_bool_flex(parser_t *parser, int64_t col, const kh_str_starts_t *false_hashset): cdef: int error, na_count = 0 - Py_ssize_t i, lines - coliter_t it - const char *word = NULL + Py_ssize_t lines uint8_t *data ndarray result @@ -1925,7 +1899,6 @@ cdef inline int _try_bool_flex_nogil(parser_t *parser, int64_t col, Py_ssize_t i, lines = line_end - line_start coliter_t it const char *word = NULL - khiter_t k na_count[0] = 0 coliter_setup(&it, parser, col, line_start) @@ -1980,10 +1953,8 @@ cdef kh_str_starts_t* kset_from_list(list values) except NULL: # caller takes responsibility for freeing the hash table cdef: Py_ssize_t i - khiter_t k kh_str_starts_t *table int ret = 0 - object val table = kh_init_str_starts() @@ -2011,7 +1982,6 @@ cdef kh_str_starts_t* kset_from_list(list values) except NULL: cdef kh_float64_t* kset_float64_from_list(values) except NULL: # caller takes responsibility for freeing the hash table cdef: - Py_ssize_t i khiter_t k kh_float64_t *table int ret = 0 @@ -2149,7 +2119,6 @@ cdef _apply_converter(object f, parser_t *parser, int64_t col, int64_t line_start, int64_t line_end, char* c_encoding): cdef: - int error Py_ssize_t i, lines coliter_t it const char *word = NULL diff --git a/pandas/_libs/reduction.pyx b/pandas/_libs/reduction.pyx index b27072aa66708..29a5a73ef08d0 100644 --- a/pandas/_libs/reduction.pyx +++ b/pandas/_libs/reduction.pyx @@ -177,6 +177,8 @@ cdef class _BaseGrouper: object.__setattr__(cached_ityp, '_index_data', islider.buf) cached_ityp._engine.clear_mapping() object.__setattr__(cached_typ._data._block, 'values', vslider.buf) + object.__setattr__(cached_typ._data._block, 'mgr_locs', + slice(len(vslider.buf))) object.__setattr__(cached_typ, '_index', cached_ityp) object.__setattr__(cached_typ, 'name', self.name) diff --git a/pandas/_libs/sparse.pyx b/pandas/_libs/sparse.pyx index 091ca42cb71dd..d853ddf3de7d4 100644 --- a/pandas/_libs/sparse.pyx +++ b/pandas/_libs/sparse.pyx @@ -34,18 +34,21 @@ cdef class IntIndex(SparseIndex): length : integer indices : array-like Contains integers corresponding to the indices. + check_integrity : bool, default=True + Check integrity of the input. """ cdef readonly: Py_ssize_t length, npoints ndarray indices - def __init__(self, Py_ssize_t length, indices): + def __init__(self, Py_ssize_t length, indices, bint check_integrity=True): self.length = length self.indices = np.ascontiguousarray(indices, dtype=np.int32) self.npoints = len(self.indices) - self.check_integrity() + if check_integrity: + self.check_integrity() def __reduce__(self): args = (self.length, self.indices) diff --git a/pandas/_libs/src/parse_helper.h b/pandas/_libs/src/parse_helper.h index 7fbe7a04d5b22..2ada0a4bd173d 100644 --- a/pandas/_libs/src/parse_helper.h +++ b/pandas/_libs/src/parse_helper.h @@ -34,6 +34,9 @@ int floatify(PyObject *str, double *result, int *maybe_int) { data = PyBytes_AS_STRING(str); } else if (PyUnicode_Check(str)) { tmp = PyUnicode_AsUTF8String(str); + if (tmp == NULL) { + return -1; + } data = PyBytes_AS_STRING(tmp); } else { PyErr_SetString(PyExc_TypeError, "Invalid object type"); diff --git a/pandas/_libs/src/parser/io.c b/pandas/_libs/src/parser/io.c index 1e3295fcb6fc7..51504527de5a2 100644 --- a/pandas/_libs/src/parser/io.c +++ b/pandas/_libs/src/parser/io.c @@ -28,6 +28,7 @@ The full license is in the LICENSE file, distributed with this software. void *new_file_source(char *fname, size_t buffer_size) { file_source *fs = (file_source *)malloc(sizeof(file_source)); if (fs == NULL) { + PyErr_NoMemory(); return NULL; } @@ -41,17 +42,20 @@ void *new_file_source(char *fname, size_t buffer_size) { int required = MultiByteToWideChar(CP_UTF8, 0, fname, -1, NULL, 0); if (required == 0) { free(fs); + PyErr_SetFromWindowsErr(0); return NULL; } wname = (wchar_t*)malloc(required * sizeof(wchar_t)); if (wname == NULL) { free(fs); + PyErr_NoMemory(); return NULL; } if (MultiByteToWideChar(CP_UTF8, 0, fname, -1, wname, required) < required) { free(wname); free(fs); + PyErr_SetFromWindowsErr(0); return NULL; } fs->fd = _wopen(wname, O_RDONLY | O_BINARY); @@ -62,6 +66,7 @@ void *new_file_source(char *fname, size_t buffer_size) { #endif if (fs->fd == -1) { free(fs); + PyErr_SetFromErrnoWithFilename(PyExc_OSError, fname); return NULL; } @@ -71,6 +76,7 @@ void *new_file_source(char *fname, size_t buffer_size) { if (fs->buffer == NULL) { close(fs->fd); free(fs); + PyErr_NoMemory(); return NULL; } @@ -83,6 +89,10 @@ void *new_file_source(char *fname, size_t buffer_size) { void *new_rd_source(PyObject *obj) { rd_source *rds = (rd_source *)malloc(sizeof(rd_source)); + if (rds == NULL) { + PyErr_NoMemory(); + return NULL; + } /* hold on to this object */ Py_INCREF(obj); rds->obj = obj; @@ -220,20 +230,15 @@ void *new_mmap(char *fname) { mm = (memory_map *)malloc(sizeof(memory_map)); if (mm == NULL) { - fprintf(stderr, "new_file_buffer: malloc() failed.\n"); - return (NULL); + return NULL; } mm->fd = open(fname, O_RDONLY | O_BINARY); if (mm->fd == -1) { - fprintf(stderr, "new_file_buffer: open(%s) failed. errno =%d\n", - fname, errno); free(mm); return NULL; } if (fstat(mm->fd, &stat) == -1) { - fprintf(stderr, "new_file_buffer: fstat() failed. errno =%d\n", - errno); close(mm->fd); free(mm); return NULL; @@ -242,8 +247,6 @@ void *new_mmap(char *fname) { mm->memmap = mmap(NULL, filesize, PROT_READ, MAP_SHARED, mm->fd, 0); if (mm->memmap == MAP_FAILED) { - /* XXX Eventually remove this print statement. */ - fprintf(stderr, "new_file_buffer: mmap() failed.\n"); close(mm->fd); free(mm); return NULL; diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c index 2188ff6b0d464..7ba1a6cd398c9 100644 --- a/pandas/_libs/src/parser/tokenizer.c +++ b/pandas/_libs/src/parser/tokenizer.c @@ -1189,8 +1189,14 @@ int parser_consume_rows(parser_t *self, size_t nrows) { /* cannot guarantee that nrows + 1 has been observed */ word_deletions = self->line_start[nrows - 1] + self->line_fields[nrows - 1]; - char_count = (self->word_starts[word_deletions - 1] + - strlen(self->words[word_deletions - 1]) + 1); + if (word_deletions >= 1) { + char_count = (self->word_starts[word_deletions - 1] + + strlen(self->words[word_deletions - 1]) + 1); + } else { + /* if word_deletions == 0 (i.e. this case) then char_count must + * be 0 too, as no data needs to be skipped */ + char_count = 0; + } TRACE(("parser_consume_rows: Deleting %d words, %d chars\n", word_deletions, char_count)); diff --git a/pandas/_libs/src/ujson/python/date_conversions.c b/pandas/_libs/src/ujson/python/date_conversions.c index fc4bdef8463af..4c25ab572bebe 100644 --- a/pandas/_libs/src/ujson/python/date_conversions.c +++ b/pandas/_libs/src/ujson/python/date_conversions.c @@ -67,7 +67,7 @@ npy_datetime NpyDateTimeToEpoch(npy_datetime dt, NPY_DATETIMEUNIT base) { } /* Convert PyDatetime To ISO C-string. mutates len */ -char *PyDateTimeToIso(PyDateTime_Date *obj, NPY_DATETIMEUNIT base, +char *PyDateTimeToIso(PyObject *obj, NPY_DATETIMEUNIT base, size_t *len) { npy_datetimestruct dts; int ret; @@ -98,7 +98,7 @@ char *PyDateTimeToIso(PyDateTime_Date *obj, NPY_DATETIMEUNIT base, return result; } -npy_datetime PyDateTimeToEpoch(PyDateTime_Date *dt, NPY_DATETIMEUNIT base) { +npy_datetime PyDateTimeToEpoch(PyObject *dt, NPY_DATETIMEUNIT base) { npy_datetimestruct dts; int ret; @@ -116,3 +116,29 @@ npy_datetime PyDateTimeToEpoch(PyDateTime_Date *dt, NPY_DATETIMEUNIT base) { npy_datetime npy_dt = npy_datetimestruct_to_datetime(NPY_FR_ns, &dts); return NpyDateTimeToEpoch(npy_dt, base); } + +/* Converts the int64_t representation of a duration to ISO; mutates len */ +char *int64ToIsoDuration(int64_t value, size_t *len) { + pandas_timedeltastruct tds; + int ret_code; + + pandas_timedelta_to_timedeltastruct(value, NPY_FR_ns, &tds); + + // Max theoretical length of ISO Duration with 64 bit day + // as the largest unit is 70 characters + 1 for a null terminator + char *result = PyObject_Malloc(71); + if (result == NULL) { + PyErr_NoMemory(); + return NULL; + } + + ret_code = make_iso_8601_timedelta(&tds, result, len); + if (ret_code == -1) { + PyErr_SetString(PyExc_ValueError, + "Could not convert timedelta value to string"); + PyObject_Free(result); + return NULL; + } + + return result; +} diff --git a/pandas/_libs/src/ujson/python/date_conversions.h b/pandas/_libs/src/ujson/python/date_conversions.h index 45455f4d6128b..23e36999be43f 100644 --- a/pandas/_libs/src/ujson/python/date_conversions.h +++ b/pandas/_libs/src/ujson/python/date_conversions.h @@ -4,7 +4,6 @@ #define PY_SSIZE_T_CLEAN #include #include -#include "datetime.h" // Scales value inplace from nanosecond resolution to unit resolution int scaleNanosecToUnit(npy_int64 *value, NPY_DATETIMEUNIT unit); @@ -23,9 +22,11 @@ npy_datetime NpyDateTimeToEpoch(npy_datetime dt, NPY_DATETIMEUNIT base); // up to precision `base` e.g. base="s" yields 2020-01-03T00:00:00Z // while base="ns" yields "2020-01-01T00:00:00.000000000Z" // len is mutated to save the length of the returned string -char *PyDateTimeToIso(PyDateTime_Date *obj, NPY_DATETIMEUNIT base, size_t *len); +char *PyDateTimeToIso(PyObject *obj, NPY_DATETIMEUNIT base, size_t *len); // Convert a Python Date/Datetime to Unix epoch with resolution base -npy_datetime PyDateTimeToEpoch(PyDateTime_Date *dt, NPY_DATETIMEUNIT base); +npy_datetime PyDateTimeToEpoch(PyObject *dt, NPY_DATETIMEUNIT base); + +char *int64ToIsoDuration(int64_t value, size_t *len); #endif diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c index 8cfc20ffd2c1c..965d6aec2c1cf 100644 --- a/pandas/_libs/src/ujson/python/objToJSON.c +++ b/pandas/_libs/src/ujson/python/objToJSON.c @@ -165,7 +165,6 @@ void *initObjToJSON(void) { cls_index = (PyTypeObject *)PyObject_GetAttrString(mod_pandas, "Index"); cls_series = (PyTypeObject *)PyObject_GetAttrString(mod_pandas, "Series"); - cls_timedelta = PyObject_GetAttrString(mod_pandas, "Timedelta"); Py_DECREF(mod_pandas); } @@ -222,28 +221,19 @@ static PyObject *get_values(PyObject *obj) { PRINTMARK(); - if (PyObject_HasAttrString(obj, "_internal_get_values")) { + if (PyObject_TypeCheck(obj, cls_index) || PyObject_TypeCheck(obj, cls_series)) { + // The special cases to worry about are dt64tz and category[dt64tz]. + // In both cases we want the UTC-localized datetime64 ndarray, + // without going through and object array of Timestamps. PRINTMARK(); - values = PyObject_CallMethod(obj, "_internal_get_values", NULL); - - if (values == NULL) { - // Clear so we can subsequently try another method - PyErr_Clear(); - } else if (!PyArray_CheckExact(values)) { - // Didn't get a numpy array, so keep trying - PRINTMARK(); - Py_DECREF(values); - values = NULL; - } - } - - if ((values == NULL) && PyObject_HasAttrString(obj, "get_block_values")) { - PRINTMARK(); - values = PyObject_CallMethod(obj, "get_block_values", NULL); + values = PyObject_GetAttrString(obj, "values"); if (values == NULL) { // Clear so we can subsequently try another method PyErr_Clear(); + } else if (PyObject_HasAttrString(values, "__array__")) { + // We may have gotten a Categorical or Sparse array so call np.array + values = PyObject_CallMethod(values, "__array__", NULL); } else if (!PyArray_CheckExact(values)) { // Didn't get a numpy array, so keep trying PRINTMARK(); @@ -366,6 +356,12 @@ static char *NpyDateTimeToIsoCallback(JSOBJ Py_UNUSED(unused), return int64ToIso(GET_TC(tc)->longValue, base, len); } +/* JSON callback. returns a char* and mutates the pointer to *len */ +static char *NpyTimeDeltaToIsoCallback(JSOBJ Py_UNUSED(unused), + JSONTypeContext *tc, size_t *len) { + return int64ToIsoDuration(GET_TC(tc)->longValue, len); +} + /* JSON callback */ static char *PyDateTimeToIsoCallback(JSOBJ obj, JSONTypeContext *tc, size_t *len) { @@ -780,7 +776,7 @@ void PdBlock_iterBegin(JSOBJ _obj, JSONTypeContext *tc) { goto BLKRET; } - tmp = get_values(block); + tmp = PyObject_CallMethod(block, "get_block_values_for_json", NULL); if (!tmp) { ((JSONObjectEncoder *)tc->encoder)->errorMsg = ""; Py_DECREF(block); @@ -1266,7 +1262,7 @@ int DataFrame_iterNext(JSOBJ obj, JSONTypeContext *tc) { } else if (index == 2) { memcpy(GET_TC(tc)->cStr, "data", sizeof(char) * 5); if (is_simple_frame(obj)) { - GET_TC(tc)->itemValue = get_values(obj); + GET_TC(tc)->itemValue = PyObject_GetAttrString(obj, "values"); if (!GET_TC(tc)->itemValue) { return 0; } @@ -1454,7 +1450,8 @@ char **NpyArr_encodeLabels(PyArrayObject *labels, PyObjectEncoder *enc, 1000000000LL; // nanoseconds per second } else { // datetime.* objects don't follow above rules - nanosecVal = PyDateTimeToEpoch(item, NPY_FR_ns); + nanosecVal = + PyDateTimeToEpoch(item, NPY_FR_ns); } } } @@ -1466,37 +1463,13 @@ char **NpyArr_encodeLabels(PyArrayObject *labels, PyObjectEncoder *enc, strncpy(cLabel, "null", len); } else { if (enc->datetimeIso) { - // TODO: Vectorized Timedelta function if ((type_num == NPY_TIMEDELTA) || (PyDelta_Check(item))) { - PyObject *td = - PyObject_CallFunction(cls_timedelta, "(O)", item); - if (td == NULL) { - Py_DECREF(item); - NpyArr_freeLabels(ret, num); - ret = 0; - break; - } - - PyObject *iso = - PyObject_CallMethod(td, "isoformat", NULL); - Py_DECREF(td); - if (iso == NULL) { - Py_DECREF(item); - NpyArr_freeLabels(ret, num); - ret = 0; - break; - } - - len = strlen(PyUnicode_AsUTF8(iso)); - cLabel = PyObject_Malloc(len + 1); - memcpy(cLabel, PyUnicode_AsUTF8(iso), len + 1); - Py_DECREF(iso); + cLabel = int64ToIsoDuration(nanosecVal, &len); } else { if (type_num == NPY_DATETIME) { cLabel = int64ToIso(nanosecVal, base, &len); } else { - cLabel = PyDateTimeToIso((PyDateTime_Date *)item, - base, &len); + cLabel = PyDateTimeToIso(item, base, &len); } } if (cLabel == NULL) { @@ -1623,7 +1596,11 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { if (enc->datetimeIso) { PRINTMARK(); - pc->PyTypeToUTF8 = NpyDateTimeToIsoCallback; + if (enc->npyType == NPY_TIMEDELTA) { + pc->PyTypeToUTF8 = NpyTimeDeltaToIsoCallback; + } else { + pc->PyTypeToUTF8 = NpyDateTimeToIsoCallback; + } // Currently no way to pass longVal to iso function, so use // state management GET_TC(tc)->longValue = longVal; @@ -1704,7 +1681,8 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { PRINTMARK(); NPY_DATETIMEUNIT base = ((PyObjectEncoder *)tc->encoder)->datetimeUnit; - GET_TC(tc)->longValue = PyDateTimeToEpoch((PyDateTime_Date *)obj, base); + GET_TC(tc)->longValue = + PyDateTimeToEpoch(obj, base); tc->type = JT_LONG; } return; @@ -1730,7 +1708,8 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { PRINTMARK(); NPY_DATETIMEUNIT base = ((PyObjectEncoder *)tc->encoder)->datetimeUnit; - GET_TC(tc)->longValue = PyDateTimeToEpoch((PyDateTime_Date *)obj, base); + GET_TC(tc)->longValue = + PyDateTimeToEpoch(obj, base); tc->type = JT_LONG; } return; @@ -1743,28 +1722,30 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { value = total_seconds(obj) * 1000000000LL; // nanoseconds per second } - unit = ((PyObjectEncoder *)tc->encoder)->datetimeUnit; - if (scaleNanosecToUnit(&value, unit) != 0) { - // TODO: Add some kind of error handling here - } - - exc = PyErr_Occurred(); - - if (exc && PyErr_ExceptionMatches(PyExc_OverflowError)) { - PRINTMARK(); - goto INVALID; - } - + PRINTMARK(); if (value == get_nat()) { PRINTMARK(); tc->type = JT_NULL; return; - } + } else if (enc->datetimeIso) { + pc->PyTypeToUTF8 = NpyTimeDeltaToIsoCallback; + tc->type = JT_UTF8; + } else { + unit = ((PyObjectEncoder *)tc->encoder)->datetimeUnit; + if (scaleNanosecToUnit(&value, unit) != 0) { + // TODO: Add some kind of error handling here + } - GET_TC(tc)->longValue = value; + exc = PyErr_Occurred(); - PRINTMARK(); - tc->type = JT_LONG; + if (exc && PyErr_ExceptionMatches(PyExc_OverflowError)) { + PRINTMARK(); + goto INVALID; + } + + tc->type = JT_LONG; + } + GET_TC(tc)->longValue = value; return; } else if (PyArray_IsScalar(obj, Integer)) { PRINTMARK(); @@ -1935,7 +1916,7 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { pc->iterNext = NpyArr_iterNext; pc->iterGetName = NpyArr_iterGetName; - pc->newObj = get_values(obj); + pc->newObj = PyObject_GetAttrString(obj, "values"); if (!pc->newObj) { goto INVALID; } diff --git a/pandas/_libs/testing.pyx b/pandas/_libs/testing.pyx index 0e57b563d4d25..c6b8c3e876390 100644 --- a/pandas/_libs/testing.pyx +++ b/pandas/_libs/testing.pyx @@ -65,7 +65,7 @@ cpdef assert_dict_equal(a, b, bint compare_keys=True): cpdef assert_almost_equal(a, b, check_less_precise=False, bint check_dtype=True, - obj=None, lobj=None, robj=None): + obj=None, lobj=None, robj=None, index_values=None): """ Check that left and right objects are almost equal. @@ -89,6 +89,12 @@ cpdef assert_almost_equal(a, b, robj : str, default None Specify right object name being compared, internally used to show appropriate assertion message + index_values : ndarray, default None + Specify shared index values of objects being compared, internally used + to show appropriate assertion message + + .. versionadded:: 1.1.0 + """ cdef: int decimal @@ -171,7 +177,7 @@ cpdef assert_almost_equal(a, b, from pandas._testing import raise_assert_detail msg = (f"{obj} values are different " f"({np.round(diff * 100.0 / na, 5)} %)") - raise_assert_detail(obj, msg, lobj, robj) + raise_assert_detail(obj, msg, lobj, robj, index_values=index_values) return True diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index b78b623bfa187..94e757624c136 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -14,7 +14,7 @@ PyDateTime_IMPORT cimport numpy as cnp -from numpy cimport float64_t, int64_t, ndarray +from numpy cimport float64_t, int64_t, ndarray, uint8_t import numpy as np cnp.import_array() @@ -351,7 +351,6 @@ def format_array_from_datetime( def array_with_unit_to_datetime( ndarray values, - ndarray mask, object unit, str errors='coerce' ): @@ -373,8 +372,6 @@ def array_with_unit_to_datetime( ---------- values : ndarray of object Date-like objects to convert. - mask : boolean ndarray - Not-a-time mask for non-nullable integer types conversion, can be None. unit : object Time unit to use during conversion. errors : str, default 'raise' @@ -395,6 +392,7 @@ def array_with_unit_to_datetime( bint need_to_iterate = True ndarray[int64_t] iresult ndarray[object] oresult + ndarray mask object tz = None assert is_ignore or is_coerce or is_raise @@ -404,9 +402,6 @@ def array_with_unit_to_datetime( result = values.astype('M8[ns]') else: result, tz = array_to_datetime(values.astype(object), errors=errors) - if mask is not None: - iresult = result.view('i8') - iresult[mask] = NPY_NAT return result, tz m = cast_from_unit(None, unit) @@ -419,9 +414,8 @@ def array_with_unit_to_datetime( if values.dtype.kind == "i": # Note: this condition makes the casting="same_kind" redundant iresult = values.astype('i8', casting='same_kind', copy=False) - # If no mask, fill mask by comparing to NPY_NAT constant - if mask is None: - mask = iresult == NPY_NAT + # fill by comparing to NPY_NAT constant + mask = iresult == NPY_NAT iresult[mask] = 0 fvalues = iresult.astype('f8') * m need_to_iterate = False diff --git a/pandas/_libs/tslibs/c_timestamp.pyx b/pandas/_libs/tslibs/c_timestamp.pyx index 2c72cec18f096..3c30460a74ece 100644 --- a/pandas/_libs/tslibs/c_timestamp.pyx +++ b/pandas/_libs/tslibs/c_timestamp.pyx @@ -286,6 +286,10 @@ cdef class _Timestamp(datetime): # coerce if necessary if we are a Timestamp-like if (PyDateTime_Check(self) and (PyDateTime_Check(other) or is_datetime64_object(other))): + # both_timestamps is to determine whether Timedelta(self - other) + # should raise the OOB error, or fall back returning a timedelta. + both_timestamps = (isinstance(other, _Timestamp) and + isinstance(self, _Timestamp)) if isinstance(self, _Timestamp): other = type(self)(other) else: @@ -301,7 +305,14 @@ cdef class _Timestamp(datetime): from pandas._libs.tslibs.timedeltas import Timedelta try: return Timedelta(self.value - other.value) - except (OverflowError, OutOfBoundsDatetime): + except (OverflowError, OutOfBoundsDatetime) as err: + if isinstance(other, _Timestamp): + if both_timestamps: + raise OutOfBoundsDatetime( + "Result is too large for pandas.Timedelta. Convert inputs " + "to datetime.datetime with 'Timestamp.to_pydatetime()' " + "before subtracting." + ) from err pass elif is_datetime64_object(self): # GH#28286 cython semantics for __rsub__, `other` is actually diff --git a/pandas/_libs/tslibs/nattype.pyx b/pandas/_libs/tslibs/nattype.pyx index 7fec4ba5e7d25..ec397a470f2ec 100644 --- a/pandas/_libs/tslibs/nattype.pyx +++ b/pandas/_libs/tslibs/nattype.pyx @@ -1,10 +1,20 @@ from cpython.object cimport ( + Py_EQ, + Py_GE, + Py_GT, + Py_LE, + Py_LT, + Py_NE, PyObject_RichCompare, - Py_GT, Py_GE, Py_EQ, Py_NE, Py_LT, Py_LE) +) -from cpython.datetime cimport (datetime, timedelta, - PyDateTime_Check, PyDelta_Check, - PyDateTime_IMPORT) +from cpython.datetime cimport ( + PyDateTime_Check, + PyDateTime_IMPORT, + PyDelta_Check, + datetime, + timedelta, +) from cpython.version cimport PY_MINOR_VERSION @@ -16,20 +26,19 @@ from numpy cimport int64_t cnp.import_array() from pandas._libs.tslibs.np_datetime cimport ( - get_datetime64_value, get_timedelta64_value) + get_datetime64_value, + get_timedelta64_value, +) cimport pandas._libs.tslibs.util as util -from pandas._libs.tslibs.util cimport ( - get_nat, is_integer_object, is_float_object, is_datetime64_object, - is_timedelta64_object) from pandas._libs.missing cimport C_NA # ---------------------------------------------------------------------- # Constants -nat_strings = {'NaT', 'nat', 'NAT', 'nan', 'NaN', 'NAN'} +nat_strings = {"NaT", "nat", "NAT", "nan", "NaN", "NAN"} -cdef int64_t NPY_NAT = get_nat() +cdef int64_t NPY_NAT = util.get_nat() iNaT = NPY_NAT # python-visible constant cdef bint _nat_scalar_rules[6] @@ -61,7 +70,7 @@ def _make_nat_func(func_name, doc): def _make_error_func(func_name, cls): def f(*args, **kwargs): - raise ValueError("NaTType does not support " + func_name) + raise ValueError(f"NaTType does not support {func_name}") f.__name__ = func_name if isinstance(cls, str): @@ -73,9 +82,9 @@ def _make_error_func(func_name, cls): cdef _nat_divide_op(self, other): - if PyDelta_Check(other) or is_timedelta64_object(other) or other is c_NaT: + if PyDelta_Check(other) or util.is_timedelta64_object(other) or other is c_NaT: return np.nan - if is_integer_object(other) or is_float_object(other): + if util.is_integer_object(other) or util.is_float_object(other): return c_NaT return NotImplemented @@ -103,7 +112,7 @@ cdef class _NaT(datetime): def __richcmp__(_NaT self, object other, int op): cdef: - int ndim = getattr(other, 'ndim', -1) + int ndim = getattr(other, "ndim", -1) if ndim == -1: return _nat_scalar_rules[op] @@ -114,11 +123,13 @@ cdef class _NaT(datetime): return result elif ndim == 0: - if is_datetime64_object(other): + if util.is_datetime64_object(other): return _nat_scalar_rules[op] else: - raise TypeError(f'Cannot compare type {type(self).__name__} ' - f'with type {type(other).__name__}') + raise TypeError( + f"Cannot compare type {type(self).__name__} " + f"with type {type(other).__name__}" + ) # Note: instead of passing "other, self, _reverse_ops[op]", we observe # that `_nat_scalar_rules` is invariant under `_reverse_ops`, @@ -134,19 +145,19 @@ cdef class _NaT(datetime): return c_NaT elif PyDelta_Check(other): return c_NaT - elif is_datetime64_object(other) or is_timedelta64_object(other): + elif util.is_datetime64_object(other) or util.is_timedelta64_object(other): return c_NaT - elif hasattr(other, 'delta'): + elif hasattr(other, "delta"): # Timedelta, offsets.Tick, offsets.Week return c_NaT - elif is_integer_object(other) or util.is_period_object(other): + elif util.is_integer_object(other) or util.is_period_object(other): # For Period compat # TODO: the integer behavior is deprecated, remove it return c_NaT elif util.is_array(other): - if other.dtype.kind in 'mM': + if other.dtype.kind in "mM": # If we are adding to datetime64, we treat NaT as timedelta # Either way, result dtype is datetime64 result = np.empty(other.shape, dtype="datetime64[ns]") @@ -171,19 +182,19 @@ cdef class _NaT(datetime): return c_NaT elif PyDelta_Check(other): return c_NaT - elif is_datetime64_object(other) or is_timedelta64_object(other): + elif util.is_datetime64_object(other) or util.is_timedelta64_object(other): return c_NaT - elif hasattr(other, 'delta'): + elif hasattr(other, "delta"): # offsets.Tick, offsets.Week return c_NaT - elif is_integer_object(other) or util.is_period_object(other): + elif util.is_integer_object(other) or util.is_period_object(other): # For Period compat # TODO: the integer behavior is deprecated, remove it return c_NaT elif util.is_array(other): - if other.dtype.kind == 'm': + if other.dtype.kind == "m": if not is_rsub: # NaT - timedelta64 we treat NaT as datetime64, so result # is datetime64 @@ -197,15 +208,16 @@ cdef class _NaT(datetime): result.fill("NaT") return result - elif other.dtype.kind == 'M': + elif other.dtype.kind == "M": # We treat NaT as a datetime, so regardless of whether this is # NaT - other or other - NaT, the result is timedelta64 result = np.empty(other.shape, dtype="timedelta64[ns]") result.fill("NaT") return result - raise TypeError(f"Cannot subtract NaT from ndarray with " - f"dtype {other.dtype}") + raise TypeError( + f"Cannot subtract NaT from ndarray with dtype {other.dtype}" + ) return NotImplemented @@ -225,19 +237,19 @@ cdef class _NaT(datetime): return _nat_divide_op(self, other) def __mul__(self, other): - if is_integer_object(other) or is_float_object(other): + if util.is_integer_object(other) or util.is_float_object(other): return NaT return NotImplemented @property def asm8(self) -> np.datetime64: - return np.datetime64(NPY_NAT, 'ns') + return np.datetime64(NPY_NAT, "ns") def to_datetime64(self) -> np.datetime64: """ Return a numpy.datetime64 object with 'ns' precision. """ - return np.datetime64('NaT', 'ns') + return np.datetime64('NaT', "ns") def to_numpy(self, dtype=None, copy=False) -> np.datetime64: """ @@ -260,14 +272,14 @@ cdef class _NaT(datetime): return self.to_datetime64() def __repr__(self) -> str: - return 'NaT' + return "NaT" def __str__(self) -> str: - return 'NaT' + return "NaT" - def isoformat(self, sep='T') -> str: + def isoformat(self, sep="T") -> str: # This allows Timestamp(ts.isoformat()) to always correctly roundtrip. - return 'NaT' + return "NaT" def __hash__(self): return NPY_NAT @@ -308,7 +320,9 @@ cdef class _NaT(datetime): class NaTType(_NaT): - """(N)ot-(A)-(T)ime, the time equivalent of NaN""" + """ + (N)ot-(A)-(T)ime, the time equivalent of NaN. + """ def __new__(cls): cdef _NaT base @@ -338,7 +352,7 @@ class NaTType(_NaT): return _nat_rdivide_op(self, other) def __rmul__(self, other): - if is_integer_object(other) or is_float_object(other): + if util.is_integer_object(other) or util.is_float_object(other): return c_NaT return NotImplemented @@ -379,10 +393,11 @@ class NaTType(_NaT): # These are the ones that can get their docstrings from datetime. # nan methods - weekday = _make_nan_func('weekday', datetime.weekday.__doc__) - isoweekday = _make_nan_func('isoweekday', datetime.isoweekday.__doc__) - total_seconds = _make_nan_func('total_seconds', timedelta.total_seconds.__doc__) - month_name = _make_nan_func('month_name', # noqa:E128 + weekday = _make_nan_func("weekday", datetime.weekday.__doc__) + isoweekday = _make_nan_func("isoweekday", datetime.isoweekday.__doc__) + total_seconds = _make_nan_func("total_seconds", timedelta.total_seconds.__doc__) + month_name = _make_nan_func( + "month_name", """ Return the month name of the Timestamp with specified locale. @@ -396,8 +411,10 @@ class NaTType(_NaT): month_name : string .. versionadded:: 0.23.0 - """) - day_name = _make_nan_func('day_name', # noqa:E128 + """, + ) + day_name = _make_nan_func( + "day_name", """ Return the day name of the Timestamp with specified locale. @@ -411,73 +428,79 @@ class NaTType(_NaT): day_name : string .. versionadded:: 0.23.0 - """) + """, + ) # _nat_methods - date = _make_nat_func('date', datetime.date.__doc__) - - utctimetuple = _make_error_func('utctimetuple', datetime) - timetz = _make_error_func('timetz', datetime) - timetuple = _make_error_func('timetuple', datetime) - strftime = _make_error_func('strftime', datetime) - isocalendar = _make_error_func('isocalendar', datetime) - dst = _make_error_func('dst', datetime) - ctime = _make_error_func('ctime', datetime) - time = _make_error_func('time', datetime) - toordinal = _make_error_func('toordinal', datetime) - tzname = _make_error_func('tzname', datetime) - utcoffset = _make_error_func('utcoffset', datetime) + date = _make_nat_func("date", datetime.date.__doc__) + + utctimetuple = _make_error_func("utctimetuple", datetime) + timetz = _make_error_func("timetz", datetime) + timetuple = _make_error_func("timetuple", datetime) + strftime = _make_error_func("strftime", datetime) + isocalendar = _make_error_func("isocalendar", datetime) + dst = _make_error_func("dst", datetime) + ctime = _make_error_func("ctime", datetime) + time = _make_error_func("time", datetime) + toordinal = _make_error_func("toordinal", datetime) + tzname = _make_error_func("tzname", datetime) + utcoffset = _make_error_func("utcoffset", datetime) # "fromisocalendar" was introduced in 3.8 if PY_MINOR_VERSION >= 8: - fromisocalendar = _make_error_func('fromisocalendar', datetime) + fromisocalendar = _make_error_func("fromisocalendar", datetime) # ---------------------------------------------------------------------- # The remaining methods have docstrings copy/pasted from the analogous # Timestamp methods. - strptime = _make_error_func('strptime', # noqa:E128 + strptime = _make_error_func( + "strptime", """ Timestamp.strptime(string, format) Function is not implemented. Use pd.to_datetime(). - """ + """, ) - utcfromtimestamp = _make_error_func('utcfromtimestamp', # noqa:E128 + utcfromtimestamp = _make_error_func( + "utcfromtimestamp", """ Timestamp.utcfromtimestamp(ts) Construct a naive UTC datetime from a POSIX timestamp. - """ + """, ) - fromtimestamp = _make_error_func('fromtimestamp', # noqa:E128 + fromtimestamp = _make_error_func( + "fromtimestamp", """ Timestamp.fromtimestamp(ts) timestamp[, tz] -> tz's local time from POSIX timestamp. - """ + """, ) - combine = _make_error_func('combine', # noqa:E128 + combine = _make_error_func( + "combine", """ Timestamp.combine(date, time) date, time -> datetime with same date and time fields. - """ + """, ) - utcnow = _make_error_func('utcnow', # noqa:E128 + utcnow = _make_error_func( + "utcnow", """ Timestamp.utcnow() Return a new Timestamp representing UTC day and time. - """ + """, ) - timestamp = _make_error_func('timestamp', # noqa:E128 - """Return POSIX timestamp as float.""") + timestamp = _make_error_func("timestamp", "Return POSIX timestamp as float.") # GH9513 NaT methods (except to_datetime64) to raise, return np.nan, or # return NaT create functions that raise, for binding to NaTType - astimezone = _make_error_func('astimezone', # noqa:E128 + astimezone = _make_error_func( + "astimezone", """ Convert tz-aware Timestamp to another time zone. @@ -495,8 +518,10 @@ class NaTType(_NaT): ------ TypeError If Timestamp is tz-naive. - """) - fromordinal = _make_error_func('fromordinal', # noqa:E128 + """, + ) + fromordinal = _make_error_func( + "fromordinal", """ Timestamp.fromordinal(ordinal, freq=None, tz=None) @@ -511,17 +536,21 @@ class NaTType(_NaT): Offset to apply to the Timestamp. tz : str, pytz.timezone, dateutil.tz.tzfile or None Time zone for the Timestamp. - """) + """, + ) # _nat_methods - to_pydatetime = _make_nat_func('to_pydatetime', # noqa:E128 + to_pydatetime = _make_nat_func( + "to_pydatetime", """ Convert a Timestamp object to a native Python datetime object. If warn=True, issue a warning if nanoseconds is nonzero. - """) + """, + ) - now = _make_nat_func('now', # noqa:E128 + now = _make_nat_func( + "now", """ Timestamp.now(tz=None) @@ -532,8 +561,10 @@ class NaTType(_NaT): ---------- tz : str or timezone object, default None Timezone to localize to. - """) - today = _make_nat_func('today', # noqa:E128 + """, + ) + today = _make_nat_func( + "today", """ Timestamp.today(cls, tz=None) @@ -545,8 +576,10 @@ class NaTType(_NaT): ---------- tz : str or timezone object, default None Timezone to localize to. - """) - round = _make_nat_func('round', # noqa:E128 + """, + ) + round = _make_nat_func( + "round", """ Round the Timestamp to the specified resolution. @@ -586,8 +619,10 @@ timedelta}, default 'raise' Raises ------ ValueError if the freq cannot be converted - """) - floor = _make_nat_func('floor', # noqa:E128 + """, + ) + floor = _make_nat_func( + "floor", """ return a new Timestamp floored to this resolution. @@ -623,8 +658,10 @@ timedelta}, default 'raise' Raises ------ ValueError if the freq cannot be converted. - """) - ceil = _make_nat_func('ceil', # noqa:E128 + """, + ) + ceil = _make_nat_func( + "ceil", """ return a new Timestamp ceiled to this resolution. @@ -660,9 +697,11 @@ timedelta}, default 'raise' Raises ------ ValueError if the freq cannot be converted. - """) + """, + ) - tz_convert = _make_nat_func('tz_convert', # noqa:E128 + tz_convert = _make_nat_func( + "tz_convert", """ Convert tz-aware Timestamp to another time zone. @@ -680,8 +719,10 @@ timedelta}, default 'raise' ------ TypeError If Timestamp is tz-naive. - """) - tz_localize = _make_nat_func('tz_localize', # noqa:E128 + """, + ) + tz_localize = _make_nat_func( + "tz_localize", """ Convert naive Timestamp to local time zone, or remove timezone from tz-aware Timestamp. @@ -733,8 +774,10 @@ default 'raise' ------ TypeError If the Timestamp is tz-aware and tz is not None. - """) - replace = _make_nat_func('replace', # noqa:E128 + """, + ) + replace = _make_nat_func( + "replace", """ implements datetime.replace, handles nanoseconds. @@ -754,7 +797,8 @@ default 'raise' Returns ------- Timestamp with fields replaced - """) + """, + ) c_NaT = NaTType() # C-visible @@ -772,7 +816,7 @@ cdef inline bint checknull_with_nat(object val): cpdef bint is_null_datetimelike(object val, bint inat_is_null=True): """ - Determine if we have a null for a timedelta/datetime (or integer versions) + Determine if we have a null for a timedelta/datetime (or integer versions). Parameters ---------- @@ -782,7 +826,7 @@ cpdef bint is_null_datetimelike(object val, bint inat_is_null=True): Returns ------- - null_datetimelike : bool + bool """ if val is None: return True diff --git a/pandas/_libs/tslibs/np_datetime.pyx b/pandas/_libs/tslibs/np_datetime.pyx index b59a1101e0bf7..9a8a8fdae6d2f 100644 --- a/pandas/_libs/tslibs/np_datetime.pyx +++ b/pandas/_libs/tslibs/np_datetime.pyx @@ -1,12 +1,15 @@ from cpython.object cimport Py_EQ, Py_NE, Py_GE, Py_GT, Py_LT, Py_LE -from cpython.datetime cimport (datetime, date, - PyDateTime_IMPORT, - PyDateTime_GET_YEAR, PyDateTime_GET_MONTH, - PyDateTime_GET_DAY, PyDateTime_DATE_GET_HOUR, - PyDateTime_DATE_GET_MINUTE, - PyDateTime_DATE_GET_SECOND, - PyDateTime_DATE_GET_MICROSECOND) +from cpython.datetime cimport ( + PyDateTime_DATE_GET_HOUR, + PyDateTime_DATE_GET_MICROSECOND, + PyDateTime_DATE_GET_MINUTE, + PyDateTime_DATE_GET_SECOND, + PyDateTime_GET_DAY, + PyDateTime_GET_MONTH, + PyDateTime_GET_YEAR, + PyDateTime_IMPORT, +) PyDateTime_IMPORT from numpy cimport int64_t diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx index 48a3886c20a3a..0849ba0f29624 100644 --- a/pandas/_libs/tslibs/offsets.pyx +++ b/pandas/_libs/tslibs/offsets.pyx @@ -114,7 +114,18 @@ def apply_index_wraps(func): # Note: normally we would use `@functools.wraps(func)`, but this does # not play nicely with cython class methods def wrapper(self, other): - result = func(self, other) + + is_index = getattr(other, "_typ", "") == "datetimeindex" + + # operate on DatetimeArray + arr = other._data if is_index else other + + result = func(self, arr) + + if is_index: + # Wrap DatetimeArray result back to DatetimeIndex + result = type(other)._simple_new(result, name=other.name) + if self.normalize: result = result.to_period('D').to_timestamp() return result @@ -509,7 +520,7 @@ class _BaseOffset: state = self.__dict__.copy() # we don't want to actually pickle the calendar object - # as its a np.busyday; we recreate on deserilization + # as its a np.busyday; we recreate on deserialization if 'calendar' in state: del state['calendar'] try: diff --git a/pandas/_libs/tslibs/parsing.pyx b/pandas/_libs/tslibs/parsing.pyx index ebdf7a1e29216..74b95a2f3076f 100644 --- a/pandas/_libs/tslibs/parsing.pyx +++ b/pandas/_libs/tslibs/parsing.pyx @@ -349,7 +349,7 @@ cpdef bint _does_string_look_like_datetime(str py_string): elif py_string in _not_datelike_strings: return False else: - # xstrtod with such paramaters copies behavior of python `float` + # xstrtod with such parameters copies behavior of python `float` # cast; for example, " 35.e-1 " is valid string for this cast so, # for correctly xstrtod call necessary to pass these params: # b'.' - a dot is used as separator, b'e' - an exponential form of @@ -577,8 +577,8 @@ def try_parse_date_and_time(object[:] dates, object[:] times, object[:] result n = len(dates) - # Cast to avoid build warning see GH#26757 - if len(times) != n: + # TODO(cython 3.0): Use len instead of `shape[0]` + if times.shape[0] != n: raise ValueError('Length of dates and times must be equal') result = np.empty(n, dtype='O') @@ -614,8 +614,8 @@ def try_parse_year_month_day(object[:] years, object[:] months, object[:] result n = len(years) - # Cast to avoid build warning see GH#26757 - if len(months) != n or len(days) != n: + # TODO(cython 3.0): Use len instead of `shape[0]` + if months.shape[0] != n or days.shape[0] != n: raise ValueError('Length of years/months/days must all be equal') result = np.empty(n, dtype='O') @@ -640,10 +640,14 @@ def try_parse_datetime_components(object[:] years, double micros n = len(years) - # Cast to avoid build warning see GH#26757 - if (len(months) != n or len(days) != n or - len(hours) != n or len(minutes) != n or - len(seconds) != n): + # TODO(cython 3.0): Use len instead of `shape[0]` + if ( + months.shape[0] != n + or days.shape[0] != n + or hours.shape[0] != n + or minutes.shape[0] != n + or seconds.shape[0] != n + ): raise ValueError('Length of all datetime components must be equal') result = np.empty(n, dtype='O') diff --git a/pandas/_libs/tslibs/src/datetime/np_datetime.c b/pandas/_libs/tslibs/src/datetime/np_datetime.c index a8a47e2e90f93..f647098140528 100644 --- a/pandas/_libs/tslibs/src/datetime/np_datetime.c +++ b/pandas/_libs/tslibs/src/datetime/np_datetime.c @@ -21,7 +21,6 @@ This file is derived from NumPy 1.7. See NUMPY_LICENSE.txt #endif // NPY_NO_DEPRECATED_API #include -#include #include #include @@ -313,15 +312,14 @@ int cmp_npy_datetimestruct(const npy_datetimestruct *a, * object into a NumPy npy_datetimestruct. Uses tzinfo (if present) * to convert to UTC time. * - * While the C API has PyDate_* and PyDateTime_* functions, the following - * implementation just asks for attributes, and thus supports - * datetime duck typing. The tzinfo time zone conversion would require - * this style of access anyway. + * The following implementation just asks for attributes, and thus + * supports datetime duck typing. The tzinfo time zone conversion + * requires this style of access as well. * * Returns -1 on error, 0 on success, and 1 (with no error set) * if obj doesn't have the needed date or datetime attributes. */ -int convert_pydatetime_to_datetimestruct(PyDateTime_Date *dtobj, +int convert_pydatetime_to_datetimestruct(PyObject *dtobj, npy_datetimestruct *out) { // Assumes that obj is a valid datetime object PyObject *tmp; diff --git a/pandas/_libs/tslibs/src/datetime/np_datetime.h b/pandas/_libs/tslibs/src/datetime/np_datetime.h index 549d38409ca83..0bbc24ed822c5 100644 --- a/pandas/_libs/tslibs/src/datetime/np_datetime.h +++ b/pandas/_libs/tslibs/src/datetime/np_datetime.h @@ -22,7 +22,6 @@ This file is derived from NumPy 1.7. See NUMPY_LICENSE.txt #endif // NPY_NO_DEPRECATED_API #include -#include typedef struct { npy_int64 days; @@ -35,7 +34,7 @@ extern const npy_datetimestruct _NS_MAX_DTS; // stuff pandas needs // ---------------------------------------------------------------------------- -int convert_pydatetime_to_datetimestruct(PyDateTime_Date *dtobj, +int convert_pydatetime_to_datetimestruct(PyObject *dtobj, npy_datetimestruct *out); npy_datetime npy_datetimestruct_to_datetime(NPY_DATETIMEUNIT base, diff --git a/pandas/_libs/tslibs/src/datetime/np_datetime_strings.c b/pandas/_libs/tslibs/src/datetime/np_datetime_strings.c index 54ed6ecff21e2..b245ae5880ecb 100644 --- a/pandas/_libs/tslibs/src/datetime/np_datetime_strings.c +++ b/pandas/_libs/tslibs/src/datetime/np_datetime_strings.c @@ -905,3 +905,37 @@ int make_iso_8601_datetime(npy_datetimestruct *dts, char *outstr, int outlen, outlen); return -1; } + + +int make_iso_8601_timedelta(pandas_timedeltastruct *tds, + char *outstr, size_t *outlen) { + *outlen = 0; + *outlen += snprintf(outstr, 60, // NOLINT + "P%" NPY_INT64_FMT + "DT%" NPY_INT32_FMT + "H%" NPY_INT32_FMT + "M%" NPY_INT32_FMT, + tds->days, tds->hrs, tds->min, tds->sec); + outstr += *outlen; + + if (tds->ns != 0) { + *outlen += snprintf(outstr, 12, // NOLINT + ".%03" NPY_INT32_FMT + "%03" NPY_INT32_FMT + "%03" NPY_INT32_FMT + "S", tds->ms, tds->us, tds->ns); + } else if (tds->us != 0) { + *outlen += snprintf(outstr, 9, // NOLINT + ".%03" NPY_INT32_FMT + "%03" NPY_INT32_FMT + "S", tds->ms, tds->us); + } else if (tds->ms != 0) { + *outlen += snprintf(outstr, 6, // NOLINT + ".%03" NPY_INT32_FMT "S", tds->ms); + } else { + *outlen += snprintf(outstr, 2, // NOLINT + "%s", "S"); + } + + return 0; +} diff --git a/pandas/_libs/tslibs/src/datetime/np_datetime_strings.h b/pandas/_libs/tslibs/src/datetime/np_datetime_strings.h index 880c34ea77638..200a71ff0c2b7 100644 --- a/pandas/_libs/tslibs/src/datetime/np_datetime_strings.h +++ b/pandas/_libs/tslibs/src/datetime/np_datetime_strings.h @@ -79,4 +79,14 @@ get_datetime_iso_8601_strlen(int local, NPY_DATETIMEUNIT base); int make_iso_8601_datetime(npy_datetimestruct *dts, char *outstr, int outlen, NPY_DATETIMEUNIT base); + +/* + * Converts an pandas_timedeltastruct to an ISO 8601 string. + * + * Mutates outlen to provide size of (non-NULL terminated) string. + * + * Currently has no error handling + */ +int make_iso_8601_timedelta(pandas_timedeltastruct *tds, char *outstr, + size_t *outlen); #endif // PANDAS__LIBS_TSLIBS_SRC_DATETIME_NP_DATETIME_STRINGS_H_ diff --git a/pandas/_libs/tslibs/strptime.pyx b/pandas/_libs/tslibs/strptime.pyx index dfe050c7bbff7..a48c3365947dc 100644 --- a/pandas/_libs/tslibs/strptime.pyx +++ b/pandas/_libs/tslibs/strptime.pyx @@ -4,7 +4,7 @@ import time import locale import calendar import re -from datetime import date as datetime_date +import datetime from _thread import allocate_lock as _thread_allocate_lock @@ -288,20 +288,20 @@ def array_strptime(object[:] values, object fmt, bint exact=True, errors='raise' elif iso_year != -1 and iso_week != -1: year, julian = _calc_julian_from_V(iso_year, iso_week, weekday + 1) - # Cannot pre-calculate datetime_date() since can change in Julian + # Cannot pre-calculate datetime.date() since can change in Julian # calculation and thus could have different value for the day of the wk # calculation. try: if julian == -1: # Need to add 1 to result since first day of the year is 1, not # 0. - ordinal = datetime_date(year, month, day).toordinal() - julian = ordinal - datetime_date(year, 1, 1).toordinal() + 1 + ordinal = datetime.date(year, month, day).toordinal() + julian = ordinal - datetime.date(year, 1, 1).toordinal() + 1 else: # Assume that if they bothered to include Julian day it will # be accurate. - datetime_result = datetime_date.fromordinal( - (julian - 1) + datetime_date(year, 1, 1).toordinal()) + datetime_result = datetime.date.fromordinal( + (julian - 1) + datetime.date(year, 1, 1).toordinal()) year = datetime_result.year month = datetime_result.month day = datetime_result.day @@ -311,7 +311,7 @@ def array_strptime(object[:] values, object fmt, bint exact=True, errors='raise' continue raise if weekday == -1: - weekday = datetime_date(year, month, day).weekday() + weekday = datetime.date(year, month, day).weekday() dts.year = year dts.month = month @@ -649,7 +649,7 @@ cdef int _calc_julian_from_U_or_W(int year, int week_of_year, cdef: int first_weekday, week_0_length, days_to_week - first_weekday = datetime_date(year, 1, 1).weekday() + first_weekday = datetime.date(year, 1, 1).weekday() # If we are dealing with the %U directive (week starts on Sunday), it's # easier to just shift the view to Sunday being the first day of the # week. @@ -692,14 +692,14 @@ cdef (int, int) _calc_julian_from_V(int iso_year, int iso_week, int iso_weekday) cdef: int correction, ordinal - correction = datetime_date(iso_year, 1, 4).isoweekday() + 3 + correction = datetime.date(iso_year, 1, 4).isoweekday() + 3 ordinal = (iso_week * 7) + iso_weekday - correction # ordinal may be negative or 0 now, which means the date is in the previous # calendar year if ordinal < 1: - ordinal += datetime_date(iso_year, 1, 1).toordinal() + ordinal += datetime.date(iso_year, 1, 1).toordinal() iso_year -= 1 - ordinal -= datetime_date(iso_year, 1, 1).toordinal() + ordinal -= datetime.date(iso_year, 1, 1).toordinal() return iso_year, ordinal diff --git a/pandas/_libs/tslibs/timedeltas.pyx b/pandas/_libs/tslibs/timedeltas.pyx index 7bd02b734beeb..457f3eb0749c2 100644 --- a/pandas/_libs/tslibs/timedeltas.pyx +++ b/pandas/_libs/tslibs/timedeltas.pyx @@ -1167,7 +1167,7 @@ class Timedelta(_Timedelta): Possible values: - * 'Y', 'M', 'W', 'D', 'T', 'S', 'L', 'U', or 'N' + * 'W', 'D', 'T', 'S', 'L', 'U', or 'N' * 'days' or 'day' * 'hours', 'hour', 'hr', or 'h' * 'minutes', 'minute', 'min', or 'm' diff --git a/pandas/_libs/window/aggregations.pyx b/pandas/_libs/window/aggregations.pyx index 80b9144042041..a90d2f77e44d1 100644 --- a/pandas/_libs/window/aggregations.pyx +++ b/pandas/_libs/window/aggregations.pyx @@ -1013,7 +1013,7 @@ def roll_max_variable(ndarray[float64_t] values, ndarray[int64_t] start, def roll_min_fixed(ndarray[float64_t] values, ndarray[int64_t] start, ndarray[int64_t] end, int64_t minp, int64_t win): """ - Moving max of 1d array of any numeric type along axis=0 ignoring NaNs. + Moving min of 1d array of any numeric type along axis=0 ignoring NaNs. Parameters ---------- @@ -1030,7 +1030,7 @@ def roll_min_fixed(ndarray[float64_t] values, ndarray[int64_t] start, def roll_min_variable(ndarray[float64_t] values, ndarray[int64_t] start, ndarray[int64_t] end, int64_t minp): """ - Moving max of 1d array of any numeric type along axis=0 ignoring NaNs. + Moving min of 1d array of any numeric type along axis=0 ignoring NaNs. Parameters ---------- diff --git a/pandas/_libs/window/indexers.pyx b/pandas/_libs/window/indexers.pyx index 2d01d1964c043..8a1e7feb57ace 100644 --- a/pandas/_libs/window/indexers.pyx +++ b/pandas/_libs/window/indexers.pyx @@ -44,6 +44,7 @@ def calculate_variable_window_bounds( cdef: bint left_closed = False bint right_closed = False + int index_growth_sign = 1 ndarray[int64_t, ndim=1] start, end int64_t start_bound, end_bound Py_ssize_t i, j @@ -58,6 +59,9 @@ def calculate_variable_window_bounds( if closed in ['left', 'both']: left_closed = True + if index[num_values - 1] < index[0]: + index_growth_sign = -1 + start = np.empty(num_values, dtype='int64') start.fill(-1) end = np.empty(num_values, dtype='int64') @@ -78,7 +82,7 @@ def calculate_variable_window_bounds( # end is end of slice interval (not including) for i in range(1, num_values): end_bound = index[i] - start_bound = index[i] - window_size + start_bound = index[i] - index_growth_sign * window_size # left endpoint is closed if left_closed: @@ -88,13 +92,13 @@ def calculate_variable_window_bounds( # within the constraint start[i] = i for j in range(start[i - 1], i): - if index[j] > start_bound: + if (index[j] - start_bound) * index_growth_sign > 0: start[i] = j break # end bound is previous end # or current index - if index[end[i - 1]] <= end_bound: + if (index[end[i - 1]] - end_bound) * index_growth_sign <= 0: end[i] = i + 1 else: end[i] = end[i - 1] diff --git a/pandas/_libs/writers.pyx b/pandas/_libs/writers.pyx index 9e95dea979577..ebf98232da58b 100644 --- a/pandas/_libs/writers.pyx +++ b/pandas/_libs/writers.pyx @@ -36,7 +36,7 @@ def write_csv_rows( """ # In crude testing, N>100 yields little marginal improvement cdef: - Py_ssize_t i, j, k = len(data_index), N = 100, ncols = len(cols) + Py_ssize_t i, j = 0, k = len(data_index), N = 100, ncols = len(cols) list rows # pre-allocate rows diff --git a/pandas/_testing.py b/pandas/_testing.py index 33ec4e4886aa6..e69263b81e1aa 100644 --- a/pandas/_testing.py +++ b/pandas/_testing.py @@ -32,8 +32,8 @@ is_datetime64tz_dtype, is_extension_array_dtype, is_interval_dtype, - is_list_like, is_number, + is_numeric_dtype, is_period_dtype, is_sequence, is_timedelta64_dtype, @@ -417,10 +417,7 @@ def rands_array(nchars, size, dtype="O"): .view((np.str_, nchars)) .reshape(size) ) - if dtype is None: - return retval - else: - return retval.astype(dtype) + return retval.astype(dtype) def randu_array(nchars, size, dtype="O"): @@ -432,10 +429,7 @@ def randu_array(nchars, size, dtype="O"): .view((np.unicode_, nchars)) .reshape(size) ) - if dtype is None: - return retval - else: - return retval.astype(dtype) + return retval.astype(dtype) def rands(nchars): @@ -448,16 +442,6 @@ def rands(nchars): return "".join(np.random.choice(RANDS_CHARS, nchars)) -def randu(nchars): - """ - Generate one random unicode string. - - See `randu_array` if you want to create an array of random unicode strings. - - """ - return "".join(np.random.choice(RANDU_CHARS, nchars)) - - def close(fignum=None): from matplotlib.pyplot import get_fignums, close as _close @@ -724,10 +708,7 @@ def repr_class(x): # return Index as it is to include values in the error message return x - try: - return type(x).__name__ - except AttributeError: - return repr(type(x)) + return type(x).__name__ if exact == "equiv": if type(left) != type(right): @@ -843,10 +824,14 @@ def assert_categorical_equal( left.codes, right.codes, check_dtype=check_dtype, obj=f"{obj}.codes", ) else: + try: + lc = left.categories.sort_values() + rc = right.categories.sort_values() + except TypeError: + # e.g. '<' not supported between instances of 'int' and 'str' + lc, rc = left.categories, right.categories assert_index_equal( - left.categories.sort_values(), - right.categories.sort_values(), - obj=f"{obj}.categories", + lc, rc, obj=f"{obj}.categories", ) assert_index_equal( left.categories.take(left.codes), @@ -903,9 +888,16 @@ def assert_timedelta_array_equal(left, right, obj="TimedeltaArray"): assert_attr_equal("freq", left, right, obj=obj) -def raise_assert_detail(obj, message, left, right, diff=None): +def raise_assert_detail(obj, message, left, right, diff=None, index_values=None): __tracebackhide__ = True + msg = f"""{obj} are different + +{message}""" + + if isinstance(index_values, np.ndarray): + msg += f"\n[index]: {pprint_thing(index_values)}" + if isinstance(left, np.ndarray): left = pprint_thing(left) elif is_categorical_dtype(left): @@ -916,9 +908,7 @@ def raise_assert_detail(obj, message, left, right, diff=None): elif is_categorical_dtype(right): right = repr(right) - msg = f"""{obj} are different - -{message} + msg += f""" [left]: {left} [right]: {right}""" @@ -936,6 +926,7 @@ def assert_numpy_array_equal( err_msg=None, check_same=None, obj="numpy array", + index_values=None, ): """ Check that 'np.ndarray' is equivalent. @@ -955,6 +946,8 @@ def assert_numpy_array_equal( obj : str, default 'numpy array' Specify object name being compared, internally used to show appropriate assertion message. + index_values : numpy.ndarray, default None + optional index (shared by both left and right), used in output. """ __tracebackhide__ = True @@ -992,7 +985,7 @@ def _raise(left, right, err_msg): diff = diff * 100.0 / left.size msg = f"{obj} values are different ({np.round(diff, 5)} %)" - raise_assert_detail(obj, msg, left, right) + raise_assert_detail(obj, msg, left, right, index_values=index_values) raise AssertionError(err_msg) @@ -1037,7 +1030,8 @@ def assert_extension_array_equal( if hasattr(left, "asi8") and type(right) == type(left): # Avoid slow object-dtype comparisons - assert_numpy_array_equal(left.asi8, right.asi8) + # np.asarray for case where we have a np.MaskedArray + assert_numpy_array_equal(np.asarray(left.asi8), np.asarray(right.asi8)) return left_na = np.asarray(left.isna()) @@ -1086,7 +1080,7 @@ def assert_series_equal( Whether to check the Index class, dtype and inferred_type are identical. check_series_type : bool, default True - Whether to check the Series class is identical. + Whether to check the Series class is identical. check_less_precise : bool or int, default False Specify comparison precision. Only used when check_exact is False. 5 digits (False) or 3 digits (True) after decimal points are compared. @@ -1119,10 +1113,7 @@ def assert_series_equal( _check_isinstance(left, right, Series) if check_series_type: - # ToDo: There are some tests using rhs is sparse - # lhs is dense. Should use assert_class_equal in future - assert isinstance(left, type(right)) - # assert_class_equal(left, right, obj=obj) + assert_class_equal(left, right, obj=obj) # length comparison if len(left) != len(right): @@ -1147,8 +1138,8 @@ def assert_series_equal( # is False. We'll still raise if only one is a `Categorical`, # regardless of `check_categorical` if ( - is_categorical_dtype(left) - and is_categorical_dtype(right) + is_categorical_dtype(left.dtype) + and is_categorical_dtype(right.dtype) and not check_categorical ): pass @@ -1156,53 +1147,54 @@ def assert_series_equal( assert_attr_equal("dtype", left, right, obj=f"Attributes of {obj}") if check_exact: + if not is_numeric_dtype(left.dtype): + raise AssertionError("check_exact may only be used with numeric Series") + assert_numpy_array_equal( - left._internal_get_values(), - right._internal_get_values(), + left._values, + right._values, check_dtype=check_dtype, obj=str(obj), + index_values=np.asarray(left.index), ) - elif check_datetimelike_compat: + elif check_datetimelike_compat and ( + needs_i8_conversion(left.dtype) or needs_i8_conversion(right.dtype) + ): # we want to check only if we have compat dtypes # e.g. integer and M|m are NOT compat, but we can simply check # the values in that case - if needs_i8_conversion(left) or needs_i8_conversion(right): - - # datetimelike may have different objects (e.g. datetime.datetime - # vs Timestamp) but will compare equal - if not Index(left._values).equals(Index(right._values)): - msg = ( - f"[datetimelike_compat=True] {left._values} " - f"is not equal to {right._values}." - ) - raise AssertionError(msg) - else: - assert_numpy_array_equal( - left._internal_get_values(), - right._internal_get_values(), - check_dtype=check_dtype, + + # datetimelike may have different objects (e.g. datetime.datetime + # vs Timestamp) but will compare equal + if not Index(left._values).equals(Index(right._values)): + msg = ( + f"[datetimelike_compat=True] {left._values} " + f"is not equal to {right._values}." ) - elif is_interval_dtype(left) or is_interval_dtype(right): + raise AssertionError(msg) + elif is_interval_dtype(left.dtype) and is_interval_dtype(right.dtype): assert_interval_array_equal(left.array, right.array) - elif is_extension_array_dtype(left.dtype) and is_datetime64tz_dtype(left.dtype): - # .values is an ndarray, but ._values is the ExtensionArray. - # TODO: Use .array - assert is_extension_array_dtype(right.dtype) + elif is_categorical_dtype(left.dtype) or is_categorical_dtype(right.dtype): + _testing.assert_almost_equal( + left._values, + right._values, + check_less_precise=check_less_precise, + check_dtype=check_dtype, + obj=str(obj), + ) + elif is_extension_array_dtype(left.dtype) and is_extension_array_dtype(right.dtype): + assert_extension_array_equal(left._values, right._values) + elif needs_i8_conversion(left.dtype) or needs_i8_conversion(right.dtype): + # DatetimeArray or TimedeltaArray assert_extension_array_equal(left._values, right._values) - elif ( - is_extension_array_dtype(left) - and not is_categorical_dtype(left) - and is_extension_array_dtype(right) - and not is_categorical_dtype(right) - ): - assert_extension_array_equal(left.array, right.array) else: _testing.assert_almost_equal( - left._internal_get_values(), - right._internal_get_values(), + left._values, + right._values, check_less_precise=check_less_precise, check_dtype=check_dtype, obj=str(obj), + index_values=np.asarray(left.index), ) # metadata comparison @@ -1491,14 +1483,7 @@ def to_array(obj): # Sparse -def assert_sp_array_equal( - left, - right, - check_dtype=True, - check_kind=True, - check_fill_value=True, - consolidate_block_indices=False, -): +def assert_sp_array_equal(left, right): """ Check that the left and right SparseArray are equal. @@ -1506,38 +1491,17 @@ def assert_sp_array_equal( ---------- left : SparseArray right : SparseArray - check_dtype : bool, default True - Whether to check the data dtype is identical. - check_kind : bool, default True - Whether to just the kind of the sparse index for each column. - check_fill_value : bool, default True - Whether to check that left.fill_value matches right.fill_value - consolidate_block_indices : bool, default False - Whether to consolidate contiguous blocks for sparse arrays with - a BlockIndex. Some operations, e.g. concat, will end up with - block indices that could be consolidated. Setting this to true will - create a new BlockIndex for that array, with consolidated - block indices. """ _check_isinstance(left, right, pd.arrays.SparseArray) - assert_numpy_array_equal(left.sp_values, right.sp_values, check_dtype=check_dtype) + assert_numpy_array_equal(left.sp_values, right.sp_values) # SparseIndex comparison assert isinstance(left.sp_index, pd._libs.sparse.SparseIndex) assert isinstance(right.sp_index, pd._libs.sparse.SparseIndex) - if not check_kind: - left_index = left.sp_index.to_block_index() - right_index = right.sp_index.to_block_index() - else: - left_index = left.sp_index - right_index = right.sp_index - - if consolidate_block_indices and left.kind == "block": - # we'll probably remove this hack... - left_index = left_index.to_int_index().to_block_index() - right_index = right_index.to_int_index().to_block_index() + left_index = left.sp_index + right_index = right.sp_index if not left_index.equals(right_index): raise_assert_detail( @@ -1547,11 +1511,9 @@ def assert_sp_array_equal( # Just ensure a pass - if check_fill_value: - assert_attr_equal("fill_value", left, right) - if check_dtype: - assert_attr_equal("dtype", left, right) - assert_numpy_array_equal(left.to_dense(), right.to_dense(), check_dtype=check_dtype) + assert_attr_equal("fill_value", left, right) + assert_attr_equal("dtype", left, right) + assert_numpy_array_equal(left.to_dense(), right.to_dense()) # ----------------------------------------------------------------------------- @@ -1734,32 +1696,6 @@ def _make_timeseries(start="2000-01-01", end="2000-12-31", freq="1D", seed=None) return df -def all_index_generator(k=10): - """ - Generator which can be iterated over to get instances of all the various - index classes. - - Parameters - ---------- - k: length of each of the index instances - """ - all_make_index_funcs = [ - makeIntIndex, - makeFloatIndex, - makeStringIndex, - makeUnicodeIndex, - makeDateIndex, - makePeriodIndex, - makeTimedeltaIndex, - makeBoolIndex, - makeRangeIndex, - makeIntervalIndex, - makeCategoricalIndex, - ] - for make_index_func in all_make_index_funcs: - yield make_index_func(k=k) - - def index_subclass_makers_generator(): make_index_funcs = [ makeDateIndex, @@ -2103,53 +2039,6 @@ def _gen_unique_rand(rng, _extra_size): return i.tolist(), j.tolist() -def makeMissingCustomDataframe( - nrows, - ncols, - density=0.9, - random_state=None, - c_idx_names=True, - r_idx_names=True, - c_idx_nlevels=1, - r_idx_nlevels=1, - data_gen_f=None, - c_ndupe_l=None, - r_ndupe_l=None, - dtype=None, - c_idx_type=None, - r_idx_type=None, -): - """ - Parameters - ---------- - Density : float, optional - Float in (0, 1) that gives the percentage of non-missing numbers in - the DataFrame. - random_state : {np.random.RandomState, int}, optional - Random number generator or random seed. - - See makeCustomDataframe for descriptions of the rest of the parameters. - """ - df = makeCustomDataframe( - nrows, - ncols, - c_idx_names=c_idx_names, - r_idx_names=r_idx_names, - c_idx_nlevels=c_idx_nlevels, - r_idx_nlevels=r_idx_nlevels, - data_gen_f=data_gen_f, - c_ndupe_l=c_ndupe_l, - r_ndupe_l=r_ndupe_l, - dtype=dtype, - c_idx_type=c_idx_type, - r_idx_type=r_idx_type, - ) - - i, j = _create_missing_idx(nrows, ncols, density, random_state) - df.values[i, j] = np.nan - return df - - def makeMissingDataframe(density=0.9, random_state=None): df = makeDataFrame() i, j = _create_missing_idx(*df.shape, density=density, random_state=random_state) @@ -2304,7 +2193,7 @@ def network( Notes ----- - * ``raise_on_error`` supercedes ``check_before_test`` + * ``raise_on_error`` supersedes ``check_before_test`` Returns ------- @@ -2397,7 +2286,6 @@ def wrapper(*args, **kwargs): def assert_produces_warning( expected_warning=Warning, filter_level="always", - clear=None, check_stacklevel=True, raise_on_extra_warnings=True, ): @@ -2427,12 +2315,6 @@ class for all warnings. To check that no warning is returned, from each module * "once" - print the warning the first time it is generated - clear : str, default None - If not ``None`` then remove any previously raised warnings from - the ``__warningsregistry__`` to ensure that no warning messages are - suppressed by this context manager. If ``None`` is specified, - the ``__warningsregistry__`` keeps track of which warnings have been - shown, and does not show them again. check_stacklevel : bool, default True If True, displays the line that called the function containing the warning to show were the function is called. Otherwise, the @@ -2465,19 +2347,6 @@ class for all warnings. To check that no warning is returned, with warnings.catch_warnings(record=True) as w: - if clear is not None: - # make sure that we are clearing these warnings - # if they have happened before - # to guarantee that we will catch them - if not is_list_like(clear): - clear = [clear] - for m in clear: - try: - m.__warningregistry__.clear() - except AttributeError: - # module may not have __warningregistry__ - pass - saw_warning = False warnings.simplefilter(filter_level) yield w diff --git a/pandas/conftest.py b/pandas/conftest.py index dcfc523315c8b..903e1a5dec132 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -1,3 +1,23 @@ +""" +This file is very long and growing, but it was decided to not split it yet, as +it's still manageable (2020-03-17, ~1.1k LoC). See gh-31989 + +Instead of splitting it was decided to define sections here: +- Configuration / Settings +- Autouse fixtures +- Common arguments +- Missing values & co. +- Classes +- Indices +- Series' +- DataFrames +- Operators & Operations +- Data sets/files +- Time zones +- Dtypes +- Misc +""" + from collections import abc from datetime import date, time, timedelta, timezone from decimal import Decimal @@ -19,19 +39,11 @@ from pandas.core import ops from pandas.core.indexes.api import Index, MultiIndex -hypothesis.settings.register_profile( - "ci", - # Hypothesis timing checks are tuned for scalars by default, so we bump - # them from 200ms to 500ms per test case as the global default. If this - # is too short for a specific test, (a) try to make it faster, and (b) - # if it really is slow add `@settings(deadline=...)` with a working value, - # or `deadline=None` to entirely disable timeouts for that test. - deadline=500, - suppress_health_check=(hypothesis.HealthCheck.too_slow,), -) -hypothesis.settings.load_profile("ci") - +# ---------------------------------------------------------------- +# Configuration / Settings +# ---------------------------------------------------------------- +# pytest def pytest_addoption(parser): parser.addoption("--skip-slow", action="store_true", help="skip slow tests") parser.addoption("--skip-network", action="store_true", help="skip network tests") @@ -66,6 +78,55 @@ def pytest_runtest_setup(item): pytest.skip("skipping high memory test since --run-high-memory was not set") +# Hypothesis +hypothesis.settings.register_profile( + "ci", + # Hypothesis timing checks are tuned for scalars by default, so we bump + # them from 200ms to 500ms per test case as the global default. If this + # is too short for a specific test, (a) try to make it faster, and (b) + # if it really is slow add `@settings(deadline=...)` with a working value, + # or `deadline=None` to entirely disable timeouts for that test. + deadline=500, + suppress_health_check=(hypothesis.HealthCheck.too_slow,), +) +hypothesis.settings.load_profile("ci") + +# Registering these strategies makes them globally available via st.from_type, +# which is use for offsets in tests/tseries/offsets/test_offsets_properties.py +for name in "MonthBegin MonthEnd BMonthBegin BMonthEnd".split(): + cls = getattr(pd.tseries.offsets, name) + st.register_type_strategy( + cls, st.builds(cls, n=st.integers(-99, 99), normalize=st.booleans()) + ) + +for name in "YearBegin YearEnd BYearBegin BYearEnd".split(): + cls = getattr(pd.tseries.offsets, name) + st.register_type_strategy( + cls, + st.builds( + cls, + n=st.integers(-5, 5), + normalize=st.booleans(), + month=st.integers(min_value=1, max_value=12), + ), + ) + +for name in "QuarterBegin QuarterEnd BQuarterBegin BQuarterEnd".split(): + cls = getattr(pd.tseries.offsets, name) + st.register_type_strategy( + cls, + st.builds( + cls, + n=st.integers(-24, 24), + normalize=st.booleans(), + startingMonth=st.integers(min_value=1, max_value=12), + ), + ) + + +# ---------------------------------------------------------------- +# Autouse fixtures +# ---------------------------------------------------------------- @pytest.fixture(autouse=True) def configure_tests(): """ @@ -83,16 +144,9 @@ def add_imports(doctest_namespace): doctest_namespace["pd"] = pd -@pytest.fixture(params=["bsr", "coo", "csc", "csr", "dia", "dok", "lil"]) -def spmatrix(request): - """ - Yields scipy sparse matrix classes. - """ - from scipy import sparse - - return getattr(sparse, request.param + "_matrix") - - +# ---------------------------------------------------------------- +# Common arguments +# ---------------------------------------------------------------- @pytest.fixture(params=[0, 1, "index", "columns"], ids=lambda x: f"axis {repr(x)}") def axis(request): """ @@ -112,19 +166,6 @@ def axis_series(request): return request.param -@pytest.fixture -def ip(): - """ - Get an instance of IPython.InteractiveShell. - - Will raise a skip if IPython is not installed. - """ - pytest.importorskip("IPython", minversion="6.0.0") - from IPython.core.interactiveshell import InteractiveShell - - return InteractiveShell() - - @pytest.fixture(params=[True, False, None]) def observed(request): """ @@ -146,938 +187,987 @@ def ordered_fixture(request): return request.param -_all_arithmetic_operators = [ - "__add__", - "__radd__", - "__sub__", - "__rsub__", - "__mul__", - "__rmul__", - "__floordiv__", - "__rfloordiv__", - "__truediv__", - "__rtruediv__", - "__pow__", - "__rpow__", - "__mod__", - "__rmod__", -] - - -@pytest.fixture(params=_all_arithmetic_operators) -def all_arithmetic_operators(request): +@pytest.fixture(params=["first", "last", False]) +def keep(request): """ - Fixture for dunder names for common arithmetic operations. + Valid values for the 'keep' parameter used in + .duplicated or .drop_duplicates """ return request.param -@pytest.fixture( - params=[ - operator.add, - ops.radd, - operator.sub, - ops.rsub, - operator.mul, - ops.rmul, - operator.truediv, - ops.rtruediv, - operator.floordiv, - ops.rfloordiv, - operator.mod, - ops.rmod, - operator.pow, - ops.rpow, - ] -) -def all_arithmetic_functions(request): +@pytest.fixture(params=["left", "right", "both", "neither"]) +def closed(request): """ - Fixture for operator and roperator arithmetic functions. - - Notes - ----- - This includes divmod and rdivmod, whereas all_arithmetic_operators - does not. + Fixture for trying all interval closed parameters. """ return request.param -_all_numeric_reductions = [ - "sum", - "max", - "min", - "mean", - "prod", - "std", - "var", - "median", - "kurt", - "skew", -] - - -@pytest.fixture(params=_all_numeric_reductions) -def all_numeric_reductions(request): +@pytest.fixture(params=["left", "right", "both", "neither"]) +def other_closed(request): """ - Fixture for numeric reduction names. + Secondary closed fixture to allow parametrizing over all pairs of closed. """ return request.param -_all_boolean_reductions = ["all", "any"] - - -@pytest.fixture(params=_all_boolean_reductions) -def all_boolean_reductions(request): +@pytest.fixture(params=[None, "gzip", "bz2", "zip", "xz"]) +def compression(request): """ - Fixture for boolean reduction names. + Fixture for trying common compression types in compression tests. """ return request.param -_cython_table = pd.core.base.SelectionMixin._cython_table.items() - - -@pytest.fixture(params=list(_cython_table)) -def cython_table_items(request): +@pytest.fixture(params=["gzip", "bz2", "zip", "xz"]) +def compression_only(request): """ - Yields a tuple of a function and its corresponding name. Correspond to - the list of aggregator "Cython functions" used on selected table items. + Fixture for trying common compression types in compression tests excluding + uncompressed case. """ return request.param -def _get_cython_table_params(ndframe, func_names_and_expected): +@pytest.fixture(params=[True, False]) +def writable(request): """ - Combine frame, functions from SelectionMixin._cython_table - keys and expected result. - - Parameters - ---------- - ndframe : DataFrame or Series - func_names_and_expected : Sequence of two items - The first item is a name of a NDFrame method ('sum', 'prod') etc. - The second item is the expected return value. - - Returns - ------- - list - List of three items (DataFrame, function, expected result) + Fixture that an array is writable. """ - results = [] - for func_name, expected in func_names_and_expected: - results.append((ndframe, func_name, expected)) - results += [ - (ndframe, func, expected) - for func, name in _cython_table - if name == func_name - ] - return results + return request.param -@pytest.fixture(params=["__eq__", "__ne__", "__le__", "__lt__", "__ge__", "__gt__"]) -def all_compare_operators(request): +@pytest.fixture(params=["inner", "outer", "left", "right"]) +def join_type(request): """ - Fixture for dunder names for common compare operations - - * >= - * > - * == - * != - * < - * <= + Fixture for trying all types of join operations. """ return request.param -@pytest.fixture(params=["__le__", "__lt__", "__ge__", "__gt__"]) -def compare_operators_no_eq_ne(request): +@pytest.fixture(params=["nlargest", "nsmallest"]) +def nselect_method(request): """ - Fixture for dunder names for compare operations except == and != - - * >= - * > - * < - * <= + Fixture for trying all nselect methods. """ return request.param -@pytest.fixture( - params=["__and__", "__rand__", "__or__", "__ror__", "__xor__", "__rxor__"] -) -def all_logical_operators(request): +# ---------------------------------------------------------------- +# Missing values & co. +# ---------------------------------------------------------------- +@pytest.fixture(params=[None, np.nan, pd.NaT, float("nan"), np.float("NaN"), pd.NA]) +def nulls_fixture(request): """ - Fixture for dunder names for common logical operations - - * | - * & - * ^ + Fixture for each null type in pandas. """ return request.param -@pytest.fixture(params=[None, "gzip", "bz2", "zip", "xz"]) -def compression(request): - """ - Fixture for trying common compression types in compression tests. - """ - return request.param +nulls_fixture2 = nulls_fixture # Generate cartesian product of nulls_fixture -@pytest.fixture(params=["gzip", "bz2", "zip", "xz"]) -def compression_only(request): +@pytest.fixture(params=[None, np.nan, pd.NaT]) +def unique_nulls_fixture(request): """ - Fixture for trying common compression types in compression tests excluding - uncompressed case. + Fixture for each null type in pandas, each null type exactly once. """ return request.param -@pytest.fixture(params=[True, False]) -def writable(request): - """ - Fixture that an array is writable. - """ - return request.param +# Generate cartesian product of unique_nulls_fixture: +unique_nulls_fixture2 = unique_nulls_fixture -@pytest.fixture(scope="module") -def datetime_tz_utc(): - """ - Yields the UTC timezone object from the datetime module. +# ---------------------------------------------------------------- +# Classes +# ---------------------------------------------------------------- +@pytest.fixture(params=[pd.Index, pd.Series], ids=["index", "series"]) +def index_or_series(request): """ - return timezone.utc + Fixture to parametrize over Index and Series, made necessary by a mypy + bug, giving an error: + List item 0 has incompatible type "Type[Series]"; expected "Type[PandasObject]" -@pytest.fixture(params=["utc", "dateutil/UTC", utc, tzutc(), timezone.utc]) -def utc_fixture(request): - """ - Fixture to provide variants of UTC timezone strings and tzinfo objects. + See GH#29725 """ return request.param -@pytest.fixture(params=["inner", "outer", "left", "right"]) -def join_type(request): +@pytest.fixture +def dict_subclass(): """ - Fixture for trying all types of join operations. + Fixture for a dictionary subclass. """ - return request.param + class TestSubDict(dict): + def __init__(self, *args, **kwargs): + dict.__init__(self, *args, **kwargs) -@pytest.fixture -def strict_data_files(pytestconfig): - """ - Returns the configuration for the test setting `--strict-data-files`. - """ - return pytestconfig.getoption("--strict-data-files") + return TestSubDict @pytest.fixture -def datapath(strict_data_files): +def non_mapping_dict_subclass(): + """ + Fixture for a non-mapping dictionary subclass. """ - Get the path to a data file. - Parameters - ---------- - path : str - Path to the file, relative to ``pandas/tests/`` + class TestNonDictMapping(abc.Mapping): + def __init__(self, underlying_dict): + self._data = underlying_dict - Returns - ------- - path including ``pandas/tests``. + def __getitem__(self, key): + return self._data.__getitem__(key) - Raises - ------ - ValueError - If the path doesn't exist and the --strict-data-files option is set. - """ - BASE_PATH = os.path.join(os.path.dirname(__file__), "tests") + def __iter__(self): + return self._data.__iter__() - def deco(*args): - path = os.path.join(BASE_PATH, *args) - if not os.path.exists(path): - if strict_data_files: - raise ValueError( - f"Could not find file {path} and --strict-data-files is set." - ) - else: - pytest.skip(f"Could not find {path}.") - return path + def __len__(self): + return self._data.__len__() - return deco + return TestNonDictMapping +# ---------------------------------------------------------------- +# Indices +# ---------------------------------------------------------------- @pytest.fixture -def iris(datapath): +def multiindex_year_month_day_dataframe_random_data(): """ - The iris dataset as a DataFrame. + DataFrame with 3 level MultiIndex (year, month, day) covering + first 100 business days from 2000-01-01 with random data """ - return pd.read_csv(datapath("data", "iris.csv")) + tdf = tm.makeTimeDataFrame(100) + ymd = tdf.groupby([lambda x: x.year, lambda x: x.month, lambda x: x.day]).sum() + # use Int64Index, to make sure things work + ymd.index.set_levels([lev.astype("i8") for lev in ymd.index.levels], inplace=True) + ymd.index.set_names(["year", "month", "day"], inplace=True) + return ymd -@pytest.fixture(params=["nlargest", "nsmallest"]) -def nselect_method(request): +def _create_multiindex(): """ - Fixture for trying all nselect methods. + MultiIndex used to test the general functionality of this object """ - return request.param + # See Also: tests.multi.conftest.idx + major_axis = Index(["foo", "bar", "baz", "qux"]) + minor_axis = Index(["one", "two"]) -@pytest.fixture(params=["left", "right", "both", "neither"]) -def closed(request): - """ - Fixture for trying all interval closed parameters. - """ - return request.param + major_codes = np.array([0, 0, 1, 2, 3, 3]) + minor_codes = np.array([0, 1, 0, 1, 0, 1]) + index_names = ["first", "second"] + mi = MultiIndex( + levels=[major_axis, minor_axis], + codes=[major_codes, minor_codes], + names=index_names, + verify_integrity=False, + ) + return mi -@pytest.fixture(params=["left", "right", "both", "neither"]) -def other_closed(request): +indices_dict = { + "unicode": tm.makeUnicodeIndex(100), + "string": tm.makeStringIndex(100), + "datetime": tm.makeDateIndex(100), + "datetime-tz": tm.makeDateIndex(100, tz="US/Pacific"), + "period": tm.makePeriodIndex(100), + "timedelta": tm.makeTimedeltaIndex(100), + "int": tm.makeIntIndex(100), + "uint": tm.makeUIntIndex(100), + "range": tm.makeRangeIndex(100), + "float": tm.makeFloatIndex(100), + "bool": tm.makeBoolIndex(10), + "categorical": tm.makeCategoricalIndex(100), + "interval": tm.makeIntervalIndex(100), + "empty": Index([]), + "tuples": MultiIndex.from_tuples(zip(["foo", "bar", "baz"], [1, 2, 3])), + "multi": _create_multiindex(), + "repeats": Index([0, 0, 1, 1, 2, 2]), +} + + +@pytest.fixture(params=indices_dict.keys()) +def indices(request): """ - Secondary closed fixture to allow parametrizing over all pairs of closed. + Fixture for many "simple" kinds of indices. + + These indices are unlikely to cover corner cases, e.g. + - no names + - no NaTs/NaNs + - no values near implementation bounds + - ... """ - return request.param + # copy to avoid mutation, e.g. setting .name + return indices_dict[request.param].copy() -@pytest.fixture(params=[None, np.nan, pd.NaT, float("nan"), np.float("NaN"), pd.NA]) -def nulls_fixture(request): +# ---------------------------------------------------------------- +# Series' +# ---------------------------------------------------------------- +@pytest.fixture +def empty_series(): + return pd.Series([], index=[], dtype=np.float64) + + +@pytest.fixture +def string_series(): """ - Fixture for each null type in pandas. + Fixture for Series of floats with Index of unique strings """ - return request.param + s = tm.makeStringSeries() + s.name = "series" + return s -nulls_fixture2 = nulls_fixture # Generate cartesian product of nulls_fixture +@pytest.fixture +def object_series(): + """ + Fixture for Series of dtype object with Index of unique strings + """ + s = tm.makeObjectSeries() + s.name = "objects" + return s -@pytest.fixture(params=[None, np.nan, pd.NaT]) -def unique_nulls_fixture(request): +@pytest.fixture +def datetime_series(): """ - Fixture for each null type in pandas, each null type exactly once. + Fixture for Series of floats with DatetimeIndex """ - return request.param + s = tm.makeTimeSeries() + s.name = "ts" + return s -# Generate cartesian product of unique_nulls_fixture: -unique_nulls_fixture2 = unique_nulls_fixture +def _create_series(index): + """ Helper for the _series dict """ + size = len(index) + data = np.random.randn(size) + return pd.Series(data, index=index, name="a") -TIMEZONES = [ - None, - "UTC", - "US/Eastern", - "Asia/Tokyo", - "dateutil/US/Pacific", - "dateutil/Asia/Singapore", - tzutc(), - tzlocal(), - FixedOffset(300), - FixedOffset(0), - FixedOffset(-300), - timezone.utc, - timezone(timedelta(hours=1)), - timezone(timedelta(hours=-1), name="foo"), -] -TIMEZONE_IDS = [repr(i) for i in TIMEZONES] +_series = { + f"series-with-{index_id}-index": _create_series(index) + for index_id, index in indices_dict.items() +} -@td.parametrize_fixture_doc(str(TIMEZONE_IDS)) -@pytest.fixture(params=TIMEZONES, ids=TIMEZONE_IDS) -def tz_naive_fixture(request): +@pytest.fixture +def series_with_simple_index(indices): """ - Fixture for trying timezones including default (None): {0} + Fixture for tests on series with changing types of indices. """ - return request.param + return _create_series(indices) -@td.parametrize_fixture_doc(str(TIMEZONE_IDS[1:])) -@pytest.fixture(params=TIMEZONES[1:], ids=TIMEZONE_IDS[1:]) -def tz_aware_fixture(request): +_narrow_dtypes = [ + np.float16, + np.float32, + np.int8, + np.int16, + np.int32, + np.uint8, + np.uint16, + np.uint32, +] +_narrow_series = { + f"{dtype.__name__}-series": tm.makeFloatSeries(name="a").astype(dtype) + for dtype in _narrow_dtypes +} + + +@pytest.fixture(params=_narrow_series.keys()) +def narrow_series(request): """ - Fixture for trying explicit timezones: {0} + Fixture for Series with low precision data types """ - return request.param + # copy to avoid mutation, e.g. setting .name + return _narrow_series[request.param].copy() -# Generate cartesian product of tz_aware_fixture: -tz_aware_fixture2 = tz_aware_fixture +_index_or_series_objs = {**indices_dict, **_series, **_narrow_series} + + +@pytest.fixture(params=_index_or_series_objs.keys()) +def index_or_series_obj(request): + """ + Fixture for tests on indexes, series and series with a narrow dtype + copy to avoid mutation, e.g. setting .name + """ + return _index_or_series_objs[request.param].copy(deep=True) # ---------------------------------------------------------------- -# Dtypes +# DataFrames # ---------------------------------------------------------------- +@pytest.fixture +def float_frame(): + """ + Fixture for DataFrame of floats with index of unique strings -UNSIGNED_INT_DTYPES = ["uint8", "uint16", "uint32", "uint64"] -UNSIGNED_EA_INT_DTYPES = ["UInt8", "UInt16", "UInt32", "UInt64"] -SIGNED_INT_DTYPES = [int, "int8", "int16", "int32", "int64"] -SIGNED_EA_INT_DTYPES = ["Int8", "Int16", "Int32", "Int64"] -ALL_INT_DTYPES = UNSIGNED_INT_DTYPES + SIGNED_INT_DTYPES -ALL_EA_INT_DTYPES = UNSIGNED_EA_INT_DTYPES + SIGNED_EA_INT_DTYPES - -FLOAT_DTYPES = [float, "float32", "float64"] -COMPLEX_DTYPES = [complex, "complex64", "complex128"] -STRING_DTYPES = [str, "str", "U"] + Columns are ['A', 'B', 'C', 'D']. -DATETIME64_DTYPES = ["datetime64[ns]", "M8[ns]"] -TIMEDELTA64_DTYPES = ["timedelta64[ns]", "m8[ns]"] + A B C D + P7GACiRnxd -0.465578 -0.361863 0.886172 -0.053465 + qZKh6afn8n -0.466693 -0.373773 0.266873 1.673901 + tkp0r6Qble 0.148691 -0.059051 0.174817 1.598433 + wP70WOCtv8 0.133045 -0.581994 -0.992240 0.261651 + M2AeYQMnCz -1.207959 -0.185775 0.588206 0.563938 + QEPzyGDYDo -0.381843 -0.758281 0.502575 -0.565053 + r78Jwns6dn -0.653707 0.883127 0.682199 0.206159 + ... ... ... ... ... + IHEGx9NO0T -0.277360 0.113021 -1.018314 0.196316 + lPMj8K27FA -1.313667 -0.604776 -1.305618 -0.863999 + qa66YMWQa5 1.110525 0.475310 -0.747865 0.032121 + yOa0ATsmcE -0.431457 0.067094 0.096567 -0.264962 + 65znX3uRNG 1.528446 0.160416 -0.109635 -0.032987 + eCOBvKqf3e 0.235281 1.622222 0.781255 0.392871 + xSucinXxuV -1.263557 0.252799 -0.552247 0.400426 -BOOL_DTYPES = [bool, "bool"] -BYTES_DTYPES = [bytes, "bytes"] -OBJECT_DTYPES = [object, "object"] + [30 rows x 4 columns] + """ + return DataFrame(tm.getSeriesData()) -ALL_REAL_DTYPES = FLOAT_DTYPES + ALL_INT_DTYPES -ALL_NUMPY_DTYPES = ( - ALL_REAL_DTYPES - + COMPLEX_DTYPES - + STRING_DTYPES - + DATETIME64_DTYPES - + TIMEDELTA64_DTYPES - + BOOL_DTYPES - + OBJECT_DTYPES - + BYTES_DTYPES -) + +# ---------------------------------------------------------------- +# Operators & Operations +# ---------------------------------------------------------------- +_all_arithmetic_operators = [ + "__add__", + "__radd__", + "__sub__", + "__rsub__", + "__mul__", + "__rmul__", + "__floordiv__", + "__rfloordiv__", + "__truediv__", + "__rtruediv__", + "__pow__", + "__rpow__", + "__mod__", + "__rmod__", +] -@pytest.fixture(params=STRING_DTYPES) -def string_dtype(request): +@pytest.fixture(params=_all_arithmetic_operators) +def all_arithmetic_operators(request): """ - Parametrized fixture for string dtypes. - - * str - * 'str' - * 'U' + Fixture for dunder names for common arithmetic operations. """ return request.param -@pytest.fixture(params=BYTES_DTYPES) -def bytes_dtype(request): +@pytest.fixture( + params=[ + operator.add, + ops.radd, + operator.sub, + ops.rsub, + operator.mul, + ops.rmul, + operator.truediv, + ops.rtruediv, + operator.floordiv, + ops.rfloordiv, + operator.mod, + ops.rmod, + operator.pow, + ops.rpow, + ] +) +def all_arithmetic_functions(request): """ - Parametrized fixture for bytes dtypes. + Fixture for operator and roperator arithmetic functions. - * bytes - * 'bytes' + Notes + ----- + This includes divmod and rdivmod, whereas all_arithmetic_operators + does not. """ return request.param -@pytest.fixture(params=OBJECT_DTYPES) -def object_dtype(request): - """ - Parametrized fixture for object dtypes. - - * object - * 'object' - """ - return request.param +_all_numeric_reductions = [ + "sum", + "max", + "min", + "mean", + "prod", + "std", + "var", + "median", + "kurt", + "skew", +] -@pytest.fixture(params=DATETIME64_DTYPES) -def datetime64_dtype(request): +@pytest.fixture(params=_all_numeric_reductions) +def all_numeric_reductions(request): """ - Parametrized fixture for datetime64 dtypes. - - * 'datetime64[ns]' - * 'M8[ns]' + Fixture for numeric reduction names. """ return request.param -@pytest.fixture(params=TIMEDELTA64_DTYPES) -def timedelta64_dtype(request): - """ - Parametrized fixture for timedelta64 dtypes. - - * 'timedelta64[ns]' - * 'm8[ns]' - """ - return request.param +_all_boolean_reductions = ["all", "any"] -@pytest.fixture(params=FLOAT_DTYPES) -def float_dtype(request): +@pytest.fixture(params=_all_boolean_reductions) +def all_boolean_reductions(request): """ - Parameterized fixture for float dtypes. - - * float - * 'float32' - * 'float64' + Fixture for boolean reduction names. """ return request.param -@pytest.fixture(params=COMPLEX_DTYPES) -def complex_dtype(request): +@pytest.fixture(params=["__eq__", "__ne__", "__le__", "__lt__", "__ge__", "__gt__"]) +def all_compare_operators(request): """ - Parameterized fixture for complex dtypes. + Fixture for dunder names for common compare operations - * complex - * 'complex64' - * 'complex128' + * >= + * > + * == + * != + * < + * <= """ return request.param -@pytest.fixture(params=SIGNED_INT_DTYPES) -def sint_dtype(request): +@pytest.fixture(params=["__le__", "__lt__", "__ge__", "__gt__"]) +def compare_operators_no_eq_ne(request): """ - Parameterized fixture for signed integer dtypes. + Fixture for dunder names for compare operations except == and != - * int - * 'int8' - * 'int16' - * 'int32' - * 'int64' + * >= + * > + * < + * <= """ return request.param -@pytest.fixture(params=UNSIGNED_INT_DTYPES) -def uint_dtype(request): +@pytest.fixture( + params=["__and__", "__rand__", "__or__", "__ror__", "__xor__", "__rxor__"] +) +def all_logical_operators(request): """ - Parameterized fixture for unsigned integer dtypes. + Fixture for dunder names for common logical operations - * 'uint8' - * 'uint16' - * 'uint32' - * 'uint64' + * | + * & + * ^ """ return request.param -@pytest.fixture(params=ALL_INT_DTYPES) -def any_int_dtype(request): +# ---------------------------------------------------------------- +# Data sets/files +# ---------------------------------------------------------------- +@pytest.fixture +def strict_data_files(pytestconfig): """ - Parameterized fixture for any integer dtype. - - * int - * 'int8' - * 'uint8' - * 'int16' - * 'uint16' - * 'int32' - * 'uint32' - * 'int64' - * 'uint64' + Returns the configuration for the test setting `--strict-data-files`. """ - return request.param + return pytestconfig.getoption("--strict-data-files") -@pytest.fixture(params=ALL_EA_INT_DTYPES) -def any_nullable_int_dtype(request): +@pytest.fixture +def datapath(strict_data_files): """ - Parameterized fixture for any nullable integer dtype. + Get the path to a data file. - * 'UInt8' - * 'Int8' - * 'UInt16' - * 'Int16' - * 'UInt32' - * 'Int32' - * 'UInt64' - * 'Int64' - """ - return request.param + Parameters + ---------- + path : str + Path to the file, relative to ``pandas/tests/`` + Returns + ------- + path including ``pandas/tests``. -@pytest.fixture(params=ALL_REAL_DTYPES) -def any_real_dtype(request): + Raises + ------ + ValueError + If the path doesn't exist and the --strict-data-files option is set. """ - Parameterized fixture for any (purely) real numeric dtype. + BASE_PATH = os.path.join(os.path.dirname(__file__), "tests") - * int - * 'int8' - * 'uint8' - * 'int16' - * 'uint16' - * 'int32' - * 'uint32' - * 'int64' - * 'uint64' - * float - * 'float32' - * 'float64' + def deco(*args): + path = os.path.join(BASE_PATH, *args) + if not os.path.exists(path): + if strict_data_files: + raise ValueError( + f"Could not find file {path} and --strict-data-files is set." + ) + else: + pytest.skip(f"Could not find {path}.") + return path + + return deco + + +@pytest.fixture +def iris(datapath): + """ + The iris dataset as a DataFrame. + """ + return pd.read_csv(datapath("data", "iris.csv")) + + +# ---------------------------------------------------------------- +# Time zones +# ---------------------------------------------------------------- +TIMEZONES = [ + None, + "UTC", + "US/Eastern", + "Asia/Tokyo", + "dateutil/US/Pacific", + "dateutil/Asia/Singapore", + tzutc(), + tzlocal(), + FixedOffset(300), + FixedOffset(0), + FixedOffset(-300), + timezone.utc, + timezone(timedelta(hours=1)), + timezone(timedelta(hours=-1), name="foo"), +] +TIMEZONE_IDS = [repr(i) for i in TIMEZONES] + + +@td.parametrize_fixture_doc(str(TIMEZONE_IDS)) +@pytest.fixture(params=TIMEZONES, ids=TIMEZONE_IDS) +def tz_naive_fixture(request): + """ + Fixture for trying timezones including default (None): {0} """ return request.param -@pytest.fixture(params=ALL_NUMPY_DTYPES) -def any_numpy_dtype(request): +@td.parametrize_fixture_doc(str(TIMEZONE_IDS[1:])) +@pytest.fixture(params=TIMEZONES[1:], ids=TIMEZONE_IDS[1:]) +def tz_aware_fixture(request): """ - Parameterized fixture for all numpy dtypes. + Fixture for trying explicit timezones: {0} + """ + return request.param + + +# Generate cartesian product of tz_aware_fixture: +tz_aware_fixture2 = tz_aware_fixture + + +@pytest.fixture(scope="module") +def datetime_tz_utc(): + """ + Yields the UTC timezone object from the datetime module. + """ + return timezone.utc + + +@pytest.fixture(params=["utc", "dateutil/UTC", utc, tzutc(), timezone.utc]) +def utc_fixture(request): + """ + Fixture to provide variants of UTC timezone strings and tzinfo objects. + """ + return request.param + + +# ---------------------------------------------------------------- +# Dtypes +# ---------------------------------------------------------------- + +UNSIGNED_INT_DTYPES = ["uint8", "uint16", "uint32", "uint64"] +UNSIGNED_EA_INT_DTYPES = ["UInt8", "UInt16", "UInt32", "UInt64"] +SIGNED_INT_DTYPES = [int, "int8", "int16", "int32", "int64"] +SIGNED_EA_INT_DTYPES = ["Int8", "Int16", "Int32", "Int64"] +ALL_INT_DTYPES = UNSIGNED_INT_DTYPES + SIGNED_INT_DTYPES +ALL_EA_INT_DTYPES = UNSIGNED_EA_INT_DTYPES + SIGNED_EA_INT_DTYPES + +FLOAT_DTYPES = [float, "float32", "float64"] +COMPLEX_DTYPES = [complex, "complex64", "complex128"] +STRING_DTYPES = [str, "str", "U"] + +DATETIME64_DTYPES = ["datetime64[ns]", "M8[ns]"] +TIMEDELTA64_DTYPES = ["timedelta64[ns]", "m8[ns]"] + +BOOL_DTYPES = [bool, "bool"] +BYTES_DTYPES = [bytes, "bytes"] +OBJECT_DTYPES = [object, "object"] + +ALL_REAL_DTYPES = FLOAT_DTYPES + ALL_INT_DTYPES +ALL_NUMPY_DTYPES = ( + ALL_REAL_DTYPES + + COMPLEX_DTYPES + + STRING_DTYPES + + DATETIME64_DTYPES + + TIMEDELTA64_DTYPES + + BOOL_DTYPES + + OBJECT_DTYPES + + BYTES_DTYPES +) + + +@pytest.fixture(params=STRING_DTYPES) +def string_dtype(request): + """ + Parametrized fixture for string dtypes. - * bool - * 'bool' - * int - * 'int8' - * 'uint8' - * 'int16' - * 'uint16' - * 'int32' - * 'uint32' - * 'int64' - * 'uint64' - * float - * 'float32' - * 'float64' - * complex - * 'complex64' - * 'complex128' * str * 'str' * 'U' + """ + return request.param + + +@pytest.fixture(params=BYTES_DTYPES) +def bytes_dtype(request): + """ + Parametrized fixture for bytes dtypes. + * bytes * 'bytes' - * 'datetime64[ns]' - * 'M8[ns]' - * 'timedelta64[ns]' - * 'm8[ns]' + """ + return request.param + + +@pytest.fixture(params=OBJECT_DTYPES) +def object_dtype(request): + """ + Parametrized fixture for object dtypes. + * object * 'object' """ return request.param -# categoricals are handled separately -_any_skipna_inferred_dtype = [ - ("string", ["a", np.nan, "c"]), - ("string", ["a", pd.NA, "c"]), - ("bytes", [b"a", np.nan, b"c"]), - ("empty", [np.nan, np.nan, np.nan]), - ("empty", []), - ("mixed-integer", ["a", np.nan, 2]), - ("mixed", ["a", np.nan, 2.0]), - ("floating", [1.0, np.nan, 2.0]), - ("integer", [1, np.nan, 2]), - ("mixed-integer-float", [1, np.nan, 2.0]), - ("decimal", [Decimal(1), np.nan, Decimal(2)]), - ("boolean", [True, np.nan, False]), - ("boolean", [True, pd.NA, False]), - ("datetime64", [np.datetime64("2013-01-01"), np.nan, np.datetime64("2018-01-01")]), - ("datetime", [pd.Timestamp("20130101"), np.nan, pd.Timestamp("20180101")]), - ("date", [date(2013, 1, 1), np.nan, date(2018, 1, 1)]), - # The following two dtypes are commented out due to GH 23554 - # ('complex', [1 + 1j, np.nan, 2 + 2j]), - # ('timedelta64', [np.timedelta64(1, 'D'), - # np.nan, np.timedelta64(2, 'D')]), - ("timedelta", [timedelta(1), np.nan, timedelta(2)]), - ("time", [time(1), np.nan, time(2)]), - ("period", [pd.Period(2013), pd.NaT, pd.Period(2018)]), - ("interval", [pd.Interval(0, 1), np.nan, pd.Interval(0, 2)]), -] -ids, _ = zip(*_any_skipna_inferred_dtype) # use inferred type as fixture-id +@pytest.fixture(params=DATETIME64_DTYPES) +def datetime64_dtype(request): + """ + Parametrized fixture for datetime64 dtypes. + + * 'datetime64[ns]' + * 'M8[ns]' + """ + return request.param -@pytest.fixture(params=_any_skipna_inferred_dtype, ids=ids) -def any_skipna_inferred_dtype(request): +@pytest.fixture(params=TIMEDELTA64_DTYPES) +def timedelta64_dtype(request): """ - Fixture for all inferred dtypes from _libs.lib.infer_dtype + Parametrized fixture for timedelta64 dtypes. - The covered (inferred) types are: - * 'string' - * 'empty' - * 'bytes' - * 'mixed' - * 'mixed-integer' - * 'mixed-integer-float' - * 'floating' - * 'integer' - * 'decimal' - * 'boolean' - * 'datetime64' - * 'datetime' - * 'date' - * 'timedelta' - * 'time' - * 'period' - * 'interval' + * 'timedelta64[ns]' + * 'm8[ns]' + """ + return request.param - Returns - ------- - inferred_dtype : str - The string for the inferred dtype from _libs.lib.infer_dtype - values : np.ndarray - An array of object dtype that will be inferred to have - `inferred_dtype` - Examples - -------- - >>> import pandas._libs.lib as lib - >>> - >>> def test_something(any_skipna_inferred_dtype): - ... inferred_dtype, values = any_skipna_inferred_dtype - ... # will pass - ... assert lib.infer_dtype(values, skipna=True) == inferred_dtype +@pytest.fixture(params=FLOAT_DTYPES) +def float_dtype(request): """ - inferred_dtype, values = request.param - values = np.array(values, dtype=object) # object dtype to avoid casting + Parameterized fixture for float dtypes. - # correctness of inference tested in tests/dtypes/test_inference.py - return inferred_dtype, values + * float + * 'float32' + * 'float64' + """ + return request.param -@pytest.fixture( - params=[ - getattr(pd.offsets, o) - for o in pd.offsets.__all__ - if issubclass(getattr(pd.offsets, o), pd.offsets.Tick) - ] -) -def tick_classes(request): +@pytest.fixture(params=COMPLEX_DTYPES) +def complex_dtype(request): """ - Fixture for Tick based datetime offsets available for a time series. + Parameterized fixture for complex dtypes. + + * complex + * 'complex64' + * 'complex128' """ return request.param -# ---------------------------------------------------------------- -# Global setup for tests using Hypothesis +@pytest.fixture(params=SIGNED_INT_DTYPES) +def sint_dtype(request): + """ + Parameterized fixture for signed integer dtypes. + * int + * 'int8' + * 'int16' + * 'int32' + * 'int64' + """ + return request.param -# Registering these strategies makes them globally available via st.from_type, -# which is use for offsets in tests/tseries/offsets/test_offsets_properties.py -for name in "MonthBegin MonthEnd BMonthBegin BMonthEnd".split(): - cls = getattr(pd.tseries.offsets, name) - st.register_type_strategy( - cls, st.builds(cls, n=st.integers(-99, 99), normalize=st.booleans()) - ) -for name in "YearBegin YearEnd BYearBegin BYearEnd".split(): - cls = getattr(pd.tseries.offsets, name) - st.register_type_strategy( - cls, - st.builds( - cls, - n=st.integers(-5, 5), - normalize=st.booleans(), - month=st.integers(min_value=1, max_value=12), - ), - ) +@pytest.fixture(params=UNSIGNED_INT_DTYPES) +def uint_dtype(request): + """ + Parameterized fixture for unsigned integer dtypes. -for name in "QuarterBegin QuarterEnd BQuarterBegin BQuarterEnd".split(): - cls = getattr(pd.tseries.offsets, name) - st.register_type_strategy( - cls, - st.builds( - cls, - n=st.integers(-24, 24), - normalize=st.booleans(), - startingMonth=st.integers(min_value=1, max_value=12), - ), - ) + * 'uint8' + * 'uint16' + * 'uint32' + * 'uint64' + """ + return request.param -@pytest.fixture -def datetime_series(): +@pytest.fixture(params=ALL_INT_DTYPES) +def any_int_dtype(request): """ - Fixture for Series of floats with DatetimeIndex + Parameterized fixture for any integer dtype. + + * int + * 'int8' + * 'uint8' + * 'int16' + * 'uint16' + * 'int32' + * 'uint32' + * 'int64' + * 'uint64' """ - s = tm.makeTimeSeries() - s.name = "ts" - return s + return request.param -@pytest.fixture -def float_frame(): +@pytest.fixture(params=ALL_EA_INT_DTYPES) +def any_nullable_int_dtype(request): """ - Fixture for DataFrame of floats with index of unique strings + Parameterized fixture for any nullable integer dtype. - Columns are ['A', 'B', 'C', 'D']. + * 'UInt8' + * 'Int8' + * 'UInt16' + * 'Int16' + * 'UInt32' + * 'Int32' + * 'UInt64' + * 'Int64' + """ + return request.param - A B C D - P7GACiRnxd -0.465578 -0.361863 0.886172 -0.053465 - qZKh6afn8n -0.466693 -0.373773 0.266873 1.673901 - tkp0r6Qble 0.148691 -0.059051 0.174817 1.598433 - wP70WOCtv8 0.133045 -0.581994 -0.992240 0.261651 - M2AeYQMnCz -1.207959 -0.185775 0.588206 0.563938 - QEPzyGDYDo -0.381843 -0.758281 0.502575 -0.565053 - r78Jwns6dn -0.653707 0.883127 0.682199 0.206159 - ... ... ... ... ... - IHEGx9NO0T -0.277360 0.113021 -1.018314 0.196316 - lPMj8K27FA -1.313667 -0.604776 -1.305618 -0.863999 - qa66YMWQa5 1.110525 0.475310 -0.747865 0.032121 - yOa0ATsmcE -0.431457 0.067094 0.096567 -0.264962 - 65znX3uRNG 1.528446 0.160416 -0.109635 -0.032987 - eCOBvKqf3e 0.235281 1.622222 0.781255 0.392871 - xSucinXxuV -1.263557 0.252799 -0.552247 0.400426 - [30 rows x 4 columns] +@pytest.fixture(params=ALL_REAL_DTYPES) +def any_real_dtype(request): """ - return DataFrame(tm.getSeriesData()) - + Parameterized fixture for any (purely) real numeric dtype. -@pytest.fixture(params=[pd.Index, pd.Series], ids=["index", "series"]) -def index_or_series(request): + * int + * 'int8' + * 'uint8' + * 'int16' + * 'uint16' + * 'int32' + * 'uint32' + * 'int64' + * 'uint64' + * float + * 'float32' + * 'float64' """ - Fixture to parametrize over Index and Series, made necessary by a mypy - bug, giving an error: + return request.param - List item 0 has incompatible type "Type[Series]"; expected "Type[PandasObject]" - See GH#29725 +@pytest.fixture(params=ALL_NUMPY_DTYPES) +def any_numpy_dtype(request): + """ + Parameterized fixture for all numpy dtypes. + + * bool + * 'bool' + * int + * 'int8' + * 'uint8' + * 'int16' + * 'uint16' + * 'int32' + * 'uint32' + * 'int64' + * 'uint64' + * float + * 'float32' + * 'float64' + * complex + * 'complex64' + * 'complex128' + * str + * 'str' + * 'U' + * bytes + * 'bytes' + * 'datetime64[ns]' + * 'M8[ns]' + * 'timedelta64[ns]' + * 'm8[ns]' + * object + * 'object' """ return request.param -@pytest.fixture -def dict_subclass(): - """ - Fixture for a dictionary subclass. - """ - - class TestSubDict(dict): - def __init__(self, *args, **kwargs): - dict.__init__(self, *args, **kwargs) - - return TestSubDict +# categoricals are handled separately +_any_skipna_inferred_dtype = [ + ("string", ["a", np.nan, "c"]), + ("string", ["a", pd.NA, "c"]), + ("bytes", [b"a", np.nan, b"c"]), + ("empty", [np.nan, np.nan, np.nan]), + ("empty", []), + ("mixed-integer", ["a", np.nan, 2]), + ("mixed", ["a", np.nan, 2.0]), + ("floating", [1.0, np.nan, 2.0]), + ("integer", [1, np.nan, 2]), + ("mixed-integer-float", [1, np.nan, 2.0]), + ("decimal", [Decimal(1), np.nan, Decimal(2)]), + ("boolean", [True, np.nan, False]), + ("boolean", [True, pd.NA, False]), + ("datetime64", [np.datetime64("2013-01-01"), np.nan, np.datetime64("2018-01-01")]), + ("datetime", [pd.Timestamp("20130101"), np.nan, pd.Timestamp("20180101")]), + ("date", [date(2013, 1, 1), np.nan, date(2018, 1, 1)]), + # The following two dtypes are commented out due to GH 23554 + # ('complex', [1 + 1j, np.nan, 2 + 2j]), + # ('timedelta64', [np.timedelta64(1, 'D'), + # np.nan, np.timedelta64(2, 'D')]), + ("timedelta", [timedelta(1), np.nan, timedelta(2)]), + ("time", [time(1), np.nan, time(2)]), + ("period", [pd.Period(2013), pd.NaT, pd.Period(2018)]), + ("interval", [pd.Interval(0, 1), np.nan, pd.Interval(0, 2)]), +] +ids, _ = zip(*_any_skipna_inferred_dtype) # use inferred type as fixture-id -@pytest.fixture -def non_mapping_dict_subclass(): - """ - Fixture for a non-mapping dictionary subclass. +@pytest.fixture(params=_any_skipna_inferred_dtype, ids=ids) +def any_skipna_inferred_dtype(request): """ + Fixture for all inferred dtypes from _libs.lib.infer_dtype - class TestNonDictMapping(abc.Mapping): - def __init__(self, underlying_dict): - self._data = underlying_dict - - def __getitem__(self, key): - return self._data.__getitem__(key) - - def __iter__(self): - return self._data.__iter__() - - def __len__(self): - return self._data.__len__() - - return TestNonDictMapping - - -def _gen_mi(): - # a MultiIndex used to test the general functionality of this object - - # See Also: tests.multi.conftest.idx - major_axis = Index(["foo", "bar", "baz", "qux"]) - minor_axis = Index(["one", "two"]) + The covered (inferred) types are: + * 'string' + * 'empty' + * 'bytes' + * 'mixed' + * 'mixed-integer' + * 'mixed-integer-float' + * 'floating' + * 'integer' + * 'decimal' + * 'boolean' + * 'datetime64' + * 'datetime' + * 'date' + * 'timedelta' + * 'time' + * 'period' + * 'interval' - major_codes = np.array([0, 0, 1, 2, 3, 3]) - minor_codes = np.array([0, 1, 0, 1, 0, 1]) - index_names = ["first", "second"] - mi = MultiIndex( - levels=[major_axis, minor_axis], - codes=[major_codes, minor_codes], - names=index_names, - verify_integrity=False, - ) - return mi + Returns + ------- + inferred_dtype : str + The string for the inferred dtype from _libs.lib.infer_dtype + values : np.ndarray + An array of object dtype that will be inferred to have + `inferred_dtype` + Examples + -------- + >>> import pandas._libs.lib as lib + >>> + >>> def test_something(any_skipna_inferred_dtype): + ... inferred_dtype, values = any_skipna_inferred_dtype + ... # will pass + ... assert lib.infer_dtype(values, skipna=True) == inferred_dtype + """ + inferred_dtype, values = request.param + values = np.array(values, dtype=object) # object dtype to avoid casting -indices_dict = { - "unicode": tm.makeUnicodeIndex(100), - "string": tm.makeStringIndex(100), - "datetime": tm.makeDateIndex(100), - "datetime-tz": tm.makeDateIndex(100, tz="US/Pacific"), - "period": tm.makePeriodIndex(100), - "timedelta": tm.makeTimedeltaIndex(100), - "int": tm.makeIntIndex(100), - "uint": tm.makeUIntIndex(100), - "range": tm.makeRangeIndex(100), - "float": tm.makeFloatIndex(100), - "bool": tm.makeBoolIndex(10), - "categorical": tm.makeCategoricalIndex(100), - "interval": tm.makeIntervalIndex(100), - "empty": Index([]), - "tuples": MultiIndex.from_tuples(zip(["foo", "bar", "baz"], [1, 2, 3])), - "multi": _gen_mi(), - "repeats": Index([0, 0, 1, 1, 2, 2]), -} + # correctness of inference tested in tests/dtypes/test_inference.py + return inferred_dtype, values -@pytest.fixture(params=indices_dict.keys()) -def indices(request): +# ---------------------------------------------------------------- +# Misc +# ---------------------------------------------------------------- +@pytest.fixture +def ip(): """ - Fixture for many "simple" kinds of indices. + Get an instance of IPython.InteractiveShell. - These indices are unlikely to cover corner cases, e.g. - - no names - - no NaTs/NaNs - - no values near implementation bounds - - ... + Will raise a skip if IPython is not installed. """ - # copy to avoid mutation, e.g. setting .name - return indices_dict[request.param].copy() - - -def _create_series(index): - """ Helper for the _series dict """ - size = len(index) - data = np.random.randn(size) - return pd.Series(data, index=index, name="a") - + pytest.importorskip("IPython", minversion="6.0.0") + from IPython.core.interactiveshell import InteractiveShell -_series = { - f"series-with-{index_id}-index": _create_series(index) - for index_id, index in indices_dict.items() -} + return InteractiveShell() -@pytest.fixture -def series_with_simple_index(indices): +@pytest.fixture(params=["bsr", "coo", "csc", "csr", "dia", "dok", "lil"]) +def spmatrix(request): """ - Fixture for tests on series with changing types of indices. + Yields scipy sparse matrix classes. """ - return _create_series(indices) + from scipy import sparse + return getattr(sparse, request.param + "_matrix") -_narrow_dtypes = [ - np.float16, - np.float32, - np.int8, - np.int16, - np.int32, - np.uint8, - np.uint16, - np.uint32, -] -_narrow_series = { - f"{dtype.__name__}-series": tm.makeFloatSeries(name="a").astype(dtype) - for dtype in _narrow_dtypes -} +_cython_table = pd.core.base.SelectionMixin._cython_table.items() -@pytest.fixture(params=_narrow_series.keys()) -def narrow_series(request): + +@pytest.fixture(params=list(_cython_table)) +def cython_table_items(request): """ - Fixture for Series with low precision data types + Yields a tuple of a function and its corresponding name. Correspond to + the list of aggregator "Cython functions" used on selected table items. """ - # copy to avoid mutation, e.g. setting .name - return _narrow_series[request.param].copy() + return request.param -_index_or_series_objs = {**indices_dict, **_series, **_narrow_series} +def _get_cython_table_params(ndframe, func_names_and_expected): + """ + Combine frame, functions from SelectionMixin._cython_table + keys and expected result. + Parameters + ---------- + ndframe : DataFrame or Series + func_names_and_expected : Sequence of two items + The first item is a name of a NDFrame method ('sum', 'prod') etc. + The second item is the expected return value. -@pytest.fixture(params=_index_or_series_objs.keys()) -def index_or_series_obj(request): - """ - Fixture for tests on indexes, series and series with a narrow dtype - copy to avoid mutation, e.g. setting .name + Returns + ------- + list + List of three items (DataFrame, function, expected result) """ - return _index_or_series_objs[request.param].copy(deep=True) + results = [] + for func_name, expected in func_names_and_expected: + results.append((ndframe, func_name, expected)) + results += [ + (ndframe, func, expected) + for func, name in _cython_table + if name == func_name + ] + return results -@pytest.fixture -def multiindex_year_month_day_dataframe_random_data(): +@pytest.fixture( + params=[ + getattr(pd.offsets, o) + for o in pd.offsets.__all__ + if issubclass(getattr(pd.offsets, o), pd.offsets.Tick) + ] +) +def tick_classes(request): """ - DataFrame with 3 level MultiIndex (year, month, day) covering - first 100 business days from 2000-01-01 with random data + Fixture for Tick based datetime offsets available for a time series. """ - tdf = tm.makeTimeDataFrame(100) - ymd = tdf.groupby([lambda x: x.year, lambda x: x.month, lambda x: x.day]).sum() - # use Int64Index, to make sure things work - ymd.index.set_levels([lev.astype("i8") for lev in ymd.index.levels], inplace=True) - ymd.index.set_names(["year", "month", "day"], inplace=True) - return ymd + return request.param diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index f9059054ba59f..5b324bc5753ec 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -11,6 +11,7 @@ from pandas._libs import Timestamp, algos, hashtable as htable, lib from pandas._libs.tslib import iNaT +from pandas._typing import AnyArrayLike from pandas.util._decorators import doc from pandas.core.dtypes.cast import ( @@ -45,10 +46,14 @@ is_unsigned_integer_dtype, needs_i8_conversion, ) -from pandas.core.dtypes.generic import ABCIndex, ABCIndexClass, ABCSeries +from pandas.core.dtypes.generic import ( + ABCExtensionArray, + ABCIndex, + ABCIndexClass, + ABCSeries, +) from pandas.core.dtypes.missing import isna, na_value_for_dtype -import pandas.core.common as com from pandas.core.construction import array, extract_array from pandas.core.indexers import validate_indices @@ -384,7 +389,7 @@ def unique(values): unique1d = unique -def isin(comps, values) -> np.ndarray: +def isin(comps: AnyArrayLike, values: AnyArrayLike) -> np.ndarray: """ Compute the isin boolean array. @@ -409,15 +414,14 @@ def isin(comps, values) -> np.ndarray: f"to isin(), you passed a [{type(values).__name__}]" ) - if not isinstance(values, (ABCIndex, ABCSeries, np.ndarray)): + if not isinstance(values, (ABCIndex, ABCSeries, ABCExtensionArray, np.ndarray)): values = construct_1d_object_array_from_listlike(list(values)) + comps = extract_array(comps, extract_numpy=True) if is_categorical_dtype(comps): # TODO(extension) # handle categoricals - return comps._values.isin(values) - - comps = com.values_from_object(comps) + return comps.isin(values) # type: ignore comps, dtype = _ensure_data(comps) values, _ = _ensure_data(values, dtype=dtype) @@ -2021,9 +2025,7 @@ def sort_mixed(values): ) codes = ensure_platform_int(np.asarray(codes)) - from pandas import Index - - if not assume_unique and not Index(values).is_unique: + if not assume_unique and not len(unique(values)) == len(values): raise ValueError("values should be unique if codes is not None") if sorter is None: diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 70e0a129c055f..ceb45bc71326e 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -179,7 +179,7 @@ def get_result(self): return self.apply_empty_result() # raw - elif self.raw and not self.obj._is_mixed_type: + elif self.raw: return self.apply_raw() return self.apply_standard() diff --git a/pandas/core/array_algos/__init__.py b/pandas/core/array_algos/__init__.py new file mode 100644 index 0000000000000..a7655a013c6cf --- /dev/null +++ b/pandas/core/array_algos/__init__.py @@ -0,0 +1,9 @@ +""" +core.array_algos is for algorithms that operate on ndarray and ExtensionArray. +These should: + +- Assume that any Index, Series, or DataFrame objects have already been unwrapped. +- Assume that any list arguments have already been cast to ndarray/EA. +- Not depend on Index, Series, or DataFrame, nor import any of these. +- May dispatch to ExtensionArray methods, but should not import from core.arrays. +""" diff --git a/pandas/core/array_algos/transforms.py b/pandas/core/array_algos/transforms.py new file mode 100644 index 0000000000000..f775b6d733d9c --- /dev/null +++ b/pandas/core/array_algos/transforms.py @@ -0,0 +1,33 @@ +""" +transforms.py is for shape-preserving functions. +""" + +import numpy as np + +from pandas.core.dtypes.common import ensure_platform_int + + +def shift(values: np.ndarray, periods: int, axis: int, fill_value) -> np.ndarray: + new_values = values + + # make sure array sent to np.roll is c_contiguous + f_ordered = values.flags.f_contiguous + if f_ordered: + new_values = new_values.T + axis = new_values.ndim - axis - 1 + + if np.prod(new_values.shape): + new_values = np.roll(new_values, ensure_platform_int(periods), axis=axis) + + axis_indexer = [slice(None)] * values.ndim + if periods > 0: + axis_indexer[axis] = slice(None, periods) + else: + axis_indexer[axis] = slice(periods, None) + new_values[tuple(axis_indexer)] = fill_value + + # restore original order + if f_ordered: + new_values = new_values.T + + return new_values diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index b5da6d4c11616..67e3807c477fb 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -21,7 +21,7 @@ from pandas.core.dtypes.common import is_array_like, is_list_like from pandas.core.dtypes.dtypes import ExtensionDtype -from pandas.core.dtypes.generic import ABCExtensionArray, ABCIndexClass, ABCSeries +from pandas.core.dtypes.generic import ABCIndexClass, ABCSeries from pandas.core.dtypes.missing import isna from pandas.core import ops @@ -93,7 +93,6 @@ class ExtensionArray: _from_factorized _from_sequence _from_sequence_of_strings - _ndarray_values _reduce _values_for_argsort _values_for_factorize @@ -356,7 +355,9 @@ def __iter__(self): for i in range(len(self)): yield self[i] - def to_numpy(self, dtype=None, copy=False, na_value=lib.no_default): + def to_numpy( + self, dtype=None, copy: bool = False, na_value=lib.no_default + ) -> np.ndarray: """ Convert to a NumPy ndarray. @@ -407,6 +408,13 @@ def shape(self) -> Tuple[int, ...]: """ return (len(self),) + @property + def size(self) -> int: + """ + The number of elements in the array. + """ + return np.prod(self.shape) + @property def ndim(self) -> int: """ @@ -583,7 +591,7 @@ def dropna(self): """ return self[~self.isna()] - def shift(self, periods: int = 1, fill_value: object = None) -> ABCExtensionArray: + def shift(self, periods: int = 1, fill_value: object = None) -> "ExtensionArray": """ Shift values by desired number. @@ -720,7 +728,7 @@ def _values_for_factorize(self) -> Tuple[np.ndarray, Any]: """ return self.astype(object), np.nan - def factorize(self, na_sentinel: int = -1) -> Tuple[np.ndarray, ABCExtensionArray]: + def factorize(self, na_sentinel: int = -1) -> Tuple[np.ndarray, "ExtensionArray"]: """ Encode the extension array as an enumerated type. @@ -825,7 +833,7 @@ def repeat(self, repeats, axis=None): def take( self, indices: Sequence[int], allow_fill: bool = False, fill_value: Any = None - ) -> ABCExtensionArray: + ) -> "ExtensionArray": """ Take elements from an array. @@ -914,7 +922,7 @@ def take(self, indices, allow_fill=False, fill_value=None): # pandas.api.extensions.take raise AbstractMethodError(self) - def copy(self) -> ABCExtensionArray: + def copy(self) -> "ExtensionArray": """ Return a copy of the array. @@ -924,7 +932,7 @@ def copy(self) -> ABCExtensionArray: """ raise AbstractMethodError(self) - def view(self, dtype=None) -> Union[ABCExtensionArray, np.ndarray]: + def view(self, dtype=None) -> ArrayLike: """ Return a view on the array. @@ -935,8 +943,8 @@ def view(self, dtype=None) -> Union[ABCExtensionArray, np.ndarray]: Returns ------- - ExtensionArray - A view of the :class:`ExtensionArray`. + ExtensionArray or np.ndarray + A view on the :class:`ExtensionArray`'s data. """ # NB: # - This must return a *new* object referencing the same data, not self. @@ -994,7 +1002,7 @@ def _formatter(self, boxed: bool = False) -> Callable[[Any], Optional[str]]: # Reshaping # ------------------------------------------------------------------------ - def ravel(self, order="C") -> ABCExtensionArray: + def ravel(self, order="C") -> "ExtensionArray": """ Return a flattened view on this array. @@ -1015,8 +1023,8 @@ def ravel(self, order="C") -> ABCExtensionArray: @classmethod def _concat_same_type( - cls, to_concat: Sequence[ABCExtensionArray] - ) -> ABCExtensionArray: + cls, to_concat: Sequence["ExtensionArray"] + ) -> "ExtensionArray": """ Concatenate multiple array. @@ -1037,22 +1045,6 @@ def _concat_same_type( # of objects _can_hold_na = True - @property - def _ndarray_values(self) -> np.ndarray: - """ - Internal pandas method for lossy conversion to a NumPy ndarray. - - This method is not part of the pandas interface. - - The expectation is that this is cheap to compute, and is primarily - used for interacting with our indexers. - - Returns - ------- - array : ndarray - """ - return np.array(self) - def _reduce(self, name, skipna=True, **kwargs): """ Return a scalar result of performing the reduction operation. diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 40a169d03f39c..bfccc6f244219 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -15,6 +15,7 @@ Substitution, cache_readonly, deprecate_kwarg, + doc, ) from pandas.util._validators import validate_bool_kwarg, validate_fillna_kwargs @@ -450,10 +451,6 @@ def dtype(self) -> CategoricalDtype: """ return self._dtype - @property - def _ndarray_values(self) -> np.ndarray: - return self.codes - @property def _constructor(self) -> Type["Categorical"]: return Categorical @@ -1317,7 +1314,7 @@ def __setstate__(self, state): setattr(self, k, v) @property - def T(self): + def T(self) -> "Categorical": """ Return transposed numpy array. """ @@ -1352,8 +1349,7 @@ def memory_usage(self, deep=False): """ return self._codes.nbytes + self.dtype.categories.memory_usage(deep=deep) - @Substitution(klass="Categorical") - @Appender(_shared_docs["searchsorted"]) + @doc(_shared_docs["searchsorted"], klass="Categorical") def searchsorted(self, value, side="left", sorter=None): # searchsorted is very performance sensitive. By converting codes # to same dtype as self.codes, we get much faster performance. @@ -1409,12 +1405,6 @@ def notna(self): notnull = notna - def put(self, *args, **kwargs): - """ - Replace specific elements in the Categorical with given values. - """ - raise NotImplementedError(("'put' is not yet implemented for Categorical")) - def dropna(self): """ Return the Categorical without null values. @@ -1494,7 +1484,7 @@ def check_for_ordered(self, op): ) def _values_for_argsort(self): - return self._codes.copy() + return self._codes def argsort(self, ascending=True, kind="quicksort", **kwargs): """ @@ -1681,6 +1671,12 @@ def to_dense(self): ------- dense : array """ + warn( + "Categorical.to_dense is deprecated and will be removed in " + "a future version. Use np.asarray(cat) instead.", + FutureWarning, + stacklevel=2, + ) return np.asarray(self) def fillna(self, value=None, method=None, limit=None): @@ -1728,7 +1724,8 @@ def fillna(self, value=None, method=None, limit=None): # pad / bfill if method is not None: - values = self.to_dense().reshape(-1, len(self)) + # TODO: dispatch when self.categories is EA-dtype + values = np.asarray(self).reshape(-1, len(self)) values = interpolate_2d(values, method, 0, None, value).astype( self.categories.dtype )[0] @@ -1738,12 +1735,17 @@ def fillna(self, value=None, method=None, limit=None): # If value is a dict or a Series (a dict value has already # been converted to a Series) - if isinstance(value, ABCSeries): - if not value[~value.isin(self.categories)].isna().all(): + if isinstance(value, (np.ndarray, Categorical, ABCSeries)): + # We get ndarray or Categorical if called via Series.fillna, + # where it will unwrap another aligned Series before getting here + + mask = ~algorithms.isin(value, self.categories) + if not isna(value[mask]).all(): raise ValueError("fill value must be in categories") values_codes = _get_codes_for_values(value, self.categories) indexer = np.where(codes == -1) + codes = codes.copy() codes[indexer] = values_codes[indexer] # If value is not a dict or Series it should be a scalar @@ -2561,12 +2563,7 @@ def _get_codes_for_values(values, categories): """ dtype_equal = is_dtype_equal(values.dtype, categories.dtype) - if dtype_equal: - # To prevent erroneous dtype coercion in _get_data_algo, retrieve - # the underlying numpy array. gh-22702 - values = getattr(values, "_ndarray_values", values) - categories = getattr(categories, "_ndarray_values", categories) - elif is_extension_array_dtype(categories.dtype) and is_object_dtype(values): + if is_extension_array_dtype(categories.dtype) and is_object_dtype(values): # Support inferring the correct extension dtype from an array of # scalar objects. e.g. # Categorical(array[Period, Period], categories=PeriodIndex(...)) @@ -2576,7 +2573,7 @@ def _get_codes_for_values(values, categories): # exception raised in _from_sequence values = ensure_object(values) categories = ensure_object(categories) - else: + elif not dtype_equal: values = ensure_object(values) categories = ensure_object(categories) diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 8c870c6255200..c3e79f40e7451 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -40,6 +40,7 @@ from pandas.core import missing, nanops, ops from pandas.core.algorithms import checked_add_with_arr, take, unique1d, value_counts +from pandas.core.array_algos.transforms import shift from pandas.core.arrays.base import ExtensionArray, ExtensionOpsMixin import pandas.core.common as com from pandas.core.construction import array, extract_array @@ -130,7 +131,7 @@ class AttributesMixin: _data: np.ndarray @classmethod - def _simple_new(cls, values, **kwargs): + def _simple_new(cls, values: np.ndarray, **kwargs): raise AbstractMethodError(cls) @property @@ -201,7 +202,7 @@ def _check_compatible_with( ---------- other setitem : bool, default False - For __setitem__ we may have stricter compatiblity resrictions than + For __setitem__ we may have stricter compatibility resrictions than for comparisons. Raises @@ -395,6 +396,34 @@ def floor(self, freq, ambiguous="raise", nonexistent="raise"): def ceil(self, freq, ambiguous="raise", nonexistent="raise"): return self._round(freq, RoundTo.PLUS_INFTY, ambiguous, nonexistent) + def _with_freq(self, freq): + """ + Helper to set our freq in-place, returning self to allow method chaining. + + Parameters + ---------- + freq : DateOffset, None, or "infer" + + Returns + ------- + self + """ + # GH#29843 + if freq is None: + # Always valid + pass + elif len(self) == 0 and isinstance(freq, DateOffset): + # Always valid. In the TimedeltaArray case, we assume this + # is a Tick offset. + pass + else: + # As an internal method, we can ensure this assertion always holds + assert freq == "infer" + freq = frequencies.to_offset(self.inferred_freq) + + self._freq = freq + return self + class DatetimeLikeArrayMixin(ExtensionOpsMixin, AttributesMixin, ExtensionArray): """ @@ -455,10 +484,6 @@ def asi8(self) -> np.ndarray: # do not cache or you'll create a memory leak return self._data.view("i8") - @property - def _ndarray_values(self): - return self._data - # ---------------------------------------------------------------- # Rendering Methods @@ -745,6 +770,38 @@ def _from_factorized(cls, values, original): def _values_for_argsort(self): return self._data + @Appender(ExtensionArray.shift.__doc__) + def shift(self, periods=1, fill_value=None, axis=0): + if not self.size or periods == 0: + return self.copy() + + if is_valid_nat_for_dtype(fill_value, self.dtype): + fill_value = NaT + elif not isinstance(fill_value, self._recognized_scalars): + # only warn if we're not going to raise + if self._scalar_type is Period and lib.is_integer(fill_value): + # kludge for #31971 since Period(integer) tries to cast to str + new_fill = Period._from_ordinal(fill_value, freq=self.freq) + else: + new_fill = self._scalar_type(fill_value) + + # stacklevel here is chosen to be correct when called from + # DataFrame.shift or Series.shift + warnings.warn( + f"Passing {type(fill_value)} to shift is deprecated and " + "will raise in a future version, pass " + f"{self._scalar_type.__name__} instead.", + FutureWarning, + stacklevel=7, + ) + fill_value = new_fill + + fill_value = self._unbox_scalar(fill_value) + + new_values = shift(self._data, periods, axis, fill_value) + + return type(self)._simple_new(new_values, dtype=self.dtype) + # ------------------------------------------------------------------ # Additional array methods # These are not part of the EA API, but we implement them because @@ -1100,56 +1157,46 @@ def _sub_period(self, other): def _add_offset(self, offset): raise AbstractMethodError(self) - def _add_delta(self, other): + def _add_timedeltalike_scalar(self, other): """ - Add a timedelta-like, Tick or TimedeltaIndex-like object - to self, yielding an int64 numpy array - - Parameters - ---------- - delta : {timedelta, np.timedelta64, Tick, - TimedeltaIndex, ndarray[timedelta64]} + Add a delta of a timedeltalike Returns ------- - result : ndarray[int64] - - Notes - ----- - The result's name is set outside of _add_delta by the calling - method (__add__ or __sub__), if necessary (i.e. for Indexes). - """ - if isinstance(other, (Tick, timedelta, np.timedelta64)): - new_values = self._add_timedeltalike_scalar(other) - elif is_timedelta64_dtype(other): - # ndarray[timedelta64] or TimedeltaArray/index - new_values = self._add_delta_tdi(other) - - return new_values - - def _add_timedeltalike_scalar(self, other): - """ - Add a delta of a timedeltalike - return the i8 result view + Same type as self """ if isna(other): # i.e np.timedelta64("NaT"), not recognized by delta_to_nanoseconds new_values = np.empty(self.shape, dtype="i8") new_values[:] = iNaT - return new_values + return type(self)(new_values, dtype=self.dtype) inc = delta_to_nanoseconds(other) new_values = checked_add_with_arr(self.asi8, inc, arr_mask=self._isnan).view( "i8" ) new_values = self._maybe_mask_results(new_values) - return new_values.view("i8") - def _add_delta_tdi(self, other): + new_freq = None + if isinstance(self.freq, Tick) or is_period_dtype(self.dtype): + # adding a scalar preserves freq + new_freq = self.freq + + if new_freq is not None: + # fastpath that doesnt require inference + return type(self)(new_values, dtype=self.dtype, freq=new_freq) + return type(self)(new_values, dtype=self.dtype)._with_freq("infer") + + def _add_timedelta_arraylike(self, other): """ Add a delta of a TimedeltaIndex - return the i8 result view + + Returns + ------- + Same type as self """ + # overridden by PeriodArray + if len(self) != len(other): raise ValueError("cannot add indices of unequal length") @@ -1167,7 +1214,8 @@ def _add_delta_tdi(self, other): if self._hasnans or other._hasnans: mask = (self._isnan) | (other._isnan) new_values[mask] = iNaT - return new_values.view("i8") + + return type(self)(new_values, dtype=self.dtype)._with_freq("infer") def _add_nat(self): """ @@ -1309,7 +1357,7 @@ def __add__(self, other): if other is NaT: result = self._add_nat() elif isinstance(other, (Tick, timedelta, np.timedelta64)): - result = self._add_delta(other) + result = self._add_timedeltalike_scalar(other) elif isinstance(other, DateOffset): # specifically _not_ a Tick result = self._add_offset(other) @@ -1325,7 +1373,7 @@ def __add__(self, other): # array-like others elif is_timedelta64_dtype(other): # TimedeltaIndex, ndarray[timedelta64] - result = self._add_delta(other) + result = self._add_timedelta_arraylike(other) elif is_object_dtype(other): # e.g. Array/Index of DateOffset objects result = self._addsub_object_array(other, operator.add) @@ -1361,7 +1409,7 @@ def __sub__(self, other): if other is NaT: result = self._sub_nat() elif isinstance(other, (Tick, timedelta, np.timedelta64)): - result = self._add_delta(-other) + result = self._add_timedeltalike_scalar(-other) elif isinstance(other, DateOffset): # specifically _not_ a Tick result = self._add_offset(-other) @@ -1380,7 +1428,7 @@ def __sub__(self, other): # array-like others elif is_timedelta64_dtype(other): # TimedeltaIndex, ndarray[timedelta64] - result = self._add_delta(-other) + result = self._add_timedelta_arraylike(-other) elif is_object_dtype(other): # e.g. Array/Index of DateOffset objects result = self._addsub_object_array(other, operator.sub) diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index 56939cda6d21c..e2a13df069ae2 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -23,6 +23,7 @@ from pandas.core.dtypes.common import ( _INT64_DTYPE, _NS_DTYPE, + is_bool_dtype, is_categorical_dtype, is_datetime64_any_dtype, is_datetime64_dtype, @@ -587,6 +588,8 @@ def astype(self, dtype, copy=True): if getattr(self.dtype, "tz", None) is None: return self.tz_localize(new_tz) result = self.tz_convert(new_tz) + if copy: + result = result.copy() if new_tz is None: # Do we want .astype('datetime64[ns]') to be an ndarray. # The astype in Block._astype expects this to return an @@ -694,7 +697,7 @@ def _add_offset(self, offset): # GH#30336 _from_sequence won't be able to infer self.tz return type(self)._from_sequence(result).tz_localize(self.tz) - return type(self)._from_sequence(result, freq="infer") + return type(self)._from_sequence(result)._with_freq("infer") def _sub_datetimelike_scalar(self, other): # subtract a datetime from myself, yielding a ndarray[timedelta64[ns]] @@ -715,23 +718,6 @@ def _sub_datetimelike_scalar(self, other): result = self._maybe_mask_results(result) return result.view("timedelta64[ns]") - def _add_delta(self, delta): - """ - Add a timedelta-like, Tick, or TimedeltaIndex-like object - to self, yielding a new DatetimeArray - - Parameters - ---------- - other : {timedelta, np.timedelta64, Tick, - TimedeltaIndex, ndarray[timedelta64]} - - Returns - ------- - result : DatetimeArray - """ - new_values = super()._add_delta(delta) - return type(self)._from_sequence(new_values, tz=self.tz, freq="infer") - # ----------------------------------------------------------------- # Timezone Conversion and Localization Methods @@ -988,7 +974,7 @@ def tz_localize(self, tz, ambiguous="raise", nonexistent="raise"): # ---------------------------------------------------------------- # Conversion Methods - Vectorized analogues of Timestamp methods - def to_pydatetime(self): + def to_pydatetime(self) -> np.ndarray: """ Return Datetime Array/Index as object ndarray of datetime.datetime objects. @@ -1045,7 +1031,7 @@ def normalize(self): new_values[not_null] = new_values[not_null] - adjustment else: new_values = conversion.normalize_i8_timestamps(self.asi8, self.tz) - return type(self)._from_sequence(new_values, freq="infer").tz_localize(self.tz) + return type(self)(new_values)._with_freq("infer").tz_localize(self.tz) def to_period(self, freq=None): """ @@ -1901,7 +1887,11 @@ def maybe_convert_dtype(data, copy): ------ TypeError : PeriodDType data is passed """ - if is_float_dtype(data): + if not hasattr(data, "dtype"): + # e.g. collections.deque + return data, copy + + if is_float_dtype(data.dtype): # Note: we must cast to datetime64[ns] here in order to treat these # as wall-times instead of UTC timestamps. data = data.astype(_NS_DTYPE) @@ -1909,24 +1899,24 @@ def maybe_convert_dtype(data, copy): # TODO: deprecate this behavior to instead treat symmetrically # with integer dtypes. See discussion in GH#23675 - elif is_timedelta64_dtype(data): + elif is_timedelta64_dtype(data.dtype) or is_bool_dtype(data.dtype): # GH#29794 enforcing deprecation introduced in GH#23539 raise TypeError(f"dtype {data.dtype} cannot be converted to datetime64[ns]") - elif is_period_dtype(data): + elif is_period_dtype(data.dtype): # Note: without explicitly raising here, PeriodIndex # test_setops.test_join_does_not_recur fails raise TypeError( "Passing PeriodDtype data is invalid. Use `data.to_timestamp()` instead" ) - elif is_categorical_dtype(data): + elif is_categorical_dtype(data.dtype): # GH#18664 preserve tz in going DTI->Categorical->DTI # TODO: cases where we need to do another pass through this func, # e.g. the categories are timedelta64s data = data.categories.take(data.codes, fill_value=NaT)._values copy = False - elif is_extension_array_dtype(data) and not is_datetime64tz_dtype(data): + elif is_extension_array_dtype(data.dtype) and not is_datetime64tz_dtype(data.dtype): # Includes categorical # TODO: We have no tests for these data = np.array(data, dtype=np.object_) diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index e2b66b1a006e4..f2880c5cbee42 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -13,6 +13,7 @@ from pandas.core.dtypes.cast import astype_nansafe from pandas.core.dtypes.common import ( is_bool_dtype, + is_datetime64_dtype, is_float, is_float_dtype, is_integer, @@ -469,24 +470,14 @@ def astype(self, dtype, copy: bool = True) -> ArrayLike: if is_float_dtype(dtype): # In astype, we consider dtype=float to also mean na_value=np.nan kwargs = dict(na_value=np.nan) + elif is_datetime64_dtype(dtype): + kwargs = dict(na_value=np.datetime64("NaT")) else: kwargs = {} data = self.to_numpy(dtype=dtype, **kwargs) return astype_nansafe(data, dtype, copy=False) - @property - def _ndarray_values(self) -> np.ndarray: - """ - Internal pandas method for lossy conversion to a NumPy ndarray. - - This method is not part of the pandas interface. - - The expectation is that this is cheap to compute, and is primarily - used for interacting with our indexers. - """ - return self._data - def _values_for_factorize(self) -> Tuple[np.ndarray, float]: # TODO: https://github.com/pandas-dev/pandas/issues/30037 # use masked algorithms, rather than object-dtype / np.nan. diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index 51c94d5059f8b..d852ea4f584c9 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -29,7 +29,6 @@ ABCDatetimeIndex, ABCExtensionArray, ABCIndexClass, - ABCInterval, ABCIntervalIndex, ABCPeriodIndex, ABCSeries, @@ -529,7 +528,7 @@ def __setitem__(self, key, value): value_left, value_right = value, value # scalar interval - elif is_interval_dtype(value) or isinstance(value, ABCInterval): + elif is_interval_dtype(value) or isinstance(value, Interval): self._check_closed_matches(value, name="value") value_left, value_right = value.left, value.right @@ -642,7 +641,7 @@ def fillna(self, value=None, method=None, limit=None): if limit is not None: raise TypeError("limit is not supported for IntervalArray.") - if not isinstance(value, ABCInterval): + if not isinstance(value, Interval): msg = ( "'IntervalArray.fillna' only supports filling with a " f"scalar 'pandas.Interval'. Got a '{type(value).__name__}' instead." diff --git a/pandas/core/arrays/numpy_.py b/pandas/core/arrays/numpy_.py index 0e64967ce93a6..3058e1d6073f3 100644 --- a/pandas/core/arrays/numpy_.py +++ b/pandas/core/arrays/numpy_.py @@ -6,7 +6,7 @@ from pandas._libs import lib from pandas.compat.numpy import function as nv -from pandas.util._decorators import Appender +from pandas.util._decorators import doc from pandas.util._validators import validate_fillna_kwargs from pandas.core.dtypes.dtypes import ExtensionDtype @@ -435,7 +435,10 @@ def skew(self, axis=None, dtype=None, out=None, keepdims=False, skipna=True): # ------------------------------------------------------------------------ # Additional Methods - def to_numpy(self, dtype=None, copy=False, na_value=lib.no_default): + + def to_numpy( + self, dtype=None, copy: bool = False, na_value=lib.no_default + ) -> np.ndarray: result = np.asarray(self._ndarray, dtype=dtype) if (copy or na_value is not lib.no_default) and result is self._ndarray: @@ -446,7 +449,7 @@ def to_numpy(self, dtype=None, copy=False, na_value=lib.no_default): return result - @Appender(ExtensionArray.searchsorted.__doc__) + @doc(ExtensionArray.searchsorted) def searchsorted(self, value, side="left", sorter=None): return searchsorted(self.to_numpy(), value, side=side, sorter=sorter) diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index 8141e2c78a7e2..c24b0b5fa64b8 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -31,13 +31,7 @@ pandas_dtype, ) from pandas.core.dtypes.dtypes import PeriodDtype -from pandas.core.dtypes.generic import ( - ABCIndexClass, - ABCPeriod, - ABCPeriodArray, - ABCPeriodIndex, - ABCSeries, -) +from pandas.core.dtypes.generic import ABCIndexClass, ABCPeriodIndex, ABCSeries from pandas.core.dtypes.missing import isna, notna import pandas.core.algorithms as algos @@ -48,7 +42,7 @@ from pandas.tseries.offsets import DateOffset, Tick, _delta_to_tick -def _field_accessor(name, alias, docstring=None): +def _field_accessor(name: str, alias: int, docstring=None): def f(self): base, mult = libfrequencies.get_freq_code(self.freq) result = get_period_field_arr(alias, self.asi8, base) @@ -170,7 +164,7 @@ def __init__(self, values, freq=None, dtype=None, copy=False): self._dtype = PeriodDtype(freq) @classmethod - def _simple_new(cls, values: np.ndarray, freq=None, **kwargs): + def _simple_new(cls, values: np.ndarray, freq=None, **kwargs) -> "PeriodArray": # alias for PeriodArray.__init__ assert isinstance(values, np.ndarray) and values.dtype == "i8" return cls(values, freq=freq, **kwargs) @@ -181,7 +175,7 @@ def _from_sequence( scalars: Sequence[Optional[Period]], dtype: Optional[PeriodDtype] = None, copy: bool = False, - ) -> ABCPeriodArray: + ) -> "PeriodArray": if dtype: freq = dtype.freq else: @@ -191,6 +185,7 @@ def _from_sequence( validate_dtype_freq(scalars.dtype, freq) if copy: scalars = scalars.copy() + assert isinstance(scalars, PeriodArray) # for mypy return scalars periods = np.asarray(scalars, dtype=object) @@ -202,11 +197,13 @@ def _from_sequence( return cls(ordinals, freq=freq) @classmethod - def _from_sequence_of_strings(cls, strings, dtype=None, copy=False): + def _from_sequence_of_strings( + cls, strings, dtype=None, copy=False + ) -> "PeriodArray": return cls._from_sequence(strings, dtype, copy) @classmethod - def _from_datetime64(cls, data, freq, tz=None): + def _from_datetime64(cls, data, freq, tz=None) -> "PeriodArray": """ Construct a PeriodArray from a datetime64 array @@ -270,19 +267,24 @@ def _check_compatible_with(self, other, setitem: bool = False): # Data / Attributes @cache_readonly - def dtype(self): + def dtype(self) -> PeriodDtype: return self._dtype # error: Read-only property cannot override read-write property [misc] @property # type: ignore - def freq(self): + def freq(self) -> DateOffset: """ Return the frequency object for this PeriodArray. """ return self.dtype.freq def __array__(self, dtype=None) -> np.ndarray: - # overriding DatetimelikeArray + if dtype == "i8": + return self.asi8 + elif dtype == bool: + return ~self._isnan + + # This will raise TypeErorr for non-object dtypes return np.array(list(self), dtype=object) def __arrow_array__(self, type=None): @@ -397,7 +399,7 @@ def __arrow_array__(self, type=None): daysinmonth = days_in_month @property - def is_leap_year(self): + def is_leap_year(self) -> np.ndarray: """ Logical indicating if the date belongs to a leap year. """ @@ -451,13 +453,7 @@ def to_timestamp(self, freq=None, how="start"): new_data = self.asfreq(freq, how=how) new_data = libperiod.periodarr_to_dt64arr(new_data.asi8, base) - return DatetimeArray._from_sequence(new_data, freq="infer") - - # -------------------------------------------------------------------- - # Array-like / EA-Interface Methods - - def _values_for_argsort(self): - return self._data + return DatetimeArray(new_data)._with_freq("infer") # -------------------------------------------------------------------- @@ -490,7 +486,7 @@ def _time_shift(self, periods, freq=None): def _box_func(self): return lambda x: Period._from_ordinal(ordinal=x, freq=self.freq) - def asfreq(self, freq=None, how="E"): + def asfreq(self, freq=None, how="E") -> "PeriodArray": """ Convert the Period Array/Index to the specified frequency `freq`. @@ -552,7 +548,7 @@ def asfreq(self, freq=None, how="E"): # ------------------------------------------------------------------ # Rendering Methods - def _formatter(self, boxed=False): + def _formatter(self, boxed: bool = False): if boxed: return str return "'{}'".format @@ -579,7 +575,7 @@ def _format_native_types(self, na_rep="NaT", date_format=None, **kwargs): # ------------------------------------------------------------------ - def astype(self, dtype, copy=True): + def astype(self, dtype, copy: bool = True): # We handle Period[T] -> Period[U] # Our parent handles everything else. dtype = pandas_dtype(dtype) @@ -652,10 +648,11 @@ def _add_timedeltalike_scalar(self, other): Returns ------- - result : ndarray[int64] + PeriodArray """ - assert isinstance(self.freq, Tick) # checked by calling function - assert isinstance(other, (timedelta, np.timedelta64, Tick)) + if not isinstance(self.freq, Tick): + # We cannot add timedelta-like to non-tick PeriodArray + raise raise_on_incompatible(self, other) if notna(other): # special handling for np.timedelta64("NaT"), avoid calling @@ -665,10 +662,9 @@ def _add_timedeltalike_scalar(self, other): # Note: when calling parent class's _add_timedeltalike_scalar, # it will call delta_to_nanoseconds(delta). Because delta here # is an integer, delta_to_nanoseconds will return it unchanged. - ordinals = super()._add_timedeltalike_scalar(other) - return ordinals + return super()._add_timedeltalike_scalar(other) - def _add_delta_tdi(self, other): + def _add_timedelta_arraylike(self, other): """ Parameters ---------- @@ -678,7 +674,9 @@ def _add_delta_tdi(self, other): ------- result : ndarray[int64] """ - assert isinstance(self.freq, Tick) # checked by calling function + if not isinstance(self.freq, Tick): + # We cannot add timedelta-like to non-tick PeriodArray + raise raise_on_incompatible(self, other) if not np.all(isna(other)): delta = self._check_timedeltalike_freq_compat(other) @@ -686,28 +684,8 @@ def _add_delta_tdi(self, other): # all-NaT TimedeltaIndex is equivalent to a single scalar td64 NaT return self + np.timedelta64("NaT") - return self._addsub_int_array(delta, operator.add).asi8 - - def _add_delta(self, other): - """ - Add a timedelta-like, Tick, or TimedeltaIndex-like object - to self, yielding a new PeriodArray - - Parameters - ---------- - other : {timedelta, np.timedelta64, Tick, - TimedeltaIndex, ndarray[timedelta64]} - - Returns - ------- - result : PeriodArray - """ - if not isinstance(self.freq, Tick): - # We cannot add timedelta-like to non-tick PeriodArray - raise raise_on_incompatible(self, other) - - new_ordinals = super()._add_delta(other) - return type(self)(new_ordinals, freq=self.freq) + ordinals = self._addsub_int_array(delta, operator.add).asi8 + return type(self)(ordinals, dtype=self.dtype) def _check_timedeltalike_freq_compat(self, other): """ @@ -960,8 +938,8 @@ def _get_ordinal_range(start, end, periods, freq, mult=1): if end is not None: end = Period(end, freq) - is_start_per = isinstance(start, ABCPeriod) - is_end_per = isinstance(end, ABCPeriod) + is_start_per = isinstance(start, Period) + is_end_per = isinstance(end, Period) if is_start_per and is_end_per and start.freq != end.freq: raise ValueError("start and end must have same freq") diff --git a/pandas/core/arrays/sparse/accessor.py b/pandas/core/arrays/sparse/accessor.py index 92c05f44d677c..787407060c7f1 100644 --- a/pandas/core/arrays/sparse/accessor.py +++ b/pandas/core/arrays/sparse/accessor.py @@ -228,14 +228,29 @@ def from_spmatrix(cls, data, index=None, columns=None): 2 0.0 0.0 1.0 """ from pandas import DataFrame + from pandas._libs.sparse import IntIndex data = data.tocsc() index, columns = cls._prep_index(data, index, columns) - sparrays = [SparseArray.from_spmatrix(data[:, i]) for i in range(data.shape[1])] - data = dict(enumerate(sparrays)) - result = DataFrame(data, index=index) - result.columns = columns - return result + n_rows, n_columns = data.shape + # We need to make sure indices are sorted, as we create + # IntIndex with no input validation (i.e. check_integrity=False ). + # Indices may already be sorted in scipy in which case this adds + # a small overhead. + data.sort_indices() + indices = data.indices + indptr = data.indptr + array_data = data.data + dtype = SparseDtype(array_data.dtype, 0) + arrays = [] + for i in range(n_columns): + sl = slice(indptr[i], indptr[i + 1]) + idx = IntIndex(n_rows, indices[sl], check_integrity=False) + arr = SparseArray._simple_new(array_data[sl], idx, dtype) + arrays.append(arr) + return DataFrame._from_arrays( + arrays, columns=columns, index=index, verify_integrity=False + ) def to_dense(self): """ @@ -314,12 +329,17 @@ def density(self) -> float: @staticmethod def _prep_index(data, index, columns): import pandas.core.indexes.base as ibase + from pandas.core.indexes.api import ensure_index N, K = data.shape if index is None: index = ibase.default_index(N) + else: + index = ensure_index(index) if columns is None: columns = ibase.default_index(K) + else: + columns = ensure_index(columns) if len(columns) != K: raise ValueError(f"Column length mismatch: {len(columns)} vs. {K}") diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index 549606795f528..8021e0babe4e0 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -27,6 +27,7 @@ is_array_like, is_bool_dtype, is_datetime64_any_dtype, + is_datetime64tz_dtype, is_dtype_equal, is_integer, is_object_dtype, @@ -42,7 +43,7 @@ from pandas.core.arrays.sparse.dtype import SparseDtype from pandas.core.base import PandasObject import pandas.core.common as com -from pandas.core.construction import sanitize_array +from pandas.core.construction import extract_array, sanitize_array from pandas.core.indexers import check_array_indexer from pandas.core.missing import interpolate_2d import pandas.core.ops as ops @@ -267,7 +268,6 @@ class SparseArray(PandasObject, ExtensionArray, ExtensionOpsMixin): Indices: array([2, 3], dtype=int32) """ - _pandas_ftype = "sparse" _subtyp = "sparse_array" # register ABCSparseArray _deprecations = PandasObject._deprecations | frozenset(["get_values"]) _sparse_index: SparseIndex @@ -312,7 +312,7 @@ def __init__( dtype = dtype.subtype if index is not None and not is_scalar(data): - raise Exception("must only pass scalars with an index ") + raise Exception("must only pass scalars with an index") if is_scalar(data): if index is not None: @@ -367,6 +367,19 @@ def __init__( sparse_index = data._sparse_index sparse_values = np.asarray(data.sp_values, dtype=dtype) elif sparse_index is None: + data = extract_array(data, extract_numpy=True) + if not isinstance(data, np.ndarray): + # EA + if is_datetime64tz_dtype(data.dtype): + warnings.warn( + f"Creating SparseArray from {data.dtype} data " + "loses timezone information. Cast to object before " + "sparse to retain timezone information.", + UserWarning, + stacklevel=2, + ) + data = np.asarray(data, dtype="datetime64[ns]") + data = np.asarray(data) sparse_values, sparse_index, fill_value = make_sparse( data, kind=kind, fill_value=fill_value, dtype=dtype ) @@ -385,7 +398,7 @@ def __init__( def _simple_new( cls, sparse_array: np.ndarray, sparse_index: SparseIndex, dtype: SparseDtype ) -> "SparseArray": - new = cls([]) + new = object.__new__(cls) new._sparse_index = sparse_index new._sparse_values = sparse_array new._dtype = dtype @@ -1296,14 +1309,14 @@ def mean(self, axis=0, *args, **kwargs): nsparse = self.sp_index.ngaps return (sp_sum + self.fill_value * nsparse) / (ct + nsparse) - def transpose(self, *axes): + def transpose(self, *axes) -> "SparseArray": """ Returns the SparseArray. """ return self @property - def T(self): + def T(self) -> "SparseArray": """ Returns the SparseArray. """ @@ -1497,7 +1510,7 @@ def _formatter(self, boxed=False): SparseArray._add_unary_ops() -def make_sparse(arr, kind="block", fill_value=None, dtype=None, copy=False): +def make_sparse(arr: np.ndarray, kind="block", fill_value=None, dtype=None, copy=False): """ Convert ndarray to sparse format @@ -1513,7 +1526,7 @@ def make_sparse(arr, kind="block", fill_value=None, dtype=None, copy=False): ------- (sparse_values, index, fill_value) : (ndarray, SparseIndex, Scalar) """ - arr = com.values_from_object(arr) + assert isinstance(arr, np.ndarray) if arr.ndim > 1: raise TypeError("expected dimension <= 1 data") diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index fcccd8cc14d6b..f82790ac4c3d9 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -281,7 +281,7 @@ def value_counts(self, dropna=False): return value_counts(self._ndarray, dropna=dropna).astype("Int64") - # Overrride parent because we have different return types. + # Override parent because we have different return types. @classmethod def _create_arithmetic_method(cls, op): # Note: this handles both arithmetic and comparison methods. diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index 749489a0a04fb..a25426c5c99cc 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -400,23 +400,6 @@ def _add_offset(self, other): f"cannot add the type {type(other).__name__} to a {type(self).__name__}" ) - def _add_delta(self, delta): - """ - Add a timedelta-like, Tick, or TimedeltaIndex-like object - to self, yielding a new TimedeltaArray. - - Parameters - ---------- - other : {timedelta, np.timedelta64, Tick, - TimedeltaIndex, ndarray[timedelta64]} - - Returns - ------- - result : TimedeltaArray - """ - new_values = super()._add_delta(delta) - return type(self)._from_sequence(new_values, freq="infer") - def _add_datetime_arraylike(self, other): """ Add DatetimeArray/Index or ndarray[datetime64] to TimedeltaArray. @@ -825,7 +808,7 @@ def total_seconds(self): """ return self._maybe_mask_results(1e-9 * self.asi8, fill_value=None) - def to_pytimedelta(self): + def to_pytimedelta(self) -> np.ndarray: """ Return Timedelta Array/Index as object ndarray of datetime.timedelta objects. diff --git a/pandas/core/base.py b/pandas/core/base.py index f55d9f905945d..148be3f50c0e7 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -4,16 +4,15 @@ import builtins import textwrap -from typing import Dict, FrozenSet, List, Optional, Union +from typing import Any, Dict, FrozenSet, List, Optional, Union import numpy as np import pandas._libs.lib as lib -from pandas._typing import T from pandas.compat import PYPY from pandas.compat.numpy import function as nv from pandas.errors import AbstractMethodError -from pandas.util._decorators import Appender, Substitution, cache_readonly, doc +from pandas.util._decorators import cache_readonly, doc from pandas.util._validators import validate_bool_kwarg from pandas.core.dtypes.cast import is_nested_object @@ -50,6 +49,8 @@ class PandasObject(DirNamesMixin): Baseclass for various pandas objects. """ + _cache: Dict[str, Any] + @property def _constructor(self): """ @@ -64,7 +65,7 @@ def __repr__(self) -> str: # Should be overwritten by base classes return object.__repr__(self) - def _reset_cache(self, key=None): + def _reset_cache(self, key: Optional[str] = None) -> None: """ Reset cached properties. If ``key`` is passed, only clears that key. """ @@ -87,15 +88,6 @@ def __sizeof__(self): # no memory_usage attribute, so fall back to object's 'sizeof' return super().__sizeof__() - def _ensure_type(self: T, obj) -> T: - """ - Ensure that an object has same type as self. - - Used by type checkers. - """ - assert isinstance(obj, type(self)), type(obj) - return obj - class NoNewAttributesMixin: """ @@ -364,7 +356,8 @@ def _aggregate(self, arg, *args, **kwargs): if isinstance(obj, ABCDataFrame) and len( obj.columns.intersection(keys) ) != len(keys): - raise SpecificationError("nested renamer is not supported") + cols = sorted(set(keys) - set(obj.columns.intersection(keys))) + raise SpecificationError(f"Column(s) {cols} do not exist") from pandas.core.reshape.concat import concat @@ -539,7 +532,7 @@ def _aggregate_multiple_funcs(self, arg, _axis): # raised directly in _aggregate_named pass elif "no results" in str(err): - # raised direcly in _aggregate_multiple_funcs + # raised directly in _aggregate_multiple_funcs pass else: raise @@ -865,23 +858,6 @@ def to_numpy(self, dtype=None, copy=False, na_value=lib.no_default, **kwargs): result[self.isna()] = na_value return result - @property - def _ndarray_values(self) -> np.ndarray: - """ - The data as an ndarray, possibly losing information. - - The expectation is that this is cheap to compute, and is primarily - used for interacting with our indexers. - - - categorical -> codes - """ - if is_extension_array_dtype(self): - return self.array._ndarray_values - # As a mixin, we depend on the mixing class having values. - # Special mixin syntax may be developed in the future: - # https://github.com/python/typing/issues/246 - return self.values # type: ignore - @property def empty(self): return not self.size @@ -895,6 +871,9 @@ def max(self, axis=None, skipna=True, *args, **kwargs): axis : int, optional For compatibility with NumPy. Only 0 or None are allowed. skipna : bool, default True + Exclude NA/null values when showing the result. + *args, **kwargs + Additional arguments and keywords for compatibility with NumPy. Returns ------- @@ -927,16 +906,17 @@ def max(self, axis=None, skipna=True, *args, **kwargs): nv.validate_max(args, kwargs) return nanops.nanmax(self._values, skipna=skipna) + @doc(op="max", oppose="min", value="largest") def argmax(self, axis=None, skipna=True, *args, **kwargs): """ - Return int position of the largest value in the Series. + Return int position of the {value} value in the Series. - If the maximum is achieved in multiple locations, + If the {op}imum is achieved in multiple locations, the first row position is returned. Parameters ---------- - axis : {None} + axis : {{None}} Dummy argument for consistency with Series. skipna : bool, default True Exclude NA/null values when showing the result. @@ -946,12 +926,13 @@ def argmax(self, axis=None, skipna=True, *args, **kwargs): Returns ------- int - Row position of the maximum values. + Row position of the {op}imum value. See Also -------- - numpy.ndarray.argmax : Equivalent method for numpy arrays. - Series.argmin : Similar method, but returning the minimum. + Series.arg{op} : Return position of the {op}imum value. + Series.arg{oppose} : Return position of the {oppose}imum value. + numpy.ndarray.arg{op} : Equivalent method for numpy arrays. Series.idxmax : Return index label of the maximum values. Series.idxmin : Return index label of the minimum values. @@ -959,8 +940,8 @@ def argmax(self, axis=None, skipna=True, *args, **kwargs): -------- Consider dataset containing cereal calories - >>> s = pd.Series({'Corn Flakes': 100.0, 'Almond Delight': 110.0, - ... 'Cinnamon Toast Crunch': 120.0, 'Cocoa Puff': 110.0}) + >>> s = pd.Series({{'Corn Flakes': 100.0, 'Almond Delight': 110.0, + ... 'Cinnamon Toast Crunch': 120.0, 'Cocoa Puff': 110.0}}) >>> s Corn Flakes 100.0 Almond Delight 110.0 @@ -970,8 +951,11 @@ def argmax(self, axis=None, skipna=True, *args, **kwargs): >>> s.argmax() 2 + >>> s.argmin() + 0 - The maximum cereal calories is in the third element, + The maximum cereal calories is the third element and + the minimum cereal calories is the first element, since series is zero-indexed. """ nv.validate_minmax_axis(axis) @@ -987,6 +971,9 @@ def min(self, axis=None, skipna=True, *args, **kwargs): axis : {None} Dummy argument for consistency with Series. skipna : bool, default True + Exclude NA/null values when showing the result. + *args, **kwargs + Additional arguments and keywords for compatibility with NumPy. Returns ------- @@ -1019,25 +1006,8 @@ def min(self, axis=None, skipna=True, *args, **kwargs): nv.validate_min(args, kwargs) return nanops.nanmin(self._values, skipna=skipna) + @doc(argmax, op="min", oppose="max", value="smallest") def argmin(self, axis=None, skipna=True, *args, **kwargs): - """ - Return a ndarray of the minimum argument indexer. - - Parameters - ---------- - axis : {None} - Dummy argument for consistency with Series. - skipna : bool, default True - - Returns - ------- - numpy.ndarray - - See Also - -------- - numpy.ndarray.argmin : Return indices of the minimum values along - the given axis. - """ nv.validate_minmax_axis(axis) nv.validate_argmax_with_skipna(skipna, args, kwargs) return nanops.nanargmin(self._values, skipna=skipna) @@ -1187,8 +1157,14 @@ def _map_values(self, mapper, na_action=None): def map_f(values, f): return lib.map_infer_mask(values, f, isna(values).view(np.uint8)) - else: + elif na_action is None: map_f = lib.map_infer + else: + msg = ( + "na_action must either be 'ignore' or None, " + f"{na_action} was passed" + ) + raise ValueError(msg) # mapper is a function new_values = map_f(values, mapper) @@ -1441,13 +1417,13 @@ def factorize(self, sort=False, na_sentinel=-1): ] = """ Find indices where elements should be inserted to maintain order. - Find the indices into a sorted %(klass)s `self` such that, if the + Find the indices into a sorted {klass} `self` such that, if the corresponding elements in `value` were inserted before the indices, the order of `self` would be preserved. .. note:: - The %(klass)s *must* be monotonically sorted, otherwise + The {klass} *must* be monotonically sorted, otherwise wrong locations will likely be returned. Pandas does *not* check this for you. @@ -1455,7 +1431,7 @@ def factorize(self, sort=False, na_sentinel=-1): ---------- value : array_like Values to insert into `self`. - side : {'left', 'right'}, optional + side : {{'left', 'right'}}, optional If 'left', the index of the first suitable location found is given. If 'right', return the last such index. If there is no suitable index, return either 0 or N (where N is the length of `self`). @@ -1485,46 +1461,53 @@ def factorize(self, sort=False, na_sentinel=-1): Examples -------- - >>> x = pd.Series([1, 2, 3]) - >>> x + >>> ser = pd.Series([1, 2, 3]) + >>> ser 0 1 1 2 2 3 dtype: int64 - >>> x.searchsorted(4) + >>> ser.searchsorted(4) 3 - >>> x.searchsorted([0, 4]) + >>> ser.searchsorted([0, 4]) array([0, 3]) - >>> x.searchsorted([1, 3], side='left') + >>> ser.searchsorted([1, 3], side='left') array([0, 2]) - >>> x.searchsorted([1, 3], side='right') + >>> ser.searchsorted([1, 3], side='right') array([1, 3]) - >>> x = pd.Categorical(['apple', 'bread', 'bread', - 'cheese', 'milk'], ordered=True) + >>> ser = pd.Categorical( + ... ['apple', 'bread', 'bread', 'cheese', 'milk'], ordered=True + ... ) + >>> ser [apple, bread, bread, cheese, milk] Categories (4, object): [apple < bread < cheese < milk] - >>> x.searchsorted('bread') + >>> ser.searchsorted('bread') 1 - >>> x.searchsorted(['bread'], side='right') + >>> ser.searchsorted(['bread'], side='right') array([3]) If the values are not monotonically sorted, wrong locations may be returned: - >>> x = pd.Series([2, 1, 3]) - >>> x.searchsorted(1) + >>> ser = pd.Series([2, 1, 3]) + >>> ser + 0 2 + 1 1 + 2 3 + dtype: int64 + + >>> ser.searchsorted(1) # doctest: +SKIP 0 # wrong result, correct would be 1 """ - @Substitution(klass="Index") - @Appender(_shared_docs["searchsorted"]) + @doc(_shared_docs["searchsorted"], klass="Index") def searchsorted(self, value, side="left", sorter=None) -> np.ndarray: return algorithms.searchsorted(self._values, value, side=side, sorter=sorter) diff --git a/pandas/core/common.py b/pandas/core/common.py index 6230ee34bcd50..fd7b4fd80bc5e 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -15,6 +15,7 @@ from pandas._libs import lib, tslibs from pandas._typing import T +from pandas.compat.numpy import _np_version_under1p17 from pandas.core.dtypes.cast import construct_1d_object_array_from_listlike from pandas.core.dtypes.common import ( @@ -87,9 +88,6 @@ def maybe_box_datetimelike(value, dtype=None): return value -values_from_object = lib.values_from_object - - def is_bool_indexer(key: Any) -> bool: """ Check whether `key` is a valid boolean indexer. @@ -395,18 +393,30 @@ def random_state(state=None): Parameters ---------- - state : int, np.random.RandomState, None. - If receives an int, passes to np.random.RandomState() as seed. + state : int, array-like, BitGenerator (NumPy>=1.17), np.random.RandomState, None. + If receives an int, array-like, or BitGenerator, passes to + np.random.RandomState() as seed. If receives an np.random.RandomState object, just returns object. If receives `None`, returns np.random. If receives anything else, raises an informative ValueError. + + ..versionchanged:: 1.1.0 + + array-like and BitGenerator (for NumPy>=1.17) object now passed to + np.random.RandomState() as seed + Default None. Returns ------- np.random.RandomState + """ - if is_integer(state): + if ( + is_integer(state) + or is_array_like(state) + or (not _np_version_under1p17 and isinstance(state, np.random.BitGenerator)) + ): return np.random.RandomState(state) elif isinstance(state, np.random.RandomState): return state @@ -414,7 +424,10 @@ def random_state(state=None): return np.random else: raise ValueError( - "random_state must be an integer, a numpy RandomState, or None" + ( + "random_state must be an integer, array-like, a BitGenerator, " + "a numpy RandomState, or None" + ) ) diff --git a/pandas/core/computation/expressions.py b/pandas/core/computation/expressions.py index fdc299ccdfde8..7f93472c766d7 100644 --- a/pandas/core/computation/expressions.py +++ b/pandas/core/computation/expressions.py @@ -121,12 +121,12 @@ def _evaluate_numexpr(op, op_str, a, b): def _where_standard(cond, a, b): - # Caller is responsible for calling values_from_object if necessary + # Caller is responsible for extracting ndarray if necessary return np.where(cond, a, b) def _where_numexpr(cond, a, b): - # Caller is responsible for calling values_from_object if necessary + # Caller is responsible for extracting ndarray if necessary result = None if _can_use_numexpr(None, "where", a, b, "where"): diff --git a/pandas/core/computation/parsing.py b/pandas/core/computation/parsing.py index 418fc7d38d08f..c7c7103654a65 100644 --- a/pandas/core/computation/parsing.py +++ b/pandas/core/computation/parsing.py @@ -116,7 +116,7 @@ def clean_column_name(name: str) -> str: If this name was used in the query string (this makes the query call impossible) an error will be raised by :func:`tokenize_backtick_quoted_string` instead, - which is not catched and propogates to the user level. + which is not caught and propagates to the user level. """ try: tokenized = tokenize_string(f"`{name}`") diff --git a/pandas/core/computation/pytables.py b/pandas/core/computation/pytables.py index 653d014775386..15d9987310f18 100644 --- a/pandas/core/computation/pytables.py +++ b/pandas/core/computation/pytables.py @@ -17,6 +17,7 @@ from pandas.core.computation.common import _ensure_decoded from pandas.core.computation.expr import BaseExprVisitor from pandas.core.computation.ops import UndefinedVariableError, is_term +from pandas.core.construction import extract_array from pandas.io.formats.printing import pprint_thing, pprint_thing_encoded @@ -202,7 +203,7 @@ def stringify(value): v = Timedelta(v, unit="s").value return TermValue(int(v), v, kind) elif meta == "category": - metadata = com.values_from_object(self.metadata) + metadata = extract_array(self.metadata, extract_numpy=True) result = metadata.searchsorted(v, side="left") # result returns 0 if v is first element or if v is not in metadata diff --git a/pandas/core/construction.py b/pandas/core/construction.py index f947a1fda49f1..c9754ff588896 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -4,6 +4,8 @@ These should not depend on core.internals. """ + +from collections import abc from typing import TYPE_CHECKING, Any, Optional, Sequence, Union, cast import numpy as np @@ -200,12 +202,12 @@ def array( >>> pd.array([1, 2, np.nan]) - [1, 2, NaN] + [1, 2, ] Length: 3, dtype: Int64 >>> pd.array(["a", None, "c"]) - ['a', nan, 'c'] + ['a', , 'c'] Length: 3, dtype: string >>> pd.array([pd.Period('2000', freq="D"), pd.Period("2000", freq="D")]) @@ -445,6 +447,8 @@ def sanitize_array( # GH#16804 arr = np.arange(data.start, data.stop, data.step, dtype="int64") subarr = _try_cast(arr, dtype, copy, raise_cast_failure) + elif isinstance(data, abc.Set): + raise TypeError("Set type is unordered") else: subarr = _try_cast(data, dtype, copy, raise_cast_failure) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 7dac36b53fce5..97c02428cbdf9 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1049,7 +1049,8 @@ def convert_dtypes( dtype new dtype """ - if convert_string or convert_integer or convert_boolean: + is_extension = is_extension_array_dtype(input_array.dtype) + if (convert_string or convert_integer or convert_boolean) and not is_extension: try: inferred_dtype = lib.infer_dtype(input_array) except ValueError: @@ -1062,9 +1063,7 @@ def convert_dtypes( if convert_integer: target_int_dtype = "Int64" - if is_integer_dtype(input_array.dtype) and not is_extension_array_dtype( - input_array.dtype - ): + if is_integer_dtype(input_array.dtype): from pandas.core.arrays.integer import _dtypes inferred_dtype = _dtypes.get(input_array.dtype.name, target_int_dtype) @@ -1078,9 +1077,7 @@ def convert_dtypes( inferred_dtype = input_array.dtype if convert_boolean: - if is_bool_dtype(input_array.dtype) and not is_extension_array_dtype( - input_array.dtype - ): + if is_bool_dtype(input_array.dtype): inferred_dtype = "boolean" else: if isinstance(inferred_dtype, str) and inferred_dtype == "boolean": diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index df5bac1071985..f5997a13e785d 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -7,7 +7,7 @@ import numpy as np -from pandas._libs import algos, lib +from pandas._libs import algos from pandas._libs.tslibs import conversion from pandas._typing import ArrayLike, DtypeObj @@ -19,18 +19,12 @@ PeriodDtype, registry, ) -from pandas.core.dtypes.generic import ( - ABCCategorical, - ABCDatetimeIndex, - ABCIndexClass, - ABCPeriodArray, - ABCPeriodIndex, - ABCSeries, -) +from pandas.core.dtypes.generic import ABCCategorical, ABCIndexClass from pandas.core.dtypes.inference import ( # noqa:F401 is_array_like, is_bool, is_complex, + is_dataclass, is_decimal, is_dict_like, is_file_like, @@ -606,71 +600,6 @@ def is_excluded_dtype(dtype) -> bool: return _is_dtype(arr_or_dtype, condition) -def is_period_arraylike(arr) -> bool: - """ - Check whether an array-like is a periodical array-like or PeriodIndex. - - Parameters - ---------- - arr : array-like - The array-like to check. - - Returns - ------- - boolean - Whether or not the array-like is a periodical array-like or - PeriodIndex instance. - - Examples - -------- - >>> is_period_arraylike([1, 2, 3]) - False - >>> is_period_arraylike(pd.Index([1, 2, 3])) - False - >>> is_period_arraylike(pd.PeriodIndex(["2017-01-01"], freq="D")) - True - """ - if isinstance(arr, (ABCPeriodIndex, ABCPeriodArray)): - return True - elif isinstance(arr, (np.ndarray, ABCSeries)): - return is_period_dtype(arr.dtype) - return getattr(arr, "inferred_type", None) == "period" - - -def is_datetime_arraylike(arr) -> bool: - """ - Check whether an array-like is a datetime array-like or DatetimeIndex. - - Parameters - ---------- - arr : array-like - The array-like to check. - - Returns - ------- - boolean - Whether or not the array-like is a datetime array-like or - DatetimeIndex. - - Examples - -------- - >>> is_datetime_arraylike([1, 2, 3]) - False - >>> is_datetime_arraylike(pd.Index([1, 2, 3])) - False - >>> is_datetime_arraylike(pd.DatetimeIndex([1, 2, 3])) - True - """ - if isinstance(arr, ABCDatetimeIndex): - return True - elif isinstance(arr, (np.ndarray, ABCSeries)): - return ( - is_object_dtype(arr.dtype) - and lib.infer_dtype(arr, skipna=False) == "datetime" - ) - return getattr(arr, "inferred_type", None) == "datetime" - - def is_dtype_equal(source, target) -> bool: """ Check if two dtypes are equal. diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index 181f0c8906853..d29102cbd4604 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -558,7 +558,7 @@ def validate_categories(categories, fastpath: bool = False): if not fastpath: if categories.hasnans: - raise ValueError("Categorial categories cannot be null") + raise ValueError("Categorical categories cannot be null") if not categories.is_unique: raise ValueError("Categorical categories must be unique") diff --git a/pandas/core/dtypes/generic.py b/pandas/core/dtypes/generic.py index 435d80b2c4dfb..2e83e6b32a51b 100644 --- a/pandas/core/dtypes/generic.py +++ b/pandas/core/dtypes/generic.py @@ -63,12 +63,11 @@ def _check(cls, inst) -> bool: "ABCTimedeltaArray", "_typ", ("timedeltaarray") ) ABCPeriodArray = create_pandas_abc_type("ABCPeriodArray", "_typ", ("periodarray",)) -ABCPeriod = create_pandas_abc_type("ABCPeriod", "_typ", ("period",)) ABCDateOffset = create_pandas_abc_type("ABCDateOffset", "_typ", ("dateoffset",)) -ABCInterval = create_pandas_abc_type("ABCInterval", "_typ", ("interval",)) ABCExtensionArray = create_pandas_abc_type( "ABCExtensionArray", "_typ", + # Note: IntervalArray and SparseArray are included bc they have _typ="extension" ("extension", "categorical", "periodarray", "datetimearray", "timedeltaarray"), ) ABCPandasArray = create_pandas_abc_type("ABCPandasArray", "_typ", ("npy_extension",)) diff --git a/pandas/core/dtypes/inference.py b/pandas/core/dtypes/inference.py index 56b880dca1241..d1607b5ede6c3 100644 --- a/pandas/core/dtypes/inference.py +++ b/pandas/core/dtypes/inference.py @@ -386,3 +386,39 @@ def is_sequence(obj) -> bool: return not isinstance(obj, (str, bytes)) except (TypeError, AttributeError): return False + + +def is_dataclass(item): + """ + Checks if the object is a data-class instance + + Parameters + ---------- + item : object + + Returns + -------- + is_dataclass : bool + True if the item is an instance of a data-class, + will return false if you pass the data class itself + + Examples + -------- + >>> from dataclasses import dataclass + >>> @dataclass + ... class Point: + ... x: int + ... y: int + + >>> is_dataclass(Point) + False + >>> is_dataclass(Point(0,2)) + True + + """ + try: + from dataclasses import is_dataclass + + return is_dataclass(item) and not isinstance(item, type) + except ImportError: + return False diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 61641bfb24293..6c36c7e71759c 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -77,6 +77,8 @@ ensure_platform_int, infer_dtype_from_object, is_bool_dtype, + is_dataclass, + is_datetime64_any_dtype, is_dict_like, is_dtype_equal, is_extension_array_dtype, @@ -88,6 +90,7 @@ is_list_like, is_named_tuple, is_object_dtype, + is_period_dtype, is_scalar, is_sequence, needs_i8_conversion, @@ -115,6 +118,7 @@ from pandas.core.internals import BlockManager from pandas.core.internals.construction import ( arrays_to_mgr, + dataclasses_to_dicts, get_names_from_index, init_dict, init_ndarray, @@ -472,6 +476,8 @@ def __init__( if not isinstance(data, (abc.Sequence, ExtensionArray)): data = list(data) if len(data) > 0: + if is_dataclass(data[0]): + data = dataclasses_to_dicts(data) if is_list_like(data[0]) and getattr(data[0], "ndim", 1) == 1: if is_named_tuple(data[0]) and columns is None: columns = data[0]._fields @@ -890,7 +896,7 @@ def style(self) -> "Styler": """ @Appender(_shared_docs["items"]) - def items(self) -> Iterable[Tuple[Optional[Hashable], Series]]: + def items(self) -> Iterable[Tuple[Label, Series]]: if self.columns.is_unique and hasattr(self, "_item_cache"): for k in self.columns: yield k, self._get_item_cache(k) @@ -899,10 +905,10 @@ def items(self) -> Iterable[Tuple[Optional[Hashable], Series]]: yield k, self._ixs(i, axis=1) @Appender(_shared_docs["items"]) - def iteritems(self) -> Iterable[Tuple[Optional[Hashable], Series]]: + def iteritems(self) -> Iterable[Tuple[Label, Series]]: yield from self.items() - def iterrows(self) -> Iterable[Tuple[Optional[Hashable], Series]]: + def iterrows(self) -> Iterable[Tuple[Label, Series]]: """ Iterate over DataFrame rows as (index, Series) pairs. @@ -1059,7 +1065,7 @@ def dot(self, other): ------- Series or DataFrame If other is a Series, return the matrix product between self and - other as a Serie. If other is a DataFrame or a numpy.array, return + other as a Series. If other is a DataFrame or a numpy.array, return the matrix product of self and other in a DataFrame of a np.array. See Also @@ -1247,7 +1253,7 @@ def from_dict(cls, data, orient="columns", dtype=None, columns=None) -> "DataFra return cls(data, index=index, columns=columns, dtype=dtype) - def to_numpy(self, dtype=None, copy=False) -> np.ndarray: + def to_numpy(self, dtype=None, copy: bool = False) -> np.ndarray: """ Convert the DataFrame to a NumPy array. @@ -1399,11 +1405,45 @@ def to_dict(self, orient="dict", into=dict): ) # GH16122 into_c = com.standardize_mapping(into) - if orient.lower().startswith("d"): + + orient = orient.lower() + # GH32515 + if orient.startswith(("d", "l", "s", "r", "i")) and orient not in { + "dict", + "list", + "series", + "split", + "records", + "index", + }: + warnings.warn( + "Using short name for 'orient' is deprecated. Only the " + "options: ('dict', list, 'series', 'split', 'records', 'index') " + "will be used in a future version. Use one of the above " + "to silence this warning.", + FutureWarning, + ) + + if orient.startswith("d"): + orient = "dict" + elif orient.startswith("l"): + orient = "list" + elif orient.startswith("sp"): + orient = "split" + elif orient.startswith("s"): + orient = "series" + elif orient.startswith("r"): + orient = "records" + elif orient.startswith("i"): + orient = "index" + + if orient == "dict": return into_c((k, v.to_dict(into)) for k, v in self.items()) - elif orient.lower().startswith("l"): + + elif orient == "list": return into_c((k, v.tolist()) for k, v in self.items()) - elif orient.lower().startswith("sp"): + + elif orient == "split": return into_c( ( ("index", self.index.tolist()), @@ -1417,9 +1457,11 @@ def to_dict(self, orient="dict", into=dict): ), ) ) - elif orient.lower().startswith("s"): + + elif orient == "series": return into_c((k, com.maybe_box_datetimelike(v)) for k, v in self.items()) - elif orient.lower().startswith("r"): + + elif orient == "records": columns = self.columns.tolist() rows = ( dict(zip(columns, row)) @@ -1429,13 +1471,15 @@ def to_dict(self, orient="dict", into=dict): into_c((k, com.maybe_box_datetimelike(v)) for k, v in row.items()) for row in rows ] - elif orient.lower().startswith("i"): + + elif orient == "index": if not self.index.is_unique: raise ValueError("DataFrame index must be unique for orient='index'.") return into_c( (t[0], dict(zip(self.columns, t[1:]))) for t in self.itertuples(name=None) ) + else: raise ValueError(f"orient '{orient}' not understood") @@ -1771,7 +1815,9 @@ def to_records( else: ix_vals = [self.index.values] - arrays = ix_vals + [self[c]._internal_get_values() for c in self.columns] + arrays = ix_vals + [ + np.asarray(self.iloc[:, i]) for i in range(len(self.columns)) + ] count = 0 index_names = list(self.index.names) @@ -1786,7 +1832,7 @@ def to_records( names = [str(name) for name in itertools.chain(index_names, self.columns)] else: - arrays = [self[c]._internal_get_values() for c in self.columns] + arrays = [np.asarray(self.iloc[:, i]) for i in range(len(self.columns))] names = [str(c) for c in self.columns] index_names = [] @@ -1843,8 +1889,41 @@ def to_records( return np.rec.fromarrays(arrays, dtype={"names": names, "formats": formats}) @classmethod - def _from_arrays(cls, arrays, columns, index, dtype=None) -> "DataFrame": - mgr = arrays_to_mgr(arrays, columns, index, columns, dtype=dtype) + def _from_arrays( + cls, arrays, columns, index, dtype=None, verify_integrity=True + ) -> "DataFrame": + """ + Create DataFrame from a list of arrays corresponding to the columns. + + Parameters + ---------- + arrays : list-like of arrays + Each array in the list corresponds to one column, in order. + columns : list-like, Index + The column names for the resulting DataFrame. + index : list-like, Index + The rows labels for the resulting DataFrame. + dtype : dtype, optional + Optional dtype to enforce for all arrays. + verify_integrity : bool, default True + Validate and homogenize all input. If set to False, it is assumed + that all elements of `arrays` are actual arrays how they will be + stored in a block (numpy ndarray or ExtensionArray), have the same + length as and are aligned with the index, and that `columns` and + `index` are ensured to be an Index object. + + Returns + ------- + DataFrame + """ + mgr = arrays_to_mgr( + arrays, + columns, + index, + columns, + dtype=dtype, + verify_integrity=verify_integrity, + ) return cls(mgr) @deprecate_kwarg(old_arg_name="fname", new_arg_name="path") @@ -2443,7 +2522,9 @@ def transpose(self, *args, copy: bool = False) -> "DataFrame": return result.__finalize__(self) - T = property(transpose) + @property + def T(self) -> "DataFrame": + return self.transpose() # ---------------------------------------------------------------------- # Indexing Methods @@ -2683,6 +2764,7 @@ def _setitem_array(self, key, value): for k1, k2 in zip(key, value.columns): self[k1] = value[k2] else: + self.loc._ensure_listlike_indexer(key, axis=1) indexer = self.loc._get_listlike_indexer( key, axis=1, raise_missing=False )[1] @@ -2706,6 +2788,20 @@ def _setitem_frame(self, key, value): self._check_setitem_copy() self._where(-key, value, inplace=True) + def _iset_item(self, loc: int, value): + self._ensure_valid_index(value) + + # technically _sanitize_column expects a label, not a position, + # but the behavior is the same as long as we pass broadcast=False + value = self._sanitize_column(loc, value, broadcast=False) + NDFrame._iset_item(self, loc, value) + + # check if we are modifying a copy + # try to set first as we want an invalid + # value exception to occur first + if len(self): + self._check_setitem_copy() + def _set_item(self, key, value): """ Add series to DataFrame in specified column. @@ -3619,7 +3715,7 @@ def reindex(self, *args, **kwargs) -> "DataFrame": # Pop these, since the values are in `kwargs` under different names kwargs.pop("axis", None) kwargs.pop("labels", None) - return self._ensure_type(super().reindex(**kwargs)) + return super().reindex(**kwargs) def drop( self, @@ -3937,8 +4033,8 @@ def replace( @Appender(_shared_docs["shift"] % _shared_doc_kwargs) def shift(self, periods=1, freq=None, axis=0, fill_value=None) -> "DataFrame": - return self._ensure_type( - super().shift(periods=periods, freq=freq, axis=axis, fill_value=fill_value) + return super().shift( + periods=periods, freq=freq, axis=axis, fill_value=fill_value ) def set_index( @@ -4043,7 +4139,7 @@ def set_index( "one-dimensional arrays." ) - missing: List[Optional[Hashable]] = [] + missing: List[Label] = [] for col in keys: if isinstance( col, (ABCIndexClass, ABCSeries, np.ndarray, list, abc.Iterator) @@ -4082,7 +4178,7 @@ def set_index( else: arrays.append(self.index) - to_remove: List[Optional[Hashable]] = [] + to_remove: List[Label] = [] for col in keys: if isinstance(col, ABCMultiIndex): for n in range(col.nlevels): @@ -4137,7 +4233,7 @@ def reset_index( drop: bool = False, inplace: bool = False, col_level: Hashable = 0, - col_fill: Optional[Hashable] = "", + col_fill: Label = "", ) -> Optional["DataFrame"]: """ Reset the index, or a level of it. @@ -5212,20 +5308,6 @@ def _arith_op(left, right): return new_data - def _combine_match_index(self, other: Series, func): - # at this point we have `self.index.equals(other.index)` - - if ops.should_series_dispatch(self, other, func): - # operate column-wise; avoid costly object-casting in `.values` - new_data = ops.dispatch_to_series(self, other, func) - else: - # fastpath --> operate directly on values - other_vals = other.values.reshape(-1, 1) - with np.errstate(all="ignore"): - new_data = func(self.values, other_vals) - new_data = dispatch_fill_zeros(func, self.values, other_vals, new_data) - return new_data - def _construct_result(self, result) -> "DataFrame": """ Wrap the result of an arithmetic, comparison, or logical operation. @@ -5899,7 +5981,8 @@ def pivot(self, index=None, columns=None, values=None) -> "DataFrame": If dict is passed, the key is column to aggregate and value is function or list of functions. fill_value : scalar, default None - Value to replace missing values with. + Value to replace missing values with (in the resulting pivot table, + after aggregation). margins : bool, default False Add all row / columns (e.g. for subtotal / grand totals). dropna : bool, default True @@ -6459,7 +6542,7 @@ def melt( # ---------------------------------------------------------------------- # Time series-related - def diff(self, periods=1, axis=0) -> "DataFrame": + def diff(self, periods: int = 1, axis: Axis = 0) -> "DataFrame": """ First discrete difference of element. @@ -7803,11 +7886,15 @@ def _reduce( self, op, name, axis=0, skipna=True, numeric_only=None, filter_type=None, **kwds ): - dtype_is_dt = self.dtypes.apply(lambda x: x.kind == "M") + assert filter_type is None or filter_type == "bool", filter_type + + dtype_is_dt = self.dtypes.apply( + lambda x: is_datetime64_any_dtype(x) or is_period_dtype(x) + ) if numeric_only is None and name in ["mean", "median"] and dtype_is_dt.any(): warnings.warn( "DataFrame.mean and DataFrame.median with numeric_only=None " - "will include datetime64 and datetime64tz columns in a " + "will include datetime64, datetime64tz, and PeriodDtype columns in a " "future version.", FutureWarning, stacklevel=3, @@ -7828,7 +7915,7 @@ def f(x): return op(x, axis=axis, skipna=skipna, **kwds) def _get_data(axis_matters): - if filter_type is None or filter_type == "numeric": + if filter_type is None: data = self._get_numeric_data() elif filter_type == "bool": if axis_matters: @@ -7868,18 +7955,18 @@ def blk_func(values): assert len(res) == max(list(res.keys())) + 1, res.keys() out = df._constructor_sliced(res, index=range(len(res)), dtype=out_dtype) out.index = df.columns + if axis == 0 and df.dtypes.apply(needs_i8_conversion).any(): + # FIXME: needs_i8_conversion check is kludge, not sure + # why it is necessary in this case and this case alone + out[:] = coerce_to_dtypes(out.values, df.dtypes) return out if numeric_only is None: - values = self.values + data = self + values = data.values try: result = f(values) - if filter_type == "bool" and is_object_dtype(values) and axis is None: - # work around https://github.com/numpy/numpy/issues/10489 - # TODO: combine with hasattr(result, 'dtype') further down - # hard since we don't have `values` down there. - result = np.bool_(result) except TypeError: # e.g. in nanops trying to convert strs to float @@ -7905,30 +7992,36 @@ def blk_func(values): # TODO: why doesnt axis matter here? data = _get_data(axis_matters=False) - with np.errstate(all="ignore"): - result = f(data.values) labels = data._get_agg_axis(axis) + + values = data.values + with np.errstate(all="ignore"): + result = f(values) else: if numeric_only: data = _get_data(axis_matters=True) + labels = data._get_agg_axis(axis) values = data.values - labels = data._get_agg_axis(axis) else: - values = self.values + data = self + values = data.values result = f(values) - if hasattr(result, "dtype") and is_object_dtype(result.dtype): + if filter_type == "bool" and is_object_dtype(values) and axis is None: + # work around https://github.com/numpy/numpy/issues/10489 + # TODO: can we de-duplicate parts of this with the next blocK? + result = np.bool_(result) + elif hasattr(result, "dtype") and is_object_dtype(result.dtype): try: - if filter_type is None or filter_type == "numeric": + if filter_type is None: result = result.astype(np.float64) elif filter_type == "bool" and notna(result).all(): result = result.astype(np.bool_) except (ValueError, TypeError): - # try to coerce to the original dtypes item by item if we can if axis == 0: - result = coerce_to_dtypes(result, self.dtypes) + result = coerce_to_dtypes(result, data.dtypes) if constructor is not None: result = self._constructor_sliced(result, index=labels) @@ -8005,6 +8098,35 @@ def idxmin(self, axis=0, skipna=True) -> Series: Notes ----- This method is the DataFrame version of ``ndarray.argmin``. + + Examples + -------- + Consider a dataset containing food consumption in Argentina. + + >>> df = pd.DataFrame({'consumption': [10.51, 103.11, 55.48], + ... 'co2_emissions': [37.2, 19.66, 1712]}, + ... index=['Pork', 'Wheat Products', 'Beef']) + + >>> df + consumption co2_emissions + Pork 10.51 37.20 + Wheat Products 103.11 19.66 + Beef 55.48 1712.00 + + By default, it returns the index for the minimum value in each column. + + >>> df.idxmin() + consumption Pork + co2_emissions Wheat Products + dtype: object + + To return the index for the minimum value in each row, use ``axis="columns"``. + + >>> df.idxmin(axis="columns") + Pork consumption + Wheat Products co2_emissions + Beef consumption + dtype: object """ axis = self._get_axis_number(axis) indices = nanops.nanargmin(self.values, axis=axis, skipna=skipna) @@ -8043,6 +8165,35 @@ def idxmax(self, axis=0, skipna=True) -> Series: Notes ----- This method is the DataFrame version of ``ndarray.argmax``. + + Examples + -------- + Consider a dataset containing food consumption in Argentina. + + >>> df = pd.DataFrame({'consumption': [10.51, 103.11, 55.48], + ... 'co2_emissions': [37.2, 19.66, 1712]}, + ... index=['Pork', 'Wheat Products', 'Beef']) + + >>> df + consumption co2_emissions + Pork 10.51 37.20 + Wheat Products 103.11 19.66 + Beef 55.48 1712.00 + + By default, it returns the index for the maximum value in each column. + + >>> df.idxmax() + consumption Wheat Products + co2_emissions Beef + dtype: object + + To return the index for the maximum value in each row, use ``axis="columns"``. + + >>> df.idxmax(axis="columns") + Pork co2_emissions + Wheat Products consumption + Beef co2_emissions + dtype: object """ axis = self._get_axis_number(axis) indices = nanops.nanargmax(self.values, axis=axis, skipna=skipna) @@ -8370,14 +8521,12 @@ def isin(self, values) -> "DataFrame": from pandas.core.reshape.concat import concat values = collections.defaultdict(list, values) - return self._ensure_type( - concat( - ( - self.iloc[:, [i]].isin(values[col]) - for i, col in enumerate(self.columns) - ), - axis=1, - ) + return concat( + ( + self.iloc[:, [i]].isin(values[col]) + for i, col in enumerate(self.columns) + ), + axis=1, ) elif isinstance(values, Series): if not values.index.is_unique: diff --git a/pandas/core/generic.py b/pandas/core/generic.py index f0147859cae97..8c6a5c9d020b4 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -30,7 +30,7 @@ from pandas._config import config -from pandas._libs import Timestamp, iNaT, lib +from pandas._libs import Timestamp, lib from pandas._typing import ( Axis, FilePathOrBuffer, @@ -72,7 +72,6 @@ is_number, is_numeric_dtype, is_object_dtype, - is_period_arraylike, is_re_compilable, is_scalar, is_timedelta64_dtype, @@ -355,7 +354,7 @@ def _get_axis_number(cls, axis): return cls._AXIS_NUMBERS[axis] except KeyError: pass - raise ValueError(f"No axis named {axis} for object type {cls}") + raise ValueError(f"No axis named {axis} for object type {cls.__name__}") @classmethod def _get_axis_name(cls, axis): @@ -368,7 +367,7 @@ def _get_axis_name(cls, axis): return cls._AXIS_NAMES[axis] except KeyError: pass - raise ValueError(f"No axis named {axis} for object type {cls}") + raise ValueError(f"No axis named {axis} for object type {cls.__name__}") def _get_axis(self, axis): name = self._get_axis_name(axis) @@ -968,7 +967,6 @@ def rename( continue ax = self._get_axis(axis_no) - baxis = self._get_block_manager_axis(axis_no) f = com.get_rename_function(replacements) if level is not None: @@ -985,9 +983,8 @@ def rename( ] raise KeyError(f"{missing_labels} not found in axis") - result._data = result._data.rename_axis( - f, axis=baxis, copy=copy, level=level - ) + new_index = ax._transform_index(f, level) + result.set_axis(new_index, axis=axis_no, inplace=True) result._clear_item_cache() if inplace: @@ -1213,7 +1210,7 @@ def _set_axis_name(self, name, axis=0, inplace=False): >>> df.index = pd.MultiIndex.from_product( ... [["mammal"], ['dog', 'cat', 'monkey']]) >>> df._set_axis_name(["type", "name"]) - legs + num_legs type name mammal dog 4 cat 4 @@ -1342,7 +1339,7 @@ def __neg__(self): def __pos__(self): values = self._values - if is_bool_dtype(values) or is_period_arraylike(values): + if is_bool_dtype(values): arr = values elif ( is_numeric_dtype(values) @@ -1915,117 +1912,7 @@ def _repr_data_resource_(self): %(klass)s in Markdown-friendly format. """ - _shared_docs[ - "to_excel" - ] = """ - Write %(klass)s to an Excel sheet. - - To write a single %(klass)s to an Excel .xlsx file it is only necessary to - specify a target file name. To write to multiple sheets it is necessary to - create an `ExcelWriter` object with a target file name, and specify a sheet - in the file to write to. - - Multiple sheets may be written to by specifying unique `sheet_name`. - With all data written to the file it is necessary to save the changes. - Note that creating an `ExcelWriter` object with a file name that already - exists will result in the contents of the existing file being erased. - - Parameters - ---------- - excel_writer : str or ExcelWriter object - File path or existing ExcelWriter. - sheet_name : str, default 'Sheet1' - Name of sheet which will contain DataFrame. - na_rep : str, default '' - Missing data representation. - float_format : str, optional - Format string for floating point numbers. For example - ``float_format="%%.2f"`` will format 0.1234 to 0.12. - columns : sequence or list of str, optional - Columns to write. - header : bool or list of str, default True - Write out the column names. If a list of string is given it is - assumed to be aliases for the column names. - index : bool, default True - Write row names (index). - index_label : str or sequence, optional - Column label for index column(s) if desired. If not specified, and - `header` and `index` are True, then the index names are used. A - sequence should be given if the DataFrame uses MultiIndex. - startrow : int, default 0 - Upper left cell row to dump data frame. - startcol : int, default 0 - Upper left cell column to dump data frame. - engine : str, optional - Write engine to use, 'openpyxl' or 'xlsxwriter'. You can also set this - via the options ``io.excel.xlsx.writer``, ``io.excel.xls.writer``, and - ``io.excel.xlsm.writer``. - merge_cells : bool, default True - Write MultiIndex and Hierarchical Rows as merged cells. - encoding : str, optional - Encoding of the resulting excel file. Only necessary for xlwt, - other writers support unicode natively. - inf_rep : str, default 'inf' - Representation for infinity (there is no native representation for - infinity in Excel). - verbose : bool, default True - Display more information in the error logs. - freeze_panes : tuple of int (length 2), optional - Specifies the one-based bottommost row and rightmost column that - is to be frozen. - - See Also - -------- - to_csv : Write DataFrame to a comma-separated values (csv) file. - ExcelWriter : Class for writing DataFrame objects into excel sheets. - read_excel : Read an Excel file into a pandas DataFrame. - read_csv : Read a comma-separated values (csv) file into DataFrame. - - Notes - ----- - For compatibility with :meth:`~DataFrame.to_csv`, - to_excel serializes lists and dicts to strings before writing. - - Once a workbook has been saved it is not possible write further data - without rewriting the whole workbook. - - Examples - -------- - - Create, write to and save a workbook: - - >>> df1 = pd.DataFrame([['a', 'b'], ['c', 'd']], - ... index=['row 1', 'row 2'], - ... columns=['col 1', 'col 2']) - >>> df1.to_excel("output.xlsx") # doctest: +SKIP - - To specify the sheet name: - - >>> df1.to_excel("output.xlsx", - ... sheet_name='Sheet_name_1') # doctest: +SKIP - - If you wish to write to more than one sheet in the workbook, it is - necessary to specify an ExcelWriter object: - - >>> df2 = df1.copy() - >>> with pd.ExcelWriter('output.xlsx') as writer: # doctest: +SKIP - ... df1.to_excel(writer, sheet_name='Sheet_name_1') - ... df2.to_excel(writer, sheet_name='Sheet_name_2') - - ExcelWriter can also be used to append to an existing Excel file: - - >>> with pd.ExcelWriter('output.xlsx', - ... mode='a') as writer: # doctest: +SKIP - ... df.to_excel(writer, sheet_name='Sheet_name_3') - - To set the library that is used to write the Excel file, - you can pass the `engine` keyword (the default engine is - automatically chosen depending on the file extension): - - >>> df1.to_excel('output1.xlsx', engine='xlsxwriter') # doctest: +SKIP - """ - - @Appender(_shared_docs["to_excel"] % dict(klass="object")) + @doc(klass="object") def to_excel( self, excel_writer, @@ -2045,6 +1932,114 @@ def to_excel( verbose=True, freeze_panes=None, ) -> None: + """ + Write {klass} to an Excel sheet. + + To write a single {klass} to an Excel .xlsx file it is only necessary to + specify a target file name. To write to multiple sheets it is necessary to + create an `ExcelWriter` object with a target file name, and specify a sheet + in the file to write to. + + Multiple sheets may be written to by specifying unique `sheet_name`. + With all data written to the file it is necessary to save the changes. + Note that creating an `ExcelWriter` object with a file name that already + exists will result in the contents of the existing file being erased. + + Parameters + ---------- + excel_writer : str or ExcelWriter object + File path or existing ExcelWriter. + sheet_name : str, default 'Sheet1' + Name of sheet which will contain DataFrame. + na_rep : str, default '' + Missing data representation. + float_format : str, optional + Format string for floating point numbers. For example + ``float_format="%.2f"`` will format 0.1234 to 0.12. + columns : sequence or list of str, optional + Columns to write. + header : bool or list of str, default True + Write out the column names. If a list of string is given it is + assumed to be aliases for the column names. + index : bool, default True + Write row names (index). + index_label : str or sequence, optional + Column label for index column(s) if desired. If not specified, and + `header` and `index` are True, then the index names are used. A + sequence should be given if the DataFrame uses MultiIndex. + startrow : int, default 0 + Upper left cell row to dump data frame. + startcol : int, default 0 + Upper left cell column to dump data frame. + engine : str, optional + Write engine to use, 'openpyxl' or 'xlsxwriter'. You can also set this + via the options ``io.excel.xlsx.writer``, ``io.excel.xls.writer``, and + ``io.excel.xlsm.writer``. + merge_cells : bool, default True + Write MultiIndex and Hierarchical Rows as merged cells. + encoding : str, optional + Encoding of the resulting excel file. Only necessary for xlwt, + other writers support unicode natively. + inf_rep : str, default 'inf' + Representation for infinity (there is no native representation for + infinity in Excel). + verbose : bool, default True + Display more information in the error logs. + freeze_panes : tuple of int (length 2), optional + Specifies the one-based bottommost row and rightmost column that + is to be frozen. + + See Also + -------- + to_csv : Write DataFrame to a comma-separated values (csv) file. + ExcelWriter : Class for writing DataFrame objects into excel sheets. + read_excel : Read an Excel file into a pandas DataFrame. + read_csv : Read a comma-separated values (csv) file into DataFrame. + + Notes + ----- + For compatibility with :meth:`~DataFrame.to_csv`, + to_excel serializes lists and dicts to strings before writing. + + Once a workbook has been saved it is not possible write further data + without rewriting the whole workbook. + + Examples + -------- + + Create, write to and save a workbook: + + >>> df1 = pd.DataFrame([['a', 'b'], ['c', 'd']], + ... index=['row 1', 'row 2'], + ... columns=['col 1', 'col 2']) + >>> df1.to_excel("output.xlsx") # doctest: +SKIP + + To specify the sheet name: + + >>> df1.to_excel("output.xlsx", + ... sheet_name='Sheet_name_1') # doctest: +SKIP + + If you wish to write to more than one sheet in the workbook, it is + necessary to specify an ExcelWriter object: + + >>> df2 = df1.copy() + >>> with pd.ExcelWriter('output.xlsx') as writer: # doctest: +SKIP + ... df1.to_excel(writer, sheet_name='Sheet_name_1') + ... df2.to_excel(writer, sheet_name='Sheet_name_2') + + ExcelWriter can also be used to append to an existing Excel file: + + >>> with pd.ExcelWriter('output.xlsx', + ... mode='a') as writer: # doctest: +SKIP + ... df.to_excel(writer, sheet_name='Sheet_name_3') + + To set the library that is used to write the Excel file, + you can pass the `engine` keyword (the default engine is + automatically chosen depending on the file extension): + + >>> df1.to_excel('output1.xlsx', engine='xlsxwriter') # doctest: +SKIP + """ + df = self if isinstance(self, ABCDataFrame) else self.to_frame() from pandas.io.formats.excel import ExcelFormatter @@ -2185,45 +2180,141 @@ def to_json( Examples -------- - >>> df = pd.DataFrame([['a', 'b'], ['c', 'd']], - ... index=['row 1', 'row 2'], - ... columns=['col 1', 'col 2']) - >>> df.to_json(orient='split') - '{"columns":["col 1","col 2"], - "index":["row 1","row 2"], - "data":[["a","b"],["c","d"]]}' + >>> import json + >>> df = pd.DataFrame( + ... [["a", "b"], ["c", "d"]], + ... index=["row 1", "row 2"], + ... columns=["col 1", "col 2"], + ... ) + + >>> result = df.to_json(orient="split") + >>> parsed = json.loads(result) + >>> json.dumps(parsed, indent=4) # doctest: +SKIP + { + "columns": [ + "col 1", + "col 2" + ], + "index": [ + "row 1", + "row 2" + ], + "data": [ + [ + "a", + "b" + ], + [ + "c", + "d" + ] + ] + } Encoding/decoding a Dataframe using ``'records'`` formatted JSON. Note that index labels are not preserved with this encoding. - >>> df.to_json(orient='records') - '[{"col 1":"a","col 2":"b"},{"col 1":"c","col 2":"d"}]' + >>> result = df.to_json(orient="records") + >>> parsed = json.loads(result) + >>> json.dumps(parsed, indent=4) # doctest: +SKIP + [ + { + "col 1": "a", + "col 2": "b" + }, + { + "col 1": "c", + "col 2": "d" + } + ] Encoding/decoding a Dataframe using ``'index'`` formatted JSON: - >>> df.to_json(orient='index') - '{"row 1":{"col 1":"a","col 2":"b"},"row 2":{"col 1":"c","col 2":"d"}}' + >>> result = df.to_json(orient="index") + >>> parsed = json.loads(result) + >>> json.dumps(parsed, indent=4) # doctest: +SKIP + { + "row 1": { + "col 1": "a", + "col 2": "b" + }, + "row 2": { + "col 1": "c", + "col 2": "d" + } + } Encoding/decoding a Dataframe using ``'columns'`` formatted JSON: - >>> df.to_json(orient='columns') - '{"col 1":{"row 1":"a","row 2":"c"},"col 2":{"row 1":"b","row 2":"d"}}' + >>> result = df.to_json(orient="columns") + >>> parsed = json.loads(result) + >>> json.dumps(parsed, indent=4) # doctest: +SKIP + { + "col 1": { + "row 1": "a", + "row 2": "c" + }, + "col 2": { + "row 1": "b", + "row 2": "d" + } + } Encoding/decoding a Dataframe using ``'values'`` formatted JSON: - >>> df.to_json(orient='values') - '[["a","b"],["c","d"]]' - - Encoding with Table Schema + >>> result = df.to_json(orient="values") + >>> parsed = json.loads(result) + >>> json.dumps(parsed, indent=4) # doctest: +SKIP + [ + [ + "a", + "b" + ], + [ + "c", + "d" + ] + ] - >>> df.to_json(orient='table') - '{"schema": {"fields": [{"name": "index", "type": "string"}, - {"name": "col 1", "type": "string"}, - {"name": "col 2", "type": "string"}], - "primaryKey": "index", - "pandas_version": "0.20.0"}, - "data": [{"index": "row 1", "col 1": "a", "col 2": "b"}, - {"index": "row 2", "col 1": "c", "col 2": "d"}]}' + Encoding with Table Schema: + + >>> result = df.to_json(orient="table") + >>> parsed = json.loads(result) + >>> json.dumps(parsed, indent=4) # doctest: +SKIP + { + "schema": { + "fields": [ + { + "name": "index", + "type": "string" + }, + { + "name": "col 1", + "type": "string" + }, + { + "name": "col 2", + "type": "string" + } + ], + "primaryKey": [ + "index" + ], + "pandas_version": "0.20.0" + }, + "data": [ + { + "index": "row 1", + "col 1": "a", + "col 2": "b" + }, + { + "index": "row 2", + "col 1": "c", + "col 2": "d" + } + ] + } """ from pandas.io import json @@ -2650,7 +2741,8 @@ def to_clipboard( Copy the contents of a DataFrame to the clipboard. >>> df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=['A', 'B', 'C']) - >>> df.to_clipboard(sep=',') + + >>> df.to_clipboard(sep=',') # doctest: +SKIP ... # Wrote the following to the system clipboard: ... # ,A,B,C ... # 0,1,2,3 @@ -2659,7 +2751,7 @@ def to_clipboard( We can omit the index by passing the keyword `index` and setting it to false. - >>> df.to_clipboard(sep=',', index=False) + >>> df.to_clipboard(sep=',', index=False) # doctest: +SKIP ... # Wrote the following to the system clipboard: ... # A,B,C ... # 1,2,3 @@ -3483,6 +3575,10 @@ def _slice(self: FrameOrSeries, slobj: slice, axis=0) -> FrameOrSeries: result._set_is_copy(self, copy=is_copy) return result + def _iset_item(self, loc: int, value) -> None: + self._data.iset(loc, value) + self._clear_item_cache() + def _set_item(self, key, value) -> None: self._data.set(key, value) self._clear_item_cache() @@ -4460,6 +4556,10 @@ def filter( >>> df = pd.DataFrame(np.array(([1, 2, 3], [4, 5, 6])), ... index=['mouse', 'rabbit'], ... columns=['one', 'two', 'three']) + >>> df + one two three + mouse 1 2 3 + rabbit 4 5 6 >>> # select columns by name >>> df.filter(items=['one', 'three']) @@ -4692,9 +4792,16 @@ def sample( If weights do not sum to 1, they will be normalized to sum to 1. Missing values in the weights column will be treated as zero. Infinite values not allowed. - random_state : int or numpy.random.RandomState, optional - Seed for the random number generator (if int), or numpy RandomState - object. + random_state : int, array-like, BitGenerator, np.random.RandomState, optional + If int, array-like, or BitGenerator (NumPy>=1.17), seed for + random number generator + If np.random.RandomState, use as numpy RandomState object. + + ..versionchanged:: 1.1.0 + + array-like and BitGenerator (for NumPy>=1.17) object now passed to + np.random.RandomState() as seed + axis : {0 or ‘index’, 1 or ‘columns’, None}, default None Axis to sample. Accepts axis number or name. Default is stat axis for given data type (0 for Series and DataFrames). @@ -4884,18 +4991,17 @@ def sample( Notes ----- - Use ``.pipe`` when chaining together functions that expect Series, DataFrames or GroupBy objects. Instead of writing - >>> f(g(h(df), arg1=a), arg2=b, arg3=c) + >>> func(g(h(df), arg1=a), arg2=b, arg3=c) # doctest: +SKIP You can write >>> (df.pipe(h) ... .pipe(g, arg1=a) - ... .pipe(f, arg2=b, arg3=c) - ... ) + ... .pipe(func, arg2=b, arg3=c) + ... ) # doctest: +SKIP If you have a function that takes the data as (say) the second argument, pass a tuple indicating which keyword expects the @@ -4903,8 +5009,8 @@ def sample( >>> (df.pipe(h) ... .pipe(g, arg1=a) - ... .pipe((f, 'arg2'), arg1=a, arg3=c) - ... ) + ... .pipe((func, 'arg2'), arg1=a, arg3=c) + ... ) # doctest: +SKIP """ @Appender(_shared_docs["pipe"] % _shared_doc_kwargs) @@ -5253,7 +5359,7 @@ def values(self) -> np.ndarray: dtype: object >>> df.values array([[ 3, 94, 31], - [ 29, 170, 115]], dtype=int64) + [ 29, 170, 115]]) A DataFrame with mixed type columns(e.g., str/object, int64, float32) results in an ndarray of the broadest type that accommodates these @@ -5281,26 +5387,6 @@ def _values(self) -> np.ndarray: """internal implementation""" return self.values - def _internal_get_values(self) -> np.ndarray: - """ - Return an ndarray after converting sparse values to dense. - - This is the same as ``.values`` for non-sparse data. For sparse - data contained in a `SparseArray`, the data are first - converted to a dense representation. - - Returns - ------- - numpy.ndarray - Numpy representation of DataFrame. - - See Also - -------- - values : Numpy representation of DataFrame. - SparseArray : Container for sparse data. - """ - return self.values - @property def dtypes(self): """ @@ -5980,14 +6066,13 @@ def fillna( downcast=downcast, ) else: - if len(self._get_axis(axis)) == 0: - return self - if self.ndim == 1: if isinstance(value, (dict, ABCSeries)): value = create_series_with_explicit_dtype( value, dtype_if_empty=object ) + value = value.reindex(self.index, copy=False) + value = value._values elif not is_list_like(value): pass else: @@ -7390,6 +7475,7 @@ def asfreq( Parameters ---------- freq : DateOffset or str + Frequency DateOffset or string. method : {'backfill'/'bfill', 'pad'/'ffill'}, default None Method to use for filling holes in reindexed Series (note this does not fill NaNs that already were present): @@ -7407,11 +7493,12 @@ def asfreq( Returns ------- - converted : same type as caller + Same type as caller + Object converted to the specified frequency. See Also -------- - reindex + reindex : Conform DataFrame to new index with optional filling logic. Notes ----- @@ -7919,15 +8006,21 @@ def resample( def first(self: FrameOrSeries, offset) -> FrameOrSeries: """ - Method to subset initial periods of time series data based on a date offset. + Select initial periods of time series data based on a date offset. + + When having a DataFrame with dates as index, this function can + select the first few rows based on a date offset. Parameters ---------- - offset : str, DateOffset, dateutil.relativedelta + offset : str, DateOffset or dateutil.relativedelta + The offset length of the data that will be selected. For instance, + '1M' will display all the rows having their index within the first month. Returns ------- - subset : same type as caller + Series or DataFrame + A subset of the caller. Raises ------ @@ -7943,7 +8036,7 @@ def first(self: FrameOrSeries, offset) -> FrameOrSeries: Examples -------- >>> i = pd.date_range('2018-04-09', periods=4, freq='2D') - >>> ts = pd.DataFrame({'A': [1,2,3,4]}, index=i) + >>> ts = pd.DataFrame({'A': [1, 2, 3, 4]}, index=i) >>> ts A 2018-04-09 1 @@ -7958,7 +8051,7 @@ def first(self: FrameOrSeries, offset) -> FrameOrSeries: 2018-04-09 1 2018-04-11 2 - Notice the data for 3 first calender days were returned, not the first + Notice the data for 3 first calendar days were returned, not the first 3 days observed in the dataset, and therefore data for 2018-04-13 was not returned. """ @@ -8020,7 +8113,7 @@ def last(self: FrameOrSeries, offset) -> FrameOrSeries: 2018-04-13 3 2018-04-15 4 - Notice the data for 3 last calender days were returned, not the last + Notice the data for 3 last calendar days were returned, not the last 3 observed days in the dataset, and therefore data for 2018-04-11 was not returned. """ @@ -8337,9 +8430,9 @@ def _align_frame( ) if method is not None: - left = self._ensure_type( - left.fillna(method=method, axis=fill_axis, limit=limit) - ) + _left = left.fillna(method=method, axis=fill_axis, limit=limit) + assert _left is not None # needed for mypy + left = _left right = right.fillna(method=method, axis=fill_axis, limit=limit) # if DatetimeIndex have different tz, convert to UTC @@ -8476,12 +8569,15 @@ def _where( for dt in cond.dtypes: if not is_bool_dtype(dt): raise ValueError(msg.format(dtype=dt)) + else: + # GH#21947 we have an empty DataFrame, could be object-dtype + cond = cond.astype(bool) cond = -cond if inplace else cond # try to align with other try_quick = True - if hasattr(other, "align"): + if isinstance(other, NDFrame): # align with me if other.ndim <= self.ndim: @@ -8556,12 +8652,7 @@ def _where( self._check_inplace_setting(other) new_data = self._data.putmask( - mask=cond, - new=other, - align=align, - inplace=True, - axis=block_axis, - transpose=self._AXIS_REVERSED, + mask=cond, new=other, align=align, axis=block_axis, ) self._update_inplace(new_data) @@ -9522,12 +9613,13 @@ def describe( ... np.datetime64("2010-01-01") ... ]) >>> s.describe() - count 3 - unique 2 - top 2010-01-01 00:00:00 - freq 2 - first 2000-01-01 00:00:00 - last 2010-01-01 00:00:00 + count 3 + mean 2006-09-01 08:00:00 + min 2000-01-01 00:00:00 + 25% 2004-12-31 12:00:00 + 50% 2010-01-01 00:00:00 + 75% 2010-01-01 00:00:00 + max 2010-01-01 00:00:00 dtype: object Describing a ``DataFrame``. By default only numeric fields @@ -9550,11 +9642,11 @@ def describe( Describing all columns of a ``DataFrame`` regardless of data type. - >>> df.describe(include='all') - categorical numeric object + >>> df.describe(include='all') # doctest: +SKIP + categorical numeric object count 3 3.0 3 unique 3 NaN 3 - top f NaN c + top f NaN a freq 1 NaN 1 mean NaN 2.0 NaN std NaN 1.0 NaN @@ -9593,11 +9685,11 @@ def describe( Including only string columns in a ``DataFrame`` description. - >>> df.describe(include=[np.object]) + >>> df.describe(include=[np.object]) # doctest: +SKIP object count 3 unique 3 - top c + top a freq 1 Including only categorical columns from a ``DataFrame`` description. @@ -9611,16 +9703,16 @@ def describe( Excluding numeric columns from a ``DataFrame`` description. - >>> df.describe(exclude=[np.number]) + >>> df.describe(exclude=[np.number]) # doctest: +SKIP categorical object count 3 3 unique 3 3 - top f c + top f a freq 1 1 Excluding object columns from a ``DataFrame`` description. - >>> df.describe(exclude=[np.object]) + >>> df.describe(exclude=[np.object]) # doctest: +SKIP categorical numeric count 3 3.0 unique 3 NaN @@ -9729,7 +9821,7 @@ def describe_1d(data): ldesc = [describe_1d(s) for _, s in data.items()] # set a convenient order for rows - names: List[Optional[Hashable]] = [] + names: List[Label] = [] ldesc_indexes = sorted((x.index for x in ldesc), key=len) for idxnames in ldesc_indexes: for name in idxnames: @@ -9871,9 +9963,9 @@ def pct_change( if fill_method is None: data = self else: - data = self._ensure_type( - self.fillna(method=fill_method, axis=axis, limit=limit) - ) + _data = self.fillna(method=fill_method, axis=axis, limit=limit) + assert _data is not None # needed for mypy + data = _data rs = data.div(data.shift(periods=periods, freq=freq, axis=axis, **kwargs)) - 1 if freq is not None: @@ -9996,8 +10088,6 @@ def mad(self, axis=None, skipna=None, level=None): desc="minimum", accum_func=np.minimum.accumulate, accum_func_name="min", - mask_a=np.inf, - mask_b=np.nan, examples=_cummin_examples, ) cls.cumsum = _make_cum_function( @@ -10009,8 +10099,6 @@ def mad(self, axis=None, skipna=None, level=None): desc="sum", accum_func=np.cumsum, accum_func_name="sum", - mask_a=0.0, - mask_b=np.nan, examples=_cumsum_examples, ) cls.cumprod = _make_cum_function( @@ -10022,8 +10110,6 @@ def mad(self, axis=None, skipna=None, level=None): desc="product", accum_func=np.cumprod, accum_func_name="prod", - mask_a=1.0, - mask_b=np.nan, examples=_cumprod_examples, ) cls.cummax = _make_cum_function( @@ -10035,8 +10121,6 @@ def mad(self, axis=None, skipna=None, level=None): desc="maximum", accum_func=np.maximum.accumulate, accum_func_name="max", - mask_a=-np.inf, - mask_b=np.nan, examples=_cummax_examples, ) @@ -11076,8 +11160,6 @@ def _make_cum_function( desc: str, accum_func: Callable, accum_func_name: str, - mask_a: float, - mask_b: float, examples: str, ) -> Callable: @Substitution( @@ -11099,61 +11181,15 @@ def cum_func(self, axis=None, skipna=True, *args, **kwargs): if axis == 1: return cum_func(self.T, axis=0, skipna=skipna, *args, **kwargs).T - def na_accum_func(blk_values): - # We will be applying this function to block values - if blk_values.dtype.kind in ["m", "M"]: - # GH#30460, GH#29058 - # numpy 1.18 started sorting NaTs at the end instead of beginning, - # so we need to work around to maintain backwards-consistency. - orig_dtype = blk_values.dtype - - # We need to define mask before masking NaTs - mask = isna(blk_values) - - if accum_func == np.minimum.accumulate: - # Note: the accum_func comparison fails as an "is" comparison - y = blk_values.view("i8") - y[mask] = np.iinfo(np.int64).max - changed = True - else: - y = blk_values - changed = False - - result = accum_func(y.view("i8"), axis) - if skipna: - np.putmask(result, mask, iNaT) - elif accum_func == np.minimum.accumulate: - # Restore NaTs that we masked previously - nz = (~np.asarray(mask)).nonzero()[0] - if len(nz): - # everything up to the first non-na entry stays NaT - result[: nz[0]] = iNaT - - if changed: - # restore NaT elements - y[mask] = iNaT # TODO: could try/finally for this? - - if isinstance(blk_values, np.ndarray): - result = result.view(orig_dtype) - else: - # DatetimeArray - result = type(blk_values)._from_sequence(result, dtype=orig_dtype) - - elif skipna and not issubclass( - blk_values.dtype.type, (np.integer, np.bool_) - ): - vals = blk_values.copy().T - mask = isna(vals) - np.putmask(vals, mask, mask_a) - result = accum_func(vals, axis) - np.putmask(result, mask, mask_b) - else: - result = accum_func(blk_values.T, axis) + def block_accum_func(blk_values): + values = blk_values.T if hasattr(blk_values, "T") else blk_values + + result = nanops.na_accum_func(values, accum_func, skipna=skipna) - # transpose back for ndarray, not for EA - return result.T if hasattr(result, "T") else result + result = result.T if hasattr(result, "T") else result + return result - result = self._data.apply(na_accum_func) + result = self._data.apply(block_accum_func) d = self._construct_axes_dict() d["copy"] = False diff --git a/pandas/core/groupby/base.py b/pandas/core/groupby/base.py index 700d8d503d086..363286704ba95 100644 --- a/pandas/core/groupby/base.py +++ b/pandas/core/groupby/base.py @@ -98,6 +98,7 @@ def _gotitem(self, key, ndim, subset=None): [ "all", "any", + "corrwith", "count", "first", "idxmax", @@ -132,7 +133,6 @@ def _gotitem(self, key, ndim, subset=None): [ "backfill", "bfill", - "corrwith", "cumcount", "cummax", "cummin", diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index fb935c9065b83..4102b8527b6aa 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -49,7 +49,7 @@ is_scalar, needs_i8_conversion, ) -from pandas.core.dtypes.missing import _isna_ndarraylike, isna, notna +from pandas.core.dtypes.missing import isna, notna from pandas.core.aggregation import ( is_multi_agg_with_relabel, @@ -589,7 +589,7 @@ def nunique(self, dropna: bool = True) -> Series: """ ids, _, _ = self.grouper.group_info - val = self.obj._internal_get_values() + val = self.obj._values codes, _ = algorithms.factorize(val, sort=False) sorter = np.lexsort((codes, ids)) @@ -657,7 +657,7 @@ def value_counts( ) ids, _, _ = self.grouper.group_info - val = self.obj._internal_get_values() + val = self.obj._values # groupby removes null keys from groupings mask = ids != -1 @@ -774,7 +774,7 @@ def count(self) -> Series: Count of values within each group. """ ids, _, ngroups = self.grouper.group_info - val = self.obj._internal_get_values() + val = self.obj._values mask = (ids != -1) & ~isna(val) ids = ensure_platform_int(ids) @@ -955,9 +955,11 @@ def aggregate(self, func=None, *args, **kwargs): raise result = self._aggregate_frame(func) else: - result.columns = Index( - result.columns.levels[0], name=self._selected_obj.columns.name - ) + # select everything except for the last level, which is the one + # containing the name of the function(s), see GH 32040 + result.columns = result.columns.rename( + [self._selected_obj.columns.name] * result.columns.nlevels + ).droplevel(-1) if not self.as_index: self._insert_inaxis_grouper_inplace(result) @@ -1083,7 +1085,7 @@ def _cython_agg_blocks( result = type(block.values)._from_sequence( result.ravel(), dtype=block.values.dtype ) - except ValueError: + except (ValueError, TypeError): # reshape to be valid for non-Extension Block result = result.reshape(1, -1) @@ -1772,10 +1774,8 @@ def count(self): ids, _, ngroups = self.grouper.group_info mask = ids != -1 - vals = ( - (mask & ~_isna_ndarraylike(np.atleast_2d(blk.get_values()))) - for blk in data.blocks - ) + # TODO(2DEA): reshape would not be necessary with 2D EAs + vals = ((mask & ~isna(blk.values).reshape(blk.shape)) for blk in data.blocks) locs = (blk.mgr_locs for blk in data.blocks) counted = ( diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 6362f11a3e032..19e51d05feb92 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1896,7 +1896,7 @@ def pre_processor(vals: np.ndarray) -> Tuple[np.ndarray, Optional[Type]]: inference = np.int64 elif is_datetime64_dtype(vals): inference = "datetime64[ns]" - vals = vals.astype(np.float) + vals = np.asarray(vals).astype(np.float) return vals, inference @@ -2271,7 +2271,7 @@ def _get_cythonized_result( for idx, obj in enumerate(self._iterate_slices()): name = obj.name - values = obj._data._values + values = obj._values if aggregate: result_sz = ngroups diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 7259268ac3f2b..577c874c9cbbe 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -217,7 +217,7 @@ def indices(self): return self.groupings[0].indices else: codes_list = [ping.codes for ping in self.groupings] - keys = [com.values_from_object(ping.group_index) for ping in self.groupings] + keys = [ping.group_index for ping in self.groupings] return get_indexer_dict(codes_list, keys) @property diff --git a/pandas/core/indexers.py b/pandas/core/indexers.py index 3858e750326b4..3d0e3699264a8 100644 --- a/pandas/core/indexers.py +++ b/pandas/core/indexers.py @@ -11,6 +11,7 @@ is_array_like, is_bool_dtype, is_extension_array_dtype, + is_integer, is_integer_dtype, is_list_like, ) @@ -20,6 +21,34 @@ # Indexer Identification +def is_valid_positional_slice(slc: slice) -> bool: + """ + Check if a slice object can be interpreted as a positional indexer. + + Parameters + ---------- + slc : slice + + Returns + ------- + bool + + Notes + ----- + A valid positional slice may also be interpreted as a label-based slice + depending on the index being sliced. + """ + + def is_int_or_none(val): + return val is None or is_integer(val) + + return ( + is_int_or_none(slc.start) + and is_int_or_none(slc.stop) + and is_int_or_none(slc.step) + ) + + def is_list_like_indexer(key) -> bool: """ Check if we have a list-like indexer that is *not* a NamedTuple. @@ -36,18 +65,26 @@ def is_list_like_indexer(key) -> bool: return is_list_like(key) and not (isinstance(key, tuple) and type(key) is not tuple) -def is_scalar_indexer(indexer, arr_value) -> bool: +def is_scalar_indexer(indexer, ndim: int) -> bool: """ Return True if we are all scalar indexers. + Parameters + ---------- + indexer : object + ndim : int + Number of dimensions in the object being indexed. + Returns ------- bool """ - if arr_value.ndim == 1: - if not isinstance(indexer, tuple): - indexer = tuple([indexer]) - return any(isinstance(idx, np.ndarray) and len(idx) == 0 for idx in indexer) + if isinstance(indexer, tuple): + if len(indexer) == ndim: + return all( + is_integer(x) or (isinstance(x, np.ndarray) and x.ndim == len(x) == 1) + for x in indexer + ) return False diff --git a/pandas/core/indexes/accessors.py b/pandas/core/indexes/accessors.py index db774a03c02f8..8cfe1f4ac469c 100644 --- a/pandas/core/indexes/accessors.py +++ b/pandas/core/indexes/accessors.py @@ -1,16 +1,17 @@ """ datetimelike delegation """ +from typing import TYPE_CHECKING + import numpy as np from pandas.core.dtypes.common import ( is_categorical_dtype, is_datetime64_dtype, is_datetime64tz_dtype, - is_datetime_arraylike, is_integer_dtype, is_list_like, - is_period_arraylike, + is_period_dtype, is_timedelta64_dtype, ) from pandas.core.dtypes.generic import ABCSeries @@ -21,9 +22,12 @@ from pandas.core.indexes.datetimes import DatetimeIndex from pandas.core.indexes.timedeltas import TimedeltaIndex +if TYPE_CHECKING: + from pandas import Series # noqa:F401 + class Properties(PandasDelegate, PandasObject, NoNewAttributesMixin): - def __init__(self, data, orig): + def __init__(self, data: "Series", orig): if not isinstance(data, ABCSeries): raise TypeError( f"cannot convert an object of type {type(data)} to a datetimelike index" @@ -45,12 +49,8 @@ def _get_values(self): elif is_timedelta64_dtype(data.dtype): return TimedeltaIndex(data, copy=False, name=self.name) - else: - if is_period_arraylike(data): - # TODO: use to_period_array - return PeriodArray(data, copy=False) - if is_datetime_arraylike(data): - return DatetimeIndex(data, copy=False, name=self.name) + elif is_period_dtype(data): + return PeriodArray(data, copy=False) raise TypeError( f"cannot convert an object of type {type(data)} to a datetimelike index" @@ -137,7 +137,7 @@ class DatetimeProperties(Properties): Raises TypeError if the Series does not contain datetimelike values. """ - def to_pydatetime(self): + def to_pydatetime(self) -> np.ndarray: """ Return the data as an array of native Python datetime objects. @@ -209,7 +209,7 @@ class TimedeltaProperties(Properties): Raises TypeError if the Series does not contain datetimelike values. """ - def to_pytimedelta(self): + def to_pytimedelta(self) -> np.ndarray: """ Return an array of native `datetime.timedelta` objects. @@ -271,7 +271,7 @@ def components(self): 2 0 0 0 2 0 0 0 3 0 0 0 3 0 0 0 4 0 0 0 4 0 0 0 - """ # noqa: E501 + """ return self._get_values().components.set_index(self._parent.index) @property @@ -303,7 +303,7 @@ class PeriodProperties(Properties): class CombinedDatetimelikeProperties( DatetimeProperties, TimedeltaProperties, PeriodProperties ): - def __new__(cls, data): + def __new__(cls, data: "Series"): # CombinedDatetimelikeProperties isn't really instantiated. Instead # we need to choose which parent (datetime or timedelta) is # appropriate. Since we're checking the dtypes anyway, we'll just @@ -330,9 +330,7 @@ def __new__(cls, data): return DatetimeProperties(data, orig) elif is_timedelta64_dtype(data.dtype): return TimedeltaProperties(data, orig) - elif is_period_arraylike(data): + elif is_period_dtype(data): return PeriodProperties(data, orig) - elif is_datetime_arraylike(data): - return DatetimeProperties(data, orig) raise AttributeError("Can only use .dt accessor with datetimelike values") diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 6f44b5abf5b04..83064fe22eaff 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -1,7 +1,7 @@ from datetime import datetime import operator from textwrap import dedent -from typing import TYPE_CHECKING, Any, FrozenSet, Hashable, Optional, Union +from typing import TYPE_CHECKING, Any, FrozenSet, Hashable, Union import warnings import numpy as np @@ -29,7 +29,6 @@ ensure_platform_int, is_bool, is_bool_dtype, - is_categorical, is_categorical_dtype, is_datetime64_any_dtype, is_dtype_equal, @@ -464,10 +463,10 @@ def _simple_new(cls, values, name: Label = None): # _index_data is a (temporary?) fix to ensure that the direct data # manipulation we do in `_libs/reduction.pyx` continues to work. # We need access to the actual ndarray, since we're messing with - # data buffers and strides. We don't re-use `_ndarray_values`, since - # we actually set this value too. + # data buffers and strides. result._index_data = values result._name = name + result._cache = {} return result._reset_identity() @@ -498,11 +497,13 @@ def _shallow_copy(self, values=None, name: Label = no_default): name : Label, defaults to self.name """ name = self.name if name is no_default else name - + cache = self._cache.copy() if values is None else {} if values is None: values = self.values - return self._simple_new(values, name=name) + result = self._simple_new(values, name=name) + result._cache = cache + return result def _shallow_copy_with_infer(self, values, **kwargs): """ @@ -528,6 +529,9 @@ def _shallow_copy_with_infer(self, values, **kwargs): return self._constructor(values, **attributes) except (TypeError, ValueError): pass + + # Remove tz so Index will try non-DatetimeIndex inference + attributes.pop("tz", None) return Index(values, **attributes) def _update_inplace(self, result, **kwargs): @@ -567,10 +571,10 @@ def _cleanup(self): def _engine(self): # property, for now, slow to look up - # to avoid a reference cycle, bind `_ndarray_values` to a local variable, so + # to avoid a reference cycle, bind `target_values` to a local variable, so # `self` is not passed into the lambda. - _ndarray_values = self._ndarray_values - return self._engine_type(lambda: _ndarray_values, len(self)) + target_values = self._get_engine_target() + return self._engine_type(lambda: target_values, len(self)) # -------------------------------------------------------------------- # Array-Like Methods @@ -619,7 +623,8 @@ def ravel(self, order="C"): -------- numpy.ndarray.ravel """ - return self._ndarray_values.ravel(order=order) + values = self._get_engine_target() + return values.ravel(order=order) def view(self, cls=None): @@ -2163,7 +2168,7 @@ def dropna(self, how="any"): Returns ------- - valid : Index + Index """ if how not in ("any", "all"): raise ValueError(f"invalid how option: {how}") @@ -2972,7 +2977,7 @@ def get_indexer( "backfill or nearest reindexing" ) - indexer = self._engine.get_indexer(target._ndarray_values) + indexer = self._engine.get_indexer(target._get_engine_target()) return ensure_platform_int(indexer) @@ -2986,19 +2991,20 @@ def _convert_tolerance(self, tolerance, target): def _get_fill_indexer( self, target: "Index", method: str_t, limit=None, tolerance=None ) -> np.ndarray: + + target_values = target._get_engine_target() + if self.is_monotonic_increasing and target.is_monotonic_increasing: engine_method = ( self._engine.get_pad_indexer if method == "pad" else self._engine.get_backfill_indexer ) - indexer = engine_method(target._ndarray_values, limit) + indexer = engine_method(target_values, limit) else: indexer = self._get_fill_indexer_searchsorted(target, method, limit) if tolerance is not None: - indexer = self._filter_indexer_tolerance( - target._ndarray_values, indexer, tolerance - ) + indexer = self._filter_indexer_tolerance(target_values, indexer, tolerance) return indexer def _get_fill_indexer_searchsorted( @@ -3137,8 +3143,18 @@ def is_int(v): pass if com.is_null_slice(key): + # It doesn't matter if we are positional or label based indexer = key elif is_positional: + if kind == "loc": + # GH#16121, GH#24612, GH#31810 + warnings.warn( + "Slicing a positional slice with .loc is not supported, " + "and will raise TypeError in a future version. " + "Use .loc with labels or .iloc with positions instead.", + FutureWarning, + stacklevel=6, + ) indexer = key else: indexer = self.slice_indexer(start, stop, step, kind=kind) @@ -3266,13 +3282,11 @@ def reindex(self, target, method=None, level=None, limit=None, tolerance=None): target = _ensure_has_len(target) # target may be an iterator if not isinstance(target, Index) and len(target) == 0: - attrs = self._get_attributes_dict() - attrs.pop("freq", None) # don't preserve freq if isinstance(self, ABCRangeIndex): values = range(0) else: values = self._data[:0] # appropriately-dtyped empty array - target = self._simple_new(values, **attrs) + target = self._simple_new(values, name=self.name) else: target = ensure_index(target) @@ -3385,6 +3399,7 @@ def join(self, other, how="left", level=None, return_indexers=False, sort=False) ------- join_index, (left_indexer, right_indexer) """ + other = ensure_index(other) self_is_mi = isinstance(self, ABCMultiIndex) other_is_mi = isinstance(other, ABCMultiIndex) @@ -3404,8 +3419,6 @@ def join(self, other, how="left", level=None, return_indexers=False, sort=False) other, level, how=how, return_indexers=return_indexers ) - other = ensure_index(other) - if len(other) == 0 and how in ("left", "outer"): join_index = self._shallow_copy() if return_indexers: @@ -3567,16 +3580,26 @@ def _join_multi(self, other, how, return_indexers=True): def _join_non_unique(self, other, how="left", return_indexers=False): from pandas.core.reshape.merge import _get_join_indexers + # We only get here if dtypes match + assert self.dtype == other.dtype + + if is_extension_array_dtype(self.dtype): + lvalues = self._data._values_for_argsort() + rvalues = other._data._values_for_argsort() + else: + lvalues = self._values + rvalues = other._values + left_idx, right_idx = _get_join_indexers( - [self._ndarray_values], [other._ndarray_values], how=how, sort=True + [lvalues], [rvalues], how=how, sort=True ) left_idx = ensure_platform_int(left_idx) right_idx = ensure_platform_int(right_idx) - join_index = np.asarray(self._ndarray_values.take(left_idx)) + join_index = np.asarray(lvalues.take(left_idx)) mask = left_idx == -1 - np.putmask(join_index, mask, other._ndarray_values.take(right_idx)) + np.putmask(join_index, mask, rvalues.take(right_idx)) join_index = self._wrap_joined_index(join_index, other) @@ -3727,6 +3750,9 @@ def _get_leaf_sorter(labels): return join_index def _join_monotonic(self, other, how="left", return_indexers=False): + # We only get here with matching dtypes + assert other.dtype == self.dtype + if self.equals(other): ret_index = other if how == "right" else self if return_indexers: @@ -3734,8 +3760,12 @@ def _join_monotonic(self, other, how="left", return_indexers=False): else: return ret_index - sv = self._ndarray_values - ov = other._ndarray_values + if is_extension_array_dtype(self.dtype): + sv = self._data._values_for_argsort() + ov = other._data._values_for_argsort() + else: + sv = self._values + ov = other._values if self.is_unique and other.is_unique: # We can perform much better than the general case @@ -3815,75 +3845,32 @@ def _values(self) -> Union[ExtensionArray, np.ndarray]: """ The best array representation. - This is an ndarray or ExtensionArray. This differs from - ``_ndarray_values``, which always returns an ndarray. + This is an ndarray or ExtensionArray. - Both ``_values`` and ``_ndarray_values`` are consistent between - ``Series`` and ``Index`` (except for datetime64[ns], which returns - a DatetimeArray for _values on the Index, but ndarray[M8ns] on the - Series). + ``_values`` are consistent between``Series`` and ``Index``. It may differ from the public '.values' method. - index | values | _values | _ndarray_values | - ----------------- | --------------- | ------------- | --------------- | - Index | ndarray | ndarray | ndarray | - CategoricalIndex | Categorical | Categorical | ndarray[int] | - DatetimeIndex | ndarray[M8ns] | DatetimeArray | ndarray[M8ns] | - DatetimeIndex[tz] | ndarray[M8ns] | DatetimeArray | ndarray[M8ns] | - PeriodIndex | ndarray[object] | PeriodArray | ndarray[int] | - IntervalIndex | IntervalArray | IntervalArray | ndarray[object] | + index | values | _values | + ----------------- | --------------- | ------------- | + Index | ndarray | ndarray | + CategoricalIndex | Categorical | Categorical | + DatetimeIndex | ndarray[M8ns] | DatetimeArray | + DatetimeIndex[tz] | ndarray[M8ns] | DatetimeArray | + PeriodIndex | ndarray[object] | PeriodArray | + IntervalIndex | IntervalArray | IntervalArray | See Also -------- values - _ndarray_values """ return self._data - def _internal_get_values(self) -> np.ndarray: + def _get_engine_target(self) -> np.ndarray: """ - Return `Index` data as an `numpy.ndarray`. - - Returns - ------- - numpy.ndarray - A one-dimensional numpy array of the `Index` values. - - See Also - -------- - Index.values : The attribute that _internal_get_values wraps. - - Examples - -------- - Getting the `Index` values of a `DataFrame`: - - >>> df = pd.DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], - ... index=['a', 'b', 'c'], columns=['A', 'B', 'C']) - >>> df - A B C - a 1 2 3 - b 4 5 6 - c 7 8 9 - >>> df.index._internal_get_values() - array(['a', 'b', 'c'], dtype=object) - - Standalone `Index` values: - - >>> idx = pd.Index(['1', '2', '3']) - >>> idx._internal_get_values() - array(['1', '2', '3'], dtype=object) - - `MultiIndex` arrays also have only one dimension: - - >>> midx = pd.MultiIndex.from_arrays([[1, 2, 3], ['a', 'b', 'c']], - ... names=('number', 'letter')) - >>> midx._internal_get_values() - array([(1, 'a'), (2, 'b'), (3, 'c')], dtype=object) - >>> midx._internal_get_values().ndim - 1 + Get the ndarray that we can pass to the IndexEngine constructor. """ - return self.values + return self._values @Appender(IndexOpsMixin.memory_usage.__doc__) def memory_usage(self, deep: bool = False) -> int: @@ -4094,7 +4081,6 @@ def __getitem__(self, key): if com.is_bool_indexer(key): key = np.asarray(key, dtype=bool) - key = com.values_from_object(key) result = getitem(key) if not is_scalar(result): if np.ndim(result) > 1: @@ -4219,19 +4205,19 @@ def equals(self, other) -> bool: if not isinstance(other, Index): return False - if is_object_dtype(self) and not is_object_dtype(other): + if is_object_dtype(self.dtype) and not is_object_dtype(other.dtype): # if other is not object, use other's logic for coercion return other.equals(self) if isinstance(other, ABCMultiIndex): # d-level MultiIndex can equal d-tuple Index - if not is_object_dtype(self.dtype): - if self.nlevels != other.nlevels: - return False + return other.equals(self) - return array_equivalent( - com.values_from_object(self), com.values_from_object(other) - ) + if is_extension_array_dtype(other.dtype): + # All EA-backed Index subclasses override equals + return other.equals(self) + + return array_equivalent(self._values, other._values) def identical(self, other) -> bool: """ @@ -4623,12 +4609,10 @@ def get_indexer_non_unique(self, target): if pself is not self or ptarget is not target: return pself.get_indexer_non_unique(ptarget) - if is_categorical(target): + if is_categorical_dtype(target.dtype): tgt_values = np.asarray(target) - elif self.is_all_dates and target.is_all_dates: # GH 30399 - tgt_values = target.asi8 else: - tgt_values = target._ndarray_values + tgt_values = target._get_engine_target() indexer, missing = self._engine.get_indexer_non_unique(tgt_values) return ensure_platform_int(indexer), missing @@ -4728,6 +4712,27 @@ def map(self, mapper, na_action=None): return Index(new_values, **attributes) + # TODO: De-duplicate with map, xref GH#32349 + def _transform_index(self, func, level=None) -> "Index": + """ + Apply function to all values found in index. + + This includes transforming multiindex entries separately. + Only apply function to one level of the MultiIndex if level is specified. + """ + if isinstance(self, ABCMultiIndex): + if level is not None: + items = [ + tuple(func(y) if i == level else y for i, y in enumerate(x)) + for x in self + ] + else: + items = [tuple(func(y) for y in x) for x in self] + return type(self).from_tuples(items, names=self.names) + else: + items = [func(x) for x in self] + return Index(items, name=self.name, tupleize_cols=False) + def isin(self, values, level=None): """ Return a boolean array where the index values are in `values`. @@ -5140,9 +5145,11 @@ def insert(self, loc: int, item): ------- new_index : Index """ - _self = np.asarray(self) - item = self._coerce_scalar_to_index(item)._ndarray_values - idx = np.concatenate((_self[:loc], item, _self[loc:])) + # Note: this method is overridden by all ExtensionIndex subclasses, + # so self is never backed by an EA. + arr = np.asarray(self) + item = self._coerce_scalar_to_index(item)._values + idx = np.concatenate((arr[:loc], item, arr[loc:])) return self._shallow_copy_with_infer(idx) def drop(self, labels, errors: str_t = "raise"): @@ -5546,7 +5553,7 @@ def default_index(n): return RangeIndex(0, n, name=None) -def maybe_extract_name(name, obj, cls) -> Optional[Hashable]: +def maybe_extract_name(name, obj, cls) -> Label: """ If no name is passed, then extract it from data, validating hashability. """ diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index d43ae8eb54818..52423c4008399 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -233,6 +233,7 @@ def _simple_new(cls, values: Categorical, name: Label = None): result._data = values result.name = name + result._cache = {} result._reset_identity() result._no_setting_name = False @@ -242,14 +243,9 @@ def _simple_new(cls, values: Categorical, name: Label = None): @Appender(Index._shallow_copy.__doc__) def _shallow_copy(self, values=None, name: Label = no_default): - name = self.name if name is no_default else name - - if values is None: - values = self.values - - cat = Categorical(values, dtype=self.dtype) - - return type(self)._simple_new(cat, name=name) + if values is not None: + values = Categorical(values, dtype=self.dtype) + return super()._shallow_copy(values=values, name=name) def _is_dtype_compat(self, other) -> bool: """ @@ -364,10 +360,6 @@ def __contains__(self, key: Any) -> bool: hash(key) return contains(self, key, container=self._engine) - def __array__(self, dtype=None) -> np.ndarray: - """ the array interface, return my values """ - return np.array(self._data, dtype=dtype) - @Appender(Index.astype.__doc__) def astype(self, dtype, copy=True): if is_interval_dtype(dtype): @@ -404,7 +396,7 @@ def _engine(self): def unique(self, level=None): if level is not None: self._validate_index_level(level) - result = self.values.unique() + result = self._values.unique() # Use _simple_new instead of _shallow_copy to ensure we keep dtype # of result, not self. return type(self)._simple_new(result, name=self.name) @@ -431,7 +423,7 @@ def where(self, cond, other=None): # 3. Rebuild CategoricalIndex. if other is None: other = self._na_value - values = np.where(cond, self.values, other) + values = np.where(cond, self._values, other) cat = Categorical(values, dtype=self.dtype) return type(self)._simple_new(cat, name=self.name) @@ -540,13 +532,13 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): "method='nearest' not implemented yet for CategoricalIndex" ) - if isinstance(target, CategoricalIndex) and self.values.is_dtype_equal(target): - if self.values.equals(target.values): + if isinstance(target, CategoricalIndex) and self._values.is_dtype_equal(target): + if self._values.equals(target._values): # we have the same codes codes = target.codes else: codes = _recode_for_categories( - target.codes, target.categories, self.values.categories + target.codes, target.categories, self._values.categories ) else: if isinstance(target, CategoricalIndex): @@ -568,7 +560,7 @@ def get_indexer_non_unique(self, target): target = target.codes indexer, missing = self._engine.get_indexer_non_unique(target) return ensure_platform_int(indexer), missing - target = target.values + target = target._values codes = self.categories.get_indexer(target) indexer, missing = self._engine.get_indexer_non_unique(codes) @@ -687,7 +679,7 @@ def map(self, mapper): >>> idx.map({'a': 'first', 'b': 'second'}) Index(['first', 'second', nan], dtype='object') """ - return self._shallow_copy_with_infer(self.values.map(mapper)) + return self._shallow_copy_with_infer(self._values.map(mapper)) def delete(self, loc): """ diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index 894e1d95a17bc..25333b3a08dce 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -8,13 +8,14 @@ from pandas._libs import NaT, iNaT, join as libjoin, lib from pandas._libs.tslibs import timezones -from pandas._typing import Label +from pandas._typing import DtypeObj, Label from pandas.compat.numpy import function as nv from pandas.errors import AbstractMethodError -from pandas.util._decorators import Appender, cache_readonly +from pandas.util._decorators import Appender, cache_readonly, doc from pandas.core.dtypes.common import ( ensure_int64, + ensure_platform_int, is_bool_dtype, is_categorical_dtype, is_dtype_equal, @@ -22,7 +23,6 @@ is_list_like, is_period_dtype, is_scalar, - needs_i8_conversion, ) from pandas.core.dtypes.concat import concat_compat from pandas.core.dtypes.generic import ABCIndex, ABCIndexClass, ABCSeries @@ -31,9 +31,9 @@ from pandas.core import algorithms from pandas.core.arrays import DatetimeArray, PeriodArray, TimedeltaArray from pandas.core.arrays.datetimelike import DatetimeLikeArrayMixin -from pandas.core.base import _shared_docs +from pandas.core.base import IndexOpsMixin import pandas.core.indexes.base as ibase -from pandas.core.indexes.base import Index, _index_shared_docs +from pandas.core.indexes.base import Index, _index_shared_docs, ensure_index from pandas.core.indexes.extension import ( ExtensionIndex, inherit_names, @@ -43,7 +43,7 @@ from pandas.core.ops import get_op_result_name from pandas.core.tools.timedeltas import to_timedelta -from pandas.tseries.frequencies import DateOffset, to_offset +from pandas.tseries.frequencies import DateOffset _index_doc_kwargs = dict(ibase._index_doc_kwargs) @@ -102,6 +102,12 @@ class DatetimeIndexOpsMixin(ExtensionIndex): def is_all_dates(self) -> bool: return True + def _is_comparable_dtype(self, dtype: DtypeObj) -> bool: + """ + Can we compare values of the given dtype to our own? + """ + raise AbstractMethodError(self) + # ------------------------------------------------------------------------ # Abstract data attributes @@ -173,7 +179,7 @@ def sort_values(self, return_indexer=False, ascending=True): sorted_index = self.take(_as) return sorted_index, _as else: - # NB: using asi8 instead of _ndarray_values matters in numpy 1.18 + # NB: using asi8 instead of _data matters in numpy 1.18 # because the treatment of NaT has been changed to put NaT last # instead of first. sorted_values = np.sort(self.asi8) @@ -206,7 +212,7 @@ def take(self, indices, axis=0, allow_fill=True, fill_value=None, **kwargs): self, indices, axis, allow_fill, fill_value, **kwargs ) - @Appender(_shared_docs["searchsorted"]) + @doc(IndexOpsMixin.searchsorted, klass="Datetime-like Index") def searchsorted(self, value, side="left", sorter=None): if isinstance(value, str): raise TypeError( @@ -427,6 +433,21 @@ def _partial_date_slice( # try to find the dates return (lhs_mask & rhs_mask).nonzero()[0] + @Appender(Index.get_indexer_non_unique.__doc__) + def get_indexer_non_unique(self, target): + target = ensure_index(target) + pself, ptarget = self._maybe_promote(target) + if pself is not self or ptarget is not target: + return pself.get_indexer_non_unique(ptarget) + + if not self._is_comparable_dtype(target.dtype): + no_matches = -1 * np.ones(self.shape, dtype=np.intp) + return no_matches, no_matches + + tgt_values = target.asi8 + indexer, missing = self._engine.get_indexer_non_unique(tgt_values) + return ensure_platform_int(indexer), missing + # -------------------------------------------------------------------- __add__ = make_wrapped_arith_op("__add__") @@ -484,7 +505,7 @@ def where(self, cond, other=None): if is_categorical_dtype(other): # e.g. we have a Categorical holding self.dtype - if needs_i8_conversion(other.categories): + if is_dtype_equal(other.categories.dtype, self.dtype): other = other._internal_get_values() if not is_dtype_equal(self.dtype, other.dtype): @@ -602,41 +623,22 @@ def _set_freq(self, freq): freq : DateOffset, None, or "infer" """ # GH#29843 - if freq is None: - # Always valid - pass - elif len(self) == 0 and isinstance(freq, DateOffset): - # Always valid. In the TimedeltaIndex case, we assume this - # is a Tick offset. - pass - else: - # As an internal method, we can ensure this assertion always holds - assert freq == "infer" - freq = to_offset(self.inferred_freq) - - self._data._freq = freq + self._data._with_freq(freq) def _shallow_copy(self, values=None, name: Label = lib.no_default): name = self.name if name is lib.no_default else name + cache = self._cache.copy() if values is None else {} if values is None: values = self._data - if isinstance(values, type(self)): - values = values._data if isinstance(values, np.ndarray): # TODO: We would rather not get here values = type(self._data)(values, dtype=self.dtype) - attributes = self._get_attributes_dict() - - if self.freq is not None: - if isinstance(values, (DatetimeArray, TimedeltaArray)): - if values.freq is None: - del attributes["freq"] - - attributes["name"] = name - return type(self)._simple_new(values, **attributes) + result = type(self)._simple_new(values, name=name) + result._cache = cache + return result # -------------------------------------------------------------------- # Set Operation Methods @@ -778,7 +780,10 @@ def _fast_union(self, other, sort=None): loc = right.searchsorted(left_start, side="left") right_chunk = right.values[:loc] dates = concat_compat((left.values, right_chunk)) - return self._shallow_copy(dates) + result = self._shallow_copy(dates) + result._set_freq("infer") + # TODO: can we infer that it has self.freq? + return result else: left, right = other, self @@ -790,7 +795,10 @@ def _fast_union(self, other, sort=None): loc = right.searchsorted(left_end, side="right") right_chunk = right.values[loc:] dates = concat_compat((left.values, right_chunk)) - return self._shallow_copy(dates) + result = self._shallow_copy(dates) + result._set_freq("infer") + # TODO: can we infer that it has self.freq? + return result else: return left diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index c9fefd46e55c7..ca1995adc1ea9 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -7,17 +7,21 @@ from pandas._libs import NaT, Period, Timestamp, index as libindex, lib, tslib as libts from pandas._libs.tslibs import fields, parsing, timezones +from pandas._typing import DtypeObj, Label from pandas.util._decorators import cache_readonly -from pandas.core.dtypes.common import _NS_DTYPE, is_float, is_integer, is_scalar -from pandas.core.dtypes.dtypes import DatetimeTZDtype +from pandas.core.dtypes.common import ( + _NS_DTYPE, + is_datetime64_any_dtype, + is_datetime64_dtype, + is_datetime64tz_dtype, + is_float, + is_integer, + is_scalar, +) from pandas.core.dtypes.missing import is_valid_nat_for_dtype -from pandas.core.arrays.datetimes import ( - DatetimeArray, - tz_to_dtype, - validate_tz_from_dtype, -) +from pandas.core.arrays.datetimes import DatetimeArray, tz_to_dtype import pandas.core.common as com from pandas.core.indexes.base import Index, InvalidIndexError, maybe_extract_name from pandas.core.indexes.datetimelike import DatetimeTimedeltaMixin @@ -36,7 +40,20 @@ def _new_DatetimeIndex(cls, d): if "data" in d and not isinstance(d["data"], DatetimeIndex): # Avoid need to verify integrity by calling simple_new directly data = d.pop("data") - result = cls._simple_new(data, **d) + if not isinstance(data, DatetimeArray): + # For backward compat with older pickles, we may need to construct + # a DatetimeArray to adapt to the newer _simple_new signature + tz = d.pop("tz") + freq = d.pop("freq") + dta = DatetimeArray._simple_new(data, dtype=tz_to_dtype(tz), freq=freq) + else: + dta = data + for key in ["tz", "freq"]: + # These are already stored in our DatetimeArray; if they are + # also in the pickle and don't match, we have a problem. + if key in d: + assert d.pop(key) == getattr(dta, key) + result = cls._simple_new(dta, **d) else: with warnings.catch_warnings(): # TODO: If we knew what was going in to **d, we might be able to @@ -78,21 +95,26 @@ def _new_DatetimeIndex(cls, d): ) class DatetimeIndex(DatetimeTimedeltaMixin): """ - Immutable ndarray of datetime64 data, represented internally as int64, and - which can be boxed to Timestamp objects that are subclasses of datetime and - carry metadata such as frequency information. + Immutable ndarray-like of datetime64 data. + + Represented internally as int64, and which can be boxed to Timestamp objects + that are subclasses of datetime and carry metadata. Parameters ---------- data : array-like (1-dimensional), optional Optional datetime-like data to construct index with. - copy : bool - Make a copy of input ndarray. freq : str or pandas offset object, optional One of pandas date offset strings or corresponding objects. The string 'infer' can be passed in order to set the frequency of the index as the inferred frequency upon creation. - tz : pytz.timezone or dateutil.tz.tzfile + tz : pytz.timezone or dateutil.tz.tzfile or datetime.tzinfo or str + Set the Timezone of the data. + normalize : bool, default False + Normalize start/end dates to midnight before generating date range. + closed : {'left', 'right'}, optional + Set whether to include `start` and `end` that are on the + boundary. The default includes boundary points on either end. ambiguous : 'infer', bool-ndarray, 'NaT', default 'raise' When clocks moved backward due to DST, ambiguous times may arise. For example in Central European Time (UTC+01), when going from 03:00 @@ -107,12 +129,16 @@ class DatetimeIndex(DatetimeTimedeltaMixin): times) - 'NaT' will return NaT where there are ambiguous times - 'raise' will raise an AmbiguousTimeError if there are ambiguous times. - name : object - Name to be stored in the index. dayfirst : bool, default False If True, parse dates in `data` with the day first order. yearfirst : bool, default False If True parse dates in `data` with the year first order. + dtype : numpy.dtype or DatetimeTZDtype or str, default None + Note that the only NumPy dtype allowed is ‘datetime64[ns]’. + copy : bool, default False + Make a copy of input ndarray. + name : label, default None + Name to be stored in the index. Attributes ---------- @@ -235,41 +261,21 @@ def __new__( return subarr @classmethod - def _simple_new(cls, values, name=None, freq=None, tz=None, dtype=None): - """ - We require the we have a dtype compat for the values - if we are passed a non-dtype compat, then coerce using the constructor - """ - if isinstance(values, DatetimeArray): - if tz: - tz = validate_tz_from_dtype(dtype, tz) - dtype = DatetimeTZDtype(tz=tz) - elif dtype is None: - dtype = _NS_DTYPE - - values = DatetimeArray(values, freq=freq, dtype=dtype) - tz = values.tz - freq = values.freq - values = values._data - - dtype = tz_to_dtype(tz) - dtarr = DatetimeArray._simple_new(values, freq=freq, dtype=dtype) - assert isinstance(dtarr, DatetimeArray) + def _simple_new(cls, values: DatetimeArray, name: Label = None): + assert isinstance(values, DatetimeArray), type(values) result = object.__new__(cls) - result._data = dtarr + result._data = values result.name = name + result._cache = {} result._no_setting_name = False # For groupby perf. See note in indexes/base about _index_data - result._index_data = dtarr._data + result._index_data = values._data result._reset_identity() return result # -------------------------------------------------------------------- - def __array__(self, dtype=None) -> np.ndarray: - return np.asarray(self._data, dtype=dtype) - @cache_readonly def _is_dates_only(self) -> bool: """ @@ -300,6 +306,18 @@ def _convert_for_op(self, value): return Timestamp(value).asm8 raise ValueError("Passed item and index have different timezone") + def _is_comparable_dtype(self, dtype: DtypeObj) -> bool: + """ + Can we compare values of the given dtype to our own? + """ + if not is_datetime64_any_dtype(dtype): + return False + if self.tz is not None: + # If we have tz, we can compare to tzaware + return is_datetime64tz_dtype(dtype) + # if we dont have tz, we can only compare to tznaive + return is_datetime64_dtype(dtype) + # -------------------------------------------------------------------- # Rendering Methods diff --git a/pandas/core/indexes/extension.py b/pandas/core/indexes/extension.py index daccb35864e98..f38a4fb83c64f 100644 --- a/pandas/core/indexes/extension.py +++ b/pandas/core/indexes/extension.py @@ -6,7 +6,8 @@ import numpy as np from pandas.compat.numpy import function as nv -from pandas.util._decorators import Appender, cache_readonly +from pandas.errors import AbstractMethodError +from pandas.util._decorators import cache_readonly, doc from pandas.core.dtypes.common import ( ensure_platform_int, @@ -224,11 +225,13 @@ def __iter__(self): # --------------------------------------------------------------------- - @property - def _ndarray_values(self) -> np.ndarray: - return self._data._ndarray_values + def __array__(self, dtype=None) -> np.ndarray: + return np.asarray(self._data, dtype=dtype) - @Appender(Index.dropna.__doc__) + def _get_engine_target(self) -> np.ndarray: + return self._data._values_for_argsort() + + @doc(Index.dropna) def dropna(self, how="any"): if how not in ("any", "all"): raise ValueError(f"invalid how option: {how}") @@ -242,11 +245,15 @@ def repeat(self, repeats, axis=None): result = self._data.repeat(repeats, axis=axis) return self._shallow_copy(result) + def insert(self, loc: int, item): + # ExtensionIndex subclasses must override Index.insert + raise AbstractMethodError(self) + def _concat_same_dtype(self, to_concat, name): arr = type(self._data)._concat_same_type(to_concat) return type(self)._simple_new(arr, name=name) - @Appender(Index.take.__doc__) + @doc(Index.take) def take(self, indices, axis=0, allow_fill=True, fill_value=None, **kwargs): nv.validate_take(tuple(), kwargs) indices = ensure_platform_int(indices) @@ -276,7 +283,7 @@ def _get_unique_index(self, dropna=False): result = result[~result.isna()] return self._shallow_copy(result) - @Appender(Index.map.__doc__) + @doc(Index.map) def map(self, mapper, na_action=None): # Try to run function on index first, and then on elements of index # Especially important for group-by functionality @@ -293,7 +300,7 @@ def map(self, mapper, na_action=None): except Exception: return self.astype(object).map(mapper) - @Appender(Index.astype.__doc__) + @doc(Index.astype) def astype(self, dtype, copy=True): if is_dtype_equal(self.dtype, dtype) and copy is False: # Ensure that self.astype(self.dtype) is self diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index 6968837fb13e6..f4942b72a6ad4 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -39,6 +39,7 @@ from pandas.core.algorithms import take_1d from pandas.core.arrays.interval import IntervalArray, _interval_shared_docs import pandas.core.common as com +from pandas.core.indexers import is_valid_positional_slice import pandas.core.indexes.base as ibase from pandas.core.indexes.base import ( Index, @@ -242,6 +243,7 @@ def _simple_new(cls, array: IntervalArray, name: Label = None): result = IntervalMixin.__new__(cls) result._data = array result.name = name + result._cache = {} result._no_setting_name = False result._reset_identity() return result @@ -331,12 +333,15 @@ def from_tuples( # -------------------------------------------------------------------- @Appender(Index._shallow_copy.__doc__) - def _shallow_copy(self, values=None, **kwargs): + def _shallow_copy(self, values=None, name: Label = lib.no_default): + name = self.name if name is lib.no_default else name + cache = self._cache.copy() if values is None else {} if values is None: values = self._data - attributes = self._get_attributes_dict() - attributes.update(kwargs) - return self._simple_new(values, **attributes) + + result = self._simple_new(values, name=name) + result._cache = cache + return result @cache_readonly def _isnan(self): @@ -404,7 +409,7 @@ def __reduce__(self): @Appender(Index.astype.__doc__) def astype(self, dtype, copy=True): with rewrite_exception("IntervalArray", type(self).__name__): - new_values = self.values.astype(dtype, copy=copy) + new_values = self._values.astype(dtype, copy=copy) if is_interval_dtype(new_values): return self._shallow_copy(new_values) return Index.astype(self, dtype, copy=copy) @@ -421,7 +426,7 @@ def memory_usage(self, deep: bool = False) -> int: return self.left.memory_usage(deep=deep) + self.right.memory_usage(deep=deep) # IntervalTree doesn't have a is_monotonic_decreasing, so have to override - # the Index implemenation + # the Index implementation @cache_readonly def is_monotonic_decreasing(self) -> bool: """ @@ -866,14 +871,23 @@ def get_indexer_for(self, target: AnyArrayLike, **kwargs) -> np.ndarray: def _convert_slice_indexer(self, key: slice, kind: str): if not (key.step is None or key.step == 1): - raise ValueError("cannot support not-default step in a slice") + # GH#31658 if label-based, we require step == 1, + # if positional, we disallow float start/stop + msg = "label-based slicing with step!=1 is not supported for IntervalIndex" + if kind == "loc": + raise ValueError(msg) + elif kind == "getitem": + if not is_valid_positional_slice(key): + # i.e. this cannot be interpreted as a positional slice + raise ValueError(msg) + return super()._convert_slice_indexer(key, kind) @Appender(Index.where.__doc__) def where(self, cond, other=None): if other is None: other = self._na_value - values = np.where(cond, self.values, other) + values = np.where(cond, self._values, other) result = IntervalArray(values) return self._shallow_copy(result) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index c1efa512f326a..1bcda72e77f2f 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -18,7 +18,7 @@ from pandas._libs import algos as libalgos, index as libindex, lib from pandas._libs.hashtable import duplicated_int64 -from pandas._typing import AnyArrayLike, ArrayLike, Scalar +from pandas._typing import AnyArrayLike, Scalar from pandas.compat.numpy import function as nv from pandas.errors import PerformanceWarning, UnsortedIndexError from pandas.util._decorators import Appender, cache_readonly @@ -52,6 +52,7 @@ ensure_index, ) from pandas.core.indexes.frozen import FrozenList +from pandas.core.indexes.numeric import Int64Index import pandas.core.missing as missing from pandas.core.sorting import ( get_group_index, @@ -275,6 +276,7 @@ def __new__( raise ValueError("Must pass non-zero number of levels/codes") result = object.__new__(MultiIndex) + result._cache = {} # we've already validated levels and codes, so shortcut here result._set_levels(levels, copy=copy, validate=False) @@ -563,6 +565,7 @@ def from_product(cls, iterables, sortorder=None, names=lib.no_default): if names is lib.no_default: names = [getattr(it, "name", None) for it in iterables] + # codes are all ndarrays, so cartesian_product is lossless codes = cartesian_product(codes) return MultiIndex(levels, codes, sortorder=sortorder, names=names) @@ -984,13 +987,41 @@ def _constructor(self): return MultiIndex.from_tuples @Appender(Index._shallow_copy.__doc__) - def _shallow_copy(self, values=None, **kwargs): + def _shallow_copy( + self, + values=None, + name=lib.no_default, + levels=None, + codes=None, + dtype=None, + sortorder=None, + names=lib.no_default, + _set_identity: bool = True, + ): + if names is not lib.no_default and name is not lib.no_default: + raise TypeError("Can only provide one of `names` and `name`") + elif names is lib.no_default: + names = name if name is not lib.no_default else self.names + if values is not None: - names = kwargs.pop("names", kwargs.pop("name", self.names)) - # discards freq - kwargs.pop("freq", None) - return MultiIndex.from_tuples(values, names=names, **kwargs) - return self.copy(**kwargs) + assert levels is None and codes is None and dtype is None + return MultiIndex.from_tuples(values, sortorder=sortorder, names=names) + + levels = levels if levels is not None else self.levels + codes = codes if codes is not None else self.codes + + result = MultiIndex( + levels=levels, + codes=codes, + dtype=dtype, + sortorder=sortorder, + names=names, + verify_integrity=False, + _set_identity=_set_identity, + ) + result._cache = self._cache.copy() + result._cache.pop("levels", None) # GH32669 + return result def _shallow_copy_with_infer(self, values, **kwargs): # On equal MultiIndexes the difference is empty. @@ -1047,17 +1078,13 @@ def copy( levels = deepcopy(self.levels) if codes is None: codes = deepcopy(self.codes) - else: - if levels is None: - levels = self.levels - if codes is None: - codes = self.codes - return MultiIndex( + + return self._shallow_copy( levels=levels, codes=codes, names=names, + dtype=dtype, sortorder=self.sortorder, - verify_integrity=False, _set_identity=_set_identity, ) @@ -1180,7 +1207,7 @@ def _format_native_types(self, na_rep="nan", **kwargs): sortorder=self.sortorder, verify_integrity=False, ) - return mi.values + return mi._values def format( self, @@ -1419,7 +1446,7 @@ def is_monotonic_increasing(self) -> bool: except TypeError: # we have mixed types and np.lexsort is not happy - return Index(self.values).is_monotonic + return Index(self._values).is_monotonic @cache_readonly def is_monotonic_decreasing(self) -> bool: @@ -1612,7 +1639,7 @@ def to_flat_index(self): ('bar', 'baz'), ('bar', 'qux')], dtype='object') """ - return Index(self.values, tupleize_cols=False) + return Index(self._values, tupleize_cols=False) @property def is_all_dates(self) -> bool: @@ -1914,7 +1941,7 @@ def append(self, other): arrays.append(label.append(appended)) return MultiIndex.from_arrays(arrays, names=self.names) - to_concat = (self.values,) + tuple(k._values for k in other) + to_concat = (self._values,) + tuple(k._values for k in other) new_tuples = np.concatenate(to_concat) # if all(isinstance(x, MultiIndex) for x in other): @@ -1924,7 +1951,7 @@ def append(self, other): return Index(new_tuples) def argsort(self, *args, **kwargs) -> np.ndarray: - return self.values.argsort(*args, **kwargs) + return self._values.argsort(*args, **kwargs) @Appender(_index_shared_docs["repeat"] % _index_doc_kwargs) def repeat(self, repeats, axis=None): @@ -2368,7 +2395,7 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): # let's instead try with a straight Index if method is None: - return Index(self.values).get_indexer( + return Index(self._values).get_indexer( target, method=method, limit=limit, tolerance=tolerance ) @@ -2831,7 +2858,8 @@ def convert_indexer(start, stop, step, indexer=indexer, codes=level_codes): mapper = Series(indexer) indexer = codes.take(ensure_platform_int(indexer)) result = Series(Index(indexer).isin(r).nonzero()[0]) - m = result.map(mapper)._ndarray_values + m = result.map(mapper) + m = np.asarray(m) else: m = np.zeros(len(codes), dtype=bool) @@ -2949,7 +2977,7 @@ def get_locs(self, seq): n = len(self) indexer = None - def _convert_to_indexer(r): + def _convert_to_indexer(r) -> Int64Index: # return an indexer if isinstance(r, slice): m = np.zeros(n, dtype=bool) @@ -3026,13 +3054,16 @@ def _update_indexer(idxr, indexer=indexer): if indexer is None: return np.array([], dtype=np.int64) + assert isinstance(indexer, Int64Index), type(indexer) indexer = self._reorder_indexer(seq, indexer) - return indexer._ndarray_values + return indexer._values def _reorder_indexer( - self, seq: Tuple[Union[Scalar, Iterable, AnyArrayLike], ...], indexer: ArrayLike - ) -> ArrayLike: + self, + seq: Tuple[Union[Scalar, Iterable, AnyArrayLike], ...], + indexer: Int64Index, + ) -> Int64Index: """ Reorder an indexer of a MultiIndex (self) so that the label are in the same order as given in seq @@ -3136,11 +3167,10 @@ def equals(self, other) -> bool: if not isinstance(other, MultiIndex): # d-level MultiIndex can equal d-tuple Index if not is_object_dtype(other.dtype): - if self.nlevels != other.nlevels: - return False + # other cannot contain tuples, so cannot match self + return False - other_vals = com.values_from_object(ensure_index(other)) - return array_equivalent(self._ndarray_values, other_vals) + return array_equivalent(self._values, other._values) if self.nlevels != other.nlevels: return False @@ -3231,9 +3261,13 @@ def union(self, other, sort=None): # TODO: Index.union returns other when `len(self)` is 0. - uniq_tuples = lib.fast_unique_multiple( - [self._ndarray_values, other._ndarray_values], sort=sort - ) + if not is_object_dtype(other.dtype): + raise NotImplementedError( + "Can only union MultiIndex with MultiIndex or Index of tuples, " + "try mi.to_flat_index().union(other) instead." + ) + + uniq_tuples = lib.fast_unique_multiple([self._values, other._values], sort=sort) return MultiIndex.from_arrays( zip(*uniq_tuples), sortorder=0, names=result_names @@ -3267,10 +3301,20 @@ def intersection(self, other, sort=False): if self.equals(other): return self - lvals = self._ndarray_values - rvals = other._ndarray_values + if not is_object_dtype(other.dtype): + # The intersection is empty + # TODO: we have no tests that get here + return MultiIndex( + levels=self.levels, + codes=[[]] * self.nlevels, + names=result_names, + verify_integrity=False, + ) + + lvals = self._values + rvals = other._values - uniq_tuples = None # flag whether _inner_indexer was succesful + uniq_tuples = None # flag whether _inner_indexer was successful if self.is_monotonic and other.is_monotonic: try: uniq_tuples = self._inner_indexer(lvals, rvals)[0] @@ -3342,7 +3386,7 @@ def difference(self, other, sort=None): indexer = indexer.take((indexer != -1).nonzero()[0]) label_diff = np.setdiff1d(np.arange(this.size), indexer, assume_unique=True) - difference = this.values.take(label_diff) + difference = this._values.take(label_diff) if sort is None: difference = sorted(difference) @@ -3359,7 +3403,8 @@ def difference(self, other, sort=None): def _convert_can_do_setop(self, other): result_names = self.names - if not hasattr(other, "names"): + if not isinstance(other, Index): + if len(other) == 0: other = MultiIndex( levels=[[]] * self.nlevels, @@ -3456,8 +3501,8 @@ def _wrap_joined_index(self, joined, other): @Appender(Index.isin.__doc__) def isin(self, values, level=None): if level is None: - values = MultiIndex.from_tuples(values, names=self.names).values - return algos.isin(self.values, values) + values = MultiIndex.from_tuples(values, names=self.names)._values + return algos.isin(self._values, values) else: num = self._get_level_number(level) levs = self.get_level_values(num) diff --git a/pandas/core/indexes/numeric.py b/pandas/core/indexes/numeric.py index 6c250ccd09a51..3a6f3630c19e7 100644 --- a/pandas/core/indexes/numeric.py +++ b/pandas/core/indexes/numeric.py @@ -104,15 +104,11 @@ def _maybe_cast_slice_bound(self, label, side, kind): @Appender(Index._shallow_copy.__doc__) def _shallow_copy(self, values=None, name: Label = lib.no_default): - name = name if name is not lib.no_default else self.name - if values is not None and not self._can_hold_na and values.dtype.kind == "f": + name = self.name if name is lib.no_default else name # Ensure we are not returning an Int64Index with float data: return Float64Index._simple_new(values, name=name) - - if values is None: - values = self.values - return type(self)._simple_new(values, name=name) + return super()._shallow_copy(values=values, name=name) def _convert_for_op(self, value): """ @@ -252,7 +248,7 @@ def inferred_type(self) -> str: @property def asi8(self) -> np.ndarray: # do not cache or you'll create a memory leak - return self.values.view(self._default_dtype) + return self._values.view(self._default_dtype) class Int64Index(IntegerIndex): @@ -372,7 +368,7 @@ def astype(self, dtype, copy=True): elif is_integer_dtype(dtype) and not is_extension_array_dtype(dtype): # TODO(jreback); this can change once we have an EA Index type # GH 13149 - arr = astype_nansafe(self.values, dtype=dtype) + arr = astype_nansafe(self._values, dtype=dtype) return Int64Index(arr) return super().astype(dtype, copy=copy) @@ -399,7 +395,7 @@ def _format_native_types( from pandas.io.formats.format import FloatArrayFormatter formatter = FloatArrayFormatter( - self.values, + self._values, na_rep=na_rep, float_format=float_format, decimal=decimal, diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index ebf69c49c029a..f6bf02b6df676 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -9,7 +9,7 @@ from pandas._libs.tslibs import frequencies as libfrequencies, resolution from pandas._libs.tslibs.parsing import parse_time_string from pandas._libs.tslibs.period import Period -from pandas._typing import Label +from pandas._typing import DtypeObj, Label from pandas.util._decorators import Appender, cache_readonly from pandas.core.dtypes.common import ( @@ -19,11 +19,11 @@ is_dtype_equal, is_float, is_integer, - is_integer_dtype, is_object_dtype, is_scalar, pandas_dtype, ) +from pandas.core.dtypes.dtypes import PeriodDtype from pandas.core.arrays.period import ( PeriodArray, @@ -234,6 +234,7 @@ def _simple_new(cls, values: PeriodArray, name: Label = None): # For groupby perf. See note in indexes/base about _index_data result._index_data = values._data result.name = name + result._cache = {} result._reset_identity() return result @@ -251,11 +252,13 @@ def _has_complex_internals(self): def _shallow_copy(self, values=None, name: Label = no_default): name = name if name is not no_default else self.name - + cache = self._cache.copy() if values is None else {} if values is None: values = self._data - return self._simple_new(values, name=name) + result = self._simple_new(values, name=name) + result._cache = cache + return result def _maybe_convert_timedelta(self, other): """ @@ -296,6 +299,14 @@ def _maybe_convert_timedelta(self, other): # raise when input doesn't have freq raise raise_on_incompatible(self, None) + def _is_comparable_dtype(self, dtype: DtypeObj) -> bool: + """ + Can we compare values of the given dtype to our own? + """ + if not isinstance(dtype, PeriodDtype): + return False + return dtype.freq == self.freq + # ------------------------------------------------------------------------ # Rendering Methods @@ -338,12 +349,6 @@ def _int64index(self) -> Int64Index: # ------------------------------------------------------------------------ # Index Methods - def __array__(self, dtype=None) -> np.ndarray: - if is_integer_dtype(dtype): - return self.asi8 - else: - return self.astype(object).values - def __array_wrap__(self, result, context=None): """ Gets called after a ufunc. Needs additional handling as @@ -377,27 +382,26 @@ def __array_wrap__(self, result, context=None): # cannot pass _simple_new as it is return type(self)(result, freq=self.freq, name=self.name) - def asof_locs(self, where, mask): + def asof_locs(self, where, mask: np.ndarray) -> np.ndarray: """ where : array of timestamps mask : array of booleans where data is not NA - """ where_idx = where if isinstance(where_idx, DatetimeIndex): where_idx = PeriodIndex(where_idx.values, freq=self.freq) + elif not isinstance(where_idx, PeriodIndex): + raise TypeError("asof_locs `where` must be DatetimeIndex or PeriodIndex") + elif where_idx.freq != self.freq: + raise raise_on_incompatible(self, where_idx) - locs = self._ndarray_values[mask].searchsorted( - where_idx._ndarray_values, side="right" - ) + locs = self.asi8[mask].searchsorted(where_idx.asi8, side="right") locs = np.where(locs > 0, locs - 1, 0) result = np.arange(len(self))[mask].take(locs) first = mask.argmax() - result[ - (locs == 0) & (where_idx._ndarray_values < self._ndarray_values[first]) - ] = -1 + result[(locs == 0) & (where_idx.asi8 < self.asi8[first])] = -1 return result @@ -459,12 +463,11 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): def get_indexer_non_unique(self, target): target = ensure_index(target) - if isinstance(target, PeriodIndex): - if target.freq != self.freq: - no_matches = -1 * np.ones(self.shape, dtype=np.intp) - return no_matches, no_matches + if not self._is_comparable_dtype(target.dtype): + no_matches = -1 * np.ones(self.shape, dtype=np.intp) + return no_matches, no_matches - target = target.asi8 + target = target.asi8 indexer, missing = self._int64index.get_indexer_non_unique(target) return ensure_platform_int(indexer), missing diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index f621a3c153adf..2c038564f4e6f 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -141,7 +141,7 @@ def _simple_new(cls, values: range, name: Label = None) -> "RangeIndex": result._range = values result.name = name - + result._cache = {} result._reset_identity() return result @@ -168,7 +168,7 @@ def _data(self): return self._cached_data @cache_readonly - def _int64index(self): + def _int64index(self) -> Int64Index: return Int64Index._simple_new(self._data, name=self.name) def _get_data_as_items(self): @@ -391,7 +391,9 @@ def _shallow_copy(self, values=None, name: Label = no_default): name = self.name if name is no_default else name if values is None: - return self._simple_new(self._range, name=name) + result = self._simple_new(self._range, name=name) + result._cache = self._cache.copy() + return result else: return Int64Index._simple_new(values, name=name) diff --git a/pandas/core/indexes/timedeltas.py b/pandas/core/indexes/timedeltas.py index 5e4a8e83bd95b..588cb3e37bced 100644 --- a/pandas/core/indexes/timedeltas.py +++ b/pandas/core/indexes/timedeltas.py @@ -1,6 +1,7 @@ """ implement the TimedeltaIndex """ from pandas._libs import NaT, Timedelta, index as libindex +from pandas._typing import DtypeObj, Label from pandas.util._decorators import Appender from pandas.core.dtypes.common import ( @@ -154,7 +155,7 @@ def __new__( if isinstance(data, TimedeltaArray) and freq is None: if copy: data = data.copy() - return cls._simple_new(data, name=name, freq=freq) + return cls._simple_new(data, name=name) if isinstance(data, TimedeltaIndex) and freq is None and name is None: if copy: @@ -170,16 +171,13 @@ def __new__( return cls._simple_new(tdarr, name=name) @classmethod - def _simple_new(cls, values, name=None, freq=None, dtype=_TD_DTYPE): - # `dtype` is passed by _shallow_copy in corner cases, should always - # be timedelta64[ns] if present - assert dtype == _TD_DTYPE, dtype + def _simple_new(cls, values: TimedeltaArray, name: Label = None): assert isinstance(values, TimedeltaArray) - assert freq is None or values.freq == freq result = object.__new__(cls) result._data = values result._name = name + result._cache = {} # For groupby perf. See note in indexes/base about _index_data result._index_data = values._data @@ -215,6 +213,12 @@ def _maybe_promote(self, other): other = TimedeltaIndex(other) return self, other + def _is_comparable_dtype(self, dtype: DtypeObj) -> bool: + """ + Can we compare values of the given dtype to our own? + """ + return is_timedelta64_dtype(dtype) + def get_loc(self, key, method=None, tolerance=None): """ Get integer location for requested label diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 9a671c7fc170a..8038bba8b6448 100755 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -5,9 +5,10 @@ from pandas._libs.indexing import _NDFrameIndexerBase from pandas._libs.lib import item_from_zerodim from pandas.errors import AbstractMethodError -from pandas.util._decorators import Appender +from pandas.util._decorators import doc from pandas.core.dtypes.common import ( + is_hashable, is_integer, is_iterator, is_list_like, @@ -581,6 +582,9 @@ def _get_setitem_indexer(self, key): """ Convert a potentially-label-based key into a positional indexer. """ + if self.name == "loc": + self._ensure_listlike_indexer(key) + if self.axis is not None: return self._convert_tuple(key, is_setter=True) @@ -611,6 +615,42 @@ def _get_setitem_indexer(self, key): raise raise IndexingError(key) from e + def _ensure_listlike_indexer(self, key, axis=None): + """ + Ensure that a list-like of column labels are all present by adding them if + they do not already exist. + + Parameters + ---------- + key : _LocIndexer key or list-like of column labels + Target labels. + axis : key axis if known + """ + column_axis = 1 + + # column only exists in 2-dimensional DataFrame + if self.ndim != 2: + return + + if isinstance(key, tuple): + # key may be a tuple if key is a _LocIndexer key + # in that case, set key to the column part of key + key = key[column_axis] + axis = column_axis + + if ( + axis == column_axis + and not isinstance(self.obj.columns, ABCMultiIndex) + and is_list_like_indexer(key) + and not com.is_bool_indexer(key) + and all(is_hashable(k) for k in key) + ): + for k in key: + try: + self.obj[k] + except KeyError: + self.obj[k] = np.nan + def __setitem__(self, key, value): if isinstance(key, tuple): key = tuple(com.apply_if_callable(x, self.obj) for x in key) @@ -847,7 +887,7 @@ def _getbool_axis(self, key, axis: int): return self.obj._take_with_is_copy(inds, axis=axis) -@Appender(IndexingMixin.loc.__doc__) +@doc(IndexingMixin.loc) class _LocIndexer(_LocationIndexer): _takeable: bool = False _valid_types = ( @@ -859,7 +899,7 @@ class _LocIndexer(_LocationIndexer): # ------------------------------------------------------------------- # Key Checks - @Appender(_LocationIndexer._validate_key.__doc__) + @doc(_LocationIndexer._validate_key) def _validate_key(self, key, axis: int): # valid for a collection of labels (we check their presence later) @@ -1289,7 +1329,7 @@ def _validate_read_indexer( ) -@Appender(IndexingMixin.iloc.__doc__) +@doc(IndexingMixin.iloc) class _iLocIndexer(_LocationIndexer): _valid_types = ( "integer, integer slice (START point is INCLUDED, END " @@ -1615,6 +1655,12 @@ def _setitem_with_indexer(self, indexer, value): info_idx = [info_idx] labels = item_labels[info_idx] + # Ensure we have something we can iterate over + ilocs = info_idx + if isinstance(info_idx, slice): + ri = Index(range(len(self.obj.columns))) + ilocs = ri[info_idx] + plane_indexer = indexer[:1] lplane_indexer = length_of_indexer(plane_indexer[0], self.obj.index) # lplane_indexer gives the expected length of obj[indexer[0]] @@ -1632,9 +1678,11 @@ def _setitem_with_indexer(self, indexer, value): "length than the value" ) - def setter(item, v): - ser = self.obj[item] - pi = plane_indexer[0] if lplane_indexer == 1 else plane_indexer + pi = plane_indexer[0] if lplane_indexer == 1 else plane_indexer + + def isetter(loc, v): + # positional setting on column loc + ser = self.obj._ixs(loc, axis=1) # perform the equivalent of a setitem on the info axis # as we have a null slice or a slice with full bounds @@ -1654,7 +1702,7 @@ def setter(item, v): ser._maybe_update_cacher(clear=True) # reset the sliced object if unique - self.obj[item] = ser + self.obj._iset_item(loc, ser) # we need an iterable, with a ndim of at least 1 # eg. don't pass through np.array(0) @@ -1664,8 +1712,10 @@ def setter(item, v): if isinstance(value, ABCDataFrame): sub_indexer = list(indexer) multiindex_indexer = isinstance(labels, ABCMultiIndex) + # TODO: we are implicitly assuming value.columns is unique - for item in labels: + for loc in ilocs: + item = item_labels[loc] if item in value: sub_indexer[info_axis] = item v = self._align_series( @@ -1674,7 +1724,7 @@ def setter(item, v): else: v = np.nan - setter(item, v) + isetter(loc, v) # we have an equal len ndarray/convertible to our labels # hasattr first, to avoid coercing to ndarray without reason. @@ -1685,16 +1735,15 @@ def setter(item, v): # note that this coerces the dtype if we are mixed # GH 7551 value = np.array(value, dtype=object) - if len(labels) != value.shape[1]: + if len(ilocs) != value.shape[1]: raise ValueError( "Must have equal len keys and value " "when setting with an ndarray" ) - for i, item in enumerate(labels): - + for i, loc in enumerate(ilocs): # setting with a list, re-coerces - setter(item, value[:, i].tolist()) + isetter(loc, value[:, i].tolist()) elif ( len(labels) == 1 @@ -1702,7 +1751,8 @@ def setter(item, v): and not is_scalar(plane_indexer[0]) ): # we have an equal len list/ndarray - setter(labels[0], value) + # We only get here with len(labels) == len(ilocs) == 1 + isetter(ilocs[0], value) elif lplane_indexer == 0 and len(value) == len(self.obj.index): # We get here in one case via .loc with a all-False mask @@ -1710,19 +1760,19 @@ def setter(item, v): else: # per-label values - if len(labels) != len(value): + if len(ilocs) != len(value): raise ValueError( "Must have equal len keys and value " "when setting with an iterable" ) - for item, v in zip(labels, value): - setter(item, v) + for loc, v in zip(ilocs, value): + isetter(loc, v) else: - # scalar - for item in labels: - setter(item, value) + # scalar value + for loc in ilocs: + isetter(loc, value) else: if isinstance(indexer, tuple): @@ -1998,7 +2048,7 @@ def __setitem__(self, key, value): self.obj._set_value(*key, value=value, takeable=self._takeable) -@Appender(IndexingMixin.at.__doc__) +@doc(IndexingMixin.at) class _AtIndexer(_ScalarAccessIndexer): _takeable = False @@ -2024,7 +2074,7 @@ def __getitem__(self, key): return obj.index._get_values_for_loc(obj, loc, key) -@Appender(IndexingMixin.iat.__doc__) +@doc(IndexingMixin.iat) class _iAtIndexer(_ScalarAccessIndexer): _takeable = True diff --git a/pandas/core/internals/__init__.py b/pandas/core/internals/__init__.py index 37a3405554745..e70652b81c42f 100644 --- a/pandas/core/internals/__init__.py +++ b/pandas/core/internals/__init__.py @@ -17,7 +17,6 @@ from pandas.core.internals.managers import ( BlockManager, SingleBlockManager, - _transform_index, concatenate_block_managers, create_block_manager_from_arrays, create_block_manager_from_blocks, @@ -40,7 +39,6 @@ "_block_shape", "BlockManager", "SingleBlockManager", - "_transform_index", "concatenate_block_managers", "create_block_manager_from_arrays", "create_block_manager_from_blocks", diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 70fd3ecdc2098..935ff09585b17 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -11,7 +11,7 @@ import pandas._libs.internals as libinternals from pandas._libs.tslibs import Timedelta, conversion from pandas._libs.tslibs.timezones import tz_compare -from pandas._typing import DtypeObj +from pandas._typing import ArrayLike from pandas.util._validators import validate_bool_kwarg from pandas.core.dtypes.cast import ( @@ -30,7 +30,6 @@ from pandas.core.dtypes.common import ( _NS_DTYPE, _TD_DTYPE, - ensure_platform_int, is_bool_dtype, is_categorical, is_categorical_dtype, @@ -67,6 +66,7 @@ ) import pandas.core.algorithms as algos +from pandas.core.array_algos.transforms import shift from pandas.core.arrays import ( Categorical, DatetimeArray, @@ -111,7 +111,6 @@ class Block(PandasObject): _can_consolidate = True _verify_integrity = True _validate_ndim = True - _ftype = "dense" _concatenator = staticmethod(np.concatenate) def __init__(self, values, placement, ndim=None): @@ -231,14 +230,12 @@ def get_values(self, dtype=None): return self.values.astype(object) return self.values - def get_block_values(self, dtype=None): + def get_block_values_for_json(self) -> np.ndarray: """ - This is used in the JSON C code + This is used in the JSON C code. """ - return self.get_values(dtype=dtype) - - def to_dense(self): - return self.values.view() + # TODO(2DEA): reshape will be unnecessary with 2D EAs + return np.asarray(self.values).reshape(self.shape) @property def fill_value(self): @@ -255,14 +252,6 @@ def mgr_locs(self, new_mgr_locs): self._mgr_locs = new_mgr_locs - @property - def array_dtype(self) -> DtypeObj: - """ - the dtype to return if I want to construct this block as an - array - """ - return self.dtype - def make_block(self, values, placement=None) -> "Block": """ Create a new block, with type inference propagate any values that are @@ -333,14 +322,6 @@ def shape(self): def dtype(self): return self.values.dtype - @property - def ftype(self) -> str: - if getattr(self.values, "_pandas_ftype", False): - dtype = self.dtype.subtype - else: - dtype = self.dtype - return f"{dtype}:{self._ftype}" - def merge(self, other): return _merge_blocks([self, other]) @@ -360,11 +341,12 @@ def iget(self, i): def set(self, locs, values): """ - Modify Block in-place with new item value + Modify block values in-place with new item value. - Returns - ------- - None + Notes + ----- + `set` never creates a new array or new Block, whereas `setitem` _may_ + create a new array and always creates a new Block. """ self.values[locs] = values @@ -402,7 +384,9 @@ def _split_op_result(self, result) -> List["Block"]: return [result] - def fillna(self, value, limit=None, inplace: bool = False, downcast=None): + def fillna( + self, value, limit=None, inplace: bool = False, downcast=None + ) -> List["Block"]: """ fillna on the block with the value. If we fail, then convert to ObjectBlock and try again @@ -416,9 +400,9 @@ def fillna(self, value, limit=None, inplace: bool = False, downcast=None): if not self._can_hold_na: if inplace: - return self + return [self] else: - return self.copy() + return [self.copy()] if self._can_hold_element(value): # equivalent: _try_coerce_args(value) would not raise @@ -427,7 +411,7 @@ def fillna(self, value, limit=None, inplace: bool = False, downcast=None): # we can't process the value, but nothing to do if not mask.any(): - return self if inplace else self.copy() + return [self] if inplace else [self.copy()] # operate column-by-column def f(mask, val, idx): @@ -441,7 +425,7 @@ def f(mask, val, idx): return self.split_and_operate(None, f, inplace) - def split_and_operate(self, mask, f, inplace: bool): + def split_and_operate(self, mask, f, inplace: bool) -> List["Block"]: """ split the block per-column, and apply the callable f per-column, return a new block for each. Handle @@ -611,7 +595,9 @@ def astype(self, dtype, copy: bool = False, errors: str = "raise"): # astype formatting else: - values = self.get_values() + # Because we have neither is_extension nor is_datelike, + # self.values already has the correct shape + values = self.values else: values = self.get_values(dtype=dtype) @@ -669,7 +655,7 @@ def _can_hold_element(self, element: Any) -> bool: def to_native_types(self, slicer=None, na_rep="nan", quoting=None, **kwargs): """ convert to our native types format, slicing if desired """ - values = self.get_values() + values = self.values if slicer is not None: values = values[:, slicer] @@ -809,7 +795,7 @@ def _replace_single(self, *args, **kwargs): def setitem(self, indexer, value): """ - Set the value inplace, returning a a maybe different typed block. + Attempt self.values[indexer] = value, possibly creating a new array. Parameters ---------- @@ -890,7 +876,7 @@ def setitem(self, indexer, value): # GH#8669 empty indexers pass - elif is_scalar_indexer(indexer, arr_value): + elif is_scalar_indexer(indexer, self.ndim): # setting a single element for each dim and with a rhs that could # be e.g. a list; see GH#6043 values[indexer] = value @@ -908,12 +894,10 @@ def setitem(self, indexer, value): # if we are an exact match (ex-broadcasting), # then use the resultant dtype elif exact_match: + # We are setting _all_ of the array's values, so can cast to new dtype values[indexer] = value - try: - values = values.astype(arr_value.dtype) - except ValueError: - pass + values = values.astype(arr_value.dtype, copy=False) # set else: @@ -925,33 +909,27 @@ def setitem(self, indexer, value): return block def putmask( - self, - mask, - new, - align: bool = True, - inplace: bool = False, - axis: int = 0, - transpose: bool = False, - ): + self, mask, new, inplace: bool = False, axis: int = 0, transpose: bool = False, + ) -> List["Block"]: """ putmask the data to the block; it is possible that we may create a new dtype of block - return the resulting block(s) + Return the resulting block(s). Parameters ---------- - mask : the condition to respect + mask : the condition to respect new : a ndarray/object - align : boolean, perform alignment on other/cond, default is True - inplace : perform inplace modification, default is False + inplace : bool, default False + Perform inplace modification. axis : int - transpose : boolean - Set to True if self is stored with axes reversed + transpose : bool, default False + Set to True if self is stored with axes reversed. Returns ------- - a list of new blocks, the result of the putmask + List[Block] """ new_values = self.values if inplace else self.values.copy() @@ -1190,7 +1168,7 @@ def _interpolate_with_fill( fill_value=None, coerce=False, downcast=None, - ): + ) -> List["Block"]: """ fillna but using the interpolate machinery """ inplace = validate_bool_kwarg(inplace, "inplace") @@ -1232,7 +1210,7 @@ def _interpolate( inplace=False, downcast=None, **kwargs, - ): + ) -> List["Block"]: """ interpolate using scipy wrappers """ inplace = validate_bool_kwarg(inplace, "inplace") data = self.values if inplace else self.values.copy() @@ -1240,7 +1218,7 @@ def _interpolate( # only deal with floats if not self.is_float: if not self.is_integer: - return self + return [self] data = data.astype(np.float64) if fill_value is None: @@ -1323,36 +1301,12 @@ def shift(self, periods, axis: int = 0, fill_value=None): # that, handle boolean etc also new_values, fill_value = maybe_upcast(self.values, fill_value) - # make sure array sent to np.roll is c_contiguous - f_ordered = new_values.flags.f_contiguous - if f_ordered: - new_values = new_values.T - axis = new_values.ndim - axis - 1 - - if np.prod(new_values.shape): - new_values = np.roll(new_values, ensure_platform_int(periods), axis=axis) - - axis_indexer = [slice(None)] * self.ndim - if periods > 0: - axis_indexer[axis] = slice(None, periods) - else: - axis_indexer[axis] = slice(periods, None) - new_values[tuple(axis_indexer)] = fill_value - - # restore original order - if f_ordered: - new_values = new_values.T + new_values = shift(new_values, periods, axis, fill_value) return [self.make_block(new_values)] def where( - self, - other, - cond, - align: bool = True, - errors="raise", - try_cast: bool = False, - axis: int = 0, + self, other, cond, errors="raise", try_cast: bool = False, axis: int = 0, ) -> List["Block"]: """ evaluate the block; return result block(s) from the result @@ -1361,8 +1315,6 @@ def where( ---------- other : a ndarray/object cond : the condition to respect - align : bool, default True - Perform alignment on other/cond. errors : str, {'raise', 'ignore'}, default 'raise' - ``raise`` : allow exceptions to be raised - ``ignore`` : suppress exceptions. On error return original object @@ -1428,12 +1380,7 @@ def where_func(cond, values, other): # we are explicitly ignoring errors block = self.coerce_to_target_dtype(other) blocks = block.where( - orig_other, - cond, - align=align, - errors=errors, - try_cast=try_cast, - axis=axis, + orig_other, cond, errors=errors, try_cast=try_cast, axis=axis, ) return self._maybe_downcast(blocks, "infer") @@ -1602,12 +1549,22 @@ def _replace_coerce( return self -class NonConsolidatableMixIn: - """ hold methods for the nonconsolidatable blocks """ +class ExtensionBlock(Block): + """ + Block for holding extension types. + + Notes + ----- + This holds all 3rd-party extension array types. It's also the immediate + parent class for our internal extension types' blocks, CategoricalBlock. + + ExtensionArrays are limited to 1-D. + """ _can_consolidate = False _verify_integrity = False _validate_ndim = False + is_extension = True def __init__(self, values, placement, ndim=None): """ @@ -1618,6 +1575,8 @@ def __init__(self, values, placement, ndim=None): This will call continue to call __init__ for the other base classes mixed in with this Mixin. """ + values = self._maybe_coerce_values(values) + # Placement must be converted to BlockPlacement so that we can check # its length if not isinstance(placement, libinternals.BlockPlacement): @@ -1631,6 +1590,10 @@ def __init__(self, values, placement, ndim=None): ndim = 2 super().__init__(values, placement, ndim=ndim) + if self.ndim == 2 and len(self.mgr_locs) != 1: + # TODO(2DEA): check unnecessary with 2D EAs + raise AssertionError("block.size != values.size") + @property def shape(self): if self.ndim == 1: @@ -1653,32 +1616,21 @@ def iget(self, col): raise IndexError(f"{self} only contains one item") return self.values - def should_store(self, value): + def should_store(self, value: ArrayLike) -> bool: + """ + Can we set the given array-like value inplace? + """ return isinstance(value, self._holder) - def set(self, locs, values, check=False): + def set(self, locs, values): assert locs.tolist() == [0] - self.values = values + self.values[:] = values def putmask( - self, mask, new, align=True, inplace=False, axis=0, transpose=False, - ): + self, mask, new, inplace: bool = False, axis: int = 0, transpose: bool = False, + ) -> List["Block"]: """ - putmask the data to the block; we must be a single block and not - generate other blocks - - return the resulting block - - Parameters - ---------- - mask : the condition to respect - new : a ndarray/object - align : boolean, perform alignment on other/cond, default is True - inplace : perform inplace modification, default is False - - Returns - ------- - a new block, the result of the putmask + See Block.putmask.__doc__ """ inplace = validate_bool_kwarg(inplace, "inplace") @@ -1726,25 +1678,6 @@ def _get_unstack_items(self, unstacker, new_columns): mask = mask.any(0) return new_placement, new_values, mask - -class ExtensionBlock(NonConsolidatableMixIn, Block): - """ - Block for holding extension types. - - Notes - ----- - This holds all 3rd-party extension array types. It's also the immediate - parent class for our internal extension types' blocks, CategoricalBlock. - - ExtensionArrays are limited to 1-D. - """ - - is_extension = True - - def __init__(self, values, placement, ndim=None): - values = self._maybe_coerce_values(values) - super().__init__(values, placement, ndim) - def _maybe_coerce_values(self, values): """ Unbox to an extension array. @@ -1788,7 +1721,7 @@ def is_numeric(self): def setitem(self, indexer, value): """ - Set the value inplace, returning a same-typed block. + Attempt self.values[indexer] = value, possibly creating a new array. This differs from Block.setitem by not allowing setitem to change the dtype of the Block. @@ -1827,9 +1760,6 @@ def get_values(self, dtype=None): def array_values(self) -> ExtensionArray: return self.values - def to_dense(self): - return np.asarray(self.values) - def to_native_types(self, slicer=None, na_rep="nan", quoting=None, **kwargs): """override to use ExtensionArray astype for the conversion""" values = self.values @@ -1916,10 +1846,7 @@ def diff(self, n: int, axis: int = 1) -> List["Block"]: return super().diff(n, axis) def shift( - self, - periods: int, - axis: libinternals.BlockPlacement = 0, - fill_value: Any = None, + self, periods: int, axis: int = 0, fill_value: Any = None, ) -> List["ExtensionBlock"]: """ Shift the block by `periods`. @@ -1936,13 +1863,7 @@ def shift( ] def where( - self, - other, - cond, - align=True, - errors="raise", - try_cast: bool = False, - axis: int = 0, + self, other, cond, errors="raise", try_cast: bool = False, axis: int = 0, ) -> List["Block"]: if isinstance(other, ABCDataFrame): # ExtensionArrays are 1-D, so if we get here then @@ -1990,10 +1911,6 @@ def where( return [self.make_block_same_class(result, placement=self.mgr_locs)] - @property - def _ftype(self): - return getattr(self.values, "_pandas_ftype", Block._ftype) - def _unstack(self, unstacker_func, new_columns, n_rows, fill_value): # ExtensionArray-safe unstack. # We override ObjectBlock._unstack, which unstacks directly on the @@ -2104,7 +2021,7 @@ def to_native_types( ) return formatter.get_result_as_array() - def should_store(self, value) -> bool: + def should_store(self, value: ArrayLike) -> bool: # when inserting a column should not coerce integers to floats # unnecessarily return issubclass(value.dtype.type, np.floating) and value.dtype == self.dtype @@ -2122,7 +2039,7 @@ def _can_hold_element(self, element: Any) -> bool: element, (float, int, complex, np.float_, np.int_) ) and not isinstance(element, (bool, np.bool_)) - def should_store(self, value) -> bool: + def should_store(self, value: ArrayLike) -> bool: return issubclass(value.dtype.type, np.complexfloating) @@ -2141,7 +2058,7 @@ def _can_hold_element(self, element: Any) -> bool: ) return is_integer(element) - def should_store(self, value) -> bool: + def should_store(self, value: ArrayLike) -> bool: return is_integer_dtype(value) and value.dtype == self.dtype @@ -2152,6 +2069,9 @@ class DatetimeLikeBlockMixin: def _holder(self): return DatetimeArray + def should_store(self, value): + return is_dtype_equal(self.dtype, value.dtype) + @property def fill_value(self): return np.datetime64("NaT", "ns") @@ -2172,7 +2092,7 @@ def internal_values(self): def iget(self, key): # GH#31649 we need to wrap scalars in Timestamp/Timedelta - # TODO: this can be removed if we ever have 2D EA + # TODO(EA2D): this can be removed if we ever have 2D EA result = super().iget(key) if isinstance(result, np.datetime64): result = Timestamp(result) @@ -2180,6 +2100,12 @@ def iget(self, key): result = Timedelta(result) return result + def shift(self, periods, axis=0, fill_value=None): + # TODO(EA2D) this is unnecessary if these blocks are backed by 2D EAs + values = self.array_values() + new_values = values.shift(periods, fill_value=fill_value, axis=axis) + return self.make_block_same_class(new_values) + class DatetimeBlock(DatetimeLikeBlockMixin, Block): __slots__ = () @@ -2228,6 +2154,9 @@ def astype(self, dtype, copy: bool = False, errors: str = "raise"): # if we are passed a datetime64[ns, tz] if is_datetime64tz_dtype(dtype): values = self.values + if copy: + # this should be the only copy + values = values.copy() if getattr(values, "tz", None) is None: values = DatetimeArray(values).tz_localize("UTC") values = values.tz_convert(dtype.tz) @@ -2279,20 +2208,9 @@ def to_native_types( ).reshape(i8values.shape) return np.atleast_2d(result) - def should_store(self, value) -> bool: - return ( - issubclass(value.dtype.type, np.datetime64) - and not is_datetime64tz_dtype(value) - and not is_extension_array_dtype(value) - ) - def set(self, locs, values): """ - Modify Block in-place with new item value - - Returns - ------- - None + See Block.set.__doc__ """ values = conversion.ensure_datetime64ns(values, copy=False) @@ -2316,6 +2234,7 @@ class DatetimeTZBlock(ExtensionBlock, DatetimeBlock): _can_hold_element = DatetimeBlock._can_hold_element to_native_types = DatetimeBlock.to_native_types fill_value = np.datetime64("NaT", "ns") + should_store = DatetimeBlock.should_store @property def _holder(self): @@ -2383,12 +2302,6 @@ def get_values(self, dtype=None): values = values.reshape(1, -1) return values - def to_dense(self): - # we request M8[ns] dtype here, even though it discards tzinfo, - # as lots of code (e.g. anything using values_from_object) - # expects that behavior. - return np.asarray(self.values, dtype=_NS_DTYPE) - def _slice(self, slicer): """ return a slice of my values """ if isinstance(slicer, tuple): @@ -2531,11 +2444,6 @@ def fillna(self, value, **kwargs): ) return super().fillna(value, **kwargs) - def should_store(self, value) -> bool: - return issubclass( - value.dtype.type, np.timedelta64 - ) and not is_extension_array_dtype(value) - def to_native_types(self, slicer=None, na_rep=None, quoting=None, **kwargs): """ convert to our native types format, slicing if desired """ values = self.values @@ -2577,7 +2485,7 @@ def _can_hold_element(self, element: Any) -> bool: return issubclass(tipo.type, np.bool_) return isinstance(element, (bool, np.bool_)) - def should_store(self, value) -> bool: + def should_store(self, value: ArrayLike) -> bool: return issubclass(value.dtype.type, np.bool_) and not is_extension_array_dtype( value ) @@ -2669,7 +2577,7 @@ def _maybe_downcast(self, blocks: List["Block"], downcast=None) -> List["Block"] def _can_hold_element(self, element: Any) -> bool: return True - def should_store(self, value) -> bool: + def should_store(self, value: ArrayLike) -> bool: return not ( issubclass( value.dtype.type, @@ -2918,19 +2826,8 @@ def __init__(self, values, placement, ndim=None): def _holder(self): return Categorical - @property - def array_dtype(self): - """ - the dtype to return if I want to construct this block as an - array - """ - return np.object_ - - def to_dense(self): - # Categorical.get_values returns a DatetimeIndex for datetime - # categories, so we can't simply use `np.asarray(self.values)` like - # other types. - return self.values._internal_get_values() + def should_store(self, arr: ArrayLike): + return isinstance(arr, self._holder) and is_dtype_equal(self.dtype, arr.dtype) def to_native_types(self, slicer=None, na_rep="", quoting=None, **kwargs): """ convert to our native types format, slicing if desired """ @@ -3144,14 +3041,15 @@ def _safe_reshape(arr, new_shape): return arr -def _putmask_smart(v, mask, n): +def _putmask_smart(v: np.ndarray, mask: np.ndarray, n) -> np.ndarray: """ Return a new ndarray, try to preserve dtype if possible. Parameters ---------- - v : `values`, updated in-place (array like) - mask : np.ndarray + v : np.ndarray + `values`, updated in-place. + mask : np.ndarray[bool] Applies to both sides (array like). n : `new values` either scalar or an array like aligned with `values` @@ -3218,9 +3116,6 @@ def _putmask_preserve(nv, n): # change the dtype if needed dtype, _ = maybe_promote(n.dtype) - if is_extension_array_dtype(v.dtype) and is_object_dtype(dtype): - v = v._internal_get_values(dtype) - else: - v = v.astype(dtype) + v = v.astype(dtype) return _putmask_preserve(v, n) diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index 7570f6eddbd9c..6839d138fbf73 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -217,7 +217,7 @@ def get_reindexed_values(self, empty_dtype, upcasted_na): else: # No dtype upcasting is done here, it will be performed during # concatenation itself. - values = self.block.get_values() + values = self.block.values if not self.indexers: # If there's no indexing to be done, we want to signal outside diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 57ed2555761be..3e0fb8455884a 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -36,7 +36,7 @@ from pandas.core import algorithms, common as com from pandas.core.arrays import Categorical -from pandas.core.construction import sanitize_array +from pandas.core.construction import extract_array, sanitize_array from pandas.core.indexes import base as ibase from pandas.core.indexes.api import ( Index, @@ -53,23 +53,26 @@ # BlockManager Interface -def arrays_to_mgr(arrays, arr_names, index, columns, dtype=None): +def arrays_to_mgr(arrays, arr_names, index, columns, dtype=None, verify_integrity=True): """ Segregate Series based on type and coerce into matrices. Needs to handle a lot of exceptional cases. """ - # figure out the index, if necessary - if index is None: - index = extract_index(arrays) - else: - index = ensure_index(index) + if verify_integrity: + # figure out the index, if necessary + if index is None: + index = extract_index(arrays) + else: + index = ensure_index(index) + + # don't force copy because getting jammed in an ndarray anyway + arrays = _homogenize(arrays, index, dtype) - # don't force copy because getting jammed in an ndarray anyway - arrays = _homogenize(arrays, index, dtype) + columns = ensure_index(columns) # from BlockManager perspective - axes = [ensure_index(columns), index] + axes = [columns, index] return create_block_manager_from_arrays(arrays, arr_names, axes) @@ -429,6 +432,33 @@ def _get_axes(N, K, index, columns): return index, columns +def dataclasses_to_dicts(data): + """ Converts a list of dataclass instances to a list of dictionaries + + Parameters + ---------- + data : List[Type[dataclass]] + + Returns + -------- + list_dict : List[dict] + + Examples + -------- + >>> @dataclass + >>> class Point: + ... x: int + ... y: int + + >>> dataclasses_to_dicts([Point(1,2), Point(2,3)]) + [{"x":1,"y":2},{"x":2,"y":3}] + + """ + from dataclasses import asdict + + return list(map(asdict, data)) + + # --------------------------------------------------------------------- # Conversion of Inputs to Arrays @@ -519,7 +549,7 @@ def _list_of_series_to_arrays(data, columns, coerce_float=False, dtype=None): else: indexer = indexer_cache[id(index)] = index.get_indexer(columns) - values = com.values_from_object(s) + values = extract_array(s, extract_numpy=True) aligned_values.append(algorithms.take_1d(values, indexer)) values = np.vstack(aligned_values) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 7320a6407b248..b245ac09029a2 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -3,12 +3,12 @@ import itertools import operator import re -from typing import Dict, List, Optional, Sequence, Tuple, Union +from typing import Dict, List, Optional, Sequence, Tuple, TypeVar, Union import numpy as np from pandas._libs import Timedelta, Timestamp, internals as libinternals, lib -from pandas._typing import DtypeObj, Label +from pandas._typing import ArrayLike, DtypeObj, Label from pandas.util._validators import validate_bool_kwarg from pandas.core.dtypes.cast import ( @@ -24,17 +24,17 @@ is_list_like, is_numeric_v_string_like, is_scalar, - is_sparse, ) from pandas.core.dtypes.concat import concat_compat from pandas.core.dtypes.dtypes import ExtensionDtype -from pandas.core.dtypes.generic import ABCExtensionArray, ABCSeries +from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries from pandas.core.dtypes.missing import isna import pandas.core.algorithms as algos +from pandas.core.arrays.sparse import SparseDtype from pandas.core.base import PandasObject from pandas.core.indexers import maybe_convert_indices -from pandas.core.indexes.api import Index, MultiIndex, ensure_index +from pandas.core.indexes.api import Index, ensure_index from pandas.core.internals.blocks import ( Block, CategoricalBlock, @@ -58,6 +58,8 @@ # TODO: flexible with index=None and/or items=None +T = TypeVar("T", bound="BlockManager") + class BlockManager(PandasObject): """ @@ -122,6 +124,9 @@ class BlockManager(PandasObject): "_blklocs", ] + _blknos: np.ndarray + _blklocs: np.ndarray + def __init__( self, blocks: Sequence[Block], @@ -146,6 +151,13 @@ def __init__( self._blknos = None self._blklocs = None + @classmethod + def from_blocks(cls, blocks: List[Block], axes: List[Index]): + """ + Constructor for BlockManager and SingleBlockManager with same signature. + """ + return cls(blocks, axes, do_integrity_check=False) + @property def blknos(self): """ @@ -173,7 +185,7 @@ def blklocs(self): return self._blklocs - def make_empty(self, axes=None) -> "BlockManager": + def make_empty(self: T, axes=None) -> T: """ return an empty BlockManager with the items axis of len 0 """ if axes is None: axes = [Index([])] + self.axes[1:] @@ -181,10 +193,13 @@ def make_empty(self, axes=None) -> "BlockManager": # preserve dtype if possible if self.ndim == 1: assert isinstance(self, SingleBlockManager) # for mypy - blocks = np.array([], dtype=self.array_dtype) + blk = self.blocks[0] + arr = blk.values[:0] + nb = blk.make_block_same_class(arr, placement=slice(0, 0), ndim=1) + blocks = [nb] else: blocks = [] - return type(self)(blocks, axes) + return type(self).from_blocks(blocks, axes) def __nonzero__(self) -> bool: return True @@ -213,23 +228,6 @@ def set_axis(self, axis: int, new_labels: Index) -> None: self.axes[axis] = new_labels - def rename_axis( - self, mapper, axis: int, copy: bool = True, level=None - ) -> "BlockManager": - """ - Rename one of axes. - - Parameters - ---------- - mapper : unary callable - axis : int - copy : bool, default True - level : int or None, default None - """ - obj = self.copy(deep=copy) - obj.set_axis(axis, _transform_index(self.axes[axis], mapper, level)) - return obj - @property def _is_single_block(self) -> bool: if self.ndim == 1: @@ -377,7 +375,7 @@ def reduce(self, func, *args, **kwargs): return res - def apply(self, f, filter=None, **kwargs) -> "BlockManager": + def apply(self: T, f, filter=None, align_keys=None, **kwargs) -> T: """ Iterate over the blocks, collect and create a new BlockManager. @@ -392,7 +390,9 @@ def apply(self, f, filter=None, **kwargs) -> "BlockManager": ------- BlockManager """ + align_keys = align_keys or [] result_blocks = [] + # fillna: Series/DataFrame is responsible for making sure value is aligned # filter kwarg is used in replace-* family of methods if filter is not None: @@ -405,33 +405,14 @@ def apply(self, f, filter=None, **kwargs) -> "BlockManager": self._consolidate_inplace() + align_copy = False if f == "where": align_copy = True - if kwargs.get("align", True): - align_keys = ["other", "cond"] - else: - align_keys = ["cond"] - elif f == "putmask": - align_copy = False - if kwargs.get("align", True): - align_keys = ["new", "mask"] - else: - align_keys = ["mask"] - elif f == "fillna": - # fillna internally does putmask, maybe it's better to do this - # at mgr, not block level? - align_copy = False - align_keys = ["value"] - else: - align_keys = [] - # TODO(EA): may interfere with ExtensionBlock.setitem for blocks - # with a .values attribute. aligned_args = { k: kwargs[k] for k in align_keys - if not isinstance(kwargs[k], ABCExtensionArray) - and hasattr(kwargs[k], "values") + if isinstance(kwargs[k], (ABCSeries, ABCDataFrame)) } for b in self.blocks: @@ -455,8 +436,8 @@ def apply(self, f, filter=None, **kwargs) -> "BlockManager": if len(result_blocks) == 0: return self.make_empty(self.axes) - bm = type(self)(result_blocks, self.axes, do_integrity_check=False) - return bm + + return type(self).from_blocks(result_blocks, self.axes) def quantile( self, @@ -567,16 +548,38 @@ def isna(self, func) -> "BlockManager": return self.apply("apply", func=func) def where(self, **kwargs) -> "BlockManager": - return self.apply("where", **kwargs) + if kwargs.pop("align", True): + align_keys = ["other", "cond"] + else: + align_keys = ["cond"] - def setitem(self, **kwargs) -> "BlockManager": - return self.apply("setitem", **kwargs) + return self.apply("where", align_keys=align_keys, **kwargs) - def putmask(self, **kwargs): - return self.apply("putmask", **kwargs) + def setitem(self, indexer, value) -> "BlockManager": + return self.apply("setitem", indexer=indexer, value=value) - def diff(self, **kwargs) -> "BlockManager": - return self.apply("diff", **kwargs) + def putmask( + self, mask, new, align: bool = True, axis: int = 0, + ): + transpose = self.ndim == 2 + + if align: + align_keys = ["new", "mask"] + else: + align_keys = ["mask"] + + return self.apply( + "putmask", + align_keys=align_keys, + mask=mask, + new=new, + inplace=True, + axis=axis, + transpose=transpose, + ) + + def diff(self, n: int, axis: int) -> "BlockManager": + return self.apply("diff", n=n, axis=axis) def interpolate(self, **kwargs) -> "BlockManager": return self.apply("interpolate", **kwargs) @@ -587,16 +590,30 @@ def shift(self, **kwargs) -> "BlockManager": def fillna(self, **kwargs) -> "BlockManager": return self.apply("fillna", **kwargs) - def downcast(self, **kwargs) -> "BlockManager": - return self.apply("downcast", **kwargs) + def downcast(self) -> "BlockManager": + return self.apply("downcast") def astype( self, dtype, copy: bool = False, errors: str = "raise" ) -> "BlockManager": return self.apply("astype", dtype=dtype, copy=copy, errors=errors) - def convert(self, **kwargs) -> "BlockManager": - return self.apply("convert", **kwargs) + def convert( + self, + copy: bool = True, + datetime: bool = True, + numeric: bool = True, + timedelta: bool = True, + coerce: bool = False, + ) -> "BlockManager": + return self.apply( + "convert", + copy=copy, + datetime=datetime, + numeric=numeric, + timedelta=timedelta, + coerce=coerce, + ) def replace(self, value, **kwargs) -> "BlockManager": assert np.ndim(value) == 0, value @@ -655,7 +672,7 @@ def comp(s, regex=False): rb = new_rb result_blocks.extend(rb) - bm = type(self)(result_blocks, self.axes) + bm = type(self).from_blocks(result_blocks, self.axes) bm._consolidate_inplace() return bm @@ -668,8 +685,8 @@ def is_consolidated(self) -> bool: return self._is_consolidated def _consolidate_check(self) -> None: - ftypes = [blk.ftype for blk in self.blocks] - self._is_consolidated = len(ftypes) == len(set(ftypes)) + dtypes = [blk.dtype for blk in self.blocks if blk._can_consolidate] + self._is_consolidated = len(dtypes) == len(set(dtypes)) self._known_consolidated = True @property @@ -744,7 +761,7 @@ def combine(self, blocks: List[Block], copy: bool = True) -> "BlockManager": axes = list(self.axes) axes[0] = self.items.take(indexer) - return type(self)(new_blocks, axes, do_integrity_check=False) + return type(self).from_blocks(new_blocks, axes) def get_slice(self, slobj: slice, axis: int = 0) -> "BlockManager": @@ -771,7 +788,7 @@ def __contains__(self, item) -> bool: def nblocks(self) -> int: return len(self.blocks) - def copy(self, deep=True) -> "BlockManager": + def copy(self: T, deep=True) -> T: """ Make deep or shallow copy of BlockManager @@ -817,17 +834,15 @@ def as_array(self, transpose: bool = False) -> np.ndarray: arr = np.empty(self.shape, dtype=float) return arr.transpose() if transpose else arr - mgr = self - - if self._is_single_block and mgr.blocks[0].is_datetimetz: + if self._is_single_block and self.blocks[0].is_datetimetz: # TODO(Block.get_values): Make DatetimeTZBlock.get_values # always be object dtype. Some callers seem to want the # DatetimeArray (previously DTI) - arr = mgr.blocks[0].get_values(dtype=object) + arr = self.blocks[0].get_values(dtype=object) elif self._is_single_block or not self.is_mixed_type: - arr = np.asarray(mgr.blocks[0].get_values()) + arr = np.asarray(self.blocks[0].get_values()) else: - arr = mgr._interleave() + arr = self._interleave() return arr.transpose() if transpose else arr @@ -840,8 +855,8 @@ def _interleave(self) -> np.ndarray: # TODO: https://github.com/pandas-dev/pandas/issues/22791 # Give EAs some input on what happens here. Sparse needs this. - if is_sparse(dtype): - dtype = dtype.subtype # type: ignore + if isinstance(dtype, SparseDtype): + dtype = dtype.subtype elif is_extension_array_dtype(dtype): dtype = "object" @@ -1083,7 +1098,10 @@ def value_getitem(placement): "Shape of new values must be compatible with manager shape" ) - if isinstance(loc, int): + if lib.is_integer(loc): + # We have 6 tests where loc is _not_ an int. + # In this case, get_blkno_placements will yield only one tuple, + # containing (self._blknos[loc], BlockPlacement(slice(0, 1, 1))) loc = [loc] # Accessing public blknos ensures the public versions are initialized @@ -1135,7 +1153,7 @@ def value_getitem(placement): # one item. new_blocks.extend( make_block( - values=value.copy(), + values=value, ndim=self.ndim, placement=slice(mgr_loc, mgr_loc + 1), ) @@ -1241,14 +1259,14 @@ def reindex_axis( ) def reindex_indexer( - self, + self: T, new_axis, indexer, axis: int, fill_value=None, - allow_dups=False, + allow_dups: bool = False, copy: bool = True, - ): + ) -> T: """ Parameters ---------- @@ -1296,7 +1314,8 @@ def reindex_indexer( new_axes = list(self.axes) new_axes[axis] = new_axis - return type(self)(new_blocks, new_axes) + + return type(self).from_blocks(new_blocks, new_axes) def _slice_take_blocks_ax0(self, slice_or_indexer, fill_tuple=None): """ @@ -1497,6 +1516,8 @@ def __init__( do_integrity_check: bool = False, fastpath: bool = False, ): + assert isinstance(block, Block), type(block) + if isinstance(axis, list): if len(axis) != 1: raise ValueError( @@ -1507,38 +1528,29 @@ def __init__( # passed from constructor, single block, single axis if fastpath: self.axes = [axis] - if isinstance(block, list): - - # empty block - if len(block) == 0: - block = [np.array([])] - elif len(block) != 1: - raise ValueError( - "Cannot create SingleBlockManager with more than 1 block" - ) - block = block[0] else: self.axes = [ensure_index(axis)] - # create the block here - if isinstance(block, list): - - # provide consolidation to the interleaved_dtype - if len(block) > 1: - dtype = _interleaved_dtype(block) - block = [b.astype(dtype) for b in block] - block = _consolidate(block) - - if len(block) != 1: - raise ValueError( - "Cannot create SingleBlockManager with more than 1 block" - ) - block = block[0] + self.blocks = tuple([block]) - if not isinstance(block, Block): - block = make_block(block, placement=slice(0, len(axis)), ndim=1) + @classmethod + def from_blocks( + cls, blocks: List[Block], axes: List[Index] + ) -> "SingleBlockManager": + """ + Constructor for BlockManager and SingleBlockManager with same signature. + """ + assert len(blocks) == 1 + assert len(axes) == 1 + return cls(blocks[0], axes[0], do_integrity_check=False, fastpath=True) - self.blocks = tuple([block]) + @classmethod + def from_array(cls, array: ArrayLike, index: Index) -> "SingleBlockManager": + """ + Constructor for if we have an array that is not yet a Block. + """ + block = make_block(array, placement=slice(0, len(index)), ndim=1) + return cls(block, index, fastpath=True) def _post_setstate(self): pass @@ -1547,10 +1559,6 @@ def _post_setstate(self): def _block(self) -> Block: return self.blocks[0] - @property - def _values(self): - return self._block.values - @property def _blknos(self): """ compat with BlockManager """ @@ -1565,7 +1573,10 @@ def get_slice(self, slobj: slice, axis: int = 0) -> "SingleBlockManager": if axis >= self.ndim: raise IndexError("Requested axis not found in manager") - return type(self)(self._block._slice(slobj), self.index[slobj], fastpath=True) + blk = self._block + array = blk._slice(slobj) + block = blk.make_block_same_class(array, placement=range(len(array))) + return type(self)(block, self.index[slobj], fastpath=True) @property def index(self) -> Index: @@ -1575,10 +1586,6 @@ def index(self) -> Index: def dtype(self) -> DtypeObj: return self._block.dtype - @property - def array_dtype(self) -> DtypeObj: - return self._block.array_dtype - def get_dtype_counts(self): return {self.dtype.name: 1} @@ -1593,10 +1600,6 @@ def internal_values(self): """The array that Series._values returns""" return self._block.internal_values() - def get_values(self) -> np.ndarray: - """ return a dense type view """ - return np.array(self._block.to_dense(), copy=False) - @property def _can_hold_na(self) -> bool: return self._block._can_hold_na @@ -1627,7 +1630,7 @@ def fast_xs(self, loc): """ raise NotImplementedError("Use series._values[loc] instead") - def concat(self, to_concat, new_axis) -> "SingleBlockManager": + def concat(self, to_concat, new_axis: Index) -> "SingleBlockManager": """ Concatenate a list of SingleBlockManagers into a single SingleBlockManager. @@ -1771,7 +1774,7 @@ def form_blocks(arrays, names, axes): if len(items_dict["DatetimeTZBlock"]): dttz_blocks = [ - make_block(array, klass=DatetimeTZBlock, placement=[i]) + make_block(array, klass=DatetimeTZBlock, placement=i) for i, _, array in items_dict["DatetimeTZBlock"] ] blocks.extend(dttz_blocks) @@ -1786,7 +1789,7 @@ def form_blocks(arrays, names, axes): if len(items_dict["CategoricalBlock"]) > 0: cat_blocks = [ - make_block(array, klass=CategoricalBlock, placement=[i]) + make_block(array, klass=CategoricalBlock, placement=i) for i, _, array in items_dict["CategoricalBlock"] ] blocks.extend(cat_blocks) @@ -1794,7 +1797,7 @@ def form_blocks(arrays, names, axes): if len(items_dict["ExtensionBlock"]): external_blocks = [ - make_block(array, klass=ExtensionBlock, placement=[i]) + make_block(array, klass=ExtensionBlock, placement=i) for i, _, array in items_dict["ExtensionBlock"] ] @@ -1802,7 +1805,7 @@ def form_blocks(arrays, names, axes): if len(items_dict["ObjectValuesExtensionBlock"]): external_blocks = [ - make_block(array, klass=ObjectValuesExtensionBlock, placement=[i]) + make_block(array, klass=ObjectValuesExtensionBlock, placement=i) for i, _, array in items_dict["ObjectValuesExtensionBlock"] ] @@ -1963,28 +1966,6 @@ def _compare_or_regex_search(a, b, regex=False): return result -def _transform_index(index, func, level=None): - """ - Apply function to all values found in index. - - This includes transforming multiindex entries separately. - Only apply function to one level of the MultiIndex if level is specified. - - """ - if isinstance(index, MultiIndex): - if level is not None: - items = [ - tuple(func(y) if i == level else y for i, y in enumerate(x)) - for x in index - ] - else: - items = [tuple(func(y) for y in x) for x in index] - return MultiIndex.from_tuples(items, names=index.names) - else: - items = [func(x) for x in index] - return Index(items, name=index.name, tupleize_cols=False) - - def _fast_count_smallints(arr): """Faster version of set(arr) for sequences of small numbers.""" counts = np.bincount(arr.astype(np.int_)) diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index 4398a1569ac56..87f937f9e7087 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -7,8 +7,8 @@ from pandas._config import get_option -from pandas._libs import NaT, Timedelta, Timestamp, iNaT, lib -from pandas._typing import Dtype, Scalar +from pandas._libs import NaT, Period, Timedelta, Timestamp, iNaT, lib +from pandas._typing import ArrayLike, Dtype, Scalar from pandas.compat._optional import import_optional_dependency from pandas.core.dtypes.cast import _int64_max, maybe_upcast_putmask @@ -17,9 +17,7 @@ is_any_int_dtype, is_bool_dtype, is_complex, - is_datetime64_dtype, - is_datetime64tz_dtype, - is_datetime_or_timedelta_dtype, + is_datetime64_any_dtype, is_float, is_float_dtype, is_integer, @@ -28,10 +26,14 @@ is_object_dtype, is_scalar, is_timedelta64_dtype, + needs_i8_conversion, pandas_dtype, ) +from pandas.core.dtypes.dtypes import PeriodDtype from pandas.core.dtypes.missing import isna, na_value_for_dtype, notna +from pandas.core.construction import extract_array + bn = import_optional_dependency("bottleneck", raise_on_missing=False, on_version="warn") _BOTTLENECK_INSTALLED = bn is not None _USE_BOTTLENECK = False @@ -132,10 +134,8 @@ def f( def _bn_ok_dtype(dtype: Dtype, name: str) -> bool: - # Bottleneck chokes on datetime64 - if not is_object_dtype(dtype) and not ( - is_datetime_or_timedelta_dtype(dtype) or is_datetime64tz_dtype(dtype) - ): + # Bottleneck chokes on datetime64, PeriodDtype (or and EA) + if not is_object_dtype(dtype) and not needs_i8_conversion(dtype): # GH 15507 # bottleneck does not properly upcast during the sum @@ -281,23 +281,16 @@ def _get_values( # with scalar fill_value. This guarantee is important for the # maybe_upcast_putmask call below assert is_scalar(fill_value) + values = extract_array(values, extract_numpy=True) mask = _maybe_get_mask(values, skipna, mask) - if is_datetime64tz_dtype(values): - # lib.values_from_object returns M8[ns] dtype instead of tz-aware, - # so this case must be handled separately from the rest - dtype = values.dtype - values = getattr(values, "_values", values) - else: - values = lib.values_from_object(values) - dtype = values.dtype + dtype = values.dtype - if is_datetime_or_timedelta_dtype(values) or is_datetime64tz_dtype(values): + if needs_i8_conversion(values): # changing timedelta64/datetime64 to int64 needs to happen after # finding `mask` above - values = getattr(values, "asi8", values) - values = values.view(np.int64) + values = np.asarray(values.view("i8")) dtype_ok = _na_ok_dtype(dtype) @@ -311,7 +304,8 @@ def _get_values( if skipna and copy: values = values.copy() - if dtype_ok: + assert mask is not None # for mypy + if dtype_ok and mask.any(): np.putmask(values, mask, fill_value) # promote if needed @@ -329,13 +323,14 @@ def _get_values( def _na_ok_dtype(dtype) -> bool: - # TODO: what about datetime64tz? PeriodDtype? - return not issubclass(dtype.type, (np.integer, np.timedelta64, np.datetime64)) + if needs_i8_conversion(dtype): + return False + return not issubclass(dtype.type, np.integer) def _wrap_results(result, dtype: Dtype, fill_value=None): """ wrap our results if needed """ - if is_datetime64_dtype(dtype) or is_datetime64tz_dtype(dtype): + if is_datetime64_any_dtype(dtype): if fill_value is None: # GH#24293 fill_value = iNaT @@ -346,7 +341,8 @@ def _wrap_results(result, dtype: Dtype, fill_value=None): result = np.nan result = Timestamp(result, tz=tz) else: - result = result.view(dtype) + # If we have float dtype, taking a view will give the wrong result + result = result.astype(dtype) elif is_timedelta64_dtype(dtype): if not isinstance(result, np.ndarray): if result == fill_value: @@ -360,6 +356,14 @@ def _wrap_results(result, dtype: Dtype, fill_value=None): else: result = result.astype("m8[ns]").view(dtype) + elif isinstance(dtype, PeriodDtype): + if is_float(result) and result.is_integer(): + result = int(result) + if is_integer(result): + result = Period._from_ordinal(result, freq=dtype.freq) + else: + raise NotImplementedError(type(result), result) + return result @@ -546,12 +550,7 @@ def nanmean(values, axis=None, skipna=True, mask=None): ) dtype_sum = dtype_max dtype_count = np.float64 - if ( - is_integer_dtype(dtype) - or is_timedelta64_dtype(dtype) - or is_datetime64_dtype(dtype) - or is_datetime64tz_dtype(dtype) - ): + if is_integer_dtype(dtype) or needs_i8_conversion(dtype): dtype_sum = np.float64 elif is_float_dtype(dtype): dtype_sum = dtype @@ -758,7 +757,7 @@ def nanvar(values, axis=None, skipna=True, ddof=1, mask=None): >>> nanops.nanvar(s) 1.0 """ - values = lib.values_from_object(values) + values = extract_array(values, extract_numpy=True) dtype = values.dtype mask = _maybe_get_mask(values, skipna, mask) if is_any_int_dtype(values): @@ -981,11 +980,11 @@ def nanskew( Examples -------- >>> import pandas.core.nanops as nanops - >>> s = pd.Series([1,np.nan, 1, 2]) + >>> s = pd.Series([1, np.nan, 1, 2]) >>> nanops.nanskew(s) 1.7320508075688787 """ - values = lib.values_from_object(values) + values = extract_array(values, extract_numpy=True) mask = _maybe_get_mask(values, skipna, mask) if not is_float_dtype(values.dtype): values = values.astype("f8") @@ -1065,11 +1064,11 @@ def nankurt( Examples -------- >>> import pandas.core.nanops as nanops - >>> s = pd.Series([1,np.nan, 1, 3, 2]) + >>> s = pd.Series([1, np.nan, 1, 3, 2]) >>> nanops.nankurt(s) -1.2892561983471076 """ - values = lib.values_from_object(values) + values = extract_array(values, extract_numpy=True) mask = _maybe_get_mask(values, skipna, mask) if not is_float_dtype(values.dtype): values = values.astype("f8") @@ -1314,7 +1313,7 @@ def get_corr_func(method): return method else: raise ValueError( - f"Unkown method '{method}', expected one of 'kendall', 'spearman'" + f"Unknown method '{method}', expected one of 'kendall', 'spearman'" ) def _pearson(a, b): @@ -1501,3 +1500,75 @@ def nanpercentile( return result else: return np.percentile(values, q, axis=axis, interpolation=interpolation) + + +def na_accum_func(values: ArrayLike, accum_func, skipna: bool) -> ArrayLike: + """ + Cumulative function with skipna support. + + Parameters + ---------- + values : np.ndarray or ExtensionArray + accum_func : {np.cumprod, np.maximum.accumulate, np.cumsum, np.minimum.accumulate} + skipna : bool + + Returns + ------- + np.ndarray or ExtensionArray + """ + mask_a, mask_b = { + np.cumprod: (1.0, np.nan), + np.maximum.accumulate: (-np.inf, np.nan), + np.cumsum: (0.0, np.nan), + np.minimum.accumulate: (np.inf, np.nan), + }[accum_func] + + # We will be applying this function to block values + if values.dtype.kind in ["m", "M"]: + # GH#30460, GH#29058 + # numpy 1.18 started sorting NaTs at the end instead of beginning, + # so we need to work around to maintain backwards-consistency. + orig_dtype = values.dtype + + # We need to define mask before masking NaTs + mask = isna(values) + + if accum_func == np.minimum.accumulate: + # Note: the accum_func comparison fails as an "is" comparison + y = values.view("i8") + y[mask] = np.iinfo(np.int64).max + changed = True + else: + y = values + changed = False + + result = accum_func(y.view("i8"), axis=0) + if skipna: + result[mask] = iNaT + elif accum_func == np.minimum.accumulate: + # Restore NaTs that we masked previously + nz = (~np.asarray(mask)).nonzero()[0] + if len(nz): + # everything up to the first non-na entry stays NaT + result[: nz[0]] = iNaT + + if changed: + # restore NaT elements + y[mask] = iNaT # TODO: could try/finally for this? + + if isinstance(values, np.ndarray): + result = result.view(orig_dtype) + else: + # DatetimeArray + result = type(values)._from_sequence(result, dtype=orig_dtype) + + elif skipna and not issubclass(values.dtype.type, (np.integer, np.bool_)): + vals = values.copy() + mask = isna(vals) + vals[mask] = mask_a + result = accum_func(vals, axis=0) + result[mask] = mask_b + else: + result = accum_func(values, axis=0) + + return result diff --git a/pandas/core/ops/__init__.py b/pandas/core/ops/__init__.py index d0adf2da04db3..10dcb59977cdd 100644 --- a/pandas/core/ops/__init__.py +++ b/pandas/core/ops/__init__.py @@ -3,18 +3,17 @@ This is not a public API. """ -import datetime import operator -from typing import TYPE_CHECKING, Optional, Set, Tuple +from typing import TYPE_CHECKING, Optional, Set import numpy as np -from pandas._libs import Timedelta, Timestamp, lib +from pandas._libs import lib from pandas._libs.ops_dispatch import maybe_dispatch_ufunc_to_dunder_op # noqa:F401 from pandas._typing import ArrayLike, Level from pandas.util._decorators import Appender -from pandas.core.dtypes.common import is_list_like, is_timedelta64_dtype +from pandas.core.dtypes.common import is_list_like from pandas.core.dtypes.generic import ABCDataFrame, ABCIndexClass, ABCSeries from pandas.core.dtypes.missing import isna @@ -152,65 +151,6 @@ def _maybe_match_name(a, b): return None -def maybe_upcast_for_op(obj, shape: Tuple[int, ...]): - """ - Cast non-pandas objects to pandas types to unify behavior of arithmetic - and comparison operations. - - Parameters - ---------- - obj: object - shape : tuple[int] - - Returns - ------- - out : object - - Notes - ----- - Be careful to call this *after* determining the `name` attribute to be - attached to the result of the arithmetic operation. - """ - from pandas.core.arrays import DatetimeArray, TimedeltaArray - - if type(obj) is datetime.timedelta: - # GH#22390 cast up to Timedelta to rely on Timedelta - # implementation; otherwise operation against numeric-dtype - # raises TypeError - return Timedelta(obj) - elif isinstance(obj, np.datetime64): - # GH#28080 numpy casts integer-dtype to datetime64 when doing - # array[int] + datetime64, which we do not allow - if isna(obj): - # Avoid possible ambiguities with pd.NaT - obj = obj.astype("datetime64[ns]") - right = np.broadcast_to(obj, shape) - return DatetimeArray(right) - - return Timestamp(obj) - - elif isinstance(obj, np.timedelta64): - if isna(obj): - # wrapping timedelta64("NaT") in Timedelta returns NaT, - # which would incorrectly be treated as a datetime-NaT, so - # we broadcast and wrap in a TimedeltaArray - obj = obj.astype("timedelta64[ns]") - right = np.broadcast_to(obj, shape) - return TimedeltaArray(right) - - # In particular non-nanosecond timedelta64 needs to be cast to - # nanoseconds, or else we get undesired behavior like - # np.timedelta64(3, 'D') / 2 == np.timedelta64(1, 'D') - return Timedelta(obj) - - elif isinstance(obj, np.ndarray) and is_timedelta64_dtype(obj.dtype): - # GH#22390 Unfortunately we need to special-case right-hand - # timedelta64 dtypes because numpy casts integer dtypes to - # timedelta64 when operating with timedelta64 - return TimedeltaArray._from_sequence(obj) - return obj - - # ----------------------------------------------------------------------------- @@ -585,7 +525,7 @@ def flex_wrapper(self, other, level=None, fill_value=None, axis=0): # DataFrame -def _combine_series_frame(left, right, func, axis: int): +def _combine_series_frame(left, right, func, axis: int, str_rep: str): """ Apply binary operator `func` to self, other using alignment and fill conventions determined by the axis argument. @@ -596,6 +536,7 @@ def _combine_series_frame(left, right, func, axis: int): right : Series func : binary operator axis : {0, 1} + str_rep : str Returns ------- @@ -603,7 +544,17 @@ def _combine_series_frame(left, right, func, axis: int): """ # We assume that self.align(other, ...) has already been called if axis == 0: - new_data = left._combine_match_index(right, func) + values = right._values + if isinstance(values, np.ndarray): + # We can operate block-wise + values = values.reshape(-1, 1) + + array_op = get_array_op(func, str_rep=str_rep) + bm = left._data.apply(array_op, right=values.T) + return type(left)(bm) + + new_data = dispatch_to_series(left, right, func) + else: new_data = dispatch_to_series(left, right, func, axis="columns") @@ -674,7 +625,8 @@ def to_series(right): elif right.ndim > 2: raise ValueError( - f"Unable to coerce to Series/DataFrame, dim must be <= 2: {right.shape}" + "Unable to coerce to Series/DataFrame, " + f"dimension must be <= 2: {right.shape}" ) elif is_list_like(right) and not isinstance(right, (ABCSeries, ABCDataFrame)): @@ -700,13 +652,17 @@ def to_series(right): def _should_reindex_frame_op( - left: "DataFrame", right, axis, default_axis: int, fill_value, level + left: "DataFrame", right, op, axis, default_axis: int, fill_value, level ) -> bool: """ Check if this is an operation between DataFrames that will need to reindex. """ assert isinstance(left, ABCDataFrame) + if op is operator.pow or op is rpow: + # GH#32685 pow has special semantics for operating with null values + return False + if not isinstance(right, ABCDataFrame): return False @@ -768,7 +724,9 @@ def _arith_method_FRAME(cls, op, special): @Appender(doc) def f(self, other, axis=default_axis, level=None, fill_value=None): - if _should_reindex_frame_op(self, other, axis, default_axis, fill_value, level): + if _should_reindex_frame_op( + self, other, op, axis, default_axis, fill_value, level + ): return _frame_arith_method_with_reindex(self, other, op) self, other = _align_method_FRAME(self, other, axis, flex=True, level=level) @@ -791,7 +749,9 @@ def f(self, other, axis=default_axis, level=None, fill_value=None): raise NotImplementedError(f"fill_value {fill_value} not supported.") axis = self._get_axis_number(axis) if axis is not None else 1 - return _combine_series_frame(self, other, pass_op, axis=axis) + return _combine_series_frame( + self, other, pass_op, axis=axis, str_rep=str_rep + ) else: # in this case we always have `np.ndim(other) == 0` if fill_value is not None: @@ -826,7 +786,7 @@ def f(self, other, axis=default_axis, level=None): elif isinstance(other, ABCSeries): axis = self._get_axis_number(axis) if axis is not None else 1 - return _combine_series_frame(self, other, op, axis=axis) + return _combine_series_frame(self, other, op, axis=axis, str_rep=str_rep) else: # in this case we always have `np.ndim(other) == 0` new_data = dispatch_to_series(self, other, op, str_rep) diff --git a/pandas/core/ops/array_ops.py b/pandas/core/ops/array_ops.py index 2c9105c52cf9b..c7f58d738b578 100644 --- a/pandas/core/ops/array_ops.py +++ b/pandas/core/ops/array_ops.py @@ -2,9 +2,10 @@ Functions for arithmetic and comparison operations on NumPy arrays and ExtensionArrays. """ +from datetime import timedelta from functools import partial import operator -from typing import Any, Optional +from typing import Any, Optional, Tuple import numpy as np @@ -24,18 +25,11 @@ is_object_dtype, is_scalar, ) -from pandas.core.dtypes.generic import ( - ABCDatetimeArray, - ABCExtensionArray, - ABCIndex, - ABCIndexClass, - ABCSeries, - ABCTimedeltaArray, -) +from pandas.core.dtypes.generic import ABCExtensionArray, ABCIndex, ABCSeries from pandas.core.dtypes.missing import isna, notna from pandas.core.ops import missing -from pandas.core.ops.dispatch import dispatch_to_extension_op, should_extension_dispatch +from pandas.core.ops.dispatch import should_extension_dispatch from pandas.core.ops.invalid import invalid_comparison from pandas.core.ops.roperator import rpow @@ -53,13 +47,15 @@ def comp_method_OBJECT_ARRAY(op, x, y): if isinstance(y, (ABCSeries, ABCIndex)): y = y.values - result = libops.vec_compare(x.ravel(), y, op) + if x.shape != y.shape: + raise ValueError("Shapes must match", x.shape, y.shape) + result = libops.vec_compare(x.ravel(), y.ravel(), op) else: result = libops.scalar_compare(x.ravel(), y, op) return result.reshape(x.shape) -def masked_arith_op(x, y, op): +def masked_arith_op(x: np.ndarray, y, op): """ If the given arithmetic operation fails, attempt it again on only the non-null elements of the input array(s). @@ -78,10 +74,22 @@ def masked_arith_op(x, y, op): dtype = find_common_type([x.dtype, y.dtype]) result = np.empty(x.size, dtype=dtype) + if len(x) != len(y): + if not _can_broadcast(x, y): + raise ValueError(x.shape, y.shape) + + # Call notna on pre-broadcasted y for performance + ymask = notna(y) + y = np.broadcast_to(y, x.shape) + ymask = np.broadcast_to(ymask, x.shape) + + else: + ymask = notna(y) + # NB: ravel() is only safe since y is ndarray; for e.g. PeriodIndex # we would get int64 dtype, see GH#19956 yrav = y.ravel() - mask = notna(xrav) & notna(yrav) + mask = notna(xrav) & ymask.ravel() if yrav.shape != mask.shape: # FIXME: GH#5284, GH#5035, GH#19448 @@ -186,23 +194,15 @@ def arithmetic_op(left: ArrayLike, right: Any, op, str_rep: str): ndarrray or ExtensionArray Or a 2-tuple of these in the case of divmod or rdivmod. """ - from pandas.core.ops import maybe_upcast_for_op # NB: We assume that extract_array has already been called # on `left` and `right`. - lvalues = left - rvalues = right - - rvalues = maybe_upcast_for_op(rvalues, lvalues.shape) + lvalues = maybe_upcast_datetimelike_array(left) + rvalues = maybe_upcast_for_op(right, lvalues.shape) - if should_extension_dispatch(left, rvalues) or isinstance( - rvalues, (ABCTimedeltaArray, ABCDatetimeArray, Timestamp, Timedelta) - ): - # TimedeltaArray, DatetimeArray, and Timestamp are included here - # because they have `freq` attribute which is handled correctly - # by dispatch_to_extension_op. + if should_extension_dispatch(lvalues, rvalues) or isinstance(rvalues, Timedelta): # Timedelta is included because numexpr will fail on it, see GH#31457 - res_values = dispatch_to_extension_op(op, lvalues, rvalues) + res_values = op(lvalues, rvalues) else: with np.errstate(all="ignore"): @@ -211,6 +211,51 @@ def arithmetic_op(left: ArrayLike, right: Any, op, str_rep: str): return res_values +def _broadcast_comparison_op(lvalues, rvalues, op) -> np.ndarray: + """ + Broadcast a comparison operation between two 2D arrays. + + Parameters + ---------- + lvalues : np.ndarray or ExtensionArray + rvalues : np.ndarray or ExtensionArray + + Returns + ------- + np.ndarray[bool] + """ + if isinstance(rvalues, np.ndarray): + rvalues = np.broadcast_to(rvalues, lvalues.shape) + result = comparison_op(lvalues, rvalues, op) + else: + result = np.empty(lvalues.shape, dtype=bool) + for i in range(len(lvalues)): + result[i, :] = comparison_op(lvalues[i], rvalues[:, 0], op) + return result + + +def _can_broadcast(lvalues, rvalues) -> bool: + """ + Check if we can broadcast rvalues to match the shape of lvalues. + + Parameters + ---------- + lvalues : np.ndarray or ExtensionArray + rvalues : np.ndarray or ExtensionArray + + Returns + ------- + bool + """ + # We assume that lengths dont match + if lvalues.ndim == rvalues.ndim == 2: + # See if we can broadcast unambiguously + if lvalues.shape[1] == rvalues.shape[-1]: + if rvalues.shape[0] == 1: + return True + return False + + def comparison_op( left: ArrayLike, right: Any, op, str_rep: Optional[str] = None, ) -> ArrayLike: @@ -229,7 +274,7 @@ def comparison_op( ndarray or ExtensionArray """ # NB: We assume extract_array has already been called on left and right - lvalues = left + lvalues = maybe_upcast_datetimelike_array(left) rvalues = right rvalues = lib.item_from_zerodim(rvalues) @@ -237,15 +282,20 @@ def comparison_op( # TODO: same for tuples? rvalues = np.asarray(rvalues) - if isinstance(rvalues, (np.ndarray, ABCExtensionArray, ABCIndexClass)): + if isinstance(rvalues, (np.ndarray, ABCExtensionArray)): # TODO: make this treatment consistent across ops and classes. # We are not catching all listlikes here (e.g. frozenset, tuple) # The ambiguous case is object-dtype. See GH#27803 if len(lvalues) != len(rvalues): - raise ValueError("Lengths must match to compare") + if _can_broadcast(lvalues, rvalues): + return _broadcast_comparison_op(lvalues, rvalues, op) + raise ValueError( + "Lengths must match to compare", lvalues.shape, rvalues.shape + ) if should_extension_dispatch(lvalues, rvalues): - res_values = dispatch_to_extension_op(op, lvalues, rvalues) + # Call the method on lvalues + res_values = op(lvalues, rvalues) elif is_scalar(rvalues) and isna(rvalues): # numpy does not like comparisons vs None @@ -344,11 +394,12 @@ def fill_bool(x, left=None): right = construct_1d_object_array_from_listlike(right) # NB: We assume extract_array has already been called on left and right - lvalues = left + lvalues = maybe_upcast_datetimelike_array(left) rvalues = right if should_extension_dispatch(lvalues, rvalues): - res_values = dispatch_to_extension_op(op, lvalues, rvalues) + # Call the method on lvalues + res_values = op(lvalues, rvalues) else: if isinstance(rvalues, np.ndarray): @@ -391,3 +442,87 @@ def get_array_op(op, str_rep: Optional[str] = None): return partial(logical_op, op=op) else: return partial(arithmetic_op, op=op, str_rep=str_rep) + + +def maybe_upcast_datetimelike_array(obj: ArrayLike) -> ArrayLike: + """ + If we have an ndarray that is either datetime64 or timedelta64, wrap in EA. + + Parameters + ---------- + obj : ndarray or ExtensionArray + + Returns + ------- + ndarray or ExtensionArray + """ + if isinstance(obj, np.ndarray): + if obj.dtype.kind == "m": + from pandas.core.arrays import TimedeltaArray + + return TimedeltaArray._from_sequence(obj) + if obj.dtype.kind == "M": + from pandas.core.arrays import DatetimeArray + + return DatetimeArray._from_sequence(obj) + + return obj + + +def maybe_upcast_for_op(obj, shape: Tuple[int, ...]): + """ + Cast non-pandas objects to pandas types to unify behavior of arithmetic + and comparison operations. + + Parameters + ---------- + obj: object + shape : tuple[int] + + Returns + ------- + out : object + + Notes + ----- + Be careful to call this *after* determining the `name` attribute to be + attached to the result of the arithmetic operation. + """ + from pandas.core.arrays import DatetimeArray, TimedeltaArray + + if type(obj) is timedelta: + # GH#22390 cast up to Timedelta to rely on Timedelta + # implementation; otherwise operation against numeric-dtype + # raises TypeError + return Timedelta(obj) + elif isinstance(obj, np.datetime64): + # GH#28080 numpy casts integer-dtype to datetime64 when doing + # array[int] + datetime64, which we do not allow + if isna(obj): + # Avoid possible ambiguities with pd.NaT + obj = obj.astype("datetime64[ns]") + right = np.broadcast_to(obj, shape) + return DatetimeArray(right) + + return Timestamp(obj) + + elif isinstance(obj, np.timedelta64): + if isna(obj): + # wrapping timedelta64("NaT") in Timedelta returns NaT, + # which would incorrectly be treated as a datetime-NaT, so + # we broadcast and wrap in a TimedeltaArray + obj = obj.astype("timedelta64[ns]") + right = np.broadcast_to(obj, shape) + return TimedeltaArray(right) + + # In particular non-nanosecond timedelta64 needs to be cast to + # nanoseconds, or else we get undesired behavior like + # np.timedelta64(3, 'D') / 2 == np.timedelta64(1, 'D') + return Timedelta(obj) + + elif isinstance(obj, np.ndarray) and obj.dtype.kind == "m": + # GH#22390 Unfortunately we need to special-case right-hand + # timedelta64 dtypes because numpy casts integer dtypes to + # timedelta64 when operating with timedelta64 + return TimedeltaArray._from_sequence(obj) + return obj diff --git a/pandas/core/ops/dispatch.py b/pandas/core/ops/dispatch.py index 61a3032c7a02c..2463a1f58a447 100644 --- a/pandas/core/ops/dispatch.py +++ b/pandas/core/ops/dispatch.py @@ -1,48 +1,33 @@ """ Functions for defining unary operations. """ -from typing import Any, Union +from typing import Any -import numpy as np +from pandas._typing import ArrayLike from pandas.core.dtypes.common import ( is_datetime64_dtype, - is_extension_array_dtype, is_integer_dtype, is_object_dtype, - is_scalar, is_timedelta64_dtype, ) -from pandas.core.dtypes.generic import ABCExtensionArray, ABCSeries +from pandas.core.dtypes.generic import ABCExtensionArray -from pandas.core.construction import array - -def should_extension_dispatch(left: ABCSeries, right: Any) -> bool: +def should_extension_dispatch(left: ArrayLike, right: Any) -> bool: """ - Identify cases where Series operation should use dispatch_to_extension_op. + Identify cases where Series operation should dispatch to ExtensionArray method. Parameters ---------- - left : Series + left : np.ndarray or ExtensionArray right : object Returns ------- bool """ - if ( - is_extension_array_dtype(left.dtype) - or is_datetime64_dtype(left.dtype) - or is_timedelta64_dtype(left.dtype) - ): - return True - - if not is_scalar(right) and is_extension_array_dtype(right): - # GH#22378 disallow scalar to exclude e.g. "category", "Int64" - return True - - return False + return isinstance(left, ABCExtensionArray) or isinstance(right, ABCExtensionArray) def should_series_dispatch(left, right, op): @@ -91,36 +76,3 @@ def should_series_dispatch(left, right, op): return True return False - - -def dispatch_to_extension_op( - op, left: Union[ABCExtensionArray, np.ndarray], right: Any, -): - """ - Assume that left or right is a Series backed by an ExtensionArray, - apply the operator defined by op. - - Parameters - ---------- - op : binary operator - left : ExtensionArray or np.ndarray - right : object - - Returns - ------- - ExtensionArray or np.ndarray - 2-tuple of these if op is divmod or rdivmod - """ - # NB: left and right should already be unboxed, so neither should be - # a Series or Index. - - if left.dtype.kind in "mM" and isinstance(left, np.ndarray): - # We need to cast datetime64 and timedelta64 ndarrays to - # DatetimeArray/TimedeltaArray. But we avoid wrapping others in - # PandasArray as that behaves poorly with e.g. IntegerArray. - left = array(left) - - # The op calls will raise TypeError if the op is not defined - # on the ExtensionArray - res_values = op(left, right) - return res_values diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index 06a180d4a096e..091129707228f 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -2,11 +2,11 @@ Concat routines. """ -from typing import Hashable, Iterable, List, Mapping, Optional, Union, overload +from typing import Iterable, List, Mapping, Union, overload import numpy as np -from pandas._typing import FrameOrSeriesUnion +from pandas._typing import FrameOrSeriesUnion, Label from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries @@ -32,7 +32,7 @@ @overload def concat( - objs: Union[Iterable["DataFrame"], Mapping[Optional[Hashable], "DataFrame"]], + objs: Union[Iterable["DataFrame"], Mapping[Label, "DataFrame"]], axis=0, join: str = "outer", ignore_index: bool = False, @@ -48,9 +48,7 @@ def concat( @overload def concat( - objs: Union[ - Iterable[FrameOrSeriesUnion], Mapping[Optional[Hashable], FrameOrSeriesUnion] - ], + objs: Union[Iterable[FrameOrSeriesUnion], Mapping[Label, FrameOrSeriesUnion]], axis=0, join: str = "outer", ignore_index: bool = False, @@ -65,9 +63,7 @@ def concat( def concat( - objs: Union[ - Iterable[FrameOrSeriesUnion], Mapping[Optional[Hashable], FrameOrSeriesUnion] - ], + objs: Union[Iterable[FrameOrSeriesUnion], Mapping[Label, FrameOrSeriesUnion]], axis=0, join="outer", ignore_index: bool = False, @@ -536,7 +532,7 @@ def _get_concat_axis(self) -> Index: idx = ibase.default_index(len(self.objs)) return idx elif self.keys is None: - names: List[Optional[Hashable]] = [None] * len(self.objs) + names: List[Label] = [None] * len(self.objs) num = 0 has_names = False for i, x in enumerate(self.objs): diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index c301d6e7c7155..acd4a68e3fd09 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -45,8 +45,9 @@ import pandas.core.algorithms as algos from pandas.core.arrays.categorical import _recode_for_categories import pandas.core.common as com +from pandas.core.construction import extract_array from pandas.core.frame import _merge_doc -from pandas.core.internals import _transform_index, concatenate_block_managers +from pandas.core.internals import concatenate_block_managers from pandas.core.sorting import is_int64_overflow_possible if TYPE_CHECKING: @@ -92,9 +93,7 @@ def merge( merge.__doc__ = _merge_doc % "\nleft : DataFrame" -def _groupby_and_merge( - by, on, left, right: "DataFrame", _merge_pieces, check_duplicates: bool = True -): +def _groupby_and_merge(by, on, left: "DataFrame", right: "DataFrame", merge_pieces): """ groupby & merge; we are always performing a left-by type operation @@ -102,11 +101,9 @@ def _groupby_and_merge( ---------- by: field to group on: duplicates field - left: left frame - right: right frame - _merge_pieces: function for merging - check_duplicates: bool, default True - should we check & clean duplicates + left: DataFrame + right: DataFrame + merge_pieces: function for merging """ pieces = [] if not isinstance(by, (list, tuple)): @@ -118,18 +115,6 @@ def _groupby_and_merge( # if we can groupby the rhs # then we can get vastly better perf - # we will check & remove duplicates if indicated - if check_duplicates: - if on is None: - on = [] - elif not isinstance(on, (list, tuple)): - on = [on] - - if right.duplicated(by + on).any(): - _right = right.drop_duplicates(by + on, keep="last") - # TODO: use overload to refine return type of drop_duplicates - assert _right is not None # needed for mypy - right = _right try: rby = right.groupby(by, sort=False) except KeyError: @@ -151,16 +136,13 @@ def _groupby_and_merge( pieces.append(merged) continue - merged = _merge_pieces(lhs, rhs) + merged = merge_pieces(lhs, rhs) # make sure join keys are in the merged - # TODO, should _merge_pieces do this? + # TODO, should merge_pieces do this? for k in by: - try: - if k in merged: - merged[k] = key - except KeyError: - pass + if k in merged: + merged[k] = key pieces.append(merged) @@ -235,8 +217,8 @@ def merge_ordered( See Also -------- - merge - merge_asof + merge : Merge with a database-style join. + merge_asof : Merge on nearest keys. Examples -------- @@ -287,16 +269,11 @@ def _merger(x, y): raise ValueError("Can only group either left or right frames") elif left_by is not None: result, _ = _groupby_and_merge( - left_by, on, left, right, lambda x, y: _merger(x, y), check_duplicates=False + left_by, on, left, right, lambda x, y: _merger(x, y) ) elif right_by is not None: result, _ = _groupby_and_merge( - right_by, - on, - right, - left, - lambda x, y: _merger(y, x), - check_duplicates=False, + right_by, on, right, left, lambda x, y: _merger(y, x) ) else: result = _merger(left, right) @@ -387,8 +364,8 @@ def merge_asof( See Also -------- - merge - merge_ordered + merge : Merge with a database-style join. + merge_ordered : Merge with optional filling/interpolation. Examples -------- @@ -1605,11 +1582,6 @@ def _validate_specification(self): if self.direction not in ["backward", "forward", "nearest"]: raise MergeError(f"direction invalid: {self.direction}") - @property - def _asof_key(self): - """ This is our asof key, the 'on' """ - return self.left_on[-1] - def _get_merge_keys(self): # note this function has side effects @@ -1685,10 +1657,13 @@ def _get_merge_keys(self): def _get_join_indexers(self): """ return the join indexers """ - def flip(xs): + def flip(xs) -> np.ndarray: """ unlike np.transpose, this returns an array of tuples """ xs = [ - x if not is_extension_array_dtype(x) else x._ndarray_values for x in xs + x + if not is_extension_array_dtype(x) + else extract_array(x)._values_for_argsort() + for x in xs ] labels = list(string.ascii_lowercase[: len(xs)]) dtypes = [x.dtype for x in xs] @@ -1849,9 +1824,14 @@ def _right_outer_join(x, y, max_groups): def _factorize_keys(lk, rk, sort=True): # Some pre-processing for non-ndarray lk / rk - if is_datetime64tz_dtype(lk) and is_datetime64tz_dtype(rk): - lk = getattr(lk, "_values", lk)._data - rk = getattr(rk, "_values", rk)._data + lk = extract_array(lk, extract_numpy=True) + rk = extract_array(rk, extract_numpy=True) + + if is_datetime64tz_dtype(lk.dtype) and is_datetime64tz_dtype(rk.dtype): + # Extract the ndarray (UTC-localized) values + # Note: we dont need the dtypes to match, as these can still be compared + lk, _ = lk._values_for_factorize() + rk, _ = rk._values_for_factorize() elif ( is_categorical_dtype(lk) and is_categorical_dtype(rk) and lk.is_dtype_equal(rk) @@ -1866,11 +1846,7 @@ def _factorize_keys(lk, rk, sort=True): lk = ensure_int64(lk.codes) rk = ensure_int64(rk) - elif ( - is_extension_array_dtype(lk.dtype) - and is_extension_array_dtype(rk.dtype) - and lk.dtype == rk.dtype - ): + elif is_extension_array_dtype(lk.dtype) and is_dtype_equal(lk.dtype, rk.dtype): lk, _ = lk._values_for_factorize() rk, _ = rk._values_for_factorize() @@ -1878,15 +1854,15 @@ def _factorize_keys(lk, rk, sort=True): # GH#23917 TODO: needs tests for case where lk is integer-dtype # and rk is datetime-dtype klass = libhashtable.Int64Factorizer - lk = ensure_int64(com.values_from_object(lk)) - rk = ensure_int64(com.values_from_object(rk)) - elif issubclass(lk.dtype.type, (np.timedelta64, np.datetime64)) and issubclass( - rk.dtype.type, (np.timedelta64, np.datetime64) - ): + lk = ensure_int64(np.asarray(lk)) + rk = ensure_int64(np.asarray(rk)) + + elif needs_i8_conversion(lk.dtype) and is_dtype_equal(lk.dtype, rk.dtype): # GH#23917 TODO: Needs tests for non-matching dtypes klass = libhashtable.Int64Factorizer - lk = ensure_int64(com.values_from_object(lk)) - rk = ensure_int64(com.values_from_object(rk)) + lk = ensure_int64(np.asarray(lk, dtype=np.int64)) + rk = ensure_int64(np.asarray(rk, dtype=np.int64)) + else: klass = libhashtable.Factorizer lk = ensure_object(lk) @@ -2022,4 +1998,4 @@ def renamer(x, suffix): lrenamer = partial(renamer, suffix=lsuffix) rrenamer = partial(renamer, suffix=rsuffix) - return (_transform_index(left, lrenamer), _transform_index(right, rrenamer)) + return (left._transform_index(lrenamer), right._transform_index(rrenamer)) diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index 61aa34f724307..a8801d8ab3f6e 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -150,7 +150,9 @@ def pivot_table( table = table.sort_index(axis=1) if fill_value is not None: - table = table._ensure_type(table.fillna(fill_value, downcast="infer")) + _table = table.fillna(fill_value, downcast="infer") + assert _table is not None # needed for mypy + table = _table if margins: if dropna: diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 359e5b956f8a5..145cf43112be3 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -985,12 +985,7 @@ def get_empty_frame(data) -> DataFrame: if prefix is None: dummy_cols = levels else: - - # PY2 embedded unicode, gh-22084 - def _make_col_name(prefix, prefix_sep, level) -> str: - return f"{prefix}{prefix_sep}{level}" - - dummy_cols = [_make_col_name(prefix, prefix_sep, level) for level in levels] + dummy_cols = [f"{prefix}{prefix_sep}{level}" for level in levels] index: Optional[Index] if isinstance(data, Series): diff --git a/pandas/core/reshape/tile.py b/pandas/core/reshape/tile.py index 86417faf6cd11..b9eb89b4d14c6 100644 --- a/pandas/core/reshape/tile.py +++ b/pandas/core/reshape/tile.py @@ -286,7 +286,7 @@ def qcut( Parameters ---------- x : 1d ndarray or Series - q : int or list-like of int + q : int or list-like of float Number of quantiles. 10 for deciles, 4 for quartiles, etc. Alternately array of quantiles, e.g. [0, .25, .5, .75, 1.] for quartiles. labels : array or False, default None diff --git a/pandas/core/reshape/util.py b/pandas/core/reshape/util.py index d8652c9b4fac9..7abb14303f8cc 100644 --- a/pandas/core/reshape/util.py +++ b/pandas/core/reshape/util.py @@ -2,8 +2,6 @@ from pandas.core.dtypes.common import is_list_like -import pandas.core.common as com - def cartesian_product(X): """ @@ -51,9 +49,20 @@ def cartesian_product(X): # if any factor is empty, the cartesian product is empty b = np.zeros_like(cumprodX) - return [ - np.tile( - np.repeat(np.asarray(com.values_from_object(x)), b[i]), np.product(a[i]) - ) - for i, x in enumerate(X) - ] + return [_tile_compat(np.repeat(x, b[i]), np.product(a[i])) for i, x in enumerate(X)] + + +def _tile_compat(arr, num: int): + """ + Index compat for np.tile. + + Notes + ----- + Does not support multi-dimensional `num`. + """ + if isinstance(arr, np.ndarray): + return np.tile(arr, num) + + # Otherwise we have an Index + taker = np.tile(np.arange(len(arr)), num) + return arr.take(taker) diff --git a/pandas/core/series.py b/pandas/core/series.py index 12164a4b8ff6b..1e1c9963ab3f1 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -47,7 +47,6 @@ ABCMultiIndex, ABCPeriodIndex, ABCSeries, - ABCSparseArray, ) from pandas.core.dtypes.inference import is_hashable from pandas.core.dtypes.missing import ( @@ -206,7 +205,7 @@ def __init__( # data is an ndarray, index is defined if not isinstance(data, SingleBlockManager): - data = SingleBlockManager(data, index, fastpath=True) + data = SingleBlockManager.from_array(data, index) if copy: data = data.copy() if index is None: @@ -289,9 +288,6 @@ def __init__( pass elif isinstance(data, (set, frozenset)): raise TypeError(f"'{type(data).__name__}' type is unordered") - elif isinstance(data, ABCSparseArray): - # handle sparse passed here (and force conversion) - data = data.to_dense() else: data = com.maybe_iterable_to_list(data) @@ -321,7 +317,7 @@ def __init__( else: data = sanitize_array(data, index, dtype, copy, raise_cast_failure=True) - data = SingleBlockManager(data, index, fastpath=True) + data = SingleBlockManager.from_array(data, index) generic.NDFrame.__init__(self, data) self.name = name @@ -439,6 +435,52 @@ def dtypes(self) -> DtypeObj: @property def name(self) -> Label: + """ + Return the name of the Series. + + The name of a Series becomes its index or column name if it is used + to form a DataFrame. It is also used whenever displaying the Series + using the interpreter. + + Returns + ------- + label (hashable object) + The name of the Series, also the column name if part of a DataFrame. + + See Also + -------- + Series.rename : Sets the Series name when given a scalar input. + Index.name : Corresponding Index property. + + Examples + -------- + The Series name can be set initially when calling the constructor. + + >>> s = pd.Series([1, 2, 3], dtype=np.int64, name='Numbers') + >>> s + 0 1 + 1 2 + 2 3 + Name: Numbers, dtype: int64 + >>> s.name = "Integers" + >>> s + 0 1 + 1 2 + 2 3 + Name: Integers, dtype: int64 + + The name of a Series within a DataFrame is its column name. + + >>> df = pd.DataFrame([[1, 2], [3, 4], [5, 6]], + ... columns=["Odd Numbers", "Even Numbers"]) + >>> df + Odd Numbers Even Numbers + 0 1 2 + 1 3 4 + 2 5 6 + >>> df["Even Numbers"].name + 'Even Numbers' + """ return self._name @name.setter @@ -508,21 +550,17 @@ def _values(self): timedelta64 dtypes), while ``.array`` ensures to always return an ExtensionArray. - Differs from ``._ndarray_values``, as that ensures to always return a - numpy array (it will call ``_ndarray_values`` on the ExtensionArray, if - the Series was backed by an ExtensionArray). - Overview: - dtype | values | _values | array | _ndarray_values | - ----------- | ------------- | ------------- | ------------- | --------------- | - Numeric | ndarray | ndarray | PandasArray | ndarray | - Category | Categorical | Categorical | Categorical | ndarray[int] | - dt64[ns] | ndarray[M8ns] | DatetimeArray | DatetimeArray | ndarray[M8ns] | - dt64[ns tz] | ndarray[M8ns] | DatetimeArray | DatetimeArray | ndarray[M8ns] | - td64[ns] | ndarray[m8ns] | TimedeltaArray| ndarray[m8ns] | ndarray[m8ns] | - Period | ndarray[obj] | PeriodArray | PeriodArray | ndarray[int] | - Nullable | EA | EA | EA | ndarray | + dtype | values | _values | array | + ----------- | ------------- | ------------- | ------------- | + Numeric | ndarray | ndarray | PandasArray | + Category | Categorical | Categorical | Categorical | + dt64[ns] | ndarray[M8ns] | DatetimeArray | DatetimeArray | + dt64[ns tz] | ndarray[M8ns] | DatetimeArray | DatetimeArray | + td64[ns] | ndarray[m8ns] | TimedeltaArray| ndarray[m8ns] | + Period | ndarray[obj] | PeriodArray | PeriodArray | + Nullable | EA | EA | EA | """ return self._data.internal_values() @@ -532,17 +570,6 @@ def _values(self): def array(self) -> ExtensionArray: return self._data._block.array_values() - def _internal_get_values(self): - """ - Same as values (but handles sparseness conversions); is a view. - - Returns - ------- - numpy.ndarray - Data of the Series. - """ - return self._data.get_values() - # ops def ravel(self, order="C"): """ @@ -692,11 +719,7 @@ def __array_ufunc__( inputs = tuple(extract_array(x, extract_numpy=True) for x in inputs) result = getattr(ufunc, method)(*inputs, **kwargs) - name: Label - if len(set(names)) == 1: - name = names[0] - else: - name = None + name = names[0] if len(set(names)) == 1 else None def construct_return(result): if lib.is_scalar(result): @@ -879,6 +902,7 @@ def __getitem__(self, key): if com.is_bool_indexer(key): key = check_bool_indexer(self.index, key) + key = np.asarray(key, dtype=bool) return self._get_values(key) return self._get_with(key) @@ -886,7 +910,7 @@ def __getitem__(self, key): def _get_with(self, key): # other: fancy integer or otherwise if isinstance(key, slice): - # _convert_slice_indexer to determing if this slice is positional + # _convert_slice_indexer to determin if this slice is positional # or label based, and if the latter, convert to positional slobj = self.index._convert_slice_indexer(key, kind="getitem") return self._slice(slobj) @@ -898,6 +922,10 @@ def _get_with(self, key): elif isinstance(key, tuple): return self._get_values_tuple(key) + elif not is_list_like(key): + # e.g. scalars that aren't recognized by lib.is_scalar, GH#32684 + return self.loc[key] + if not isinstance(key, (list, np.ndarray, ExtensionArray, Series, Index)): key = list(key) @@ -978,14 +1006,15 @@ def __setitem__(self, key, value): key = com.apply_if_callable(key, self) cacher_needs_updating = self._check_is_chained_assignment_possible() + if key is Ellipsis: + key = slice(None) + try: self._set_with_engine(key, value) except (KeyError, ValueError): values = self._values if is_integer(key) and not self.index.inferred_type == "integer": values[key] = value - elif key is Ellipsis: - self[:] = value else: self.loc[key] = value @@ -999,13 +1028,15 @@ def __setitem__(self, key, value): if com.is_bool_indexer(key): key = check_bool_indexer(self.index, key) + key = np.asarray(key, dtype=bool) try: self._where(~key, value, inplace=True) return except InvalidIndexError: - pass + self._set_values(key.astype(np.bool_), value) - self._set_with(key, value) + else: + self._set_with(key, value) if cacher_needs_updating: self._maybe_update_cacher() @@ -1046,13 +1077,13 @@ def _set_with(self, key, value): else: key_type = lib.infer_dtype(key, skipna=False) + # Note: key_type == "boolean" should not occur because that + # should be caught by the is_bool_indexer check in __setitem__ if key_type == "integer": if self.index.inferred_type == "integer": self._set_labels(key, value) else: return self._set_values(key, value) - elif key_type == "boolean": - self._set_values(key.astype(np.bool_), value) else: self._set_labels(key, value) @@ -1661,6 +1692,10 @@ def count(self, level=None): int or Series (if level specified) Number of non-null values in the Series. + See Also + -------- + DataFrame.count : Count non-NA cells for each column or row. + Examples -------- >>> s = pd.Series([0.0, 1.0, np.nan]) @@ -1990,7 +2025,7 @@ def idxmin(self, axis=0, skipna=True, *args, **kwargs): nan """ skipna = nv.validate_argmin_with_skipna(skipna, args, kwargs) - i = nanops.nanargmin(com.values_from_object(self), skipna=skipna) + i = nanops.nanargmin(self._values, skipna=skipna) if i == -1: return np.nan return self.index[i] @@ -2061,7 +2096,7 @@ def idxmax(self, axis=0, skipna=True, *args, **kwargs): nan """ skipna = nv.validate_argmax_with_skipna(skipna, args, kwargs) - i = nanops.nanargmax(com.values_from_object(self), skipna=skipna) + i = nanops.nanargmax(self._values, skipna=skipna) if i == -1: return np.nan return self.index[i] @@ -2099,7 +2134,7 @@ def round(self, decimals=0, *args, **kwargs) -> "Series": dtype: float64 """ nv.validate_round(args, kwargs) - result = com.values_from_object(self).round(decimals) + result = self._values.round(decimals) result = self._constructor(result, index=self.index).__finalize__(self) return result @@ -2191,6 +2226,12 @@ def corr(self, other, method="pearson", min_periods=None) -> float: float Correlation with other. + See Also + -------- + DataFrame.corr : Compute pairwise correlation between columns. + DataFrame.corrwith : Compute pairwise correlation with another + DataFrame or Series. + Examples -------- >>> def histogram_intersection(a, b): @@ -2233,6 +2274,10 @@ def cov(self, other, min_periods=None) -> float: Covariance between Series and other normalized by N-1 (unbiased estimator). + See Also + -------- + DataFrame.cov : Compute pairwise covariance of columns. + Examples -------- >>> s1 = pd.Series([0.90010907, 0.13484424, 0.62036035]) @@ -2245,7 +2290,7 @@ def cov(self, other, min_periods=None) -> float: return np.nan return nanops.nancov(this.values, other.values, min_periods=min_periods) - def diff(self, periods=1) -> "Series": + def diff(self, periods: int = 1) -> "Series": """ First discrete difference of element. @@ -2449,8 +2494,7 @@ def __rmatmul__(self, other): """ return self.dot(np.transpose(other)) - @Substitution(klass="Series") - @Appender(base._shared_docs["searchsorted"]) + @doc(base.IndexOpsMixin.searchsorted, klass="Series") def searchsorted(self, value, side="left", sorter=None): return algorithms.searchsorted(self._values, value, side=side, sorter=sorter) @@ -2671,9 +2715,9 @@ def combine(self, other, func, fill_value=None) -> "Series": new_values = [func(lv, other) for lv in self._values] new_name = self.name - if is_categorical_dtype(self.values): + if is_categorical_dtype(self.dtype): pass - elif is_extension_array_dtype(self.values): + elif is_extension_array_dtype(self.dtype): # The function can return something of any type, so check # if the type is compatible with the calling EA. new_values = try_cast_to_ea(self._values, new_values) @@ -2768,7 +2812,7 @@ def update(self, other) -> None: other = other.reindex_like(self) mask = notna(other) - self._data = self._data.putmask(mask=mask, new=other, inplace=True) + self._data = self._data.putmask(mask=mask, new=other) self._maybe_update_cacher() # ---------------------------------------------------------------------- @@ -3914,7 +3958,7 @@ def rename( Parameters ---------- axis : {0 or "index"} - Unused. Accepted for compatability with DataFrame method only. + Unused. Accepted for compatibility with DataFrame method only. index : scalar, hashable sequence, dict-like or function, optional Functions or dict-like are transformations to apply to the index. diff --git a/pandas/core/strings.py b/pandas/core/strings.py index 3be9c5fcdfb26..7f26c7a26d4d8 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -446,7 +446,7 @@ def str_contains(arr, pat, case=True, flags=0, na=np.nan, regex=True): stacklevel=3, ) - f = lambda x: bool(regex.search(x)) + f = lambda x: regex.search(x) is not None else: if case: f = lambda x: pat in x @@ -572,7 +572,7 @@ def str_replace(arr, pat, repl, n=-1, case=None, flags=0, regex=True): r""" Replace occurrences of pattern/regex in the Series/Index with some other string. Equivalent to :meth:`str.replace` or - :func:`re.sub`. + :func:`re.sub`, depending on the regex value. Parameters ---------- @@ -775,6 +775,8 @@ def scalar_rep(x): else: def rep(x, r): + if x is libmissing.NA: + return x try: return bytes.__mul__(x, r) except TypeError: @@ -816,7 +818,7 @@ def str_match(arr, pat, case=True, flags=0, na=np.nan): regex = re.compile(pat, flags=flags) dtype = bool - f = lambda x: bool(regex.match(x)) + f = lambda x: regex.match(x) is not None return _na_map(f, arr, na, dtype=dtype) @@ -2496,7 +2498,7 @@ def cat(self, others=None, sep=None, na_rep=None, join="left"): Limit number of splits in output. ``None``, 0 and -1 will be interpreted as return all splits. expand : bool, default False - Expand the splitted strings into separate columns. + Expand the split strings into separate columns. * If ``True``, return DataFrame/MultiIndex expanding dimensionality. * If ``False``, return Series/Index, containing lists of strings. diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 5580146b37d25..7414165ab5711 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -323,15 +323,13 @@ def _convert_listlike_datetimes( # GH 30050 pass an ndarray to tslib.array_with_unit_to_datetime # because it expects an ndarray argument if isinstance(arg, IntegerArray): - # Explicitly pass NaT mask to array_with_unit_to_datetime - mask = arg.isna() - arg = arg._ndarray_values + result = arg.astype(f"datetime64[{unit}]") + tz_parsed = None else: - mask = None - result, tz_parsed = tslib.array_with_unit_to_datetime( - arg, mask, unit, errors=errors - ) + result, tz_parsed = tslib.array_with_unit_to_datetime( + arg, unit, errors=errors + ) if errors == "ignore": from pandas import Index @@ -361,7 +359,18 @@ def _convert_listlike_datetimes( # warn if passing timedelta64, raise for PeriodDtype # NB: this must come after unit transformation orig_arg = arg - arg, _ = maybe_convert_dtype(arg, copy=False) + try: + arg, _ = maybe_convert_dtype(arg, copy=False) + except TypeError: + if errors == "coerce": + result = np.array(["NaT"], dtype="datetime64[ns]").repeat(len(arg)) + return DatetimeIndex(result, name=name) + elif errors == "ignore": + from pandas import Index + + result = Index(arg, name=name) + return result + raise arg = ensure_object(arg) require_iso8601 = False @@ -556,7 +565,7 @@ def to_datetime( Parameters ---------- - arg : int, float, str, datetime, list, tuple, 1-d array, Series DataFrame/dict-like + arg : int, float, str, datetime, list, tuple, 1-d array, Series, DataFrame/dict-like The object to convert to a datetime. errors : {'ignore', 'raise', 'coerce'}, default 'raise' - If 'raise', then invalid parsing will raise an exception. diff --git a/pandas/core/tools/numeric.py b/pandas/core/tools/numeric.py index 40f376724bd39..a6198f8b752ae 100644 --- a/pandas/core/tools/numeric.py +++ b/pandas/core/tools/numeric.py @@ -35,6 +35,7 @@ def to_numeric(arg, errors="raise", downcast=None): Parameters ---------- arg : scalar, list, tuple, 1-d array, or Series + Argument to be converted. errors : {'ignore', 'raise', 'coerce'}, default 'raise' - If 'raise', then invalid parsing will raise an exception. - If 'coerce', then invalid parsing will be set as NaN. @@ -61,7 +62,8 @@ def to_numeric(arg, errors="raise", downcast=None): Returns ------- - ret : numeric if parsing succeeded. + ret + Numeric if parsing succeeded. Return type depends on input. Series if Series, otherwise ndarray. See Also @@ -160,7 +162,7 @@ def to_numeric(arg, errors="raise", downcast=None): if downcast in ("integer", "signed"): typecodes = np.typecodes["Integer"] - elif downcast == "unsigned" and np.min(values) >= 0: + elif downcast == "unsigned" and (not len(values) or np.min(values) >= 0): typecodes = np.typecodes["UnsignedInteger"] elif downcast == "float": typecodes = np.typecodes["Float"] diff --git a/pandas/core/util/numba_.py b/pandas/core/util/numba_.py new file mode 100644 index 0000000000000..e4debab2c22ee --- /dev/null +++ b/pandas/core/util/numba_.py @@ -0,0 +1,58 @@ +"""Common utilities for Numba operations""" +import types +from typing import Callable, Dict, Optional + +import numpy as np + +from pandas.compat._optional import import_optional_dependency + + +def check_kwargs_and_nopython( + kwargs: Optional[Dict] = None, nopython: Optional[bool] = None +): + if kwargs and nopython: + raise ValueError( + "numba does not support kwargs with nopython=True: " + "https://github.com/numba/numba/issues/2916" + ) + + +def get_jit_arguments(engine_kwargs: Optional[Dict[str, bool]] = None): + """ + Return arguments to pass to numba.JIT, falling back on pandas default JIT settings. + """ + if engine_kwargs is None: + engine_kwargs = {} + + nopython = engine_kwargs.get("nopython", True) + nogil = engine_kwargs.get("nogil", False) + parallel = engine_kwargs.get("parallel", False) + return nopython, nogil, parallel + + +def jit_user_function(func: Callable, nopython: bool, nogil: bool, parallel: bool): + """ + JIT the user's function given the configurable arguments. + """ + numba = import_optional_dependency("numba") + + if isinstance(func, numba.targets.registry.CPUDispatcher): + # Don't jit a user passed jitted function + numba_func = func + else: + + @numba.generated_jit(nopython=nopython, nogil=nogil, parallel=parallel) + def numba_func(data, *_args): + if getattr(np, func.__name__, False) is func or isinstance( + func, types.BuiltinFunctionType + ): + jf = func + else: + jf = numba.jit(func, nopython=nopython, nogil=nogil) + + def impl(data, *_args): + return jf(data, *_args) + + return impl + + return numba_func diff --git a/pandas/core/window/ewm.py b/pandas/core/window/ewm.py index c6096c24ecbc9..0ec876583dcde 100644 --- a/pandas/core/window/ewm.py +++ b/pandas/core/window/ewm.py @@ -29,19 +29,25 @@ class EWM(_Rolling): r""" - Provide exponential weighted functions. + Provide exponential weighted (EW) functions. + + Available EW functions: ``mean()``, ``var()``, ``std()``, ``corr()``, ``cov()``. + + Exactly one parameter: ``com``, ``span``, ``halflife``, or ``alpha`` must be + provided. Parameters ---------- com : float, optional Specify decay in terms of center of mass, - :math:`\alpha = 1 / (1 + com),\text{ for } com \geq 0`. + :math:`\alpha = 1 / (1 + com)`, for :math:`com \geq 0`. span : float, optional Specify decay in terms of span, - :math:`\alpha = 2 / (span + 1),\text{ for } span \geq 1`. + :math:`\alpha = 2 / (span + 1)`, for :math:`span \geq 1`. halflife : float, optional Specify decay in terms of half-life, - :math:`\alpha = 1 - exp(log(0.5) / halflife),\text{for} halflife > 0`. + :math:`\alpha = 1 - \exp\left(-\ln(2) / halflife\right)`, for + :math:`halflife > 0`. alpha : float, optional Specify smoothing factor :math:`\alpha` directly, :math:`0 < \alpha \leq 1`. @@ -50,11 +56,39 @@ class EWM(_Rolling): (otherwise result is NA). adjust : bool, default True Divide by decaying adjustment factor in beginning periods to account - for imbalance in relative weightings - (viewing EWMA as a moving average). + for imbalance in relative weightings (viewing EWMA as a moving average). + + - When ``adjust=True`` (default), the EW function is calculated using weights + :math:`w_i = (1 - \alpha)^i`. For example, the EW moving average of the series + [:math:`x_0, x_1, ..., x_t`] would be: + + .. math:: + y_t = \frac{x_t + (1 - \alpha)x_{t-1} + (1 - \alpha)^2 x_{t-2} + ... + (1 - + \alpha)^t x_0}{1 + (1 - \alpha) + (1 - \alpha)^2 + ... + (1 - \alpha)^t} + + - When ``adjust=False``, the exponentially weighted function is calculated + recursively: + + .. math:: + \begin{split} + y_0 &= x_0\\ + y_t &= (1 - \alpha) y_{t-1} + \alpha x_t, + \end{split} ignore_na : bool, default False - Ignore missing values when calculating weights; - specify True to reproduce pre-0.15.0 behavior. + Ignore missing values when calculating weights; specify ``True`` to reproduce + pre-0.15.0 behavior. + + - When ``ignore_na=False`` (default), weights are based on absolute positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in calculating + the final weighted average of [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\alpha)^2` and :math:`\alpha` if ``adjust=False``. + + - When ``ignore_na=True`` (reproducing pre-0.15.0 behavior), weights are based + on relative positions. For example, the weights of :math:`x_0` and :math:`x_2` + used in calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are :math:`1-\alpha` and :math:`1` if + ``adjust=True``, and :math:`1-\alpha` and :math:`\alpha` if ``adjust=False``. axis : {0 or 'index', 1 or 'columns'}, default 0 The axis to use. The value 0 identifies the rows, and 1 identifies the columns. @@ -71,30 +105,9 @@ class EWM(_Rolling): Notes ----- - Exactly one of center of mass, span, half-life, and alpha must be provided. - Allowed values and relationship between the parameters are specified in the - parameter descriptions above; see the link at the end of this section for - a detailed explanation. - - When adjust is True (default), weighted averages are calculated using - weights (1-alpha)**(n-1), (1-alpha)**(n-2), ..., 1-alpha, 1. - - When adjust is False, weighted averages are calculated recursively as: - weighted_average[0] = arg[0]; - weighted_average[i] = (1-alpha)*weighted_average[i-1] + alpha*arg[i]. - - When ignore_na is False (default), weights are based on absolute positions. - For example, the weights of x and y used in calculating the final weighted - average of [x, None, y] are (1-alpha)**2 and 1 (if adjust is True), and - (1-alpha)**2 and alpha (if adjust is False). - - When ignore_na is True (reproducing pre-0.15.0 behavior), weights are based - on relative positions. For example, the weights of x and y used in - calculating the final weighted average of [x, None, y] are 1-alpha and 1 - (if adjust is True), and 1-alpha and alpha (if adjust is False). - - More details can be found at - https://pandas.pydata.org/pandas-docs/stable/user_guide/computation.html#exponentially-weighted-windows + + More details can be found at: + :ref:`Exponentially weighted windows `. Examples -------- diff --git a/pandas/core/window/numba_.py b/pandas/core/window/numba_.py index d6e8194c861fa..5d35ec7457ab0 100644 --- a/pandas/core/window/numba_.py +++ b/pandas/core/window/numba_.py @@ -1,4 +1,3 @@ -import types from typing import Any, Callable, Dict, Optional, Tuple import numpy as np @@ -6,35 +5,49 @@ from pandas._typing import Scalar from pandas.compat._optional import import_optional_dependency +from pandas.core.util.numba_ import ( + check_kwargs_and_nopython, + get_jit_arguments, + jit_user_function, +) -def make_rolling_apply( - func: Callable[..., Scalar], + +def generate_numba_apply_func( args: Tuple, - nogil: bool, - parallel: bool, - nopython: bool, + kwargs: Dict[str, Any], + func: Callable[..., Scalar], + engine_kwargs: Optional[Dict[str, bool]], ): """ - Creates a JITted rolling apply function with a JITted version of - the user's function. + Generate a numba jitted apply function specified by values from engine_kwargs. + + 1. jit the user's function + 2. Return a rolling apply function with the jitted function inline + + Configurations specified in engine_kwargs apply to both the user's + function _AND_ the rolling apply function. Parameters ---------- - func : function - function to be applied to each window and will be JITed args : tuple *args to be passed into the function - nogil : bool - nogil parameter from engine_kwargs for numba.jit - parallel : bool - parallel parameter from engine_kwargs for numba.jit - nopython : bool - nopython parameter from engine_kwargs for numba.jit + kwargs : dict + **kwargs to be passed into the function + func : function + function to be applied to each window and will be JITed + engine_kwargs : dict + dictionary of arguments to be passed into numba.jit Returns ------- Numba function """ + nopython, nogil, parallel = get_jit_arguments(engine_kwargs) + + check_kwargs_and_nopython(kwargs, nopython) + + numba_func = jit_user_function(func, nopython, nogil, parallel) + numba = import_optional_dependency("numba") if parallel: @@ -42,25 +55,6 @@ def make_rolling_apply( else: loop_range = range - if isinstance(func, numba.targets.registry.CPUDispatcher): - # Don't jit a user passed jitted function - numba_func = func - else: - - @numba.generated_jit(nopython=nopython, nogil=nogil, parallel=parallel) - def numba_func(window, *_args): - if getattr(np, func.__name__, False) is func or isinstance( - func, types.BuiltinFunctionType - ): - jf = func - else: - jf = numba.jit(func, nopython=nopython, nogil=nogil) - - def impl(window, *_args): - return jf(window, *_args) - - return impl - @numba.jit(nopython=nopython, nogil=nogil, parallel=parallel) def roll_apply( values: np.ndarray, begin: np.ndarray, end: np.ndarray, minimum_periods: int, @@ -78,49 +72,3 @@ def roll_apply( return result return roll_apply - - -def generate_numba_apply_func( - args: Tuple, - kwargs: Dict[str, Any], - func: Callable[..., Scalar], - engine_kwargs: Optional[Dict[str, bool]], -): - """ - Generate a numba jitted apply function specified by values from engine_kwargs. - - 1. jit the user's function - 2. Return a rolling apply function with the jitted function inline - - Configurations specified in engine_kwargs apply to both the user's - function _AND_ the rolling apply function. - - Parameters - ---------- - args : tuple - *args to be passed into the function - kwargs : dict - **kwargs to be passed into the function - func : function - function to be applied to each window and will be JITed - engine_kwargs : dict - dictionary of arguments to be passed into numba.jit - - Returns - ------- - Numba function - """ - if engine_kwargs is None: - engine_kwargs = {} - - nopython = engine_kwargs.get("nopython", True) - nogil = engine_kwargs.get("nogil", False) - parallel = engine_kwargs.get("parallel", False) - - if kwargs and nopython: - raise ValueError( - "numba does not support kwargs with nopython=True: " - "https://github.com/numba/numba/issues/2916" - ) - - return make_rolling_apply(func, args, nogil, parallel, nopython) diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index 3784989de10ab..aeab51149ec4e 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -196,7 +196,7 @@ def _dir_additions(self): def _get_win_type(self, kwargs: Dict): """ - Exists for compatibility, overriden by subclass Window. + Exists for compatibility, overridden by subclass Window. Parameters ---------- diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index d2f9dd285582f..d1139f640cef4 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -1,5 +1,5 @@ import abc -from datetime import date, datetime, timedelta +import datetime from io import BytesIO import os from textwrap import fill @@ -28,7 +28,6 @@ _pop_header_name, get_writer, ) -from pandas.io.formats.printing import pprint_thing from pandas.io.parsers import TextParser _read_excel_doc = ( @@ -367,6 +366,9 @@ def _workbook_class(self): def load_workbook(self, filepath_or_buffer): pass + def close(self): + pass + @property @abc.abstractmethod def sheet_names(self): @@ -742,11 +744,11 @@ def _value_with_fmt(self, val): val = float(val) elif is_bool(val): val = bool(val) - elif isinstance(val, datetime): + elif isinstance(val, datetime.datetime): fmt = self.datetime_format - elif isinstance(val, date): + elif isinstance(val, datetime.date): fmt = self.date_format - elif isinstance(val, timedelta): + elif isinstance(val, datetime.timedelta): val = val.total_seconds() / float(86400) fmt = "0" else: @@ -763,9 +765,7 @@ def check_extension(cls, ext): if ext.startswith("."): ext = ext[1:] if not any(ext in extension for extension in cls.supported_extensions): - msg = "Invalid extension for engine" - f"'{pprint_thing(cls.engine)}': '{pprint_thing(ext)}'" - raise ValueError(msg) + raise ValueError(f"Invalid extension for engine '{cls.engine}': '{ext}'") else: return True @@ -898,14 +898,7 @@ def sheet_names(self): def close(self): """close io if necessary""" - if self.engine == "openpyxl": - # https://stackoverflow.com/questions/31416842/ - # openpyxl-does-not-close-excel-workbook-in-read-only-mode - wb = self.book - wb._archive.close() - - if hasattr(self.io, "close"): - self.io.close() + self._reader.close() def __enter__(self): return self diff --git a/pandas/io/excel/_openpyxl.py b/pandas/io/excel/_openpyxl.py index a96c0f814e2d8..0696d82e51f34 100644 --- a/pandas/io/excel/_openpyxl.py +++ b/pandas/io/excel/_openpyxl.py @@ -492,6 +492,11 @@ def load_workbook(self, filepath_or_buffer: FilePathOrBuffer): filepath_or_buffer, read_only=True, data_only=True, keep_links=False ) + def close(self): + # https://stackoverflow.com/questions/31416842/ + # openpyxl-does-not-close-excel-workbook-in-read-only-mode + self.book.close() + @property def sheet_names(self) -> List[str]: return self.book.sheetnames diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 2a528781f8c93..17cc897136aad 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -58,11 +58,8 @@ ) from pandas.core.dtypes.generic import ( ABCDatetimeIndex, - ABCIndexClass, ABCMultiIndex, ABCPeriodIndex, - ABCSeries, - ABCSparseArray, ABCTimedeltaIndex, ) from pandas.core.dtypes.missing import isna, notna @@ -71,6 +68,7 @@ from pandas.core.arrays.timedeltas import TimedeltaArray from pandas.core.base import PandasObject import pandas.core.common as com +from pandas.core.construction import extract_array from pandas.core.indexes.api import Index, ensure_index from pandas.core.indexes.datetimes import DatetimeIndex from pandas.core.indexes.timedeltas import TimedeltaIndex @@ -283,9 +281,7 @@ def _chk_truncate(self) -> None: series = series.iloc[:max_rows] else: row_num = max_rows // 2 - series = series._ensure_type( - concat((series.iloc[:row_num], series.iloc[-row_num:])) - ) + series = concat((series.iloc[:row_num], series.iloc[-row_num:])) self.tr_row_num = row_num else: self.tr_row_num = None @@ -1230,11 +1226,7 @@ def _format(x): # object dtype return str(formatter(x)) - vals = self.values - if isinstance(vals, Index): - vals = vals._values - elif isinstance(vals, ABCSparseArray): - vals = vals.values + vals = extract_array(self.values, extract_numpy=True) is_float_type = lib.map_infer(vals, is_float) & notna(vals) leading_space = self.leading_space @@ -1352,8 +1344,6 @@ def format_values_with(float_format): values = self.values is_complex = is_complex_dtype(values) mask = isna(values) - if hasattr(values, "to_dense"): # sparse numpy ndarray - values = values.to_dense() values = np.array(values, dtype="object") values[mask] = na_rep imask = (~mask).ravel() @@ -1461,9 +1451,7 @@ def _format_strings(self) -> List[str]: class ExtensionArrayFormatter(GenericArrayFormatter): def _format_strings(self) -> List[str]: - values = self.values - if isinstance(values, (ABCIndexClass, ABCSeries)): - values = values._values + values = extract_array(self.values, extract_numpy=True) formatter = values._formatter(boxed=True) diff --git a/pandas/io/formats/html.py b/pandas/io/formats/html.py index 585e1af3dbc01..1be0f977f9b20 100644 --- a/pandas/io/formats/html.py +++ b/pandas/io/formats/html.py @@ -101,7 +101,7 @@ def write_th( self, s: Any, header: bool = False, indent: int = 0, tags: Optional[str] = None ) -> None: """ - Method for writting a formatted cell. + Method for writing a formatted cell. If col_space is set on the formatter then that is used for the value of min-width. diff --git a/pandas/io/formats/style.py b/pandas/io/formats/style.py index 018441dacd9a8..718534e42ec25 100644 --- a/pandas/io/formats/style.py +++ b/pandas/io/formats/style.py @@ -27,7 +27,7 @@ from pandas._libs import lib from pandas._typing import Axis, FrameOrSeries, FrameOrSeriesUnion, Label from pandas.compat._optional import import_optional_dependency -from pandas.util._decorators import Appender +from pandas.util._decorators import doc from pandas.core.dtypes.common import is_float @@ -35,7 +35,7 @@ from pandas.api.types import is_dict_like, is_list_like import pandas.core.common as com from pandas.core.frame import DataFrame -from pandas.core.generic import _shared_docs +from pandas.core.generic import NDFrame from pandas.core.indexing import _maybe_numeric_slice, _non_reducing_slice jinja2 = import_optional_dependency("jinja2", extra="DataFrame.style requires jinja2.") @@ -192,18 +192,7 @@ def _repr_html_(self) -> str: """ return self.render() - @Appender( - _shared_docs["to_excel"] - % dict( - axes="index, columns", - klass="Styler", - axes_single_arg="{0 or 'index', 1 or 'columns'}", - optional_by=""" - by : str or list of str - Name or list of names which refer to the axis items.""", - versionadded_to_excel="\n .. versionadded:: 0.20", - ) - ) + @doc(NDFrame.to_excel, klass="Styler") def to_excel( self, excel_writer, @@ -1003,19 +992,27 @@ def hide_columns(self, subset) -> "Styler": def _highlight_null(v, null_color: str) -> str: return f"background-color: {null_color}" if pd.isna(v) else "" - def highlight_null(self, null_color: str = "red") -> "Styler": + def highlight_null( + self, + null_color: str = "red", + subset: Optional[Union[Label, Sequence[Label]]] = None, + ) -> "Styler": """ Shade the background ``null_color`` for missing values. Parameters ---------- - null_color : str + null_color : str, default 'red' + subset : label or list of labels, default None + A valid slice for ``data`` to limit the style application to. + + .. versionadded:: 1.1.0 Returns ------- self : Styler """ - self.applymap(self._highlight_null, null_color=null_color) + self.applymap(self._highlight_null, null_color=null_color, subset=subset) return self def background_gradient( diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index 77a0c2f99496b..d6b90ae99973e 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -973,9 +973,9 @@ def _try_convert_to_date(self, data): # ignore numbers that are out of range if issubclass(new_data.dtype.type, np.number): in_range = ( - isna(new_data.values) + isna(new_data._values) | (new_data > self.min_stamp) - | (new_data.values == iNaT) + | (new_data._values == iNaT) ) if not in_range.all(): return data, False diff --git a/pandas/io/json/_normalize.py b/pandas/io/json/_normalize.py index 4b153d3cb69bf..6e68c1cf5e27e 100644 --- a/pandas/io/json/_normalize.py +++ b/pandas/io/json/_normalize.py @@ -8,6 +8,7 @@ import numpy as np from pandas._libs.writers import convert_json_to_lines +from pandas._typing import Scalar from pandas.util._decorators import deprecate import pandas as pd @@ -226,14 +227,28 @@ def _json_normalize( Returns normalized data with columns prefixed with the given string. """ - def _pull_field(js: Dict[str, Any], spec: Union[List, str]) -> Iterable: + def _pull_field( + js: Dict[str, Any], spec: Union[List, str] + ) -> Union[Scalar, Iterable]: + """Internal function to pull field""" result = js # type: ignore if isinstance(spec, list): for field in spec: result = result[field] else: result = result[spec] + return result + + def _pull_records(js: Dict[str, Any], spec: Union[List, str]) -> Iterable: + """ + Interal function to pull field for records, and similar to + _pull_field, but require to return Iterable. And will raise error + if has non iterable value. + """ + result = _pull_field(js, spec) + # GH 31507 GH 30145, if result is not Iterable, raise TypeError if not + # null, otherwise return an empty list if not isinstance(result, Iterable): if pd.isnull(result): result = [] # type: ignore @@ -242,7 +257,6 @@ def _pull_field(js: Dict[str, Any], spec: Union[List, str]) -> Iterable: f"{js} has non iterable value {result} for path {spec}. " "Must be iterable or null." ) - return result if isinstance(data, list) and not data: @@ -292,7 +306,7 @@ def _recursive_extract(data, path, seen_meta, level=0): _recursive_extract(obj[path[0]], path[1:], seen_meta, level=level + 1) else: for obj in data: - recs = _pull_field(obj, path[0]) + recs = _pull_records(obj, path[0]) recs = [ nested_to_record(r, sep=sep, max_level=max_level) if isinstance(r, dict) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index bc2fb9f0f41bc..2df81ba0aa51a 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -5,11 +5,12 @@ from collections import abc, defaultdict import csv import datetime -from io import BufferedIOBase, RawIOBase, StringIO, TextIOWrapper +from io import StringIO, TextIOWrapper +import itertools import re import sys from textwrap import fill -from typing import Any, Dict, Set +from typing import Any, Dict, Iterable, List, Set import warnings import numpy as np @@ -34,6 +35,7 @@ ensure_str, is_bool_dtype, is_categorical_dtype, + is_dict_like, is_dtype_equal, is_extension_array_dtype, is_file_like, @@ -1421,6 +1423,54 @@ def __init__(self, kwds): # keep references to file handles opened by the parser itself self.handles = [] + def _validate_parse_dates_presence(self, columns: List[str]) -> None: + """ + Check if parse_dates are in columns. + + If user has provided names for parse_dates, check if those columns + are available. + + Parameters + ---------- + columns : list + List of names of the dataframe. + + Raises + ------ + ValueError + If column to parse_date is not in dataframe. + + """ + cols_needed: Iterable + if is_dict_like(self.parse_dates): + cols_needed = itertools.chain(*self.parse_dates.values()) + elif is_list_like(self.parse_dates): + # a column in parse_dates could be represented + # ColReference = Union[int, str] + # DateGroups = List[ColReference] + # ParseDates = Union[DateGroups, List[DateGroups], + # Dict[ColReference, DateGroups]] + cols_needed = itertools.chain.from_iterable( + col if is_list_like(col) else [col] for col in self.parse_dates + ) + else: + cols_needed = [] + + # get only columns that are references using names (str), not by index + missing_cols = ", ".join( + sorted( + { + col + for col in cols_needed + if isinstance(col, str) and col not in columns + } + ) + ) + if missing_cols: + raise ValueError( + f"Missing column provided to 'parse_dates': '{missing_cols}'" + ) + def close(self): for f in self.handles: f.close() @@ -1870,7 +1920,7 @@ def __init__(self, src, **kwds): # Handle the file object with universal line mode enabled. # We will handle the newline character ourselves later on. - if isinstance(src, (BufferedIOBase, RawIOBase)): + if hasattr(src, "read") and not hasattr(src, "encoding"): src = TextIOWrapper(src, encoding=encoding, newline="") kwds["encoding"] = "utf-8" @@ -1940,6 +1990,7 @@ def __init__(self, src, **kwds): if len(self.names) < len(usecols): _validate_usecols_names(usecols, self.names) + self._validate_parse_dates_presence(self.names) self._set_noconvert_columns() self.orig_names = self.names @@ -2273,11 +2324,15 @@ def __init__(self, f, **kwds): # Get columns in two steps: infer from data, then # infer column indices from self.usecols if it is specified. self._col_indices = None - ( - self.columns, - self.num_original_columns, - self.unnamed_cols, - ) = self._infer_columns() + try: + ( + self.columns, + self.num_original_columns, + self.unnamed_cols, + ) = self._infer_columns() + except (TypeError, ValueError): + self.close() + raise # Now self.columns has the set of columns that we will process. # The original set is stored in self.original_columns. @@ -2310,6 +2365,7 @@ def __init__(self, f, **kwds): if self.index_names is None: self.index_names = index_names + self._validate_parse_dates_presence(self.columns) if self.parse_dates: self._no_thousands_columns = self._set_no_thousands_columns() else: diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 168666ea21f45..8c213803170a3 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -8,17 +8,7 @@ import itertools import os import re -from typing import ( - TYPE_CHECKING, - Any, - Dict, - Hashable, - List, - Optional, - Tuple, - Type, - Union, -) +from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type, Union import warnings import numpy as np @@ -27,7 +17,7 @@ from pandas._libs import lib, writers as libwriters from pandas._libs.tslibs import timezones -from pandas._typing import ArrayLike, FrameOrSeries +from pandas._typing import ArrayLike, FrameOrSeries, Label from pandas.compat._optional import import_optional_dependency from pandas.errors import PerformanceWarning from pandas.util._decorators import cache_readonly @@ -2212,7 +2202,7 @@ def __eq__(self, other: Any) -> bool: for a in ["name", "cname", "dtype", "pos"] ) - def set_data(self, data: Union[np.ndarray, ABCExtensionArray]): + def set_data(self, data: ArrayLike): assert data is not None assert self.dtype is None @@ -2392,7 +2382,7 @@ def convert(self, values: np.ndarray, nan_rep, encoding: str, errors: str): mask = isna(categories) if mask.any(): categories = categories[~mask] - codes[codes != -1] -= mask.astype(int).cumsum().values + codes[codes != -1] -= mask.astype(int).cumsum()._values converted = Categorical.from_codes( codes, categories=categories, ordered=ordered @@ -2811,7 +2801,7 @@ def read_multi_index( levels = [] codes = [] - names: List[Optional[Hashable]] = [] + names: List[Label] = [] for i in range(nlevels): level_key = f"{key}_level{i}" node = getattr(self.group, level_key) @@ -2976,7 +2966,7 @@ class SeriesFixed(GenericFixed): pandas_kind = "series" attributes = ["name"] - name: Optional[Hashable] + name: Label @property def shape(self): @@ -4692,7 +4682,7 @@ def _convert_index(name: str, index: Index, encoding: str, errors: str) -> Index raise TypeError("MultiIndex not supported here!") inferred_type = lib.infer_dtype(index, skipna=False) - # we wont get inferred_type of "datetime64" or "timedelta64" as these + # we won't get inferred_type of "datetime64" or "timedelta64" as these # would go through the DatetimeIndex/TimedeltaIndex paths above values = np.asarray(index) @@ -4836,7 +4826,9 @@ def _convert_string_array(data: np.ndarray, encoding: str, errors: str) -> np.nd # encode if needed if len(data): data = ( - Series(data.ravel()).str.encode(encoding, errors).values.reshape(data.shape) + Series(data.ravel()) + .str.encode(encoding, errors) + ._values.reshape(data.shape) ) # create the sized dtype @@ -4875,7 +4867,7 @@ def _unconvert_string_array( dtype = f"U{itemsize}" if isinstance(data[0], bytes): - data = Series(data).str.decode(encoding, errors=errors).values + data = Series(data).str.decode(encoding, errors=errors)._values else: data = data.astype(dtype, copy=False).astype(object, copy=False) @@ -4969,11 +4961,11 @@ def _dtype_to_kind(dtype_str: str) -> str: return kind -def _get_data_and_dtype_name(data: Union[np.ndarray, ABCExtensionArray]): +def _get_data_and_dtype_name(data: ArrayLike): """ Convert the passed data into a storable form and a dtype string. """ - if is_categorical_dtype(data.dtype): + if isinstance(data, Categorical): data = data.codes # For datetime64tz we need to drop the TZ in tests TODO: why? diff --git a/pandas/io/sas/sas.pyx b/pandas/io/sas/sas.pyx index 40fea0aaf0d07..11171af1e0c82 100644 --- a/pandas/io/sas/sas.pyx +++ b/pandas/io/sas/sas.pyx @@ -120,7 +120,7 @@ cdef const uint8_t[:] rdc_decompress(int result_length, const uint8_t[:] inbuff) cdef: uint8_t cmd - uint16_t ctrl_bits, ctrl_mask = 0, ofs, cnt + uint16_t ctrl_bits = 0, ctrl_mask = 0, ofs, cnt int rpos = 0, k uint8_t[:] outbuff = np.zeros(result_length, dtype=np.uint8) Py_ssize_t ipos = 0, length = len(inbuff) diff --git a/pandas/io/sql.py b/pandas/io/sql.py index 9a53e7cd241e1..560e7e4781cbb 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -705,8 +705,16 @@ def insert_data(self): else: # convert to microsecond resolution for datetime.datetime d = b.values.astype("M8[us]").astype(object) + elif b.is_timedelta: + # numpy converts this to an object array of integers, + # whereas b.astype(object).values would convert to + # object array of Timedeltas + d = b.values.astype(object) else: - d = np.array(b.get_values(), dtype=object) + # TODO(2DEA): astype-first can be avoided with 2D EAs + # astype on the block instead of values to ensure we + # get the right shape + d = b.astype(object).values # replace NaN with None if b._can_hold_na: diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 6e79f5890f76d..8f3aa60b7a9cc 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -351,10 +351,10 @@ def _datetime_to_stata_elapsed_vec(dates: Series, fmt: str) -> Series: def parse_dates_safe(dates, delta=False, year=False, days=False): d = {} - if is_datetime64_dtype(dates.values): + if is_datetime64_dtype(dates.dtype): if delta: time_delta = dates - stata_epoch - d["delta"] = time_delta.values.astype(np.int64) // 1000 # microseconds + d["delta"] = time_delta._values.astype(np.int64) // 1000 # microseconds if days or year: # ignore since mypy reports that DatetimeIndex has no year/month date_index = DatetimeIndex(dates) @@ -368,7 +368,7 @@ def parse_dates_safe(dates, delta=False, year=False, days=False): elif infer_dtype(dates, skipna=False) == "datetime": if delta: - delta = dates.values - stata_epoch + delta = dates._values - stata_epoch def f(x: datetime.timedelta) -> float: return US_PER_DAY * x.days + 1000000 * x.seconds + x.microseconds @@ -377,8 +377,8 @@ def f(x: datetime.timedelta) -> float: d["delta"] = v(delta) if year: year_month = dates.apply(lambda x: 100 * x.year + x.month) - d["year"] = year_month.values // 100 - d["month"] = year_month.values - d["year"] * 100 + d["year"] = year_month._values // 100 + d["month"] = year_month._values - d["year"] * 100 if days: def g(x: datetime.datetime) -> int: @@ -1956,7 +1956,7 @@ def _dtype_to_stata_type(dtype: np.dtype, column: Series) -> int: if dtype.type == np.object_: # try to coerce it to the biggest string # not memory efficient, what else could we # do? - itemsize = max_len_string_array(ensure_object(column.values)) + itemsize = max_len_string_array(ensure_object(column._values)) return max(itemsize, 1) elif dtype == np.float64: return 255 @@ -1998,7 +1998,7 @@ def _dtype_to_default_stata_fmt( if force_strl: return "%9s" if dtype.type == np.object_: - itemsize = max_len_string_array(ensure_object(column.values)) + itemsize = max_len_string_array(ensure_object(column._values)) if itemsize > max_str_len: if dta_version >= 117: return "%9s" @@ -2151,7 +2151,7 @@ def _prepare_categoricals(self, data: DataFrame) -> DataFrame: "It is not possible to export " "int64-based categorical data to Stata." ) - values = data[col].cat.codes.values.copy() + values = data[col].cat.codes._values.copy() # Upcast if needed so that correct missing values can be set if values.max() >= get_base_missing_value(dtype): @@ -2384,7 +2384,7 @@ def _encode_strings(self) -> None: encoded = self.data[col].str.encode(self._encoding) # If larger than _max_string_length do nothing if ( - max_len_string_array(ensure_object(encoded.values)) + max_len_string_array(ensure_object(encoded._values)) <= self._max_string_length ): self.data[col] = encoded @@ -2650,7 +2650,7 @@ def _dtype_to_stata_type_117(dtype: np.dtype, column: Series, force_strl: bool) if dtype.type == np.object_: # try to coerce it to the biggest string # not memory efficient, what else could we # do? - itemsize = max_len_string_array(ensure_object(column.values)) + itemsize = max_len_string_array(ensure_object(column._values)) itemsize = max(itemsize, 1) if itemsize <= 2045: return itemsize diff --git a/pandas/plotting/_matplotlib/compat.py b/pandas/plotting/_matplotlib/compat.py index e7855068334f7..f2c5032112bc9 100644 --- a/pandas/plotting/_matplotlib/compat.py +++ b/pandas/plotting/_matplotlib/compat.py @@ -20,3 +20,4 @@ def inner(): _mpl_ge_2_2_3 = _mpl_version("2.2.3", operator.ge) _mpl_ge_3_0_0 = _mpl_version("3.0.0", operator.ge) _mpl_ge_3_1_0 = _mpl_version("3.1.0", operator.ge) +_mpl_ge_3_2_0 = _mpl_version("3.2.0", operator.ge) diff --git a/pandas/plotting/_matplotlib/misc.py b/pandas/plotting/_matplotlib/misc.py index 0720f544203f7..7319e8de3ec6e 100644 --- a/pandas/plotting/_matplotlib/misc.py +++ b/pandas/plotting/_matplotlib/misc.py @@ -260,6 +260,7 @@ def bootstrap_plot(series, fig=None, size=50, samples=500, **kwds): import matplotlib.pyplot as plt + # TODO: is the failure mentioned below still relevant? # random.sample(ndarray, int) fails on python 3.3, sigh data = list(series.values) samplings = [random.sample(data, size) for _ in range(samples)] diff --git a/pandas/plotting/_matplotlib/tools.py b/pandas/plotting/_matplotlib/tools.py index 5743288982da4..08d945f679810 100644 --- a/pandas/plotting/_matplotlib/tools.py +++ b/pandas/plotting/_matplotlib/tools.py @@ -9,6 +9,8 @@ from pandas.core.dtypes.common import is_list_like from pandas.core.dtypes.generic import ABCDataFrame, ABCIndexClass, ABCSeries +from pandas.plotting._matplotlib import compat + def format_date_labels(ax, rot): # mini version of autofmt_xdate @@ -288,6 +290,12 @@ def _remove_labels_from_axis(axis): def _handle_shared_axes(axarr, nplots, naxes, nrows, ncols, sharex, sharey): if nplots > 1: + if compat._mpl_ge_3_2_0(): + row_num = lambda x: x.get_subplotspec().rowspan.start + col_num = lambda x: x.get_subplotspec().colspan.start + else: + row_num = lambda x: x.rowNum + col_num = lambda x: x.colNum if nrows > 1: try: @@ -295,13 +303,13 @@ def _handle_shared_axes(axarr, nplots, naxes, nrows, ncols, sharex, sharey): # so that we can correctly handle 'gaps" layout = np.zeros((nrows + 1, ncols + 1), dtype=np.bool) for ax in axarr: - layout[ax.rowNum, ax.colNum] = ax.get_visible() + layout[row_num(ax), col_num(ax)] = ax.get_visible() for ax in axarr: # only the last row of subplots should get x labels -> all # other off layout handles the case that the subplot is # the last in the column, because below is no subplot/gap. - if not layout[ax.rowNum + 1, ax.colNum]: + if not layout[row_num(ax) + 1, col_num(ax)]: continue if sharex or len(ax.get_shared_x_axes().get_siblings(ax)) > 1: _remove_labels_from_axis(ax.xaxis) diff --git a/pandas/tests/arithmetic/test_array_ops.py b/pandas/tests/arithmetic/test_array_ops.py index d8aaa3183a1c6..53cb10ba9fc5e 100644 --- a/pandas/tests/arithmetic/test_array_ops.py +++ b/pandas/tests/arithmetic/test_array_ops.py @@ -4,7 +4,7 @@ import pytest import pandas._testing as tm -from pandas.core.ops.array_ops import na_logical_op +from pandas.core.ops.array_ops import comparison_op, na_logical_op def test_na_logical_op_2d(): @@ -19,3 +19,18 @@ def test_na_logical_op_2d(): result = na_logical_op(left, right, operator.or_) expected = right tm.assert_numpy_array_equal(result, expected) + + +def test_object_comparison_2d(): + left = np.arange(9).reshape(3, 3).astype(object) + right = left.T + + result = comparison_op(left, right, operator.eq) + expected = np.eye(3).astype(bool) + tm.assert_numpy_array_equal(result, expected) + + # Ensure that cython doesn't raise on non-writeable arg, which + # we can get from np.broadcast_to + right.flags.writeable = False + result = comparison_op(left, right, operator.ne) + tm.assert_numpy_array_equal(result, ~expected) diff --git a/pandas/tests/arithmetic/test_interval.py b/pandas/tests/arithmetic/test_interval.py index 3f85ac8c190db..d7c312b2fda1b 100644 --- a/pandas/tests/arithmetic/test_interval.py +++ b/pandas/tests/arithmetic/test_interval.py @@ -100,7 +100,7 @@ def interval_constructor(self, request): def elementwise_comparison(self, op, array, other): """ - Helper that performs elementwise comparisions between `array` and `other` + Helper that performs elementwise comparisons between `array` and `other` """ other = other if is_list_like(other) else [other] * len(array) return np.array([op(x, y) for x, y in zip(array, other)]) diff --git a/pandas/tests/arithmetic/test_numeric.py b/pandas/tests/arithmetic/test_numeric.py index d4baf2f374cdf..202e30287881f 100644 --- a/pandas/tests/arithmetic/test_numeric.py +++ b/pandas/tests/arithmetic/test_numeric.py @@ -913,13 +913,13 @@ def test_frame_operators(self, float_frame): # TODO: taken from tests.series.test_operators; needs cleanup def test_series_operators(self): - def _check_op(series, other, op, pos_only=False, check_dtype=True): + def _check_op(series, other, op, pos_only=False): left = np.abs(series) if pos_only else series right = np.abs(other) if pos_only else other cython_or_numpy = op(left, right) python = left.combine(right, op) - tm.assert_series_equal(cython_or_numpy, python, check_dtype=check_dtype) + tm.assert_series_equal(cython_or_numpy, python) def check(series, other): simple_ops = ["add", "sub", "mul", "truediv", "floordiv", "mod"] @@ -942,15 +942,15 @@ def check(series, other): check(tser, tser[::2]) check(tser, 5) - def check_comparators(series, other, check_dtype=True): - _check_op(series, other, operator.gt, check_dtype=check_dtype) - _check_op(series, other, operator.ge, check_dtype=check_dtype) - _check_op(series, other, operator.eq, check_dtype=check_dtype) - _check_op(series, other, operator.lt, check_dtype=check_dtype) - _check_op(series, other, operator.le, check_dtype=check_dtype) + def check_comparators(series, other): + _check_op(series, other, operator.gt) + _check_op(series, other, operator.ge) + _check_op(series, other, operator.eq) + _check_op(series, other, operator.lt) + _check_op(series, other, operator.le) check_comparators(tser, 5) - check_comparators(tser, tser + 1, check_dtype=False) + check_comparators(tser, tser + 1) # TODO: taken from tests.series.test_operators; needs cleanup def test_divmod(self): diff --git a/pandas/tests/arrays/boolean/__init__.py b/pandas/tests/arrays/boolean/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/pandas/tests/arrays/boolean/test_arithmetic.py b/pandas/tests/arrays/boolean/test_arithmetic.py new file mode 100644 index 0000000000000..df4c218cbf9bf --- /dev/null +++ b/pandas/tests/arrays/boolean/test_arithmetic.py @@ -0,0 +1,42 @@ +import numpy as np +import pytest + +import pandas as pd +from pandas.tests.extension.base import BaseOpsUtil + + +@pytest.fixture +def data(): + return pd.array( + [True, False] * 4 + [np.nan] + [True, False] * 44 + [np.nan] + [True, False], + dtype="boolean", + ) + + +class TestArithmeticOps(BaseOpsUtil): + def test_error(self, data, all_arithmetic_operators): + # invalid ops + + op = all_arithmetic_operators + s = pd.Series(data) + ops = getattr(s, op) + opa = getattr(data, op) + + # invalid scalars + with pytest.raises(TypeError): + ops("foo") + with pytest.raises(TypeError): + ops(pd.Timestamp("20180101")) + + # invalid array-likes + if op not in ("__mul__", "__rmul__"): + # TODO(extension) numpy's mul with object array sees booleans as numbers + with pytest.raises(TypeError): + ops(pd.Series("foo", index=s.index)) + + # 2d + result = opa(pd.DataFrame({"A": s})) + assert result is NotImplemented + + with pytest.raises(NotImplementedError): + opa(np.arange(len(s)).reshape(-1, len(s))) diff --git a/pandas/tests/arrays/boolean/test_astype.py b/pandas/tests/arrays/boolean/test_astype.py new file mode 100644 index 0000000000000..90fe9a6905d40 --- /dev/null +++ b/pandas/tests/arrays/boolean/test_astype.py @@ -0,0 +1,53 @@ +import numpy as np +import pytest + +import pandas as pd +import pandas._testing as tm + + +def test_astype(): + # with missing values + arr = pd.array([True, False, None], dtype="boolean") + + with pytest.raises(ValueError, match="cannot convert NA to integer"): + arr.astype("int64") + + with pytest.raises(ValueError, match="cannot convert float NaN to"): + arr.astype("bool") + + result = arr.astype("float64") + expected = np.array([1, 0, np.nan], dtype="float64") + tm.assert_numpy_array_equal(result, expected) + + result = arr.astype("str") + expected = np.array(["True", "False", ""], dtype="object") + tm.assert_numpy_array_equal(result, expected) + + # no missing values + arr = pd.array([True, False, True], dtype="boolean") + result = arr.astype("int64") + expected = np.array([1, 0, 1], dtype="int64") + tm.assert_numpy_array_equal(result, expected) + + result = arr.astype("bool") + expected = np.array([True, False, True], dtype="bool") + tm.assert_numpy_array_equal(result, expected) + + +def test_astype_to_boolean_array(): + # astype to BooleanArray + arr = pd.array([True, False, None], dtype="boolean") + + result = arr.astype("boolean") + tm.assert_extension_array_equal(result, arr) + result = arr.astype(pd.BooleanDtype()) + tm.assert_extension_array_equal(result, arr) + + +def test_astype_to_integer_array(): + # astype to IntegerArray + arr = pd.array([True, False, None], dtype="boolean") + + result = arr.astype("Int64") + expected = pd.array([1, 0, None], dtype="Int64") + tm.assert_extension_array_equal(result, expected) diff --git a/pandas/tests/arrays/boolean/test_comparison.py b/pandas/tests/arrays/boolean/test_comparison.py new file mode 100644 index 0000000000000..726b78fbd43bd --- /dev/null +++ b/pandas/tests/arrays/boolean/test_comparison.py @@ -0,0 +1,94 @@ +import numpy as np +import pytest + +import pandas as pd +import pandas._testing as tm +from pandas.arrays import BooleanArray +from pandas.tests.extension.base import BaseOpsUtil + + +@pytest.fixture +def data(): + return pd.array( + [True, False] * 4 + [np.nan] + [True, False] * 44 + [np.nan] + [True, False], + dtype="boolean", + ) + + +class TestComparisonOps(BaseOpsUtil): + def _compare_other(self, data, op_name, other): + op = self.get_op_from_name(op_name) + + # array + result = pd.Series(op(data, other)) + expected = pd.Series(op(data._data, other), dtype="boolean") + # propagate NAs + expected[data._mask] = pd.NA + + tm.assert_series_equal(result, expected) + + # series + s = pd.Series(data) + result = op(s, other) + + expected = pd.Series(data._data) + expected = op(expected, other) + expected = expected.astype("boolean") + # propagate NAs + expected[data._mask] = pd.NA + + tm.assert_series_equal(result, expected) + + def test_compare_scalar(self, data, all_compare_operators): + op_name = all_compare_operators + self._compare_other(data, op_name, True) + + def test_compare_array(self, data, all_compare_operators): + op_name = all_compare_operators + other = pd.array([True] * len(data), dtype="boolean") + self._compare_other(data, op_name, other) + other = np.array([True] * len(data)) + self._compare_other(data, op_name, other) + other = pd.Series([True] * len(data)) + self._compare_other(data, op_name, other) + + @pytest.mark.parametrize("other", [True, False, pd.NA]) + def test_scalar(self, other, all_compare_operators): + op = self.get_op_from_name(all_compare_operators) + a = pd.array([True, False, None], dtype="boolean") + + result = op(a, other) + + if other is pd.NA: + expected = pd.array([None, None, None], dtype="boolean") + else: + values = op(a._data, other) + expected = BooleanArray(values, a._mask, copy=True) + tm.assert_extension_array_equal(result, expected) + + # ensure we haven't mutated anything inplace + result[0] = None + tm.assert_extension_array_equal( + a, pd.array([True, False, None], dtype="boolean") + ) + + def test_array(self, all_compare_operators): + op = self.get_op_from_name(all_compare_operators) + a = pd.array([True] * 3 + [False] * 3 + [None] * 3, dtype="boolean") + b = pd.array([True, False, None] * 3, dtype="boolean") + + result = op(a, b) + + values = op(a._data, b._data) + mask = a._mask | b._mask + expected = BooleanArray(values, mask) + tm.assert_extension_array_equal(result, expected) + + # ensure we haven't mutated anything inplace + result[0] = None + tm.assert_extension_array_equal( + a, pd.array([True] * 3 + [False] * 3 + [None] * 3, dtype="boolean") + ) + tm.assert_extension_array_equal( + b, pd.array([True, False, None] * 3, dtype="boolean") + ) diff --git a/pandas/tests/arrays/boolean/test_construction.py b/pandas/tests/arrays/boolean/test_construction.py new file mode 100644 index 0000000000000..bf1aba190f3e2 --- /dev/null +++ b/pandas/tests/arrays/boolean/test_construction.py @@ -0,0 +1,376 @@ +import numpy as np +import pytest + +import pandas.util._test_decorators as td + +import pandas as pd +import pandas._testing as tm +from pandas.arrays import BooleanArray +from pandas.core.arrays.boolean import coerce_to_array + + +@pytest.fixture +def data(): + return pd.array( + [True, False] * 4 + [np.nan] + [True, False] * 44 + [np.nan] + [True, False], + dtype="boolean", + ) + + +def test_boolean_array_constructor(): + values = np.array([True, False, True, False], dtype="bool") + mask = np.array([False, False, False, True], dtype="bool") + + result = BooleanArray(values, mask) + expected = pd.array([True, False, True, None], dtype="boolean") + tm.assert_extension_array_equal(result, expected) + + with pytest.raises(TypeError, match="values should be boolean numpy array"): + BooleanArray(values.tolist(), mask) + + with pytest.raises(TypeError, match="mask should be boolean numpy array"): + BooleanArray(values, mask.tolist()) + + with pytest.raises(TypeError, match="values should be boolean numpy array"): + BooleanArray(values.astype(int), mask) + + with pytest.raises(TypeError, match="mask should be boolean numpy array"): + BooleanArray(values, None) + + with pytest.raises(ValueError, match="values must be a 1D array"): + BooleanArray(values.reshape(1, -1), mask) + + with pytest.raises(ValueError, match="mask must be a 1D array"): + BooleanArray(values, mask.reshape(1, -1)) + + +def test_boolean_array_constructor_copy(): + values = np.array([True, False, True, False], dtype="bool") + mask = np.array([False, False, False, True], dtype="bool") + + result = BooleanArray(values, mask) + assert result._data is values + assert result._mask is mask + + result = BooleanArray(values, mask, copy=True) + assert result._data is not values + assert result._mask is not mask + + +def test_to_boolean_array(): + expected = BooleanArray( + np.array([True, False, True]), np.array([False, False, False]) + ) + + result = pd.array([True, False, True], dtype="boolean") + tm.assert_extension_array_equal(result, expected) + result = pd.array(np.array([True, False, True]), dtype="boolean") + tm.assert_extension_array_equal(result, expected) + result = pd.array(np.array([True, False, True], dtype=object), dtype="boolean") + tm.assert_extension_array_equal(result, expected) + + # with missing values + expected = BooleanArray( + np.array([True, False, True]), np.array([False, False, True]) + ) + + result = pd.array([True, False, None], dtype="boolean") + tm.assert_extension_array_equal(result, expected) + result = pd.array(np.array([True, False, None], dtype=object), dtype="boolean") + tm.assert_extension_array_equal(result, expected) + + +def test_to_boolean_array_all_none(): + expected = BooleanArray(np.array([True, True, True]), np.array([True, True, True])) + + result = pd.array([None, None, None], dtype="boolean") + tm.assert_extension_array_equal(result, expected) + result = pd.array(np.array([None, None, None], dtype=object), dtype="boolean") + tm.assert_extension_array_equal(result, expected) + + +@pytest.mark.parametrize( + "a, b", + [ + ([True, False, None, np.nan, pd.NA], [True, False, None, None, None]), + ([True, np.nan], [True, None]), + ([True, pd.NA], [True, None]), + ([np.nan, np.nan], [None, None]), + (np.array([np.nan, np.nan], dtype=float), [None, None]), + ], +) +def test_to_boolean_array_missing_indicators(a, b): + result = pd.array(a, dtype="boolean") + expected = pd.array(b, dtype="boolean") + tm.assert_extension_array_equal(result, expected) + + +@pytest.mark.parametrize( + "values", + [ + ["foo", "bar"], + ["1", "2"], + # "foo", + [1, 2], + [1.0, 2.0], + pd.date_range("20130101", periods=2), + np.array(["foo"]), + np.array([1, 2]), + np.array([1.0, 2.0]), + [np.nan, {"a": 1}], + ], +) +def test_to_boolean_array_error(values): + # error in converting existing arrays to BooleanArray + msg = "Need to pass bool-like value" + with pytest.raises(TypeError, match=msg): + pd.array(values, dtype="boolean") + + +def test_to_boolean_array_from_integer_array(): + result = pd.array(np.array([1, 0, 1, 0]), dtype="boolean") + expected = pd.array([True, False, True, False], dtype="boolean") + tm.assert_extension_array_equal(result, expected) + + # with missing values + result = pd.array(np.array([1, 0, 1, None]), dtype="boolean") + expected = pd.array([True, False, True, None], dtype="boolean") + tm.assert_extension_array_equal(result, expected) + + +def test_to_boolean_array_from_float_array(): + result = pd.array(np.array([1.0, 0.0, 1.0, 0.0]), dtype="boolean") + expected = pd.array([True, False, True, False], dtype="boolean") + tm.assert_extension_array_equal(result, expected) + + # with missing values + result = pd.array(np.array([1.0, 0.0, 1.0, np.nan]), dtype="boolean") + expected = pd.array([True, False, True, None], dtype="boolean") + tm.assert_extension_array_equal(result, expected) + + +def test_to_boolean_array_integer_like(): + # integers of 0's and 1's + result = pd.array([1, 0, 1, 0], dtype="boolean") + expected = pd.array([True, False, True, False], dtype="boolean") + tm.assert_extension_array_equal(result, expected) + + # with missing values + result = pd.array([1, 0, 1, None], dtype="boolean") + expected = pd.array([True, False, True, None], dtype="boolean") + tm.assert_extension_array_equal(result, expected) + + +def test_coerce_to_array(): + # TODO this is currently not public API + values = np.array([True, False, True, False], dtype="bool") + mask = np.array([False, False, False, True], dtype="bool") + result = BooleanArray(*coerce_to_array(values, mask=mask)) + expected = BooleanArray(values, mask) + tm.assert_extension_array_equal(result, expected) + assert result._data is values + assert result._mask is mask + result = BooleanArray(*coerce_to_array(values, mask=mask, copy=True)) + expected = BooleanArray(values, mask) + tm.assert_extension_array_equal(result, expected) + assert result._data is not values + assert result._mask is not mask + + # mixed missing from values and mask + values = [True, False, None, False] + mask = np.array([False, False, False, True], dtype="bool") + result = BooleanArray(*coerce_to_array(values, mask=mask)) + expected = BooleanArray( + np.array([True, False, True, True]), np.array([False, False, True, True]) + ) + tm.assert_extension_array_equal(result, expected) + result = BooleanArray(*coerce_to_array(np.array(values, dtype=object), mask=mask)) + tm.assert_extension_array_equal(result, expected) + result = BooleanArray(*coerce_to_array(values, mask=mask.tolist())) + tm.assert_extension_array_equal(result, expected) + + # raise errors for wrong dimension + values = np.array([True, False, True, False], dtype="bool") + mask = np.array([False, False, False, True], dtype="bool") + + with pytest.raises(ValueError, match="values must be a 1D list-like"): + coerce_to_array(values.reshape(1, -1)) + + with pytest.raises(ValueError, match="mask must be a 1D list-like"): + coerce_to_array(values, mask=mask.reshape(1, -1)) + + +def test_coerce_to_array_from_boolean_array(): + # passing BooleanArray to coerce_to_array + values = np.array([True, False, True, False], dtype="bool") + mask = np.array([False, False, False, True], dtype="bool") + arr = BooleanArray(values, mask) + result = BooleanArray(*coerce_to_array(arr)) + tm.assert_extension_array_equal(result, arr) + # no copy + assert result._data is arr._data + assert result._mask is arr._mask + + result = BooleanArray(*coerce_to_array(arr), copy=True) + tm.assert_extension_array_equal(result, arr) + assert result._data is not arr._data + assert result._mask is not arr._mask + + with pytest.raises(ValueError, match="cannot pass mask for BooleanArray input"): + coerce_to_array(arr, mask=mask) + + +def test_coerce_to_numpy_array(): + # with missing values -> object dtype + arr = pd.array([True, False, None], dtype="boolean") + result = np.array(arr) + expected = np.array([True, False, pd.NA], dtype="object") + tm.assert_numpy_array_equal(result, expected) + + # also with no missing values -> object dtype + arr = pd.array([True, False, True], dtype="boolean") + result = np.array(arr) + expected = np.array([True, False, True], dtype="object") + tm.assert_numpy_array_equal(result, expected) + + # force bool dtype + result = np.array(arr, dtype="bool") + expected = np.array([True, False, True], dtype="bool") + tm.assert_numpy_array_equal(result, expected) + # with missing values will raise error + arr = pd.array([True, False, None], dtype="boolean") + msg = ( + "cannot convert to 'bool'-dtype NumPy array with missing values. " + "Specify an appropriate 'na_value' for this dtype." + ) + with pytest.raises(ValueError, match=msg): + np.array(arr, dtype="bool") + + +def test_to_boolean_array_from_strings(): + result = BooleanArray._from_sequence_of_strings( + np.array(["True", "False", np.nan], dtype=object) + ) + expected = BooleanArray( + np.array([True, False, False]), np.array([False, False, True]) + ) + + tm.assert_extension_array_equal(result, expected) + + +def test_to_boolean_array_from_strings_invalid_string(): + with pytest.raises(ValueError, match="cannot be cast"): + BooleanArray._from_sequence_of_strings(["donkey"]) + + +@pytest.mark.parametrize("box", [True, False], ids=["series", "array"]) +def test_to_numpy(box): + con = pd.Series if box else pd.array + # default (with or without missing values) -> object dtype + arr = con([True, False, True], dtype="boolean") + result = arr.to_numpy() + expected = np.array([True, False, True], dtype="object") + tm.assert_numpy_array_equal(result, expected) + + arr = con([True, False, None], dtype="boolean") + result = arr.to_numpy() + expected = np.array([True, False, pd.NA], dtype="object") + tm.assert_numpy_array_equal(result, expected) + + arr = con([True, False, None], dtype="boolean") + result = arr.to_numpy(dtype="str") + expected = np.array([True, False, pd.NA], dtype=" can convert to bool, otherwise raises + arr = con([True, False, True], dtype="boolean") + result = arr.to_numpy(dtype="bool") + expected = np.array([True, False, True], dtype="bool") + tm.assert_numpy_array_equal(result, expected) + + arr = con([True, False, None], dtype="boolean") + with pytest.raises(ValueError, match="cannot convert to 'bool'-dtype"): + result = arr.to_numpy(dtype="bool") + + # specify dtype and na_value + arr = con([True, False, None], dtype="boolean") + result = arr.to_numpy(dtype=object, na_value=None) + expected = np.array([True, False, None], dtype="object") + tm.assert_numpy_array_equal(result, expected) + + result = arr.to_numpy(dtype=bool, na_value=False) + expected = np.array([True, False, False], dtype="bool") + tm.assert_numpy_array_equal(result, expected) + + result = arr.to_numpy(dtype="int64", na_value=-99) + expected = np.array([1, 0, -99], dtype="int64") + tm.assert_numpy_array_equal(result, expected) + + result = arr.to_numpy(dtype="float64", na_value=np.nan) + expected = np.array([1, 0, np.nan], dtype="float64") + tm.assert_numpy_array_equal(result, expected) + + # converting to int or float without specifying na_value raises + with pytest.raises(ValueError, match="cannot convert to 'int64'-dtype"): + arr.to_numpy(dtype="int64") + with pytest.raises(ValueError, match="cannot convert to 'float64'-dtype"): + arr.to_numpy(dtype="float64") + + +def test_to_numpy_copy(): + # to_numpy can be zero-copy if no missing values + arr = pd.array([True, False, True], dtype="boolean") + result = arr.to_numpy(dtype=bool) + result[0] = False + tm.assert_extension_array_equal( + arr, pd.array([False, False, True], dtype="boolean") + ) + + arr = pd.array([True, False, True], dtype="boolean") + result = arr.to_numpy(dtype=bool, copy=True) + result[0] = False + tm.assert_extension_array_equal(arr, pd.array([True, False, True], dtype="boolean")) + + +# FIXME: don't leave commented out +# TODO when BooleanArray coerces to object dtype numpy array, need to do conversion +# manually in the indexing code +# def test_indexing_boolean_mask(): +# arr = pd.array([1, 2, 3, 4], dtype="Int64") +# mask = pd.array([True, False, True, False], dtype="boolean") +# result = arr[mask] +# expected = pd.array([1, 3], dtype="Int64") +# tm.assert_extension_array_equal(result, expected) + +# # missing values -> error +# mask = pd.array([True, False, True, None], dtype="boolean") +# with pytest.raises(IndexError): +# result = arr[mask] + + +@td.skip_if_no("pyarrow", min_version="0.15.0") +def test_arrow_array(data): + # protocol added in 0.15.0 + import pyarrow as pa + + arr = pa.array(data) + + # TODO use to_numpy(na_value=None) here + data_object = np.array(data, dtype=object) + data_object[data.isna()] = None + expected = pa.array(data_object, type=pa.bool_(), from_pandas=True) + assert arr.equals(expected) + + +@td.skip_if_no("pyarrow", min_version="0.15.1.dev") +def test_arrow_roundtrip(): + # roundtrip possible from arrow 1.0.0 + import pyarrow as pa + + data = pd.array([True, False, None], dtype="boolean") + df = pd.DataFrame({"a": data}) + table = pa.table(df) + assert table.field("a").type == "bool" + result = table.to_pandas() + assert isinstance(result["a"].dtype, pd.BooleanDtype) + tm.assert_frame_equal(result, df) diff --git a/pandas/tests/arrays/boolean/test_function.py b/pandas/tests/arrays/boolean/test_function.py new file mode 100644 index 0000000000000..c2987dc37b960 --- /dev/null +++ b/pandas/tests/arrays/boolean/test_function.py @@ -0,0 +1,107 @@ +import numpy as np +import pytest + +import pandas as pd +import pandas._testing as tm + + +@pytest.fixture +def data(): + return pd.array( + [True, False] * 4 + [np.nan] + [True, False] * 44 + [np.nan] + [True, False], + dtype="boolean", + ) + + +@pytest.mark.parametrize( + "ufunc", [np.add, np.logical_or, np.logical_and, np.logical_xor] +) +def test_ufuncs_binary(ufunc): + # two BooleanArrays + a = pd.array([True, False, None], dtype="boolean") + result = ufunc(a, a) + expected = pd.array(ufunc(a._data, a._data), dtype="boolean") + expected[a._mask] = np.nan + tm.assert_extension_array_equal(result, expected) + + s = pd.Series(a) + result = ufunc(s, a) + expected = pd.Series(ufunc(a._data, a._data), dtype="boolean") + expected[a._mask] = np.nan + tm.assert_series_equal(result, expected) + + # Boolean with numpy array + arr = np.array([True, True, False]) + result = ufunc(a, arr) + expected = pd.array(ufunc(a._data, arr), dtype="boolean") + expected[a._mask] = np.nan + tm.assert_extension_array_equal(result, expected) + + result = ufunc(arr, a) + expected = pd.array(ufunc(arr, a._data), dtype="boolean") + expected[a._mask] = np.nan + tm.assert_extension_array_equal(result, expected) + + # BooleanArray with scalar + result = ufunc(a, True) + expected = pd.array(ufunc(a._data, True), dtype="boolean") + expected[a._mask] = np.nan + tm.assert_extension_array_equal(result, expected) + + result = ufunc(True, a) + expected = pd.array(ufunc(True, a._data), dtype="boolean") + expected[a._mask] = np.nan + tm.assert_extension_array_equal(result, expected) + + # not handled types + with pytest.raises(TypeError): + ufunc(a, "test") + + +@pytest.mark.parametrize("ufunc", [np.logical_not]) +def test_ufuncs_unary(ufunc): + a = pd.array([True, False, None], dtype="boolean") + result = ufunc(a) + expected = pd.array(ufunc(a._data), dtype="boolean") + expected[a._mask] = np.nan + tm.assert_extension_array_equal(result, expected) + + s = pd.Series(a) + result = ufunc(s) + expected = pd.Series(ufunc(a._data), dtype="boolean") + expected[a._mask] = np.nan + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("values", [[True, False], [True, None]]) +def test_ufunc_reduce_raises(values): + a = pd.array(values, dtype="boolean") + with pytest.raises(NotImplementedError): + np.add.reduce(a) + + +def test_value_counts_na(): + arr = pd.array([True, False, pd.NA], dtype="boolean") + result = arr.value_counts(dropna=False) + expected = pd.Series([1, 1, 1], index=[True, False, pd.NA], dtype="Int64") + tm.assert_series_equal(result, expected) + + result = arr.value_counts(dropna=True) + expected = pd.Series([1, 1], index=[True, False], dtype="Int64") + tm.assert_series_equal(result, expected) + + +def test_diff(): + a = pd.array( + [True, True, False, False, True, None, True, None, False], dtype="boolean" + ) + result = pd.core.algorithms.diff(a, 1) + expected = pd.array( + [None, False, True, False, True, None, None, None, None], dtype="boolean" + ) + tm.assert_extension_array_equal(result, expected) + + s = pd.Series(a) + result = s.diff() + expected = pd.Series(expected) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/arrays/boolean/test_indexing.py b/pandas/tests/arrays/boolean/test_indexing.py new file mode 100644 index 0000000000000..6a7daea16963c --- /dev/null +++ b/pandas/tests/arrays/boolean/test_indexing.py @@ -0,0 +1,13 @@ +import numpy as np +import pytest + +import pandas as pd +import pandas._testing as tm + + +@pytest.mark.parametrize("na", [None, np.nan, pd.NA]) +def test_setitem_missing_values(na): + arr = pd.array([True, False, None], dtype="boolean") + expected = pd.array([True, None, None], dtype="boolean") + arr[1] = na + tm.assert_extension_array_equal(arr, expected) diff --git a/pandas/tests/arrays/boolean/test_logical.py b/pandas/tests/arrays/boolean/test_logical.py new file mode 100644 index 0000000000000..6cfe19e2fe3eb --- /dev/null +++ b/pandas/tests/arrays/boolean/test_logical.py @@ -0,0 +1,230 @@ +import operator + +import numpy as np +import pytest + +import pandas as pd +import pandas._testing as tm +from pandas.arrays import BooleanArray +from pandas.tests.extension.base import BaseOpsUtil + + +class TestLogicalOps(BaseOpsUtil): + def test_numpy_scalars_ok(self, all_logical_operators): + a = pd.array([True, False, None], dtype="boolean") + op = getattr(a, all_logical_operators) + + tm.assert_extension_array_equal(op(True), op(np.bool(True))) + tm.assert_extension_array_equal(op(False), op(np.bool(False))) + + def get_op_from_name(self, op_name): + short_opname = op_name.strip("_") + short_opname = short_opname if "xor" in short_opname else short_opname + "_" + try: + op = getattr(operator, short_opname) + except AttributeError: + # Assume it is the reverse operator + rop = getattr(operator, short_opname[1:]) + op = lambda x, y: rop(y, x) + + return op + + def test_empty_ok(self, all_logical_operators): + a = pd.array([], dtype="boolean") + op_name = all_logical_operators + result = getattr(a, op_name)(True) + tm.assert_extension_array_equal(a, result) + + result = getattr(a, op_name)(False) + tm.assert_extension_array_equal(a, result) + + # TODO: pd.NA + # result = getattr(a, op_name)(pd.NA) + # tm.assert_extension_array_equal(a, result) + + def test_logical_length_mismatch_raises(self, all_logical_operators): + op_name = all_logical_operators + a = pd.array([True, False, None], dtype="boolean") + msg = "Lengths must match to compare" + + with pytest.raises(ValueError, match=msg): + getattr(a, op_name)([True, False]) + + with pytest.raises(ValueError, match=msg): + getattr(a, op_name)(np.array([True, False])) + + with pytest.raises(ValueError, match=msg): + getattr(a, op_name)(pd.array([True, False], dtype="boolean")) + + def test_logical_nan_raises(self, all_logical_operators): + op_name = all_logical_operators + a = pd.array([True, False, None], dtype="boolean") + msg = "Got float instead" + + with pytest.raises(TypeError, match=msg): + getattr(a, op_name)(np.nan) + + @pytest.mark.parametrize("other", ["a", 1]) + def test_non_bool_or_na_other_raises(self, other, all_logical_operators): + a = pd.array([True, False], dtype="boolean") + with pytest.raises(TypeError, match=str(type(other).__name__)): + getattr(a, all_logical_operators)(other) + + def test_kleene_or(self): + # A clear test of behavior. + a = pd.array([True] * 3 + [False] * 3 + [None] * 3, dtype="boolean") + b = pd.array([True, False, None] * 3, dtype="boolean") + result = a | b + expected = pd.array( + [True, True, True, True, False, None, True, None, None], dtype="boolean" + ) + tm.assert_extension_array_equal(result, expected) + + result = b | a + tm.assert_extension_array_equal(result, expected) + + # ensure we haven't mutated anything inplace + tm.assert_extension_array_equal( + a, pd.array([True] * 3 + [False] * 3 + [None] * 3, dtype="boolean") + ) + tm.assert_extension_array_equal( + b, pd.array([True, False, None] * 3, dtype="boolean") + ) + + @pytest.mark.parametrize( + "other, expected", + [ + (pd.NA, [True, None, None]), + (True, [True, True, True]), + (np.bool_(True), [True, True, True]), + (False, [True, False, None]), + (np.bool_(False), [True, False, None]), + ], + ) + def test_kleene_or_scalar(self, other, expected): + # TODO: test True & False + a = pd.array([True, False, None], dtype="boolean") + result = a | other + expected = pd.array(expected, dtype="boolean") + tm.assert_extension_array_equal(result, expected) + + result = other | a + tm.assert_extension_array_equal(result, expected) + + # ensure we haven't mutated anything inplace + tm.assert_extension_array_equal( + a, pd.array([True, False, None], dtype="boolean") + ) + + def test_kleene_and(self): + # A clear test of behavior. + a = pd.array([True] * 3 + [False] * 3 + [None] * 3, dtype="boolean") + b = pd.array([True, False, None] * 3, dtype="boolean") + result = a & b + expected = pd.array( + [True, False, None, False, False, False, None, False, None], dtype="boolean" + ) + tm.assert_extension_array_equal(result, expected) + + result = b & a + tm.assert_extension_array_equal(result, expected) + + # ensure we haven't mutated anything inplace + tm.assert_extension_array_equal( + a, pd.array([True] * 3 + [False] * 3 + [None] * 3, dtype="boolean") + ) + tm.assert_extension_array_equal( + b, pd.array([True, False, None] * 3, dtype="boolean") + ) + + @pytest.mark.parametrize( + "other, expected", + [ + (pd.NA, [None, False, None]), + (True, [True, False, None]), + (False, [False, False, False]), + (np.bool_(True), [True, False, None]), + (np.bool_(False), [False, False, False]), + ], + ) + def test_kleene_and_scalar(self, other, expected): + a = pd.array([True, False, None], dtype="boolean") + result = a & other + expected = pd.array(expected, dtype="boolean") + tm.assert_extension_array_equal(result, expected) + + result = other & a + tm.assert_extension_array_equal(result, expected) + + # ensure we haven't mutated anything inplace + tm.assert_extension_array_equal( + a, pd.array([True, False, None], dtype="boolean") + ) + + def test_kleene_xor(self): + a = pd.array([True] * 3 + [False] * 3 + [None] * 3, dtype="boolean") + b = pd.array([True, False, None] * 3, dtype="boolean") + result = a ^ b + expected = pd.array( + [False, True, None, True, False, None, None, None, None], dtype="boolean" + ) + tm.assert_extension_array_equal(result, expected) + + result = b ^ a + tm.assert_extension_array_equal(result, expected) + + # ensure we haven't mutated anything inplace + tm.assert_extension_array_equal( + a, pd.array([True] * 3 + [False] * 3 + [None] * 3, dtype="boolean") + ) + tm.assert_extension_array_equal( + b, pd.array([True, False, None] * 3, dtype="boolean") + ) + + @pytest.mark.parametrize( + "other, expected", + [ + (pd.NA, [None, None, None]), + (True, [False, True, None]), + (np.bool_(True), [False, True, None]), + (np.bool_(False), [True, False, None]), + ], + ) + def test_kleene_xor_scalar(self, other, expected): + a = pd.array([True, False, None], dtype="boolean") + result = a ^ other + expected = pd.array(expected, dtype="boolean") + tm.assert_extension_array_equal(result, expected) + + result = other ^ a + tm.assert_extension_array_equal(result, expected) + + # ensure we haven't mutated anything inplace + tm.assert_extension_array_equal( + a, pd.array([True, False, None], dtype="boolean") + ) + + @pytest.mark.parametrize( + "other", [True, False, pd.NA, [True, False, None] * 3], + ) + def test_no_masked_assumptions(self, other, all_logical_operators): + # The logical operations should not assume that masked values are False! + a = pd.arrays.BooleanArray( + np.array([True, True, True, False, False, False, True, False, True]), + np.array([False] * 6 + [True, True, True]), + ) + b = pd.array([True] * 3 + [False] * 3 + [None] * 3, dtype="boolean") + if isinstance(other, list): + other = pd.array(other, dtype="boolean") + + result = getattr(a, all_logical_operators)(other) + expected = getattr(b, all_logical_operators)(other) + tm.assert_extension_array_equal(result, expected) + + if isinstance(other, BooleanArray): + other._data[other._mask] = True + a._data[a._mask] = False + + result = getattr(a, all_logical_operators)(other) + expected = getattr(b, all_logical_operators)(other) + tm.assert_extension_array_equal(result, expected) diff --git a/pandas/tests/arrays/boolean/test_ops.py b/pandas/tests/arrays/boolean/test_ops.py new file mode 100644 index 0000000000000..52f602258a049 --- /dev/null +++ b/pandas/tests/arrays/boolean/test_ops.py @@ -0,0 +1,20 @@ +import pandas as pd +import pandas._testing as tm + + +class TestUnaryOps: + def test_invert(self): + a = pd.array([True, False, None], dtype="boolean") + expected = pd.array([False, True, None], dtype="boolean") + tm.assert_extension_array_equal(~a, expected) + + expected = pd.Series(expected, index=["a", "b", "c"], name="name") + result = ~pd.Series(a, index=["a", "b", "c"], name="name") + tm.assert_series_equal(result, expected) + + df = pd.DataFrame({"A": a, "B": [True, False, False]}, index=["a", "b", "c"]) + result = ~df + expected = pd.DataFrame( + {"A": expected, "B": [False, True, True]}, index=["a", "b", "c"] + ) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/arrays/boolean/test_reduction.py b/pandas/tests/arrays/boolean/test_reduction.py new file mode 100644 index 0000000000000..7a8146ef14de0 --- /dev/null +++ b/pandas/tests/arrays/boolean/test_reduction.py @@ -0,0 +1,55 @@ +import numpy as np +import pytest + +import pandas as pd + + +@pytest.fixture +def data(): + return pd.array( + [True, False] * 4 + [np.nan] + [True, False] * 44 + [np.nan] + [True, False], + dtype="boolean", + ) + + +@pytest.mark.parametrize( + "values, exp_any, exp_all, exp_any_noskip, exp_all_noskip", + [ + ([True, pd.NA], True, True, True, pd.NA), + ([False, pd.NA], False, False, pd.NA, False), + ([pd.NA], False, True, pd.NA, pd.NA), + ([], False, True, False, True), + ], +) +def test_any_all(values, exp_any, exp_all, exp_any_noskip, exp_all_noskip): + # the methods return numpy scalars + exp_any = pd.NA if exp_any is pd.NA else np.bool_(exp_any) + exp_all = pd.NA if exp_all is pd.NA else np.bool_(exp_all) + exp_any_noskip = pd.NA if exp_any_noskip is pd.NA else np.bool_(exp_any_noskip) + exp_all_noskip = pd.NA if exp_all_noskip is pd.NA else np.bool_(exp_all_noskip) + + for con in [pd.array, pd.Series]: + a = con(values, dtype="boolean") + assert a.any() is exp_any + assert a.all() is exp_all + assert a.any(skipna=False) is exp_any_noskip + assert a.all(skipna=False) is exp_all_noskip + + assert np.any(a.any()) is exp_any + assert np.all(a.all()) is exp_all + + +@pytest.mark.parametrize("dropna", [True, False]) +def test_reductions_return_types(dropna, data, all_numeric_reductions): + op = all_numeric_reductions + s = pd.Series(data) + if dropna: + s = s.dropna() + + if op in ("sum", "prod"): + assert isinstance(getattr(s, op)(), np.int64) + elif op in ("min", "max"): + assert isinstance(getattr(s, op)(), np.bool_) + else: + # "mean", "std", "var", "median", "kurt", "skew" + assert isinstance(getattr(s, op)(), np.float64) diff --git a/pandas/tests/arrays/boolean/test_repr.py b/pandas/tests/arrays/boolean/test_repr.py new file mode 100644 index 0000000000000..0ee904b18cc9e --- /dev/null +++ b/pandas/tests/arrays/boolean/test_repr.py @@ -0,0 +1,13 @@ +import pandas as pd + + +def test_repr(): + df = pd.DataFrame({"A": pd.array([True, False, None], dtype="boolean")}) + expected = " A\n0 True\n1 False\n2 " + assert repr(df) == expected + + expected = "0 True\n1 False\n2 \nName: A, dtype: boolean" + assert repr(df.A) == expected + + expected = "\n[True, False, ]\nLength: 3, dtype: boolean" + assert repr(df.A.array) == expected diff --git a/pandas/tests/arrays/categorical/test_api.py b/pandas/tests/arrays/categorical/test_api.py index f49f70f5acf77..b99e172674f66 100644 --- a/pandas/tests/arrays/categorical/test_api.py +++ b/pandas/tests/arrays/categorical/test_api.py @@ -247,7 +247,7 @@ def test_set_categories(self): tm.assert_index_equal(c.categories, Index([1, 2, 3, 4])) exp = np.array([1, 2, 3, 4, 1], dtype=np.int64) - tm.assert_numpy_array_equal(c.to_dense(), exp) + tm.assert_numpy_array_equal(np.asarray(c), exp) # all "pointers" to '4' must be changed from 3 to 0,... c = c.set_categories([4, 3, 2, 1]) @@ -260,7 +260,7 @@ def test_set_categories(self): # output is the same exp = np.array([1, 2, 3, 4, 1], dtype=np.int64) - tm.assert_numpy_array_equal(c.to_dense(), exp) + tm.assert_numpy_array_equal(np.asarray(c), exp) assert c.min() == 4 assert c.max() == 1 @@ -268,13 +268,19 @@ def test_set_categories(self): c2 = c.set_categories([4, 3, 2, 1], ordered=False) assert not c2.ordered - tm.assert_numpy_array_equal(c.to_dense(), c2.to_dense()) + tm.assert_numpy_array_equal(np.asarray(c), np.asarray(c2)) # set_categories should pass thru the ordering c2 = c.set_ordered(False).set_categories([4, 3, 2, 1]) assert not c2.ordered - tm.assert_numpy_array_equal(c.to_dense(), c2.to_dense()) + tm.assert_numpy_array_equal(np.asarray(c), np.asarray(c2)) + + def test_to_dense_deprecated(self): + cat = Categorical(["a", "b", "c", "a"], ordered=True) + + with tm.assert_produces_warning(FutureWarning): + cat.to_dense() @pytest.mark.parametrize( "values, categories, new_categories", diff --git a/pandas/tests/arrays/categorical/test_constructors.py b/pandas/tests/arrays/categorical/test_constructors.py index c6b4c4904735c..3e31c1acbe09d 100644 --- a/pandas/tests/arrays/categorical/test_constructors.py +++ b/pandas/tests/arrays/categorical/test_constructors.py @@ -252,7 +252,7 @@ def test_constructor_not_sequence(self): def test_constructor_with_null(self): # Cannot have NaN in categories - msg = "Categorial categories cannot be null" + msg = "Categorical categories cannot be null" with pytest.raises(ValueError, match=msg): Categorical([np.nan, "a", "b", "c"], categories=[np.nan, "a", "b", "c"]) @@ -500,7 +500,7 @@ def test_from_codes_non_unique_categories(self): Categorical.from_codes([0, 1, 2], categories=["a", "a", "b"]) def test_from_codes_nan_cat_included(self): - with pytest.raises(ValueError, match="Categorial categories cannot be null"): + with pytest.raises(ValueError, match="Categorical categories cannot be null"): Categorical.from_codes([0, 1, 2], categories=["a", "b", np.nan]) def test_from_codes_too_negative(self): diff --git a/pandas/tests/arrays/categorical/test_indexing.py b/pandas/tests/arrays/categorical/test_indexing.py index 3d9469c252914..1cbf64a1529c2 100644 --- a/pandas/tests/arrays/categorical/test_indexing.py +++ b/pandas/tests/arrays/categorical/test_indexing.py @@ -146,7 +146,7 @@ def test_periodindex(self): tm.assert_numpy_array_equal(cat3._codes, exp_arr) tm.assert_index_equal(cat3.categories, exp_idx) - def test_categories_assigments(self): + def test_categories_assignments(self): s = Categorical(["a", "b", "c", "a"]) exp = np.array([1, 2, 3, 1], dtype=np.int64) s.categories = [1, 2, 3] @@ -154,7 +154,7 @@ def test_categories_assigments(self): tm.assert_index_equal(s.categories, Index([1, 2, 3])) @pytest.mark.parametrize("new_categories", [[1, 2, 3, 4], [1, 2]]) - def test_categories_assigments_wrong_length_raises(self, new_categories): + def test_categories_assignments_wrong_length_raises(self, new_categories): cat = Categorical(["a", "b", "c", "a"]) msg = ( "new categories need to have the same number of items " diff --git a/pandas/tests/arrays/categorical/test_missing.py b/pandas/tests/arrays/categorical/test_missing.py index 8889f45a84237..9eb3c8b3a8c48 100644 --- a/pandas/tests/arrays/categorical/test_missing.py +++ b/pandas/tests/arrays/categorical/test_missing.py @@ -82,3 +82,18 @@ def test_fillna_iterable_category(self, named): expected = Categorical([Point(0, 0), Point(0, 1), Point(0, 0)]) tm.assert_categorical_equal(result, expected) + + def test_fillna_array(self): + # accept Categorical or ndarray value if it holds appropriate values + cat = Categorical(["A", "B", "C", None, None]) + + other = cat.fillna("C") + result = cat.fillna(other) + tm.assert_categorical_equal(result, other) + assert isna(cat[-1]) # didnt modify original inplace + + other = np.array(["A", "B", "C", "B", "A"]) + result = cat.fillna(other) + expected = Categorical(["A", "B", "C", "B", "A"], dtype=cat.dtype) + tm.assert_categorical_equal(result, expected) + assert isna(cat[-1]) # didnt modify original inplace diff --git a/pandas/tests/arrays/categorical/test_replace.py b/pandas/tests/arrays/categorical/test_replace.py index 52530123bd52f..b9ac3ce9a37ae 100644 --- a/pandas/tests/arrays/categorical/test_replace.py +++ b/pandas/tests/arrays/categorical/test_replace.py @@ -1,3 +1,4 @@ +import numpy as np import pytest import pandas as pd @@ -5,44 +6,46 @@ @pytest.mark.parametrize( - "to_replace,value,expected,check_types,check_categorical", + "to_replace,value,expected,flip_categories", [ # one-to-one - (1, 2, [2, 2, 3], True, True), - (1, 4, [4, 2, 3], True, True), - (4, 1, [1, 2, 3], True, True), - (5, 6, [1, 2, 3], True, True), + (1, 2, [2, 2, 3], False), + (1, 4, [4, 2, 3], False), + (4, 1, [1, 2, 3], False), + (5, 6, [1, 2, 3], False), # many-to-one - ([1], 2, [2, 2, 3], True, True), - ([1, 2], 3, [3, 3, 3], True, True), - ([1, 2], 4, [4, 4, 3], True, True), - ((1, 2, 4), 5, [5, 5, 3], True, True), - ((5, 6), 2, [1, 2, 3], True, True), + ([1], 2, [2, 2, 3], False), + ([1, 2], 3, [3, 3, 3], False), + ([1, 2], 4, [4, 4, 3], False), + ((1, 2, 4), 5, [5, 5, 3], False), + ((5, 6), 2, [1, 2, 3], False), # many-to-many, handled outside of Categorical and results in separate dtype - ([1], [2], [2, 2, 3], False, False), - ([1, 4], [5, 2], [5, 2, 3], False, False), + ([1], [2], [2, 2, 3], True), + ([1, 4], [5, 2], [5, 2, 3], True), # check_categorical sorts categories, which crashes on mixed dtypes - (3, "4", [1, 2, "4"], True, False), - ([1, 2, "3"], "5", ["5", "5", 3], True, False), + (3, "4", [1, 2, "4"], False), + ([1, 2, "3"], "5", ["5", "5", 3], True), ], ) -def test_replace(to_replace, value, expected, check_types, check_categorical): +def test_replace(to_replace, value, expected, flip_categories): # GH 31720 + stays_categorical = not isinstance(value, list) + s = pd.Series([1, 2, 3], dtype="category") result = s.replace(to_replace, value) expected = pd.Series(expected, dtype="category") s.replace(to_replace, value, inplace=True) + + if flip_categories: + expected = expected.cat.set_categories(expected.cat.categories[::-1]) + + if not stays_categorical: + # the replace call loses categorical dtype + expected = pd.Series(np.asarray(expected)) + tm.assert_series_equal( - expected, - result, - check_dtype=check_types, - check_categorical=check_categorical, - check_category_order=False, + expected, result, check_category_order=False, ) tm.assert_series_equal( - expected, - s, - check_dtype=check_types, - check_categorical=check_categorical, - check_category_order=False, + expected, s, check_category_order=False, ) diff --git a/pandas/tests/arrays/sparse/test_array.py b/pandas/tests/arrays/sparse/test_array.py index baca18239b929..cb3a70e934dcb 100644 --- a/pandas/tests/arrays/sparse/test_array.py +++ b/pandas/tests/arrays/sparse/test_array.py @@ -96,6 +96,22 @@ def test_constructor_na_dtype(self, dtype): with pytest.raises(ValueError, match="Cannot convert"): SparseArray([0, 1, np.nan], dtype=dtype) + def test_constructor_warns_when_losing_timezone(self): + # GH#32501 warn when losing timezone inforamtion + dti = pd.date_range("2016-01-01", periods=3, tz="US/Pacific") + + expected = SparseArray(np.asarray(dti, dtype="datetime64[ns]")) + + with tm.assert_produces_warning(UserWarning): + result = SparseArray(dti) + + tm.assert_sp_array_equal(result, expected) + + with tm.assert_produces_warning(UserWarning): + result = SparseArray(pd.Series(dti)) + + tm.assert_sp_array_equal(result, expected) + def test_constructor_spindex_dtype(self): arr = SparseArray(data=[1, 2], sparse_index=IntIndex(4, [1, 2])) # XXX: Behavior change: specifying SparseIndex no longer changes the @@ -1102,7 +1118,7 @@ def test_nbytes_block(self): arr = SparseArray([1, 2, 0, 0, 0], kind="block") result = arr.nbytes # (2 * 8) + 4 + 4 - # sp_values, blocs, blenghts + # sp_values, blocs, blengths assert result == 24 def test_asarray_datetime64(self): diff --git a/pandas/tests/arrays/test_array.py b/pandas/tests/arrays/test_array.py index f42b16cf18f20..ad6e6e4a98057 100644 --- a/pandas/tests/arrays/test_array.py +++ b/pandas/tests/arrays/test_array.py @@ -222,6 +222,8 @@ def test_array_copy(): # integer ([1, 2], IntegerArray._from_sequence([1, 2])), ([1, None], IntegerArray._from_sequence([1, None])), + ([1, pd.NA], IntegerArray._from_sequence([1, pd.NA])), + ([1, np.nan], IntegerArray._from_sequence([1, np.nan])), # string (["a", "b"], StringArray._from_sequence(["a", "b"])), (["a", None], StringArray._from_sequence(["a", None])), diff --git a/pandas/tests/arrays/test_boolean.py b/pandas/tests/arrays/test_boolean.py deleted file mode 100644 index f4b466f4804c7..0000000000000 --- a/pandas/tests/arrays/test_boolean.py +++ /dev/null @@ -1,936 +0,0 @@ -import operator - -import numpy as np -import pytest - -import pandas.util._test_decorators as td - -import pandas as pd -import pandas._testing as tm -from pandas.arrays import BooleanArray -from pandas.core.arrays.boolean import coerce_to_array -from pandas.tests.extension.base import BaseOpsUtil - - -def make_data(): - return [True, False] * 4 + [np.nan] + [True, False] * 44 + [np.nan] + [True, False] - - -@pytest.fixture -def dtype(): - return pd.BooleanDtype() - - -@pytest.fixture -def data(dtype): - return pd.array(make_data(), dtype=dtype) - - -def test_boolean_array_constructor(): - values = np.array([True, False, True, False], dtype="bool") - mask = np.array([False, False, False, True], dtype="bool") - - result = BooleanArray(values, mask) - expected = pd.array([True, False, True, None], dtype="boolean") - tm.assert_extension_array_equal(result, expected) - - with pytest.raises(TypeError, match="values should be boolean numpy array"): - BooleanArray(values.tolist(), mask) - - with pytest.raises(TypeError, match="mask should be boolean numpy array"): - BooleanArray(values, mask.tolist()) - - with pytest.raises(TypeError, match="values should be boolean numpy array"): - BooleanArray(values.astype(int), mask) - - with pytest.raises(TypeError, match="mask should be boolean numpy array"): - BooleanArray(values, None) - - with pytest.raises(ValueError, match="values must be a 1D array"): - BooleanArray(values.reshape(1, -1), mask) - - with pytest.raises(ValueError, match="mask must be a 1D array"): - BooleanArray(values, mask.reshape(1, -1)) - - -def test_boolean_array_constructor_copy(): - values = np.array([True, False, True, False], dtype="bool") - mask = np.array([False, False, False, True], dtype="bool") - - result = BooleanArray(values, mask) - assert result._data is values - assert result._mask is mask - - result = BooleanArray(values, mask, copy=True) - assert result._data is not values - assert result._mask is not mask - - -def test_to_boolean_array(): - expected = BooleanArray( - np.array([True, False, True]), np.array([False, False, False]) - ) - - result = pd.array([True, False, True], dtype="boolean") - tm.assert_extension_array_equal(result, expected) - result = pd.array(np.array([True, False, True]), dtype="boolean") - tm.assert_extension_array_equal(result, expected) - result = pd.array(np.array([True, False, True], dtype=object), dtype="boolean") - tm.assert_extension_array_equal(result, expected) - - # with missing values - expected = BooleanArray( - np.array([True, False, True]), np.array([False, False, True]) - ) - - result = pd.array([True, False, None], dtype="boolean") - tm.assert_extension_array_equal(result, expected) - result = pd.array(np.array([True, False, None], dtype=object), dtype="boolean") - tm.assert_extension_array_equal(result, expected) - - -def test_to_boolean_array_all_none(): - expected = BooleanArray(np.array([True, True, True]), np.array([True, True, True])) - - result = pd.array([None, None, None], dtype="boolean") - tm.assert_extension_array_equal(result, expected) - result = pd.array(np.array([None, None, None], dtype=object), dtype="boolean") - tm.assert_extension_array_equal(result, expected) - - -@pytest.mark.parametrize( - "a, b", - [ - ([True, False, None, np.nan, pd.NA], [True, False, None, None, None]), - ([True, np.nan], [True, None]), - ([True, pd.NA], [True, None]), - ([np.nan, np.nan], [None, None]), - (np.array([np.nan, np.nan], dtype=float), [None, None]), - ], -) -def test_to_boolean_array_missing_indicators(a, b): - result = pd.array(a, dtype="boolean") - expected = pd.array(b, dtype="boolean") - tm.assert_extension_array_equal(result, expected) - - -@pytest.mark.parametrize( - "values", - [ - ["foo", "bar"], - ["1", "2"], - # "foo", - [1, 2], - [1.0, 2.0], - pd.date_range("20130101", periods=2), - np.array(["foo"]), - np.array([1, 2]), - np.array([1.0, 2.0]), - [np.nan, {"a": 1}], - ], -) -def test_to_boolean_array_error(values): - # error in converting existing arrays to BooleanArray - msg = "Need to pass bool-like value" - with pytest.raises(TypeError, match=msg): - pd.array(values, dtype="boolean") - - -def test_to_boolean_array_from_integer_array(): - result = pd.array(np.array([1, 0, 1, 0]), dtype="boolean") - expected = pd.array([True, False, True, False], dtype="boolean") - tm.assert_extension_array_equal(result, expected) - - # with missing values - result = pd.array(np.array([1, 0, 1, None]), dtype="boolean") - expected = pd.array([True, False, True, None], dtype="boolean") - tm.assert_extension_array_equal(result, expected) - - -def test_to_boolean_array_from_float_array(): - result = pd.array(np.array([1.0, 0.0, 1.0, 0.0]), dtype="boolean") - expected = pd.array([True, False, True, False], dtype="boolean") - tm.assert_extension_array_equal(result, expected) - - # with missing values - result = pd.array(np.array([1.0, 0.0, 1.0, np.nan]), dtype="boolean") - expected = pd.array([True, False, True, None], dtype="boolean") - tm.assert_extension_array_equal(result, expected) - - -def test_to_boolean_array_integer_like(): - # integers of 0's and 1's - result = pd.array([1, 0, 1, 0], dtype="boolean") - expected = pd.array([True, False, True, False], dtype="boolean") - tm.assert_extension_array_equal(result, expected) - - # with missing values - result = pd.array([1, 0, 1, None], dtype="boolean") - expected = pd.array([True, False, True, None], dtype="boolean") - tm.assert_extension_array_equal(result, expected) - - -def test_coerce_to_array(): - # TODO this is currently not public API - values = np.array([True, False, True, False], dtype="bool") - mask = np.array([False, False, False, True], dtype="bool") - result = BooleanArray(*coerce_to_array(values, mask=mask)) - expected = BooleanArray(values, mask) - tm.assert_extension_array_equal(result, expected) - assert result._data is values - assert result._mask is mask - result = BooleanArray(*coerce_to_array(values, mask=mask, copy=True)) - expected = BooleanArray(values, mask) - tm.assert_extension_array_equal(result, expected) - assert result._data is not values - assert result._mask is not mask - - # mixed missing from values and mask - values = [True, False, None, False] - mask = np.array([False, False, False, True], dtype="bool") - result = BooleanArray(*coerce_to_array(values, mask=mask)) - expected = BooleanArray( - np.array([True, False, True, True]), np.array([False, False, True, True]) - ) - tm.assert_extension_array_equal(result, expected) - result = BooleanArray(*coerce_to_array(np.array(values, dtype=object), mask=mask)) - tm.assert_extension_array_equal(result, expected) - result = BooleanArray(*coerce_to_array(values, mask=mask.tolist())) - tm.assert_extension_array_equal(result, expected) - - # raise errors for wrong dimension - values = np.array([True, False, True, False], dtype="bool") - mask = np.array([False, False, False, True], dtype="bool") - - with pytest.raises(ValueError, match="values must be a 1D list-like"): - coerce_to_array(values.reshape(1, -1)) - - with pytest.raises(ValueError, match="mask must be a 1D list-like"): - coerce_to_array(values, mask=mask.reshape(1, -1)) - - -def test_coerce_to_array_from_boolean_array(): - # passing BooleanArray to coerce_to_array - values = np.array([True, False, True, False], dtype="bool") - mask = np.array([False, False, False, True], dtype="bool") - arr = BooleanArray(values, mask) - result = BooleanArray(*coerce_to_array(arr)) - tm.assert_extension_array_equal(result, arr) - # no copy - assert result._data is arr._data - assert result._mask is arr._mask - - result = BooleanArray(*coerce_to_array(arr), copy=True) - tm.assert_extension_array_equal(result, arr) - assert result._data is not arr._data - assert result._mask is not arr._mask - - with pytest.raises(ValueError, match="cannot pass mask for BooleanArray input"): - coerce_to_array(arr, mask=mask) - - -def test_coerce_to_numpy_array(): - # with missing values -> object dtype - arr = pd.array([True, False, None], dtype="boolean") - result = np.array(arr) - expected = np.array([True, False, pd.NA], dtype="object") - tm.assert_numpy_array_equal(result, expected) - - # also with no missing values -> object dtype - arr = pd.array([True, False, True], dtype="boolean") - result = np.array(arr) - expected = np.array([True, False, True], dtype="object") - tm.assert_numpy_array_equal(result, expected) - - # force bool dtype - result = np.array(arr, dtype="bool") - expected = np.array([True, False, True], dtype="bool") - tm.assert_numpy_array_equal(result, expected) - # with missing values will raise error - arr = pd.array([True, False, None], dtype="boolean") - msg = ( - "cannot convert to 'bool'-dtype NumPy array with missing values. " - "Specify an appropriate 'na_value' for this dtype." - ) - with pytest.raises(ValueError, match=msg): - np.array(arr, dtype="bool") - - -def test_to_boolean_array_from_strings(): - result = BooleanArray._from_sequence_of_strings( - np.array(["True", "False", np.nan], dtype=object) - ) - expected = BooleanArray( - np.array([True, False, False]), np.array([False, False, True]) - ) - - tm.assert_extension_array_equal(result, expected) - - -def test_to_boolean_array_from_strings_invalid_string(): - with pytest.raises(ValueError, match="cannot be cast"): - BooleanArray._from_sequence_of_strings(["donkey"]) - - -def test_repr(): - df = pd.DataFrame({"A": pd.array([True, False, None], dtype="boolean")}) - expected = " A\n0 True\n1 False\n2 " - assert repr(df) == expected - - expected = "0 True\n1 False\n2 \nName: A, dtype: boolean" - assert repr(df.A) == expected - - expected = "\n[True, False, ]\nLength: 3, dtype: boolean" - assert repr(df.A.array) == expected - - -@pytest.mark.parametrize("box", [True, False], ids=["series", "array"]) -def test_to_numpy(box): - con = pd.Series if box else pd.array - # default (with or without missing values) -> object dtype - arr = con([True, False, True], dtype="boolean") - result = arr.to_numpy() - expected = np.array([True, False, True], dtype="object") - tm.assert_numpy_array_equal(result, expected) - - arr = con([True, False, None], dtype="boolean") - result = arr.to_numpy() - expected = np.array([True, False, pd.NA], dtype="object") - tm.assert_numpy_array_equal(result, expected) - - arr = con([True, False, None], dtype="boolean") - result = arr.to_numpy(dtype="str") - expected = np.array([True, False, pd.NA], dtype=" can convert to bool, otherwise raises - arr = con([True, False, True], dtype="boolean") - result = arr.to_numpy(dtype="bool") - expected = np.array([True, False, True], dtype="bool") - tm.assert_numpy_array_equal(result, expected) - - arr = con([True, False, None], dtype="boolean") - with pytest.raises(ValueError, match="cannot convert to 'bool'-dtype"): - result = arr.to_numpy(dtype="bool") - - # specify dtype and na_value - arr = con([True, False, None], dtype="boolean") - result = arr.to_numpy(dtype=object, na_value=None) - expected = np.array([True, False, None], dtype="object") - tm.assert_numpy_array_equal(result, expected) - - result = arr.to_numpy(dtype=bool, na_value=False) - expected = np.array([True, False, False], dtype="bool") - tm.assert_numpy_array_equal(result, expected) - - result = arr.to_numpy(dtype="int64", na_value=-99) - expected = np.array([1, 0, -99], dtype="int64") - tm.assert_numpy_array_equal(result, expected) - - result = arr.to_numpy(dtype="float64", na_value=np.nan) - expected = np.array([1, 0, np.nan], dtype="float64") - tm.assert_numpy_array_equal(result, expected) - - # converting to int or float without specifying na_value raises - with pytest.raises(ValueError, match="cannot convert to 'int64'-dtype"): - arr.to_numpy(dtype="int64") - with pytest.raises(ValueError, match="cannot convert to 'float64'-dtype"): - arr.to_numpy(dtype="float64") - - -def test_to_numpy_copy(): - # to_numpy can be zero-copy if no missing values - arr = pd.array([True, False, True], dtype="boolean") - result = arr.to_numpy(dtype=bool) - result[0] = False - tm.assert_extension_array_equal( - arr, pd.array([False, False, True], dtype="boolean") - ) - - arr = pd.array([True, False, True], dtype="boolean") - result = arr.to_numpy(dtype=bool, copy=True) - result[0] = False - tm.assert_extension_array_equal(arr, pd.array([True, False, True], dtype="boolean")) - - -def test_astype(): - # with missing values - arr = pd.array([True, False, None], dtype="boolean") - - with pytest.raises(ValueError, match="cannot convert NA to integer"): - arr.astype("int64") - - with pytest.raises(ValueError, match="cannot convert float NaN to"): - arr.astype("bool") - - result = arr.astype("float64") - expected = np.array([1, 0, np.nan], dtype="float64") - tm.assert_numpy_array_equal(result, expected) - - result = arr.astype("str") - expected = np.array(["True", "False", ""], dtype="object") - tm.assert_numpy_array_equal(result, expected) - - # no missing values - arr = pd.array([True, False, True], dtype="boolean") - result = arr.astype("int64") - expected = np.array([1, 0, 1], dtype="int64") - tm.assert_numpy_array_equal(result, expected) - - result = arr.astype("bool") - expected = np.array([True, False, True], dtype="bool") - tm.assert_numpy_array_equal(result, expected) - - -def test_astype_to_boolean_array(): - # astype to BooleanArray - arr = pd.array([True, False, None], dtype="boolean") - - result = arr.astype("boolean") - tm.assert_extension_array_equal(result, arr) - result = arr.astype(pd.BooleanDtype()) - tm.assert_extension_array_equal(result, arr) - - -def test_astype_to_integer_array(): - # astype to IntegerArray - arr = pd.array([True, False, None], dtype="boolean") - - result = arr.astype("Int64") - expected = pd.array([1, 0, None], dtype="Int64") - tm.assert_extension_array_equal(result, expected) - - -@pytest.mark.parametrize("na", [None, np.nan, pd.NA]) -def test_setitem_missing_values(na): - arr = pd.array([True, False, None], dtype="boolean") - expected = pd.array([True, None, None], dtype="boolean") - arr[1] = na - tm.assert_extension_array_equal(arr, expected) - - -@pytest.mark.parametrize( - "ufunc", [np.add, np.logical_or, np.logical_and, np.logical_xor] -) -def test_ufuncs_binary(ufunc): - # two BooleanArrays - a = pd.array([True, False, None], dtype="boolean") - result = ufunc(a, a) - expected = pd.array(ufunc(a._data, a._data), dtype="boolean") - expected[a._mask] = np.nan - tm.assert_extension_array_equal(result, expected) - - s = pd.Series(a) - result = ufunc(s, a) - expected = pd.Series(ufunc(a._data, a._data), dtype="boolean") - expected[a._mask] = np.nan - tm.assert_series_equal(result, expected) - - # Boolean with numpy array - arr = np.array([True, True, False]) - result = ufunc(a, arr) - expected = pd.array(ufunc(a._data, arr), dtype="boolean") - expected[a._mask] = np.nan - tm.assert_extension_array_equal(result, expected) - - result = ufunc(arr, a) - expected = pd.array(ufunc(arr, a._data), dtype="boolean") - expected[a._mask] = np.nan - tm.assert_extension_array_equal(result, expected) - - # BooleanArray with scalar - result = ufunc(a, True) - expected = pd.array(ufunc(a._data, True), dtype="boolean") - expected[a._mask] = np.nan - tm.assert_extension_array_equal(result, expected) - - result = ufunc(True, a) - expected = pd.array(ufunc(True, a._data), dtype="boolean") - expected[a._mask] = np.nan - tm.assert_extension_array_equal(result, expected) - - # not handled types - with pytest.raises(TypeError): - ufunc(a, "test") - - -@pytest.mark.parametrize("ufunc", [np.logical_not]) -def test_ufuncs_unary(ufunc): - a = pd.array([True, False, None], dtype="boolean") - result = ufunc(a) - expected = pd.array(ufunc(a._data), dtype="boolean") - expected[a._mask] = np.nan - tm.assert_extension_array_equal(result, expected) - - s = pd.Series(a) - result = ufunc(s) - expected = pd.Series(ufunc(a._data), dtype="boolean") - expected[a._mask] = np.nan - tm.assert_series_equal(result, expected) - - -@pytest.mark.parametrize("values", [[True, False], [True, None]]) -def test_ufunc_reduce_raises(values): - a = pd.array(values, dtype="boolean") - with pytest.raises(NotImplementedError): - np.add.reduce(a) - - -class TestUnaryOps: - def test_invert(self): - a = pd.array([True, False, None], dtype="boolean") - expected = pd.array([False, True, None], dtype="boolean") - tm.assert_extension_array_equal(~a, expected) - - expected = pd.Series(expected, index=["a", "b", "c"], name="name") - result = ~pd.Series(a, index=["a", "b", "c"], name="name") - tm.assert_series_equal(result, expected) - - df = pd.DataFrame({"A": a, "B": [True, False, False]}, index=["a", "b", "c"]) - result = ~df - expected = pd.DataFrame( - {"A": expected, "B": [False, True, True]}, index=["a", "b", "c"] - ) - tm.assert_frame_equal(result, expected) - - -class TestLogicalOps(BaseOpsUtil): - def test_numpy_scalars_ok(self, all_logical_operators): - a = pd.array([True, False, None], dtype="boolean") - op = getattr(a, all_logical_operators) - - tm.assert_extension_array_equal(op(True), op(np.bool(True))) - tm.assert_extension_array_equal(op(False), op(np.bool(False))) - - def get_op_from_name(self, op_name): - short_opname = op_name.strip("_") - short_opname = short_opname if "xor" in short_opname else short_opname + "_" - try: - op = getattr(operator, short_opname) - except AttributeError: - # Assume it is the reverse operator - rop = getattr(operator, short_opname[1:]) - op = lambda x, y: rop(y, x) - - return op - - def test_empty_ok(self, all_logical_operators): - a = pd.array([], dtype="boolean") - op_name = all_logical_operators - result = getattr(a, op_name)(True) - tm.assert_extension_array_equal(a, result) - - result = getattr(a, op_name)(False) - tm.assert_extension_array_equal(a, result) - - # TODO: pd.NA - # result = getattr(a, op_name)(pd.NA) - # tm.assert_extension_array_equal(a, result) - - def test_logical_length_mismatch_raises(self, all_logical_operators): - op_name = all_logical_operators - a = pd.array([True, False, None], dtype="boolean") - msg = "Lengths must match to compare" - - with pytest.raises(ValueError, match=msg): - getattr(a, op_name)([True, False]) - - with pytest.raises(ValueError, match=msg): - getattr(a, op_name)(np.array([True, False])) - - with pytest.raises(ValueError, match=msg): - getattr(a, op_name)(pd.array([True, False], dtype="boolean")) - - def test_logical_nan_raises(self, all_logical_operators): - op_name = all_logical_operators - a = pd.array([True, False, None], dtype="boolean") - msg = "Got float instead" - - with pytest.raises(TypeError, match=msg): - getattr(a, op_name)(np.nan) - - @pytest.mark.parametrize("other", ["a", 1]) - def test_non_bool_or_na_other_raises(self, other, all_logical_operators): - a = pd.array([True, False], dtype="boolean") - with pytest.raises(TypeError, match=str(type(other).__name__)): - getattr(a, all_logical_operators)(other) - - def test_kleene_or(self): - # A clear test of behavior. - a = pd.array([True] * 3 + [False] * 3 + [None] * 3, dtype="boolean") - b = pd.array([True, False, None] * 3, dtype="boolean") - result = a | b - expected = pd.array( - [True, True, True, True, False, None, True, None, None], dtype="boolean" - ) - tm.assert_extension_array_equal(result, expected) - - result = b | a - tm.assert_extension_array_equal(result, expected) - - # ensure we haven't mutated anything inplace - tm.assert_extension_array_equal( - a, pd.array([True] * 3 + [False] * 3 + [None] * 3, dtype="boolean") - ) - tm.assert_extension_array_equal( - b, pd.array([True, False, None] * 3, dtype="boolean") - ) - - @pytest.mark.parametrize( - "other, expected", - [ - (pd.NA, [True, None, None]), - (True, [True, True, True]), - (np.bool_(True), [True, True, True]), - (False, [True, False, None]), - (np.bool_(False), [True, False, None]), - ], - ) - def test_kleene_or_scalar(self, other, expected): - # TODO: test True & False - a = pd.array([True, False, None], dtype="boolean") - result = a | other - expected = pd.array(expected, dtype="boolean") - tm.assert_extension_array_equal(result, expected) - - result = other | a - tm.assert_extension_array_equal(result, expected) - - # ensure we haven't mutated anything inplace - tm.assert_extension_array_equal( - a, pd.array([True, False, None], dtype="boolean") - ) - - def test_kleene_and(self): - # A clear test of behavior. - a = pd.array([True] * 3 + [False] * 3 + [None] * 3, dtype="boolean") - b = pd.array([True, False, None] * 3, dtype="boolean") - result = a & b - expected = pd.array( - [True, False, None, False, False, False, None, False, None], dtype="boolean" - ) - tm.assert_extension_array_equal(result, expected) - - result = b & a - tm.assert_extension_array_equal(result, expected) - - # ensure we haven't mutated anything inplace - tm.assert_extension_array_equal( - a, pd.array([True] * 3 + [False] * 3 + [None] * 3, dtype="boolean") - ) - tm.assert_extension_array_equal( - b, pd.array([True, False, None] * 3, dtype="boolean") - ) - - @pytest.mark.parametrize( - "other, expected", - [ - (pd.NA, [None, False, None]), - (True, [True, False, None]), - (False, [False, False, False]), - (np.bool_(True), [True, False, None]), - (np.bool_(False), [False, False, False]), - ], - ) - def test_kleene_and_scalar(self, other, expected): - a = pd.array([True, False, None], dtype="boolean") - result = a & other - expected = pd.array(expected, dtype="boolean") - tm.assert_extension_array_equal(result, expected) - - result = other & a - tm.assert_extension_array_equal(result, expected) - - # ensure we haven't mutated anything inplace - tm.assert_extension_array_equal( - a, pd.array([True, False, None], dtype="boolean") - ) - - def test_kleene_xor(self): - a = pd.array([True] * 3 + [False] * 3 + [None] * 3, dtype="boolean") - b = pd.array([True, False, None] * 3, dtype="boolean") - result = a ^ b - expected = pd.array( - [False, True, None, True, False, None, None, None, None], dtype="boolean" - ) - tm.assert_extension_array_equal(result, expected) - - result = b ^ a - tm.assert_extension_array_equal(result, expected) - - # ensure we haven't mutated anything inplace - tm.assert_extension_array_equal( - a, pd.array([True] * 3 + [False] * 3 + [None] * 3, dtype="boolean") - ) - tm.assert_extension_array_equal( - b, pd.array([True, False, None] * 3, dtype="boolean") - ) - - @pytest.mark.parametrize( - "other, expected", - [ - (pd.NA, [None, None, None]), - (True, [False, True, None]), - (np.bool_(True), [False, True, None]), - (np.bool_(False), [True, False, None]), - ], - ) - def test_kleene_xor_scalar(self, other, expected): - a = pd.array([True, False, None], dtype="boolean") - result = a ^ other - expected = pd.array(expected, dtype="boolean") - tm.assert_extension_array_equal(result, expected) - - result = other ^ a - tm.assert_extension_array_equal(result, expected) - - # ensure we haven't mutated anything inplace - tm.assert_extension_array_equal( - a, pd.array([True, False, None], dtype="boolean") - ) - - @pytest.mark.parametrize( - "other", [True, False, pd.NA, [True, False, None] * 3], - ) - def test_no_masked_assumptions(self, other, all_logical_operators): - # The logical operations should not assume that masked values are False! - a = pd.arrays.BooleanArray( - np.array([True, True, True, False, False, False, True, False, True]), - np.array([False] * 6 + [True, True, True]), - ) - b = pd.array([True] * 3 + [False] * 3 + [None] * 3, dtype="boolean") - if isinstance(other, list): - other = pd.array(other, dtype="boolean") - - result = getattr(a, all_logical_operators)(other) - expected = getattr(b, all_logical_operators)(other) - tm.assert_extension_array_equal(result, expected) - - if isinstance(other, BooleanArray): - other._data[other._mask] = True - a._data[a._mask] = False - - result = getattr(a, all_logical_operators)(other) - expected = getattr(b, all_logical_operators)(other) - tm.assert_extension_array_equal(result, expected) - - -class TestComparisonOps(BaseOpsUtil): - def _compare_other(self, data, op_name, other): - op = self.get_op_from_name(op_name) - - # array - result = pd.Series(op(data, other)) - expected = pd.Series(op(data._data, other), dtype="boolean") - # propagate NAs - expected[data._mask] = pd.NA - - tm.assert_series_equal(result, expected) - - # series - s = pd.Series(data) - result = op(s, other) - - expected = pd.Series(data._data) - expected = op(expected, other) - expected = expected.astype("boolean") - # propagate NAs - expected[data._mask] = pd.NA - - tm.assert_series_equal(result, expected) - - def test_compare_scalar(self, data, all_compare_operators): - op_name = all_compare_operators - self._compare_other(data, op_name, True) - - def test_compare_array(self, data, all_compare_operators): - op_name = all_compare_operators - other = pd.array([True] * len(data), dtype="boolean") - self._compare_other(data, op_name, other) - other = np.array([True] * len(data)) - self._compare_other(data, op_name, other) - other = pd.Series([True] * len(data)) - self._compare_other(data, op_name, other) - - @pytest.mark.parametrize("other", [True, False, pd.NA]) - def test_scalar(self, other, all_compare_operators): - op = self.get_op_from_name(all_compare_operators) - a = pd.array([True, False, None], dtype="boolean") - - result = op(a, other) - - if other is pd.NA: - expected = pd.array([None, None, None], dtype="boolean") - else: - values = op(a._data, other) - expected = BooleanArray(values, a._mask, copy=True) - tm.assert_extension_array_equal(result, expected) - - # ensure we haven't mutated anything inplace - result[0] = None - tm.assert_extension_array_equal( - a, pd.array([True, False, None], dtype="boolean") - ) - - def test_array(self, all_compare_operators): - op = self.get_op_from_name(all_compare_operators) - a = pd.array([True] * 3 + [False] * 3 + [None] * 3, dtype="boolean") - b = pd.array([True, False, None] * 3, dtype="boolean") - - result = op(a, b) - - values = op(a._data, b._data) - mask = a._mask | b._mask - expected = BooleanArray(values, mask) - tm.assert_extension_array_equal(result, expected) - - # ensure we haven't mutated anything inplace - result[0] = None - tm.assert_extension_array_equal( - a, pd.array([True] * 3 + [False] * 3 + [None] * 3, dtype="boolean") - ) - tm.assert_extension_array_equal( - b, pd.array([True, False, None] * 3, dtype="boolean") - ) - - -class TestArithmeticOps(BaseOpsUtil): - def test_error(self, data, all_arithmetic_operators): - # invalid ops - - op = all_arithmetic_operators - s = pd.Series(data) - ops = getattr(s, op) - opa = getattr(data, op) - - # invalid scalars - with pytest.raises(TypeError): - ops("foo") - with pytest.raises(TypeError): - ops(pd.Timestamp("20180101")) - - # invalid array-likes - if op not in ("__mul__", "__rmul__"): - # TODO(extension) numpy's mul with object array sees booleans as numbers - with pytest.raises(TypeError): - ops(pd.Series("foo", index=s.index)) - - # 2d - result = opa(pd.DataFrame({"A": s})) - assert result is NotImplemented - - with pytest.raises(NotImplementedError): - opa(np.arange(len(s)).reshape(-1, len(s))) - - -@pytest.mark.parametrize("dropna", [True, False]) -def test_reductions_return_types(dropna, data, all_numeric_reductions): - op = all_numeric_reductions - s = pd.Series(data) - if dropna: - s = s.dropna() - - if op in ("sum", "prod"): - assert isinstance(getattr(s, op)(), np.int64) - elif op in ("min", "max"): - assert isinstance(getattr(s, op)(), np.bool_) - else: - # "mean", "std", "var", "median", "kurt", "skew" - assert isinstance(getattr(s, op)(), np.float64) - - -@pytest.mark.parametrize( - "values, exp_any, exp_all, exp_any_noskip, exp_all_noskip", - [ - ([True, pd.NA], True, True, True, pd.NA), - ([False, pd.NA], False, False, pd.NA, False), - ([pd.NA], False, True, pd.NA, pd.NA), - ([], False, True, False, True), - ], -) -def test_any_all(values, exp_any, exp_all, exp_any_noskip, exp_all_noskip): - # the methods return numpy scalars - exp_any = pd.NA if exp_any is pd.NA else np.bool_(exp_any) - exp_all = pd.NA if exp_all is pd.NA else np.bool_(exp_all) - exp_any_noskip = pd.NA if exp_any_noskip is pd.NA else np.bool_(exp_any_noskip) - exp_all_noskip = pd.NA if exp_all_noskip is pd.NA else np.bool_(exp_all_noskip) - - for con in [pd.array, pd.Series]: - a = con(values, dtype="boolean") - assert a.any() is exp_any - assert a.all() is exp_all - assert a.any(skipna=False) is exp_any_noskip - assert a.all(skipna=False) is exp_all_noskip - - assert np.any(a.any()) is exp_any - assert np.all(a.all()) is exp_all - - -# TODO when BooleanArray coerces to object dtype numpy array, need to do conversion -# manually in the indexing code -# def test_indexing_boolean_mask(): -# arr = pd.array([1, 2, 3, 4], dtype="Int64") -# mask = pd.array([True, False, True, False], dtype="boolean") -# result = arr[mask] -# expected = pd.array([1, 3], dtype="Int64") -# tm.assert_extension_array_equal(result, expected) - -# # missing values -> error -# mask = pd.array([True, False, True, None], dtype="boolean") -# with pytest.raises(IndexError): -# result = arr[mask] - - -@td.skip_if_no("pyarrow", min_version="0.15.0") -def test_arrow_array(data): - # protocol added in 0.15.0 - import pyarrow as pa - - arr = pa.array(data) - - # TODO use to_numpy(na_value=None) here - data_object = np.array(data, dtype=object) - data_object[data.isna()] = None - expected = pa.array(data_object, type=pa.bool_(), from_pandas=True) - assert arr.equals(expected) - - -@td.skip_if_no("pyarrow", min_version="0.15.1.dev") -def test_arrow_roundtrip(): - # roundtrip possible from arrow 1.0.0 - import pyarrow as pa - - data = pd.array([True, False, None], dtype="boolean") - df = pd.DataFrame({"a": data}) - table = pa.table(df) - assert table.field("a").type == "bool" - result = table.to_pandas() - assert isinstance(result["a"].dtype, pd.BooleanDtype) - tm.assert_frame_equal(result, df) - - -def test_value_counts_na(): - arr = pd.array([True, False, pd.NA], dtype="boolean") - result = arr.value_counts(dropna=False) - expected = pd.Series([1, 1, 1], index=[True, False, pd.NA], dtype="Int64") - tm.assert_series_equal(result, expected) - - result = arr.value_counts(dropna=True) - expected = pd.Series([1, 1], index=[True, False], dtype="Int64") - tm.assert_series_equal(result, expected) - - -def test_diff(): - a = pd.array( - [True, True, False, False, True, None, True, None, False], dtype="boolean" - ) - result = pd.core.algorithms.diff(a, 1) - expected = pd.array( - [None, False, True, False, True, None, None, None, None], dtype="boolean" - ) - tm.assert_extension_array_equal(result, expected) - - s = pd.Series(a) - result = s.diff() - expected = pd.Series(expected) - tm.assert_series_equal(result, expected) diff --git a/pandas/tests/arrays/test_datetimelike.py b/pandas/tests/arrays/test_datetimelike.py index 17818b6ce689f..e505917da1dc4 100644 --- a/pandas/tests/arrays/test_datetimelike.py +++ b/pandas/tests/arrays/test_datetimelike.py @@ -65,8 +65,8 @@ def test_compare_len1_raises(self): # to the case where one has length-1, which numpy would broadcast data = np.arange(10, dtype="i8") * 24 * 3600 * 10 ** 9 - idx = self.array_cls._simple_new(data, freq="D") - arr = self.index_cls(idx) + arr = self.array_cls._simple_new(data, freq="D") + idx = self.index_cls(arr) with pytest.raises(ValueError, match="Lengths must match"): arr == arr[:1] @@ -240,6 +240,23 @@ def test_inplace_arithmetic(self): arr -= pd.Timedelta(days=1) tm.assert_equal(arr, expected) + def test_shift_fill_int_deprecated(self): + # GH#31971 + data = np.arange(10, dtype="i8") * 24 * 3600 * 10 ** 9 + arr = self.array_cls(data, freq="D") + + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + result = arr.shift(1, fill_value=1) + + expected = arr.copy() + if self.array_cls is PeriodArray: + fill_val = PeriodArray._scalar_type._from_ordinal(1, freq=arr.freq) + else: + fill_val = arr._scalar_type(1) + expected[0] = fill_val + expected[1:] = arr[:-1] + tm.assert_equal(result, expected) + class TestDatetimeArray(SharedTests): index_cls = pd.DatetimeIndex @@ -687,10 +704,10 @@ def test_array_interface(self, period_index): result = np.asarray(arr, dtype=object) tm.assert_numpy_array_equal(result, expected) - # to other dtypes - with pytest.raises(TypeError): - np.asarray(arr, dtype="int64") + result = np.asarray(arr, dtype="int64") + tm.assert_numpy_array_equal(result, arr.asi8) + # to other dtypes with pytest.raises(TypeError): np.asarray(arr, dtype="float64") diff --git a/pandas/tests/arrays/test_datetimes.py b/pandas/tests/arrays/test_datetimes.py index a59ed429cc404..7d80ad3d8c6be 100644 --- a/pandas/tests/arrays/test_datetimes.py +++ b/pandas/tests/arrays/test_datetimes.py @@ -89,11 +89,26 @@ def test_non_array_raises(self): with pytest.raises(ValueError, match="list"): DatetimeArray([1, 2, 3]) - def test_other_type_raises(self): + def test_bool_dtype_raises(self): + arr = np.array([1, 2, 3], dtype="bool") + with pytest.raises( ValueError, match="The dtype of 'values' is incorrect.*bool" ): - DatetimeArray(np.array([1, 2, 3], dtype="bool")) + DatetimeArray(arr) + + msg = r"dtype bool cannot be converted to datetime64\[ns\]" + with pytest.raises(TypeError, match=msg): + DatetimeArray._from_sequence(arr) + + with pytest.raises(TypeError, match=msg): + sequence_to_dt64ns(arr) + + with pytest.raises(TypeError, match=msg): + pd.DatetimeIndex(arr) + + with pytest.raises(TypeError, match=msg): + pd.to_datetime(arr) def test_incorrect_dtype_raises(self): with pytest.raises(ValueError, match="Unexpected value for 'dtype'."): @@ -151,6 +166,18 @@ def test_astype_to_same(self): result = arr.astype(DatetimeTZDtype(tz="US/Central"), copy=False) assert result is arr + @pytest.mark.parametrize("dtype", ["datetime64[ns]", "datetime64[ns, UTC]"]) + @pytest.mark.parametrize( + "other", ["datetime64[ns]", "datetime64[ns, UTC]", "datetime64[ns, CET]"] + ) + def test_astype_copies(self, dtype, other): + # https://github.com/pandas-dev/pandas/pull/32490 + s = pd.Series([1, 2], dtype=dtype) + orig = s.copy() + t = s.astype(other) + t[:] = pd.NaT + tm.assert_series_equal(s, orig) + @pytest.mark.parametrize("dtype", [int, np.int32, np.int64, "uint32", "uint64"]) def test_astype_int(self, dtype): arr = DatetimeArray._from_sequence([pd.Timestamp("2000"), pd.Timestamp("2001")]) diff --git a/pandas/tests/arrays/test_integer.py b/pandas/tests/arrays/test_integer.py index 0a5a2362bd290..70a029bd74bda 100644 --- a/pandas/tests/arrays/test_integer.py +++ b/pandas/tests/arrays/test_integer.py @@ -633,6 +633,15 @@ def test_astype_specific_casting(self, dtype): expected = pd.Series([1, 2, 3, None], dtype=dtype) tm.assert_series_equal(result, expected) + def test_astype_dt64(self): + # GH#32435 + arr = pd.array([1, 2, 3, pd.NA]) * 10 ** 9 + + result = arr.astype("datetime64[ns]") + + expected = np.array([1, 2, 3, "NaT"], dtype="M8[s]").astype("M8[ns]") + tm.assert_numpy_array_equal(result, expected) + def test_construct_cast_invalid(self, dtype): msg = "cannot safely" diff --git a/pandas/tests/base/common.py b/pandas/tests/base/common.py new file mode 100644 index 0000000000000..b09710a974c2a --- /dev/null +++ b/pandas/tests/base/common.py @@ -0,0 +1,9 @@ +from typing import Any + +from pandas import Index + + +def allow_na_ops(obj: Any) -> bool: + """Whether to skip test cases including NaN""" + is_bool_index = isinstance(obj, Index) and obj.is_boolean() + return not is_bool_index and obj._can_hold_na diff --git a/pandas/tests/base/test_conversion.py b/pandas/tests/base/test_conversion.py index 46fd1551e6170..59f9103072fe9 100644 --- a/pandas/tests/base/test_conversion.py +++ b/pandas/tests/base/test_conversion.py @@ -220,34 +220,6 @@ def test_values_consistent(array, expected_type, dtype): tm.assert_equal(l_values, r_values) -@pytest.mark.parametrize( - "array, expected", - [ - (np.array([0, 1], dtype=np.int64), np.array([0, 1], dtype=np.int64)), - (np.array(["0", "1"]), np.array(["0", "1"], dtype=object)), - (pd.Categorical(["a", "a"]), np.array([0, 0], dtype="int8")), - ( - pd.DatetimeIndex(["2017-01-01T00:00:00"]), - np.array(["2017-01-01T00:00:00"], dtype="M8[ns]"), - ), - ( - pd.DatetimeIndex(["2017-01-01T00:00:00"], tz="US/Eastern"), - np.array(["2017-01-01T05:00:00"], dtype="M8[ns]"), - ), - (pd.TimedeltaIndex([10 ** 10]), np.array([10 ** 10], dtype="m8[ns]")), - ( - pd.PeriodIndex(["2017", "2018"], freq="D"), - np.array([17167, 17532], dtype=np.int64), - ), - ], -) -def test_ndarray_values(array, expected): - l_values = pd.Series(array)._ndarray_values - r_values = pd.Index(array)._ndarray_values - tm.assert_numpy_array_equal(l_values, r_values) - tm.assert_numpy_array_equal(l_values, expected) - - @pytest.mark.parametrize("arr", [np.array([1, 2, 3])]) def test_numpy_array(arr): ser = pd.Series(arr) diff --git a/pandas/tests/base/test_drop_duplicates.py b/pandas/tests/base/test_drop_duplicates.py new file mode 100644 index 0000000000000..4032890b4db18 --- /dev/null +++ b/pandas/tests/base/test_drop_duplicates.py @@ -0,0 +1,30 @@ +from datetime import datetime + +import numpy as np + +import pandas as pd +import pandas._testing as tm + + +def test_drop_duplicates_series_vs_dataframe(): + # GH 14192 + df = pd.DataFrame( + { + "a": [1, 1, 1, "one", "one"], + "b": [2, 2, np.nan, np.nan, np.nan], + "c": [3, 3, np.nan, np.nan, "three"], + "d": [1, 2, 3, 4, 4], + "e": [ + datetime(2015, 1, 1), + datetime(2015, 1, 1), + datetime(2015, 2, 1), + pd.NaT, + pd.NaT, + ], + } + ) + for column in df.columns: + for keep in ["first", "last", False]: + dropped_frame = df[[column]].drop_duplicates(keep=keep) + dropped_series = df[column].drop_duplicates(keep=keep) + tm.assert_frame_equal(dropped_frame, dropped_series.to_frame()) diff --git a/pandas/tests/base/test_factorize.py b/pandas/tests/base/test_factorize.py new file mode 100644 index 0000000000000..415a8b7e4362f --- /dev/null +++ b/pandas/tests/base/test_factorize.py @@ -0,0 +1,28 @@ +import numpy as np +import pytest + +import pandas as pd +import pandas._testing as tm + + +@pytest.mark.parametrize("sort", [True, False]) +def test_factorize(index_or_series_obj, sort): + obj = index_or_series_obj + result_codes, result_uniques = obj.factorize(sort=sort) + + constructor = pd.Index + if isinstance(obj, pd.MultiIndex): + constructor = pd.MultiIndex.from_tuples + expected_uniques = constructor(obj.unique()) + + if sort: + expected_uniques = expected_uniques.sort_values() + + # construct an integer ndarray so that + # `expected_uniques.take(expected_codes)` is equal to `obj` + expected_uniques_list = list(expected_uniques) + expected_codes = [expected_uniques_list.index(val) for val in obj] + expected_codes = np.asarray(expected_codes, dtype=np.intp) + + tm.assert_numpy_array_equal(result_codes, expected_codes) + tm.assert_index_equal(result_uniques, expected_uniques) diff --git a/pandas/tests/base/test_fillna.py b/pandas/tests/base/test_fillna.py new file mode 100644 index 0000000000000..5e50a9e2d1c7f --- /dev/null +++ b/pandas/tests/base/test_fillna.py @@ -0,0 +1,70 @@ +""" +Though Index.fillna and Series.fillna has separate impl, +test here to confirm these works as the same +""" + +import numpy as np +import pytest + +from pandas._libs.tslib import iNaT + +from pandas.core.dtypes.common import needs_i8_conversion +from pandas.core.dtypes.generic import ABCMultiIndex + +from pandas import Index +import pandas._testing as tm +from pandas.tests.base.common import allow_na_ops + + +def test_fillna(index_or_series_obj): + # GH 11343 + obj = index_or_series_obj + if isinstance(obj, ABCMultiIndex): + pytest.skip("MultiIndex doesn't support isna") + + # values will not be changed + fill_value = obj.values[0] if len(obj) > 0 else 0 + result = obj.fillna(fill_value) + if isinstance(obj, Index): + tm.assert_index_equal(obj, result) + else: + tm.assert_series_equal(obj, result) + + # check shallow_copied + assert obj is not result + + +@pytest.mark.parametrize("null_obj", [np.nan, None]) +def test_fillna_null(null_obj, index_or_series_obj): + # GH 11343 + obj = index_or_series_obj + klass = type(obj) + + if not allow_na_ops(obj): + pytest.skip(f"{klass} doesn't allow for NA operations") + elif len(obj) < 1: + pytest.skip("Test doesn't make sense on empty data") + elif isinstance(obj, ABCMultiIndex): + pytest.skip(f"MultiIndex can't hold '{null_obj}'") + + values = obj.values + fill_value = values[0] + expected = values.copy() + if needs_i8_conversion(obj): + values[0:2] = iNaT + expected[0:2] = fill_value + else: + values[0:2] = null_obj + expected[0:2] = fill_value + + expected = klass(expected) + obj = klass(values) + + result = obj.fillna(fill_value) + if isinstance(obj, Index): + tm.assert_index_equal(result, expected) + else: + tm.assert_series_equal(result, expected) + + # check shallow_copied + assert obj is not result diff --git a/pandas/tests/base/test_misc.py b/pandas/tests/base/test_misc.py new file mode 100644 index 0000000000000..6bab60f05ce89 --- /dev/null +++ b/pandas/tests/base/test_misc.py @@ -0,0 +1,204 @@ +import sys + +import numpy as np +import pytest + +from pandas.compat import PYPY + +from pandas.core.dtypes.common import ( + is_categorical_dtype, + is_datetime64_dtype, + is_datetime64tz_dtype, + is_object_dtype, +) + +import pandas as pd +from pandas import DataFrame, Index, IntervalIndex, Series +import pandas._testing as tm + + +@pytest.mark.parametrize( + "op_name, op", + [ + ("add", "+"), + ("sub", "-"), + ("mul", "*"), + ("mod", "%"), + ("pow", "**"), + ("truediv", "/"), + ("floordiv", "//"), + ], +) +@pytest.mark.parametrize("klass", [Series, DataFrame]) +def test_binary_ops_docstring(klass, op_name, op): + # not using the all_arithmetic_functions fixture with _get_opstr + # as _get_opstr is used internally in the dynamic implementation of the docstring + operand1 = klass.__name__.lower() + operand2 = "other" + expected_str = " ".join([operand1, op, operand2]) + assert expected_str in getattr(klass, op_name).__doc__ + + # reverse version of the binary ops + expected_str = " ".join([operand2, op, operand1]) + assert expected_str in getattr(klass, "r" + op_name).__doc__ + + +def test_none_comparison(series_with_simple_index): + series = series_with_simple_index + if isinstance(series.index, IntervalIndex): + # IntervalIndex breaks on "series[0] = np.nan" below + pytest.skip("IntervalIndex doesn't support assignment") + if len(series) < 1: + pytest.skip("Test doesn't make sense on empty data") + + # bug brought up by #1079 + # changed from TypeError in 0.17.0 + series[0] = np.nan + + # noinspection PyComparisonWithNone + result = series == None # noqa + assert not result.iat[0] + assert not result.iat[1] + + # noinspection PyComparisonWithNone + result = series != None # noqa + assert result.iat[0] + assert result.iat[1] + + result = None == series # noqa + assert not result.iat[0] + assert not result.iat[1] + + result = None != series # noqa + assert result.iat[0] + assert result.iat[1] + + if is_datetime64_dtype(series) or is_datetime64tz_dtype(series): + # Following DatetimeIndex (and Timestamp) convention, + # inequality comparisons with Series[datetime64] raise + msg = "Invalid comparison" + with pytest.raises(TypeError, match=msg): + None > series + with pytest.raises(TypeError, match=msg): + series > None + else: + result = None > series + assert not result.iat[0] + assert not result.iat[1] + + result = series < None + assert not result.iat[0] + assert not result.iat[1] + + +def test_ndarray_compat_properties(index_or_series_obj): + obj = index_or_series_obj + + # Check that we work. + for p in ["shape", "dtype", "T", "nbytes"]: + assert getattr(obj, p, None) is not None + + # deprecated properties + for p in ["flags", "strides", "itemsize", "base", "data"]: + assert not hasattr(obj, p) + + msg = "can only convert an array of size 1 to a Python scalar" + with pytest.raises(ValueError, match=msg): + obj.item() # len > 1 + + assert obj.ndim == 1 + assert obj.size == len(obj) + + assert Index([1]).item() == 1 + assert Series([1]).item() == 1 + + +@pytest.mark.skipif(PYPY, reason="not relevant for PyPy") +def test_memory_usage(index_or_series_obj): + obj = index_or_series_obj + res = obj.memory_usage() + res_deep = obj.memory_usage(deep=True) + + is_object = is_object_dtype(obj) or ( + isinstance(obj, Series) and is_object_dtype(obj.index) + ) + is_categorical = is_categorical_dtype(obj) or ( + isinstance(obj, Series) and is_categorical_dtype(obj.index) + ) + + if len(obj) == 0: + assert res_deep == res == 0 + elif is_object or is_categorical: + # only deep will pick them up + assert res_deep > res + else: + assert res == res_deep + + # sys.getsizeof will call the .memory_usage with + # deep=True, and add on some GC overhead + diff = res_deep - sys.getsizeof(obj) + assert abs(diff) < 100 + + +def test_memory_usage_components_series(series_with_simple_index): + series = series_with_simple_index + total_usage = series.memory_usage(index=True) + non_index_usage = series.memory_usage(index=False) + index_usage = series.index.memory_usage() + assert total_usage == non_index_usage + index_usage + + +def test_memory_usage_components_narrow_series(narrow_series): + series = narrow_series + total_usage = series.memory_usage(index=True) + non_index_usage = series.memory_usage(index=False) + index_usage = series.index.memory_usage() + assert total_usage == non_index_usage + index_usage + + +def test_searchsorted(index_or_series_obj): + # numpy.searchsorted calls obj.searchsorted under the hood. + # See gh-12238 + obj = index_or_series_obj + + if isinstance(obj, pd.MultiIndex): + # See gh-14833 + pytest.skip("np.searchsorted doesn't work on pd.MultiIndex") + + max_obj = max(obj, default=0) + index = np.searchsorted(obj, max_obj) + assert 0 <= index <= len(obj) + + index = np.searchsorted(obj, max_obj, sorter=range(len(obj))) + assert 0 <= index <= len(obj) + + +def test_access_by_position(indices): + index = indices + + if len(index) == 0: + pytest.skip("Test doesn't make sense on empty data") + elif isinstance(index, pd.MultiIndex): + pytest.skip("Can't instantiate Series from MultiIndex") + + series = pd.Series(index) + assert index[0] == series.iloc[0] + assert index[5] == series.iloc[5] + assert index[-1] == series.iloc[-1] + + size = len(index) + assert index[-1] == index[size - 1] + + msg = f"index {size} is out of bounds for axis 0 with size {size}" + with pytest.raises(IndexError, match=msg): + index[size] + msg = "single positional indexer is out-of-bounds" + with pytest.raises(IndexError, match=msg): + series.iloc[size] + + +def test_get_indexer_non_unique_dtype_mismatch(): + # GH 25459 + indexes, missing = pd.Index(["A", "B"]).get_indexer_non_unique(pd.Index([0])) + tm.assert_numpy_array_equal(np.array([-1], dtype=np.intp), indexes) + tm.assert_numpy_array_equal(np.array([0], dtype=np.int64), missing) diff --git a/pandas/tests/base/test_ops.py b/pandas/tests/base/test_ops.py deleted file mode 100644 index 8f48d0a3e8378..0000000000000 --- a/pandas/tests/base/test_ops.py +++ /dev/null @@ -1,875 +0,0 @@ -import collections -from datetime import datetime, timedelta -from io import StringIO -import sys -from typing import Any - -import numpy as np -import pytest - -from pandas._libs.tslib import iNaT -from pandas.compat import PYPY -from pandas.compat.numpy import np_array_datetime64_compat - -from pandas.core.dtypes.common import ( - is_categorical_dtype, - is_datetime64_dtype, - is_datetime64tz_dtype, - is_object_dtype, - needs_i8_conversion, -) - -import pandas as pd -from pandas import ( - DataFrame, - DatetimeIndex, - Index, - Interval, - IntervalIndex, - Series, - Timedelta, - TimedeltaIndex, -) -import pandas._testing as tm - - -def allow_na_ops(obj: Any) -> bool: - """Whether to skip test cases including NaN""" - is_bool_index = isinstance(obj, Index) and obj.is_boolean() - return not is_bool_index and obj._can_hold_na - - -class Ops: - def setup_method(self, method): - self.bool_index = tm.makeBoolIndex(10, name="a") - self.int_index = tm.makeIntIndex(10, name="a") - self.float_index = tm.makeFloatIndex(10, name="a") - self.dt_index = tm.makeDateIndex(10, name="a") - self.dt_tz_index = tm.makeDateIndex(10, name="a").tz_localize(tz="US/Eastern") - self.period_index = tm.makePeriodIndex(10, name="a") - self.string_index = tm.makeStringIndex(10, name="a") - self.unicode_index = tm.makeUnicodeIndex(10, name="a") - - arr = np.random.randn(10) - self.bool_series = Series(arr, index=self.bool_index, name="a") - self.int_series = Series(arr, index=self.int_index, name="a") - self.float_series = Series(arr, index=self.float_index, name="a") - self.dt_series = Series(arr, index=self.dt_index, name="a") - self.dt_tz_series = self.dt_tz_index.to_series() - self.period_series = Series(arr, index=self.period_index, name="a") - self.string_series = Series(arr, index=self.string_index, name="a") - self.unicode_series = Series(arr, index=self.unicode_index, name="a") - - types = ["bool", "int", "float", "dt", "dt_tz", "period", "string", "unicode"] - self.indexes = [getattr(self, f"{t}_index") for t in types] - self.series = [getattr(self, f"{t}_series") for t in types] - - # To test narrow dtypes, we use narrower *data* elements, not *index* elements - index = self.int_index - self.float32_series = Series(arr.astype(np.float32), index=index, name="a") - - arr_int = np.random.choice(10, size=10, replace=False) - self.int8_series = Series(arr_int.astype(np.int8), index=index, name="a") - self.int16_series = Series(arr_int.astype(np.int16), index=index, name="a") - self.int32_series = Series(arr_int.astype(np.int32), index=index, name="a") - - self.uint8_series = Series(arr_int.astype(np.uint8), index=index, name="a") - self.uint16_series = Series(arr_int.astype(np.uint16), index=index, name="a") - self.uint32_series = Series(arr_int.astype(np.uint32), index=index, name="a") - - nrw_types = ["float32", "int8", "int16", "int32", "uint8", "uint16", "uint32"] - self.narrow_series = [getattr(self, f"{t}_series") for t in nrw_types] - - self.objs = self.indexes + self.series + self.narrow_series - - -@pytest.mark.parametrize( - "op_name, op", - [ - ("add", "+"), - ("sub", "-"), - ("mul", "*"), - ("mod", "%"), - ("pow", "**"), - ("truediv", "/"), - ("floordiv", "//"), - ], -) -@pytest.mark.parametrize("klass", [Series, DataFrame]) -def test_binary_ops(klass, op_name, op): - # not using the all_arithmetic_functions fixture with _get_opstr - # as _get_opstr is used internally in the dynamic implementation of the docstring - operand1 = klass.__name__.lower() - operand2 = "other" - expected_str = " ".join([operand1, op, operand2]) - assert expected_str in getattr(klass, op_name).__doc__ - - # reverse version of the binary ops - expected_str = " ".join([operand2, op, operand1]) - assert expected_str in getattr(klass, "r" + op_name).__doc__ - - -class TestTranspose: - errmsg = "the 'axes' parameter is not supported" - - def test_transpose(self, index_or_series_obj): - obj = index_or_series_obj - tm.assert_equal(obj.transpose(), obj) - - def test_transpose_non_default_axes(self, index_or_series_obj): - obj = index_or_series_obj - with pytest.raises(ValueError, match=self.errmsg): - obj.transpose(1) - with pytest.raises(ValueError, match=self.errmsg): - obj.transpose(axes=1) - - def test_numpy_transpose(self, index_or_series_obj): - obj = index_or_series_obj - tm.assert_equal(np.transpose(obj), obj) - - with pytest.raises(ValueError, match=self.errmsg): - np.transpose(obj, axes=1) - - -class TestIndexOps(Ops): - def setup_method(self, method): - super().setup_method(method) - self.is_valid_objs = self.objs - self.not_valid_objs = [] - - def test_none_comparison(self, series_with_simple_index): - series = series_with_simple_index - if isinstance(series.index, IntervalIndex): - # IntervalIndex breaks on "series[0] = np.nan" below - pytest.skip("IntervalIndex doesn't support assignment") - if len(series) < 1: - pytest.skip("Test doesn't make sense on empty data") - - # bug brought up by #1079 - # changed from TypeError in 0.17.0 - series[0] = np.nan - - # noinspection PyComparisonWithNone - result = series == None # noqa - assert not result.iat[0] - assert not result.iat[1] - - # noinspection PyComparisonWithNone - result = series != None # noqa - assert result.iat[0] - assert result.iat[1] - - result = None == series # noqa - assert not result.iat[0] - assert not result.iat[1] - - result = None != series # noqa - assert result.iat[0] - assert result.iat[1] - - if is_datetime64_dtype(series) or is_datetime64tz_dtype(series): - # Following DatetimeIndex (and Timestamp) convention, - # inequality comparisons with Series[datetime64] raise - msg = "Invalid comparison" - with pytest.raises(TypeError, match=msg): - None > series - with pytest.raises(TypeError, match=msg): - series > None - else: - result = None > series - assert not result.iat[0] - assert not result.iat[1] - - result = series < None - assert not result.iat[0] - assert not result.iat[1] - - def test_ndarray_compat_properties(self, index_or_series_obj): - obj = index_or_series_obj - - # Check that we work. - for p in ["shape", "dtype", "T", "nbytes"]: - assert getattr(obj, p, None) is not None - - # deprecated properties - for p in ["flags", "strides", "itemsize", "base", "data"]: - assert not hasattr(obj, p) - - msg = "can only convert an array of size 1 to a Python scalar" - with pytest.raises(ValueError, match=msg): - obj.item() # len > 1 - - assert obj.ndim == 1 - assert obj.size == len(obj) - - assert Index([1]).item() == 1 - assert Series([1]).item() == 1 - - def test_unique(self, index_or_series_obj): - obj = index_or_series_obj - obj = np.repeat(obj, range(1, len(obj) + 1)) - result = obj.unique() - - # dict.fromkeys preserves the order - unique_values = list(dict.fromkeys(obj.values)) - if isinstance(obj, pd.MultiIndex): - expected = pd.MultiIndex.from_tuples(unique_values) - expected.names = obj.names - tm.assert_index_equal(result, expected) - elif isinstance(obj, pd.Index): - expected = pd.Index(unique_values, dtype=obj.dtype) - if is_datetime64tz_dtype(obj): - expected = expected.normalize() - tm.assert_index_equal(result, expected) - else: - expected = np.array(unique_values) - tm.assert_numpy_array_equal(result, expected) - - @pytest.mark.parametrize("null_obj", [np.nan, None]) - def test_unique_null(self, null_obj, index_or_series_obj): - obj = index_or_series_obj - - if not allow_na_ops(obj): - pytest.skip("type doesn't allow for NA operations") - elif len(obj) < 1: - pytest.skip("Test doesn't make sense on empty data") - elif isinstance(obj, pd.MultiIndex): - pytest.skip(f"MultiIndex can't hold '{null_obj}'") - - values = obj.values - if needs_i8_conversion(obj): - values[0:2] = iNaT - else: - values[0:2] = null_obj - - klass = type(obj) - repeated_values = np.repeat(values, range(1, len(values) + 1)) - obj = klass(repeated_values, dtype=obj.dtype) - result = obj.unique() - - unique_values_raw = dict.fromkeys(obj.values) - # because np.nan == np.nan is False, but None == None is True - # np.nan would be duplicated, whereas None wouldn't - unique_values_not_null = [ - val for val in unique_values_raw if not pd.isnull(val) - ] - unique_values = [null_obj] + unique_values_not_null - - if isinstance(obj, pd.Index): - expected = pd.Index(unique_values, dtype=obj.dtype) - if is_datetime64tz_dtype(obj): - result = result.normalize() - expected = expected.normalize() - elif isinstance(obj, pd.CategoricalIndex): - expected = expected.set_categories(unique_values_not_null) - tm.assert_index_equal(result, expected) - else: - expected = np.array(unique_values, dtype=obj.dtype) - tm.assert_numpy_array_equal(result, expected) - - def test_nunique(self, index_or_series_obj): - obj = index_or_series_obj - obj = np.repeat(obj, range(1, len(obj) + 1)) - expected = len(obj.unique()) - assert obj.nunique(dropna=False) == expected - - @pytest.mark.parametrize("null_obj", [np.nan, None]) - def test_nunique_null(self, null_obj, index_or_series_obj): - obj = index_or_series_obj - - if not allow_na_ops(obj): - pytest.skip("type doesn't allow for NA operations") - elif isinstance(obj, pd.MultiIndex): - pytest.skip(f"MultiIndex can't hold '{null_obj}'") - - values = obj.values - if needs_i8_conversion(obj): - values[0:2] = iNaT - else: - values[0:2] = null_obj - - klass = type(obj) - repeated_values = np.repeat(values, range(1, len(values) + 1)) - obj = klass(repeated_values, dtype=obj.dtype) - - if isinstance(obj, pd.CategoricalIndex): - assert obj.nunique() == len(obj.categories) - assert obj.nunique(dropna=False) == len(obj.categories) + 1 - else: - num_unique_values = len(obj.unique()) - assert obj.nunique() == max(0, num_unique_values - 1) - assert obj.nunique(dropna=False) == max(0, num_unique_values) - - def test_value_counts(self, index_or_series_obj): - obj = index_or_series_obj - obj = np.repeat(obj, range(1, len(obj) + 1)) - result = obj.value_counts() - - counter = collections.Counter(obj) - expected = pd.Series(dict(counter.most_common()), dtype=np.int64, name=obj.name) - expected.index = expected.index.astype(obj.dtype) - if isinstance(obj, pd.MultiIndex): - expected.index = pd.Index(expected.index) - - # sort_index to avoid switched order when values share the same count - result = result.sort_index() - expected = expected.sort_index() - tm.assert_series_equal(result, expected) - - @pytest.mark.parametrize("null_obj", [np.nan, None]) - def test_value_counts_null(self, null_obj, index_or_series_obj): - orig = index_or_series_obj - obj = orig.copy() - - if not allow_na_ops(obj): - pytest.skip("type doesn't allow for NA operations") - elif len(obj) < 1: - pytest.skip("Test doesn't make sense on empty data") - elif isinstance(orig, pd.MultiIndex): - pytest.skip(f"MultiIndex can't hold '{null_obj}'") - - values = obj.values - if needs_i8_conversion(obj): - values[0:2] = iNaT - else: - values[0:2] = null_obj - - klass = type(obj) - repeated_values = np.repeat(values, range(1, len(values) + 1)) - obj = klass(repeated_values, dtype=obj.dtype) - - # because np.nan == np.nan is False, but None == None is True - # np.nan would be duplicated, whereas None wouldn't - counter = collections.Counter(obj.dropna()) - expected = pd.Series(dict(counter.most_common()), dtype=np.int64) - expected.index = expected.index.astype(obj.dtype) - - tm.assert_series_equal(obj.value_counts(), expected) - - # can't use expected[null_obj] = 3 as - # IntervalIndex doesn't allow assignment - new_entry = pd.Series({np.nan: 3}, dtype=np.int64) - expected = expected.append(new_entry) - tm.assert_series_equal(obj.value_counts(dropna=False), expected) - - def test_value_counts_inferred(self, index_or_series): - klass = index_or_series - s_values = ["a", "b", "b", "b", "b", "c", "d", "d", "a", "a"] - s = klass(s_values) - expected = Series([4, 3, 2, 1], index=["b", "a", "d", "c"]) - tm.assert_series_equal(s.value_counts(), expected) - - if isinstance(s, Index): - exp = Index(np.unique(np.array(s_values, dtype=np.object_))) - tm.assert_index_equal(s.unique(), exp) - else: - exp = np.unique(np.array(s_values, dtype=np.object_)) - tm.assert_numpy_array_equal(s.unique(), exp) - - assert s.nunique() == 4 - # don't sort, have to sort after the fact as not sorting is - # platform-dep - hist = s.value_counts(sort=False).sort_values() - expected = Series([3, 1, 4, 2], index=list("acbd")).sort_values() - tm.assert_series_equal(hist, expected) - - # sort ascending - hist = s.value_counts(ascending=True) - expected = Series([1, 2, 3, 4], index=list("cdab")) - tm.assert_series_equal(hist, expected) - - # relative histogram. - hist = s.value_counts(normalize=True) - expected = Series([0.4, 0.3, 0.2, 0.1], index=["b", "a", "d", "c"]) - tm.assert_series_equal(hist, expected) - - def test_value_counts_bins(self, index_or_series): - klass = index_or_series - s_values = ["a", "b", "b", "b", "b", "c", "d", "d", "a", "a"] - s = klass(s_values) - - # bins - msg = "bins argument only works with numeric data" - with pytest.raises(TypeError, match=msg): - s.value_counts(bins=1) - - s1 = Series([1, 1, 2, 3]) - res1 = s1.value_counts(bins=1) - exp1 = Series({Interval(0.997, 3.0): 4}) - tm.assert_series_equal(res1, exp1) - res1n = s1.value_counts(bins=1, normalize=True) - exp1n = Series({Interval(0.997, 3.0): 1.0}) - tm.assert_series_equal(res1n, exp1n) - - if isinstance(s1, Index): - tm.assert_index_equal(s1.unique(), Index([1, 2, 3])) - else: - exp = np.array([1, 2, 3], dtype=np.int64) - tm.assert_numpy_array_equal(s1.unique(), exp) - - assert s1.nunique() == 3 - - # these return the same - res4 = s1.value_counts(bins=4, dropna=True) - intervals = IntervalIndex.from_breaks([0.997, 1.5, 2.0, 2.5, 3.0]) - exp4 = Series([2, 1, 1, 0], index=intervals.take([0, 3, 1, 2])) - tm.assert_series_equal(res4, exp4) - - res4 = s1.value_counts(bins=4, dropna=False) - intervals = IntervalIndex.from_breaks([0.997, 1.5, 2.0, 2.5, 3.0]) - exp4 = Series([2, 1, 1, 0], index=intervals.take([0, 3, 1, 2])) - tm.assert_series_equal(res4, exp4) - - res4n = s1.value_counts(bins=4, normalize=True) - exp4n = Series([0.5, 0.25, 0.25, 0], index=intervals.take([0, 3, 1, 2])) - tm.assert_series_equal(res4n, exp4n) - - # handle NA's properly - s_values = ["a", "b", "b", "b", np.nan, np.nan, "d", "d", "a", "a", "b"] - s = klass(s_values) - expected = Series([4, 3, 2], index=["b", "a", "d"]) - tm.assert_series_equal(s.value_counts(), expected) - - if isinstance(s, Index): - exp = Index(["a", "b", np.nan, "d"]) - tm.assert_index_equal(s.unique(), exp) - else: - exp = np.array(["a", "b", np.nan, "d"], dtype=object) - tm.assert_numpy_array_equal(s.unique(), exp) - assert s.nunique() == 3 - - s = klass({}) if klass is dict else klass({}, dtype=object) - expected = Series([], dtype=np.int64) - tm.assert_series_equal(s.value_counts(), expected, check_index_type=False) - # returned dtype differs depending on original - if isinstance(s, Index): - tm.assert_index_equal(s.unique(), Index([]), exact=False) - else: - tm.assert_numpy_array_equal(s.unique(), np.array([]), check_dtype=False) - - assert s.nunique() == 0 - - def test_value_counts_datetime64(self, index_or_series): - klass = index_or_series - - # GH 3002, datetime64[ns] - # don't test names though - txt = "\n".join( - [ - "xxyyzz20100101PIE", - "xxyyzz20100101GUM", - "xxyyzz20100101EGG", - "xxyyww20090101EGG", - "foofoo20080909PIE", - "foofoo20080909GUM", - ] - ) - f = StringIO(txt) - df = pd.read_fwf( - f, widths=[6, 8, 3], names=["person_id", "dt", "food"], parse_dates=["dt"] - ) - - s = klass(df["dt"].copy()) - s.name = None - idx = pd.to_datetime( - ["2010-01-01 00:00:00", "2008-09-09 00:00:00", "2009-01-01 00:00:00"] - ) - expected_s = Series([3, 2, 1], index=idx) - tm.assert_series_equal(s.value_counts(), expected_s) - - expected = np_array_datetime64_compat( - ["2010-01-01 00:00:00", "2009-01-01 00:00:00", "2008-09-09 00:00:00"], - dtype="datetime64[ns]", - ) - if isinstance(s, Index): - tm.assert_index_equal(s.unique(), DatetimeIndex(expected)) - else: - tm.assert_numpy_array_equal(s.unique(), expected) - - assert s.nunique() == 3 - - # with NaT - s = df["dt"].copy() - s = klass(list(s.values) + [pd.NaT]) - - result = s.value_counts() - assert result.index.dtype == "datetime64[ns]" - tm.assert_series_equal(result, expected_s) - - result = s.value_counts(dropna=False) - expected_s[pd.NaT] = 1 - tm.assert_series_equal(result, expected_s) - - unique = s.unique() - assert unique.dtype == "datetime64[ns]" - - # numpy_array_equal cannot compare pd.NaT - if isinstance(s, Index): - exp_idx = DatetimeIndex(expected.tolist() + [pd.NaT]) - tm.assert_index_equal(unique, exp_idx) - else: - tm.assert_numpy_array_equal(unique[:3], expected) - assert pd.isna(unique[3]) - - assert s.nunique() == 3 - assert s.nunique(dropna=False) == 4 - - # timedelta64[ns] - td = df.dt - df.dt + timedelta(1) - td = klass(td, name="dt") - - result = td.value_counts() - expected_s = Series([6], index=[Timedelta("1day")], name="dt") - tm.assert_series_equal(result, expected_s) - - expected = TimedeltaIndex(["1 days"], name="dt") - if isinstance(td, Index): - tm.assert_index_equal(td.unique(), expected) - else: - tm.assert_numpy_array_equal(td.unique(), expected.values) - - td2 = timedelta(1) + (df.dt - df.dt) - td2 = klass(td2, name="dt") - result2 = td2.value_counts() - tm.assert_series_equal(result2, expected_s) - - def test_factorize(self): - for orig in self.objs: - o = orig.copy() - - if isinstance(o, Index) and o.is_boolean(): - exp_arr = np.array([0, 1] + [0] * 8, dtype=np.intp) - exp_uniques = o - exp_uniques = Index([False, True]) - else: - exp_arr = np.array(range(len(o)), dtype=np.intp) - exp_uniques = o - codes, uniques = o.factorize() - - tm.assert_numpy_array_equal(codes, exp_arr) - if isinstance(o, Series): - tm.assert_index_equal(uniques, Index(orig), check_names=False) - else: - # factorize explicitly resets name - tm.assert_index_equal(uniques, exp_uniques, check_names=False) - - def test_factorize_repeated(self): - for orig in self.objs: - o = orig.copy() - - # don't test boolean - if isinstance(o, Index) and o.is_boolean(): - continue - - # sort by value, and create duplicates - if isinstance(o, Series): - o = o.sort_values() - n = o.iloc[5:].append(o) - else: - indexer = o.argsort() - o = o.take(indexer) - n = o[5:].append(o) - - exp_arr = np.array( - [5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype=np.intp - ) - codes, uniques = n.factorize(sort=True) - - tm.assert_numpy_array_equal(codes, exp_arr) - if isinstance(o, Series): - tm.assert_index_equal( - uniques, Index(orig).sort_values(), check_names=False - ) - else: - tm.assert_index_equal(uniques, o, check_names=False) - - exp_arr = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4], np.intp) - codes, uniques = n.factorize(sort=False) - tm.assert_numpy_array_equal(codes, exp_arr) - - if isinstance(o, Series): - expected = Index(o.iloc[5:10].append(o.iloc[:5])) - tm.assert_index_equal(uniques, expected, check_names=False) - else: - expected = o[5:10].append(o[:5]) - tm.assert_index_equal(uniques, expected, check_names=False) - - def test_duplicated_drop_duplicates_index(self): - # GH 4060 - for original in self.objs: - if isinstance(original, Index): - - # special case - if original.is_boolean(): - result = original.drop_duplicates() - expected = Index([False, True], name="a") - tm.assert_index_equal(result, expected) - continue - - # original doesn't have duplicates - expected = np.array([False] * len(original), dtype=bool) - duplicated = original.duplicated() - tm.assert_numpy_array_equal(duplicated, expected) - assert duplicated.dtype == bool - result = original.drop_duplicates() - tm.assert_index_equal(result, original) - assert result is not original - - # has_duplicates - assert not original.has_duplicates - - # create repeated values, 3rd and 5th values are duplicated - idx = original[list(range(len(original))) + [5, 3]] - expected = np.array([False] * len(original) + [True, True], dtype=bool) - duplicated = idx.duplicated() - tm.assert_numpy_array_equal(duplicated, expected) - assert duplicated.dtype == bool - tm.assert_index_equal(idx.drop_duplicates(), original) - - base = [False] * len(idx) - base[3] = True - base[5] = True - expected = np.array(base) - - duplicated = idx.duplicated(keep="last") - tm.assert_numpy_array_equal(duplicated, expected) - assert duplicated.dtype == bool - result = idx.drop_duplicates(keep="last") - tm.assert_index_equal(result, idx[~expected]) - - base = [False] * len(original) + [True, True] - base[3] = True - base[5] = True - expected = np.array(base) - - duplicated = idx.duplicated(keep=False) - tm.assert_numpy_array_equal(duplicated, expected) - assert duplicated.dtype == bool - result = idx.drop_duplicates(keep=False) - tm.assert_index_equal(result, idx[~expected]) - - with pytest.raises( - TypeError, - match=r"drop_duplicates\(\) got an unexpected keyword argument", - ): - idx.drop_duplicates(inplace=True) - - else: - expected = Series( - [False] * len(original), index=original.index, name="a" - ) - tm.assert_series_equal(original.duplicated(), expected) - result = original.drop_duplicates() - tm.assert_series_equal(result, original) - assert result is not original - - idx = original.index[list(range(len(original))) + [5, 3]] - values = original._values[list(range(len(original))) + [5, 3]] - s = Series(values, index=idx, name="a") - - expected = Series( - [False] * len(original) + [True, True], index=idx, name="a" - ) - tm.assert_series_equal(s.duplicated(), expected) - tm.assert_series_equal(s.drop_duplicates(), original) - - base = [False] * len(idx) - base[3] = True - base[5] = True - expected = Series(base, index=idx, name="a") - - tm.assert_series_equal(s.duplicated(keep="last"), expected) - tm.assert_series_equal( - s.drop_duplicates(keep="last"), s[~np.array(base)] - ) - - base = [False] * len(original) + [True, True] - base[3] = True - base[5] = True - expected = Series(base, index=idx, name="a") - - tm.assert_series_equal(s.duplicated(keep=False), expected) - tm.assert_series_equal( - s.drop_duplicates(keep=False), s[~np.array(base)] - ) - - s.drop_duplicates(inplace=True) - tm.assert_series_equal(s, original) - - def test_drop_duplicates_series_vs_dataframe(self): - # GH 14192 - df = pd.DataFrame( - { - "a": [1, 1, 1, "one", "one"], - "b": [2, 2, np.nan, np.nan, np.nan], - "c": [3, 3, np.nan, np.nan, "three"], - "d": [1, 2, 3, 4, 4], - "e": [ - datetime(2015, 1, 1), - datetime(2015, 1, 1), - datetime(2015, 2, 1), - pd.NaT, - pd.NaT, - ], - } - ) - for column in df.columns: - for keep in ["first", "last", False]: - dropped_frame = df[[column]].drop_duplicates(keep=keep) - dropped_series = df[column].drop_duplicates(keep=keep) - tm.assert_frame_equal(dropped_frame, dropped_series.to_frame()) - - def test_fillna(self): - # # GH 11343 - # though Index.fillna and Series.fillna has separate impl, - # test here to confirm these works as the same - - for orig in self.objs: - - o = orig.copy() - values = o.values - - # values will not be changed - result = o.fillna(o.astype(object).values[0]) - if isinstance(o, Index): - tm.assert_index_equal(o, result) - else: - tm.assert_series_equal(o, result) - # check shallow_copied - assert o is not result - - for null_obj in [np.nan, None]: - for orig in self.objs: - o = orig.copy() - klass = type(o) - - if not allow_na_ops(o): - continue - - if needs_i8_conversion(o): - - values = o.astype(object).values - fill_value = values[0] - values[0:2] = pd.NaT - else: - values = o.values.copy() - fill_value = o.values[0] - values[0:2] = null_obj - - expected = [fill_value] * 2 + list(values[2:]) - - expected = klass(expected, dtype=orig.dtype) - o = klass(values) - - # check values has the same dtype as the original - assert o.dtype == orig.dtype - - result = o.fillna(fill_value) - if isinstance(o, Index): - tm.assert_index_equal(result, expected) - else: - tm.assert_series_equal(result, expected) - # check shallow_copied - assert o is not result - - @pytest.mark.skipif(PYPY, reason="not relevant for PyPy") - def test_memory_usage(self, index_or_series_obj): - obj = index_or_series_obj - res = obj.memory_usage() - res_deep = obj.memory_usage(deep=True) - - is_object = is_object_dtype(obj) or ( - isinstance(obj, Series) and is_object_dtype(obj.index) - ) - is_categorical = is_categorical_dtype(obj) or ( - isinstance(obj, Series) and is_categorical_dtype(obj.index) - ) - - if len(obj) == 0: - assert res_deep == res == 0 - elif is_object or is_categorical: - # only deep will pick them up - assert res_deep > res - else: - assert res == res_deep - - # sys.getsizeof will call the .memory_usage with - # deep=True, and add on some GC overhead - diff = res_deep - sys.getsizeof(obj) - assert abs(diff) < 100 - - def test_memory_usage_components_series(self, series_with_simple_index): - series = series_with_simple_index - total_usage = series.memory_usage(index=True) - non_index_usage = series.memory_usage(index=False) - index_usage = series.index.memory_usage() - assert total_usage == non_index_usage + index_usage - - def test_memory_usage_components_narrow_series(self, narrow_series): - series = narrow_series - total_usage = series.memory_usage(index=True) - non_index_usage = series.memory_usage(index=False) - index_usage = series.index.memory_usage() - assert total_usage == non_index_usage + index_usage - - def test_searchsorted(self, index_or_series_obj): - # numpy.searchsorted calls obj.searchsorted under the hood. - # See gh-12238 - obj = index_or_series_obj - - if isinstance(obj, pd.MultiIndex): - # See gh-14833 - pytest.skip("np.searchsorted doesn't work on pd.MultiIndex") - - max_obj = max(obj, default=0) - index = np.searchsorted(obj, max_obj) - assert 0 <= index <= len(obj) - - index = np.searchsorted(obj, max_obj, sorter=range(len(obj))) - assert 0 <= index <= len(obj) - - def test_access_by_position(self, indices): - index = indices - - if len(index) == 0: - pytest.skip("Test doesn't make sense on empty data") - elif isinstance(index, pd.MultiIndex): - pytest.skip("Can't instantiate Series from MultiIndex") - - series = pd.Series(index) - assert index[0] == series.iloc[0] - assert index[5] == series.iloc[5] - assert index[-1] == series.iloc[-1] - - size = len(index) - assert index[-1] == index[size - 1] - - msg = f"index {size} is out of bounds for axis 0 with size {size}" - with pytest.raises(IndexError, match=msg): - index[size] - msg = "single positional indexer is out-of-bounds" - with pytest.raises(IndexError, match=msg): - series.iloc[size] - - @pytest.mark.parametrize("indexer_klass", [list, pd.Index]) - @pytest.mark.parametrize( - "indexer", - [ - [True] * 10, - [False] * 10, - [True, False, True, True, False, False, True, True, False, True], - ], - ) - def test_bool_indexing(self, indexer_klass, indexer): - # GH 22533 - for idx in self.indexes: - exp_idx = [i for i in range(len(indexer)) if indexer[i]] - tm.assert_index_equal(idx[indexer_klass(indexer)], idx[exp_idx]) - s = pd.Series(idx) - tm.assert_series_equal(s[indexer_klass(indexer)], s.iloc[exp_idx]) - - def test_get_indexer_non_unique_dtype_mismatch(self): - # GH 25459 - indexes, missing = pd.Index(["A", "B"]).get_indexer_non_unique(pd.Index([0])) - tm.assert_numpy_array_equal(np.array([-1], dtype=np.intp), indexes) - tm.assert_numpy_array_equal(np.array([0], dtype=np.int64), missing) diff --git a/pandas/tests/base/test_transpose.py b/pandas/tests/base/test_transpose.py new file mode 100644 index 0000000000000..5ba278368834c --- /dev/null +++ b/pandas/tests/base/test_transpose.py @@ -0,0 +1,27 @@ +import numpy as np +import pytest + +import pandas._testing as tm + + +def test_transpose(index_or_series_obj): + obj = index_or_series_obj + tm.assert_equal(obj.transpose(), obj) + + +def test_transpose_non_default_axes(index_or_series_obj): + msg = "the 'axes' parameter is not supported" + obj = index_or_series_obj + with pytest.raises(ValueError, match=msg): + obj.transpose(1) + with pytest.raises(ValueError, match=msg): + obj.transpose(axes=1) + + +def test_numpy_transpose(index_or_series_obj): + msg = "the 'axes' parameter is not supported" + obj = index_or_series_obj + tm.assert_equal(np.transpose(obj), obj) + + with pytest.raises(ValueError, match=msg): + np.transpose(obj, axes=1) diff --git a/pandas/tests/base/test_unique.py b/pandas/tests/base/test_unique.py new file mode 100644 index 0000000000000..c6225c9b5ca64 --- /dev/null +++ b/pandas/tests/base/test_unique.py @@ -0,0 +1,107 @@ +import numpy as np +import pytest + +from pandas._libs.tslib import iNaT + +from pandas.core.dtypes.common import is_datetime64tz_dtype, needs_i8_conversion + +import pandas as pd +import pandas._testing as tm +from pandas.tests.base.common import allow_na_ops + + +def test_unique(index_or_series_obj): + obj = index_or_series_obj + obj = np.repeat(obj, range(1, len(obj) + 1)) + result = obj.unique() + + # dict.fromkeys preserves the order + unique_values = list(dict.fromkeys(obj.values)) + if isinstance(obj, pd.MultiIndex): + expected = pd.MultiIndex.from_tuples(unique_values) + expected.names = obj.names + tm.assert_index_equal(result, expected) + elif isinstance(obj, pd.Index): + expected = pd.Index(unique_values, dtype=obj.dtype) + if is_datetime64tz_dtype(obj): + expected = expected.normalize() + tm.assert_index_equal(result, expected) + else: + expected = np.array(unique_values) + tm.assert_numpy_array_equal(result, expected) + + +@pytest.mark.parametrize("null_obj", [np.nan, None]) +def test_unique_null(null_obj, index_or_series_obj): + obj = index_or_series_obj + + if not allow_na_ops(obj): + pytest.skip("type doesn't allow for NA operations") + elif len(obj) < 1: + pytest.skip("Test doesn't make sense on empty data") + elif isinstance(obj, pd.MultiIndex): + pytest.skip(f"MultiIndex can't hold '{null_obj}'") + + values = obj.values + if needs_i8_conversion(obj): + values[0:2] = iNaT + else: + values[0:2] = null_obj + + klass = type(obj) + repeated_values = np.repeat(values, range(1, len(values) + 1)) + obj = klass(repeated_values, dtype=obj.dtype) + result = obj.unique() + + unique_values_raw = dict.fromkeys(obj.values) + # because np.nan == np.nan is False, but None == None is True + # np.nan would be duplicated, whereas None wouldn't + unique_values_not_null = [val for val in unique_values_raw if not pd.isnull(val)] + unique_values = [null_obj] + unique_values_not_null + + if isinstance(obj, pd.Index): + expected = pd.Index(unique_values, dtype=obj.dtype) + if is_datetime64tz_dtype(obj): + result = result.normalize() + expected = expected.normalize() + elif isinstance(obj, pd.CategoricalIndex): + expected = expected.set_categories(unique_values_not_null) + tm.assert_index_equal(result, expected) + else: + expected = np.array(unique_values, dtype=obj.dtype) + tm.assert_numpy_array_equal(result, expected) + + +def test_nunique(index_or_series_obj): + obj = index_or_series_obj + obj = np.repeat(obj, range(1, len(obj) + 1)) + expected = len(obj.unique()) + assert obj.nunique(dropna=False) == expected + + +@pytest.mark.parametrize("null_obj", [np.nan, None]) +def test_nunique_null(null_obj, index_or_series_obj): + obj = index_or_series_obj + + if not allow_na_ops(obj): + pytest.skip("type doesn't allow for NA operations") + elif isinstance(obj, pd.MultiIndex): + pytest.skip(f"MultiIndex can't hold '{null_obj}'") + + values = obj.values + if needs_i8_conversion(obj): + values[0:2] = iNaT + else: + values[0:2] = null_obj + + klass = type(obj) + repeated_values = np.repeat(values, range(1, len(values) + 1)) + obj = klass(repeated_values, dtype=obj.dtype) + + if isinstance(obj, pd.CategoricalIndex): + assert obj.nunique() == len(obj.categories) + assert obj.nunique(dropna=False) == len(obj.categories) + 1 + else: + num_unique_values = len(obj.unique()) + assert obj.nunique() == max(0, num_unique_values - 1) + assert obj.nunique(dropna=False) == max(0, num_unique_values) diff --git a/pandas/tests/base/test_value_counts.py b/pandas/tests/base/test_value_counts.py new file mode 100644 index 0000000000000..d45feaff68dde --- /dev/null +++ b/pandas/tests/base/test_value_counts.py @@ -0,0 +1,276 @@ +import collections +from datetime import timedelta +from io import StringIO + +import numpy as np +import pytest + +from pandas._libs.tslib import iNaT +from pandas.compat.numpy import np_array_datetime64_compat + +from pandas.core.dtypes.common import needs_i8_conversion + +import pandas as pd +from pandas import ( + DatetimeIndex, + Index, + Interval, + IntervalIndex, + Series, + Timedelta, + TimedeltaIndex, +) +import pandas._testing as tm +from pandas.tests.base.common import allow_na_ops + + +def test_value_counts(index_or_series_obj): + obj = index_or_series_obj + obj = np.repeat(obj, range(1, len(obj) + 1)) + result = obj.value_counts() + + counter = collections.Counter(obj) + expected = pd.Series(dict(counter.most_common()), dtype=np.int64, name=obj.name) + expected.index = expected.index.astype(obj.dtype) + if isinstance(obj, pd.MultiIndex): + expected.index = pd.Index(expected.index) + + # TODO: Order of entries with the same count is inconsistent on CI (gh-32449) + if obj.duplicated().any(): + result = result.sort_index() + expected = expected.sort_index() + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("null_obj", [np.nan, None]) +def test_value_counts_null(null_obj, index_or_series_obj): + orig = index_or_series_obj + obj = orig.copy() + + if not allow_na_ops(obj): + pytest.skip("type doesn't allow for NA operations") + elif len(obj) < 1: + pytest.skip("Test doesn't make sense on empty data") + elif isinstance(orig, pd.MultiIndex): + pytest.skip(f"MultiIndex can't hold '{null_obj}'") + + values = obj.values + if needs_i8_conversion(obj): + values[0:2] = iNaT + else: + values[0:2] = null_obj + + klass = type(obj) + repeated_values = np.repeat(values, range(1, len(values) + 1)) + obj = klass(repeated_values, dtype=obj.dtype) + + # because np.nan == np.nan is False, but None == None is True + # np.nan would be duplicated, whereas None wouldn't + counter = collections.Counter(obj.dropna()) + expected = pd.Series(dict(counter.most_common()), dtype=np.int64) + expected.index = expected.index.astype(obj.dtype) + + result = obj.value_counts() + if obj.duplicated().any(): + # TODO: + # Order of entries with the same count is inconsistent on CI (gh-32449) + expected = expected.sort_index() + result = result.sort_index() + tm.assert_series_equal(result, expected) + + # can't use expected[null_obj] = 3 as + # IntervalIndex doesn't allow assignment + new_entry = pd.Series({np.nan: 3}, dtype=np.int64) + expected = expected.append(new_entry) + + result = obj.value_counts(dropna=False) + if obj.duplicated().any(): + # TODO: + # Order of entries with the same count is inconsistent on CI (gh-32449) + expected = expected.sort_index() + result = result.sort_index() + tm.assert_series_equal(result, expected) + + +def test_value_counts_inferred(index_or_series): + klass = index_or_series + s_values = ["a", "b", "b", "b", "b", "c", "d", "d", "a", "a"] + s = klass(s_values) + expected = Series([4, 3, 2, 1], index=["b", "a", "d", "c"]) + tm.assert_series_equal(s.value_counts(), expected) + + if isinstance(s, Index): + exp = Index(np.unique(np.array(s_values, dtype=np.object_))) + tm.assert_index_equal(s.unique(), exp) + else: + exp = np.unique(np.array(s_values, dtype=np.object_)) + tm.assert_numpy_array_equal(s.unique(), exp) + + assert s.nunique() == 4 + # don't sort, have to sort after the fact as not sorting is + # platform-dep + hist = s.value_counts(sort=False).sort_values() + expected = Series([3, 1, 4, 2], index=list("acbd")).sort_values() + tm.assert_series_equal(hist, expected) + + # sort ascending + hist = s.value_counts(ascending=True) + expected = Series([1, 2, 3, 4], index=list("cdab")) + tm.assert_series_equal(hist, expected) + + # relative histogram. + hist = s.value_counts(normalize=True) + expected = Series([0.4, 0.3, 0.2, 0.1], index=["b", "a", "d", "c"]) + tm.assert_series_equal(hist, expected) + + +def test_value_counts_bins(index_or_series): + klass = index_or_series + s_values = ["a", "b", "b", "b", "b", "c", "d", "d", "a", "a"] + s = klass(s_values) + + # bins + msg = "bins argument only works with numeric data" + with pytest.raises(TypeError, match=msg): + s.value_counts(bins=1) + + s1 = Series([1, 1, 2, 3]) + res1 = s1.value_counts(bins=1) + exp1 = Series({Interval(0.997, 3.0): 4}) + tm.assert_series_equal(res1, exp1) + res1n = s1.value_counts(bins=1, normalize=True) + exp1n = Series({Interval(0.997, 3.0): 1.0}) + tm.assert_series_equal(res1n, exp1n) + + if isinstance(s1, Index): + tm.assert_index_equal(s1.unique(), Index([1, 2, 3])) + else: + exp = np.array([1, 2, 3], dtype=np.int64) + tm.assert_numpy_array_equal(s1.unique(), exp) + + assert s1.nunique() == 3 + + # these return the same + res4 = s1.value_counts(bins=4, dropna=True) + intervals = IntervalIndex.from_breaks([0.997, 1.5, 2.0, 2.5, 3.0]) + exp4 = Series([2, 1, 1, 0], index=intervals.take([0, 3, 1, 2])) + tm.assert_series_equal(res4, exp4) + + res4 = s1.value_counts(bins=4, dropna=False) + intervals = IntervalIndex.from_breaks([0.997, 1.5, 2.0, 2.5, 3.0]) + exp4 = Series([2, 1, 1, 0], index=intervals.take([0, 3, 1, 2])) + tm.assert_series_equal(res4, exp4) + + res4n = s1.value_counts(bins=4, normalize=True) + exp4n = Series([0.5, 0.25, 0.25, 0], index=intervals.take([0, 3, 1, 2])) + tm.assert_series_equal(res4n, exp4n) + + # handle NA's properly + s_values = ["a", "b", "b", "b", np.nan, np.nan, "d", "d", "a", "a", "b"] + s = klass(s_values) + expected = Series([4, 3, 2], index=["b", "a", "d"]) + tm.assert_series_equal(s.value_counts(), expected) + + if isinstance(s, Index): + exp = Index(["a", "b", np.nan, "d"]) + tm.assert_index_equal(s.unique(), exp) + else: + exp = np.array(["a", "b", np.nan, "d"], dtype=object) + tm.assert_numpy_array_equal(s.unique(), exp) + assert s.nunique() == 3 + + s = klass({}) if klass is dict else klass({}, dtype=object) + expected = Series([], dtype=np.int64) + tm.assert_series_equal(s.value_counts(), expected, check_index_type=False) + # returned dtype differs depending on original + if isinstance(s, Index): + tm.assert_index_equal(s.unique(), Index([]), exact=False) + else: + tm.assert_numpy_array_equal(s.unique(), np.array([]), check_dtype=False) + + assert s.nunique() == 0 + + +def test_value_counts_datetime64(index_or_series): + klass = index_or_series + + # GH 3002, datetime64[ns] + # don't test names though + txt = "\n".join( + [ + "xxyyzz20100101PIE", + "xxyyzz20100101GUM", + "xxyyzz20100101EGG", + "xxyyww20090101EGG", + "foofoo20080909PIE", + "foofoo20080909GUM", + ] + ) + f = StringIO(txt) + df = pd.read_fwf( + f, widths=[6, 8, 3], names=["person_id", "dt", "food"], parse_dates=["dt"] + ) + + s = klass(df["dt"].copy()) + s.name = None + idx = pd.to_datetime( + ["2010-01-01 00:00:00", "2008-09-09 00:00:00", "2009-01-01 00:00:00"] + ) + expected_s = Series([3, 2, 1], index=idx) + tm.assert_series_equal(s.value_counts(), expected_s) + + expected = np_array_datetime64_compat( + ["2010-01-01 00:00:00", "2009-01-01 00:00:00", "2008-09-09 00:00:00"], + dtype="datetime64[ns]", + ) + if isinstance(s, Index): + tm.assert_index_equal(s.unique(), DatetimeIndex(expected)) + else: + tm.assert_numpy_array_equal(s.unique(), expected) + + assert s.nunique() == 3 + + # with NaT + s = df["dt"].copy() + s = klass(list(s.values) + [pd.NaT]) + + result = s.value_counts() + assert result.index.dtype == "datetime64[ns]" + tm.assert_series_equal(result, expected_s) + + result = s.value_counts(dropna=False) + expected_s[pd.NaT] = 1 + tm.assert_series_equal(result, expected_s) + + unique = s.unique() + assert unique.dtype == "datetime64[ns]" + + # numpy_array_equal cannot compare pd.NaT + if isinstance(s, Index): + exp_idx = DatetimeIndex(expected.tolist() + [pd.NaT]) + tm.assert_index_equal(unique, exp_idx) + else: + tm.assert_numpy_array_equal(unique[:3], expected) + assert pd.isna(unique[3]) + + assert s.nunique() == 3 + assert s.nunique(dropna=False) == 4 + + # timedelta64[ns] + td = df.dt - df.dt + timedelta(1) + td = klass(td, name="dt") + + result = td.value_counts() + expected_s = Series([6], index=[Timedelta("1day")], name="dt") + tm.assert_series_equal(result, expected_s) + + expected = TimedeltaIndex(["1 days"], name="dt") + if isinstance(td, Index): + tm.assert_index_equal(td.unique(), expected) + else: + tm.assert_numpy_array_equal(td.unique(), expected.values) + + td2 = timedelta(1) + (df.dt - df.dt) + td2 = klass(td2, name="dt") + result2 = td2.value_counts() + tm.assert_series_equal(result2, expected_s) diff --git a/pandas/tests/computation/test_eval.py b/pandas/tests/computation/test_eval.py index a240e6cef5930..08d8d5ca342b7 100644 --- a/pandas/tests/computation/test_eval.py +++ b/pandas/tests/computation/test_eval.py @@ -375,7 +375,8 @@ def check_pow(self, lhs, arith1, rhs): and is_scalar(rhs) and _is_py3_complex_incompat(result, expected) ): - with pytest.raises(AssertionError): + msg = "(DataFrame.columns|numpy array) are different" + with pytest.raises(AssertionError, match=msg): tm.assert_numpy_array_equal(result, expected) else: tm.assert_almost_equal(result, expected) @@ -449,16 +450,19 @@ def test_frame_invert(self): # float always raises lhs = DataFrame(randn(5, 2)) if self.engine == "numexpr": - with pytest.raises(NotImplementedError): + msg = "couldn't find matching opcode for 'invert_dd'" + with pytest.raises(NotImplementedError, match=msg): result = pd.eval(expr, engine=self.engine, parser=self.parser) else: - with pytest.raises(TypeError): + msg = "ufunc 'invert' not supported for the input types" + with pytest.raises(TypeError, match=msg): result = pd.eval(expr, engine=self.engine, parser=self.parser) # int raises on numexpr lhs = DataFrame(randint(5, size=(5, 2))) if self.engine == "numexpr": - with pytest.raises(NotImplementedError): + msg = "couldn't find matching opcode for 'invert" + with pytest.raises(NotImplementedError, match=msg): result = pd.eval(expr, engine=self.engine, parser=self.parser) else: expect = ~lhs @@ -474,10 +478,11 @@ def test_frame_invert(self): # object raises lhs = DataFrame({"b": ["a", 1, 2.0], "c": rand(3) > 0.5}) if self.engine == "numexpr": - with pytest.raises(ValueError): + with pytest.raises(ValueError, match="unknown type object"): result = pd.eval(expr, engine=self.engine, parser=self.parser) else: - with pytest.raises(TypeError): + msg = "bad operand type for unary ~: 'str'" + with pytest.raises(TypeError, match=msg): result = pd.eval(expr, engine=self.engine, parser=self.parser) def test_series_invert(self): @@ -488,16 +493,19 @@ def test_series_invert(self): # float raises lhs = Series(randn(5)) if self.engine == "numexpr": - with pytest.raises(NotImplementedError): + msg = "couldn't find matching opcode for 'invert_dd'" + with pytest.raises(NotImplementedError, match=msg): result = pd.eval(expr, engine=self.engine, parser=self.parser) else: - with pytest.raises(TypeError): + msg = "ufunc 'invert' not supported for the input types" + with pytest.raises(TypeError, match=msg): result = pd.eval(expr, engine=self.engine, parser=self.parser) # int raises on numexpr lhs = Series(randint(5, size=5)) if self.engine == "numexpr": - with pytest.raises(NotImplementedError): + msg = "couldn't find matching opcode for 'invert" + with pytest.raises(NotImplementedError, match=msg): result = pd.eval(expr, engine=self.engine, parser=self.parser) else: expect = ~lhs @@ -517,10 +525,11 @@ def test_series_invert(self): # object lhs = Series(["a", 1, 2.0]) if self.engine == "numexpr": - with pytest.raises(ValueError): + with pytest.raises(ValueError, match="unknown type object"): result = pd.eval(expr, engine=self.engine, parser=self.parser) else: - with pytest.raises(TypeError): + msg = "bad operand type for unary ~: 'str'" + with pytest.raises(TypeError, match=msg): result = pd.eval(expr, engine=self.engine, parser=self.parser) def test_frame_negate(self): @@ -541,7 +550,8 @@ def test_frame_negate(self): # bool doesn't work with numexpr but works elsewhere lhs = DataFrame(rand(5, 2) > 0.5) if self.engine == "numexpr": - with pytest.raises(NotImplementedError): + msg = "couldn't find matching opcode for 'neg_bb'" + with pytest.raises(NotImplementedError, match=msg): result = pd.eval(expr, engine=self.engine, parser=self.parser) else: expect = -lhs @@ -566,7 +576,8 @@ def test_series_negate(self): # bool doesn't work with numexpr but works elsewhere lhs = Series(rand(5) > 0.5) if self.engine == "numexpr": - with pytest.raises(NotImplementedError): + msg = "couldn't find matching opcode for 'neg_bb'" + with pytest.raises(NotImplementedError, match=msg): result = pd.eval(expr, engine=self.engine, parser=self.parser) else: expect = -lhs @@ -610,7 +621,8 @@ def test_series_pos(self, lhs): tm.assert_series_equal(expect, result) def test_scalar_unary(self): - with pytest.raises(TypeError): + msg = "bad operand type for unary ~: 'float'" + with pytest.raises(TypeError, match=msg): pd.eval("~1.0", engine=self.engine, parser=self.parser) assert pd.eval("-1.0", parser=self.parser, engine=self.engine) == -1.0 @@ -671,7 +683,8 @@ def test_disallow_scalar_bool_ops(self): x, a, b, df = np.random.randn(3), 1, 2, DataFrame(randn(3, 2)) # noqa for ex in exprs: - with pytest.raises(NotImplementedError): + msg = "cannot evaluate scalar only bool ops|'BoolOp' nodes are not" + with pytest.raises(NotImplementedError, match=msg): pd.eval(ex, engine=self.engine, parser=self.parser) def test_identical(self): @@ -772,7 +785,8 @@ def setup_ops(self): def check_chained_cmp_op(self, lhs, cmp1, mid, cmp2, rhs): ex1 = f"lhs {cmp1} mid {cmp2} rhs" - with pytest.raises(NotImplementedError): + msg = "'BoolOp' nodes are not implemented" + with pytest.raises(NotImplementedError, match=msg): pd.eval(ex1, engine=self.engine, parser=self.parser) @@ -1183,7 +1197,8 @@ def test_bool_ops_with_constants(self): def test_4d_ndarray_fails(self): x = randn(3, 4, 5, 6) y = Series(randn(10)) - with pytest.raises(NotImplementedError): + msg = "N-dimensional objects, where N > 2, are not supported with eval" + with pytest.raises(NotImplementedError, match=msg): self.eval("x + y", local_dict={"x": x, "y": y}) def test_constant(self): @@ -1232,7 +1247,7 @@ def test_truediv(self): def test_failing_subscript_with_name_error(self): df = DataFrame(np.random.randn(5, 3)) # noqa - with pytest.raises(NameError): + with pytest.raises(NameError, match="name 'x' is not defined"): self.eval("df[x > 2] > 2") def test_lhs_expression_subscript(self): @@ -1379,7 +1394,8 @@ def test_multi_line_expression(self): assert ans is None # multi-line not valid if not all assignments - with pytest.raises(ValueError): + msg = "Multi-line expressions are only valid if all expressions contain" + with pytest.raises(ValueError, match=msg): df.eval( """ a = b + 2 @@ -1474,7 +1490,8 @@ def test_assignment_in_query(self): # GH 8664 df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) df_orig = df.copy() - with pytest.raises(ValueError): + msg = "cannot assign without a target object" + with pytest.raises(ValueError, match=msg): df.query("a = 1") tm.assert_frame_equal(df, df_orig) @@ -1593,19 +1610,21 @@ def test_simple_in_ops(self): ) assert res else: - with pytest.raises(NotImplementedError): + msg = "'In' nodes are not implemented" + with pytest.raises(NotImplementedError, match=msg): pd.eval("1 in [1, 2]", engine=self.engine, parser=self.parser) - with pytest.raises(NotImplementedError): + with pytest.raises(NotImplementedError, match=msg): pd.eval("2 in (1, 2)", engine=self.engine, parser=self.parser) - with pytest.raises(NotImplementedError): + with pytest.raises(NotImplementedError, match=msg): pd.eval("3 in (1, 2)", engine=self.engine, parser=self.parser) - with pytest.raises(NotImplementedError): - pd.eval("3 not in (1, 2)", engine=self.engine, parser=self.parser) - with pytest.raises(NotImplementedError): + with pytest.raises(NotImplementedError, match=msg): pd.eval( "[(3,)] in (1, 2, [(3,)])", engine=self.engine, parser=self.parser ) - with pytest.raises(NotImplementedError): + msg = "'NotIn' nodes are not implemented" + with pytest.raises(NotImplementedError, match=msg): + pd.eval("3 not in (1, 2)", engine=self.engine, parser=self.parser) + with pytest.raises(NotImplementedError, match=msg): pd.eval( "[3] not in (1, 2, [[3]])", engine=self.engine, parser=self.parser ) @@ -1664,13 +1683,15 @@ def test_fails_not(self): def test_fails_ampersand(self): df = DataFrame(np.random.randn(5, 3)) # noqa ex = "(df + 2)[df > 1] > 0 & (df > 0)" - with pytest.raises(NotImplementedError): + msg = "cannot evaluate scalar only bool ops" + with pytest.raises(NotImplementedError, match=msg): pd.eval(ex, parser=self.parser, engine=self.engine) def test_fails_pipe(self): df = DataFrame(np.random.randn(5, 3)) # noqa ex = "(df + 2)[df > 1] > 0 | (df > 0)" - with pytest.raises(NotImplementedError): + msg = "cannot evaluate scalar only bool ops" + with pytest.raises(NotImplementedError, match=msg): pd.eval(ex, parser=self.parser, engine=self.engine) def test_bool_ops_with_constants(self): @@ -1679,7 +1700,8 @@ def test_bool_ops_with_constants(self): ): ex = f"{lhs} {op} {rhs}" if op in ("and", "or"): - with pytest.raises(NotImplementedError): + msg = "'BoolOp' nodes are not implemented" + with pytest.raises(NotImplementedError, match=msg): self.eval(ex) else: res = self.eval(ex) @@ -1690,7 +1712,8 @@ def test_simple_bool_ops(self): for op, lhs, rhs in product(expr._bool_ops_syms, (True, False), (True, False)): ex = f"lhs {op} rhs" if op in ("and", "or"): - with pytest.raises(NotImplementedError): + msg = "'BoolOp' nodes are not implemented" + with pytest.raises(NotImplementedError, match=msg): pd.eval(ex, engine=self.engine, parser=self.parser) else: res = pd.eval(ex, engine=self.engine, parser=self.parser) @@ -1902,19 +1925,21 @@ def test_disallowed_nodes(engine, parser): inst = VisitorClass("x + 1", engine, parser) for ops in uns_ops: - with pytest.raises(NotImplementedError): + msg = "nodes are not implemented" + with pytest.raises(NotImplementedError, match=msg): getattr(inst, ops)() def test_syntax_error_exprs(engine, parser): e = "s +" - with pytest.raises(SyntaxError): + with pytest.raises(SyntaxError, match="invalid syntax"): pd.eval(e, engine=engine, parser=parser) def test_name_error_exprs(engine, parser): e = "s + t" - with pytest.raises(NameError): + msg = "name 's' is not defined" + with pytest.raises(NameError, match=msg): pd.eval(e, engine=engine, parser=parser) @@ -1973,7 +1998,8 @@ def test_bool_ops_fails_on_scalars(lhs, cmp, rhs, engine, parser): ex2 = f"lhs {cmp} mid and mid {cmp} rhs" ex3 = f"(lhs {cmp} mid) & (mid {cmp} rhs)" for ex in (ex1, ex2, ex3): - with pytest.raises(NotImplementedError): + msg = "cannot evaluate scalar only bool ops|'BoolOp' nodes are not" + with pytest.raises(NotImplementedError, match=msg): pd.eval(ex, engine=engine, parser=parser) @@ -2029,7 +2055,8 @@ def test_negate_lt_eq_le(engine, parser): tm.assert_frame_equal(result, expected) if parser == "python": - with pytest.raises(NotImplementedError): + msg = "'Not' nodes are not implemented" + with pytest.raises(NotImplementedError, match=msg): df.query("not (cat > 0)", engine=engine, parser=parser) else: result = df.query("not (cat > 0)", engine=engine, parser=parser) @@ -2041,5 +2068,6 @@ def test_validate_bool_args(self): invalid_values = [1, "True", [1, 2, 3], 5.0] for value in invalid_values: - with pytest.raises(ValueError): + msg = 'For argument "inplace" expected type bool, received type' + with pytest.raises(ValueError, match=msg): pd.eval("2+2", inplace=value) diff --git a/pandas/tests/dtypes/cast/test_construct_from_scalar.py b/pandas/tests/dtypes/cast/test_construct_from_scalar.py index cc823a3d6e02c..ed272cef3e7ba 100644 --- a/pandas/tests/dtypes/cast/test_construct_from_scalar.py +++ b/pandas/tests/dtypes/cast/test_construct_from_scalar.py @@ -15,6 +15,4 @@ def test_cast_1d_array_like_from_scalar_categorical(): expected = Categorical(["a", "a"], categories=cats) result = construct_1d_arraylike_from_scalar("a", len(expected), cat_type) - tm.assert_categorical_equal( - result, expected, check_category_order=True, check_dtype=True - ) + tm.assert_categorical_equal(result, expected) diff --git a/pandas/tests/dtypes/cast/test_upcast.py b/pandas/tests/dtypes/cast/test_upcast.py index bb7a7d059c7ee..f9227a4e78a79 100644 --- a/pandas/tests/dtypes/cast/test_upcast.py +++ b/pandas/tests/dtypes/cast/test_upcast.py @@ -12,7 +12,7 @@ def test_upcast_error(result): # GH23823 require result arg to be ndarray mask = np.array([False, True, False]) other = np.array([61, 62, 63]) - with pytest.raises(ValueError): + with pytest.raises(ValueError, match="The result input must be a ndarray"): result, _ = maybe_upcast_putmask(result, mask, other) diff --git a/pandas/tests/dtypes/test_common.py b/pandas/tests/dtypes/test_common.py index 8da2797835080..66bf696cbe912 100644 --- a/pandas/tests/dtypes/test_common.py +++ b/pandas/tests/dtypes/test_common.py @@ -281,18 +281,6 @@ def test_is_string_dtype(): assert com.is_string_dtype(pd.array(["a", "b"], dtype="string")) -def test_is_period_arraylike(): - assert not com.is_period_arraylike([1, 2, 3]) - assert not com.is_period_arraylike(pd.Index([1, 2, 3])) - assert com.is_period_arraylike(pd.PeriodIndex(["2017-01-01"], freq="D")) - - -def test_is_datetime_arraylike(): - assert not com.is_datetime_arraylike([1, 2, 3]) - assert not com.is_datetime_arraylike(pd.Index([1, 2, 3])) - assert com.is_datetime_arraylike(pd.DatetimeIndex([1, 2, 3])) - - integer_dtypes: List = [] diff --git a/pandas/tests/dtypes/test_dtypes.py b/pandas/tests/dtypes/test_dtypes.py index 55b1ac819049d..658d27160e3e1 100644 --- a/pandas/tests/dtypes/test_dtypes.py +++ b/pandas/tests/dtypes/test_dtypes.py @@ -361,7 +361,7 @@ def test_hash_vs_equality(self, dtype): assert hash(dtype) == hash(dtype3) def test_construction(self): - with pytest.raises(ValueError): + with pytest.raises(ValueError, match="Invalid frequency: xx"): PeriodDtype("xx") for s in ["period[D]", "Period[D]", "D"]: @@ -414,21 +414,25 @@ def test_construction_from_string(self, dtype): assert is_dtype_equal(dtype, result) result = PeriodDtype.construct_from_string("period[D]") assert is_dtype_equal(dtype, result) - with pytest.raises(TypeError): - PeriodDtype.construct_from_string("foo") - with pytest.raises(TypeError): - PeriodDtype.construct_from_string("period[foo]") - with pytest.raises(TypeError): - PeriodDtype.construct_from_string("foo[D]") - - with pytest.raises(TypeError): - PeriodDtype.construct_from_string("datetime64[ns]") - with pytest.raises(TypeError): - PeriodDtype.construct_from_string("datetime64[ns, US/Eastern]") with pytest.raises(TypeError, match="list"): PeriodDtype.construct_from_string([1, 2, 3]) + @pytest.mark.parametrize( + "string", + [ + "foo", + "period[foo]", + "foo[D]", + "datetime64[ns]", + "datetime64[ns, US/Eastern]", + ], + ) + def test_construct_dtype_from_string_invalid_raises(self, string): + msg = f"Cannot construct a 'PeriodDtype' from '{string}'" + with pytest.raises(TypeError, match=re.escape(msg)): + PeriodDtype.construct_from_string(string) + def test_is_dtype(self, dtype): assert PeriodDtype.is_dtype(dtype) assert PeriodDtype.is_dtype("period[D]") @@ -475,7 +479,8 @@ def test_basic(self, dtype): def test_empty(self): dt = PeriodDtype() - with pytest.raises(AttributeError): + msg = "object has no attribute 'freqstr'" + with pytest.raises(AttributeError, match=msg): str(dt) def test_not_string(self): @@ -764,11 +769,13 @@ def test_order_hashes_different(self, v1, v2): assert c1 is not c3 def test_nan_invalid(self): - with pytest.raises(ValueError): + msg = "Categorical categories cannot be null" + with pytest.raises(ValueError, match=msg): CategoricalDtype([1, 2, np.nan]) def test_non_unique_invalid(self): - with pytest.raises(ValueError): + msg = "Categorical categories must be unique" + with pytest.raises(ValueError, match=msg): CategoricalDtype([1, 2, 1]) def test_same_categories_different_order(self): diff --git a/pandas/tests/dtypes/test_generic.py b/pandas/tests/dtypes/test_generic.py index 2c8631ac2d71d..f9ee943d9e6bf 100644 --- a/pandas/tests/dtypes/test_generic.py +++ b/pandas/tests/dtypes/test_generic.py @@ -37,13 +37,10 @@ def test_abc_types(self): assert isinstance(self.df, gt.ABCDataFrame) assert isinstance(self.sparse_array, gt.ABCSparseArray) assert isinstance(self.categorical, gt.ABCCategorical) - assert isinstance(pd.Period("2012", freq="A-DEC"), gt.ABCPeriod) assert isinstance(pd.DateOffset(), gt.ABCDateOffset) assert isinstance(pd.Period("2012", freq="A-DEC").freq, gt.ABCDateOffset) assert not isinstance(pd.Period("2012", freq="A-DEC"), gt.ABCDateOffset) - assert isinstance(pd.Interval(0, 1.5), gt.ABCInterval) - assert not isinstance(pd.Period("2012", freq="A-DEC"), gt.ABCInterval) assert isinstance(self.datetime_array, gt.ABCDatetimeArray) assert not isinstance(self.datetime_index, gt.ABCDatetimeArray) diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py index 48ae1f67297af..82d6b1df19393 100644 --- a/pandas/tests/dtypes/test_inference.py +++ b/pandas/tests/dtypes/test_inference.py @@ -16,7 +16,7 @@ import pytest import pytz -from pandas._libs import iNaT, lib, missing as libmissing +from pandas._libs import lib, missing as libmissing import pandas.util._test_decorators as td from pandas.core.dtypes import inference @@ -50,7 +50,6 @@ Timedelta, TimedeltaIndex, Timestamp, - isna, ) import pandas._testing as tm from pandas.core.arrays import IntegerArray @@ -507,6 +506,13 @@ def test_convert_numeric_int64_uint64(self, case, coerce): result = lib.maybe_convert_numeric(case, set(), coerce_numeric=coerce) tm.assert_almost_equal(result, expected) + def test_convert_numeric_string_uint64(self): + # GH32394 + result = lib.maybe_convert_numeric( + np.array(["uint64"], dtype=object), set(), coerce_numeric=True + ) + assert np.isnan(result) + @pytest.mark.parametrize("value", [-(2 ** 63) - 1, 2 ** 64]) def test_convert_int_overflow(self, value): # see gh-18584 @@ -568,6 +574,13 @@ def test_maybe_convert_objects_nullable_integer(self, exp): tm.assert_extension_array_equal(result, exp) + def test_maybe_convert_objects_bool_nan(self): + # GH32146 + ind = pd.Index([True, False, np.nan], dtype=object) + exp = np.array([True, False, np.nan], dtype=object) + out = lib.maybe_convert_objects(ind.values, safe=1) + tm.assert_numpy_array_equal(out, exp) + def test_mixed_dtypes_remain_object_array(self): # GH14956 array = np.array([datetime(2015, 1, 1, tzinfo=pytz.utc), 1], dtype=object) @@ -1423,6 +1436,7 @@ def test_is_scalar_pandas_scalars(self): assert is_scalar(Period("2014-01-01")) assert is_scalar(Interval(left=0, right=1)) assert is_scalar(DateOffset(days=1)) + assert is_scalar(pd.offsets.Minute(3)) def test_is_scalar_pandas_containers(self): assert not is_scalar(Series(dtype=object)) @@ -1431,6 +1445,11 @@ def test_is_scalar_pandas_containers(self): assert not is_scalar(DataFrame([[1]])) assert not is_scalar(Index([])) assert not is_scalar(Index([1])) + assert not is_scalar(Categorical([])) + assert not is_scalar(DatetimeIndex([])._data) + assert not is_scalar(TimedeltaIndex([])._data) + assert not is_scalar(DatetimeIndex([])._data.to_period("D")) + assert not is_scalar(pd.array([1, 2, 3])) def test_is_scalar_number(self): # Number() is not recognied by PyNumber_Check, so by extension @@ -1460,14 +1479,12 @@ def test_nan_to_nat_conversions(): dict({"A": np.asarray(range(10), dtype="float64"), "B": Timestamp("20010101")}) ) df.iloc[3:6, :] = np.nan - result = df.loc[4, "B"].value - assert result == iNaT + result = df.loc[4, "B"] + assert result is pd.NaT s = df["B"].copy() - s._data = s._data.setitem(indexer=tuple([slice(8, 9)]), value=np.nan) - assert isna(s[8]) - - assert s[8].value == np.datetime64("NaT").astype(np.int64) + s[8:9] = np.nan + assert s[8] is pd.NaT @td.skip_if_no_scipy diff --git a/pandas/tests/extension/base/interface.py b/pandas/tests/extension/base/interface.py index cdea96334be2a..95fb3d7439b56 100644 --- a/pandas/tests/extension/base/interface.py +++ b/pandas/tests/extension/base/interface.py @@ -19,6 +19,9 @@ class BaseInterfaceTests(BaseExtensionTests): def test_len(self, data): assert len(data) == 100 + def test_size(self, data): + assert data.size == 100 + def test_ndim(self, data): assert data.ndim == 1 diff --git a/pandas/tests/extension/base/setitem.py b/pandas/tests/extension/base/setitem.py index a4fe89df158fa..dece8098c8542 100644 --- a/pandas/tests/extension/base/setitem.py +++ b/pandas/tests/extension/base/setitem.py @@ -1,5 +1,3 @@ -import operator - import numpy as np import pytest @@ -60,7 +58,7 @@ def test_setitem_sequence_broadcasts(self, data, box_in_series): def test_setitem_scalar(self, data, setter): arr = pd.Series(data) setter = getattr(arr, setter) - operator.setitem(setter, 0, data[1]) + setter[0] = data[1] assert arr[0] == data[1] def test_setitem_loc_scalar_mixed(self, data): @@ -196,7 +194,7 @@ def test_setitem_mask_aligned(self, data, as_callable, setter): # Series.__setitem__ target = ser - operator.setitem(target, mask2, data[5:7]) + target[mask2] = data[5:7] ser[mask2] = data[5:7] assert ser[0] == data[5] @@ -213,7 +211,7 @@ def test_setitem_mask_broadcast(self, data, setter): else: # __setitem__ target = ser - operator.setitem(target, mask, data[10]) + target[mask] = data[10] assert ser[0] == data[10] assert ser[1] == data[10] diff --git a/pandas/tests/extension/decimal/array.py b/pandas/tests/extension/decimal/array.py index 9384ed5199c1f..85d8ad6ec6e38 100644 --- a/pandas/tests/extension/decimal/array.py +++ b/pandas/tests/extension/decimal/array.py @@ -79,7 +79,9 @@ def _from_factorized(cls, values, original): _HANDLED_TYPES = (decimal.Decimal, numbers.Number, np.ndarray) - def to_numpy(self, dtype=None, copy=False, na_value=no_default, decimals=None): + def to_numpy( + self, dtype=None, copy: bool = False, na_value=no_default, decimals=None + ) -> np.ndarray: result = np.asarray(self, dtype=dtype) if decimals is not None: result = np.asarray([round(x, decimals) for x in result]) diff --git a/pandas/tests/extension/test_external_block.py b/pandas/tests/extension/test_external_block.py index 6311070cfe2bb..26606d7e799e8 100644 --- a/pandas/tests/extension/test_external_block.py +++ b/pandas/tests/extension/test_external_block.py @@ -2,13 +2,14 @@ import pytest import pandas as pd -from pandas.core.internals import BlockManager -from pandas.core.internals.blocks import Block, NonConsolidatableMixIn +from pandas.core.internals import BlockManager, SingleBlockManager +from pandas.core.internals.blocks import ExtensionBlock -class CustomBlock(NonConsolidatableMixIn, Block): +class CustomBlock(ExtensionBlock): _holder = np.ndarray + _can_hold_na = False def concat_same_type(self, to_concat, placement=None): """ @@ -36,7 +37,8 @@ def test_concat_series(): # GH17728 values = np.arange(3, dtype="int64") block = CustomBlock(values, placement=slice(0, 3)) - s = pd.Series(block, pd.RangeIndex(3), fastpath=True) + mgr = SingleBlockManager(block, pd.RangeIndex(3)) + s = pd.Series(mgr, pd.RangeIndex(3), fastpath=True) res = pd.concat([s, s]) assert isinstance(res._data.blocks[0], CustomBlock) diff --git a/pandas/tests/frame/conftest.py b/pandas/tests/frame/conftest.py index 03598b6bb5eca..e89b2c6f1fec0 100644 --- a/pandas/tests/frame/conftest.py +++ b/pandas/tests/frame/conftest.py @@ -40,8 +40,8 @@ def float_frame_with_na(): """ df = DataFrame(tm.getSeriesData()) # set some NAs - df.loc[5:10] = np.nan - df.loc[15:20, -2:] = np.nan + df.iloc[5:10] = np.nan + df.iloc[15:20, -2:] = np.nan return df @@ -74,8 +74,8 @@ def bool_frame_with_na(): df = DataFrame(tm.getSeriesData()) > 0 df = df.astype(object) # set some NAs - df.loc[5:10] = np.nan - df.loc[15:20, -2:] = np.nan + df.iloc[5:10] = np.nan + df.iloc[15:20, -2:] = np.nan return df diff --git a/pandas/tests/frame/indexing/test_datetime.py b/pandas/tests/frame/indexing/test_datetime.py index 6bfcac3793584..0fd60c151b9c4 100644 --- a/pandas/tests/frame/indexing/test_datetime.py +++ b/pandas/tests/frame/indexing/test_datetime.py @@ -40,7 +40,7 @@ def test_set_reset(self): # set/reset df = DataFrame({"A": [0, 1, 2]}, index=idx) result = df.reset_index() - assert result["foo"].dtype, "M8[ns, US/Eastern" + assert result["foo"].dtype == "datetime64[ns, US/Eastern]" df = result.set_index("foo") tm.assert_index_equal(df.index, idx) diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py index b0cb988720c25..923447889d04c 100644 --- a/pandas/tests/frame/indexing/test_indexing.py +++ b/pandas/tests/frame/indexing/test_indexing.py @@ -215,6 +215,63 @@ def test_setitem_list_of_tuples(self, float_frame): expected = Series(tuples, index=float_frame.index, name="tuples") tm.assert_series_equal(result, expected) + @pytest.mark.parametrize( + "columns,box,expected", + [ + ( + ["A", "B", "C", "D"], + 7, + pd.DataFrame( + [[7, 7, 7, 7], [7, 7, 7, 7], [7, 7, 7, 7]], + columns=["A", "B", "C", "D"], + ), + ), + ( + ["C", "D"], + [7, 8], + pd.DataFrame( + [[1, 2, 7, 8], [3, 4, 7, 8], [5, 6, 7, 8]], + columns=["A", "B", "C", "D"], + ), + ), + ( + ["A", "B", "C"], + np.array([7, 8, 9], dtype=np.int64), + pd.DataFrame( + [[7, 8, 9], [7, 8, 9], [7, 8, 9]], columns=["A", "B", "C"] + ), + ), + ( + ["B", "C", "D"], + [[7, 8, 9], [10, 11, 12], [13, 14, 15]], + pd.DataFrame( + [[1, 7, 8, 9], [3, 10, 11, 12], [5, 13, 14, 15]], + columns=["A", "B", "C", "D"], + ), + ), + ( + ["C", "A", "D"], + np.array([[7, 8, 9], [10, 11, 12], [13, 14, 15]], dtype=np.int64), + pd.DataFrame( + [[8, 2, 7, 9], [11, 4, 10, 12], [14, 6, 13, 15]], + columns=["A", "B", "C", "D"], + ), + ), + ( + ["A", "C"], + pd.DataFrame([[7, 8], [9, 10], [11, 12]], columns=["A", "C"]), + pd.DataFrame( + [[7, 2, 8], [9, 4, 10], [11, 6, 12]], columns=["A", "B", "C"] + ), + ), + ], + ) + def test_setitem_list_missing_columns(self, columns, box, expected): + # GH 29334 + df = pd.DataFrame([[1, 2], [3, 4], [5, 6]], columns=["A", "B"]) + df[columns] = box + tm.assert_frame_equal(df, expected) + def test_setitem_multi_index(self): # GH7655, test that assigning to a sub-frame of a frame # with multi-index columns aligns both rows and columns @@ -459,13 +516,6 @@ def test_setitem(self, float_frame): float_frame["col6"] = series tm.assert_series_equal(series, float_frame["col6"], check_names=False) - msg = ( - r"\"None of \[Float64Index\(\[.*dtype='float64'\)\] are in the " - r"\[columns\]\"" - ) - with pytest.raises(KeyError, match=msg): - float_frame[np.random.randn(len(float_frame) + 1)] = 1 - # set ndarray arr = np.random.randn(len(float_frame)) float_frame["col9"] = arr @@ -1209,7 +1259,7 @@ def test_setitem_frame_mixed(self, float_string_frame): piece = DataFrame( [[1.0, 2.0], [3.0, 4.0]], index=f.index[0:2], columns=["A", "B"] ) - key = (slice(None, 2), ["A", "B"]) + key = (f.index[slice(None, 2)], ["A", "B"]) f.loc[key] = piece tm.assert_almost_equal(f.loc[f.index[0:2], ["A", "B"]].values, piece.values) @@ -1220,7 +1270,7 @@ def test_setitem_frame_mixed(self, float_string_frame): index=list(f.index[0:2]) + ["foo", "bar"], columns=["A", "B"], ) - key = (slice(None, 2), ["A", "B"]) + key = (f.index[slice(None, 2)], ["A", "B"]) f.loc[key] = piece tm.assert_almost_equal( f.loc[f.index[0:2:], ["A", "B"]].values, piece.values[0:2] @@ -1230,7 +1280,7 @@ def test_setitem_frame_mixed(self, float_string_frame): f = float_string_frame.copy() piece = f.loc[f.index[:2], ["A"]] piece.index = f.index[-2:] - key = (slice(-2, None), ["A", "B"]) + key = (f.index[slice(-2, None)], ["A", "B"]) f.loc[key] = piece piece["B"] = np.nan tm.assert_almost_equal(f.loc[f.index[-2:], ["A", "B"]].values, piece.values) @@ -1238,7 +1288,7 @@ def test_setitem_frame_mixed(self, float_string_frame): # ndarray f = float_string_frame.copy() piece = float_string_frame.loc[f.index[:2], ["A", "B"]] - key = (slice(-2, None), ["A", "B"]) + key = (f.index[slice(-2, None)], ["A", "B"]) f.loc[key] = piece.values tm.assert_almost_equal(f.loc[f.index[-2:], ["A", "B"]].values, piece.values) @@ -1605,6 +1655,17 @@ def test_reindex_methods(self, method, expected_values): actual = df[::-1].reindex(target, method=switched_method) tm.assert_frame_equal(expected, actual) + def test_reindex_subclass(self): + # https://github.com/pandas-dev/pandas/issues/31925 + class MyDataFrame(DataFrame): + pass + + expected = DataFrame() + df = MyDataFrame() + result = df.reindex_like(expected) + + tm.assert_frame_equal(result, expected) + def test_reindex_methods_nearest_special(self): df = pd.DataFrame({"x": list(range(5))}) target = np.array([-0.1, 0.9, 1.1, 1.5]) @@ -1873,7 +1934,7 @@ def test_setitem_datetimelike_with_inference(self): df = DataFrame(index=date_range("20130101", periods=4)) df["A"] = np.array([1 * one_hour] * 4, dtype="m8[ns]") df.loc[:, "B"] = np.array([2 * one_hour] * 4, dtype="m8[ns]") - df.loc[:3, "C"] = np.array([3 * one_hour] * 3, dtype="m8[ns]") + df.loc[df.index[:3], "C"] = np.array([3 * one_hour] * 3, dtype="m8[ns]") df.loc[:, "D"] = np.array([4 * one_hour] * 4, dtype="m8[ns]") df.loc[df.index[:3], "E"] = np.array([5 * one_hour] * 3, dtype="m8[ns]") df["F"] = np.timedelta64("NaT") diff --git a/pandas/tests/frame/indexing/test_where.py b/pandas/tests/frame/indexing/test_where.py index eee754a47fb8c..bbf8ee5978e7c 100644 --- a/pandas/tests/frame/indexing/test_where.py +++ b/pandas/tests/frame/indexing/test_where.py @@ -397,7 +397,8 @@ def test_where_none(self): def test_where_empty_df_and_empty_cond_having_non_bool_dtypes(self): # see gh-21947 df = pd.DataFrame(columns=["a"]) - cond = df.applymap(lambda x: x > 0) + cond = df + assert (cond.dtypes == object).all() result = df.where(cond) tm.assert_frame_equal(result, df) diff --git a/pandas/tests/frame/methods/test_asof.py b/pandas/tests/frame/methods/test_asof.py index e2b417972638e..70b42976c95a7 100644 --- a/pandas/tests/frame/methods/test_asof.py +++ b/pandas/tests/frame/methods/test_asof.py @@ -1,7 +1,17 @@ import numpy as np import pytest -from pandas import DataFrame, Period, Series, Timestamp, date_range, to_datetime +from pandas._libs.tslibs import IncompatibleFrequency + +from pandas import ( + DataFrame, + Period, + Series, + Timestamp, + date_range, + period_range, + to_datetime, +) import pandas._testing as tm @@ -21,7 +31,7 @@ class TestFrameAsof: def test_basic(self, date_range_frame): df = date_range_frame N = 50 - df.loc[15:30, "A"] = np.nan + df.loc[df.index[15:30], "A"] = np.nan dates = date_range("1/1/1990", periods=N * 3, freq="25s") result = df.asof(dates) @@ -41,7 +51,7 @@ def test_basic(self, date_range_frame): def test_subset(self, date_range_frame): N = 10 df = date_range_frame.iloc[:N].copy() - df.loc[4:8, "A"] = np.nan + df.loc[df.index[4:8], "A"] = np.nan dates = date_range("1/1/1990", periods=N * 3, freq="25s") # with a subset of A should be the same @@ -149,10 +159,20 @@ def test_is_copy(self, date_range_frame): # doesn't track the parent dataframe / doesn't give SettingWithCopy warnings df = date_range_frame N = 50 - df.loc[15:30, "A"] = np.nan + df.loc[df.index[15:30], "A"] = np.nan dates = date_range("1/1/1990", periods=N * 3, freq="25s") result = df.asof(dates) with tm.assert_produces_warning(None): result["C"] = 1 + + def test_asof_periodindex_mismatched_freq(self): + N = 50 + rng = period_range("1/1/1990", periods=N, freq="H") + df = DataFrame(np.random.randn(N), index=rng) + + # Mismatched freq + msg = "Input has different freq" + with pytest.raises(IncompatibleFrequency, match=msg): + df.asof(rng.asfreq("D")) diff --git a/pandas/tests/frame/methods/test_drop.py b/pandas/tests/frame/methods/test_drop.py new file mode 100644 index 0000000000000..e6d002369f758 --- /dev/null +++ b/pandas/tests/frame/methods/test_drop.py @@ -0,0 +1,54 @@ +import numpy as np +import pytest + +import pandas as pd +import pandas._testing as tm + + +@pytest.mark.parametrize( + "msg,labels,level", + [ + (r"labels \[4\] not found in level", 4, "a"), + (r"labels \[7\] not found in level", 7, "b"), + ], +) +def test_drop_raise_exception_if_labels_not_in_level(msg, labels, level): + # GH 8594 + mi = pd.MultiIndex.from_arrays([[1, 2, 3], [4, 5, 6]], names=["a", "b"]) + s = pd.Series([10, 20, 30], index=mi) + df = pd.DataFrame([10, 20, 30], index=mi) + + with pytest.raises(KeyError, match=msg): + s.drop(labels, level=level) + with pytest.raises(KeyError, match=msg): + df.drop(labels, level=level) + + +@pytest.mark.parametrize("labels,level", [(4, "a"), (7, "b")]) +def test_drop_errors_ignore(labels, level): + # GH 8594 + mi = pd.MultiIndex.from_arrays([[1, 2, 3], [4, 5, 6]], names=["a", "b"]) + s = pd.Series([10, 20, 30], index=mi) + df = pd.DataFrame([10, 20, 30], index=mi) + + expected_s = s.drop(labels, level=level, errors="ignore") + tm.assert_series_equal(s, expected_s) + + expected_df = df.drop(labels, level=level, errors="ignore") + tm.assert_frame_equal(df, expected_df) + + +def test_drop_with_non_unique_datetime_index_and_invalid_keys(): + # GH 30399 + + # define dataframe with unique datetime index + df = pd.DataFrame( + np.random.randn(5, 3), + columns=["a", "b", "c"], + index=pd.date_range("2012", freq="H", periods=5), + ) + # create dataframe with non-unique datetime index + df = df.iloc[[0, 2, 2, 3]].copy() + + with pytest.raises(KeyError, match="not found in axis"): + df.drop(["a", "b"]) # Dropping with labels not exist in the index diff --git a/pandas/tests/frame/methods/test_duplicated.py b/pandas/tests/frame/methods/test_duplicated.py index 38b9d7fd049ab..34751b565a24b 100644 --- a/pandas/tests/frame/methods/test_duplicated.py +++ b/pandas/tests/frame/methods/test_duplicated.py @@ -64,7 +64,6 @@ def test_duplicated_nan_none(keep, expected): tm.assert_series_equal(result, expected) -@pytest.mark.parametrize("keep", ["first", "last", False]) @pytest.mark.parametrize("subset", [None, ["A", "B"], "A"]) def test_duplicated_subset(subset, keep): df = DataFrame( diff --git a/pandas/tests/frame/methods/test_quantile.py b/pandas/tests/frame/methods/test_quantile.py index 9c52e8ec5620f..0eec30cbc5c67 100644 --- a/pandas/tests/frame/methods/test_quantile.py +++ b/pandas/tests/frame/methods/test_quantile.py @@ -100,13 +100,10 @@ def test_quantile_axis_parameter(self): result = df.quantile(0.5, axis="columns") tm.assert_series_equal(result, expected) - msg = "No axis named -1 for object type " + msg = "No axis named -1 for object type DataFrame" with pytest.raises(ValueError, match=msg): df.quantile(0.1, axis=-1) - msg = ( - "No axis named column for object type " - "" - ) + msg = "No axis named column for object type DataFrame" with pytest.raises(ValueError, match=msg): df.quantile(0.1, axis="column") diff --git a/pandas/tests/frame/methods/test_shift.py b/pandas/tests/frame/methods/test_shift.py index cfb17de892b1c..f6c89172bbf86 100644 --- a/pandas/tests/frame/methods/test_shift.py +++ b/pandas/tests/frame/methods/test_shift.py @@ -185,3 +185,26 @@ def test_tshift(self, datetime_frame): msg = "Freq was not given and was not set in the index" with pytest.raises(ValueError, match=msg): no_freq.tshift() + + def test_shift_dt64values_int_fill_deprecated(self): + # GH#31971 + ser = pd.Series([pd.Timestamp("2020-01-01"), pd.Timestamp("2020-01-02")]) + df = ser.to_frame() + + with tm.assert_produces_warning(FutureWarning): + result = df.shift(1, fill_value=0) + + expected = pd.Series([pd.Timestamp(0), ser[0]]).to_frame() + tm.assert_frame_equal(result, expected) + + # axis = 1 + df2 = pd.DataFrame({"A": ser, "B": ser}) + df2._consolidate_inplace() + + with tm.assert_produces_warning(FutureWarning): + result = df2.shift(1, axis=1, fill_value=0) + + expected = pd.DataFrame( + {"A": [pd.Timestamp(0), pd.Timestamp(0)], "B": df2["A"]} + ) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_sort_values.py b/pandas/tests/frame/methods/test_sort_values.py index 5a25d1c2c0894..3d3bb98f80ac5 100644 --- a/pandas/tests/frame/methods/test_sort_values.py +++ b/pandas/tests/frame/methods/test_sort_values.py @@ -43,7 +43,7 @@ def test_sort_values(self): sorted_df = frame.sort_values(by=["B", "A"], ascending=[True, False]) tm.assert_frame_equal(sorted_df, expected) - msg = "No axis named 2 for object type " + msg = "No axis named 2 for object type DataFrame" with pytest.raises(ValueError, match=msg): frame.sort_values(by=["A", "B"], axis=2, inplace=True) diff --git a/pandas/tests/frame/methods/test_to_dict.py b/pandas/tests/frame/methods/test_to_dict.py index cd9bd169322fd..f1656b46cf356 100644 --- a/pandas/tests/frame/methods/test_to_dict.py +++ b/pandas/tests/frame/methods/test_to_dict.py @@ -70,8 +70,17 @@ def test_to_dict_invalid_orient(self): with pytest.raises(ValueError, match=msg): df.to_dict(orient="xinvalid") + @pytest.mark.parametrize("orient", ["d", "l", "r", "sp", "s", "i"]) + def test_to_dict_short_orient_warns(self, orient): + # GH#32515 + df = DataFrame({"A": [0, 1]}) + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + df.to_dict(orient=orient) + @pytest.mark.parametrize("mapping", [dict, defaultdict(list), OrderedDict]) def test_to_dict(self, mapping): + # orient= should only take the listed options + # see GH#32515 test_data = {"A": {"1": 1, "2": 2}, "B": {"1": "1", "2": "2", "3": "3"}} # GH#16122 @@ -81,19 +90,19 @@ def test_to_dict(self, mapping): for k2, v2 in v.items(): assert v2 == recons_data[k][k2] - recons_data = DataFrame(test_data).to_dict("l", mapping) + recons_data = DataFrame(test_data).to_dict("list", mapping) for k, v in test_data.items(): for k2, v2 in v.items(): assert v2 == recons_data[k][int(k2) - 1] - recons_data = DataFrame(test_data).to_dict("s", mapping) + recons_data = DataFrame(test_data).to_dict("series", mapping) for k, v in test_data.items(): for k2, v2 in v.items(): assert v2 == recons_data[k][k2] - recons_data = DataFrame(test_data).to_dict("sp", mapping) + recons_data = DataFrame(test_data).to_dict("split", mapping) expected_split = { "columns": ["A", "B"], "index": ["1", "2", "3"], @@ -101,7 +110,7 @@ def test_to_dict(self, mapping): } tm.assert_dict_equal(recons_data, expected_split) - recons_data = DataFrame(test_data).to_dict("r", mapping) + recons_data = DataFrame(test_data).to_dict("records", mapping) expected_records = [ {"A": 1.0, "B": "1"}, {"A": 2.0, "B": "2"}, @@ -113,7 +122,7 @@ def test_to_dict(self, mapping): tm.assert_dict_equal(l, r) # GH#10844 - recons_data = DataFrame(test_data).to_dict("i") + recons_data = DataFrame(test_data).to_dict("index") for k, v in test_data.items(): for k2, v2 in v.items(): @@ -121,7 +130,7 @@ def test_to_dict(self, mapping): df = DataFrame(test_data) df["duped"] = df[df.columns[0]] - recons_data = df.to_dict("i") + recons_data = df.to_dict("index") comp_data = test_data.copy() comp_data["duped"] = comp_data[df.columns[0]] for k, v in comp_data.items(): diff --git a/pandas/tests/frame/methods/test_to_period.py b/pandas/tests/frame/methods/test_to_period.py index eac78e611b008..051461b6c554d 100644 --- a/pandas/tests/frame/methods/test_to_period.py +++ b/pandas/tests/frame/methods/test_to_period.py @@ -31,6 +31,6 @@ def test_frame_to_period(self): pts = df.to_period("M", axis=1) tm.assert_index_equal(pts.columns, exp.columns.asfreq("M")) - msg = "No axis named 2 for object type " + msg = "No axis named 2 for object type DataFrame" with pytest.raises(ValueError, match=msg): df.to_period(axis=2) diff --git a/pandas/tests/frame/methods/test_to_records.py b/pandas/tests/frame/methods/test_to_records.py index d0181f0309af1..34b323e55d8cd 100644 --- a/pandas/tests/frame/methods/test_to_records.py +++ b/pandas/tests/frame/methods/test_to_records.py @@ -3,7 +3,14 @@ import numpy as np import pytest -from pandas import CategoricalDtype, DataFrame, MultiIndex, Series, date_range +from pandas import ( + CategoricalDtype, + DataFrame, + MultiIndex, + Series, + Timestamp, + date_range, +) import pandas._testing as tm @@ -18,6 +25,17 @@ def test_to_records_dt64(self): result = df.to_records()["index"][0] assert expected == result + def test_to_records_dt64tz_column(self): + # GH#32535 dont less tz in to_records + df = DataFrame({"A": date_range("2012-01-01", "2012-01-02", tz="US/Eastern")}) + + result = df.to_records() + + assert result.dtype["A"] == object + val = result[0][1] + assert isinstance(val, Timestamp) + assert val == df.loc[0, "A"] + def test_to_records_with_multindex(self): # GH#3189 index = [ diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py index 61802956addeb..3a7df29ae9091 100644 --- a/pandas/tests/frame/test_analytics.py +++ b/pandas/tests/frame/test_analytics.py @@ -331,6 +331,8 @@ def kurt(x): check_dates=True, ) + # GH#32571 check_less_precise is needed on apparently-random + # py37-npdev builds and OSX-PY36-min_version builds # mixed types (with upcasting happening) assert_stat_op_calc( "sum", @@ -875,11 +877,6 @@ def test_mean_datetimelike(self): expected = pd.Series({"A": 1.0, "C": df.loc[1, "C"]}) tm.assert_series_equal(result, expected) - @pytest.mark.xfail( - reason="casts to object-dtype and then tries to add timestamps", - raises=TypeError, - strict=True, - ) def test_mean_datetimelike_numeric_only_false(self): df = pd.DataFrame( { @@ -913,8 +910,8 @@ def test_sum_bools(self): def test_idxmin(self, float_frame, int_frame): frame = float_frame - frame.loc[5:10] = np.nan - frame.loc[15:20, -2:] = np.nan + frame.iloc[5:10] = np.nan + frame.iloc[15:20, -2:] = np.nan for skipna in [True, False]: for axis in [0, 1]: for df in [frame, int_frame]: @@ -922,14 +919,14 @@ def test_idxmin(self, float_frame, int_frame): expected = df.apply(Series.idxmin, axis=axis, skipna=skipna) tm.assert_series_equal(result, expected) - msg = "No axis named 2 for object type " + msg = "No axis named 2 for object type DataFrame" with pytest.raises(ValueError, match=msg): frame.idxmin(axis=2) def test_idxmax(self, float_frame, int_frame): frame = float_frame - frame.loc[5:10] = np.nan - frame.loc[15:20, -2:] = np.nan + frame.iloc[5:10] = np.nan + frame.iloc[15:20, -2:] = np.nan for skipna in [True, False]: for axis in [0, 1]: for df in [frame, int_frame]: @@ -937,7 +934,7 @@ def test_idxmax(self, float_frame, int_frame): expected = df.apply(Series.idxmax, axis=axis, skipna=skipna) tm.assert_series_equal(result, expected) - msg = "No axis named 2 for object type " + msg = "No axis named 2 for object type DataFrame" with pytest.raises(ValueError, match=msg): frame.idxmax(axis=2) diff --git a/pandas/tests/frame/test_api.py b/pandas/tests/frame/test_api.py index a021dd91a7d26..91627b46c2fee 100644 --- a/pandas/tests/frame/test_api.py +++ b/pandas/tests/frame/test_api.py @@ -127,6 +127,14 @@ def test_not_hashable(self): with pytest.raises(TypeError, match=msg): hash(empty_frame) + def test_column_name_contains_unicode_surrogate(self): + # GH 25509 + colname = "\ud83d" + df = DataFrame({colname: []}) + # this should not crash + assert colname not in dir(df) + assert df.columns[0] == colname + def test_new_empty_index(self): df1 = DataFrame(np.random.randn(0, 3)) df2 = DataFrame(np.random.randn(0, 3)) @@ -363,10 +371,7 @@ def test_swapaxes(self): tm.assert_frame_equal(df.T, df.swapaxes(0, 1)) tm.assert_frame_equal(df.T, df.swapaxes(1, 0)) tm.assert_frame_equal(df, df.swapaxes(0, 0)) - msg = ( - "No axis named 2 for object type " - r"" - ) + msg = "No axis named 2 for object type DataFrame" with pytest.raises(ValueError, match=msg): df.swapaxes(2, 5) diff --git a/pandas/tests/frame/test_apply.py b/pandas/tests/frame/test_apply.py index fe6abef97acc4..6dee4424f1cec 100644 --- a/pandas/tests/frame/test_apply.py +++ b/pandas/tests/frame/test_apply.py @@ -49,7 +49,8 @@ def test_apply(self, float_frame): # invalid axis df = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], index=["a", "a", "c"]) - with pytest.raises(ValueError): + msg = "No axis named 2 for object type DataFrame" + with pytest.raises(ValueError, match=msg): df.apply(lambda x: x, 2) # GH 9573 @@ -221,7 +222,8 @@ def test_apply_broadcast_error(self, int_frame_const_col): df = int_frame_const_col # > 1 ndim - with pytest.raises(ValueError): + msg = "too many dims to broadcast" + with pytest.raises(ValueError, match=msg): df.apply( lambda x: np.array([1, 2]).reshape(-1, 2), axis=1, @@ -229,13 +231,21 @@ def test_apply_broadcast_error(self, int_frame_const_col): ) # cannot broadcast - with pytest.raises(ValueError): + msg = "cannot broadcast result" + with pytest.raises(ValueError, match=msg): df.apply(lambda x: [1, 2], axis=1, result_type="broadcast") - with pytest.raises(ValueError): + with pytest.raises(ValueError, match=msg): df.apply(lambda x: Series([1, 2]), axis=1, result_type="broadcast") - def test_apply_raw(self, float_frame): + def test_apply_raw(self, float_frame, mixed_type_frame): + def _assert_raw(x): + assert isinstance(x, np.ndarray) + assert x.ndim == 1 + + float_frame.apply(_assert_raw, raw=True) + float_frame.apply(_assert_raw, axis=1, raw=True) + result0 = float_frame.apply(np.mean, raw=True) result1 = float_frame.apply(np.mean, axis=1, raw=True) @@ -250,6 +260,10 @@ def test_apply_raw(self, float_frame): expected = float_frame * 2 tm.assert_frame_equal(result, expected) + # Mixed dtype (GH-32423) + mixed_type_frame.apply(_assert_raw, raw=True) + mixed_type_frame.apply(_assert_raw, axis=1, raw=True) + def test_apply_axis1(self, float_frame): d = float_frame.index[0] tapplied = float_frame.apply(np.mean, axis=1) @@ -339,7 +353,7 @@ def test_apply_yield_list(self, float_frame): tm.assert_frame_equal(result, float_frame) def test_apply_reduce_Series(self, float_frame): - float_frame.loc[::2, "A"] = np.nan + float_frame["A"].iloc[::2] = np.nan expected = float_frame.mean(1) result = float_frame.apply(np.mean, axis=1) tm.assert_series_equal(result, expected) @@ -939,7 +953,11 @@ def test_result_type_error(self, result_type, int_frame_const_col): # allowed result_type df = int_frame_const_col - with pytest.raises(ValueError): + msg = ( + "invalid value for result_type, must be one of " + "{None, 'reduce', 'broadcast', 'expand'}" + ) + with pytest.raises(ValueError, match=msg): df.apply(lambda x: [1, 2, 3], axis=1, result_type=result_type) @pytest.mark.parametrize( @@ -1035,14 +1053,16 @@ def test_agg_transform(self, axis, float_frame): def test_transform_and_agg_err(self, axis, float_frame): # cannot both transform and agg - with pytest.raises(ValueError): + msg = "transforms cannot produce aggregated results" + with pytest.raises(ValueError, match=msg): float_frame.transform(["max", "min"], axis=axis) - with pytest.raises(ValueError): + msg = "cannot combine transform and aggregation operations" + with pytest.raises(ValueError, match=msg): with np.errstate(all="ignore"): float_frame.agg(["max", "sqrt"], axis=axis) - with pytest.raises(ValueError): + with pytest.raises(ValueError, match=msg): with np.errstate(all="ignore"): float_frame.transform(["max", "sqrt"], axis=axis) @@ -1376,7 +1396,8 @@ def test_agg_cython_table_transform(self, df, func, expected, axis): ) def test_agg_cython_table_raises(self, df, func, expected, axis): # GH 21224 - with pytest.raises(expected): + msg = "can't multiply sequence by non-int of type 'str'" + with pytest.raises(expected, match=msg): df.agg(func, axis=axis) @pytest.mark.parametrize("num_cols", [2, 3, 5]) diff --git a/pandas/tests/frame/test_arithmetic.py b/pandas/tests/frame/test_arithmetic.py index e4be8a979a70f..2150e1da9e8ad 100644 --- a/pandas/tests/frame/test_arithmetic.py +++ b/pandas/tests/frame/test_arithmetic.py @@ -1,6 +1,7 @@ from collections import deque from datetime import datetime import operator +import re import numpy as np import pytest @@ -46,13 +47,16 @@ def check(df, df2): ) tm.assert_frame_equal(result, expected) - with pytest.raises(TypeError): + msg = re.escape( + "Invalid comparison between dtype=datetime64[ns] and ndarray" + ) + with pytest.raises(TypeError, match=msg): x >= y - with pytest.raises(TypeError): + with pytest.raises(TypeError, match=msg): x > y - with pytest.raises(TypeError): + with pytest.raises(TypeError, match=msg): x < y - with pytest.raises(TypeError): + with pytest.raises(TypeError, match=msg): x <= y # GH4968 @@ -98,9 +102,13 @@ def test_timestamp_compare(self): result = right_f(pd.Timestamp("20010109"), df) tm.assert_frame_equal(result, expected) else: - with pytest.raises(TypeError): + msg = ( + "'(<|>)=?' not supported between " + "instances of 'Timestamp' and 'float'" + ) + with pytest.raises(TypeError, match=msg): left_f(df, pd.Timestamp("20010109")) - with pytest.raises(TypeError): + with pytest.raises(TypeError, match=msg): right_f(pd.Timestamp("20010109"), df) # nats expected = left_f(df, pd.Timestamp("nat")) @@ -348,6 +356,25 @@ def test_floordiv_axis0(self): result2 = df.floordiv(ser.values, axis=0) tm.assert_frame_equal(result2, expected) + @pytest.mark.slow + @pytest.mark.parametrize("opname", ["floordiv", "pow"]) + def test_floordiv_axis0_numexpr_path(self, opname): + # case that goes through numexpr and has to fall back to masked_arith_op + op = getattr(operator, opname) + + arr = np.arange(10 ** 6).reshape(100, -1) + df = pd.DataFrame(arr) + df["C"] = 1.0 + + ser = df[0] + result = getattr(df, opname)(ser, axis=0) + + expected = pd.DataFrame({col: op(df[col], ser) for col in df.columns}) + tm.assert_frame_equal(result, expected) + + result2 = getattr(df, opname)(ser.values, axis=0) + tm.assert_frame_equal(result2, expected) + def test_df_add_td64_columnwise(self): # GH 22534 Check that column-wise addition broadcasts correctly dti = pd.date_range("2016-01-01", periods=10) @@ -804,3 +831,27 @@ def test_align_frame(self): half = ts[::2] result = ts + half.take(np.random.permutation(len(half))) tm.assert_frame_equal(result, expected) + + +def test_pow_with_realignment(): + # GH#32685 pow has special semantics for operating with null values + left = pd.DataFrame({"A": [0, 1, 2]}) + right = pd.DataFrame(index=[0, 1, 2]) + + result = left ** right + expected = pd.DataFrame({"A": [np.nan, 1.0, np.nan]}) + tm.assert_frame_equal(result, expected) + + +# TODO: move to tests.arithmetic and parametrize +def test_pow_nan_with_zero(): + left = pd.DataFrame({"A": [np.nan, np.nan, np.nan]}) + right = pd.DataFrame({"A": [0, 0, 0]}) + + expected = pd.DataFrame({"A": [1.0, 1.0, 1.0]}) + + result = left ** right + tm.assert_frame_equal(result, expected) + + result = left["A"] ** right["A"] + tm.assert_series_equal(result, expected["A"]) diff --git a/pandas/tests/frame/test_axis_select_reindex.py b/pandas/tests/frame/test_axis_select_reindex.py index 7effa98fd8213..ea21359c2f75c 100644 --- a/pandas/tests/frame/test_axis_select_reindex.py +++ b/pandas/tests/frame/test_axis_select_reindex.py @@ -173,13 +173,15 @@ def test_drop_api_equivalence(self): res2 = df.drop(index=["a"], columns=["d"]) tm.assert_frame_equal(res1, res2) - with pytest.raises(ValueError): + msg = "Cannot specify both 'labels' and 'index'/'columns'" + with pytest.raises(ValueError, match=msg): df.drop(labels="a", index="b") - with pytest.raises(ValueError): + with pytest.raises(ValueError, match=msg): df.drop(labels="a", columns="b") - with pytest.raises(ValueError): + msg = "Need to specify at least one of 'labels', 'index' or 'columns'" + with pytest.raises(ValueError, match=msg): df.drop(axis=1) def test_merge_join_different_levels(self): @@ -616,7 +618,8 @@ def test_align_float(self, float_frame): tm.assert_index_equal(bf.index, Index([])) # Try to align DataFrame to Series along bad axis - with pytest.raises(ValueError): + msg = "No axis named 2 for object type DataFrame" + with pytest.raises(ValueError, match=msg): float_frame.align(af.iloc[0, :3], join="inner", axis=2) # align dataframe to series with broadcast or not diff --git a/pandas/tests/frame/test_block_internals.py b/pandas/tests/frame/test_block_internals.py index a5f5e6f36cd58..e67fef9efef6d 100644 --- a/pandas/tests/frame/test_block_internals.py +++ b/pandas/tests/frame/test_block_internals.py @@ -478,7 +478,7 @@ def test_convert_objects(self, float_string_frame): length = len(float_string_frame) float_string_frame["J"] = "1." float_string_frame["K"] = "1" - float_string_frame.loc[0:5, ["J", "K"]] = "garbled" + float_string_frame.loc[float_string_frame.index[0:5], ["J", "K"]] = "garbled" converted = float_string_frame._convert(datetime=True, numeric=True) assert converted["H"].dtype == "float64" assert converted["I"].dtype == "int64" diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 071d2409f1be2..9f40e8c6931c8 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -2,6 +2,7 @@ from datetime import date, datetime, timedelta import functools import itertools +import re import numpy as np import numpy.ma as ma @@ -9,7 +10,7 @@ import pytest import pytz -from pandas.compat import is_platform_little_endian +from pandas.compat import PY37, is_platform_little_endian from pandas.compat.numpy import _is_numpy_dev from pandas.core.dtypes.common import is_integer_dtype @@ -47,15 +48,15 @@ class TestDataFrameConstructors: def test_series_with_name_not_matching_column(self): # GH#9232 - x = pd.Series(range(5), name=1) - y = pd.Series(range(5), name=0) + x = Series(range(5), name=1) + y = Series(range(5), name=0) - result = pd.DataFrame(x, columns=[0]) - expected = pd.DataFrame([], columns=[0]) + result = DataFrame(x, columns=[0]) + expected = DataFrame([], columns=[0]) tm.assert_frame_equal(result, expected) - result = pd.DataFrame(y, columns=[1]) - expected = pd.DataFrame([], columns=[1]) + result = DataFrame(y, columns=[1]) + expected = DataFrame([], columns=[1]) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize( @@ -126,7 +127,7 @@ def test_constructor_cast_failure(self): def test_constructor_dtype_copy(self): orig_df = DataFrame({"col1": [1.0], "col2": [2.0], "col3": [3.0]}) - new_df = pd.DataFrame(orig_df, dtype=float, copy=True) + new_df = DataFrame(orig_df, dtype=float, copy=True) new_df["col1"] = 200.0 assert orig_df["col1"][0] == 1.0 @@ -220,10 +221,10 @@ def test_constructor_rec(self, float_frame): index = float_frame.index df = DataFrame(rec) - tm.assert_index_equal(df.columns, pd.Index(rec.dtype.names)) + tm.assert_index_equal(df.columns, Index(rec.dtype.names)) df2 = DataFrame(rec, index=index) - tm.assert_index_equal(df2.columns, pd.Index(rec.dtype.names)) + tm.assert_index_equal(df2.columns, Index(rec.dtype.names)) tm.assert_index_equal(df2.index, index) rng = np.arange(len(rec))[::-1] @@ -298,7 +299,7 @@ def test_constructor_dict(self): tm.assert_series_equal(frame["col1"], datetime_series.rename("col1")) - exp = pd.Series( + exp = Series( np.concatenate([[np.nan] * 5, datetime_series_short.values]), index=datetime_series.index, name="col2", @@ -325,7 +326,7 @@ def test_constructor_dict(self): # Length-one dict micro-optimization frame = DataFrame({"A": {"1": 1, "2": 2}}) - tm.assert_index_equal(frame.index, pd.Index(["1", "2"])) + tm.assert_index_equal(frame.index, Index(["1", "2"])) # empty dict plus index idx = Index([0, 1, 2]) @@ -418,8 +419,8 @@ def test_constructor_dict_order_insertion(self): def test_constructor_dict_nan_key_and_columns(self): # GH 16894 - result = pd.DataFrame({np.nan: [1, 2], 2: [2, 3]}, columns=[np.nan, 2]) - expected = pd.DataFrame([[1, 2], [2, 3]], columns=[np.nan, 2]) + result = DataFrame({np.nan: [1, 2], 2: [2, 3]}, columns=[np.nan, 2]) + expected = DataFrame([[1, 2], [2, 3]], columns=[np.nan, 2]) tm.assert_frame_equal(result, expected) def test_constructor_multi_index(self): @@ -428,29 +429,29 @@ def test_constructor_multi_index(self): tuples = [(2, 3), (3, 3), (3, 3)] mi = MultiIndex.from_tuples(tuples) df = DataFrame(index=mi, columns=mi) - assert pd.isna(df).values.ravel().all() + assert isna(df).values.ravel().all() tuples = [(3, 3), (2, 3), (3, 3)] mi = MultiIndex.from_tuples(tuples) df = DataFrame(index=mi, columns=mi) - assert pd.isna(df).values.ravel().all() + assert isna(df).values.ravel().all() def test_constructor_2d_index(self): # GH 25416 # handling of 2d index in construction - df = pd.DataFrame([[1]], columns=[[1]], index=[1, 2]) - expected = pd.DataFrame( + df = DataFrame([[1]], columns=[[1]], index=[1, 2]) + expected = DataFrame( [1, 1], index=pd.Int64Index([1, 2], dtype="int64"), - columns=pd.MultiIndex(levels=[[1]], codes=[[0]]), + columns=MultiIndex(levels=[[1]], codes=[[0]]), ) tm.assert_frame_equal(df, expected) - df = pd.DataFrame([[1]], columns=[[1]], index=[[1, 2]]) - expected = pd.DataFrame( + df = DataFrame([[1]], columns=[[1]], index=[[1, 2]]) + expected = DataFrame( [1, 1], - index=pd.MultiIndex(levels=[[1, 2]], codes=[[0, 1]]), - columns=pd.MultiIndex(levels=[[1]], codes=[[0]]), + index=MultiIndex(levels=[[1, 2]], codes=[[0, 1]]), + columns=MultiIndex(levels=[[1]], codes=[[0]]), ) tm.assert_frame_equal(df, expected) @@ -471,7 +472,7 @@ def test_constructor_error_msgs(self): DataFrame( np.arange(12).reshape((4, 3)), columns=["foo", "bar", "baz"], - index=pd.date_range("2000-01-01", periods=3), + index=date_range("2000-01-01", periods=3), ) arr = np.array([[4, 5, 6]]) @@ -713,14 +714,12 @@ def test_constructor_period(self): # PeriodIndex a = pd.PeriodIndex(["2012-01", "NaT", "2012-04"], freq="M") b = pd.PeriodIndex(["2012-02-01", "2012-03-01", "NaT"], freq="D") - df = pd.DataFrame({"a": a, "b": b}) + df = DataFrame({"a": a, "b": b}) assert df["a"].dtype == a.dtype assert df["b"].dtype == b.dtype # list of periods - df = pd.DataFrame( - {"a": a.astype(object).tolist(), "b": b.astype(object).tolist()} - ) + df = DataFrame({"a": a.astype(object).tolist(), "b": b.astype(object).tolist()}) assert df["a"].dtype == a.dtype assert df["b"].dtype == b.dtype @@ -882,8 +881,8 @@ def test_constructor_maskedarray_nonfloat(self): def test_constructor_maskedarray_hardened(self): # Check numpy masked arrays with hard masks -- from GH24574 mat_hard = ma.masked_all((2, 2), dtype=float).harden_mask() - result = pd.DataFrame(mat_hard, columns=["A", "B"], index=[1, 2]) - expected = pd.DataFrame( + result = DataFrame(mat_hard, columns=["A", "B"], index=[1, 2]) + expected = DataFrame( {"A": [np.nan, np.nan], "B": [np.nan, np.nan]}, columns=["A", "B"], index=[1, 2], @@ -892,8 +891,8 @@ def test_constructor_maskedarray_hardened(self): tm.assert_frame_equal(result, expected) # Check case where mask is hard but no data are masked mat_hard = ma.ones((2, 2), dtype=float).harden_mask() - result = pd.DataFrame(mat_hard, columns=["A", "B"], index=[1, 2]) - expected = pd.DataFrame( + result = DataFrame(mat_hard, columns=["A", "B"], index=[1, 2]) + expected = DataFrame( {"A": [1.0, 1.0], "B": [1.0, 1.0]}, columns=["A", "B"], index=[1, 2], @@ -907,8 +906,8 @@ def test_constructor_maskedrecarray_dtype(self): np.ma.zeros(5, dtype=[("date", " 2, inplace=value) - with pytest.raises(ValueError): + with pytest.raises(ValueError, match=msg): super(DataFrame, df).mask(cond=df.a > 2, inplace=value) def test_unexpected_keyword(self): @@ -222,102 +214,15 @@ def test_unexpected_keyword(self): ts = df["joe"].copy() ts[2] = np.nan - with pytest.raises(TypeError, match="unexpected keyword"): + msg = "unexpected keyword" + with pytest.raises(TypeError, match=msg): df.drop("joe", axis=1, in_place=True) - with pytest.raises(TypeError, match="unexpected keyword"): + with pytest.raises(TypeError, match=msg): df.reindex([1, 0], inplace=True) - with pytest.raises(TypeError, match="unexpected keyword"): + with pytest.raises(TypeError, match=msg): ca.fillna(0, inplace=True) - with pytest.raises(TypeError, match="unexpected keyword"): + with pytest.raises(TypeError, match=msg): ts.fillna(0, in_place=True) - - -class TestToXArray: - @pytest.mark.skipif( - not _XARRAY_INSTALLED - or _XARRAY_INSTALLED - and LooseVersion(xarray.__version__) < LooseVersion("0.10.0"), - reason="xarray >= 0.10.0 required", - ) - @pytest.mark.parametrize("index", tm.all_index_generator(3)) - def test_to_xarray_index_types(self, index): - from xarray import Dataset - - df = DataFrame( - { - "a": list("abc"), - "b": list(range(1, 4)), - "c": np.arange(3, 6).astype("u1"), - "d": np.arange(4.0, 7.0, dtype="float64"), - "e": [True, False, True], - "f": pd.Categorical(list("abc")), - "g": pd.date_range("20130101", periods=3), - "h": pd.date_range("20130101", periods=3, tz="US/Eastern"), - } - ) - - df.index = index - df.index.name = "foo" - df.columns.name = "bar" - result = df.to_xarray() - assert result.dims["foo"] == 3 - assert len(result.coords) == 1 - assert len(result.data_vars) == 8 - tm.assert_almost_equal(list(result.coords.keys()), ["foo"]) - assert isinstance(result, Dataset) - - # idempotency - # categoricals are not preserved - # datetimes w/tz are preserved - # column names are lost - expected = df.copy() - expected["f"] = expected["f"].astype(object) - expected.columns.name = None - tm.assert_frame_equal( - result.to_dataframe(), - expected, - check_index_type=False, - check_categorical=False, - ) - - @td.skip_if_no("xarray", min_version="0.7.0") - def test_to_xarray(self): - from xarray import Dataset - - df = DataFrame( - { - "a": list("abc"), - "b": list(range(1, 4)), - "c": np.arange(3, 6).astype("u1"), - "d": np.arange(4.0, 7.0, dtype="float64"), - "e": [True, False, True], - "f": pd.Categorical(list("abc")), - "g": pd.date_range("20130101", periods=3), - "h": pd.date_range("20130101", periods=3, tz="US/Eastern"), - } - ) - - df.index.name = "foo" - result = df[0:0].to_xarray() - assert result.dims["foo"] == 0 - assert isinstance(result, Dataset) - - # available in 0.7.1 - # MultiIndex - df.index = pd.MultiIndex.from_product([["a"], range(3)], names=["one", "two"]) - result = df.to_xarray() - assert result.dims["one"] == 1 - assert result.dims["two"] == 3 - assert len(result.coords) == 2 - assert len(result.data_vars) == 8 - tm.assert_almost_equal(list(result.coords.keys()), ["one", "two"]) - assert isinstance(result, Dataset) - - result = result.to_dataframe() - expected = df.copy() - expected["f"] = expected["f"].astype(object) - expected.columns.name = None - tm.assert_frame_equal(result, expected, check_index_type=False) diff --git a/pandas/tests/generic/test_generic.py b/pandas/tests/generic/test_generic.py index 1b6cb8447c76d..1a4a0b1678aa4 100644 --- a/pandas/tests/generic/test_generic.py +++ b/pandas/tests/generic/test_generic.py @@ -3,11 +3,14 @@ import numpy as np import pytest +from pandas.compat.numpy import _np_version_under1p17 + from pandas.core.dtypes.common import is_scalar import pandas as pd from pandas import DataFrame, MultiIndex, Series, date_range import pandas._testing as tm +import pandas.core.common as com # ---------------------------------------------------------------------- # Generic types test cases @@ -159,27 +162,14 @@ def test_downcast(self): o = self._construct(shape=4, value=9, dtype=np.int64) result = o.copy() - result._data = o._data.downcast(dtypes="infer") + result._data = o._data.downcast() self._compare(result, o) - o = self._construct(shape=4, value=9.0) - expected = o.astype(np.int64) - result = o.copy() - result._data = o._data.downcast(dtypes="infer") - self._compare(result, expected) - o = self._construct(shape=4, value=9.5) result = o.copy() - result._data = o._data.downcast(dtypes="infer") + result._data = o._data.downcast() self._compare(result, o) - # are close - o = self._construct(shape=4, value=9.000000000005) - result = o.copy() - result._data = o._data.downcast(dtypes="infer") - expected = o.astype(np.int64) - self._compare(result, expected) - def test_constructor_compound_dtypes(self): # see gh-5191 # Compound dtypes should raise NotImplementedError. @@ -259,14 +249,13 @@ def test_metadata_propagation(self): self.check_metadata(v1 & v2) self.check_metadata(v1 | v2) - @pytest.mark.parametrize("index", tm.all_index_generator(10)) - def test_head_tail(self, index): + def test_head_tail(self, indices): # GH5370 - o = self._construct(shape=10) + o = self._construct(shape=len(indices)) axis = o._get_axis_name(0) - setattr(o, axis, index) + setattr(o, axis, indices) o.head() @@ -282,8 +271,8 @@ def test_head_tail(self, index): self._compare(o.tail(len(o) + 1), o) # neg index - self._compare(o.head(-3), o.head(7)) - self._compare(o.tail(-3), o.tail(7)) + self._compare(o.head(-3), o.head(len(indices) - 3)) + self._compare(o.tail(-3), o.tail(len(indices) - 3)) def test_sample(self): # Fixes issue: 2419 @@ -654,6 +643,29 @@ def test_sample(sel): with pytest.raises(ValueError): df.sample(1, weights=s4) + @pytest.mark.parametrize( + "func_str,arg", + [ + ("np.array", [2, 3, 1, 0]), + pytest.param( + "np.random.MT19937", + 3, + marks=pytest.mark.skipif(_np_version_under1p17, reason="NumPy<1.17"), + ), + pytest.param( + "np.random.PCG64", + 11, + marks=pytest.mark.skipif(_np_version_under1p17, reason="NumPy<1.17"), + ), + ], + ) + def test_sample_random_state(self, func_str, arg): + # GH32503 + df = pd.DataFrame({"col1": range(10, 20), "col2": range(20, 30)}) + result = df.sample(n=3, random_state=eval(func_str)(arg)) + expected = df.sample(n=3, random_state=com.random_state(eval(func_str)(arg))) + tm.assert_frame_equal(result, expected) + def test_squeeze(self): # noop for s in [tm.makeFloatSeries(), tm.makeStringSeries(), tm.makeObjectSeries()]: @@ -679,10 +691,10 @@ def test_squeeze(self): tm.assert_series_equal(df.squeeze(axis=1), df.iloc[:, 0]) tm.assert_series_equal(df.squeeze(axis="columns"), df.iloc[:, 0]) assert df.squeeze() == df.iloc[0, 0] - msg = "No axis named 2 for object type " + msg = "No axis named 2 for object type DataFrame" with pytest.raises(ValueError, match=msg): df.squeeze(axis=2) - msg = "No axis named x for object type " + msg = "No axis named x for object type DataFrame" with pytest.raises(ValueError, match=msg): df.squeeze(axis="x") diff --git a/pandas/tests/generic/test_series.py b/pandas/tests/generic/test_series.py index f119eb422a276..20f6cda7cad60 100644 --- a/pandas/tests/generic/test_series.py +++ b/pandas/tests/generic/test_series.py @@ -1,24 +1,14 @@ -from distutils.version import LooseVersion from operator import methodcaller import numpy as np import pytest -import pandas.util._test_decorators as td - import pandas as pd from pandas import MultiIndex, Series, date_range import pandas._testing as tm from .test_generic import Generic -try: - import xarray - - _XARRAY_INSTALLED = True -except ImportError: - _XARRAY_INSTALLED = False - class TestSeries(Generic): _typ = Series @@ -57,7 +47,8 @@ def test_set_axis_name_mi(self, func): def test_set_axis_name_raises(self): s = pd.Series([1]) - with pytest.raises(ValueError): + msg = "No axis named 1 for object type Series" + with pytest.raises(ValueError, match=msg): s._set_axis_name(name="a", axis=1) def test_get_numeric_data_preserve_dtype(self): @@ -176,6 +167,9 @@ def finalize(self, other, method=None, **kwargs): Series._metadata = _metadata Series.__finalize__ = _finalize # FIXME: use monkeypatch + +class TestSeries2: + # Separating off because it doesnt rely on parent class @pytest.mark.parametrize( "s", [ @@ -194,72 +188,3 @@ def test_datetime_shift_always_copy(self, move_by_freq): # GH22397 s = pd.Series(range(5), index=pd.date_range("2017", periods=5)) assert s.shift(freq=move_by_freq) is not s - - -class TestSeries2: - # moved from Generic - def test_get_default(self): - - # GH#7725 - d0 = ["a", "b", "c", "d"] - d1 = np.arange(4, dtype="int64") - others = ["e", 10] - - for data, index in ((d0, d1), (d1, d0)): - s = Series(data, index=index) - for i, d in zip(index, data): - assert s.get(i) == d - assert s.get(i, d) == d - assert s.get(i, "z") == d - for other in others: - assert s.get(other, "z") == "z" - assert s.get(other, other) == other - - -class TestToXArray: - @pytest.mark.skipif( - not _XARRAY_INSTALLED - or _XARRAY_INSTALLED - and LooseVersion(xarray.__version__) < LooseVersion("0.10.0"), - reason="xarray >= 0.10.0 required", - ) - @pytest.mark.parametrize("index", tm.all_index_generator(6)) - def test_to_xarray_index_types(self, index): - from xarray import DataArray - - s = Series(range(6), index=index) - s.index.name = "foo" - result = s.to_xarray() - repr(result) - assert len(result) == 6 - assert len(result.coords) == 1 - tm.assert_almost_equal(list(result.coords.keys()), ["foo"]) - assert isinstance(result, DataArray) - - # idempotency - tm.assert_series_equal( - result.to_series(), s, check_index_type=False, check_categorical=True - ) - - @td.skip_if_no("xarray", min_version="0.7.0") - def test_to_xarray(self): - from xarray import DataArray - - s = Series([], dtype=object) - s.index.name = "foo" - result = s.to_xarray() - assert len(result) == 0 - assert len(result.coords) == 1 - tm.assert_almost_equal(list(result.coords.keys()), ["foo"]) - assert isinstance(result, DataArray) - - s = Series(range(6)) - s.index.name = "foo" - s.index = pd.MultiIndex.from_product( - [["a", "b"], range(3)], names=["one", "two"] - ) - result = s.to_xarray() - assert len(result) == 2 - tm.assert_almost_equal(list(result.coords.keys()), ["one", "two"]) - assert isinstance(result, DataArray) - tm.assert_series_equal(result.to_series(), s) diff --git a/pandas/tests/generic/test_to_xarray.py b/pandas/tests/generic/test_to_xarray.py new file mode 100644 index 0000000000000..250fe950a05fc --- /dev/null +++ b/pandas/tests/generic/test_to_xarray.py @@ -0,0 +1,154 @@ +from distutils.version import LooseVersion + +import numpy as np +import pytest + +import pandas.util._test_decorators as td + +import pandas as pd +from pandas import DataFrame, Series +import pandas._testing as tm + +try: + import xarray + + _XARRAY_INSTALLED = True +except ImportError: + _XARRAY_INSTALLED = False + + +class TestDataFrameToXArray: + @pytest.mark.skipif( + not _XARRAY_INSTALLED + or _XARRAY_INSTALLED + and LooseVersion(xarray.__version__) < LooseVersion("0.10.0"), + reason="xarray >= 0.10.0 required", + ) + def test_to_xarray_index_types(self, indices): + if isinstance(indices, pd.MultiIndex): + pytest.skip("MultiIndex is tested separately") + if len(indices) == 0: + pytest.skip("Test doesn't make sense for empty index") + + from xarray import Dataset + + df = DataFrame( + { + "a": list("abc"), + "b": list(range(1, 4)), + "c": np.arange(3, 6).astype("u1"), + "d": np.arange(4.0, 7.0, dtype="float64"), + "e": [True, False, True], + "f": pd.Categorical(list("abc")), + "g": pd.date_range("20130101", periods=3), + "h": pd.date_range("20130101", periods=3, tz="US/Eastern"), + } + ) + + df.index = indices[:3] + df.index.name = "foo" + df.columns.name = "bar" + result = df.to_xarray() + assert result.dims["foo"] == 3 + assert len(result.coords) == 1 + assert len(result.data_vars) == 8 + tm.assert_almost_equal(list(result.coords.keys()), ["foo"]) + assert isinstance(result, Dataset) + + # idempotency + # datetimes w/tz are preserved + # column names are lost + expected = df.copy() + expected["f"] = expected["f"].astype(object) + expected.columns.name = None + tm.assert_frame_equal( + result.to_dataframe(), expected, + ) + + @td.skip_if_no("xarray", min_version="0.7.0") + def test_to_xarray(self): + from xarray import Dataset + + df = DataFrame( + { + "a": list("abc"), + "b": list(range(1, 4)), + "c": np.arange(3, 6).astype("u1"), + "d": np.arange(4.0, 7.0, dtype="float64"), + "e": [True, False, True], + "f": pd.Categorical(list("abc")), + "g": pd.date_range("20130101", periods=3), + "h": pd.date_range("20130101", periods=3, tz="US/Eastern"), + } + ) + + df.index.name = "foo" + result = df[0:0].to_xarray() + assert result.dims["foo"] == 0 + assert isinstance(result, Dataset) + + # available in 0.7.1 + # MultiIndex + df.index = pd.MultiIndex.from_product([["a"], range(3)], names=["one", "two"]) + result = df.to_xarray() + assert result.dims["one"] == 1 + assert result.dims["two"] == 3 + assert len(result.coords) == 2 + assert len(result.data_vars) == 8 + tm.assert_almost_equal(list(result.coords.keys()), ["one", "two"]) + assert isinstance(result, Dataset) + + result = result.to_dataframe() + expected = df.copy() + expected["f"] = expected["f"].astype(object) + expected.columns.name = None + tm.assert_frame_equal(result, expected, check_index_type=False) + + +class TestSeriesToXArray: + @pytest.mark.skipif( + not _XARRAY_INSTALLED + or _XARRAY_INSTALLED + and LooseVersion(xarray.__version__) < LooseVersion("0.10.0"), + reason="xarray >= 0.10.0 required", + ) + def test_to_xarray_index_types(self, indices): + if isinstance(indices, pd.MultiIndex): + pytest.skip("MultiIndex is tested separately") + + from xarray import DataArray + + s = Series(range(len(indices)), index=indices) + s.index.name = "foo" + result = s.to_xarray() + repr(result) + assert len(result) == len(indices) + assert len(result.coords) == 1 + tm.assert_almost_equal(list(result.coords.keys()), ["foo"]) + assert isinstance(result, DataArray) + + # idempotency + tm.assert_series_equal(result.to_series(), s, check_index_type=False) + + @td.skip_if_no("xarray", min_version="0.7.0") + def test_to_xarray(self): + from xarray import DataArray + + s = Series([], dtype=object) + s.index.name = "foo" + result = s.to_xarray() + assert len(result) == 0 + assert len(result.coords) == 1 + tm.assert_almost_equal(list(result.coords.keys()), ["foo"]) + assert isinstance(result, DataArray) + + s = Series(range(6)) + s.index.name = "foo" + s.index = pd.MultiIndex.from_product( + [["a", "b"], range(3)], names=["one", "two"] + ) + result = s.to_xarray() + assert len(result) == 2 + tm.assert_almost_equal(list(result.coords.keys()), ["one", "two"]) + assert isinstance(result, DataArray) + tm.assert_series_equal(result.to_series(), s) diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index 48f8de7e51ae4..1265547653d7b 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -691,6 +691,19 @@ def test_agg_relabel_multiindex_duplicates(): tm.assert_frame_equal(result, expected) +@pytest.mark.parametrize( + "func", [lambda s: s.mean(), lambda s: np.mean(s), lambda s: np.nanmean(s)] +) +def test_multiindex_custom_func(func): + # GH 31777 + data = [[1, 4, 2], [5, 7, 1]] + df = pd.DataFrame(data, columns=pd.MultiIndex.from_arrays([[1, 1, 2], [3, 4, 3]])) + result = df.groupby(np.array([0, 1])).agg(func) + expected_dict = {(1, 3): {0: 1, 1: 5}, (1, 4): {0: 4, 1: 7}, (2, 3): {0: 2, 1: 1}} + expected = pd.DataFrame(expected_dict) + tm.assert_frame_equal(result, expected) + + def myfunc(s): return np.percentile(s, q=0.90) diff --git a/pandas/tests/groupby/aggregate/test_other.py b/pandas/tests/groupby/aggregate/test_other.py index 52ee3e652501c..264cf40dc6984 100644 --- a/pandas/tests/groupby/aggregate/test_other.py +++ b/pandas/tests/groupby/aggregate/test_other.py @@ -209,7 +209,7 @@ def test_aggregate_api_consistency(): expected = pd.concat([c_mean, c_sum, d_mean, d_sum], axis=1) expected.columns = MultiIndex.from_product([["C", "D"], ["mean", "sum"]]) - msg = r"nested renamer is not supported" + msg = r"Column\(s\) \['r', 'r2'\] do not exist" with pytest.raises(SpecificationError, match=msg): grouped[["D", "C"]].agg({"r": np.sum, "r2": np.mean}) @@ -224,9 +224,11 @@ def test_agg_dict_renaming_deprecation(): {"B": {"foo": ["sum", "max"]}, "C": {"bar": ["count", "min"]}} ) + msg = r"Column\(s\) \['ma'\] do not exist" with pytest.raises(SpecificationError, match=msg): df.groupby("A")[["B", "C"]].agg({"ma": "max"}) + msg = r"nested renamer is not supported" with pytest.raises(SpecificationError, match=msg): df.groupby("A").B.agg({"foo": "count"}) diff --git a/pandas/tests/groupby/test_bin_groupby.py b/pandas/tests/groupby/test_bin_groupby.py index ff74d374e5e3f..152086c241a52 100644 --- a/pandas/tests/groupby/test_bin_groupby.py +++ b/pandas/tests/groupby/test_bin_groupby.py @@ -5,6 +5,7 @@ from pandas.core.dtypes.common import ensure_int64 +import pandas as pd from pandas import Index, Series, isna import pandas._testing as tm @@ -51,6 +52,30 @@ def test_series_bin_grouper(): tm.assert_almost_equal(counts, exp_counts) +def assert_block_lengths(x): + assert len(x) == len(x._data.blocks[0].mgr_locs) + return 0 + + +def cumsum_max(x): + x.cumsum().max() + return 0 + + +@pytest.mark.parametrize("func", [cumsum_max, assert_block_lengths]) +def test_mgr_locs_updated(func): + # https://github.com/pandas-dev/pandas/issues/31802 + # Some operations may require creating new blocks, which requires + # valid mgr_locs + df = pd.DataFrame({"A": ["a", "a", "a"], "B": ["a", "b", "b"], "C": [1, 1, 1]}) + result = df.groupby(["A", "B"]).agg(func) + expected = pd.DataFrame( + {"C": [0, 0]}, + index=pd.MultiIndex.from_product([["a"], ["a", "b"]], names=["A", "B"]), + ) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( "binner,closed,expected", [ diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 9b07269811d8e..9ea5252b91e13 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -1262,6 +1262,9 @@ def test_series_groupby_on_2_categoricals_unobserved( if reduction_func == "ngroup": pytest.skip("ngroup is not truly a reduction") + if reduction_func == "corrwith": # GH 32293 + pytest.xfail("TODO: implemented SeriesGroupBy.corrwith") + df = pd.DataFrame( { "cat_1": pd.Categorical(list("AABB"), categories=list("ABCD")), diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index c402ca194648f..9c33843cdcecc 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -1,7 +1,6 @@ import builtins import datetime as dt from io import StringIO -from itertools import product from string import ascii_lowercase import numpy as np @@ -662,7 +661,7 @@ def test_nlargest_mi_grouper(): ] expected = Series(exp_values, index=exp_idx) - tm.assert_series_equal(result, expected, check_exact=False, check_less_precise=True) + tm.assert_series_equal(result, expected, check_exact=False) def test_nsmallest(): @@ -1296,36 +1295,32 @@ def __eq__(self, other): # -------------------------------- -def test_size(df): - grouped = df.groupby(["A", "B"]) +@pytest.mark.parametrize("by", ["A", "B", ["A", "B"]]) +def test_size(df, by): + grouped = df.groupby(by=by) result = grouped.size() for key, group in grouped: assert result[key] == len(group) - grouped = df.groupby("A") - result = grouped.size() - for key, group in grouped: - assert result[key] == len(group) - grouped = df.groupby("B") - result = grouped.size() - for key, group in grouped: - assert result[key] == len(group) +@pytest.mark.parametrize("by", ["A", "B", ["A", "B"]]) +@pytest.mark.parametrize("sort", [True, False]) +def test_size_sort(df, sort, by): + df = DataFrame(np.random.choice(20, (1000, 3)), columns=list("ABC")) + left = df.groupby(by=by, sort=sort).size() + right = df.groupby(by=by, sort=sort)["C"].apply(lambda a: a.shape[0]) + tm.assert_series_equal(left, right, check_names=False) - df = DataFrame(np.random.choice(20, (1000, 3)), columns=list("abc")) - for sort, key in product((False, True), ("a", "b", ["a", "b"])): - left = df.groupby(key, sort=sort).size() - right = df.groupby(key, sort=sort)["c"].apply(lambda a: a.shape[0]) - tm.assert_series_equal(left, right, check_names=False) - # GH11699 +def test_size_series_dataframe(): + # https://github.com/pandas-dev/pandas/issues/11699 df = DataFrame(columns=["A", "B"]) out = Series(dtype="int64", index=Index([], name="A")) tm.assert_series_equal(df.groupby("A").size(), out) def test_size_groupby_all_null(): - # GH23050 + # https://github.com/pandas-dev/pandas/issues/23050 # Assert no 'Value Error : Length of passed values is 2, index implies 0' df = DataFrame({"A": [None, None]}) # all-null groups result = df.groupby("A").size() @@ -1335,6 +1330,8 @@ def test_size_groupby_all_null(): # quantile # -------------------------------- + + @pytest.mark.parametrize( "interpolation", ["linear", "lower", "higher", "nearest", "midpoint"] ) @@ -1608,3 +1605,34 @@ def test_groupby_mean_no_overflow(): } ) assert df.groupby("user")["connections"].mean()["A"] == 3689348814740003840 + + +@pytest.mark.parametrize( + "values", + [ + { + "a": [1, 1, 1, 2, 2, 2, 3, 3, 3], + "b": [1, pd.NA, 2, 1, pd.NA, 2, 1, pd.NA, 2], + }, + {"a": [1, 1, 2, 2, 3, 3], "b": [1, 2, 1, 2, 1, 2]}, + ], +) +@pytest.mark.parametrize("function", ["mean", "median", "var"]) +def test_apply_to_nullable_integer_returns_float(values, function): + # https://github.com/pandas-dev/pandas/issues/32219 + output = 0.5 if function == "var" else 1.5 + arr = np.array([output] * 3, dtype=float) + idx = pd.Index([1, 2, 3], dtype=object, name="a") + expected = pd.DataFrame({"b": arr}, index=idx) + + groups = pd.DataFrame(values, dtype="Int64").groupby("a") + + result = getattr(groups, function)() + tm.assert_frame_equal(result, expected) + + result = groups.agg(function) + tm.assert_frame_equal(result, expected) + + result = groups.agg([function]) + expected.columns = MultiIndex.from_tuples([("b", function)]) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_transform.py b/pandas/tests/groupby/test_transform.py index 740103eec185a..2295eb2297fa6 100644 --- a/pandas/tests/groupby/test_transform.py +++ b/pandas/tests/groupby/test_transform.py @@ -327,9 +327,9 @@ def test_transform_transformation_func(transformation_func): } ) - if transformation_func in ["pad", "backfill", "tshift", "corrwith", "cumcount"]: + if transformation_func in ["pad", "backfill", "tshift", "cumcount"]: # These transformation functions are not yet covered in this test - pytest.xfail("See GH 31269 and GH 31270") + pytest.xfail("See GH 31269") elif _is_numpy_dev and transformation_func in ["cummin"]: pytest.xfail("https://github.com/pandas-dev/pandas/issues/31992") elif transformation_func == "fillna": @@ -1093,8 +1093,10 @@ def test_transform_agg_by_name(reduction_func, obj): pytest.xfail("TODO: g.transform('ngroup') doesn't work") if func == "size": # GH#27469 pytest.xfail("TODO: g.transform('size') doesn't work") + if func == "corrwith" and isinstance(obj, Series): # GH#32293 + pytest.xfail("TODO: implement SeriesGroupBy.corrwith") - args = {"nth": [0], "quantile": [0.5]}.get(func, []) + args = {"nth": [0], "quantile": [0.5], "corrwith": [obj]}.get(func, []) result = g.transform(func, *args) diff --git a/pandas/tests/indexes/categorical/test_indexing.py b/pandas/tests/indexes/categorical/test_indexing.py index 507e38d9acac2..1d41e17e327a8 100644 --- a/pandas/tests/indexes/categorical/test_indexing.py +++ b/pandas/tests/indexes/categorical/test_indexing.py @@ -65,7 +65,8 @@ def test_take_fill_value(self): with pytest.raises(ValueError, match=msg): idx.take(np.array([1, 0, -5]), fill_value=True) - with pytest.raises(IndexError): + msg = "index -5 is out of bounds for (axis 0 with )?size 3" + with pytest.raises(IndexError, match=msg): idx.take(np.array([1, -5])) def test_take_fill_value_datetime(self): @@ -104,7 +105,8 @@ def test_take_fill_value_datetime(self): with pytest.raises(ValueError, match=msg): idx.take(np.array([1, 0, -5]), fill_value=True) - with pytest.raises(IndexError): + msg = "index -5 is out of bounds for (axis 0 with )?size 3" + with pytest.raises(IndexError, match=msg): idx.take(np.array([1, -5])) def test_take_invalid_kwargs(self): diff --git a/pandas/tests/indexes/common.py b/pandas/tests/indexes/common.py index 69451501fd7bd..1473058b2a0a9 100644 --- a/pandas/tests/indexes/common.py +++ b/pandas/tests/indexes/common.py @@ -35,6 +35,9 @@ class Base: _holder: Optional[Type[Index]] = None _compat_props = ["shape", "ndim", "size", "nbytes"] + def create_index(self) -> Index: + raise NotImplementedError("Method not implemented") + def test_pickle_compat_construction(self): # need an object to create with msg = ( @@ -310,16 +313,11 @@ def test_ensure_copied_data(self, indices): result = result.tz_localize("UTC").tz_convert(indices.tz) tm.assert_index_equal(indices, result) - tm.assert_numpy_array_equal( - indices._ndarray_values, result._ndarray_values, check_same="copy" - ) if isinstance(indices, PeriodIndex): # .values an object array of Period, thus copied result = index_type(ordinal=indices.asi8, copy=False, **init_kwargs) - tm.assert_numpy_array_equal( - indices._ndarray_values, result._ndarray_values, check_same="same" - ) + tm.assert_numpy_array_equal(indices.asi8, result.asi8, check_same="same") elif isinstance(indices, IntervalIndex): # checked in test_interval.py pass @@ -328,9 +326,6 @@ def test_ensure_copied_data(self, indices): tm.assert_numpy_array_equal( indices.values, result.values, check_same="same" ) - tm.assert_numpy_array_equal( - indices._ndarray_values, result._ndarray_values, check_same="same" - ) def test_memory_usage(self, indices): indices._engine.clear_mapping() @@ -916,3 +911,29 @@ def test_contains_requires_hashable_raises(self): with pytest.raises(TypeError): {} in idx._engine + + def test_copy_copies_cache(self): + # GH32898 + idx = self.create_index() + idx.get_loc(idx[0]) # populates the _cache. + copy = idx.copy() + + # check that the copied cache is a copy of the original + assert idx._cache == copy._cache + assert idx._cache is not copy._cache + # cache values should reference the same object + for key, val in idx._cache.items(): + assert copy._cache[key] is val, key + + def test_shallow_copy_copies_cache(self): + # GH32669 + idx = self.create_index() + idx.get_loc(idx[0]) # populates the _cache. + shallow_copy = idx._shallow_copy() + + # check that the shallow_copied cache is a copy of the original + assert idx._cache == shallow_copy._cache + assert idx._cache is not shallow_copy._cache + # cache values should reference the same object + for key, val in idx._cache.items(): + assert shallow_copy._cache[key] is val, key diff --git a/pandas/tests/indexes/datetimes/test_constructors.py b/pandas/tests/indexes/datetimes/test_constructors.py index b293c008d6683..0247947ff19c5 100644 --- a/pandas/tests/indexes/datetimes/test_constructors.py +++ b/pandas/tests/indexes/datetimes/test_constructors.py @@ -415,7 +415,8 @@ def test_construction_dti_with_mixed_timezones(self): # tz mismatch affecting to tz-aware raises TypeError/ValueError - with pytest.raises(ValueError): + msg = "cannot be converted to datetime64" + with pytest.raises(ValueError, match=msg): DatetimeIndex( [ Timestamp("2011-01-01 10:00", tz="Asia/Tokyo"), @@ -424,7 +425,6 @@ def test_construction_dti_with_mixed_timezones(self): name="idx", ) - msg = "cannot be converted to datetime64" with pytest.raises(ValueError, match=msg): DatetimeIndex( [ @@ -435,7 +435,7 @@ def test_construction_dti_with_mixed_timezones(self): name="idx", ) - with pytest.raises(ValueError): + with pytest.raises(ValueError, match=msg): DatetimeIndex( [ Timestamp("2011-01-01 10:00", tz="Asia/Tokyo"), @@ -480,7 +480,8 @@ def test_construction_outofbounds(self): # coerces to object tm.assert_index_equal(Index(dates), exp) - with pytest.raises(OutOfBoundsDatetime): + msg = "Out of bounds nanosecond timestamp" + with pytest.raises(OutOfBoundsDatetime, match=msg): # can't create DatetimeIndex DatetimeIndex(dates) @@ -516,7 +517,8 @@ def test_constructor_coverage(self): with pytest.raises(TypeError, match=msg): date_range(start="1/1/2000", periods="foo", freq="D") - with pytest.raises(TypeError): + msg = "DatetimeIndex\\(\\) must be called with a collection" + with pytest.raises(TypeError, match=msg): DatetimeIndex("1/1/2000") # generator expression @@ -664,7 +666,8 @@ def test_constructor_dtype(self): @pytest.mark.parametrize("dtype", [object, np.int32, np.int64]) def test_constructor_invalid_dtype_raises(self, dtype): # GH 23986 - with pytest.raises(ValueError): + msg = "Unexpected value for 'dtype'" + with pytest.raises(ValueError, match=msg): DatetimeIndex([1, 2], dtype=dtype) def test_constructor_name(self): @@ -681,7 +684,8 @@ def test_000constructor_resolution(self): def test_disallow_setting_tz(self): # GH 3746 dti = DatetimeIndex(["2010"], tz="UTC") - with pytest.raises(AttributeError): + msg = "Cannot directly set timezone" + with pytest.raises(AttributeError, match=msg): dti.tz = pytz.timezone("US/Pacific") @pytest.mark.parametrize( @@ -770,7 +774,8 @@ def test_construction_from_replaced_timestamps_with_dst(self): def test_construction_with_tz_and_tz_aware_dti(self): # GH 23579 dti = date_range("2016-01-01", periods=3, tz="US/Central") - with pytest.raises(TypeError): + msg = "data is already tz-aware US/Central, unable to set specified tz" + with pytest.raises(TypeError, match=msg): DatetimeIndex(dti, tz="Asia/Tokyo") def test_construction_with_nat_and_tzlocal(self): @@ -790,7 +795,8 @@ def test_constructor_no_precision_raises(self): pd.Index(["2000"], dtype="datetime64") def test_constructor_wrong_precision_raises(self): - with pytest.raises(ValueError): + msg = "Unexpected value for 'dtype': 'datetime64\\[us\\]'" + with pytest.raises(ValueError, match=msg): pd.DatetimeIndex(["2000"], dtype="datetime64[us]") def test_index_constructor_with_numpy_object_array_and_timestamp_tz_with_nan(self): diff --git a/pandas/tests/indexes/datetimes/test_date_range.py b/pandas/tests/indexes/datetimes/test_date_range.py index d33351fe94a8c..9bcd1839662e5 100644 --- a/pandas/tests/indexes/datetimes/test_date_range.py +++ b/pandas/tests/indexes/datetimes/test_date_range.py @@ -153,9 +153,10 @@ def test_date_range_int64_overflow_stride_endpoint_different_signs(self): def test_date_range_out_of_bounds(self): # GH#14187 - with pytest.raises(OutOfBoundsDatetime): + msg = "Cannot generate range" + with pytest.raises(OutOfBoundsDatetime, match=msg): date_range("2016-01-01", periods=100000, freq="D") - with pytest.raises(OutOfBoundsDatetime): + with pytest.raises(OutOfBoundsDatetime, match=msg): date_range(end="1763-10-12", periods=100000, freq="D") def test_date_range_gen_error(self): @@ -736,9 +737,10 @@ def test_precision_finer_than_offset(self): ) def test_mismatching_tz_raises_err(self, start, end): # issue 18488 - with pytest.raises(TypeError): + msg = "Start and end cannot both be tz-aware with different timezones" + with pytest.raises(TypeError, match=msg): pd.date_range(start, end) - with pytest.raises(TypeError): + with pytest.raises(TypeError, match=msg): pd.date_range(start, end, freq=BDay()) @@ -771,16 +773,17 @@ def test_misc(self): def test_date_parse_failure(self): badly_formed_date = "2007/100/1" - with pytest.raises(ValueError): + msg = "could not convert string to Timestamp" + with pytest.raises(ValueError, match=msg): Timestamp(badly_formed_date) - with pytest.raises(ValueError): + with pytest.raises(ValueError, match=msg): bdate_range(start=badly_formed_date, periods=10) - with pytest.raises(ValueError): + with pytest.raises(ValueError, match=msg): bdate_range(end=badly_formed_date, periods=10) - with pytest.raises(ValueError): + with pytest.raises(ValueError, match=msg): bdate_range(badly_formed_date, badly_formed_date) def test_daterange_bug_456(self): @@ -813,8 +816,9 @@ def test_bday_near_overflow(self): def test_bday_overflow_error(self): # GH#24252 check that we get OutOfBoundsDatetime and not OverflowError + msg = "Out of bounds nanosecond timestamp" start = pd.Timestamp.max.floor("D").to_pydatetime() - with pytest.raises(OutOfBoundsDatetime): + with pytest.raises(OutOfBoundsDatetime, match=msg): pd.date_range(start, periods=2, freq="B") diff --git a/pandas/tests/indexes/datetimes/test_datetime.py b/pandas/tests/indexes/datetimes/test_datetime.py index 6217f225d496e..12c4abe7a1b00 100644 --- a/pandas/tests/indexes/datetimes/test_datetime.py +++ b/pandas/tests/indexes/datetimes/test_datetime.py @@ -425,3 +425,11 @@ def test_index_map(self, name): ((2018,), range(1, 7)), names=[name, name] ) tm.assert_index_equal(index, exp_index) + + def test_split_non_utc(self): + # GH 14042 + indices = pd.date_range("2016-01-01 00:00:00+0200", freq="S", periods=10) + result = np.split(indices, indices_or_sections=[])[0] + expected = indices.copy() + expected._set_freq(None) + tm.assert_index_equal(result, expected) diff --git a/pandas/tests/indexes/datetimes/test_datetimelike.py b/pandas/tests/indexes/datetimes/test_datetimelike.py index da1bd6f091d1a..e4785e5f80256 100644 --- a/pandas/tests/indexes/datetimes/test_datetimelike.py +++ b/pandas/tests/indexes/datetimes/test_datetimelike.py @@ -17,7 +17,7 @@ class TestDatetimeIndex(DatetimeLike): def indices(self, request): return request.param - def create_index(self): + def create_index(self) -> DatetimeIndex: return date_range("20130101", periods=5) def test_shift(self): diff --git a/pandas/tests/indexes/datetimes/test_indexing.py b/pandas/tests/indexes/datetimes/test_indexing.py index 554ae76979ba8..5882f5c77428b 100644 --- a/pandas/tests/indexes/datetimes/test_indexing.py +++ b/pandas/tests/indexes/datetimes/test_indexing.py @@ -312,7 +312,8 @@ def test_take_fill_value(self): with pytest.raises(ValueError, match=msg): idx.take(np.array([1, 0, -5]), fill_value=True) - with pytest.raises(IndexError): + msg = "out of bounds" + with pytest.raises(IndexError, match=msg): idx.take(np.array([1, -5])) def test_take_fill_value_with_timezone(self): @@ -348,7 +349,8 @@ def test_take_fill_value_with_timezone(self): with pytest.raises(ValueError, match=msg): idx.take(np.array([1, 0, -5]), fill_value=True) - with pytest.raises(IndexError): + msg = "out of bounds" + with pytest.raises(IndexError, match=msg): idx.take(np.array([1, -5])) @@ -428,7 +430,8 @@ def test_get_loc(self): tm.assert_numpy_array_equal( idx.get_loc(time(12, 30)), np.array([]), check_dtype=False ) - with pytest.raises(NotImplementedError): + msg = "cannot yet lookup inexact labels when key is a time object" + with pytest.raises(NotImplementedError, match=msg): idx.get_loc(time(12, 30), method="pad") def test_get_loc_tz_aware(self): @@ -462,7 +465,8 @@ def test_get_loc_nat(self): def test_get_loc_timedelta_invalid_key(self, key): # GH#20464 dti = pd.date_range("1970-01-01", periods=10) - with pytest.raises(TypeError): + msg = "Cannot index DatetimeIndex with [Tt]imedelta" + with pytest.raises(TypeError, match=msg): dti.get_loc(key) def test_get_loc_reasonable_key_error(self): @@ -571,9 +575,9 @@ def test_insert(self): idx.insert(3, pd.Timestamp("2000-01-04")) with pytest.raises(TypeError, match="Cannot compare tz-naive and tz-aware"): idx.insert(3, datetime(2000, 1, 4)) - with pytest.raises(ValueError): + with pytest.raises(ValueError, match="Timezones don't match"): idx.insert(3, pd.Timestamp("2000-01-04", tz="US/Eastern")) - with pytest.raises(ValueError): + with pytest.raises(ValueError, match="Timezones don't match"): idx.insert(3, datetime(2000, 1, 4, tzinfo=pytz.timezone("US/Eastern"))) for tz in ["US/Pacific", "Asia/Singapore"]: @@ -645,7 +649,7 @@ def test_delete(self): assert result.name == expected.name assert result.freq == expected.freq - with pytest.raises((IndexError, ValueError)): + with pytest.raises((IndexError, ValueError), match="out of bounds"): # either depending on numpy version idx.delete(5) @@ -804,5 +808,5 @@ def test_get_indexer(self): ] with pytest.raises(ValueError, match="abbreviation w/o a number"): idx.get_indexer(target, "nearest", tolerance=tol_bad) - with pytest.raises(ValueError): + with pytest.raises(ValueError, match="abbreviation w/o a number"): idx.get_indexer(idx[[0]], method="nearest", tolerance="foo") diff --git a/pandas/tests/indexes/datetimes/test_ops.py b/pandas/tests/indexes/datetimes/test_ops.py index 8ed98410ad9a4..cbb598286aefe 100644 --- a/pandas/tests/indexes/datetimes/test_ops.py +++ b/pandas/tests/indexes/datetimes/test_ops.py @@ -4,10 +4,16 @@ import numpy as np import pytest -from pandas.core.dtypes.generic import ABCDateOffset - import pandas as pd -from pandas import DatetimeIndex, Index, Series, Timestamp, bdate_range, date_range +from pandas import ( + DateOffset, + DatetimeIndex, + Index, + Series, + Timestamp, + bdate_range, + date_range, +) import pandas._testing as tm from pandas.tseries.offsets import BDay, BMonthEnd, CDay, Day, Hour @@ -363,7 +369,7 @@ def test_equals(self): assert not idx.equals(pd.Series(idx2)) # same internal, different tz - idx3 = pd.DatetimeIndex._simple_new(idx.asi8, tz="US/Pacific") + idx3 = pd.DatetimeIndex(idx.asi8, tz="US/Pacific") tm.assert_numpy_array_equal(idx.asi8, idx3.asi8) assert not idx.equals(idx3) assert not idx.equals(idx3.copy()) @@ -394,7 +400,7 @@ def test_freq_setter(self, values, freq, tz): # can set to an offset, converting from string if necessary idx._data.freq = freq assert idx.freq == freq - assert isinstance(idx.freq, ABCDateOffset) + assert isinstance(idx.freq, DateOffset) # can reset to None idx._data.freq = None diff --git a/pandas/tests/indexes/datetimes/test_shift.py b/pandas/tests/indexes/datetimes/test_shift.py index 1c87995931c62..1e21404551fa8 100644 --- a/pandas/tests/indexes/datetimes/test_shift.py +++ b/pandas/tests/indexes/datetimes/test_shift.py @@ -80,7 +80,7 @@ def test_dti_shift_int(self): def test_dti_shift_no_freq(self): # GH#19147 dti = pd.DatetimeIndex(["2011-01-01 10:00", "2011-01-01"], freq=None) - with pytest.raises(NullFrequencyError): + with pytest.raises(NullFrequencyError, match="Cannot shift with no freq"): dti.shift(2) @pytest.mark.parametrize("tzstr", ["US/Eastern", "dateutil/US/Eastern"]) diff --git a/pandas/tests/indexes/datetimes/test_timezones.py b/pandas/tests/indexes/datetimes/test_timezones.py index 9c1e8cb0f563f..d2f68302d4dcf 100644 --- a/pandas/tests/indexes/datetimes/test_timezones.py +++ b/pandas/tests/indexes/datetimes/test_timezones.py @@ -319,10 +319,10 @@ def test_dti_tz_localize_nonexistent_raise_coerce(self): times = ["2015-03-08 01:00", "2015-03-08 02:00", "2015-03-08 03:00"] index = DatetimeIndex(times) tz = "US/Eastern" - with pytest.raises(pytz.NonExistentTimeError): + with pytest.raises(pytz.NonExistentTimeError, match="|".join(times)): index.tz_localize(tz=tz) - with pytest.raises(pytz.NonExistentTimeError): + with pytest.raises(pytz.NonExistentTimeError, match="|".join(times)): index.tz_localize(tz=tz, nonexistent="raise") result = index.tz_localize(tz=tz, nonexistent="NaT") @@ -336,7 +336,7 @@ def test_dti_tz_localize_ambiguous_infer(self, tz): # November 6, 2011, fall back, repeat 2 AM hour # With no repeated hours, we cannot infer the transition dr = date_range(datetime(2011, 11, 6, 0), periods=5, freq=pd.offsets.Hour()) - with pytest.raises(pytz.AmbiguousTimeError): + with pytest.raises(pytz.AmbiguousTimeError, match="Cannot infer dst time"): dr.tz_localize(tz) # With repeated hours, we can infer the transition @@ -365,7 +365,7 @@ def test_dti_tz_localize_ambiguous_infer(self, tz): def test_dti_tz_localize_ambiguous_times(self, tz): # March 13, 2011, spring forward, skip from 2 AM to 3 AM dr = date_range(datetime(2011, 3, 13, 1, 30), periods=3, freq=pd.offsets.Hour()) - with pytest.raises(pytz.NonExistentTimeError): + with pytest.raises(pytz.NonExistentTimeError, match="2011-03-13 02:30:00"): dr.tz_localize(tz) # after dst transition, it works @@ -375,7 +375,7 @@ def test_dti_tz_localize_ambiguous_times(self, tz): # November 6, 2011, fall back, repeat 2 AM hour dr = date_range(datetime(2011, 11, 6, 1, 30), periods=3, freq=pd.offsets.Hour()) - with pytest.raises(pytz.AmbiguousTimeError): + with pytest.raises(pytz.AmbiguousTimeError, match="Cannot infer dst time"): dr.tz_localize(tz) # UTC is OK @@ -411,11 +411,11 @@ def test_dti_tz_localize(self, prefix): tm.assert_numpy_array_equal(dti3.values, dti_utc.values) dti = pd.date_range(start="11/6/2011 1:59", end="11/6/2011 2:00", freq="L") - with pytest.raises(pytz.AmbiguousTimeError): + with pytest.raises(pytz.AmbiguousTimeError, match="Cannot infer dst time"): dti.tz_localize(tzstr) dti = pd.date_range(start="3/13/2011 1:59", end="3/13/2011 2:00", freq="L") - with pytest.raises(pytz.NonExistentTimeError): + with pytest.raises(pytz.NonExistentTimeError, match="2011-03-13 02:00:00"): dti.tz_localize(tzstr) @pytest.mark.parametrize( @@ -441,7 +441,7 @@ def test_dti_tz_localize_utc_conversion(self, tz): # DST ambiguity, this should fail rng = date_range("3/11/2012", "3/12/2012", freq="30T") # Is this really how it should fail?? - with pytest.raises(pytz.NonExistentTimeError): + with pytest.raises(pytz.NonExistentTimeError, match="2012-03-11 02:00:00"): rng.tz_localize(tz) def test_dti_tz_localize_roundtrip(self, tz_aware_fixture): @@ -452,7 +452,9 @@ def test_dti_tz_localize_roundtrip(self, tz_aware_fixture): tz = tz_aware_fixture localized = idx.tz_localize(tz) # cant localize a tz-aware object - with pytest.raises(TypeError): + with pytest.raises( + TypeError, match="Already tz-aware, use tz_convert to convert" + ): localized.tz_localize(tz) reset = localized.tz_localize(None) assert reset.tzinfo is None @@ -542,7 +544,8 @@ def test_dti_tz_localize_ambiguous_flags(self, tz): di = DatetimeIndex(times) # When the sizes are incompatible, make sure error is raised - with pytest.raises(Exception): + msg = "Length of ambiguous bool-array must be the same size as vals" + with pytest.raises(Exception, match=msg): di.tz_localize(tz, ambiguous=is_dst) # When sizes are compatible and there are repeats ('infer' won't work) @@ -564,7 +567,7 @@ def test_dti_construction_ambiguous_endpoint(self, tz): # construction with an ambiguous end-point # GH#11626 - with pytest.raises(pytz.AmbiguousTimeError): + with pytest.raises(pytz.AmbiguousTimeError, match="Cannot infer dst time"): date_range( "2013-10-26 23:00", "2013-10-27 01:00", tz="Europe/London", freq="H" ) @@ -588,7 +591,7 @@ def test_dti_construction_ambiguous_endpoint(self, tz): def test_dti_construction_nonexistent_endpoint(self, tz, option, expected): # construction with an nonexistent end-point - with pytest.raises(pytz.NonExistentTimeError): + with pytest.raises(pytz.NonExistentTimeError, match="2019-03-10 02:00:00"): date_range( "2019-03-10 00:00", "2019-03-10 02:00", tz="US/Pacific", freq="H" ) @@ -613,10 +616,15 @@ def test_dti_tz_localize_nonexistent(self, tz, method, exp): n = 60 dti = date_range(start="2015-03-29 02:00:00", periods=n, freq="min") if method == "raise": - with pytest.raises(pytz.NonExistentTimeError): + with pytest.raises(pytz.NonExistentTimeError, match="2015-03-29 02:00:00"): dti.tz_localize(tz, nonexistent=method) elif exp == "invalid": - with pytest.raises(ValueError): + msg = ( + "The nonexistent argument must be one of " + "'raise', 'NaT', 'shift_forward', 'shift_backward' " + "or a timedelta object" + ) + with pytest.raises(ValueError, match=msg): dti.tz_localize(tz, nonexistent=method) else: result = dti.tz_localize(tz, nonexistent=method) @@ -1082,7 +1090,8 @@ def test_with_tz(self, tz): dr = bdate_range( datetime(2005, 1, 1, tzinfo=pytz.utc), datetime(2009, 1, 1, tzinfo=pytz.utc) ) - with pytest.raises(Exception): + msg = "Start and end cannot both be tz-aware with different timezones" + with pytest.raises(Exception, match=msg): bdate_range(datetime(2005, 1, 1, tzinfo=pytz.utc), "1/1/2009", tz=tz) @pytest.mark.parametrize("prefix", ["", "dateutil/"]) diff --git a/pandas/tests/indexes/interval/test_constructors.py b/pandas/tests/indexes/interval/test_constructors.py index 837c124db2bed..fa881df8139c6 100644 --- a/pandas/tests/indexes/interval/test_constructors.py +++ b/pandas/tests/indexes/interval/test_constructors.py @@ -91,7 +91,7 @@ def test_constructor_nan(self, constructor, breaks, closed): assert result.closed == closed assert result.dtype.subtype == expected_subtype - tm.assert_numpy_array_equal(result._ndarray_values, expected_values) + tm.assert_numpy_array_equal(np.array(result), expected_values) @pytest.mark.parametrize( "breaks", @@ -114,7 +114,7 @@ def test_constructor_empty(self, constructor, breaks, closed): assert result.empty assert result.closed == closed assert result.dtype.subtype == expected_subtype - tm.assert_numpy_array_equal(result._ndarray_values, expected_values) + tm.assert_numpy_array_equal(np.array(result), expected_values) @pytest.mark.parametrize( "breaks", diff --git a/pandas/tests/indexes/interval/test_interval.py b/pandas/tests/indexes/interval/test_interval.py index c2b209c810af9..efdd3fc9907a2 100644 --- a/pandas/tests/indexes/interval/test_interval.py +++ b/pandas/tests/indexes/interval/test_interval.py @@ -147,7 +147,7 @@ def test_ensure_copied_data(self, closed): ) # by-definition make a copy - result = IntervalIndex(index._ndarray_values, copy=False) + result = IntervalIndex(np.array(index), copy=False) tm.assert_numpy_array_equal( index.left.values, result.left.values, check_same="copy" ) diff --git a/pandas/tests/indexes/multi/test_drop.py b/pandas/tests/indexes/multi/test_drop.py index b909025b3f2f9..6ba565f0406ab 100644 --- a/pandas/tests/indexes/multi/test_drop.py +++ b/pandas/tests/indexes/multi/test_drop.py @@ -139,52 +139,3 @@ def test_drop_not_lexsorted(): tm.assert_index_equal(lexsorted_mi, not_lexsorted_mi) with tm.assert_produces_warning(PerformanceWarning): tm.assert_index_equal(lexsorted_mi.drop("a"), not_lexsorted_mi.drop("a")) - - -@pytest.mark.parametrize( - "msg,labels,level", - [ - (r"labels \[4\] not found in level", 4, "a"), - (r"labels \[7\] not found in level", 7, "b"), - ], -) -def test_drop_raise_exception_if_labels_not_in_level(msg, labels, level): - # GH 8594 - mi = MultiIndex.from_arrays([[1, 2, 3], [4, 5, 6]], names=["a", "b"]) - s = pd.Series([10, 20, 30], index=mi) - df = pd.DataFrame([10, 20, 30], index=mi) - - with pytest.raises(KeyError, match=msg): - s.drop(labels, level=level) - with pytest.raises(KeyError, match=msg): - df.drop(labels, level=level) - - -@pytest.mark.parametrize("labels,level", [(4, "a"), (7, "b")]) -def test_drop_errors_ignore(labels, level): - # GH 8594 - mi = MultiIndex.from_arrays([[1, 2, 3], [4, 5, 6]], names=["a", "b"]) - s = pd.Series([10, 20, 30], index=mi) - df = pd.DataFrame([10, 20, 30], index=mi) - - expected_s = s.drop(labels, level=level, errors="ignore") - tm.assert_series_equal(s, expected_s) - - expected_df = df.drop(labels, level=level, errors="ignore") - tm.assert_frame_equal(df, expected_df) - - -def test_drop_with_non_unique_datetime_index_and_invalid_keys(): - # GH 30399 - - # define dataframe with unique datetime index - df = pd.DataFrame( - np.random.randn(5, 3), - columns=["a", "b", "c"], - index=pd.date_range("2012", freq="H", periods=5), - ) - # create dataframe with non-unique datetime index - df = df.iloc[[0, 2, 2, 3]].copy() - - with pytest.raises(KeyError, match="not found in axis"): - df.drop(["a", "b"]) # Dropping with labels not exist in the index diff --git a/pandas/tests/indexes/multi/test_duplicates.py b/pandas/tests/indexes/multi/test_duplicates.py index 5e17a19335c7e..433b631ab9472 100644 --- a/pandas/tests/indexes/multi/test_duplicates.py +++ b/pandas/tests/indexes/multi/test_duplicates.py @@ -238,7 +238,6 @@ def test_duplicated(idx_dup, keep, expected): tm.assert_numpy_array_equal(result, expected) -@pytest.mark.parametrize("keep", ["first", "last", False]) def test_duplicated_large(keep): # GH 9125 n, k = 200, 5000 diff --git a/pandas/tests/indexes/multi/test_setops.py b/pandas/tests/indexes/multi/test_setops.py index d7d0ff4c411aa..b24f56afee376 100644 --- a/pandas/tests/indexes/multi/test_setops.py +++ b/pandas/tests/indexes/multi/test_setops.py @@ -246,6 +246,7 @@ def test_union(idx, sort): the_union = idx.union(idx[:0], sort=sort) assert the_union is idx + # FIXME: dont leave commented-out # won't work in python 3 # tuples = _index.values # result = _index[:4] | tuples[4:] @@ -282,6 +283,7 @@ def test_intersection(idx, sort): expected = idx[:0] assert empty.equals(expected) + # FIXME: dont leave commented-out # can't do in python 3 # tuples = _index.values # result = _index & tuples @@ -351,6 +353,17 @@ def test_union_sort_other_incomparable_sort(): idx.union(idx[:1], sort=True) +def test_union_non_object_dtype_raises(): + # GH#32646 raise NotImplementedError instead of less-informative error + mi = pd.MultiIndex.from_product([["a", "b"], [1, 2]]) + + idx = mi.levels[1] + + msg = "Can only union MultiIndex with MultiIndex or Index of tuples" + with pytest.raises(NotImplementedError, match=msg): + mi.union(idx) + + @pytest.mark.parametrize( "method", ["union", "intersection", "difference", "symmetric_difference"] ) diff --git a/pandas/tests/indexes/period/test_constructors.py b/pandas/tests/indexes/period/test_constructors.py index b5ff83ec7514d..cb2140d0b4025 100644 --- a/pandas/tests/indexes/period/test_constructors.py +++ b/pandas/tests/indexes/period/test_constructors.py @@ -147,9 +147,9 @@ def test_constructor_fromarraylike(self): msg = "freq not specified and cannot be inferred" with pytest.raises(ValueError, match=msg): - PeriodIndex(idx._ndarray_values) + PeriodIndex(idx.asi8) with pytest.raises(ValueError, match=msg): - PeriodIndex(list(idx._ndarray_values)) + PeriodIndex(list(idx.asi8)) msg = "'Period' object is not iterable" with pytest.raises(TypeError, match=msg): diff --git a/pandas/tests/indexes/period/test_indexing.py b/pandas/tests/indexes/period/test_indexing.py index 077fa2a0b1c56..a4c6764d065c9 100644 --- a/pandas/tests/indexes/period/test_indexing.py +++ b/pandas/tests/indexes/period/test_indexing.py @@ -795,3 +795,27 @@ def test_period_index_indexer(self): tm.assert_frame_equal(df, df.loc[list(idx)]) tm.assert_frame_equal(df.iloc[0:5], df.loc[idx[0:5]]) tm.assert_frame_equal(df, df.loc[list(idx)]) + + +class TestAsOfLocs: + def test_asof_locs_mismatched_type(self): + dti = pd.date_range("2016-01-01", periods=3) + pi = dti.to_period("D") + pi2 = dti.to_period("H") + + mask = np.array([0, 1, 0], dtype=bool) + + msg = "must be DatetimeIndex or PeriodIndex" + with pytest.raises(TypeError, match=msg): + pi.asof_locs(pd.Int64Index(pi.asi8), mask) + + with pytest.raises(TypeError, match=msg): + pi.asof_locs(pd.Float64Index(pi.asi8), mask) + + with pytest.raises(TypeError, match=msg): + # TimedeltaIndex + pi.asof_locs(dti - dti, mask) + + msg = "Input has different freq=H" + with pytest.raises(libperiod.IncompatibleFrequency, match=msg): + pi.asof_locs(pi2, mask) diff --git a/pandas/tests/indexes/period/test_period.py b/pandas/tests/indexes/period/test_period.py index ab3e967f12360..df2f85cd7f1e2 100644 --- a/pandas/tests/indexes/period/test_period.py +++ b/pandas/tests/indexes/period/test_period.py @@ -35,7 +35,7 @@ class TestPeriodIndex(DatetimeLike): def indices(self, request): return request.param - def create_index(self): + def create_index(self) -> PeriodIndex: return period_range("20130101", periods=5, freq="D") def test_pickle_compat_construction(self): @@ -161,7 +161,7 @@ def test_values(self): tm.assert_numpy_array_equal(idx.to_numpy(), exp) exp = np.array([], dtype=np.int64) - tm.assert_numpy_array_equal(idx._ndarray_values, exp) + tm.assert_numpy_array_equal(idx.asi8, exp) idx = PeriodIndex(["2011-01", NaT], freq="M") @@ -169,7 +169,7 @@ def test_values(self): tm.assert_numpy_array_equal(idx.values, exp) tm.assert_numpy_array_equal(idx.to_numpy(), exp) exp = np.array([492, -9223372036854775808], dtype=np.int64) - tm.assert_numpy_array_equal(idx._ndarray_values, exp) + tm.assert_numpy_array_equal(idx.asi8, exp) idx = PeriodIndex(["2011-01-01", NaT], freq="D") @@ -177,7 +177,7 @@ def test_values(self): tm.assert_numpy_array_equal(idx.values, exp) tm.assert_numpy_array_equal(idx.to_numpy(), exp) exp = np.array([14975, -9223372036854775808], dtype=np.int64) - tm.assert_numpy_array_equal(idx._ndarray_values, exp) + tm.assert_numpy_array_equal(idx.asi8, exp) def test_period_index_length(self): pi = period_range(freq="A", start="1/1/2001", end="12/1/2009") @@ -681,3 +681,32 @@ def test_is_monotonic_with_nat(): assert not obj.is_monotonic_increasing assert not obj.is_monotonic_decreasing assert obj.is_unique + + +@pytest.mark.parametrize("array", [True, False]) +def test_dunder_array(array): + obj = PeriodIndex(["2000-01-01", "2001-01-01"], freq="D") + if array: + obj = obj._data + + expected = np.array([obj[0], obj[1]], dtype=object) + result = np.array(obj) + tm.assert_numpy_array_equal(result, expected) + + result = np.asarray(obj) + tm.assert_numpy_array_equal(result, expected) + + expected = obj.asi8 + for dtype in ["i8", "int64", np.int64]: + result = np.array(obj, dtype=dtype) + tm.assert_numpy_array_equal(result, expected) + + result = np.asarray(obj, dtype=dtype) + tm.assert_numpy_array_equal(result, expected) + + for dtype in ["float64", "int32", "uint64"]: + msg = "argument must be" + with pytest.raises(TypeError, match=msg): + np.array(obj, dtype=dtype) + with pytest.raises(TypeError, match=msg): + np.array(obj, dtype=getattr(np, dtype)) diff --git a/pandas/tests/indexes/ranges/test_range.py b/pandas/tests/indexes/ranges/test_range.py index c1cc23039eeaf..61ac937f5fda0 100644 --- a/pandas/tests/indexes/ranges/test_range.py +++ b/pandas/tests/indexes/ranges/test_range.py @@ -30,7 +30,7 @@ class TestRangeIndex(Numeric): def indices(self, request): return request.param - def create_index(self): + def create_index(self) -> RangeIndex: return RangeIndex(start=0, stop=20, step=2) def test_can_hold_identifiers(self): diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 0c4a790646a81..5bdbc18769ce5 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -59,7 +59,7 @@ def index(self, request): # copy to avoid mutation, e.g. setting .name return indices_dict[key].copy() - def create_index(self): + def create_index(self) -> Index: return Index(list("abcde")) def test_can_hold_identifiers(self): @@ -2277,7 +2277,7 @@ class TestMixedIntIndex(Base): def indices(self, request): return Index(request.param) - def create_index(self): + def create_index(self) -> Index: return Index([0, "a", 1, "b", 2, "c"]) def test_argsort(self): @@ -2458,6 +2458,17 @@ def test_intersect_str_dates(self): expected = Index([], dtype=object) tm.assert_index_equal(result, expected) + def test_index_repr_bool_nan(self): + # GH32146 + arr = Index([True, False, np.nan], dtype=object) + exp1 = arr.format() + out1 = ["True", "False", "NaN"] + assert out1 == exp1 + + exp2 = repr(arr) + out2 = "Index([True, False, nan], dtype='object')" + assert out2 == exp2 + class TestIndexUtils: @pytest.mark.parametrize( @@ -2598,9 +2609,47 @@ def test_convert_almost_null_slice(indices): key = slice(None, None, "foo") if isinstance(idx, pd.IntervalIndex): - with pytest.raises(ValueError, match="cannot support not-default step"): + msg = "label-based slicing with step!=1 is not supported for IntervalIndex" + with pytest.raises(ValueError, match=msg): idx._convert_slice_indexer(key, "loc") else: msg = "'>=' not supported between instances of 'str' and 'int'" with pytest.raises(TypeError, match=msg): idx._convert_slice_indexer(key, "loc") + + +dtlike_dtypes = [ + np.dtype("timedelta64[ns]"), + np.dtype("datetime64[ns]"), + pd.DatetimeTZDtype("ns", "Asia/Tokyo"), + pd.PeriodDtype("ns"), +] + + +@pytest.mark.parametrize("ldtype", dtlike_dtypes) +@pytest.mark.parametrize("rdtype", dtlike_dtypes) +def test_get_indexer_non_unique_wrong_dtype(ldtype, rdtype): + + vals = np.tile(3600 * 10 ** 9 * np.arange(3), 2) + + def construct(dtype): + if dtype is dtlike_dtypes[-1]: + # PeriodArray will try to cast ints to strings + return pd.DatetimeIndex(vals).astype(dtype) + return pd.Index(vals, dtype=dtype) + + left = construct(ldtype) + right = construct(rdtype) + + result = left.get_indexer_non_unique(right) + + if ldtype is rdtype: + ex1 = np.array([0, 3, 1, 4, 2, 5] * 2, dtype=np.intp) + ex2 = np.array([], dtype=np.intp) + tm.assert_numpy_array_equal(result[0], ex1) + tm.assert_numpy_array_equal(result[1], ex2.astype(np.int64)) + + else: + no_matches = np.array([-1] * 6, dtype=np.intp) + tm.assert_numpy_array_equal(result[0], no_matches) + tm.assert_numpy_array_equal(result[1], no_matches) diff --git a/pandas/tests/indexes/test_common.py b/pandas/tests/indexes/test_common.py index c6ba5c9d61e9e..a220ae6361b79 100644 --- a/pandas/tests/indexes/test_common.py +++ b/pandas/tests/indexes/test_common.py @@ -302,32 +302,65 @@ def test_pickle(self, indices): assert indices.equals(unpickled) indices.name = original_name - @pytest.mark.parametrize("keep", ["first", "last", False]) - def test_duplicated(self, indices, keep): - if not len(indices) or isinstance(indices, (MultiIndex, RangeIndex)): - # MultiIndex tested separately in: - # tests/indexes/multi/test_unique_and_duplicates - pytest.skip("Skip check for empty Index, MultiIndex, RangeIndex") - + def test_drop_duplicates(self, indices, keep): + if isinstance(indices, MultiIndex): + pytest.skip("MultiIndex is tested separately") + if isinstance(indices, RangeIndex): + pytest.skip( + "RangeIndex is tested in test_drop_duplicates_no_duplicates " + "as it cannot hold duplicates" + ) + if len(indices) == 0: + pytest.skip( + "empty index is tested in test_drop_duplicates_no_duplicates " + "as it cannot hold duplicates" + ) + + # make unique index holder = type(indices) + unique_values = list(set(indices)) + unique_idx = holder(unique_values) + + # make duplicated index + n = len(unique_idx) + duplicated_selection = np.random.choice(n, int(n * 1.5)) + idx = holder(unique_idx.values[duplicated_selection]) + + # Series.duplicated is tested separately + expected_duplicated = ( + pd.Series(duplicated_selection).duplicated(keep=keep).values + ) + tm.assert_numpy_array_equal(idx.duplicated(keep=keep), expected_duplicated) + + # Series.drop_duplicates is tested separately + expected_dropped = holder(pd.Series(idx).drop_duplicates(keep=keep)) + tm.assert_index_equal(idx.drop_duplicates(keep=keep), expected_dropped) + + def test_drop_duplicates_no_duplicates(self, indices): + if isinstance(indices, MultiIndex): + pytest.skip("MultiIndex is tested separately") - idx = holder(indices) - if idx.has_duplicates: - # We are testing the duplicated-method here, so we need to know - # exactly which indices are duplicate and how (for the result). - # This is not possible if "idx" has duplicates already, which we - # therefore remove. This is seemingly circular, as drop_duplicates - # invokes duplicated, but in the end, it all works out because we - # cross-check with Series.duplicated, which is tested separately. - idx = idx.drop_duplicates() - - n, k = len(idx), 10 - duplicated_selection = np.random.choice(n, k * n) - expected = pd.Series(duplicated_selection).duplicated(keep=keep).values - idx = holder(idx.values[duplicated_selection]) - - result = idx.duplicated(keep=keep) - tm.assert_numpy_array_equal(result, expected) + # make unique index + if isinstance(indices, RangeIndex): + # RangeIndex cannot have duplicates + unique_idx = indices + else: + holder = type(indices) + unique_values = list(set(indices)) + unique_idx = holder(unique_values) + + # check on unique index + expected_duplicated = np.array([False] * len(unique_idx), dtype="bool") + tm.assert_numpy_array_equal(unique_idx.duplicated(), expected_duplicated) + result_dropped = unique_idx.drop_duplicates() + tm.assert_index_equal(result_dropped, unique_idx) + # validate shallow copy + assert result_dropped is not unique_idx + + def test_drop_duplicates_inplace(self, indices): + msg = r"drop_duplicates\(\) got an unexpected keyword argument" + with pytest.raises(TypeError, match=msg): + indices.drop_duplicates(inplace=True) def test_has_duplicates(self, indices): holder = type(indices) diff --git a/pandas/tests/indexes/test_numeric.py b/pandas/tests/indexes/test_numeric.py index 10d57d8616cf3..23877c2c7607a 100644 --- a/pandas/tests/indexes/test_numeric.py +++ b/pandas/tests/indexes/test_numeric.py @@ -118,7 +118,7 @@ def mixed_index(self): def float_index(self): return Float64Index([0.0, 2.5, 5.0, 7.5, 10.0]) - def create_index(self): + def create_index(self) -> Float64Index: return Float64Index(np.arange(5, dtype="float64")) def test_repr_roundtrip(self, indices): @@ -663,7 +663,7 @@ class TestInt64Index(NumericInt): def indices(self, request): return Int64Index(request.param) - def create_index(self): + def create_index(self) -> Int64Index: # return Int64Index(np.arange(5, dtype="int64")) return Int64Index(range(0, 20, 2)) @@ -801,7 +801,7 @@ def index_large(self): large = [2 ** 63, 2 ** 63 + 10, 2 ** 63 + 15, 2 ** 63 + 20, 2 ** 63 + 25] return UInt64Index(large) - def create_index(self): + def create_index(self) -> UInt64Index: # compat with shared Int64/Float64 tests; use index_large for UInt64 only tests return UInt64Index(np.arange(5, dtype="uint64")) diff --git a/pandas/tests/indexes/timedeltas/test_ops.py b/pandas/tests/indexes/timedeltas/test_ops.py index 6606507dabc29..4af5df6e2cc55 100644 --- a/pandas/tests/indexes/timedeltas/test_ops.py +++ b/pandas/tests/indexes/timedeltas/test_ops.py @@ -3,13 +3,11 @@ import numpy as np import pytest -from pandas.core.dtypes.generic import ABCDateOffset - import pandas as pd from pandas import Series, TimedeltaIndex, timedelta_range import pandas._testing as tm -from pandas.tseries.offsets import Day, Hour +from pandas.tseries.offsets import DateOffset, Day, Hour class TestTimedeltaIndexOps: @@ -263,7 +261,7 @@ def test_freq_setter(self, values, freq): # can set to an offset, converting from string if necessary idx._data.freq = freq assert idx.freq == freq - assert isinstance(idx.freq, ABCDateOffset) + assert isinstance(idx.freq, DateOffset) # can reset to None idx._data.freq = None diff --git a/pandas/tests/indexes/timedeltas/test_scalar_compat.py b/pandas/tests/indexes/timedeltas/test_scalar_compat.py index 44f4a2adedaad..1b86cd1df5a7a 100644 --- a/pandas/tests/indexes/timedeltas/test_scalar_compat.py +++ b/pandas/tests/indexes/timedeltas/test_scalar_compat.py @@ -69,3 +69,67 @@ def test_tdi_round(self): td.round(freq="M") with pytest.raises(ValueError, match=msg): elt.round(freq="M") + + @pytest.mark.parametrize( + "freq,msg", + [ + ("Y", " is a non-fixed frequency"), + ("M", " is a non-fixed frequency"), + ("foobar", "Invalid frequency: foobar"), + ], + ) + def test_tdi_round_invalid(self, freq, msg): + t1 = timedelta_range("1 days", periods=3, freq="1 min 2 s 3 us") + + with pytest.raises(ValueError, match=msg): + t1.round(freq) + with pytest.raises(ValueError, match=msg): + # Same test for TimedeltaArray + t1._data.round(freq) + + # TODO: de-duplicate with test_tdi_round + def test_round(self): + t1 = timedelta_range("1 days", periods=3, freq="1 min 2 s 3 us") + t2 = -1 * t1 + t1a = timedelta_range("1 days", periods=3, freq="1 min 2 s") + t1c = TimedeltaIndex([1, 1, 1], unit="D") + + # note that negative times round DOWN! so don't give whole numbers + for (freq, s1, s2) in [ + ("N", t1, t2), + ("U", t1, t2), + ( + "L", + t1a, + TimedeltaIndex( + ["-1 days +00:00:00", "-2 days +23:58:58", "-2 days +23:57:56"], + ), + ), + ( + "S", + t1a, + TimedeltaIndex( + ["-1 days +00:00:00", "-2 days +23:58:58", "-2 days +23:57:56"], + ), + ), + ("12T", t1c, TimedeltaIndex(["-1 days", "-1 days", "-1 days"],),), + ("H", t1c, TimedeltaIndex(["-1 days", "-1 days", "-1 days"],),), + ("d", t1c, TimedeltaIndex([-1, -1, -1], unit="D")), + ]: + + r1 = t1.round(freq) + tm.assert_index_equal(r1, s1) + r2 = t2.round(freq) + tm.assert_index_equal(r2, s2) + + def test_components(self): + rng = timedelta_range("1 days, 10:11:12", periods=2, freq="s") + rng.components + + # with nat + s = Series(rng) + s[1] = np.nan + + result = s.dt.components + assert not result.iloc[0].isna().all() + assert result.iloc[1].isna().all() diff --git a/pandas/tests/indexes/timedeltas/test_timedelta.py b/pandas/tests/indexes/timedeltas/test_timedelta.py index d4a94f8693081..971203d6fc720 100644 --- a/pandas/tests/indexes/timedeltas/test_timedelta.py +++ b/pandas/tests/indexes/timedeltas/test_timedelta.py @@ -28,7 +28,7 @@ class TestTimedeltaIndex(DatetimeLike): def indices(self): return tm.makeTimedeltaIndex(10) - def create_index(self): + def create_index(self) -> TimedeltaIndex: return pd.to_timedelta(range(5), unit="d") + pd.offsets.Hour(1) def test_numeric_compat(self): diff --git a/pandas/tests/indexing/interval/test_interval_new.py b/pandas/tests/indexing/interval/test_interval_new.py index 43036fbbd9844..03c3034772bc6 100644 --- a/pandas/tests/indexing/interval/test_interval_new.py +++ b/pandas/tests/indexing/interval/test_interval_new.py @@ -128,9 +128,6 @@ def test_loc_with_slices(self): with pytest.raises(NotImplementedError, match=msg): s[Interval(3, 4, closed="left") :] - # TODO with non-existing intervals ? - # s.loc[Interval(-1, 0):Interval(2, 3)] - # slice of scalar expected = s.iloc[:3] @@ -143,9 +140,32 @@ def test_loc_with_slices(self): tm.assert_series_equal(expected, s[:2.5]) tm.assert_series_equal(expected, s[0.1:2.5]) - # slice of scalar with step != 1 - with pytest.raises(ValueError): - s[0:4:2] + def test_slice_step_ne1(self): + # GH#31658 slice of scalar with step != 1 + s = self.s + expected = s.iloc[0:4:2] + + result = s[0:4:2] + tm.assert_series_equal(result, expected) + + result2 = s[0:4][::2] + tm.assert_series_equal(result2, expected) + + def test_slice_float_start_stop(self): + # GH#31658 slicing with integers is positional, with floats is not + # supported + ser = Series(np.arange(5), IntervalIndex.from_breaks(np.arange(6))) + + msg = "label-based slicing with step!=1 is not supported for IntervalIndex" + with pytest.raises(ValueError, match=msg): + ser[1.5:9.5:2] + + def test_slice_interval_step(self): + # GH#31658 allows for integer step!=1, not Interval step + s = self.s + msg = "label-based slicing with step!=1 is not supported for IntervalIndex" + with pytest.raises(ValueError, match=msg): + s[0 : 4 : Interval(0, 1)] def test_loc_with_overlap(self): diff --git a/pandas/tests/indexing/test_iloc.py b/pandas/tests/indexing/test_iloc.py index 683d4f2605712..9664f8d7212ad 100644 --- a/pandas/tests/indexing/test_iloc.py +++ b/pandas/tests/indexing/test_iloc.py @@ -349,7 +349,6 @@ def test_iloc_setitem_dups(self): df = concat([df1, df2], axis=1) expected = df.fillna(3) - expected["A"] = expected["A"].astype("float64") inds = np.isnan(df.iloc[:, 0]) mask = inds[inds].index df.iloc[mask, 0] = df.iloc[mask, 2] @@ -694,3 +693,43 @@ def test_series_indexing_zerodim_np_array(self): s = Series([1, 2]) result = s.iloc[np.array(0)] assert result == 1 + + def test_iloc_setitem_categorical_updates_inplace(self): + # Mixed dtype ensures we go through take_split_path in setitem_with_indexer + cat = pd.Categorical(["A", "B", "C"]) + df = pd.DataFrame({1: cat, 2: [1, 2, 3]}) + + # This should modify our original values in-place + df.iloc[:, 0] = cat[::-1] + + expected = pd.Categorical(["C", "B", "A"]) + tm.assert_categorical_equal(cat, expected) + + +class TestILocSetItemDuplicateColumns: + def test_iloc_setitem_scalar_duplicate_columns(self): + # GH#15686, duplicate columns and mixed dtype + df1 = pd.DataFrame([{"A": None, "B": 1}, {"A": 2, "B": 2}]) + df2 = pd.DataFrame([{"A": 3, "B": 3}, {"A": 4, "B": 4}]) + df = pd.concat([df1, df2], axis=1) + df.iloc[0, 0] = -1 + + assert df.iloc[0, 0] == -1 + assert df.iloc[0, 2] == 3 + assert df.dtypes.iloc[2] == np.int64 + + def test_iloc_setitem_list_duplicate_columns(self): + # GH#22036 setting with same-sized list + df = pd.DataFrame([[0, "str", "str2"]], columns=["a", "b", "b"]) + + df.iloc[:, 2] = ["str3"] + + expected = pd.DataFrame([[0, "str", "str3"]], columns=["a", "b", "b"]) + tm.assert_frame_equal(df, expected) + + def test_iloc_setitem_series_duplicate_columns(self): + df = pd.DataFrame( + np.arange(8, dtype=np.int64).reshape(2, 4), columns=["A", "B", "A", "B"] + ) + df.iloc[:, 0] = df.iloc[:, 0].astype(np.float64) + assert df.dtypes.iloc[2] == np.int64 diff --git a/pandas/tests/indexing/test_indexers.py b/pandas/tests/indexing/test_indexers.py index 173f33b19f8d5..35c0c06e86099 100644 --- a/pandas/tests/indexing/test_indexers.py +++ b/pandas/tests/indexing/test_indexers.py @@ -1,7 +1,7 @@ # Tests aimed at pandas.core.indexers import numpy as np -from pandas.core.indexers import length_of_indexer +from pandas.core.indexers import is_scalar_indexer, length_of_indexer def test_length_of_indexer(): @@ -9,3 +9,20 @@ def test_length_of_indexer(): arr[0] = 1 result = length_of_indexer(arr) assert result == 1 + + +def test_is_scalar_indexer(): + indexer = (0, 1) + assert is_scalar_indexer(indexer, 2) + assert not is_scalar_indexer(indexer[0], 2) + + indexer = (np.array([2]), 1) + assert is_scalar_indexer(indexer, 2) + + indexer = (np.array([2]), np.array([3])) + assert is_scalar_indexer(indexer, 2) + + indexer = (np.array([2]), np.array([3, 4])) + assert not is_scalar_indexer(indexer, 2) + + assert not is_scalar_indexer(slice(None), 1) diff --git a/pandas/tests/indexing/test_indexing.py b/pandas/tests/indexing/test_indexing.py index 8af0fe548e48a..a8a21b0610c14 100644 --- a/pandas/tests/indexing/test_indexing.py +++ b/pandas/tests/indexing/test_indexing.py @@ -52,9 +52,6 @@ def test_setitem_ndarray_1d(self): with pytest.raises(ValueError): df[2:5] = np.arange(1, 4) * 1j - @pytest.mark.parametrize( - "index", tm.all_index_generator(5), ids=lambda x: type(x).__name__ - ) @pytest.mark.parametrize( "obj", [ @@ -71,9 +68,9 @@ def test_setitem_ndarray_1d(self): (lambda x: x.iloc, "iloc"), ], ) - def test_getitem_ndarray_3d(self, index, obj, idxr, idxr_id): + def test_getitem_ndarray_3d(self, indices, obj, idxr, idxr_id): # GH 25567 - obj = obj(index) + obj = obj(indices) idxr = idxr(obj) nd3 = np.random.randint(5, size=(2, 2, 2)) @@ -83,16 +80,16 @@ def test_getitem_ndarray_3d(self, index, obj, idxr, idxr_id): "Cannot index with multidimensional key", r"Wrong number of dimensions. values.ndim != ndim \[3 != 1\]", "Index data must be 1-dimensional", + "positional indexers are out-of-bounds", + "Indexing a MultiIndex with a multidimensional key is not implemented", ] ) - with pytest.raises(ValueError, match=msg): + potential_errors = (IndexError, ValueError, NotImplementedError) + with pytest.raises(potential_errors, match=msg): with tm.assert_produces_warning(DeprecationWarning, check_stacklevel=False): idxr[nd3] - @pytest.mark.parametrize( - "index", tm.all_index_generator(5), ids=lambda x: type(x).__name__ - ) @pytest.mark.parametrize( "obj", [ @@ -109,17 +106,25 @@ def test_getitem_ndarray_3d(self, index, obj, idxr, idxr_id): (lambda x: x.iloc, "iloc"), ], ) - def test_setitem_ndarray_3d(self, index, obj, idxr, idxr_id): + def test_setitem_ndarray_3d(self, indices, obj, idxr, idxr_id): # GH 25567 - obj = obj(index) + obj = obj(indices) idxr = idxr(obj) nd3 = np.random.randint(5, size=(2, 2, 2)) + if ( + (len(indices) == 0) + and (idxr_id == "iloc") + and isinstance(obj, pd.DataFrame) + ): + # gh-32896 + pytest.skip("This is currently failing. There's an xfailed test below.") + if idxr_id == "iloc": err = ValueError msg = f"Cannot set values with ndim > {obj.ndim}" elif ( - isinstance(index, pd.IntervalIndex) + isinstance(indices, pd.IntervalIndex) and idxr_id == "setitem" and obj.ndim == 1 ): @@ -134,6 +139,17 @@ def test_setitem_ndarray_3d(self, index, obj, idxr, idxr_id): with pytest.raises(err, match=msg): idxr[nd3] = 0 + @pytest.mark.xfail(reason="gh-32896") + def test_setitem_ndarray_3d_does_not_fail_for_iloc_empty_dataframe(self): + # when fixing this, please remove the pytest.skip in test_setitem_ndarray_3d + i = Index([]) + obj = DataFrame(np.random.randn(len(i), len(i)), index=i, columns=i) + nd3 = np.random.randint(5, size=(2, 2, 2)) + + msg = f"Cannot set values with ndim > {obj.ndim}" + with pytest.raises(ValueError, match=msg): + obj.iloc[nd3] = 0 + def test_inf_upcast(self): # GH 16957 # We should be able to use np.inf as a key diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index 4d042af8d59b4..ee92e5a69204d 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -631,6 +631,64 @@ def test_loc_setitem_with_scalar_index(self, indexer, value): assert is_scalar(result) and result == "Z" + @pytest.mark.parametrize( + "index,box,expected", + [ + ( + ([0, 2], ["A", "B", "C", "D"]), + 7, + pd.DataFrame( + [[7, 7, 7, 7], [3, 4, np.nan, np.nan], [7, 7, 7, 7]], + columns=["A", "B", "C", "D"], + ), + ), + ( + (1, ["C", "D"]), + [7, 8], + pd.DataFrame( + [[1, 2, np.nan, np.nan], [3, 4, 7, 8], [5, 6, np.nan, np.nan]], + columns=["A", "B", "C", "D"], + ), + ), + ( + (1, ["A", "B", "C"]), + np.array([7, 8, 9], dtype=np.int64), + pd.DataFrame( + [[1, 2, np.nan], [7, 8, 9], [5, 6, np.nan]], + columns=["A", "B", "C"], + ), + ), + ( + (slice(1, 3, None), ["B", "C", "D"]), + [[7, 8, 9], [10, 11, 12]], + pd.DataFrame( + [[1, 2, np.nan, np.nan], [3, 7, 8, 9], [5, 10, 11, 12]], + columns=["A", "B", "C", "D"], + ), + ), + ( + (slice(1, 3, None), ["C", "A", "D"]), + np.array([[7, 8, 9], [10, 11, 12]], dtype=np.int64), + pd.DataFrame( + [[1, 2, np.nan, np.nan], [8, 4, 7, 9], [11, 6, 10, 12]], + columns=["A", "B", "C", "D"], + ), + ), + ( + (slice(None, None, None), ["A", "C"]), + pd.DataFrame([[7, 8], [9, 10], [11, 12]], columns=["A", "C"]), + pd.DataFrame( + [[7, 2, 8], [9, 4, 10], [11, 6, 12]], columns=["A", "B", "C"] + ), + ), + ], + ) + def test_loc_setitem_missing_columns(self, index, box, expected): + # GH 29334 + df = pd.DataFrame([[1, 2], [3, 4], [5, 6]], columns=["A", "B"]) + df.loc[index] = box + tm.assert_frame_equal(df, expected) + def test_loc_coercion(self): # 12411 @@ -863,6 +921,7 @@ def test_loc_setitem_empty_append_raises(self): data = [1, 2] df = DataFrame(columns=["x", "y"]) + df.index = df.index.astype(np.int64) msg = ( r"None of \[Int64Index\(\[0, 1\], dtype='int64'\)\] " r"are in the \[index\]" @@ -975,3 +1034,64 @@ def test_loc_mixed_int_float(): result = ser.loc[1] assert result == 0 + + +def test_loc_with_positional_slice_deprecation(): + # GH#31840 + ser = pd.Series(range(4), index=["A", "B", "C", "D"]) + + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + ser.loc[:3] = 2 + + expected = pd.Series([2, 2, 2, 3], index=["A", "B", "C", "D"]) + tm.assert_series_equal(ser, expected) + + +def test_loc_slice_disallows_positional(): + # GH#16121, GH#24612, GH#31810 + dti = pd.date_range("2016-01-01", periods=3) + df = pd.DataFrame(np.random.random((3, 2)), index=dti) + + ser = df[0] + + msg = ( + "cannot do slice indexing on DatetimeIndex with these " + r"indexers \[1\] of type int" + ) + + for obj in [df, ser]: + with pytest.raises(TypeError, match=msg): + obj.loc[1:3] + + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + # GH#31840 deprecated incorrect behavior + obj.loc[1:3] = 1 + + with pytest.raises(TypeError, match=msg): + df.loc[1:3, 1] + + with tm.assert_produces_warning(FutureWarning): + # GH#31840 deprecated incorrect behavior + df.loc[1:3, 1] = 2 + + +def test_loc_datetimelike_mismatched_dtypes(): + # GH#32650 dont mix and match datetime/timedelta/period dtypes + + df = pd.DataFrame( + np.random.randn(5, 3), + columns=["a", "b", "c"], + index=pd.date_range("2012", freq="H", periods=5), + ) + # create dataframe with non-unique DatetimeIndex + df = df.iloc[[0, 2, 2, 3]].copy() + + dti = df.index + tdi = pd.TimedeltaIndex(dti.asi8) # matching i8 values + + msg = r"None of \[TimedeltaIndex.* are in the \[index\]" + with pytest.raises(KeyError, match=msg): + df.loc[tdi] + + with pytest.raises(KeyError, match=msg): + df["a"].loc[tdi] diff --git a/pandas/tests/indexing/test_na_indexing.py b/pandas/tests/indexing/test_na_indexing.py index 345ca30ec77eb..9e8ef6e6e1c22 100644 --- a/pandas/tests/indexing/test_na_indexing.py +++ b/pandas/tests/indexing/test_na_indexing.py @@ -7,6 +7,7 @@ @pytest.mark.parametrize( "values, dtype", [ + ([], "object"), ([1, 2, 3], "int64"), ([1.0, 2.0, 3.0], "float64"), (["a", "b", "c"], "object"), @@ -22,42 +23,43 @@ @pytest.mark.parametrize( "mask", [[True, False, False], [True, True, True], [False, False, False]] ) -@pytest.mark.parametrize("box_mask", [True, False]) +@pytest.mark.parametrize("indexer_class", [list, pd.array, pd.Index, pd.Series]) @pytest.mark.parametrize("frame", [True, False]) -def test_series_mask_boolean(values, dtype, mask, box_mask, frame): - ser = pd.Series(values, dtype=dtype, index=["a", "b", "c"]) - if frame: - ser = ser.to_frame() - mask = pd.array(mask, dtype="boolean") - if box_mask: - mask = pd.Series(mask, index=ser.index) - - expected = ser[mask.astype("bool")] +def test_series_mask_boolean(values, dtype, mask, indexer_class, frame): + # In case len(values) < 3 + index = ["a", "b", "c"][: len(values)] + mask = mask[: len(values)] - result = ser[mask] - tm.assert_equal(result, expected) - - if not box_mask: - # Series.iloc[Series[bool]] isn't allowed - result = ser.iloc[mask] - tm.assert_equal(result, expected) + obj = pd.Series(values, dtype=dtype, index=index) + if frame: + if len(values) == 0: + # Otherwise obj is an empty DataFrame with shape (0, 1) + obj = pd.DataFrame(dtype=dtype) + else: + obj = obj.to_frame() + + if indexer_class is pd.array: + mask = pd.array(mask, dtype="boolean") + elif indexer_class is pd.Series: + mask = pd.Series(mask, index=obj.index, dtype="boolean") + else: + mask = indexer_class(mask) - result = ser.loc[mask] - tm.assert_equal(result, expected) + expected = obj[mask] - # empty - mask = mask[:0] - ser = ser.iloc[:0] - expected = ser[mask.astype("bool")] - result = ser[mask] + result = obj[mask] tm.assert_equal(result, expected) - if not box_mask: - # Series.iloc[Series[bool]] isn't allowed - result = ser.iloc[mask] + if indexer_class is pd.Series: + msg = "iLocation based boolean indexing cannot use an indexable as a mask" + with pytest.raises(ValueError, match=msg): + result = obj.iloc[mask] + tm.assert_equal(result, expected) + else: + result = obj.iloc[mask] tm.assert_equal(result, expected) - result = ser.loc[mask] + result = obj.loc[mask] tm.assert_equal(result, expected) diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py index a569726e9a22a..bbf968aef4a5c 100644 --- a/pandas/tests/internals/test_internals.py +++ b/pandas/tests/internals/test_internals.py @@ -207,7 +207,6 @@ def setup_method(self, method): self.cblock = create_block("complex", [7]) self.oblock = create_block("object", [1, 3]) self.bool_block = create_block("bool", [5]) - self.int_block = create_block("int", [6]) def test_constructor(self): int32block = create_block("i4", [0]) @@ -334,13 +333,9 @@ def test_pickle(self, mgr): assert not mgr2._is_consolidated assert not mgr2._known_consolidated - def test_non_unique_pickle(self): - - mgr = create_mgr("a,a,a:f8") - mgr2 = tm.round_trip_pickle(mgr) - tm.assert_frame_equal(DataFrame(mgr), DataFrame(mgr2)) - - mgr = create_mgr("a: f8; a: i8") + @pytest.mark.parametrize("mgr_string", ["a,a,a:f8", "a: f8; a: i8"]) + def test_non_unique_pickle(self, mgr_string): + mgr = create_mgr(mgr_string) mgr2 = tm.round_trip_pickle(mgr) tm.assert_frame_equal(DataFrame(mgr), DataFrame(mgr2)) @@ -428,22 +423,25 @@ def test_sparse_mixed(self): # TODO: what to test here? - def test_as_array_float(self): - mgr = create_mgr("c: f4; d: f2; e: f8") - assert mgr.as_array().dtype == np.float64 - - mgr = create_mgr("c: f4; d: f2") - assert mgr.as_array().dtype == np.float32 - - def test_as_array_int_bool(self): - mgr = create_mgr("a: bool-1; b: bool-2") - assert mgr.as_array().dtype == np.bool_ - - mgr = create_mgr("a: i8-1; b: i8-2; c: i4; d: i2; e: u1") - assert mgr.as_array().dtype == np.int64 + @pytest.mark.parametrize( + "mgr_string, dtype", + [("c: f4; d: f2", np.float32), ("c: f4; d: f2; e: f8", np.float64)], + ) + def test_as_array_float(self, mgr_string, dtype): + mgr = create_mgr(mgr_string) + assert mgr.as_array().dtype == dtype - mgr = create_mgr("c: i4; d: i2; e: u1") - assert mgr.as_array().dtype == np.int32 + @pytest.mark.parametrize( + "mgr_string, dtype", + [ + ("a: bool-1; b: bool-2", np.bool_), + ("a: i8-1; b: i8-2; c: i4; d: i2; e: u1", np.int64), + ("c: i4; d: i2; e: u1", np.int32), + ], + ) + def test_as_array_int_bool(self, mgr_string, dtype): + mgr = create_mgr(mgr_string) + assert mgr.as_array().dtype == dtype def test_as_array_datetime(self): mgr = create_mgr("h: datetime-1; g: datetime-2") @@ -541,8 +539,14 @@ def _compare(old_mgr, new_mgr): assert new_mgr.get("g").dtype == np.float64 assert new_mgr.get("h").dtype == np.float16 - def test_interleave(self): + def test_invalid_ea_block(self): + with pytest.raises(AssertionError, match="block.size != values.size"): + create_mgr("a: category; b: category") + + with pytest.raises(AssertionError, match="block.size != values.size"): + create_mgr("a: category2; b: category2") + def test_interleave(self): # self for dtype in ["f8", "i8", "object", "bool", "complex", "M8[ns]", "m8[ns]"]: mgr = create_mgr(f"a: {dtype}") @@ -550,17 +554,37 @@ def test_interleave(self): mgr = create_mgr(f"a: {dtype}; b: {dtype}") assert mgr.as_array().dtype == dtype + @pytest.mark.parametrize( + "mgr_string, dtype", + [ + ("a: category", "i8"), + ("a: category; b: category", "i8"), + ("a: category; b: category2", "object"), + ("a: category2", "object"), + ("a: category2; b: category2", "object"), + ("a: f8", "f8"), + ("a: f8; b: i8", "f8"), + ("a: f4; b: i8", "f8"), + ("a: f4; b: i8; d: object", "object"), + ("a: bool; b: i8", "object"), + ("a: complex", "complex"), + ("a: f8; b: category", "object"), + ("a: M8[ns]; b: category", "object"), + ("a: M8[ns]; b: bool", "object"), + ("a: M8[ns]; b: i8", "object"), + ("a: m8[ns]; b: bool", "object"), + ("a: m8[ns]; b: i8", "object"), + ("a: M8[ns]; b: m8[ns]", "object"), + ], + ) + def test_interleave_dtype(self, mgr_string, dtype): # will be converted according the actual dtype of the underlying mgr = create_mgr("a: category") assert mgr.as_array().dtype == "i8" - mgr = create_mgr("a: category; b: category") - assert mgr.as_array().dtype == "i8" mgr = create_mgr("a: category; b: category2") assert mgr.as_array().dtype == "object" mgr = create_mgr("a: category2") assert mgr.as_array().dtype == "object" - mgr = create_mgr("a: category2; b: category2") - assert mgr.as_array().dtype == "object" # combinations mgr = create_mgr("a: f8") @@ -687,13 +711,12 @@ def test_get_bool_data(self): def test_unicode_repr_doesnt_raise(self): repr(create_mgr("b,\u05d0: object")) - def test_equals(self): + @pytest.mark.parametrize( + "mgr_string", ["a,b,c: i8-1; d,e,f: i8-2", "a,a,a: i8-1; b,b,b: i8-2"] + ) + def test_equals(self, mgr_string): # unique items - bm1 = create_mgr("a,b,c: i8-1; d,e,f: i8-2") - bm2 = BlockManager(bm1.blocks[::-1], bm1.axes) - assert bm1.equals(bm2) - - bm1 = create_mgr("a,a,a: i8-1; b,b,b: i8-2") + bm1 = create_mgr(mgr_string) bm2 = BlockManager(bm1.blocks[::-1], bm1.axes) assert bm1.equals(bm2) @@ -703,7 +726,7 @@ def test_equals(self): "a:i8;b:f8", # basic case "a:i8;b:f8;c:c8;d:b", # many types "a:i8;e:dt;f:td;g:string", # more types - "a:i8;b:category;c:category2;d:category2", # categories + "a:i8;b:category;c:category2", # categories "c:sparse;d:sparse_na;b:f8", # sparse ], ) @@ -903,97 +926,111 @@ def assert_reindex_indexer_is_ok(mgr, axis, new_labels, indexer, fill_value): class TestBlockPlacement: - def test_slice_len(self): - assert len(BlockPlacement(slice(0, 4))) == 4 - assert len(BlockPlacement(slice(0, 4, 2))) == 2 - assert len(BlockPlacement(slice(0, 3, 2))) == 2 - - assert len(BlockPlacement(slice(0, 1, 2))) == 1 - assert len(BlockPlacement(slice(1, 0, -1))) == 1 + @pytest.mark.parametrize( + "slc, expected", + [ + (slice(0, 4), 4), + (slice(0, 4, 2), 2), + (slice(0, 3, 2), 2), + (slice(0, 1, 2), 1), + (slice(1, 0, -1), 1), + ], + ) + def test_slice_len(self, slc, expected): + assert len(BlockPlacement(slc)) == expected - def test_zero_step_raises(self): + @pytest.mark.parametrize("slc", [slice(1, 1, 0), slice(1, 2, 0)]) + def test_zero_step_raises(self, slc): msg = "slice step cannot be zero" - with pytest.raises(ValueError, match=msg): - BlockPlacement(slice(1, 1, 0)) + BlockPlacement(slc) + + @pytest.mark.parametrize( + "slc", + [ + slice(None, None), + slice(10, None), + slice(None, None, -1), + slice(None, 10, -1), + # These are "unbounded" because negative index will + # change depending on container shape. + slice(-1, None), + slice(None, -1), + slice(-1, -1), + slice(-1, None, -1), + slice(None, -1, -1), + slice(-1, -1, -1), + ], + ) + def test_unbounded_slice_raises(self, slc): + msg = "unbounded slice" with pytest.raises(ValueError, match=msg): - BlockPlacement(slice(1, 2, 0)) - - def test_unbounded_slice_raises(self): - def assert_unbounded_slice_error(slc): - with pytest.raises(ValueError, match="unbounded slice"): - BlockPlacement(slc) - - assert_unbounded_slice_error(slice(None, None)) - assert_unbounded_slice_error(slice(10, None)) - assert_unbounded_slice_error(slice(None, None, -1)) - assert_unbounded_slice_error(slice(None, 10, -1)) - - # These are "unbounded" because negative index will change depending on - # container shape. - assert_unbounded_slice_error(slice(-1, None)) - assert_unbounded_slice_error(slice(None, -1)) - assert_unbounded_slice_error(slice(-1, -1)) - assert_unbounded_slice_error(slice(-1, None, -1)) - assert_unbounded_slice_error(slice(None, -1, -1)) - assert_unbounded_slice_error(slice(-1, -1, -1)) - - def test_not_slice_like_slices(self): - def assert_not_slice_like(slc): - assert not BlockPlacement(slc).is_slice_like - - assert_not_slice_like(slice(0, 0)) - assert_not_slice_like(slice(100, 0)) - - assert_not_slice_like(slice(100, 100, -1)) - assert_not_slice_like(slice(0, 100, -1)) - - assert not BlockPlacement(slice(0, 0)).is_slice_like - assert not BlockPlacement(slice(100, 100)).is_slice_like - - def test_array_to_slice_conversion(self): - def assert_as_slice_equals(arr, slc): - assert BlockPlacement(arr).as_slice == slc - - assert_as_slice_equals([0], slice(0, 1, 1)) - assert_as_slice_equals([100], slice(100, 101, 1)) - - assert_as_slice_equals([0, 1, 2], slice(0, 3, 1)) - assert_as_slice_equals([0, 5, 10], slice(0, 15, 5)) - assert_as_slice_equals([0, 100], slice(0, 200, 100)) - - assert_as_slice_equals([2, 1], slice(2, 0, -1)) - - def test_not_slice_like_arrays(self): - def assert_not_slice_like(arr): - assert not BlockPlacement(arr).is_slice_like - - assert_not_slice_like([]) - assert_not_slice_like([-1]) - assert_not_slice_like([-1, -2, -3]) - assert_not_slice_like([-10]) - assert_not_slice_like([-1]) - assert_not_slice_like([-1, 0, 1, 2]) - assert_not_slice_like([-2, 0, 2, 4]) - assert_not_slice_like([1, 0, -1]) - assert_not_slice_like([1, 1, 1]) - - def test_slice_iter(self): - assert list(BlockPlacement(slice(0, 3))) == [0, 1, 2] - assert list(BlockPlacement(slice(0, 0))) == [] - assert list(BlockPlacement(slice(3, 0))) == [] - - def test_slice_to_array_conversion(self): - def assert_as_array_equals(slc, asarray): - tm.assert_numpy_array_equal( - BlockPlacement(slc).as_array, np.asarray(asarray, dtype=np.int64) - ) + BlockPlacement(slc) + + @pytest.mark.parametrize( + "slc", + [ + slice(0, 0), + slice(100, 0), + slice(100, 100), + slice(100, 100, -1), + slice(0, 100, -1), + ], + ) + def test_not_slice_like_slices(self, slc): + assert not BlockPlacement(slc).is_slice_like + + @pytest.mark.parametrize( + "arr, slc", + [ + ([0], slice(0, 1, 1)), + ([100], slice(100, 101, 1)), + ([0, 1, 2], slice(0, 3, 1)), + ([0, 5, 10], slice(0, 15, 5)), + ([0, 100], slice(0, 200, 100)), + ([2, 1], slice(2, 0, -1)), + ], + ) + def test_array_to_slice_conversion(self, arr, slc): + assert BlockPlacement(arr).as_slice == slc + + @pytest.mark.parametrize( + "arr", + [ + [], + [-1], + [-1, -2, -3], + [-10], + [-1], + [-1, 0, 1, 2], + [-2, 0, 2, 4], + [1, 0, -1], + [1, 1, 1], + ], + ) + def test_not_slice_like_arrays(self, arr): + assert not BlockPlacement(arr).is_slice_like - assert_as_array_equals(slice(0, 3), [0, 1, 2]) - assert_as_array_equals(slice(0, 0), []) - assert_as_array_equals(slice(3, 0), []) + @pytest.mark.parametrize( + "slc, expected", + [(slice(0, 3), [0, 1, 2]), (slice(0, 0), []), (slice(3, 0), [])], + ) + def test_slice_iter(self, slc, expected): + assert list(BlockPlacement(slc)) == expected - assert_as_array_equals(slice(3, 0, -1), [3, 2, 1]) + @pytest.mark.parametrize( + "slc, arr", + [ + (slice(0, 3), [0, 1, 2]), + (slice(0, 0), []), + (slice(3, 0), []), + (slice(3, 0, -1), [3, 2, 1]), + ], + ) + def test_slice_to_array_conversion(self, slc, arr): + tm.assert_numpy_array_equal( + BlockPlacement(slc).as_array, np.asarray(arr, dtype=np.int64) + ) def test_blockplacement_add(self): bpl = BlockPlacement(slice(0, 5)) @@ -1001,30 +1038,30 @@ def test_blockplacement_add(self): assert bpl.add(np.arange(5)).as_slice == slice(0, 10, 2) assert list(bpl.add(np.arange(5, 0, -1))) == [5, 5, 5, 5, 5] - def test_blockplacement_add_int(self): - def assert_add_equals(val, inc, result): - assert list(BlockPlacement(val).add(inc)) == result - - assert_add_equals(slice(0, 0), 0, []) - assert_add_equals(slice(1, 4), 0, [1, 2, 3]) - assert_add_equals(slice(3, 0, -1), 0, [3, 2, 1]) - assert_add_equals([1, 2, 4], 0, [1, 2, 4]) - - assert_add_equals(slice(0, 0), 10, []) - assert_add_equals(slice(1, 4), 10, [11, 12, 13]) - assert_add_equals(slice(3, 0, -1), 10, [13, 12, 11]) - assert_add_equals([1, 2, 4], 10, [11, 12, 14]) - - assert_add_equals(slice(0, 0), -1, []) - assert_add_equals(slice(1, 4), -1, [0, 1, 2]) - assert_add_equals([1, 2, 4], -1, [0, 1, 3]) + @pytest.mark.parametrize( + "val, inc, expected", + [ + (slice(0, 0), 0, []), + (slice(1, 4), 0, [1, 2, 3]), + (slice(3, 0, -1), 0, [3, 2, 1]), + ([1, 2, 4], 0, [1, 2, 4]), + (slice(0, 0), 10, []), + (slice(1, 4), 10, [11, 12, 13]), + (slice(3, 0, -1), 10, [13, 12, 11]), + ([1, 2, 4], 10, [11, 12, 14]), + (slice(0, 0), -1, []), + (slice(1, 4), -1, [0, 1, 2]), + ([1, 2, 4], -1, [0, 1, 3]), + ], + ) + def test_blockplacement_add_int(self, val, inc, expected): + assert list(BlockPlacement(val).add(inc)) == expected + @pytest.mark.parametrize("val", [slice(1, 4), [1, 2, 4]]) + def test_blockplacement_add_int_raises(self, val): msg = "iadd causes length change" - - with pytest.raises(ValueError, match=msg): - BlockPlacement(slice(1, 4)).add(-10) with pytest.raises(ValueError, match=msg): - BlockPlacement([1, 2, 4]).add(-10) + BlockPlacement(val).add(-10) class DummyElement: @@ -1152,6 +1189,23 @@ def test_binop_other(self, op, value, dtype): tm.assert_series_equal(result, expected) +class TestShouldStore: + def test_should_store_categorical(self): + cat = pd.Categorical(["A", "B", "C"]) + df = pd.DataFrame(cat) + blk = df._data.blocks[0] + + # matching dtype + assert blk.should_store(cat) + assert blk.should_store(cat[:-1]) + + # different dtype + assert not blk.should_store(cat.as_ordered()) + + # ndarray instead of Categorical + assert not blk.should_store(np.asarray(cat)) + + @pytest.mark.parametrize( "typestr, holder", [ diff --git a/pandas/tests/io/data/excel/high_surrogate.xlsx b/pandas/tests/io/data/excel/high_surrogate.xlsx new file mode 100644 index 0000000000000..1e29b6bee6586 Binary files /dev/null and b/pandas/tests/io/data/excel/high_surrogate.xlsx differ diff --git a/pandas/tests/io/excel/test_openpyxl.py b/pandas/tests/io/excel/test_openpyxl.py index 10ed192062d9c..60c943d95e510 100644 --- a/pandas/tests/io/excel/test_openpyxl.py +++ b/pandas/tests/io/excel/test_openpyxl.py @@ -114,7 +114,7 @@ def test_to_excel_with_openpyxl_engine(ext, tmpdir): df2 = DataFrame({"B": np.linspace(1, 20, 10)}) df = pd.concat([df1, df2], axis=1) styled = df.style.applymap( - lambda val: "color: %s" % "red" if val < 0 else "black" + lambda val: "color: %s" % ("red" if val < 0 else "black") ).highlight_max() filename = tmpdir / "styled.xlsx" diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index a59b409809eed..b1502ed3f3c09 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -426,7 +426,8 @@ def test_reader_dtype(self, read_ext): expected["c"] = ["001", "002", "003", "004"] tm.assert_frame_equal(actual, expected) - with pytest.raises(ValueError): + msg = "Unable to convert column d to type int64" + with pytest.raises(ValueError, match=msg): pd.read_excel(basename + read_ext, dtype={"d": "int64"}) @pytest.mark.parametrize( @@ -629,6 +630,17 @@ def test_read_from_py_localpath(self, read_ext): tm.assert_frame_equal(expected, actual) + @td.check_file_leaks + def test_close_from_py_localpath(self, read_ext): + + # GH31467 + str_path = os.path.join("test1" + read_ext) + with open(str_path, "rb") as f: + x = pd.read_excel(f, "Sheet1", index_col=0) + del x + # should not throw an exception because the passed file was closed + f.read() + def test_reader_seconds(self, read_ext): if pd.read_excel.keywords["engine"] == "pyxlsb": pytest.xfail("Sheets containing datetimes not supported by pyxlsb") @@ -811,13 +823,15 @@ def test_excel_old_index_format(self, read_ext): def test_read_excel_bool_header_arg(self, read_ext): # GH 6114 + msg = "Passing a bool to header is invalid" for arg in [True, False]: - with pytest.raises(TypeError): + with pytest.raises(TypeError, match=msg): pd.read_excel("test1" + read_ext, header=arg) def test_read_excel_chunksize(self, read_ext): # GH 8011 - with pytest.raises(NotImplementedError): + msg = "chunksize keyword of read_excel is not implemented" + with pytest.raises(NotImplementedError, match=msg): pd.read_excel("test1" + read_ext, chunksize=100) def test_read_excel_skiprows_list(self, read_ext): @@ -1020,10 +1034,10 @@ def test_excel_read_buffer(self, engine, read_ext): tm.assert_frame_equal(expected, actual) def test_reader_closes_file(self, engine, read_ext): - f = open("test1" + read_ext, "rb") - with pd.ExcelFile(f) as xlsx: - # parses okay - pd.read_excel(xlsx, "Sheet1", index_col=0, engine=engine) + with open("test1" + read_ext, "rb") as f: + with pd.ExcelFile(f) as xlsx: + # parses okay + pd.read_excel(xlsx, "Sheet1", index_col=0, engine=engine) assert f.closed @@ -1044,3 +1058,11 @@ def test_excel_read_binary(self, engine, read_ext): actual = pd.read_excel(data, engine=engine) tm.assert_frame_equal(expected, actual) + + def test_excel_high_surrogate(self, engine): + # GH 23809 + expected = pd.DataFrame(["\udc88"], columns=["Column1"]) + + # should not produce a segmentation violation + actual = pd.read_excel("high_surrogate.xlsx") + tm.assert_frame_equal(expected, actual) diff --git a/pandas/tests/io/excel/test_writers.py b/pandas/tests/io/excel/test_writers.py index 506d223dbedb4..0811f2f822198 100644 --- a/pandas/tests/io/excel/test_writers.py +++ b/pandas/tests/io/excel/test_writers.py @@ -330,7 +330,8 @@ def test_excel_sheet_by_name_raise(self, path): tm.assert_frame_equal(gt, df) - with pytest.raises(xlrd.XLRDError): + msg = "No sheet named <'0'>" + with pytest.raises(xlrd.XLRDError, match=msg): pd.read_excel(xl, "0") def test_excel_writer_context_manager(self, frame, path): @@ -452,7 +453,7 @@ def test_float_types(self, np_type, path): reader = ExcelFile(path) recons = pd.read_excel(reader, "test1", index_col=0).astype(np_type) - tm.assert_frame_equal(df, recons, check_dtype=False) + tm.assert_frame_equal(df, recons) @pytest.mark.parametrize("np_type", [np.bool8, np.bool_]) def test_bool_types(self, np_type, path): @@ -564,7 +565,7 @@ def test_roundtrip_indexlabels(self, merge_cells, frame, path): reader = ExcelFile(path) recons = pd.read_excel(reader, "test1", index_col=[0, 1]) - tm.assert_frame_equal(df, recons, check_less_precise=True) + tm.assert_frame_equal(df, recons) def test_excel_roundtrip_indexname(self, merge_cells, path): df = DataFrame(np.random.randn(10, 4)) @@ -973,7 +974,11 @@ def roundtrip(data, header=True, parser_hdr=0, index=True): # This if will be removed once multi-column Excel writing # is implemented. For now fixing gh-9794. if c_idx_nlevels > 1: - with pytest.raises(NotImplementedError): + msg = ( + "Writing to Excel with MultiIndex columns and no index " + "\\('index'=False\\) is not yet implemented." + ) + with pytest.raises(NotImplementedError, match=msg): roundtrip(df, use_headers, index=False) else: res = roundtrip(df, use_headers) diff --git a/pandas/tests/io/excel/test_xlwt.py b/pandas/tests/io/excel/test_xlwt.py index 01feab08eb5e3..a2d8b9fce9767 100644 --- a/pandas/tests/io/excel/test_xlwt.py +++ b/pandas/tests/io/excel/test_xlwt.py @@ -18,7 +18,12 @@ def test_excel_raise_error_on_multiindex_columns_and_no_index(ext): [("site", ""), ("2014", "height"), ("2014", "weight")] ) df = DataFrame(np.random.randn(10, 3), columns=cols) - with pytest.raises(NotImplementedError): + + msg = ( + "Writing to Excel with MultiIndex columns and no index " + "\\('index'=False\\) is not yet implemented." + ) + with pytest.raises(NotImplementedError, match=msg): with tm.ensure_clean(ext) as path: df.to_excel(path, index=False) diff --git a/pandas/tests/io/formats/test_format.py b/pandas/tests/io/formats/test_format.py index bf7b98eb78f11..1a5d122d732a9 100644 --- a/pandas/tests/io/formats/test_format.py +++ b/pandas/tests/io/formats/test_format.py @@ -1508,7 +1508,8 @@ def test_to_string_specified_header(self): assert df_s == expected - with pytest.raises(ValueError): + msg = "Writing 2 cols but got 1 aliases" + with pytest.raises(ValueError, match=msg): df.to_string(header=["X"]) def test_to_string_no_index(self): diff --git a/pandas/tests/io/formats/test_style.py b/pandas/tests/io/formats/test_style.py index a2659079be7c0..ec4614538004c 100644 --- a/pandas/tests/io/formats/test_style.py +++ b/pandas/tests/io/formats/test_style.py @@ -37,7 +37,8 @@ def h(x, foo="bar"): ] def test_init_non_pandas(self): - with pytest.raises(TypeError): + msg = "``data`` must be a Series or DataFrame" + with pytest.raises(TypeError, match=msg): Styler([1, 2, 3]) def test_init_series(self): @@ -1013,7 +1014,8 @@ def test_bar_align_zero_nans(self): def test_bar_bad_align_raises(self): df = pd.DataFrame({"A": [-100, -60, -30, -20]}) - with pytest.raises(ValueError): + msg = "`align` must be one of {'left', 'zero',' mid'}" + with pytest.raises(ValueError, match=msg): df.style.bar(align="poorly", color=["#d65f5f", "#5fba7d"]) def test_format_with_na_rep(self): @@ -1082,7 +1084,8 @@ def test_format_non_numeric_na(self): def test_format_with_bad_na_rep(self): # GH 21527 28358 df = pd.DataFrame([[None, None], [1.1, 1.2]], columns=["A", "B"]) - with pytest.raises(TypeError): + msg = "Expected a string, got -1 instead" + with pytest.raises(TypeError, match=msg): df.style.format(None, na_rep=-1) def test_highlight_null(self, null_color="red"): @@ -1091,12 +1094,30 @@ def test_highlight_null(self, null_color="red"): expected = {(0, 0): [""], (1, 0): ["background-color: red"]} assert result == expected + def test_highlight_null_subset(self): + # GH 31345 + df = pd.DataFrame({"A": [0, np.nan], "B": [0, np.nan]}) + result = ( + df.style.highlight_null(null_color="red", subset=["A"]) + .highlight_null(null_color="green", subset=["B"]) + ._compute() + .ctx + ) + expected = { + (0, 0): [""], + (1, 0): ["background-color: red"], + (0, 1): [""], + (1, 1): ["background-color: green"], + } + assert result == expected + def test_nonunique_raises(self): df = pd.DataFrame([[1, 2]], columns=["A", "A"]) - with pytest.raises(ValueError): + msg = "style is not supported for non-unique indices." + with pytest.raises(ValueError, match=msg): df.style - with pytest.raises(ValueError): + with pytest.raises(ValueError, match=msg): Styler(df) def test_caption(self): @@ -1243,9 +1264,12 @@ def test_display_format(self): def test_display_format_raises(self): df = pd.DataFrame(np.random.randn(2, 2)) - with pytest.raises(TypeError): + msg = "Expected a template string or callable, got 5 instead" + with pytest.raises(TypeError, match=msg): df.style.format(5) - with pytest.raises(TypeError): + + msg = "Expected a template string or callable, got True instead" + with pytest.raises(TypeError, match=msg): df.style.format(True) def test_display_set_precision(self): @@ -1318,19 +1342,21 @@ def test_display_dict(self): def test_bad_apply_shape(self): df = pd.DataFrame([[1, 2], [3, 4]]) - with pytest.raises(ValueError): + msg = "returned the wrong shape" + with pytest.raises(ValueError, match=msg): df.style._apply(lambda x: "x", subset=pd.IndexSlice[[0, 1], :]) - with pytest.raises(ValueError): + with pytest.raises(ValueError, match=msg): df.style._apply(lambda x: [""], subset=pd.IndexSlice[[0, 1], :]) - with pytest.raises(ValueError): + with pytest.raises(ValueError, match=msg): df.style._apply(lambda x: ["", "", "", ""]) - with pytest.raises(ValueError): + with pytest.raises(ValueError, match=msg): df.style._apply(lambda x: ["", "", ""], subset=1) - with pytest.raises(ValueError): + msg = "Length mismatch: Expected axis has 3 elements" + with pytest.raises(ValueError, match=msg): df.style._apply(lambda x: ["", "", ""], axis=1) def test_apply_bad_return(self): @@ -1338,7 +1364,8 @@ def f(x): return "" df = pd.DataFrame([[1, 2], [3, 4]]) - with pytest.raises(TypeError): + msg = "must return a DataFrame when passed to `Styler.apply` with axis=None" + with pytest.raises(TypeError, match=msg): df.style._apply(f, axis=None) def test_apply_bad_labels(self): @@ -1346,7 +1373,8 @@ def f(x): return pd.DataFrame(index=[1, 2], columns=["a", "b"]) df = pd.DataFrame([[1, 2], [3, 4]]) - with pytest.raises(ValueError): + msg = "must have identical index and columns as the input" + with pytest.raises(ValueError, match=msg): df.style._apply(f, axis=None) def test_get_level_lengths(self): diff --git a/pandas/tests/io/formats/test_to_latex.py b/pandas/tests/io/formats/test_to_latex.py index c2fbc59b8f482..509e5bcb33304 100644 --- a/pandas/tests/io/formats/test_to_latex.py +++ b/pandas/tests/io/formats/test_to_latex.py @@ -664,7 +664,8 @@ def test_to_latex_specified_header(self): assert withoutescape_result == withoutescape_expected - with pytest.raises(ValueError): + msg = "Writing 2 cols but got 1 aliases" + with pytest.raises(ValueError, match=msg): df.to_latex(header=["A"]) def test_to_latex_decimal(self, float_frame): diff --git a/pandas/tests/io/json/test_json_table_schema.py b/pandas/tests/io/json/test_json_table_schema.py index 2ac2acc6748d1..c0d40048a72fe 100644 --- a/pandas/tests/io/json/test_json_table_schema.py +++ b/pandas/tests/io/json/test_json_table_schema.py @@ -603,8 +603,7 @@ def test_timestamp_in_columns(self): result = df.to_json(orient="table") js = json.loads(result) assert js["schema"]["fields"][1]["name"] == "2016-01-01T00:00:00.000Z" - # TODO - below expectation is not correct; see GH 28256 - assert js["schema"]["fields"][2]["name"] == 10000 + assert js["schema"]["fields"][2]["name"] == "P0DT0H0M10S" @pytest.mark.parametrize( "case", diff --git a/pandas/tests/io/json/test_normalize.py b/pandas/tests/io/json/test_normalize.py index 91b204ed41ebc..b7a9918ff46da 100644 --- a/pandas/tests/io/json/test_normalize.py +++ b/pandas/tests/io/json/test_normalize.py @@ -486,6 +486,16 @@ def test_non_interable_record_path_errors(self): with pytest.raises(TypeError, match=msg): json_normalize([test_input], record_path=[test_path]) + def test_meta_non_iterable(self): + # GH 31507 + data = """[{"id": 99, "data": [{"one": 1, "two": 2}]}]""" + + result = json_normalize(json.loads(data), record_path=["data"], meta=["id"]) + expected = DataFrame( + {"one": [1], "two": [2], "id": np.array([99], dtype=object)} + ) + tm.assert_frame_equal(result, expected) + class TestNestedToRecord: def test_flat_stays_flat(self): diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index f2d35bfb3b5ae..e13b2b34d611b 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -42,22 +42,9 @@ def assert_json_roundtrip_equal(result, expected, orient): @pytest.mark.filterwarnings("ignore:the 'numpy' keyword is deprecated:FutureWarning") class TestPandasContainer: - @pytest.fixture(scope="function", autouse=True) - def setup(self, datapath): - self.dirpath = datapath("io", "json", "data") - - self.ts = tm.makeTimeSeries() - self.ts.name = "ts" - - self.series = tm.makeStringSeries() - self.series.name = "series" - - self.objSeries = tm.makeObjectSeries() - self.objSeries.name = "objects" - - self.empty_series = Series([], index=[], dtype=np.float64) + @pytest.fixture(autouse=True) + def setup(self): self.empty_frame = DataFrame() - self.frame = _frame.copy() self.frame2 = _frame2.copy() self.intframe = _intframe.copy() @@ -67,15 +54,6 @@ def setup(self, datapath): yield - del self.dirpath - - del self.ts - - del self.series - - del self.objSeries - - del self.empty_series del self.empty_frame del self.frame @@ -457,7 +435,7 @@ def test_frame_mixedtype_orient(self): # GH10289 left = read_json(inp, orient="values", convert_axes=False) tm.assert_frame_equal(left, right) - def test_v12_compat(self): + def test_v12_compat(self, datapath): df = DataFrame( [ [1.56808523, 0.65727391, 1.81021139, -0.17251653], @@ -474,12 +452,13 @@ def test_v12_compat(self): df["modified"] = df["date"] df.iloc[1, df.columns.get_loc("modified")] = pd.NaT - v12_json = os.path.join(self.dirpath, "tsframe_v012.json") + dirpath = datapath("io", "json", "data") + v12_json = os.path.join(dirpath, "tsframe_v012.json") df_unser = pd.read_json(v12_json) tm.assert_frame_equal(df, df_unser) df_iso = df.drop(["modified"], axis=1) - v12_iso_json = os.path.join(self.dirpath, "tsframe_iso_v012.json") + v12_iso_json = os.path.join(dirpath, "tsframe_iso_v012.json") df_unser_iso = pd.read_json(v12_iso_json) tm.assert_frame_equal(df_iso, df_unser_iso) @@ -572,7 +551,6 @@ def test_blocks_compat_GH9037(self): df_roundtrip, check_index_type=True, check_column_type=True, - check_frame_type=True, by_blocks=True, check_exact=True, ) @@ -634,15 +612,15 @@ def test_series_non_unique_index(self): unser = read_json(s.to_json(orient="records"), orient="records", typ="series") tm.assert_numpy_array_equal(s.values, unser.values) - def test_series_default_orient(self): - assert self.series.to_json() == self.series.to_json(orient="index") + def test_series_default_orient(self, string_series): + assert string_series.to_json() == string_series.to_json(orient="index") @pytest.mark.parametrize("numpy", [True, False]) - def test_series_roundtrip_simple(self, orient, numpy): - data = self.series.to_json(orient=orient) + def test_series_roundtrip_simple(self, orient, numpy, string_series): + data = string_series.to_json(orient=orient) result = pd.read_json(data, typ="series", orient=orient, numpy=numpy) - expected = self.series.copy() + expected = string_series if orient in ("values", "records"): expected = expected.reset_index(drop=True) if orient != "split": @@ -652,13 +630,13 @@ def test_series_roundtrip_simple(self, orient, numpy): @pytest.mark.parametrize("dtype", [False, None]) @pytest.mark.parametrize("numpy", [True, False]) - def test_series_roundtrip_object(self, orient, numpy, dtype): - data = self.objSeries.to_json(orient=orient) + def test_series_roundtrip_object(self, orient, numpy, dtype, object_series): + data = object_series.to_json(orient=orient) result = pd.read_json( data, typ="series", orient=orient, numpy=numpy, dtype=dtype ) - expected = self.objSeries.copy() + expected = object_series if orient in ("values", "records"): expected = expected.reset_index(drop=True) if orient != "split": @@ -667,12 +645,11 @@ def test_series_roundtrip_object(self, orient, numpy, dtype): tm.assert_series_equal(result, expected) @pytest.mark.parametrize("numpy", [True, False]) - def test_series_roundtrip_empty(self, orient, numpy): - data = self.empty_series.to_json(orient=orient) + def test_series_roundtrip_empty(self, orient, numpy, empty_series): + data = empty_series.to_json(orient=orient) result = pd.read_json(data, typ="series", orient=orient, numpy=numpy) - expected = self.empty_series.copy() - # TODO: see what causes inconsistency + expected = empty_series if orient in ("values", "records"): expected = expected.reset_index(drop=True) else: @@ -681,11 +658,11 @@ def test_series_roundtrip_empty(self, orient, numpy): tm.assert_series_equal(result, expected) @pytest.mark.parametrize("numpy", [True, False]) - def test_series_roundtrip_timeseries(self, orient, numpy): - data = self.ts.to_json(orient=orient) + def test_series_roundtrip_timeseries(self, orient, numpy, datetime_series): + data = datetime_series.to_json(orient=orient) result = pd.read_json(data, typ="series", orient=orient, numpy=numpy) - expected = self.ts.copy() + expected = datetime_series if orient in ("values", "records"): expected = expected.reset_index(drop=True) if orient != "split": @@ -773,7 +750,7 @@ def test_path(self): df.to_json(path) read_json(path) - def test_axis_dates(self): + def test_axis_dates(self, datetime_series): # frame json = self.tsframe.to_json() @@ -781,12 +758,12 @@ def test_axis_dates(self): tm.assert_frame_equal(result, self.tsframe) # series - json = self.ts.to_json() + json = datetime_series.to_json() result = read_json(json, typ="series") - tm.assert_series_equal(result, self.ts, check_names=False) + tm.assert_series_equal(result, datetime_series, check_names=False) assert result.name is None - def test_convert_dates(self): + def test_convert_dates(self, datetime_series): # frame df = self.tsframe.copy() @@ -806,7 +783,7 @@ def test_convert_dates(self): tm.assert_frame_equal(result, expected) # series - ts = Series(Timestamp("20130101"), index=self.ts.index) + ts = Series(Timestamp("20130101"), index=datetime_series.index) json = ts.to_json() result = read_json(json, typ="series") tm.assert_series_equal(result, ts) @@ -901,8 +878,8 @@ def test_date_format_frame_raises(self): ("20130101 20:43:42.123456789", "ns"), ], ) - def test_date_format_series(self, date, date_unit): - ts = Series(Timestamp(date), index=self.ts.index) + def test_date_format_series(self, date, date_unit, datetime_series): + ts = Series(Timestamp(date), index=datetime_series.index) ts.iloc[1] = pd.NaT ts.iloc[5] = pd.NaT if date_unit: @@ -915,8 +892,8 @@ def test_date_format_series(self, date, date_unit): expected = expected.dt.tz_localize("UTC") tm.assert_series_equal(result, expected) - def test_date_format_series_raises(self): - ts = Series(Timestamp("20130101 20:43:42.123"), index=self.ts.index) + def test_date_format_series_raises(self, datetime_series): + ts = Series(Timestamp("20130101 20:43:42.123"), index=datetime_series.index) msg = "Invalid value 'foo' for option 'date_unit'" with pytest.raises(ValueError, match=msg): ts.to_json(date_format="iso", date_unit="foo") @@ -1058,6 +1035,29 @@ def test_mixed_timedelta_datetime(self): result = pd.read_json(frame.to_json(date_unit="ns"), dtype={"a": "int64"}) tm.assert_frame_equal(result, expected, check_index_type=False) + @pytest.mark.parametrize("as_object", [True, False]) + @pytest.mark.parametrize("date_format", ["iso", "epoch"]) + @pytest.mark.parametrize("timedelta_typ", [pd.Timedelta, timedelta]) + def test_timedelta_to_json(self, as_object, date_format, timedelta_typ): + # GH28156: to_json not correctly formatting Timedelta + data = [timedelta_typ(days=1), timedelta_typ(days=2), pd.NaT] + if as_object: + data.append("a") + + ser = pd.Series(data, index=data) + if date_format == "iso": + expected = ( + '{"P1DT0H0M0S":"P1DT0H0M0S","P2DT0H0M0S":"P2DT0H0M0S","null":null}' + ) + else: + expected = '{"86400000":86400000,"172800000":172800000,"null":null}' + + if as_object: + expected = expected.replace("}", ',"a":"a"}') + + result = ser.to_json(date_format=date_format) + assert result == expected + def test_default_handler(self): value = object() frame = DataFrame({"a": [7, value]}) diff --git a/pandas/tests/io/json/test_ujson.py b/pandas/tests/io/json/test_ujson.py index e966db7a1cc71..34dd9ba9bc7b6 100644 --- a/pandas/tests/io/json/test_ujson.py +++ b/pandas/tests/io/json/test_ujson.py @@ -16,7 +16,7 @@ from pandas._libs.tslib import Timestamp import pandas.compat as compat -from pandas import DataFrame, DatetimeIndex, Index, NaT, Series, date_range +from pandas import DataFrame, DatetimeIndex, Index, NaT, Series, Timedelta, date_range import pandas._testing as tm @@ -48,6 +48,18 @@ def numpy(request): return request.param +def get_int32_compat_dtype(numpy, orient): + # See GH#32527 + dtype = np.int64 + if not ((numpy is None or orient == "index") or (numpy is True and orient is None)): + if compat.is_platform_windows(): + dtype = np.int32 + else: + dtype = np.intp + + return dtype + + class TestUltraJSONTests: @pytest.mark.skipif( compat.is_platform_32bit(), reason="not compliant on 32-bit, xref #15865" @@ -833,13 +845,20 @@ def test_dataframe(self, orient, numpy): if orient == "records" and numpy: pytest.skip("Not idiomatic pandas") + dtype = get_int32_compat_dtype(numpy, orient) + df = DataFrame( - [[1, 2, 3], [4, 5, 6]], index=["a", "b"], columns=["x", "y", "z"] + [[1, 2, 3], [4, 5, 6]], + index=["a", "b"], + columns=["x", "y", "z"], + dtype=dtype, ) encode_kwargs = {} if orient is None else dict(orient=orient) decode_kwargs = {} if numpy is None else dict(numpy=numpy) + assert (df.dtypes == dtype).all() output = ujson.decode(ujson.encode(df, **encode_kwargs), **decode_kwargs) + assert (df.dtypes == dtype).all() # Ensure proper DataFrame initialization. if orient == "split": @@ -857,7 +876,8 @@ def test_dataframe(self, orient, numpy): elif orient == "index": df = df.transpose() - tm.assert_frame_equal(output, df, check_dtype=False) + assert (df.dtypes == dtype).all() + tm.assert_frame_equal(output, df) def test_dataframe_nested(self, orient): df = DataFrame( @@ -897,14 +917,20 @@ def test_dataframe_numpy_labelled(self, orient): tm.assert_frame_equal(output, df) def test_series(self, orient, numpy): + dtype = get_int32_compat_dtype(numpy, orient) s = Series( - [10, 20, 30, 40, 50, 60], name="series", index=[6, 7, 8, 9, 10, 15] + [10, 20, 30, 40, 50, 60], + name="series", + index=[6, 7, 8, 9, 10, 15], + dtype=dtype, ).sort_values() + assert s.dtype == dtype encode_kwargs = {} if orient is None else dict(orient=orient) decode_kwargs = {} if numpy is None else dict(numpy=numpy) output = ujson.decode(ujson.encode(s, **encode_kwargs), **decode_kwargs) + assert s.dtype == dtype if orient == "split": dec = _clean_dict(output) @@ -920,7 +946,8 @@ def test_series(self, orient, numpy): s.name = None s.index = [0, 1, 2, 3, 4, 5] - tm.assert_series_equal(output, s, check_dtype=False) + assert s.dtype == dtype + tm.assert_series_equal(output, s) def test_series_nested(self, orient): s = Series( @@ -1076,3 +1103,24 @@ def test_encode_set(self): for v in dec: assert v in s + + @pytest.mark.parametrize( + "td", + [ + Timedelta(days=366), + Timedelta(days=-1), + Timedelta(hours=13, minutes=5, seconds=5), + Timedelta(hours=13, minutes=20, seconds=30), + Timedelta(days=-1, nanoseconds=5), + Timedelta(nanoseconds=1), + Timedelta(microseconds=1, nanoseconds=1), + Timedelta(milliseconds=1, microseconds=1, nanoseconds=1), + Timedelta(milliseconds=999, microseconds=999, nanoseconds=999), + ], + ) + def test_encode_timedelta_iso(self, td): + # GH 28256 + result = ujson.encode(td, iso_dates=True) + expected = f'"{td.isoformat()}"' + + assert result == expected diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py index b3aa1aa14a509..5bf9587a6ca22 100644 --- a/pandas/tests/io/parser/test_common.py +++ b/pandas/tests/io/parser/test_common.py @@ -15,6 +15,7 @@ from pandas._libs.tslib import Timestamp from pandas.errors import DtypeWarning, EmptyDataError, ParserError +import pandas.util._test_decorators as td from pandas import DataFrame, Index, MultiIndex, Series, compat, concat import pandas._testing as tm @@ -959,13 +960,23 @@ def test_nonexistent_path(all_parsers): parser = all_parsers path = f"{tm.rands(10)}.csv" - msg = f"File {path} does not exist" if parser.engine == "c" else r"\[Errno 2\]" + msg = r"\[Errno 2\]" with pytest.raises(FileNotFoundError, match=msg) as e: parser.read_csv(path) + assert path == e.value.filename - filename = e.value.filename - assert path == filename +@td.skip_if_windows # os.chmod does not work in windows +def test_no_permission(all_parsers): + # GH 23784 + parser = all_parsers + + msg = r"\[Errno 13\]" + with tm.ensure_clean() as path: + os.chmod(path, 0) # make file unreadable + with pytest.raises(PermissionError, match=msg) as e: + parser.read_csv(path) + assert path == e.value.filename def test_missing_trailing_delimiters(all_parsers): @@ -1062,14 +1073,14 @@ def test_escapechar(all_parsers): data = '''SEARCH_TERM,ACTUAL_URL "bra tv bord","http://www.ikea.com/se/sv/catalog/categories/departments/living_room/10475/?se%7cps%7cnonbranded%7cvardagsrum%7cgoogle%7ctv_bord" "tv p\xc3\xa5 hjul","http://www.ikea.com/se/sv/catalog/categories/departments/living_room/10475/?se%7cps%7cnonbranded%7cvardagsrum%7cgoogle%7ctv_bord" -"SLAGBORD, \\"Bergslagen\\", IKEA:s 1700-tals serie","http://www.ikea.com/se/sv/catalog/categories/departments/living_room/10475/?se%7cps%7cnonbranded%7cvardagsrum%7cgoogle%7ctv_bord"''' # noqa +"SLAGBORD, \\"Bergslagen\\", IKEA:s 1700-tals series","http://www.ikea.com/se/sv/catalog/categories/departments/living_room/10475/?se%7cps%7cnonbranded%7cvardagsrum%7cgoogle%7ctv_bord"''' # noqa parser = all_parsers result = parser.read_csv( StringIO(data), escapechar="\\", quotechar='"', encoding="utf-8" ) - assert result["SEARCH_TERM"][2] == 'SLAGBORD, "Bergslagen", IKEA:s 1700-tals serie' + assert result["SEARCH_TERM"][2] == 'SLAGBORD, "Bergslagen", IKEA:s 1700-tals series' tm.assert_index_equal(result.columns, Index(["SEARCH_TERM", "ACTUAL_URL"])) @@ -2079,3 +2090,39 @@ def test_integer_precision(all_parsers): result = parser.read_csv(StringIO(s), header=None)[4] expected = Series([4321583677327450765, 4321113141090630389], name=4) tm.assert_series_equal(result, expected) + + +def test_file_descriptor_leak(all_parsers): + # GH 31488 + + parser = all_parsers + with tm.ensure_clean() as path: + + def test(): + with pytest.raises(EmptyDataError, match="No columns to parse from file"): + parser.read_csv(path) + + td.check_file_leaks(test)() + + +@pytest.mark.parametrize("nrows", range(1, 6)) +def test_blank_lines_between_header_and_data_rows(all_parsers, nrows): + # GH 28071 + ref = DataFrame( + [[np.nan, np.nan], [np.nan, np.nan], [1, 2], [np.nan, np.nan], [3, 4]], + columns=list("ab"), + ) + csv = "\nheader\n\na,b\n\n\n1,2\n\n3,4" + parser = all_parsers + df = parser.read_csv(StringIO(csv), header=3, nrows=nrows, skip_blank_lines=False) + tm.assert_frame_equal(df, ref[:nrows]) + + +def test_no_header_two_extra_columns(all_parsers): + # GH 26218 + column_names = ["one", "two", "three"] + ref = DataFrame([["foo", "bar", "baz"]], columns=column_names) + stream = StringIO("foo,bar,baz,bam,blah") + parser = all_parsers + df = parser.read_csv(stream, header=None, names=column_names, index_col=False) + tm.assert_frame_equal(df, ref) diff --git a/pandas/tests/io/parser/test_dtypes.py b/pandas/tests/io/parser/test_dtypes.py index 11dcf7f04f76b..e68dcb3aa577e 100644 --- a/pandas/tests/io/parser/test_dtypes.py +++ b/pandas/tests/io/parser/test_dtypes.py @@ -192,7 +192,7 @@ def test_categorical_dtype_utf16(all_parsers, csv_dir_path): pth = os.path.join(csv_dir_path, "utf16_ex.txt") parser = all_parsers encoding = "utf-16" - sep = "," + sep = "\t" expected = parser.read_csv(pth, sep=sep, encoding=encoding) expected = expected.apply(Categorical) diff --git a/pandas/tests/io/parser/test_encoding.py b/pandas/tests/io/parser/test_encoding.py index 3661e4e056db2..13b74cf29f857 100644 --- a/pandas/tests/io/parser/test_encoding.py +++ b/pandas/tests/io/parser/test_encoding.py @@ -5,6 +5,7 @@ from io import BytesIO import os +import tempfile import numpy as np import pytest @@ -174,3 +175,25 @@ def test_encoding_temp_file(all_parsers, utf_value, encoding_fmt, pass_encoding) result = parser.read_csv(f, encoding=encoding if pass_encoding else None) tm.assert_frame_equal(result, expected) + + +def test_encoding_named_temp_file(all_parsers): + # see gh-31819 + parser = all_parsers + encoding = "shift-jis" + + if parser.engine == "python": + pytest.skip("NamedTemporaryFile does not work with Python engine") + + title = "てすと" + data = "こむ" + + expected = DataFrame({title: [data]}) + + with tempfile.NamedTemporaryFile() as f: + f.write(f"{title}\n{data}".encode(encoding)) + + f.seek(0) + + result = parser.read_csv(f, encoding=encoding) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py index 31573e4e6ecce..2fcac6fa57cf8 100644 --- a/pandas/tests/io/parser/test_parse_dates.py +++ b/pandas/tests/io/parser/test_parse_dates.py @@ -1516,3 +1516,35 @@ def test_hypothesis_delimited_date(date_format, dayfirst, delimiter, test_dateti assert except_out_dateutil == except_in_dateutil assert result == expected + + +@pytest.mark.parametrize( + "names, usecols, parse_dates, missing_cols", + [ + (None, ["val"], ["date", "time"], "date, time"), + (None, ["val"], [0, "time"], "time"), + (None, ["val"], [["date", "time"]], "date, time"), + (None, ["val"], [[0, "time"]], "time"), + (None, ["val"], {"date": [0, "time"]}, "time"), + (None, ["val"], {"date": ["date", "time"]}, "date, time"), + (None, ["val"], [["date", "time"], "date"], "date, time"), + (["date1", "time1", "temperature"], None, ["date", "time"], "date, time"), + ( + ["date1", "time1", "temperature"], + ["date1", "temperature"], + ["date1", "time"], + "time", + ), + ], +) +def test_missing_parse_dates_column_raises( + all_parsers, names, usecols, parse_dates, missing_cols +): + # gh-31251 column names provided in parse_dates could be missing. + parser = all_parsers + content = StringIO("date,time,val\n2020-01-31,04:20:32,32\n") + msg = f"Missing column provided to 'parse_dates': '{missing_cols}'" + with pytest.raises(ValueError, match=msg): + parser.read_csv( + content, sep=",", names=names, usecols=usecols, parse_dates=parse_dates, + ) diff --git a/pandas/tests/io/pytables/test_store.py b/pandas/tests/io/pytables/test_store.py index fd585a73f6ce6..9a0788ea068ad 100644 --- a/pandas/tests/io/pytables/test_store.py +++ b/pandas/tests/io/pytables/test_store.py @@ -13,8 +13,6 @@ from pandas.compat import is_platform_little_endian, is_platform_windows import pandas.util._test_decorators as td -from pandas.core.dtypes.common import is_categorical_dtype - import pandas as pd from pandas import ( Categorical, @@ -342,7 +340,7 @@ def test_repr(self, setup_path): df["timestamp2"] = Timestamp("20010103") df["datetime1"] = datetime.datetime(2001, 1, 2, 0, 0) df["datetime2"] = datetime.datetime(2001, 1, 3, 0, 0) - df.loc[3:6, ["obj1"]] = np.nan + df.loc[df.index[3:6], ["obj1"]] = np.nan df = df._consolidate()._convert(datetime=True) with catch_warnings(record=True): @@ -846,7 +844,7 @@ def test_put_mixed_type(self, setup_path): df["timestamp2"] = Timestamp("20010103") df["datetime1"] = datetime.datetime(2001, 1, 2, 0, 0) df["datetime2"] = datetime.datetime(2001, 1, 3, 0, 0) - df.loc[3:6, ["obj1"]] = np.nan + df.loc[df.index[3:6], ["obj1"]] = np.nan df = df._consolidate()._convert(datetime=True) with ensure_clean_store(setup_path) as store: @@ -1057,18 +1055,7 @@ def test_latin_encoding(self, setup_path, dtype, val): s_nan = ser.replace(nan_rep, np.nan) - if is_categorical_dtype(s_nan): - assert is_categorical_dtype(retr) - tm.assert_series_equal( - s_nan, retr, check_dtype=False, check_categorical=False - ) - else: - tm.assert_series_equal(s_nan, retr) - - # FIXME: don't leave commented-out - # fails: - # for x in examples: - # roundtrip(s, nan_rep=b'\xf8\xfc') + tm.assert_series_equal(s_nan, retr) def test_append_some_nans(self, setup_path): @@ -1230,14 +1217,14 @@ def test_read_missing_key_opened_store(self, setup_path): df = pd.DataFrame({"a": range(2), "b": range(2)}) df.to_hdf(path, "k1") - store = pd.HDFStore(path, "r") + with pd.HDFStore(path, "r") as store: - with pytest.raises(KeyError, match="'No object named k2 in the file'"): - pd.read_hdf(store, "k2") + with pytest.raises(KeyError, match="'No object named k2 in the file'"): + pd.read_hdf(store, "k2") - # Test that the file is still open after a KeyError and that we can - # still read from it. - pd.read_hdf(store, "k1") + # Test that the file is still open after a KeyError and that we can + # still read from it. + pd.read_hdf(store, "k1") def test_append_frame_column_oriented(self, setup_path): with ensure_clean_store(setup_path) as store: @@ -1372,11 +1359,11 @@ def check_col(key, name, size): _maybe_remove(store, "df") df = tm.makeTimeDataFrame() df["string"] = "foo" - df.loc[1:4, "string"] = np.nan + df.loc[df.index[1:4], "string"] = np.nan df["string2"] = "bar" - df.loc[4:8, "string2"] = np.nan + df.loc[df.index[4:8], "string2"] = np.nan df["string3"] = "bah" - df.loc[1:, "string3"] = np.nan + df.loc[df.index[1:], "string3"] = np.nan store.append("df", df) result = store.select("df") tm.assert_frame_equal(result, df) @@ -1492,8 +1479,8 @@ def test_append_with_data_columns(self, setup_path): # data column selection with a string data_column df_new = df.copy() df_new["string"] = "foo" - df_new.loc[1:4, "string"] = np.nan - df_new.loc[5:6, "string"] = "bar" + df_new.loc[df_new.index[1:4], "string"] = np.nan + df_new.loc[df_new.index[5:6], "string"] = "bar" _maybe_remove(store, "df") store.append("df", df_new, data_columns=["string"]) result = store.select("df", "string='foo'") @@ -1574,12 +1561,12 @@ def check_col(key, name, size): # doc example df_dc = df.copy() df_dc["string"] = "foo" - df_dc.loc[4:6, "string"] = np.nan - df_dc.loc[7:9, "string"] = "bar" + df_dc.loc[df_dc.index[4:6], "string"] = np.nan + df_dc.loc[df_dc.index[7:9], "string"] = "bar" df_dc["string2"] = "cool" df_dc["datetime"] = Timestamp("20010102") df_dc = df_dc._convert(datetime=True) - df_dc.loc[3:5, ["A", "B", "datetime"]] = np.nan + df_dc.loc[df_dc.index[3:5], ["A", "B", "datetime"]] = np.nan _maybe_remove(store, "df_dc") store.append( @@ -1602,8 +1589,8 @@ def check_col(key, name, size): np.random.randn(8, 3), index=index, columns=["A", "B", "C"] ) df_dc["string"] = "foo" - df_dc.loc[4:6, "string"] = np.nan - df_dc.loc[7:9, "string"] = "bar" + df_dc.loc[df_dc.index[4:6], "string"] = np.nan + df_dc.loc[df_dc.index[7:9], "string"] = "bar" df_dc.loc[:, ["B", "C"]] = df_dc.loc[:, ["B", "C"]].abs() df_dc["string2"] = "cool" @@ -2024,7 +2011,7 @@ def test_table_mixed_dtypes(self, setup_path): df["timestamp2"] = Timestamp("20010103") df["datetime1"] = datetime.datetime(2001, 1, 2, 0, 0) df["datetime2"] = datetime.datetime(2001, 1, 3, 0, 0) - df.loc[3:6, ["obj1"]] = np.nan + df.loc[df.index[3:6], ["obj1"]] = np.nan df = df._consolidate()._convert(datetime=True) with ensure_clean_store(setup_path) as store: @@ -2200,7 +2187,7 @@ def test_invalid_terms(self, setup_path): df = tm.makeTimeDataFrame() df["string"] = "foo" - df.loc[0:4, "string"] = "bar" + df.loc[df.index[0:4], "string"] = "bar" store.put("df", df, format="table") @@ -2311,9 +2298,7 @@ def test_index_types(self, setup_path): with catch_warnings(record=True): values = np.random.randn(2) - func = lambda l, r: tm.assert_series_equal( - l, r, check_dtype=True, check_index_type=True, check_series_type=True - ) + func = lambda l, r: tm.assert_series_equal(l, r, check_index_type=True) with catch_warnings(record=True): ser = Series(values, [0, "y"]) @@ -3343,7 +3328,7 @@ def test_string_select(self, setup_path): # test string ==/!= df["x"] = "none" - df.loc[2:7, "x"] = "" + df.loc[df.index[2:7], "x"] = "" store.append("df", df, data_columns=["x"]) @@ -3365,7 +3350,7 @@ def test_string_select(self, setup_path): # int ==/!= df["int"] = 1 - df.loc[2:7, "int"] = 2 + df.loc[df.index[2:7], "int"] = 2 store.append("df3", df, data_columns=["int"]) @@ -3419,7 +3404,7 @@ def test_read_column(self, setup_path): # a data column with NaNs, result excludes the NaNs df3 = df.copy() df3["string"] = "foo" - df3.loc[4:6, "string"] = np.nan + df3.loc[df3.index[4:6], "string"] = np.nan store.append("df3", df3, data_columns=["string"]) result = store.select_column("df3", "string") tm.assert_almost_equal(result.values, df3["string"].values) diff --git a/pandas/tests/io/test_clipboard.py b/pandas/tests/io/test_clipboard.py index 3458cfb6ad254..b627e0e1cad54 100644 --- a/pandas/tests/io/test_clipboard.py +++ b/pandas/tests/io/test_clipboard.py @@ -75,7 +75,11 @@ def df(request): ) elif data_type == "mixed": return DataFrame( - {"a": np.arange(1.0, 6.0) + 0.01, "b": np.arange(1, 6), "c": list("abcde")} + { + "a": np.arange(1.0, 6.0) + 0.01, + "b": np.arange(1, 6).astype(np.int64), + "c": list("abcde"), + } ) elif data_type == "float": return tm.makeCustomDataframe( @@ -146,7 +150,7 @@ class TestClipboard: def check_round_trip_frame(self, data, excel=None, sep=None, encoding=None): data.to_clipboard(excel=excel, sep=sep, encoding=encoding) result = read_clipboard(sep=sep or "\t", index_col=0, encoding=encoding) - tm.assert_frame_equal(data, result, check_dtype=False) + tm.assert_frame_equal(data, result) # Test that default arguments copy as tab delimited def test_round_trip_frame(self, df): diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py index 730043e6ec7d7..84bc29ebc65e0 100644 --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py @@ -137,11 +137,11 @@ def test_iterator(self): (pd.read_pickle, "os", FileNotFoundError, "pickle"), ], ) - def test_read_non_existant(self, reader, module, error_class, fn_ext): + def test_read_non_existent(self, reader, module, error_class, fn_ext): pytest.importorskip(module) path = os.path.join(HERE, "data", "does_not_exist." + fn_ext) - msg1 = r"File (b')?.+does_not_exist\.{}'? does not exist".format(fn_ext) + msg1 = fr"File (b')?.+does_not_exist\.{fn_ext}'? does not exist" msg2 = fr"\[Errno 2\] No such file or directory: '.+does_not_exist\.{fn_ext}'" msg3 = "Expected object or value" msg4 = "path_or_buf needs to be a string file path or file-like" diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index fc3876eee9d66..bf0ed4fe25346 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -2633,6 +2633,8 @@ def test_write_row_by_row(self): result = sql.read_sql("select * from test", con=self.conn) result.index = frame.index tm.assert_frame_equal(result, frame, check_less_precise=True) + # GH#32571 result comes back rounded to 6 digits in some builds; + # no obvious pattern def test_chunksize_read_type(self): frame = tm.makeTimeDataFrame() diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py index b65efac2bd527..eaa92fa53d799 100644 --- a/pandas/tests/io/test_stata.py +++ b/pandas/tests/io/test_stata.py @@ -254,12 +254,21 @@ def test_read_dta4(self, file): ) # these are all categoricals - expected = pd.concat( - [expected[col].astype("category") for col in expected], axis=1 - ) + for col in expected: + orig = expected[col].copy() + + categories = np.asarray(expected["fully_labeled"][orig.notna()]) + if col == "incompletely_labeled": + categories = orig + + cat = orig.astype("category")._values + cat = cat.set_categories(categories, ordered=True) + cat.categories.rename(None, inplace=True) + + expected[col] = cat # stata doesn't save .category metadata - tm.assert_frame_equal(parsed, expected, check_categorical=False) + tm.assert_frame_equal(parsed, expected) # File containing strls def test_read_dta12(self): @@ -952,19 +961,27 @@ def test_categorical_writing(self, version): original = pd.concat( [original[col].astype("category") for col in original], axis=1 ) + expected.index.name = "index" expected["incompletely_labeled"] = expected["incompletely_labeled"].apply(str) expected["unlabeled"] = expected["unlabeled"].apply(str) - expected = pd.concat( - [expected[col].astype("category") for col in expected], axis=1 - ) - expected.index.name = "index" + for col in expected: + orig = expected[col].copy() + + cat = orig.astype("category")._values + cat = cat.as_ordered() + if col == "unlabeled": + cat = cat.set_categories(orig, ordered=True) + + cat.categories.rename(None, inplace=True) + + expected[col] = cat with tm.ensure_clean() as path: original.to_stata(path, version=version) written_and_read_again = self.read_dta(path) res = written_and_read_again.set_index("index") - tm.assert_frame_equal(res, expected, check_categorical=False) + tm.assert_frame_equal(res, expected) def test_categorical_warnings_and_errors(self): # Warning for non-string labels @@ -1009,7 +1026,14 @@ def test_categorical_with_stata_missing_values(self, version): original.to_stata(path, version=version) written_and_read_again = self.read_dta(path) res = written_and_read_again.set_index("index") - tm.assert_frame_equal(res, original, check_categorical=False) + + expected = original.copy() + for col in expected: + cat = expected[col]._values + new_cats = cat.remove_unused_categories().categories + cat = cat.set_categories(new_cats, ordered=True) + expected[col] = cat + tm.assert_frame_equal(res, expected) @pytest.mark.parametrize("file", ["dta19_115", "dta19_117"]) def test_categorical_order(self, file): @@ -1027,7 +1051,9 @@ def test_categorical_order(self, file): cols = [] for is_cat, col, labels, codes in expected: if is_cat: - cols.append((col, pd.Categorical.from_codes(codes, labels))) + cols.append( + (col, pd.Categorical.from_codes(codes, labels, ordered=True)) + ) else: cols.append((col, pd.Series(labels, dtype=np.float32))) expected = DataFrame.from_dict(dict(cols)) @@ -1035,7 +1061,7 @@ def test_categorical_order(self, file): # Read with and with out categoricals, ensure order is identical file = getattr(self, file) parsed = read_stata(file) - tm.assert_frame_equal(expected, parsed, check_categorical=False) + tm.assert_frame_equal(expected, parsed) # Check identity of codes for col in expected: @@ -1056,9 +1082,11 @@ def test_categorical_sorting(self, file): parsed.index = np.arange(parsed.shape[0]) codes = [-1, -1, 0, 1, 1, 1, 2, 2, 3, 4] categories = ["Poor", "Fair", "Good", "Very good", "Excellent"] - cat = pd.Categorical.from_codes(codes=codes, categories=categories) + cat = pd.Categorical.from_codes( + codes=codes, categories=categories, ordered=True + ) expected = pd.Series(cat, name="srh") - tm.assert_series_equal(expected, parsed["srh"], check_categorical=False) + tm.assert_series_equal(expected, parsed["srh"]) @pytest.mark.parametrize("file", ["dta19_115", "dta19_117"]) def test_categorical_ordering(self, file): @@ -1118,18 +1146,30 @@ def test_read_chunks_117( chunk = itr.read(chunksize) except StopIteration: break - from_frame = parsed.iloc[pos : pos + chunksize, :] + from_frame = parsed.iloc[pos : pos + chunksize, :].copy() + from_frame = self._convert_categorical(from_frame) tm.assert_frame_equal( - from_frame, - chunk, - check_dtype=False, - check_datetimelike_compat=True, - check_categorical=False, + from_frame, chunk, check_dtype=False, check_datetimelike_compat=True, ) pos += chunksize itr.close() + @staticmethod + def _convert_categorical(from_frame: DataFrame) -> DataFrame: + """ + Emulate the categorical casting behavior we expect from roundtripping. + """ + for col in from_frame: + ser = from_frame[col] + if is_categorical_dtype(ser.dtype): + cat = ser._values.remove_unused_categories() + if cat.categories.dtype == object: + categories = pd.Index(cat.categories._values) + cat = cat.set_categories(categories) + from_frame[col] = cat + return from_frame + def test_iterator(self): fname = self.dta3_117 @@ -1204,13 +1244,10 @@ def test_read_chunks_115( chunk = itr.read(chunksize) except StopIteration: break - from_frame = parsed.iloc[pos : pos + chunksize, :] + from_frame = parsed.iloc[pos : pos + chunksize, :].copy() + from_frame = self._convert_categorical(from_frame) tm.assert_frame_equal( - from_frame, - chunk, - check_dtype=False, - check_datetimelike_compat=True, - check_categorical=False, + from_frame, chunk, check_dtype=False, check_datetimelike_compat=True, ) pos += chunksize diff --git a/pandas/tests/plotting/common.py b/pandas/tests/plotting/common.py index ea0ec8ad98ffe..75b825687209c 100644 --- a/pandas/tests/plotting/common.py +++ b/pandas/tests/plotting/common.py @@ -34,6 +34,7 @@ def setup_method(self, method): self.mpl_ge_2_2_3 = compat._mpl_ge_2_2_3() self.mpl_ge_3_0_0 = compat._mpl_ge_3_0_0() self.mpl_ge_3_1_0 = compat._mpl_ge_3_1_0() + self.mpl_ge_3_2_0 = compat._mpl_ge_3_2_0() self.bp_n_objects = 7 self.polycollection_factor = 2 diff --git a/pandas/tests/plotting/test_datetimelike.py b/pandas/tests/plotting/test_datetimelike.py index 979b89a87d843..b85a2affc4e4b 100644 --- a/pandas/tests/plotting/test_datetimelike.py +++ b/pandas/tests/plotting/test_datetimelike.py @@ -1468,7 +1468,9 @@ def test_matplotlib_scatter_datetime64(self): ax.scatter(x="time", y="y", data=df) self.plt.draw() label = ax.get_xticklabels()[0] - if self.mpl_ge_3_0_0: + if self.mpl_ge_3_2_0: + expected = "2018-01-01" + elif self.mpl_ge_3_0_0: expected = "2017-12-08" else: expected = "2017-12-12" diff --git a/pandas/tests/plotting/test_frame.py b/pandas/tests/plotting/test_frame.py index ffbd135466709..32673b9a0a5cf 100644 --- a/pandas/tests/plotting/test_frame.py +++ b/pandas/tests/plotting/test_frame.py @@ -3316,6 +3316,16 @@ def test_missing_markers_legend_using_style(self): self._check_legend_labels(ax, labels=["A", "B", "C"]) self._check_legend_marker(ax, expected_markers=[".", ".", "."]) + def test_colors_of_columns_with_same_name(self): + # ISSUE 11136 -> https://github.com/pandas-dev/pandas/issues/11136 + # Creating a DataFrame with duplicate column labels and testing colors of them. + df = pd.DataFrame({"b": [0, 1, 0], "a": [1, 2, 3]}) + df1 = pd.DataFrame({"a": [2, 4, 6]}) + df_concat = pd.concat([df, df1], axis=1) + result = df_concat.plot() + for legend, line in zip(result.get_legend().legendHandles, result.lines): + assert legend.get_color() == line.get_color() + def _generate_4_axes_via_gridspec(): import matplotlib.pyplot as plt diff --git a/pandas/tests/reductions/test_reductions.py b/pandas/tests/reductions/test_reductions.py index 211d0d52d8357..abd99aadfb484 100644 --- a/pandas/tests/reductions/test_reductions.py +++ b/pandas/tests/reductions/test_reductions.py @@ -55,9 +55,7 @@ def test_ops(self, opname, obj): if not isinstance(obj, PeriodIndex): expected = getattr(obj.values, opname)() else: - expected = pd.Period( - ordinal=getattr(obj._ndarray_values, opname)(), freq=obj.freq - ) + expected = pd.Period(ordinal=getattr(obj.asi8, opname)(), freq=obj.freq) try: assert result == expected except TypeError: diff --git a/pandas/tests/resample/conftest.py b/pandas/tests/resample/conftest.py index d5b71a6e4cee1..fb2111a60a261 100644 --- a/pandas/tests/resample/conftest.py +++ b/pandas/tests/resample/conftest.py @@ -134,7 +134,7 @@ def series(index, _series_name, _static_values): @pytest.fixture -def empty_series(series): +def empty_series_dti(series): """ Fixture for parametrization of empty Series with date_range, period_range and timedelta_range indexes diff --git a/pandas/tests/resample/test_base.py b/pandas/tests/resample/test_base.py index c84a5bf653b0a..3384c2a94487b 100644 --- a/pandas/tests/resample/test_base.py +++ b/pandas/tests/resample/test_base.py @@ -94,13 +94,13 @@ def test_raises_on_non_datetimelike_index(): @all_ts @pytest.mark.parametrize("freq", ["M", "D", "H"]) -def test_resample_empty_series(freq, empty_series, resample_method): +def test_resample_empty_series(freq, empty_series_dti, resample_method): # GH12771 & GH12868 if resample_method == "ohlc": pytest.skip("need to test for ohlc from GH13083") - s = empty_series + s = empty_series_dti result = getattr(s.resample(freq), resample_method)() expected = s.copy() @@ -114,13 +114,13 @@ def test_resample_empty_series(freq, empty_series, resample_method): @all_ts @pytest.mark.parametrize("freq", ["M", "D", "H"]) @pytest.mark.parametrize("resample_method", ["count", "size"]) -def test_resample_count_empty_series(freq, empty_series, resample_method): +def test_resample_count_empty_series(freq, empty_series_dti, resample_method): # GH28427 - result = getattr(empty_series.resample(freq), resample_method)() + result = getattr(empty_series_dti.resample(freq), resample_method)() - index = _asfreq_compat(empty_series.index, freq) + index = _asfreq_compat(empty_series_dti.index, freq) - expected = pd.Series([], dtype="int64", index=index, name=empty_series.name) + expected = pd.Series([], dtype="int64", index=index, name=empty_series_dti.name) tm.assert_series_equal(result, expected) @@ -188,9 +188,9 @@ def test_resample_empty_dtypes(index, dtype, resample_method): # Empty series were sometimes causing a segfault (for the functions # with Cython bounds-checking disabled) or an IndexError. We just run # them to ensure they no longer do. (GH #10228) - empty_series = Series([], index, dtype) + empty_series_dti = Series([], index, dtype) try: - getattr(empty_series.resample("d"), resample_method)() + getattr(empty_series_dti.resample("d"), resample_method)() except DataError: # Ignore these since some combinations are invalid # (ex: doing mean with dtype of np.object) @@ -227,9 +227,9 @@ def test_resample_loffset_arg_type(frame, create_index, arg): @all_ts -def test_apply_to_empty_series(empty_series): +def test_apply_to_empty_series(empty_series_dti): # GH 14313 - s = empty_series + s = empty_series_dti for freq in ["M", "D", "H"]: result = s.resample(freq).apply(lambda x: 1) expected = s.resample(freq).apply(np.sum) diff --git a/pandas/tests/resample/test_resample_api.py b/pandas/tests/resample/test_resample_api.py index d552241f9126f..5044a18e33248 100644 --- a/pandas/tests/resample/test_resample_api.py +++ b/pandas/tests/resample/test_resample_api.py @@ -287,7 +287,7 @@ def test_agg_consistency(): r = df.resample("3T") - msg = "nested renamer is not supported" + msg = r"Column\(s\) \['r1', 'r2'\] do not exist" with pytest.raises(pd.core.base.SpecificationError, match=msg): r.agg({"r1": "mean", "r2": "sum"}) @@ -419,7 +419,7 @@ def test_agg_misc(): [("result1", "A"), ("result1", "B"), ("result2", "A"), ("result2", "B")] ) - msg = "nested renamer is not supported" + msg = r"Column\(s\) \['result1', 'result2'\] do not exist" for t in cases: with pytest.raises(pd.core.base.SpecificationError, match=msg): t[["A", "B"]].agg(OrderedDict([("result1", np.sum), ("result2", np.mean)])) @@ -440,6 +440,8 @@ def test_agg_misc(): result = t[["A", "B"]].agg({"A": ["sum", "std"], "B": ["mean", "std"]}) tm.assert_frame_equal(result, expected, check_like=True) + msg = "nested renamer is not supported" + # series like aggs for t in cases: with pytest.raises(pd.core.base.SpecificationError, match=msg): @@ -580,3 +582,27 @@ def test_agg_with_datetime_index_list_agg_func(col_name): columns=pd.MultiIndex(levels=[[col_name], ["mean"]], codes=[[0], [0]]), ) tm.assert_frame_equal(result, expected) + + +def test_resample_agg_readonly(): + # GH#31710 cython needs to allow readonly data + index = pd.date_range("2020-01-01", "2020-01-02", freq="1h") + arr = np.zeros_like(index) + arr.setflags(write=False) + + ser = pd.Series(arr, index=index) + rs = ser.resample("1D") + + expected = pd.Series([pd.Timestamp(0), pd.Timestamp(0)], index=index[::24]) + + result = rs.agg("last") + tm.assert_series_equal(result, expected) + + result = rs.agg("first") + tm.assert_series_equal(result, expected) + + result = rs.agg("max") + tm.assert_series_equal(result, expected) + + result = rs.agg("min") + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/reshape/merge/test_join.py b/pandas/tests/reshape/merge/test_join.py index 725157b7c8523..dc1efa46403be 100644 --- a/pandas/tests/reshape/merge/test_join.py +++ b/pandas/tests/reshape/merge/test_join.py @@ -298,7 +298,7 @@ def test_join_on_inner(self): expected = df.join(df2, on="key") expected = expected[expected["value"].notna()] - tm.assert_series_equal(joined["key"], expected["key"], check_dtype=False) + tm.assert_series_equal(joined["key"], expected["key"]) tm.assert_series_equal(joined["value"], expected["value"], check_dtype=False) tm.assert_index_equal(joined.index, expected.index) diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index d80e2e7afceef..51e6f80df657d 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -2077,8 +2077,7 @@ def test_merge_equal_cat_dtypes(cat_dtype, reverse): } ).set_index("foo") - # Categorical is unordered, so don't check ordering. - tm.assert_frame_equal(result, expected, check_categorical=False) + tm.assert_frame_equal(result, expected) def test_merge_equal_cat_dtypes2(): @@ -2100,8 +2099,7 @@ def test_merge_equal_cat_dtypes2(): {"left": [1, 2], "right": [3, 2], "foo": Series(["a", "b"]).astype(cat_dtype)} ).set_index("foo") - # Categorical is unordered, so don't check ordering. - tm.assert_frame_equal(result, expected, check_categorical=False) + tm.assert_frame_equal(result, expected) def test_merge_on_cat_and_ext_array(): diff --git a/pandas/tests/reshape/test_crosstab.py b/pandas/tests/reshape/test_crosstab.py new file mode 100644 index 0000000000000..8795af2e11122 --- /dev/null +++ b/pandas/tests/reshape/test_crosstab.py @@ -0,0 +1,700 @@ +import numpy as np +import pytest + +from pandas import CategoricalIndex, DataFrame, Index, MultiIndex, Series, crosstab +import pandas._testing as tm + + +class TestCrosstab: + def setup_method(self, method): + df = DataFrame( + { + "A": [ + "foo", + "foo", + "foo", + "foo", + "bar", + "bar", + "bar", + "bar", + "foo", + "foo", + "foo", + ], + "B": [ + "one", + "one", + "one", + "two", + "one", + "one", + "one", + "two", + "two", + "two", + "one", + ], + "C": [ + "dull", + "dull", + "shiny", + "dull", + "dull", + "shiny", + "shiny", + "dull", + "shiny", + "shiny", + "shiny", + ], + "D": np.random.randn(11), + "E": np.random.randn(11), + "F": np.random.randn(11), + } + ) + + self.df = df.append(df, ignore_index=True) + + def test_crosstab_single(self): + df = self.df + result = crosstab(df["A"], df["C"]) + expected = df.groupby(["A", "C"]).size().unstack() + tm.assert_frame_equal(result, expected.fillna(0).astype(np.int64)) + + def test_crosstab_multiple(self): + df = self.df + + result = crosstab(df["A"], [df["B"], df["C"]]) + expected = df.groupby(["A", "B", "C"]).size() + expected = expected.unstack("B").unstack("C").fillna(0).astype(np.int64) + tm.assert_frame_equal(result, expected) + + result = crosstab([df["B"], df["C"]], df["A"]) + expected = df.groupby(["B", "C", "A"]).size() + expected = expected.unstack("A").fillna(0).astype(np.int64) + tm.assert_frame_equal(result, expected) + + def test_crosstab_ndarray(self): + a = np.random.randint(0, 5, size=100) + b = np.random.randint(0, 3, size=100) + c = np.random.randint(0, 10, size=100) + + df = DataFrame({"a": a, "b": b, "c": c}) + + result = crosstab(a, [b, c], rownames=["a"], colnames=("b", "c")) + expected = crosstab(df["a"], [df["b"], df["c"]]) + tm.assert_frame_equal(result, expected) + + result = crosstab([b, c], a, colnames=["a"], rownames=("b", "c")) + expected = crosstab([df["b"], df["c"]], df["a"]) + tm.assert_frame_equal(result, expected) + + # assign arbitrary names + result = crosstab(self.df["A"].values, self.df["C"].values) + assert result.index.name == "row_0" + assert result.columns.name == "col_0" + + def test_crosstab_non_aligned(self): + # GH 17005 + a = Series([0, 1, 1], index=["a", "b", "c"]) + b = Series([3, 4, 3, 4, 3], index=["a", "b", "c", "d", "f"]) + c = np.array([3, 4, 3]) + + expected = DataFrame( + [[1, 0], [1, 1]], + index=Index([0, 1], name="row_0"), + columns=Index([3, 4], name="col_0"), + ) + + result = crosstab(a, b) + tm.assert_frame_equal(result, expected) + + result = crosstab(a, c) + tm.assert_frame_equal(result, expected) + + def test_crosstab_margins(self): + a = np.random.randint(0, 7, size=100) + b = np.random.randint(0, 3, size=100) + c = np.random.randint(0, 5, size=100) + + df = DataFrame({"a": a, "b": b, "c": c}) + + result = crosstab(a, [b, c], rownames=["a"], colnames=("b", "c"), margins=True) + + assert result.index.names == ("a",) + assert result.columns.names == ["b", "c"] + + all_cols = result["All", ""] + exp_cols = df.groupby(["a"]).size().astype("i8") + # to keep index.name + exp_margin = Series([len(df)], index=Index(["All"], name="a")) + exp_cols = exp_cols.append(exp_margin) + exp_cols.name = ("All", "") + + tm.assert_series_equal(all_cols, exp_cols) + + all_rows = result.loc["All"] + exp_rows = df.groupby(["b", "c"]).size().astype("i8") + exp_rows = exp_rows.append(Series([len(df)], index=[("All", "")])) + exp_rows.name = "All" + + exp_rows = exp_rows.reindex(all_rows.index) + exp_rows = exp_rows.fillna(0).astype(np.int64) + tm.assert_series_equal(all_rows, exp_rows) + + def test_crosstab_margins_set_margin_name(self): + # GH 15972 + a = np.random.randint(0, 7, size=100) + b = np.random.randint(0, 3, size=100) + c = np.random.randint(0, 5, size=100) + + df = DataFrame({"a": a, "b": b, "c": c}) + + result = crosstab( + a, + [b, c], + rownames=["a"], + colnames=("b", "c"), + margins=True, + margins_name="TOTAL", + ) + + assert result.index.names == ("a",) + assert result.columns.names == ["b", "c"] + + all_cols = result["TOTAL", ""] + exp_cols = df.groupby(["a"]).size().astype("i8") + # to keep index.name + exp_margin = Series([len(df)], index=Index(["TOTAL"], name="a")) + exp_cols = exp_cols.append(exp_margin) + exp_cols.name = ("TOTAL", "") + + tm.assert_series_equal(all_cols, exp_cols) + + all_rows = result.loc["TOTAL"] + exp_rows = df.groupby(["b", "c"]).size().astype("i8") + exp_rows = exp_rows.append(Series([len(df)], index=[("TOTAL", "")])) + exp_rows.name = "TOTAL" + + exp_rows = exp_rows.reindex(all_rows.index) + exp_rows = exp_rows.fillna(0).astype(np.int64) + tm.assert_series_equal(all_rows, exp_rows) + + msg = "margins_name argument must be a string" + for margins_name in [666, None, ["a", "b"]]: + with pytest.raises(ValueError, match=msg): + crosstab( + a, + [b, c], + rownames=["a"], + colnames=("b", "c"), + margins=True, + margins_name=margins_name, + ) + + def test_crosstab_pass_values(self): + a = np.random.randint(0, 7, size=100) + b = np.random.randint(0, 3, size=100) + c = np.random.randint(0, 5, size=100) + values = np.random.randn(100) + + table = crosstab( + [a, b], c, values, aggfunc=np.sum, rownames=["foo", "bar"], colnames=["baz"] + ) + + df = DataFrame({"foo": a, "bar": b, "baz": c, "values": values}) + + expected = df.pivot_table( + "values", index=["foo", "bar"], columns="baz", aggfunc=np.sum + ) + tm.assert_frame_equal(table, expected) + + def test_crosstab_dropna(self): + # GH 3820 + a = np.array(["foo", "foo", "foo", "bar", "bar", "foo", "foo"], dtype=object) + b = np.array(["one", "one", "two", "one", "two", "two", "two"], dtype=object) + c = np.array( + ["dull", "dull", "dull", "dull", "dull", "shiny", "shiny"], dtype=object + ) + res = crosstab(a, [b, c], rownames=["a"], colnames=["b", "c"], dropna=False) + m = MultiIndex.from_tuples( + [("one", "dull"), ("one", "shiny"), ("two", "dull"), ("two", "shiny")], + names=["b", "c"], + ) + tm.assert_index_equal(res.columns, m) + + def test_crosstab_no_overlap(self): + # GS 10291 + + s1 = Series([1, 2, 3], index=[1, 2, 3]) + s2 = Series([4, 5, 6], index=[4, 5, 6]) + + actual = crosstab(s1, s2) + expected = DataFrame() + + tm.assert_frame_equal(actual, expected) + + def test_margin_dropna(self): + # GH 12577 + # pivot_table counts null into margin ('All') + # when margins=true and dropna=true + + df = DataFrame({"a": [1, 2, 2, 2, 2, np.nan], "b": [3, 3, 4, 4, 4, 4]}) + actual = crosstab(df.a, df.b, margins=True, dropna=True) + expected = DataFrame([[1, 0, 1], [1, 3, 4], [2, 3, 5]]) + expected.index = Index([1.0, 2.0, "All"], name="a") + expected.columns = Index([3, 4, "All"], name="b") + tm.assert_frame_equal(actual, expected) + + df = DataFrame( + {"a": [1, np.nan, np.nan, np.nan, 2, np.nan], "b": [3, np.nan, 4, 4, 4, 4]} + ) + actual = crosstab(df.a, df.b, margins=True, dropna=True) + expected = DataFrame([[1, 0, 1], [0, 1, 1], [1, 1, 2]]) + expected.index = Index([1.0, 2.0, "All"], name="a") + expected.columns = Index([3.0, 4.0, "All"], name="b") + tm.assert_frame_equal(actual, expected) + + df = DataFrame( + {"a": [1, np.nan, np.nan, np.nan, np.nan, 2], "b": [3, 3, 4, 4, 4, 4]} + ) + actual = crosstab(df.a, df.b, margins=True, dropna=True) + expected = DataFrame([[1, 0, 1], [0, 1, 1], [1, 1, 2]]) + expected.index = Index([1.0, 2.0, "All"], name="a") + expected.columns = Index([3, 4, "All"], name="b") + tm.assert_frame_equal(actual, expected) + + # GH 12642 + # _add_margins raises KeyError: Level None not found + # when margins=True and dropna=False + df = DataFrame({"a": [1, 2, 2, 2, 2, np.nan], "b": [3, 3, 4, 4, 4, 4]}) + actual = crosstab(df.a, df.b, margins=True, dropna=False) + expected = DataFrame([[1, 0, 1], [1, 3, 4], [2, 4, 6]]) + expected.index = Index([1.0, 2.0, "All"], name="a") + expected.columns = Index([3, 4, "All"], name="b") + tm.assert_frame_equal(actual, expected) + + df = DataFrame( + {"a": [1, np.nan, np.nan, np.nan, 2, np.nan], "b": [3, np.nan, 4, 4, 4, 4]} + ) + actual = crosstab(df.a, df.b, margins=True, dropna=False) + expected = DataFrame([[1, 0, 1], [0, 1, 1], [1, 4, 6]]) + expected.index = Index([1.0, 2.0, "All"], name="a") + expected.columns = Index([3.0, 4.0, "All"], name="b") + tm.assert_frame_equal(actual, expected) + + a = np.array(["foo", "foo", "foo", "bar", "bar", "foo", "foo"], dtype=object) + b = np.array(["one", "one", "two", "one", "two", np.nan, "two"], dtype=object) + c = np.array( + ["dull", "dull", "dull", "dull", "dull", "shiny", "shiny"], dtype=object + ) + + actual = crosstab( + a, [b, c], rownames=["a"], colnames=["b", "c"], margins=True, dropna=False + ) + m = MultiIndex.from_arrays( + [ + ["one", "one", "two", "two", "All"], + ["dull", "shiny", "dull", "shiny", ""], + ], + names=["b", "c"], + ) + expected = DataFrame( + [[1, 0, 1, 0, 2], [2, 0, 1, 1, 5], [3, 0, 2, 1, 7]], columns=m + ) + expected.index = Index(["bar", "foo", "All"], name="a") + tm.assert_frame_equal(actual, expected) + + actual = crosstab( + [a, b], c, rownames=["a", "b"], colnames=["c"], margins=True, dropna=False + ) + m = MultiIndex.from_arrays( + [["bar", "bar", "foo", "foo", "All"], ["one", "two", "one", "two", ""]], + names=["a", "b"], + ) + expected = DataFrame( + [[1, 0, 1], [1, 0, 1], [2, 0, 2], [1, 1, 2], [5, 2, 7]], index=m + ) + expected.columns = Index(["dull", "shiny", "All"], name="c") + tm.assert_frame_equal(actual, expected) + + actual = crosstab( + [a, b], c, rownames=["a", "b"], colnames=["c"], margins=True, dropna=True + ) + m = MultiIndex.from_arrays( + [["bar", "bar", "foo", "foo", "All"], ["one", "two", "one", "two", ""]], + names=["a", "b"], + ) + expected = DataFrame( + [[1, 0, 1], [1, 0, 1], [2, 0, 2], [1, 1, 2], [5, 1, 6]], index=m + ) + expected.columns = Index(["dull", "shiny", "All"], name="c") + tm.assert_frame_equal(actual, expected) + + def test_crosstab_normalize(self): + # Issue 12578 + df = DataFrame( + {"a": [1, 2, 2, 2, 2], "b": [3, 3, 4, 4, 4], "c": [1, 1, np.nan, 1, 1]} + ) + + rindex = Index([1, 2], name="a") + cindex = Index([3, 4], name="b") + full_normal = DataFrame([[0.2, 0], [0.2, 0.6]], index=rindex, columns=cindex) + row_normal = DataFrame([[1.0, 0], [0.25, 0.75]], index=rindex, columns=cindex) + col_normal = DataFrame([[0.5, 0], [0.5, 1.0]], index=rindex, columns=cindex) + + # Check all normalize args + tm.assert_frame_equal(crosstab(df.a, df.b, normalize="all"), full_normal) + tm.assert_frame_equal(crosstab(df.a, df.b, normalize=True), full_normal) + tm.assert_frame_equal(crosstab(df.a, df.b, normalize="index"), row_normal) + tm.assert_frame_equal(crosstab(df.a, df.b, normalize="columns"), col_normal) + tm.assert_frame_equal( + crosstab(df.a, df.b, normalize=1), + crosstab(df.a, df.b, normalize="columns"), + ) + tm.assert_frame_equal( + crosstab(df.a, df.b, normalize=0), crosstab(df.a, df.b, normalize="index"), + ) + + row_normal_margins = DataFrame( + [[1.0, 0], [0.25, 0.75], [0.4, 0.6]], + index=Index([1, 2, "All"], name="a", dtype="object"), + columns=Index([3, 4], name="b", dtype="object"), + ) + col_normal_margins = DataFrame( + [[0.5, 0, 0.2], [0.5, 1.0, 0.8]], + index=Index([1, 2], name="a", dtype="object"), + columns=Index([3, 4, "All"], name="b", dtype="object"), + ) + + all_normal_margins = DataFrame( + [[0.2, 0, 0.2], [0.2, 0.6, 0.8], [0.4, 0.6, 1]], + index=Index([1, 2, "All"], name="a", dtype="object"), + columns=Index([3, 4, "All"], name="b", dtype="object"), + ) + tm.assert_frame_equal( + crosstab(df.a, df.b, normalize="index", margins=True), row_normal_margins + ) + tm.assert_frame_equal( + crosstab(df.a, df.b, normalize="columns", margins=True), col_normal_margins, + ) + tm.assert_frame_equal( + crosstab(df.a, df.b, normalize=True, margins=True), all_normal_margins + ) + + # Test arrays + crosstab( + [np.array([1, 1, 2, 2]), np.array([1, 2, 1, 2])], np.array([1, 2, 1, 2]) + ) + + # Test with aggfunc + norm_counts = DataFrame( + [[0.25, 0, 0.25], [0.25, 0.5, 0.75], [0.5, 0.5, 1]], + index=Index([1, 2, "All"], name="a", dtype="object"), + columns=Index([3, 4, "All"], name="b"), + ) + test_case = crosstab( + df.a, df.b, df.c, aggfunc="count", normalize="all", margins=True + ) + tm.assert_frame_equal(test_case, norm_counts) + + df = DataFrame( + {"a": [1, 2, 2, 2, 2], "b": [3, 3, 4, 4, 4], "c": [0, 4, np.nan, 3, 3]} + ) + + norm_sum = DataFrame( + [[0, 0, 0.0], [0.4, 0.6, 1], [0.4, 0.6, 1]], + index=Index([1, 2, "All"], name="a", dtype="object"), + columns=Index([3, 4, "All"], name="b", dtype="object"), + ) + test_case = crosstab( + df.a, df.b, df.c, aggfunc=np.sum, normalize="all", margins=True + ) + tm.assert_frame_equal(test_case, norm_sum) + + def test_crosstab_with_empties(self): + # Check handling of empties + df = DataFrame( + { + "a": [1, 2, 2, 2, 2], + "b": [3, 3, 4, 4, 4], + "c": [np.nan, np.nan, np.nan, np.nan, np.nan], + } + ) + + empty = DataFrame( + [[0.0, 0.0], [0.0, 0.0]], + index=Index([1, 2], name="a", dtype="int64"), + columns=Index([3, 4], name="b"), + ) + + for i in [True, "index", "columns"]: + calculated = crosstab(df.a, df.b, values=df.c, aggfunc="count", normalize=i) + tm.assert_frame_equal(empty, calculated) + + nans = DataFrame( + [[0.0, np.nan], [0.0, 0.0]], + index=Index([1, 2], name="a", dtype="int64"), + columns=Index([3, 4], name="b"), + ) + + calculated = crosstab(df.a, df.b, values=df.c, aggfunc="count", normalize=False) + tm.assert_frame_equal(nans, calculated) + + def test_crosstab_errors(self): + # Issue 12578 + + df = DataFrame( + {"a": [1, 2, 2, 2, 2], "b": [3, 3, 4, 4, 4], "c": [1, 1, np.nan, 1, 1]} + ) + + error = "values cannot be used without an aggfunc." + with pytest.raises(ValueError, match=error): + crosstab(df.a, df.b, values=df.c) + + error = "aggfunc cannot be used without values" + with pytest.raises(ValueError, match=error): + crosstab(df.a, df.b, aggfunc=np.mean) + + error = "Not a valid normalize argument" + with pytest.raises(ValueError, match=error): + crosstab(df.a, df.b, normalize="42") + + with pytest.raises(ValueError, match=error): + crosstab(df.a, df.b, normalize=42) + + error = "Not a valid margins argument" + with pytest.raises(ValueError, match=error): + crosstab(df.a, df.b, normalize="all", margins=42) + + def test_crosstab_with_categorial_columns(self): + # GH 8860 + df = DataFrame( + { + "MAKE": ["Honda", "Acura", "Tesla", "Honda", "Honda", "Acura"], + "MODEL": ["Sedan", "Sedan", "Electric", "Pickup", "Sedan", "Sedan"], + } + ) + categories = ["Sedan", "Electric", "Pickup"] + df["MODEL"] = df["MODEL"].astype("category").cat.set_categories(categories) + result = crosstab(df["MAKE"], df["MODEL"]) + + expected_index = Index(["Acura", "Honda", "Tesla"], name="MAKE") + expected_columns = CategoricalIndex( + categories, categories=categories, ordered=False, name="MODEL" + ) + expected_data = [[2, 0, 0], [2, 0, 1], [0, 1, 0]] + expected = DataFrame( + expected_data, index=expected_index, columns=expected_columns + ) + tm.assert_frame_equal(result, expected) + + def test_crosstab_with_numpy_size(self): + # GH 4003 + df = DataFrame( + { + "A": ["one", "one", "two", "three"] * 6, + "B": ["A", "B", "C"] * 8, + "C": ["foo", "foo", "foo", "bar", "bar", "bar"] * 4, + "D": np.random.randn(24), + "E": np.random.randn(24), + } + ) + result = crosstab( + index=[df["A"], df["B"]], + columns=[df["C"]], + margins=True, + aggfunc=np.size, + values=df["D"], + ) + expected_index = MultiIndex( + levels=[["All", "one", "three", "two"], ["", "A", "B", "C"]], + codes=[[1, 1, 1, 2, 2, 2, 3, 3, 3, 0], [1, 2, 3, 1, 2, 3, 1, 2, 3, 0]], + names=["A", "B"], + ) + expected_column = Index(["bar", "foo", "All"], dtype="object", name="C") + expected_data = np.array( + [ + [2.0, 2.0, 4.0], + [2.0, 2.0, 4.0], + [2.0, 2.0, 4.0], + [2.0, np.nan, 2.0], + [np.nan, 2.0, 2.0], + [2.0, np.nan, 2.0], + [np.nan, 2.0, 2.0], + [2.0, np.nan, 2.0], + [np.nan, 2.0, 2.0], + [12.0, 12.0, 24.0], + ] + ) + expected = DataFrame( + expected_data, index=expected_index, columns=expected_column + ) + tm.assert_frame_equal(result, expected) + + def test_crosstab_dup_index_names(self): + # GH 13279 + s = Series(range(3), name="foo") + + result = crosstab(s, s) + expected_index = Index(range(3), name="foo") + expected = DataFrame( + np.eye(3, dtype=np.int64), index=expected_index, columns=expected_index + ) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("names", [["a", ("b", "c")], [("a", "b"), "c"]]) + def test_crosstab_tuple_name(self, names): + s1 = Series(range(3), name=names[0]) + s2 = Series(range(1, 4), name=names[1]) + + mi = MultiIndex.from_arrays([range(3), range(1, 4)], names=names) + expected = Series(1, index=mi).unstack(1, fill_value=0) + + result = crosstab(s1, s2) + tm.assert_frame_equal(result, expected) + + def test_crosstab_both_tuple_names(self): + # GH 18321 + s1 = Series(range(3), name=("a", "b")) + s2 = Series(range(3), name=("c", "d")) + + expected = DataFrame( + np.eye(3, dtype="int64"), + index=Index(range(3), name=("a", "b")), + columns=Index(range(3), name=("c", "d")), + ) + result = crosstab(s1, s2) + tm.assert_frame_equal(result, expected) + + def test_crosstab_unsorted_order(self): + df = DataFrame({"b": [3, 1, 2], "a": [5, 4, 6]}, index=["C", "A", "B"]) + result = crosstab(df.index, [df.b, df.a]) + e_idx = Index(["A", "B", "C"], name="row_0") + e_columns = MultiIndex.from_tuples([(1, 4), (2, 6), (3, 5)], names=["b", "a"]) + expected = DataFrame( + [[1, 0, 0], [0, 1, 0], [0, 0, 1]], index=e_idx, columns=e_columns + ) + tm.assert_frame_equal(result, expected) + + def test_crosstab_normalize_multiple_columns(self): + # GH 15150 + df = DataFrame( + { + "A": ["one", "one", "two", "three"] * 6, + "B": ["A", "B", "C"] * 8, + "C": ["foo", "foo", "foo", "bar", "bar", "bar"] * 4, + "D": [0] * 24, + "E": [0] * 24, + } + ) + result = crosstab( + [df.A, df.B], + df.C, + values=df.D, + aggfunc=np.sum, + normalize=True, + margins=True, + ) + expected = DataFrame( + np.array([0] * 29 + [1], dtype=float).reshape(10, 3), + columns=Index(["bar", "foo", "All"], dtype="object", name="C"), + index=MultiIndex.from_tuples( + [ + ("one", "A"), + ("one", "B"), + ("one", "C"), + ("three", "A"), + ("three", "B"), + ("three", "C"), + ("two", "A"), + ("two", "B"), + ("two", "C"), + ("All", ""), + ], + names=["A", "B"], + ), + ) + tm.assert_frame_equal(result, expected) + + def test_margin_normalize(self): + # GH 27500 + df = DataFrame( + { + "A": ["foo", "foo", "foo", "foo", "foo", "bar", "bar", "bar", "bar"], + "B": ["one", "one", "one", "two", "two", "one", "one", "two", "two"], + "C": [ + "small", + "large", + "large", + "small", + "small", + "large", + "small", + "small", + "large", + ], + "D": [1, 2, 2, 3, 3, 4, 5, 6, 7], + "E": [2, 4, 5, 5, 6, 6, 8, 9, 9], + } + ) + # normalize on index + result = crosstab( + [df.A, df.B], df.C, margins=True, margins_name="Sub-Total", normalize=0 + ) + expected = DataFrame( + [[0.5, 0.5], [0.5, 0.5], [0.666667, 0.333333], [0, 1], [0.444444, 0.555556]] + ) + expected.index = MultiIndex( + levels=[["Sub-Total", "bar", "foo"], ["", "one", "two"]], + codes=[[1, 1, 2, 2, 0], [1, 2, 1, 2, 0]], + names=["A", "B"], + ) + expected.columns = Index(["large", "small"], dtype="object", name="C") + tm.assert_frame_equal(result, expected) + + # normalize on columns + result = crosstab( + [df.A, df.B], df.C, margins=True, margins_name="Sub-Total", normalize=1 + ) + expected = DataFrame( + [ + [0.25, 0.2, 0.222222], + [0.25, 0.2, 0.222222], + [0.5, 0.2, 0.333333], + [0, 0.4, 0.222222], + ] + ) + expected.columns = Index( + ["large", "small", "Sub-Total"], dtype="object", name="C" + ) + expected.index = MultiIndex( + levels=[["bar", "foo"], ["one", "two"]], + codes=[[0, 0, 1, 1], [0, 1, 0, 1]], + names=["A", "B"], + ) + tm.assert_frame_equal(result, expected) + + # normalize on both index and column + result = crosstab( + [df.A, df.B], df.C, margins=True, margins_name="Sub-Total", normalize=True + ) + expected = DataFrame( + [ + [0.111111, 0.111111, 0.222222], + [0.111111, 0.111111, 0.222222], + [0.222222, 0.111111, 0.333333], + [0.000000, 0.222222, 0.222222], + [0.444444, 0.555555, 1], + ] + ) + expected.columns = Index( + ["large", "small", "Sub-Total"], dtype="object", name="C" + ) + expected.index = MultiIndex( + levels=[["Sub-Total", "bar", "foo"], ["", "one", "two"]], + codes=[[1, 1, 2, 2, 0], [1, 2, 1, 2, 0]], + names=["A", "B"], + ) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index e09a2a7907177..cdb1a73abc431 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -17,7 +17,7 @@ ) import pandas._testing as tm from pandas.api.types import CategoricalDtype as CDT -from pandas.core.reshape.pivot import crosstab, pivot_table +from pandas.core.reshape.pivot import pivot_table @pytest.fixture(params=[True, False]) @@ -1026,6 +1026,14 @@ def test_pivot_table_multiindex_only(self, cols): tm.assert_frame_equal(result, expected) + def test_pivot_table_retains_tz(self): + dti = date_range("2016-01-01", periods=3, tz="Europe/Amsterdam") + df = DataFrame({"A": np.random.randn(3), "B": np.random.randn(3), "C": dti}) + result = df.pivot_table(index=["B", "C"], dropna=False) + + # check tz retention + assert result.index.levels[1].equals(dti) + def test_pivot_integer_columns(self): # caused by upstream bug in unstack @@ -2064,708 +2072,3 @@ def agg(l): ) with pytest.raises(KeyError, match="notpresent"): foo.pivot_table("notpresent", "X", "Y", aggfunc=agg) - - -class TestCrosstab: - def setup_method(self, method): - df = DataFrame( - { - "A": [ - "foo", - "foo", - "foo", - "foo", - "bar", - "bar", - "bar", - "bar", - "foo", - "foo", - "foo", - ], - "B": [ - "one", - "one", - "one", - "two", - "one", - "one", - "one", - "two", - "two", - "two", - "one", - ], - "C": [ - "dull", - "dull", - "shiny", - "dull", - "dull", - "shiny", - "shiny", - "dull", - "shiny", - "shiny", - "shiny", - ], - "D": np.random.randn(11), - "E": np.random.randn(11), - "F": np.random.randn(11), - } - ) - - self.df = df.append(df, ignore_index=True) - - def test_crosstab_single(self): - df = self.df - result = crosstab(df["A"], df["C"]) - expected = df.groupby(["A", "C"]).size().unstack() - tm.assert_frame_equal(result, expected.fillna(0).astype(np.int64)) - - def test_crosstab_multiple(self): - df = self.df - - result = crosstab(df["A"], [df["B"], df["C"]]) - expected = df.groupby(["A", "B", "C"]).size() - expected = expected.unstack("B").unstack("C").fillna(0).astype(np.int64) - tm.assert_frame_equal(result, expected) - - result = crosstab([df["B"], df["C"]], df["A"]) - expected = df.groupby(["B", "C", "A"]).size() - expected = expected.unstack("A").fillna(0).astype(np.int64) - tm.assert_frame_equal(result, expected) - - def test_crosstab_ndarray(self): - a = np.random.randint(0, 5, size=100) - b = np.random.randint(0, 3, size=100) - c = np.random.randint(0, 10, size=100) - - df = DataFrame({"a": a, "b": b, "c": c}) - - result = crosstab(a, [b, c], rownames=["a"], colnames=("b", "c")) - expected = crosstab(df["a"], [df["b"], df["c"]]) - tm.assert_frame_equal(result, expected) - - result = crosstab([b, c], a, colnames=["a"], rownames=("b", "c")) - expected = crosstab([df["b"], df["c"]], df["a"]) - tm.assert_frame_equal(result, expected) - - # assign arbitrary names - result = crosstab(self.df["A"].values, self.df["C"].values) - assert result.index.name == "row_0" - assert result.columns.name == "col_0" - - def test_crosstab_non_aligned(self): - # GH 17005 - a = pd.Series([0, 1, 1], index=["a", "b", "c"]) - b = pd.Series([3, 4, 3, 4, 3], index=["a", "b", "c", "d", "f"]) - c = np.array([3, 4, 3]) - - expected = pd.DataFrame( - [[1, 0], [1, 1]], - index=Index([0, 1], name="row_0"), - columns=Index([3, 4], name="col_0"), - ) - - result = crosstab(a, b) - tm.assert_frame_equal(result, expected) - - result = crosstab(a, c) - tm.assert_frame_equal(result, expected) - - def test_crosstab_margins(self): - a = np.random.randint(0, 7, size=100) - b = np.random.randint(0, 3, size=100) - c = np.random.randint(0, 5, size=100) - - df = DataFrame({"a": a, "b": b, "c": c}) - - result = crosstab(a, [b, c], rownames=["a"], colnames=("b", "c"), margins=True) - - assert result.index.names == ("a",) - assert result.columns.names == ["b", "c"] - - all_cols = result["All", ""] - exp_cols = df.groupby(["a"]).size().astype("i8") - # to keep index.name - exp_margin = Series([len(df)], index=Index(["All"], name="a")) - exp_cols = exp_cols.append(exp_margin) - exp_cols.name = ("All", "") - - tm.assert_series_equal(all_cols, exp_cols) - - all_rows = result.loc["All"] - exp_rows = df.groupby(["b", "c"]).size().astype("i8") - exp_rows = exp_rows.append(Series([len(df)], index=[("All", "")])) - exp_rows.name = "All" - - exp_rows = exp_rows.reindex(all_rows.index) - exp_rows = exp_rows.fillna(0).astype(np.int64) - tm.assert_series_equal(all_rows, exp_rows) - - def test_crosstab_margins_set_margin_name(self): - # GH 15972 - a = np.random.randint(0, 7, size=100) - b = np.random.randint(0, 3, size=100) - c = np.random.randint(0, 5, size=100) - - df = DataFrame({"a": a, "b": b, "c": c}) - - result = crosstab( - a, - [b, c], - rownames=["a"], - colnames=("b", "c"), - margins=True, - margins_name="TOTAL", - ) - - assert result.index.names == ("a",) - assert result.columns.names == ["b", "c"] - - all_cols = result["TOTAL", ""] - exp_cols = df.groupby(["a"]).size().astype("i8") - # to keep index.name - exp_margin = Series([len(df)], index=Index(["TOTAL"], name="a")) - exp_cols = exp_cols.append(exp_margin) - exp_cols.name = ("TOTAL", "") - - tm.assert_series_equal(all_cols, exp_cols) - - all_rows = result.loc["TOTAL"] - exp_rows = df.groupby(["b", "c"]).size().astype("i8") - exp_rows = exp_rows.append(Series([len(df)], index=[("TOTAL", "")])) - exp_rows.name = "TOTAL" - - exp_rows = exp_rows.reindex(all_rows.index) - exp_rows = exp_rows.fillna(0).astype(np.int64) - tm.assert_series_equal(all_rows, exp_rows) - - msg = "margins_name argument must be a string" - for margins_name in [666, None, ["a", "b"]]: - with pytest.raises(ValueError, match=msg): - crosstab( - a, - [b, c], - rownames=["a"], - colnames=("b", "c"), - margins=True, - margins_name=margins_name, - ) - - def test_crosstab_pass_values(self): - a = np.random.randint(0, 7, size=100) - b = np.random.randint(0, 3, size=100) - c = np.random.randint(0, 5, size=100) - values = np.random.randn(100) - - table = crosstab( - [a, b], c, values, aggfunc=np.sum, rownames=["foo", "bar"], colnames=["baz"] - ) - - df = DataFrame({"foo": a, "bar": b, "baz": c, "values": values}) - - expected = df.pivot_table( - "values", index=["foo", "bar"], columns="baz", aggfunc=np.sum - ) - tm.assert_frame_equal(table, expected) - - def test_crosstab_dropna(self): - # GH 3820 - a = np.array(["foo", "foo", "foo", "bar", "bar", "foo", "foo"], dtype=object) - b = np.array(["one", "one", "two", "one", "two", "two", "two"], dtype=object) - c = np.array( - ["dull", "dull", "dull", "dull", "dull", "shiny", "shiny"], dtype=object - ) - res = pd.crosstab(a, [b, c], rownames=["a"], colnames=["b", "c"], dropna=False) - m = MultiIndex.from_tuples( - [("one", "dull"), ("one", "shiny"), ("two", "dull"), ("two", "shiny")], - names=["b", "c"], - ) - tm.assert_index_equal(res.columns, m) - - def test_crosstab_no_overlap(self): - # GS 10291 - - s1 = pd.Series([1, 2, 3], index=[1, 2, 3]) - s2 = pd.Series([4, 5, 6], index=[4, 5, 6]) - - actual = crosstab(s1, s2) - expected = pd.DataFrame() - - tm.assert_frame_equal(actual, expected) - - def test_margin_dropna(self): - # GH 12577 - # pivot_table counts null into margin ('All') - # when margins=true and dropna=true - - df = pd.DataFrame({"a": [1, 2, 2, 2, 2, np.nan], "b": [3, 3, 4, 4, 4, 4]}) - actual = pd.crosstab(df.a, df.b, margins=True, dropna=True) - expected = pd.DataFrame([[1, 0, 1], [1, 3, 4], [2, 3, 5]]) - expected.index = Index([1.0, 2.0, "All"], name="a") - expected.columns = Index([3, 4, "All"], name="b") - tm.assert_frame_equal(actual, expected) - - df = DataFrame( - {"a": [1, np.nan, np.nan, np.nan, 2, np.nan], "b": [3, np.nan, 4, 4, 4, 4]} - ) - actual = pd.crosstab(df.a, df.b, margins=True, dropna=True) - expected = pd.DataFrame([[1, 0, 1], [0, 1, 1], [1, 1, 2]]) - expected.index = Index([1.0, 2.0, "All"], name="a") - expected.columns = Index([3.0, 4.0, "All"], name="b") - tm.assert_frame_equal(actual, expected) - - df = DataFrame( - {"a": [1, np.nan, np.nan, np.nan, np.nan, 2], "b": [3, 3, 4, 4, 4, 4]} - ) - actual = pd.crosstab(df.a, df.b, margins=True, dropna=True) - expected = pd.DataFrame([[1, 0, 1], [0, 1, 1], [1, 1, 2]]) - expected.index = Index([1.0, 2.0, "All"], name="a") - expected.columns = Index([3, 4, "All"], name="b") - tm.assert_frame_equal(actual, expected) - - # GH 12642 - # _add_margins raises KeyError: Level None not found - # when margins=True and dropna=False - df = pd.DataFrame({"a": [1, 2, 2, 2, 2, np.nan], "b": [3, 3, 4, 4, 4, 4]}) - actual = pd.crosstab(df.a, df.b, margins=True, dropna=False) - expected = pd.DataFrame([[1, 0, 1], [1, 3, 4], [2, 4, 6]]) - expected.index = Index([1.0, 2.0, "All"], name="a") - expected.columns = Index([3, 4, "All"], name="b") - tm.assert_frame_equal(actual, expected) - - df = DataFrame( - {"a": [1, np.nan, np.nan, np.nan, 2, np.nan], "b": [3, np.nan, 4, 4, 4, 4]} - ) - actual = pd.crosstab(df.a, df.b, margins=True, dropna=False) - expected = pd.DataFrame([[1, 0, 1], [0, 1, 1], [1, 4, 6]]) - expected.index = Index([1.0, 2.0, "All"], name="a") - expected.columns = Index([3.0, 4.0, "All"], name="b") - tm.assert_frame_equal(actual, expected) - - a = np.array(["foo", "foo", "foo", "bar", "bar", "foo", "foo"], dtype=object) - b = np.array(["one", "one", "two", "one", "two", np.nan, "two"], dtype=object) - c = np.array( - ["dull", "dull", "dull", "dull", "dull", "shiny", "shiny"], dtype=object - ) - - actual = pd.crosstab( - a, [b, c], rownames=["a"], colnames=["b", "c"], margins=True, dropna=False - ) - m = MultiIndex.from_arrays( - [ - ["one", "one", "two", "two", "All"], - ["dull", "shiny", "dull", "shiny", ""], - ], - names=["b", "c"], - ) - expected = DataFrame( - [[1, 0, 1, 0, 2], [2, 0, 1, 1, 5], [3, 0, 2, 1, 7]], columns=m - ) - expected.index = Index(["bar", "foo", "All"], name="a") - tm.assert_frame_equal(actual, expected) - - actual = pd.crosstab( - [a, b], c, rownames=["a", "b"], colnames=["c"], margins=True, dropna=False - ) - m = MultiIndex.from_arrays( - [["bar", "bar", "foo", "foo", "All"], ["one", "two", "one", "two", ""]], - names=["a", "b"], - ) - expected = DataFrame( - [[1, 0, 1], [1, 0, 1], [2, 0, 2], [1, 1, 2], [5, 2, 7]], index=m - ) - expected.columns = Index(["dull", "shiny", "All"], name="c") - tm.assert_frame_equal(actual, expected) - - actual = pd.crosstab( - [a, b], c, rownames=["a", "b"], colnames=["c"], margins=True, dropna=True - ) - m = MultiIndex.from_arrays( - [["bar", "bar", "foo", "foo", "All"], ["one", "two", "one", "two", ""]], - names=["a", "b"], - ) - expected = DataFrame( - [[1, 0, 1], [1, 0, 1], [2, 0, 2], [1, 1, 2], [5, 1, 6]], index=m - ) - expected.columns = Index(["dull", "shiny", "All"], name="c") - tm.assert_frame_equal(actual, expected) - - def test_crosstab_normalize(self): - # Issue 12578 - df = pd.DataFrame( - {"a": [1, 2, 2, 2, 2], "b": [3, 3, 4, 4, 4], "c": [1, 1, np.nan, 1, 1]} - ) - - rindex = pd.Index([1, 2], name="a") - cindex = pd.Index([3, 4], name="b") - full_normal = pd.DataFrame([[0.2, 0], [0.2, 0.6]], index=rindex, columns=cindex) - row_normal = pd.DataFrame( - [[1.0, 0], [0.25, 0.75]], index=rindex, columns=cindex - ) - col_normal = pd.DataFrame([[0.5, 0], [0.5, 1.0]], index=rindex, columns=cindex) - - # Check all normalize args - tm.assert_frame_equal(pd.crosstab(df.a, df.b, normalize="all"), full_normal) - tm.assert_frame_equal(pd.crosstab(df.a, df.b, normalize=True), full_normal) - tm.assert_frame_equal(pd.crosstab(df.a, df.b, normalize="index"), row_normal) - tm.assert_frame_equal(pd.crosstab(df.a, df.b, normalize="columns"), col_normal) - tm.assert_frame_equal( - pd.crosstab(df.a, df.b, normalize=1), - pd.crosstab(df.a, df.b, normalize="columns"), - ) - tm.assert_frame_equal( - pd.crosstab(df.a, df.b, normalize=0), - pd.crosstab(df.a, df.b, normalize="index"), - ) - - row_normal_margins = pd.DataFrame( - [[1.0, 0], [0.25, 0.75], [0.4, 0.6]], - index=pd.Index([1, 2, "All"], name="a", dtype="object"), - columns=pd.Index([3, 4], name="b", dtype="object"), - ) - col_normal_margins = pd.DataFrame( - [[0.5, 0, 0.2], [0.5, 1.0, 0.8]], - index=pd.Index([1, 2], name="a", dtype="object"), - columns=pd.Index([3, 4, "All"], name="b", dtype="object"), - ) - - all_normal_margins = pd.DataFrame( - [[0.2, 0, 0.2], [0.2, 0.6, 0.8], [0.4, 0.6, 1]], - index=pd.Index([1, 2, "All"], name="a", dtype="object"), - columns=pd.Index([3, 4, "All"], name="b", dtype="object"), - ) - tm.assert_frame_equal( - pd.crosstab(df.a, df.b, normalize="index", margins=True), row_normal_margins - ) - tm.assert_frame_equal( - pd.crosstab(df.a, df.b, normalize="columns", margins=True), - col_normal_margins, - ) - tm.assert_frame_equal( - pd.crosstab(df.a, df.b, normalize=True, margins=True), all_normal_margins - ) - - # Test arrays - pd.crosstab( - [np.array([1, 1, 2, 2]), np.array([1, 2, 1, 2])], np.array([1, 2, 1, 2]) - ) - - # Test with aggfunc - norm_counts = pd.DataFrame( - [[0.25, 0, 0.25], [0.25, 0.5, 0.75], [0.5, 0.5, 1]], - index=pd.Index([1, 2, "All"], name="a", dtype="object"), - columns=pd.Index([3, 4, "All"], name="b"), - ) - test_case = pd.crosstab( - df.a, df.b, df.c, aggfunc="count", normalize="all", margins=True - ) - tm.assert_frame_equal(test_case, norm_counts) - - df = pd.DataFrame( - {"a": [1, 2, 2, 2, 2], "b": [3, 3, 4, 4, 4], "c": [0, 4, np.nan, 3, 3]} - ) - - norm_sum = pd.DataFrame( - [[0, 0, 0.0], [0.4, 0.6, 1], [0.4, 0.6, 1]], - index=pd.Index([1, 2, "All"], name="a", dtype="object"), - columns=pd.Index([3, 4, "All"], name="b", dtype="object"), - ) - test_case = pd.crosstab( - df.a, df.b, df.c, aggfunc=np.sum, normalize="all", margins=True - ) - tm.assert_frame_equal(test_case, norm_sum) - - def test_crosstab_with_empties(self): - # Check handling of empties - df = pd.DataFrame( - { - "a": [1, 2, 2, 2, 2], - "b": [3, 3, 4, 4, 4], - "c": [np.nan, np.nan, np.nan, np.nan, np.nan], - } - ) - - empty = pd.DataFrame( - [[0.0, 0.0], [0.0, 0.0]], - index=pd.Index([1, 2], name="a", dtype="int64"), - columns=pd.Index([3, 4], name="b"), - ) - - for i in [True, "index", "columns"]: - calculated = pd.crosstab( - df.a, df.b, values=df.c, aggfunc="count", normalize=i - ) - tm.assert_frame_equal(empty, calculated) - - nans = pd.DataFrame( - [[0.0, np.nan], [0.0, 0.0]], - index=pd.Index([1, 2], name="a", dtype="int64"), - columns=pd.Index([3, 4], name="b"), - ) - - calculated = pd.crosstab( - df.a, df.b, values=df.c, aggfunc="count", normalize=False - ) - tm.assert_frame_equal(nans, calculated) - - def test_crosstab_errors(self): - # Issue 12578 - - df = pd.DataFrame( - {"a": [1, 2, 2, 2, 2], "b": [3, 3, 4, 4, 4], "c": [1, 1, np.nan, 1, 1]} - ) - - error = "values cannot be used without an aggfunc." - with pytest.raises(ValueError, match=error): - pd.crosstab(df.a, df.b, values=df.c) - - error = "aggfunc cannot be used without values" - with pytest.raises(ValueError, match=error): - pd.crosstab(df.a, df.b, aggfunc=np.mean) - - error = "Not a valid normalize argument" - with pytest.raises(ValueError, match=error): - pd.crosstab(df.a, df.b, normalize="42") - - with pytest.raises(ValueError, match=error): - pd.crosstab(df.a, df.b, normalize=42) - - error = "Not a valid margins argument" - with pytest.raises(ValueError, match=error): - pd.crosstab(df.a, df.b, normalize="all", margins=42) - - def test_crosstab_with_categorial_columns(self): - # GH 8860 - df = pd.DataFrame( - { - "MAKE": ["Honda", "Acura", "Tesla", "Honda", "Honda", "Acura"], - "MODEL": ["Sedan", "Sedan", "Electric", "Pickup", "Sedan", "Sedan"], - } - ) - categories = ["Sedan", "Electric", "Pickup"] - df["MODEL"] = df["MODEL"].astype("category").cat.set_categories(categories) - result = pd.crosstab(df["MAKE"], df["MODEL"]) - - expected_index = pd.Index(["Acura", "Honda", "Tesla"], name="MAKE") - expected_columns = pd.CategoricalIndex( - categories, categories=categories, ordered=False, name="MODEL" - ) - expected_data = [[2, 0, 0], [2, 0, 1], [0, 1, 0]] - expected = pd.DataFrame( - expected_data, index=expected_index, columns=expected_columns - ) - tm.assert_frame_equal(result, expected) - - def test_crosstab_with_numpy_size(self): - # GH 4003 - df = pd.DataFrame( - { - "A": ["one", "one", "two", "three"] * 6, - "B": ["A", "B", "C"] * 8, - "C": ["foo", "foo", "foo", "bar", "bar", "bar"] * 4, - "D": np.random.randn(24), - "E": np.random.randn(24), - } - ) - result = pd.crosstab( - index=[df["A"], df["B"]], - columns=[df["C"]], - margins=True, - aggfunc=np.size, - values=df["D"], - ) - expected_index = pd.MultiIndex( - levels=[["All", "one", "three", "two"], ["", "A", "B", "C"]], - codes=[[1, 1, 1, 2, 2, 2, 3, 3, 3, 0], [1, 2, 3, 1, 2, 3, 1, 2, 3, 0]], - names=["A", "B"], - ) - expected_column = pd.Index(["bar", "foo", "All"], dtype="object", name="C") - expected_data = np.array( - [ - [2.0, 2.0, 4.0], - [2.0, 2.0, 4.0], - [2.0, 2.0, 4.0], - [2.0, np.nan, 2.0], - [np.nan, 2.0, 2.0], - [2.0, np.nan, 2.0], - [np.nan, 2.0, 2.0], - [2.0, np.nan, 2.0], - [np.nan, 2.0, 2.0], - [12.0, 12.0, 24.0], - ] - ) - expected = pd.DataFrame( - expected_data, index=expected_index, columns=expected_column - ) - tm.assert_frame_equal(result, expected) - - def test_crosstab_dup_index_names(self): - # GH 13279 - s = pd.Series(range(3), name="foo") - - result = pd.crosstab(s, s) - expected_index = pd.Index(range(3), name="foo") - expected = pd.DataFrame( - np.eye(3, dtype=np.int64), index=expected_index, columns=expected_index - ) - tm.assert_frame_equal(result, expected) - - @pytest.mark.parametrize("names", [["a", ("b", "c")], [("a", "b"), "c"]]) - def test_crosstab_tuple_name(self, names): - s1 = pd.Series(range(3), name=names[0]) - s2 = pd.Series(range(1, 4), name=names[1]) - - mi = pd.MultiIndex.from_arrays([range(3), range(1, 4)], names=names) - expected = pd.Series(1, index=mi).unstack(1, fill_value=0) - - result = pd.crosstab(s1, s2) - tm.assert_frame_equal(result, expected) - - def test_crosstab_both_tuple_names(self): - # GH 18321 - s1 = pd.Series(range(3), name=("a", "b")) - s2 = pd.Series(range(3), name=("c", "d")) - - expected = pd.DataFrame( - np.eye(3, dtype="int64"), - index=pd.Index(range(3), name=("a", "b")), - columns=pd.Index(range(3), name=("c", "d")), - ) - result = crosstab(s1, s2) - tm.assert_frame_equal(result, expected) - - def test_crosstab_unsorted_order(self): - df = pd.DataFrame({"b": [3, 1, 2], "a": [5, 4, 6]}, index=["C", "A", "B"]) - result = pd.crosstab(df.index, [df.b, df.a]) - e_idx = pd.Index(["A", "B", "C"], name="row_0") - e_columns = pd.MultiIndex.from_tuples( - [(1, 4), (2, 6), (3, 5)], names=["b", "a"] - ) - expected = pd.DataFrame( - [[1, 0, 0], [0, 1, 0], [0, 0, 1]], index=e_idx, columns=e_columns - ) - tm.assert_frame_equal(result, expected) - - def test_crosstab_normalize_multiple_columns(self): - # GH 15150 - df = pd.DataFrame( - { - "A": ["one", "one", "two", "three"] * 6, - "B": ["A", "B", "C"] * 8, - "C": ["foo", "foo", "foo", "bar", "bar", "bar"] * 4, - "D": [0] * 24, - "E": [0] * 24, - } - ) - result = pd.crosstab( - [df.A, df.B], - df.C, - values=df.D, - aggfunc=np.sum, - normalize=True, - margins=True, - ) - expected = pd.DataFrame( - np.array([0] * 29 + [1], dtype=float).reshape(10, 3), - columns=Index(["bar", "foo", "All"], dtype="object", name="C"), - index=MultiIndex.from_tuples( - [ - ("one", "A"), - ("one", "B"), - ("one", "C"), - ("three", "A"), - ("three", "B"), - ("three", "C"), - ("two", "A"), - ("two", "B"), - ("two", "C"), - ("All", ""), - ], - names=["A", "B"], - ), - ) - tm.assert_frame_equal(result, expected) - - def test_margin_normalize(self): - # GH 27500 - df = pd.DataFrame( - { - "A": ["foo", "foo", "foo", "foo", "foo", "bar", "bar", "bar", "bar"], - "B": ["one", "one", "one", "two", "two", "one", "one", "two", "two"], - "C": [ - "small", - "large", - "large", - "small", - "small", - "large", - "small", - "small", - "large", - ], - "D": [1, 2, 2, 3, 3, 4, 5, 6, 7], - "E": [2, 4, 5, 5, 6, 6, 8, 9, 9], - } - ) - # normalize on index - result = pd.crosstab( - [df.A, df.B], df.C, margins=True, margins_name="Sub-Total", normalize=0 - ) - expected = pd.DataFrame( - [[0.5, 0.5], [0.5, 0.5], [0.666667, 0.333333], [0, 1], [0.444444, 0.555556]] - ) - expected.index = MultiIndex( - levels=[["Sub-Total", "bar", "foo"], ["", "one", "two"]], - codes=[[1, 1, 2, 2, 0], [1, 2, 1, 2, 0]], - names=["A", "B"], - ) - expected.columns = Index(["large", "small"], dtype="object", name="C") - tm.assert_frame_equal(result, expected) - - # normalize on columns - result = pd.crosstab( - [df.A, df.B], df.C, margins=True, margins_name="Sub-Total", normalize=1 - ) - expected = pd.DataFrame( - [ - [0.25, 0.2, 0.222222], - [0.25, 0.2, 0.222222], - [0.5, 0.2, 0.333333], - [0, 0.4, 0.222222], - ] - ) - expected.columns = Index( - ["large", "small", "Sub-Total"], dtype="object", name="C" - ) - expected.index = MultiIndex( - levels=[["bar", "foo"], ["one", "two"]], - codes=[[0, 0, 1, 1], [0, 1, 0, 1]], - names=["A", "B"], - ) - tm.assert_frame_equal(result, expected) - - # normalize on both index and column - result = pd.crosstab( - [df.A, df.B], df.C, margins=True, margins_name="Sub-Total", normalize=True - ) - expected = pd.DataFrame( - [ - [0.111111, 0.111111, 0.222222], - [0.111111, 0.111111, 0.222222], - [0.222222, 0.111111, 0.333333], - [0.000000, 0.222222, 0.222222], - [0.444444, 0.555555, 1], - ] - ) - expected.columns = Index( - ["large", "small", "Sub-Total"], dtype="object", name="C" - ) - expected.index = MultiIndex( - levels=[["Sub-Total", "bar", "foo"], ["", "one", "two"]], - codes=[[1, 1, 2, 2, 0], [1, 2, 1, 2, 0]], - names=["A", "B"], - ) - tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/reshape/test_union_categoricals.py b/pandas/tests/reshape/test_union_categoricals.py index a503173bd74b1..8918d19e4ba7b 100644 --- a/pandas/tests/reshape/test_union_categoricals.py +++ b/pandas/tests/reshape/test_union_categoricals.py @@ -41,7 +41,7 @@ def test_union_categorical(self): for box in [Categorical, CategoricalIndex, Series]: result = union_categoricals([box(Categorical(a)), box(Categorical(b))]) expected = Categorical(combined) - tm.assert_categorical_equal(result, expected, check_category_order=True) + tm.assert_categorical_equal(result, expected) # new categories ordered by appearance s = Categorical(["x", "y", "z"]) diff --git a/pandas/tests/reshape/test_util.py b/pandas/tests/reshape/test_util.py index cd518dda4edbf..9d074b5ade425 100644 --- a/pandas/tests/reshape/test_util.py +++ b/pandas/tests/reshape/test_util.py @@ -25,6 +25,22 @@ def test_datetimeindex(self): tm.assert_index_equal(result1, expected1) tm.assert_index_equal(result2, expected2) + def test_tzaware_retained(self): + x = date_range("2000-01-01", periods=2, tz="US/Pacific") + y = np.array([3, 4]) + result1, result2 = cartesian_product([x, y]) + + expected = x.repeat(2) + tm.assert_index_equal(result1, expected) + + def test_tzaware_retained_categorical(self): + x = date_range("2000-01-01", periods=2, tz="US/Pacific").astype("category") + y = np.array([3, 4]) + result1, result2 = cartesian_product([x, y]) + + expected = x.repeat(2) + tm.assert_index_equal(result1, expected) + def test_empty(self): # product of empty factors X = [[], [0, 1], []] diff --git a/pandas/tests/scalar/period/test_period.py b/pandas/tests/scalar/period/test_period.py index 3846274dacd75..1fee40c2a902b 100644 --- a/pandas/tests/scalar/period/test_period.py +++ b/pandas/tests/scalar/period/test_period.py @@ -347,10 +347,18 @@ def test_period_from_ordinal(self): assert p == res assert isinstance(res, Period) - def test_period_cons_nat(self): - p = Period("NaT", freq="M") - assert p is NaT + @pytest.mark.parametrize("freq", ["A", "M", "D", "H"]) + def test_construct_from_nat_string_and_freq(self, freq): + per = Period("NaT", freq=freq) + assert per is NaT + + per = Period("NaT", freq="2" + freq) + assert per is NaT + per = Period("NaT", freq="3" + freq) + assert per is NaT + + def test_period_cons_nat(self): p = Period("nat", freq="W-SUN") assert p is NaT @@ -930,87 +938,83 @@ def test_get_period_field_array_raises_on_out_of_range(self): libperiod.get_period_field_arr(-1, np.empty(1), 0) -class TestComparisons: - def setup_method(self, method): - self.january1 = Period("2000-01", "M") - self.january2 = Period("2000-01", "M") - self.february = Period("2000-02", "M") - self.march = Period("2000-03", "M") - self.day = Period("2012-01-01", "D") - - def test_equal(self): - assert self.january1 == self.january2 - - def test_equal_Raises_Value(self): - with pytest.raises(IncompatibleFrequency): - self.january1 == self.day - - def test_notEqual(self): - assert self.january1 != 1 - assert self.january1 != self.february - - def test_greater(self): - assert self.february > self.january1 - - def test_greater_Raises_Value(self): - with pytest.raises(IncompatibleFrequency): - self.january1 > self.day - - def test_greater_Raises_Type(self): - with pytest.raises(TypeError): - self.january1 > 1 - - def test_greaterEqual(self): - assert self.january1 >= self.january2 - - def test_greaterEqual_Raises_Value(self): - with pytest.raises(IncompatibleFrequency): - self.january1 >= self.day - - with pytest.raises(TypeError): - print(self.january1 >= 1) - - def test_smallerEqual(self): - assert self.january1 <= self.january2 - - def test_smallerEqual_Raises_Value(self): - with pytest.raises(IncompatibleFrequency): - self.january1 <= self.day +class TestPeriodComparisons: + def test_comparison_same_period_different_object(self): + # Separate Period objects for the same period + left = Period("2000-01", "M") + right = Period("2000-01", "M") - def test_smallerEqual_Raises_Type(self): - with pytest.raises(TypeError): - self.january1 <= 1 + assert left == right + assert left >= right + assert left <= right + assert not left < right + assert not left > right - def test_smaller(self): - assert self.january1 < self.february + def test_comparison_same_freq(self): + jan = Period("2000-01", "M") + feb = Period("2000-02", "M") - def test_smaller_Raises_Value(self): - with pytest.raises(IncompatibleFrequency): - self.january1 < self.day + assert not jan == feb + assert jan != feb + assert jan < feb + assert jan <= feb + assert not jan > feb + assert not jan >= feb - def test_smaller_Raises_Type(self): - with pytest.raises(TypeError): - self.january1 < 1 + def test_comparison_mismatched_freq(self): + jan = Period("2000-01", "M") + day = Period("2012-01-01", "D") - def test_sort(self): - periods = [self.march, self.january1, self.february] - correctPeriods = [self.january1, self.february, self.march] + msg = r"Input has different freq=D from Period\(freq=M\)" + with pytest.raises(IncompatibleFrequency, match=msg): + jan == day + with pytest.raises(IncompatibleFrequency, match=msg): + jan != day + with pytest.raises(IncompatibleFrequency, match=msg): + jan < day + with pytest.raises(IncompatibleFrequency, match=msg): + jan <= day + with pytest.raises(IncompatibleFrequency, match=msg): + jan > day + with pytest.raises(IncompatibleFrequency, match=msg): + jan >= day + + def test_comparison_invalid_type(self): + jan = Period("2000-01", "M") + + assert not jan == 1 + assert jan != 1 + + msg = "Cannot compare type Period with type int" + for left, right in [(jan, 1), (1, jan)]: + + with pytest.raises(TypeError, match=msg): + left > right + with pytest.raises(TypeError, match=msg): + left >= right + with pytest.raises(TypeError, match=msg): + left < right + with pytest.raises(TypeError, match=msg): + left <= right + + def test_sort_periods(self): + jan = Period("2000-01", "M") + feb = Period("2000-02", "M") + mar = Period("2000-03", "M") + periods = [mar, jan, feb] + correctPeriods = [jan, feb, mar] assert sorted(periods) == correctPeriods - def test_period_nat_comp(self): - p_nat = Period("NaT", freq="D") + def test_period_cmp_nat(self): p = Period("2011-01-01", freq="D") - nat = Timestamp("NaT") t = Timestamp("2011-01-01") # confirm Period('NaT') work identical with Timestamp('NaT') for left, right in [ - (p_nat, p), - (p, p_nat), - (p_nat, p_nat), - (nat, t), - (t, nat), - (nat, nat), + (NaT, p), + (p, NaT), + (NaT, t), + (t, NaT), ]: assert not left < right assert not left > right @@ -1043,13 +1047,6 @@ def test_add_sub_nat(self): assert p - NaT is NaT assert NaT - p is NaT - p = Period("NaT", freq="M") - assert p is NaT - assert p + NaT is NaT - assert NaT + p is NaT - assert p - NaT is NaT - assert NaT - p is NaT - def test_add_invalid(self): # GH#4731 per1 = Period(freq="D", year=2008, month=1, day=1) @@ -1281,91 +1278,6 @@ def test_add_offset(self): with pytest.raises(IncompatibleFrequency): o + p - def test_add_offset_nat(self): - # freq is DateOffset - for freq in ["A", "2A", "3A"]: - p = Period("NaT", freq=freq) - assert p is NaT - for o in [offsets.YearEnd(2)]: - assert p + o is NaT - assert o + p is NaT - - for o in [ - offsets.YearBegin(2), - offsets.MonthBegin(1), - offsets.Minute(), - np.timedelta64(365, "D"), - timedelta(365), - ]: - assert p + o is NaT - assert o + p is NaT - - for freq in ["M", "2M", "3M"]: - p = Period("NaT", freq=freq) - assert p is NaT - for o in [offsets.MonthEnd(2), offsets.MonthEnd(12)]: - assert p + o is NaT - assert o + p is NaT - - for o in [ - offsets.YearBegin(2), - offsets.MonthBegin(1), - offsets.Minute(), - np.timedelta64(365, "D"), - timedelta(365), - ]: - assert p + o is NaT - assert o + p is NaT - - # freq is Tick - for freq in ["D", "2D", "3D"]: - p = Period("NaT", freq=freq) - assert p is NaT - for o in [ - offsets.Day(5), - offsets.Hour(24), - np.timedelta64(2, "D"), - np.timedelta64(3600 * 24, "s"), - timedelta(-2), - timedelta(hours=48), - ]: - assert p + o is NaT - assert o + p is NaT - - for o in [ - offsets.YearBegin(2), - offsets.MonthBegin(1), - offsets.Minute(), - np.timedelta64(4, "h"), - timedelta(hours=23), - ]: - assert p + o is NaT - assert o + p is NaT - - for freq in ["H", "2H", "3H"]: - p = Period("NaT", freq=freq) - assert p is NaT - for o in [ - offsets.Day(2), - offsets.Hour(3), - np.timedelta64(3, "h"), - np.timedelta64(3600, "s"), - timedelta(minutes=120), - timedelta(days=4, minutes=180), - ]: - assert p + o is NaT - assert o + p is NaT - - for o in [ - offsets.YearBegin(2), - offsets.MonthBegin(1), - offsets.Minute(), - np.timedelta64(3200, "s"), - timedelta(hours=23, minutes=30), - ]: - assert p + o is NaT - assert o + p is NaT - def test_sub_offset(self): # freq is DateOffset for freq in ["A", "2A", "3A"]: @@ -1440,92 +1352,10 @@ def test_sub_offset(self): with pytest.raises(IncompatibleFrequency): p - o - def test_sub_offset_nat(self): - # freq is DateOffset - for freq in ["A", "2A", "3A"]: - p = Period("NaT", freq=freq) - assert p is NaT - for o in [offsets.YearEnd(2)]: - assert p - o is NaT - - for o in [ - offsets.YearBegin(2), - offsets.MonthBegin(1), - offsets.Minute(), - np.timedelta64(365, "D"), - timedelta(365), - ]: - assert p - o is NaT - - for freq in ["M", "2M", "3M"]: - p = Period("NaT", freq=freq) - assert p is NaT - for o in [offsets.MonthEnd(2), offsets.MonthEnd(12)]: - assert p - o is NaT - - for o in [ - offsets.YearBegin(2), - offsets.MonthBegin(1), - offsets.Minute(), - np.timedelta64(365, "D"), - timedelta(365), - ]: - assert p - o is NaT - - # freq is Tick - for freq in ["D", "2D", "3D"]: - p = Period("NaT", freq=freq) - assert p is NaT - for o in [ - offsets.Day(5), - offsets.Hour(24), - np.timedelta64(2, "D"), - np.timedelta64(3600 * 24, "s"), - timedelta(-2), - timedelta(hours=48), - ]: - assert p - o is NaT - - for o in [ - offsets.YearBegin(2), - offsets.MonthBegin(1), - offsets.Minute(), - np.timedelta64(4, "h"), - timedelta(hours=23), - ]: - assert p - o is NaT - - for freq in ["H", "2H", "3H"]: - p = Period("NaT", freq=freq) - assert p is NaT - for o in [ - offsets.Day(2), - offsets.Hour(3), - np.timedelta64(3, "h"), - np.timedelta64(3600, "s"), - timedelta(minutes=120), - timedelta(days=4, minutes=180), - ]: - assert p - o is NaT - - for o in [ - offsets.YearBegin(2), - offsets.MonthBegin(1), - offsets.Minute(), - np.timedelta64(3200, "s"), - timedelta(hours=23, minutes=30), - ]: - assert p - o is NaT - @pytest.mark.parametrize("freq", ["M", "2M", "3M"]) - def test_nat_ops(self, freq): - p = Period("NaT", freq=freq) - assert p is NaT - assert p + 1 is NaT - assert 1 + p is NaT - assert p - 1 is NaT - assert p - Period("2011-01", freq=freq) is NaT - assert Period("2011-01", freq=freq) - p is NaT + def test_period_addsub_nat(self, freq): + assert NaT - Period("2011-01", freq=freq) is NaT + assert Period("2011-01", freq=freq) - NaT is NaT def test_period_ops_offset(self): p = Period("2011-04-01", freq="D") diff --git a/pandas/tests/scalar/test_nat.py b/pandas/tests/scalar/test_nat.py index a537f000959e3..f94b96b47fc05 100644 --- a/pandas/tests/scalar/test_nat.py +++ b/pandas/tests/scalar/test_nat.py @@ -20,6 +20,7 @@ TimedeltaIndex, Timestamp, isna, + offsets, ) import pandas._testing as tm from pandas.core.arrays import DatetimeArray, PeriodArray, TimedeltaArray @@ -508,3 +509,38 @@ def test_nat_comparisons(compare_operators_no_eq_ne, other): # GH 26039 assert getattr(NaT, compare_operators_no_eq_ne)(other) is False assert getattr(other, compare_operators_no_eq_ne)(NaT) is False + + +@pytest.mark.parametrize( + "obj", + [ + offsets.YearEnd(2), + offsets.YearBegin(2), + offsets.MonthBegin(1), + offsets.MonthEnd(2), + offsets.MonthEnd(12), + offsets.Day(2), + offsets.Day(5), + offsets.Hour(24), + offsets.Hour(3), + offsets.Minute(), + np.timedelta64(3, "h"), + np.timedelta64(4, "h"), + np.timedelta64(3200, "s"), + np.timedelta64(3600, "s"), + np.timedelta64(3600 * 24, "s"), + np.timedelta64(2, "D"), + np.timedelta64(365, "D"), + timedelta(-2), + timedelta(365), + timedelta(minutes=120), + timedelta(days=4, minutes=180), + timedelta(hours=23), + timedelta(hours=23, minutes=30), + timedelta(hours=48), + ], +) +def test_nat_addsub_tdlike_scalar(obj): + assert NaT + obj is NaT + assert obj + NaT is NaT + assert NaT - obj is NaT diff --git a/pandas/tests/scalar/timedelta/test_arithmetic.py b/pandas/tests/scalar/timedelta/test_arithmetic.py index ea02a76275443..3cb868dd88605 100644 --- a/pandas/tests/scalar/timedelta/test_arithmetic.py +++ b/pandas/tests/scalar/timedelta/test_arithmetic.py @@ -88,6 +88,13 @@ def test_td_add_datetimelike_scalar(self, op): result = op(td, NaT) assert result is NaT + def test_td_add_timestamp_overflow(self): + with pytest.raises(OverflowError): + Timestamp("1700-01-01") + Timedelta(13 * 19999, unit="D") + + with pytest.raises(OverflowError): + Timestamp("1700-01-01") + timedelta(days=13 * 19999) + @pytest.mark.parametrize("op", [operator.add, ops.radd]) def test_td_add_td(self, op): td = Timedelta(10, unit="d") @@ -365,6 +372,26 @@ def test_td_div_timedeltalike_scalar(self): assert np.isnan(td / NaT) + def test_td_div_td64_non_nano(self): + + # truediv + td = Timedelta("1 days 2 hours 3 ns") + result = td / np.timedelta64(1, "D") + assert result == td.value / float(86400 * 1e9) + result = td / np.timedelta64(1, "s") + assert result == td.value / float(1e9) + result = td / np.timedelta64(1, "ns") + assert result == td.value + + # floordiv + td = Timedelta("1 days 2 hours 3 ns") + result = td // np.timedelta64(1, "D") + assert result == 1 + result = td // np.timedelta64(1, "s") + assert result == 93600 + result = td // np.timedelta64(1, "ns") + assert result == td.value + def test_td_div_numeric_scalar(self): # GH#19738 td = Timedelta(10, unit="d") @@ -589,6 +616,13 @@ def test_td_rfloordiv_timedeltalike_array(self): expected = np.array([10, np.nan]) tm.assert_numpy_array_equal(res, expected) + def test_td_rfloordiv_intarray(self): + # deprecated GH#19761, enforced GH#29797 + ints = np.array([1349654400, 1349740800, 1349827200, 1349913600]) * 10 ** 9 + + with pytest.raises(TypeError, match="Invalid dtype"): + ints // Timedelta(1, unit="s") + def test_td_rfloordiv_numeric_series(self): # GH#18846 td = Timedelta(hours=3, minutes=3) @@ -796,3 +830,129 @@ def test_rdivmod_invalid(self): def test_td_op_timedelta_timedeltalike_array(self, op, arr): with pytest.raises(TypeError): op(arr, Timedelta("1D")) + + +class TestTimedeltaComparison: + def test_compare_tick(self, tick_classes): + cls = tick_classes + + off = cls(4) + td = off.delta + assert isinstance(td, Timedelta) + + assert td == off + assert not td != off + assert td <= off + assert td >= off + assert not td < off + assert not td > off + + assert not td == 2 * off + assert td != 2 * off + assert td <= 2 * off + assert td < 2 * off + assert not td >= 2 * off + assert not td > 2 * off + + def test_comparison_object_array(self): + # analogous to GH#15183 + td = Timedelta("2 days") + other = Timedelta("3 hours") + + arr = np.array([other, td], dtype=object) + res = arr == td + expected = np.array([False, True], dtype=bool) + assert (res == expected).all() + + # 2D case + arr = np.array([[other, td], [td, other]], dtype=object) + res = arr != td + expected = np.array([[True, False], [False, True]], dtype=bool) + assert res.shape == expected.shape + assert (res == expected).all() + + def test_compare_timedelta_ndarray(self): + # GH#11835 + periods = [Timedelta("0 days 01:00:00"), Timedelta("0 days 01:00:00")] + arr = np.array(periods) + result = arr[0] > arr + expected = np.array([False, False]) + tm.assert_numpy_array_equal(result, expected) + + @pytest.mark.skip(reason="GH#20829 is reverted until after 0.24.0") + def test_compare_custom_object(self): + """ + Make sure non supported operations on Timedelta returns NonImplemented + and yields to other operand (GH#20829). + """ + + class CustomClass: + def __init__(self, cmp_result=None): + self.cmp_result = cmp_result + + def generic_result(self): + if self.cmp_result is None: + return NotImplemented + else: + return self.cmp_result + + def __eq__(self, other): + return self.generic_result() + + def __gt__(self, other): + return self.generic_result() + + t = Timedelta("1s") + + assert not (t == "string") + assert not (t == 1) + assert not (t == CustomClass()) + assert not (t == CustomClass(cmp_result=False)) + + assert t < CustomClass(cmp_result=True) + assert not (t < CustomClass(cmp_result=False)) + + assert t == CustomClass(cmp_result=True) + + @pytest.mark.parametrize("val", ["string", 1]) + def test_compare_unknown_type(self, val): + # GH#20829 + t = Timedelta("1s") + with pytest.raises(TypeError): + t >= val + with pytest.raises(TypeError): + t > val + with pytest.raises(TypeError): + t <= val + with pytest.raises(TypeError): + t < val + + +def test_ops_notimplemented(): + class Other: + pass + + other = Other() + + td = Timedelta("1 day") + assert td.__add__(other) is NotImplemented + assert td.__sub__(other) is NotImplemented + assert td.__truediv__(other) is NotImplemented + assert td.__mul__(other) is NotImplemented + assert td.__floordiv__(other) is NotImplemented + + +def test_ops_error_str(): + # GH#13624 + td = Timedelta("1 day") + + for left, right in [(td, "a"), ("a", td)]: + + with pytest.raises(TypeError): + left + right + + with pytest.raises(TypeError): + left > right + + assert not left == right + assert left != right diff --git a/pandas/tests/scalar/timedelta/test_timedelta.py b/pandas/tests/scalar/timedelta/test_timedelta.py index 9cdbeb6ab4845..0f2486be3a626 100644 --- a/pandas/tests/scalar/timedelta/test_timedelta.py +++ b/pandas/tests/scalar/timedelta/test_timedelta.py @@ -4,56 +4,14 @@ import numpy as np import pytest -from pandas._libs.tslibs import NaT, Timestamp, iNaT +from pandas._libs.tslibs import NaT, iNaT import pandas as pd -from pandas import Series, Timedelta, TimedeltaIndex, timedelta_range, to_timedelta +from pandas import Timedelta, TimedeltaIndex, offsets, to_timedelta import pandas._testing as tm -class TestTimedeltaArithmetic: - def test_arithmetic_overflow(self): - with pytest.raises(OverflowError): - Timestamp("1700-01-01") + Timedelta(13 * 19999, unit="D") - - with pytest.raises(OverflowError): - Timestamp("1700-01-01") + timedelta(days=13 * 19999) - - def test_array_timedelta_floordiv(self): - # deprecated GH#19761, enforced GH#29797 - ints = pd.date_range("2012-10-08", periods=4, freq="D").view("i8") - - with pytest.raises(TypeError, match="Invalid dtype"): - ints // Timedelta(1, unit="s") - - def test_ops_error_str(self): - # GH 13624 - td = Timedelta("1 day") - - for left, right in [(td, "a"), ("a", td)]: - - with pytest.raises(TypeError): - left + right - - with pytest.raises(TypeError): - left > right - - assert not left == right - assert left != right - - def test_ops_notimplemented(self): - class Other: - pass - - other = Other() - - td = Timedelta("1 day") - assert td.__add__(other) is NotImplemented - assert td.__sub__(other) is NotImplemented - assert td.__truediv__(other) is NotImplemented - assert td.__mul__(other) is NotImplemented - assert td.__floordiv__(other) is NotImplemented - +class TestTimedeltaUnaryOps: def test_unary_ops(self): td = Timedelta(10, unit="d") @@ -68,102 +26,6 @@ def test_unary_ops(self): assert abs(-td) == Timedelta("10d") -class TestTimedeltaComparison: - def test_compare_tick(self, tick_classes): - cls = tick_classes - - off = cls(4) - td = off.delta - assert isinstance(td, Timedelta) - - assert td == off - assert not td != off - assert td <= off - assert td >= off - assert not td < off - assert not td > off - - assert not td == 2 * off - assert td != 2 * off - assert td <= 2 * off - assert td < 2 * off - assert not td >= 2 * off - assert not td > 2 * off - - def test_comparison_object_array(self): - # analogous to GH#15183 - td = Timedelta("2 days") - other = Timedelta("3 hours") - - arr = np.array([other, td], dtype=object) - res = arr == td - expected = np.array([False, True], dtype=bool) - assert (res == expected).all() - - # 2D case - arr = np.array([[other, td], [td, other]], dtype=object) - res = arr != td - expected = np.array([[True, False], [False, True]], dtype=bool) - assert res.shape == expected.shape - assert (res == expected).all() - - def test_compare_timedelta_ndarray(self): - # GH11835 - periods = [Timedelta("0 days 01:00:00"), Timedelta("0 days 01:00:00")] - arr = np.array(periods) - result = arr[0] > arr - expected = np.array([False, False]) - tm.assert_numpy_array_equal(result, expected) - - @pytest.mark.skip(reason="GH#20829 is reverted until after 0.24.0") - def test_compare_custom_object(self): - """ - Make sure non supported operations on Timedelta returns NonImplemented - and yields to other operand (GH#20829). - """ - - class CustomClass: - def __init__(self, cmp_result=None): - self.cmp_result = cmp_result - - def generic_result(self): - if self.cmp_result is None: - return NotImplemented - else: - return self.cmp_result - - def __eq__(self, other): - return self.generic_result() - - def __gt__(self, other): - return self.generic_result() - - t = Timedelta("1s") - - assert not (t == "string") - assert not (t == 1) - assert not (t == CustomClass()) - assert not (t == CustomClass(cmp_result=False)) - - assert t < CustomClass(cmp_result=True) - assert not (t < CustomClass(cmp_result=False)) - - assert t == CustomClass(cmp_result=True) - - @pytest.mark.parametrize("val", ["string", 1]) - def test_compare_unknown_type(self, val): - # GH20829 - t = Timedelta("1s") - with pytest.raises(TypeError): - t >= val - with pytest.raises(TypeError): - t > val - with pytest.raises(TypeError): - t <= val - with pytest.raises(TypeError): - t < val - - class TestTimedeltas: @pytest.mark.parametrize( "unit, value, expected", @@ -209,26 +71,6 @@ def test_conversion(self): td = Timedelta("1 days, 10:11:12.012345678") assert td != td.to_pytimedelta() - def test_freq_conversion(self): - - # truediv - td = Timedelta("1 days 2 hours 3 ns") - result = td / np.timedelta64(1, "D") - assert result == td.value / float(86400 * 1e9) - result = td / np.timedelta64(1, "s") - assert result == td.value / float(1e9) - result = td / np.timedelta64(1, "ns") - assert result == td.value - - # floordiv - td = Timedelta("1 days 2 hours 3 ns") - result = td // np.timedelta64(1, "D") - assert result == 1 - result = td // np.timedelta64(1, "s") - assert result == 93600 - result = td // np.timedelta64(1, "ns") - assert result == td.value - def test_fields(self): def check(value): # that we are int @@ -457,13 +299,15 @@ def test_to_numpy_alias(self): td = Timedelta("10m7s") assert td.to_timedelta64() == td.to_numpy() - def test_round(self): - - t1 = Timedelta("1 days 02:34:56.789123456") - t2 = Timedelta("-1 days 02:34:56.789123456") - - for (freq, s1, s2) in [ - ("N", t1, t2), + @pytest.mark.parametrize( + "freq,s1,s2", + [ + # This first case has s1, s2 being the same as t1,t2 below + ( + "N", + Timedelta("1 days 02:34:56.789123456"), + Timedelta("-1 days 02:34:56.789123456"), + ), ( "U", Timedelta("1 days 02:34:56.789123000"), @@ -481,75 +325,21 @@ def test_round(self): ("12T", Timedelta("1 days 02:36:00"), Timedelta("-1 days 02:36:00")), ("H", Timedelta("1 days 03:00:00"), Timedelta("-1 days 03:00:00")), ("d", Timedelta("1 days"), Timedelta("-1 days")), - ]: - r1 = t1.round(freq) - assert r1 == s1 - r2 = t2.round(freq) - assert r2 == s2 - - # invalid - for freq, msg in [ - ("Y", " is a non-fixed frequency"), - ("M", " is a non-fixed frequency"), - ("foobar", "Invalid frequency: foobar"), - ]: - with pytest.raises(ValueError, match=msg): - t1.round(freq) + ], + ) + def test_round(self, freq, s1, s2): - t1 = timedelta_range("1 days", periods=3, freq="1 min 2 s 3 us") - t2 = -1 * t1 - t1a = timedelta_range("1 days", periods=3, freq="1 min 2 s") - t1c = TimedeltaIndex([1, 1, 1], unit="D") + t1 = Timedelta("1 days 02:34:56.789123456") + t2 = Timedelta("-1 days 02:34:56.789123456") - # note that negative times round DOWN! so don't give whole numbers - for (freq, s1, s2) in [ - ("N", t1, t2), - ("U", t1, t2), - ( - "L", - t1a, - TimedeltaIndex( - ["-1 days +00:00:00", "-2 days +23:58:58", "-2 days +23:57:56"], - dtype="timedelta64[ns]", - freq=None, - ), - ), - ( - "S", - t1a, - TimedeltaIndex( - ["-1 days +00:00:00", "-2 days +23:58:58", "-2 days +23:57:56"], - dtype="timedelta64[ns]", - freq=None, - ), - ), - ( - "12T", - t1c, - TimedeltaIndex( - ["-1 days", "-1 days", "-1 days"], - dtype="timedelta64[ns]", - freq=None, - ), - ), - ( - "H", - t1c, - TimedeltaIndex( - ["-1 days", "-1 days", "-1 days"], - dtype="timedelta64[ns]", - freq=None, - ), - ), - ("d", t1c, TimedeltaIndex([-1, -1, -1], unit="D")), - ]: + r1 = t1.round(freq) + assert r1 == s1 + r2 = t2.round(freq) + assert r2 == s2 - r1 = t1.round(freq) - tm.assert_index_equal(r1, s1) - r2 = t2.round(freq) - tm.assert_index_equal(r2, s2) + def test_round_invalid(self): + t1 = Timedelta("1 days 02:34:56.789123456") - # invalid for freq, msg in [ ("Y", " is a non-fixed frequency"), ("M", " is a non-fixed frequency"), @@ -561,7 +351,7 @@ def test_round(self): def test_contains(self): # Checking for any NaT-like objects # GH 13603 - td = to_timedelta(range(5), unit="d") + pd.offsets.Hour(1) + td = to_timedelta(range(5), unit="d") + offsets.Hour(1) for v in [NaT, None, float("nan"), np.nan]: assert not (v in td) @@ -652,29 +442,6 @@ def conv(v): with pytest.raises(ValueError): Timedelta("- 1days, 00") - def test_overflow(self): - # GH 9442 - s = Series(pd.date_range("20130101", periods=100000, freq="H")) - s[0] += Timedelta("1s 1ms") - - # mean - result = (s - s.min()).mean() - expected = Timedelta((TimedeltaIndex((s - s.min())).asi8 / len(s)).sum()) - - # the computation is converted to float so - # might be some loss of precision - assert np.allclose(result.value / 1000, expected.value / 1000) - - # sum - msg = "overflow in timedelta operation" - with pytest.raises(ValueError, match=msg): - (s - s.min()).sum() - s1 = s[0:10000] - with pytest.raises(ValueError, match=msg): - (s1 - s1.min()).sum() - s2 = s[0:1000] - result = (s2 - s2.min()).sum() - def test_pickle(self): v = Timedelta("1 days 10:11:12.0123456") @@ -690,7 +457,7 @@ def test_timedelta_hash_equality(self): d = {td: 2} assert d[v] == 2 - tds = timedelta_range("1 second", periods=20) + tds = [Timedelta(seconds=1) + Timedelta(days=n) for n in range(20)] assert all(hash(td) == hash(td.to_pytimedelta()) for td in tds) # python timedeltas drop ns resolution @@ -734,57 +501,6 @@ def test_total_seconds_precision(self): assert (Timedelta("30S").total_seconds() - 30.0) < 1e-20 assert (30.0 - Timedelta("30S").total_seconds()) < 1e-20 - def test_timedelta_arithmetic(self): - data = Series(["nat", "32 days"], dtype="timedelta64[ns]") - deltas = [timedelta(days=1), Timedelta(1, unit="D")] - for delta in deltas: - result_method = data.add(delta) - result_operator = data + delta - expected = Series(["nat", "33 days"], dtype="timedelta64[ns]") - tm.assert_series_equal(result_operator, expected) - tm.assert_series_equal(result_method, expected) - - result_method = data.sub(delta) - result_operator = data - delta - expected = Series(["nat", "31 days"], dtype="timedelta64[ns]") - tm.assert_series_equal(result_operator, expected) - tm.assert_series_equal(result_method, expected) - # GH 9396 - result_method = data.div(delta) - result_operator = data / delta - expected = Series([np.nan, 32.0], dtype="float64") - tm.assert_series_equal(result_operator, expected) - tm.assert_series_equal(result_method, expected) - - def test_apply_to_timedelta(self): - timedelta_NaT = to_timedelta("NaT") - - list_of_valid_strings = ["00:00:01", "00:00:02"] - a = to_timedelta(list_of_valid_strings) - b = Series(list_of_valid_strings).apply(to_timedelta) - # Can't compare until apply on a Series gives the correct dtype - # assert_series_equal(a, b) - - list_of_strings = ["00:00:01", np.nan, NaT, timedelta_NaT] - - # TODO: unused? - a = to_timedelta(list_of_strings) # noqa - b = Series(list_of_strings).apply(to_timedelta) # noqa - # Can't compare until apply on a Series gives the correct dtype - # assert_series_equal(a, b) - - def test_components(self): - rng = timedelta_range("1 days, 10:11:12", periods=2, freq="s") - rng.components - - # with nat - s = Series(rng) - s[1] = np.nan - - result = s.dt.components - assert not result.iloc[0].isna().all() - assert result.iloc[1].isna().all() - def test_resolution_string(self): assert Timedelta(days=1).resolution_string == "D" assert Timedelta(days=1, hours=6).resolution_string == "H" diff --git a/pandas/tests/scalar/timestamp/test_arithmetic.py b/pandas/tests/scalar/timestamp/test_arithmetic.py index 1cab007c20a0e..ccd7bf721430a 100644 --- a/pandas/tests/scalar/timestamp/test_arithmetic.py +++ b/pandas/tests/scalar/timestamp/test_arithmetic.py @@ -3,6 +3,8 @@ import numpy as np import pytest +from pandas.errors import OutOfBoundsDatetime + from pandas import Timedelta, Timestamp from pandas.tseries import offsets @@ -60,6 +62,18 @@ def test_overflow_offset_raises(self): with pytest.raises(OverflowError, match=msg): stamp - offset_overflow + def test_overflow_timestamp_raises(self): + # https://github.com/pandas-dev/pandas/issues/31774 + msg = "Result is too large" + a = Timestamp("2101-01-01 00:00:00") + b = Timestamp("1688-01-01 00:00:00") + + with pytest.raises(OutOfBoundsDatetime, match=msg): + a - b + + # but we're OK for timestamp and datetime.datetime + assert (a - b.to_pydatetime()) == (a.to_pydatetime() - b) + def test_delta_preserve_nanos(self): val = Timestamp(1337299200000000123) result = val + timedelta(1) diff --git a/pandas/tests/series/indexing/test_alter_index.py b/pandas/tests/series/indexing/test_alter_index.py index 05bd967903e9d..a3c431696b689 100644 --- a/pandas/tests/series/indexing/test_alter_index.py +++ b/pandas/tests/series/indexing/test_alter_index.py @@ -523,9 +523,9 @@ def test_drop_unique_and_non_unique_index( ], ) def test_drop_exception_raised(data, index, drop_labels, axis, error_type, error_desc): - + ser = Series(data, index=index) with pytest.raises(error_type, match=error_desc): - Series(data, index=index).drop(drop_labels, axis=axis) + ser.drop(drop_labels, axis=axis) def test_drop_with_ignore_errors(): @@ -565,6 +565,7 @@ def test_drop_empty_list(index, drop_labels): ) def test_drop_non_empty_list(data, index, drop_labels): # GH 21494 and GH 16877 + dtype = object if data is None else None + ser = pd.Series(data=data, index=index, dtype=dtype) with pytest.raises(KeyError, match="not found in axis"): - dtype = object if data is None else None - pd.Series(data=data, index=index, dtype=dtype).drop(drop_labels) + ser.drop(drop_labels) diff --git a/pandas/tests/series/indexing/test_datetime.py b/pandas/tests/series/indexing/test_datetime.py index fc9d4ec5290a5..b5d04fd499c08 100644 --- a/pandas/tests/series/indexing/test_datetime.py +++ b/pandas/tests/series/indexing/test_datetime.py @@ -293,7 +293,7 @@ def test_getitem_setitem_datetimeindex(): result = ts.copy() result[ts.index[4:8]] = 0 - result[4:8] = ts[4:8] + result.iloc[4:8] = ts.iloc[4:8] tm.assert_series_equal(result, ts) # also test partial date slicing @@ -349,7 +349,7 @@ def test_getitem_setitem_periodindex(): result = ts.copy() result[ts.index[4:8]] = 0 - result[4:8] = ts[4:8] + result.iloc[4:8] = ts.iloc[4:8] tm.assert_series_equal(result, ts) diff --git a/pandas/tests/series/indexing/test_get.py b/pandas/tests/series/indexing/test_get.py index 438b61ed203a3..5847141a44ef5 100644 --- a/pandas/tests/series/indexing/test_get.py +++ b/pandas/tests/series/indexing/test_get.py @@ -132,3 +132,20 @@ def test_get_nan_multiple(): idx = [np.nan, np.nan] assert s.get(idx) is None + + +def test_get_with_default(): + # GH#7725 + d0 = ["a", "b", "c", "d"] + d1 = np.arange(4, dtype="int64") + others = ["e", 10] + + for data, index in ((d0, d1), (d1, d0)): + s = Series(data, index=index) + for i, d in zip(index, data): + assert s.get(i) == d + assert s.get(i, d) == d + assert s.get(i, "z") == d + for other in others: + assert s.get(other, "z") == "z" + assert s.get(other, other) == other diff --git a/pandas/tests/series/indexing/test_indexing.py b/pandas/tests/series/indexing/test_indexing.py index 18fcbea683dd3..5b3786e1a0d3c 100644 --- a/pandas/tests/series/indexing/test_indexing.py +++ b/pandas/tests/series/indexing/test_indexing.py @@ -923,3 +923,15 @@ def test_getitem_2d_no_warning(): series = pd.Series([1, 2, 3], index=[1, 2, 3]) with tm.assert_produces_warning(None): series[:, None] + + +def test_getitem_unrecognized_scalar(): + # GH#32684 a scalar key that is not recognized by lib.is_scalar + + # a series that might be produced via `frame.dtypes` + ser = pd.Series([1, 2], index=[np.dtype("O"), np.dtype("i8")]) + + key = ser.index[1] + + result = ser[key] + assert result == 2 diff --git a/pandas/tests/series/methods/test_argsort.py b/pandas/tests/series/methods/test_argsort.py index c7fe6ed19a2eb..4353eb4c8cd64 100644 --- a/pandas/tests/series/methods/test_argsort.py +++ b/pandas/tests/series/methods/test_argsort.py @@ -49,8 +49,8 @@ def test_argsort_stable(self): mexpected = np.argsort(s.values, kind="mergesort") qexpected = np.argsort(s.values, kind="quicksort") - tm.assert_series_equal(mindexer, Series(mexpected), check_dtype=False) - tm.assert_series_equal(qindexer, Series(qexpected), check_dtype=False) + tm.assert_series_equal(mindexer.astype(np.intp), Series(mexpected)) + tm.assert_series_equal(qindexer.astype(np.intp), Series(qexpected)) msg = ( r"ndarray Expected type , " r"found instead" diff --git a/pandas/tests/series/methods/test_asof.py b/pandas/tests/series/methods/test_asof.py index b121efd202744..ad5a2de6eabac 100644 --- a/pandas/tests/series/methods/test_asof.py +++ b/pandas/tests/series/methods/test_asof.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas._libs.tslibs import IncompatibleFrequency + from pandas import Series, Timestamp, date_range, isna, notna, offsets import pandas._testing as tm @@ -12,7 +14,7 @@ def test_basic(self): N = 50 rng = date_range("1/1/1990", periods=N, freq="53s") ts = Series(np.random.randn(N), index=rng) - ts[15:30] = np.nan + ts.iloc[15:30] = np.nan dates = date_range("1/1/1990", periods=N * 3, freq="25s") result = ts.asof(dates) @@ -37,8 +39,8 @@ def test_scalar(self): N = 30 rng = date_range("1/1/1990", periods=N, freq="53s") ts = Series(np.arange(N), index=rng) - ts[5:10] = np.NaN - ts[15:20] = np.NaN + ts.iloc[5:10] = np.NaN + ts.iloc[15:20] = np.NaN val1 = ts.asof(ts.index[7]) val2 = ts.asof(ts.index[19]) @@ -94,7 +96,7 @@ def test_periodindex(self): N = 50 rng = period_range("1/1/1990", periods=N, freq="H") ts = Series(np.random.randn(N), index=rng) - ts[15:30] = np.nan + ts.iloc[15:30] = np.nan dates = date_range("1/1/1990", periods=N * 3, freq="37min") result = ts.asof(dates) @@ -112,8 +114,8 @@ def test_periodindex(self): rs = result[mask] assert (rs == ts[lb]).all() - ts[5:10] = np.nan - ts[15:20] = np.nan + ts.iloc[5:10] = np.nan + ts.iloc[15:20] = np.nan val1 = ts.asof(ts.index[7]) val2 = ts.asof(ts.index[19]) @@ -132,6 +134,11 @@ def test_periodindex(self): d = ts.index[0].to_timestamp() - offsets.BDay() assert isna(ts.asof(d)) + # Mismatched freq + msg = "Input has different freq" + with pytest.raises(IncompatibleFrequency, match=msg): + ts.asof(rng.asfreq("D")) + def test_errors(self): s = Series( diff --git a/pandas/tests/series/methods/test_autocorr.py b/pandas/tests/series/methods/test_autocorr.py new file mode 100644 index 0000000000000..05e3540a7e702 --- /dev/null +++ b/pandas/tests/series/methods/test_autocorr.py @@ -0,0 +1,30 @@ +import numpy as np + + +class TestAutoCorr: + def test_autocorr(self, datetime_series): + # Just run the function + corr1 = datetime_series.autocorr() + + # Now run it with the lag parameter + corr2 = datetime_series.autocorr(lag=1) + + # corr() with lag needs Series of at least length 2 + if len(datetime_series) <= 2: + assert np.isnan(corr1) + assert np.isnan(corr2) + else: + assert corr1 == corr2 + + # Choose a random lag between 1 and length of Series - 2 + # and compare the result with the Series corr() function + n = 1 + np.random.randint(max(1, len(datetime_series) - 2)) + corr1 = datetime_series.corr(datetime_series.shift(n)) + corr2 = datetime_series.autocorr(lag=n) + + # corr() with lag needs Series of at least length 2 + if len(datetime_series) <= 2: + assert np.isnan(corr1) + assert np.isnan(corr2) + else: + assert corr1 == corr2 diff --git a/pandas/tests/series/methods/test_between_time.py b/pandas/tests/series/methods/test_between_time.py index 3fa26afe77a1d..e9d2f8e6f1637 100644 --- a/pandas/tests/series/methods/test_between_time.py +++ b/pandas/tests/series/methods/test_between_time.py @@ -139,6 +139,6 @@ def test_between_time_axis(self): assert len(ts.between_time(stime, etime)) == expected_length assert len(ts.between_time(stime, etime, axis=0)) == expected_length - msg = "No axis named 1 for object type " + msg = "No axis named 1 for object type Series" with pytest.raises(ValueError, match=msg): ts.between_time(stime, etime, axis=1) diff --git a/pandas/tests/series/methods/test_convert_dtypes.py b/pandas/tests/series/methods/test_convert_dtypes.py index 17527a09f07a1..a41f893e3753f 100644 --- a/pandas/tests/series/methods/test_convert_dtypes.py +++ b/pandas/tests/series/methods/test_convert_dtypes.py @@ -279,3 +279,8 @@ def test_convert_string_dtype(self): ) result = df.convert_dtypes() tm.assert_frame_equal(df, result) + + def test_convert_bool_dtype(self): + # GH32287 + df = pd.DataFrame({"A": pd.array([True])}) + tm.assert_frame_equal(df, df.convert_dtypes()) diff --git a/pandas/tests/series/methods/test_drop_duplicates.py b/pandas/tests/series/methods/test_drop_duplicates.py index 2d052505d5ecc..54f32f979232d 100644 --- a/pandas/tests/series/methods/test_drop_duplicates.py +++ b/pandas/tests/series/methods/test_drop_duplicates.py @@ -44,6 +44,26 @@ def test_drop_duplicates_bool(keep, expected): tm.assert_series_equal(sc, tc[~expected]) +@pytest.mark.parametrize("values", [[], list(range(5))]) +def test_drop_duplicates_no_duplicates(any_numpy_dtype, keep, values): + tc = Series(values, dtype=np.dtype(any_numpy_dtype)) + expected = Series([False] * len(tc), dtype="bool") + + if tc.dtype == "bool": + # 0 -> False and 1-> True + # any other value would be duplicated + tc = tc[:2] + expected = expected[:2] + + tm.assert_series_equal(tc.duplicated(keep=keep), expected) + + result_dropped = tc.drop_duplicates(keep=keep) + tm.assert_series_equal(result_dropped, tc) + + # validate shallow copy + assert result_dropped is not tc + + class TestSeriesDropDuplicates: @pytest.mark.parametrize( "dtype", diff --git a/pandas/tests/series/methods/test_rank.py b/pandas/tests/series/methods/test_rank.py index 3d4688c8274f9..caaffb7d5b61f 100644 --- a/pandas/tests/series/methods/test_rank.py +++ b/pandas/tests/series/methods/test_rank.py @@ -202,9 +202,7 @@ def test_rank_categorical(self): def test_rank_signature(self): s = Series([0, 1]) s.rank(method="average") - msg = ( - "No axis named average for object type " - ) + msg = "No axis named average for object type Series" with pytest.raises(ValueError, match=msg): s.rank("average") diff --git a/pandas/tests/series/methods/test_replace.py b/pandas/tests/series/methods/test_replace.py index 26eaf53616282..904a455870ab1 100644 --- a/pandas/tests/series/methods/test_replace.py +++ b/pandas/tests/series/methods/test_replace.py @@ -294,7 +294,7 @@ def test_replace_categorical(self, categorical, numeric): s = pd.Series(categorical) result = s.replace({"A": 1, "B": 2}) expected = pd.Series(numeric) - tm.assert_series_equal(expected, result, check_dtype=False) + tm.assert_series_equal(expected, result) def test_replace_categorical_single(self): # GH 26988 diff --git a/pandas/tests/series/methods/test_shift.py b/pandas/tests/series/methods/test_shift.py index 8256e2f33b936..e8d7f5958d0a1 100644 --- a/pandas/tests/series/methods/test_shift.py +++ b/pandas/tests/series/methods/test_shift.py @@ -263,3 +263,13 @@ def test_shift_categorical(self): tm.assert_index_equal(s.values.categories, sp1.values.categories) tm.assert_index_equal(s.values.categories, sn2.values.categories) + + def test_shift_dt64values_int_fill_deprecated(self): + # GH#31971 + ser = pd.Series([pd.Timestamp("2020-01-01"), pd.Timestamp("2020-01-02")]) + + with tm.assert_produces_warning(FutureWarning): + result = ser.shift(1, fill_value=0) + + expected = pd.Series([pd.Timestamp(0), ser[0]]) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/series/methods/test_sort_index.py b/pandas/tests/series/methods/test_sort_index.py index 6fa4eeaee34c0..d4ebc9062a0c9 100644 --- a/pandas/tests/series/methods/test_sort_index.py +++ b/pandas/tests/series/methods/test_sort_index.py @@ -30,7 +30,7 @@ def test_sort_index(self, datetime_series): sorted_series = random_order.sort_index(axis=0) tm.assert_series_equal(sorted_series, datetime_series) - msg = "No axis named 1 for object type " + msg = "No axis named 1 for object type Series" with pytest.raises(ValueError, match=msg): random_order.sort_values(axis=1) diff --git a/pandas/tests/series/methods/test_value_counts.py b/pandas/tests/series/methods/test_value_counts.py index fdb35befeb0c2..f97362ce9c2a9 100644 --- a/pandas/tests/series/methods/test_value_counts.py +++ b/pandas/tests/series/methods/test_value_counts.py @@ -1,4 +1,5 @@ import numpy as np +import pytest import pandas as pd from pandas import Categorical, CategoricalIndex, Series @@ -177,3 +178,28 @@ def test_value_counts_categorical_with_nan(self): exp = Series([2, 1, 3], index=CategoricalIndex(["a", "b", np.nan])) res = ser.value_counts(dropna=False, sort=False) tm.assert_series_equal(res, exp) + + @pytest.mark.parametrize( + "ser, dropna, exp", + [ + ( + pd.Series([False, True, True, pd.NA]), + False, + pd.Series([2, 1, 1], index=[True, False, pd.NA]), + ), + ( + pd.Series([False, True, True, pd.NA]), + True, + pd.Series([2, 1], index=[True, False]), + ), + ( + pd.Series(range(3), index=[True, False, np.nan]).index, + False, + pd.Series([1, 1, 1], index=[True, False, pd.NA]), + ), + ], + ) + def test_value_counts_bool_with_nan(self, ser, dropna, exp): + # GH32146 + out = ser.value_counts(dropna=dropna) + tm.assert_series_equal(out, exp) diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py index 6f45b72154805..149d0aae8ab99 100644 --- a/pandas/tests/series/test_analytics.py +++ b/pandas/tests/series/test_analytics.py @@ -209,3 +209,27 @@ def test_validate_stat_keepdims(self): ) with pytest.raises(ValueError, match=msg): np.sum(s, keepdims=True) + + def test_td64_summation_overflow(self): + # GH 9442 + s = pd.Series(pd.date_range("20130101", periods=100000, freq="H")) + s[0] += pd.Timedelta("1s 1ms") + + # mean + result = (s - s.min()).mean() + expected = pd.Timedelta((pd.TimedeltaIndex((s - s.min())).asi8 / len(s)).sum()) + + # the computation is converted to float so + # might be some loss of precision + assert np.allclose(result.value / 1000, expected.value / 1000) + + # sum + msg = "overflow in timedelta operation" + with pytest.raises(ValueError, match=msg): + (s - s.min()).sum() + + s1 = s[0:10000] + with pytest.raises(ValueError, match=msg): + (s1 - s1.min()).sum() + s2 = s[0:1000] + (s2 - s2.min()).sum() diff --git a/pandas/tests/series/test_apply.py b/pandas/tests/series/test_apply.py index a4c55a80a9f0f..3c2cb5275f3a8 100644 --- a/pandas/tests/series/test_apply.py +++ b/pandas/tests/series/test_apply.py @@ -4,6 +4,8 @@ import numpy as np import pytest +from pandas.core.dtypes.generic import ABCMultiIndex + import pandas as pd from pandas import DataFrame, Index, Series, isna import pandas._testing as tm @@ -514,9 +516,11 @@ def test_map(self, datetime_series): exp = Series([np.nan, "B", "C", "D"]) tm.assert_series_equal(a.map(c), exp) - @pytest.mark.parametrize("index", tm.all_index_generator(10)) - def test_map_empty(self, index): - s = Series(index) + def test_map_empty(self, indices): + if isinstance(indices, ABCMultiIndex): + pytest.skip("Initializing a Series from a MultiIndex is not supported") + + s = Series(indices) result = s.map({}) expected = pd.Series(np.nan, index=s.index) @@ -787,3 +791,25 @@ def test_map_float_to_string_precision(self): result = ser.map(lambda val: str(val)).to_dict() expected = {0: "0.3333333333333333"} assert result == expected + + def test_map_with_invalid_na_action_raises(self): + # https://github.com/pandas-dev/pandas/issues/32815 + s = pd.Series([1, 2, 3]) + msg = "na_action must either be 'ignore' or None" + with pytest.raises(ValueError, match=msg): + s.map(lambda x: x, na_action="____") + + def test_apply_to_timedelta(self): + list_of_valid_strings = ["00:00:01", "00:00:02"] + a = pd.to_timedelta(list_of_valid_strings) + b = Series(list_of_valid_strings).apply(pd.to_timedelta) + # FIXME: dont leave commented-out + # Can't compare until apply on a Series gives the correct dtype + # assert_series_equal(a, b) + + list_of_strings = ["00:00:01", np.nan, pd.NaT, pd.NaT] + + a = pd.to_timedelta(list_of_strings) # noqa + b = Series(list_of_strings).apply(pd.to_timedelta) # noqa + # Can't compare until apply on a Series gives the correct dtype + # assert_series_equal(a, b) diff --git a/pandas/tests/series/test_arithmetic.py b/pandas/tests/series/test_arithmetic.py index 10197766ce4a6..95d04c9a45d25 100644 --- a/pandas/tests/series/test_arithmetic.py +++ b/pandas/tests/series/test_arithmetic.py @@ -74,7 +74,7 @@ def test_add_series_with_period_index(self): result = ts + ts[::2] expected = ts + ts - expected[1::2] = np.nan + expected.iloc[1::2] = np.nan tm.assert_series_equal(result, expected) result = ts + _permute(ts[::2]) diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index 1a794f8656abe..e4c25f31c4b43 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -393,7 +393,7 @@ def test_constructor_categorical_dtype(self): expected = Series( ["a", "a"], index=[0, 1], dtype=CategoricalDtype(["a", "b"], ordered=True) ) - tm.assert_series_equal(result, expected, check_categorical=True) + tm.assert_series_equal(result, expected) def test_constructor_categorical_string(self): # GH 26336: the string 'category' maintains existing CategoricalDtype @@ -1115,7 +1115,7 @@ def create_data(constructor): tm.assert_series_equal(result_datetime, expected) tm.assert_series_equal(result_Timestamp, expected) - def test_contructor_dict_tuple_indexer(self): + def test_constructor_dict_tuple_indexer(self): # GH 12948 data = {(1, 1, None): -1.0} result = Series(data) @@ -1428,3 +1428,10 @@ def test_constructor_data_aware_dtype_naive(self, tz_aware_fixture): result = Series([Timestamp("2019", tz=tz)], dtype="datetime64[ns]") expected = Series([Timestamp("2019")]) tm.assert_series_equal(result, expected) + + def test_constructor_datetime64(self): + rng = date_range("1/1/2000 00:00:00", "1/1/2000 1:59:50", freq="10s") + dates = np.asarray(rng) + + series = Series(dates) + assert np.issubdtype(series.dtype, np.dtype("M8[ns]")) diff --git a/pandas/tests/series/test_dtypes.py b/pandas/tests/series/test_dtypes.py index 80a024eda7848..31f17be2fac7b 100644 --- a/pandas/tests/series/test_dtypes.py +++ b/pandas/tests/series/test_dtypes.py @@ -296,18 +296,18 @@ def cmp(a, b): # array conversion tm.assert_almost_equal(np.array(s), np.array(s.values)) - # valid conversion - for valid in [ - lambda x: x.astype("category"), - lambda x: x.astype(CategoricalDtype()), - lambda x: x.astype("object").astype("category"), - lambda x: x.astype("object").astype(CategoricalDtype()), - ]: - - result = valid(s) - # compare series values - # internal .categories can't be compared because it is sorted - tm.assert_series_equal(result, s, check_categorical=False) + tm.assert_series_equal(s.astype("category"), s) + tm.assert_series_equal(s.astype(CategoricalDtype()), s) + + roundtrip_expected = s.cat.set_categories( + s.cat.categories.sort_values() + ).cat.remove_unused_categories() + tm.assert_series_equal( + s.astype("object").astype("category"), roundtrip_expected + ) + tm.assert_series_equal( + s.astype("object").astype(CategoricalDtype()), roundtrip_expected + ) # invalid conversion (these are NOT a dtype) msg = ( diff --git a/pandas/tests/series/test_duplicates.py b/pandas/tests/series/test_duplicates.py index 3513db6177951..89181a08819b1 100644 --- a/pandas/tests/series/test_duplicates.py +++ b/pandas/tests/series/test_duplicates.py @@ -47,9 +47,9 @@ def test_unique(): # GH 18051 s = Series(Categorical([])) - tm.assert_categorical_equal(s.unique(), Categorical([]), check_dtype=False) + tm.assert_categorical_equal(s.unique(), Categorical([])) s = Series(Categorical([np.nan])) - tm.assert_categorical_equal(s.unique(), Categorical([np.nan]), check_dtype=False) + tm.assert_categorical_equal(s.unique(), Categorical([np.nan])) def test_unique_data_ownership(): diff --git a/pandas/tests/series/test_internals.py b/pandas/tests/series/test_internals.py index 1566d8f36373b..3a1996b2938a5 100644 --- a/pandas/tests/series/test_internals.py +++ b/pandas/tests/series/test_internals.py @@ -232,7 +232,7 @@ def test_from_list_dtype(self): assert result._data.blocks[0].is_extension is False -def test_hasnans_unchached_for_series(): +def test_hasnans_uncached_for_series(): # GH#19700 idx = pd.Index([0, 1]) assert idx.hasnans is False diff --git a/pandas/tests/series/test_missing.py b/pandas/tests/series/test_missing.py index bac005465034f..15f1bc8941d47 100644 --- a/pandas/tests/series/test_missing.py +++ b/pandas/tests/series/test_missing.py @@ -823,7 +823,7 @@ def test_dropna_empty(self): assert len(s) == 0 # invalid axis - msg = "No axis named 1 for object type " + msg = "No axis named 1 for object type Series" with pytest.raises(ValueError, match=msg): s.dropna(axis=1) diff --git a/pandas/tests/series/test_repr.py b/pandas/tests/series/test_repr.py index 64a8c4569406e..77f942a9e32ec 100644 --- a/pandas/tests/series/test_repr.py +++ b/pandas/tests/series/test_repr.py @@ -218,6 +218,25 @@ def test_index_repr_in_frame_with_nan(self): assert repr(s) == exp + def test_format_pre_1900_dates(self): + rng = date_range("1/1/1850", "1/1/1950", freq="A-DEC") + rng.format() + ts = Series(1, index=rng) + repr(ts) + + def test_series_repr_nat(self): + series = Series([0, 1000, 2000, pd.NaT.value], dtype="M8[ns]") + + result = repr(series) + expected = ( + "0 1970-01-01 00:00:00.000000\n" + "1 1970-01-01 00:00:00.000001\n" + "2 1970-01-01 00:00:00.000002\n" + "3 NaT\n" + "dtype: datetime64[ns]" + ) + assert result == expected + class TestCategoricalRepr: def test_categorical_repr_unicode(self): diff --git a/pandas/tests/series/test_timeseries.py b/pandas/tests/series/test_timeseries.py index 9796a32532b99..6ca67b6cc8429 100644 --- a/pandas/tests/series/test_timeseries.py +++ b/pandas/tests/series/test_timeseries.py @@ -1,53 +1,12 @@ -from io import StringIO - import numpy as np - -from pandas._libs.tslib import iNaT +import pytest import pandas as pd from pandas import DataFrame, DatetimeIndex, Series, date_range, timedelta_range import pandas._testing as tm -def _simple_ts(start, end, freq="D"): - rng = date_range(start, end, freq=freq) - return Series(np.random.randn(len(rng)), index=rng) - - -def assert_range_equal(left, right): - assert left.equals(right) - assert left.freq == right.freq - assert left.tz == right.tz - - class TestTimeSeries: - def test_autocorr(self, datetime_series): - # Just run the function - corr1 = datetime_series.autocorr() - - # Now run it with the lag parameter - corr2 = datetime_series.autocorr(lag=1) - - # corr() with lag needs Series of at least length 2 - if len(datetime_series) <= 2: - assert np.isnan(corr1) - assert np.isnan(corr2) - else: - assert corr1 == corr2 - - # Choose a random lag between 1 and length of Series - 2 - # and compare the result with the Series corr() function - n = 1 + np.random.randint(max(1, len(datetime_series) - 2)) - corr1 = datetime_series.corr(datetime_series.shift(n)) - corr2 = datetime_series.autocorr(lag=n) - - # corr() with lag needs Series of at least length 2 - if len(datetime_series) <= 2: - assert np.isnan(corr1) - assert np.isnan(corr2) - else: - assert corr1 == corr2 - def test_mpl_compat_hack(self, datetime_series): # This is currently failing because the test was relying on @@ -74,33 +33,13 @@ def test_contiguous_boolean_preserve_freq(self): masked = rng[mask] expected = rng[10:20] - assert expected.freq is not None - assert_range_equal(masked, expected) + assert expected.freq == rng.freq + tm.assert_index_equal(masked, expected) mask[22] = True masked = rng[mask] assert masked.freq is None - def test_series_ctor_datetime64(self): - rng = date_range("1/1/2000 00:00:00", "1/1/2000 1:59:50", freq="10s") - dates = np.asarray(rng) - - series = Series(dates) - assert np.issubdtype(series.dtype, np.dtype("M8[ns]")) - - def test_series_repr_nat(self): - series = Series([0, 1000, 2000, iNaT], dtype="M8[ns]") - - result = repr(series) - expected = ( - "0 1970-01-01 00:00:00.000000\n" - "1 1970-01-01 00:00:00.000001\n" - "2 1970-01-01 00:00:00.000002\n" - "3 NaT\n" - "dtype: datetime64[ns]" - ) - assert result == expected - def test_promote_datetime_date(self): rng = date_range("1/1/2000", periods=20) ts = Series(np.random.randn(20), index=rng) @@ -124,12 +63,6 @@ def test_promote_datetime_date(self): expected = rng.get_indexer(ts_slice.index) tm.assert_numpy_array_equal(result, expected) - def test_format_pre_1900_dates(self): - rng = date_range("1/1/1850", "1/1/1950", freq="A-DEC") - rng.format() - ts = Series(1, index=rng) - repr(ts) - def test_groupby_count_dateparseerror(self): dr = date_range(start="1/1/2012", freq="5min", periods=10) @@ -144,15 +77,6 @@ def test_groupby_count_dateparseerror(self): tm.assert_series_equal(result, expected) - def test_to_csv_numpy_16_bug(self): - frame = DataFrame({"a": date_range("1/1/2000", periods=10)}) - - buf = StringIO() - frame.to_csv(buf) - - result = buf.getvalue() - assert "2000-01-01" in result - def test_series_map_box_timedelta(self): # GH 11349 s = Series(timedelta_range("1 day 1 s", periods=5, freq="h")) @@ -196,6 +120,19 @@ def test_view_tz(self): ) tm.assert_series_equal(result, expected) + @pytest.mark.parametrize("tz", [None, "US/Central"]) + def test_asarray_object_dt64(self, tz): + ser = pd.Series(pd.date_range("2000", periods=2, tz=tz)) + + with tm.assert_produces_warning(None): + # Future behavior (for tzaware case) with no warning + result = np.asarray(ser, dtype=object) + + expected = np.array( + [pd.Timestamp("2000-01-01", tz=tz), pd.Timestamp("2000-01-02", tz=tz)] + ) + tm.assert_numpy_array_equal(result, expected) + def test_asarray_tz_naive(self): # This shouldn't produce a warning. ser = pd.Series(pd.date_range("2000", periods=2)) @@ -204,12 +141,6 @@ def test_asarray_tz_naive(self): tm.assert_numpy_array_equal(result, expected) - # optionally, object - result = np.asarray(ser, dtype=object) - - expected = np.array([pd.Timestamp("2000-01-01"), pd.Timestamp("2000-01-02")]) - tm.assert_numpy_array_equal(result, expected) - def test_asarray_tz_aware(self): tz = "US/Central" ser = pd.Series(pd.date_range("2000", periods=2, tz=tz)) @@ -222,11 +153,3 @@ def test_asarray_tz_aware(self): result = np.asarray(ser, dtype="M8[ns]") tm.assert_numpy_array_equal(result, expected) - - # Future behavior with no warning - expected = np.array( - [pd.Timestamp("2000-01-01", tz=tz), pd.Timestamp("2000-01-02", tz=tz)] - ) - result = np.asarray(ser, dtype=object) - - tm.assert_numpy_array_equal(result, expected) diff --git a/pandas/tests/series/test_ufunc.py b/pandas/tests/series/test_ufunc.py index 536f15ea75d69..c7fc37a278e83 100644 --- a/pandas/tests/series/test_ufunc.py +++ b/pandas/tests/series/test_ufunc.py @@ -171,7 +171,7 @@ def test_binary_ufunc_scalar(ufunc, sparse, flip, arrays_for_binary_ufunc): @pytest.mark.parametrize("sparse", SPARSE, ids=SPARSE_IDS) @pytest.mark.parametrize("shuffle", SHUFFLE) @pytest.mark.filterwarnings("ignore:divide by zero:RuntimeWarning") -def test_multiple_ouput_binary_ufuncs(ufunc, sparse, shuffle, arrays_for_binary_ufunc): +def test_multiple_output_binary_ufuncs(ufunc, sparse, shuffle, arrays_for_binary_ufunc): # Test that # the same conditions from binary_ufunc_scalar apply to # ufuncs with multiple outputs. @@ -204,7 +204,7 @@ def test_multiple_ouput_binary_ufuncs(ufunc, sparse, shuffle, arrays_for_binary_ @pytest.mark.parametrize("sparse", SPARSE, ids=SPARSE_IDS) -def test_multiple_ouput_ufunc(sparse, arrays_for_binary_ufunc): +def test_multiple_output_ufunc(sparse, arrays_for_binary_ufunc): # Test that the same conditions from unary input apply to multi-output # ufuncs array, _ = arrays_for_binary_ufunc diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index a1de9c435c9ba..ad7028702ec8c 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -760,6 +760,16 @@ def test_categorical_from_codes(self): result = algos.isin(Sd, St) tm.assert_numpy_array_equal(expected, result) + def test_categorical_isin(self): + vals = np.array([0, 1, 2, 0]) + cats = ["a", "b", "c"] + cat = Categorical(1).from_codes(vals, cats) + other = Categorical(1).from_codes(np.array([0, 1]), cats) + + expected = np.array([True, True, False, True]) + result = algos.isin(cat, other) + tm.assert_numpy_array_equal(expected, result) + def test_same_nan_is_in(self): # GH 22160 # nan is special, because from " a is b" doesn't follow "a == b" diff --git a/pandas/tests/test_common.py b/pandas/tests/test_common.py index 186c735a0bff9..bcfed2d0d3a10 100644 --- a/pandas/tests/test_common.py +++ b/pandas/tests/test_common.py @@ -6,6 +6,8 @@ import numpy as np import pytest +from pandas.compat.numpy import _np_version_under1p17 + import pandas as pd from pandas import Series, Timestamp from pandas.core import ops @@ -59,8 +61,31 @@ def test_random_state(): # check with no arg random state assert com.random_state() is np.random + # check array-like + # GH32503 + state_arr_like = npr.randint(0, 2 ** 31, size=624, dtype="uint32") + assert ( + com.random_state(state_arr_like).uniform() + == npr.RandomState(state_arr_like).uniform() + ) + + # Check BitGenerators + # GH32503 + if not _np_version_under1p17: + assert ( + com.random_state(npr.MT19937(3)).uniform() + == npr.RandomState(npr.MT19937(3)).uniform() + ) + assert ( + com.random_state(npr.PCG64(11)).uniform() + == npr.RandomState(npr.PCG64(11)).uniform() + ) + # Error for floats or strings - msg = "random_state must be an integer, a numpy RandomState, or None" + msg = ( + "random_state must be an integer, array-like, a BitGenerator, " + "a numpy RandomState, or None" + ) with pytest.raises(ValueError, match=msg): com.random_state("test") diff --git a/pandas/tests/test_downstream.py b/pandas/tests/test_downstream.py index b2a85b539fd86..122ef1f47968e 100644 --- a/pandas/tests/test_downstream.py +++ b/pandas/tests/test_downstream.py @@ -107,7 +107,6 @@ def test_pandas_datareader(): # importing from pandas, Cython import warning @pytest.mark.filterwarnings("ignore:can't resolve:ImportWarning") -@pytest.mark.skip(reason="Anaconda installation issue - GH32144") def test_geopandas(): geopandas = import_module("geopandas") # noqa diff --git a/pandas/tests/test_nanops.py b/pandas/tests/test_nanops.py index f7e652eb78e2d..cac6a59527a6e 100644 --- a/pandas/tests/test_nanops.py +++ b/pandas/tests/test_nanops.py @@ -19,6 +19,14 @@ has_c16 = hasattr(np, "complex128") +@pytest.fixture(params=[True, False]) +def skipna(request): + """ + Fixture to pass skipna to nanops functions. + """ + return request.param + + class TestnanopsDataFrame: def setup_method(self, method): np.random.seed(11235) @@ -89,28 +97,14 @@ def teardown_method(self, method): def check_results(self, targ, res, axis, check_dtype=True): res = getattr(res, "asm8", res) - res = getattr(res, "values", res) - - # timedeltas are a beast here - def _coerce_tds(targ, res): - if hasattr(targ, "dtype") and targ.dtype == "m8[ns]": - if len(targ) == 1: - targ = targ[0].item() - res = res.item() - else: - targ = targ.view("i8") - return targ, res - try: - if ( - axis != 0 - and hasattr(targ, "shape") - and targ.ndim - and targ.shape != res.shape - ): - res = np.split(res, [targ.shape[0]], axis=0)[0] - except (ValueError, IndexError): - targ, res = _coerce_tds(targ, res) + if ( + axis != 0 + and hasattr(targ, "shape") + and targ.ndim + and targ.shape != res.shape + ): + res = np.split(res, [targ.shape[0]], axis=0)[0] try: tm.assert_almost_equal(targ, res, check_dtype=check_dtype) @@ -118,9 +112,7 @@ def _coerce_tds(targ, res): # handle timedelta dtypes if hasattr(targ, "dtype") and targ.dtype == "m8[ns]": - targ, res = _coerce_tds(targ, res) - tm.assert_almost_equal(targ, res, check_dtype=check_dtype) - return + raise # There are sometimes rounding errors with # complex and object dtypes. @@ -149,29 +141,29 @@ def check_fun_data( targfunc, testarval, targarval, + skipna, check_dtype=True, empty_targfunc=None, **kwargs, ): for axis in list(range(targarval.ndim)) + [None]: - for skipna in [False, True]: - targartempval = targarval if skipna else testarval - if skipna and empty_targfunc and isna(targartempval).all(): - targ = empty_targfunc(targartempval, axis=axis, **kwargs) - else: - targ = targfunc(targartempval, axis=axis, **kwargs) + targartempval = targarval if skipna else testarval + if skipna and empty_targfunc and isna(targartempval).all(): + targ = empty_targfunc(targartempval, axis=axis, **kwargs) + else: + targ = targfunc(targartempval, axis=axis, **kwargs) - res = testfunc(testarval, axis=axis, skipna=skipna, **kwargs) + res = testfunc(testarval, axis=axis, skipna=skipna, **kwargs) + self.check_results(targ, res, axis, check_dtype=check_dtype) + if skipna: + res = testfunc(testarval, axis=axis, **kwargs) + self.check_results(targ, res, axis, check_dtype=check_dtype) + if axis is None: + res = testfunc(testarval, skipna=skipna, **kwargs) + self.check_results(targ, res, axis, check_dtype=check_dtype) + if skipna and axis is None: + res = testfunc(testarval, **kwargs) self.check_results(targ, res, axis, check_dtype=check_dtype) - if skipna: - res = testfunc(testarval, axis=axis, **kwargs) - self.check_results(targ, res, axis, check_dtype=check_dtype) - if axis is None: - res = testfunc(testarval, skipna=skipna, **kwargs) - self.check_results(targ, res, axis, check_dtype=check_dtype) - if skipna and axis is None: - res = testfunc(testarval, **kwargs) - self.check_results(targ, res, axis, check_dtype=check_dtype) if testarval.ndim <= 1: return @@ -184,12 +176,15 @@ def check_fun_data( targfunc, testarval2, targarval2, + skipna=skipna, check_dtype=check_dtype, empty_targfunc=empty_targfunc, **kwargs, ) - def check_fun(self, testfunc, targfunc, testar, empty_targfunc=None, **kwargs): + def check_fun( + self, testfunc, targfunc, testar, skipna, empty_targfunc=None, **kwargs + ): targar = testar if testar.endswith("_nan") and hasattr(self, testar[:-4]): @@ -202,6 +197,7 @@ def check_fun(self, testfunc, targfunc, testar, empty_targfunc=None, **kwargs): targfunc, testarval, targarval, + skipna=skipna, empty_targfunc=empty_targfunc, **kwargs, ) @@ -210,6 +206,7 @@ def check_funs( self, testfunc, targfunc, + skipna, allow_complex=True, allow_all_nan=True, allow_date=True, @@ -217,10 +214,10 @@ def check_funs( allow_obj=True, **kwargs, ): - self.check_fun(testfunc, targfunc, "arr_float", **kwargs) - self.check_fun(testfunc, targfunc, "arr_float_nan", **kwargs) - self.check_fun(testfunc, targfunc, "arr_int", **kwargs) - self.check_fun(testfunc, targfunc, "arr_bool", **kwargs) + self.check_fun(testfunc, targfunc, "arr_float", skipna, **kwargs) + self.check_fun(testfunc, targfunc, "arr_float_nan", skipna, **kwargs) + self.check_fun(testfunc, targfunc, "arr_int", skipna, **kwargs) + self.check_fun(testfunc, targfunc, "arr_bool", skipna, **kwargs) objs = [ self.arr_float.astype("O"), self.arr_int.astype("O"), @@ -228,18 +225,18 @@ def check_funs( ] if allow_all_nan: - self.check_fun(testfunc, targfunc, "arr_nan", **kwargs) + self.check_fun(testfunc, targfunc, "arr_nan", skipna, **kwargs) if allow_complex: - self.check_fun(testfunc, targfunc, "arr_complex", **kwargs) - self.check_fun(testfunc, targfunc, "arr_complex_nan", **kwargs) + self.check_fun(testfunc, targfunc, "arr_complex", skipna, **kwargs) + self.check_fun(testfunc, targfunc, "arr_complex_nan", skipna, **kwargs) if allow_all_nan: - self.check_fun(testfunc, targfunc, "arr_nan_nanj", **kwargs) + self.check_fun(testfunc, targfunc, "arr_nan_nanj", skipna, **kwargs) objs += [self.arr_complex.astype("O")] if allow_date: targfunc(self.arr_date) - self.check_fun(testfunc, targfunc, "arr_date", **kwargs) + self.check_fun(testfunc, targfunc, "arr_date", skipna, **kwargs) objs += [self.arr_date.astype("O")] if allow_tdelta: @@ -248,7 +245,7 @@ def check_funs( except TypeError: pass else: - self.check_fun(testfunc, targfunc, "arr_tdelta", **kwargs) + self.check_fun(testfunc, targfunc, "arr_tdelta", skipna, **kwargs) objs += [self.arr_tdelta.astype("O")] if allow_obj: @@ -260,7 +257,7 @@ def check_funs( targfunc = partial( self._badobj_wrap, func=targfunc, allow_complex=allow_complex ) - self.check_fun(testfunc, targfunc, "arr_obj", **kwargs) + self.check_fun(testfunc, targfunc, "arr_obj", skipna, **kwargs) def _badobj_wrap(self, value, func, allow_complex=True, **kwargs): if value.dtype.kind == "O": @@ -273,28 +270,22 @@ def _badobj_wrap(self, value, func, allow_complex=True, **kwargs): @pytest.mark.parametrize( "nan_op,np_op", [(nanops.nanany, np.any), (nanops.nanall, np.all)] ) - def test_nan_funcs(self, nan_op, np_op): - # TODO: allow tdelta, doesn't break tests - self.check_funs( - nan_op, np_op, allow_all_nan=False, allow_date=False, allow_tdelta=False - ) + def test_nan_funcs(self, nan_op, np_op, skipna): + self.check_funs(nan_op, np_op, skipna, allow_all_nan=False, allow_date=False) - def test_nansum(self): + def test_nansum(self, skipna): self.check_funs( nanops.nansum, np.sum, + skipna, allow_date=False, check_dtype=False, empty_targfunc=np.nansum, ) - def test_nanmean(self): + def test_nanmean(self, skipna): self.check_funs( - nanops.nanmean, - np.mean, - allow_complex=False, # TODO: allow this, doesn't break test - allow_obj=False, - allow_date=False, + nanops.nanmean, np.mean, skipna, allow_obj=False, allow_date=False, ) def test_nanmean_overflow(self): @@ -336,22 +327,24 @@ def test_returned_dtype(self, dtype): else: assert result.dtype == dtype - def test_nanmedian(self): + def test_nanmedian(self, skipna): with warnings.catch_warnings(record=True): warnings.simplefilter("ignore", RuntimeWarning) self.check_funs( nanops.nanmedian, np.median, + skipna, allow_complex=False, allow_date=False, allow_obj="convert", ) @pytest.mark.parametrize("ddof", range(3)) - def test_nanvar(self, ddof): + def test_nanvar(self, ddof, skipna): self.check_funs( nanops.nanvar, np.var, + skipna, allow_complex=False, allow_date=False, allow_obj="convert", @@ -359,10 +352,11 @@ def test_nanvar(self, ddof): ) @pytest.mark.parametrize("ddof", range(3)) - def test_nanstd(self, ddof): + def test_nanstd(self, ddof, skipna): self.check_funs( nanops.nanstd, np.std, + skipna, allow_complex=False, allow_date=False, allow_obj="convert", @@ -371,13 +365,14 @@ def test_nanstd(self, ddof): @td.skip_if_no_scipy @pytest.mark.parametrize("ddof", range(3)) - def test_nansem(self, ddof): + def test_nansem(self, ddof, skipna): from scipy.stats import sem with np.errstate(invalid="ignore"): self.check_funs( nanops.nansem, sem, + skipna, allow_complex=False, allow_date=False, allow_tdelta=False, @@ -388,10 +383,10 @@ def test_nansem(self, ddof): @pytest.mark.parametrize( "nan_op,np_op", [(nanops.nanmin, np.min), (nanops.nanmax, np.max)] ) - def test_nanops_with_warnings(self, nan_op, np_op): + def test_nanops_with_warnings(self, nan_op, np_op, skipna): with warnings.catch_warnings(record=True): warnings.simplefilter("ignore", RuntimeWarning) - self.check_funs(nan_op, np_op, allow_obj=False) + self.check_funs(nan_op, np_op, skipna, allow_obj=False) def _argminmax_wrap(self, value, axis=None, func=None): res = func(value, axis) @@ -408,17 +403,17 @@ def _argminmax_wrap(self, value, axis=None, func=None): res = -1 return res - def test_nanargmax(self): + def test_nanargmax(self, skipna): with warnings.catch_warnings(record=True): warnings.simplefilter("ignore", RuntimeWarning) func = partial(self._argminmax_wrap, func=np.argmax) - self.check_funs(nanops.nanargmax, func, allow_obj=False) + self.check_funs(nanops.nanargmax, func, skipna, allow_obj=False) - def test_nanargmin(self): + def test_nanargmin(self, skipna): with warnings.catch_warnings(record=True): warnings.simplefilter("ignore", RuntimeWarning) func = partial(self._argminmax_wrap, func=np.argmin) - self.check_funs(nanops.nanargmin, func, allow_obj=False) + self.check_funs(nanops.nanargmin, func, skipna, allow_obj=False) def _skew_kurt_wrap(self, values, axis=None, func=None): if not isinstance(values.dtype.type, np.floating): @@ -433,7 +428,7 @@ def _skew_kurt_wrap(self, values, axis=None, func=None): return result @td.skip_if_no_scipy - def test_nanskew(self): + def test_nanskew(self, skipna): from scipy.stats import skew func = partial(self._skew_kurt_wrap, func=skew) @@ -441,13 +436,14 @@ def test_nanskew(self): self.check_funs( nanops.nanskew, func, + skipna, allow_complex=False, allow_date=False, allow_tdelta=False, ) @td.skip_if_no_scipy - def test_nankurt(self): + def test_nankurt(self, skipna): from scipy.stats import kurtosis func1 = partial(kurtosis, fisher=True) @@ -456,15 +452,17 @@ def test_nankurt(self): self.check_funs( nanops.nankurt, func, + skipna, allow_complex=False, allow_date=False, allow_tdelta=False, ) - def test_nanprod(self): + def test_nanprod(self, skipna): self.check_funs( nanops.nanprod, np.prod, + skipna, allow_date=False, allow_tdelta=False, empty_targfunc=np.nanprod, @@ -602,7 +600,7 @@ def test_nancorr_spearman(self): def test_invalid_method(self): targ0 = np.corrcoef(self.arr_float_2d, self.arr_float1_2d)[0, 1] targ1 = np.corrcoef(self.arr_float_2d.flat, self.arr_float1_2d.flat)[0, 1] - msg = "Unkown method 'foo', expected one of 'kendall', 'spearman'" + msg = "Unknown method 'foo', expected one of 'kendall', 'spearman'" with pytest.raises(ValueError, match=msg): self.check_nancorr_nancov_1d(nanops.nancorr, targ0, targ1, method="foo") diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index 1338d801e39f4..6abf174aa7fd2 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -1157,6 +1157,18 @@ def test_repeat(self): assert isinstance(rs, Series) tm.assert_series_equal(rs, xp) + def test_repeat_with_null(self): + # GH: 31632 + values = Series(["a", None], dtype="string") + result = values.str.repeat([3, 4]) + exp = Series(["aaa", None], dtype="string") + tm.assert_series_equal(result, exp) + + values = Series(["a", "b"], dtype="string") + result = values.str.repeat([3, None]) + exp = Series(["aaa", None], dtype="string") + tm.assert_series_equal(result, exp) + def test_match(self): # New match behavior introduced in 0.13 values = Series(["fooBAD__barBAD", np.nan, "foo"]) diff --git a/pandas/tests/tools/test_to_numeric.py b/pandas/tests/tools/test_to_numeric.py index 19385e797467c..263887a8ea36e 100644 --- a/pandas/tests/tools/test_to_numeric.py +++ b/pandas/tests/tools/test_to_numeric.py @@ -627,3 +627,25 @@ def test_non_coerce_uint64_conflict(errors, exp): else: result = to_numeric(ser, errors=errors) tm.assert_series_equal(result, ser) + + +@pytest.mark.parametrize("dc1", ["integer", "float", "unsigned"]) +@pytest.mark.parametrize("dc2", ["integer", "float", "unsigned"]) +def test_downcast_empty(dc1, dc2): + # GH32493 + + tm.assert_numpy_array_equal( + pd.to_numeric([], downcast=dc1), + pd.to_numeric([], downcast=dc2), + check_dtype=False, + ) + + +def test_failure_to_convert_uint64_string_to_NaN(): + # GH 32394 + result = to_numeric("uint64", errors="coerce") + assert np.isnan(result) + + ser = Series([32, 64, np.nan]) + result = to_numeric(pd.Series(["32", "64", "uint64"]), errors="coerce") + tm.assert_series_equal(result, ser) diff --git a/pandas/tests/util/test_assert_frame_equal.py b/pandas/tests/util/test_assert_frame_equal.py index 3090343ba2fd9..fe3e1ff906919 100644 --- a/pandas/tests/util/test_assert_frame_equal.py +++ b/pandas/tests/util/test_assert_frame_equal.py @@ -1,5 +1,6 @@ import pytest +import pandas as pd from pandas import DataFrame import pandas._testing as tm @@ -177,6 +178,7 @@ def test_frame_equal_block_mismatch(by_blocks_fixture, obj_fixture): msg = f"""{obj}\\.iloc\\[:, 1\\] \\(column name="B"\\) are different {obj}\\.iloc\\[:, 1\\] \\(column name="B"\\) values are different \\(33\\.33333 %\\) +\\[index\\]: \\[0, 1, 2\\] \\[left\\]: \\[4, 5, 6\\] \\[right\\]: \\[4, 5, 7\\]""" @@ -196,6 +198,7 @@ def test_frame_equal_block_mismatch(by_blocks_fixture, obj_fixture): """{obj}\\.iloc\\[:, 1\\] \\(column name="E"\\) are different {obj}\\.iloc\\[:, 1\\] \\(column name="E"\\) values are different \\(33\\.33333 %\\) +\\[index\\]: \\[0, 1, 2\\] \\[left\\]: \\[é, è, ë\\] \\[right\\]: \\[é, è, e̊\\]""", ), @@ -205,6 +208,7 @@ def test_frame_equal_block_mismatch(by_blocks_fixture, obj_fixture): """{obj}\\.iloc\\[:, 0\\] \\(column name="A"\\) are different {obj}\\.iloc\\[:, 0\\] \\(column name="A"\\) values are different \\(100\\.0 %\\) +\\[index\\]: \\[0, 1, 2\\] \\[left\\]: \\[á, à, ä\\] \\[right\\]: \\[a, a, a\\]""", ), @@ -218,3 +222,41 @@ def test_frame_equal_unicode(df1, df2, msg, by_blocks_fixture, obj_fixture): msg = msg.format(obj=obj_fixture) with pytest.raises(AssertionError, match=msg): tm.assert_frame_equal(df1, df2, by_blocks=by_blocks_fixture, obj=obj_fixture) + + +def test_assert_frame_equal_extension_dtype_mismatch(): + # https://github.com/pandas-dev/pandas/issues/32747 + left = DataFrame({"a": [1, 2, 3]}, dtype="Int64") + right = left.astype(int) + + msg = ( + "Attributes of DataFrame\\.iloc\\[:, 0\\] " + '\\(column name="a"\\) are different\n\n' + 'Attribute "dtype" are different\n' + "\\[left\\]: Int64\n" + "\\[right\\]: int[32|64]" + ) + + tm.assert_frame_equal(left, right, check_dtype=False) + + with pytest.raises(AssertionError, match=msg): + tm.assert_frame_equal(left, right, check_dtype=True) + + +def test_assert_frame_equal_interval_dtype_mismatch(): + # https://github.com/pandas-dev/pandas/issues/32747 + left = DataFrame({"a": [pd.Interval(0, 1)]}, dtype="interval") + right = left.astype(object) + + msg = ( + "Attributes of DataFrame\\.iloc\\[:, 0\\] " + '\\(column name="a"\\) are different\n\n' + 'Attribute "dtype" are different\n' + "\\[left\\]: interval\\[int64\\]\n" + "\\[right\\]: object" + ) + + tm.assert_frame_equal(left, right, check_dtype=False) + + with pytest.raises(AssertionError, match=msg): + tm.assert_frame_equal(left, right, check_dtype=True) diff --git a/pandas/tests/util/test_assert_series_equal.py b/pandas/tests/util/test_assert_series_equal.py index eaf0824f52927..8bf3d82672695 100644 --- a/pandas/tests/util/test_assert_series_equal.py +++ b/pandas/tests/util/test_assert_series_equal.py @@ -1,5 +1,6 @@ import pytest +import pandas as pd from pandas import Categorical, DataFrame, Series import pandas._testing as tm @@ -168,6 +169,7 @@ def test_series_equal_values_mismatch(check_less_precise): msg = """Series are different Series values are different \\(33\\.33333 %\\) +\\[index\\]: \\[0, 1, 2\\] \\[left\\]: \\[1, 2, 3\\] \\[right\\]: \\[1, 2, 4\\]""" @@ -194,3 +196,58 @@ def test_series_equal_categorical_mismatch(check_categorical): tm.assert_series_equal(s1, s2, check_categorical=check_categorical) else: _assert_series_equal_both(s1, s2, check_categorical=check_categorical) + + +def test_assert_series_equal_extension_dtype_mismatch(): + # https://github.com/pandas-dev/pandas/issues/32747 + left = Series(pd.array([1, 2, 3], dtype="Int64")) + right = left.astype(int) + + msg = """Attributes of Series are different + +Attribute "dtype" are different +\\[left\\]: Int64 +\\[right\\]: int[32|64]""" + + tm.assert_series_equal(left, right, check_dtype=False) + + with pytest.raises(AssertionError, match=msg): + tm.assert_series_equal(left, right, check_dtype=True) + + +def test_assert_series_equal_interval_dtype_mismatch(): + # https://github.com/pandas-dev/pandas/issues/32747 + left = Series([pd.Interval(0, 1)], dtype="interval") + right = left.astype(object) + + msg = """Attributes of Series are different + +Attribute "dtype" are different +\\[left\\]: interval\\[int64\\] +\\[right\\]: object""" + + tm.assert_series_equal(left, right, check_dtype=False) + + with pytest.raises(AssertionError, match=msg): + tm.assert_series_equal(left, right, check_dtype=True) + + +def test_series_equal_series_type(): + class MySeries(Series): + pass + + s1 = Series([1, 2]) + s2 = Series([1, 2]) + s3 = MySeries([1, 2]) + + tm.assert_series_equal(s1, s2, check_series_type=False) + tm.assert_series_equal(s1, s2, check_series_type=True) + + tm.assert_series_equal(s1, s3, check_series_type=False) + tm.assert_series_equal(s3, s1, check_series_type=False) + + with pytest.raises(AssertionError, match="Series classes are different"): + tm.assert_series_equal(s1, s3, check_series_type=True) + + with pytest.raises(AssertionError, match="Series classes are different"): + tm.assert_series_equal(s3, s1, check_series_type=True) diff --git a/pandas/tests/util/test_doc.py b/pandas/tests/util/test_doc.py index 7e5e24456b9a7..50859564e654f 100644 --- a/pandas/tests/util/test_doc.py +++ b/pandas/tests/util/test_doc.py @@ -14,13 +14,15 @@ def cumsum(whatever): @doc( cumsum, - """ - Examples - -------- + dedent( + """ + Examples + -------- - >>> cumavg([1, 2, 3]) - 2 - """, + >>> cumavg([1, 2, 3]) + 2 + """ + ), method="cumavg", operation="average", ) diff --git a/pandas/tests/window/test_timeseries_window.py b/pandas/tests/window/test_timeseries_window.py index 5f5e10b5dd497..0c5289cd78fed 100644 --- a/pandas/tests/window/test_timeseries_window.py +++ b/pandas/tests/window/test_timeseries_window.py @@ -709,20 +709,25 @@ def test_rolling_cov_offset(self): tm.assert_series_equal(result, expected2) def test_rolling_on_decreasing_index(self): - # GH-19248 + # GH-19248, GH-32385 index = [ - Timestamp("20190101 09:00:00"), - Timestamp("20190101 09:00:02"), - Timestamp("20190101 09:00:03"), - Timestamp("20190101 09:00:05"), - Timestamp("20190101 09:00:06"), + Timestamp("20190101 09:00:30"), + Timestamp("20190101 09:00:27"), + Timestamp("20190101 09:00:20"), + Timestamp("20190101 09:00:18"), + Timestamp("20190101 09:00:10"), ] - df = DataFrame({"column": [3, 4, 4, 2, 1]}, index=reversed(index)) - result = df.rolling("2s").min() - expected = DataFrame( - {"column": [3.0, 3.0, 3.0, 2.0, 1.0]}, index=reversed(index) - ) + df = DataFrame({"column": [3, 4, 4, 5, 6]}, index=index) + result = df.rolling("5s").min() + expected = DataFrame({"column": [3.0, 3.0, 4.0, 4.0, 6.0]}, index=index) + tm.assert_frame_equal(result, expected) + + def test_rolling_on_empty(self): + # GH-32385 + df = DataFrame({"column": []}, index=[]) + result = df.rolling("5s").min() + expected = DataFrame({"column": []}, index=[]) tm.assert_frame_equal(result, expected) def test_rolling_on_multi_index_level(self): diff --git a/pandas/tseries/frequencies.py b/pandas/tseries/frequencies.py index 1a1b7e8e1bd08..077c5046ac44d 100644 --- a/pandas/tseries/frequencies.py +++ b/pandas/tseries/frequencies.py @@ -4,7 +4,6 @@ import warnings import numpy as np -from pytz import AmbiguousTimeError from pandas._libs.algos import unique_deltas from pandas._libs.tslibs import Timedelta, Timestamp @@ -20,7 +19,7 @@ from pandas.core.dtypes.common import ( is_datetime64_dtype, - is_period_arraylike, + is_period_dtype, is_timedelta64_dtype, ) from pandas.core.dtypes.generic import ABCSeries @@ -148,13 +147,11 @@ def to_offset(freq) -> Optional[DateOffset]: delta = None stride_sign = None try: - splitted = re.split(libfreqs.opattern, freq) - if splitted[-1] != "" and not splitted[-1].isspace(): + split = re.split(libfreqs.opattern, freq) + if split[-1] != "" and not split[-1].isspace(): # the last element must be blank raise ValueError("last element must be blank") - for sep, stride, name in zip( - splitted[0::4], splitted[1::4], splitted[2::4] - ): + for sep, stride, name in zip(split[0::4], split[1::4], split[2::4]): if sep != "" and not sep.isspace(): raise ValueError("separator must be spaces") prefix = libfreqs._lite_rule_alias.get(name) or name @@ -250,9 +247,14 @@ def infer_freq(index, warn: bool = True) -> Optional[str]: Returns ------- str or None - None if no discernible frequency - TypeError if the index is not datetime-like - ValueError if there are less than three values. + None if no discernible frequency. + + Raises + ------ + TypeError + If the index is not datetime-like. + ValueError + If there are fewer than three values. """ import pandas as pd @@ -270,7 +272,7 @@ def infer_freq(index, warn: bool = True) -> Optional[str]: index = values inferer: _FrequencyInferer - if is_period_arraylike(index): + if is_period_dtype(index): raise TypeError( "PeriodIndex given. Check the `freq` attribute " "instead of using infer_freq." @@ -285,13 +287,10 @@ def infer_freq(index, warn: bool = True) -> Optional[str]: raise TypeError( f"cannot infer freq from a non-convertible index type {type(index)}" ) - index = index.values + index = index._values if not isinstance(index, pd.DatetimeIndex): - try: - index = pd.DatetimeIndex(index) - except AmbiguousTimeError: - index = pd.DatetimeIndex(index.asi8) + index = pd.DatetimeIndex(index) inferer = _FrequencyInferer(index, warn=warn) return inferer.get_freq() @@ -304,13 +303,13 @@ class _FrequencyInferer: def __init__(self, index, warn: bool = True): self.index = index - self.values = index.asi8 + self.i8values = index.asi8 # This moves the values, which are implicitly in UTC, to the # the timezone so they are in local time if hasattr(index, "tz"): if index.tz is not None: - self.values = tz_convert(self.values, UTC, index.tz) + self.i8values = tz_convert(self.i8values, UTC, index.tz) self.warn = warn @@ -323,10 +322,12 @@ def __init__(self, index, warn: bool = True): @cache_readonly def deltas(self): - return unique_deltas(self.values) + return unique_deltas(self.i8values) @cache_readonly def deltas_asi8(self): + # NB: we cannot use self.i8values here because we may have converted + # the tz in __init__ return unique_deltas(self.index.asi8) @cache_readonly @@ -340,7 +341,7 @@ def is_unique_asi8(self) -> bool: def get_freq(self) -> Optional[str]: """ Find the appropriate frequency string to describe the inferred - frequency of self.values + frequency of self.i8values Returns ------- @@ -392,11 +393,11 @@ def hour_deltas(self): @cache_readonly def fields(self): - return build_field_sarray(self.values) + return build_field_sarray(self.i8values) @cache_readonly def rep_stamp(self): - return Timestamp(self.values[0]) + return Timestamp(self.i8values[0]) def month_position_check(self): return libresolution.month_position_check(self.fields, self.index.dayofweek) @@ -490,6 +491,7 @@ def _is_business_daily(self) -> bool: ) def _get_wom_rule(self) -> Optional[str]: + # FIXME: dont leave commented-out # wdiffs = unique(np.diff(self.index.week)) # We also need -47, -49, -48 to catch index spanning year boundary # if not lib.ismember(wdiffs, set([4, 5, -47, -49, -48])).all(): diff --git a/pandas/tseries/offsets.py b/pandas/tseries/offsets.py index b6bbe008812cb..bc20d784c8dee 100644 --- a/pandas/tseries/offsets.py +++ b/pandas/tseries/offsets.py @@ -337,9 +337,6 @@ def apply_index(self, i): # integer addition on PeriodIndex is deprecated, # so we directly use _time_shift instead asper = i.to_period("W") - if not isinstance(asper._data, np.ndarray): - # unwrap PeriodIndex --> PeriodArray - asper = asper._data shifted = asper._time_shift(weeks) i = shifted.to_timestamp() + i.to_perioddelta("W") @@ -629,9 +626,6 @@ def apply_index(self, i): # to_period rolls forward to next BDay; track and # reduce n where it does when rolling forward asper = i.to_period("B") - if not isinstance(asper._data, np.ndarray): - # unwrap PeriodIndex --> PeriodArray - asper = asper._data if self.n > 0: shifted = (i.to_perioddelta("B") - time).asi8 != 0 @@ -1384,9 +1378,6 @@ def apply_index(self, i): # integer-array addition on PeriodIndex is deprecated, # so we use _addsub_int_array directly asper = i.to_period("M") - if not isinstance(asper._data, np.ndarray): - # unwrap PeriodIndex --> PeriodArray - asper = asper._data shifted = asper._addsub_int_array(roll // 2, operator.add) i = type(dti)(shifted.to_timestamp()) @@ -1582,9 +1573,6 @@ def apply_index(self, i): # integer addition on PeriodIndex is deprecated, # so we use _time_shift directly asper = i.to_period("W") - if not isinstance(asper._data, np.ndarray): - # unwrap PeriodIndex --> PeriodArray - asper = asper._data shifted = asper._time_shift(self.n) return shifted.to_timestamp() + i.to_perioddelta("W") @@ -1608,9 +1596,6 @@ def _end_apply_index(self, dtindex): base, mult = libfrequencies.get_freq_code(self.freqstr) base_period = dtindex.to_period(base) - if not isinstance(base_period._data, np.ndarray): - # unwrap PeriodIndex --> PeriodArray - base_period = base_period._data if self.n > 0: # when adding, dates on end roll to next diff --git a/pandas/util/_decorators.py b/pandas/util/_decorators.py index d854be062fcbb..7a804792174c7 100644 --- a/pandas/util/_decorators.py +++ b/pandas/util/_decorators.py @@ -250,9 +250,11 @@ def doc(*args: Union[str, Callable], **kwargs: str) -> Callable[[F], F]: A decorator take docstring templates, concatenate them and perform string substitution on it. - This decorator is robust even if func.__doc__ is None. This decorator will - add a variable "_docstr_template" to the wrapped function to save original - docstring template for potential usage. + This decorator will add a variable "_docstring_components" to the wrapped + function to keep track the original docstring template for potential usage. + If it should be consider as a template, it will be saved as a string. + Otherwise, it will be saved as callable, and later user __doc__ and dedent + to get docstring. Parameters ---------- @@ -268,17 +270,28 @@ def decorator(func: F) -> F: def wrapper(*args, **kwargs) -> Callable: return func(*args, **kwargs) - templates = [func.__doc__ if func.__doc__ else ""] + # collecting docstring and docstring templates + docstring_components: List[Union[str, Callable]] = [] + if func.__doc__: + docstring_components.append(dedent(func.__doc__)) + for arg in args: - if isinstance(arg, str): - templates.append(arg) - elif hasattr(arg, "_docstr_template"): - templates.append(arg._docstr_template) # type: ignore - elif arg.__doc__: - templates.append(arg.__doc__) - - wrapper._docstr_template = "".join(dedent(t) for t in templates) # type: ignore - wrapper.__doc__ = wrapper._docstr_template.format(**kwargs) # type: ignore + if hasattr(arg, "_docstring_components"): + docstring_components.extend(arg._docstring_components) # type: ignore + elif isinstance(arg, str) or arg.__doc__: + docstring_components.append(arg) + + # formatting templates and concatenating docstring + wrapper.__doc__ = "".join( + [ + arg.format(**kwargs) + if isinstance(arg, str) + else dedent(arg.__doc__ or "") + for arg in docstring_components + ] + ) + + wrapper._docstring_components = docstring_components # type: ignore return cast(F, wrapper) diff --git a/pandas/util/_doctools.py b/pandas/util/_doctools.py index 8fd4566d7763b..71965b8e7dd9d 100644 --- a/pandas/util/_doctools.py +++ b/pandas/util/_doctools.py @@ -126,7 +126,7 @@ def _insert_index(self, data): if col_nlevels > 1: col = data.columns._get_level_values(0) values = [ - data.columns._get_level_values(i).values for i in range(1, col_nlevels) + data.columns._get_level_values(i)._values for i in range(1, col_nlevels) ] col_df = pd.DataFrame(values) data.columns = col_df.columns diff --git a/requirements-dev.txt b/requirements-dev.txt index a469cbdd93ceb..9ee67c56ab8ca 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -70,5 +70,5 @@ sqlalchemy xarray pyreadstat tabulate>=0.8.3 -git+https://github.com/pandas-dev/pandas-sphinx-theme.git@master +git+https://github.com/pandas-dev/pydata-sphinx-theme.git@master git+https://github.com/numpy/numpydoc \ No newline at end of file diff --git a/scripts/validate_rst_title_capitalization.py b/scripts/validate_rst_title_capitalization.py new file mode 100755 index 0000000000000..17752134e5049 --- /dev/null +++ b/scripts/validate_rst_title_capitalization.py @@ -0,0 +1,206 @@ +#!/usr/bin/env python +""" +Validate that the titles in the rst files follow the proper capitalization convention. + +Print the titles that do not follow the convention. + +Usage:: +./scripts/validate_rst_title_capitalization.py doc/source/development/contributing.rst +./scripts/validate_rst_title_capitalization.py doc/source/ + +""" +import argparse +import sys +import re +import os +from typing import Tuple, Generator, List +import glob + + +CAPITALIZATION_EXCEPTIONS = { + "pandas", + "Python", + "IPython", + "PyTables", + "Excel", + "JSON", + "HTML", + "SAS", + "SQL", + "BigQuery", + "STATA", + "Interval", + "PEP8", + "Period", + "Series", + "Index", + "DataFrame", + "C", + "Git", + "GitHub", + "NumPy", + "Apache", + "Arrow", + "Parquet", + "MultiIndex", + "NumFOCUS", + "sklearn", + "Docker", +} + +CAP_EXCEPTIONS_DICT = {word.lower(): word for word in CAPITALIZATION_EXCEPTIONS} + +err_msg = "Heading capitalization formatted incorrectly. Please correctly capitalize" + +symbols = ("*", "=", "-", "^", "~", "#", '"') + + +def correct_title_capitalization(title: str) -> str: + """ + Algorithm to create the correct capitalization for a given title. + + Parameters + ---------- + title : str + Heading string to correct. + + Returns + ------- + str + Correctly capitalized heading. + """ + + # Strip all non-word characters from the beginning of the title to the + # first word character. + correct_title: str = re.sub(r"^\W*", "", title).capitalize() + + # Remove a URL from the title. We do this because words in a URL must + # stay lowercase, even if they are a capitalization exception. + removed_https_title = re.sub(r"", "", correct_title) + + # Split a title into a list using non-word character delimiters. + word_list = re.split(r"\W", removed_https_title) + + for word in word_list: + if word.lower() in CAP_EXCEPTIONS_DICT: + correct_title = re.sub( + rf"\b{word}\b", CAP_EXCEPTIONS_DICT[word.lower()], correct_title + ) + + return correct_title + + +def find_titles(rst_file: str) -> Generator[Tuple[str, int], None, None]: + """ + Algorithm to identify particular text that should be considered headings in an + RST file. + + See for details + on what constitutes a string as a heading in RST. + + Parameters + ---------- + rst_file : str + RST file to scan through for headings. + + Yields + ------- + title : str + A heading found in the rst file. + + line_number : int + The corresponding line number of the heading. + """ + + with open(rst_file, "r") as fd: + previous_line = "" + for i, line in enumerate(fd): + line = line[:-1] + line_chars = set(line) + if ( + len(line_chars) == 1 + and line_chars.pop() in symbols + and len(line) == len(previous_line) + ): + yield re.sub(r"[`\*_]", "", previous_line), i + previous_line = line + + +def find_rst_files(source_paths: List[str]) -> Generator[str, None, None]: + """ + Given the command line arguments of directory paths, this method + yields the strings of the .rst file directories that these paths contain. + + Parameters + ---------- + source_paths : str + List of directories to validate, provided through command line arguments. + + Yields + ------- + str + Directory address of a .rst files found in command line argument directories. + """ + + for directory_address in source_paths: + if not os.path.exists(directory_address): + raise ValueError( + "Please enter a valid path, pointing to a valid file/directory." + ) + elif directory_address.endswith(".rst"): + yield directory_address + else: + for filename in glob.glob( + pathname=f"{directory_address}/**/*.rst", recursive=True + ): + yield filename + + +def main(source_paths: List[str], output_format: str) -> bool: + """ + The main method to print all headings with incorrect capitalization. + + Parameters + ---------- + source_paths : str + List of directories to validate, provided through command line arguments. + output_format : str + Output format of the script. + + Returns + ------- + int + Number of incorrect headings found overall. + """ + + number_of_errors: int = 0 + + for filename in find_rst_files(source_paths): + for title, line_number in find_titles(filename): + if title != correct_title_capitalization(title): + print( + f"""{filename}:{line_number}:{err_msg} "{title}" to "{ + correct_title_capitalization(title)}" """ + ) + number_of_errors += 1 + + return number_of_errors + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Validate heading capitalization") + + parser.add_argument( + "paths", nargs="+", default=".", help="Source paths of file/directory to check." + ) + + parser.add_argument( + "--format", + "-f", + default="{source_path}:{line_number}:{msg}:{heading}:{correct_heading}", + help="Output format of incorrectly capitalized titles", + ) + + args = parser.parse_args() + + sys.exit(main(args.paths, args.format)) diff --git a/scripts/validate_string_concatenation.py b/scripts/validate_string_concatenation.py index fbf3bb5cfccf2..c5f257c641b25 100755 --- a/scripts/validate_string_concatenation.py +++ b/scripts/validate_string_concatenation.py @@ -4,7 +4,7 @@ Check where there is a string that needs to be concatenated. -This is necessary after black formating, +This is necessary after black formatting, where for example black transforms this: >>> foo = ( diff --git a/setup.cfg b/setup.cfg index bbd8489622005..87802190ea26a 100644 --- a/setup.cfg +++ b/setup.cfg @@ -98,6 +98,7 @@ exclude_lines = # Don't complain if non-runnable code isn't run: if 0: if __name__ == .__main__.: + if TYPE_CHECKING: [coverage:html] directory = coverage_html_report @@ -195,9 +196,6 @@ check_untyped_defs=False [mypy-pandas.core.indexes.multi] check_untyped_defs=False -[mypy-pandas.core.indexing] -check_untyped_defs=False - [mypy-pandas.core.internals.blocks] check_untyped_defs=False diff --git a/setup.py b/setup.py index 2d49d7e1e85f2..461ef005c3df3 100755 --- a/setup.py +++ b/setup.py @@ -433,8 +433,7 @@ def run(self): extra_compile_args.append("/Z7") extra_link_args.append("/DEBUG") else: - # args to ignore warnings - extra_compile_args = [] + extra_compile_args = ["-Werror"] extra_link_args = [] if debugging_symbols_requested: extra_compile_args.append("-g") @@ -477,6 +476,14 @@ def run(self): # we can't do anything about these warnings because they stem from # cython+numpy version mismatches. macros.append(("NPY_NO_DEPRECATED_API", "0")) +if "-Werror" in extra_compile_args: + try: + import numpy as np + except ImportError: + pass + else: + if np.__version__ < LooseVersion("1.16.0"): + extra_compile_args.remove("-Werror") # ---------------------------------------------------------------------- diff --git a/web/pandas/config.yml b/web/pandas/config.yml index a52c580f23530..d943ad3833b52 100644 --- a/web/pandas/config.yml +++ b/web/pandas/config.yml @@ -127,7 +127,7 @@ sponsors: url: https://chanzuckerberg.com/ logo: /static/img/partners/czi.svg kind: regular - description: "pandas is funded by the Essential Open Source Software for Science program of the Chan Zuckerberg Initiative. The funding is used for general maintainance, improve extension types, and a efficient string type." + description: "pandas is funded by the Essential Open Source Software for Science program of the Chan Zuckerberg Initiative. The funding is used for general maintenance, improve extension types, and a efficient string type." inkind: # not included in active so they don't appear in the home page - name: "OVH" url: https://us.ovhcloud.com/