diff --git a/.circleci/config.yml b/.circleci/config.yml index ac9db5f451bf3..ba124533e953a 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -46,8 +46,9 @@ jobs: fi - run: name: Build aarch64 wheels + no_output_timeout: 30m # Sometimes the tests won't generate any output, make sure the job doesn't get killed by that command: | - pip3 install cibuildwheel==2.14.1 + pip3 install cibuildwheel==2.15.0 cibuildwheel --prerelease-pythons --output-dir wheelhouse environment: CIBW_BUILD: << parameters.cibw-build >> @@ -91,5 +92,4 @@ workflows: only: /^v.*/ matrix: parameters: - # TODO: Enable Python 3.12 wheels when numpy releases a version that supports Python 3.12 - cibw-build: ["cp39-manylinux_aarch64", "cp310-manylinux_aarch64", "cp311-manylinux_aarch64"]#, "cp312-manylinux_aarch64"] + cibw-build: ["cp39-manylinux_aarch64", "cp310-manylinux_aarch64", "cp311-manylinux_aarch64", "cp312-manylinux_aarch64"] diff --git a/.github/actions/build_pandas/action.yml b/.github/actions/build_pandas/action.yml index 73d7723e2fb49..3ee10efaaf96f 100644 --- a/.github/actions/build_pandas/action.yml +++ b/.github/actions/build_pandas/action.yml @@ -25,8 +25,8 @@ runs: - name: Build Pandas run: | if [[ ${{ inputs.editable }} == "true" ]]; then - pip install -e . --no-build-isolation -v + pip install -e . --no-build-isolation -v --no-deps else - pip install . --no-build-isolation -v + pip install . --no-build-isolation -v --no-deps fi shell: bash -el {0} diff --git a/.github/workflows/code-checks.yml b/.github/workflows/code-checks.yml index 0954bf7d9231c..f87aef5385898 100644 --- a/.github/workflows/code-checks.yml +++ b/.github/workflows/code-checks.yml @@ -4,11 +4,11 @@ on: push: branches: - main - - 2.0.x + - 2.1.x pull_request: branches: - main - - 2.0.x + - 2.1.x env: ENV_FILE: environment.yml @@ -124,7 +124,7 @@ jobs: run: | cd asv_bench asv machine --yes - asv run --quick --dry-run --strict --durations=30 --python=same + asv run --quick --dry-run --durations=30 --python=same build_docker_dev_environment: name: Build Docker Dev Environment diff --git a/.github/workflows/docbuild-and-upload.yml b/.github/workflows/docbuild-and-upload.yml index 9840596357d26..6fa98ff352d38 100644 --- a/.github/workflows/docbuild-and-upload.yml +++ b/.github/workflows/docbuild-and-upload.yml @@ -4,13 +4,13 @@ on: push: branches: - main - - 2.0.x + - 2.1.x tags: - '*' pull_request: branches: - main - - 2.0.x + - 2.1.x env: ENV_FILE: environment.yml @@ -67,7 +67,7 @@ jobs: run: cp doc/cheatsheet/Pandas_Cheat_Sheet* web/build/ - name: Upload web - run: rsync -az --delete --exclude='pandas-docs' --exclude='docs' web/build/ web@${{ secrets.server_ip }}:/var/www/html + run: rsync -az --delete --exclude='pandas-docs' --exclude='docs' --exclude='benchmarks' web/build/ web@${{ secrets.server_ip }}:/var/www/html if: github.event_name == 'push' && github.ref == 'refs/heads/main' - name: Upload dev docs diff --git a/.github/workflows/package-checks.yml b/.github/workflows/package-checks.yml index 6c9ba89d82794..7d20ef92587d9 100644 --- a/.github/workflows/package-checks.yml +++ b/.github/workflows/package-checks.yml @@ -4,11 +4,11 @@ on: push: branches: - main - - 2.0.x + - 2.1.x pull_request: branches: - main - - 2.0.x + - 2.1.x types: [ labeled, opened, synchronize, reopened ] permissions: @@ -24,7 +24,7 @@ jobs: runs-on: ubuntu-22.04 strategy: matrix: - extra: ["test", "performance", "computation", "fss", "aws", "gcp", "excel", "parquet", "feather", "hdf5", "spss", "postgresql", "mysql", "sql-other", "html", "xml", "plot", "output_formatting", "clipboard", "compression", "consortium-standard", "all"] + extra: ["test", "performance", "computation", "fss", "aws", "gcp", "excel", "parquet", "feather", "hdf5", "spss", "postgresql", "mysql", "sql-other", "html", "xml", "plot", "output-formatting", "clipboard", "compression", "consortium-standard", "all"] fail-fast: false name: Install Extras - ${{ matrix.extra }} concurrency: diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index 1770d18d4eb41..14abcdbfdb310 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -4,11 +4,11 @@ on: push: branches: - main - - 2.0.x + - 2.1.x pull_request: branches: - main - - 2.0.x + - 2.1.x paths-ignore: - "doc/**" - "web/**" @@ -77,16 +77,12 @@ jobs: env_file: actions-311-numpydev.yaml pattern: "not slow and not network and not single_cpu" test_args: "-W error::DeprecationWarning -W error::FutureWarning" - # TODO(cython3): Re-enable once next-beta(after beta 1) comes out - # There are some warnings failing the build with -werror - pandas_ci: "0" - name: "Pyarrow Nightly" env_file: actions-311-pyarrownightly.yaml pattern: "not slow and not network and not single_cpu" fail-fast: false name: ${{ matrix.name || format('ubuntu-latest {0}', matrix.env_file) }} env: - ENV_FILE: ci/deps/${{ matrix.env_file }} PATTERN: ${{ matrix.pattern }} EXTRA_APT: ${{ matrix.extra_apt || '' }} LANG: ${{ matrix.lang || 'C.UTF-8' }} @@ -150,14 +146,13 @@ jobs: - name: Generate extra locales # These extra locales will be available for locale.setlocale() calls in tests - run: | - sudo locale-gen ${{ matrix.extra_loc }} + run: sudo locale-gen ${{ matrix.extra_loc }} if: ${{ matrix.extra_loc }} - name: Set up Conda uses: ./.github/actions/setup-conda with: - environment-file: ${{ env.ENV_FILE }} + environment-file: ci/deps/${{ matrix.env_file }} - name: Build Pandas id: build @@ -235,11 +230,13 @@ jobs: git -c user.email="you@example.com" merge --no-commit my_ref_name fi - name: Build environment and Run Tests + # https://github.com/numpy/numpy/issues/24703#issuecomment-1722379388 run: | /opt/python/cp311-cp311/bin/python -m venv ~/virtualenvs/pandas-dev . ~/virtualenvs/pandas-dev/bin/activate - python -m pip install --no-cache-dir -U pip wheel setuptools meson[ninja]==1.0.1 meson-python==0.13.1 - python -m pip install --no-cache-dir versioneer[toml] cython numpy python-dateutil pytz pytest>=7.3.2 pytest-xdist>=2.2.0 pytest-asyncio>=0.17 hypothesis>=6.46.1 + python -m pip install --no-cache-dir -U pip wheel setuptools meson[ninja]==1.2.1 meson-python==0.13.1 + python -m pip install numpy --config-settings=setup-args="-Dallow-noblas=true" + python -m pip install --no-cache-dir versioneer[toml] cython python-dateutil pytz pytest>=7.3.2 pytest-xdist>=2.2.0 hypothesis>=6.46.1 python -m pip install --no-cache-dir --no-build-isolation -e . python -m pip list --no-cache-dir export PANDAS_CI=1 @@ -276,8 +273,8 @@ jobs: run: | /opt/python/cp311-cp311/bin/python -m venv ~/virtualenvs/pandas-dev . ~/virtualenvs/pandas-dev/bin/activate - python -m pip install --no-cache-dir -U pip wheel setuptools meson-python==0.13.1 meson[ninja]==1.0.1 - python -m pip install --no-cache-dir versioneer[toml] cython numpy python-dateutil pytz pytest>=7.3.2 pytest-xdist>=2.2.0 pytest-asyncio>=0.17 hypothesis>=6.46.1 + python -m pip install --no-cache-dir -U pip wheel setuptools meson-python==0.13.1 meson[ninja]==1.2.1 + python -m pip install --no-cache-dir versioneer[toml] cython numpy python-dateutil pytz pytest>=7.3.2 pytest-xdist>=2.2.0 hypothesis>=6.46.1 python -m pip install --no-cache-dir --no-build-isolation -e . python -m pip list --no-cache-dir @@ -312,15 +309,14 @@ jobs: # to the corresponding posix/windows-macos/sdist etc. workflows. # Feel free to modify this comment as necessary. #if: false # Uncomment this to freeze the workflow, comment it to unfreeze + defaults: + run: + shell: bash -eou pipefail {0} runs-on: ${{ matrix.os }} strategy: fail-fast: false matrix: - # TODO: Disable macOS for now, Github Actions bug where python is not - # symlinked correctly to 3.12 - # xref https://github.com/actions/setup-python/issues/701 - #os: [ubuntu-22.04, macOS-latest, windows-latest] - os: [ubuntu-22.04, windows-latest] + os: [ubuntu-22.04, macOS-latest, windows-latest] timeout-minutes: 180 @@ -345,22 +341,15 @@ jobs: with: python-version: '3.12-dev' - - name: Install dependencies + - name: Build Environment run: | python --version - python -m pip install --upgrade pip setuptools wheel meson[ninja]==1.0.1 meson-python==0.13.1 + python -m pip install --upgrade pip setuptools wheel meson[ninja]==1.2.1 meson-python==0.13.1 python -m pip install --pre --extra-index-url https://pypi.anaconda.org/scientific-python-nightly-wheels/simple numpy python -m pip install versioneer[toml] - python -m pip install python-dateutil pytz tzdata cython hypothesis>=6.46.1 pytest>=7.3.2 pytest-xdist>=2.2.0 pytest-cov pytest-asyncio>=0.17 + python -m pip install python-dateutil pytz tzdata cython hypothesis>=6.46.1 pytest>=7.3.2 pytest-xdist>=2.2.0 pytest-cov + python -m pip install -ve . --no-build-isolation --no-index --no-deps python -m pip list - - name: Build Pandas - run: | - python -m pip install -ve . --no-build-isolation --no-index - - - name: Build Version - run: | - python -c "import pandas; pandas.show_versions();" - - - name: Test + - name: Run Tests uses: ./.github/actions/run-tests diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml index f486a57bdb67d..95b48ee4b7426 100644 --- a/.github/workflows/wheels.yml +++ b/.github/workflows/wheels.yml @@ -97,8 +97,7 @@ jobs: - [macos-12, macosx_*] - [windows-2022, win_amd64] # TODO: support PyPy? - # TODO: Enable Python 3.12 wheels when numpy releases a version that supports Python 3.12 - python: [["cp39", "3.9"], ["cp310", "3.10"], ["cp311", "3.11"]]#, ["cp312", "3.12"]] + python: [["cp39", "3.9"], ["cp310", "3.10"], ["cp311", "3.11"], ["cp312", "3.12"]] env: IS_PUSH: ${{ github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') }} IS_SCHEDULE_DISPATCH: ${{ github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' }} @@ -139,7 +138,7 @@ jobs: run: echo "sdist_name=$(cd ./dist && ls -d */)" >> "$GITHUB_ENV" - name: Build wheels - uses: pypa/cibuildwheel@v2.14.1 + uses: pypa/cibuildwheel@v2.16.2 with: package-dir: ./dist/${{ matrix.buildplat[1] == 'macosx_*' && env.sdist_name || needs.build_sdist.outputs.sdist_file }} env: @@ -150,8 +149,10 @@ jobs: uses: mamba-org/setup-micromamba@v1 with: environment-name: wheel-env + # Use a fixed Python, since we might have an unreleased Python not + # yet present on conda-forge create-args: >- - python=${{ matrix.python[1] }} + python=3.11 anaconda-client wheel cache-downloads: true @@ -167,12 +168,13 @@ jobs: shell: pwsh run: | $TST_CMD = @" - python -m pip install pytz six numpy python-dateutil tzdata>=2022.1 hypothesis>=6.46.1 pytest>=7.3.2 pytest-xdist>=2.2.0 pytest-asyncio>=0.17; - python -m pip install --find-links=pandas\wheelhouse --no-index pandas; + python -m pip install hypothesis>=6.46.1 pytest>=7.3.2 pytest-xdist>=2.2.0; + python -m pip install `$(Get-Item pandas\wheelhouse\*.whl); python -c `'import pandas as pd; pd.test(extra_args=[\"`\"--no-strict-data-files`\"\", \"`\"-m not clipboard and not single_cpu and not slow and not network and not db`\"\"])`'; "@ - docker pull python:${{ matrix.python[1] }}-windowsservercore - docker run --env PANDAS_CI='1' -v ${PWD}:C:\pandas python:${{ matrix.python[1] }}-windowsservercore powershell -Command $TST_CMD + # add rc to the end of the image name if the Python version is unreleased + docker pull python:${{ matrix.python[1] == '3.12' && '3.12-rc' || format('{0}-windowsservercore', matrix.python[1]) }} + docker run --env PANDAS_CI='1' -v ${PWD}:C:\pandas python:${{ matrix.python[1] == '3.12' && '3.12-rc' || format('{0}-windowsservercore', matrix.python[1]) }} powershell -Command $TST_CMD - uses: actions/upload-artifact@v3 with: diff --git a/README.md b/README.md index 8ea473beb107e..6fa20d237babe 100644 --- a/README.md +++ b/README.md @@ -130,23 +130,17 @@ In the `pandas` directory (same one where you found this file after cloning the git repo), execute: ```sh -python setup.py install +pip install . ``` or for installing in [development mode](https://pip.pypa.io/en/latest/cli/pip_install/#install-editable): ```sh -python -m pip install -e . --no-build-isolation --no-use-pep517 +python -m pip install -ve . --no-build-isolation --config-settings=editable-verbose=true ``` -or alternatively - -```sh -python setup.py develop -``` - -See the full instructions for [installing from source](https://pandas.pydata.org/pandas-docs/stable/getting_started/install.html#installing-from-source). +See the full instructions for [installing from source](https://pandas.pydata.org/docs/dev/development/contributing_environment.html). ## License [BSD 3](LICENSE) diff --git a/asv_bench/benchmarks/algos/isin.py b/asv_bench/benchmarks/algos/isin.py index ac79ab65cea81..92797425b2c30 100644 --- a/asv_bench/benchmarks/algos/isin.py +++ b/asv_bench/benchmarks/algos/isin.py @@ -247,7 +247,7 @@ def setup(self, series_type, vals_type): elif series_type == "long": ser_vals = np.arange(N_many) elif series_type == "long_floats": - ser_vals = np.arange(N_many, dtype=np.float_) + ser_vals = np.arange(N_many, dtype=np.float64) self.series = Series(ser_vals).astype(object) @@ -258,7 +258,7 @@ def setup(self, series_type, vals_type): elif vals_type == "long": values = np.arange(N_many) elif vals_type == "long_floats": - values = np.arange(N_many, dtype=np.float_) + values = np.arange(N_many, dtype=np.float64) self.values = values.astype(object) diff --git a/ci/deps/actions-310.yaml b/ci/deps/actions-310.yaml index ffa7732c604a0..82f41811bf56e 100644 --- a/ci/deps/actions-310.yaml +++ b/ci/deps/actions-310.yaml @@ -7,27 +7,25 @@ dependencies: # build dependencies - versioneer[toml] - cython>=0.29.33 - - meson[ninja]=1.0.1 + - meson[ninja]=1.2.1 - meson-python=0.13.1 # test dependencies - pytest>=7.3.2 - pytest-cov - pytest-xdist>=2.2.0 - - pytest-asyncio>=0.17.0 - pytest-localserver>=0.7.1 - boto3 # required dependencies - python-dateutil - - numpy + - numpy<2 - pytz # optional dependencies - beautifulsoup4>=4.11.1 - blosc>=1.21.0 - bottleneck>=1.3.4 - - brotlipy>=0.7.0 - fastparquet>=0.8.1 - fsspec>=2022.05.0 - html5lib>=1.1 @@ -35,7 +33,7 @@ dependencies: - gcsfs>=2022.05.0 - jinja2>=3.1.2 - lxml>=4.8.0 - - matplotlib>=3.6.1 + - matplotlib>=3.6.1, <3.8 - numba>=0.55.2 - numexpr>=2.8.0 - odfpy>=1.4.1 @@ -47,7 +45,6 @@ dependencies: - pymysql>=1.0.2 - pyreadstat>=1.1.5 - pytables>=3.7.0 - - python-snappy>=0.6.1 - pyxlsb>=1.0.9 - s3fs>=2022.05.0 - scipy>=1.8.1 diff --git a/ci/deps/actions-311-downstream_compat.yaml b/ci/deps/actions-311-downstream_compat.yaml index 5a6a26c2e1ad8..8422aba5239f9 100644 --- a/ci/deps/actions-311-downstream_compat.yaml +++ b/ci/deps/actions-311-downstream_compat.yaml @@ -8,27 +8,25 @@ dependencies: # build dependencies - versioneer[toml] - cython>=0.29.33 - - meson[ninja]=1.0.1 + - meson[ninja]=1.2.1 - meson-python=0.13.1 # test dependencies - pytest>=7.3.2 - pytest-cov - pytest-xdist>=2.2.0 - - pytest-asyncio>=0.17.0 - pytest-localserver>=0.7.1 - boto3 # required dependencies - python-dateutil - - numpy + - numpy<2 - pytz # optional dependencies - beautifulsoup4>=4.11.1 - blosc>=1.21.0 - bottleneck>=1.3.4 - - brotlipy>=0.7.0 - fastparquet>=0.8.1 - fsspec>=2022.05.0 - html5lib>=1.1 @@ -36,7 +34,7 @@ dependencies: - gcsfs>=2022.05.0 - jinja2>=3.1.2 - lxml>=4.8.0 - - matplotlib>=3.6.1 + - matplotlib>=3.6.1, <3.8 - numba>=0.55.2 - numexpr>=2.8.0 - odfpy>=1.4.1 @@ -48,7 +46,6 @@ dependencies: - pymysql>=1.0.2 - pyreadstat>=1.1.5 - pytables>=3.7.0 - - python-snappy>=0.6.1 - pyxlsb>=1.0.9 - s3fs>=2022.05.0 - scipy>=1.8.1 diff --git a/ci/deps/actions-311-numpydev.yaml b/ci/deps/actions-311-numpydev.yaml index 2cd4d5f3528f8..695bd483fc572 100644 --- a/ci/deps/actions-311-numpydev.yaml +++ b/ci/deps/actions-311-numpydev.yaml @@ -6,8 +6,9 @@ dependencies: # build dependencies - versioneer[toml] - - meson[ninja]=1.0.1 + - meson[ninja]=1.2.1 - meson-python=0.13.1 + - cython>=0.29.33 # test dependencies - pytest>=7.3.2 @@ -17,7 +18,6 @@ dependencies: # causes an InternalError within pytest - pytest-xdist>=2.2.0, <3 - hypothesis>=6.46.1 - - pytest-asyncio>=0.17.0 # pandas dependencies - python-dateutil @@ -25,9 +25,7 @@ dependencies: - pip - pip: - - "cython" - "--extra-index-url https://pypi.anaconda.org/scientific-python-nightly-wheels/simple" - "--pre" - "numpy" - - "scipy" - "tzdata>=2022.1" diff --git a/ci/deps/actions-311-pyarrownightly.yaml b/ci/deps/actions-311-pyarrownightly.yaml index f24e866af0439..b5df68968ea46 100644 --- a/ci/deps/actions-311-pyarrownightly.yaml +++ b/ci/deps/actions-311-pyarrownightly.yaml @@ -6,7 +6,7 @@ dependencies: # build dependencies - versioneer[toml] - - meson[ninja]=1.0.1 + - meson[ninja]=1.2.1 - cython>=0.29.33 - meson-python=0.13.1 @@ -15,11 +15,10 @@ dependencies: - pytest-cov - pytest-xdist>=2.2.0 - hypothesis>=6.46.1 - - pytest-asyncio>=0.17.0 # required dependencies - python-dateutil - - numpy + - numpy<2 - pytz - pip diff --git a/ci/deps/actions-311.yaml b/ci/deps/actions-311.yaml index 9d60d734db5b3..cb1771c485297 100644 --- a/ci/deps/actions-311.yaml +++ b/ci/deps/actions-311.yaml @@ -7,27 +7,25 @@ dependencies: # build dependencies - versioneer[toml] - cython>=0.29.33 - - meson[ninja]=1.0.1 + - meson[ninja]=1.2.1 - meson-python=0.13.1 # test dependencies - pytest>=7.3.2 - pytest-cov - pytest-xdist>=2.2.0 - - pytest-asyncio>=0.17.0 - pytest-localserver>=0.7.1 - boto3 # required dependencies - python-dateutil - - numpy + - numpy<2 - pytz # optional dependencies - beautifulsoup4>=4.11.1 - blosc>=1.21.0 - bottleneck>=1.3.4 - - brotlipy>=0.7.0 - fastparquet>=0.8.1 - fsspec>=2022.05.0 - html5lib>=1.1 @@ -35,7 +33,7 @@ dependencies: - gcsfs>=2022.05.0 - jinja2>=3.1.2 - lxml>=4.8.0 - - matplotlib>=3.6.1 + - matplotlib>=3.6.1, <3.8 - numba>=0.55.2 - numexpr>=2.8.0 - odfpy>=1.4.1 @@ -47,7 +45,6 @@ dependencies: - pymysql>=1.0.2 - pyreadstat>=1.1.5 # - pytables>=3.7.0, 3.8.0 is first version that supports 3.11 - - python-snappy>=0.6.1 - pyxlsb>=1.0.9 - s3fs>=2022.05.0 - scipy>=1.8.1 diff --git a/ci/deps/actions-39-minimum_versions.yaml b/ci/deps/actions-39-minimum_versions.yaml index 0e2fcf87c2d6e..e4db8cb0a540b 100644 --- a/ci/deps/actions-39-minimum_versions.yaml +++ b/ci/deps/actions-39-minimum_versions.yaml @@ -9,27 +9,25 @@ dependencies: # build dependencies - versioneer[toml] - cython>=0.29.33 - - meson[ninja]=1.0.1 + - meson[ninja]=1.2.1 - meson-python=0.13.1 # test dependencies - pytest>=7.3.2 - pytest-cov - pytest-xdist>=2.2.0 - - pytest-asyncio>=0.17.0 - pytest-localserver>=0.7.1 - boto3 # required dependencies - python-dateutil=2.8.2 - - numpy=1.22.4 + - numpy=1.22.4, <2 - pytz=2020.1 # optional dependencies - beautifulsoup4=4.11.1 - blosc=1.21.0 - bottleneck=1.3.4 - - brotlipy=0.7.0 - fastparquet=0.8.1 - fsspec=2022.05.0 - html5lib=1.1 @@ -49,7 +47,6 @@ dependencies: - pymysql=1.0.2 - pyreadstat=1.1.5 - pytables=3.7.0 - - python-snappy=0.6.1 - pyxlsb=1.0.9 - s3fs=2022.05.0 - scipy=1.8.1 diff --git a/ci/deps/actions-39.yaml b/ci/deps/actions-39.yaml index 6ea0d41b947dc..745781298b86a 100644 --- a/ci/deps/actions-39.yaml +++ b/ci/deps/actions-39.yaml @@ -7,27 +7,25 @@ dependencies: # build dependencies - versioneer[toml] - cython>=0.29.33 - - meson[ninja]=1.0.1 + - meson[ninja]=1.2.1 - meson-python=0.13.1 # test dependencies - pytest>=7.3.2 - pytest-cov - pytest-xdist>=2.2.0 - - pytest-asyncio>=0.17.0 - pytest-localserver>=0.7.1 - boto3 # required dependencies - python-dateutil - - numpy + - numpy<2 - pytz # optional dependencies - beautifulsoup4>=4.11.1 - blosc>=1.21.0 - bottleneck>=1.3.4 - - brotlipy>=0.7.0 - fastparquet>=0.8.1 - fsspec>=2022.05.0 - html5lib>=1.1 @@ -35,7 +33,7 @@ dependencies: - gcsfs>=2022.05.0 - jinja2>=3.1.2 - lxml>=4.8.0 - - matplotlib>=3.6.1 + - matplotlib>=3.6.1, <3.8 - numba>=0.55.2 - numexpr>=2.8.0 - odfpy>=1.4.1 @@ -47,7 +45,6 @@ dependencies: - pymysql>=1.0.2 - pyreadstat>=1.1.5 - pytables>=3.7.0 - - python-snappy>=0.6.1 - pyxlsb>=1.0.9 - s3fs>=2022.05.0 - scipy>=1.8.1 diff --git a/ci/deps/actions-pypy-39.yaml b/ci/deps/actions-pypy-39.yaml index 035395d55eb3a..ce02ea2b9a090 100644 --- a/ci/deps/actions-pypy-39.yaml +++ b/ci/deps/actions-pypy-39.yaml @@ -10,18 +10,17 @@ dependencies: # build dependencies - versioneer[toml] - cython>=0.29.33 - - meson[ninja]=1.0.1 + - meson[ninja]=1.2.1 - meson-python=0.13.1 # test dependencies - pytest>=7.3.2 - pytest-cov - - pytest-asyncio>=0.17.0 - pytest-xdist>=2.2.0 - hypothesis>=6.46.1 # required - - numpy + - numpy<2 - python-dateutil - pytz - pip: diff --git a/ci/deps/circle-310-arm64.yaml b/ci/deps/circle-310-arm64.yaml index df4e8e285bd02..08a422c6d538a 100644 --- a/ci/deps/circle-310-arm64.yaml +++ b/ci/deps/circle-310-arm64.yaml @@ -7,27 +7,25 @@ dependencies: # build dependencies - versioneer[toml] - cython>=0.29.33 - - meson[ninja]=1.0.1 + - meson[ninja]=1.2.1 - meson-python=0.13.1 # test dependencies - pytest>=7.3.2 - pytest-cov - pytest-xdist>=2.2.0 - - pytest-asyncio>=0.17.0 - pytest-localserver>=0.7.1 - boto3 # required dependencies - python-dateutil - - numpy + - numpy<2 - pytz # optional dependencies - beautifulsoup4>=4.11.1 - blosc>=1.21.0 - bottleneck>=1.3.4 - - brotlipy>=0.7.0 - fastparquet>=0.8.1 - fsspec>=2022.05.0 - html5lib>=1.1 @@ -35,7 +33,7 @@ dependencies: - gcsfs>=2022.05.0 - jinja2>=3.1.2 - lxml>=4.8.0 - - matplotlib>=3.6.1 + - matplotlib>=3.6.1, <3.8 # test_numba_vs_cython segfaults with numba 0.57 - numba>=0.55.2, <0.57.0 - numexpr>=2.8.0 @@ -48,7 +46,6 @@ dependencies: - pymysql>=1.0.2 # - pyreadstat>=1.1.5 not available on ARM - pytables>=3.7.0 - - python-snappy>=0.6.1 - pyxlsb>=1.0.9 - s3fs>=2022.05.0 - scipy>=1.8.1 diff --git a/ci/meta.yaml b/ci/meta.yaml index 77f536a18a204..5b343f60b3dce 100644 --- a/ci/meta.yaml +++ b/ci/meta.yaml @@ -64,7 +64,6 @@ test: requires: - pip - pytest >=7.3.2 - - pytest-asyncio >=0.17.0 - pytest-xdist >=2.2.0 - pytest-cov - hypothesis >=6.46.1 diff --git a/ci/run_tests.sh b/ci/run_tests.sh index 48ef21686a26f..6a70ea1df3e71 100755 --- a/ci/run_tests.sh +++ b/ci/run_tests.sh @@ -10,7 +10,8 @@ echo PYTHONHASHSEED=$PYTHONHASHSEED COVERAGE="-s --cov=pandas --cov-report=xml --cov-append --cov-config=pyproject.toml" -PYTEST_CMD="MESONPY_EDITABLE_VERBOSE=1 PYTHONDEVMODE=1 PYTHONWARNDEFAULTENCODING=1 pytest -r fEs -n $PYTEST_WORKERS --dist=loadfile $TEST_ARGS $COVERAGE $PYTEST_TARGET" +# TODO: Support NEP 50 and remove NPY_PROMOTION_STATE +PYTEST_CMD="NPY_PROMOTION_STATE=legacy MESONPY_EDITABLE_VERBOSE=1 PYTHONDEVMODE=1 PYTHONWARNDEFAULTENCODING=1 pytest -r fEs -n $PYTEST_WORKERS --dist=loadfile $TEST_ARGS $COVERAGE $PYTEST_TARGET" if [[ "$PATTERN" ]]; then PYTEST_CMD="$PYTEST_CMD -m \"$PATTERN\"" diff --git a/doc/source/getting_started/comparison/comparison_with_sql.rst b/doc/source/getting_started/comparison/comparison_with_sql.rst index 7a83d50416186..f0eaa7362c52c 100644 --- a/doc/source/getting_started/comparison/comparison_with_sql.rst +++ b/doc/source/getting_started/comparison/comparison_with_sql.rst @@ -107,7 +107,7 @@ methods. .. ipython:: python frame = pd.DataFrame( - {"col1": ["A", "B", np.NaN, "C", "D"], "col2": ["F", np.NaN, "G", "H", "I"]} + {"col1": ["A", "B", np.nan, "C", "D"], "col2": ["F", np.nan, "G", "H", "I"]} ) frame diff --git a/doc/source/getting_started/install.rst b/doc/source/getting_started/install.rst index 1896dffa9a105..7570f9f33d265 100644 --- a/doc/source/getting_started/install.rst +++ b/doc/source/getting_started/install.rst @@ -206,6 +206,7 @@ Package Minimum support `NumPy `__ 1.22.4 `python-dateutil `__ 2.8.2 `pytz `__ 2020.1 +`tzdata `__ 2022.1 ================================================================ ========================== .. _install.optional_dependencies: @@ -246,14 +247,14 @@ Dependency Minimum Version pip ext Visualization ^^^^^^^^^^^^^ -Installable with ``pip install "pandas[plot, output_formatting]"``. +Installable with ``pip install "pandas[plot, output-formatting]"``. ========================= ================== ================== ============================================================= Dependency Minimum Version pip extra Notes ========================= ================== ================== ============================================================= matplotlib 3.6.1 plot Plotting library -Jinja2 3.1.2 output_formatting Conditional formatting with DataFrame.style -tabulate 0.8.10 output_formatting Printing in Markdown-friendly format (see `tabulate`_) +Jinja2 3.1.2 output-formatting Conditional formatting with DataFrame.style +tabulate 0.8.10 output-formatting Printing in Markdown-friendly format (see `tabulate`_) ========================= ================== ================== ============================================================= Computation @@ -411,8 +412,6 @@ Installable with ``pip install "pandas[compression]"`` ========================= ================== =============== ============================================================= Dependency Minimum Version pip extra Notes ========================= ================== =============== ============================================================= -brotli 0.7.0 compression Brotli compression -python-snappy 0.6.1 compression Snappy compression Zstandard 0.17.0 compression Zstandard compression ========================= ================== =============== ============================================================= diff --git a/doc/source/reference/extensions.rst b/doc/source/reference/extensions.rst index eb2529716b55e..e177e2b1d87d5 100644 --- a/doc/source/reference/extensions.rst +++ b/doc/source/reference/extensions.rst @@ -39,6 +39,7 @@ objects. api.extensions.ExtensionArray._from_sequence api.extensions.ExtensionArray._from_sequence_of_strings api.extensions.ExtensionArray._hash_pandas_object + api.extensions.ExtensionArray._pad_or_backfill api.extensions.ExtensionArray._reduce api.extensions.ExtensionArray._values_for_argsort api.extensions.ExtensionArray._values_for_factorize @@ -54,7 +55,6 @@ objects. api.extensions.ExtensionArray.interpolate api.extensions.ExtensionArray.isin api.extensions.ExtensionArray.isna - api.extensions.ExtensionArray.pad_or_backfill api.extensions.ExtensionArray.ravel api.extensions.ExtensionArray.repeat api.extensions.ExtensionArray.searchsorted diff --git a/doc/source/reference/series.rst b/doc/source/reference/series.rst index 5a43e5796d1d9..41705620d4bc7 100644 --- a/doc/source/reference/series.rst +++ b/doc/source/reference/series.rst @@ -314,6 +314,7 @@ Datetime properties Series.dt.weekday Series.dt.dayofyear Series.dt.day_of_year + Series.dt.days_in_month Series.dt.quarter Series.dt.is_month_start Series.dt.is_month_end @@ -327,6 +328,7 @@ Datetime properties Series.dt.tz Series.dt.freq Series.dt.unit + Series.dt.normalize Datetime methods ^^^^^^^^^^^^^^^^ diff --git a/doc/source/user_guide/basics.rst b/doc/source/user_guide/basics.rst index 2e299da5e5794..d3c9d83b943ce 100644 --- a/doc/source/user_guide/basics.rst +++ b/doc/source/user_guide/basics.rst @@ -408,20 +408,6 @@ raise a ValueError: pd.Series(['foo', 'bar', 'baz']) == pd.Series(['foo']) -Note that this is different from the NumPy behavior where a comparison can -be broadcast: - -.. ipython:: python - - np.array([1, 2, 3]) == np.array([2]) - -or it can return False if broadcasting can not be done: - -.. ipython:: python - :okwarning: - - np.array([1, 2, 3]) == np.array([1, 2]) - Combining overlapping data sets ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/doc/source/user_guide/cookbook.rst b/doc/source/user_guide/cookbook.rst index 66ee571d6b5a5..b7a59516ae671 100644 --- a/doc/source/user_guide/cookbook.rst +++ b/doc/source/user_guide/cookbook.rst @@ -794,12 +794,12 @@ Apply index=["I", "II", "III"], ) - def make_df(ser): - new_vals = [pd.Series(value, name=name) for name, value in ser.items()] - return pd.DataFrame(new_vals) - - df_orgz = pd.concat({ind: row.pipe(make_df) for ind, row in df.iterrows()}) + def SeriesFromSubList(aList): + return pd.Series(aList) + df_orgz = pd.concat( + {ind: row.apply(SeriesFromSubList) for ind, row in df.iterrows()} + ) df_orgz `Rolling apply with a DataFrame returning a Series diff --git a/doc/source/user_guide/copy_on_write.rst b/doc/source/user_guide/copy_on_write.rst index 59bdb1926895f..d0c57b56585db 100644 --- a/doc/source/user_guide/copy_on_write.rst +++ b/doc/source/user_guide/copy_on_write.rst @@ -7,8 +7,8 @@ Copy-on-Write (CoW) ******************* Copy-on-Write was first introduced in version 1.5.0. Starting from version 2.0 most of the -optimizations that become possible through CoW are implemented and supported. A complete list -can be found at :ref:`Copy-on-Write optimizations `. +optimizations that become possible through CoW are implemented and supported. All possible +optimizations are supported starting from pandas 2.1. We expect that CoW will be enabled by default in version 3.0. @@ -154,6 +154,77 @@ With copy on write this can be done by using ``loc``. df.loc[df["bar"] > 5, "foo"] = 100 +Read-only NumPy arrays +---------------------- + +Accessing the underlying NumPy array of a DataFrame will return a read-only array if the array +shares data with the initial DataFrame: + +The array is a copy if the initial DataFrame consists of more than one array: + + +.. ipython:: python + + df = pd.DataFrame({"a": [1, 2], "b": [1.5, 2.5]}) + df.to_numpy() + +The array shares data with the DataFrame if the DataFrame consists of only one NumPy array: + +.. ipython:: python + + df = pd.DataFrame({"a": [1, 2], "b": [3, 4]}) + df.to_numpy() + +This array is read-only, which means that it can't be modified inplace: + +.. ipython:: python + :okexcept: + + arr = df.to_numpy() + arr[0, 0] = 100 + +The same holds true for a Series, since a Series always consists of a single array. + +There are two potential solution to this: + +- Trigger a copy manually if you want to avoid updating DataFrames that share memory with your array. +- Make the array writeable. This is a more performant solution but circumvents Copy-on-Write rules, so + it should be used with caution. + +.. ipython:: python + + arr = df.to_numpy() + arr.flags.writeable = True + arr[0, 0] = 100 + arr + +Patterns to avoid +----------------- + +No defensive copy will be performed if two objects share the same data while +you are modifying one object inplace. + +.. ipython:: python + + df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + df2 = df.reset_index() + df2.iloc[0, 0] = 100 + +This creates two objects that share data and thus the setitem operation will trigger a +copy. This is not necessary if the initial object ``df`` isn't needed anymore. +Simply reassigning to the same variable will invalidate the reference that is +held by the object. + +.. ipython:: python + + df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + df = df.reset_index() + df.iloc[0, 0] = 100 + +No copy is necessary in this example. +Creating multiple references keeps unnecessary references alive +and thus will hurt performance with Copy-on-Write. + .. _copy_on_write.optimizations: Copy-on-Write optimizations @@ -161,59 +232,8 @@ Copy-on-Write optimizations A new lazy copy mechanism that defers the copy until the object in question is modified and only if this object shares data with another object. This mechanism was added to -following methods: - - - :meth:`DataFrame.reset_index` / :meth:`Series.reset_index` - - :meth:`DataFrame.set_index` - - :meth:`DataFrame.set_axis` / :meth:`Series.set_axis` - - :meth:`DataFrame.set_flags` / :meth:`Series.set_flags` - - :meth:`DataFrame.rename_axis` / :meth:`Series.rename_axis` - - :meth:`DataFrame.reindex` / :meth:`Series.reindex` - - :meth:`DataFrame.reindex_like` / :meth:`Series.reindex_like` - - :meth:`DataFrame.assign` - - :meth:`DataFrame.drop` - - :meth:`DataFrame.dropna` / :meth:`Series.dropna` - - :meth:`DataFrame.select_dtypes` - - :meth:`DataFrame.align` / :meth:`Series.align` - - :meth:`Series.to_frame` - - :meth:`DataFrame.rename` / :meth:`Series.rename` - - :meth:`DataFrame.add_prefix` / :meth:`Series.add_prefix` - - :meth:`DataFrame.add_suffix` / :meth:`Series.add_suffix` - - :meth:`DataFrame.drop_duplicates` / :meth:`Series.drop_duplicates` - - :meth:`DataFrame.droplevel` / :meth:`Series.droplevel` - - :meth:`DataFrame.reorder_levels` / :meth:`Series.reorder_levels` - - :meth:`DataFrame.between_time` / :meth:`Series.between_time` - - :meth:`DataFrame.filter` / :meth:`Series.filter` - - :meth:`DataFrame.head` / :meth:`Series.head` - - :meth:`DataFrame.tail` / :meth:`Series.tail` - - :meth:`DataFrame.isetitem` - - :meth:`DataFrame.pipe` / :meth:`Series.pipe` - - :meth:`DataFrame.pop` / :meth:`Series.pop` - - :meth:`DataFrame.replace` / :meth:`Series.replace` - - :meth:`DataFrame.shift` / :meth:`Series.shift` - - :meth:`DataFrame.sort_index` / :meth:`Series.sort_index` - - :meth:`DataFrame.sort_values` / :meth:`Series.sort_values` - - :meth:`DataFrame.squeeze` / :meth:`Series.squeeze` - - :meth:`DataFrame.swapaxes` - - :meth:`DataFrame.swaplevel` / :meth:`Series.swaplevel` - - :meth:`DataFrame.take` / :meth:`Series.take` - - :meth:`DataFrame.to_timestamp` / :meth:`Series.to_timestamp` - - :meth:`DataFrame.to_period` / :meth:`Series.to_period` - - :meth:`DataFrame.truncate` - - :meth:`DataFrame.iterrows` - - :meth:`DataFrame.tz_convert` / :meth:`Series.tz_localize` - - :meth:`DataFrame.fillna` / :meth:`Series.fillna` - - :meth:`DataFrame.interpolate` / :meth:`Series.interpolate` - - :meth:`DataFrame.ffill` / :meth:`Series.ffill` - - :meth:`DataFrame.bfill` / :meth:`Series.bfill` - - :meth:`DataFrame.where` / :meth:`Series.where` - - :meth:`DataFrame.infer_objects` / :meth:`Series.infer_objects` - - :meth:`DataFrame.astype` / :meth:`Series.astype` - - :meth:`DataFrame.convert_dtypes` / :meth:`Series.convert_dtypes` - - :meth:`DataFrame.join` - - :meth:`DataFrame.eval` - - :func:`concat` - - :func:`merge` +methods that don't require a copy of the underlying data. Popular examples are :meth:`DataFrame.drop` for ``axis=1`` +and :meth:`DataFrame.rename`. These methods return views when Copy-on-Write is enabled, which provides a significant performance improvement compared to the regular execution. diff --git a/doc/source/user_guide/enhancingperf.rst b/doc/source/user_guide/enhancingperf.rst index 2ddc3e709be85..c4721f3a6b09c 100644 --- a/doc/source/user_guide/enhancingperf.rst +++ b/doc/source/user_guide/enhancingperf.rst @@ -183,8 +183,8 @@ can be improved by passing an ``np.ndarray``. ...: return s * dx ...: cpdef np.ndarray[double] apply_integrate_f(np.ndarray col_a, np.ndarray col_b, ...: np.ndarray col_N): - ...: assert (col_a.dtype == np.float_ - ...: and col_b.dtype == np.float_ and col_N.dtype == np.int_) + ...: assert (col_a.dtype == np.float64 + ...: and col_b.dtype == np.float64 and col_N.dtype == np.dtype(int)) ...: cdef Py_ssize_t i, n = len(col_N) ...: assert (len(col_a) == len(col_b) == n) ...: cdef np.ndarray[double] res = np.empty(n) diff --git a/doc/source/user_guide/gotchas.rst b/doc/source/user_guide/gotchas.rst index 67106df328361..99c85ac66623d 100644 --- a/doc/source/user_guide/gotchas.rst +++ b/doc/source/user_guide/gotchas.rst @@ -327,7 +327,7 @@ present in the more domain-specific statistical programming language `R ``numpy.unsignedinteger`` | ``uint8, uint16, uint32, uint64`` ``numpy.object_`` | ``object_`` ``numpy.bool_`` | ``bool_`` - ``numpy.character`` | ``string_, unicode_`` + ``numpy.character`` | ``bytes_, str_`` The R language, by contrast, only has a handful of built-in data types: ``integer``, ``numeric`` (floating-point), ``character``, and @@ -379,7 +379,7 @@ constructors using something similar to the following: .. ipython:: python x = np.array(list(range(10)), ">i4") # big endian - newx = x.byteswap().newbyteorder() # force native byteorder + newx = x.byteswap().view(x.dtype.newbyteorder()) # force native byteorder s = pd.Series(newx) See `the NumPy documentation on byte order diff --git a/doc/source/user_guide/groupby.rst b/doc/source/user_guide/groupby.rst index 75c816f66d5e4..92fa38c9da5c2 100644 --- a/doc/source/user_guide/groupby.rst +++ b/doc/source/user_guide/groupby.rst @@ -1207,6 +1207,19 @@ The dimension of the returned result can also change: grouped.apply(f) +``apply`` on a Series can operate on a returned value from the applied function +that is itself a series, and possibly upcast the result to a DataFrame: + +.. ipython:: python + + def f(x): + return pd.Series([x, x ** 2], index=["x", "x^2"]) + + + s = pd.Series(np.random.rand(5)) + s + s.apply(f) + Similar to :ref:`groupby.aggregate.agg`, the resulting dtype will reflect that of the apply function. If the results from different groups have different dtypes, then a common dtype will be determined in the same way as ``DataFrame`` construction. diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 6e352c52cd60e..df2f1bccc3cff 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -4881,7 +4881,7 @@ unspecified columns of the given DataFrame. The argument ``selector`` defines which table is the selector table (which you can make queries from). The argument ``dropna`` will drop rows from the input ``DataFrame`` to ensure tables are synchronized. This means that if a row for one of the tables -being written to is entirely ``np.NaN``, that row will be dropped from all tables. +being written to is entirely ``np.nan``, that row will be dropped from all tables. If ``dropna`` is False, **THE USER IS RESPONSIBLE FOR SYNCHRONIZING THE TABLES**. Remember that entirely ``np.Nan`` rows are not written to the HDFStore, so if diff --git a/doc/source/whatsnew/index.rst b/doc/source/whatsnew/index.rst index fc225d0f44497..eef65366a2762 100644 --- a/doc/source/whatsnew/index.rst +++ b/doc/source/whatsnew/index.rst @@ -16,6 +16,9 @@ Version 2.1 .. toctree:: :maxdepth: 2 + v2.1.3 + v2.1.2 + v2.1.1 v2.1.0 Version 2.0 diff --git a/doc/source/whatsnew/v0.10.0.rst b/doc/source/whatsnew/v0.10.0.rst index 3425986a37743..3f1db2e3a9cd9 100644 --- a/doc/source/whatsnew/v0.10.0.rst +++ b/doc/source/whatsnew/v0.10.0.rst @@ -244,26 +244,15 @@ Convenience methods ``ffill`` and ``bfill`` have been added: function, that is itself a series, and possibly upcast the result to a DataFrame - .. code-block:: python - - >>> def f(x): - ... return pd.Series([x, x ** 2], index=["x", "x^2"]) - >>> - >>> s = pd.Series(np.random.rand(5)) - >>> s - 0 0.340445 - 1 0.984729 - 2 0.919540 - 3 0.037772 - 4 0.861549 - dtype: float64 - >>> s.apply(f) - x x^2 - 0 0.340445 0.115903 - 1 0.984729 0.969691 - 2 0.919540 0.845555 - 3 0.037772 0.001427 - 4 0.861549 0.742267 + .. ipython:: python + + def f(x): + return pd.Series([x, x ** 2], index=["x", "x^2"]) + + + s = pd.Series(np.random.rand(5)) + s + s.apply(f) - New API functions for working with pandas options (:issue:`2097`): diff --git a/doc/source/whatsnew/v0.17.0.rst b/doc/source/whatsnew/v0.17.0.rst index abbda2ffc9be2..5a668e5fabd9c 100644 --- a/doc/source/whatsnew/v0.17.0.rst +++ b/doc/source/whatsnew/v0.17.0.rst @@ -726,10 +726,10 @@ be broadcast: or it can return False if broadcasting can not be done: -.. ipython:: python - :okwarning: +.. code-block:: ipython - np.array([1, 2, 3]) == np.array([1, 2]) + In [11]: np.array([1, 2, 3]) == np.array([1, 2]) + Out[11]: False Changes to boolean comparisons vs. None ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index 73a523b14f9f7..38c6e1123aaae 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -556,7 +556,7 @@ You must pass in the ``line_terminator`` explicitly, even in this case. .. _whatsnew_0240.bug_fixes.nan_with_str_dtype: -Proper handling of ``np.NaN`` in a string data-typed column with the Python engine +Proper handling of ``np.nan`` in a string data-typed column with the Python engine ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ There was bug in :func:`read_excel` and :func:`read_csv` with the Python diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index 313bf61ecabf9..d422fb1be9fdf 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -1,6 +1,6 @@ .. _whatsnew_210: -What's new in 2.1.0 (Aug 11, 2023) +What's new in 2.1.0 (Aug 30, 2023) -------------------------------------- These are the changes in pandas 2.1.0. See :ref:`release` for a full changelog @@ -21,7 +21,7 @@ PyArrow will become a required dependency with pandas 3.0 `PyArrow `_ will become a required dependency of pandas starting with pandas 3.0. This decision was made based on -`PDEP 12 `_. +`PDEP 10 `_. This will enable more changes that are hugely beneficial to pandas users, including but not limited to: @@ -39,12 +39,18 @@ We are collecting feedback on this decision `here ` (:issue:`48347`) +- Implemented ``__from_arrow__`` on :class:`DatetimeTZDtype` (:issue:`52201`) +- Implemented ``__pandas_priority__`` to allow custom types to take precedence over :class:`DataFrame`, :class:`Series`, :class:`Index`, or :class:`.ExtensionArray` for arithmetic operations, :ref:`see the developer guide ` (:issue:`48347`) - Improve error message when having incompatible columns using :meth:`DataFrame.merge` (:issue:`51861`) - Improve error message when setting :class:`DataFrame` with wrong number of columns through :meth:`DataFrame.isetitem` (:issue:`51701`) - Improved error handling when using :meth:`DataFrame.to_json` with incompatible ``index`` and ``orient`` arguments (:issue:`52143`) -- Improved error message when creating a DataFrame with empty data (0 rows), no index and an incorrect number of columns. (:issue:`52084`) -- Improved error message when providing an invalid ``index`` or ``offset`` argument to :class:`pandas.api.indexers.VariableOffsetWindowIndexer` (:issue:`54379`) +- Improved error message when creating a DataFrame with empty data (0 rows), no index and an incorrect number of columns (:issue:`52084`) +- Improved error message when providing an invalid ``index`` or ``offset`` argument to :class:`.VariableOffsetWindowIndexer` (:issue:`54379`) - Let :meth:`DataFrame.to_feather` accept a non-default :class:`Index` and non-string column names (:issue:`51787`) - Added a new parameter ``by_row`` to :meth:`Series.apply` and :meth:`DataFrame.apply`. When set to ``False`` the supplied callables will always operate on the whole Series or DataFrame (:issue:`53400`, :issue:`53601`). - :meth:`DataFrame.shift` and :meth:`Series.shift` now allow shifting by multiple periods by supplying a list of periods (:issue:`44424`) -- Groupby aggregations (such as :meth:`.DataFrameGroupby.sum`) now can preserve the dtype of the input instead of casting to ``float64`` (:issue:`44952`) +- Groupby aggregations with ``numba`` (such as :meth:`.DataFrameGroupBy.sum`) now can preserve the dtype of the input instead of casting to ``float64`` (:issue:`44952`) - Improved error message when :meth:`.DataFrameGroupBy.agg` failed (:issue:`52930`) -- Many read/to_* functions, such as :meth:`DataFrame.to_pickle` and :func:`read_csv`, support forwarding compression arguments to lzma.LZMAFile (:issue:`52979`) -- Reductions :meth:`Series.argmax`, :meth:`Series.argmin`, :meth:`Series.idxmax`, :meth:`Series.idxmin`, :meth:`Index.argmax`, :meth:`Index.argmin`, :meth:`DataFrame.idxmax`, :meth:`DataFrame.idxmin` are now supported for object-dtype objects (:issue:`4279`, :issue:`18021`, :issue:`40685`, :issue:`43697`) +- Many read/to_* functions, such as :meth:`DataFrame.to_pickle` and :func:`read_csv`, support forwarding compression arguments to ``lzma.LZMAFile`` (:issue:`52979`) +- Reductions :meth:`Series.argmax`, :meth:`Series.argmin`, :meth:`Series.idxmax`, :meth:`Series.idxmin`, :meth:`Index.argmax`, :meth:`Index.argmin`, :meth:`DataFrame.idxmax`, :meth:`DataFrame.idxmin` are now supported for object-dtype (:issue:`4279`, :issue:`18021`, :issue:`40685`, :issue:`43697`) - :meth:`DataFrame.to_parquet` and :func:`read_parquet` will now write and read ``attrs`` respectively (:issue:`54346`) +- :meth:`Index.all` and :meth:`Index.any` with floating dtypes and timedelta64 dtypes no longer raise ``TypeError``, matching the :meth:`Series.all` and :meth:`Series.any` behavior (:issue:`54566`) +- :meth:`Series.cummax`, :meth:`Series.cummin` and :meth:`Series.cumprod` are now supported for pyarrow dtypes with pyarrow version 13.0 and above (:issue:`52085`) - Added support for the DataFrame Consortium Standard (:issue:`54383`) -- Performance improvement in :meth:`.GroupBy.quantile` (:issue:`51722`) - -.. --------------------------------------------------------------------------- -.. _whatsnew_210.notable_bug_fixes: - -Notable bug fixes -~~~~~~~~~~~~~~~~~ - -These are bug fixes that might have notable behavior changes. +- Performance improvement in :meth:`.DataFrameGroupBy.quantile` and :meth:`.SeriesGroupBy.quantile` (:issue:`51722`) +- PyArrow-backed integer dtypes now support bitwise operations (:issue:`54495`) .. --------------------------------------------------------------------------- .. _whatsnew_210.api_breaking: @@ -363,7 +365,7 @@ See :ref:`install.dependencies` and :ref:`install.optional_dependencies` for mor Other API changes ^^^^^^^^^^^^^^^^^ -- :class:`arrays.PandasArray` has been renamed ``NumpyExtensionArray`` and the attached dtype name changed from ``PandasDtype`` to ``NumpyEADtype``; importing ``PandasArray`` still works until the next major version (:issue:`53694`) +- :class:`arrays.PandasArray` has been renamed :class:`.NumpyExtensionArray` and the attached dtype name changed from ``PandasDtype`` to ``NumpyEADtype``; importing ``PandasArray`` still works until the next major version (:issue:`53694`) .. --------------------------------------------------------------------------- .. _whatsnew_210.deprecations: @@ -374,6 +376,8 @@ Deprecations Deprecated silent upcasting in setitem-like Series operations ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +PDEP-6: https://pandas.pydata.org/pdeps/0006-ban-upcasting.html + Setitem-like operations on Series (or DataFrame columns) which silently upcast the dtype are deprecated and show a warning. Examples of affected operations are: @@ -506,34 +510,33 @@ and ``datetime.datetime.strptime``: Other Deprecations ^^^^^^^^^^^^^^^^^^ -- Deprecated 'broadcast_axis' keyword in :meth:`Series.align` and :meth:`DataFrame.align`, upcast before calling ``align`` with ``left = DataFrame({col: left for col in right.columns}, index=right.index)`` (:issue:`51856`) -- Deprecated 'downcast' keyword in :meth:`Index.fillna` (:issue:`53956`) -- Deprecated 'fill_method' and 'limit' keywords in :meth:`DataFrame.pct_change`, :meth:`Series.pct_change`, :meth:`.DataFrameGroupBy.pct_change`, and :meth:`.SeriesGroupBy.pct_change`, explicitly call ``ffill`` or ``bfill`` before calling ``pct_change`` instead (:issue:`53491`) -- Deprecated 'method', 'limit', and 'fill_axis' keywords in :meth:`DataFrame.align` and :meth:`Series.align`, explicitly call ``fillna`` on the alignment results instead (:issue:`51856`) -- Deprecated 'quantile' keyword in :meth:`.Rolling.quantile` and :meth:`.Expanding.quantile`, renamed as 'q' instead (:issue:`52550`) - Deprecated :attr:`.DataFrameGroupBy.dtypes`, check ``dtypes`` on the underlying object instead (:issue:`51045`) - Deprecated :attr:`DataFrame._data` and :attr:`Series._data`, use public APIs instead (:issue:`33333`) - Deprecated :func:`concat` behavior when any of the objects being concatenated have length 0; in the past the dtypes of empty objects were ignored when determining the resulting dtype, in a future version they will not (:issue:`39122`) +- Deprecated :meth:`.Categorical.to_list`, use ``obj.tolist()`` instead (:issue:`51254`) - Deprecated :meth:`.DataFrameGroupBy.all` and :meth:`.DataFrameGroupBy.any` with datetime64 or :class:`PeriodDtype` values, matching the :class:`Series` and :class:`DataFrame` deprecations (:issue:`34479`) -- Deprecated :meth:`.DataFrameGroupBy.apply` and methods on the objects returned by :meth:`.DataFrameGroupBy.resample` operating on the grouping column(s); select the columns to operate on after groupby to either explicitly include or exclude the groupings and avoid the ``FutureWarning`` (:issue:`7155`) -- Deprecated :meth:`Categorical.to_list`, use ``obj.tolist()`` instead (:issue:`51254`) - Deprecated ``axis=1`` in :meth:`DataFrame.ewm`, :meth:`DataFrame.rolling`, :meth:`DataFrame.expanding`, transpose before calling the method instead (:issue:`51778`) - Deprecated ``axis=1`` in :meth:`DataFrame.groupby` and in :class:`Grouper` constructor, do ``frame.T.groupby(...)`` instead (:issue:`51203`) +- Deprecated ``broadcast_axis`` keyword in :meth:`Series.align` and :meth:`DataFrame.align`, upcast before calling ``align`` with ``left = DataFrame({col: left for col in right.columns}, index=right.index)`` (:issue:`51856`) +- Deprecated ``downcast`` keyword in :meth:`Index.fillna` (:issue:`53956`) +- Deprecated ``fill_method`` and ``limit`` keywords in :meth:`DataFrame.pct_change`, :meth:`Series.pct_change`, :meth:`.DataFrameGroupBy.pct_change`, and :meth:`.SeriesGroupBy.pct_change`, explicitly call e.g. :meth:`DataFrame.ffill` or :meth:`DataFrame.bfill` before calling ``pct_change`` instead (:issue:`53491`) +- Deprecated ``method``, ``limit``, and ``fill_axis`` keywords in :meth:`DataFrame.align` and :meth:`Series.align`, explicitly call :meth:`DataFrame.fillna` or :meth:`Series.fillna` on the alignment results instead (:issue:`51856`) +- Deprecated ``quantile`` keyword in :meth:`.Rolling.quantile` and :meth:`.Expanding.quantile`, renamed to ``q`` instead (:issue:`52550`) - Deprecated accepting slices in :meth:`DataFrame.take`, call ``obj[slicer]`` or pass a sequence of integers instead (:issue:`51539`) - Deprecated behavior of :meth:`DataFrame.idxmax`, :meth:`DataFrame.idxmin`, :meth:`Series.idxmax`, :meth:`Series.idxmin` in with all-NA entries or any-NA and ``skipna=False``; in a future version these will raise ``ValueError`` (:issue:`51276`) - Deprecated explicit support for subclassing :class:`Index` (:issue:`45289`) -- Deprecated making functions given to :meth:`Series.agg` attempt to operate on each element in the :class:`Series` and only operate on the whole :class:`Series` if the elementwise operations failed. In the future, functions given to :meth:`Series.agg` will always operate on the whole :class:`Series` only. To keep the current behavior, use :meth:`Series.transform` instead. (:issue:`53325`) -- Deprecated making the functions in a list of functions given to :meth:`DataFrame.agg` attempt to operate on each element in the :class:`DataFrame` and only operate on the columns of the :class:`DataFrame` if the elementwise operations failed. To keep the current behavior, use :meth:`DataFrame.transform` instead. (:issue:`53325`) +- Deprecated making functions given to :meth:`Series.agg` attempt to operate on each element in the :class:`Series` and only operate on the whole :class:`Series` if the elementwise operations failed. In the future, functions given to :meth:`Series.agg` will always operate on the whole :class:`Series` only. To keep the current behavior, use :meth:`Series.transform` instead (:issue:`53325`) +- Deprecated making the functions in a list of functions given to :meth:`DataFrame.agg` attempt to operate on each element in the :class:`DataFrame` and only operate on the columns of the :class:`DataFrame` if the elementwise operations failed. To keep the current behavior, use :meth:`DataFrame.transform` instead (:issue:`53325`) - Deprecated passing a :class:`DataFrame` to :meth:`DataFrame.from_records`, use :meth:`DataFrame.set_index` or :meth:`DataFrame.drop` instead (:issue:`51353`) - Deprecated silently dropping unrecognized timezones when parsing strings to datetimes (:issue:`18702`) -- Deprecated the "downcast" keyword in :meth:`Series.interpolate`, :meth:`DataFrame.interpolate`, :meth:`Series.fillna`, :meth:`DataFrame.fillna`, :meth:`Series.ffill`, :meth:`DataFrame.ffill`, :meth:`Series.bfill`, :meth:`DataFrame.bfill` (:issue:`40988`) - Deprecated the ``axis`` keyword in :meth:`DataFrame.ewm`, :meth:`Series.ewm`, :meth:`DataFrame.rolling`, :meth:`Series.rolling`, :meth:`DataFrame.expanding`, :meth:`Series.expanding` (:issue:`51778`) - Deprecated the ``axis`` keyword in :meth:`DataFrame.resample`, :meth:`Series.resample` (:issue:`51778`) +- Deprecated the ``downcast`` keyword in :meth:`Series.interpolate`, :meth:`DataFrame.interpolate`, :meth:`Series.fillna`, :meth:`DataFrame.fillna`, :meth:`Series.ffill`, :meth:`DataFrame.ffill`, :meth:`Series.bfill`, :meth:`DataFrame.bfill` (:issue:`40988`) - Deprecated the behavior of :func:`concat` with both ``len(keys) != len(objs)``, in a future version this will raise instead of truncating to the shorter of the two sequences (:issue:`43485`) - Deprecated the behavior of :meth:`Series.argsort` in the presence of NA values; in a future version these will be sorted at the end instead of giving -1 (:issue:`54219`) - Deprecated the default of ``observed=False`` in :meth:`DataFrame.groupby` and :meth:`Series.groupby`; this will default to ``True`` in a future version (:issue:`43999`) -- Deprecating pinning ``group.name`` to each group in :meth:`SeriesGroupBy.aggregate` aggregations; if your operation requires utilizing the groupby keys, iterate over the groupby object instead (:issue:`41090`) -- Deprecated the 'axis' keyword in :meth:`.DataFrameGroupBy.idxmax`, :meth:`.DataFrameGroupBy.idxmin`, :meth:`.DataFrameGroupBy.fillna`, :meth:`.DataFrameGroupBy.take`, :meth:`.DataFrameGroupBy.skew`, :meth:`.DataFrameGroupBy.rank`, :meth:`.DataFrameGroupBy.cumprod`, :meth:`.DataFrameGroupBy.cumsum`, :meth:`.DataFrameGroupBy.cummax`, :meth:`.DataFrameGroupBy.cummin`, :meth:`.DataFrameGroupBy.pct_change`, :meth:`DataFrameGroupBy.diff`, :meth:`.DataFrameGroupBy.shift`, and :meth:`DataFrameGroupBy.corrwith`; for ``axis=1`` operate on the underlying :class:`DataFrame` instead (:issue:`50405`, :issue:`51046`) +- Deprecating pinning ``group.name`` to each group in :meth:`.SeriesGroupBy.aggregate` aggregations; if your operation requires utilizing the groupby keys, iterate over the groupby object instead (:issue:`41090`) +- Deprecated the ``axis`` keyword in :meth:`.DataFrameGroupBy.idxmax`, :meth:`.DataFrameGroupBy.idxmin`, :meth:`.DataFrameGroupBy.fillna`, :meth:`.DataFrameGroupBy.take`, :meth:`.DataFrameGroupBy.skew`, :meth:`.DataFrameGroupBy.rank`, :meth:`.DataFrameGroupBy.cumprod`, :meth:`.DataFrameGroupBy.cumsum`, :meth:`.DataFrameGroupBy.cummax`, :meth:`.DataFrameGroupBy.cummin`, :meth:`.DataFrameGroupBy.pct_change`, :meth:`.DataFrameGroupBy.diff`, :meth:`.DataFrameGroupBy.shift`, and :meth:`.DataFrameGroupBy.corrwith`; for ``axis=1`` operate on the underlying :class:`DataFrame` instead (:issue:`50405`, :issue:`51046`) - Deprecated :class:`.DataFrameGroupBy` with ``as_index=False`` not including groupings in the result when they are not columns of the DataFrame (:issue:`49519`) - Deprecated :func:`is_categorical_dtype`, use ``isinstance(obj.dtype, pd.CategoricalDtype)`` instead (:issue:`52527`) - Deprecated :func:`is_datetime64tz_dtype`, check ``isinstance(dtype, pd.DatetimeTZDtype)`` instead (:issue:`52607`) @@ -545,49 +548,49 @@ Other Deprecations - Deprecated :meth:`.Styler.applymap`. Use the new :meth:`.Styler.map` method instead (:issue:`52708`) - Deprecated :meth:`DataFrame.applymap`. Use the new :meth:`DataFrame.map` method instead (:issue:`52353`) - Deprecated :meth:`DataFrame.swapaxes` and :meth:`Series.swapaxes`, use :meth:`DataFrame.transpose` or :meth:`Series.transpose` instead (:issue:`51946`) -- Deprecated ``freq`` parameter in :class:`PeriodArray` constructor, pass ``dtype`` instead (:issue:`52462`) -- Deprecated allowing non-standard inputs in :func:`take`, pass either a ``numpy.ndarray``, :class:`ExtensionArray`, :class:`Index`, or :class:`Series` (:issue:`52981`) -- Deprecated allowing non-standard sequences for :func:`isin`, :func:`value_counts`, :func:`unique`, :func:`factorize`, case to one of ``numpy.ndarray``, :class:`Index`, :class:`ExtensionArray`, or :class:`Series` before calling (:issue:`52986`) +- Deprecated ``freq`` parameter in :class:`.PeriodArray` constructor, pass ``dtype`` instead (:issue:`52462`) +- Deprecated allowing non-standard inputs in :func:`take`, pass either a ``numpy.ndarray``, :class:`.ExtensionArray`, :class:`Index`, or :class:`Series` (:issue:`52981`) +- Deprecated allowing non-standard sequences for :func:`isin`, :func:`value_counts`, :func:`unique`, :func:`factorize`, case to one of ``numpy.ndarray``, :class:`Index`, :class:`.ExtensionArray`, or :class:`Series` before calling (:issue:`52986`) - Deprecated behavior of :class:`DataFrame` reductions ``sum``, ``prod``, ``std``, ``var``, ``sem`` with ``axis=None``, in a future version this will operate over both axes returning a scalar instead of behaving like ``axis=0``; note this also affects numpy functions e.g. ``np.sum(df)`` (:issue:`21597`) - Deprecated behavior of :func:`concat` when :class:`DataFrame` has columns that are all-NA, in a future version these will not be discarded when determining the resulting dtype (:issue:`40893`) -- Deprecated behavior of :meth:`Series.dt.to_pydatetime`, in a future version this will return a :class:`Series` containing python ``datetime`` objects instead of an ``ndarray`` of datetimes; this matches the behavior of other :meth:`Series.dt` properties (:issue:`20306`) -- Deprecated logical operations (``|``, ``&``, ``^``) between pandas objects and dtype-less sequences (e.g. ``list``, ``tuple``), wrap a sequence in a :class:`Series` or numpy array before operating instead (:issue:`51521`) -- Deprecated making :meth:`Series.apply` return a :class:`DataFrame` when the passed-in callable returns a :class:`Series` object. In the future this will return a :class:`Series` whose values are themselves :class:`Series`. This pattern was very slow and it's recommended to use alternative methods to archive the same goal (:issue:`52116`) +- Deprecated behavior of :meth:`Series.dt.to_pydatetime`, in a future version this will return a :class:`Series` containing python ``datetime`` objects instead of an ``ndarray`` of datetimes; this matches the behavior of other :attr:`Series.dt` properties (:issue:`20306`) +- Deprecated logical operations (``|``, ``&``, ``^``) between pandas objects and dtype-less sequences (e.g. ``list``, ``tuple``), wrap a sequence in a :class:`Series` or NumPy array before operating instead (:issue:`51521`) - Deprecated parameter ``convert_type`` in :meth:`Series.apply` (:issue:`52140`) - Deprecated passing a dictionary to :meth:`.SeriesGroupBy.agg`; pass a list of aggregations instead (:issue:`50684`) -- Deprecated the "fastpath" keyword in :class:`Categorical` constructor, use :meth:`Categorical.from_codes` instead (:issue:`20110`) +- Deprecated the ``fastpath`` keyword in :class:`Categorical` constructor, use :meth:`Categorical.from_codes` instead (:issue:`20110`) - Deprecated the behavior of :func:`is_bool_dtype` returning ``True`` for object-dtype :class:`Index` of bool objects (:issue:`52680`) - Deprecated the methods :meth:`Series.bool` and :meth:`DataFrame.bool` (:issue:`51749`) -- Deprecated unused "closed" and "normalize" keywords in the :class:`DatetimeIndex` constructor (:issue:`52628`) -- Deprecated unused "closed" keyword in the :class:`TimedeltaIndex` constructor (:issue:`52628`) -- Deprecated logical operation between two non boolean :class:`Series` with different indexes always coercing the result to bool dtype. In a future version, this will maintain the return type of the inputs. (:issue:`52500`, :issue:`52538`) +- Deprecated unused ``closed`` and ``normalize`` keywords in the :class:`DatetimeIndex` constructor (:issue:`52628`) +- Deprecated unused ``closed`` keyword in the :class:`TimedeltaIndex` constructor (:issue:`52628`) +- Deprecated logical operation between two non boolean :class:`Series` with different indexes always coercing the result to bool dtype. In a future version, this will maintain the return type of the inputs (:issue:`52500`, :issue:`52538`) - Deprecated :class:`Period` and :class:`PeriodDtype` with ``BDay`` freq, use a :class:`DatetimeIndex` with ``BDay`` freq instead (:issue:`53446`) - Deprecated :func:`value_counts`, use ``pd.Series(obj).value_counts()`` instead (:issue:`47862`) -- Deprecated :meth:`Series.first` and :meth:`DataFrame.first` (please create a mask and filter using ``.loc`` instead) (:issue:`45908`) +- Deprecated :meth:`Series.first` and :meth:`DataFrame.first`; create a mask and filter using ``.loc`` instead (:issue:`45908`) - Deprecated :meth:`Series.interpolate` and :meth:`DataFrame.interpolate` for object-dtype (:issue:`53631`) -- Deprecated :meth:`Series.last` and :meth:`DataFrame.last` (please create a mask and filter using ``.loc`` instead) (:issue:`53692`) +- Deprecated :meth:`Series.last` and :meth:`DataFrame.last`; create a mask and filter using ``.loc`` instead (:issue:`53692`) - Deprecated allowing arbitrary ``fill_value`` in :class:`SparseDtype`, in a future version the ``fill_value`` will need to be compatible with the ``dtype.subtype``, either a scalar that can be held by that subtype or ``NaN`` for integer or bool subtypes (:issue:`23124`) - Deprecated allowing bool dtype in :meth:`.DataFrameGroupBy.quantile` and :meth:`.SeriesGroupBy.quantile`, consistent with the :meth:`Series.quantile` and :meth:`DataFrame.quantile` behavior (:issue:`51424`) - Deprecated behavior of :func:`.testing.assert_series_equal` and :func:`.testing.assert_frame_equal` considering NA-like values (e.g. ``NaN`` vs ``None`` as equivalent) (:issue:`52081`) -- Deprecated bytes input to :func:`read_excel`. To read a file path, use a string or path-like object. (:issue:`53767`) -- Deprecated constructing :class:`SparseArray` from scalar data, pass a sequence instead (:issue:`53039`) +- Deprecated bytes input to :func:`read_excel`. To read a file path, use a string or path-like object (:issue:`53767`) +- Deprecated constructing :class:`.SparseArray` from scalar data, pass a sequence instead (:issue:`53039`) - Deprecated falling back to filling when ``value`` is not specified in :meth:`DataFrame.replace` and :meth:`Series.replace` with non-dict-like ``to_replace`` (:issue:`33302`) -- Deprecated literal json input to :func:`read_json`. Wrap literal json string input in ``io.StringIO`` instead. (:issue:`53409`) -- Deprecated literal string input to :func:`read_xml`. Wrap literal string/bytes input in ``io.StringIO`` / ``io.BytesIO`` instead. (:issue:`53767`) -- Deprecated literal string/bytes input to :func:`read_html`. Wrap literal string/bytes input in ``io.StringIO`` / ``io.BytesIO`` instead. (:issue:`53767`) -- Deprecated option "mode.use_inf_as_na", convert inf entries to ``NaN`` before instead (:issue:`51684`) +- Deprecated literal json input to :func:`read_json`. Wrap literal json string input in ``io.StringIO`` instead (:issue:`53409`) +- Deprecated literal string input to :func:`read_xml`. Wrap literal string/bytes input in ``io.StringIO`` / ``io.BytesIO`` instead (:issue:`53767`) +- Deprecated literal string/bytes input to :func:`read_html`. Wrap literal string/bytes input in ``io.StringIO`` / ``io.BytesIO`` instead (:issue:`53767`) +- Deprecated option ``mode.use_inf_as_na``, convert inf entries to ``NaN`` before instead (:issue:`51684`) - Deprecated parameter ``obj`` in :meth:`.DataFrameGroupBy.get_group` (:issue:`53545`) - Deprecated positional indexing on :class:`Series` with :meth:`Series.__getitem__` and :meth:`Series.__setitem__`, in a future version ``ser[item]`` will *always* interpret ``item`` as a label, not a position (:issue:`50617`) - Deprecated replacing builtin and NumPy functions in ``.agg``, ``.apply``, and ``.transform``; use the corresponding string alias (e.g. ``"sum"`` for ``sum`` or ``np.sum``) instead (:issue:`53425`) - Deprecated strings ``T``, ``t``, ``L`` and ``l`` denoting units in :func:`to_timedelta` (:issue:`52536`) -- Deprecated the "method" and "limit" keywords in ``ExtensionArray.fillna``, implement and use ``pad_or_backfill`` instead (:issue:`53621`) -- Deprecated the "method" and "limit" keywords on :meth:`Series.fillna`, :meth:`DataFrame.fillna`, :meth:`.SeriesGroupBy.fillna`, :meth:`.DataFrameGroupBy.fillna`, and :meth:`.Resampler.fillna`, use ``obj.bfill()`` or ``obj.ffill()`` instead (:issue:`53394`) +- Deprecated the "method" and "limit" keywords in ``.ExtensionArray.fillna``, implement ``_pad_or_backfill`` instead (:issue:`53621`) - Deprecated the ``method`` and ``limit`` keywords in :meth:`DataFrame.replace` and :meth:`Series.replace` (:issue:`33302`) +- Deprecated the ``method`` and ``limit`` keywords on :meth:`Series.fillna`, :meth:`DataFrame.fillna`, :meth:`.SeriesGroupBy.fillna`, :meth:`.DataFrameGroupBy.fillna`, and :meth:`.Resampler.fillna`, use ``obj.bfill()`` or ``obj.ffill()`` instead (:issue:`53394`) - Deprecated the behavior of :meth:`Series.__getitem__`, :meth:`Series.__setitem__`, :meth:`DataFrame.__getitem__`, :meth:`DataFrame.__setitem__` with an integer slice on objects with a floating-dtype index, in a future version this will be treated as *positional* indexing (:issue:`49612`) - Deprecated the use of non-supported datetime64 and timedelta64 resolutions with :func:`pandas.array`. Supported resolutions are: "s", "ms", "us", "ns" resolutions (:issue:`53058`) -- Deprecated values "pad", "ffill", "bfill", "backfill" for :meth:`Series.interpolate` and :meth:`DataFrame.interpolate`, use ``obj.ffill()`` or ``obj.bfill()`` instead (:issue:`53581`) -- Deprecated the behavior of :meth:`Index.argmax`, :meth:`Index.argmin`, :meth:`Series.argmax`, :meth:`Series.argmin` with either all-NAs and skipna=True or any-NAs and skipna=False returning -1; in a future version this will raise ``ValueError`` (:issue:`33941`, :issue:`33942`) -- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_sql` except ``name``. (:issue:`54229`) +- Deprecated values ``"pad"``, ``"ffill"``, ``"bfill"``, ``"backfill"`` for :meth:`Series.interpolate` and :meth:`DataFrame.interpolate`, use ``obj.ffill()`` or ``obj.bfill()`` instead (:issue:`53581`) +- Deprecated the behavior of :meth:`Index.argmax`, :meth:`Index.argmin`, :meth:`Series.argmax`, :meth:`Series.argmin` with either all-NAs and ``skipna=True`` or any-NAs and ``skipna=False`` returning -1; in a future version this will raise ``ValueError`` (:issue:`33941`, :issue:`33942`) +- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_sql` except ``name`` and ``con`` (:issue:`54229`) +- Deprecated silently ignoring ``fill_value`` when passing both ``freq`` and ``fill_value`` to :meth:`DataFrame.shift`, :meth:`Series.shift` and :meth:`.DataFrameGroupBy.shift`; in a future version this will raise ``ValueError`` (:issue:`53832`) .. --------------------------------------------------------------------------- .. _whatsnew_210.performance: @@ -596,7 +599,7 @@ Performance improvements ~~~~~~~~~~~~~~~~~~~~~~~~ - Performance improvement in :func:`concat` with homogeneous ``np.float64`` or ``np.float32`` dtypes (:issue:`52685`) - Performance improvement in :func:`factorize` for object columns not containing strings (:issue:`51921`) -- Performance improvement in :func:`read_orc` when reading a remote URI file path. (:issue:`51609`) +- Performance improvement in :func:`read_orc` when reading a remote URI file path (:issue:`51609`) - Performance improvement in :func:`read_parquet` and :meth:`DataFrame.to_parquet` when reading a remote file with ``engine="pyarrow"`` (:issue:`51609`) - Performance improvement in :func:`read_parquet` on string columns when using ``use_nullable_dtypes=True`` (:issue:`47345`) - Performance improvement in :meth:`DataFrame.clip` and :meth:`Series.clip` (:issue:`51472`) @@ -611,9 +614,10 @@ Performance improvements - Performance improvement when parsing strings to ``boolean[pyarrow]`` dtype (:issue:`51730`) - Performance improvement when searching an :class:`Index` sliced from other indexes (:issue:`51738`) - Performance improvement in :func:`concat` (:issue:`52291`, :issue:`52290`) -- :class:`Period`'s default formatter (`period_format`) is now significantly (~twice) faster. This improves performance of ``str(Period)``, ``repr(Period)``, and :meth:`Period.strftime(fmt=None)`, as well as ``PeriodArray.strftime(fmt=None)``, ``PeriodIndex.strftime(fmt=None)`` and ``PeriodIndex.format(fmt=None)``. Finally, ``to_csv`` operations involving :class:`PeriodArray` or :class:`PeriodIndex` with default ``date_format`` are also significantly accelerated. (:issue:`51459`) +- :class:`Period`'s default formatter (``period_format``) is now significantly (~twice) faster. This improves performance of ``str(Period)``, ``repr(Period)``, and :meth:`.Period.strftime(fmt=None)`, as well as ``.PeriodArray.strftime(fmt=None)``, ``.PeriodIndex.strftime(fmt=None)`` and ``.PeriodIndex.format(fmt=None)``. ``to_csv`` operations involving :class:`.PeriodArray` or :class:`PeriodIndex` with default ``date_format`` are also significantly accelerated (:issue:`51459`) - Performance improvement accessing :attr:`arrays.IntegerArrays.dtype` & :attr:`arrays.FloatingArray.dtype` (:issue:`52998`) -- Performance improvement for :class:`DataFrameGroupBy`/:class:`SeriesGroupBy` aggregations (e.g. :meth:`DataFrameGroupBy.sum`) with ``engine="numba"`` (:issue:`53731`) +- Performance improvement for :class:`.DataFrameGroupBy`/:class:`.SeriesGroupBy` aggregations (e.g. :meth:`.DataFrameGroupBy.sum`) with ``engine="numba"`` (:issue:`53731`) +- Performance improvement in :class:`DataFrame` reductions with ``axis=1`` and extension dtypes (:issue:`54341`) - Performance improvement in :class:`DataFrame` reductions with ``axis=None`` and extension dtypes (:issue:`54308`) - Performance improvement in :class:`MultiIndex` and multi-column operations (e.g. :meth:`DataFrame.sort_values`, :meth:`DataFrame.groupby`, :meth:`Series.unstack`) when index/column values are already sorted (:issue:`53806`) - Performance improvement in :class:`Series` reductions (:issue:`52341`) @@ -621,24 +625,26 @@ Performance improvements - Performance improvement in :func:`concat` when the concatenation axis is a :class:`MultiIndex` (:issue:`53574`) - Performance improvement in :func:`merge` for PyArrow backed strings (:issue:`54443`) - Performance improvement in :func:`read_csv` with ``engine="c"`` (:issue:`52632`) +- Performance improvement in :meth:`.ArrowExtensionArray.to_numpy` (:issue:`52525`) - Performance improvement in :meth:`.DataFrameGroupBy.groups` (:issue:`53088`) - Performance improvement in :meth:`DataFrame.astype` when ``dtype`` is an extension dtype (:issue:`54299`) +- Performance improvement in :meth:`DataFrame.iloc` when input is an single integer and dataframe is backed by extension dtypes (:issue:`54508`) - Performance improvement in :meth:`DataFrame.isin` for extension dtypes (:issue:`53514`) - Performance improvement in :meth:`DataFrame.loc` when selecting rows and columns (:issue:`53014`) +- Performance improvement in :meth:`DataFrame.transpose` when transposing a DataFrame with a single PyArrow dtype (:issue:`54224`) - Performance improvement in :meth:`DataFrame.transpose` when transposing a DataFrame with a single masked dtype, e.g. :class:`Int64` (:issue:`52836`) -- Performance improvement in :meth:`DataFrame.transpose` when transposing a DataFrame with a single pyarrow dtype (:issue:`54224`) -- Performance improvement in :meth:`Series.add` for pyarrow string and binary dtypes (:issue:`53150`) +- Performance improvement in :meth:`Series.add` for PyArrow string and binary dtypes (:issue:`53150`) - Performance improvement in :meth:`Series.corr` and :meth:`Series.cov` for extension dtypes (:issue:`52502`) -- Performance improvement in :meth:`Series.ffill`, :meth:`Series.bfill`, :meth:`DataFrame.ffill`, :meth:`DataFrame.bfill` with pyarrow dtypes (:issue:`53950`) -- Performance improvement in :meth:`Series.str.get_dummies` for pyarrow-backed strings (:issue:`53655`) -- Performance improvement in :meth:`Series.str.get` for pyarrow-backed strings (:issue:`53152`) -- Performance improvement in :meth:`Series.str.split` with ``expand=True`` for pyarrow-backed strings (:issue:`53585`) -- Performance improvement in :meth:`Series.to_numpy` when dtype is a numpy float dtype and ``na_value`` is ``np.nan`` (:issue:`52430`) -- Performance improvement in :meth:`~arrays.ArrowExtensionArray.astype` when converting from a pyarrow timestamp or duration dtype to numpy (:issue:`53326`) -- Performance improvement in :meth:`~arrays.ArrowExtensionArray.to_numpy` (:issue:`52525`) +- Performance improvement in :meth:`Series.drop_duplicates` for ``ArrowDtype`` (:issue:`54667`). +- Performance improvement in :meth:`Series.ffill`, :meth:`Series.bfill`, :meth:`DataFrame.ffill`, :meth:`DataFrame.bfill` with PyArrow dtypes (:issue:`53950`) +- Performance improvement in :meth:`Series.str.get_dummies` for PyArrow-backed strings (:issue:`53655`) +- Performance improvement in :meth:`Series.str.get` for PyArrow-backed strings (:issue:`53152`) +- Performance improvement in :meth:`Series.str.split` with ``expand=True`` for PyArrow-backed strings (:issue:`53585`) +- Performance improvement in :meth:`Series.to_numpy` when dtype is a NumPy float dtype and ``na_value`` is ``np.nan`` (:issue:`52430`) +- Performance improvement in :meth:`~arrays.ArrowExtensionArray.astype` when converting from a PyArrow timestamp or duration dtype to NumPy (:issue:`53326`) - Performance improvement in various :class:`MultiIndex` set and indexing operations (:issue:`53955`) -- Performance improvement when doing various reshaping operations on :class:`arrays.IntegerArrays` & :class:`arrays.FloatingArray` by avoiding doing unnecessary validation (:issue:`53013`) -- Performance improvement when indexing with pyarrow timestamp and duration dtypes (:issue:`53368`) +- Performance improvement when doing various reshaping operations on :class:`arrays.IntegerArray` & :class:`arrays.FloatingArray` by avoiding doing unnecessary validation (:issue:`53013`) +- Performance improvement when indexing with PyArrow timestamp and duration dtypes (:issue:`53368`) - Performance improvement when passing an array to :meth:`RangeIndex.take`, :meth:`DataFrame.loc`, or :meth:`DataFrame.iloc` and the DataFrame is using a RangeIndex (:issue:`53387`) .. --------------------------------------------------------------------------- @@ -655,28 +661,29 @@ Categorical Datetimelike ^^^^^^^^^^^^ -- :meth:`DatetimeIndex.map` with ``na_action="ignore"`` now works as expected. (:issue:`51644`) -- :meth:`DatetimeIndex.slice_indexer` now raises ``KeyError`` for non-monotonic indexes if either of the slice bounds is not in the index, this behaviour was previously deprecated but inconsistently handled. (:issue:`53983`) +- :meth:`DatetimeIndex.map` with ``na_action="ignore"`` now works as expected (:issue:`51644`) +- :meth:`DatetimeIndex.slice_indexer` now raises ``KeyError`` for non-monotonic indexes if either of the slice bounds is not in the index; this behaviour was previously deprecated but inconsistently handled (:issue:`53983`) - Bug in :class:`DateOffset` which had inconsistent behavior when multiplying a :class:`DateOffset` object by a constant (:issue:`47953`) - Bug in :func:`date_range` when ``freq`` was a :class:`DateOffset` with ``nanoseconds`` (:issue:`46877`) -- Bug in :func:`to_datetime` converting :class:`Series` or :class:`DataFrame` containing :class:`arrays.ArrowExtensionArray` of ``pyarrow`` timestamps to numpy datetimes (:issue:`52545`) -- Bug in :meth:`DataFrame.to_sql` raising ``ValueError`` for pyarrow-backed date like dtypes (:issue:`53854`) +- Bug in :func:`to_datetime` converting :class:`Series` or :class:`DataFrame` containing :class:`arrays.ArrowExtensionArray` of PyArrow timestamps to numpy datetimes (:issue:`52545`) +- Bug in :meth:`.DatetimeArray.map` and :meth:`DatetimeIndex.map`, where the supplied callable operated array-wise instead of element-wise (:issue:`51977`) +- Bug in :meth:`DataFrame.to_sql` raising ``ValueError`` for PyArrow-backed date like dtypes (:issue:`53854`) - Bug in :meth:`Timestamp.date`, :meth:`Timestamp.isocalendar`, :meth:`Timestamp.timetuple`, and :meth:`Timestamp.toordinal` were returning incorrect results for inputs outside those supported by the Python standard library's datetime module (:issue:`53668`) - Bug in :meth:`Timestamp.round` with values close to the implementation bounds returning incorrect results instead of raising ``OutOfBoundsDatetime`` (:issue:`51494`) -- Bug in :meth:`arrays.DatetimeArray.map` and :meth:`DatetimeIndex.map`, where the supplied callable operated array-wise instead of element-wise (:issue:`51977`) - Bug in constructing a :class:`Series` or :class:`DataFrame` from a datetime or timedelta scalar always inferring nanosecond resolution instead of inferring from the input (:issue:`52212`) - Bug in constructing a :class:`Timestamp` from a string representing a time without a date inferring an incorrect unit (:issue:`54097`) - Bug in constructing a :class:`Timestamp` with ``ts_input=pd.NA`` raising ``TypeError`` (:issue:`45481`) - Bug in parsing datetime strings with weekday but no day e.g. "2023 Sept Thu" incorrectly raising ``AttributeError`` instead of ``ValueError`` (:issue:`52659`) +- Bug in the repr for :class:`Series` when dtype is a timezone aware datetime with non-nanosecond resolution raising ``OutOfBoundsDatetime`` (:issue:`54623`) Timedelta ^^^^^^^^^ -- :meth:`TimedeltaIndex.map` with ``na_action="ignore"`` now works as expected (:issue:`51644`) - Bug in :class:`TimedeltaIndex` division or multiplication leading to ``.freq`` of "0 Days" instead of ``None`` (:issue:`51575`) -- Bug in :class:`Timedelta` with Numpy timedelta64 objects not properly raising ``ValueError`` (:issue:`52806`) -- Bug in :func:`to_timedelta` converting :class:`Series` or :class:`DataFrame` containing :class:`ArrowDtype` of ``pyarrow.duration`` to numpy ``timedelta64`` (:issue:`54298`) +- Bug in :class:`Timedelta` with NumPy ``timedelta64`` objects not properly raising ``ValueError`` (:issue:`52806`) +- Bug in :func:`to_timedelta` converting :class:`Series` or :class:`DataFrame` containing :class:`ArrowDtype` of ``pyarrow.duration`` to NumPy ``timedelta64`` (:issue:`54298`) - Bug in :meth:`Timedelta.__hash__`, raising an ``OutOfBoundsTimedelta`` on certain large values of second resolution (:issue:`54037`) - Bug in :meth:`Timedelta.round` with values close to the implementation bounds returning incorrect results instead of raising ``OutOfBoundsTimedelta`` (:issue:`51494`) +- Bug in :meth:`TimedeltaIndex.map` with ``na_action="ignore"`` (:issue:`51644`) - Bug in :meth:`arrays.TimedeltaArray.map` and :meth:`TimedeltaIndex.map`, where the supplied callable operated array-wise instead of element-wise (:issue:`51977`) Timezones @@ -688,10 +695,10 @@ Numeric ^^^^^^^ - Bug in :class:`RangeIndex` setting ``step`` incorrectly when being the subtrahend with minuend a numeric value (:issue:`53255`) - Bug in :meth:`Series.corr` and :meth:`Series.cov` raising ``AttributeError`` for masked dtypes (:issue:`51422`) -- Bug when calling :meth:`Series.kurt` and :meth:`Series.skew` on numpy data of all zero returning a python type instead of a numpy type (:issue:`53482`) +- Bug when calling :meth:`Series.kurt` and :meth:`Series.skew` on NumPy data of all zero returning a Python type instead of a NumPy type (:issue:`53482`) - Bug in :meth:`Series.mean`, :meth:`DataFrame.mean` with object-dtype values containing strings that can be converted to numbers (e.g. "2") returning incorrect numeric results; these now raise ``TypeError`` (:issue:`36703`, :issue:`44008`) -- Bug in :meth:`DataFrame.corrwith` raising ``NotImplementedError`` for pyarrow-backed dtypes (:issue:`52314`) -- Bug in :meth:`DataFrame.size` and :meth:`Series.size` returning 64-bit integer instead of int (:issue:`52897`) +- Bug in :meth:`DataFrame.corrwith` raising ``NotImplementedError`` for PyArrow-backed dtypes (:issue:`52314`) +- Bug in :meth:`DataFrame.size` and :meth:`Series.size` returning 64-bit integer instead of a Python int (:issue:`52897`) - Bug in :meth:`DateFrame.dot` returning ``object`` dtype for :class:`ArrowDtype` data (:issue:`53979`) - Bug in :meth:`Series.any`, :meth:`Series.all`, :meth:`DataFrame.any`, and :meth:`DataFrame.all` had the default value of ``bool_only`` set to ``None`` instead of ``False``; this change should have no impact on users (:issue:`53258`) - Bug in :meth:`Series.corr` and :meth:`Series.cov` raising ``AttributeError`` for masked dtypes (:issue:`51422`) @@ -702,9 +709,9 @@ Numeric Conversion ^^^^^^^^^^ - Bug in :func:`DataFrame.style.to_latex` and :func:`DataFrame.style.to_html` if the DataFrame contains integers with more digits than can be represented by floating point double precision (:issue:`52272`) -- Bug in :func:`array` when given a ``datetime64`` or ``timedelta64`` dtype with unit of "s", "us", or "ms" returning :class:`NumpyExtensionArray` instead of :class:`DatetimeArray` or :class:`TimedeltaArray` (:issue:`52859`) -- Bug in :func:`array` when given an empty list and no dtype returning :class:`NumpyExtensionArray` instead of :class:`FloatingArray` (:issue:`54371`) -- Bug in :meth:`ArrowDtype.numpy_dtype` returning nanosecond units for non-nanosecond ``pyarrow.timestamp`` and ``pyarrow.duration`` types (:issue:`51800`) +- Bug in :func:`array` when given a ``datetime64`` or ``timedelta64`` dtype with unit of "s", "us", or "ms" returning :class:`.NumpyExtensionArray` instead of :class:`.DatetimeArray` or :class:`.TimedeltaArray` (:issue:`52859`) +- Bug in :func:`array` when given an empty list and no dtype returning :class:`.NumpyExtensionArray` instead of :class:`.FloatingArray` (:issue:`54371`) +- Bug in :meth:`.ArrowDtype.numpy_dtype` returning nanosecond units for non-nanosecond ``pyarrow.timestamp`` and ``pyarrow.duration`` types (:issue:`51800`) - Bug in :meth:`DataFrame.__repr__` incorrectly raising a ``TypeError`` when the dtype of a column is ``np.record`` (:issue:`48526`) - Bug in :meth:`DataFrame.info` raising ``ValueError`` when ``use_numba`` is set (:issue:`51922`) - Bug in :meth:`DataFrame.insert` raising ``TypeError`` if ``loc`` is ``np.int64`` (:issue:`53193`) @@ -714,6 +721,7 @@ Conversion Strings ^^^^^^^ - Bug in :meth:`Series.str` that did not raise a ``TypeError`` when iterated (:issue:`54173`) +- Bug in ``repr`` for :class:`DataFrame`` with string-dtype columns (:issue:`54797`) Interval ^^^^^^^^ @@ -726,13 +734,14 @@ Indexing - Bug in :meth:`DataFrame.__setitem__` losing dtype when setting a :class:`DataFrame` into duplicated columns (:issue:`53143`) - Bug in :meth:`DataFrame.__setitem__` with a boolean mask and :meth:`DataFrame.putmask` with mixed non-numeric dtypes and a value other than ``NaN`` incorrectly raising ``TypeError`` (:issue:`53291`) - Bug in :meth:`DataFrame.iloc` when using ``nan`` as the only element (:issue:`52234`) +- Bug in :meth:`Series.loc` casting :class:`Series` to ``np.dnarray`` when assigning :class:`Series` at predefined index of ``object`` dtype :class:`Series` (:issue:`48933`) Missing ^^^^^^^ -- Bug in :meth:`DataFrame.interpolate` failing to fill across multiblock data when ``method`` is "pad", "ffill", "bfill", or "backfill" (:issue:`53898`) +- Bug in :meth:`DataFrame.interpolate` failing to fill across data when ``method`` is ``"pad"``, ``"ffill"``, ``"bfill"``, or ``"backfill"`` (:issue:`53898`) - Bug in :meth:`DataFrame.interpolate` ignoring ``inplace`` when :class:`DataFrame` is empty (:issue:`53199`) - Bug in :meth:`Series.idxmin`, :meth:`Series.idxmax`, :meth:`DataFrame.idxmin`, :meth:`DataFrame.idxmax` with a :class:`DatetimeIndex` index containing ``NaT`` incorrectly returning ``NaN`` instead of ``NaT`` (:issue:`43587`) -- Bug in :meth:`Series.interpolate` and :meth:`DataFrame.interpolate` failing to raise on invalid ``downcast`` keyword, which can be only ``None`` or "infer" (:issue:`53103`) +- Bug in :meth:`Series.interpolate` and :meth:`DataFrame.interpolate` failing to raise on invalid ``downcast`` keyword, which can be only ``None`` or ``"infer"`` (:issue:`53103`) - Bug in :meth:`Series.interpolate` and :meth:`DataFrame.interpolate` with complex dtype incorrectly failing to fill ``NaN`` entries (:issue:`53635`) MultiIndex @@ -744,25 +753,23 @@ I/O ^^^ - :meth:`DataFrame.to_orc` now raising ``ValueError`` when non-default :class:`Index` is given (:issue:`51828`) - :meth:`DataFrame.to_sql` now raising ``ValueError`` when the name param is left empty while using SQLAlchemy to connect (:issue:`52675`) -- Added ``filters`` parameter to :func:`read_parquet` to filter out data, compatible with both ``engines`` (:issue:`53212`) -- Bug in :func:`json_normalize`, fix json_normalize cannot parse metadata fields list type (:issue:`37782`) +- Bug in :func:`json_normalize` could not parse metadata fields list type (:issue:`37782`) - Bug in :func:`read_csv` where it would error when ``parse_dates`` was set to a list or dictionary with ``engine="pyarrow"`` (:issue:`47961`) -- Bug in :func:`read_csv`, with ``engine="pyarrow"`` erroring when specifying a ``dtype`` with ``index_col`` (:issue:`53229`) -- Bug in :func:`read_hdf` not properly closing store after a ``IndexError`` is raised (:issue:`52781`) -- Bug in :func:`read_html`, style elements were read into DataFrames (:issue:`52197`) -- Bug in :func:`read_html`, tail texts were removed together with elements containing ``display:none`` style (:issue:`51629`) +- Bug in :func:`read_csv` with ``engine="pyarrow"`` raising when specifying a ``dtype`` with ``index_col`` (:issue:`53229`) +- Bug in :func:`read_hdf` not properly closing store after an ``IndexError`` is raised (:issue:`52781`) +- Bug in :func:`read_html` where style elements were read into DataFrames (:issue:`52197`) +- Bug in :func:`read_html` where tail texts were removed together with elements containing ``display:none`` style (:issue:`51629`) - Bug in :func:`read_sql_table` raising an exception when reading a view (:issue:`52969`) - Bug in :func:`read_sql` when reading multiple timezone aware columns with the same column name (:issue:`44421`) - Bug in :func:`read_xml` stripping whitespace in string data (:issue:`53811`) - Bug in :meth:`DataFrame.to_html` where ``colspace`` was incorrectly applied in case of multi index columns (:issue:`53885`) - Bug in :meth:`DataFrame.to_html` where conversion for an empty :class:`DataFrame` with complex dtype raised a ``ValueError`` (:issue:`54167`) -- Bug in :meth:`DataFrame.to_json` where :class:`DateTimeArray`/:class:`DateTimeIndex` with non nanosecond precision could not be serialized correctly (:issue:`53686`) +- Bug in :meth:`DataFrame.to_json` where :class:`.DateTimeArray`/:class:`.DateTimeIndex` with non nanosecond precision could not be serialized correctly (:issue:`53686`) - Bug when writing and reading empty Stata dta files where dtype information was lost (:issue:`46240`) - Bug where ``bz2`` was treated as a hard requirement (:issue:`53857`) Period ^^^^^^ -- :meth:`PeriodIndex.map` with ``na_action="ignore"`` now works as expected (:issue:`51644`) - Bug in :class:`PeriodDtype` constructor failing to raise ``TypeError`` when no argument is passed or when ``None`` is passed (:issue:`27388`) - Bug in :class:`PeriodDtype` constructor incorrectly returning the same ``normalize`` for different :class:`DateOffset` ``freq`` inputs (:issue:`24121`) - Bug in :class:`PeriodDtype` constructor raising ``ValueError`` instead of ``TypeError`` when an invalid type is passed (:issue:`51790`) @@ -770,6 +777,7 @@ Period - Bug in :func:`read_csv` not processing empty strings as a null value, with ``engine="pyarrow"`` (:issue:`52087`) - Bug in :func:`read_csv` returning ``object`` dtype columns instead of ``float64`` dtype columns with ``engine="pyarrow"`` for columns that are all null with ``engine="pyarrow"`` (:issue:`52087`) - Bug in :meth:`Period.now` not accepting the ``freq`` parameter as a keyword argument (:issue:`53369`) +- Bug in :meth:`PeriodIndex.map` with ``na_action="ignore"`` (:issue:`51644`) - Bug in :meth:`arrays.PeriodArray.map` and :meth:`PeriodIndex.map`, where the supplied callable operated array-wise instead of element-wise (:issue:`51977`) - Bug in incorrectly allowing construction of :class:`Period` or :class:`PeriodDtype` with :class:`CustomBusinessDay` freq; use :class:`BusinessDay` instead (:issue:`52534`) @@ -780,29 +788,29 @@ Plotting Groupby/resample/rolling ^^^^^^^^^^^^^^^^^^^^^^^^ -- Bug in :meth:`.DataFrameGroupBy.idxmin`, :meth:`.SeriesGroupBy.idxmin`, :meth:`.DataFrameGroupBy.idxmax`, :meth:`.SeriesGroupBy.idxmax` return wrong dtype when used on empty DataFrameGroupBy or SeriesGroupBy (:issue:`51423`) -- Bug in :meth:`DataFrame.resample` and :meth:`Series.resample` :class:`Datetimelike` ``origin`` has no effect in resample when values are outside of axis (:issue:`53662`) +- Bug in :meth:`.DataFrameGroupBy.idxmin`, :meth:`.SeriesGroupBy.idxmin`, :meth:`.DataFrameGroupBy.idxmax`, :meth:`.SeriesGroupBy.idxmax` returns wrong dtype when used on an empty DataFrameGroupBy or SeriesGroupBy (:issue:`51423`) - Bug in :meth:`DataFrame.resample` and :meth:`Series.resample` in incorrectly allowing non-fixed ``freq`` when resampling on a :class:`TimedeltaIndex` (:issue:`51896`) - Bug in :meth:`DataFrame.resample` and :meth:`Series.resample` losing time zone when resampling empty data (:issue:`53664`) +- Bug in :meth:`DataFrame.resample` and :meth:`Series.resample` where ``origin`` has no effect in resample when values are outside of axis (:issue:`53662`) - Bug in weighted rolling aggregations when specifying ``min_periods=0`` (:issue:`51449`) -- Bug in :meth:`DataFrame.groupby` and :meth:`Series.groupby`, where, when the index of the +- Bug in :meth:`DataFrame.groupby` and :meth:`Series.groupby` where, when the index of the grouped :class:`Series` or :class:`DataFrame` was a :class:`DatetimeIndex`, :class:`TimedeltaIndex` or :class:`PeriodIndex`, and the ``groupby`` method was given a function as its first argument, - the function operated on the whole index rather than each element of the index. (:issue:`51979`) + the function operated on the whole index rather than each element of the index (:issue:`51979`) - Bug in :meth:`.DataFrameGroupBy.agg` with lists not respecting ``as_index=False`` (:issue:`52849`) -- Bug in :meth:`.DataFrameGroupBy.apply` causing an error to be raised when the input :class:`DataFrame` was subset as a :class:`DataFrame` after groupby (``[['a']]`` and not ``['a']``) and the given callable returned :class:`Series` that were not all indexed the same. (:issue:`52444`) +- Bug in :meth:`.DataFrameGroupBy.apply` causing an error to be raised when the input :class:`DataFrame` was subset as a :class:`DataFrame` after groupby (``[['a']]`` and not ``['a']``) and the given callable returned :class:`Series` that were not all indexed the same (:issue:`52444`) - Bug in :meth:`.DataFrameGroupBy.apply` raising a ``TypeError`` when selecting multiple columns and providing a function that returns ``np.ndarray`` results (:issue:`18930`) -- Bug in :meth:`.GroupBy.groups` with a datetime key in conjunction with another key produced incorrect number of group keys (:issue:`51158`) -- Bug in :meth:`.GroupBy.quantile` may implicitly sort the result index with ``sort=False`` (:issue:`53009`) +- Bug in :meth:`.DataFrameGroupBy.groups` and :meth:`.SeriesGroupBy.groups` with a datetime key in conjunction with another key produced an incorrect number of group keys (:issue:`51158`) +- Bug in :meth:`.DataFrameGroupBy.quantile` and :meth:`.SeriesGroupBy.quantile` may implicitly sort the result index with ``sort=False`` (:issue:`53009`) - Bug in :meth:`.SeriesGroupBy.size` where the dtype would be ``np.int64`` for data with :class:`ArrowDtype` or masked dtypes (e.g. ``Int64``) (:issue:`53831`) -- Bug in :meth:`DataFrame.groupby` with column selection on the resulting groupby object not returning names as tuples when grouping by a list of a single element. (:issue:`53500`) -- Bug in :meth:`.GroupBy.var` failing to raise ``TypeError`` when called with datetime64, timedelta64 or :class:`PeriodDtype` values (:issue:`52128`, :issue:`53045`) -- Bug in :meth:`.DataFrameGroupby.resample` with ``kind="period"`` raising ``AttributeError`` (:issue:`24103`) +- Bug in :meth:`DataFrame.groupby` with column selection on the resulting groupby object not returning names as tuples when grouping by a list consisting of a single element (:issue:`53500`) +- Bug in :meth:`.DataFrameGroupBy.var` and :meth:`.SeriesGroupBy.var` failing to raise ``TypeError`` when called with datetime64, timedelta64 or :class:`PeriodDtype` values (:issue:`52128`, :issue:`53045`) +- Bug in :meth:`.DataFrameGroupBy.resample` with ``kind="period"`` raising ``AttributeError`` (:issue:`24103`) - Bug in :meth:`.Resampler.ohlc` with empty object returning a :class:`Series` instead of empty :class:`DataFrame` (:issue:`42902`) - Bug in :meth:`.SeriesGroupBy.count` and :meth:`.DataFrameGroupBy.count` where the dtype would be ``np.int64`` for data with :class:`ArrowDtype` or masked dtypes (e.g. ``Int64``) (:issue:`53831`) - Bug in :meth:`.SeriesGroupBy.nth` and :meth:`.DataFrameGroupBy.nth` after performing column selection when using ``dropna="any"`` or ``dropna="all"`` would not subset columns (:issue:`53518`) - Bug in :meth:`.SeriesGroupBy.nth` and :meth:`.DataFrameGroupBy.nth` raised after performing column selection when using ``dropna="any"`` or ``dropna="all"`` resulted in rows being dropped (:issue:`53518`) -- Bug in :meth:`.SeriesGroupBy.sum` and :meth:`.DataFrameGroupby.sum` summing ``np.inf + np.inf`` and ``(-np.inf) + (-np.inf)`` to ``np.nan`` (:issue:`53606`) +- Bug in :meth:`.SeriesGroupBy.sum` and :meth:`.DataFrameGroupBy.sum` summing ``np.inf + np.inf`` and ``(-np.inf) + (-np.inf)`` to ``np.nan`` instead of ``np.inf`` and ``-np.inf`` respectively (:issue:`53606`) - Bug in :meth:`Series.groupby` raising an error when grouped :class:`Series` has a :class:`DatetimeIndex` index and a :class:`Series` with a name that is a month is given to the ``by`` argument (:issue:`48509`) Reshaping @@ -813,6 +821,7 @@ Reshaping - Bug in :func:`merge_asof` raising ``KeyError`` for extension dtypes (:issue:`52904`) - Bug in :func:`merge_asof` raising ``ValueError`` for data backed by read-only ndarrays (:issue:`53513`) - Bug in :func:`merge_asof` with ``left_index=True`` or ``right_index=True`` with mismatched index dtypes giving incorrect results in some cases instead of raising ``MergeError`` (:issue:`53870`) +- Bug in :func:`merge` when merging on integer ``ExtensionDtype`` and float NumPy dtype raising ``TypeError`` (:issue:`46178`) - Bug in :meth:`DataFrame.agg` and :meth:`Series.agg` on non-unique columns would return incorrect type when dist-like argument passed in (:issue:`51099`) - Bug in :meth:`DataFrame.combine_first` ignoring other's columns if ``other`` is empty (:issue:`53792`) - Bug in :meth:`DataFrame.idxmin` and :meth:`DataFrame.idxmax`, where the axis dtype would be lost for empty frames (:issue:`53265`) @@ -825,23 +834,26 @@ Reshaping Sparse ^^^^^^ -- Bug in :class:`SparseDtype` constructor failing to raise ``TypeError`` when given an incompatible ``dtype`` for its subtype, which must be a ``numpy`` dtype (:issue:`53160`) +- Bug in :class:`SparseDtype` constructor failing to raise ``TypeError`` when given an incompatible ``dtype`` for its subtype, which must be a NumPy dtype (:issue:`53160`) - Bug in :meth:`arrays.SparseArray.map` allowed the fill value to be included in the sparse values (:issue:`52095`) ExtensionArray ^^^^^^^^^^^^^^ -- Bug in :class:`ArrowStringArray` constructor raises ``ValueError`` with dictionary types of strings (:issue:`54074`) +- Bug in :class:`.ArrowStringArray` constructor raises ``ValueError`` with dictionary types of strings (:issue:`54074`) - Bug in :class:`DataFrame` constructor not copying :class:`Series` with extension dtype when given in dict (:issue:`53744`) - Bug in :class:`~arrays.ArrowExtensionArray` converting pandas non-nanosecond temporal objects from non-zero values to zero values (:issue:`53171`) -- Bug in :meth:`Series.quantile` for pyarrow temporal types raising ArrowInvalid (:issue:`52678`) +- Bug in :meth:`Series.quantile` for PyArrow temporal types raising ``ArrowInvalid`` (:issue:`52678`) - Bug in :meth:`Series.rank` returning wrong order for small values with ``Float64`` dtype (:issue:`52471`) +- Bug in :meth:`Series.unique` for boolean ``ArrowDtype`` with ``NA`` values (:issue:`54667`) - Bug in :meth:`~arrays.ArrowExtensionArray.__iter__` and :meth:`~arrays.ArrowExtensionArray.__getitem__` returning python datetime and timedelta objects for non-nano dtypes (:issue:`53326`) -- Bug where the :class:`DataFrame` repr would not work when a column would have an :class:`ArrowDtype` with an ``pyarrow.ExtensionDtype`` (:issue:`54063`) -- Bug where the ``__from_arrow__`` method of masked ExtensionDtypes(e.g. :class:`Float64Dtype`, :class:`BooleanDtype`) would not accept pyarrow arrays of type ``pyarrow.null()`` (:issue:`52223`) +- Bug in :meth:`~arrays.ArrowExtensionArray.factorize` returning incorrect uniques for a ``pyarrow.dictionary`` type ``pyarrow.chunked_array`` with more than one chunk (:issue:`54844`) +- Bug when passing an :class:`ExtensionArray` subclass to ``dtype`` keywords. This will now raise a ``UserWarning`` to encourage passing an instance instead (:issue:`31356`, :issue:`54592`) +- Bug where the :class:`DataFrame` repr would not work when a column had an :class:`ArrowDtype` with a ``pyarrow.ExtensionDtype`` (:issue:`54063`) +- Bug where the ``__from_arrow__`` method of masked ExtensionDtypes (e.g. :class:`Float64Dtype`, :class:`BooleanDtype`) would not accept PyArrow arrays of type ``pyarrow.null()`` (:issue:`52223`) Styler ^^^^^^ -- Bug in :meth:`Styler._copy` calling overridden methods in subclasses of :class:`Styler` (:issue:`52728`) +- Bug in :meth:`.Styler._copy` calling overridden methods in subclasses of :class:`.Styler` (:issue:`52728`) Metadata ^^^^^^^^ @@ -851,21 +863,20 @@ Metadata Other ^^^^^ +- Bug in :class:`.FloatingArray.__contains__` with ``NaN`` item incorrectly returning ``False`` when ``NaN`` values are present (:issue:`52840`) - Bug in :class:`DataFrame` and :class:`Series` raising for data of complex dtype when ``NaN`` values are present (:issue:`53627`) - Bug in :class:`DatetimeIndex` where ``repr`` of index passed with time does not print time is midnight and non-day based freq(:issue:`53470`) -- Bug in :class:`FloatingArray.__contains__` with ``NaN`` item incorrectly returning ``False`` when ``NaN`` values are present (:issue:`52840`) -- Bug in :func:`.testing.assert_almost_equal` now throwing assertion error for two unequal sets (:issue:`51727`) +- Bug in :func:`.testing.assert_frame_equal` and :func:`.testing.assert_series_equal` now throw assertion error for two unequal sets (:issue:`51727`) - Bug in :func:`.testing.assert_frame_equal` checks category dtypes even when asked not to check index type (:issue:`52126`) - Bug in :func:`api.interchange.from_dataframe` was not respecting ``allow_copy`` argument (:issue:`54322`) - Bug in :func:`api.interchange.from_dataframe` was raising during interchanging from non-pandas tz-aware data containing null values (:issue:`54287`) - Bug in :func:`api.interchange.from_dataframe` when converting an empty DataFrame object (:issue:`53155`) - Bug in :func:`from_dummies` where the resulting :class:`Index` did not match the original :class:`Index` (:issue:`54300`) - Bug in :func:`from_dummies` where the resulting data would always be ``object`` dtype instead of the dtype of the columns (:issue:`54300`) +- Bug in :meth:`.DataFrameGroupBy.first`, :meth:`.DataFrameGroupBy.last`, :meth:`.SeriesGroupBy.first`, and :meth:`.SeriesGroupBy.last` where an empty group would return ``np.nan`` instead of the corresponding :class:`.ExtensionArray` NA value (:issue:`39098`) - Bug in :meth:`DataFrame.pivot_table` with casting the mean of ints back to an int (:issue:`16676`) - Bug in :meth:`DataFrame.reindex` with a ``fill_value`` that should be inferred with a :class:`ExtensionDtype` incorrectly inferring ``object`` dtype (:issue:`52586`) -- Bug in :meth:`DataFrame.shift` and :meth:`Series.shift` and :meth:`DataFrameGroupBy.shift` when passing both "freq" and "fill_value" silently ignoring "fill_value" instead of raising ``ValueError`` (:issue:`53832`) - Bug in :meth:`DataFrame.shift` with ``axis=1`` on a :class:`DataFrame` with a single :class:`ExtensionDtype` column giving incorrect results (:issue:`53832`) -- Bug in :meth:`GroupBy.first` and :meth:`GroupBy.last` where an empty group would return ``np.nan`` instead of a an ExtensionArray's NA value (:issue:`39098`) - Bug in :meth:`Index.sort_values` when a ``key`` is passed (:issue:`52764`) - Bug in :meth:`Series.align`, :meth:`DataFrame.align`, :meth:`Series.reindex`, :meth:`DataFrame.reindex`, :meth:`Series.interpolate`, :meth:`DataFrame.interpolate`, incorrectly failing to raise with method="asfreq" (:issue:`53620`) - Bug in :meth:`Series.argsort` failing to raise when an invalid ``axis`` is passed (:issue:`54257`) @@ -879,3 +890,5 @@ Other Contributors ~~~~~~~~~~~~ + +.. contributors:: v2.0.3..v2.1.0 diff --git a/doc/source/whatsnew/v2.1.1.rst b/doc/source/whatsnew/v2.1.1.rst new file mode 100644 index 0000000000000..6101333ae760c --- /dev/null +++ b/doc/source/whatsnew/v2.1.1.rst @@ -0,0 +1,55 @@ +.. _whatsnew_211: + +What's new in 2.1.1 (September 20, 2023) +---------------------------------------- + +These are the changes in pandas 2.1.1. See :ref:`release` for a full changelog +including other versions of pandas. + +{{ header }} + +.. --------------------------------------------------------------------------- +.. _whatsnew_211.regressions: + +Fixed regressions +~~~~~~~~~~~~~~~~~ +- Fixed regression in :func:`concat` when :class:`DataFrame` 's have two different extension dtypes (:issue:`54848`) +- Fixed regression in :func:`merge` when merging over a PyArrow string index (:issue:`54894`) +- Fixed regression in :func:`read_csv` when ``usecols`` is given and ``dtypes`` is a dict for ``engine="python"`` (:issue:`54868`) +- Fixed regression in :func:`read_csv` when ``delim_whitespace`` is True (:issue:`54918`, :issue:`54931`) +- Fixed regression in :meth:`.GroupBy.get_group` raising for ``axis=1`` (:issue:`54858`) +- Fixed regression in :meth:`DataFrame.__setitem__` raising ``AssertionError`` when setting a :class:`Series` with a partial :class:`MultiIndex` (:issue:`54875`) +- Fixed regression in :meth:`DataFrame.filter` not respecting the order of elements for ``filter`` (:issue:`54980`) +- Fixed regression in :meth:`DataFrame.to_sql` not roundtripping datetime columns correctly for sqlite (:issue:`54877`) +- Fixed regression in :meth:`DataFrameGroupBy.agg` when aggregating a DataFrame with duplicate column names using a dictionary (:issue:`55006`) +- Fixed regression in :meth:`MultiIndex.append` raising when appending overlapping :class:`IntervalIndex` levels (:issue:`54934`) +- Fixed regression in :meth:`Series.drop_duplicates` for PyArrow strings (:issue:`54904`) +- Fixed regression in :meth:`Series.interpolate` raising when ``fill_value`` was given (:issue:`54920`) +- Fixed regression in :meth:`Series.value_counts` raising for numeric data if ``bins`` was specified (:issue:`54857`) +- Fixed regression in comparison operations for PyArrow backed columns not propagating exceptions correctly (:issue:`54944`) +- Fixed regression when comparing a :class:`Series` with ``datetime64`` dtype with ``None`` (:issue:`54870`) + +.. --------------------------------------------------------------------------- +.. _whatsnew_211.bug_fixes: + +Bug fixes +~~~~~~~~~ +- Fixed bug for :class:`ArrowDtype` raising ``NotImplementedError`` for fixed-size list (:issue:`55000`) +- Fixed bug in :meth:`DataFrame.stack` with ``future_stack=True`` and columns a non-:class:`MultiIndex` consisting of tuples (:issue:`54948`) +- Fixed bug in :meth:`Series.dt.tz` with :class:`ArrowDtype` where a string was returned instead of a ``tzinfo`` object (:issue:`55003`) +- Fixed bug in :meth:`Series.pct_change` and :meth:`DataFrame.pct_change` showing unnecessary ``FutureWarning`` (:issue:`54981`) + +.. --------------------------------------------------------------------------- +.. _whatsnew_211.other: + +Other +~~~~~ +- Reverted the deprecation that disallowed :meth:`Series.apply` returning a :class:`DataFrame` when the passed-in callable returns a :class:`Series` object (:issue:`52116`) + +.. --------------------------------------------------------------------------- +.. _whatsnew_211.contributors: + +Contributors +~~~~~~~~~~~~ + +.. contributors:: v2.1.0..v2.1.1 diff --git a/doc/source/whatsnew/v2.1.2.rst b/doc/source/whatsnew/v2.1.2.rst new file mode 100644 index 0000000000000..a13a273b01257 --- /dev/null +++ b/doc/source/whatsnew/v2.1.2.rst @@ -0,0 +1,70 @@ +.. _whatsnew_212: + +What's new in 2.1.2 (October 26, 2023) +--------------------------------------- + +These are the changes in pandas 2.1.2. See :ref:`release` for a full changelog +including other versions of pandas. + +{{ header }} + +.. --------------------------------------------------------------------------- +.. _whatsnew_212.deprecations: + +Deprecations +~~~~~~~~~~~~ + +- Reverted deprecation of ``fill_method=None`` in :meth:`DataFrame.pct_change`, :meth:`Series.pct_change`, :meth:`DataFrameGroupBy.pct_change`, and :meth:`SeriesGroupBy.pct_change`; the values ``'backfill'``, ``'bfill'``, ``'pad'``, and ``'ffill'`` are still deprecated (:issue:`53491`) + +.. --------------------------------------------------------------------------- +.. _whatsnew_212.regressions: + +Fixed regressions +~~~~~~~~~~~~~~~~~ +- Fixed regression in :meth:`DataFrame.join` where result has missing values and dtype is arrow backed string (:issue:`55348`) +- Fixed regression in :meth:`~DataFrame.rolling` where non-nanosecond index or ``on`` column would produce incorrect results (:issue:`55026`, :issue:`55106`, :issue:`55299`) +- Fixed regression in :meth:`DataFrame.resample` which was extrapolating back to ``origin`` when ``origin`` was outside its bounds (:issue:`55064`) +- Fixed regression in :meth:`DataFrame.sort_index` which was not sorting correctly when the index was a sliced :class:`MultiIndex` (:issue:`55379`) +- Fixed regression in :meth:`DataFrameGroupBy.agg` and :meth:`SeriesGroupBy.agg` where if the option ``compute.use_numba`` was set to True, groupby methods not supported by the numba engine would raise a ``TypeError`` (:issue:`55520`) +- Fixed performance regression with wide DataFrames, typically involving methods where all columns were accessed individually (:issue:`55256`, :issue:`55245`) +- Fixed regression in :func:`merge_asof` raising ``TypeError`` for ``by`` with datetime and timedelta dtypes (:issue:`55453`) +- Fixed regression in :func:`read_parquet` when reading a file with a string column consisting of more than 2 GB of string data and using the ``"string"`` dtype (:issue:`55606`) +- Fixed regression in :meth:`DataFrame.to_sql` not roundtripping datetime columns correctly for sqlite when using ``detect_types`` (:issue:`55554`) +- Fixed regression in construction of certain DataFrame or Series subclasses (:issue:`54922`) + +.. --------------------------------------------------------------------------- +.. _whatsnew_212.bug_fixes: + +Bug fixes +~~~~~~~~~ +- Fixed bug in :class:`.DataFrameGroupBy` reductions not preserving object dtype when ``infer_string`` is set (:issue:`55620`) +- Fixed bug in :meth:`.SeriesGroupBy.value_counts` returning incorrect dtype for string columns (:issue:`55627`) +- Fixed bug in :meth:`Categorical.equals` if other has arrow backed string dtype (:issue:`55364`) +- Fixed bug in :meth:`DataFrame.__setitem__` not inferring string dtype for zero-dimensional array with ``infer_string=True`` (:issue:`55366`) +- Fixed bug in :meth:`DataFrame.idxmin` and :meth:`DataFrame.idxmax` raising for arrow dtypes (:issue:`55368`) +- Fixed bug in :meth:`DataFrame.interpolate` raising incorrect error message (:issue:`55347`) +- Fixed bug in :meth:`Index.insert` raising when inserting ``None`` into :class:`Index` with ``dtype="string[pyarrow_numpy]"`` (:issue:`55365`) +- Fixed bug in :meth:`Series.all` and :meth:`Series.any` not treating missing values correctly for ``dtype="string[pyarrow_numpy]"`` (:issue:`55367`) +- Fixed bug in :meth:`Series.floordiv` for :class:`ArrowDtype` (:issue:`55561`) +- Fixed bug in :meth:`Series.mode` not sorting values for arrow backed string dtype (:issue:`55621`) +- Fixed bug in :meth:`Series.rank` for ``string[pyarrow_numpy]`` dtype (:issue:`55362`) +- Fixed bug in :meth:`Series.str.extractall` for :class:`ArrowDtype` dtype being converted to object (:issue:`53846`) +- Fixed bug where PDEP-6 warning about setting an item of an incompatible dtype was being shown when creating a new conditional column (:issue:`55025`) +- Silence ``Period[B]`` warnings introduced by :issue:`53446` during normal plotting activity (:issue:`55138`) +- Fixed bug in :class:`Series` constructor not inferring string dtype when ``NA`` is the first value and ``infer_string`` is set (:issue:` 55655`) + +.. --------------------------------------------------------------------------- +.. _whatsnew_212.other: + +Other +~~~~~ +- Fixed non-working installation of optional dependency group ``output_formatting``. Replacing underscore ``_`` with a dash ``-`` fixes broken dependency resolution. A correct way to use now is ``pip install pandas[output-formatting]``. +- + +.. --------------------------------------------------------------------------- +.. _whatsnew_212.contributors: + +Contributors +~~~~~~~~~~~~ + +.. contributors:: v2.1.1..v2.1.2 diff --git a/doc/source/whatsnew/v2.1.3.rst b/doc/source/whatsnew/v2.1.3.rst new file mode 100644 index 0000000000000..af626895a9e0e --- /dev/null +++ b/doc/source/whatsnew/v2.1.3.rst @@ -0,0 +1,33 @@ +.. _whatsnew_213: + +What's new in 2.1.3 (November 10, 2023) +--------------------------------------- + +These are the changes in pandas 2.1.3. See :ref:`release` for a full changelog +including other versions of pandas. + +{{ header }} + +.. --------------------------------------------------------------------------- +.. _whatsnew_213.regressions: + +Fixed regressions +~~~~~~~~~~~~~~~~~ +- Fixed infinite recursion from operations that return a new object on some DataFrame subclasses (:issue:`55763`) + +.. --------------------------------------------------------------------------- +.. _whatsnew_213.bug_fixes: + +Bug fixes +~~~~~~~~~ +- Bug in :meth:`DatetimeIndex.diff` raising ``TypeError`` (:issue:`55080`) +- Bug in :meth:`Index.isin` raising for Arrow backed string and ``None`` value (:issue:`55821`) +- Fix :func:`read_parquet` and :func:`read_feather` for `CVE-2023-47248 `__ (:issue:`55894`) + +.. --------------------------------------------------------------------------- +.. _whatsnew_213.contributors: + +Contributors +~~~~~~~~~~~~ + +.. contributors:: v2.1.2..v2.1.3|HEAD diff --git a/environment.yml b/environment.yml index 3a0da0bfc703d..3f6f0de68748b 100644 --- a/environment.yml +++ b/environment.yml @@ -9,25 +9,23 @@ dependencies: # build dependencies - versioneer[toml] - cython=0.29.33 - - meson[ninja]=1.0.1 + - meson[ninja]=1.2.1 - meson-python=0.13.1 # test dependencies - pytest>=7.3.2 - pytest-cov - pytest-xdist>=2.2.0 - - pytest-asyncio>=0.17.0 - coverage # required dependencies - python-dateutil - - numpy + - numpy<2 - pytz # optional dependencies - beautifulsoup4>=4.11.1 - blosc - - brotlipy>=0.7.0 - bottleneck>=1.3.4 - fastparquet>=0.8.1 - fsspec>=2022.05.0 @@ -37,7 +35,7 @@ dependencies: - ipython - jinja2>=3.1.2 - lxml>=4.8.0 - - matplotlib>=3.6.1 + - matplotlib>=3.6.1, <3.8 - numba>=0.55.2 - numexpr>=2.8.0 - openpyxl>=3.0.10 @@ -48,7 +46,6 @@ dependencies: - pymysql>=1.0.2 - pyreadstat>=1.1.5 - pytables>=3.7.0 - - python-snappy>=0.6.1 - pyxlsb>=1.0.9 - s3fs>=2022.05.0 - scipy>=1.8.1 @@ -86,7 +83,7 @@ dependencies: - google-auth - natsort # DataFrame.sort_values doctest - numpydoc - - pydata-sphinx-theme + - pydata-sphinx-theme=0.13 - pytest-cython # doctest - sphinx - sphinx-design diff --git a/generate_version.py b/generate_version.py index 5534f49c0ea58..06e38ce0fd978 100644 --- a/generate_version.py +++ b/generate_version.py @@ -1,18 +1,32 @@ +#!/usr/bin/env python3 + # Note: This file has to live next to setup.py or versioneer will not work import argparse import os +import sys import versioneer +sys.path.insert(0, "") + def write_version_info(path): + version = None + git_version = None + + try: + import _version_meson + + version = _version_meson.__version__ + git_version = _version_meson.__git_version__ + except ImportError: + version = versioneer.get_version() + git_version = versioneer.get_versions()["full-revisionid"] if os.environ.get("MESON_DIST_ROOT"): path = os.path.join(os.environ.get("MESON_DIST_ROOT"), path) with open(path, "w", encoding="utf-8") as file: - file.write(f'__version__="{versioneer.get_version()}"\n') - file.write( - f'__git_version__="{versioneer.get_versions()["full-revisionid"]}"\n' - ) + file.write(f'__version__="{version}"\n') + file.write(f'__git_version__="{git_version}"\n') def main(): @@ -43,7 +57,13 @@ def main(): write_version_info(args.outfile) if args.print: - print(versioneer.get_version()) + try: + import _version_meson + + version = _version_meson.__version__ + except ImportError: + version = versioneer.get_version() + print(version) main() diff --git a/meson.build b/meson.build index a927b59abeaf9..68018046c081f 100644 --- a/meson.build +++ b/meson.build @@ -2,24 +2,17 @@ project( 'pandas', 'c', 'cpp', 'cython', - version: run_command(['python', 'generate_version.py', '--print'], check: true).stdout().strip(), + version: run_command(['generate_version.py', '--print'], check: true).stdout().strip(), license: 'BSD-3', - meson_version: '>=1.0.1', + meson_version: '>=1.2.1', default_options: [ - # TODO: investigate, does meson try to compile against debug Python - # when buildtype = debug, this seems to be causing problems on CI - # where provided Python is not compiled in debug mode 'buildtype=release', - # TODO: Reactivate werror, some warnings on Windows - #'werror=true', 'c_std=c99' ] ) -py_mod = import('python') fs = import('fs') -py = py_mod.find_installation('python') -py_dep = py.dependency() +py = import('python').find_installation(pure: false) tempita = files('generate_pxi.py') versioneer = files('generate_version.py') @@ -35,7 +28,7 @@ add_project_arguments('-DNPY_TARGET_VERSION=NPY_1_21_API_VERSION', language : 'c if fs.exists('_version_meson.py') - py.install_sources('_version_meson.py', pure: false, subdir: 'pandas') + py.install_sources('_version_meson.py', subdir: 'pandas') else custom_target('write_version_file', output: '_version_meson.py', @@ -45,11 +38,15 @@ else build_by_default: true, build_always_stale: true, install: true, - install_dir: py.get_install_dir(pure: false) / 'pandas' + install_dir: py.get_install_dir() / 'pandas' ) meson.add_dist_script(py, versioneer, '-o', '_version_meson.py') endif # Needed by pandas.test() when it looks for the pytest ini options -py.install_sources('pyproject.toml', pure: false, subdir: 'pandas') +py.install_sources( + 'pyproject.toml', + subdir: 'pandas' +) + subdir('pandas') diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index 0b6ea58f987d4..9eed70a23c9dd 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -59,7 +59,7 @@ from pandas._libs.util cimport get_nat cdef: float64_t FP_ERR = 1e-13 - float64_t NaN = np.NaN + float64_t NaN = np.nan int64_t NPY_NAT = get_nat() diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 20499016f951e..7635b261d4149 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -52,7 +52,7 @@ from pandas._libs.missing cimport checknull cdef int64_t NPY_NAT = util.get_nat() -cdef float64_t NaN = np.NaN +cdef float64_t NaN = np.nan cdef enum InterpolationEnumType: INTERPOLATION_LINEAR, diff --git a/pandas/_libs/internals.pyx b/pandas/_libs/internals.pyx index adf4e8c926fa3..10b0f4776e0d1 100644 --- a/pandas/_libs/internals.pyx +++ b/pandas/_libs/internals.pyx @@ -944,12 +944,29 @@ cdef class BlockValuesRefs: """ cdef: public list referenced_blocks + public int clear_counter def __cinit__(self, blk: SharedBlock | None = None) -> None: if blk is not None: self.referenced_blocks = [weakref.ref(blk)] else: self.referenced_blocks = [] + self.clear_counter = 500 # set reasonably high + + def _clear_dead_references(self, force=False) -> None: + # Use exponential backoff to decide when we want to clear references + # if force=False. Clearing for every insertion causes slowdowns if + # all these objects stay alive, e.g. df.items() for wide DataFrames + # see GH#55245 and GH#55008 + if force or len(self.referenced_blocks) > self.clear_counter: + self.referenced_blocks = [ + ref for ref in self.referenced_blocks if ref() is not None + ] + nr_of_refs = len(self.referenced_blocks) + if nr_of_refs < self.clear_counter // 2: + self.clear_counter = max(self.clear_counter // 2, 500) + elif nr_of_refs > self.clear_counter: + self.clear_counter = max(self.clear_counter * 2, nr_of_refs) def add_reference(self, blk: SharedBlock) -> None: """Adds a new reference to our reference collection. @@ -959,6 +976,7 @@ cdef class BlockValuesRefs: blk: SharedBlock The block that the new references should point to. """ + self._clear_dead_references() self.referenced_blocks.append(weakref.ref(blk)) def add_index_reference(self, index: object) -> None: @@ -969,6 +987,7 @@ cdef class BlockValuesRefs: index : Index The index that the new reference should point to. """ + self._clear_dead_references() self.referenced_blocks.append(weakref.ref(index)) def has_reference(self) -> bool: @@ -981,8 +1000,6 @@ cdef class BlockValuesRefs: ------- bool """ - self.referenced_blocks = [ - ref for ref in self.referenced_blocks if ref() is not None - ] + self._clear_dead_references(force=True) # Checking for more references than block pointing to itself return len(self.referenced_blocks) > 1 diff --git a/pandas/_libs/lib.pyi b/pandas/_libs/lib.pyi index 32641319a6b96..15bd5a7379105 100644 --- a/pandas/_libs/lib.pyi +++ b/pandas/_libs/lib.pyi @@ -195,6 +195,7 @@ def array_equivalent_object( right: npt.NDArray[np.object_], ) -> bool: ... def has_infs(arr: np.ndarray) -> bool: ... # const floating[:] +def has_only_ints_or_nan(arr: np.ndarray) -> bool: ... # const floating[:] def get_reverse_indexer( indexer: np.ndarray, # const intp_t[:] length: int, diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 55819ebd1f15e..704a5f12204c4 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -144,7 +144,7 @@ cdef: object oINT64_MIN = INT64_MIN object oUINT64_MAX = UINT64_MAX - float64_t NaN = np.NaN + float64_t NaN = np.nan # python-visible i8max = INT64_MAX @@ -530,6 +530,22 @@ def has_infs(floating[:] arr) -> bool: return ret +@cython.boundscheck(False) +@cython.wraparound(False) +def has_only_ints_or_nan(floating[:] arr) -> bool: + cdef: + floating val + intp_t i + + for i in range(len(arr)): + val = arr[i] + if (val != val) or (val == val): + continue + else: + return False + return True + + def maybe_indices_to_slice(ndarray[intp_t, ndim=1] indices, int max_len): cdef: Py_ssize_t i, n = len(indices) @@ -2626,6 +2642,9 @@ def maybe_convert_objects(ndarray[object] objects, else: seen.object_ = True break + elif val is C_NA: + seen.object_ = True + continue else: seen.object_ = True break @@ -2681,14 +2700,11 @@ def maybe_convert_objects(ndarray[object] objects, seen.object_ = True elif seen.str_: - if is_string_array(objects, skipna=True): - if using_pyarrow_string_dtype(): - import pyarrow as pa - - from pandas.core.dtypes.dtypes import ArrowDtype + if using_pyarrow_string_dtype() and is_string_array(objects, skipna=True): + from pandas.core.arrays.string_ import StringDtype - dtype = ArrowDtype(pa.string()) - return dtype.construct_array_type()._from_sequence(objects, dtype=dtype) + dtype = StringDtype(storage="pyarrow_numpy") + return dtype.construct_array_type()._from_sequence(objects, dtype=dtype) seen.object_ = True elif seen.interval_: diff --git a/pandas/_libs/meson.build b/pandas/_libs/meson.build index f302c649bc7bd..fd632790546f6 100644 --- a/pandas/_libs/meson.build +++ b/pandas/_libs/meson.build @@ -69,7 +69,8 @@ libs_sources = { 'index': {'sources': ['index.pyx', _index_class_helper]}, 'indexing': {'sources': ['indexing.pyx']}, 'internals': {'sources': ['internals.pyx']}, - 'interval': {'sources': ['interval.pyx', _intervaltree_helper]}, + 'interval': {'sources': ['interval.pyx', _intervaltree_helper], + 'deps': _khash_primitive_helper_dep}, 'join': {'sources': ['join.pyx', _khash_primitive_helper], 'deps': _khash_primitive_helper_dep}, 'lib': {'sources': ['lib.pyx', 'src/parser/tokenizer.c']}, @@ -113,8 +114,40 @@ foreach ext_name, ext_dict : libs_sources ) endforeach -py.install_sources('__init__.py', - pure: false, - subdir: 'pandas/_libs') +# Basically just __init__.py and the .pyi files +sources_to_install = [ + '__init__.py', + 'algos.pyi', + 'arrays.pyi', + 'byteswap.pyi', + 'groupby.pyi', + 'hashing.pyi', + 'hashtable.pyi', + 'index.pyi', + 'indexing.pyi', + 'internals.pyi', + 'interval.pyi', + 'join.pyi', + 'json.pyi', + 'lib.pyi', + 'missing.pyi', + 'ops.pyi', + 'ops_dispatch.pyi', + 'parsers.pyi', + 'properties.pyi', + 'reshape.pyi', + 'sas.pyi', + 'sparse.pyi', + 'testing.pyi', + 'tslib.pyi', + 'writers.pyi' +] + +foreach source: sources_to_install + py.install_sources( + source, + subdir: 'pandas/_libs' + ) +endforeach subdir('window') diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c index abd3fb9e1fef3..ce8a38df172ef 100644 --- a/pandas/_libs/src/parser/tokenizer.c +++ b/pandas/_libs/src/parser/tokenizer.c @@ -664,7 +664,8 @@ static int parser_buffer_bytes(parser_t *self, size_t nbytes, ((!self->delim_whitespace && c == ' ' && self->skipinitialspace)) // applied when in a field -#define IS_DELIMITER(c) ((c == delimiter) || (delim_whitespace && isblank(c))) +#define IS_DELIMITER(c) \ + ((!delim_whitespace && c == delimiter) || (delim_whitespace && isblank(c))) #define _TOKEN_CLEANUP() \ self->stream_len = slen; \ diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index 20a18cf56779f..e61ad28c54afb 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -671,7 +671,7 @@ cdef _array_to_datetime_object( if len(unique_timezones) > 1: warnings.warn( "In a future version of pandas, parsing datetimes with mixed time " - "zones will raise a warning unless `utc=True`. " + "zones will raise an error unless `utc=True`. " "Please specify `utc=True` to opt in to the new behaviour " "and silence this warning. To create a `Series` with mixed offsets and " "`object` dtype, please use `apply` and `datetime.datetime.strptime`", diff --git a/pandas/_libs/tslibs/meson.build b/pandas/_libs/tslibs/meson.build index 14d2eef46da20..a1b0c54d1f48c 100644 --- a/pandas/_libs/tslibs/meson.build +++ b/pandas/_libs/tslibs/meson.build @@ -31,6 +31,28 @@ foreach ext_name, ext_dict : tslibs_sources ) endforeach -py.install_sources('__init__.py', - pure: false, - subdir: 'pandas/_libs/tslibs') +sources_to_install = [ + '__init__.py', + 'ccalendar.pyi', + 'conversion.pyi', + 'dtypes.pyi', + 'fields.pyi', + 'nattype.pyi', + 'np_datetime.pyi', + 'offsets.pyi', + 'parsing.pyi', + 'period.pyi', + 'strptime.pyi', + 'timedeltas.pyi', + 'timestamps.pyi', + 'timezones.pyi', + 'tzconversion.pyi', + 'vectorized.pyi' +] + +foreach source: sources_to_install + py.install_sources( + source, + subdir: 'pandas/_libs/tslibs' + ) +endforeach diff --git a/pandas/_libs/tslibs/tzconversion.pyx b/pandas/_libs/tslibs/tzconversion.pyx index e17d264333264..af27acf16b771 100644 --- a/pandas/_libs/tslibs/tzconversion.pyx +++ b/pandas/_libs/tslibs/tzconversion.pyx @@ -332,13 +332,16 @@ timedelta-like} # Shift the delta_idx by if the UTC offset of # the target tz is greater than 0 and we're moving forward # or vice versa - first_delta = info.deltas[0] - if (shift_forward or shift_delta > 0) and first_delta > 0: - delta_idx_offset = 1 - elif (shift_backward or shift_delta < 0) and first_delta < 0: - delta_idx_offset = 1 - else: - delta_idx_offset = 0 + # TODO: delta_idx_offset and info.deltas are needed for zoneinfo timezones, + # but are not applicable for all timezones. Setting the former to 0 and + # length checking the latter avoids UB, but this could use a larger refactor + delta_idx_offset = 0 + if len(info.deltas): + first_delta = info.deltas[0] + if (shift_forward or shift_delta > 0) and first_delta > 0: + delta_idx_offset = 1 + elif (shift_backward or shift_delta < 0) and first_delta < 0: + delta_idx_offset = 1 for i in range(n): val = vals[i] diff --git a/pandas/_libs/tslibs/util.pxd b/pandas/_libs/tslibs/util.pxd index e25e7e8b94e1d..519d3fc939efa 100644 --- a/pandas/_libs/tslibs/util.pxd +++ b/pandas/_libs/tslibs/util.pxd @@ -75,7 +75,7 @@ cdef inline bint is_integer_object(object obj) noexcept nogil: cdef inline bint is_float_object(object obj) noexcept nogil: """ - Cython equivalent of `isinstance(val, (float, np.float_))` + Cython equivalent of `isinstance(val, (float, np.float64))` Parameters ---------- @@ -91,7 +91,7 @@ cdef inline bint is_float_object(object obj) noexcept nogil: cdef inline bint is_complex_object(object obj) noexcept nogil: """ - Cython equivalent of `isinstance(val, (complex, np.complex_))` + Cython equivalent of `isinstance(val, (complex, np.complex128))` Parameters ---------- diff --git a/pandas/_libs/window/aggregations.pyx b/pandas/_libs/window/aggregations.pyx index 425c5ade2e2d4..9c151b8269a52 100644 --- a/pandas/_libs/window/aggregations.pyx +++ b/pandas/_libs/window/aggregations.pyx @@ -57,7 +57,7 @@ cdef: float32_t MAXfloat32 = np.inf float64_t MAXfloat64 = np.inf - float64_t NaN = np.NaN + float64_t NaN = np.nan cdef bint is_monotonic_increasing_start_end_bounds( ndarray[int64_t, ndim=1] start, ndarray[int64_t, ndim=1] end diff --git a/pandas/_libs/window/meson.build b/pandas/_libs/window/meson.build index b83d8730f9447..ad15644f73a0c 100644 --- a/pandas/_libs/window/meson.build +++ b/pandas/_libs/window/meson.build @@ -3,7 +3,6 @@ py.extension_module( ['aggregations.pyx'], cython_args: ['-X always_allow_keywords=true'], include_directories: [inc_np, inc_pd], - dependencies: [py_dep], subdir: 'pandas/_libs/window', override_options : ['cython_language=cpp'], install: true @@ -14,7 +13,19 @@ py.extension_module( ['indexers.pyx'], cython_args: ['-X always_allow_keywords=true'], include_directories: [inc_np, inc_pd], - dependencies: [py_dep], subdir: 'pandas/_libs/window', install: true ) + +sources_to_install = [ + '__init__.py', + 'aggregations.pyi', + 'indexers.pyi' +] + +foreach source: sources_to_install + py.install_sources( + source, + subdir: 'pandas/_libs/window' + ) +endforeach diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py index be0a762642e46..6896945344b24 100644 --- a/pandas/compat/__init__.py +++ b/pandas/compat/__init__.py @@ -30,6 +30,8 @@ pa_version_under9p0, pa_version_under11p0, pa_version_under13p0, + pa_version_under14p0, + pa_version_under14p1, ) if TYPE_CHECKING: @@ -186,6 +188,8 @@ def get_bz2_file() -> type[pandas.compat.compressors.BZ2File]: "pa_version_under9p0", "pa_version_under11p0", "pa_version_under13p0", + "pa_version_under14p0", + "pa_version_under14p1", "IS64", "ISMUSL", "PY310", diff --git a/pandas/compat/_optional.py b/pandas/compat/_optional.py index fe4e6457ff08c..c5792fa1379fe 100644 --- a/pandas/compat/_optional.py +++ b/pandas/compat/_optional.py @@ -18,7 +18,6 @@ "bs4": "4.11.1", "blosc": "1.21.0", "bottleneck": "1.3.4", - "brotli": "0.7.0", "dataframe-api-compat": "0.1.7", "fastparquet": "0.8.1", "fsspec": "2022.05.0", @@ -41,7 +40,6 @@ "pyxlsb": "1.0.9", "s3fs": "2022.05.0", "scipy": "1.8.1", - "snappy": "0.6.1", "sqlalchemy": "1.4.36", "tables": "3.7.0", "tabulate": "0.8.10", @@ -60,12 +58,10 @@ INSTALL_MAPPING = { "bs4": "beautifulsoup4", "bottleneck": "Bottleneck", - "brotli": "brotlipy", "jinja2": "Jinja2", "lxml.etree": "lxml", "odf": "odfpy", "pandas_gbq": "pandas-gbq", - "snappy": "python-snappy", "sqlalchemy": "SQLAlchemy", "tables": "pytables", } @@ -75,13 +71,6 @@ def get_version(module: types.ModuleType) -> str: version = getattr(module, "__version__", None) if version is None: - if module.__name__ == "brotli": - # brotli doesn't contain attributes to confirm it's version - return "" - if module.__name__ == "snappy": - # snappy doesn't contain attributes to confirm it's version - # See https://github.com/andrix/python-snappy/pull/119 - return "" raise ImportError(f"Can't determine version for {module.__name__}") if module.__name__ == "psycopg2": # psycopg2 appends " (dt dec pq3 ext lo64)" to it's version diff --git a/pandas/compat/numpy/__init__.py b/pandas/compat/numpy/__init__.py index d376fa4c1919e..0552301b59400 100644 --- a/pandas/compat/numpy/__init__.py +++ b/pandas/compat/numpy/__init__.py @@ -1,4 +1,6 @@ """ support numpy compatibility across versions """ +import warnings + import numpy as np from pandas.util.version import Version @@ -9,6 +11,7 @@ np_version_gte1p24 = _nlv >= Version("1.24") np_version_gte1p24p3 = _nlv >= Version("1.24.3") np_version_gte1p25 = _nlv >= Version("1.25") +np_version_gt2 = _nlv >= Version("2.0.0.dev0") is_numpy_dev = _nlv.dev is not None _min_numpy_ver = "1.22.4" @@ -21,6 +24,27 @@ ) +np_long: type +np_ulong: type + +if np_version_gt2: + try: + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", + r".*In the future `np\.long` will be defined as.*", + FutureWarning, + ) + np_long = np.long # type: ignore[attr-defined] + np_ulong = np.ulong # type: ignore[attr-defined] + except AttributeError: + np_long = np.int_ + np_ulong = np.uint +else: + np_long = np.int_ + np_ulong = np.uint + + __all__ = [ "np", "_np_version", diff --git a/pandas/compat/pyarrow.py b/pandas/compat/pyarrow.py index 049ce50920e28..96501b47e07f6 100644 --- a/pandas/compat/pyarrow.py +++ b/pandas/compat/pyarrow.py @@ -15,6 +15,8 @@ pa_version_under11p0 = _palv < Version("11.0.0") pa_version_under12p0 = _palv < Version("12.0.0") pa_version_under13p0 = _palv < Version("13.0.0") + pa_version_under14p0 = _palv < Version("14.0.0") + pa_version_under14p1 = _palv < Version("14.0.1") except ImportError: pa_version_under7p0 = True pa_version_under8p0 = True @@ -23,3 +25,5 @@ pa_version_under11p0 = True pa_version_under12p0 = True pa_version_under13p0 = True + pa_version_under14p0 = True + pa_version_under14p1 = True diff --git a/pandas/conftest.py b/pandas/conftest.py index f756da82157b8..b1b35448af134 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -71,6 +71,7 @@ Index, MultiIndex, ) +from pandas.util.version import Version if TYPE_CHECKING: from collections.abc import ( @@ -190,6 +191,10 @@ def pytest_collection_modifyitems(items, config) -> None: item.add_marker(pytest.mark.arraymanager) +hypothesis_health_checks = [hypothesis.HealthCheck.too_slow] +if Version(hypothesis.__version__) >= Version("6.83.2"): + hypothesis_health_checks.append(hypothesis.HealthCheck.differing_executors) + # Hypothesis hypothesis.settings.register_profile( "ci", @@ -201,7 +206,7 @@ def pytest_collection_modifyitems(items, config) -> None: # 2022-02-09: Changed deadline from 500 -> None. Deadline leads to # non-actionable, flaky CI failures (# GH 24641, 44969, 45118, 44969) deadline=None, - suppress_health_check=(hypothesis.HealthCheck.too_slow,), + suppress_health_check=tuple(hypothesis_health_checks), ) hypothesis.settings.load_profile("ci") @@ -777,7 +782,7 @@ def series_with_multilevel_index() -> Series: index = MultiIndex.from_tuples(tuples) data = np.random.default_rng(2).standard_normal(8) ser = Series(data, index=index) - ser.iloc[3] = np.NaN + ser.iloc[3] = np.nan return ser @@ -1321,6 +1326,7 @@ def nullable_string_dtype(request): params=[ "python", pytest.param("pyarrow", marks=td.skip_if_no("pyarrow")), + pytest.param("pyarrow_numpy", marks=td.skip_if_no("pyarrow")), ] ) def string_storage(request): @@ -1329,6 +1335,7 @@ def string_storage(request): * 'python' * 'pyarrow' + * 'pyarrow_numpy' """ return request.param @@ -1380,6 +1387,7 @@ def object_dtype(request): "object", "string[python]", pytest.param("string[pyarrow]", marks=td.skip_if_no("pyarrow")), + pytest.param("string[pyarrow_numpy]", marks=td.skip_if_no("pyarrow")), ] ) def any_string_dtype(request): @@ -1996,3 +2004,8 @@ def warsaw(request) -> str: tzinfo for Europe/Warsaw using pytz, dateutil, or zoneinfo. """ return request.param + + +@pytest.fixture() +def arrow_string_storage(): + return ("pyarrow", "pyarrow_numpy") diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 14dee202a9d8d..5f9fced2e415c 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -55,6 +55,7 @@ ) from pandas.core.dtypes.concat import concat_compat from pandas.core.dtypes.dtypes import ( + ArrowDtype, BaseMaskedDtype, CategoricalDtype, ExtensionDtype, @@ -517,9 +518,9 @@ def isin(comps: ListLike, values: ListLike) -> npt.NDArray[np.bool_]: return isin(np.asarray(comps_array), np.asarray(values)) # GH16012 - # Ensure np.in1d doesn't get object types or it *may* throw an exception + # Ensure np.isin doesn't get object types or it *may* throw an exception # Albeit hashmap has O(1) look-up (vs. O(logn) in sorted array), - # in1d is faster for small sizes + # isin is faster for small sizes if ( len(comps_array) > _MINIMUM_COMP_ARR_LEN and len(values) <= 26 @@ -530,10 +531,10 @@ def isin(comps: ListLike, values: ListLike) -> npt.NDArray[np.bool_]: if isna(values).any(): def f(c, v): - return np.logical_or(np.in1d(c, v), np.isnan(c)) + return np.logical_or(np.isin(c, v).ravel(), np.isnan(c)) else: - f = np.in1d + f = lambda a, b: np.isin(a, b).ravel() else: common = np_find_common_type(values.dtype, comps_array.dtype) @@ -877,7 +878,9 @@ def value_counts_internal( if bins is not None: from pandas.core.reshape.tile import cut - values = Series(values, copy=False) + if isinstance(values, Series): + values = values._values + try: ii = cut(values, bins, include_lowest=True) except TypeError as err: @@ -996,9 +999,13 @@ def duplicated( ------- duplicated : ndarray[bool] """ - if hasattr(values, "dtype") and isinstance(values.dtype, BaseMaskedDtype): - values = cast("BaseMaskedArray", values) - return htable.duplicated(values._data, keep=keep, mask=values._mask) + if hasattr(values, "dtype"): + if isinstance(values.dtype, ArrowDtype) and values.dtype.kind in "ifub": + values = values._to_masked() # type: ignore[union-attr] + + if isinstance(values.dtype, BaseMaskedDtype): + values = cast("BaseMaskedArray", values) + return htable.duplicated(values._data, keep=keep, mask=values._mask) values = _ensure_data(values) return htable.duplicated(values, keep=keep) @@ -1634,7 +1641,7 @@ def safe_sort( else: mask = None else: - reverse_indexer = np.empty(len(sorter), dtype=np.int_) + reverse_indexer = np.empty(len(sorter), dtype=int) reverse_indexer.put(sorter, np.arange(len(sorter))) # Out of bound indices will be masked with `-1` next, so we # may deal with them here without performance loss using `mode='wrap'` diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 6ab2f958b8730..e5683359c2fb9 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -434,7 +434,13 @@ def compute_dict_like( Data for result. When aggregating with a Series, this can contain any Python object. """ + from pandas.core.groupby.generic import ( + DataFrameGroupBy, + SeriesGroupBy, + ) + obj = self.obj + is_groupby = isinstance(obj, (DataFrameGroupBy, SeriesGroupBy)) func = cast(AggFuncTypeDict, self.func) func = self.normalize_dictlike_arg(op_name, selected_obj, func) @@ -448,7 +454,7 @@ def compute_dict_like( colg = obj._gotitem(selection, ndim=1) results = [getattr(colg, op_name)(how, **kwargs) for _, how in func.items()] keys = list(func.keys()) - elif is_non_unique_col: + elif not is_groupby and is_non_unique_col: # key used for column selection and output # GH#51099 results = [] @@ -1283,14 +1289,6 @@ def curried(x): ) if len(mapped) and isinstance(mapped[0], ABCSeries): - warnings.warn( - "Returning a DataFrame from Series.apply when the supplied function " - "returns a Series is deprecated and will be removed in a future " - "version.", - FutureWarning, - stacklevel=find_stack_level(), - ) # GH52116 - # GH#43986 Need to do list(mapped) in order to get treated as nested # See also GH#25959 regarding EA support return obj._constructor_expanddim(list(mapped), index=obj.index) @@ -1824,12 +1822,12 @@ def warn_alias_replacement( full_alias = alias else: full_alias = f"{type(obj).__name__}.{alias}" - alias = f"'{alias}'" + alias = f'"{alias}"' warnings.warn( f"The provided callable {func} is currently using " f"{full_alias}. In a future version of pandas, " f"the provided callable will be used directly. To keep current " - f"behavior pass {alias} instead.", + f"behavior pass the string {alias} instead.", category=FutureWarning, stacklevel=find_stack_level(), ) diff --git a/pandas/core/arrays/_arrow_string_mixins.py b/pandas/core/arrays/_arrow_string_mixins.py new file mode 100644 index 0000000000000..63db03340683b --- /dev/null +++ b/pandas/core/arrays/_arrow_string_mixins.py @@ -0,0 +1,84 @@ +from __future__ import annotations + +from typing import Literal + +import numpy as np + +from pandas.compat import pa_version_under7p0 + +if not pa_version_under7p0: + import pyarrow as pa + import pyarrow.compute as pc + + +class ArrowStringArrayMixin: + _pa_array = None + + def __init__(self, *args, **kwargs) -> None: + raise NotImplementedError + + def _str_pad( + self, + width: int, + side: Literal["left", "right", "both"] = "left", + fillchar: str = " ", + ): + if side == "left": + pa_pad = pc.utf8_lpad + elif side == "right": + pa_pad = pc.utf8_rpad + elif side == "both": + pa_pad = pc.utf8_center + else: + raise ValueError( + f"Invalid side: {side}. Side must be one of 'left', 'right', 'both'" + ) + return type(self)(pa_pad(self._pa_array, width=width, padding=fillchar)) + + def _str_get(self, i: int): + lengths = pc.utf8_length(self._pa_array) + if i >= 0: + out_of_bounds = pc.greater_equal(i, lengths) + start = i + stop = i + 1 + step = 1 + else: + out_of_bounds = pc.greater(-i, lengths) + start = i + stop = i - 1 + step = -1 + not_out_of_bounds = pc.invert(out_of_bounds.fill_null(True)) + selected = pc.utf8_slice_codeunits( + self._pa_array, start=start, stop=stop, step=step + ) + null_value = pa.scalar( + None, type=self._pa_array.type # type: ignore[attr-defined] + ) + result = pc.if_else(not_out_of_bounds, selected, null_value) + return type(self)(result) + + def _str_slice_replace( + self, start: int | None = None, stop: int | None = None, repl: str | None = None + ): + if repl is None: + repl = "" + if start is None: + start = 0 + if stop is None: + stop = np.iinfo(np.int64).max + return type(self)(pc.utf8_replace_slice(self._pa_array, start, stop, repl)) + + def _str_capitalize(self): + return type(self)(pc.utf8_capitalize(self._pa_array)) + + def _str_title(self): + return type(self)(pc.utf8_title(self._pa_array)) + + def _str_swapcase(self): + return type(self)(pc.utf8_swapcase(self._pa_array)) + + def _str_removesuffix(self, suffix: str): + ends_with = pc.ends_with(self._pa_array, pattern=suffix) + removed = pc.utf8_slice_codeunits(self._pa_array, 0, stop=-len(suffix)) + result = pc.if_else(ends_with, removed, self._pa_array) + return type(self)(result) diff --git a/pandas/core/arrays/_mixins.py b/pandas/core/arrays/_mixins.py index 359e0161e763c..6d21f4a1ac8a2 100644 --- a/pandas/core/arrays/_mixins.py +++ b/pandas/core/arrays/_mixins.py @@ -296,13 +296,8 @@ def _fill_mask_inplace( func = missing.get_fill_func(method, ndim=self.ndim) func(self._ndarray.T, limit=limit, mask=mask.T) - def pad_or_backfill( - self, - *, - method: FillnaOptions, - limit: int | None = None, - limit_area: Literal["inside", "outside"] | None = None, - copy: bool = True, + def _pad_or_backfill( + self, *, method: FillnaOptions, limit: int | None = None, copy: bool = True ) -> Self: mask = self.isna() if mask.any(): diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 88695f11fba59..9c8f28d660450 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -18,6 +18,7 @@ from pandas._libs.tslibs import ( Timedelta, Timestamp, + timezones, ) from pandas.compat import ( pa_version_under7p0, @@ -29,7 +30,9 @@ from pandas.util._decorators import doc from pandas.util._validators import validate_fillna_kwargs +from pandas.core.dtypes.cast import infer_dtype_from_scalar from pandas.core.dtypes.common import ( + CategoricalDtype, is_array_like, is_bool_dtype, is_integer, @@ -40,8 +43,12 @@ from pandas.core.dtypes.dtypes import DatetimeTZDtype from pandas.core.dtypes.missing import isna -from pandas.core import roperator +from pandas.core import ( + missing, + roperator, +) from pandas.core.arraylike import OpsMixin +from pandas.core.arrays._arrow_string_mixins import ArrowStringArrayMixin from pandas.core.arrays.base import ( ExtensionArray, ExtensionArraySupportsAnyAll, @@ -83,6 +90,15 @@ "rxor": lambda x, y: pc.xor(y, x), } + ARROW_BIT_WISE_FUNCS = { + "and_": pc.bit_wise_and, + "rand_": lambda x, y: pc.bit_wise_and(y, x), + "or_": pc.bit_wise_or, + "ror_": lambda x, y: pc.bit_wise_or(y, x), + "xor": pc.bit_wise_xor, + "rxor": lambda x, y: pc.bit_wise_xor(y, x), + } + def cast_for_truediv( arrow_array: pa.ChunkedArray, pa_object: pa.Array | pa.Scalar ) -> pa.ChunkedArray: @@ -100,7 +116,8 @@ def floordiv_compat( ) -> pa.ChunkedArray: # Ensure int // int -> int mirroring Python/Numpy behavior # as pc.floor(pc.divide_checked(int, int)) -> float - result = pc.floor(pc.divide(left, right)) + converted_left = cast_for_truediv(left, right) + result = pc.floor(pc.divide(converted_left, right)) if pa.types.is_integer(left.type) and pa.types.is_integer(right.type): result = result.cast(left.type) return result @@ -184,7 +201,10 @@ def to_pyarrow_type( class ArrowExtensionArray( - OpsMixin, ExtensionArraySupportsAnyAll, BaseStringArrayMethods + OpsMixin, + ExtensionArraySupportsAnyAll, + ArrowStringArrayMixin, + BaseStringArrayMethods, ): """ Pandas ExtensionArray backed by a PyArrow ChunkedArray. @@ -368,8 +388,7 @@ def _box_pa_scalar(cls, value, pa_type: pa.DataType | None = None) -> pa.Scalar: elif isna(value): pa_scalar = pa.scalar(None, type=pa_type) else: - # GH 53171: pyarrow does not yet handle pandas non-nano correctly - # see https://github.com/apache/arrow/issues/33321 + # Workaround https://github.com/apache/arrow/issues/37291 if isinstance(value, Timedelta): if pa_type is None: pa_type = pa.duration(value.unit) @@ -435,8 +454,7 @@ def _box_pa_array( and pa.types.is_duration(pa_type) and (not isinstance(value, np.ndarray) or value.dtype.kind not in "mi") ): - # GH 53171: pyarrow does not yet handle pandas non-nano correctly - # see https://github.com/apache/arrow/issues/33321 + # Workaround https://github.com/apache/arrow/issues/37291 from pandas.core.tools.timedeltas import to_timedelta value = to_timedelta(value, unit=pa_type.unit).as_unit(pa_type.unit) @@ -449,8 +467,7 @@ def _box_pa_array( pa_array = pa.array(value, from_pandas=True) if pa_type is None and pa.types.is_duration(pa_array.type): - # GH 53171: pyarrow does not yet handle pandas non-nano correctly - # see https://github.com/apache/arrow/issues/33321 + # Workaround https://github.com/apache/arrow/issues/37291 from pandas.core.tools.timedeltas import to_timedelta value = to_timedelta(value) @@ -502,7 +519,10 @@ def __getitem__(self, item: PositionalIndexer): if isinstance(item, np.ndarray): if not len(item): # Removable once we migrate StringDtype[pyarrow] to ArrowDtype[string] - if self._dtype.name == "string" and self._dtype.storage == "pyarrow": + if self._dtype.name == "string" and self._dtype.storage in ( + "pyarrow", + "pyarrow_numpy", + ): pa_dtype = pa.string() else: pa_dtype = self._dtype.pyarrow_dtype @@ -578,7 +598,11 @@ def __array__(self, dtype: NpDtype | None = None) -> np.ndarray: return self.to_numpy(dtype=dtype) def __invert__(self) -> Self: - return type(self)(pc.invert(self._pa_array)) + # This is a bit wise op for integer types + if pa.types.is_integer(self._pa_array.type): + return type(self)(pc.bit_wise_not(self._pa_array)) + else: + return type(self)(pc.invert(self._pa_array)) def __neg__(self) -> Self: return type(self)(pc.negate_checked(self._pa_array)) @@ -606,20 +630,24 @@ def __setstate__(self, state) -> None: def _cmp_method(self, other, op): pc_func = ARROW_CMP_FUNCS[op.__name__] - try: + if isinstance( + other, (ArrowExtensionArray, np.ndarray, list, BaseMaskedArray) + ) or isinstance(getattr(other, "dtype", None), CategoricalDtype): result = pc_func(self._pa_array, self._box_pa(other)) - except (pa.lib.ArrowNotImplementedError, pa.lib.ArrowInvalid): - if is_scalar(other): + elif is_scalar(other): + try: + result = pc_func(self._pa_array, self._box_pa(other)) + except (pa.lib.ArrowNotImplementedError, pa.lib.ArrowInvalid): mask = isna(self) | isna(other) valid = ~mask result = np.zeros(len(self), dtype="bool") result[valid] = op(np.array(self)[valid], other) result = pa.array(result, type=pa.bool_()) result = pc.if_else(valid, result, None) - else: - raise NotImplementedError( - f"{op.__name__} not implemented for {type(other)}" - ) + else: + raise NotImplementedError( + f"{op.__name__} not implemented for {type(other)}" + ) return ArrowExtensionArray(result) def _evaluate_op_method(self, other, op, arrow_funcs): @@ -653,7 +681,12 @@ def _evaluate_op_method(self, other, op, arrow_funcs): return type(self)(result) def _logical_method(self, other, op): - return self._evaluate_op_method(other, op, ARROW_LOGICAL_FUNCS) + # For integer types `^`, `|`, `&` are bitwise operators and return + # integer types. Otherwise these are boolean ops. + if pa.types.is_integer(self._pa_array.type): + return self._evaluate_op_method(other, op, ARROW_BIT_WISE_FUNCS) + else: + return self._evaluate_op_method(other, op, ARROW_LOGICAL_FUNCS) def _arith_method(self, other, op): return self._evaluate_op_method(other, op, ARROW_ARITHMETIC_FUNCS) @@ -899,23 +932,30 @@ def dropna(self) -> Self: """ return type(self)(pc.drop_null(self._pa_array)) - def pad_or_backfill( - self, - *, - method: FillnaOptions, - limit: int | None = None, - limit_area: Literal["inside", "outside"] | None = None, - copy: bool = True, + def _pad_or_backfill( + self, *, method: FillnaOptions, limit: int | None = None, copy: bool = True ) -> Self: if not self._hasna: # TODO(CoW): Not necessary anymore when CoW is the default return self.copy() + if limit is None: + method = missing.clean_fill_method(method) + try: + if method == "pad": + return type(self)(pc.fill_null_forward(self._pa_array)) + elif method == "backfill": + return type(self)(pc.fill_null_backward(self._pa_array)) + except pa.ArrowNotImplementedError: + # ArrowNotImplementedError: Function 'coalesce' has no kernel + # matching input types (duration[ns], duration[ns]) + # TODO: remove try/except wrapper if/when pyarrow implements + # a kernel for duration types. + pass + # TODO(3.0): after EA.fillna 'method' deprecation is enforced, we can remove # this method entirely. - return super().pad_or_backfill( - method=method, limit=limit, limit_area=limit_area, copy=copy - ) + return super()._pad_or_backfill(method=method, limit=limit, copy=copy) @doc(ExtensionArray.fillna) def fillna( @@ -931,9 +971,12 @@ def fillna( # TODO(CoW): Not necessary anymore when CoW is the default return self.copy() - if limit is not None or method is not None: + if limit is not None: return super().fillna(value=value, method=method, limit=limit, copy=copy) + if method is not None: + return super().fillna(method=method, limit=limit, copy=copy) + if isinstance(value, (np.ndarray, ExtensionArray)): # Similar to check_value_size, but we do not mask here since we may # end up passing it to the super() method. @@ -943,31 +986,14 @@ def fillna( f" expected {len(self)}" ) - def convert_fill_value(value, pa_type, dtype): - if value is None: - return value - if isinstance(value, (pa.Scalar, pa.Array, pa.ChunkedArray)): - return value - if is_array_like(value): - pa_box = pa.array - else: - pa_box = pa.scalar - try: - value = pa_box(value, type=pa_type, from_pandas=True) - except pa.ArrowTypeError as err: - msg = f"Invalid value '{str(value)}' for dtype {dtype}" - raise TypeError(msg) from err - return value - - fill_value = convert_fill_value(value, self._pa_array.type, self.dtype) + try: + fill_value = self._box_pa(value, pa_type=self._pa_array.type) + except pa.ArrowTypeError as err: + msg = f"Invalid value '{str(value)}' for dtype {self.dtype}" + raise TypeError(msg) from err try: - if method is None: - return type(self)(pc.fill_null(self._pa_array, fill_value=fill_value)) - elif method == "pad": - return type(self)(pc.fill_null_forward(self._pa_array)) - elif method == "backfill": - return type(self)(pc.fill_null_backward(self._pa_array)) + return type(self)(pc.fill_null(self._pa_array, fill_value=fill_value)) except pa.ArrowNotImplementedError: # ArrowNotImplementedError: Function 'coalesce' has no kernel # matching input types (duration[ns], duration[ns]) @@ -1011,12 +1037,11 @@ def factorize( ) -> tuple[np.ndarray, ExtensionArray]: null_encoding = "mask" if use_na_sentinel else "encode" - pa_type = self._pa_array.type - if pa.types.is_duration(pa_type): + data = self._pa_array + pa_type = data.type + if pa_version_under11p0 and pa.types.is_duration(pa_type): # https://github.com/apache/arrow/issues/15226#issuecomment-1376578323 - data = self._pa_array.cast(pa.int64()) - else: - data = self._pa_array + data = data.cast(pa.int64()) if pa.types.is_dictionary(data.type): encoded = data @@ -1026,15 +1051,17 @@ def factorize( indices = np.array([], dtype=np.intp) uniques = type(self)(pa.chunked_array([], type=encoded.type.value_type)) else: - pa_indices = encoded.combine_chunks().indices + # GH 54844 + combined = encoded.combine_chunks() + pa_indices = combined.indices if pa_indices.null_count > 0: pa_indices = pc.fill_null(pa_indices, -1) indices = pa_indices.to_numpy(zero_copy_only=False, writable=True).astype( np.intp, copy=False ) - uniques = type(self)(encoded.chunk(0).dictionary) + uniques = type(self)(combined.dictionary) - if pa.types.is_duration(pa_type): + if pa_version_under11p0 and pa.types.is_duration(pa_type): uniques = cast(ArrowExtensionArray, uniques.astype(self.dtype)) return indices, uniques @@ -1273,7 +1300,7 @@ def unique(self) -> Self: """ pa_type = self._pa_array.type - if pa.types.is_duration(pa_type): + if pa_version_under11p0 and pa.types.is_duration(pa_type): # https://github.com/apache/arrow/issues/15226#issuecomment-1376578323 data = self._pa_array.cast(pa.int64()) else: @@ -1281,7 +1308,7 @@ def unique(self) -> Self: pa_result = pc.unique(data) - if pa.types.is_duration(pa_type): + if pa_version_under11p0 and pa.types.is_duration(pa_type): pa_result = pa_result.cast(pa_type) return type(self)(pa_result) @@ -1304,7 +1331,7 @@ def value_counts(self, dropna: bool = True) -> Series: Series.value_counts """ pa_type = self._pa_array.type - if pa.types.is_duration(pa_type): + if pa_version_under11p0 and pa.types.is_duration(pa_type): # https://github.com/apache/arrow/issues/15226#issuecomment-1376578323 data = self._pa_array.cast(pa.int64()) else: @@ -1324,7 +1351,7 @@ def value_counts(self, dropna: bool = True) -> Series: values = values.filter(mask) counts = counts.filter(mask) - if pa.types.is_duration(pa_type): + if pa_version_under11p0 and pa.types.is_duration(pa_type): values = values.cast(pa_type) counts = ArrowExtensionArray(counts) @@ -1386,6 +1413,9 @@ def _accumulate( NotImplementedError : subclass does not define accumulations """ pyarrow_name = { + "cummax": "cumulative_max", + "cummin": "cumulative_min", + "cumprod": "cumulative_prod_checked", "cumsum": "cumulative_sum_checked", }.get(name, name) pyarrow_meth = getattr(pc, pyarrow_name, None) @@ -1395,12 +1425,20 @@ def _accumulate( data_to_accum = self._pa_array pa_dtype = data_to_accum.type - if pa.types.is_duration(pa_dtype): - data_to_accum = data_to_accum.cast(pa.int64()) + + convert_to_int = ( + pa.types.is_temporal(pa_dtype) and name in ["cummax", "cummin"] + ) or (pa.types.is_duration(pa_dtype) and name == "cumsum") + + if convert_to_int: + if pa_dtype.bit_width == 32: + data_to_accum = data_to_accum.cast(pa.int32()) + else: + data_to_accum = data_to_accum.cast(pa.int64()) result = pyarrow_meth(data_to_accum, skip_nulls=skipna, **kwargs) - if pa.types.is_duration(pa_dtype): + if convert_to_int: result = result.cast(pa_dtype) return type(self)(result) @@ -1559,16 +1597,33 @@ def _reduce( ------ TypeError : subclass does not define reductions """ + result = self._reduce_calc(name, skipna=skipna, keepdims=keepdims, **kwargs) + if isinstance(result, pa.Array): + return type(self)(result) + else: + return result + + def _reduce_calc( + self, name: str, *, skipna: bool = True, keepdims: bool = False, **kwargs + ): pa_result = self._reduce_pyarrow(name, skipna=skipna, **kwargs) if keepdims: - result = pa.array([pa_result.as_py()], type=pa_result.type) - return type(self)(result) + if isinstance(pa_result, pa.Scalar): + result = pa.array([pa_result.as_py()], type=pa_result.type) + else: + result = pa.array( + [pa_result], + type=to_pyarrow_type(infer_dtype_from_scalar(pa_result)[0]), + ) + return result if pc.is_null(pa_result).as_py(): return self.dtype.na_value - else: + elif isinstance(pa_result, pa.Scalar): return pa_result.as_py() + else: + return pa_result def _explode(self): """ @@ -1667,7 +1722,7 @@ def __setitem__(self, key, value) -> None: data = pa.chunked_array([data]) self._pa_array = data - def _rank( + def _rank_calc( self, *, axis: AxisInt = 0, @@ -1676,9 +1731,6 @@ def _rank( ascending: bool = True, pct: bool = False, ): - """ - See Series.rank.__doc__. - """ if pa_version_under9p0 or axis != 0: ranked = super()._rank( axis=axis, @@ -1693,7 +1745,7 @@ def _rank( else: pa_type = pa.uint64() result = pa.array(ranked, type=pa_type, from_pandas=True) - return type(self)(result) + return result data = self._pa_array.combine_chunks() sort_keys = "ascending" if ascending else "descending" @@ -1732,7 +1784,29 @@ def _rank( divisor = pc.count(result) result = pc.divide(result, divisor) - return type(self)(result) + return result + + def _rank( + self, + *, + axis: AxisInt = 0, + method: str = "average", + na_option: str = "keep", + ascending: bool = True, + pct: bool = False, + ): + """ + See Series.rank.__doc__. + """ + return type(self)( + self._rank_calc( + axis=axis, + method=method, + na_option=na_option, + ascending=ascending, + pct=pct, + ) + ) def _quantile(self, qs: npt.NDArray[np.float64], interpolation: str) -> Self: """ @@ -1812,6 +1886,7 @@ def _mode(self, dropna: bool = True) -> Self: if pa.types.is_temporal(pa_type): most_common = most_common.cast(pa_type) + most_common = most_common.take(pc.array_sort_indices(most_common)) return type(self)(most_common) def _maybe_convert_setitem_value(self, value): @@ -1987,24 +2062,6 @@ def _str_count(self, pat: str, flags: int = 0): raise NotImplementedError(f"count not implemented with {flags=}") return type(self)(pc.count_substring_regex(self._pa_array, pat)) - def _str_pad( - self, - width: int, - side: Literal["left", "right", "both"] = "left", - fillchar: str = " ", - ): - if side == "left": - pa_pad = pc.utf8_lpad - elif side == "right": - pa_pad = pc.utf8_rpad - elif side == "both": - pa_pad = pc.utf8_center - else: - raise ValueError( - f"Invalid side: {side}. Side must be one of 'left', 'right', 'both'" - ) - return type(self)(pa_pad(self._pa_array, width=width, padding=fillchar)) - def _str_contains( self, pat, case: bool = True, flags: int = 0, na=None, regex: bool = True ): @@ -2089,26 +2146,6 @@ def _str_find(self, sub: str, start: int = 0, end: int | None = None): ) return type(self)(result) - def _str_get(self, i: int): - lengths = pc.utf8_length(self._pa_array) - if i >= 0: - out_of_bounds = pc.greater_equal(i, lengths) - start = i - stop = i + 1 - step = 1 - else: - out_of_bounds = pc.greater(-i, lengths) - start = i - stop = i - 1 - step = -1 - not_out_of_bounds = pc.invert(out_of_bounds.fill_null(True)) - selected = pc.utf8_slice_codeunits( - self._pa_array, start=start, stop=stop, step=step - ) - null_value = pa.scalar(None, type=self._pa_array.type) - result = pc.if_else(not_out_of_bounds, selected, null_value) - return type(self)(result) - def _str_join(self, sep: str): if pa.types.is_string(self._pa_array.type): result = self._apply_elementwise(list) @@ -2138,15 +2175,6 @@ def _str_slice( pc.utf8_slice_codeunits(self._pa_array, start=start, stop=stop, step=step) ) - def _str_slice_replace( - self, start: int | None = None, stop: int | None = None, repl: str | None = None - ): - if repl is None: - repl = "" - if start is None: - start = 0 - return type(self)(pc.utf8_replace_slice(self._pa_array, start, stop, repl)) - def _str_isalnum(self): return type(self)(pc.utf8_is_alnum(self._pa_array)) @@ -2171,18 +2199,9 @@ def _str_isspace(self): def _str_istitle(self): return type(self)(pc.utf8_is_title(self._pa_array)) - def _str_capitalize(self): - return type(self)(pc.utf8_capitalize(self._pa_array)) - - def _str_title(self): - return type(self)(pc.utf8_title(self._pa_array)) - def _str_isupper(self): return type(self)(pc.utf8_is_upper(self._pa_array)) - def _str_swapcase(self): - return type(self)(pc.utf8_swapcase(self._pa_array)) - def _str_len(self): return type(self)(pc.utf8_length(self._pa_array)) @@ -2223,12 +2242,6 @@ def _str_removeprefix(self, prefix: str): result = self._apply_elementwise(predicate) return type(self)(pa.chunked_array(result)) - def _str_removesuffix(self, suffix: str): - ends_with = pc.ends_with(self._pa_array, pattern=suffix) - removed = pc.utf8_slice_codeunits(self._pa_array, 0, stop=-len(suffix)) - result = pc.if_else(ends_with, removed, self._pa_array) - return type(self)(result) - def _str_casefold(self): predicate = lambda val: val.casefold() result = self._apply_elementwise(predicate) @@ -2453,7 +2466,7 @@ def _dt_time(self): @property def _dt_tz(self): - return self.dtype.pyarrow_dtype.tz + return timezones.maybe_get_tz(self.dtype.pyarrow_dtype.tz) @property def _dt_unit(self): diff --git a/pandas/core/arrays/arrow/extension_types.py b/pandas/core/arrays/arrow/extension_types.py index 3249c1c829546..36d536bf86847 100644 --- a/pandas/core/arrays/arrow/extension_types.py +++ b/pandas/core/arrays/arrow/extension_types.py @@ -5,6 +5,8 @@ import pyarrow +from pandas.compat import pa_version_under14p1 + from pandas.core.dtypes.dtypes import ( IntervalDtype, PeriodDtype, @@ -112,3 +114,61 @@ def to_pandas_dtype(self): # register the type with a dummy instance _interval_type = ArrowIntervalType(pyarrow.int64(), "left") pyarrow.register_extension_type(_interval_type) + + +_ERROR_MSG = """\ +Disallowed deserialization of 'arrow.py_extension_type': +storage_type = {storage_type} +serialized = {serialized} +pickle disassembly:\n{pickle_disassembly} + +Reading of untrusted Parquet or Feather files with a PyExtensionType column +allows arbitrary code execution. +If you trust this file, you can enable reading the extension type by one of: + +- upgrading to pyarrow >= 14.0.1, and call `pa.PyExtensionType.set_auto_load(True)` +- install pyarrow-hotfix (`pip install pyarrow-hotfix`) and disable it by running + `import pyarrow_hotfix; pyarrow_hotfix.uninstall()` + +We strongly recommend updating your Parquet/Feather files to use extension types +derived from `pyarrow.ExtensionType` instead, and register this type explicitly. +""" + + +def patch_pyarrow(): + # starting from pyarrow 14.0.1, it has its own mechanism + if not pa_version_under14p1: + return + + # if https://github.com/pitrou/pyarrow-hotfix was installed and enabled + if getattr(pyarrow, "_hotfix_installed", False): + return + + class ForbiddenExtensionType(pyarrow.ExtensionType): + def __arrow_ext_serialize__(self): + return b"" + + @classmethod + def __arrow_ext_deserialize__(cls, storage_type, serialized): + import io + import pickletools + + out = io.StringIO() + pickletools.dis(serialized, out) + raise RuntimeError( + _ERROR_MSG.format( + storage_type=storage_type, + serialized=serialized, + pickle_disassembly=out.getvalue(), + ) + ) + + pyarrow.unregister_extension_type("arrow.py_extension_type") + pyarrow.register_extension_type( + ForbiddenExtensionType(pyarrow.null(), "arrow.py_extension_type") + ) + + pyarrow._hotfix_installed = True + + +patch_pyarrow() diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 7babce46a3977..bfd6ae361e1e8 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -132,7 +132,6 @@ class ExtensionArray: interpolate isin isna - pad_or_backfill ravel repeat searchsorted @@ -148,6 +147,7 @@ class ExtensionArray: _from_sequence _from_sequence_of_strings _hash_pandas_object + _pad_or_backfill _reduce _values_for_argsort _values_for_factorize @@ -183,7 +183,7 @@ class ExtensionArray: methods: * fillna - * pad_or_backfill + * _pad_or_backfill * dropna * unique * factorize / _values_for_factorize @@ -889,7 +889,6 @@ def interpolate( limit, limit_direction, limit_area, - fill_value, copy: bool, **kwargs, ) -> Self: @@ -917,13 +916,8 @@ def interpolate( f"{type(self).__name__} does not implement interpolate" ) - def pad_or_backfill( - self, - *, - method: FillnaOptions, - limit: int | None = None, - limit_area: Literal["inside", "outside"] | None = None, - copy: bool = True, + def _pad_or_backfill( + self, *, method: FillnaOptions, limit: int | None = None, copy: bool = True ) -> Self: """ Pad or backfill values, used by Series/DataFrame ffill and bfill. @@ -959,26 +953,26 @@ def pad_or_backfill( Examples -------- >>> arr = pd.array([np.nan, np.nan, 2, 3, np.nan, np.nan]) - >>> arr.pad_or_backfill(method="backfill", limit=1) + >>> arr._pad_or_backfill(method="backfill", limit=1) [, 2, 2, 3, , ] Length: 6, dtype: Int64 """ # If a 3rd-party EA has implemented this functionality in fillna, - # we warn that they need to implement pad_or_backfill instead. + # we warn that they need to implement _pad_or_backfill instead. if ( type(self).fillna is not ExtensionArray.fillna - and type(self).pad_or_backfill is ExtensionArray.pad_or_backfill + and type(self)._pad_or_backfill is ExtensionArray._pad_or_backfill ): - # Check for pad_or_backfill here allows us to call - # super().pad_or_backfill without getting this warning + # Check for _pad_or_backfill here allows us to call + # super()._pad_or_backfill without getting this warning warnings.warn( "ExtensionArray.fillna 'method' keyword is deprecated. " - "In a future version. arr.pad_or_backfill will be called " + "In a future version. arr._pad_or_backfill will be called " "instead. 3rd-party ExtensionArray authors need to implement " - "pad_or_backfill.", - FutureWarning, + "_pad_or_backfill.", + DeprecationWarning, stacklevel=find_stack_level(), ) return self.fillna(method=method, limit=limit) @@ -1062,8 +1056,7 @@ def fillna( if method is not None: warnings.warn( f"The 'method' keyword in {type(self).__name__}.fillna is " - "deprecated and will be removed in a future version. " - "Use pad_or_backfill instead.", + "deprecated and will be removed in a future version.", FutureWarning, stacklevel=find_stack_level(), ) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 6107388bfe78b..53c942f615abe 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -2109,7 +2109,7 @@ def _codes(self) -> np.ndarray: def _box_func(self, i: int): if i == -1: - return np.NaN + return np.nan return self.categories[i] def _unbox_scalar(self, key) -> int: @@ -2950,7 +2950,7 @@ def recode_for_categories( return codes indexer = coerce_indexer_dtype( - new_categories.get_indexer(old_categories), new_categories + new_categories.get_indexer_for(old_categories), new_categories ) new_codes = take_nd(indexer, codes, fill_value=-1) return new_codes diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index c3b8d1c0e79e8..1a6438b2445c4 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -107,6 +107,7 @@ algorithms, missing, nanops, + ops, ) from pandas.core.algorithms import ( checked_add_with_arr, @@ -943,8 +944,12 @@ def _cmp_method(self, other, op): dtype = getattr(other, "dtype", None) if is_object_dtype(dtype): - return op(np.asarray(self, dtype=object), other) - + # We have to use comp_method_OBJECT_ARRAY instead of numpy + # comparison otherwise it would raise when comparing to None + result = ops.comp_method_OBJECT_ARRAY( + op, np.asarray(self.astype(object)), other + ) + return result if other is NaT: if op is operator.ne: result = np.ones(self.shape, dtype=bool) diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index 3263dd73fe4dc..d0510ede5a366 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -890,19 +890,12 @@ def max(self, *, axis: AxisInt | None = None, skipna: bool = True) -> IntervalOr indexer = obj.argsort()[-1] return obj[indexer] - def pad_or_backfill( # pylint: disable=useless-parent-delegation - self, - *, - method: FillnaOptions, - limit: int | None = None, - limit_area: Literal["inside", "outside"] | None = None, - copy: bool = True, + def _pad_or_backfill( # pylint: disable=useless-parent-delegation + self, *, method: FillnaOptions, limit: int | None = None, copy: bool = True ) -> Self: # TODO(3.0): after EA.fillna 'method' deprecation is enforced, we can remove # this method entirely. - return super().pad_or_backfill( - method=method, limit=limit, limit_area=limit_area, copy=copy - ) + return super()._pad_or_backfill(method=method, limit=limit, copy=copy) def fillna( self, value=None, method=None, limit: int | None = None, copy: bool = True @@ -1844,14 +1837,14 @@ def isin(self, values) -> npt.NDArray[np.bool_]: # complex128 ndarray is much more performant. left = self._combined.view("complex128") right = values._combined.view("complex128") - # error: Argument 1 to "in1d" has incompatible type + # error: Argument 1 to "isin" has incompatible type # "Union[ExtensionArray, ndarray[Any, Any], # ndarray[Any, dtype[Any]]]"; expected # "Union[_SupportsArray[dtype[Any]], # _NestedSequence[_SupportsArray[dtype[Any]]], bool, # int, float, complex, str, bytes, _NestedSequence[ # Union[bool, int, float, complex, str, bytes]]]" - return np.in1d(left, right) # type: ignore[arg-type] + return np.isin(left, right).ravel() # type: ignore[arg-type] elif needs_i8_conversion(self.left.dtype) ^ needs_i8_conversion( values.left.dtype diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index bec875f2bbfa1..2cf28c28427ab 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -190,13 +190,8 @@ def __getitem__(self, item: PositionalIndexer) -> Self | Any: return self._simple_new(self._data[item], newmask) - def pad_or_backfill( - self, - *, - method: FillnaOptions, - limit: int | None = None, - limit_area: Literal["inside", "outside"] | None = None, - copy: bool = True, + def _pad_or_backfill( + self, *, method: FillnaOptions, limit: int | None = None, copy: bool = True ) -> Self: mask = self._mask diff --git a/pandas/core/arrays/numpy_.py b/pandas/core/arrays/numpy_.py index 99a3586871d10..efe0c0df45e00 100644 --- a/pandas/core/arrays/numpy_.py +++ b/pandas/core/arrays/numpy_.py @@ -240,7 +240,10 @@ def _values_for_factorize(self) -> tuple[np.ndarray, float | None]: fv = np.nan return self._ndarray, fv - def pad_or_backfill( + # Base EA class (and all other EA classes) don't have limit_area keyword + # This can be removed here as well when the interpolate ffill/bfill method + # deprecation is enforced + def _pad_or_backfill( self, *, method: FillnaOptions, diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index 024cd6d4aad14..a2e4b595c42aa 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -791,20 +791,13 @@ def searchsorted( m8arr = self._ndarray.view("M8[ns]") return m8arr.searchsorted(npvalue, side=side, sorter=sorter) - def pad_or_backfill( - self, - *, - method: FillnaOptions, - limit: int | None = None, - limit_area: Literal["inside", "outside"] | None = None, - copy: bool = True, + def _pad_or_backfill( + self, *, method: FillnaOptions, limit: int | None = None, copy: bool = True ) -> Self: # view as dt64 so we get treated as timelike in core.missing, # similar to dtl._period_dispatch dta = self.view("M8[ns]") - result = dta.pad_or_backfill( - method=method, limit=limit, limit_area=limit_area, copy=copy - ) + result = dta._pad_or_backfill(method=method, limit=limit, copy=copy) if copy: return cast("Self", result.view(self.dtype)) else: diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index d32c98535d7cb..e38fa0a3bdae5 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -712,19 +712,12 @@ def isna(self): mask[self.sp_index.indices] = isna(self.sp_values) return type(self)(mask, fill_value=False, dtype=dtype) - def pad_or_backfill( # pylint: disable=useless-parent-delegation - self, - *, - method: FillnaOptions, - limit: int | None = None, - limit_area: Literal["inside", "outside"] | None = None, - copy: bool = True, + def _pad_or_backfill( # pylint: disable=useless-parent-delegation + self, *, method: FillnaOptions, limit: int | None = None, copy: bool = True ) -> Self: # TODO(3.0): We can remove this method once deprecation for fillna method # keyword is enforced. - return super().pad_or_backfill( - method=method, limit=limit, limit_area=limit_area, copy=copy - ) + return super()._pad_or_backfill(method=method, limit=limit, copy=copy) def fillna( self, diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 25f1c2ec6ce4f..a41214ae17044 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -14,6 +14,7 @@ missing as libmissing, ) from pandas._libs.arrays import NDArrayBacked +from pandas._libs.lib import ensure_string_array from pandas.compat import pa_version_under7p0 from pandas.compat.numpy import function as nv from pandas.util._decorators import doc @@ -76,7 +77,7 @@ class StringDtype(StorageExtensionDtype): Parameters ---------- - storage : {"python", "pyarrow"}, optional + storage : {"python", "pyarrow", "pyarrow_numpy"}, optional If not given, the value of ``pd.options.mode.string_storage``. Attributes @@ -98,21 +99,30 @@ class StringDtype(StorageExtensionDtype): name = "string" - #: StringDtype().na_value uses pandas.NA + #: StringDtype().na_value uses pandas.NA except the implementation that + # follows NumPy semantics, which uses nan. @property - def na_value(self) -> libmissing.NAType: - return libmissing.NA + def na_value(self) -> libmissing.NAType | float: # type: ignore[override] + if self.storage == "pyarrow_numpy": + return np.nan + else: + return libmissing.NA _metadata = ("storage",) def __init__(self, storage=None) -> None: if storage is None: - storage = get_option("mode.string_storage") - if storage not in {"python", "pyarrow"}: + infer_string = get_option("future.infer_string") + if infer_string: + storage = "pyarrow_numpy" + else: + storage = get_option("mode.string_storage") + if storage not in {"python", "pyarrow", "pyarrow_numpy"}: raise ValueError( - f"Storage must be 'python' or 'pyarrow'. Got {storage} instead." + f"Storage must be 'python', 'pyarrow' or 'pyarrow_numpy'. " + f"Got {storage} instead." ) - if storage == "pyarrow" and pa_version_under7p0: + if storage in ("pyarrow", "pyarrow_numpy") and pa_version_under7p0: raise ImportError( "pyarrow>=7.0.0 is required for PyArrow backed StringArray." ) @@ -160,6 +170,8 @@ def construct_from_string(cls, string): return cls(storage="python") elif string == "string[pyarrow]": return cls(storage="pyarrow") + elif string == "string[pyarrow_numpy]": + return cls(storage="pyarrow_numpy") else: raise TypeError(f"Cannot construct a '{cls.__name__}' from '{string}'") @@ -176,12 +188,17 @@ def construct_array_type( # type: ignore[override] ------- type """ - from pandas.core.arrays.string_arrow import ArrowStringArray + from pandas.core.arrays.string_arrow import ( + ArrowStringArray, + ArrowStringArrayNumpySemantics, + ) if self.storage == "python": return StringArray - else: + elif self.storage == "pyarrow": return ArrowStringArray + else: + return ArrowStringArrayNumpySemantics def __from_arrow__( self, array: pyarrow.Array | pyarrow.ChunkedArray @@ -193,6 +210,10 @@ def __from_arrow__( from pandas.core.arrays.string_arrow import ArrowStringArray return ArrowStringArray(array) + elif self.storage == "pyarrow_numpy": + from pandas.core.arrays.string_arrow import ArrowStringArrayNumpySemantics + + return ArrowStringArrayNumpySemantics(array) else: import pyarrow @@ -202,11 +223,19 @@ def __from_arrow__( # pyarrow.ChunkedArray chunks = array.chunks + results = [] + for arr in chunks: + # convert chunk by chunk to numpy and concatenate then, to avoid + # overflow for large string data when concatenating the pyarrow arrays + arr = arr.to_numpy(zero_copy_only=False) + arr = ensure_string_array(arr, na_value=libmissing.NA) + results.append(arr) + if len(chunks) == 0: arr = np.array([], dtype=object) else: - arr = pyarrow.concat_arrays(chunks).to_numpy(zero_copy_only=False) - arr = lib.convert_nans_to_NA(arr) + arr = np.concatenate(results) + # Bypass validation inside StringArray constructor, see GH#47781 new_string_array = StringArray.__new__(StringArray) NDArrayBacked.__init__( diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 4a70fcf6b5a93..2eef240af53f8 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -1,5 +1,6 @@ from __future__ import annotations +from functools import partial import re from typing import ( TYPE_CHECKING, @@ -27,6 +28,7 @@ ) from pandas.core.dtypes.missing import isna +from pandas.core.arrays._arrow_string_mixins import ArrowStringArrayMixin from pandas.core.arrays.arrow import ArrowExtensionArray from pandas.core.arrays.boolean import BooleanDtype from pandas.core.arrays.integer import Int64Dtype @@ -46,6 +48,7 @@ if TYPE_CHECKING: from pandas._typing import ( + AxisInt, Dtype, Scalar, npt, @@ -113,10 +116,11 @@ class ArrowStringArray(ObjectStringArrayMixin, ArrowExtensionArray, BaseStringAr # error: Incompatible types in assignment (expression has type "StringDtype", # base class "ArrowExtensionArray" defined the type as "ArrowDtype") _dtype: StringDtype # type: ignore[assignment] + _storage = "pyarrow" def __init__(self, values) -> None: super().__init__(values) - self._dtype = StringDtype(storage="pyarrow") + self._dtype = StringDtype(storage=self._storage) if not pa.types.is_string(self._pa_array.type) and not ( pa.types.is_dictionary(self._pa_array.type) @@ -144,7 +148,10 @@ def _from_sequence(cls, scalars, dtype: Dtype | None = None, copy: bool = False) if dtype and not (isinstance(dtype, str) and dtype == "string"): dtype = pandas_dtype(dtype) - assert isinstance(dtype, StringDtype) and dtype.storage == "pyarrow" + assert isinstance(dtype, StringDtype) and dtype.storage in ( + "pyarrow", + "pyarrow_numpy", + ) if isinstance(scalars, BaseMaskedArray): # avoid costly conversion to object dtype in ensure_string_array and @@ -178,6 +185,10 @@ def insert(self, loc: int, item) -> ArrowStringArray: raise TypeError("Scalar must be NA or str") return super().insert(loc, item) + @classmethod + def _result_converter(cls, values, na=None): + return BooleanDtype().__from_arrow__(values) + def _maybe_convert_setitem_value(self, value): """Maybe convert value to be pyarrow compatible.""" if is_scalar(value): @@ -313,7 +324,7 @@ def _str_contains( result = pc.match_substring_regex(self._pa_array, pat, ignore_case=not case) else: result = pc.match_substring(self._pa_array, pat, ignore_case=not case) - result = BooleanDtype().__from_arrow__(result) + result = self._result_converter(result, na=na) if not isna(na): result[isna(result)] = bool(na) return result @@ -322,7 +333,7 @@ def _str_startswith(self, pat: str, na=None): result = pc.starts_with(self._pa_array, pattern=pat) if not isna(na): result = result.fill_null(na) - result = BooleanDtype().__from_arrow__(result) + result = self._result_converter(result) if not isna(na): result[isna(result)] = bool(na) return result @@ -331,7 +342,7 @@ def _str_endswith(self, pat: str, na=None): result = pc.ends_with(self._pa_array, pattern=pat) if not isna(na): result = result.fill_null(na) - result = BooleanDtype().__from_arrow__(result) + result = self._result_converter(result) if not isna(na): result[isna(result)] = bool(na) return result @@ -369,39 +380,39 @@ def _str_fullmatch( def _str_isalnum(self): result = pc.utf8_is_alnum(self._pa_array) - return BooleanDtype().__from_arrow__(result) + return self._result_converter(result) def _str_isalpha(self): result = pc.utf8_is_alpha(self._pa_array) - return BooleanDtype().__from_arrow__(result) + return self._result_converter(result) def _str_isdecimal(self): result = pc.utf8_is_decimal(self._pa_array) - return BooleanDtype().__from_arrow__(result) + return self._result_converter(result) def _str_isdigit(self): result = pc.utf8_is_digit(self._pa_array) - return BooleanDtype().__from_arrow__(result) + return self._result_converter(result) def _str_islower(self): result = pc.utf8_is_lower(self._pa_array) - return BooleanDtype().__from_arrow__(result) + return self._result_converter(result) def _str_isnumeric(self): result = pc.utf8_is_numeric(self._pa_array) - return BooleanDtype().__from_arrow__(result) + return self._result_converter(result) def _str_isspace(self): result = pc.utf8_is_space(self._pa_array) - return BooleanDtype().__from_arrow__(result) + return self._result_converter(result) def _str_istitle(self): result = pc.utf8_is_title(self._pa_array) - return BooleanDtype().__from_arrow__(result) + return self._result_converter(result) def _str_isupper(self): result = pc.utf8_is_upper(self._pa_array) - return BooleanDtype().__from_arrow__(result) + return self._result_converter(result) def _str_len(self): result = pc.utf8_length(self._pa_array) @@ -433,3 +444,186 @@ def _str_rstrip(self, to_strip=None): else: result = pc.utf8_rtrim(self._pa_array, characters=to_strip) return type(self)(result) + + def _reduce( + self, name: str, *, skipna: bool = True, keepdims: bool = False, **kwargs + ): + result = self._reduce_calc(name, skipna=skipna, keepdims=keepdims, **kwargs) + if name in ("argmin", "argmax") and isinstance(result, pa.Array): + return self._convert_int_dtype(result) + elif isinstance(result, pa.Array): + return type(self)(result) + else: + return result + + def _convert_int_dtype(self, result): + return Int64Dtype().__from_arrow__(result) + + def _rank( + self, + *, + axis: AxisInt = 0, + method: str = "average", + na_option: str = "keep", + ascending: bool = True, + pct: bool = False, + ): + """ + See Series.rank.__doc__. + """ + return self._convert_int_dtype( + self._rank_calc( + axis=axis, + method=method, + na_option=na_option, + ascending=ascending, + pct=pct, + ) + ) + + +class ArrowStringArrayNumpySemantics(ArrowStringArray): + _storage = "pyarrow_numpy" + + def __init__(self, values) -> None: + _chk_pyarrow_available() + + if isinstance(values, (pa.Array, pa.ChunkedArray)) and pa.types.is_large_string( + values.type + ): + values = pc.cast(values, pa.string()) + super().__init__(values) + + @classmethod + def _result_converter(cls, values, na=None): + if not isna(na): + values = values.fill_null(bool(na)) + return ArrowExtensionArray(values).to_numpy(na_value=np.nan) + + def __getattribute__(self, item): + # ArrowStringArray and we both inherit from ArrowExtensionArray, which + # creates inheritance problems (Diamond inheritance) + if item in ArrowStringArrayMixin.__dict__ and item not in ( + "_pa_array", + "__dict__", + ): + return partial(getattr(ArrowStringArrayMixin, item), self) + return super().__getattribute__(item) + + def _str_map( + self, f, na_value=None, dtype: Dtype | None = None, convert: bool = True + ): + if dtype is None: + dtype = self.dtype + if na_value is None: + na_value = self.dtype.na_value + + mask = isna(self) + arr = np.asarray(self) + + if is_integer_dtype(dtype) or is_bool_dtype(dtype): + if is_integer_dtype(dtype): + na_value = np.nan + else: + na_value = False + try: + result = lib.map_infer_mask( + arr, + f, + mask.view("uint8"), + convert=False, + na_value=na_value, + dtype=np.dtype(dtype), # type: ignore[arg-type] + ) + return result + + except ValueError: + result = lib.map_infer_mask( + arr, + f, + mask.view("uint8"), + convert=False, + na_value=na_value, + ) + if convert and result.dtype == object: + result = lib.maybe_convert_objects(result) + return result + + elif is_string_dtype(dtype) and not is_object_dtype(dtype): + # i.e. StringDtype + result = lib.map_infer_mask( + arr, f, mask.view("uint8"), convert=False, na_value=na_value + ) + result = pa.array(result, mask=mask, type=pa.string(), from_pandas=True) + return type(self)(result) + else: + # This is when the result type is object. We reach this when + # -> We know the result type is truly object (e.g. .encode returns bytes + # or .findall returns a list). + # -> We don't know the result type. E.g. `.get` can return anything. + return lib.map_infer_mask(arr, f, mask.view("uint8")) + + def _convert_int_dtype(self, result): + if isinstance(result, pa.Array): + result = result.to_numpy(zero_copy_only=False) + elif not isinstance(result, np.ndarray): + result = result.to_numpy() + if result.dtype == np.int32: + result = result.astype(np.int64) + return result + + def _str_count(self, pat: str, flags: int = 0): + if flags: + return super()._str_count(pat, flags) + result = pc.count_substring_regex(self._pa_array, pat).to_numpy() + return self._convert_int_dtype(result) + + def _str_len(self): + result = pc.utf8_length(self._pa_array).to_numpy() + return self._convert_int_dtype(result) + + def _str_find(self, sub: str, start: int = 0, end: int | None = None): + if start != 0 and end is not None: + slices = pc.utf8_slice_codeunits(self._pa_array, start, stop=end) + result = pc.find_substring(slices, sub) + not_found = pc.equal(result, -1) + offset_result = pc.add(result, end - start) + result = pc.if_else(not_found, result, offset_result) + elif start == 0 and end is None: + slices = self._pa_array + result = pc.find_substring(slices, sub) + else: + return super()._str_find(sub, start, end) + return self._convert_int_dtype(result.to_numpy()) + + def _cmp_method(self, other, op): + result = super()._cmp_method(other, op) + return result.to_numpy(np.bool_, na_value=False) + + def value_counts(self, dropna: bool = True): + from pandas import Series + + result = super().value_counts(dropna) + return Series( + result._values.to_numpy(), index=result.index, name=result.name, copy=False + ) + + def _reduce( + self, name: str, *, skipna: bool = True, keepdims: bool = False, **kwargs + ): + if name in ["any", "all"]: + if not skipna and name == "all": + nas = pc.invert(pc.is_null(self._pa_array)) + arr = pc.and_kleene(nas, pc.not_equal(self._pa_array, "")) + else: + arr = pc.not_equal(self._pa_array, "") + return ArrowExtensionArray(arr)._reduce( + name, skipna=skipna, keepdims=keepdims, **kwargs + ) + else: + return super()._reduce(name, skipna=skipna, keepdims=keepdims, **kwargs) + + def insert(self, loc: int, item) -> ArrowStringArrayNumpySemantics: + if item is np.nan: + item = libmissing.NA + return super().insert(loc, item) # type: ignore[return-value] diff --git a/pandas/core/computation/ops.py b/pandas/core/computation/ops.py index 9050fb6e76b9c..852bfae1cc79a 100644 --- a/pandas/core/computation/ops.py +++ b/pandas/core/computation/ops.py @@ -537,8 +537,8 @@ def __init__(self, lhs, rhs) -> None: ) # do not upcast float32s to float64 un-necessarily - acceptable_dtypes = [np.float32, np.float_] - _cast_inplace(com.flatten(self), acceptable_dtypes, np.float_) + acceptable_dtypes = [np.float32, np.float64] + _cast_inplace(com.flatten(self), acceptable_dtypes, np.float64) UNARY_OPS_SYMS = ("+", "-", "~", "not") diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py index 27e9bf8958ab0..765b24fbc7868 100644 --- a/pandas/core/config_init.py +++ b/pandas/core/config_init.py @@ -492,7 +492,8 @@ def use_inf_as_na_cb(key) -> None: string_storage_doc = """ : string - The default storage for StringDtype. + The default storage for StringDtype. This option is ignored if + ``future.infer_string`` is set to True. """ with cf.config_prefix("mode"): @@ -500,7 +501,7 @@ def use_inf_as_na_cb(key) -> None: "string_storage", "python", string_storage_doc, - validator=is_one_of_factory(["python", "pyarrow"]), + validator=is_one_of_factory(["python", "pyarrow", "pyarrow_numpy"]), ) diff --git a/pandas/core/construction.py b/pandas/core/construction.py index 5757c69bb6ec7..5903187769f08 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -19,6 +19,8 @@ import numpy as np from numpy import ma +from pandas._config import using_pyarrow_string_dtype + from pandas._libs import lib from pandas._libs.tslibs import ( Period, @@ -538,6 +540,7 @@ def sanitize_array( ------- np.ndarray or ExtensionArray """ + original_dtype = dtype if isinstance(data, ma.MaskedArray): data = sanitize_masked_array(data) @@ -560,7 +563,16 @@ def sanitize_array( if not is_list_like(data): if index is None: raise ValueError("index must be specified when data is not list-like") + if ( + isinstance(data, str) + and using_pyarrow_string_dtype() + and original_dtype is None + ): + from pandas.core.arrays.string_ import StringDtype + + dtype = StringDtype("pyarrow_numpy") data = construct_1d_arraylike_from_scalar(data, len(index), dtype) + return data elif isinstance(data, ABCExtensionArray): @@ -589,6 +601,11 @@ def sanitize_array( subarr = data if data.dtype == object: subarr = maybe_infer_to_datetimelike(data) + elif data.dtype.kind == "U" and using_pyarrow_string_dtype(): + from pandas.core.arrays.string_ import StringDtype + + dtype = StringDtype(storage="pyarrow_numpy") + subarr = dtype.construct_array_type()._from_sequence(data, dtype=dtype) if subarr is data and copy: subarr = subarr.copy() diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 9f7c0b3e36032..9a9a8ed22f282 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -700,7 +700,7 @@ def _maybe_promote(dtype: np.dtype, fill_value=np.nan): dtype = np.dtype(np.object_) elif issubclass(dtype.type, np.integer): - if not np.can_cast(fill_value, dtype): + if not np_can_cast_scalar(fill_value, dtype): # type: ignore[arg-type] # upcast to prevent overflow mst = np.min_scalar_type(fill_value) dtype = np.promote_types(dtype, mst) @@ -799,10 +799,9 @@ def infer_dtype_from_scalar(val) -> tuple[DtypeObj, Any]: dtype = _dtype_obj if using_pyarrow_string_dtype(): - import pyarrow as pa + from pandas.core.arrays.string_ import StringDtype - pa_dtype = pa.string() - dtype = ArrowDtype(pa_dtype) + dtype = StringDtype(storage="pyarrow_numpy") elif isinstance(val, (np.datetime64, dt.datetime)): try: @@ -850,7 +849,7 @@ def infer_dtype_from_scalar(val) -> tuple[DtypeObj, Any]: dtype = np.dtype(np.float64) elif is_complex(val): - dtype = np.dtype(np.complex_) + dtype = np.dtype(np.complex128) if lib.is_period(val): dtype = PeriodDtype(freq=val.freq) @@ -1893,4 +1892,25 @@ def _dtype_can_hold_range(rng: range, dtype: np.dtype) -> bool: """ if not len(rng): return True - return np.can_cast(rng[0], dtype) and np.can_cast(rng[-1], dtype) + return np_can_cast_scalar(rng.start, dtype) and np_can_cast_scalar(rng.stop, dtype) + + +def np_can_cast_scalar(element: Scalar, dtype: np.dtype) -> bool: + """ + np.can_cast pandas-equivalent for pre 2-0 behavior that allowed scalar + inference + + Parameters + ---------- + element : Scalar + dtype : np.dtype + + Returns + ------- + bool + """ + try: + np_can_hold_element(dtype, element) + return True + except (LossySetitemError, NotImplementedError): + return False diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index a0feb49f47c4e..3db36fc50e343 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -1351,7 +1351,7 @@ def is_complex_dtype(arr_or_dtype) -> bool: False >>> is_complex_dtype(int) False - >>> is_complex_dtype(np.complex_) + >>> is_complex_dtype(np.complex128) True >>> is_complex_dtype(np.array(['a', 'b'])) False @@ -1614,6 +1614,15 @@ def pandas_dtype(dtype) -> DtypeObj: # registered extension types result = registry.find(dtype) if result is not None: + if isinstance(result, type): + # GH 31356, GH 54592 + warnings.warn( + f"Instantiating {result.__name__} without any arguments." + f"Pass a {result.__name__} instance to silence this warning.", + UserWarning, + stacklevel=find_stack_level(), + ) + result = result() return result # try a numpy dtype diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index 53f0fb2843653..272e9928b96cb 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -2109,6 +2109,8 @@ def type(self): return CategoricalDtypeType elif pa.types.is_list(pa_type) or pa.types.is_large_list(pa_type): return list + elif pa.types.is_fixed_size_list(pa_type): + return list elif pa.types.is_map(pa_type): return list elif pa.types.is_struct(pa_type): diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py index de99f828d604f..8760c8eeca454 100644 --- a/pandas/core/dtypes/missing.py +++ b/pandas/core/dtypes/missing.py @@ -285,7 +285,7 @@ def _isna_array(values: ArrayLike, inf_as_na: bool = False): # "Union[ndarray[Any, Any], ExtensionArraySupportsAnyAll]", variable has # type "ndarray[Any, dtype[bool_]]") result = values.isna() # type: ignore[assignment] - elif isinstance(values, np.recarray): + elif isinstance(values, np.rec.recarray): # GH 48526 result = _isna_recarray_dtype(values, inf_as_na=inf_as_na) elif is_string_or_object_np_dtype(values.dtype): @@ -332,7 +332,9 @@ def _has_record_inf_value(record_as_array: np.ndarray) -> np.bool_: return np.any(is_inf_in_record) -def _isna_recarray_dtype(values: np.recarray, inf_as_na: bool) -> npt.NDArray[np.bool_]: +def _isna_recarray_dtype( + values: np.rec.recarray, inf_as_na: bool +) -> npt.NDArray[np.bool_]: result = np.zeros(values.shape, dtype=bool) for i, record in enumerate(values): record_as_array = np.array(record.tolist()) @@ -622,6 +624,9 @@ def infer_fill_value(val): return np.array("NaT", dtype=DT64NS_DTYPE) elif dtype in ["timedelta", "timedelta64"]: return np.array("NaT", dtype=TD64NS_DTYPE) + return np.array(np.nan, dtype=object) + elif val.dtype.kind == "U": + return np.array(np.nan, dtype=val.dtype) return np.nan diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 4908b535bcb1c..cb04905323af1 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -639,14 +639,12 @@ def _constructor(self) -> Callable[..., DataFrame]: return DataFrame def _constructor_from_mgr(self, mgr, axes): - df = self._from_mgr(mgr, axes=axes) - - if type(self) is DataFrame: - # fastpath avoiding constructor call - return df + if self._constructor is DataFrame: + # we are pandas.DataFrame (or a subclass that doesn't override _constructor) + return DataFrame._from_mgr(mgr, axes=axes) else: assert axes is mgr.axes - return self._constructor(df, copy=False) + return self._constructor(mgr) _constructor_sliced: Callable[..., Series] = Series @@ -654,13 +652,12 @@ def _sliced_from_mgr(self, mgr, axes) -> Series: return Series._from_mgr(mgr, axes) def _constructor_sliced_from_mgr(self, mgr, axes): - ser = self._sliced_from_mgr(mgr, axes=axes) - ser._name = None # caller is responsible for setting real name - if type(self) is DataFrame: - # fastpath avoiding constructor call + if self._constructor_sliced is Series: + ser = self._sliced_from_mgr(mgr, axes) + ser._name = None # caller is responsible for setting real name return ser assert axes is mgr.axes - return self._constructor_sliced(ser, copy=False) + return self._constructor_sliced(mgr) # ---------------------------------------------------------------------- # Constructors @@ -2402,7 +2399,7 @@ def maybe_reorder( def to_records( self, index: bool = True, column_dtypes=None, index_dtypes=None - ) -> np.recarray: + ) -> np.rec.recarray: """ Convert DataFrame to a NumPy record array. @@ -2427,7 +2424,7 @@ def to_records( Returns ------- - numpy.recarray + numpy.rec.recarray NumPy ndarray with the DataFrame labels as fields and each row of the DataFrame as entries. @@ -2435,7 +2432,7 @@ def to_records( -------- DataFrame.from_records: Convert structured or record ndarray to DataFrame. - numpy.recarray: An ndarray that allows field access using + numpy.rec.recarray: An ndarray that allows field access using attributes, analogous to typed columns in a spreadsheet. @@ -2909,10 +2906,7 @@ def to_parquet( 'pyarrow' is unavailable. compression : str or None, default 'snappy' Name of the compression to use. Use ``None`` for no compression. - The supported compression methods actually depend on which engine - is used. For 'pyarrow', 'snappy', 'gzip', 'brotli', 'lz4', 'zstd' - are all supported. For 'fastparquet', only 'gzip' and 'snappy' are - supported. + Supported options: 'snappy', 'gzip', 'brotli', 'lz4', 'zstd'. index : bool, default None If ``True``, include the dataframe's index(es) in the file output. If ``False``, they will not be written to the file. @@ -4950,7 +4944,7 @@ def insert( value, refs = self._sanitize_column(value) self._mgr.insert(loc, column, value, refs=refs) - def assign(self, **kwargs) -> DataFrame: + def assign(self, __kwargs_dict__=None, **kwargs) -> DataFrame: r""" Assign new columns to a DataFrame. @@ -5012,12 +5006,20 @@ def assign(self, **kwargs) -> DataFrame: Portland 17.0 62.6 290.15 Berkeley 25.0 77.0 298.15 """ - data = self.copy(deep=None) + if __kwargs_dict__ is not None: + kwargs = __kwargs_dict__ + for key in kwargs: + if not isinstance(key, str): + raise TypeError( + f"assign() only supports string column names. " + f"Use df[{key!r}] = ... to assign non-string column names like tuples." + ) + + data = self.copy(deep=None) for k, v in kwargs.items(): data[k] = com.apply_if_callable(v, data) return data - def _sanitize_column(self, value) -> tuple[ArrayLike, BlockValuesRefs | None]: """ Ensures new columns (which go into the BlockManager as new blocks) are @@ -5620,10 +5622,14 @@ def shift( ) -> DataFrame: if freq is not None and fill_value is not lib.no_default: # GH#53832 - raise ValueError( - "Cannot pass both 'freq' and 'fill_value' to " - f"{type(self).__name__}.shift" + warnings.warn( + "Passing a 'freq' together with a 'fill_value' silently ignores " + "the fill_value and is deprecated. This will raise in a future " + "version.", + FutureWarning, + stacklevel=find_stack_level(), ) + fill_value = lib.no_default axis = self._get_axis_number(axis) @@ -7263,7 +7269,7 @@ def value_counts( subset = self.columns.tolist() name = "proportion" if normalize else "count" - counts = self.groupby(subset, dropna=dropna).grouper.size() + counts = self.groupby(subset, dropna=dropna, observed=False).grouper.size() counts.name = name if sort: @@ -7932,6 +7938,7 @@ def to_series(right): ) elif isinstance(right, Series): # axis=1 is default for DataFrame-with-Series op + axis = axis if axis is not None else 1 if not flex: if not left.axes[axis].equals(right.index): raise ValueError( @@ -11109,14 +11116,20 @@ def func(values: np.ndarray): # We only use this in the case that operates on self.values return op(values, axis=axis, skipna=skipna, **kwds) + dtype_has_keepdims: dict[ExtensionDtype, bool] = {} + def blk_func(values, axis: Axis = 1): if isinstance(values, ExtensionArray): if not is_1d_only_ea_dtype(values.dtype) and not isinstance( self._mgr, ArrayManager ): return values._reduce(name, axis=1, skipna=skipna, **kwds) - sign = signature(values._reduce) - if "keepdims" in sign.parameters: + has_keepdims = dtype_has_keepdims.get(values.dtype) + if has_keepdims is None: + sign = signature(values._reduce) + has_keepdims = "keepdims" in sign.parameters + dtype_has_keepdims[values.dtype] = has_keepdims + if has_keepdims: return values._reduce(name, skipna=skipna, keepdims=True, **kwds) else: warnings.warn( @@ -11166,6 +11179,32 @@ def _get_data() -> DataFrame: ).iloc[:0] result.index = df.index return result + + # kurtosis excluded since groupby does not implement it + if df.shape[1] and name != "kurt": + dtype = find_common_type([arr.dtype for arr in df._mgr.arrays]) + if isinstance(dtype, ExtensionDtype): + # GH 54341: fastpath for EA-backed axis=1 reductions + # This flattens the frame into a single 1D array while keeping + # track of the row and column indices of the original frame. Once + # flattened, grouping by the row indices and aggregating should + # be equivalent to transposing the original frame and aggregating + # with axis=0. + name = {"argmax": "idxmax", "argmin": "idxmin"}.get(name, name) + df = df.astype(dtype, copy=False) + arr = concat_compat(list(df._iter_column_arrays())) + nrows, ncols = df.shape + row_index = np.tile(np.arange(nrows), ncols) + col_index = np.repeat(np.arange(ncols), nrows) + ser = Series(arr, index=col_index, copy=False) + result = ser.groupby(row_index).agg(name, **kwds) + result.index = df.index + if not skipna and name not in ("any", "all"): + mask = df.isna().to_numpy(dtype=np.bool_).any(axis=1) + other = -1 if name in ("idxmax", "idxmin") else lib.no_default + result = result.mask(mask, other) + return result + df = df.T # After possibly _get_data and transposing, we are now in the diff --git a/pandas/core/generic.py b/pandas/core/generic.py index be0d046697ba9..3281d245dca56 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -2796,7 +2796,7 @@ def to_hdf( @final @deprecate_nonkeyword_arguments( - version="3.0", allowed_args=["self", "name"], name="to_sql" + version="3.0", allowed_args=["self", "name", "con"], name="to_sql" ) def to_sql( self, @@ -5307,7 +5307,7 @@ def reindex( level : int or name Broadcast across a level, matching Index values on the passed MultiIndex level. - fill_value : scalar, default np.NaN + fill_value : scalar, default np.nan Value to use for missing values. Defaults to NaN, but can be any "compatible" value. limit : int, default None @@ -5701,10 +5701,12 @@ def filter( if items is not None: name = self._get_axis_name(axis) + items = Index(items).intersection(labels) + if len(items) == 0: + # Keep the dtype of labels when we are empty + items = items.astype(labels.dtype) # error: Keywords must be strings - return self.reindex( # type: ignore[misc] - **{name: labels.intersection(items)} - ) + return self.reindex(**{name: items}) # type: ignore[misc] elif like: def f(x) -> bool_t: @@ -7079,6 +7081,8 @@ def fillna( See Also -------- + ffill : Fill values by propagating the last valid observation to next valid. + bfill : Fill values by using the next valid observation to fill the gap. interpolate : Fill NaN values using interpolation. reindex : Conform object to new index. asfreq : Convert TimeSeries to specified frequency. @@ -7338,7 +7342,10 @@ def ffill( ... @final - @doc(klass=_shared_doc_kwargs["klass"]) + @doc( + klass=_shared_doc_kwargs["klass"], + axes_single_arg=_shared_doc_kwargs["axes_single_arg"], + ) def ffill( self, *, @@ -7348,7 +7355,28 @@ def ffill( downcast: dict | None | lib.NoDefault = lib.no_default, ) -> Self | None: """ - Synonym for :meth:`DataFrame.fillna` with ``method='ffill'``. + Fill NA/NaN values by propagating the last valid observation to next valid. + + Parameters + ---------- + axis : {axes_single_arg} + Axis along which to fill missing values. For `Series` + this parameter is unused and defaults to 0. + inplace : bool, default False + If True, fill in-place. Note: this will modify any + other views on this object (e.g., a no-copy slice for a column in a + DataFrame). + limit : int, default None + If method is specified, this is the maximum number of consecutive + NaN values to forward/backward fill. In other words, if there is + a gap with more than this number of consecutive NaNs, it will only + be partially filled. If method is not specified, this is the + maximum number of entries along the entire axis where NaNs will be + filled. Must be greater than 0 if not None. + downcast : dict, default is None + A dict of item->dtype of what to downcast if possible, + or the string 'infer' which will try to downcast to an appropriate + equal type (e.g. float64 to int64 if possible). Returns ------- @@ -7376,7 +7404,7 @@ def ffill( 2 3.0 4.0 NaN 1.0 3 3.0 3.0 NaN 4.0 - >>> ser = pd.Series([1, np.NaN, 2, 3]) + >>> ser = pd.Series([1, np.nan, 2, 3]) >>> ser.ffill() 0 1.0 1 1.0 @@ -7417,7 +7445,7 @@ def pad( downcast: dict | None | lib.NoDefault = lib.no_default, ) -> Self | None: """ - Synonym for :meth:`DataFrame.fillna` with ``method='ffill'``. + Fill NA/NaN values by propagating the last valid observation to next valid. .. deprecated:: 2.0 @@ -7474,7 +7502,10 @@ def bfill( ... @final - @doc(klass=_shared_doc_kwargs["klass"]) + @doc( + klass=_shared_doc_kwargs["klass"], + axes_single_arg=_shared_doc_kwargs["axes_single_arg"], + ) def bfill( self, *, @@ -7484,7 +7515,28 @@ def bfill( downcast: dict | None | lib.NoDefault = lib.no_default, ) -> Self | None: """ - Synonym for :meth:`DataFrame.fillna` with ``method='bfill'``. + Fill NA/NaN values by using the next valid observation to fill the gap. + + Parameters + ---------- + axis : {axes_single_arg} + Axis along which to fill missing values. For `Series` + this parameter is unused and defaults to 0. + inplace : bool, default False + If True, fill in-place. Note: this will modify any + other views on this object (e.g., a no-copy slice for a column in a + DataFrame). + limit : int, default None + If method is specified, this is the maximum number of consecutive + NaN values to forward/backward fill. In other words, if there is + a gap with more than this number of consecutive NaNs, it will only + be partially filled. If method is not specified, this is the + maximum number of entries along the entire axis where NaNs will be + filled. Must be greater than 0 if not None. + downcast : dict, default is None + A dict of item->dtype of what to downcast if possible, + or the string 'infer' which will try to downcast to an appropriate + equal type (e.g. float64 to int64 if possible). Returns ------- @@ -7563,7 +7615,7 @@ def backfill( downcast: dict | None | lib.NoDefault = lib.no_default, ) -> Self | None: """ - Synonym for :meth:`DataFrame.fillna` with ``method='bfill'``. + Fill NA/NaN values by using the next valid observation to fill the gap. .. deprecated:: 2.0 @@ -8110,10 +8162,11 @@ def interpolate( stacklevel=find_stack_level(), ) - if "fill_value" in kwargs: + if method in fillna_methods and "fill_value" in kwargs: raise ValueError( "'fill_value' is not a valid keyword for " - f"{type(self).__name__}.interpolate" + f"{type(self).__name__}.interpolate with method from " + f"{fillna_methods}" ) if isinstance(obj.index, MultiIndex) and method != "linear": @@ -8375,7 +8428,7 @@ def isna(self) -> Self: -------- Show which entries in a DataFrame are NA. - >>> df = pd.DataFrame(dict(age=[5, 6, np.NaN], + >>> df = pd.DataFrame(dict(age=[5, 6, np.nan], ... born=[pd.NaT, pd.Timestamp('1939-05-27'), ... pd.Timestamp('1940-04-25')], ... name=['Alfred', 'Batman', ''], @@ -8394,7 +8447,7 @@ def isna(self) -> Self: Show which entries in a Series are NA. - >>> ser = pd.Series([5, 6, np.NaN]) + >>> ser = pd.Series([5, 6, np.nan]) >>> ser 0 5.0 1 6.0 @@ -8442,7 +8495,7 @@ def notna(self) -> Self: -------- Show which entries in a DataFrame are not NA. - >>> df = pd.DataFrame(dict(age=[5, 6, np.NaN], + >>> df = pd.DataFrame(dict(age=[5, 6, np.nan], ... born=[pd.NaT, pd.Timestamp('1939-05-27'), ... pd.Timestamp('1940-04-25')], ... name=['Alfred', 'Batman', ''], @@ -8461,7 +8514,7 @@ def notna(self) -> Self: Show which entries in a Series are not NA. - >>> ser = pd.Series([5, 6, np.NaN]) + >>> ser = pd.Series([5, 6, np.nan]) >>> ser 0 5.0 1 6.0 @@ -8628,7 +8681,7 @@ def clip( Clips using specific lower threshold per column element, with missing values: - >>> t = pd.Series([2, -4, np.NaN, 6, 3]) + >>> t = pd.Series([2, -4, np.nan, 6, 3]) >>> t 0 2.0 1 -4.0 @@ -9044,6 +9097,10 @@ def resample( .. versionadded:: 1.3.0 + .. note:: + + Only takes effect for Tick-frequencies (i.e. fixed frequencies like + days, hours, and minutes, rather than months or quarters). offset : Timedelta or str, default is None An offset timedelta added to the origin. @@ -9314,12 +9371,12 @@ def resample( 2000-10-02 00:26:00 24 Freq: 17T, dtype: int64 - >>> ts.resample('17W', origin='2000-01-01').sum() - 2000-01-02 0 - 2000-04-30 0 - 2000-08-27 0 - 2000-12-24 108 - Freq: 17W-SUN, dtype: int64 + >>> ts.resample('17min', origin='2000-01-01').sum() + 2000-10-01 23:24:00 3 + 2000-10-01 23:41:00 15 + 2000-10-01 23:58:00 45 + 2000-10-02 00:15:00 45 + Freq: 17T, dtype: int64 If you want to adjust the start of the bins with an `offset` Timedelta, the two following lines are equivalent: @@ -9828,7 +9885,7 @@ def align( copy : bool, default True Always returns new objects. If copy=False and no reindexing is required then original objects are returned. - fill_value : scalar, default np.NaN + fill_value : scalar, default np.nan Value to use for missing values. Defaults to NaN, but can be any "compatible" value. method : {{'backfill', 'bfill', 'pad', 'ffill', None}}, default None @@ -10752,10 +10809,14 @@ def shift( if freq is not None and fill_value is not lib.no_default: # GH#53832 - raise ValueError( - "Cannot pass both 'freq' and 'fill_value' to " - f"{type(self).__name__}.shift" + warnings.warn( + "Passing a 'freq' together with a 'fill_value' silently ignores " + "the fill_value and is deprecated. This will raise in a future " + "version.", + FutureWarning, + stacklevel=find_stack_level(), ) + fill_value = lib.no_default if periods == 0: return self.copy(deep=None) @@ -11518,6 +11579,7 @@ def pct_change( How to handle NAs **before** computing percent changes. .. deprecated:: 2.1 + All options of `fill_method` are deprecated except `fill_method=None`. limit : int, default None The number of consecutive NAs to fill before stopping. @@ -11624,26 +11686,33 @@ def pct_change( APPL -0.252395 -0.011860 NaN """ # GH#53491 - if fill_method is not lib.no_default or limit is not lib.no_default: + if fill_method not in (lib.no_default, None) or limit is not lib.no_default: warnings.warn( - "The 'fill_method' and 'limit' keywords in " - f"{type(self).__name__}.pct_change are deprecated and will be " - "removed in a future version. Call " - f"{'bfill' if fill_method in ('backfill', 'bfill') else 'ffill'} " - "before calling pct_change instead.", + "The 'fill_method' keyword being not None and the 'limit' keyword in " + f"{type(self).__name__}.pct_change are deprecated and will be removed " + "in a future version. Either fill in any non-leading NA values prior " + "to calling pct_change or specify 'fill_method=None' to not fill NA " + "values.", FutureWarning, stacklevel=find_stack_level(), ) if fill_method is lib.no_default: - if self.isna().values.any(): - warnings.warn( - "The default fill_method='pad' in " - f"{type(self).__name__}.pct_change is deprecated and will be " - "removed in a future version. Call ffill before calling " - "pct_change to retain current behavior and silence this warning.", - FutureWarning, - stacklevel=find_stack_level(), - ) + if limit is lib.no_default: + cols = self.items() if self.ndim == 2 else [(None, self)] + for _, col in cols: + mask = col.isna().values + mask = mask[np.argmax(~mask) :] + if mask.any(): + warnings.warn( + "The default fill_method='pad' in " + f"{type(self).__name__}.pct_change is deprecated and will " + "be removed in a future version. Either fill in any " + "non-leading NA values prior to calling pct_change or " + "specify 'fill_method=None' to not fill NA values.", + FutureWarning, + stacklevel=find_stack_level(), + ) + break fill_method = "pad" if limit is lib.no_default: limit = None diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 8bef167b747e2..8ba644b588a08 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -236,10 +236,13 @@ def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs) kwargs = {} if isinstance(func, str): - if maybe_use_numba(engine): + if maybe_use_numba(engine) and engine is not None: # Not all agg functions support numba, only propagate numba kwargs - # if user asks for numba + # if user asks for numba, and engine is not None + # (if engine is None, the called function will handle the case where + # numba is requested via the global option) kwargs["engine"] = engine + if engine_kwargs is not None: kwargs["engine_kwargs"] = engine_kwargs return getattr(self, func)(*args, **kwargs) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 69a99c1bc867c..187c62bc0c9ce 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -105,6 +105,12 @@ class providing the base-class of operations. ExtensionArray, FloatingArray, IntegerArray, + SparseArray, +) +from pandas.core.arrays.string_ import StringDtype +from pandas.core.arrays.string_arrow import ( + ArrowStringArray, + ArrowStringArrayNumpySemantics, ) from pandas.core.base import ( PandasObject, @@ -1057,7 +1063,8 @@ def get_group(self, name, obj=None) -> DataFrame | Series: raise KeyError(name) if obj is None: - return self._selected_obj.iloc[inds] + indexer = inds if self.axis == 0 else (slice(None), inds) + return self._selected_obj.iloc[indexer] else: warnings.warn( "obj is deprecated and will be removed in a future version. " @@ -1852,8 +1859,7 @@ def _agg_py_fallback( ser = Series(values, copy=False) else: # We only get here with values.dtype == object - # TODO: special case not needed with ArrayManager - df = DataFrame(values.T) + df = DataFrame(values.T, dtype=values.dtype) # bc we split object blocks in grouped_reduce, we have only 1 col # otherwise we'd have to worry about block-splitting GH#39329 assert df.shape[1] == 1 @@ -1909,7 +1915,10 @@ def array_func(values: ArrayLike) -> ArrayLike: # and non-applicable functions # try to python agg # TODO: shouldn't min_count matter? - if how in ["any", "all", "std", "sem"]: + # TODO: avoid special casing SparseArray here + if how in ["any", "all"] and isinstance(values, SparseArray): + pass + elif how in ["any", "all", "std", "sem"]: raise # TODO: re-raise as TypeError? should not be reached else: return result @@ -2257,7 +2266,9 @@ def hfunc(bvalues: ArrayLike) -> ArrayLike: return IntegerArray( counted[0], mask=np.zeros(counted.shape[1], dtype=np.bool_) ) - elif isinstance(bvalues, ArrowExtensionArray): + elif isinstance(bvalues, ArrowExtensionArray) and not isinstance( + bvalues.dtype, StringDtype + ): return type(bvalues)._from_sequence(counted[0]) if is_series: assert counted.ndim == 2 @@ -2796,7 +2807,9 @@ def _value_counts( result_series.name = name result_series.index = index.set_names(range(len(columns))) result_frame = result_series.reset_index() - result_frame.columns = columns + [name] + orig_dtype = self.grouper.groupings[0].obj.columns.dtype # type: ignore[union-attr] # noqa: E501 + cols = Index(columns, dtype=orig_dtype).insert(len(columns), name) + result_frame.columns = cols result = result_frame return result.__finalize__(self.obj, method="value_counts") @@ -2948,7 +2961,12 @@ def size(self) -> DataFrame | Series: dtype_backend: None | Literal["pyarrow", "numpy_nullable"] = None if isinstance(self.obj, Series): if isinstance(self.obj.array, ArrowExtensionArray): - dtype_backend = "pyarrow" + if isinstance(self.obj.array, ArrowStringArrayNumpySemantics): + dtype_backend = None + elif isinstance(self.obj.array, ArrowStringArray): + dtype_backend = "numpy_nullable" + else: + dtype_backend = "pyarrow" elif isinstance(self.obj.array, BaseMaskedArray): dtype_backend = "numpy_nullable" # TODO: For DataFrames what if columns are mixed arrow/numpy/masked? @@ -5199,7 +5217,7 @@ def diff( def pct_change( self, periods: int = 1, - fill_method: FillnaOptions | lib.NoDefault = lib.no_default, + fill_method: FillnaOptions | None | lib.NoDefault = lib.no_default, limit: int | None | lib.NoDefault = lib.no_default, freq=None, axis: Axis | lib.NoDefault = lib.no_default, @@ -5251,23 +5269,26 @@ def pct_change( goldfish 0.2 0.125 """ # GH#53491 - if fill_method is not lib.no_default or limit is not lib.no_default: + if fill_method not in (lib.no_default, None) or limit is not lib.no_default: warnings.warn( - "The 'fill_method' and 'limit' keywords in " - f"{type(self).__name__}.pct_change are deprecated and will be " - "removed in a future version. Call " - f"{'bfill' if fill_method in ('backfill', 'bfill') else 'ffill'} " - "before calling pct_change instead.", + "The 'fill_method' keyword being not None and the 'limit' keyword in " + f"{type(self).__name__}.pct_change are deprecated and will be removed " + "in a future version. Either fill in any non-leading NA values prior " + "to calling pct_change or specify 'fill_method=None' to not fill NA " + "values.", FutureWarning, stacklevel=find_stack_level(), ) if fill_method is lib.no_default: - if any(grp.isna().values.any() for _, grp in self): + if limit is lib.no_default and any( + grp.isna().values.any() for _, grp in self + ): warnings.warn( "The default fill_method='ffill' in " - f"{type(self).__name__}.pct_change is deprecated and will be " - "removed in a future version. Call ffill before calling " - "pct_change to retain current behavior and silence this warning.", + f"{type(self).__name__}.pct_change is deprecated and will " + "be removed in a future version. Either fill in any " + "non-leading NA values prior to calling pct_change or " + "specify 'fill_method=None' to not fill NA values.", FutureWarning, stacklevel=find_stack_level(), ) @@ -5414,7 +5435,7 @@ def _mask_selected_obj(self, mask: npt.NDArray[np.bool_]) -> NDFrameT: def _reindex_output( self, output: OutputFrameOrSeries, - fill_value: Scalar = np.NaN, + fill_value: Scalar = np.nan, qs: npt.NDArray[np.float64] | None = None, ) -> OutputFrameOrSeries: """ @@ -5432,7 +5453,7 @@ def _reindex_output( ---------- output : Series or DataFrame Object resulting from grouping and applying an operation. - fill_value : scalar, default np.NaN + fill_value : scalar, default np.nan Value to use for unobserved categories if self.observed is False. qs : np.ndarray[float64] or None, default None quantile values, only relevant for quantile. diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index ea92fbae9566d..9877ddf0ea7a6 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -207,12 +207,12 @@ class Grouper: 2000-10-02 00:26:00 24 Freq: 17T, dtype: int64 - >>> ts.groupby(pd.Grouper(freq='17W', origin='2000-01-01')).sum() - 2000-01-02 0 - 2000-04-30 0 - 2000-08-27 0 - 2000-12-24 108 - Freq: 17W-SUN, dtype: int64 + >>> ts.groupby(pd.Grouper(freq='17min', origin='2000-01-01')).sum() + 2000-10-01 23:24:00 3 + 2000-10-01 23:41:00 15 + 2000-10-01 23:58:00 45 + 2000-10-02 00:15:00 45 + Freq: 17T, dtype: int64 If you want to adjust the start of the bins with an `offset` Timedelta, the two following lines are equivalent: diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 288fd35892fd0..85b68c1bc2ec7 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -1419,8 +1419,8 @@ def _format_with_header(self, header: list[str_t], na_rep: str_t) -> list[str_t] values = self._values - if is_object_dtype(values.dtype): - values = cast(np.ndarray, values) + if is_object_dtype(values.dtype) or is_string_dtype(values.dtype): + values = np.asarray(values) values = lib.maybe_convert_objects(values, safe=True) result = [pprint_thing(x, escape_chars=("\t", "\r", "\n")) for x in values] @@ -2848,7 +2848,7 @@ def isna(self) -> npt.NDArray[np.bool_]: Show which entries in a pandas.Index are NA. The result is an array. - >>> idx = pd.Index([5.2, 6.0, np.NaN]) + >>> idx = pd.Index([5.2, 6.0, np.nan]) >>> idx Index([5.2, 6.0, nan], dtype='float64') >>> idx.isna() @@ -2904,7 +2904,7 @@ def notna(self) -> npt.NDArray[np.bool_]: Show which entries in an Index are not NA. The result is an array. - >>> idx = pd.Index([5.2, 6.0, np.NaN]) + >>> idx = pd.Index([5.2, 6.0, np.nan]) >>> idx Index([5.2, 6.0, nan], dtype='float64') >>> idx.notna() @@ -4586,7 +4586,7 @@ def join( ------- join_index, (left_indexer, right_indexer) - Examples + Examples -------- >>> idx1 = pd.Index([1, 2, 3]) >>> idx2 = pd.Index([4, 5, 6]) @@ -5204,7 +5204,7 @@ def _from_join_target(self, result: np.ndarray) -> ArrayLike: """ if isinstance(self.values, BaseMaskedArray): return type(self.values)(result, np.zeros(result.shape, dtype=np.bool_)) - elif isinstance(self.values, ArrowExtensionArray): + elif isinstance(self.values, (ArrowExtensionArray, StringArray)): return type(self.values)._from_sequence(result) return result @@ -7024,7 +7024,8 @@ def infer_objects(self, copy: bool = True) -> Index: result._references.add_index_reference(result) return result - def diff(self, periods: int = 1): + @final + def diff(self, periods: int = 1) -> Index: """ Computes the difference between consecutive values in the Index object. @@ -7050,7 +7051,7 @@ def diff(self, periods: int = 1): Index([nan, 10.0, 10.0, 10.0, 10.0], dtype='float64') """ - return self._constructor(self.to_series().diff(periods)) + return Index(self.to_series().diff(periods)) def round(self, decimals: int = 0): """ @@ -7215,11 +7216,12 @@ def any(self, *args, **kwargs): """ nv.validate_any(args, kwargs) self._maybe_disable_logical_methods("any") - # error: Argument 1 to "any" has incompatible type "ArrayLike"; expected - # "Union[Union[int, float, complex, str, bytes, generic], Sequence[Union[int, - # float, complex, str, bytes, generic]], Sequence[Sequence[Any]], - # _SupportsArray]" - return np.any(self.values) # type: ignore[arg-type] + vals = self._values + if not isinstance(vals, np.ndarray): + # i.e. EA, call _reduce instead of "any" to get TypeError instead + # of AttributeError + return vals._reduce("any") + return np.any(vals) def all(self, *args, **kwargs): """ @@ -7262,11 +7264,12 @@ def all(self, *args, **kwargs): """ nv.validate_all(args, kwargs) self._maybe_disable_logical_methods("all") - # error: Argument 1 to "all" has incompatible type "ArrayLike"; expected - # "Union[Union[int, float, complex, str, bytes, generic], Sequence[Union[int, - # float, complex, str, bytes, generic]], Sequence[Sequence[Any]], - # _SupportsArray]" - return np.all(self.values) # type: ignore[arg-type] + vals = self._values + if not isinstance(vals, np.ndarray): + # i.e. EA, call _reduce instead of "all" to get TypeError instead + # of AttributeError + return vals._reduce("all") + return np.all(vals) @final def _maybe_disable_logical_methods(self, opname: str_t) -> None: @@ -7275,9 +7278,9 @@ def _maybe_disable_logical_methods(self, opname: str_t) -> None: """ if ( isinstance(self, ABCMultiIndex) - or needs_i8_conversion(self.dtype) - or isinstance(self.dtype, (IntervalDtype, CategoricalDtype)) - or is_float_dtype(self.dtype) + # TODO(3.0): PeriodArray and DatetimeArray any/all will raise, + # so checking needs_i8_conversion will be unnecessary + or (needs_i8_conversion(self.dtype) and self.dtype.kind != "m") ): # This call will raise make_invalid_op(opname)(self) diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index f915c08bb8294..e8b3676e71ae0 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -124,7 +124,7 @@ def _get_next_label(label): elif is_integer_dtype(dtype): return label + 1 elif is_float_dtype(dtype): - return np.nextafter(label, np.infty) + return np.nextafter(label, np.inf) else: raise TypeError(f"cannot determine next label for type {repr(type(label))}") @@ -141,7 +141,7 @@ def _get_prev_label(label): elif is_integer_dtype(dtype): return label - 1 elif is_float_dtype(dtype): - return np.nextafter(label, -np.infty) + return np.nextafter(label, -np.inf) else: raise TypeError(f"cannot determine next label for type {repr(type(label))}") diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 33eb411374e67..bdc9e05a38d1c 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -2460,12 +2460,12 @@ def _reorder_ilevels(self, order) -> MultiIndex: def _recode_for_new_levels( self, new_levels, copy: bool = True ) -> Generator[np.ndarray, None, None]: - if len(new_levels) != self.nlevels: + if len(new_levels) > self.nlevels: raise AssertionError( f"Length of new_levels ({len(new_levels)}) " - f"must be same as self.nlevels ({self.nlevels})" + f"must be <= self.nlevels ({self.nlevels})" ) - for i in range(self.nlevels): + for i in range(len(new_levels)): yield recode_for_categories( self.codes[i], self.levels[i], new_levels[i], copy=copy ) diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index ca0b1705b5c38..a2871f364f092 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -985,8 +985,9 @@ def _getitem_tuple_same_dim(self, tup: tuple): """ retval = self.obj # Selecting columns before rows is signficiantly faster + start_val = (self.ndim - len(tup)) + 1 for i, key in enumerate(reversed(tup)): - i = self.ndim - i - 1 + i = self.ndim - i - start_val if com.is_null_slice(key): continue diff --git a/pandas/core/interchange/from_dataframe.py b/pandas/core/interchange/from_dataframe.py index 214fbf9f36435..d45ae37890ba7 100644 --- a/pandas/core/interchange/from_dataframe.py +++ b/pandas/core/interchange/from_dataframe.py @@ -266,10 +266,9 @@ def string_column_to_ndarray(col: Column) -> tuple[np.ndarray, Any]: assert buffers["offsets"], "String buffers must contain offsets" # Retrieve the data buffer containing the UTF-8 code units - data_buff, protocol_data_dtype = buffers["data"] + data_buff, _ = buffers["data"] # We're going to reinterpret the buffer as uint8, so make sure we can do it safely - assert protocol_data_dtype[1] == 8 - assert protocol_data_dtype[2] in ( + assert col.dtype[2] in ( ArrowCTypes.STRING, ArrowCTypes.LARGE_STRING, ) # format_str == utf-8 @@ -377,15 +376,16 @@ def datetime_column_to_ndarray(col: Column) -> tuple[np.ndarray | pd.Series, Any """ buffers = col.get_buffers() - _, _, format_str, _ = col.dtype - dbuf, dtype = buffers["data"] + _, col_bit_width, format_str, _ = col.dtype + dbuf, _ = buffers["data"] # Consider dtype being `uint` to get number of units passed since the 01.01.1970 + data = buffer_to_ndarray( dbuf, ( - DtypeKind.UINT, - dtype[1], - getattr(ArrowCTypes, f"UINT{dtype[1]}"), + DtypeKind.INT, + col_bit_width, + getattr(ArrowCTypes, f"INT{col_bit_width}"), Endianness.NATIVE, ), offset=col.offset, diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index ecb9cd47d7995..c0b78e734b4b4 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -18,6 +18,7 @@ from pandas._config import using_copy_on_write from pandas._libs import ( + NaT, internals as libinternals, lib, writers, @@ -60,7 +61,10 @@ from pandas.core.dtypes.common import ( ensure_platform_int, is_1d_only_ea_dtype, + is_float_dtype, + is_integer_dtype, is_list_like, + is_scalar, is_string_dtype, ) from pandas.core.dtypes.dtypes import ( @@ -454,6 +458,25 @@ def coerce_to_target_dtype(self, other, warn_on_upcast: bool = False) -> Block: and will receive the same block """ new_dtype = find_result_type(self.values.dtype, other) + + # In a future version of pandas, the default will be that + # setting `nan` into an integer series won't raise. + if ( + is_scalar(other) + and is_integer_dtype(self.values.dtype) + and isna(other) + and other is not NaT + ): + warn_on_upcast = False + elif ( + isinstance(other, np.ndarray) + and other.ndim == 1 + and is_integer_dtype(self.values.dtype) + and is_float_dtype(other.dtype) + and lib.has_only_ints_or_nan(other) + ): + warn_on_upcast = False + if warn_on_upcast: warnings.warn( f"Setting an item of incompatible dtype is deprecated " @@ -1133,7 +1156,9 @@ def setitem(self, indexer, value, using_cow: bool = False) -> Block: # length checking check_setitem_lengths(indexer, value, values) - value = extract_array(value, extract_numpy=True) + if self.dtype != _dtype_obj: + # GH48933: extract_array would convert a pd.Series value to np.ndarray + value = extract_array(value, extract_numpy=True) try: casted = np_can_hold_element(values.dtype, value) except LossySetitemError: @@ -1427,7 +1452,7 @@ def pad_or_backfill( vals = cast(NumpyExtensionArray, self.array_values) if axis == 1: vals = vals.T - new_values = vals.pad_or_backfill( + new_values = vals._pad_or_backfill( method=method, limit=limit, limit_area=limit_area, @@ -1930,9 +1955,9 @@ def pad_or_backfill( if values.ndim == 2 and axis == 1: # NDArrayBackedExtensionArray.fillna assumes axis=0 - new_values = values.T.pad_or_backfill(method=method, limit=limit).T + new_values = values.T._pad_or_backfill(method=method, limit=limit).T else: - new_values = values.pad_or_backfill(method=method, limit=limit) + new_values = values._pad_or_backfill(method=method, limit=limit) return [self.make_block_same_class(new_values)] @@ -1990,7 +2015,7 @@ def fillna( "need to implement this keyword or an exception will be " "raised. In the interim, the keyword is ignored by " f"{type(self.values).__name__}.", - FutureWarning, + DeprecationWarning, stacklevel=find_stack_level(), ) diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index 4d33f0137d3c4..b2d463a8c6c26 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -177,7 +177,7 @@ def concatenate_managers( values = np.concatenate(vals, axis=1) # type: ignore[arg-type] elif is_1d_only_ea_dtype(blk.dtype): # TODO(EA2D): special-casing not needed with 2D EAs - values = concat_compat(vals, axis=1, ea_compat_axis=True) + values = concat_compat(vals, axis=0, ea_compat_axis=True) values = ensure_block_shape(values, ndim=2) else: values = concat_compat(vals, axis=1) diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 2290cd86f35e6..8bb6c6b5de7ea 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -13,6 +13,8 @@ import numpy as np from numpy import ma +from pandas._config import using_pyarrow_string_dtype + from pandas._libs import lib from pandas.core.dtypes.astype import astype_is_view @@ -65,6 +67,7 @@ from pandas.core.internals.blocks import ( BlockPlacement, ensure_block_shape, + new_block, new_block_2d, ) from pandas.core.internals.managers import ( @@ -156,7 +159,7 @@ def arrays_to_mgr( def rec_array_to_mgr( - data: np.recarray | np.ndarray, + data: np.rec.recarray | np.ndarray, index, columns, dtype: DtypeObj | None, @@ -372,6 +375,19 @@ def ndarray_to_mgr( bp = BlockPlacement(slice(len(columns))) nb = new_block_2d(values, placement=bp, refs=refs) block_values = [nb] + elif dtype is None and values.dtype.kind == "U" and using_pyarrow_string_dtype(): + dtype = StringDtype(storage="pyarrow_numpy") + + obj_columns = list(values) + block_values = [ + new_block( + dtype.construct_array_type()._from_sequence(data, dtype=dtype), + BlockPlacement(slice(i, i + 1)), + ndim=2, + ) + for i, data in enumerate(obj_columns) + ] + else: bp = BlockPlacement(slice(len(columns))) nb = new_block_2d(values, placement=bp, refs=refs) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 7bb579b22aeed..4a6d3c333a6a5 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -968,19 +968,10 @@ def fast_xs(self, loc: int) -> SingleBlockManager: n = len(self) - # GH#46406 - immutable_ea = isinstance(dtype, ExtensionDtype) and dtype._is_immutable - - if isinstance(dtype, ExtensionDtype) and not immutable_ea: - cls = dtype.construct_array_type() - result = cls._empty((n,), dtype=dtype) + if isinstance(dtype, ExtensionDtype): + result = np.empty(n, dtype=object) else: - # error: Argument "dtype" to "empty" has incompatible type - # "Union[Type[object], dtype[Any], ExtensionDtype, None]"; expected - # "None" - result = np.empty( - n, dtype=object if immutable_ea else dtype # type: ignore[arg-type] - ) + result = np.empty(n, dtype=dtype) result = ensure_wrapped_if_datetimelike(result) for blk in self.blocks: @@ -989,9 +980,9 @@ def fast_xs(self, loc: int) -> SingleBlockManager: for i, rl in enumerate(blk.mgr_locs): result[rl] = blk.iget((i, loc)) - if immutable_ea: - dtype = cast(ExtensionDtype, dtype) - result = dtype.construct_array_type()._from_sequence(result, dtype=dtype) + if isinstance(dtype, ExtensionDtype): + cls = dtype.construct_array_type() + result = cls._from_sequence(result, dtype=dtype) bp = BlockPlacement(slice(0, len(result))) block = new_block(result, placement=bp, ndim=1) diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 9b8d1c870091d..b75005ff202e3 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -2463,16 +2463,8 @@ def _get_timestamp_range_edges( """ if isinstance(freq, Tick): index_tz = first.tz - - if isinstance(origin, Timestamp) and origin.tz != index_tz: + if isinstance(origin, Timestamp) and (origin.tz is None) != (index_tz is None): raise ValueError("The origin must have the same timezone as the index.") - - elif isinstance(origin, Timestamp): - if origin <= first: - first = origin - elif origin >= last: - last = origin - if origin == "epoch": # set the epoch based on the timezone to have similar bins results when # resampling on the same kind of indexes on different timezones @@ -2494,9 +2486,6 @@ def _get_timestamp_range_edges( first = first.tz_localize(index_tz) last = last.tz_localize(index_tz) else: - if isinstance(origin, Timestamp): - first = origin - first = first.normalize() last = last.normalize() diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 6987a0ac7bf6b..0b343a1fd2d82 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -53,6 +53,7 @@ ensure_object, is_bool, is_bool_dtype, + is_extension_array_dtype, is_float_dtype, is_integer, is_integer_dtype, @@ -90,6 +91,7 @@ ExtensionArray, ) from pandas.core.arrays._mixins import NDArrayBackedExtensionArray +from pandas.core.arrays.string_ import StringDtype import pandas.core.common as com from pandas.core.construction import ( ensure_wrapped_if_datetimelike, @@ -1384,6 +1386,21 @@ def _maybe_coerce_merge_keys(self) -> None: if lk.dtype.kind == rk.dtype.kind: continue + if is_extension_array_dtype(lk.dtype) and not is_extension_array_dtype( + rk.dtype + ): + ct = find_common_type([lk.dtype, rk.dtype]) + if is_extension_array_dtype(ct): + rk = ct.construct_array_type()._from_sequence(rk) # type: ignore[union-attr] # noqa: E501 + else: + rk = rk.astype(ct) # type: ignore[arg-type] + elif is_extension_array_dtype(rk.dtype): + ct = find_common_type([lk.dtype, rk.dtype]) + if is_extension_array_dtype(ct): + lk = ct.construct_array_type()._from_sequence(lk) # type: ignore[union-attr] # noqa: E501 + else: + lk = lk.astype(ct) # type: ignore[arg-type] + # check whether ints and floats if is_integer_dtype(rk.dtype) and is_float_dtype(lk.dtype): # GH 47391 numpy > 1.24 will raise a RuntimeError for nan -> int @@ -2176,6 +2193,10 @@ def injection(obj: ArrayLike): # TODO: conversions for EAs that can be no-copy. lbv = np.asarray(lbv) rbv = np.asarray(rbv) + if needs_i8_conversion(lbv.dtype): + lbv = lbv.view("i8") + if needs_i8_conversion(rbv.dtype): + rbv = rbv.view("i8") else: # We get here with non-ndarrays in test_merge_by_col_tz_aware # and test_merge_groupby_multiple_column_with_categorical_column @@ -2399,21 +2420,10 @@ def _factorize_keys( rk = ensure_int64(rk.codes) elif isinstance(lk, ExtensionArray) and lk.dtype == rk.dtype: - if not isinstance(lk, BaseMaskedArray) and not ( - # exclude arrow dtypes that would get cast to object - isinstance(lk.dtype, ArrowDtype) - and ( - is_numeric_dtype(lk.dtype.numpy_dtype) - or is_string_dtype(lk.dtype) - and not sort - ) + if (isinstance(lk.dtype, ArrowDtype) and is_string_dtype(lk.dtype)) or ( + isinstance(lk.dtype, StringDtype) + and lk.dtype.storage in ["pyarrow", "pyarrow_numpy"] ): - lk, _ = lk._values_for_factorize() - - # error: Item "ndarray" of "Union[Any, ndarray]" has no attribute - # "_values_for_factorize" - rk, _ = rk._values_for_factorize() # type: ignore[union-attr] - elif isinstance(lk.dtype, ArrowDtype) and is_string_dtype(lk.dtype): import pyarrow as pa import pyarrow.compute as pc @@ -2428,14 +2438,35 @@ def _factorize_keys( length = len(dc.dictionary) llab, rlab, count = ( - pc.fill_null(dc.indices[slice(len_lk)], length).to_numpy(), - pc.fill_null(dc.indices[slice(len_lk, None)], length).to_numpy(), + pc.fill_null(dc.indices[slice(len_lk)], length) + .to_numpy() + .astype(np.intp, copy=False), + pc.fill_null(dc.indices[slice(len_lk, None)], length) + .to_numpy() + .astype(np.intp, copy=False), len(dc.dictionary), ) + if dc.null_count > 0: + count += 1 if how == "right": return rlab, llab, count return llab, rlab, count + if not isinstance(lk, BaseMaskedArray) and not ( + # exclude arrow dtypes that would get cast to object + isinstance(lk.dtype, ArrowDtype) + and ( + is_numeric_dtype(lk.dtype.numpy_dtype) + or is_string_dtype(lk.dtype) + and not sort + ) + ): + lk, _ = lk._values_for_factorize() + + # error: Item "ndarray" of "Union[Any, ndarray]" has no attribute + # "_values_for_factorize" + rk, _ = rk._values_for_factorize() # type: ignore[union-attr] + if needs_i8_conversion(lk.dtype) and lk.dtype == rk.dtype: # GH#23917 TODO: Needs tests for non-matching dtypes # GH#23917 TODO: needs tests for case where lk is integer-dtype diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index fc8d827cd31bb..bf7c7a1ee4dc7 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -908,7 +908,7 @@ def stack_v3(frame: DataFrame, level: list[int]) -> Series | DataFrame: data = frame.copy() else: # Take the data from frame corresponding to this idx value - if not isinstance(idx, tuple): + if len(level) == 1: idx = (idx,) gen = iter(idx) column_indexer = tuple( diff --git a/pandas/core/series.py b/pandas/core/series.py index 814a770b192bf..7b22d89bfe22d 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -579,14 +579,14 @@ def _constructor(self) -> Callable[..., Series]: return Series def _constructor_from_mgr(self, mgr, axes): - ser = self._from_mgr(mgr, axes=axes) - ser._name = None # caller is responsible for setting real name - if type(self) is Series: - # fastpath avoiding constructor call + if self._constructor is Series: + # we are pandas.Series (or a subclass that doesn't override _constructor) + ser = Series._from_mgr(mgr, axes=axes) + ser._name = None # caller is responsible for setting real name return ser else: assert axes is mgr.axes - return self._constructor(ser, copy=False) + return self._constructor(mgr) @property def _constructor_expanddim(self) -> Callable[..., DataFrame]: @@ -610,12 +610,12 @@ def _expanddim_from_mgr(self, mgr, axes) -> DataFrame: return DataFrame._from_mgr(mgr, axes=mgr.axes) def _constructor_expanddim_from_mgr(self, mgr, axes): - df = self._expanddim_from_mgr(mgr, axes) - if type(self) is Series: - # fastpath avoiding constructor - return df + from pandas.core.frame import DataFrame + + if self._constructor_expanddim is DataFrame: + return self._expanddim_from_mgr(mgr, axes) assert axes is mgr.axes - return self._constructor_expanddim(df, copy=False) + return self._constructor_expanddim(mgr) # types @property @@ -4634,11 +4634,6 @@ def apply( """ Invoke function on values of Series. - .. deprecated:: 2.1.0 - - If the result from ``func`` is a ``Series``, wrapping the output in a - ``DataFrame`` instead of a ``Series`` has been deprecated. - Can be ufunc (a NumPy function that applies to the entire Series) or a Python function that only works on single values. @@ -5586,7 +5581,7 @@ def dropna( Empty strings are not considered NA values. ``None`` is considered an NA value. - >>> ser = pd.Series([np.NaN, 2, pd.NaT, '', None, 'I stay']) + >>> ser = pd.Series([np.nan, 2, pd.NaT, '', None, 'I stay']) >>> ser 0 NaN 1 2 diff --git a/pandas/core/shared_docs.py b/pandas/core/shared_docs.py index 7579f816d0ace..ba793b9c11c27 100644 --- a/pandas/core/shared_docs.py +++ b/pandas/core/shared_docs.py @@ -113,6 +113,12 @@ axis : {0 or 'index', 1 or 'columns'}, default 0 Split along rows (0) or columns (1). For `Series` this parameter is unused and defaults to 0. + + .. deprecated:: 2.1.0 + + Will be removed and behave like axis=0 in a future version. + For ``axis=1``, do ``frame.T.groupby(...)`` instead. + level : int, level name, or sequence of such, default None If the axis is a MultiIndex (hierarchical), group by a particular level or levels. Do not specify both ``by`` and ``level``. diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index bc6e29c3c7fbf..e6b54de9a8bfb 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -365,22 +365,15 @@ def lexsort_indexer( if codes_given: mask = k == -1 codes = k.copy() - n = len(codes) - mask_n = n - # error: Item "ExtensionArray" of "Union[Any, ExtensionArray, - # ndarray[Any, Any]]" has no attribute "any" - if mask.any(): # type: ignore[union-attr] - n -= 1 + # error: Item "ExtensionArray" of "Series | ExtensionArray | + # ndarray[Any, Any]" has no attribute "max" + n = codes.max() + 1 if len(codes) else 0 # type: ignore[union-attr] else: cat = Categorical(k, ordered=True) n = len(cat.categories) codes = cat.codes.copy() mask = cat.codes == -1 - if mask.any(): - mask_n = n + 1 - else: - mask_n = n if order: # ascending if na_position == "last": @@ -391,12 +384,6 @@ def lexsort_indexer( # complex, str, bytes, _NestedSequence[Union[bool, int, float, # complex, str, bytes]]]" codes = np.where(mask, n, codes) # type: ignore[arg-type] - elif na_position == "first": - # error: Incompatible types in assignment (expression has type - # "Union[Any, int, ndarray[Any, dtype[signedinteger[Any]]]]", - # variable has type "Union[Series, ExtensionArray, ndarray[Any, Any]]") - # error: Unsupported operand types for + ("ExtensionArray" and "int") - codes += 1 # type: ignore[operator,assignment] else: # not order means descending if na_position == "last": # error: Unsupported operand types for - ("int" and "ExtensionArray") @@ -406,9 +393,7 @@ def lexsort_indexer( # _NestedSequence[_SupportsArray[dtype[Any]]], bool, int, float, # complex, str, bytes, _NestedSequence[Union[bool, int, float, # complex, str, bytes]]]" - codes = np.where( - mask, n, n - codes - 1 # type: ignore[operator,arg-type] - ) + codes = np.where(mask, n, n - codes - 1) # type: ignore[arg-type] elif na_position == "first": # error: Unsupported operand types for - ("int" and "ExtensionArray") # error: Argument 1 to "where" has incompatible type "Union[Any, @@ -417,9 +402,9 @@ def lexsort_indexer( # _NestedSequence[_SupportsArray[dtype[Any]]], bool, int, float, # complex, str, bytes, _NestedSequence[Union[bool, int, float, # complex, str, bytes]]]" - codes = np.where(mask, 0, n - codes) # type: ignore[operator,arg-type] + codes = np.where(mask, -1, n - codes) # type: ignore[arg-type] - shape.append(mask_n) + shape.append(n + 1) labels.append(codes) return indexer_from_factorized(labels, tuple(shape)) diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index e59369db776da..b299f5d6deab3 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -145,7 +145,9 @@ def _map_and_wrap(name: str | None, docstring: str | None): @forbid_nonstring_types(["bytes"], name=name) def wrapper(self): result = getattr(self._data.array, f"_str_{name}")() - return self._wrap_result(result) + return self._wrap_result( + result, returns_string=name not in ("isnumeric", "isdecimal") + ) wrapper.__doc__ = docstring return wrapper @@ -1215,7 +1217,7 @@ def contains( -------- Returning a Series of booleans using only a literal pattern. - >>> s1 = pd.Series(['Mouse', 'dog', 'house and parrot', '23', np.NaN]) + >>> s1 = pd.Series(['Mouse', 'dog', 'house and parrot', '23', np.nan]) >>> s1.str.contains('og', regex=False) 0 False 1 True @@ -1226,7 +1228,7 @@ def contains( Returning an Index of booleans using only a literal pattern. - >>> ind = pd.Index(['Mouse', 'dog', 'house and parrot', '23.0', np.NaN]) + >>> ind = pd.Index(['Mouse', 'dog', 'house and parrot', '23.0', np.nan]) >>> ind.str.contains('23', regex=False) Index([False, False, False, True, nan], dtype='object') @@ -3447,10 +3449,9 @@ def _result_dtype(arr): # when the list of values is empty. from pandas.core.arrays.string_ import StringDtype - if isinstance(arr.dtype, StringDtype): + if isinstance(arr.dtype, (ArrowDtype, StringDtype)): return arr.dtype - else: - return object + return object def _get_single_group_name(regex: re.Pattern) -> Hashable: @@ -3500,7 +3501,7 @@ def str_extractall(arr, pat, flags: int = 0) -> DataFrame: for match_i, match_tuple in enumerate(regex.findall(subject)): if isinstance(match_tuple, str): match_tuple = (match_tuple,) - na_tuple = [np.NaN if group == "" else group for group in match_tuple] + na_tuple = [np.nan if group == "" else group for group in match_tuple] match_list.append(na_tuple) result_key = tuple(subject_key + (match_i,)) index_list.append(result_key) diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index ac306229f3111..303136b7d3890 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -356,7 +356,7 @@ def _return_parsed_timezone_results( if len(non_na_timezones) > 1: warnings.warn( "In a future version of pandas, parsing datetimes with mixed time " - "zones will raise a warning unless `utc=True`. Please specify `utc=True` " + "zones will raise an error unless `utc=True`. Please specify `utc=True` " "to opt in to the new behaviour and silence this warning. " "To create a `Series` with mixed offsets and `object` dtype, " "please use `apply` and `datetime.datetime.strptime`", @@ -788,7 +788,7 @@ def to_datetime( .. warning:: In a future version of pandas, parsing datetimes with mixed time - zones will raise a warning unless `utc=True`. + zones will raise an error unless `utc=True`. Please specify `utc=True` to opt in to the new behaviour and silence this warning. To create a `Series` with mixed offsets and `object` dtype, please use `apply` and `datetime.datetime.strptime`. @@ -1023,7 +1023,7 @@ def to_datetime( >>> pd.to_datetime(['2020-10-25 02:00 +0200', ... '2020-10-25 04:00 +0100']) # doctest: +SKIP FutureWarning: In a future version of pandas, parsing datetimes with mixed - time zones will raise a warning unless `utc=True`. Please specify `utc=True` + time zones will raise an error unless `utc=True`. Please specify `utc=True` to opt in to the new behaviour and silence this warning. To create a `Series` with mixed offsets and `object` dtype, please use `apply` and `datetime.datetime.strptime`. @@ -1037,7 +1037,7 @@ def to_datetime( >>> pd.to_datetime(["2020-01-01 01:00:00-01:00", ... datetime(2020, 1, 1, 3, 0)]) # doctest: +SKIP FutureWarning: In a future version of pandas, parsing datetimes with mixed - time zones will raise a warning unless `utc=True`. Please specify `utc=True` + time zones will raise an error unless `utc=True`. Please specify `utc=True` to opt in to the new behaviour and silence this warning. To create a `Series` with mixed offsets and `object` dtype, please use `apply` and `datetime.datetime.strptime`. diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index becbba703f92c..ddd6caaa7f783 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -21,6 +21,7 @@ from pandas._libs.tslibs import ( BaseOffset, + Timedelta, to_offset, ) import pandas._libs.window.aggregations as window_aggregations @@ -112,6 +113,8 @@ from pandas.core.generic import NDFrame from pandas.core.groupby.ops import BaseGrouper +from pandas.core.arrays.datetimelike import dtype_to_unit + class BaseWindow(SelectionMixin): """Provides utilities for performing windowing operations.""" @@ -1882,7 +1885,12 @@ def _validate(self): self._on.freq.nanos / self._on.freq.n ) else: - self._win_freq_i8 = freq.nanos + try: + unit = dtype_to_unit(self._on.dtype) # type: ignore[arg-type] + except TypeError: + # if not a datetime dtype, eg for empty dataframes + unit = "ns" + self._win_freq_i8 = Timedelta(freq.nanos).as_unit(unit)._value # min_periods must be an integer if self.min_periods is None: diff --git a/pandas/io/_util.py b/pandas/io/_util.py index 27316b3ab0af0..3b2ae5daffdba 100644 --- a/pandas/io/_util.py +++ b/pandas/io/_util.py @@ -28,4 +28,7 @@ def _arrow_dtype_mapping() -> dict: def arrow_string_types_mapper() -> Callable: pa = import_optional_dependency("pyarrow") - return {pa.string(): pd.ArrowDtype(pa.string())}.get + return { + pa.string(): pd.StringDtype(storage="pyarrow_numpy"), + pa.large_string(): pd.StringDtype(storage="pyarrow_numpy"), + }.get diff --git a/pandas/io/feather_format.py b/pandas/io/feather_format.py index c463f6e4d2759..b018b5720d126 100644 --- a/pandas/io/feather_format.py +++ b/pandas/io/feather_format.py @@ -117,6 +117,9 @@ def read_feather( import_optional_dependency("pyarrow") from pyarrow import feather + # import utils to register the pyarrow extension types + import pandas.core.arrays.arrow.extension_types # pyright: ignore[reportUnusedImport] # noqa: F401,E501 + check_dtype_backend(dtype_backend) with get_handle( diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 9fe8cbfa159c6..2297f7945a264 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -1715,7 +1715,7 @@ def format_percentiles( """ percentiles = np.asarray(percentiles) - # It checks for np.NaN as well + # It checks for np.nan as well if ( not is_numeric_dtype(percentiles) or not np.all(percentiles >= 0) @@ -1830,8 +1830,8 @@ def get_format_datetime64_from_values( class Datetime64TZFormatter(Datetime64Formatter): def _format_strings(self) -> list[str]: """we by definition have a TZ""" + ido = is_dates_only(self.values) values = self.values.astype(object) - ido = is_dates_only(values) formatter = self.formatter or get_format_datetime64( ido, date_format=self.date_format ) diff --git a/pandas/io/html.py b/pandas/io/html.py index 8a73c786825e3..10701be4f7e0b 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -23,6 +23,7 @@ AbstractMethodError, EmptyDataError, ) +from pandas.util._decorators import doc from pandas.util._exceptions import find_stack_level from pandas.util._validators import check_dtype_backend @@ -32,6 +33,7 @@ from pandas.core.indexes.base import Index from pandas.core.indexes.multi import MultiIndex from pandas.core.series import Series +from pandas.core.shared_docs import _shared_docs from pandas.io.common import ( file_exists, @@ -363,13 +365,13 @@ def _parse_tfoot_tr(self, table): """ raise AbstractMethodError(self) - def _parse_tables(self, doc, match, attrs): + def _parse_tables(self, document, match, attrs): """ Return all tables from the parsed DOM. Parameters ---------- - doc : the DOM from which to parse the table element. + document : the DOM from which to parse the table element. match : str or regular expression The text to search for in the DOM tree. @@ -594,9 +596,9 @@ def __init__(self, *args, **kwargs) -> None: self._strainer = SoupStrainer("table") - def _parse_tables(self, doc, match, attrs): + def _parse_tables(self, document, match, attrs): element_name = self._strainer.name - tables = doc.find_all(element_name, attrs=attrs) + tables = document.find_all(element_name, attrs=attrs) if not tables: raise ValueError("No tables found") @@ -726,7 +728,7 @@ def _parse_td(self, row): # or (see _parse_thead_tr). return row.xpath("./td|./th") - def _parse_tables(self, doc, match, kwargs): + def _parse_tables(self, document, match, kwargs): pattern = match.pattern # 1. check all descendants for the given pattern and only search tables @@ -738,7 +740,7 @@ def _parse_tables(self, doc, match, kwargs): if kwargs: xpath_expr += _build_xpath_expr(kwargs) - tables = doc.xpath(xpath_expr, namespaces=_re_namespace) + tables = document.xpath(xpath_expr, namespaces=_re_namespace) tables = self._handle_hidden_tables(tables, "attrib") if self.displayed_only: @@ -1026,6 +1028,7 @@ def _parse( return ret +@doc(storage_options=_shared_docs["storage_options"]) def read_html( io: FilePath | ReadBuffer[str], *, @@ -1096,13 +1099,13 @@ def read_html( passed to lxml or Beautiful Soup. However, these attributes must be valid HTML table attributes to work correctly. For example, :: - attrs = {'id': 'table'} + attrs = {{'id': 'table'}} is a valid attribute dictionary because the 'id' HTML tag attribute is a valid HTML attribute for *any* HTML tag as per `this document `__. :: - attrs = {'asdf': 'table'} + attrs = {{'asdf': 'table'}} is *not* a valid attribute dictionary because 'asdf' is not a valid HTML attribute even if it is a valid XML attribute. Valid HTML 4.01 @@ -1144,13 +1147,13 @@ def read_html( displayed_only : bool, default True Whether elements with "display: none" should be parsed. - extract_links : {None, "all", "header", "body", "footer"} + extract_links : {{None, "all", "header", "body", "footer"}} Table elements in the specified section(s) with tags will have their href extracted. .. versionadded:: 1.5.0 - dtype_backend : {'numpy_nullable', 'pyarrow'}, default 'numpy_nullable' + dtype_backend : {{'numpy_nullable', 'pyarrow'}}, default 'numpy_nullable' Back-end data type applied to the resultant :class:`DataFrame` (still experimental). Behaviour is as follows: @@ -1161,6 +1164,10 @@ def read_html( .. versionadded:: 2.0 + {storage_options} + + .. versionadded:: 2.1.0 + Returns ------- dfs diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index 833f4986b6da6..58979a29c97d3 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -1316,7 +1316,7 @@ def _try_convert_to_date(self, data): warnings.filterwarnings( "ignore", ".*parsing datetimes with mixed time " - "zones will raise a warning", + "zones will raise an error", category=FutureWarning, ) new_data = to_datetime(new_data, errors="raise", unit=date_unit) diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index 91987e6531261..f51b98a929440 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -444,10 +444,7 @@ def to_parquet( if you wish to use its implementation. compression : {{'snappy', 'gzip', 'brotli', 'lz4', 'zstd', None}}, default 'snappy'. Name of the compression to use. Use ``None`` - for no compression. The supported compression methods actually - depend on which engine is used. For 'pyarrow', 'snappy', 'gzip', - 'brotli', 'lz4', 'zstd' are all supported. For 'fastparquet', - only 'gzip' and 'snappy' are supported. + for no compression. index : bool, default None If ``True``, include the dataframe's index(es) in the file output. If ``False``, they will not be written to the file. diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index 60996a7d42187..6b1daa96782a0 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -1147,7 +1147,7 @@ def converter(*date_cols, col: Hashable): with warnings.catch_warnings(): warnings.filterwarnings( "ignore", - ".*parsing datetimes with mixed time zones will raise a warning", + ".*parsing datetimes with mixed time zones will raise an error", category=FutureWarning, ) result = tools.to_datetime( @@ -1169,7 +1169,7 @@ def converter(*date_cols, col: Hashable): warnings.filterwarnings( "ignore", ".*parsing datetimes with mixed time zones " - "will raise a warning", + "will raise an error", category=FutureWarning, ) result = tools.to_datetime( @@ -1187,7 +1187,7 @@ def converter(*date_cols, col: Hashable): warnings.filterwarnings( "ignore", ".*parsing datetimes with mixed time zones " - "will raise a warning", + "will raise an error", category=FutureWarning, ) return tools.to_datetime( diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py index 520d2193e1c04..6846ea2b196b8 100644 --- a/pandas/io/parsers/python_parser.py +++ b/pandas/io/parsers/python_parser.py @@ -1176,17 +1176,17 @@ def _set_no_thousand_columns(self) -> set[int]: ) if self.columns and self.dtype: assert self._col_indices is not None - for i in self._col_indices: + for i, col in zip(self._col_indices, self.columns): if not isinstance(self.dtype, dict) and not is_numeric_dtype( self.dtype ): no_thousands_columns.add(i) if ( isinstance(self.dtype, dict) - and self.columns[i] in self.dtype + and col in self.dtype and ( - not is_numeric_dtype(self.dtype[self.columns[i]]) - or is_bool_dtype(self.dtype[self.columns[i]]) + not is_numeric_dtype(self.dtype[col]) + or is_bool_dtype(self.dtype[col]) ) ): no_thousands_columns.add(i) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index f26411f65d91f..d3055f0ad2a38 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -68,7 +68,6 @@ ) from pandas.core.dtypes.missing import array_equivalent -import pandas as pd from pandas import ( DataFrame, DatetimeIndex, @@ -3224,9 +3223,7 @@ def read( values = self.read_array("values", start=start, stop=stop) result = Series(values, index=index, name=self.name, copy=False) if using_pyarrow_string_dtype() and is_string_array(values, skipna=True): - import pyarrow as pa - - result = result.astype(pd.ArrowDtype(pa.string())) + result = result.astype("string[pyarrow_numpy]") return result # error: Signature of "write" incompatible with supertype "Fixed" @@ -3296,9 +3293,7 @@ def read( columns = items[items.get_indexer(blk_items)] df = DataFrame(values.T, columns=columns, index=axes[1], copy=False) if using_pyarrow_string_dtype() and is_string_array(values, skipna=True): - import pyarrow as pa - - df = df.astype(pd.ArrowDtype(pa.string())) + df = df.astype("string[pyarrow_numpy]") dfs.append(df) if len(dfs) > 0: @@ -4686,9 +4681,7 @@ def read( values, # type: ignore[arg-type] skipna=True, ): - import pyarrow as pa - - df = df.astype(pd.ArrowDtype(pa.string())) + df = df.astype("string[pyarrow_numpy]") frames.append(df) if len(frames) == 1: diff --git a/pandas/io/sql.py b/pandas/io/sql.py index 7669d5aa4cea5..c1d68d71ac91c 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -2090,21 +2090,17 @@ def _adapt_time(t) -> str: # Python 3.12+ doesn't auto-register adapters for us anymore adapt_date_iso = lambda val: val.isoformat() - adapt_datetime_iso = lambda val: val.isoformat() - adapt_datetime_epoch = lambda val: int(val.timestamp()) + adapt_datetime_iso = lambda val: val.isoformat(" ") sqlite3.register_adapter(time, _adapt_time) sqlite3.register_adapter(date, adapt_date_iso) sqlite3.register_adapter(datetime, adapt_datetime_iso) - sqlite3.register_adapter(datetime, adapt_datetime_epoch) convert_date = lambda val: date.fromisoformat(val.decode()) - convert_datetime = lambda val: datetime.fromisoformat(val.decode()) - convert_timestamp = lambda val: datetime.fromtimestamp(int(val)) + convert_timestamp = lambda val: datetime.fromisoformat(val.decode()) sqlite3.register_converter("date", convert_date) - sqlite3.register_converter("datetime", convert_datetime) sqlite3.register_converter("timestamp", convert_timestamp) def sql_schema(self) -> str: diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 698a2882ada39..0a02da09c9e15 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -977,7 +977,7 @@ def __init__(self) -> None: # with a label, but the underlying variable is -127 to 100 # we're going to drop the label and cast to int self.DTYPE_MAP = dict( - list(zip(range(1, 245), [np.dtype("a" + str(i)) for i in range(1, 245)])) + [(i, np.dtype(f"S{i}")) for i in range(1, 245)] + [ (251, np.dtype(np.int8)), (252, np.dtype(np.int16)), @@ -1771,7 +1771,7 @@ def read( self._data_read = True # if necessary, swap the byte order to native here if self._byteorder != self._native_byteorder: - raw_data = raw_data.byteswap().newbyteorder() + raw_data = raw_data.byteswap().view(raw_data.dtype.newbyteorder()) if convert_categoricals: self._read_value_labels() @@ -2961,7 +2961,7 @@ def _convert_strls(self, data: DataFrame) -> DataFrame: """No-op, future compatibility""" return data - def _prepare_data(self) -> np.recarray: + def _prepare_data(self) -> np.rec.recarray: data = self.data typlist = self.typlist convert_dates = self._convert_dates @@ -2993,7 +2993,7 @@ def _prepare_data(self) -> np.recarray: return data.to_records(index=False, column_dtypes=dtypes) - def _write_data(self, records: np.recarray) -> None: + def _write_data(self, records: np.rec.recarray) -> None: self._write_bytes(records.tobytes()) @staticmethod diff --git a/pandas/meson.build b/pandas/meson.build index 1dc9955aa4ff6..435103a954d86 100644 --- a/pandas/meson.build +++ b/pandas/meson.build @@ -26,7 +26,6 @@ subdir('_libs') subdirs_list = [ '_config', - '_libs', '_testing', 'api', 'arrays', @@ -40,8 +39,9 @@ subdirs_list = [ 'util' ] foreach subdir: subdirs_list - install_subdir(subdir, install_dir: py.get_install_dir(pure: false) / 'pandas') + install_subdir(subdir, install_dir: py.get_install_dir() / 'pandas') endforeach + top_level_py_list = [ '__init__.py', '_typing.py', @@ -49,8 +49,4 @@ top_level_py_list = [ 'conftest.py', 'testing.py' ] -foreach file: top_level_py_list - py.install_sources(file, - pure: false, - subdir: 'pandas') -endforeach +py.install_sources(top_level_py_list, subdir: 'pandas') diff --git a/pandas/plotting/_matplotlib/converter.py b/pandas/plotting/_matplotlib/converter.py index cd7823ba15e44..be0ded0ecdf57 100644 --- a/pandas/plotting/_matplotlib/converter.py +++ b/pandas/plotting/_matplotlib/converter.py @@ -14,6 +14,7 @@ Final, cast, ) +import warnings import matplotlib.dates as mdates from matplotlib.ticker import ( @@ -243,18 +244,29 @@ def _convert_1d(values, units, axis): if not hasattr(axis, "freq"): raise TypeError("Axis must have `freq` set to convert to Periods") valid_types = (str, datetime, Period, pydt.date, pydt.time, np.datetime64) - if isinstance(values, valid_types) or is_integer(values) or is_float(values): - return get_datevalue(values, axis.freq) - elif isinstance(values, PeriodIndex): - return values.asfreq(axis.freq).asi8 - elif isinstance(values, Index): - return values.map(lambda x: get_datevalue(x, axis.freq)) - elif lib.infer_dtype(values, skipna=False) == "period": - # https://github.com/pandas-dev/pandas/issues/24304 - # convert ndarray[period] -> PeriodIndex - return PeriodIndex(values, freq=axis.freq).asi8 - elif isinstance(values, (list, tuple, np.ndarray, Index)): - return [get_datevalue(x, axis.freq) for x in values] + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", "Period with BDay freq is deprecated", category=FutureWarning + ) + warnings.filterwarnings( + "ignore", r"PeriodDtype\[B\] is deprecated", category=FutureWarning + ) + if ( + isinstance(values, valid_types) + or is_integer(values) + or is_float(values) + ): + return get_datevalue(values, axis.freq) + elif isinstance(values, PeriodIndex): + return values.asfreq(axis.freq).asi8 + elif isinstance(values, Index): + return values.map(lambda x: get_datevalue(x, axis.freq)) + elif lib.infer_dtype(values, skipna=False) == "period": + # https://github.com/pandas-dev/pandas/issues/24304 + # convert ndarray[period] -> PeriodIndex + return PeriodIndex(values, freq=axis.freq).asi8 + elif isinstance(values, (list, tuple, np.ndarray, Index)): + return [get_datevalue(x, axis.freq) for x in values] return values @@ -567,14 +579,30 @@ def _daily_finder(vmin, vmax, freq: BaseOffset): # save this for later usage vmin_orig = vmin - (vmin, vmax) = ( - Period(ordinal=int(vmin), freq=freq), - Period(ordinal=int(vmax), freq=freq), - ) + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", "Period with BDay freq is deprecated", category=FutureWarning + ) + warnings.filterwarnings( + "ignore", r"PeriodDtype\[B\] is deprecated", category=FutureWarning + ) + (vmin, vmax) = ( + Period(ordinal=int(vmin), freq=freq), + Period(ordinal=int(vmax), freq=freq), + ) assert isinstance(vmin, Period) assert isinstance(vmax, Period) span = vmax.ordinal - vmin.ordinal + 1 - dates_ = period_range(start=vmin, end=vmax, freq=freq) + + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", "Period with BDay freq is deprecated", category=FutureWarning + ) + warnings.filterwarnings( + "ignore", r"PeriodDtype\[B\] is deprecated", category=FutureWarning + ) + dates_ = period_range(start=vmin, end=vmax, freq=freq) + # Initialize the output info = np.zeros( span, dtype=[("val", np.int64), ("maj", bool), ("min", bool), ("fmt", "|S20")] @@ -1072,7 +1100,13 @@ def __call__(self, x, pos: int = 0) -> str: fmt = self.formatdict.pop(x, "") if isinstance(fmt, np.bytes_): fmt = fmt.decode("utf-8") - period = Period(ordinal=int(x), freq=self.freq) + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", + "Period with BDay freq is deprecated", + category=FutureWarning, + ) + period = Period(ordinal=int(x), freq=self.freq) assert isinstance(period, Period) return period.strftime(fmt) diff --git a/pandas/tests/apply/test_frame_apply.py b/pandas/tests/apply/test_frame_apply.py index ba052c6936dd9..3a3f73a68374b 100644 --- a/pandas/tests/apply/test_frame_apply.py +++ b/pandas/tests/apply/test_frame_apply.py @@ -637,15 +637,15 @@ def test_apply_with_byte_string(): tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("val", ["asd", 12, None, np.NaN]) +@pytest.mark.parametrize("val", ["asd", 12, None, np.nan]) def test_apply_category_equalness(val): # Check if categorical comparisons on apply, GH 21239 - df_values = ["asd", None, 12, "asd", "cde", np.NaN] + df_values = ["asd", None, 12, "asd", "cde", np.nan] df = DataFrame({"a": df_values}, dtype="category") result = df.a.apply(lambda x: x == val) expected = Series( - [np.NaN if pd.isnull(x) else x == val for x in df_values], name="a" + [np.nan if pd.isnull(x) else x == val for x in df_values], name="a" ) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/apply/test_series_apply.py b/pandas/tests/apply/test_series_apply.py index aea1e03dfe0ee..aeb6a01eb587a 100644 --- a/pandas/tests/apply/test_series_apply.py +++ b/pandas/tests/apply/test_series_apply.py @@ -242,7 +242,7 @@ def test_apply_categorical(by_row): assert result.dtype == object -@pytest.mark.parametrize("series", [["1-1", "1-1", np.NaN], ["1-1", "1-2", np.NaN]]) +@pytest.mark.parametrize("series", [["1-1", "1-1", np.nan], ["1-1", "1-2", np.nan]]) def test_apply_categorical_with_nan_values(series, by_row): # GH 20714 bug fixed in: GH 24275 s = Series(series, dtype="category") @@ -254,7 +254,7 @@ def test_apply_categorical_with_nan_values(series, by_row): result = s.apply(lambda x: x.split("-")[0], by_row=by_row) result = result.astype(object) - expected = Series(["1", "1", np.NaN], dtype="category") + expected = Series(["1", "1", np.nan], dtype="category") expected = expected.astype(object) tm.assert_series_equal(result, expected) @@ -420,8 +420,9 @@ def test_agg_evaluate_lambdas(string_series): def test_with_nested_series(datetime_series, op_name): # GH 2316 # .agg with a reducer and a transform, what to do - msg = "Returning a DataFrame from Series.apply when the supplied function" - with tm.assert_produces_warning(FutureWarning, match=msg): + msg = "cannot aggregate" + warning = FutureWarning if op_name == "agg" else None + with tm.assert_produces_warning(warning, match=msg): # GH52123 result = getattr(datetime_series, op_name)( lambda x: Series([x, x**2], index=["x", "x^2"]) @@ -429,6 +430,10 @@ def test_with_nested_series(datetime_series, op_name): expected = DataFrame({"x": datetime_series, "x^2": datetime_series**2}) tm.assert_frame_equal(result, expected) + with tm.assert_produces_warning(FutureWarning, match=msg): + result = datetime_series.agg(lambda x: Series([x, x**2], index=["x", "x^2"])) + tm.assert_frame_equal(result, expected) + def test_replicate_describe(string_series): # this also tests a result set that is all scalars @@ -512,10 +517,7 @@ def test_apply_series_on_date_time_index_aware_series(dti, exp, aware): index = dti.tz_localize("UTC").index else: index = dti.index - msg = "Returning a DataFrame from Series.apply when the supplied function" - with tm.assert_produces_warning(FutureWarning, match=msg): - # GH52123 - result = Series(index).apply(lambda x: Series([1, 2])) + result = Series(index).apply(lambda x: Series([1, 2])) tm.assert_frame_equal(result, exp) @@ -662,19 +664,7 @@ def test_apply_dictlike_lambda(ops, by_row, expected): def test_apply_retains_column_name(by_row): # GH 16380 df = DataFrame({"x": range(3)}, Index(range(3), name="x")) - func = lambda x: Series(range(x + 1), Index(range(x + 1), name="y")) - - if not by_row: - # GH53400 - msg = "'Series' object cannot be interpreted as an integer" - with pytest.raises(TypeError, match=msg): - df.x.apply(func, by_row=by_row) - return - - msg = "Returning a DataFrame from Series.apply when the supplied function" - with tm.assert_produces_warning(FutureWarning, match=msg): - # GH52123 - result = df.x.apply(func, by_row=by_row) + result = df.x.apply(lambda x: Series(range(x + 1), Index(range(x + 1), name="y"))) expected = DataFrame( [[0.0, np.nan, np.nan], [0.0, 1.0, np.nan], [0.0, 1.0, 2.0]], columns=Index(range(3), name="y"), @@ -689,3 +679,11 @@ def test_apply_type(): result = s.apply(type) expected = Series([int, str, type], index=["a", "b", "c"]) tm.assert_series_equal(result, expected) + + +def test_series_apply_unpack_nested_data(): + # GH#55189 + ser = Series([[1, 2, 3], [4, 5, 6, 7]]) + result = ser.apply(lambda x: Series(x)) + expected = DataFrame({0: [1.0, 4.0], 1: [2.0, 5.0], 2: [3.0, 6.0], 3: [np.nan, 7]}) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/arrays/categorical/test_analytics.py b/pandas/tests/arrays/categorical/test_analytics.py index c42364d4d4377..c2c53fbc4637e 100644 --- a/pandas/tests/arrays/categorical/test_analytics.py +++ b/pandas/tests/arrays/categorical/test_analytics.py @@ -73,8 +73,8 @@ def test_min_max_reduce(self): @pytest.mark.parametrize( "categories,expected", [ - (list("ABC"), np.NaN), - ([1, 2, 3], np.NaN), + (list("ABC"), np.nan), + ([1, 2, 3], np.nan), pytest.param( Series(date_range("2020-01-01", periods=3), dtype="category"), NaT, diff --git a/pandas/tests/arrays/categorical/test_warnings.py b/pandas/tests/arrays/categorical/test_warnings.py index c0623faccd325..68c59706a6c3b 100644 --- a/pandas/tests/arrays/categorical/test_warnings.py +++ b/pandas/tests/arrays/categorical/test_warnings.py @@ -1,19 +1,16 @@ import pytest -from pandas.util._test_decorators import async_mark - import pandas._testing as tm class TestCategoricalWarnings: - @async_mark() - async def test_tab_complete_warning(self, ip): + def test_tab_complete_warning(self, ip): # https://github.com/pandas-dev/pandas/issues/16409 pytest.importorskip("IPython", minversion="6.0.0") from IPython.core.completer import provisionalcompleter code = "import pandas as pd; c = pd.Categorical([])" - await ip.run_code(code) + ip.run_cell(code) # GH 31324 newer jedi version raises Deprecation warning; # appears resolved 2021-02-02 diff --git a/pandas/tests/arrays/interval/test_interval.py b/pandas/tests/arrays/interval/test_interval.py index e16ef37e8799d..761b85287764f 100644 --- a/pandas/tests/arrays/interval/test_interval.py +++ b/pandas/tests/arrays/interval/test_interval.py @@ -129,7 +129,7 @@ def test_set_na(self, left_right_dtypes): # GH#45484 TypeError, not ValueError, matches what we get with # non-NA un-holdable value. with pytest.raises(TypeError, match=msg): - result[0] = np.NaN + result[0] = np.nan return result[0] = np.nan diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index cfd3314eb5944..68b2b01ddcaf3 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -5,12 +5,23 @@ import numpy as np import pytest +from pandas.compat.pyarrow import pa_version_under12p0 + from pandas.core.dtypes.common import is_dtype_equal import pandas as pd import pandas._testing as tm -from pandas.core.arrays.string_arrow import ArrowStringArray -from pandas.util.version import Version +from pandas.core.arrays.string_arrow import ( + ArrowStringArray, + ArrowStringArrayNumpySemantics, +) + + +def na_val(dtype): + if dtype.storage == "pyarrow_numpy": + return np.nan + else: + return pd.NA @pytest.fixture @@ -27,21 +38,34 @@ def cls(dtype): def test_repr(dtype): df = pd.DataFrame({"A": pd.array(["a", pd.NA, "b"], dtype=dtype)}) - expected = " A\n0 a\n1 \n2 b" + if dtype.storage == "pyarrow_numpy": + expected = " A\n0 a\n1 NaN\n2 b" + else: + expected = " A\n0 a\n1 \n2 b" assert repr(df) == expected - expected = "0 a\n1 \n2 b\nName: A, dtype: string" + if dtype.storage == "pyarrow_numpy": + expected = "0 a\n1 NaN\n2 b\nName: A, dtype: string" + else: + expected = "0 a\n1 \n2 b\nName: A, dtype: string" assert repr(df.A) == expected - arr_name = "ArrowStringArray" if dtype.storage == "pyarrow" else "StringArray" - expected = f"<{arr_name}>\n['a', , 'b']\nLength: 3, dtype: string" + if dtype.storage == "pyarrow": + arr_name = "ArrowStringArray" + expected = f"<{arr_name}>\n['a', , 'b']\nLength: 3, dtype: string" + elif dtype.storage == "pyarrow_numpy": + arr_name = "ArrowStringArrayNumpySemantics" + expected = f"<{arr_name}>\n['a', nan, 'b']\nLength: 3, dtype: string" + else: + arr_name = "StringArray" + expected = f"<{arr_name}>\n['a', , 'b']\nLength: 3, dtype: string" assert repr(df.A.array) == expected def test_none_to_nan(cls): a = cls._from_sequence(["a", None, "b"]) assert a[1] is not None - assert a[1] is pd.NA + assert a[1] is na_val(a.dtype) def test_setitem_validates(cls): @@ -115,8 +139,8 @@ def test_add(dtype): tm.assert_series_equal(result, expected) -def test_add_2d(dtype, request): - if dtype.storage == "pyarrow": +def test_add_2d(dtype, request, arrow_string_storage): + if dtype.storage in arrow_string_storage: reason = "Failed: DID NOT RAISE " mark = pytest.mark.xfail(raises=None, reason=reason) request.node.add_marker(mark) @@ -144,8 +168,8 @@ def test_add_sequence(dtype): tm.assert_extension_array_equal(result, expected) -def test_mul(dtype, request): - if dtype.storage == "pyarrow": +def test_mul(dtype, request, arrow_string_storage): + if dtype.storage in arrow_string_storage: reason = "unsupported operand type(s) for *: 'ArrowStringArray' and 'int'" mark = pytest.mark.xfail(raises=NotImplementedError, reason=reason) request.node.add_marker(mark) @@ -195,19 +219,30 @@ def test_comparison_methods_scalar(comparison_op, dtype): a = pd.array(["a", None, "c"], dtype=dtype) other = "a" result = getattr(a, op_name)(other) - expected_dtype = "boolean[pyarrow]" if dtype.storage == "pyarrow" else "boolean" - expected = np.array([getattr(item, op_name)(other) for item in a], dtype=object) - expected = pd.array(expected, dtype=expected_dtype) - tm.assert_extension_array_equal(result, expected) + if dtype.storage == "pyarrow_numpy": + expected = np.array([getattr(item, op_name)(other) for item in a]) + expected[1] = False + tm.assert_numpy_array_equal(result, expected.astype(np.bool_)) + else: + expected_dtype = "boolean[pyarrow]" if dtype.storage == "pyarrow" else "boolean" + expected = np.array([getattr(item, op_name)(other) for item in a], dtype=object) + expected = pd.array(expected, dtype=expected_dtype) + tm.assert_extension_array_equal(result, expected) def test_comparison_methods_scalar_pd_na(comparison_op, dtype): op_name = f"__{comparison_op.__name__}__" a = pd.array(["a", None, "c"], dtype=dtype) result = getattr(a, op_name)(pd.NA) - expected_dtype = "boolean[pyarrow]" if dtype.storage == "pyarrow" else "boolean" - expected = pd.array([None, None, None], dtype=expected_dtype) - tm.assert_extension_array_equal(result, expected) + + if dtype.storage == "pyarrow_numpy": + expected = np.array([False, False, False]) + tm.assert_numpy_array_equal(result, expected) + else: + expected_dtype = "boolean[pyarrow]" if dtype.storage == "pyarrow" else "boolean" + expected = pd.array([None, None, None], dtype=expected_dtype) + tm.assert_extension_array_equal(result, expected) + tm.assert_extension_array_equal(result, expected) def test_comparison_methods_scalar_not_string(comparison_op, dtype): @@ -223,12 +258,21 @@ def test_comparison_methods_scalar_not_string(comparison_op, dtype): return result = getattr(a, op_name)(other) - expected_data = {"__eq__": [False, None, False], "__ne__": [True, None, True]}[ - op_name - ] - expected_dtype = "boolean[pyarrow]" if dtype.storage == "pyarrow" else "boolean" - expected = pd.array(expected_data, dtype=expected_dtype) - tm.assert_extension_array_equal(result, expected) + + if dtype.storage == "pyarrow_numpy": + expected_data = { + "__eq__": [False, False, False], + "__ne__": [True, False, True], + }[op_name] + expected = np.array(expected_data) + tm.assert_numpy_array_equal(result, expected) + else: + expected_data = {"__eq__": [False, None, False], "__ne__": [True, None, True]}[ + op_name + ] + expected_dtype = "boolean[pyarrow]" if dtype.storage == "pyarrow" else "boolean" + expected = pd.array(expected_data, dtype=expected_dtype) + tm.assert_extension_array_equal(result, expected) def test_comparison_methods_array(comparison_op, dtype): @@ -237,15 +281,25 @@ def test_comparison_methods_array(comparison_op, dtype): a = pd.array(["a", None, "c"], dtype=dtype) other = [None, None, "c"] result = getattr(a, op_name)(other) - expected_dtype = "boolean[pyarrow]" if dtype.storage == "pyarrow" else "boolean" - expected = np.full(len(a), fill_value=None, dtype="object") - expected[-1] = getattr(other[-1], op_name)(a[-1]) - expected = pd.array(expected, dtype=expected_dtype) - tm.assert_extension_array_equal(result, expected) + if dtype.storage == "pyarrow_numpy": + expected = np.array([False, False, False]) + expected[-1] = getattr(other[-1], op_name)(a[-1]) + tm.assert_numpy_array_equal(result, expected) - result = getattr(a, op_name)(pd.NA) - expected = pd.array([None, None, None], dtype=expected_dtype) - tm.assert_extension_array_equal(result, expected) + result = getattr(a, op_name)(pd.NA) + expected = np.array([False, False, False]) + tm.assert_numpy_array_equal(result, expected) + + else: + expected_dtype = "boolean[pyarrow]" if dtype.storage == "pyarrow" else "boolean" + expected = np.full(len(a), fill_value=None, dtype="object") + expected[-1] = getattr(other[-1], op_name)(a[-1]) + expected = pd.array(expected, dtype=expected_dtype) + tm.assert_extension_array_equal(result, expected) + + result = getattr(a, op_name)(pd.NA) + expected = pd.array([None, None, None], dtype=expected_dtype) + tm.assert_extension_array_equal(result, expected) def test_constructor_raises(cls): @@ -297,7 +351,7 @@ def test_from_sequence_no_mutate(copy, cls, request): result = cls._from_sequence(nan_arr, copy=copy) - if cls is ArrowStringArray: + if cls in (ArrowStringArray, ArrowStringArrayNumpySemantics): import pyarrow as pa expected = cls(pa.array(na_arr, type=pa.string(), from_pandas=True)) @@ -364,13 +418,13 @@ def test_min_max(method, skipna, dtype, request): expected = "a" if method == "min" else "c" assert result == expected else: - assert result is pd.NA + assert result is na_val(arr.dtype) @pytest.mark.parametrize("method", ["min", "max"]) @pytest.mark.parametrize("box", [pd.Series, pd.array]) -def test_min_max_numpy(method, box, dtype, request): - if dtype.storage == "pyarrow" and box is pd.array: +def test_min_max_numpy(method, box, dtype, request, arrow_string_storage): + if dtype.storage in arrow_string_storage and box is pd.array: if box is pd.array: reason = "'<=' not supported between instances of 'str' and 'NoneType'" else: @@ -384,7 +438,7 @@ def test_min_max_numpy(method, box, dtype, request): assert result == expected -def test_fillna_args(dtype, request): +def test_fillna_args(dtype, request, arrow_string_storage): # GH 37987 arr = pd.array(["a", pd.NA], dtype=dtype) @@ -397,7 +451,7 @@ def test_fillna_args(dtype, request): expected = pd.array(["a", "b"], dtype=dtype) tm.assert_extension_array_equal(res, expected) - if dtype.storage == "pyarrow": + if dtype.storage in arrow_string_storage: msg = "Invalid value '1' for dtype string" else: msg = "Cannot set non-string value '1' into a StringArray." @@ -412,7 +466,7 @@ def test_arrow_array(dtype): data = pd.array(["a", "b", "c"], dtype=dtype) arr = pa.array(data) expected = pa.array(list(data), type=pa.string(), from_pandas=True) - if dtype.storage == "pyarrow" and Version(pa.__version__) <= Version("11.0.0"): + if dtype.storage in ("pyarrow", "pyarrow_numpy") and pa_version_under12p0: expected = pa.chunked_array(expected) assert arr.equals(expected) @@ -432,7 +486,7 @@ def test_arrow_roundtrip(dtype, string_storage2): expected = df.astype(f"string[{string_storage2}]") tm.assert_frame_equal(result, expected) # ensure the missing value is represented by NA and not np.nan or None - assert result.loc[2, "a"] is pd.NA + assert result.loc[2, "a"] is na_val(result["a"].dtype) def test_arrow_load_from_zero_chunks(dtype, string_storage2): @@ -455,6 +509,8 @@ def test_arrow_load_from_zero_chunks(dtype, string_storage2): def test_value_counts_na(dtype): if getattr(dtype, "storage", "") == "pyarrow": exp_dtype = "int64[pyarrow]" + elif getattr(dtype, "storage", "") == "pyarrow_numpy": + exp_dtype = "int64" else: exp_dtype = "Int64" arr = pd.array(["a", "b", "a", pd.NA], dtype=dtype) @@ -470,6 +526,8 @@ def test_value_counts_na(dtype): def test_value_counts_with_normalize(dtype): if getattr(dtype, "storage", "") == "pyarrow": exp_dtype = "double[pyarrow]" + elif getattr(dtype, "storage", "") == "pyarrow_numpy": + exp_dtype = np.float64 else: exp_dtype = "Float64" ser = pd.Series(["a", "b", "a", pd.NA], dtype=dtype) @@ -503,10 +561,10 @@ def test_use_inf_as_na(values, expected, dtype): tm.assert_frame_equal(result, expected) -def test_memory_usage(dtype): +def test_memory_usage(dtype, arrow_string_storage): # GH 33963 - if dtype.storage == "pyarrow": + if dtype.storage in arrow_string_storage: pytest.skip(f"not applicable for {dtype.storage}") series = pd.Series(["a", "b", "c"], dtype=dtype) @@ -526,7 +584,7 @@ def test_astype_from_float_dtype(float_dtype, dtype): def test_to_numpy_returns_pdna_default(dtype): arr = pd.array(["a", pd.NA, "b"], dtype=dtype) result = np.array(arr) - expected = np.array(["a", pd.NA, "b"], dtype=object) + expected = np.array(["a", na_val(dtype), "b"], dtype=object) tm.assert_numpy_array_equal(result, expected) @@ -566,7 +624,7 @@ def test_setitem_scalar_with_mask_validation(dtype): mask = np.array([False, True, False]) ser[mask] = None - assert ser.array[1] is pd.NA + assert ser.array[1] is na_val(ser.dtype) # for other non-string we should also raise an error ser = pd.Series(["a", "b", "c"], dtype=dtype) diff --git a/pandas/tests/arrays/string_/test_string_arrow.py b/pandas/tests/arrays/string_/test_string_arrow.py index 6912d5038ae0d..c1d424f12bfc4 100644 --- a/pandas/tests/arrays/string_/test_string_arrow.py +++ b/pandas/tests/arrays/string_/test_string_arrow.py @@ -12,7 +12,10 @@ StringArray, StringDtype, ) -from pandas.core.arrays.string_arrow import ArrowStringArray +from pandas.core.arrays.string_arrow import ( + ArrowStringArray, + ArrowStringArrayNumpySemantics, +) skip_if_no_pyarrow = pytest.mark.skipif( pa_version_under7p0, @@ -49,10 +52,10 @@ def test_config_bad_storage_raises(): @skip_if_no_pyarrow @pytest.mark.parametrize("chunked", [True, False]) @pytest.mark.parametrize("array", ["numpy", "pyarrow"]) -def test_constructor_not_string_type_raises(array, chunked): +def test_constructor_not_string_type_raises(array, chunked, arrow_string_storage): import pyarrow as pa - array = pa if array == "pyarrow" else np + array = pa if array in arrow_string_storage else np arr = array.array([1, 2, 3]) if chunked: @@ -166,6 +169,9 @@ def test_pyarrow_not_installed_raises(): with pytest.raises(ImportError, match=msg): ArrowStringArray([]) + with pytest.raises(ImportError, match=msg): + ArrowStringArrayNumpySemantics([]) + with pytest.raises(ImportError, match=msg): ArrowStringArray._from_sequence(["a", None, "b"]) @@ -235,9 +241,10 @@ def test_setitem_invalid_indexer_raises(): @skip_if_no_pyarrow -def test_pickle_roundtrip(): +@pytest.mark.parametrize("dtype", ["string[pyarrow]", "string[pyarrow_numpy]"]) +def test_pickle_roundtrip(dtype): # GH 42600 - expected = pd.Series(range(10), dtype="string[pyarrow]") + expected = pd.Series(range(10), dtype=dtype) expected_sliced = expected.head(2) full_pickled = pickle.dumps(expected) sliced_pickled = pickle.dumps(expected_sliced) @@ -249,3 +256,11 @@ def test_pickle_roundtrip(): result_sliced = pickle.loads(sliced_pickled) tm.assert_series_equal(result_sliced, expected_sliced) + + +@skip_if_no_pyarrow +def test_string_dtype_error_message(): + # GH#55051 + msg = "Storage must be 'python', 'pyarrow' or 'pyarrow_numpy'." + with pytest.raises(ValueError, match=msg): + StringDtype("bla") diff --git a/pandas/tests/arrays/test_datetimelike.py b/pandas/tests/arrays/test_datetimelike.py index 9eee2e0bea687..96aab94b24ddd 100644 --- a/pandas/tests/arrays/test_datetimelike.py +++ b/pandas/tests/arrays/test_datetimelike.py @@ -258,7 +258,7 @@ def test_fillna_method_doesnt_change_orig(self, method): fill_value = arr[3] if method == "pad" else arr[5] - result = arr.pad_or_backfill(method=method) + result = arr._pad_or_backfill(method=method) assert result[4] == fill_value # check that the original was not changed @@ -324,7 +324,12 @@ def test_searchsorted_castable_strings(self, arr1d, box, string_storage): ): arr.searchsorted("foo") - arr_type = "StringArray" if string_storage == "python" else "ArrowStringArray" + if string_storage == "python": + arr_type = "StringArray" + elif string_storage == "pyarrow": + arr_type = "ArrowStringArray" + else: + arr_type = "ArrowStringArrayNumpySemantics" with pd.option_context("string_storage", string_storage): with pytest.raises( diff --git a/pandas/tests/arrays/test_datetimes.py b/pandas/tests/arrays/test_datetimes.py index 8e38a8c741b8d..c2d68a79f32d4 100644 --- a/pandas/tests/arrays/test_datetimes.py +++ b/pandas/tests/arrays/test_datetimes.py @@ -497,7 +497,7 @@ def test_fillna_preserves_tz(self, method): dtype=DatetimeTZDtype(tz="US/Central"), ) - result = arr.pad_or_backfill(method=method) + result = arr._pad_or_backfill(method=method) tm.assert_extension_array_equal(result, expected) # assert that arr and dti were not modified in-place @@ -510,12 +510,12 @@ def test_fillna_2d(self): dta[0, 1] = pd.NaT dta[1, 0] = pd.NaT - res1 = dta.pad_or_backfill(method="pad") + res1 = dta._pad_or_backfill(method="pad") expected1 = dta.copy() expected1[1, 0] = dta[0, 0] tm.assert_extension_array_equal(res1, expected1) - res2 = dta.pad_or_backfill(method="backfill") + res2 = dta._pad_or_backfill(method="backfill") expected2 = dta.copy() expected2 = dta.copy() expected2[1, 0] = dta[2, 0] @@ -529,10 +529,10 @@ def test_fillna_2d(self): assert not dta2._ndarray.flags["C_CONTIGUOUS"] tm.assert_extension_array_equal(dta, dta2) - res3 = dta2.pad_or_backfill(method="pad") + res3 = dta2._pad_or_backfill(method="pad") tm.assert_extension_array_equal(res3, expected1) - res4 = dta2.pad_or_backfill(method="backfill") + res4 = dta2._pad_or_backfill(method="backfill") tm.assert_extension_array_equal(res4, expected2) # test the DataFrame method while we're here diff --git a/pandas/tests/computation/test_eval.py b/pandas/tests/computation/test_eval.py index f958d25e51103..9c630e29ea8e6 100644 --- a/pandas/tests/computation/test_eval.py +++ b/pandas/tests/computation/test_eval.py @@ -725,7 +725,7 @@ def test_and_logic_string_match(self): class TestTypeCasting: @pytest.mark.parametrize("op", ["+", "-", "*", "**", "/"]) # maybe someday... numexpr has too many upcasting rules now - # chain(*(np.sctypes[x] for x in ['uint', 'int', 'float'])) + # chain(*(np.core.sctypes[x] for x in ['uint', 'int', 'float'])) @pytest.mark.parametrize("dt", [np.float32, np.float64]) @pytest.mark.parametrize("left_right", [("df", "3"), ("3", "df")]) def test_binop_typecasting(self, engine, parser, op, dt, left_right): diff --git a/pandas/tests/copy_view/test_internals.py b/pandas/tests/copy_view/test_internals.py index 9180bd5a3a426..a727331307d7e 100644 --- a/pandas/tests/copy_view/test_internals.py +++ b/pandas/tests/copy_view/test_internals.py @@ -119,3 +119,33 @@ def test_iset_splits_blocks_inplace(using_copy_on_write, locs, arr, dtype): else: for col in df.columns: assert not np.shares_memory(get_array(df, col), get_array(df2, col)) + + +def test_exponential_backoff(): + # GH#55518 + df = DataFrame({"a": [1, 2, 3]}) + for i in range(490): + df.copy(deep=False) + + assert len(df._mgr.blocks[0].refs.referenced_blocks) == 491 + + df = DataFrame({"a": [1, 2, 3]}) + dfs = [df.copy(deep=False) for i in range(510)] + + for i in range(20): + df.copy(deep=False) + assert len(df._mgr.blocks[0].refs.referenced_blocks) == 531 + assert df._mgr.blocks[0].refs.clear_counter == 1000 + + for i in range(500): + df.copy(deep=False) + + # Don't reduce since we still have over 500 objects alive + assert df._mgr.blocks[0].refs.clear_counter == 1000 + + dfs = dfs[:300] + for i in range(500): + df.copy(deep=False) + + # Reduce since there are less than 500 objects alive + assert df._mgr.blocks[0].refs.clear_counter == 500 diff --git a/pandas/tests/dtypes/cast/test_infer_dtype.py b/pandas/tests/dtypes/cast/test_infer_dtype.py index b5d761b3549fa..50eaa1f4d8713 100644 --- a/pandas/tests/dtypes/cast/test_infer_dtype.py +++ b/pandas/tests/dtypes/cast/test_infer_dtype.py @@ -42,7 +42,7 @@ def test_infer_dtype_from_float_scalar(float_numpy_dtype): @pytest.mark.parametrize( - "data,exp_dtype", [(12, np.int64), (np.float_(12), np.float64)] + "data,exp_dtype", [(12, np.int64), (np.float64(12), np.float64)] ) def test_infer_dtype_from_python_scalar(data, exp_dtype): dtype, val = infer_dtype_from_scalar(data) @@ -58,7 +58,7 @@ def test_infer_dtype_from_boolean(bool_val): def test_infer_dtype_from_complex(complex_dtype): data = np.dtype(complex_dtype).type(1) dtype, val = infer_dtype_from_scalar(data) - assert dtype == np.complex_ + assert dtype == np.complex128 def test_infer_dtype_from_datetime(): @@ -153,7 +153,7 @@ def test_infer_dtype_from_scalar_errors(): ("foo", np.object_), (b"foo", np.object_), (1, np.int64), - (1.5, np.float_), + (1.5, np.float64), (np.datetime64("2016-01-01"), np.dtype("M8[s]")), (Timestamp("20160101"), np.dtype("M8[s]")), (Timestamp("20160101", tz="UTC"), "datetime64[s, UTC]"), @@ -170,10 +170,10 @@ def test_infer_dtype_from_scalar(value, expected): @pytest.mark.parametrize( "arr, expected", [ - ([1], np.int_), + ([1], np.dtype(int)), (np.array([1], dtype=np.int64), np.int64), ([np.nan, 1, ""], np.object_), - (np.array([[1.0, 2.0]]), np.float_), + (np.array([[1.0, 2.0]]), np.float64), (Categorical(list("aabc")), "category"), (Categorical([1, 2, 3]), "category"), (date_range("20160101", periods=3), np.dtype("=M8[ns]")), diff --git a/pandas/tests/dtypes/test_common.py b/pandas/tests/dtypes/test_common.py index 0043ace1b9590..165bf61302145 100644 --- a/pandas/tests/dtypes/test_common.py +++ b/pandas/tests/dtypes/test_common.py @@ -652,7 +652,7 @@ def test_is_complex_dtype(): assert not com.is_complex_dtype(pd.Series([1, 2])) assert not com.is_complex_dtype(np.array(["a", "b"])) - assert com.is_complex_dtype(np.complex_) + assert com.is_complex_dtype(np.complex128) assert com.is_complex_dtype(complex) assert com.is_complex_dtype(np.array([1 + 1j, 5])) @@ -782,3 +782,9 @@ def test_pandas_dtype_numpy_warning(): match="Converting `np.integer` or `np.signedinteger` to a dtype is deprecated", ): pandas_dtype(np.integer) + + +def test_pandas_dtype_ea_not_instance(): + # GH 31356 GH 54592 + with tm.assert_produces_warning(UserWarning): + assert pandas_dtype(CategoricalDtype) == CategoricalDtype() diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py index 375003e58c21a..df7c787d2b9bf 100644 --- a/pandas/tests/dtypes/test_inference.py +++ b/pandas/tests/dtypes/test_inference.py @@ -536,7 +536,7 @@ def test_isneginf_scalar(self, value, expected): ) def test_maybe_convert_nullable_boolean(self, convert_to_masked_nullable, exp): # GH 40687 - arr = np.array([True, np.NaN], dtype=object) + arr = np.array([True, np.nan], dtype=object) result = libops.maybe_convert_bool( arr, set(), convert_to_masked_nullable=convert_to_masked_nullable ) @@ -862,7 +862,7 @@ def test_maybe_convert_objects_timedelta64_nat(self): ) def test_maybe_convert_objects_nullable_integer(self, exp): # GH27335 - arr = np.array([2, np.NaN], dtype=object) + arr = np.array([2, np.nan], dtype=object) result = lib.maybe_convert_objects(arr, convert_to_nullable_dtype=True) tm.assert_extension_array_equal(result, exp) @@ -890,7 +890,7 @@ def test_maybe_convert_numeric_nullable_integer( self, convert_to_masked_nullable, exp ): # GH 40687 - arr = np.array([2, np.NaN], dtype=object) + arr = np.array([2, np.nan], dtype=object) result = lib.maybe_convert_numeric( arr, set(), convert_to_masked_nullable=convert_to_masked_nullable ) @@ -1889,7 +1889,6 @@ def test_is_scalar_numpy_array_scalars(self): assert is_scalar(np.complex64(2)) assert is_scalar(np.object_("foobar")) assert is_scalar(np.str_("foobar")) - assert is_scalar(np.unicode_("foobar")) assert is_scalar(np.bytes_(b"foobar")) assert is_scalar(np.datetime64("2014-01-01")) assert is_scalar(np.timedelta64(1, "h")) diff --git a/pandas/tests/dtypes/test_missing.py b/pandas/tests/dtypes/test_missing.py index 170f4f49ba377..451ac2afd1d91 100644 --- a/pandas/tests/dtypes/test_missing.py +++ b/pandas/tests/dtypes/test_missing.py @@ -51,7 +51,7 @@ def test_notna_notnull(notna_f): assert notna_f(1.0) assert not notna_f(None) - assert not notna_f(np.NaN) + assert not notna_f(np.nan) msg = "use_inf_as_na option is deprecated" with tm.assert_produces_warning(FutureWarning, match=msg): @@ -112,7 +112,7 @@ def test_empty_object(self, shape): def test_isna_isnull(self, isna_f): assert not isna_f(1.0) assert isna_f(None) - assert isna_f(np.NaN) + assert isna_f(np.nan) assert float("nan") assert not isna_f(np.inf) assert not isna_f(-np.inf) @@ -156,7 +156,7 @@ def test_isna_lists(self): tm.assert_numpy_array_equal(result, exp) # GH20675 - result = isna([np.NaN, "world"]) + result = isna([np.nan, "world"]) exp = np.array([True, False]) tm.assert_numpy_array_equal(result, exp) diff --git a/pandas/tests/extension/base/dim2.py b/pandas/tests/extension/base/dim2.py index a0c24ee068e81..bff4fbd73984d 100644 --- a/pandas/tests/extension/base/dim2.py +++ b/pandas/tests/extension/base/dim2.py @@ -160,9 +160,9 @@ def test_fillna_2d_method(self, data_missing, method): assert arr[0].isna().all() assert not arr[1].isna().any() - result = arr.pad_or_backfill(method=method, limit=None) + result = arr._pad_or_backfill(method=method, limit=None) - expected = data_missing.pad_or_backfill(method=method).repeat(2).reshape(2, 2) + expected = data_missing._pad_or_backfill(method=method).repeat(2).reshape(2, 2) tm.assert_extension_array_equal(result, expected) # Reverse so that backfill is not a no-op. @@ -170,10 +170,10 @@ def test_fillna_2d_method(self, data_missing, method): assert not arr2[0].isna().any() assert arr2[1].isna().all() - result2 = arr2.pad_or_backfill(method=method, limit=None) + result2 = arr2._pad_or_backfill(method=method, limit=None) expected2 = ( - data_missing[::-1].pad_or_backfill(method=method).repeat(2).reshape(2, 2) + data_missing[::-1]._pad_or_backfill(method=method).repeat(2).reshape(2, 2) ) tm.assert_extension_array_equal(result2, expected2) @@ -237,7 +237,7 @@ def get_reduction_result_dtype(dtype): return NUMPY_INT_TO_DTYPE[np.dtype(int)] else: # i.e. dtype.kind == "u" - return NUMPY_INT_TO_DTYPE[np.dtype(np.uint)] + return NUMPY_INT_TO_DTYPE[np.dtype("uint")] if method in ["sum", "prod"]: # std and var are not dtype-preserving diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py index 2dd62a4ca7538..16059155a7a8f 100644 --- a/pandas/tests/extension/base/methods.py +++ b/pandas/tests/extension/base/methods.py @@ -70,6 +70,9 @@ def test_value_counts_with_normalize(self, data): ): # TODO: avoid special-casing expected = expected.astype("double[pyarrow]") + elif getattr(data.dtype, "storage", "") == "pyarrow_numpy": + # TODO: avoid special-casing + expected = expected.astype("float64") elif na_value_for_dtype(data.dtype) is pd.NA: # TODO(GH#44692): avoid special-casing expected = expected.astype("Float64") diff --git a/pandas/tests/extension/base/missing.py b/pandas/tests/extension/base/missing.py index 2a0bd0770dc07..40cc952d44200 100644 --- a/pandas/tests/extension/base/missing.py +++ b/pandas/tests/extension/base/missing.py @@ -94,7 +94,7 @@ def test_fillna_no_op_returns_copy(self, data): assert result is not data tm.assert_extension_array_equal(result, data) - result = data.pad_or_backfill(method="backfill") + result = data._pad_or_backfill(method="backfill") assert result is not data tm.assert_extension_array_equal(result, data) diff --git a/pandas/tests/extension/base/reduce.py b/pandas/tests/extension/base/reduce.py index a6532a6190467..91d4855dcefe5 100644 --- a/pandas/tests/extension/base/reduce.py +++ b/pandas/tests/extension/base/reduce.py @@ -25,7 +25,7 @@ def check_reduce(self, s, op_name, skipna): try: alt = s.astype("float64") - except TypeError: + except (TypeError, ValueError): # e.g. Interval can't cast, so let's cast to object and do # the reduction pointwise alt = s.astype(object) @@ -39,7 +39,7 @@ def check_reduce(self, s, op_name, skipna): expected = exp_op(skipna=skipna) tm.assert_almost_equal(result, expected) - def _get_expected_reduction_dtype(self, arr, op_name: str): + def _get_expected_reduction_dtype(self, arr, op_name: str, skipna: bool): # Find the expected dtype when the given reduction is done on a DataFrame # column with this array. The default assumes float64-like behavior, # i.e. retains the dtype. @@ -58,7 +58,7 @@ def check_reduce_frame(self, ser: pd.Series, op_name: str, skipna: bool): kwargs = {"ddof": 1} if op_name in ["var", "std"] else {} - cmp_dtype = self._get_expected_reduction_dtype(arr, op_name) + cmp_dtype = self._get_expected_reduction_dtype(arr, op_name, skipna) # The DataFrame method just calls arr._reduce with keepdims=True, # so this first check is perfunctory. diff --git a/pandas/tests/extension/decimal/array.py b/pandas/tests/extension/decimal/array.py index d2c3cb395ed44..9ce7ac309b6d3 100644 --- a/pandas/tests/extension/decimal/array.py +++ b/pandas/tests/extension/decimal/array.py @@ -285,7 +285,7 @@ def value_counts(self, dropna: bool = True): return value_counts(self.to_numpy(), dropna=dropna) # We override fillna here to simulate a 3rd party EA that has done so. This - # lets us test the deprecation telling authors to implement pad_or_backfill + # lets us test the deprecation telling authors to implement _pad_or_backfill # Simulate a 3rd-party EA that has not yet updated to include a "copy" # keyword in its fillna method. # error: Signature of "fillna" incompatible with supertype "ExtensionArray" diff --git a/pandas/tests/extension/decimal/test_decimal.py b/pandas/tests/extension/decimal/test_decimal.py index baa056550624f..e61a2894b9aa7 100644 --- a/pandas/tests/extension/decimal/test_decimal.py +++ b/pandas/tests/extension/decimal/test_decimal.py @@ -133,58 +133,96 @@ def test_arith_series_with_array(self, data, all_arithmetic_operators): def test_fillna_frame(self, data_missing): msg = "ExtensionArray.fillna added a 'copy' keyword" with tm.assert_produces_warning( - FutureWarning, match=msg, check_stacklevel=False + DeprecationWarning, match=msg, check_stacklevel=False ): super().test_fillna_frame(data_missing) def test_fillna_limit_pad(self, data_missing): msg = "ExtensionArray.fillna 'method' keyword is deprecated" with tm.assert_produces_warning( - FutureWarning, match=msg, check_stacklevel=False + DeprecationWarning, + match=msg, + check_stacklevel=False, + raise_on_extra_warnings=False, + ): + super().test_fillna_limit_pad(data_missing) + + msg = "The 'method' keyword in DecimalArray.fillna is deprecated" + with tm.assert_produces_warning( + FutureWarning, + match=msg, + check_stacklevel=False, + raise_on_extra_warnings=False, ): super().test_fillna_limit_pad(data_missing) def test_fillna_limit_backfill(self, data_missing): - msg = "|".join( - [ - "ExtensionArray.fillna added a 'copy' keyword", - "Series.fillna with 'method' is deprecated", - ] - ) + msg = "Series.fillna with 'method' is deprecated" with tm.assert_produces_warning( - FutureWarning, match=msg, check_stacklevel=False + FutureWarning, + match=msg, + check_stacklevel=False, + raise_on_extra_warnings=False, ): super().test_fillna_limit_backfill(data_missing) - def test_fillna_no_op_returns_copy(self, data): msg = "ExtensionArray.fillna 'method' keyword is deprecated" with tm.assert_produces_warning( - FutureWarning, match=msg, check_stacklevel=False + DeprecationWarning, + match=msg, + check_stacklevel=False, + raise_on_extra_warnings=False, + ): + super().test_fillna_limit_backfill(data_missing) + + msg = "The 'method' keyword in DecimalArray.fillna is deprecated" + with tm.assert_produces_warning( + FutureWarning, + match=msg, + check_stacklevel=False, + raise_on_extra_warnings=False, + ): + super().test_fillna_limit_backfill(data_missing) + + def test_fillna_no_op_returns_copy(self, data): + msg = "|".join( + [ + "ExtensionArray.fillna 'method' keyword is deprecated", + "The 'method' keyword in DecimalArray.fillna is deprecated", + ] + ) + with tm.assert_produces_warning( + (FutureWarning, DeprecationWarning), match=msg, check_stacklevel=False ): super().test_fillna_no_op_returns_copy(data) def test_fillna_series(self, data_missing): msg = "ExtensionArray.fillna added a 'copy' keyword" with tm.assert_produces_warning( - FutureWarning, match=msg, check_stacklevel=False + DeprecationWarning, match=msg, check_stacklevel=False ): super().test_fillna_series(data_missing) def test_fillna_series_method(self, data_missing, fillna_method): - msg = "ExtensionArray.fillna 'method' keyword is deprecated" + msg = "|".join( + [ + "ExtensionArray.fillna 'method' keyword is deprecated", + "The 'method' keyword in DecimalArray.fillna is deprecated", + ] + ) with tm.assert_produces_warning( - FutureWarning, match=msg, check_stacklevel=False + (FutureWarning, DeprecationWarning), match=msg, check_stacklevel=False ): super().test_fillna_series_method(data_missing, fillna_method) def test_fillna_copy_frame(self, data_missing, using_copy_on_write): - warn = FutureWarning if not using_copy_on_write else None + warn = DeprecationWarning if not using_copy_on_write else None msg = "ExtensionArray.fillna added a 'copy' keyword" with tm.assert_produces_warning(warn, match=msg, check_stacklevel=False): super().test_fillna_copy_frame(data_missing) def test_fillna_copy_series(self, data_missing, using_copy_on_write): - warn = FutureWarning if not using_copy_on_write else None + warn = DeprecationWarning if not using_copy_on_write else None msg = "ExtensionArray.fillna added a 'copy' keyword" with tm.assert_produces_warning(warn, match=msg, check_stacklevel=False): super().test_fillna_copy_series(data_missing) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index dd1ff925adf5f..61474aa94d1c8 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -31,6 +31,7 @@ import pytest from pandas._libs import lib +from pandas._libs.tslibs import timezones from pandas.compat import ( PY311, is_ci_environment, @@ -40,6 +41,7 @@ pa_version_under9p0, pa_version_under11p0, pa_version_under13p0, + pa_version_under14p0, ) from pandas.core.dtypes.dtypes import ( @@ -347,10 +349,15 @@ class TestBaseAccumulateTests(base.BaseAccumulateTests): def check_accumulate(self, ser, op_name, skipna): result = getattr(ser, op_name)(skipna=skipna) - if ser.dtype.kind == "m": + pa_type = ser.dtype.pyarrow_dtype + if pa.types.is_temporal(pa_type): # Just check that we match the integer behavior. - ser = ser.astype("int64[pyarrow]") - result = result.astype("int64[pyarrow]") + if pa_type.bit_width == 32: + int_type = "int32[pyarrow]" + else: + int_type = "int64[pyarrow]" + ser = ser.astype(int_type) + result = result.astype(int_type) result = result.astype("Float64") expected = getattr(ser.astype("Float64"), op_name)(skipna=skipna) @@ -361,14 +368,20 @@ def _supports_accumulation(self, ser: pd.Series, op_name: str) -> bool: # attribute "pyarrow_dtype" pa_type = ser.dtype.pyarrow_dtype # type: ignore[union-attr] - if pa.types.is_string(pa_type) or pa.types.is_binary(pa_type): - if op_name in ["cumsum", "cumprod"]: + if ( + pa.types.is_string(pa_type) + or pa.types.is_binary(pa_type) + or pa.types.is_decimal(pa_type) + ): + if op_name in ["cumsum", "cumprod", "cummax", "cummin"]: + return False + elif pa.types.is_boolean(pa_type): + if op_name in ["cumprod", "cummax", "cummin"]: return False - elif pa.types.is_temporal(pa_type) and not pa.types.is_duration(pa_type): - if op_name in ["cumsum", "cumprod"]: + elif pa.types.is_temporal(pa_type): + if op_name == "cumsum" and not pa.types.is_duration(pa_type): return False - elif pa.types.is_duration(pa_type): - if op_name == "cumprod": + elif op_name == "cumprod": return False return True @@ -384,7 +397,9 @@ def test_accumulate_series(self, data, all_numeric_accumulations, skipna, reques data, all_numeric_accumulations, skipna ) - if all_numeric_accumulations != "cumsum" or pa_version_under9p0: + if pa_version_under9p0 or ( + pa_version_under13p0 and all_numeric_accumulations != "cumsum" + ): # xfailing takes a long time to run because pytest # renders the exception messages even when not showing them opt = request.config.option @@ -528,7 +543,7 @@ def test_reduce_series_boolean( return super().test_reduce_series_boolean(data, all_boolean_reductions, skipna) - def _get_expected_reduction_dtype(self, arr, op_name: str): + def _get_expected_reduction_dtype(self, arr, op_name: str, skipna: bool): if op_name in ["max", "min"]: cmp_dtype = arr.dtype elif arr.dtype.name == "decimal128(7, 3)[pyarrow]": @@ -740,7 +755,7 @@ def test_EA_types(self, engine, data, dtype_backend, request): class TestBaseUnaryOps(base.BaseUnaryOpsTests): def test_invert(self, data, request): pa_dtype = data.dtype.pyarrow_dtype - if not pa.types.is_boolean(pa_dtype): + if not (pa.types.is_boolean(pa_dtype) or pa.types.is_integer(pa_dtype)): request.node.add_marker( pytest.mark.xfail( raises=pa.ArrowNotImplementedError, @@ -938,7 +953,7 @@ def _is_temporal_supported(self, opname, pa_dtype): or ( opname in ("__truediv__", "__rtruediv__", "__floordiv__", "__rfloordiv__") - and not pa_version_under13p0 + and not pa_version_under14p0 ) ) and pa.types.is_duration(pa_dtype) @@ -1326,6 +1341,31 @@ def test_logical_masked_numpy(self, op, exp): tm.assert_series_equal(result, expected) +@pytest.mark.parametrize("pa_type", tm.ALL_INT_PYARROW_DTYPES) +def test_bitwise(pa_type): + # GH 54495 + dtype = ArrowDtype(pa_type) + left = pd.Series([1, None, 3, 4], dtype=dtype) + right = pd.Series([None, 3, 5, 4], dtype=dtype) + + result = left | right + expected = pd.Series([None, None, 3 | 5, 4 | 4], dtype=dtype) + tm.assert_series_equal(result, expected) + + result = left & right + expected = pd.Series([None, None, 3 & 5, 4 & 4], dtype=dtype) + tm.assert_series_equal(result, expected) + + result = left ^ right + expected = pd.Series([None, None, 3 ^ 5, 4 ^ 4], dtype=dtype) + tm.assert_series_equal(result, expected) + + result = ~left + expected = ~(left.fillna(0).to_numpy()) + expected = pd.Series(expected, dtype=dtype).mask(left.isnull()) + tm.assert_series_equal(result, expected) + + def test_arrowdtype_construct_from_string_type_with_unsupported_parameters(): with pytest.raises(NotImplementedError, match="Passing pyarrow type"): ArrowDtype.construct_from_string("not_a_real_dype[s, tz=UTC][pyarrow]") @@ -1434,7 +1474,7 @@ def test_quantile(data, interpolation, quantile, request): @pytest.mark.parametrize( "take_idx, exp_idx", - [[[0, 0, 2, 2, 4, 4], [0, 4]], [[0, 0, 0, 2, 4, 4], [0]]], + [[[0, 0, 2, 2, 4, 4], [4, 0]], [[0, 0, 0, 2, 4, 4], [0]]], ids=["multi_mode", "single_mode"], ) def test_mode_dropna_true(data_for_grouping, take_idx, exp_idx): @@ -1452,7 +1492,7 @@ def test_mode_dropna_false_mode_na(data): expected = pd.Series([None], dtype=data.dtype) tm.assert_series_equal(result, expected) - expected = pd.Series([None, data[0]], dtype=data.dtype) + expected = pd.Series([data[0], None], dtype=data.dtype) result = expected.mode(dropna=False) tm.assert_series_equal(result, expected) @@ -2438,7 +2478,7 @@ def test_dt_tz(tz): dtype=ArrowDtype(pa.timestamp("ns", tz=tz)), ) result = ser.dt.tz - assert result == tz + assert result == timezones.maybe_get_tz(tz) def test_dt_isocalendar(): @@ -2999,6 +3039,15 @@ def test_groupby_count_return_arrow_dtype(data_missing): tm.assert_frame_equal(result, expected) +def test_fixed_size_list(): + # GH#55000 + ser = pd.Series( + [[1, 2], [3, 4]], dtype=ArrowDtype(pa.list_(pa.int64(), list_size=2)) + ) + result = ser.dtype.type + assert result == list + + def test_arrowextensiondtype_dataframe_repr(): # GH 54062 df = pd.DataFrame( @@ -3011,3 +3060,43 @@ def test_arrowextensiondtype_dataframe_repr(): # pyarrow.ExtensionType values are displayed expected = " col\n0 15340\n1 15341\n2 15342" assert result == expected + + +@pytest.mark.parametrize("pa_type", tm.TIMEDELTA_PYARROW_DTYPES) +def test_duration_fillna_numpy(pa_type): + # GH 54707 + ser1 = pd.Series([None, 2], dtype=ArrowDtype(pa_type)) + ser2 = pd.Series(np.array([1, 3], dtype=f"m8[{pa_type.unit}]")) + result = ser1.fillna(ser2) + expected = pd.Series([1, 2], dtype=ArrowDtype(pa_type)) + tm.assert_series_equal(result, expected) + + +def test_comparison_not_propagating_arrow_error(): + # GH#54944 + a = pd.Series([1 << 63], dtype="uint64[pyarrow]") + b = pd.Series([None], dtype="int64[pyarrow]") + with pytest.raises(pa.lib.ArrowInvalid, match="Integer value"): + a < b + + +def test_factorize_chunked_dictionary(): + # GH 54844 + pa_array = pa.chunked_array( + [pa.array(["a"]).dictionary_encode(), pa.array(["b"]).dictionary_encode()] + ) + ser = pd.Series(ArrowExtensionArray(pa_array)) + res_indices, res_uniques = ser.factorize() + exp_indicies = np.array([0, 1], dtype=np.intp) + exp_uniques = pd.Index(ArrowExtensionArray(pa_array.combine_chunks())) + tm.assert_numpy_array_equal(res_indices, exp_indicies) + tm.assert_index_equal(res_uniques, exp_uniques) + + +def test_arrow_floordiv(): + # GH 55561 + a = pd.Series([-7], dtype="int64[pyarrow]") + b = pd.Series([4], dtype="int64[pyarrow]") + expected = pd.Series([-2], dtype="int64[pyarrow]") + result = a // b + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/extension/test_masked.py b/pandas/tests/extension/test_masked.py index c4195be8ea121..588a2fb58d9be 100644 --- a/pandas/tests/extension/test_masked.py +++ b/pandas/tests/extension/test_masked.py @@ -20,6 +20,7 @@ IS64, is_platform_windows, ) +from pandas.compat.numpy import np_version_gt2 import pandas as pd import pandas._testing as tm @@ -40,7 +41,7 @@ ) from pandas.tests.extension import base -is_windows_or_32bit = is_platform_windows() or not IS64 +is_windows_or_32bit = (is_platform_windows() and not np_version_gt2) or not IS64 pytestmark = [ pytest.mark.filterwarnings( @@ -325,7 +326,7 @@ def check_reduce(self, ser: pd.Series, op_name: str, skipna: bool): expected = pd.NA tm.assert_almost_equal(result, expected) - def _get_expected_reduction_dtype(self, arr, op_name: str): + def _get_expected_reduction_dtype(self, arr, op_name: str, skipna: bool): if tm.is_float_dtype(arr.dtype): cmp_dtype = arr.dtype.name elif op_name in ["mean", "median", "var", "std", "skew"]: @@ -335,16 +336,32 @@ def _get_expected_reduction_dtype(self, arr, op_name: str): elif arr.dtype in ["Int64", "UInt64"]: cmp_dtype = arr.dtype.name elif tm.is_signed_integer_dtype(arr.dtype): - cmp_dtype = "Int32" if is_windows_or_32bit else "Int64" + # TODO: Why does Window Numpy 2.0 dtype depend on skipna? + cmp_dtype = ( + "Int32" + if (is_platform_windows() and (not np_version_gt2 or not skipna)) + or not IS64 + else "Int64" + ) elif tm.is_unsigned_integer_dtype(arr.dtype): - cmp_dtype = "UInt32" if is_windows_or_32bit else "UInt64" + cmp_dtype = ( + "UInt32" + if (is_platform_windows() and (not np_version_gt2 or not skipna)) + or not IS64 + else "UInt64" + ) elif arr.dtype.kind == "b": if op_name in ["mean", "median", "var", "std", "skew"]: cmp_dtype = "Float64" elif op_name in ["min", "max"]: cmp_dtype = "boolean" elif op_name in ["sum", "prod"]: - cmp_dtype = "Int32" if is_windows_or_32bit else "Int64" + cmp_dtype = ( + "Int32" + if (is_platform_windows() and (not np_version_gt2 or not skipna)) + or not IS64 + else "Int64" + ) else: raise TypeError("not supposed to reach this") else: @@ -360,7 +377,7 @@ def check_accumulate(self, ser: pd.Series, op_name: str, skipna: bool): # overwrite to ensure pd.NA is tested instead of np.nan # https://github.com/pandas-dev/pandas/issues/30958 length = 64 - if not IS64 or is_platform_windows(): + if is_windows_or_32bit: # Item "ExtensionDtype" of "Union[dtype[Any], ExtensionDtype]" has # no attribute "itemsize" if not ser.dtype.itemsize == 8: # type: ignore[union-attr] diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py index 6597ff84e3ca4..5176289994033 100644 --- a/pandas/tests/extension/test_string.py +++ b/pandas/tests/extension/test_string.py @@ -103,8 +103,8 @@ def test_is_not_string_type(self, dtype): class TestInterface(base.BaseInterfaceTests): - def test_view(self, data, request): - if data.dtype.storage == "pyarrow": + def test_view(self, data, request, arrow_string_storage): + if data.dtype.storage in arrow_string_storage: pytest.skip(reason="2D support not implemented for ArrowStringArray") super().test_view(data) @@ -116,8 +116,8 @@ def test_from_dtype(self, data): class TestReshaping(base.BaseReshapingTests): - def test_transpose(self, data, request): - if data.dtype.storage == "pyarrow": + def test_transpose(self, data, request, arrow_string_storage): + if data.dtype.storage in arrow_string_storage: pytest.skip(reason="2D support not implemented for ArrowStringArray") super().test_transpose(data) @@ -127,8 +127,8 @@ class TestGetitem(base.BaseGetitemTests): class TestSetitem(base.BaseSetitemTests): - def test_setitem_preserves_views(self, data, request): - if data.dtype.storage == "pyarrow": + def test_setitem_preserves_views(self, data, request, arrow_string_storage): + if data.dtype.storage in arrow_string_storage: pytest.skip(reason="2D support not implemented for ArrowStringArray") super().test_setitem_preserves_views(data) @@ -157,6 +157,12 @@ def test_fillna_no_op_returns_copy(self, data): class TestReduce(base.BaseReduceTests): + def _supports_reduction(self, ser: pd.Series, op_name: str) -> bool: + return ( + ser.dtype.storage == "pyarrow_numpy" # type: ignore[union-attr] + and op_name in ("any", "all") + ) + @pytest.mark.parametrize("skipna", [True, False]) def test_reduce_series_numeric(self, data, all_numeric_reductions, skipna): op_name = all_numeric_reductions @@ -184,6 +190,8 @@ def _cast_pointwise_result(self, op_name: str, obj, other, pointwise_result): # attribute "storage" if dtype.storage == "pyarrow": # type: ignore[union-attr] cast_to = "boolean[pyarrow]" + elif dtype.storage == "pyarrow_numpy": # type: ignore[union-attr] + cast_to = np.bool_ # type: ignore[assignment] else: cast_to = "boolean" return pointwise_result.astype(cast_to) diff --git a/pandas/tests/frame/constructors/test_from_records.py b/pandas/tests/frame/constructors/test_from_records.py index 95f9f2ba4051e..bb4aed2163dac 100644 --- a/pandas/tests/frame/constructors/test_from_records.py +++ b/pandas/tests/frame/constructors/test_from_records.py @@ -42,7 +42,7 @@ def test_from_records_with_datetimes(self): arrdata = [np.array([datetime(2005, 3, 1, 0, 0), None])] dtypes = [("EXPIRY", " None: + # https://github.com/pandas-dev/pandas/issues/55025 + df = DataFrame({"x": [1]}) + df.loc[df["x"] == 1, "y"] = "1" + expected = DataFrame({"x": [1], "y": ["1"]}) + tm.assert_frame_equal(df, expected) + + df = DataFrame({"x": [1]}) + # try inserting something which numpy would store as 'object' + value = lambda x: x + df.loc[df["x"] == 1, "y"] = value + expected = DataFrame({"x": [1], "y": [value]}) + tm.assert_frame_equal(df, expected) + + +def test_add_new_column_infer_string(): + # GH#55366 + pytest.importorskip("pyarrow") + df = DataFrame({"x": [1]}) + with pd.option_context("future.infer_string", True): + df.loc[df["x"] == 1, "y"] = "1" + expected = DataFrame( + {"x": [1], "y": Series(["1"], dtype="string[pyarrow_numpy]")}, + columns=Index(["x", "y"], dtype="string[pyarrow_numpy]"), + ) + tm.assert_frame_equal(df, expected) + + class TestSetitemValidation: # This is adapted from pandas/tests/arrays/masked/test_indexing.py # but checks for warnings instead of errors. - def _check_setitem_invalid(self, df, invalid, indexer): + def _check_setitem_invalid(self, df, invalid, indexer, warn): msg = "Setting an item of incompatible dtype is deprecated" msg = re.escape(msg) orig_df = df.copy() # iloc - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(warn, match=msg): df.iloc[indexer, 0] = invalid df = orig_df.copy() # loc - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(warn, match=msg): df.loc[indexer, "a"] = invalid df = orig_df.copy() @@ -1934,16 +1954,20 @@ def _check_setitem_invalid(self, df, invalid, indexer): @pytest.mark.parametrize("indexer", _indexers) def test_setitem_validation_scalar_bool(self, invalid, indexer): df = DataFrame({"a": [True, False, False]}, dtype="bool") - self._check_setitem_invalid(df, invalid, indexer) + self._check_setitem_invalid(df, invalid, indexer, FutureWarning) @pytest.mark.parametrize("invalid", _invalid_scalars + [True, 1.5, np.float64(1.5)]) @pytest.mark.parametrize("indexer", _indexers) def test_setitem_validation_scalar_int(self, invalid, any_int_numpy_dtype, indexer): df = DataFrame({"a": [1, 2, 3]}, dtype=any_int_numpy_dtype) - self._check_setitem_invalid(df, invalid, indexer) + if isna(invalid) and invalid is not pd.NaT: + warn = None + else: + warn = FutureWarning + self._check_setitem_invalid(df, invalid, indexer, warn) @pytest.mark.parametrize("invalid", _invalid_scalars + [True]) @pytest.mark.parametrize("indexer", _indexers) def test_setitem_validation_scalar_float(self, invalid, float_numpy_dtype, indexer): df = DataFrame({"a": [1, 2, None]}, dtype=float_numpy_dtype) - self._check_setitem_invalid(df, invalid, indexer) + self._check_setitem_invalid(df, invalid, indexer, FutureWarning) diff --git a/pandas/tests/frame/indexing/test_set_value.py b/pandas/tests/frame/indexing/test_set_value.py index cd9ffa0f129a2..32312868adacb 100644 --- a/pandas/tests/frame/indexing/test_set_value.py +++ b/pandas/tests/frame/indexing/test_set_value.py @@ -26,10 +26,7 @@ def test_set_value_resize(self, float_frame): assert float_frame._get_value("foobar", "qux") == 0 res = float_frame.copy() - with tm.assert_produces_warning( - FutureWarning, match="Setting an item of incompatible dtype" - ): - res._set_value("foobar", "baz", "sam") + res._set_value("foobar", "baz", "sam") assert res["baz"].dtype == np.object_ res = float_frame.copy() diff --git a/pandas/tests/frame/methods/test_assign.py b/pandas/tests/frame/methods/test_assign.py index 0ae501d43e742..5c652be411987 100644 --- a/pandas/tests/frame/methods/test_assign.py +++ b/pandas/tests/frame/methods/test_assign.py @@ -1,5 +1,5 @@ import pytest - +import pandas as pd from pandas import DataFrame import pandas._testing as tm @@ -65,7 +65,7 @@ def test_assign_bad(self): df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) # non-keyword argument - msg = r"assign\(\) takes 1 positional argument but 2 were given" + msg = r"function' object is not iterable" with pytest.raises(TypeError, match=msg): df.assign(lambda x: x.A) msg = "'DataFrame' object has no attribute 'C'" @@ -82,3 +82,9 @@ def test_assign_dependent(self): result = df.assign(C=lambda df: df.A, D=lambda df: df["A"] + df["C"]) expected = DataFrame([[1, 3, 1, 2], [2, 4, 2, 4]], columns=list("ABCD")) tm.assert_frame_equal(result, expected) + + def test_assign_with_tuple_column_key_raises_typeerror(self): + df = DataFrame(columns=pd.MultiIndex.from_tuples([('A', 'x'), ('B', 'y')])) + + with pytest.raises(TypeError, match="assign\\(\\) only supports string column names"): + df.assign(__kwargs_dict__={('C', 'one'): [1, 2, 3]}) \ No newline at end of file diff --git a/pandas/tests/frame/methods/test_astype.py b/pandas/tests/frame/methods/test_astype.py index 34cbebe1b3d3f..6590f10c6b967 100644 --- a/pandas/tests/frame/methods/test_astype.py +++ b/pandas/tests/frame/methods/test_astype.py @@ -164,7 +164,7 @@ def test_astype_str(self): def test_astype_str_float(self): # see GH#11302 - result = DataFrame([np.NaN]).astype(str) + result = DataFrame([np.nan]).astype(str) expected = DataFrame(["nan"]) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_clip.py b/pandas/tests/frame/methods/test_clip.py index 710978057460a..f7b221d84ab78 100644 --- a/pandas/tests/frame/methods/test_clip.py +++ b/pandas/tests/frame/methods/test_clip.py @@ -166,7 +166,7 @@ def test_clip_with_na_args(self, float_frame): # GH#40420 data = {"col_0": [9, -3, 0, -1, 5], "col_1": [-2, -7, 6, 8, -5]} df = DataFrame(data) - t = Series([2, -4, np.NaN, 6, 3]) + t = Series([2, -4, np.nan, 6, 3]) result = df.clip(lower=t, axis=0) expected = DataFrame({"col_0": [9, -3, 0, 6, 5], "col_1": [2, -4, 6, 8, 3]}) tm.assert_frame_equal(result, expected) @@ -177,3 +177,14 @@ def test_clip_int_data_with_float_bound(self): result = df.clip(lower=1.5) expected = DataFrame({"a": [1.5, 2.0, 3.0]}) tm.assert_frame_equal(result, expected) + + def test_clip_with_list_bound(self): + # GH#54817 + df = DataFrame([1, 5]) + expected = DataFrame([3, 5]) + result = df.clip([3]) + tm.assert_frame_equal(result, expected) + + expected = DataFrame([1, 3]) + result = df.clip(upper=[3]) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_describe.py b/pandas/tests/frame/methods/test_describe.py index e2f92a1e04cb5..f56a7896c753e 100644 --- a/pandas/tests/frame/methods/test_describe.py +++ b/pandas/tests/frame/methods/test_describe.py @@ -332,7 +332,7 @@ def test_describe_percentiles_integer_idx(self): result = df.describe(percentiles=pct) expected = DataFrame( - {"x": [1.0, 1.0, np.NaN, 1.0, *(1.0 for _ in pct), 1.0]}, + {"x": [1.0, 1.0, np.nan, 1.0, *(1.0 for _ in pct), 1.0]}, index=[ "count", "mean", diff --git a/pandas/tests/frame/methods/test_dropna.py b/pandas/tests/frame/methods/test_dropna.py index 11edf665b5494..7899b4aeac3fd 100644 --- a/pandas/tests/frame/methods/test_dropna.py +++ b/pandas/tests/frame/methods/test_dropna.py @@ -231,7 +231,7 @@ def test_dropna_with_duplicate_columns(self): def test_set_single_column_subset(self): # GH 41021 - df = DataFrame({"A": [1, 2, 3], "B": list("abc"), "C": [4, np.NaN, 5]}) + df = DataFrame({"A": [1, 2, 3], "B": list("abc"), "C": [4, np.nan, 5]}) expected = DataFrame( {"A": [1, 3], "B": list("ac"), "C": [4.0, 5.0]}, index=[0, 2] ) @@ -248,7 +248,7 @@ def test_single_column_not_present_in_axis(self): def test_subset_is_nparray(self): # GH 41021 - df = DataFrame({"A": [1, 2, np.NaN], "B": list("abc"), "C": [4, np.NaN, 5]}) + df = DataFrame({"A": [1, 2, np.nan], "B": list("abc"), "C": [4, np.nan, 5]}) expected = DataFrame({"A": [1.0], "B": ["a"], "C": [4.0]}) result = df.dropna(subset=np.array(["A", "C"])) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_dtypes.py b/pandas/tests/frame/methods/test_dtypes.py index 6f21bd4c4b438..4bdf16977dae6 100644 --- a/pandas/tests/frame/methods/test_dtypes.py +++ b/pandas/tests/frame/methods/test_dtypes.py @@ -62,15 +62,15 @@ def test_datetime_with_tz_dtypes(self): def test_dtypes_are_correct_after_column_slice(self): # GH6525 - df = DataFrame(index=range(5), columns=list("abc"), dtype=np.float_) + df = DataFrame(index=range(5), columns=list("abc"), dtype=np.float64) tm.assert_series_equal( df.dtypes, - Series({"a": np.float_, "b": np.float_, "c": np.float_}), + Series({"a": np.float64, "b": np.float64, "c": np.float64}), ) - tm.assert_series_equal(df.iloc[:, 2:].dtypes, Series({"c": np.float_})) + tm.assert_series_equal(df.iloc[:, 2:].dtypes, Series({"c": np.float64})) tm.assert_series_equal( df.dtypes, - Series({"a": np.float_, "b": np.float_, "c": np.float_}), + Series({"a": np.float64, "b": np.float64, "c": np.float64}), ) @pytest.mark.parametrize( diff --git a/pandas/tests/frame/methods/test_filter.py b/pandas/tests/frame/methods/test_filter.py index 1a2fbf8a65a55..9d5e6876bb08c 100644 --- a/pandas/tests/frame/methods/test_filter.py +++ b/pandas/tests/frame/methods/test_filter.py @@ -137,3 +137,17 @@ def test_filter_regex_non_string(self): result = df.filter(regex="STRING") expected = df[["STRING"]] tm.assert_frame_equal(result, expected) + + def test_filter_keep_order(self): + # GH#54980 + df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) + result = df.filter(items=["B", "A"]) + expected = df[["B", "A"]] + tm.assert_frame_equal(result, expected) + + def test_filter_different_dtype(self): + # GH#54980 + df = DataFrame({1: [1, 2, 3], 2: [4, 5, 6]}) + result = df.filter(items=["B", "A"]) + expected = df[[]] + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_interpolate.py b/pandas/tests/frame/methods/test_interpolate.py index 291a79815a81c..67aa07dd83764 100644 --- a/pandas/tests/frame/methods/test_interpolate.py +++ b/pandas/tests/frame/methods/test_interpolate.py @@ -497,3 +497,9 @@ def test_interpolate_empty_df(self): result = df.interpolate(inplace=True) assert result is None tm.assert_frame_equal(df, expected) + + def test_interpolate_ea_raise(self): + # GH#55347 + df = DataFrame({"a": [1, None, 2]}, dtype="Int64") + with pytest.raises(NotImplementedError, match="does not implement"): + df.interpolate() diff --git a/pandas/tests/frame/methods/test_join.py b/pandas/tests/frame/methods/test_join.py index 98f3926968ad0..3b6b86056f8c7 100644 --- a/pandas/tests/frame/methods/test_join.py +++ b/pandas/tests/frame/methods/test_join.py @@ -158,9 +158,14 @@ def test_join_invalid_validate(left_no_dup, right_no_dup): left_no_dup.merge(right_no_dup, on="a", validate="invalid") -def test_join_on_single_col_dup_on_right(left_no_dup, right_w_dups): +@pytest.mark.parametrize("dtype", ["object", "string[pyarrow]"]) +def test_join_on_single_col_dup_on_right(left_no_dup, right_w_dups, dtype): # GH 46622 # Dups on right allowed by one_to_many constraint + if dtype == "string[pyarrow]": + pytest.importorskip("pyarrow") + left_no_dup = left_no_dup.astype(dtype) + right_w_dups.index = right_w_dups.index.astype(dtype) left_no_dup.join( right_w_dups, on="a", diff --git a/pandas/tests/frame/methods/test_pct_change.py b/pandas/tests/frame/methods/test_pct_change.py index d0153da038a75..92b66e12d4356 100644 --- a/pandas/tests/frame/methods/test_pct_change.py +++ b/pandas/tests/frame/methods/test_pct_change.py @@ -29,7 +29,7 @@ def test_pct_change_with_nas( obj = frame_or_series(vals) msg = ( - "The 'fill_method' and 'limit' keywords in " + "The 'fill_method' keyword being not None and the 'limit' keyword in " f"{type(obj).__name__}.pct_change are deprecated" ) with tm.assert_produces_warning(FutureWarning, match=msg): @@ -46,7 +46,7 @@ def test_pct_change_numeric(self): pnl.iat[2, 3] = 60 msg = ( - "The 'fill_method' and 'limit' keywords in " + "The 'fill_method' keyword being not None and the 'limit' keyword in " "DataFrame.pct_change are deprecated" ) @@ -59,12 +59,11 @@ def test_pct_change_numeric(self): def test_pct_change(self, datetime_frame): msg = ( - "The 'fill_method' and 'limit' keywords in " + "The 'fill_method' keyword being not None and the 'limit' keyword in " "DataFrame.pct_change are deprecated" ) - with tm.assert_produces_warning(FutureWarning, match=msg): - rs = datetime_frame.pct_change(fill_method=None) + rs = datetime_frame.pct_change(fill_method=None) tm.assert_frame_equal(rs, datetime_frame / datetime_frame.shift(1) - 1) rs = datetime_frame.pct_change(2) @@ -110,7 +109,7 @@ def test_pct_change_periods_freq( self, datetime_frame, freq, periods, fill_method, limit ): msg = ( - "The 'fill_method' and 'limit' keywords in " + "The 'fill_method' keyword being not None and the 'limit' keyword in " "DataFrame.pct_change are deprecated" ) @@ -144,11 +143,12 @@ def test_pct_change_with_duplicated_indices(fill_method): {0: [np.nan, 1, 2, 3, 9, 18], 1: [0, 1, np.nan, 3, 9, 18]}, index=["a", "b"] * 3 ) + warn = None if fill_method is None else FutureWarning msg = ( - "The 'fill_method' and 'limit' keywords in " + "The 'fill_method' keyword being not None and the 'limit' keyword in " "DataFrame.pct_change are deprecated" ) - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(warn, match=msg): result = data.pct_change(fill_method=fill_method) if fill_method is None: @@ -160,3 +160,21 @@ def test_pct_change_with_duplicated_indices(fill_method): index=["a", "b"] * 3, ) tm.assert_frame_equal(result, expected) + + +def test_pct_change_none_beginning_no_warning(): + # GH#54481 + df = DataFrame( + [ + [1, None], + [2, 1], + [3, 2], + [4, 3], + [5, 4], + ] + ) + result = df.pct_change() + expected = DataFrame( + {0: [np.nan, 1, 0.5, 1 / 3, 0.25], 1: [np.nan, np.nan, 1, 0.5, 1 / 3]} + ) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_rank.py b/pandas/tests/frame/methods/test_rank.py index 8b451c84dc5da..b5b5e42691e59 100644 --- a/pandas/tests/frame/methods/test_rank.py +++ b/pandas/tests/frame/methods/test_rank.py @@ -488,3 +488,15 @@ def test_rank_mixed_axis_zero(self, data, expected): df.rank() result = df.rank(numeric_only=True) tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "dtype, exp_dtype", + [("string[pyarrow]", "Int64"), ("string[pyarrow_numpy]", "float64")], + ) + def test_rank_string_dtype(self, dtype, exp_dtype): + # GH#55362 + pytest.importorskip("pyarrow") + obj = Series(["foo", "foo", None, "foo"], dtype=dtype) + result = obj.rank(method="first") + expected = Series([1, 2, None, 3], dtype=exp_dtype) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_reindex.py b/pandas/tests/frame/methods/test_reindex.py index 0858e33a989b7..678fec835aa82 100644 --- a/pandas/tests/frame/methods/test_reindex.py +++ b/pandas/tests/frame/methods/test_reindex.py @@ -12,6 +12,7 @@ IS64, is_platform_windows, ) +from pandas.compat.numpy import np_version_gt2 import pandas.util._test_decorators as td import pandas as pd @@ -131,7 +132,7 @@ class TestDataFrameSelectReindex: # test_indexing @pytest.mark.xfail( - not IS64 or is_platform_windows(), + not IS64 or (is_platform_windows() and not np_version_gt2), reason="Passes int32 values to DatetimeArray in make_na_array on " "windows, 32bit linux builds", ) diff --git a/pandas/tests/frame/methods/test_replace.py b/pandas/tests/frame/methods/test_replace.py index 3203482ddf724..61e44b4e24c08 100644 --- a/pandas/tests/frame/methods/test_replace.py +++ b/pandas/tests/frame/methods/test_replace.py @@ -666,7 +666,7 @@ def test_replace_NA_with_None(self): def test_replace_NAT_with_None(self): # gh-45836 df = DataFrame([pd.NaT, pd.NaT]) - result = df.replace({pd.NaT: None, np.NaN: None}) + result = df.replace({pd.NaT: None, np.nan: None}) expected = DataFrame([None, None]) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_select_dtypes.py b/pandas/tests/frame/methods/test_select_dtypes.py index 3bfb1af423bdd..a38d2c6fd016a 100644 --- a/pandas/tests/frame/methods/test_select_dtypes.py +++ b/pandas/tests/frame/methods/test_select_dtypes.py @@ -339,9 +339,7 @@ def test_select_dtypes_datetime_with_tz(self): expected = df3.reindex(columns=[]) tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize( - "dtype", [str, "str", np.string_, "S1", "unicode", np.unicode_, "U1"] - ) + @pytest.mark.parametrize("dtype", [str, "str", np.bytes_, "S1", np.str_, "U1"]) @pytest.mark.parametrize("arg", ["include", "exclude"]) def test_select_dtypes_str_raises(self, dtype, arg): df = DataFrame( diff --git a/pandas/tests/frame/methods/test_shift.py b/pandas/tests/frame/methods/test_shift.py index 35941e9f24a4e..1e88152137b1f 100644 --- a/pandas/tests/frame/methods/test_shift.py +++ b/pandas/tests/frame/methods/test_shift.py @@ -32,19 +32,23 @@ def test_shift_axis1_with_valid_fill_value_one_array(self): expected2 = DataFrame([12345] * 5, dtype="Float64") tm.assert_frame_equal(res2, expected2) - def test_shift_disallow_freq_and_fill_value(self, frame_or_series): + def test_shift_deprecate_freq_and_fill_value(self, frame_or_series): # Can't pass both! obj = frame_or_series( np.random.default_rng(2).standard_normal(5), index=date_range("1/1/2000", periods=5, freq="H"), ) - msg = "Cannot pass both 'freq' and 'fill_value' to (Series|DataFrame).shift" - with pytest.raises(ValueError, match=msg): + msg = ( + "Passing a 'freq' together with a 'fill_value' silently ignores the " + "fill_value" + ) + with tm.assert_produces_warning(FutureWarning, match=msg): obj.shift(1, fill_value=1, freq="H") if frame_or_series is DataFrame: - with pytest.raises(ValueError, match=msg): + obj.columns = date_range("1/1/2000", periods=1, freq="H") + with tm.assert_produces_warning(FutureWarning, match=msg): obj.shift(1, axis=1, fill_value=1, freq="H") @pytest.mark.parametrize( @@ -681,10 +685,10 @@ def test_shift_with_iterable_basic_functionality(self): { "a_0": [1, 2, 3], "b_0": [4, 5, 6], - "a_1": [np.NaN, 1.0, 2.0], - "b_1": [np.NaN, 4.0, 5.0], - "a_2": [np.NaN, np.NaN, 1.0], - "b_2": [np.NaN, np.NaN, 4.0], + "a_1": [np.nan, 1.0, 2.0], + "b_1": [np.nan, 4.0, 5.0], + "a_2": [np.nan, np.nan, 1.0], + "b_2": [np.nan, np.nan, 4.0], } ) tm.assert_frame_equal(expected, shifted) @@ -716,8 +720,11 @@ def test_shift_with_iterable_freq_and_fill_value(self): df.shift(1, freq="H"), ) - msg = r"Cannot pass both 'freq' and 'fill_value' to.*" - with pytest.raises(ValueError, match=msg): + msg = ( + "Passing a 'freq' together with a 'fill_value' silently ignores the " + "fill_value" + ) + with tm.assert_produces_warning(FutureWarning, match=msg): df.shift([1, 2], fill_value=1, freq="H") def test_shift_with_iterable_check_other_arguments(self): diff --git a/pandas/tests/frame/methods/test_sort_index.py b/pandas/tests/frame/methods/test_sort_index.py index 228b62a418813..f1465c9ba9df8 100644 --- a/pandas/tests/frame/methods/test_sort_index.py +++ b/pandas/tests/frame/methods/test_sort_index.py @@ -955,3 +955,42 @@ def test_sort_index_multiindex_sort_remaining(self, ascending): ) tm.assert_frame_equal(result, expected) + + +def test_sort_index_with_sliced_multiindex(): + # GH 55379 + mi = MultiIndex.from_tuples( + [ + ("a", "10"), + ("a", "18"), + ("a", "25"), + ("b", "16"), + ("b", "26"), + ("a", "45"), + ("b", "28"), + ("a", "5"), + ("a", "50"), + ("a", "51"), + ("b", "4"), + ], + names=["group", "str"], + ) + + df = DataFrame({"x": range(len(mi))}, index=mi) + result = df.iloc[0:6].sort_index() + + expected = DataFrame( + {"x": [0, 1, 2, 5, 3, 4]}, + index=MultiIndex.from_tuples( + [ + ("a", "10"), + ("a", "18"), + ("a", "25"), + ("a", "45"), + ("b", "16"), + ("b", "26"), + ], + names=["group", "str"], + ), + ) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_to_records.py b/pandas/tests/frame/methods/test_to_records.py index 8853d718270f4..27939856503e5 100644 --- a/pandas/tests/frame/methods/test_to_records.py +++ b/pandas/tests/frame/methods/test_to_records.py @@ -253,7 +253,7 @@ def test_to_records_with_categorical(self): ), # Pass in a dtype instance. ( - {"column_dtypes": np.dtype("unicode")}, + {"column_dtypes": np.dtype(np.str_)}, np.rec.array( [("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")], dtype=[ @@ -391,7 +391,7 @@ def test_to_records_dtype(self, kwargs, expected): # see GH#18146 df = DataFrame({"A": [1, 2], "B": [0.2, 1.5], "C": ["a", "bc"]}) - if not isinstance(expected, np.recarray): + if not isinstance(expected, np.rec.recarray): with pytest.raises(expected[0], match=expected[1]): df.to_records(**kwargs) else: diff --git a/pandas/tests/frame/methods/test_value_counts.py b/pandas/tests/frame/methods/test_value_counts.py index 355f05cd5156c..c05a929360478 100644 --- a/pandas/tests/frame/methods/test_value_counts.py +++ b/pandas/tests/frame/methods/test_value_counts.py @@ -175,3 +175,17 @@ def test_data_frame_value_counts_subset(nulls_fixture, columns): ) tm.assert_series_equal(result, expected) + + +def test_value_counts_categorical_future_warning(): + # GH#54775 + df = pd.DataFrame({"a": [1, 2, 3]}, dtype="category") + result = df.value_counts() + expected = pd.Series( + 1, + index=pd.MultiIndex.from_arrays( + [pd.Index([1, 2, 3], name="a", dtype="category")] + ), + name="count", + ) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/frame/test_api.py b/pandas/tests/frame/test_api.py index 8fc78629beb0a..aa7aa8964a059 100644 --- a/pandas/tests/frame/test_api.py +++ b/pandas/tests/frame/test_api.py @@ -7,8 +7,6 @@ from pandas._config.config import option_context -from pandas.util._test_decorators import async_mark - import pandas as pd from pandas import ( DataFrame, @@ -288,8 +286,7 @@ def _check_f(base, f): f = lambda x: x.rename({1: "foo"}, inplace=True) _check_f(d.copy(), f) - @async_mark() - async def test_tab_complete_warning(self, ip, frame_or_series): + def test_tab_complete_warning(self, ip, frame_or_series): # GH 16409 pytest.importorskip("IPython", minversion="6.0.0") from IPython.core.completer import provisionalcompleter @@ -299,8 +296,7 @@ async def test_tab_complete_warning(self, ip, frame_or_series): else: code = "from pandas import Series; obj = Series(dtype=object)" - await ip.run_code(code) - + ip.run_cell(code) # GH 31324 newer jedi version raises Deprecation warning; # appears resolved 2021-02-02 with tm.assert_produces_warning(None, raise_on_extra_warnings=False): diff --git a/pandas/tests/frame/test_arithmetic.py b/pandas/tests/frame/test_arithmetic.py index 878e94c15e16b..e5a8feb7a89d3 100644 --- a/pandas/tests/frame/test_arithmetic.py +++ b/pandas/tests/frame/test_arithmetic.py @@ -22,14 +22,12 @@ ) import pandas._testing as tm from pandas.core.computation import expressions as expr -from pandas.core.computation.expressions import ( - _MIN_ELEMENTS, - NUMEXPR_INSTALLED, -) +from pandas.core.computation.expressions import _MIN_ELEMENTS from pandas.tests.frame.common import ( _check_mixed_float, _check_mixed_int, ) +from pandas.util.version import Version @pytest.fixture(autouse=True, params=[0, 1000000], ids=["numexpr", "python"]) @@ -502,10 +500,19 @@ def test_floordiv_axis0(self): result2 = df.floordiv(ser.values, axis=0) tm.assert_frame_equal(result2, expected) - @pytest.mark.skipif(not NUMEXPR_INSTALLED, reason="numexpr not installed") @pytest.mark.parametrize("opname", ["floordiv", "pow"]) - def test_floordiv_axis0_numexpr_path(self, opname): + def test_floordiv_axis0_numexpr_path(self, opname, request): # case that goes through numexpr and has to fall back to masked_arith_op + ne = pytest.importorskip("numexpr") + if ( + Version(ne.__version__) >= Version("2.8.7") + and opname == "pow" + and "python" in request.node.callspec.id + ): + request.node.add_marker( + pytest.mark.xfail(reason="https://github.com/pydata/numexpr/issues/454") + ) + op = getattr(operator, opname) arr = np.arange(_MIN_ELEMENTS + 100).reshape(_MIN_ELEMENTS // 100 + 1, -1) * 100 @@ -2071,6 +2078,9 @@ def test_frame_sub_nullable_int(any_int_ea_dtype): tm.assert_frame_equal(result, expected) +@pytest.mark.filterwarnings( + "ignore:Passing a BlockManager|Passing a SingleBlockManager:DeprecationWarning" +) def test_frame_op_subclass_nonclass_constructor(): # GH#43201 subclass._constructor is a function, not the subclass itself diff --git a/pandas/tests/frame/test_block_internals.py b/pandas/tests/frame/test_block_internals.py index 008d7a023576a..9e8d92e832d01 100644 --- a/pandas/tests/frame/test_block_internals.py +++ b/pandas/tests/frame/test_block_internals.py @@ -131,22 +131,22 @@ def test_constructor_with_convert(self): df = DataFrame({"A": [None, 1]}) result = df["A"] - expected = Series(np.asarray([np.nan, 1], np.float_), name="A") + expected = Series(np.asarray([np.nan, 1], np.float64), name="A") tm.assert_series_equal(result, expected) df = DataFrame({"A": [1.0, 2]}) result = df["A"] - expected = Series(np.asarray([1.0, 2], np.float_), name="A") + expected = Series(np.asarray([1.0, 2], np.float64), name="A") tm.assert_series_equal(result, expected) df = DataFrame({"A": [1.0 + 2.0j, 3]}) result = df["A"] - expected = Series(np.asarray([1.0 + 2.0j, 3], np.complex_), name="A") + expected = Series(np.asarray([1.0 + 2.0j, 3], np.complex128), name="A") tm.assert_series_equal(result, expected) df = DataFrame({"A": [1.0 + 2.0j, 3.0]}) result = df["A"] - expected = Series(np.asarray([1.0 + 2.0j, 3.0], np.complex_), name="A") + expected = Series(np.asarray([1.0 + 2.0j, 3.0], np.complex128), name="A") tm.assert_series_equal(result, expected) df = DataFrame({"A": [1.0 + 2.0j, True]}) @@ -156,12 +156,12 @@ def test_constructor_with_convert(self): df = DataFrame({"A": [1.0, None]}) result = df["A"] - expected = Series(np.asarray([1.0, np.nan], np.float_), name="A") + expected = Series(np.asarray([1.0, np.nan], np.float64), name="A") tm.assert_series_equal(result, expected) df = DataFrame({"A": [1.0 + 2.0j, None]}) result = df["A"] - expected = Series(np.asarray([1.0 + 2.0j, np.nan], np.complex_), name="A") + expected = Series(np.asarray([1.0 + 2.0j, np.nan], np.complex128), name="A") tm.assert_series_equal(result, expected) df = DataFrame({"A": [2.0, 1, True, None]}) @@ -343,9 +343,9 @@ def test_stale_cached_series_bug_473(self, using_copy_on_write): Y["e"] = Y["e"].astype("object") if using_copy_on_write: with tm.raises_chained_assignment_error(): - Y["g"]["c"] = np.NaN + Y["g"]["c"] = np.nan else: - Y["g"]["c"] = np.NaN + Y["g"]["c"] = np.nan repr(Y) Y.sum() Y["g"].sum() diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index a493084142f7b..a291b906d6710 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -1781,8 +1781,6 @@ def test_constructor_empty_with_string_dtype(self): tm.assert_frame_equal(df, expected) df = DataFrame(index=[0, 1], columns=[0, 1], dtype=np.str_) tm.assert_frame_equal(df, expected) - df = DataFrame(index=[0, 1], columns=[0, 1], dtype=np.unicode_) - tm.assert_frame_equal(df, expected) df = DataFrame(index=[0, 1], columns=[0, 1], dtype="U5") tm.assert_frame_equal(df, expected) @@ -1825,8 +1823,8 @@ def test_constructor_single_value(self): DataFrame("a", [1, 2], ["a", "c"], float) def test_constructor_with_datetimes(self): - intname = np.dtype(np.int_).name - floatname = np.dtype(np.float_).name + intname = np.dtype(int).name + floatname = np.dtype(np.float64).name objectname = np.dtype(np.object_).name # single item @@ -2687,8 +2685,8 @@ def test_construct_with_strings_and_none(self): def test_frame_string_inference(self): # GH#54430 - pa = pytest.importorskip("pyarrow") - dtype = pd.ArrowDtype(pa.string()) + pytest.importorskip("pyarrow") + dtype = "string[pyarrow_numpy]" expected = DataFrame( {"a": ["a", "b"]}, dtype=dtype, columns=Index(["a"], dtype=dtype) ) @@ -2720,6 +2718,38 @@ def test_frame_string_inference(self): df = DataFrame({"a": ["a", "b"]}, dtype="object") tm.assert_frame_equal(df, expected) + def test_frame_string_inference_array_string_dtype(self): + # GH#54496 + pytest.importorskip("pyarrow") + dtype = "string[pyarrow_numpy]" + expected = DataFrame( + {"a": ["a", "b"]}, dtype=dtype, columns=Index(["a"], dtype=dtype) + ) + with pd.option_context("future.infer_string", True): + df = DataFrame({"a": np.array(["a", "b"])}) + tm.assert_frame_equal(df, expected) + + expected = DataFrame({0: ["a", "b"], 1: ["c", "d"]}, dtype=dtype) + with pd.option_context("future.infer_string", True): + df = DataFrame(np.array([["a", "c"], ["b", "d"]])) + tm.assert_frame_equal(df, expected) + + expected = DataFrame( + {"a": ["a", "b"], "b": ["c", "d"]}, + dtype=dtype, + columns=Index(["a", "b"], dtype=dtype), + ) + with pd.option_context("future.infer_string", True): + df = DataFrame(np.array([["a", "c"], ["b", "d"]]), columns=["a", "b"]) + tm.assert_frame_equal(df, expected) + + def test_frame_string_inference_block_dim(self): + # GH#55363 + pytest.importorskip("pyarrow") + with pd.option_context("future.infer_string", True): + df = DataFrame(np.array([["hello", "goodbye"], ["hello", "Hello"]])) + assert df._mgr.blocks[0].ndim == 2 + class TestDataFrameConstructorIndexInference: def test_frame_from_dict_of_series_overlapping_monthly_period_indexes(self): diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py index 3768298156550..bec1fcd1e7462 100644 --- a/pandas/tests/frame/test_reductions.py +++ b/pandas/tests/frame/test_reductions.py @@ -10,6 +10,7 @@ IS64, is_platform_windows, ) +from pandas.compat.numpy import np_version_gt2 import pandas.util._test_decorators as td import pandas as pd @@ -32,6 +33,7 @@ nanops, ) +is_windows_np2_or_is32 = (is_platform_windows() and not np_version_gt2) or not IS64 is_windows_or_is32 = is_platform_windows() or not IS64 @@ -134,7 +136,7 @@ def wrapper(x): # all NA case if has_skipna: - all_na = frame * np.NaN + all_na = frame * np.nan r0 = getattr(all_na, opname)(axis=0) r1 = getattr(all_na, opname)(axis=1) if opname in ["sum", "prod"]: @@ -834,9 +836,9 @@ def test_sum_nanops_min_count(self): @pytest.mark.parametrize( "kwargs, expected_result", [ - ({"axis": 1, "min_count": 2}, [3.2, 5.3, np.NaN]), - ({"axis": 1, "min_count": 3}, [np.NaN, np.NaN, np.NaN]), - ({"axis": 1, "skipna": False}, [3.2, 5.3, np.NaN]), + ({"axis": 1, "min_count": 2}, [3.2, 5.3, np.nan]), + ({"axis": 1, "min_count": 3}, [np.nan, np.nan, np.nan]), + ({"axis": 1, "skipna": False}, [3.2, 5.3, np.nan]), ], ) def test_sum_nanops_dtype_min_count(self, float_type, kwargs, expected_result): @@ -850,9 +852,9 @@ def test_sum_nanops_dtype_min_count(self, float_type, kwargs, expected_result): @pytest.mark.parametrize( "kwargs, expected_result", [ - ({"axis": 1, "min_count": 2}, [2.0, 4.0, np.NaN]), - ({"axis": 1, "min_count": 3}, [np.NaN, np.NaN, np.NaN]), - ({"axis": 1, "skipna": False}, [2.0, 4.0, np.NaN]), + ({"axis": 1, "min_count": 2}, [2.0, 4.0, np.nan]), + ({"axis": 1, "min_count": 3}, [np.nan, np.nan, np.nan]), + ({"axis": 1, "skipna": False}, [2.0, 4.0, np.nan]), ], ) def test_prod_nanops_dtype_min_count(self, float_type, kwargs, expected_result): @@ -1056,6 +1058,28 @@ def test_idxmax_numeric_only(self, numeric_only): expected = Series([1, 0, 1], index=["a", "b", "c"]) tm.assert_series_equal(result, expected) + def test_idxmax_arrow_types(self): + # GH#55368 + pytest.importorskip("pyarrow") + + df = DataFrame({"a": [2, 3, 1], "b": [2, 1, 1]}, dtype="int64[pyarrow]") + result = df.idxmax() + expected = Series([1, 0], index=["a", "b"]) + tm.assert_series_equal(result, expected) + + result = df.idxmin() + expected = Series([2, 1], index=["a", "b"]) + tm.assert_series_equal(result, expected) + + df = DataFrame({"a": ["b", "c", "a"]}, dtype="string[pyarrow]") + result = df.idxmax(numeric_only=False) + expected = Series([1], index=["a"]) + tm.assert_series_equal(result, expected) + + result = df.idxmin(numeric_only=False) + expected = Series([2], index=["a"]) + tm.assert_series_equal(result, expected) + def test_idxmax_axis_2(self, float_frame): frame = float_frame msg = "No axis named 2 for object type DataFrame" @@ -1189,7 +1213,7 @@ def wrapper(x): f(axis=2) # all NA case - all_na = frame * np.NaN + all_na = frame * np.nan r0 = getattr(all_na, opname)(axis=0) r1 = getattr(all_na, opname)(axis=1) if opname == "any": @@ -1744,13 +1768,13 @@ def test_df_empty_min_count_1(self, opname, dtype, exp_dtype): @pytest.mark.parametrize( "opname, dtype, exp_value, exp_dtype", [ - ("sum", "Int8", 0, ("Int32" if is_windows_or_is32 else "Int64")), - ("prod", "Int8", 1, ("Int32" if is_windows_or_is32 else "Int64")), - ("prod", "Int8", 1, ("Int32" if is_windows_or_is32 else "Int64")), + ("sum", "Int8", 0, ("Int32" if is_windows_np2_or_is32 else "Int64")), + ("prod", "Int8", 1, ("Int32" if is_windows_np2_or_is32 else "Int64")), + ("prod", "Int8", 1, ("Int32" if is_windows_np2_or_is32 else "Int64")), ("sum", "Int64", 0, "Int64"), ("prod", "Int64", 1, "Int64"), - ("sum", "UInt8", 0, ("UInt32" if is_windows_or_is32 else "UInt64")), - ("prod", "UInt8", 1, ("UInt32" if is_windows_or_is32 else "UInt64")), + ("sum", "UInt8", 0, ("UInt32" if is_windows_np2_or_is32 else "UInt64")), + ("prod", "UInt8", 1, ("UInt32" if is_windows_np2_or_is32 else "UInt64")), ("sum", "UInt64", 0, "UInt64"), ("prod", "UInt64", 1, "UInt64"), ("sum", "Float32", 0, "Float32"), @@ -1765,6 +1789,8 @@ def test_df_empty_nullable_min_count_0(self, opname, dtype, exp_value, exp_dtype expected = Series([exp_value, exp_value], dtype=exp_dtype) tm.assert_series_equal(result, expected) + # TODO: why does min_count=1 impact the resulting Windows dtype + # differently than min_count=0? @pytest.mark.parametrize( "opname, dtype, exp_dtype", [ @@ -1938,3 +1964,80 @@ def test_fails_on_non_numeric(kernel): msg = "|".join([msg1, msg2]) with pytest.raises(TypeError, match=msg): getattr(df, kernel)(*args) + + +@pytest.mark.parametrize( + "method", + [ + "all", + "any", + "count", + "idxmax", + "idxmin", + "kurt", + "kurtosis", + "max", + "mean", + "median", + "min", + "nunique", + "prod", + "product", + "sem", + "skew", + "std", + "sum", + "var", + ], +) +@pytest.mark.parametrize("min_count", [0, 2]) +def test_numeric_ea_axis_1(method, skipna, min_count, any_numeric_ea_dtype): + # GH 54341 + df = DataFrame( + { + "a": Series([0, 1, 2, 3], dtype=any_numeric_ea_dtype), + "b": Series([0, 1, pd.NA, 3], dtype=any_numeric_ea_dtype), + }, + ) + expected_df = DataFrame( + { + "a": [0.0, 1.0, 2.0, 3.0], + "b": [0.0, 1.0, np.nan, 3.0], + }, + ) + if method in ("count", "nunique"): + expected_dtype = "int64" + elif method in ("all", "any"): + expected_dtype = "boolean" + elif method in ( + "kurt", + "kurtosis", + "mean", + "median", + "sem", + "skew", + "std", + "var", + ) and not any_numeric_ea_dtype.startswith("Float"): + expected_dtype = "Float64" + else: + expected_dtype = any_numeric_ea_dtype + + kwargs = {} + if method not in ("count", "nunique", "quantile"): + kwargs["skipna"] = skipna + if method in ("prod", "product", "sum"): + kwargs["min_count"] = min_count + + warn = None + msg = None + if not skipna and method in ("idxmax", "idxmin"): + warn = FutureWarning + msg = f"The behavior of DataFrame.{method} with all-NA values" + with tm.assert_produces_warning(warn, match=msg): + result = getattr(df, method)(axis=1, **kwargs) + with tm.assert_produces_warning(warn, match=msg): + expected = getattr(expected_df, method)(axis=1, **kwargs) + if method not in ("idxmax", "idxmin"): + expected = expected.astype(expected_dtype) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/frame/test_repr_info.py b/pandas/tests/frame/test_repr_info.py index 49375658abfee..64d516e484991 100644 --- a/pandas/tests/frame/test_repr_info.py +++ b/pandas/tests/frame/test_repr_info.py @@ -455,3 +455,14 @@ def test_masked_ea_with_formatter(self): 0 0.12 1.00 1 1.12 2.00""" assert result == expected + + def test_repr_ea_columns(self, any_string_dtype): + # GH#54797 + pytest.importorskip("pyarrow") + df = DataFrame({"long_column_name": [1, 2, 3], "col2": [4, 5, 6]}) + df.columns = df.columns.astype(any_string_dtype) + expected = """ long_column_name col2 +0 1 4 +1 2 5 +2 3 6""" + assert repr(df) == expected diff --git a/pandas/tests/frame/test_stack_unstack.py b/pandas/tests/frame/test_stack_unstack.py index cb8e8c5025e3b..dbd1f96fc17c9 100644 --- a/pandas/tests/frame/test_stack_unstack.py +++ b/pandas/tests/frame/test_stack_unstack.py @@ -72,7 +72,7 @@ def test_stack_mixed_level(self, future_stack): def test_unstack_not_consolidated(self, using_array_manager): # Gh#34708 - df = DataFrame({"x": [1, 2, np.NaN], "y": [3.0, 4, np.NaN]}) + df = DataFrame({"x": [1, 2, np.nan], "y": [3.0, 4, np.nan]}) df2 = df[["x"]] df2["y"] = df["y"] if not using_array_manager: @@ -584,7 +584,7 @@ def test_unstack_to_series(self, float_frame): tm.assert_frame_equal(undo, float_frame) # check NA handling - data = DataFrame({"x": [1, 2, np.NaN], "y": [3.0, 4, np.NaN]}) + data = DataFrame({"x": [1, 2, np.nan], "y": [3.0, 4, np.nan]}) data.index = Index(["a", "b", "c"]) result = data.unstack() @@ -592,7 +592,7 @@ def test_unstack_to_series(self, float_frame): levels=[["x", "y"], ["a", "b", "c"]], codes=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]], ) - expected = Series([1, 2, np.NaN, 3, 4, np.NaN], index=midx) + expected = Series([1, 2, np.nan, 3, 4, np.nan], index=midx) tm.assert_series_equal(result, expected) @@ -902,9 +902,9 @@ def cast(val): def test_unstack_nan_index2(self): # GH7403 df = DataFrame({"A": list("aaaabbbb"), "B": range(8), "C": range(8)}) - # Explicit cast to avoid implicit cast when setting to np.NaN + # Explicit cast to avoid implicit cast when setting to np.nan df = df.astype({"B": "float"}) - df.iloc[3, 1] = np.NaN + df.iloc[3, 1] = np.nan left = df.set_index(["A", "B"]).unstack(0) vals = [ @@ -921,9 +921,9 @@ def test_unstack_nan_index2(self): tm.assert_frame_equal(left, right) df = DataFrame({"A": list("aaaabbbb"), "B": list(range(4)) * 2, "C": range(8)}) - # Explicit cast to avoid implicit cast when setting to np.NaN + # Explicit cast to avoid implicit cast when setting to np.nan df = df.astype({"B": "float"}) - df.iloc[2, 1] = np.NaN + df.iloc[2, 1] = np.nan left = df.set_index(["A", "B"]).unstack(0) vals = [[2, np.nan], [0, 4], [1, 5], [np.nan, 6], [3, 7]] @@ -935,9 +935,9 @@ def test_unstack_nan_index2(self): tm.assert_frame_equal(left, right) df = DataFrame({"A": list("aaaabbbb"), "B": list(range(4)) * 2, "C": range(8)}) - # Explicit cast to avoid implicit cast when setting to np.NaN + # Explicit cast to avoid implicit cast when setting to np.nan df = df.astype({"B": "float"}) - df.iloc[3, 1] = np.NaN + df.iloc[3, 1] = np.nan left = df.set_index(["A", "B"]).unstack(0) vals = [[3, np.nan], [0, 4], [1, 5], [2, 6], [np.nan, 7]] @@ -958,7 +958,7 @@ def test_unstack_nan_index3(self, using_array_manager): } ) - df.iloc[3, 1] = np.NaN + df.iloc[3, 1] = np.nan left = df.set_index(["A", "B"]).unstack() vals = np.array([[3, 0, 1, 2, np.nan, 4], [np.nan, 5, 6, 7, 8, 9]]) @@ -1754,7 +1754,7 @@ def test_stack_mixed_dtype(self, multiindex_dataframe_random_data, future_stack) result = df["foo"].stack(future_stack=future_stack).sort_index() tm.assert_series_equal(stacked["foo"], result, check_names=False) assert result.name is None - assert stacked["bar"].dtype == np.float_ + assert stacked["bar"].dtype == np.float64 def test_unstack_bug(self, future_stack): df = DataFrame( @@ -2508,3 +2508,19 @@ def test_unstack_mixed_level_names(self): index=MultiIndex.from_tuples([(1, "red"), (2, "blue")], names=[0, "y"]), ) tm.assert_frame_equal(result, expected) + + +def test_stack_tuple_columns(future_stack): + # GH#54948 - test stack when the input has a non-MultiIndex with tuples + df = DataFrame( + [[1, 2, 3], [4, 5, 6], [7, 8, 9]], columns=[("a", 1), ("a", 2), ("b", 1)] + ) + result = df.stack(future_stack=future_stack) + expected = Series( + [1, 2, 3, 4, 5, 6, 7, 8, 9], + index=MultiIndex( + levels=[[0, 1, 2], [("a", 1), ("a", 2), ("b", 1)]], + codes=[[0, 0, 0, 1, 1, 1, 2, 2, 2], [0, 1, 2, 0, 1, 2, 0, 1, 2]], + ), + ) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/frame/test_subclass.py b/pandas/tests/frame/test_subclass.py index 37ca52eba6451..ef78ae62cb4d6 100644 --- a/pandas/tests/frame/test_subclass.py +++ b/pandas/tests/frame/test_subclass.py @@ -10,6 +10,10 @@ ) import pandas._testing as tm +pytestmark = pytest.mark.filterwarnings( + "ignore:Passing a BlockManager|Passing a SingleBlockManager:DeprecationWarning" +) + @pytest.fixture() def gpd_style_subclass_df(): @@ -734,8 +738,77 @@ def test_replace_list_method(self): # https://github.com/pandas-dev/pandas/pull/46018 df = tm.SubclassedDataFrame({"A": [0, 1, 2]}) msg = "The 'method' keyword in SubclassedDataFrame.replace is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning( + FutureWarning, match=msg, raise_on_extra_warnings=False + ): result = df.replace([1, 2], method="ffill") expected = tm.SubclassedDataFrame({"A": [0, 0, 0]}) assert isinstance(result, tm.SubclassedDataFrame) tm.assert_frame_equal(result, expected) + + +class MySubclassWithMetadata(DataFrame): + _metadata = ["my_metadata"] + + def __init__(self, *args, **kwargs) -> None: + super().__init__(*args, **kwargs) + + my_metadata = kwargs.pop("my_metadata", None) + if args and isinstance(args[0], MySubclassWithMetadata): + my_metadata = args[0].my_metadata # type: ignore[has-type] + self.my_metadata = my_metadata + + @property + def _constructor(self): + return MySubclassWithMetadata + + +def test_constructor_with_metadata(): + # https://github.com/pandas-dev/pandas/pull/54922 + # https://github.com/pandas-dev/pandas/issues/55120 + df = MySubclassWithMetadata( + np.random.default_rng(2).random((5, 3)), columns=["A", "B", "C"] + ) + subset = df[["A", "B"]] + assert isinstance(subset, MySubclassWithMetadata) + + +class SimpleDataFrameSubClass(DataFrame): + """A subclass of DataFrame that does not define a constructor.""" + + +class SimpleSeriesSubClass(Series): + """A subclass of Series that does not define a constructor.""" + + +class TestSubclassWithoutConstructor: + def test_copy_df(self): + expected = DataFrame({"a": [1, 2, 3]}) + result = SimpleDataFrameSubClass(expected).copy() + + assert ( + type(result) is DataFrame + ) # assert_frame_equal only checks isinstance(lhs, type(rhs)) + tm.assert_frame_equal(result, expected) + + def test_copy_series(self): + expected = Series([1, 2, 3]) + result = SimpleSeriesSubClass(expected).copy() + + tm.assert_series_equal(result, expected) + + def test_series_to_frame(self): + orig = Series([1, 2, 3]) + expected = orig.to_frame() + result = SimpleSeriesSubClass(orig).to_frame() + + assert ( + type(result) is DataFrame + ) # assert_frame_equal only checks isinstance(lhs, type(rhs)) + tm.assert_frame_equal(result, expected) + + def test_groupby(self): + df = SimpleDataFrameSubClass(DataFrame({"a": [1, 2, 3]})) + + for _, v in df.groupby("a"): + assert type(v) is DataFrame diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index cdfa80c8c7cb5..406f4b9b127e7 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -513,6 +513,18 @@ def test_groupby_agg_dict_with_getitem(): tm.assert_frame_equal(result, expected) +def test_groupby_agg_dict_dup_columns(): + # GH#55006 + df = DataFrame( + [[1, 2, 3, 4], [1, 3, 4, 5], [2, 4, 5, 6]], + columns=["a", "b", "c", "c"], + ) + gb = df.groupby("a") + result = gb.agg({"b": "sum"}) + expected = DataFrame({"b": [5, 4]}, index=Index([1, 2], name="a")) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( "op", [ diff --git a/pandas/tests/groupby/aggregate/test_cython.py b/pandas/tests/groupby/aggregate/test_cython.py index f917f567e1ce3..ff50628aad17d 100644 --- a/pandas/tests/groupby/aggregate/test_cython.py +++ b/pandas/tests/groupby/aggregate/test_cython.py @@ -229,7 +229,7 @@ def test_cython_agg_empty_buckets_nanops(observed): # GH-18869 can't call nanops on empty groups, so hardcode expected # for these df = DataFrame([11, 12, 13], columns=["a"]) - grps = np.arange(0, 25, 5, dtype=np.int_) + grps = np.arange(0, 25, 5, dtype=int) # add / sum result = df.groupby(pd.cut(df["a"], grps), observed=observed)._cython_agg_general( "sum", alt=None, numeric_only=True diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index d0ae9eeed394f..68ce58ad23690 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -18,7 +18,7 @@ from pandas.tests.groupby import get_groupby_method_args -def cartesian_product_for_groupers(result, args, names, fill_value=np.NaN): +def cartesian_product_for_groupers(result, args, names, fill_value=np.nan): """Reindex to a cartesian production for the groupers, preserving the nature (Categorical) of each grouper """ @@ -42,28 +42,28 @@ def f(a): # These expected values can be used across several tests (i.e. they are # the same for SeriesGroupBy and DataFrameGroupBy) but they should only be # hardcoded in one place. - "all": np.NaN, - "any": np.NaN, + "all": np.nan, + "any": np.nan, "count": 0, - "corrwith": np.NaN, - "first": np.NaN, - "idxmax": np.NaN, - "idxmin": np.NaN, - "last": np.NaN, - "max": np.NaN, - "mean": np.NaN, - "median": np.NaN, - "min": np.NaN, - "nth": np.NaN, + "corrwith": np.nan, + "first": np.nan, + "idxmax": np.nan, + "idxmin": np.nan, + "last": np.nan, + "max": np.nan, + "mean": np.nan, + "median": np.nan, + "min": np.nan, + "nth": np.nan, "nunique": 0, - "prod": np.NaN, - "quantile": np.NaN, - "sem": np.NaN, + "prod": np.nan, + "quantile": np.nan, + "sem": np.nan, "size": 0, - "skew": np.NaN, - "std": np.NaN, + "skew": np.nan, + "std": np.nan, "sum": 0, - "var": np.NaN, + "var": np.nan, } @@ -1750,8 +1750,8 @@ def test_series_groupby_first_on_categorical_col_grouped_on_2_categoricals( cat2 = Categorical([0, 1]) idx = MultiIndex.from_product([cat2, cat2], names=["a", "b"]) expected_dict = { - "first": Series([0, np.NaN, np.NaN, 1], idx, name="c"), - "last": Series([1, np.NaN, np.NaN, 0], idx, name="c"), + "first": Series([0, np.nan, np.nan, 1], idx, name="c"), + "last": Series([1, np.nan, np.nan, 0], idx, name="c"), } expected = expected_dict[func] @@ -1775,8 +1775,8 @@ def test_df_groupby_first_on_categorical_col_grouped_on_2_categoricals( cat2 = Categorical([0, 1]) idx = MultiIndex.from_product([cat2, cat2], names=["a", "b"]) expected_dict = { - "first": Series([0, np.NaN, np.NaN, 1], idx, name="c"), - "last": Series([1, np.NaN, np.NaN, 0], idx, name="c"), + "first": Series([0, np.nan, np.nan, 1], idx, name="c"), + "last": Series([1, np.nan, np.nan, 0], idx, name="c"), } expected = expected_dict[func].to_frame() diff --git a/pandas/tests/groupby/test_counting.py b/pandas/tests/groupby/test_counting.py index fd5018d05380c..885e7848b76cb 100644 --- a/pandas/tests/groupby/test_counting.py +++ b/pandas/tests/groupby/test_counting.py @@ -232,7 +232,7 @@ def test_count_with_only_nans_in_first_group(self): def test_count_groupby_column_with_nan_in_groupby_column(self): # https://github.com/pandas-dev/pandas/issues/32841 - df = DataFrame({"A": [1, 1, 1, 1, 1], "B": [5, 4, np.NaN, 3, 0]}) + df = DataFrame({"A": [1, 1, 1, 1, 1], "B": [5, 4, np.nan, 3, 0]}) res = df.groupby(["B"]).count() expected = DataFrame( index=Index([0.0, 3.0, 4.0, 5.0], name="B"), data={"A": [1, 1, 1, 1]} @@ -379,3 +379,14 @@ def __eq__(self, other): result = df.groupby("grp").count() expected = DataFrame({"a": [2, 2]}, index=Index(list("ab"), name="grp")) tm.assert_frame_equal(result, expected) + + +def test_count_arrow_string_array(any_string_dtype): + # GH#54751 + pytest.importorskip("pyarrow") + df = DataFrame( + {"a": [1, 2, 3], "b": Series(["a", "b", "a"], dtype=any_string_dtype)} + ) + result = df.groupby("a").count() + expected = DataFrame({"b": 1}, index=Index([1, 2, 3], name="a")) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 0abf6428730ff..ac58701f5fa39 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -822,7 +822,7 @@ def test_nlargest_and_smallest_noop(data, groups, dtype, method): data = list(reversed(data)) ser = Series(data, name="a") result = getattr(ser.groupby(groups), method)(n=2) - expidx = np.array(groups, dtype=np.int_) if isinstance(groups, list) else groups + expidx = np.array(groups, dtype=int) if isinstance(groups, list) else groups expected = Series(data, index=MultiIndex.from_arrays([expidx, ser.index]), name="a") tm.assert_series_equal(result, expected) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 24a550b4602a6..49ae217513018 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -9,6 +9,7 @@ PerformanceWarning, SpecificationError, ) +import pandas.util._test_decorators as td import pandas as pd from pandas import ( @@ -2513,13 +2514,21 @@ def test_groupby_column_index_name_lost(func): tm.assert_index_equal(result, expected) -def test_groupby_duplicate_columns(): +@pytest.mark.parametrize( + "infer_string", + [ + False, + pytest.param(True, marks=td.skip_if_no("pyarrow")), + ], +) +def test_groupby_duplicate_columns(infer_string): # GH: 31735 df = DataFrame( {"A": ["f", "e", "g", "h"], "B": ["a", "b", "c", "d"], "C": [1, 2, 3, 4]} ).astype(object) df.columns = ["A", "B", "B"] - result = df.groupby([0, 0, 0, 0]).min() + with pd.option_context("future.infer_string", infer_string): + result = df.groupby([0, 0, 0, 0]).min() expected = DataFrame( [["e", "a", 1]], index=np.array([0]), columns=["A", "B", "B"], dtype=object ) @@ -2739,13 +2748,20 @@ def test_rolling_wrong_param_min_period(): test_df.groupby("name")["val"].rolling(window=2, min_period=1).sum() -def test_by_column_values_with_same_starting_value(): +@pytest.mark.parametrize( + "dtype", + [ + object, + pytest.param("string[pyarrow_numpy]", marks=td.skip_if_no("pyarrow")), + ], +) +def test_by_column_values_with_same_starting_value(dtype): # GH29635 df = DataFrame( { "Name": ["Thomas", "Thomas", "Thomas John"], "Credit": [1200, 1300, 900], - "Mood": ["sad", "happy", "happy"], + "Mood": Series(["sad", "happy", "happy"], dtype=dtype), } ) aggregate_details = {"Mood": Series.mode, "Credit": "sum"} @@ -2977,12 +2993,12 @@ def test_groupby_reduce_period(): res = gb.max() expected = ser[-10:] - expected.index = Index(range(10), dtype=np.int_) + expected.index = Index(range(10), dtype=int) tm.assert_series_equal(res, expected) res = gb.min() expected = ser[:10] - expected.index = Index(range(10), dtype=np.int_) + expected.index = Index(range(10), dtype=int) tm.assert_series_equal(res, expected) @@ -3161,27 +3177,24 @@ def test_groupby_series_with_datetimeindex_month_name(): tm.assert_series_equal(result, expected) -def test_rolling_corr_grouping_column_contains_tuples_1(df): - # GH 44078 - df = DataFrame({"a": [(1,), (1,), (1,)], "b": [4, 5, 6]}) - gb = df.groupby(["a"]) - result = gb.rolling(2).corr(other=df) - index = MultiIndex.from_tuples([((1,), 0), ((1,), 1), ((1,), 2)], names=["a", None]) - expected = DataFrame( - {"a": [np.nan, np.nan, np.nan], "b": [np.nan, 1.0, 1.0]}, index=index - ) - tm.assert_frame_equal(result, expected) - - -def test_rolling_corr_case_grrouping_column_contains_tuples_2(df): - # GH 44078 - df = DataFrame({"a": [(1, 2), (1, 2), (1, 2)], "b": [4, 5, 6]}) - gb = df.groupby(["a"]) - result = gb.rolling(2).corr(other=df) - index = MultiIndex.from_tuples( - [((1, 2), 0), ((1, 2), 1), ((1, 2), 2)], names=["a", None] +def test_get_group_axis_1(): + # GH#54858 + df = DataFrame( + { + "col1": [0, 3, 2, 3], + "col2": [4, 1, 6, 7], + "col3": [3, 8, 2, 10], + "col4": [1, 13, 6, 15], + "col5": [-4, 5, 6, -7], + } ) + with tm.assert_produces_warning(FutureWarning, match="deprecated"): + grouped = df.groupby(axis=1, by=[1, 2, 3, 2, 1]) + result = grouped.get_group(1) expected = DataFrame( - {"a": [np.nan, np.nan, np.nan], "b": [np.nan, 1.0, 1.0]}, index=index + { + "col1": [0, 3, 2, 3], + "col5": [-4, 5, 6, -7], + } ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_groupby_shift_diff.py b/pandas/tests/groupby/test_groupby_shift_diff.py index 495f3fcd359c7..bb4b9aa866ac9 100644 --- a/pandas/tests/groupby/test_groupby_shift_diff.py +++ b/pandas/tests/groupby/test_groupby_shift_diff.py @@ -166,12 +166,14 @@ def test_shift_periods_freq(): tm.assert_frame_equal(result, expected) -def test_shift_disallow_freq_and_fill_value(): +def test_shift_deprecate_freq_and_fill_value(): # GH 53832 data = {"a": [1, 2, 3, 4, 5, 6], "b": [0, 0, 0, 1, 1, 1]} df = DataFrame(data, index=date_range(start="20100101", periods=6)) - msg = "Cannot pass both 'freq' and 'fill_value' to (Series|DataFrame).shift" - with pytest.raises(ValueError, match=msg): + msg = ( + "Passing a 'freq' together with a 'fill_value' silently ignores the fill_value" + ) + with tm.assert_produces_warning(FutureWarning, match=msg): df.groupby(df.index).shift(periods=-2, freq="D", fill_value="1") @@ -238,12 +240,15 @@ def test_group_shift_with_multiple_periods_and_fill_value(): tm.assert_frame_equal(shifted_df, expected_df) -def test_group_shift_with_multiple_periods_and_both_fill_and_freq_fails(): +def test_group_shift_with_multiple_periods_and_both_fill_and_freq_deprecated(): # GH#44424 df = DataFrame( {"a": [1, 2, 3, 4, 5], "b": [True, True, False, False, True]}, index=date_range("1/1/2000", periods=5, freq="H"), ) - msg = r"Cannot pass both 'freq' and 'fill_value' to.*" - with pytest.raises(ValueError, match=msg): + msg = ( + "Passing a 'freq' together with a 'fill_value' silently ignores the " + "fill_value" + ) + with tm.assert_produces_warning(FutureWarning, match=msg): df.groupby("b")[["a"]].shift([1, 2], fill_value=1, freq="H") diff --git a/pandas/tests/groupby/test_groupby_subclass.py b/pandas/tests/groupby/test_groupby_subclass.py index 773c1e60e97af..678211ea4a053 100644 --- a/pandas/tests/groupby/test_groupby_subclass.py +++ b/pandas/tests/groupby/test_groupby_subclass.py @@ -11,6 +11,10 @@ import pandas._testing as tm from pandas.tests.groupby import get_groupby_method_args +pytestmark = pytest.mark.filterwarnings( + "ignore:Passing a BlockManager|Passing a SingleBlockManager:DeprecationWarning" +) + @pytest.mark.parametrize( "obj", diff --git a/pandas/tests/groupby/test_min_max.py b/pandas/tests/groupby/test_min_max.py index 37eb52be0b37b..30c7e1df1e691 100644 --- a/pandas/tests/groupby/test_min_max.py +++ b/pandas/tests/groupby/test_min_max.py @@ -108,7 +108,7 @@ def test_max_inat_not_all_na(): # Note: in converting to float64, the iNaT + 1 maps to iNaT, i.e. is lossy expected = Series({1: np.nan, 2: np.nan, 3: iNaT + 1}) - expected.index = expected.index.astype(np.int_) + expected.index = expected.index.astype(int) tm.assert_series_equal(result, expected, check_exact=True) diff --git a/pandas/tests/groupby/test_numba.py b/pandas/tests/groupby/test_numba.py index e36c248c99ad6..ee7d342472493 100644 --- a/pandas/tests/groupby/test_numba.py +++ b/pandas/tests/groupby/test_numba.py @@ -3,6 +3,7 @@ from pandas import ( DataFrame, Series, + option_context, ) import pandas._testing as tm @@ -66,3 +67,14 @@ def test_axis_1_unsupported(self, numba_supported_reductions): gb = df.groupby("a", axis=1) with pytest.raises(NotImplementedError, match="axis=1"): getattr(gb, func)(engine="numba", **kwargs) + + def test_no_engine_doesnt_raise(self): + # GH55520 + df = DataFrame({"a": [3, 2, 3, 2], "b": range(4), "c": range(1, 5)}) + gb = df.groupby("a") + # Make sure behavior of functions w/out engine argument don't raise + # when the global use_numba option is set + with option_context("compute.use_numba", True): + res = gb.agg({"b": "first"}) + expected = gb.agg({"b": "first"}) + tm.assert_frame_equal(res, expected) diff --git a/pandas/tests/groupby/test_quantile.py b/pandas/tests/groupby/test_quantile.py index 165d72bf3e878..5a12f9a8e0e35 100644 --- a/pandas/tests/groupby/test_quantile.py +++ b/pandas/tests/groupby/test_quantile.py @@ -468,7 +468,7 @@ def test_groupby_quantile_dt64tz_period(): # Check that we match the group-by-group result exp = {i: df.iloc[i::5].quantile(0.5) for i in range(5)} expected = DataFrame(exp).T.infer_objects() - expected.index = expected.index.astype(np.int_) + expected.index = expected.index.astype(int) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_rank.py b/pandas/tests/groupby/test_rank.py index 26881bdd18274..5d85a0783e024 100644 --- a/pandas/tests/groupby/test_rank.py +++ b/pandas/tests/groupby/test_rank.py @@ -578,7 +578,7 @@ def test_rank_min_int(): result = df.groupby("grp").rank() expected = DataFrame( - {"int_col": [1.0, 2.0, 1.0], "datetimelike": [np.NaN, 1.0, np.NaN]} + {"int_col": [1.0, 2.0, 1.0], "datetimelike": [np.nan, 1.0, np.nan]} ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_size.py b/pandas/tests/groupby/test_size.py index 76b26c04f9f3a..93a4e743d0d71 100644 --- a/pandas/tests/groupby/test_size.py +++ b/pandas/tests/groupby/test_size.py @@ -1,6 +1,8 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + from pandas.core.dtypes.common import is_integer_dtype from pandas import ( @@ -39,7 +41,7 @@ def test_size_axis_1(df, axis_1, by, sort, dropna): if sort: expected = expected.sort_index() if is_integer_dtype(expected.index.dtype) and not any(x is None for x in by): - expected.index = expected.index.astype(np.int_) + expected.index = expected.index.astype(int) msg = "DataFrame.groupby with axis=1 is deprecated" with tm.assert_produces_warning(FutureWarning, match=msg): @@ -104,3 +106,25 @@ def test_size_series_masked_type_returns_Int64(dtype): result = ser.groupby(level=0).size() expected = Series([2, 1], dtype="Int64", index=["a", "b"]) tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "dtype", + [ + object, + pytest.param("string[pyarrow_numpy]", marks=td.skip_if_no("pyarrow")), + pytest.param("string[pyarrow]", marks=td.skip_if_no("pyarrow")), + ], +) +def test_size_strings(dtype): + # GH#55627 + df = DataFrame({"a": ["a", "a", "b"], "b": "a"}, dtype=dtype) + result = df.groupby("a")["b"].size() + exp_dtype = "Int64" if dtype == "string[pyarrow]" else "int64" + expected = Series( + [2, 1], + index=Index(["a", "b"], name="a", dtype=dtype), + name="b", + dtype=exp_dtype, + ) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/groupby/test_value_counts.py b/pandas/tests/groupby/test_value_counts.py index 7c50124e57e29..070bdda976dc4 100644 --- a/pandas/tests/groupby/test_value_counts.py +++ b/pandas/tests/groupby/test_value_counts.py @@ -9,6 +9,8 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + from pandas import ( Categorical, CategoricalIndex, @@ -366,6 +368,14 @@ def test_against_frame_and_seriesgroupby( tm.assert_frame_equal(result, expected) +@pytest.mark.parametrize( + "dtype", + [ + object, + pytest.param("string[pyarrow_numpy]", marks=td.skip_if_no("pyarrow")), + pytest.param("string[pyarrow]", marks=td.skip_if_no("pyarrow")), + ], +) @pytest.mark.parametrize("normalize", [True, False]) @pytest.mark.parametrize( "sort, ascending, expected_rows, expected_count, expected_group_size", @@ -383,7 +393,10 @@ def test_compound( expected_rows, expected_count, expected_group_size, + dtype, ): + education_df = education_df.astype(dtype) + education_df.columns = education_df.columns.astype(dtype) # Multiple groupby keys and as_index=False gp = education_df.groupby(["country", "gender"], as_index=False, sort=False) result = gp["education"].value_counts( @@ -392,11 +405,17 @@ def test_compound( expected = DataFrame() for column in ["country", "gender", "education"]: expected[column] = [education_df[column][row] for row in expected_rows] + expected = expected.astype(dtype) + expected.columns = expected.columns.astype(dtype) if normalize: expected["proportion"] = expected_count expected["proportion"] /= expected_group_size + if dtype == "string[pyarrow]": + expected["proportion"] = expected["proportion"].convert_dtypes() else: expected["count"] = expected_count + if dtype == "string[pyarrow]": + expected["count"] = expected["count"].convert_dtypes() tm.assert_frame_equal(result, expected) @@ -993,7 +1012,7 @@ def test_mixed_groupings(normalize, expected_label, expected_values): result = gp.value_counts(sort=True, normalize=normalize) expected = DataFrame( { - "level_0": np.array([4, 4, 5], dtype=np.int_), + "level_0": np.array([4, 4, 5], dtype=int), "A": [1, 1, 2], "level_2": [8, 8, 7], "B": [1, 3, 2], @@ -1143,3 +1162,14 @@ def test_value_counts_time_grouper(utc): ) expected = Series(1, index=index, name="count") tm.assert_series_equal(result, expected) + + +def test_value_counts_integer_columns(): + # GH#55627 + df = DataFrame({1: ["a", "a", "a"], 2: ["a", "a", "d"], 3: ["a", "b", "c"]}) + gp = df.groupby([1, 2], as_index=False, sort=False) + result = gp[3].value_counts() + expected = DataFrame( + {1: ["a", "a", "a"], 2: ["a", "a", "d"], 3: ["a", "b", "c"], "count": 1} + ) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/transform/test_transform.py b/pandas/tests/groupby/transform/test_transform.py index 062dfe3931423..d51c618b3c29d 100644 --- a/pandas/tests/groupby/transform/test_transform.py +++ b/pandas/tests/groupby/transform/test_transform.py @@ -1063,7 +1063,7 @@ def test_pct_change(frame_or_series, freq, periods, fill_method, limit): expected = expected.to_frame("vals") msg = ( - "The 'fill_method' and 'limit' keywords in " + "The 'fill_method' keyword being not None and the 'limit' keyword in " f"{type(gb).__name__}.pct_change are deprecated" ) with tm.assert_produces_warning(FutureWarning, match=msg): diff --git a/pandas/tests/indexes/base_class/test_constructors.py b/pandas/tests/indexes/base_class/test_constructors.py index 638124ac20e06..60abbfc441e8e 100644 --- a/pandas/tests/indexes/base_class/test_constructors.py +++ b/pandas/tests/indexes/base_class/test_constructors.py @@ -46,8 +46,8 @@ def test_construct_empty_tuples(self, tuple_list): def test_index_string_inference(self): # GH#54430 - pa = pytest.importorskip("pyarrow") - dtype = pd.ArrowDtype(pa.string()) + pytest.importorskip("pyarrow") + dtype = "string[pyarrow_numpy]" expected = Index(["a", "b"], dtype=dtype) with pd.option_context("future.infer_string", True): ser = Index(["a", "b"]) diff --git a/pandas/tests/indexes/base_class/test_reshape.py b/pandas/tests/indexes/base_class/test_reshape.py index 5ecb2c753644d..6586f5f9de480 100644 --- a/pandas/tests/indexes/base_class/test_reshape.py +++ b/pandas/tests/indexes/base_class/test_reshape.py @@ -54,6 +54,14 @@ def test_insert_datetime_into_object(self, loc, val): tm.assert_index_equal(result, expected) assert type(expected[2]) is type(val) + def test_insert_none_into_string_numpy(self): + # GH#55365 + pytest.importorskip("pyarrow") + index = Index(["a", "b", "c"], dtype="string[pyarrow_numpy]") + result = index.insert(-1, None) + expected = Index(["a", "b", None, "c"], dtype="string[pyarrow_numpy]") + tm.assert_index_equal(result, expected) + @pytest.mark.parametrize( "pos,expected", [ diff --git a/pandas/tests/indexes/base_class/test_setops.py b/pandas/tests/indexes/base_class/test_setops.py index 21d1630af9de2..488f79eea0d11 100644 --- a/pandas/tests/indexes/base_class/test_setops.py +++ b/pandas/tests/indexes/base_class/test_setops.py @@ -182,7 +182,7 @@ def test_symmetric_difference(self): "intersection", np.array( [(1, "A"), (2, "A"), (1, "B"), (2, "B")], - dtype=[("num", int), ("let", "a1")], + dtype=[("num", int), ("let", "S1")], ), False, ), @@ -190,7 +190,7 @@ def test_symmetric_difference(self): "intersection", np.array( [(1, "A"), (1, "B"), (2, "A"), (2, "B")], - dtype=[("num", int), ("let", "a1")], + dtype=[("num", int), ("let", "S1")], ), None, ), @@ -198,7 +198,7 @@ def test_symmetric_difference(self): "union", np.array( [(1, "A"), (1, "B"), (1, "C"), (2, "A"), (2, "B"), (2, "C")], - dtype=[("num", int), ("let", "a1")], + dtype=[("num", int), ("let", "S1")], ), None, ), @@ -208,13 +208,13 @@ def test_tuple_union_bug(self, method, expected, sort): index1 = Index( np.array( [(1, "A"), (2, "A"), (1, "B"), (2, "B")], - dtype=[("num", int), ("let", "a1")], + dtype=[("num", int), ("let", "S1")], ) ) index2 = Index( np.array( [(1, "A"), (2, "A"), (1, "B"), (2, "B"), (1, "C"), (2, "C")], - dtype=[("num", int), ("let", "a1")], + dtype=[("num", int), ("let", "S1")], ) ) diff --git a/pandas/tests/indexes/categorical/test_equals.py b/pandas/tests/indexes/categorical/test_equals.py index 1ed8f3a903439..a8353f301a3c3 100644 --- a/pandas/tests/indexes/categorical/test_equals.py +++ b/pandas/tests/indexes/categorical/test_equals.py @@ -88,3 +88,9 @@ def test_equals_multiindex(self): ci = mi.to_flat_index().astype("category") assert not ci.equals(mi) + + def test_equals_string_dtype(self, any_string_dtype): + # GH#55364 + idx = CategoricalIndex(list("abc"), name="B") + other = Index(["a", "b", "c"], name="B", dtype=any_string_dtype) + assert idx.equals(other) diff --git a/pandas/tests/indexes/datetimes/methods/test_astype.py b/pandas/tests/indexes/datetimes/methods/test_astype.py index 94cf86b7fb9c5..d339639dc5def 100644 --- a/pandas/tests/indexes/datetimes/methods/test_astype.py +++ b/pandas/tests/indexes/datetimes/methods/test_astype.py @@ -20,7 +20,7 @@ class TestDatetimeIndex: def test_astype(self): # GH 13149, GH 13209 - idx = DatetimeIndex(["2016-05-16", "NaT", NaT, np.NaN], name="idx") + idx = DatetimeIndex(["2016-05-16", "NaT", NaT, np.nan], name="idx") result = idx.astype(object) expected = Index( @@ -84,7 +84,7 @@ def test_astype_str_nat(self): # GH 13149, GH 13209 # verify that we are returning NaT as a string (and not unicode) - idx = DatetimeIndex(["2016-05-16", "NaT", NaT, np.NaN]) + idx = DatetimeIndex(["2016-05-16", "NaT", NaT, np.nan]) result = idx.astype(str) expected = Index(["2016-05-16", "NaT", "NaT", "NaT"], dtype=object) tm.assert_index_equal(result, expected) @@ -141,7 +141,7 @@ def test_astype_str_freq_and_tz(self): def test_astype_datetime64(self): # GH 13149, GH 13209 - idx = DatetimeIndex(["2016-05-16", "NaT", NaT, np.NaN], name="idx") + idx = DatetimeIndex(["2016-05-16", "NaT", NaT, np.nan], name="idx") result = idx.astype("datetime64[ns]") tm.assert_index_equal(result, idx) @@ -151,7 +151,7 @@ def test_astype_datetime64(self): tm.assert_index_equal(result, idx) assert result is idx - idx_tz = DatetimeIndex(["2016-05-16", "NaT", NaT, np.NaN], tz="EST", name="idx") + idx_tz = DatetimeIndex(["2016-05-16", "NaT", NaT, np.nan], tz="EST", name="idx") msg = "Cannot use .astype to convert from timezone-aware" with pytest.raises(TypeError, match=msg): # dt64tz->dt64 deprecated @@ -202,7 +202,7 @@ def test_astype_object_with_nat(self): ) def test_astype_raises(self, dtype): # GH 13149, GH 13209 - idx = DatetimeIndex(["2016-05-16", "NaT", NaT, np.NaN]) + idx = DatetimeIndex(["2016-05-16", "NaT", NaT, np.nan]) msg = "Cannot cast DatetimeIndex to dtype" if dtype == "datetime64": msg = "Casting to unit-less dtype 'datetime64' is not supported" diff --git a/pandas/tests/indexes/datetimes/test_constructors.py b/pandas/tests/indexes/datetimes/test_constructors.py index 733c14f33567a..fc7ce19c42b38 100644 --- a/pandas/tests/indexes/datetimes/test_constructors.py +++ b/pandas/tests/indexes/datetimes/test_constructors.py @@ -300,7 +300,7 @@ def test_construction_index_with_mixed_timezones(self): assert not isinstance(result, DatetimeIndex) msg = "DatetimeIndex has mixed timezones" - msg_depr = "parsing datetimes with mixed time zones will raise a warning" + msg_depr = "parsing datetimes with mixed time zones will raise an error" with pytest.raises(TypeError, match=msg): with tm.assert_produces_warning(FutureWarning, match=msg_depr): DatetimeIndex(["2013-11-02 22:00-05:00", "2013-11-03 22:00-06:00"]) diff --git a/pandas/tests/indexes/datetimes/test_datetime.py b/pandas/tests/indexes/datetimes/test_datetime.py index aa4954ff0ba85..e5e6d99c13e94 100644 --- a/pandas/tests/indexes/datetimes/test_datetime.py +++ b/pandas/tests/indexes/datetimes/test_datetime.py @@ -4,6 +4,8 @@ import numpy as np import pytest +from pandas.compat.numpy import np_long + import pandas as pd from pandas import ( DataFrame, @@ -54,7 +56,7 @@ def test_time_overflow_for_32bit_machines(self): # (which has value 1e9) and since the max value for np.int32 is ~2e9, # and since those machines won't promote np.int32 to np.int64, we get # overflow. - periods = np.int_(1000) + periods = np_long(1000) idx1 = date_range(start="2000", periods=periods, freq="S") assert len(idx1) == periods diff --git a/pandas/tests/indexes/datetimes/test_indexing.py b/pandas/tests/indexes/datetimes/test_indexing.py index 8e1b41095e056..cfbf1a75b25a8 100644 --- a/pandas/tests/indexes/datetimes/test_indexing.py +++ b/pandas/tests/indexes/datetimes/test_indexing.py @@ -8,6 +8,8 @@ import numpy as np import pytest +from pandas.compat.numpy import np_long + import pandas as pd from pandas import ( DatetimeIndex, @@ -91,7 +93,7 @@ def test_dti_business_getitem(self, freq): assert fancy_indexed.freq is None # 32-bit vs. 64-bit platforms - assert rng[4] == rng[np.int_(4)] + assert rng[4] == rng[np_long(4)] @pytest.mark.parametrize("freq", ["B", "C"]) def test_dti_business_getitem_matplotlib_hackaround(self, freq): diff --git a/pandas/tests/indexes/datetimes/test_timezones.py b/pandas/tests/indexes/datetimes/test_timezones.py index 6f3c83b999e94..09b06ecd5630d 100644 --- a/pandas/tests/indexes/datetimes/test_timezones.py +++ b/pandas/tests/indexes/datetimes/test_timezones.py @@ -538,8 +538,8 @@ def test_dti_tz_localize_ambiguous_nat(self, tz): times = [ "11/06/2011 00:00", - np.NaN, - np.NaN, + np.nan, + np.nan, "11/06/2011 02:00", "11/06/2011 03:00", ] diff --git a/pandas/tests/indexes/interval/test_interval.py b/pandas/tests/indexes/interval/test_interval.py index 49e8df2b71f22..aff4944e7bd55 100644 --- a/pandas/tests/indexes/interval/test_interval.py +++ b/pandas/tests/indexes/interval/test_interval.py @@ -247,12 +247,12 @@ def test_is_unique_interval(self, closed): assert idx.is_unique is True # unique NaN - idx = IntervalIndex.from_tuples([(np.NaN, np.NaN)], closed=closed) + idx = IntervalIndex.from_tuples([(np.nan, np.nan)], closed=closed) assert idx.is_unique is True # non-unique NaN idx = IntervalIndex.from_tuples( - [(np.NaN, np.NaN), (np.NaN, np.NaN)], closed=closed + [(np.nan, np.nan), (np.nan, np.nan)], closed=closed ) assert idx.is_unique is False diff --git a/pandas/tests/indexes/multi/test_join.py b/pandas/tests/indexes/multi/test_join.py index c5a3512113655..700af142958b3 100644 --- a/pandas/tests/indexes/multi/test_join.py +++ b/pandas/tests/indexes/multi/test_join.py @@ -217,7 +217,7 @@ def test_join_multi_with_nan(): ) df2 = DataFrame( data={"col2": [2.1, 2.2]}, - index=MultiIndex.from_product([["A"], [np.NaN, 2.0]], names=["id1", "id2"]), + index=MultiIndex.from_product([["A"], [np.nan, 2.0]], names=["id1", "id2"]), ) result = df1.join(df2) expected = DataFrame( diff --git a/pandas/tests/indexes/multi/test_reshape.py b/pandas/tests/indexes/multi/test_reshape.py index da9838d4a2ed3..06dbb33aadf97 100644 --- a/pandas/tests/indexes/multi/test_reshape.py +++ b/pandas/tests/indexes/multi/test_reshape.py @@ -169,6 +169,28 @@ def test_append_names_dont_match(): tm.assert_index_equal(result, expected) +def test_append_overlapping_interval_levels(): + # GH 54934 + ivl1 = pd.IntervalIndex.from_breaks([0.0, 1.0, 2.0]) + ivl2 = pd.IntervalIndex.from_breaks([0.5, 1.5, 2.5]) + mi1 = MultiIndex.from_product([ivl1, ivl1]) + mi2 = MultiIndex.from_product([ivl2, ivl2]) + result = mi1.append(mi2) + expected = MultiIndex.from_tuples( + [ + (pd.Interval(0.0, 1.0), pd.Interval(0.0, 1.0)), + (pd.Interval(0.0, 1.0), pd.Interval(1.0, 2.0)), + (pd.Interval(1.0, 2.0), pd.Interval(0.0, 1.0)), + (pd.Interval(1.0, 2.0), pd.Interval(1.0, 2.0)), + (pd.Interval(0.5, 1.5), pd.Interval(0.5, 1.5)), + (pd.Interval(0.5, 1.5), pd.Interval(1.5, 2.5)), + (pd.Interval(1.5, 2.5), pd.Interval(0.5, 1.5)), + (pd.Interval(1.5, 2.5), pd.Interval(1.5, 2.5)), + ] + ) + tm.assert_index_equal(result, expected) + + def test_repeat(): reps = 2 numbers = [1, 2, 3] diff --git a/pandas/tests/indexes/numeric/test_numeric.py b/pandas/tests/indexes/numeric/test_numeric.py index 977c7da7d866f..8cd295802a5d1 100644 --- a/pandas/tests/indexes/numeric/test_numeric.py +++ b/pandas/tests/indexes/numeric/test_numeric.py @@ -227,6 +227,14 @@ def test_fillna_float64(self): exp = Index([1.0, "obj", 3.0], name="x") tm.assert_index_equal(idx.fillna("obj"), exp, exact=True) + def test_logical_compat(self, simple_index): + idx = simple_index + assert idx.all() == idx.values.all() + assert idx.any() == idx.values.any() + + assert idx.all() == idx.to_series().all() + assert idx.any() == idx.to_series().any() + class TestNumericInt: @pytest.fixture(params=[np.int64, np.int32, np.int16, np.int8, np.uint64]) diff --git a/pandas/tests/indexes/period/methods/test_astype.py b/pandas/tests/indexes/period/methods/test_astype.py index 2a605d136175e..e54cd73a35f59 100644 --- a/pandas/tests/indexes/period/methods/test_astype.py +++ b/pandas/tests/indexes/period/methods/test_astype.py @@ -17,14 +17,14 @@ class TestPeriodIndexAsType: @pytest.mark.parametrize("dtype", [float, "timedelta64", "timedelta64[ns]"]) def test_astype_raises(self, dtype): # GH#13149, GH#13209 - idx = PeriodIndex(["2016-05-16", "NaT", NaT, np.NaN], freq="D") + idx = PeriodIndex(["2016-05-16", "NaT", NaT, np.nan], freq="D") msg = "Cannot cast PeriodIndex to dtype" with pytest.raises(TypeError, match=msg): idx.astype(dtype) def test_astype_conversion(self): # GH#13149, GH#13209 - idx = PeriodIndex(["2016-05-16", "NaT", NaT, np.NaN], freq="D", name="idx") + idx = PeriodIndex(["2016-05-16", "NaT", NaT, np.nan], freq="D", name="idx") result = idx.astype(object) expected = Index( diff --git a/pandas/tests/indexes/period/test_pickle.py b/pandas/tests/indexes/period/test_pickle.py index 82f906d1e361f..cb981ab10064f 100644 --- a/pandas/tests/indexes/period/test_pickle.py +++ b/pandas/tests/indexes/period/test_pickle.py @@ -14,7 +14,7 @@ class TestPickle: @pytest.mark.parametrize("freq", ["D", "M", "A"]) def test_pickle_round_trip(self, freq): - idx = PeriodIndex(["2016-05-16", "NaT", NaT, np.NaN], freq=freq) + idx = PeriodIndex(["2016-05-16", "NaT", NaT, np.nan], freq=freq) result = tm.round_trip_pickle(idx) tm.assert_index_equal(result, idx) diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index b3fb5a26ca63f..da4b44227bef3 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -10,7 +10,6 @@ from pandas.compat import IS64 from pandas.errors import InvalidIndexError -from pandas.util._test_decorators import async_mark from pandas.core.dtypes.common import ( is_any_real_numeric_dtype, @@ -457,7 +456,7 @@ def test_fancy(self, simple_index): ["string", "int64", "int32", "uint64", "uint32", "float64", "float32"], indirect=True, ) - @pytest.mark.parametrize("dtype", [np.int_, np.bool_]) + @pytest.mark.parametrize("dtype", [int, np.bool_]) def test_empty_fancy(self, index, dtype): empty_arr = np.array([], dtype=dtype) empty_index = type(index)([], dtype=index.dtype) @@ -473,7 +472,7 @@ def test_empty_fancy(self, index, dtype): def test_empty_fancy_raises(self, index): # DatetimeIndex is excluded, because it overrides getitem and should # be tested separately. - empty_farr = np.array([], dtype=np.float_) + empty_farr = np.array([], dtype=np.float64) empty_index = type(index)([], dtype=index.dtype) assert index[[]].identical(empty_index) @@ -692,7 +691,12 @@ def test_format_missing(self, vals, nulls_fixture): @pytest.mark.parametrize("op", ["any", "all"]) def test_logical_compat(self, op, simple_index): index = simple_index - assert getattr(index, op)() == getattr(index.values, op)() + left = getattr(index, op)() + assert left == getattr(index.values, op)() + right = getattr(index.to_series(), op)() + # left might not match right exactly in e.g. string cases where the + # because we use np.any/all instead of .any/all + assert bool(left) == bool(right) @pytest.mark.parametrize( "index", ["string", "int64", "int32", "float64", "float32"], indirect=True @@ -1213,14 +1217,13 @@ def test_cached_properties_not_settable(self): with pytest.raises(AttributeError, match="Can't set attribute"): index.is_unique = False - @async_mark() - async def test_tab_complete_warning(self, ip): + def test_tab_complete_warning(self, ip): # https://github.com/pandas-dev/pandas/issues/16409 pytest.importorskip("IPython", minversion="6.0.0") from IPython.core.completer import provisionalcompleter code = "import pandas as pd; idx = pd.Index([1, 2])" - await ip.run_code(code) + ip.run_cell(code) # GH 31324 newer jedi version raises Deprecation warning; # appears resolved 2021-02-02 diff --git a/pandas/tests/indexes/test_datetimelike.py b/pandas/tests/indexes/test_datetimelike.py index 71cc7f29c62bc..5ad2e9b2f717e 100644 --- a/pandas/tests/indexes/test_datetimelike.py +++ b/pandas/tests/indexes/test_datetimelike.py @@ -159,3 +159,11 @@ def test_where_cast_str(self, simple_index): result = index.where(mask, ["foo"]) tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize("unit", ["ns", "us", "ms", "s"]) + def test_diff(self, unit): + # GH 55080 + dti = pd.to_datetime([10, 20, 30], unit=unit).as_unit(unit) + result = dti.diff(1) + expected = pd.TimedeltaIndex([pd.NaT, 10, 10], unit=unit).as_unit(unit) + tm.assert_index_equal(result, expected) diff --git a/pandas/tests/indexes/test_old_base.py b/pandas/tests/indexes/test_old_base.py index f8f5a543a9c19..79dc423f12a85 100644 --- a/pandas/tests/indexes/test_old_base.py +++ b/pandas/tests/indexes/test_old_base.py @@ -209,17 +209,25 @@ def test_numeric_compat(self, simple_index): 1 // idx def test_logical_compat(self, simple_index): - if ( - isinstance(simple_index, RangeIndex) - or is_numeric_dtype(simple_index.dtype) - or simple_index.dtype == object - ): + if simple_index.dtype == object: pytest.skip("Tested elsewhere.") idx = simple_index - with pytest.raises(TypeError, match="cannot perform all"): - idx.all() - with pytest.raises(TypeError, match="cannot perform any"): - idx.any() + if idx.dtype.kind in "iufcbm": + assert idx.all() == idx._values.all() + assert idx.all() == idx.to_series().all() + assert idx.any() == idx._values.any() + assert idx.any() == idx.to_series().any() + else: + msg = "cannot perform (any|all)" + if isinstance(idx, IntervalIndex): + msg = ( + r"'IntervalArray' with dtype interval\[.*\] does " + "not support reduction '(any|all)'" + ) + with pytest.raises(TypeError, match=msg): + idx.all() + with pytest.raises(TypeError, match=msg): + idx.any() def test_repr_roundtrip(self, simple_index): if isinstance(simple_index, IntervalIndex): diff --git a/pandas/tests/indexes/test_setops.py b/pandas/tests/indexes/test_setops.py index 2fd203dbc77ed..a64994efec85a 100644 --- a/pandas/tests/indexes/test_setops.py +++ b/pandas/tests/indexes/test_setops.py @@ -899,3 +899,10 @@ def test_union_ea_dtypes(self, any_numeric_ea_and_arrow_dtype): result = idx.union(idx2) expected = Index([1, 2, 3, 4, 5], dtype=any_numeric_ea_and_arrow_dtype) tm.assert_index_equal(result, expected) + + def test_union_string_array(self, any_string_dtype): + idx1 = Index(["a"], dtype=any_string_dtype) + idx2 = Index(["b"], dtype=any_string_dtype) + result = idx1.union(idx2) + expected = Index(["a", "b"], dtype=any_string_dtype) + tm.assert_index_equal(result, expected) diff --git a/pandas/tests/indexes/timedeltas/methods/test_astype.py b/pandas/tests/indexes/timedeltas/methods/test_astype.py index 9b17a8af59ac5..f69f0fd3d78e2 100644 --- a/pandas/tests/indexes/timedeltas/methods/test_astype.py +++ b/pandas/tests/indexes/timedeltas/methods/test_astype.py @@ -45,7 +45,7 @@ def test_astype_object_with_nat(self): def test_astype(self): # GH 13149, GH 13209 - idx = TimedeltaIndex([1e14, "NaT", NaT, np.NaN], name="idx") + idx = TimedeltaIndex([1e14, "NaT", NaT, np.nan], name="idx") result = idx.astype(object) expected = Index( @@ -78,7 +78,7 @@ def test_astype_uint(self): def test_astype_timedelta64(self): # GH 13149, GH 13209 - idx = TimedeltaIndex([1e14, "NaT", NaT, np.NaN]) + idx = TimedeltaIndex([1e14, "NaT", NaT, np.nan]) msg = ( r"Cannot convert from timedelta64\[ns\] to timedelta64. " @@ -98,7 +98,7 @@ def test_astype_timedelta64(self): @pytest.mark.parametrize("dtype", [float, "datetime64", "datetime64[ns]"]) def test_astype_raises(self, dtype): # GH 13149, GH 13209 - idx = TimedeltaIndex([1e14, "NaT", NaT, np.NaN]) + idx = TimedeltaIndex([1e14, "NaT", NaT, np.nan]) msg = "Cannot cast TimedeltaIndex to dtype" with pytest.raises(TypeError, match=msg): idx.astype(dtype) diff --git a/pandas/tests/indexing/multiindex/test_partial.py b/pandas/tests/indexing/multiindex/test_partial.py index de989ad550f2b..081da385ebcc3 100644 --- a/pandas/tests/indexing/multiindex/test_partial.py +++ b/pandas/tests/indexing/multiindex/test_partial.py @@ -166,7 +166,7 @@ def test_getitem_intkey_leading_level( mi = ser.index assert isinstance(mi, MultiIndex) if dtype is int: - assert mi.levels[0].dtype == np.int_ + assert mi.levels[0].dtype == np.dtype(int) else: assert mi.levels[0].dtype == np.float64 diff --git a/pandas/tests/indexing/multiindex/test_setitem.py b/pandas/tests/indexing/multiindex/test_setitem.py index 960a8a59ba5a7..51b94c3beeb6f 100644 --- a/pandas/tests/indexing/multiindex/test_setitem.py +++ b/pandas/tests/indexing/multiindex/test_setitem.py @@ -556,3 +556,21 @@ def test_frame_setitem_copy_no_write( result = df tm.assert_frame_equal(result, expected) + + +def test_frame_setitem_partial_multiindex(): + # GH 54875 + df = DataFrame( + { + "a": [1, 2, 3], + "b": [3, 4, 5], + "c": 6, + "d": 7, + } + ).set_index(["a", "b", "c"]) + ser = Series(8, index=df.index.droplevel("c")) + result = df.copy() + result["d"] = ser + expected = df.copy() + expected["d"] = 8 + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/indexing/test_indexing.py b/pandas/tests/indexing/test_indexing.py index d6d8a63797bb6..54e204c43dadd 100644 --- a/pandas/tests/indexing/test_indexing.py +++ b/pandas/tests/indexing/test_indexing.py @@ -830,8 +830,7 @@ def test_coercion_with_loc(self, expected): start_data, expected_result, warn = expected start_dataframe = DataFrame({"foo": start_data}) - with tm.assert_produces_warning(warn, match="incompatible dtype"): - start_dataframe.loc[0, ["foo"]] = None + start_dataframe.loc[0, ["foo"]] = None expected_dataframe = DataFrame({"foo": expected_result}) tm.assert_frame_equal(start_dataframe, expected_dataframe) @@ -841,8 +840,7 @@ def test_coercion_with_setitem_and_dataframe(self, expected): start_data, expected_result, warn = expected start_dataframe = DataFrame({"foo": start_data}) - with tm.assert_produces_warning(warn, match="incompatible dtype"): - start_dataframe[start_dataframe["foo"] == start_dataframe["foo"][0]] = None + start_dataframe[start_dataframe["foo"] == start_dataframe["foo"][0]] = None expected_dataframe = DataFrame({"foo": expected_result}) tm.assert_frame_equal(start_dataframe, expected_dataframe) @@ -852,10 +850,7 @@ def test_none_coercion_loc_and_dataframe(self, expected): start_data, expected_result, warn = expected start_dataframe = DataFrame({"foo": start_data}) - with tm.assert_produces_warning(warn, match="incompatible dtype"): - start_dataframe.loc[ - start_dataframe["foo"] == start_dataframe["foo"][0] - ] = None + start_dataframe.loc[start_dataframe["foo"] == start_dataframe["foo"][0]] = None expected_dataframe = DataFrame({"foo": expected_result}) tm.assert_frame_equal(start_dataframe, expected_dataframe) @@ -869,10 +864,7 @@ def test_none_coercion_mixed_dtypes(self): "d": ["a", "b", "c"], } ) - with tm.assert_produces_warning( - FutureWarning, match="item of incompatible dtype" - ): - start_dataframe.iloc[0] = None + start_dataframe.iloc[0] = None exp = DataFrame( { @@ -1132,3 +1124,19 @@ def test_scalar_setitem_series_with_nested_value_length1(value, indexer_sli): assert (ser.loc[0] == value).all() else: assert ser.loc[0] == value + + +def test_object_dtype_series_set_series_element(): + # GH 48933 + s1 = Series(dtype="O", index=["a", "b"]) + + s1["a"] = Series() + s1.loc["b"] = Series() + + tm.assert_series_equal(s1.loc["a"], Series()) + tm.assert_series_equal(s1.loc["b"], Series()) + + s2 = Series(dtype="O", index=["a", "b"]) + + s2.iloc[1] = Series() + tm.assert_series_equal(s2.iloc[1], Series()) diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index abbf22a7fc70a..8b2730b3ab082 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -201,7 +201,7 @@ def test_column_types_consistent(self): df = DataFrame( data={ "channel": [1, 2, 3], - "A": ["String 1", np.NaN, "String 2"], + "A": ["String 1", np.nan, "String 2"], "B": [ Timestamp("2019-06-11 11:00:00"), pd.NaT, @@ -442,7 +442,7 @@ def test_loc_to_fail(self): ) msg = ( - rf"\"None of \[Index\(\[1, 2\], dtype='{np.int_().dtype}'\)\] are " + rf"\"None of \[Index\(\[1, 2\], dtype='{np.dtype(int)}'\)\] are " r"in the \[index\]\"" ) with pytest.raises(KeyError, match=msg): @@ -460,7 +460,7 @@ def test_loc_to_fail2(self): s.loc[-1] msg = ( - rf"\"None of \[Index\(\[-1, -2\], dtype='{np.int_().dtype}'\)\] are " + rf"\"None of \[Index\(\[-1, -2\], dtype='{np.dtype(int)}'\)\] are " r"in the \[index\]\"" ) with pytest.raises(KeyError, match=msg): @@ -476,7 +476,7 @@ def test_loc_to_fail2(self): s["a"] = 2 msg = ( - rf"\"None of \[Index\(\[-2\], dtype='{np.int_().dtype}'\)\] are " + rf"\"None of \[Index\(\[-2\], dtype='{np.dtype(int)}'\)\] are " r"in the \[index\]\"" ) with pytest.raises(KeyError, match=msg): @@ -493,7 +493,7 @@ def test_loc_to_fail3(self): df = DataFrame([["a"], ["b"]], index=[1, 2], columns=["value"]) msg = ( - rf"\"None of \[Index\(\[3\], dtype='{np.int_().dtype}'\)\] are " + rf"\"None of \[Index\(\[3\], dtype='{np.dtype(int)}'\)\] are " r"in the \[index\]\"" ) with pytest.raises(KeyError, match=msg): @@ -510,7 +510,7 @@ def test_loc_getitem_list_with_fail(self): s.loc[[2]] - msg = f"\"None of [Index([3], dtype='{np.int_().dtype}')] are in the [index]" + msg = f"\"None of [Index([3], dtype='{np.dtype(int)}')] are in the [index]" with pytest.raises(KeyError, match=re.escape(msg)): s.loc[[3]] @@ -1209,7 +1209,7 @@ def test_loc_setitem_empty_append_raises(self): df = DataFrame(columns=["x", "y"]) df.index = df.index.astype(np.int64) msg = ( - rf"None of \[Index\(\[0, 1\], dtype='{np.int_().dtype}'\)\] " + rf"None of \[Index\(\[0, 1\], dtype='{np.dtype(int)}'\)\] " r"are in the \[index\]" ) with pytest.raises(KeyError, match=msg): diff --git a/pandas/tests/indexing/test_partial.py b/pandas/tests/indexing/test_partial.py index 8e5cde42ec91b..8f499644f1013 100644 --- a/pandas/tests/indexing/test_partial.py +++ b/pandas/tests/indexing/test_partial.py @@ -402,7 +402,7 @@ def test_series_partial_set(self): # raises as nothing is in the index msg = ( - rf"\"None of \[Index\(\[3, 3, 3\], dtype='{np.int_().dtype}'\)\] " + rf"\"None of \[Index\(\[3, 3, 3\], dtype='{np.dtype(int)}'\)\] " r"are in the \[index\]\"" ) with pytest.raises(KeyError, match=msg): @@ -483,7 +483,7 @@ def test_series_partial_set_with_name(self): # raises as nothing is in the index msg = ( - rf"\"None of \[Index\(\[3, 3, 3\], dtype='{np.int_().dtype}', " + rf"\"None of \[Index\(\[3, 3, 3\], dtype='{np.dtype(int)}', " r"name='idx'\)\] are in the \[index\]\"" ) with pytest.raises(KeyError, match=msg): diff --git a/pandas/tests/interchange/test_impl.py b/pandas/tests/interchange/test_impl.py index 2a65937a82200..97a388569e261 100644 --- a/pandas/tests/interchange/test_impl.py +++ b/pandas/tests/interchange/test_impl.py @@ -4,6 +4,10 @@ import pytest from pandas._libs.tslibs import iNaT +from pandas.compat import ( + is_ci_environment, + is_platform_windows, +) import pandas.util._test_decorators as td import pandas as pd @@ -14,6 +18,7 @@ DtypeKind, ) from pandas.core.interchange.from_dataframe import from_dataframe +from pandas.core.interchange.utils import ArrowCTypes @pytest.fixture @@ -32,7 +37,7 @@ def string_data(): "234,3245.67", "gSaf,qWer|Gre", "asd3,4sad|", - np.NaN, + np.nan, ] } @@ -309,11 +314,21 @@ def test_datetimetzdtype(tz, unit): tm.assert_frame_equal(df, from_dataframe(df.__dataframe__())) -def test_interchange_from_non_pandas_tz_aware(): +def test_interchange_from_non_pandas_tz_aware(request): # GH 54239, 54287 pa = pytest.importorskip("pyarrow", "11.0.0") import pyarrow.compute as pc + if is_platform_windows() and is_ci_environment(): + mark = pytest.mark.xfail( + raises=pa.ArrowInvalid, + reason=( + "TODO: Set ARROW_TIMEZONE_DATABASE environment variable " + "on CI to path to the tzdata for pyarrow." + ), + ) + request.node.add_marker(mark) + arr = pa.array([datetime(2020, 1, 1), None, datetime(2020, 1, 2)]) arr = pc.assume_timezone(arr, "Asia/Kathmandu") table = pa.table({"arr": arr}) @@ -326,3 +341,24 @@ def test_interchange_from_non_pandas_tz_aware(): dtype="datetime64[us, Asia/Kathmandu]", ) tm.assert_frame_equal(expected, result) + + +def test_interchange_from_corrected_buffer_dtypes(monkeypatch) -> None: + # https://github.com/pandas-dev/pandas/issues/54781 + df = pd.DataFrame({"a": ["foo", "bar"]}).__dataframe__() + interchange = df.__dataframe__() + column = interchange.get_column_by_name("a") + buffers = column.get_buffers() + buffers_data = buffers["data"] + buffer_dtype = buffers_data[1] + buffer_dtype = ( + DtypeKind.UINT, + 8, + ArrowCTypes.UINT8, + buffer_dtype[3], + ) + buffers["data"] = (buffers_data[0], buffer_dtype) + column.get_buffers = lambda: buffers + interchange.get_column_by_name = lambda _: column + monkeypatch.setattr(df, "__dataframe__", lambda allow_copy: interchange) + pd.api.interchange.from_dataframe(df) diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py index 97d9f13bd9e9e..4b23829a554aa 100644 --- a/pandas/tests/internals/test_internals.py +++ b/pandas/tests/internals/test_internals.py @@ -468,12 +468,12 @@ def test_set_change_dtype(self, mgr): np.random.default_rng(2).standard_normal(N).astype(int), ) idx = mgr2.items.get_loc("quux") - assert mgr2.iget(idx).dtype == np.int_ + assert mgr2.iget(idx).dtype == np.dtype(int) mgr2.iset( mgr2.items.get_loc("quux"), np.random.default_rng(2).standard_normal(N) ) - assert mgr2.iget(idx).dtype == np.float_ + assert mgr2.iget(idx).dtype == np.float64 def test_copy(self, mgr): cp = mgr.copy(deep=False) diff --git a/pandas/tests/io/conftest.py b/pandas/tests/io/conftest.py index 170e2f61e7d4a..701bfe3767db4 100644 --- a/pandas/tests/io/conftest.py +++ b/pandas/tests/io/conftest.py @@ -234,3 +234,19 @@ def compression_format(request): @pytest.fixture(params=_compression_formats_params) def compression_ext(request): return request.param[0] + + +@pytest.fixture( + params=[ + "python", + pytest.param("pyarrow", marks=td.skip_if_no("pyarrow")), + ] +) +def string_storage(request): + """ + Parametrized fixture for pd.options.mode.string_storage. + + * 'python' + * 'pyarrow' + """ + return request.param diff --git a/pandas/tests/io/formats/test_format.py b/pandas/tests/io/formats/test_format.py index 3b0dac21ef10c..7dfd35d5424b4 100644 --- a/pandas/tests/io/formats/test_format.py +++ b/pandas/tests/io/formats/test_format.py @@ -3320,6 +3320,14 @@ def format_func(x): result = formatter.get_result() assert result == ["10:10", "12:12"] + def test_datetime64formatter_tz_ms(self): + x = Series( + np.array(["2999-01-01", "2999-01-02", "NaT"], dtype="datetime64[ms]") + ).dt.tz_localize("US/Pacific") + result = fmt.Datetime64TZFormatter(x).get_result() + assert result[0].strip() == "2999-01-01 00:00:00-08:00" + assert result[1].strip() == "2999-01-02 00:00:00-08:00" + class TestNaTFormatting: def test_repr(self): diff --git a/pandas/tests/io/formats/test_to_csv.py b/pandas/tests/io/formats/test_to_csv.py index 32509a799fa69..c8e984a92f418 100644 --- a/pandas/tests/io/formats/test_to_csv.py +++ b/pandas/tests/io/formats/test_to_csv.py @@ -181,7 +181,7 @@ def test_to_csv_na_rep(self): # see gh-11553 # # Testing if NaN values are correctly represented in the index. - df = DataFrame({"a": [0, np.NaN], "b": [0, 1], "c": [2, 3]}) + df = DataFrame({"a": [0, np.nan], "b": [0, 1], "c": [2, 3]}) expected_rows = ["a,b,c", "0.0,0,2", "_,1,3"] expected = tm.convert_rows_list_to_csv_str(expected_rows) @@ -189,7 +189,7 @@ def test_to_csv_na_rep(self): assert df.set_index(["a", "b"]).to_csv(na_rep="_") == expected # now with an index containing only NaNs - df = DataFrame({"a": np.NaN, "b": [0, 1], "c": [2, 3]}) + df = DataFrame({"a": np.nan, "b": [0, 1], "c": [2, 3]}) expected_rows = ["a,b,c", "_,0,2", "_,1,3"] expected = tm.convert_rows_list_to_csv_str(expected_rows) diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 637d62b98a831..663d0a195fa55 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -599,7 +599,7 @@ def test_blocks_compat_GH9037(self): ) # JSON deserialisation always creates unicode strings - df_mixed.columns = df_mixed.columns.astype("unicode") + df_mixed.columns = df_mixed.columns.astype(np.str_) data = StringIO(df_mixed.to_json(orient="split")) df_roundtrip = read_json(data, orient="split") tm.assert_frame_equal( @@ -2099,7 +2099,7 @@ def test_pyarrow_engine_lines_false(): def test_json_roundtrip_string_inference(orient): - pa = pytest.importorskip("pyarrow") + pytest.importorskip("pyarrow") df = DataFrame( [["a", "b"], ["c", "d"]], index=["row 1", "row 2"], columns=["col 1", "col 2"] ) @@ -2108,8 +2108,8 @@ def test_json_roundtrip_string_inference(orient): result = read_json(StringIO(out)) expected = DataFrame( [["a", "b"], ["c", "d"]], - dtype=pd.ArrowDtype(pa.string()), - index=pd.Index(["row 1", "row 2"], dtype=pd.ArrowDtype(pa.string())), - columns=pd.Index(["col 1", "col 2"], dtype=pd.ArrowDtype(pa.string())), + dtype="string[pyarrow_numpy]", + index=pd.Index(["row 1", "row 2"], dtype="string[pyarrow_numpy]"), + columns=pd.Index(["col 1", "col 2"], dtype="string[pyarrow_numpy]"), ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/common/test_common_basic.py b/pandas/tests/io/parser/common/test_common_basic.py index 00a26a755756f..442aa5ef87b10 100644 --- a/pandas/tests/io/parser/common/test_common_basic.py +++ b/pandas/tests/io/parser/common/test_common_basic.py @@ -86,7 +86,9 @@ def test_read_csv_local(all_parsers, csv1): fname = prefix + str(os.path.abspath(csv1)) result = parser.read_csv(fname, index_col=0, parse_dates=True) - + # TODO: make unit check more specific + if parser.engine == "pyarrow": + result.index = result.index.as_unit("ns") expected = DataFrame( [ [0.980269, 3.685731, -0.364216805298, -1.159738], @@ -177,7 +179,9 @@ def test_read_csv_low_memory_no_rows_with_index(all_parsers): def test_read_csv_dataframe(all_parsers, csv1): parser = all_parsers result = parser.read_csv(csv1, index_col=0, parse_dates=True) - + # TODO: make unit check more specific + if parser.engine == "pyarrow": + result.index = result.index.as_unit("ns") expected = DataFrame( [ [0.980269, 3.685731, -0.364216805298, -1.159738], diff --git a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py index 1c0f0939029ff..fbb026f98df6f 100644 --- a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py +++ b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py @@ -542,8 +542,8 @@ def test_ea_int_avoid_overflow(all_parsers): def test_string_inference(all_parsers): # GH#54430 - pa = pytest.importorskip("pyarrow") - dtype = pd.ArrowDtype(pa.string()) + pytest.importorskip("pyarrow") + dtype = "string[pyarrow_numpy]" data = """a,b x,1 @@ -558,3 +558,20 @@ def test_string_inference(all_parsers): columns=pd.Index(["a", "b"], dtype=dtype), ) tm.assert_frame_equal(result, expected) + + +def test_dtypes_with_usecols(all_parsers): + # GH#54868 + + parser = all_parsers + data = """a,b,c +1,2,3 +4,5,6""" + + result = parser.read_csv(StringIO(data), usecols=["a", "c"], dtype={"a": object}) + if parser.engine == "pyarrow": + values = [1, 4] + else: + values = ["1", "4"] + expected = DataFrame({"a": pd.Series(values, dtype=object), "c": [3, 6]}) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/test_header.py b/pandas/tests/io/parser/test_header.py index 5cb54bb4e2916..d72174c40478e 100644 --- a/pandas/tests/io/parser/test_header.py +++ b/pandas/tests/io/parser/test_header.py @@ -658,3 +658,29 @@ def test_header_missing_rows(all_parsers): msg = r"Passed header=\[0,1,2\], len of 3, but only 2 lines in file" with pytest.raises(ValueError, match=msg): parser.read_csv(StringIO(data), header=[0, 1, 2]) + + +@skip_pyarrow +def test_header_multiple_whitespaces(all_parsers): + # GH#54931 + parser = all_parsers + data = """aa bb(1,1) cc(1,1) + 0 2 3.5""" + + result = parser.read_csv(StringIO(data), sep=r"\s+") + expected = DataFrame({"aa": [0], "bb(1,1)": 2, "cc(1,1)": 3.5}) + tm.assert_frame_equal(result, expected) + + +@skip_pyarrow +def test_header_delim_whitespace(all_parsers): + # GH#54918 + parser = all_parsers + data = """a,b +1,2 +3,4 + """ + + result = parser.read_csv(StringIO(data), delim_whitespace=True) + expected = DataFrame({"a,b": ["1,2", "3,4"]}) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py index eecacf29de872..c6afff56de16b 100644 --- a/pandas/tests/io/parser/test_parse_dates.py +++ b/pandas/tests/io/parser/test_parse_dates.py @@ -46,8 +46,8 @@ def test_read_csv_with_custom_date_parser(all_parsers): # GH36111 def __custom_date_parser(time): - time = time.astype(np.float_) - time = time.astype(np.int_) # convert float seconds to int type + time = time.astype(np.float64) + time = time.astype(int) # convert float seconds to int type return pd.to_timedelta(time, unit="s") testdata = StringIO( @@ -86,8 +86,8 @@ def __custom_date_parser(time): def test_read_csv_with_custom_date_parser_parse_dates_false(all_parsers): # GH44366 def __custom_date_parser(time): - time = time.astype(np.float_) - time = time.astype(np.int_) # convert float seconds to int type + time = time.astype(np.float64) + time = time.astype(int) # convert float seconds to int type return pd.to_timedelta(time, unit="s") testdata = StringIO( @@ -979,12 +979,15 @@ def test_parse_dates_custom_euro_format(all_parsers, kwargs): ) -def test_parse_tz_aware(all_parsers, request): +def test_parse_tz_aware(all_parsers): # See gh-1693 parser = all_parsers data = "Date,x\n2012-06-13T01:39:00Z,0.5" result = parser.read_csv(StringIO(data), index_col=0, parse_dates=True) + # TODO: make unit check more specific + if parser.engine == "pyarrow": + result.index = result.index.as_unit("ns") expected = DataFrame( {"x": [0.5]}, index=Index([Timestamp("2012-06-13 01:39:00+00:00")], name="Date") ) @@ -2231,6 +2234,9 @@ def test_parse_dates_arrow_engine(all_parsers): 2000-01-01 00:00:01,1""" result = parser.read_csv(StringIO(data), parse_dates=["a"]) + # TODO: make unit check more specific + if parser.engine == "pyarrow": + result["a"] = result["a"].dt.as_unit("ns") expected = DataFrame( { "a": [ diff --git a/pandas/tests/io/pytables/test_read.py b/pandas/tests/io/pytables/test_read.py index 425828cb881a7..bc5ed4d208e0f 100644 --- a/pandas/tests/io/pytables/test_read.py +++ b/pandas/tests/io/pytables/test_read.py @@ -392,7 +392,7 @@ def test_read_py2_hdf_file_in_py3(datapath): def test_read_infer_string(tmp_path, setup_path): # GH#54431 - pa = pytest.importorskip("pyarrow") + pytest.importorskip("pyarrow") df = DataFrame({"a": ["a", "b", None]}) path = tmp_path / setup_path df.to_hdf(path, key="data", format="table") @@ -400,7 +400,7 @@ def test_read_infer_string(tmp_path, setup_path): result = read_hdf(path, key="data", mode="r") expected = DataFrame( {"a": ["a", "b", None]}, - dtype=pd.ArrowDtype(pa.string()), - columns=Index(["a"], dtype=pd.ArrowDtype(pa.string())), + dtype="string[pyarrow_numpy]", + columns=Index(["a"], dtype="string[pyarrow_numpy]"), ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/test_feather.py b/pandas/tests/io/test_feather.py index a0fee6751bf53..cf43203466ef4 100644 --- a/pandas/tests/io/test_feather.py +++ b/pandas/tests/io/test_feather.py @@ -222,14 +222,10 @@ def test_invalid_dtype_backend(self): def test_string_inference(self, tmp_path): # GH#54431 - import pyarrow as pa - path = tmp_path / "test_string_inference.p" df = pd.DataFrame(data={"a": ["x", "y"]}) df.to_feather(path) with pd.option_context("future.infer_string", True): result = read_feather(path) - expected = pd.DataFrame( - data={"a": ["x", "y"]}, dtype=pd.ArrowDtype(pa.string()) - ) + expected = pd.DataFrame(data={"a": ["x", "y"]}, dtype="string[pyarrow_numpy]") tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/test_orc.py b/pandas/tests/io/test_orc.py index c2d791ba24c87..d90f803f1e607 100644 --- a/pandas/tests/io/test_orc.py +++ b/pandas/tests/io/test_orc.py @@ -426,7 +426,7 @@ def test_string_inference(tmp_path): result = read_orc(path) expected = pd.DataFrame( data={"a": ["x", "y"]}, - dtype=pd.ArrowDtype(pa.string()), - columns=pd.Index(["a"], dtype=pd.ArrowDtype(pa.string())), + dtype="string[pyarrow_numpy]", + columns=pd.Index(["a"], dtype="string[pyarrow_numpy]"), ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index fcc1c218a149d..1d68f12270b55 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -1,5 +1,6 @@ """ test parquet compat """ import datetime +from decimal import Decimal from io import BytesIO import os import pathlib @@ -16,6 +17,7 @@ from pandas.compat.pyarrow import ( pa_version_under7p0, pa_version_under8p0, + pa_version_under11p0, pa_version_under13p0, ) @@ -404,12 +406,6 @@ def test_columns_dtypes(self, engine): @pytest.mark.parametrize("compression", [None, "gzip", "snappy", "brotli"]) def test_compression(self, engine, compression): - if compression == "snappy": - pytest.importorskip("snappy") - - elif compression == "brotli": - pytest.importorskip("brotli") - df = pd.DataFrame({"A": [1, 2, 3]}) check_round_trip(df, engine, write_kwargs={"compression": compression}) @@ -1105,8 +1101,6 @@ def test_df_attrs_persistence(self, tmp_path, pa): def test_string_inference(self, tmp_path, pa): # GH#54431 - import pyarrow as pa - path = tmp_path / "test_string_inference.p" df = pd.DataFrame(data={"a": ["x", "y"]}, index=["a", "b"]) df.to_parquet(path, engine="pyarrow") @@ -1114,11 +1108,54 @@ def test_string_inference(self, tmp_path, pa): result = read_parquet(path, engine="pyarrow") expected = pd.DataFrame( data={"a": ["x", "y"]}, - dtype=pd.ArrowDtype(pa.string()), - index=pd.Index(["a", "b"], dtype=pd.ArrowDtype(pa.string())), + dtype="string[pyarrow_numpy]", + index=pd.Index(["a", "b"], dtype="string[pyarrow_numpy]"), ) tm.assert_frame_equal(result, expected) + @pytest.mark.skipif(pa_version_under11p0, reason="not supported before 11.0") + def test_roundtrip_decimal(self, tmp_path, pa): + # GH#54768 + import pyarrow as pa + + path = tmp_path / "decimal.p" + df = pd.DataFrame({"a": [Decimal("123.00")]}, dtype="string[pyarrow]") + df.to_parquet(path, schema=pa.schema([("a", pa.decimal128(5))])) + result = read_parquet(path) + expected = pd.DataFrame({"a": ["123"]}, dtype="string[python]") + tm.assert_frame_equal(result, expected) + + def test_infer_string_large_string_type(self, tmp_path, pa): + # GH#54798 + import pyarrow as pa + import pyarrow.parquet as pq + + path = tmp_path / "large_string.p" + + table = pa.table({"a": pa.array([None, "b", "c"], pa.large_string())}) + pq.write_table(table, path) + + with pd.option_context("future.infer_string", True): + result = read_parquet(path) + expected = pd.DataFrame( + data={"a": [None, "b", "c"]}, + dtype="string[pyarrow_numpy]", + columns=pd.Index(["a"], dtype="string[pyarrow_numpy]"), + ) + tm.assert_frame_equal(result, expected) + + # NOTE: this test is not run by default, because it requires a lot of memory (>5GB) + # @pytest.mark.slow + # def test_string_column_above_2GB(self, tmp_path, pa): + # # https://github.com/pandas-dev/pandas/issues/55606 + # # above 2GB of string data + # v1 = b"x" * 100000000 + # v2 = b"x" * 147483646 + # df = pd.DataFrame({"strings": [v1] * 20 + [v2] + ["x"] * 20}, dtype="string") + # df.to_parquet(tmp_path / "test.parquet") + # result = read_parquet(tmp_path / "test.parquet") + # assert result["strings"].dtype == "string" + class TestParquetFastParquet(Base): def test_basic(self, fp, df_full): diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index 02736406e109b..5fd6a52031c52 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -2849,13 +2849,14 @@ def setup_driver(cls): def test_keyword_deprecation(self): # GH 54397 msg = ( - "tarting with pandas version 3.0 all arguments of to_sql except for the " - "argument 'name' will be keyword-only." + "Starting with pandas version 3.0 all arguments of to_sql except for the " + "arguments 'name' and 'con' will be keyword-only." ) df = DataFrame([{"A": 1, "B": 2, "C": 3}, {"A": 1, "B": 2, "C": 3}]) + df.to_sql("example", self.conn) with tm.assert_produces_warning(FutureWarning, match=msg): - df.to_sql("example", self.conn) + df.to_sql("example", self.conn, None, if_exists="replace") def test_default_type_conversion(self): df = sql.read_sql_table("types", self.conn) @@ -2946,7 +2947,7 @@ def test_read_sql_dtype_backend_table(self, string_storage, func): def test_read_sql_string_inference(self): # GH#54430 - pa = pytest.importorskip("pyarrow") + pytest.importorskip("pyarrow") table = "test" df = DataFrame({"a": ["x", "y"]}) df.to_sql(table, con=self.conn, index=False, if_exists="replace") @@ -2954,13 +2955,38 @@ def test_read_sql_string_inference(self): with pd.option_context("future.infer_string", True): result = read_sql_table(table, self.conn) - dtype = pd.ArrowDtype(pa.string()) + dtype = "string[pyarrow_numpy]" expected = DataFrame( {"a": ["x", "y"]}, dtype=dtype, columns=Index(["a"], dtype=dtype) ) tm.assert_frame_equal(result, expected) + def test_roundtripping_datetimes(self): + # GH#54877 + df = DataFrame({"t": [datetime(2020, 12, 31, 12)]}, dtype="datetime64[ns]") + df.to_sql("test", self.conn, if_exists="replace", index=False) + result = pd.read_sql("select * from test", self.conn).iloc[0, 0] + assert result == "2020-12-31 12:00:00.000000" + + +@pytest.fixture +def sqlite_builtin_detect_types(): + with contextlib.closing( + sqlite3.connect(":memory:", detect_types=sqlite3.PARSE_DECLTYPES) + ) as closing_conn: + with closing_conn as conn: + yield conn + + +def test_roundtripping_datetimes_detect_types(sqlite_builtin_detect_types): + # https://github.com/pandas-dev/pandas/issues/55554 + conn = sqlite_builtin_detect_types + df = DataFrame({"t": [datetime(2020, 12, 31, 12)]}, dtype="datetime64[ns]") + df.to_sql("test", conn, if_exists="replace", index=False) + result = pd.read_sql("select * from test", conn).iloc[0, 0] + assert result == Timestamp("2020-12-31 12:00:00.000000") + @pytest.mark.db class TestMySQLAlchemy(_TestSQLAlchemy): diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py index 3cfd86049588b..7459aa1df8f3e 100644 --- a/pandas/tests/io/test_stata.py +++ b/pandas/tests/io/test_stata.py @@ -1475,9 +1475,9 @@ def test_stata_111(self, datapath): df = read_stata(datapath("io", "data", "stata", "stata7_111.dta")) original = DataFrame( { - "y": [1, 1, 1, 1, 1, 0, 0, np.NaN, 0, 0], - "x": [1, 2, 1, 3, np.NaN, 4, 3, 5, 1, 6], - "w": [2, np.NaN, 5, 2, 4, 4, 3, 1, 2, 3], + "y": [1, 1, 1, 1, 1, 0, 0, np.nan, 0, 0], + "x": [1, 2, 1, 3, np.nan, 4, 3, 5, 1, 6], + "w": [2, np.nan, 5, 2, 4, 4, 3, 1, 2, 3], "z": ["a", "b", "c", "d", "e", "", "g", "h", "i", "j"], } ) diff --git a/pandas/tests/plotting/frame/test_frame.py b/pandas/tests/plotting/frame/test_frame.py index 11008646f0ad4..b97f1d64d57fd 100644 --- a/pandas/tests/plotting/frame/test_frame.py +++ b/pandas/tests/plotting/frame/test_frame.py @@ -2487,6 +2487,15 @@ def test_secondary_y(self, secondary_y): assert ax.get_ylim() == (0, 100) assert ax.get_yticks()[0] == 99 + @pytest.mark.slow + def test_plot_no_warning(self): + # GH 55138 + # TODO(3.0): this can be removed once Period[B] deprecation is enforced + df = tm.makeTimeDataFrame() + with tm.assert_produces_warning(False): + _ = df.plot() + _ = df.T.plot() + def _generate_4_axes_via_gridspec(): import matplotlib.pyplot as plt diff --git a/pandas/tests/plotting/test_series.py b/pandas/tests/plotting/test_series.py index 6f0afab53c267..768fce023e6e0 100644 --- a/pandas/tests/plotting/test_series.py +++ b/pandas/tests/plotting/test_series.py @@ -973,3 +973,10 @@ def test_series_none_color(self): ax = series.plot(color=None) expected = _unpack_cycler(mpl.pyplot.rcParams)[:1] _check_colors(ax.get_lines(), linecolors=expected) + + @pytest.mark.slow + def test_plot_no_warning(self, ts): + # GH 55138 + # TODO(3.0): this can be removed once Period[B] deprecation is enforced + with tm.assert_produces_warning(False): + _ = ts.plot() diff --git a/pandas/tests/reductions/test_reductions.py b/pandas/tests/reductions/test_reductions.py index afe5b3c66a611..560b2377ada70 100644 --- a/pandas/tests/reductions/test_reductions.py +++ b/pandas/tests/reductions/test_reductions.py @@ -867,7 +867,7 @@ def test_idxmin(self): string_series = tm.makeStringSeries().rename("series") # add some NaNs - string_series[5:15] = np.NaN + string_series[5:15] = np.nan # skipna or no assert string_series[string_series.idxmin()] == string_series.min() @@ -900,7 +900,7 @@ def test_idxmax(self): string_series = tm.makeStringSeries().rename("series") # add some NaNs - string_series[5:15] = np.NaN + string_series[5:15] = np.nan # skipna or no assert string_series[string_series.idxmax()] == string_series.max() @@ -1078,6 +1078,26 @@ def test_any_all_datetimelike(self): assert df.any().all() assert not df.all().any() + def test_any_all_pyarrow_string(self): + # GH#54591 + pytest.importorskip("pyarrow") + ser = Series(["", "a"], dtype="string[pyarrow_numpy]") + assert ser.any() + assert not ser.all() + + ser = Series([None, "a"], dtype="string[pyarrow_numpy]") + assert ser.any() + assert ser.all() + assert not ser.all(skipna=False) + + ser = Series([None, ""], dtype="string[pyarrow_numpy]") + assert not ser.any() + assert not ser.all() + + ser = Series(["a", "b"], dtype="string[pyarrow_numpy]") + assert ser.any() + assert ser.all() + def test_timedelta64_analytics(self): # index min/max dti = date_range("2012-1-1", periods=3, freq="D") diff --git a/pandas/tests/reductions/test_stat_reductions.py b/pandas/tests/reductions/test_stat_reductions.py index 58c5fc7269aee..55d78c516b6f3 100644 --- a/pandas/tests/reductions/test_stat_reductions.py +++ b/pandas/tests/reductions/test_stat_reductions.py @@ -99,7 +99,7 @@ def _check_stat_op( f = getattr(Series, name) # add some NaNs - string_series_[5:15] = np.NaN + string_series_[5:15] = np.nan # mean, idxmax, idxmin, min, and max are valid for dates if name not in ["max", "min", "mean", "median", "std"]: diff --git a/pandas/tests/resample/test_datetime_index.py b/pandas/tests/resample/test_datetime_index.py index 1d72e6d3970ca..1b20b383c4eae 100644 --- a/pandas/tests/resample/test_datetime_index.py +++ b/pandas/tests/resample/test_datetime_index.py @@ -478,7 +478,7 @@ def test_resample_how_method(unit): ) s.index = s.index.as_unit(unit) expected = Series( - [11, np.NaN, np.NaN, np.NaN, np.NaN, np.NaN, 22], + [11, np.nan, np.nan, np.nan, np.nan, np.nan, 22], index=DatetimeIndex( [ Timestamp("2015-03-31 21:48:50"), @@ -796,34 +796,24 @@ def test_resample_offset(unit): @pytest.mark.parametrize( - "kwargs, expected", + "kwargs", [ - ( - {"origin": "1999-12-31 23:57:00"}, - ["1999-12-31 23:57:00", "2000-01-01 01:57:00"], - ), - ( - {"origin": Timestamp("1970-01-01 00:02:00")}, - ["1970-01-01 00:02:00", "2000-01-01 01:57:00"], - ), - ( - {"origin": "epoch", "offset": "2m"}, - ["1999-12-31 23:57:00", "2000-01-01 01:57:00"], - ), + {"origin": "1999-12-31 23:57:00"}, + {"origin": Timestamp("1970-01-01 00:02:00")}, + {"origin": "epoch", "offset": "2m"}, # origin of '1999-31-12 12:02:00' should be equivalent for this case - ( - {"origin": "1999-12-31 12:02:00"}, - ["1999-12-31 12:02:00", "2000-01-01 01:57:00"], - ), - ({"offset": "-3m"}, ["1999-12-31 23:57:00", "2000-01-01 01:57:00"]), + {"origin": "1999-12-31 12:02:00"}, + {"offset": "-3m"}, ], ) -def test_resample_origin(kwargs, unit, expected): +def test_resample_origin(kwargs, unit): # GH 31809 rng = date_range("2000-01-01 00:00:00", "2000-01-01 02:00", freq="s").as_unit(unit) ts = Series(np.random.default_rng(2).standard_normal(len(rng)), index=rng) - exp_rng = date_range(expected[0], expected[1], freq="5min").as_unit(unit) + exp_rng = date_range( + "1999-12-31 23:57:00", "2000-01-01 01:57", freq="5min" + ).as_unit(unit) resampled = ts.resample("5min", **kwargs).mean() tm.assert_index_equal(resampled.index, exp_rng) @@ -853,31 +843,6 @@ def test_resample_bad_offset(offset, unit): ts.resample("5min", offset=offset) -def test_resample_monthstart_origin(): - # GH 53662 - df = DataFrame({"ts": [datetime(1999, 12, 31, 0, 0, 0)], "values": [10.0]}) - result = df.resample("2MS", on="ts", origin="1999-11-01")["values"].sum() - excepted = Series( - [10.0], - index=DatetimeIndex( - ["1999-11-01"], dtype="datetime64[ns]", name="ts", freq="2MS" - ), - ) - tm.assert_index_equal(result.index, excepted.index) - - df = DataFrame({"ts": [datetime(1999, 12, 31, 20)], "values": [10.0]}) - result = df.resample( - "3YS", on="ts", closed="left", label="left", origin=datetime(1995, 1, 1) - )["values"].sum() - expected = Series( - [0, 10.0], - index=DatetimeIndex( - ["1995-01-01", "1998-01-01"], dtype="datetime64[ns]", name="ts", freq="3YS" - ), - ) - tm.assert_index_equal(result.index, expected.index) - - def test_resample_origin_prime_freq(unit): # GH 31809 start, end = "2000-10-01 23:30:00", "2000-10-02 00:30:00" @@ -909,7 +874,7 @@ def test_resample_origin_prime_freq(unit): tm.assert_index_equal(resampled.index, exp_rng) exp_rng = date_range( - "2000-01-01 00:00:00", "2000-10-02 00:15:00", freq="17min" + "2000-10-01 23:24:00", "2000-10-02 00:15:00", freq="17min" ).as_unit(unit) resampled = ts.resample("17min", origin="2000-01-01").mean() tm.assert_index_equal(resampled.index, exp_rng) @@ -928,12 +893,14 @@ def test_resample_origin_with_tz(unit): exp_rng = date_range( "1999-12-31 23:57:00", "2000-01-01 01:57", freq="5min", tz=tz ).as_unit(unit) - resampled = ts.resample("5min", origin="epoch", offset="2m").mean() + resampled = ts.resample("5min", origin="1999-12-31 23:57:00+00:00").mean() tm.assert_index_equal(resampled.index, exp_rng) - resampled = ts.resample( - "5min", origin=Timestamp("1999-12-31 23:57:00", tz=tz) - ).mean() + # origin of '1999-31-12 12:02:00+03:00' should be equivalent for this case + resampled = ts.resample("5min", origin="1999-12-31 12:02:00+03:00").mean() + tm.assert_index_equal(resampled.index, exp_rng) + + resampled = ts.resample("5min", origin="epoch", offset="2m").mean() tm.assert_index_equal(resampled.index, exp_rng) with pytest.raises(ValueError, match=msg): @@ -1356,7 +1323,7 @@ def test_resample_consistency(unit): i30 = date_range("2002-02-02", periods=4, freq="30T").as_unit(unit) s = Series(np.arange(4.0), index=i30) - s.iloc[2] = np.NaN + s.iloc[2] = np.nan # Upsample by factor 3 with reindex() and resample() methods: i10 = date_range(i30[0], i30[-1], freq="10T").as_unit(unit) diff --git a/pandas/tests/resample/test_period_index.py b/pandas/tests/resample/test_period_index.py index 0ded7d7e6bfc5..7559a85de7a6b 100644 --- a/pandas/tests/resample/test_period_index.py +++ b/pandas/tests/resample/test_period_index.py @@ -793,7 +793,7 @@ def test_upsampling_ohlc(self, freq, period_mult, kind): @pytest.mark.parametrize( "freq, expected_values", [ - ("1s", [3, np.NaN, 7, 11]), + ("1s", [3, np.nan, 7, 11]), ("2s", [3, (7 + 11) / 2]), ("3s", [(3 + 7) / 2, 11]), ], diff --git a/pandas/tests/resample/test_resampler_grouper.py b/pandas/tests/resample/test_resampler_grouper.py index 62b0bc2012af1..b5288c793dafa 100644 --- a/pandas/tests/resample/test_resampler_grouper.py +++ b/pandas/tests/resample/test_resampler_grouper.py @@ -4,7 +4,6 @@ import pytest from pandas.compat import is_platform_windows -from pandas.util._test_decorators import async_mark import pandas as pd from pandas import ( @@ -26,8 +25,7 @@ def test_frame(): ) -@async_mark() -async def test_tab_complete_ipython6_warning(ip): +def test_tab_complete_ipython6_warning(ip): from IPython.core.completer import provisionalcompleter code = dedent( @@ -37,7 +35,7 @@ async def test_tab_complete_ipython6_warning(ip): rs = s.resample("D") """ ) - await ip.run_code(code) + ip.run_cell(code) # GH 31324 newer jedi version raises Deprecation warning; # appears resolved 2021-02-02 @@ -141,19 +139,6 @@ def test_groupby_with_origin(): start, end = "1/1/2000 00:00:00", "1/31/2000 00:00" middle = "1/15/2000 00:00:00" - # test origin on 1970-01-01 00:00:00 - rng = date_range("1970-01-01 00:00:00", end, freq="1231min") # prime number - ts = Series(np.random.default_rng(2).standard_normal(len(rng)), index=rng) - middle_ts = rng[len(rng) // 2] - ts2 = ts[middle_ts:end] - - origin = Timestamp(0) - adjusted_grouper = pd.Grouper(freq=freq, origin=origin) - adjusted_count_ts = ts.groupby(adjusted_grouper).agg("count") - adjusted_count_ts = adjusted_count_ts[middle_ts:end] - adjusted_count_ts2 = ts2.groupby(adjusted_grouper).agg("count") - tm.assert_series_equal(adjusted_count_ts, adjusted_count_ts2[middle_ts:end]) - rng = date_range(start, end, freq="1231min") # prime number ts = Series(np.random.default_rng(2).standard_normal(len(rng)), index=rng) ts2 = ts[middle:end] @@ -167,19 +152,26 @@ def test_groupby_with_origin(): with pytest.raises(AssertionError, match="Index are different"): tm.assert_index_equal(count_ts.index, count_ts2.index) - # test origin on 2049-10-18 20:00:00 + # test origin on 1970-01-01 00:00:00 + origin = Timestamp(0) + adjusted_grouper = pd.Grouper(freq=freq, origin=origin) + adjusted_count_ts = ts.groupby(adjusted_grouper).agg("count") + adjusted_count_ts = adjusted_count_ts[middle:end] + adjusted_count_ts2 = ts2.groupby(adjusted_grouper).agg("count") + tm.assert_series_equal(adjusted_count_ts, adjusted_count_ts2) - rng = date_range(start, "2049-10-18 20:00:00", freq="1231min") # prime number - ts = Series(np.random.default_rng(2).standard_normal(len(rng)), index=rng) - middle_ts = rng[len(rng) // 2] - ts2 = ts[middle_ts:end] + # test origin on 2049-10-18 20:00:00 origin_future = Timestamp(0) + pd.Timedelta("1399min") * 30_000 adjusted_grouper2 = pd.Grouper(freq=freq, origin=origin_future) adjusted2_count_ts = ts.groupby(adjusted_grouper2).agg("count") - adjusted2_count_ts = adjusted2_count_ts[middle_ts:end] + adjusted2_count_ts = adjusted2_count_ts[middle:end] adjusted2_count_ts2 = ts2.groupby(adjusted_grouper2).agg("count") tm.assert_series_equal(adjusted2_count_ts, adjusted2_count_ts2) + # both grouper use an adjusted timestamp that is a multiple of 1399 min + # they should be equals even if the adjusted_timestamp is in the future + tm.assert_series_equal(adjusted_count_ts, adjusted2_count_ts2) + def test_nearest(): # GH 17496 diff --git a/pandas/tests/reshape/concat/test_append.py b/pandas/tests/reshape/concat/test_append.py index 386f363c81557..81ca227fb7afb 100644 --- a/pandas/tests/reshape/concat/test_append.py +++ b/pandas/tests/reshape/concat/test_append.py @@ -91,10 +91,10 @@ def test_append_length0_frame(self, sort): tm.assert_frame_equal(df5, expected) def test_append_records(self): - arr1 = np.zeros((2,), dtype=("i4,f4,a10")) + arr1 = np.zeros((2,), dtype=("i4,f4,S10")) arr1[:] = [(1, 2.0, "Hello"), (2, 3.0, "World")] - arr2 = np.zeros((3,), dtype=("i4,f4,a10")) + arr2 = np.zeros((3,), dtype=("i4,f4,S10")) arr2[:] = [(3, 4.0, "foo"), (5, 6.0, "bar"), (7.0, 8.0, "baz")] df1 = DataFrame(arr1) diff --git a/pandas/tests/reshape/concat/test_concat.py b/pandas/tests/reshape/concat/test_concat.py index 869bf3ace9492..7863aa55152f1 100644 --- a/pandas/tests/reshape/concat/test_concat.py +++ b/pandas/tests/reshape/concat/test_concat.py @@ -507,10 +507,10 @@ def test_concat_duplicate_indices_raise(self): concat([df1, df2], axis=1) -@pytest.mark.parametrize("dt", np.sctypes["float"]) -def test_concat_no_unnecessary_upcast(dt, frame_or_series): +def test_concat_no_unnecessary_upcast(float_numpy_dtype, frame_or_series): # GH 13247 dims = frame_or_series(dtype=object).ndim + dt = float_numpy_dtype dfs = [ frame_or_series(np.array([1], dtype=dt, ndmin=dims)), @@ -522,8 +522,8 @@ def test_concat_no_unnecessary_upcast(dt, frame_or_series): @pytest.mark.parametrize("pdt", [Series, DataFrame]) -@pytest.mark.parametrize("dt", np.sctypes["int"]) -def test_concat_will_upcast(dt, pdt): +def test_concat_will_upcast(pdt, any_signed_int_numpy_dtype): + dt = any_signed_int_numpy_dtype dims = pdt().ndim dfs = [ pdt(np.array([1], dtype=dt, ndmin=dims)), @@ -591,6 +591,9 @@ def test_duplicate_keys_same_frame(): tm.assert_frame_equal(result, expected) +@pytest.mark.filterwarnings( + "ignore:Passing a BlockManager|Passing a SingleBlockManager:DeprecationWarning" +) @pytest.mark.parametrize( "obj", [ @@ -858,3 +861,12 @@ def test_concat_multiindex_with_category(): ) expected = expected.set_index(["c1", "c2"]) tm.assert_frame_equal(result, expected) + + +def test_concat_ea_upcast(): + # GH#54848 + df1 = DataFrame(["a"], dtype="string") + df2 = DataFrame([1], dtype="Int64") + result = concat([df1, df2]) + expected = DataFrame(["a", 1], index=[0, 0]) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 91510fae0a9b1..6868f7344c153 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -366,7 +366,7 @@ def test_merge_join_key_dtype_cast(self): lkey = np.array([1]) rkey = np.array([2]) df = merge(df1, df2, left_on=lkey, right_on=rkey, how="outer") - assert df["key_0"].dtype == np.int_ + assert df["key_0"].dtype == np.dtype(int) def test_handle_join_key_pass_array(self): left = DataFrame( @@ -390,7 +390,7 @@ def test_handle_join_key_pass_array(self): rkey = np.array([1, 1, 2, 3, 4, 5]) merged = merge(left, right, left_on=lkey, right_on=rkey, how="outer") - expected = Series([1, 1, 1, 1, 2, 2, 3, 4, 5], dtype=np.int_, name="key_0") + expected = Series([1, 1, 1, 1, 2, 2, 3, 4, 5], dtype=int, name="key_0") tm.assert_series_equal(merged["key_0"], expected) left = DataFrame({"value": np.arange(3)}) @@ -689,6 +689,9 @@ def test_merge_nan_right2(self): )[["i1", "i2", "i1_", "i3"]] tm.assert_frame_equal(result, expected) + @pytest.mark.filterwarnings( + "ignore:Passing a BlockManager|Passing a SingleBlockManager:DeprecationWarning" + ) def test_merge_type(self, df, df2): class NotADataFrame(DataFrame): @property @@ -2847,3 +2850,38 @@ def test_merge_multiindex_single_level(): result = df.merge(df2, left_on=["col"], right_index=True, how="left") tm.assert_frame_equal(result, expected) + + +def test_merge_ea_int_and_float_numpy(): + # GH#46178 + df1 = DataFrame([1.0, np.nan], dtype=pd.Int64Dtype()) + df2 = DataFrame([1.5]) + expected = DataFrame(columns=[0], dtype="Int64") + + with tm.assert_produces_warning(UserWarning, match="You are merging"): + result = df1.merge(df2) + tm.assert_frame_equal(result, expected) + + with tm.assert_produces_warning(UserWarning, match="You are merging"): + result = df2.merge(df1) + tm.assert_frame_equal(result, expected.astype("float64")) + + df2 = DataFrame([1.0]) + expected = DataFrame([1], columns=[0], dtype="Int64") + result = df1.merge(df2) + tm.assert_frame_equal(result, expected) + + result = df2.merge(df1) + tm.assert_frame_equal(result, expected.astype("float64")) + + +def test_merge_arrow_string_index(any_string_dtype): + # GH#54894 + pytest.importorskip("pyarrow") + left = DataFrame({"a": ["a", "b"]}, dtype=any_string_dtype) + right = DataFrame({"b": 1}, index=Index(["a", "c"], dtype=any_string_dtype)) + result = left.merge(right, left_on="a", right_index=True, how="left") + expected = DataFrame( + {"a": Series(["a", "b"], dtype=any_string_dtype), "b": [1, np.nan]} + ) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/reshape/merge/test_merge_asof.py b/pandas/tests/reshape/merge/test_merge_asof.py index d4cb89dadde9b..a4ae81f86513c 100644 --- a/pandas/tests/reshape/merge/test_merge_asof.py +++ b/pandas/tests/reshape/merge/test_merge_asof.py @@ -1341,6 +1341,34 @@ def test_by_mixed_tz_aware(self): expected["value_y"] = np.array([np.nan], dtype=object) tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize("dtype", ["m8[ns]", "M8[us]"]) + def test_by_datelike(self, dtype): + # GH 55453 + left = pd.DataFrame( + { + "by_col": np.array([1], dtype=dtype), + "on_col": [2], + "value": ["a"], + } + ) + right = pd.DataFrame( + { + "by_col": np.array([1], dtype=dtype), + "on_col": [1], + "value": ["b"], + } + ) + result = merge_asof(left, right, by="by_col", on="on_col") + expected = pd.DataFrame( + { + "by_col": np.array([1], dtype=dtype), + "on_col": [2], + "value_x": ["a"], + "value_y": ["b"], + } + ) + tm.assert_frame_equal(result, expected) + def test_timedelta_tolerance_nearest(self, unit): # GH 27642 if unit == "s": diff --git a/pandas/tests/reshape/merge/test_merge_ordered.py b/pandas/tests/reshape/merge/test_merge_ordered.py index 1d0d4e3eb554b..cfb4e92fb45cd 100644 --- a/pandas/tests/reshape/merge/test_merge_ordered.py +++ b/pandas/tests/reshape/merge/test_merge_ordered.py @@ -70,6 +70,9 @@ def test_multigroup(self, left, right): result = merge_ordered(left, right, on="key", left_by="group") assert result["group"].notna().all() + @pytest.mark.filterwarnings( + "ignore:Passing a BlockManager|Passing a SingleBlockManager:DeprecationWarning" + ) def test_merge_type(self, left, right): class NotADataFrame(DataFrame): @property diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index 43786ee15d138..46da18445e135 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -1950,7 +1950,7 @@ def test_pivot_table_not_series(self): result = df.pivot_table("col1", index="col3", columns="col2", aggfunc="sum") expected = DataFrame( - [[3, np.NaN, np.NaN], [np.NaN, 4, np.NaN], [np.NaN, np.NaN, 5]], + [[3, np.nan, np.nan], [np.nan, 4, np.nan], [np.nan, np.nan, 5]], index=Index([1, 3, 9], name="col3"), columns=Index(["C", "D", "E"], name="col2"), ) @@ -2424,7 +2424,7 @@ def test_pivot_table_aggfunc_nunique_with_different_values(self): ], names=(None, None, "b"), ) - nparr = np.full((10, 10), np.NaN) + nparr = np.full((10, 10), np.nan) np.fill_diagonal(nparr, 1.0) expected = DataFrame(nparr, index=Index(range(10), name="a"), columns=columnval) diff --git a/pandas/tests/scalar/test_na_scalar.py b/pandas/tests/scalar/test_na_scalar.py index 213fa1791838d..287b7557f50f9 100644 --- a/pandas/tests/scalar/test_na_scalar.py +++ b/pandas/tests/scalar/test_na_scalar.py @@ -103,9 +103,9 @@ def test_comparison_ops(comparison_op, other): False, np.bool_(False), np.int_(0), - np.float_(0), + np.float64(0), np.int_(-0), - np.float_(-0), + np.float64(-0), ], ) @pytest.mark.parametrize("asarray", [True, False]) @@ -123,7 +123,7 @@ def test_pow_special(value, asarray): @pytest.mark.parametrize( - "value", [1, 1.0, True, np.bool_(True), np.int_(1), np.float_(1)] + "value", [1, 1.0, True, np.bool_(True), np.int_(1), np.float64(1)] ) @pytest.mark.parametrize("asarray", [True, False]) def test_rpow_special(value, asarray): @@ -133,14 +133,14 @@ def test_rpow_special(value, asarray): if asarray: result = result[0] - elif not isinstance(value, (np.float_, np.bool_, np.int_)): + elif not isinstance(value, (np.float64, np.bool_, np.int_)): # this assertion isn't possible with asarray=True assert isinstance(result, type(value)) assert result == value -@pytest.mark.parametrize("value", [-1, -1.0, np.int_(-1), np.float_(-1)]) +@pytest.mark.parametrize("value", [-1, -1.0, np.int_(-1), np.float64(-1)]) @pytest.mark.parametrize("asarray", [True, False]) def test_rpow_minus_one(value, asarray): if asarray: diff --git a/pandas/tests/scalar/timedelta/test_timedelta.py b/pandas/tests/scalar/timedelta/test_timedelta.py index 701cfdf157d26..884c3a648f395 100644 --- a/pandas/tests/scalar/timedelta/test_timedelta.py +++ b/pandas/tests/scalar/timedelta/test_timedelta.py @@ -921,7 +921,6 @@ def test_timedelta_hash_equality(self): @pytest.mark.xfail( reason="pd.Timedelta violates the Python hash invariant (GH#44504).", - raises=AssertionError, ) @given( st.integers( diff --git a/pandas/tests/series/accessors/test_dt_accessor.py b/pandas/tests/series/accessors/test_dt_accessor.py index e7fea9aa597b8..dd810a31c25af 100644 --- a/pandas/tests/series/accessors/test_dt_accessor.py +++ b/pandas/tests/series/accessors/test_dt_accessor.py @@ -739,9 +739,9 @@ def test_dt_timetz_accessor(self, tz_naive_fixture): "input_series, expected_output", [ [["2020-01-01"], [[2020, 1, 3]]], - [[pd.NaT], [[np.NaN, np.NaN, np.NaN]]], + [[pd.NaT], [[np.nan, np.nan, np.nan]]], [["2019-12-31", "2019-12-29"], [[2020, 1, 2], [2019, 52, 7]]], - [["2010-01-01", pd.NaT], [[2009, 53, 5], [np.NaN, np.NaN, np.NaN]]], + [["2010-01-01", pd.NaT], [[2009, 53, 5], [np.nan, np.nan, np.nan]]], # see GH#36032 [["2016-01-08", "2016-01-04"], [[2016, 1, 5], [2016, 1, 1]]], [["2016-01-07", "2016-01-01"], [[2016, 1, 4], [2015, 53, 5]]], diff --git a/pandas/tests/series/indexing/test_datetime.py b/pandas/tests/series/indexing/test_datetime.py index 8daaf087085c6..11d66d4820fe6 100644 --- a/pandas/tests/series/indexing/test_datetime.py +++ b/pandas/tests/series/indexing/test_datetime.py @@ -486,3 +486,12 @@ def test_getitem_str_second_with_datetimeindex(): msg = r"Timestamp\('2012-01-02 18:01:02-0600', tz='US/Central'\)" with pytest.raises(KeyError, match=msg): df[df.index[2]] + + +def test_compare_datetime_with_all_none(): + # GH#54870 + ser = Series(["2020-01-01", "2020-01-02"], dtype="datetime64[ns]") + ser2 = Series([None, None]) + result = ser > ser2 + expected = Series([False, False]) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/series/indexing/test_indexing.py b/pandas/tests/series/indexing/test_indexing.py index 20f8dd1fc5b2a..0fa28920d41bd 100644 --- a/pandas/tests/series/indexing/test_indexing.py +++ b/pandas/tests/series/indexing/test_indexing.py @@ -19,6 +19,7 @@ Timestamp, concat, date_range, + isna, period_range, timedelta_range, ) @@ -196,9 +197,9 @@ def test_setitem_ambiguous_keyerror(indexer_sl): def test_setitem(datetime_series): - datetime_series[datetime_series.index[5]] = np.NaN - datetime_series.iloc[[1, 2, 17]] = np.NaN - datetime_series.iloc[6] = np.NaN + datetime_series[datetime_series.index[5]] = np.nan + datetime_series.iloc[[1, 2, 17]] = np.nan + datetime_series.iloc[6] = np.nan assert np.isnan(datetime_series.iloc[6]) assert np.isnan(datetime_series.iloc[2]) datetime_series[np.isnan(datetime_series)] = 5 @@ -304,7 +305,7 @@ def test_underlying_data_conversion(using_copy_on_write): def test_preserve_refs(datetime_series): seq = datetime_series.iloc[[5, 10, 15]] - seq.iloc[1] = np.NaN + seq.iloc[1] = np.nan assert not np.isnan(datetime_series.iloc[10]) @@ -456,25 +457,25 @@ def test_setitem_dict_and_set_disallowed_multiindex(self, key): class TestSetitemValidation: # This is adapted from pandas/tests/arrays/masked/test_indexing.py # but checks for warnings instead of errors. - def _check_setitem_invalid(self, ser, invalid, indexer): + def _check_setitem_invalid(self, ser, invalid, indexer, warn): msg = "Setting an item of incompatible dtype is deprecated" msg = re.escape(msg) orig_ser = ser.copy() - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(warn, match=msg): ser[indexer] = invalid ser = orig_ser.copy() - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(warn, match=msg): ser.iloc[indexer] = invalid ser = orig_ser.copy() - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(warn, match=msg): ser.loc[indexer] = invalid ser = orig_ser.copy() - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(warn, match=msg): ser[:] = invalid _invalid_scalars = [ @@ -494,16 +495,20 @@ def _check_setitem_invalid(self, ser, invalid, indexer): @pytest.mark.parametrize("indexer", _indexers) def test_setitem_validation_scalar_bool(self, invalid, indexer): ser = Series([True, False, False], dtype="bool") - self._check_setitem_invalid(ser, invalid, indexer) + self._check_setitem_invalid(ser, invalid, indexer, FutureWarning) @pytest.mark.parametrize("invalid", _invalid_scalars + [True, 1.5, np.float64(1.5)]) @pytest.mark.parametrize("indexer", _indexers) def test_setitem_validation_scalar_int(self, invalid, any_int_numpy_dtype, indexer): ser = Series([1, 2, 3], dtype=any_int_numpy_dtype) - self._check_setitem_invalid(ser, invalid, indexer) + if isna(invalid) and invalid is not NaT: + warn = None + else: + warn = FutureWarning + self._check_setitem_invalid(ser, invalid, indexer, warn) @pytest.mark.parametrize("invalid", _invalid_scalars + [True]) @pytest.mark.parametrize("indexer", _indexers) def test_setitem_validation_scalar_float(self, invalid, float_numpy_dtype, indexer): ser = Series([1, 2, None], dtype=float_numpy_dtype) - self._check_setitem_invalid(ser, invalid, indexer) + self._check_setitem_invalid(ser, invalid, indexer, FutureWarning) diff --git a/pandas/tests/series/indexing/test_setitem.py b/pandas/tests/series/indexing/test_setitem.py index bc596fb0a3abe..d42e884c1ac78 100644 --- a/pandas/tests/series/indexing/test_setitem.py +++ b/pandas/tests/series/indexing/test_setitem.py @@ -191,14 +191,11 @@ def test_setitem_series_object_dtype(self, indexer, ser_index): expected = Series([Series([42], index=[ser_index]), 0], dtype="object") tm.assert_series_equal(ser, expected) - @pytest.mark.parametrize( - "index, exp_value, warn", [(0, 42, None), (1, np.nan, FutureWarning)] - ) - def test_setitem_series(self, index, exp_value, warn): + @pytest.mark.parametrize("index, exp_value", [(0, 42), (1, np.nan)]) + def test_setitem_series(self, index, exp_value): # GH#38303 ser = Series([0, 0]) - with tm.assert_produces_warning(warn, match="item of incompatible dtype"): - ser.loc[0] = Series([42], index=[index]) + ser.loc[0] = Series([42], index=[index]) expected = Series([exp_value, 0]) tm.assert_series_equal(ser, expected) @@ -575,7 +572,7 @@ def test_setitem_keep_precision(self, any_numeric_ea_dtype): [ (NA, NA, "Int64", "Int64", 1, None), (NA, NA, "Int64", "Int64", 2, None), - (NA, np.nan, "int64", "float64", 1, FutureWarning), + (NA, np.nan, "int64", "float64", 1, None), (NA, np.nan, "int64", "float64", 2, None), (NaT, NaT, "int64", "object", 1, FutureWarning), (NaT, NaT, "int64", "object", 2, None), @@ -583,7 +580,7 @@ def test_setitem_keep_precision(self, any_numeric_ea_dtype): (np.nan, NA, "Int64", "Int64", 2, None), (np.nan, NA, "Float64", "Float64", 1, None), (np.nan, NA, "Float64", "Float64", 2, None), - (np.nan, np.nan, "int64", "float64", 1, FutureWarning), + (np.nan, np.nan, "int64", "float64", 1, None), (np.nan, np.nan, "int64", "float64", 2, None), ], ) @@ -592,7 +589,7 @@ def test_setitem_enlarge_with_na( ): # GH#32346 ser = Series([1, 2], dtype=dtype) - with tm.assert_produces_warning(warn, match="item of incompatible dtype"): + with tm.assert_produces_warning(warn, match="incompatible dtype"): ser[indexer] = na expected_values = [1, target_na] if indexer == 1 else [1, 2, target_na] expected = Series(expected_values, dtype=target_dtype) @@ -884,7 +881,7 @@ def test_index_putmask(self, obj, key, expected, warn, val): Series([2, 3, 4, 5, 6, 7, 8, 9, 10]), Series([np.nan, 3, np.nan, 5, np.nan, 7, np.nan, 9, np.nan]), slice(None, None, 2), - FutureWarning, + None, id="int_series_slice_key_step", ), pytest.param( @@ -899,7 +896,7 @@ def test_index_putmask(self, obj, key, expected, warn, val): Series(np.arange(10)), Series([np.nan, np.nan, np.nan, np.nan, np.nan, 5, 6, 7, 8, 9]), slice(None, 5), - FutureWarning, + None, id="int_series_slice_key", ), pytest.param( @@ -907,7 +904,7 @@ def test_index_putmask(self, obj, key, expected, warn, val): Series([1, 2, 3]), Series([np.nan, 2, 3]), 0, - FutureWarning, + None, id="int_series_int_key", ), pytest.param( @@ -1134,7 +1131,7 @@ def warn(self): "obj,expected,warn", [ # For numeric series, we should coerce to NaN. - (Series([1, 2, 3]), Series([np.nan, 2, 3]), FutureWarning), + (Series([1, 2, 3]), Series([np.nan, 2, 3]), None), (Series([1.0, 2.0, 3.0]), Series([np.nan, 2.0, 3.0]), None), # For datetime series, we should coerce to NaT. ( @@ -1584,13 +1581,11 @@ def test_20643_comment(): expected = Series([np.nan, 1, 2], index=["a", "b", "c"]) ser = orig.copy() - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): - ser.iat[0] = None + ser.iat[0] = None tm.assert_series_equal(ser, expected) ser = orig.copy() - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): - ser.iloc[0] = None + ser.iloc[0] = None tm.assert_series_equal(ser, expected) diff --git a/pandas/tests/series/methods/test_argsort.py b/pandas/tests/series/methods/test_argsort.py index bd8b7b34bd402..5bcf42aad1db4 100644 --- a/pandas/tests/series/methods/test_argsort.py +++ b/pandas/tests/series/methods/test_argsort.py @@ -27,7 +27,7 @@ def test_argsort_numpy(self, datetime_series): # with missing values ts = ser.copy() - ts[::2] = np.NaN + ts[::2] = np.nan msg = "The behavior of Series.argsort in the presence of NA values" with tm.assert_produces_warning( diff --git a/pandas/tests/series/methods/test_asof.py b/pandas/tests/series/methods/test_asof.py index d5f99f721d323..31c264d74d063 100644 --- a/pandas/tests/series/methods/test_asof.py +++ b/pandas/tests/series/methods/test_asof.py @@ -65,8 +65,8 @@ def test_scalar(self): rng = date_range("1/1/1990", periods=N, freq="53s") # Explicit cast to float avoid implicit cast when setting nan ts = Series(np.arange(N), index=rng, dtype="float") - ts.iloc[5:10] = np.NaN - ts.iloc[15:20] = np.NaN + ts.iloc[5:10] = np.nan + ts.iloc[15:20] = np.nan val1 = ts.asof(ts.index[7]) val2 = ts.asof(ts.index[19]) diff --git a/pandas/tests/series/methods/test_astype.py b/pandas/tests/series/methods/test_astype.py index b6c409397c9fb..03fc6cba2902a 100644 --- a/pandas/tests/series/methods/test_astype.py +++ b/pandas/tests/series/methods/test_astype.py @@ -403,12 +403,12 @@ def test_astype_unicode(self): # bytes with obj.decode() instead of str(obj) item = "野菜食べないとやばい" ser = Series([item.encode()]) - result = ser.astype("unicode") + result = ser.astype(np.str_) expected = Series([item]) tm.assert_series_equal(result, expected) for ser in test_series: - res = ser.astype("unicode") + res = ser.astype(np.str_) expec = ser.map(str) tm.assert_series_equal(res, expec) diff --git a/pandas/tests/series/methods/test_combine_first.py b/pandas/tests/series/methods/test_combine_first.py index c7ca73da9ae66..d2d8eab1cb38b 100644 --- a/pandas/tests/series/methods/test_combine_first.py +++ b/pandas/tests/series/methods/test_combine_first.py @@ -36,7 +36,7 @@ def test_combine_first(self): series = Series(values, index=tm.makeIntIndex(20)) series_copy = series * 2 - series_copy[::2] = np.NaN + series_copy[::2] = np.nan # nothing used from the input combined = series.combine_first(series_copy) @@ -70,14 +70,14 @@ def test_combine_first(self): tm.assert_series_equal(ser, result) def test_combine_first_dt64(self): - s0 = to_datetime(Series(["2010", np.NaN])) - s1 = to_datetime(Series([np.NaN, "2011"])) + s0 = to_datetime(Series(["2010", np.nan])) + s1 = to_datetime(Series([np.nan, "2011"])) rs = s0.combine_first(s1) xp = to_datetime(Series(["2010", "2011"])) tm.assert_series_equal(rs, xp) - s0 = to_datetime(Series(["2010", np.NaN])) - s1 = Series([np.NaN, "2011"]) + s0 = to_datetime(Series(["2010", np.nan])) + s1 = Series([np.nan, "2011"]) rs = s0.combine_first(s1) xp = Series([datetime(2010, 1, 1), "2011"], dtype="datetime64[ns]") diff --git a/pandas/tests/series/methods/test_convert_dtypes.py b/pandas/tests/series/methods/test_convert_dtypes.py index 9cd0ce250b5df..d1c79d0f00365 100644 --- a/pandas/tests/series/methods/test_convert_dtypes.py +++ b/pandas/tests/series/methods/test_convert_dtypes.py @@ -206,17 +206,7 @@ def test_convert_dtypes( # Test that it is a copy copy = series.copy(deep=True) - if result.notna().sum() > 0 and result.dtype in [ - "int8", - "uint8", - "int16", - "uint16", - "int32", - "uint32", - "int64", - "uint64", - "interval[int64, right]", - ]: + if result.notna().sum() > 0 and result.dtype in ["interval[int64, right]"]: with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): result[result.notna()] = np.nan else: diff --git a/pandas/tests/series/methods/test_copy.py b/pandas/tests/series/methods/test_copy.py index 5ebf45090d7b8..77600e0e7d293 100644 --- a/pandas/tests/series/methods/test_copy.py +++ b/pandas/tests/series/methods/test_copy.py @@ -27,7 +27,7 @@ def test_copy(self, deep, using_copy_on_write): else: assert not np.may_share_memory(ser.values, ser2.values) - ser2[::2] = np.NaN + ser2[::2] = np.nan if deep is not False or using_copy_on_write: # Did not modify original Series diff --git a/pandas/tests/series/methods/test_count.py b/pandas/tests/series/methods/test_count.py index 90984a2e65cba..9ba163f347198 100644 --- a/pandas/tests/series/methods/test_count.py +++ b/pandas/tests/series/methods/test_count.py @@ -12,7 +12,7 @@ class TestSeriesCount: def test_count(self, datetime_series): assert datetime_series.count() == len(datetime_series) - datetime_series[::2] = np.NaN + datetime_series[::2] = np.nan assert datetime_series.count() == np.isfinite(datetime_series).sum() diff --git a/pandas/tests/series/methods/test_drop_duplicates.py b/pandas/tests/series/methods/test_drop_duplicates.py index 7e4503be2ec47..10b2e98586365 100644 --- a/pandas/tests/series/methods/test_drop_duplicates.py +++ b/pandas/tests/series/methods/test_drop_duplicates.py @@ -1,6 +1,7 @@ import numpy as np import pytest +import pandas as pd from pandas import ( Categorical, Series, @@ -71,7 +72,7 @@ def test_drop_duplicates_no_duplicates(any_numpy_dtype, keep, values): class TestSeriesDropDuplicates: @pytest.fixture( - params=["int_", "uint", "float_", "unicode_", "timedelta64[h]", "datetime64[D]"] + params=["int_", "uint", "float64", "str_", "timedelta64[h]", "datetime64[D]"] ) def dtype(self, request): return request.param @@ -249,3 +250,18 @@ def test_drop_duplicates_ignore_index(self): result = ser.drop_duplicates(ignore_index=True) expected = Series([1, 2, 3]) tm.assert_series_equal(result, expected) + + def test_duplicated_arrow_dtype(self): + pytest.importorskip("pyarrow") + ser = Series([True, False, None, False], dtype="bool[pyarrow]") + result = ser.drop_duplicates() + expected = Series([True, False, None], dtype="bool[pyarrow]") + tm.assert_series_equal(result, expected) + + def test_drop_duplicates_arrow_strings(self): + # GH#54904 + pa = pytest.importorskip("pyarrow") + ser = Series(["a", "a"], dtype=pd.ArrowDtype(pa.string())) + result = ser.drop_duplicates() + expecetd = Series(["a"], dtype=pd.ArrowDtype(pa.string())) + tm.assert_series_equal(result, expecetd) diff --git a/pandas/tests/series/methods/test_fillna.py b/pandas/tests/series/methods/test_fillna.py index 96c3674541e6b..46bc14da59eb0 100644 --- a/pandas/tests/series/methods/test_fillna.py +++ b/pandas/tests/series/methods/test_fillna.py @@ -75,7 +75,7 @@ def test_fillna(self): tm.assert_series_equal(ts, ts.fillna(method="ffill")) - ts.iloc[2] = np.NaN + ts.iloc[2] = np.nan exp = Series([0.0, 1.0, 1.0, 3.0, 4.0], index=ts.index) tm.assert_series_equal(ts.fillna(method="ffill"), exp) @@ -881,7 +881,7 @@ def test_fillna_bug(self): def test_ffill(self): ts = Series([0.0, 1.0, 2.0, 3.0, 4.0], index=tm.makeDateIndex(5)) - ts.iloc[2] = np.NaN + ts.iloc[2] = np.nan tm.assert_series_equal(ts.ffill(), ts.fillna(method="ffill")) def test_ffill_mixed_dtypes_without_missing_data(self): @@ -892,7 +892,7 @@ def test_ffill_mixed_dtypes_without_missing_data(self): def test_bfill(self): ts = Series([0.0, 1.0, 2.0, 3.0, 4.0], index=tm.makeDateIndex(5)) - ts.iloc[2] = np.NaN + ts.iloc[2] = np.nan tm.assert_series_equal(ts.bfill(), ts.fillna(method="bfill")) def test_pad_nan(self): diff --git a/pandas/tests/series/methods/test_interpolate.py b/pandas/tests/series/methods/test_interpolate.py index a984cd16997aa..549f429f09d35 100644 --- a/pandas/tests/series/methods/test_interpolate.py +++ b/pandas/tests/series/methods/test_interpolate.py @@ -94,7 +94,7 @@ def test_interpolate(self, datetime_series): ts = Series(np.arange(len(datetime_series), dtype=float), datetime_series.index) ts_copy = ts.copy() - ts_copy[5:10] = np.NaN + ts_copy[5:10] = np.nan linear_interp = ts_copy.interpolate(method="linear") tm.assert_series_equal(linear_interp, ts) @@ -104,7 +104,7 @@ def test_interpolate(self, datetime_series): ).astype(float) ord_ts_copy = ord_ts.copy() - ord_ts_copy[5:10] = np.NaN + ord_ts_copy[5:10] = np.nan time_interp = ord_ts_copy.interpolate(method="time") tm.assert_series_equal(time_interp, ord_ts) @@ -112,7 +112,7 @@ def test_interpolate(self, datetime_series): def test_interpolate_time_raises_for_non_timeseries(self): # When method='time' is used on a non-TimeSeries that contains a null # value, a ValueError should be raised. - non_ts = Series([0, 1, 2, np.NaN]) + non_ts = Series([0, 1, 2, np.nan]) msg = "time-weighted interpolation only works on Series.* with a DatetimeIndex" with pytest.raises(ValueError, match=msg): non_ts.interpolate(method="time") @@ -858,3 +858,11 @@ def test_interpolate_asfreq_raises(self): with pytest.raises(ValueError, match=msg): with tm.assert_produces_warning(FutureWarning, match=msg2): ser.interpolate(method="asfreq") + + def test_interpolate_fill_value(self): + # GH#54920 + pytest.importorskip("scipy") + ser = Series([np.nan, 0, 1, np.nan, 3, np.nan]) + result = ser.interpolate(method="nearest", fill_value=0) + expected = Series([np.nan, 0, 1, 1, 3, 0]) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/series/methods/test_map.py b/pandas/tests/series/methods/test_map.py index 00d1ad99332e9..783e18e541ad8 100644 --- a/pandas/tests/series/methods/test_map.py +++ b/pandas/tests/series/methods/test_map.py @@ -104,7 +104,7 @@ def test_map_series_stringdtype(any_string_dtype): @pytest.mark.parametrize( "data, expected_dtype", - [(["1-1", "1-1", np.NaN], "category"), (["1-1", "1-2", np.NaN], object)], + [(["1-1", "1-1", np.nan], "category"), (["1-1", "1-2", np.nan], object)], ) def test_map_categorical_with_nan_values(data, expected_dtype): # GH 20714 bug fixed in: GH 24275 @@ -114,7 +114,7 @@ def func(val): s = Series(data, dtype="category") result = s.map(func, na_action="ignore") - expected = Series(["1", "1", np.NaN], dtype=expected_dtype) + expected = Series(["1", "1", np.nan], dtype=expected_dtype) tm.assert_series_equal(result, expected) @@ -229,11 +229,11 @@ def test_map_int(): left = Series({"a": 1.0, "b": 2.0, "c": 3.0, "d": 4}) right = Series({1: 11, 2: 22, 3: 33}) - assert left.dtype == np.float_ + assert left.dtype == np.float64 assert issubclass(right.dtype.type, np.integer) merged = left.map(right) - assert merged.dtype == np.float_ + assert merged.dtype == np.float64 assert isna(merged["d"]) assert not isna(merged["c"]) diff --git a/pandas/tests/series/methods/test_pct_change.py b/pandas/tests/series/methods/test_pct_change.py index 38a42062b275e..9727ef3d5c27c 100644 --- a/pandas/tests/series/methods/test_pct_change.py +++ b/pandas/tests/series/methods/test_pct_change.py @@ -11,12 +11,11 @@ class TestSeriesPctChange: def test_pct_change(self, datetime_series): msg = ( - "The 'fill_method' and 'limit' keywords in " + "The 'fill_method' keyword being not None and the 'limit' keyword in " "Series.pct_change are deprecated" ) - with tm.assert_produces_warning(FutureWarning, match=msg): - rs = datetime_series.pct_change(fill_method=None) + rs = datetime_series.pct_change(fill_method=None) tm.assert_series_equal(rs, datetime_series / datetime_series.shift(1) - 1) rs = datetime_series.pct_change(2) @@ -40,7 +39,7 @@ def test_pct_change_with_duplicate_axis(self): result = Series(range(5), common_idx).pct_change(freq="B") # the reason that the expected should be like this is documented at PR 28681 - expected = Series([np.NaN, np.inf, np.NaN, np.NaN, 3.0], common_idx) + expected = Series([np.nan, np.inf, np.nan, np.nan, 3.0], common_idx) tm.assert_series_equal(result, expected) @@ -69,7 +68,7 @@ def test_pct_change_periods_freq( self, freq, periods, fill_method, limit, datetime_series ): msg = ( - "The 'fill_method' and 'limit' keywords in " + "The 'fill_method' keyword being not None and the 'limit' keyword in " "Series.pct_change are deprecated" ) @@ -101,9 +100,21 @@ def test_pct_change_with_duplicated_indices(fill_method): # GH30463 s = Series([np.nan, 1, 2, 3, 9, 18], index=["a", "b"] * 3) - msg = "The 'fill_method' and 'limit' keywords in Series.pct_change are deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): + warn = None if fill_method is None else FutureWarning + msg = ( + "The 'fill_method' keyword being not None and the 'limit' keyword in " + "Series.pct_change are deprecated" + ) + with tm.assert_produces_warning(warn, match=msg): result = s.pct_change(fill_method=fill_method) expected = Series([np.nan, np.nan, 1.0, 0.5, 2.0, 1.0], index=["a", "b"] * 3) tm.assert_series_equal(result, expected) + + +def test_pct_change_no_warning_na_beginning(): + # GH#54981 + ser = Series([None, None, 1, 2, 3]) + result = ser.pct_change() + expected = Series([np.nan, np.nan, np.nan, 1, 0.5]) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/series/methods/test_rank.py b/pandas/tests/series/methods/test_rank.py index 766a2415d89fb..24cf97c05c0a8 100644 --- a/pandas/tests/series/methods/test_rank.py +++ b/pandas/tests/series/methods/test_rank.py @@ -185,7 +185,7 @@ def test_rank_categorical(self): # Test na_option for rank data na_ser = Series( - ["first", "second", "third", "fourth", "fifth", "sixth", np.NaN] + ["first", "second", "third", "fourth", "fifth", "sixth", np.nan] ).astype( CategoricalDtype( ["first", "second", "third", "fourth", "fifth", "sixth", "seventh"], @@ -195,7 +195,7 @@ def test_rank_categorical(self): exp_top = Series([2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 1.0]) exp_bot = Series([1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0]) - exp_keep = Series([1.0, 2.0, 3.0, 4.0, 5.0, 6.0, np.NaN]) + exp_keep = Series([1.0, 2.0, 3.0, 4.0, 5.0, 6.0, np.nan]) tm.assert_series_equal(na_ser.rank(na_option="top"), exp_top) tm.assert_series_equal(na_ser.rank(na_option="bottom"), exp_bot) @@ -204,7 +204,7 @@ def test_rank_categorical(self): # Test na_option for rank data with ascending False exp_top = Series([7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0]) exp_bot = Series([6.0, 5.0, 4.0, 3.0, 2.0, 1.0, 7.0]) - exp_keep = Series([6.0, 5.0, 4.0, 3.0, 2.0, 1.0, np.NaN]) + exp_keep = Series([6.0, 5.0, 4.0, 3.0, 2.0, 1.0, np.nan]) tm.assert_series_equal(na_ser.rank(na_option="top", ascending=False), exp_top) tm.assert_series_equal( @@ -223,12 +223,12 @@ def test_rank_categorical(self): na_ser.rank(na_option=True, ascending=False) # Test with pct=True - na_ser = Series(["first", "second", "third", "fourth", np.NaN]).astype( + na_ser = Series(["first", "second", "third", "fourth", np.nan]).astype( CategoricalDtype(["first", "second", "third", "fourth"], True) ) exp_top = Series([0.4, 0.6, 0.8, 1.0, 0.2]) exp_bot = Series([0.2, 0.4, 0.6, 0.8, 1.0]) - exp_keep = Series([0.25, 0.5, 0.75, 1.0, np.NaN]) + exp_keep = Series([0.25, 0.5, 0.75, 1.0, np.nan]) tm.assert_series_equal(na_ser.rank(na_option="top", pct=True), exp_top) tm.assert_series_equal(na_ser.rank(na_option="bottom", pct=True), exp_bot) diff --git a/pandas/tests/series/methods/test_reindex.py b/pandas/tests/series/methods/test_reindex.py index 52446f96009d5..c29c3297eb63b 100644 --- a/pandas/tests/series/methods/test_reindex.py +++ b/pandas/tests/series/methods/test_reindex.py @@ -194,11 +194,11 @@ def test_reindex_int(datetime_series): reindexed_int = int_ts.reindex(datetime_series.index) # if NaNs introduced - assert reindexed_int.dtype == np.float_ + assert reindexed_int.dtype == np.float64 # NO NaNs introduced reindexed_int = int_ts.reindex(int_ts.index[::2]) - assert reindexed_int.dtype == np.int_ + assert reindexed_int.dtype == np.dtype(int) def test_reindex_bool(datetime_series): @@ -425,11 +425,11 @@ def test_reindexing_with_float64_NA_log(): s = Series([1.0, NA], dtype=Float64Dtype()) s_reindex = s.reindex(range(3)) result = s_reindex.values._data - expected = np.array([1, np.NaN, np.NaN]) + expected = np.array([1, np.nan, np.nan]) tm.assert_numpy_array_equal(result, expected) with tm.assert_produces_warning(None): result_log = np.log(s_reindex) - expected_log = Series([0, np.NaN, np.NaN], dtype=Float64Dtype()) + expected_log = Series([0, np.nan, np.nan], dtype=Float64Dtype()) tm.assert_series_equal(result_log, expected_log) diff --git a/pandas/tests/series/methods/test_sort_values.py b/pandas/tests/series/methods/test_sort_values.py index c3e074dc68c82..4808272879071 100644 --- a/pandas/tests/series/methods/test_sort_values.py +++ b/pandas/tests/series/methods/test_sort_values.py @@ -18,7 +18,7 @@ def test_sort_values(self, datetime_series, using_copy_on_write): tm.assert_series_equal(expected, result) ts = datetime_series.copy() - ts[:5] = np.NaN + ts[:5] = np.nan vals = ts.values result = ts.sort_values() diff --git a/pandas/tests/series/methods/test_to_frame.py b/pandas/tests/series/methods/test_to_frame.py index 01e547aa34b47..0eadf696b34cc 100644 --- a/pandas/tests/series/methods/test_to_frame.py +++ b/pandas/tests/series/methods/test_to_frame.py @@ -1,3 +1,5 @@ +import pytest + from pandas import ( DataFrame, Index, @@ -40,6 +42,9 @@ def test_to_frame(self, datetime_series): ) tm.assert_frame_equal(rs, xp) + @pytest.mark.filterwarnings( + "ignore:Passing a BlockManager|Passing a SingleBlockManager:DeprecationWarning" + ) def test_to_frame_expanddim(self): # GH#9762 diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index 331afc4345616..b74ee5cf8f2bc 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -165,7 +165,7 @@ def test_constructor(self, datetime_series): assert id(datetime_series.index) == id(derived.index) # Mixed type Series - mixed = Series(["hello", np.NaN], index=[0, 1]) + mixed = Series(["hello", np.nan], index=[0, 1]) assert mixed.dtype == np.object_ assert np.isnan(mixed[1]) @@ -1464,8 +1464,8 @@ def test_fromDict(self): assert series.dtype == np.float64 def test_fromValue(self, datetime_series): - nans = Series(np.NaN, index=datetime_series.index, dtype=np.float64) - assert nans.dtype == np.float_ + nans = Series(np.nan, index=datetime_series.index, dtype=np.float64) + assert nans.dtype == np.float64 assert len(nans) == len(datetime_series) strings = Series("foo", index=datetime_series.index) @@ -2077,8 +2077,8 @@ def test_series_from_index_dtype_equal_does_not_copy(self): def test_series_string_inference(self): # GH#54430 - pa = pytest.importorskip("pyarrow") - dtype = pd.ArrowDtype(pa.string()) + pytest.importorskip("pyarrow") + dtype = "string[pyarrow_numpy]" expected = Series(["a", "b"], dtype=dtype) with pd.option_context("future.infer_string", True): ser = Series(["a", "b"]) @@ -2092,8 +2092,8 @@ def test_series_string_inference(self): @pytest.mark.parametrize("na_value", [None, np.nan, pd.NA]) def test_series_string_with_na_inference(self, na_value): # GH#54430 - pa = pytest.importorskip("pyarrow") - dtype = pd.ArrowDtype(pa.string()) + pytest.importorskip("pyarrow") + dtype = "string[pyarrow_numpy]" expected = Series(["a", na_value], dtype=dtype) with pd.option_context("future.infer_string", True): ser = Series(["a", na_value]) @@ -2101,12 +2101,44 @@ def test_series_string_with_na_inference(self, na_value): def test_series_string_inference_scalar(self): # GH#54430 - pa = pytest.importorskip("pyarrow") - expected = Series("a", index=[1], dtype=pd.ArrowDtype(pa.string())) + pytest.importorskip("pyarrow") + expected = Series("a", index=[1], dtype="string[pyarrow_numpy]") with pd.option_context("future.infer_string", True): ser = Series("a", index=[1]) tm.assert_series_equal(ser, expected) + def test_series_string_inference_array_string_dtype(self): + # GH#54496 + pytest.importorskip("pyarrow") + expected = Series(["a", "b"], dtype="string[pyarrow_numpy]") + with pd.option_context("future.infer_string", True): + ser = Series(np.array(["a", "b"])) + tm.assert_series_equal(ser, expected) + + def test_series_string_inference_storage_definition(self): + # GH#54793 + pytest.importorskip("pyarrow") + expected = Series(["a", "b"], dtype="string[pyarrow_numpy]") + with pd.option_context("future.infer_string", True): + result = Series(["a", "b"], dtype="string") + tm.assert_series_equal(result, expected) + + def test_series_constructor_infer_string_scalar(self): + # GH#55537 + with pd.option_context("future.infer_string", True): + ser = Series("a", index=[1, 2], dtype="string[python]") + expected = Series(["a", "a"], index=[1, 2], dtype="string[python]") + tm.assert_series_equal(ser, expected) + assert ser.dtype.storage == "python" + + def test_series_string_inference_na_first(self): + # GH#55655 + pytest.importorskip("pyarrow") + expected = Series([pd.NA, "b"], dtype="string[pyarrow_numpy]") + with pd.option_context("future.infer_string", True): + result = Series([pd.NA, "b"]) + tm.assert_series_equal(result, expected) + class TestSeriesConstructorIndexCoercion: def test_series_constructor_datetimelike_index_coercion(self): diff --git a/pandas/tests/series/test_cumulative.py b/pandas/tests/series/test_cumulative.py index 4c5fd2d44e4f4..e6f7b2a5e69e0 100644 --- a/pandas/tests/series/test_cumulative.py +++ b/pandas/tests/series/test_cumulative.py @@ -31,7 +31,7 @@ def test_datetime_series(self, datetime_series, func): # with missing values ts = datetime_series.copy() - ts[::2] = np.NaN + ts[::2] = np.nan result = func(ts)[1::2] expected = func(np.array(ts.dropna())) @@ -47,7 +47,7 @@ def test_cummin_cummax(self, datetime_series, method): tm.assert_numpy_array_equal(result, expected) ts = datetime_series.copy() - ts[::2] = np.NaN + ts[::2] = np.nan result = getattr(ts, method)()[1::2] expected = ufunc(ts.dropna()) diff --git a/pandas/tests/series/test_logical_ops.py b/pandas/tests/series/test_logical_ops.py index 4dab3e8f62598..26046ef9ba295 100644 --- a/pandas/tests/series/test_logical_ops.py +++ b/pandas/tests/series/test_logical_ops.py @@ -93,7 +93,7 @@ def test_logical_operators_int_dtype_with_float(self): msg = "Cannot perform.+with a dtyped.+array and scalar of type" with pytest.raises(TypeError, match=msg): - s_0123 & np.NaN + s_0123 & np.nan with pytest.raises(TypeError, match=msg): s_0123 & 3.14 msg = "unsupported operand type.+for &:" @@ -149,11 +149,11 @@ def test_logical_operators_int_dtype_with_object(self): # GH#9016: support bitwise op for integer types s_0123 = Series(range(4), dtype="int64") - result = s_0123 & Series([False, np.NaN, False, False]) + result = s_0123 & Series([False, np.nan, False, False]) expected = Series([False] * 4) tm.assert_series_equal(result, expected) - s_abNd = Series(["a", "b", np.NaN, "d"]) + s_abNd = Series(["a", "b", np.nan, "d"]) with pytest.raises(TypeError, match="unsupported.* 'int' and 'str'"): s_0123 & s_abNd diff --git a/pandas/tests/series/test_missing.py b/pandas/tests/series/test_missing.py index 9f17f6d86cf93..cafc69c4d0f20 100644 --- a/pandas/tests/series/test_missing.py +++ b/pandas/tests/series/test_missing.py @@ -84,7 +84,7 @@ def test_logical_range_select(self, datetime_series): def test_valid(self, datetime_series): ts = datetime_series.copy() ts.index = ts.index._with_freq(None) - ts[::2] = np.NaN + ts[::2] = np.nan result = ts.dropna() assert len(result) == ts.count() diff --git a/pandas/tests/series/test_repr.py b/pandas/tests/series/test_repr.py index 4c92b5694c43b..be68918d2a380 100644 --- a/pandas/tests/series/test_repr.py +++ b/pandas/tests/series/test_repr.py @@ -83,7 +83,7 @@ def test_string(self, string_series): str(string_series.astype(int)) # with NaNs - string_series[5:7] = np.NaN + string_series[5:7] = np.nan str(string_series) def test_object(self, object_series): @@ -348,7 +348,7 @@ def test_categorical_series_repr(self): 8 8 9 9 dtype: category -Categories (10, {np.int_().dtype}): [0, 1, 2, 3, ..., 6, 7, 8, 9]""" +Categories (10, {np.dtype(int)}): [0, 1, 2, 3, ..., 6, 7, 8, 9]""" assert repr(s) == exp @@ -374,7 +374,7 @@ def test_categorical_series_repr_ordered(self): 8 8 9 9 dtype: category -Categories (10, {np.int_().dtype}): [0 < 1 < 2 < 3 ... 6 < 7 < 8 < 9]""" +Categories (10, {np.dtype(int)}): [0 < 1 < 2 < 3 ... 6 < 7 < 8 < 9]""" assert repr(s) == exp diff --git a/pandas/tests/series/test_subclass.py b/pandas/tests/series/test_subclass.py index a3550c6de6780..c2d5afcf884b1 100644 --- a/pandas/tests/series/test_subclass.py +++ b/pandas/tests/series/test_subclass.py @@ -4,6 +4,10 @@ import pandas as pd import pandas._testing as tm +pytestmark = pytest.mark.filterwarnings( + "ignore:Passing a BlockManager|Passing a SingleBlockManager:DeprecationWarning" +) + class TestSeriesSubclassing: @pytest.mark.parametrize( diff --git a/pandas/tests/strings/__init__.py b/pandas/tests/strings/__init__.py index e69de29bb2d1d..01b49b5e5b633 100644 --- a/pandas/tests/strings/__init__.py +++ b/pandas/tests/strings/__init__.py @@ -0,0 +1,15 @@ +import numpy as np + +import pandas as pd + +object_pyarrow_numpy = ("object", "string[pyarrow_numpy]") + + +def _convert_na_value(ser, expected): + if ser.dtype != object: + if ser.dtype.storage == "pyarrow_numpy": + expected = expected.fillna(np.nan) + else: + # GH#18463 + expected = expected.fillna(pd.NA) + return expected diff --git a/pandas/tests/strings/test_case_justify.py b/pandas/tests/strings/test_case_justify.py index ced941187f548..1dee25e631648 100644 --- a/pandas/tests/strings/test_case_justify.py +++ b/pandas/tests/strings/test_case_justify.py @@ -278,6 +278,11 @@ def test_center_ljust_rjust_mixed_object(): def test_center_ljust_rjust_fillchar(any_string_dtype): + if any_string_dtype == "string[pyarrow_numpy]": + pytest.skip( + "Arrow logic is different, " + "see https://github.com/pandas-dev/pandas/pull/54533/files#r1299808126", + ) s = Series(["a", "bb", "cccc", "ddddd", "eeeeee"], dtype=any_string_dtype) result = s.str.center(5, fillchar="X") diff --git a/pandas/tests/strings/test_extract.py b/pandas/tests/strings/test_extract.py index 4773c13aad376..b8319e90e09a8 100644 --- a/pandas/tests/strings/test_extract.py +++ b/pandas/tests/strings/test_extract.py @@ -4,6 +4,8 @@ import numpy as np import pytest +from pandas.core.dtypes.dtypes import ArrowDtype + from pandas import ( DataFrame, Index, @@ -706,3 +708,12 @@ def test_extractall_same_as_extract_subject_index(any_string_dtype): has_match_index = s.str.extractall(pattern_one_noname) no_match_index = has_match_index.xs(0, level="match") tm.assert_frame_equal(extract_one_noname, no_match_index) + + +def test_extractall_preserves_dtype(): + # Ensure that when extractall is called on a series with specific dtypes set, that + # the dtype is preserved in the resulting DataFrame's column. + pa = pytest.importorskip("pyarrow") + + result = Series(["abc", "ab"], dtype=ArrowDtype(pa.string())).str.extractall("(ab)") + assert result.dtypes[0] == "string[pyarrow]" diff --git a/pandas/tests/strings/test_find_replace.py b/pandas/tests/strings/test_find_replace.py index c3cc8b3643ed2..78f0730d730e8 100644 --- a/pandas/tests/strings/test_find_replace.py +++ b/pandas/tests/strings/test_find_replace.py @@ -11,12 +11,20 @@ Series, _testing as tm, ) +from pandas.tests.strings import ( + _convert_na_value, + object_pyarrow_numpy, +) # -------------------------------------------------------------------------------------- # str.contains # -------------------------------------------------------------------------------------- +def using_pyarrow(dtype): + return dtype in ("string[pyarrow]", "string[pyarrow_numpy]") + + def test_contains(any_string_dtype): values = np.array( ["foo", np.nan, "fooommm__foo", "mmm_", "foommm[_]+bar"], dtype=np.object_ @@ -25,7 +33,7 @@ def test_contains(any_string_dtype): pat = "mmm[_]+" result = values.str.contains(pat) - expected_dtype = "object" if any_string_dtype == "object" else "boolean" + expected_dtype = "object" if any_string_dtype in object_pyarrow_numpy else "boolean" expected = Series( np.array([False, np.nan, True, True, False], dtype=np.object_), dtype=expected_dtype, @@ -44,7 +52,7 @@ def test_contains(any_string_dtype): dtype=any_string_dtype, ) result = values.str.contains(pat) - expected_dtype = np.bool_ if any_string_dtype == "object" else "boolean" + expected_dtype = np.bool_ if any_string_dtype in object_pyarrow_numpy else "boolean" expected = Series(np.array([False, False, True, True]), dtype=expected_dtype) tm.assert_series_equal(result, expected) @@ -71,14 +79,14 @@ def test_contains(any_string_dtype): pat = "mmm[_]+" result = values.str.contains(pat) - expected_dtype = "object" if any_string_dtype == "object" else "boolean" + expected_dtype = "object" if any_string_dtype in object_pyarrow_numpy else "boolean" expected = Series( np.array([False, np.nan, True, True], dtype=np.object_), dtype=expected_dtype ) tm.assert_series_equal(result, expected) result = values.str.contains(pat, na=False) - expected_dtype = np.bool_ if any_string_dtype == "object" else "boolean" + expected_dtype = np.bool_ if any_string_dtype in object_pyarrow_numpy else "boolean" expected = Series(np.array([False, False, True, True]), dtype=expected_dtype) tm.assert_series_equal(result, expected) @@ -163,7 +171,7 @@ def test_contains_moar(any_string_dtype): ) result = s.str.contains("a") - expected_dtype = "object" if any_string_dtype == "object" else "boolean" + expected_dtype = "object" if any_string_dtype in object_pyarrow_numpy else "boolean" expected = Series( [False, False, False, True, True, False, np.nan, False, False, True], dtype=expected_dtype, @@ -204,7 +212,7 @@ def test_contains_nan(any_string_dtype): s = Series([np.nan, np.nan, np.nan], dtype=any_string_dtype) result = s.str.contains("foo", na=False) - expected_dtype = np.bool_ if any_string_dtype == "object" else "boolean" + expected_dtype = np.bool_ if any_string_dtype in object_pyarrow_numpy else "boolean" expected = Series([False, False, False], dtype=expected_dtype) tm.assert_series_equal(result, expected) @@ -215,12 +223,14 @@ def test_contains_nan(any_string_dtype): result = s.str.contains("foo", na="foo") if any_string_dtype == "object": expected = Series(["foo", "foo", "foo"], dtype=np.object_) + elif any_string_dtype == "string[pyarrow_numpy]": + expected = Series([True, True, True], dtype=np.bool_) else: expected = Series([True, True, True], dtype="boolean") tm.assert_series_equal(result, expected) result = s.str.contains("foo") - expected_dtype = "object" if any_string_dtype == "object" else "boolean" + expected_dtype = "object" if any_string_dtype in object_pyarrow_numpy else "boolean" expected = Series([np.nan, np.nan, np.nan], dtype=expected_dtype) tm.assert_series_equal(result, expected) @@ -378,9 +388,7 @@ def test_replace_mixed_object(): def test_replace_unicode(any_string_dtype): ser = Series([b"abcd,\xc3\xa0".decode("utf-8")], dtype=any_string_dtype) expected = Series([b"abcd, \xc3\xa0".decode("utf-8")], dtype=any_string_dtype) - with tm.maybe_produces_warning( - PerformanceWarning, any_string_dtype == "string[pyarrow]" - ): + with tm.maybe_produces_warning(PerformanceWarning, using_pyarrow(any_string_dtype)): result = ser.str.replace(r"(?<=\w),(?=\w)", ", ", flags=re.UNICODE, regex=True) tm.assert_series_equal(result, expected) @@ -401,9 +409,7 @@ def test_replace_callable(any_string_dtype): # test with callable repl = lambda m: m.group(0).swapcase() - with tm.maybe_produces_warning( - PerformanceWarning, any_string_dtype == "string[pyarrow]" - ): + with tm.maybe_produces_warning(PerformanceWarning, using_pyarrow(any_string_dtype)): result = ser.str.replace("[a-z][A-Z]{2}", repl, n=2, regex=True) expected = Series(["foObaD__baRbaD", np.nan], dtype=any_string_dtype) tm.assert_series_equal(result, expected) @@ -423,7 +429,7 @@ def test_replace_callable_raises(any_string_dtype, repl): ) with pytest.raises(TypeError, match=msg): with tm.maybe_produces_warning( - PerformanceWarning, any_string_dtype == "string[pyarrow]" + PerformanceWarning, using_pyarrow(any_string_dtype) ): values.str.replace("a", repl, regex=True) @@ -433,9 +439,7 @@ def test_replace_callable_named_groups(any_string_dtype): ser = Series(["Foo Bar Baz", np.nan], dtype=any_string_dtype) pat = r"(?P\w+) (?P\w+) (?P\w+)" repl = lambda m: m.group("middle").swapcase() - with tm.maybe_produces_warning( - PerformanceWarning, any_string_dtype == "string[pyarrow]" - ): + with tm.maybe_produces_warning(PerformanceWarning, using_pyarrow(any_string_dtype)): result = ser.str.replace(pat, repl, regex=True) expected = Series(["bAR", np.nan], dtype=any_string_dtype) tm.assert_series_equal(result, expected) @@ -447,16 +451,12 @@ def test_replace_compiled_regex(any_string_dtype): # test with compiled regex pat = re.compile(r"BAD_*") - with tm.maybe_produces_warning( - PerformanceWarning, any_string_dtype == "string[pyarrow]" - ): + with tm.maybe_produces_warning(PerformanceWarning, using_pyarrow(any_string_dtype)): result = ser.str.replace(pat, "", regex=True) expected = Series(["foobar", np.nan], dtype=any_string_dtype) tm.assert_series_equal(result, expected) - with tm.maybe_produces_warning( - PerformanceWarning, any_string_dtype == "string[pyarrow]" - ): + with tm.maybe_produces_warning(PerformanceWarning, using_pyarrow(any_string_dtype)): result = ser.str.replace(pat, "", n=1, regex=True) expected = Series(["foobarBAD", np.nan], dtype=any_string_dtype) tm.assert_series_equal(result, expected) @@ -476,9 +476,7 @@ def test_replace_compiled_regex_unicode(any_string_dtype): ser = Series([b"abcd,\xc3\xa0".decode("utf-8")], dtype=any_string_dtype) expected = Series([b"abcd, \xc3\xa0".decode("utf-8")], dtype=any_string_dtype) pat = re.compile(r"(?<=\w),(?=\w)", flags=re.UNICODE) - with tm.maybe_produces_warning( - PerformanceWarning, any_string_dtype == "string[pyarrow]" - ): + with tm.maybe_produces_warning(PerformanceWarning, using_pyarrow(any_string_dtype)): result = ser.str.replace(pat, ", ", regex=True) tm.assert_series_equal(result, expected) @@ -506,9 +504,7 @@ def test_replace_compiled_regex_callable(any_string_dtype): ser = Series(["fooBAD__barBAD", np.nan], dtype=any_string_dtype) repl = lambda m: m.group(0).swapcase() pat = re.compile("[a-z][A-Z]{2}") - with tm.maybe_produces_warning( - PerformanceWarning, any_string_dtype == "string[pyarrow]" - ): + with tm.maybe_produces_warning(PerformanceWarning, using_pyarrow(any_string_dtype)): result = ser.str.replace(pat, repl, n=2, regex=True) expected = Series(["foObaD__baRbaD", np.nan], dtype=any_string_dtype) tm.assert_series_equal(result, expected) @@ -557,9 +553,7 @@ def test_replace_moar(any_string_dtype): ) tm.assert_series_equal(result, expected) - with tm.maybe_produces_warning( - PerformanceWarning, any_string_dtype == "string[pyarrow]" - ): + with tm.maybe_produces_warning(PerformanceWarning, using_pyarrow(any_string_dtype)): result = ser.str.replace("A", "YYY", case=False) expected = Series( [ @@ -578,9 +572,7 @@ def test_replace_moar(any_string_dtype): ) tm.assert_series_equal(result, expected) - with tm.maybe_produces_warning( - PerformanceWarning, any_string_dtype == "string[pyarrow]" - ): + with tm.maybe_produces_warning(PerformanceWarning, using_pyarrow(any_string_dtype)): result = ser.str.replace("^.a|dog", "XX-XX ", case=False, regex=True) expected = Series( [ @@ -604,16 +596,12 @@ def test_replace_not_case_sensitive_not_regex(any_string_dtype): # https://github.com/pandas-dev/pandas/issues/41602 ser = Series(["A.", "a.", "Ab", "ab", np.nan], dtype=any_string_dtype) - with tm.maybe_produces_warning( - PerformanceWarning, any_string_dtype == "string[pyarrow]" - ): + with tm.maybe_produces_warning(PerformanceWarning, using_pyarrow(any_string_dtype)): result = ser.str.replace("a", "c", case=False, regex=False) expected = Series(["c.", "c.", "cb", "cb", np.nan], dtype=any_string_dtype) tm.assert_series_equal(result, expected) - with tm.maybe_produces_warning( - PerformanceWarning, any_string_dtype == "string[pyarrow]" - ): + with tm.maybe_produces_warning(PerformanceWarning, using_pyarrow(any_string_dtype)): result = ser.str.replace("a.", "c.", case=False, regex=False) expected = Series(["c.", "c.", "Ab", "ab", np.nan], dtype=any_string_dtype) tm.assert_series_equal(result, expected) @@ -648,7 +636,7 @@ def test_replace_regex_single_character(regex, any_string_dtype): def test_match(any_string_dtype): # New match behavior introduced in 0.13 - expected_dtype = "object" if any_string_dtype == "object" else "boolean" + expected_dtype = "object" if any_string_dtype in object_pyarrow_numpy else "boolean" values = Series(["fooBAD__barBAD", np.nan, "foo"], dtype=any_string_dtype) result = values.str.match(".*(BAD[_]+).*(BAD)") @@ -703,12 +691,12 @@ def test_match_na_kwarg(any_string_dtype): s = Series(["a", "b", np.nan], dtype=any_string_dtype) result = s.str.match("a", na=False) - expected_dtype = np.bool_ if any_string_dtype == "object" else "boolean" + expected_dtype = np.bool_ if any_string_dtype in object_pyarrow_numpy else "boolean" expected = Series([True, False, False], dtype=expected_dtype) tm.assert_series_equal(result, expected) result = s.str.match("a") - expected_dtype = "object" if any_string_dtype == "object" else "boolean" + expected_dtype = "object" if any_string_dtype in object_pyarrow_numpy else "boolean" expected = Series([True, False, np.nan], dtype=expected_dtype) tm.assert_series_equal(result, expected) @@ -716,7 +704,7 @@ def test_match_na_kwarg(any_string_dtype): def test_match_case_kwarg(any_string_dtype): values = Series(["ab", "AB", "abc", "ABC"], dtype=any_string_dtype) result = values.str.match("ab", case=False) - expected_dtype = np.bool_ if any_string_dtype == "object" else "boolean" + expected_dtype = np.bool_ if any_string_dtype in object_pyarrow_numpy else "boolean" expected = Series([True, True, True, True], dtype=expected_dtype) tm.assert_series_equal(result, expected) @@ -732,7 +720,7 @@ def test_fullmatch(any_string_dtype): ["fooBAD__barBAD", "BAD_BADleroybrown", np.nan, "foo"], dtype=any_string_dtype ) result = ser.str.fullmatch(".*BAD[_]+.*BAD") - expected_dtype = "object" if any_string_dtype == "object" else "boolean" + expected_dtype = "object" if any_string_dtype in object_pyarrow_numpy else "boolean" expected = Series([True, False, np.nan, False], dtype=expected_dtype) tm.assert_series_equal(result, expected) @@ -742,14 +730,14 @@ def test_fullmatch_na_kwarg(any_string_dtype): ["fooBAD__barBAD", "BAD_BADleroybrown", np.nan, "foo"], dtype=any_string_dtype ) result = ser.str.fullmatch(".*BAD[_]+.*BAD", na=False) - expected_dtype = np.bool_ if any_string_dtype == "object" else "boolean" + expected_dtype = np.bool_ if any_string_dtype in object_pyarrow_numpy else "boolean" expected = Series([True, False, False, False], dtype=expected_dtype) tm.assert_series_equal(result, expected) def test_fullmatch_case_kwarg(any_string_dtype): ser = Series(["ab", "AB", "abc", "ABC"], dtype=any_string_dtype) - expected_dtype = np.bool_ if any_string_dtype == "object" else "boolean" + expected_dtype = np.bool_ if any_string_dtype in object_pyarrow_numpy else "boolean" expected = Series([True, False, False, False], dtype=expected_dtype) @@ -761,9 +749,7 @@ def test_fullmatch_case_kwarg(any_string_dtype): result = ser.str.fullmatch("ab", case=False) tm.assert_series_equal(result, expected) - with tm.maybe_produces_warning( - PerformanceWarning, any_string_dtype == "string[pyarrow]" - ): + with tm.maybe_produces_warning(PerformanceWarning, using_pyarrow(any_string_dtype)): result = ser.str.fullmatch("ab", flags=re.IGNORECASE) tm.assert_series_equal(result, expected) @@ -777,9 +763,7 @@ def test_findall(any_string_dtype): ser = Series(["fooBAD__barBAD", np.nan, "foo", "BAD"], dtype=any_string_dtype) result = ser.str.findall("BAD[_]*") expected = Series([["BAD__", "BAD"], np.nan, [], ["BAD"]]) - if ser.dtype != object: - # GH#18463 - expected = expected.fillna(pd.NA) + expected = _convert_na_value(ser, expected) tm.assert_series_equal(result, expected) @@ -825,7 +809,7 @@ def test_find(any_string_dtype): ser = Series( ["ABCDEFG", "BCDEFEF", "DEFGHIJEF", "EFGHEF", "XXXX"], dtype=any_string_dtype ) - expected_dtype = np.int64 if any_string_dtype == "object" else "Int64" + expected_dtype = np.int64 if any_string_dtype in object_pyarrow_numpy else "Int64" result = ser.str.find("EF") expected = Series([4, 3, 1, 0, -1], dtype=expected_dtype) @@ -877,7 +861,7 @@ def test_find_nan(any_string_dtype): ser = Series( ["ABCDEFG", np.nan, "DEFGHIJEF", np.nan, "XXXX"], dtype=any_string_dtype ) - expected_dtype = np.float64 if any_string_dtype == "object" else "Int64" + expected_dtype = np.float64 if any_string_dtype in object_pyarrow_numpy else "Int64" result = ser.str.find("EF") expected = Series([4, np.nan, 1, np.nan, -1], dtype=expected_dtype) @@ -944,16 +928,16 @@ def test_flags_kwarg(any_string_dtype): pat = r"([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})" - using_pyarrow = any_string_dtype == "string[pyarrow]" + use_pyarrow = using_pyarrow(any_string_dtype) result = data.str.extract(pat, flags=re.IGNORECASE, expand=True) assert result.iloc[0].tolist() == ["dave", "google", "com"] - with tm.maybe_produces_warning(PerformanceWarning, using_pyarrow): + with tm.maybe_produces_warning(PerformanceWarning, use_pyarrow): result = data.str.match(pat, flags=re.IGNORECASE) assert result.iloc[0] - with tm.maybe_produces_warning(PerformanceWarning, using_pyarrow): + with tm.maybe_produces_warning(PerformanceWarning, use_pyarrow): result = data.str.fullmatch(pat, flags=re.IGNORECASE) assert result.iloc[0] @@ -965,7 +949,7 @@ def test_flags_kwarg(any_string_dtype): msg = "has match groups" with tm.assert_produces_warning( - UserWarning, match=msg, raise_on_extra_warnings=not using_pyarrow + UserWarning, match=msg, raise_on_extra_warnings=not use_pyarrow ): result = data.str.contains(pat, flags=re.IGNORECASE) assert result.iloc[0] diff --git a/pandas/tests/strings/test_split_partition.py b/pandas/tests/strings/test_split_partition.py index 0298694ccaf71..0a7d409773dd6 100644 --- a/pandas/tests/strings/test_split_partition.py +++ b/pandas/tests/strings/test_split_partition.py @@ -12,6 +12,10 @@ Series, _testing as tm, ) +from pandas.tests.strings import ( + _convert_na_value, + object_pyarrow_numpy, +) @pytest.mark.parametrize("method", ["split", "rsplit"]) @@ -20,9 +24,7 @@ def test_split(any_string_dtype, method): result = getattr(values.str, method)("_") exp = Series([["a", "b", "c"], ["c", "d", "e"], np.nan, ["f", "g", "h"]]) - if values.dtype != object: - # GH#18463 - exp = exp.fillna(pd.NA) + exp = _convert_na_value(values, exp) tm.assert_series_equal(result, exp) @@ -32,9 +34,7 @@ def test_split_more_than_one_char(any_string_dtype, method): values = Series(["a__b__c", "c__d__e", np.nan, "f__g__h"], dtype=any_string_dtype) result = getattr(values.str, method)("__") exp = Series([["a", "b", "c"], ["c", "d", "e"], np.nan, ["f", "g", "h"]]) - if values.dtype != object: - # GH#18463 - exp = exp.fillna(pd.NA) + exp = _convert_na_value(values, exp) tm.assert_series_equal(result, exp) result = getattr(values.str, method)("__", expand=False) @@ -46,9 +46,7 @@ def test_split_more_regex_split(any_string_dtype): values = Series(["a,b_c", "c_d,e", np.nan, "f,g,h"], dtype=any_string_dtype) result = values.str.split("[,_]") exp = Series([["a", "b", "c"], ["c", "d", "e"], np.nan, ["f", "g", "h"]]) - if values.dtype != object: - # GH#18463 - exp = exp.fillna(pd.NA) + exp = _convert_na_value(values, exp) tm.assert_series_equal(result, exp) @@ -118,8 +116,8 @@ def test_split_object_mixed(expand, method): def test_split_n(any_string_dtype, method, n): s = Series(["a b", pd.NA, "b c"], dtype=any_string_dtype) expected = Series([["a", "b"], pd.NA, ["b", "c"]]) - result = getattr(s.str, method)(" ", n=n) + expected = _convert_na_value(s, expected) tm.assert_series_equal(result, expected) @@ -128,9 +126,7 @@ def test_rsplit(any_string_dtype): values = Series(["a,b_c", "c_d,e", np.nan, "f,g,h"], dtype=any_string_dtype) result = values.str.rsplit("[,_]") exp = Series([["a,b_c"], ["c_d,e"], np.nan, ["f,g,h"]]) - if values.dtype != object: - # GH#18463 - exp = exp.fillna(pd.NA) + exp = _convert_na_value(values, exp) tm.assert_series_equal(result, exp) @@ -139,9 +135,7 @@ def test_rsplit_max_number(any_string_dtype): values = Series(["a_b_c", "c_d_e", np.nan, "f_g_h"], dtype=any_string_dtype) result = values.str.rsplit("_", n=1) exp = Series([["a_b", "c"], ["c_d", "e"], np.nan, ["f_g", "h"]]) - if values.dtype != object: - # GH#18463 - exp = exp.fillna(pd.NA) + exp = _convert_na_value(values, exp) tm.assert_series_equal(result, exp) @@ -390,7 +384,7 @@ def test_split_nan_expand(any_string_dtype): # check that these are actually np.nan/pd.NA and not None # TODO see GH 18463 # tm.assert_frame_equal does not differentiate - if any_string_dtype == "object": + if any_string_dtype in object_pyarrow_numpy: assert all(np.isnan(x) for x in result.iloc[1]) else: assert all(x is pd.NA for x in result.iloc[1]) @@ -455,9 +449,7 @@ def test_partition_series_more_than_one_char(method, exp, any_string_dtype): s = Series(["a__b__c", "c__d__e", np.nan, "f__g__h", None], dtype=any_string_dtype) result = getattr(s.str, method)("__", expand=False) expected = Series(exp) - if s.dtype != object: - # GH#18463 - expected = expected.fillna(pd.NA) + expected = _convert_na_value(s, expected) tm.assert_series_equal(result, expected) @@ -480,9 +472,7 @@ def test_partition_series_none(any_string_dtype, method, exp): s = Series(["a b c", "c d e", np.nan, "f g h", None], dtype=any_string_dtype) result = getattr(s.str, method)(expand=False) expected = Series(exp) - if s.dtype != object: - # GH#18463 - expected = expected.fillna(pd.NA) + expected = _convert_na_value(s, expected) tm.assert_series_equal(result, expected) @@ -505,9 +495,7 @@ def test_partition_series_not_split(any_string_dtype, method, exp): s = Series(["abc", "cde", np.nan, "fgh", None], dtype=any_string_dtype) result = getattr(s.str, method)("_", expand=False) expected = Series(exp) - if s.dtype != object: - # GH#18463 - expected = expected.fillna(pd.NA) + expected = _convert_na_value(s, expected) tm.assert_series_equal(result, expected) @@ -531,9 +519,7 @@ def test_partition_series_unicode(any_string_dtype, method, exp): result = getattr(s.str, method)("_", expand=False) expected = Series(exp) - if s.dtype != object: - # GH#18463 - expected = expected.fillna(pd.NA) + expected = _convert_na_value(s, expected) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/strings/test_strings.py b/pandas/tests/strings/test_strings.py index 1e573bdfe8fb5..4315835b70a40 100644 --- a/pandas/tests/strings/test_strings.py +++ b/pandas/tests/strings/test_strings.py @@ -14,6 +14,7 @@ ) import pandas._testing as tm from pandas.core.strings.accessor import StringMethods +from pandas.tests.strings import object_pyarrow_numpy @pytest.mark.parametrize("pattern", [0, True, Series(["foo", "bar"])]) @@ -40,7 +41,7 @@ def test_iter_raises(): def test_count(any_string_dtype): ser = Series(["foo", "foofoo", np.nan, "foooofooofommmfoo"], dtype=any_string_dtype) result = ser.str.count("f[o]+") - expected_dtype = np.float64 if any_string_dtype == "object" else "Int64" + expected_dtype = np.float64 if any_string_dtype in object_pyarrow_numpy else "Int64" expected = Series([1, 2, np.nan, 4], dtype=expected_dtype) tm.assert_series_equal(result, expected) @@ -91,7 +92,7 @@ def test_repeat_with_null(any_string_dtype, arg, repeat): def test_empty_str_methods(any_string_dtype): empty_str = empty = Series(dtype=any_string_dtype) - if any_string_dtype == "object": + if any_string_dtype in object_pyarrow_numpy: empty_int = Series(dtype="int64") empty_bool = Series(dtype=bool) else: @@ -205,7 +206,7 @@ def test_ismethods(method, expected, any_string_dtype): ser = Series( ["A", "b", "Xy", "4", "3A", "", "TT", "55", "-", " "], dtype=any_string_dtype ) - expected_dtype = "bool" if any_string_dtype == "object" else "boolean" + expected_dtype = "bool" if any_string_dtype in object_pyarrow_numpy else "boolean" expected = Series(expected, dtype=expected_dtype) result = getattr(ser.str, method)() tm.assert_series_equal(result, expected) @@ -230,7 +231,7 @@ def test_isnumeric_unicode(method, expected, any_string_dtype): ser = Series( ["A", "3", "¼", "★", "፸", "3", "four"], dtype=any_string_dtype # noqa: RUF001 ) - expected_dtype = "bool" if any_string_dtype == "object" else "boolean" + expected_dtype = "bool" if any_string_dtype in object_pyarrow_numpy else "boolean" expected = Series(expected, dtype=expected_dtype) result = getattr(ser.str, method)() tm.assert_series_equal(result, expected) @@ -250,7 +251,7 @@ def test_isnumeric_unicode(method, expected, any_string_dtype): def test_isnumeric_unicode_missing(method, expected, any_string_dtype): values = ["A", np.nan, "¼", "★", np.nan, "3", "four"] # noqa: RUF001 ser = Series(values, dtype=any_string_dtype) - expected_dtype = "object" if any_string_dtype == "object" else "boolean" + expected_dtype = "object" if any_string_dtype in object_pyarrow_numpy else "boolean" expected = Series(expected, dtype=expected_dtype) result = getattr(ser.str, method)() tm.assert_series_equal(result, expected) @@ -280,7 +281,7 @@ def test_len(any_string_dtype): dtype=any_string_dtype, ) result = ser.str.len() - expected_dtype = "float64" if any_string_dtype == "object" else "Int64" + expected_dtype = "float64" if any_string_dtype in object_pyarrow_numpy else "Int64" expected = Series([3, 4, 6, np.nan, 8, 4, 1], dtype=expected_dtype) tm.assert_series_equal(result, expected) @@ -309,7 +310,7 @@ def test_index(method, sub, start, end, index_or_series, any_string_dtype, expec obj = index_or_series( ["ABCDEFG", "BCDEFEF", "DEFGHIJEF", "EFGHEF"], dtype=any_string_dtype ) - expected_dtype = np.int64 if any_string_dtype == "object" else "Int64" + expected_dtype = np.int64 if any_string_dtype in object_pyarrow_numpy else "Int64" expected = index_or_series(expected, dtype=expected_dtype) result = getattr(obj.str, method)(sub, start, end) @@ -350,7 +351,7 @@ def test_index_wrong_type_raises(index_or_series, any_string_dtype, method): ) def test_index_missing(any_string_dtype, method, exp): ser = Series(["abcb", "ab", "bcbe", np.nan], dtype=any_string_dtype) - expected_dtype = np.float64 if any_string_dtype == "object" else "Int64" + expected_dtype = np.float64 if any_string_dtype in object_pyarrow_numpy else "Int64" result = getattr(ser.str, method)("b") expected = Series(exp + [np.nan], dtype=expected_dtype) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 856c31b9ccb06..cb703d3439d44 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -1412,6 +1412,19 @@ def test_value_counts_uint64(self): tm.assert_series_equal(result, expected) + def test_value_counts_series(self): + # GH#54857 + values = np.array([3, 1, 2, 3, 4, np.nan]) + result = Series(values).value_counts(bins=3) + expected = Series( + [2, 2, 1], + index=IntervalIndex.from_tuples( + [(0.996, 2.0), (2.0, 3.0), (3.0, 4.0)], dtype="interval[float64, right]" + ), + name="count", + ) + tm.assert_series_equal(result, expected) + class TestDuplicated: def test_duplicated_with_nas(self): diff --git a/pandas/tests/test_nanops.py b/pandas/tests/test_nanops.py index 76784ec726afe..a0062d2b6dd44 100644 --- a/pandas/tests/test_nanops.py +++ b/pandas/tests/test_nanops.py @@ -323,7 +323,7 @@ def check_fun_data( res = testfunc(testarval, axis=axis, skipna=skipna, **kwargs) if ( - isinstance(targ, np.complex_) + isinstance(targ, np.complex128) and isinstance(res, float) and np.isnan(targ) and np.isnan(res) diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index 5e51edfee17f1..580f0605efdbc 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -518,7 +518,7 @@ def test_to_datetime_parse_tzname_or_tzoffset_utc_false_deprecated( self, fmt, dates, expected_dates ): # GH 13486, 50887 - msg = "parsing datetimes with mixed time zones will raise a warning" + msg = "parsing datetimes with mixed time zones will raise an error" with tm.assert_produces_warning(FutureWarning, match=msg): result = to_datetime(dates, format=fmt) expected = Index(expected_dates) @@ -693,7 +693,7 @@ def test_to_datetime_mixed_datetime_and_string_with_format_mixed_offsets_utc_fal args = ["2000-01-01 01:00:00", "2000-01-01 02:00:00+00:00"] ts1 = constructor(args[0]) ts2 = args[1] - msg = "parsing datetimes with mixed time zones will raise a warning" + msg = "parsing datetimes with mixed time zones will raise an error" expected = Index( [ @@ -734,7 +734,7 @@ def test_to_datetime_mixed_datetime_and_string_with_format_mixed_offsets_utc_fal ) def test_to_datetime_mixed_offsets_with_none_tz(self, fmt, expected): # https://github.com/pandas-dev/pandas/issues/50071 - msg = "parsing datetimes with mixed time zones will raise a warning" + msg = "parsing datetimes with mixed time zones will raise an error" with tm.assert_produces_warning(FutureWarning, match=msg): result = to_datetime( @@ -1236,7 +1236,7 @@ def test_to_datetime_different_offsets(self, cache): ts_string_2 = "March 1, 2018 12:00:00+0500" arr = [ts_string_1] * 5 + [ts_string_2] * 5 expected = Index([parse(x) for x in arr]) - msg = "parsing datetimes with mixed time zones will raise a warning" + msg = "parsing datetimes with mixed time zones will raise an error" with tm.assert_produces_warning(FutureWarning, match=msg): result = to_datetime(arr, cache=cache) tm.assert_index_equal(result, expected) @@ -1570,8 +1570,8 @@ def test_convert_object_to_datetime_with_cache( (Series([""] * 60), Series([NaT] * 60, dtype="datetime64[ns]")), (Series([pd.NA] * 20), Series([NaT] * 20, dtype="datetime64[ns]")), (Series([pd.NA] * 60), Series([NaT] * 60, dtype="datetime64[ns]")), - (Series([np.NaN] * 20), Series([NaT] * 20, dtype="datetime64[ns]")), - (Series([np.NaN] * 60), Series([NaT] * 60, dtype="datetime64[ns]")), + (Series([np.nan] * 20), Series([NaT] * 20, dtype="datetime64[ns]")), + (Series([np.nan] * 60), Series([NaT] * 60, dtype="datetime64[ns]")), ), ) def test_to_datetime_converts_null_like_to_nat(self, cache, input, expected): @@ -1604,7 +1604,7 @@ def test_to_datetime_coerce(self): "March 1, 2018 12:00:00+0500", "20100240", ] - msg = "parsing datetimes with mixed time zones will raise a warning" + msg = "parsing datetimes with mixed time zones will raise an error" with tm.assert_produces_warning(FutureWarning, match=msg): result = to_datetime(ts_strings, errors="coerce") expected = Index( @@ -1689,7 +1689,7 @@ def test_iso_8601_strings_with_same_offset(self): def test_iso_8601_strings_with_different_offsets(self): # GH 17697, 11736, 50887 ts_strings = ["2015-11-18 15:30:00+05:30", "2015-11-18 16:30:00+06:30", NaT] - msg = "parsing datetimes with mixed time zones will raise a warning" + msg = "parsing datetimes with mixed time zones will raise an error" with tm.assert_produces_warning(FutureWarning, match=msg): result = to_datetime(ts_strings) expected = np.array( @@ -1729,7 +1729,7 @@ def test_mixed_offsets_with_native_datetime_raises(self): now = Timestamp("now") today = Timestamp("today") - msg = "parsing datetimes with mixed time zones will raise a warning" + msg = "parsing datetimes with mixed time zones will raise an error" with tm.assert_produces_warning(FutureWarning, match=msg): mixed = to_datetime(ser) expected = Series( @@ -1810,7 +1810,7 @@ def test_to_datetime_fixed_offset(self): ) def test_to_datetime_mixed_offsets_with_utc_false_deprecated(self, date): # GH 50887 - msg = "parsing datetimes with mixed time zones will raise a warning" + msg = "parsing datetimes with mixed time zones will raise an error" with tm.assert_produces_warning(FutureWarning, match=msg): to_datetime(date, utc=False) @@ -3680,7 +3680,7 @@ def test_to_datetime_with_empty_str_utc_false_format_mixed(): def test_to_datetime_with_empty_str_utc_false_offsets_and_format_mixed(): # GH 50887 - msg = "parsing datetimes with mixed time zones will raise a warning" + msg = "parsing datetimes with mixed time zones will raise an error" with tm.assert_produces_warning(FutureWarning, match=msg): to_datetime( diff --git a/pandas/tests/tslibs/test_array_to_datetime.py b/pandas/tests/tslibs/test_array_to_datetime.py index 435fe5f4b90d8..829bb140e6e96 100644 --- a/pandas/tests/tslibs/test_array_to_datetime.py +++ b/pandas/tests/tslibs/test_array_to_datetime.py @@ -85,7 +85,7 @@ def test_parsing_different_timezone_offsets(): data = ["2015-11-18 15:30:00+05:30", "2015-11-18 15:30:00+06:30"] data = np.array(data, dtype=object) - msg = "parsing datetimes with mixed time zones will raise a warning" + msg = "parsing datetimes with mixed time zones will raise an error" with tm.assert_produces_warning(FutureWarning, match=msg): result, result_tz = tslib.array_to_datetime(data) expected = np.array( diff --git a/pandas/tests/util/test_assert_almost_equal.py b/pandas/tests/util/test_assert_almost_equal.py index a86302f158005..8527efdbf7867 100644 --- a/pandas/tests/util/test_assert_almost_equal.py +++ b/pandas/tests/util/test_assert_almost_equal.py @@ -293,7 +293,7 @@ def test_assert_almost_equal_null(): _assert_almost_equal_both(None, None) -@pytest.mark.parametrize("a,b", [(None, np.NaN), (None, 0), (np.NaN, 0)]) +@pytest.mark.parametrize("a,b", [(None, np.nan), (None, 0), (np.nan, 0)]) def test_assert_not_almost_equal_null(a, b): _assert_not_almost_equal(a, b) diff --git a/pandas/tests/window/conftest.py b/pandas/tests/window/conftest.py index 2dd4458172593..73ab470ab97a7 100644 --- a/pandas/tests/window/conftest.py +++ b/pandas/tests/window/conftest.py @@ -126,7 +126,7 @@ def series(): """Make mocked series as fixture.""" arr = np.random.default_rng(2).standard_normal(100) locs = np.arange(20, 40) - arr[locs] = np.NaN + arr[locs] = np.nan series = Series(arr, index=bdate_range(datetime(2009, 1, 1), periods=100)) return series diff --git a/pandas/tests/window/test_api.py b/pandas/tests/window/test_api.py index d901fe58950e3..33858e10afd75 100644 --- a/pandas/tests/window/test_api.py +++ b/pandas/tests/window/test_api.py @@ -223,9 +223,9 @@ def test_count_nonnumeric_types(step): Period("2012-02"), Period("2012-03"), ], - "fl_inf": [1.0, 2.0, np.Inf], - "fl_nan": [1.0, 2.0, np.NaN], - "str_nan": ["aa", "bb", np.NaN], + "fl_inf": [1.0, 2.0, np.inf], + "fl_nan": [1.0, 2.0, np.nan], + "str_nan": ["aa", "bb", np.nan], "dt_nat": dt_nat_col, "periods_nat": [ Period("2012-01"), diff --git a/pandas/tests/window/test_apply.py b/pandas/tests/window/test_apply.py index 6af5a41e96e0a..4e4eca6e772e7 100644 --- a/pandas/tests/window/test_apply.py +++ b/pandas/tests/window/test_apply.py @@ -184,8 +184,8 @@ def numpysum(x, par): def test_nans(raw): obj = Series(np.random.default_rng(2).standard_normal(50)) - obj[:10] = np.NaN - obj[-10:] = np.NaN + obj[:10] = np.nan + obj[-10:] = np.nan result = obj.rolling(50, min_periods=30).apply(f, raw=raw) tm.assert_almost_equal(result.iloc[-1], np.mean(obj[10:-10])) @@ -210,12 +210,12 @@ def test_nans(raw): def test_center(raw): obj = Series(np.random.default_rng(2).standard_normal(50)) - obj[:10] = np.NaN - obj[-10:] = np.NaN + obj[:10] = np.nan + obj[-10:] = np.nan result = obj.rolling(20, min_periods=15, center=True).apply(f, raw=raw) expected = ( - concat([obj, Series([np.NaN] * 9)]) + concat([obj, Series([np.nan] * 9)]) .rolling(20, min_periods=15) .apply(f, raw=raw) .iloc[9:] diff --git a/pandas/tests/window/test_ewm.py b/pandas/tests/window/test_ewm.py index 45d481fdd2e44..c5c395414b450 100644 --- a/pandas/tests/window/test_ewm.py +++ b/pandas/tests/window/test_ewm.py @@ -417,7 +417,7 @@ def test_ewm_alpha(): # GH 10789 arr = np.random.default_rng(2).standard_normal(100) locs = np.arange(20, 40) - arr[locs] = np.NaN + arr[locs] = np.nan s = Series(arr) a = s.ewm(alpha=0.61722699889169674).mean() @@ -433,7 +433,7 @@ def test_ewm_domain_checks(): # GH 12492 arr = np.random.default_rng(2).standard_normal(100) locs = np.arange(20, 40) - arr[locs] = np.NaN + arr[locs] = np.nan s = Series(arr) msg = "comass must satisfy: comass >= 0" @@ -484,8 +484,8 @@ def test_ew_empty_series(method): def test_ew_min_periods(min_periods, name): # excluding NaNs correctly arr = np.random.default_rng(2).standard_normal(50) - arr[:10] = np.NaN - arr[-10:] = np.NaN + arr[:10] = np.nan + arr[-10:] = np.nan s = Series(arr) # check min_periods @@ -515,11 +515,11 @@ def test_ew_min_periods(min_periods, name): else: # ewm.std, ewm.var with bias=False require at least # two values - tm.assert_series_equal(result, Series([np.NaN])) + tm.assert_series_equal(result, Series([np.nan])) # pass in ints result2 = getattr(Series(np.arange(50)).ewm(span=10), name)() - assert result2.dtype == np.float_ + assert result2.dtype == np.float64 @pytest.mark.parametrize("name", ["cov", "corr"]) @@ -527,8 +527,8 @@ def test_ewm_corr_cov(name): A = Series(np.random.default_rng(2).standard_normal(50), index=range(50)) B = A[2:] + np.random.default_rng(2).standard_normal(48) - A[:10] = np.NaN - B.iloc[-10:] = np.NaN + A[:10] = np.nan + B.iloc[-10:] = np.nan result = getattr(A.ewm(com=20, min_periods=5), name)(B) assert np.isnan(result.values[:14]).all() @@ -542,8 +542,8 @@ def test_ewm_corr_cov_min_periods(name, min_periods): A = Series(np.random.default_rng(2).standard_normal(50), index=range(50)) B = A[2:] + np.random.default_rng(2).standard_normal(48) - A[:10] = np.NaN - B.iloc[-10:] = np.NaN + A[:10] = np.nan + B.iloc[-10:] = np.nan result = getattr(A.ewm(com=20, min_periods=min_periods), name)(B) # binary functions (ewmcov, ewmcorr) with bias=False require at @@ -560,13 +560,13 @@ def test_ewm_corr_cov_min_periods(name, min_periods): result = getattr(Series([1.0]).ewm(com=50, min_periods=min_periods), name)( Series([1.0]) ) - tm.assert_series_equal(result, Series([np.NaN])) + tm.assert_series_equal(result, Series([np.nan])) @pytest.mark.parametrize("name", ["cov", "corr"]) def test_different_input_array_raise_exception(name): A = Series(np.random.default_rng(2).standard_normal(50), index=range(50)) - A[:10] = np.NaN + A[:10] = np.nan msg = "other must be a DataFrame or Series" # exception raised is Exception diff --git a/pandas/tests/window/test_groupby.py b/pandas/tests/window/test_groupby.py index eb4fc06aa523e..ab00e18fc4812 100644 --- a/pandas/tests/window/test_groupby.py +++ b/pandas/tests/window/test_groupby.py @@ -1230,3 +1230,47 @@ def test_dont_mutate_obj_after_slicing(self): # This is the key test result = grp.count() tm.assert_frame_equal(result, expected_df) + + +def test_rolling_corr_with_single_integer_in_index(): + # GH 44078 + df = DataFrame({"a": [(1,), (1,), (1,)], "b": [4, 5, 6]}) + gb = df.groupby(["a"]) + result = gb.rolling(2).corr(other=df) + index = MultiIndex.from_tuples([((1,), 0), ((1,), 1), ((1,), 2)], names=["a", None]) + expected = DataFrame( + {"a": [np.nan, np.nan, np.nan], "b": [np.nan, 1.0, 1.0]}, index=index + ) + tm.assert_frame_equal(result, expected) + + +def test_rolling_corr_with_tuples_in_index(): + # GH 44078 + df = DataFrame( + { + "a": [ + ( + 1, + 2, + ), + ( + 1, + 2, + ), + ( + 1, + 2, + ), + ], + "b": [4, 5, 6], + } + ) + gb = df.groupby(["a"]) + result = gb.rolling(2).corr(other=df) + index = MultiIndex.from_tuples( + [((1, 2), 0), ((1, 2), 1), ((1, 2), 2)], names=["a", None] + ) + expected = DataFrame( + {"a": [np.nan, np.nan, np.nan], "b": [np.nan, 1.0, 1.0]}, index=index + ) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/window/test_pairwise.py b/pandas/tests/window/test_pairwise.py index 8dac6d271510a..b6f2365afb457 100644 --- a/pandas/tests/window/test_pairwise.py +++ b/pandas/tests/window/test_pairwise.py @@ -410,7 +410,7 @@ def test_cov_mulittindex(self): expected = DataFrame( np.vstack( ( - np.full((8, 8), np.NaN), + np.full((8, 8), np.nan), np.full((8, 8), 32.000000), np.full((8, 8), 63.881919), ) diff --git a/pandas/tests/window/test_rolling.py b/pandas/tests/window/test_rolling.py index 4df20282bbfa6..1beab5340484f 100644 --- a/pandas/tests/window/test_rolling.py +++ b/pandas/tests/window/test_rolling.py @@ -142,7 +142,7 @@ def test_constructor_timedelta_window_and_minperiods(window, raw): index=date_range("2017-08-08", periods=n, freq="D"), ) expected = DataFrame( - {"value": np.append([np.NaN, 1.0], np.arange(3.0, 27.0, 3))}, + {"value": np.append([np.nan, 1.0], np.arange(3.0, 27.0, 3))}, index=date_range("2017-08-08", periods=n, freq="D"), ) result_roll_sum = df.rolling(window=window, min_periods=2).sum() @@ -1461,15 +1461,15 @@ def test_rolling_mean_all_nan_window_floating_artifacts(start, exp_values): 0.03, 0.03, 0.001, - np.NaN, + np.nan, 0.002, 0.008, - np.NaN, - np.NaN, - np.NaN, - np.NaN, - np.NaN, - np.NaN, + np.nan, + np.nan, + np.nan, + np.nan, + np.nan, + np.nan, 0.005, 0.2, ] @@ -1480,8 +1480,8 @@ def test_rolling_mean_all_nan_window_floating_artifacts(start, exp_values): 0.005, 0.005, 0.008, - np.NaN, - np.NaN, + np.nan, + np.nan, 0.005, 0.102500, ] @@ -1495,7 +1495,7 @@ def test_rolling_mean_all_nan_window_floating_artifacts(start, exp_values): def test_rolling_sum_all_nan_window_floating_artifacts(): # GH#41053 - df = DataFrame([0.002, 0.008, 0.005, np.NaN, np.NaN, np.NaN]) + df = DataFrame([0.002, 0.008, 0.005, np.nan, np.nan, np.nan]) result = df.rolling(3, min_periods=0).sum() expected = DataFrame([0.002, 0.010, 0.015, 0.013, 0.005, 0.0]) tm.assert_frame_equal(result, expected) @@ -1878,3 +1878,35 @@ def test_numeric_only_corr_cov_series(kernel, use_arg, numeric_only, dtype): op2 = getattr(rolling2, kernel) expected = op2(*arg2, numeric_only=numeric_only) tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("unit", ["s", "ms", "us", "ns"]) +@pytest.mark.parametrize("tz", [None, "UTC", "Europe/Prague"]) +def test_rolling_timedelta_window_non_nanoseconds(unit, tz): + # Test Sum, GH#55106 + df_time = DataFrame( + {"A": range(5)}, index=date_range("2013-01-01", freq="1s", periods=5, tz=tz) + ) + sum_in_nanosecs = df_time.rolling("1s").sum() + # microseconds / milliseconds should not break the correct rolling + df_time.index = df_time.index.as_unit(unit) + sum_in_microsecs = df_time.rolling("1s").sum() + sum_in_microsecs.index = sum_in_microsecs.index.as_unit("ns") + tm.assert_frame_equal(sum_in_nanosecs, sum_in_microsecs) + + # Test max, GH#55026 + ref_dates = date_range("2023-01-01", "2023-01-10", unit="ns", tz=tz) + ref_series = Series(0, index=ref_dates) + ref_series.iloc[0] = 1 + ref_max_series = ref_series.rolling(Timedelta(days=4)).max() + + dates = date_range("2023-01-01", "2023-01-10", unit=unit, tz=tz) + series = Series(0, index=dates) + series.iloc[0] = 1 + max_series = series.rolling(Timedelta(days=4)).max() + + ref_df = DataFrame(ref_max_series) + df = DataFrame(max_series) + df.index = df.index.as_unit("ns") + + tm.assert_frame_equal(ref_df, df) diff --git a/pandas/tests/window/test_rolling_functions.py b/pandas/tests/window/test_rolling_functions.py index bc0b3e496038c..940f0845befa2 100644 --- a/pandas/tests/window/test_rolling_functions.py +++ b/pandas/tests/window/test_rolling_functions.py @@ -150,8 +150,8 @@ def test_time_rule_frame(raw, frame, compare_func, roll_func, kwargs, minp): ) def test_nans(compare_func, roll_func, kwargs): obj = Series(np.random.default_rng(2).standard_normal(50)) - obj[:10] = np.NaN - obj[-10:] = np.NaN + obj[:10] = np.nan + obj[-10:] = np.nan result = getattr(obj.rolling(50, min_periods=30), roll_func)(**kwargs) tm.assert_almost_equal(result.iloc[-1], compare_func(obj[10:-10])) @@ -177,8 +177,8 @@ def test_nans(compare_func, roll_func, kwargs): def test_nans_count(): obj = Series(np.random.default_rng(2).standard_normal(50)) - obj[:10] = np.NaN - obj[-10:] = np.NaN + obj[:10] = np.nan + obj[-10:] = np.nan result = obj.rolling(50, min_periods=30).count() tm.assert_almost_equal( result.iloc[-1], np.isfinite(obj[10:-10]).astype(float).sum() @@ -241,15 +241,15 @@ def test_min_periods_count(series, step): ) def test_center(roll_func, kwargs, minp): obj = Series(np.random.default_rng(2).standard_normal(50)) - obj[:10] = np.NaN - obj[-10:] = np.NaN + obj[:10] = np.nan + obj[-10:] = np.nan result = getattr(obj.rolling(20, min_periods=minp, center=True), roll_func)( **kwargs ) expected = ( getattr( - concat([obj, Series([np.NaN] * 9)]).rolling(20, min_periods=minp), roll_func + concat([obj, Series([np.nan] * 9)]).rolling(20, min_periods=minp), roll_func )(**kwargs) .iloc[9:] .reset_index(drop=True) diff --git a/pandas/tests/window/test_rolling_quantile.py b/pandas/tests/window/test_rolling_quantile.py index 32296ae3f2470..d5a7010923563 100644 --- a/pandas/tests/window/test_rolling_quantile.py +++ b/pandas/tests/window/test_rolling_quantile.py @@ -89,8 +89,8 @@ def test_time_rule_frame(raw, frame, q): def test_nans(q): compare_func = partial(scoreatpercentile, per=q) obj = Series(np.random.default_rng(2).standard_normal(50)) - obj[:10] = np.NaN - obj[-10:] = np.NaN + obj[:10] = np.nan + obj[-10:] = np.nan result = obj.rolling(50, min_periods=30).quantile(q) tm.assert_almost_equal(result.iloc[-1], compare_func(obj[10:-10])) @@ -128,12 +128,12 @@ def test_min_periods(series, minp, q, step): @pytest.mark.parametrize("q", [0.0, 0.1, 0.5, 0.9, 1.0]) def test_center(q): obj = Series(np.random.default_rng(2).standard_normal(50)) - obj[:10] = np.NaN - obj[-10:] = np.NaN + obj[:10] = np.nan + obj[-10:] = np.nan result = obj.rolling(20, center=True).quantile(q) expected = ( - concat([obj, Series([np.NaN] * 9)]) + concat([obj, Series([np.nan] * 9)]) .rolling(20) .quantile(q) .iloc[9:] diff --git a/pandas/tests/window/test_rolling_skew_kurt.py b/pandas/tests/window/test_rolling_skew_kurt.py index ada726401c4a0..79c14f243e7cc 100644 --- a/pandas/tests/window/test_rolling_skew_kurt.py +++ b/pandas/tests/window/test_rolling_skew_kurt.py @@ -79,8 +79,8 @@ def test_nans(sp_func, roll_func): compare_func = partial(getattr(sp_stats, sp_func), bias=False) obj = Series(np.random.default_rng(2).standard_normal(50)) - obj[:10] = np.NaN - obj[-10:] = np.NaN + obj[:10] = np.nan + obj[-10:] = np.nan result = getattr(obj.rolling(50, min_periods=30), roll_func)() tm.assert_almost_equal(result.iloc[-1], compare_func(obj[10:-10])) @@ -122,12 +122,12 @@ def test_min_periods(series, minp, roll_func, step): @pytest.mark.parametrize("roll_func", ["kurt", "skew"]) def test_center(roll_func): obj = Series(np.random.default_rng(2).standard_normal(50)) - obj[:10] = np.NaN - obj[-10:] = np.NaN + obj[:10] = np.nan + obj[-10:] = np.nan result = getattr(obj.rolling(20, center=True), roll_func)() expected = ( - getattr(concat([obj, Series([np.NaN] * 9)]).rolling(20), roll_func)() + getattr(concat([obj, Series([np.nan] * 9)]).rolling(20), roll_func)() .iloc[9:] .reset_index(drop=True) ) @@ -170,14 +170,14 @@ def test_center_reindex_frame(frame, roll_func): def test_rolling_skew_edge_cases(step): - expected = Series([np.NaN] * 4 + [0.0])[::step] + expected = Series([np.nan] * 4 + [0.0])[::step] # yields all NaN (0 variance) d = Series([1] * 5) x = d.rolling(window=5, step=step).skew() # index 4 should be 0 as it contains 5 same obs tm.assert_series_equal(expected, x) - expected = Series([np.NaN] * 5)[::step] + expected = Series([np.nan] * 5)[::step] # yields all NaN (window too small) d = Series(np.random.default_rng(2).standard_normal(5)) x = d.rolling(window=2, step=step).skew() @@ -185,13 +185,13 @@ def test_rolling_skew_edge_cases(step): # yields [NaN, NaN, NaN, 0.177994, 1.548824] d = Series([-1.50837035, -0.1297039, 0.19501095, 1.73508164, 0.41941401]) - expected = Series([np.NaN, np.NaN, np.NaN, 0.177994, 1.548824])[::step] + expected = Series([np.nan, np.nan, np.nan, 0.177994, 1.548824])[::step] x = d.rolling(window=4, step=step).skew() tm.assert_series_equal(expected, x) def test_rolling_kurt_edge_cases(step): - expected = Series([np.NaN] * 4 + [-3.0])[::step] + expected = Series([np.nan] * 4 + [-3.0])[::step] # yields all NaN (0 variance) d = Series([1] * 5) @@ -199,14 +199,14 @@ def test_rolling_kurt_edge_cases(step): tm.assert_series_equal(expected, x) # yields all NaN (window too small) - expected = Series([np.NaN] * 5)[::step] + expected = Series([np.nan] * 5)[::step] d = Series(np.random.default_rng(2).standard_normal(5)) x = d.rolling(window=3, step=step).kurt() tm.assert_series_equal(expected, x) # yields [NaN, NaN, NaN, 1.224307, 2.671499] d = Series([-1.50837035, -0.1297039, 0.19501095, 1.73508164, 0.41941401]) - expected = Series([np.NaN, np.NaN, np.NaN, 1.224307, 2.671499])[::step] + expected = Series([np.nan, np.nan, np.nan, 1.224307, 2.671499])[::step] x = d.rolling(window=4, step=step).kurt() tm.assert_series_equal(expected, x) diff --git a/pandas/tests/window/test_win_type.py b/pandas/tests/window/test_win_type.py index 2ca02fef796ed..5052019ddb726 100644 --- a/pandas/tests/window/test_win_type.py +++ b/pandas/tests/window/test_win_type.py @@ -666,7 +666,7 @@ def test_weighted_var_big_window_no_segfault(win_types, center): pytest.importorskip("scipy") x = Series(0) result = x.rolling(window=16, center=center, win_type=win_types).var() - expected = Series(np.NaN) + expected = Series(np.nan) tm.assert_series_equal(result, expected) diff --git a/pandas/tseries/holiday.py b/pandas/tseries/holiday.py index 44c21bc284121..75cb7f7850013 100644 --- a/pandas/tseries/holiday.py +++ b/pandas/tseries/holiday.py @@ -283,11 +283,11 @@ def dates( holiday_dates = self._apply_rule(dates) if self.days_of_week is not None: holiday_dates = holiday_dates[ - np.in1d( + np.isin( # error: "DatetimeIndex" has no attribute "dayofweek" holiday_dates.dayofweek, # type: ignore[attr-defined] self.days_of_week, - ) + ).ravel() ] if self.start_date is not None: diff --git a/pandas/util/_test_decorators.py b/pandas/util/_test_decorators.py index 03011a1ffe622..93c46274fdc1b 100644 --- a/pandas/util/_test_decorators.py +++ b/pandas/util/_test_decorators.py @@ -38,11 +38,11 @@ def test_foo(): if TYPE_CHECKING: from pandas._typing import F + from pandas.compat import ( IS64, is_platform_windows, ) -from pandas.compat._optional import import_optional_dependency from pandas.core.computation.expressions import ( NUMEXPR_INSTALLED, @@ -214,16 +214,6 @@ def documented_fixture(fixture): return documented_fixture -def async_mark(): - try: - import_optional_dependency("pytest_asyncio") - async_mark = pytest.mark.asyncio - except ImportError: - async_mark = pytest.mark.skip(reason="Missing dependency pytest-asyncio") - - return async_mark - - def mark_array_manager_not_yet_implemented(request) -> None: mark = pytest.mark.xfail(reason="Not yet implemented for ArrayManager") request.node.add_marker(mark) diff --git a/pyproject.toml b/pyproject.toml index 1034196baa15e..d7634894b4835 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -3,14 +3,14 @@ # See https://github.com/scipy/scipy/pull/12940 for the AIX issue. requires = [ "meson-python==0.13.1", - "meson==1.0.1", + "meson==1.2.1", "wheel", "Cython>=0.29.33,<3", # Note: sync with setup.py, environment.yml and asv.conf.json # Note: numpy 1.25 has a backwards compatible C API by default # we don't want to force users to compile with 1.25 though # (Ideally, in the future, though, oldest-supported-numpy can be dropped when our min numpy is 1.25.x) "oldest-supported-numpy>=2022.8.16; python_version<'3.12'", - "numpy>=1.22.4; python_version>='3.12'", + "numpy>=1.26.0,<2; python_version>='3.12'", "versioneer[toml]" ] @@ -29,8 +29,9 @@ authors = [ license = {file = 'LICENSE'} requires-python = '>=3.9' dependencies = [ - "numpy>=1.22.4; python_version<'3.11'", - "numpy>=1.23.2; python_version>='3.11'", + "numpy>=1.22.4,<2; python_version<'3.11'", + "numpy>=1.23.2,<2; python_version=='3.11'", + "numpy>=1.26.0,<2; python_version>='3.12'", "python-dateutil>=2.8.2", "pytz>=2020.1", "tzdata>=2022.1" @@ -60,7 +61,7 @@ repository = 'https://github.com/pandas-dev/pandas' matplotlib = "pandas:plotting._matplotlib" [project.optional-dependencies] -test = ['hypothesis>=6.46.1', 'pytest>=7.3.2', 'pytest-xdist>=2.2.0', 'pytest-asyncio>=0.17.0'] +test = ['hypothesis>=6.46.1', 'pytest>=7.3.2', 'pytest-xdist>=2.2.0'] performance = ['bottleneck>=1.3.4', 'numba>=0.55.2', 'numexpr>=2.8.0'] computation = ['scipy>=1.8.1', 'xarray>=2022.03.0'] fss = ['fsspec>=2022.05.0'] @@ -79,15 +80,14 @@ sql-other = ['SQLAlchemy>=1.4.36'] html = ['beautifulsoup4>=4.11.1', 'html5lib>=1.1', 'lxml>=4.8.0'] xml = ['lxml>=4.8.0'] plot = ['matplotlib>=3.6.1'] -output_formatting = ['jinja2>=3.1.2', 'tabulate>=0.8.10'] +output-formatting = ['jinja2>=3.1.2', 'tabulate>=0.8.10'] clipboard = ['PyQt5>=5.15.6', 'qtpy>=2.2.0'] -compression = ['brotlipy>=0.7.0', 'python-snappy>=0.6.1', 'zstandard>=0.17.0'] +compression = ['zstandard>=0.17.0'] consortium-standard = ['dataframe-api-compat>=0.1.7'] all = ['beautifulsoup4>=4.11.1', # blosc only available on conda (https://github.com/Blosc/python-blosc/issues/297) #'blosc>=1.21.0', 'bottleneck>=1.3.4', - 'brotlipy>=0.7.0', 'dataframe-api-compat>=0.1.7', 'fastparquet>=0.8.1', 'fsspec>=2022.05.0', @@ -109,8 +109,6 @@ all = ['beautifulsoup4>=4.11.1', 'pyreadstat>=1.1.5', 'pytest>=7.3.2', 'pytest-xdist>=2.2.0', - 'pytest-asyncio>=0.17.0', - 'python-snappy>=0.6.1', 'pyxlsb>=1.0.9', 'qtpy>=2.2.0', 'scipy>=1.8.1', @@ -152,7 +150,7 @@ setup = ['--vsenv'] # For Windows skip = "cp36-* cp37-* cp38-* pp* *_i686 *_ppc64le *_s390x *-musllinux_aarch64" build-verbosity = "3" environment = {LDFLAGS="-Wl,--strip-all"} -test-requires = "hypothesis>=6.46.1 pytest>=7.3.2 pytest-xdist>=2.2.0 pytest-asyncio>=0.17" +test-requires = "hypothesis>=6.46.1 pytest>=7.3.2 pytest-xdist>=2.2.0" test-command = """ PANDAS_CI='1' python -c 'import pandas as pd; \ pd.test(extra_args=["-m not clipboard and not single_cpu and not slow and not network and not db", "-n 2", "--no-strict-data-files"]); \ @@ -502,6 +500,8 @@ filterwarnings = [ "ignore:distutils Version classes are deprecated:DeprecationWarning:numexpr", "ignore:distutils Version classes are deprecated:DeprecationWarning:fastparquet", "ignore:distutils Version classes are deprecated:DeprecationWarning:fsspec", + # Can be removed once https://github.com/numpy/numpy/pull/24794 is merged + "ignore:.*In the future `np.long` will be defined as.*:FutureWarning", ] junit_family = "xunit2" markers = [ @@ -513,7 +513,6 @@ markers = [ "arm_slow: mark a test as slow for arm64 architecture", "arraymanager: mark a test to run with ArrayManager enabled", ] -asyncio_mode = "strict" [tool.mypy] # Import discovery diff --git a/requirements-dev.txt b/requirements-dev.txt index 0944acbc36c9b..af4f82b068d31 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -4,19 +4,17 @@ pip versioneer[toml] cython==0.29.33 -meson[ninja]==1.0.1 +meson[ninja]==1.2.1 meson-python==0.13.1 pytest>=7.3.2 pytest-cov pytest-xdist>=2.2.0 -pytest-asyncio>=0.17.0 coverage python-dateutil -numpy +numpy<2 pytz beautifulsoup4>=4.11.1 blosc -brotlipy>=0.7.0 bottleneck>=1.3.4 fastparquet>=0.8.1 fsspec>=2022.05.0 @@ -26,7 +24,7 @@ gcsfs>=2022.05.0 ipython jinja2>=3.1.2 lxml>=4.8.0 -matplotlib>=3.6.1 +matplotlib>=3.6.1, <3.8 numba>=0.55.2 numexpr>=2.8.0 openpyxl>=3.0.10 @@ -37,7 +35,6 @@ pyarrow>=7.0.0 pymysql>=1.0.2 pyreadstat>=1.1.5 tables>=3.7.0 -python-snappy>=0.6.1 pyxlsb>=1.0.9 s3fs>=2022.05.0 scipy>=1.8.1 @@ -61,7 +58,7 @@ gitdb google-auth natsort numpydoc -pydata-sphinx-theme +pydata-sphinx-theme==0.13 pytest-cython sphinx sphinx-design diff --git a/scripts/tests/data/deps_expected_random.yaml b/scripts/tests/data/deps_expected_random.yaml index 35d7fe74806a9..bebb25b9e1bb8 100644 --- a/scripts/tests/data/deps_expected_random.yaml +++ b/scripts/tests/data/deps_expected_random.yaml @@ -14,7 +14,6 @@ dependencies: - pytest-cov - pytest-xdist>=2.2.0 - psutil - - pytest-asyncio>=0.17.0 - boto3 # required dependencies @@ -26,7 +25,6 @@ dependencies: - beautifulsoup4>=5.9.3 - blosc - bottleneck>=1.3.2 - - brotlipy>=0.7.0 - fastparquet>=0.6.3 - fsspec>=2021.07.0 - html5lib>=1.1 @@ -45,7 +43,6 @@ dependencies: - pymysql>=1.0.2 - pyreadstat>=1.1.2 - pytables>=3.6.1 - - python-snappy>=0.6.0 - pyxlsb>=1.0.8 - s3fs>=2021.08.0 - scipy>=1.7.1 diff --git a/scripts/tests/data/deps_minimum.toml b/scripts/tests/data/deps_minimum.toml index 6f56ca498794b..5fb51856e841e 100644 --- a/scripts/tests/data/deps_minimum.toml +++ b/scripts/tests/data/deps_minimum.toml @@ -55,7 +55,7 @@ repository = 'https://github.com/pandas-dev/pandas' matplotlib = "pandas:plotting._matplotlib" [project.optional-dependencies] -test = ['hypothesis>=6.34.2', 'pytest>=7.3.2', 'pytest-xdist>=2.2.0', 'pytest-asyncio>=0.17.0'] +test = ['hypothesis>=6.34.2', 'pytest>=7.3.2', 'pytest-xdist>=2.2.0'] performance = ['bottleneck>=1.3.2', 'numba>=0.53.1', 'numexpr>=2.7.1'] timezone = ['tzdata>=2022.1'] computation = ['scipy>=1.7.1', 'xarray>=0.21.0'] @@ -77,12 +77,11 @@ xml = ['lxml>=4.6.3'] plot = ['matplotlib>=3.6.1'] output_formatting = ['jinja2>=3.0.0', 'tabulate>=0.8.9'] clipboard = ['PyQt5>=5.15.1', 'qtpy>=2.2.0'] -compression = ['brotlipy>=0.7.0', 'python-snappy>=0.6.0', 'zstandard>=0.15.2'] +compression = ['zstandard>=0.15.2'] all = ['beautifulsoup4>=5.9.3', # blosc only available on conda (https://github.com/Blosc/python-blosc/issues/297) #'blosc>=1.21.0', 'bottleneck>=1.3.2', - 'brotlipy>=0.7.0', 'fastparquet>=0.6.3', 'fsspec>=2021.07.0', 'gcsfs>=2021.07.0', @@ -103,8 +102,6 @@ all = ['beautifulsoup4>=5.9.3', 'pyreadstat>=1.1.2', 'pytest>=7.3.2', 'pytest-xdist>=2.2.0', - 'pytest-asyncio>=0.17.0', - 'python-snappy>=0.6.0', 'pyxlsb>=1.0.8', 'qtpy>=2.2.0', 'scipy>=1.7.1', @@ -143,7 +140,7 @@ parentdir_prefix = "pandas-" [tool.cibuildwheel] skip = "cp36-* cp37-* pp37-* *-manylinux_i686 *_ppc64le *_s390x *-musllinux*" build-verbosity = "3" -test-requires = "hypothesis>=6.34.2 pytest>=7.3.2 pytest-xdist>=2.2.0 pytest-asyncio>=0.17" +test-requires = "hypothesis>=6.34.2 pytest>=7.3.2 pytest-xdist>=2.2.0" test-command = "python {project}/ci/test_wheels.py" [tool.cibuildwheel.macos] @@ -387,7 +384,6 @@ markers = [ "arm_slow: mark a test as slow for arm64 architecture", "arraymanager: mark a test to run with ArrayManager enabled", ] -asyncio_mode = "strict" [tool.mypy] # Import discovery diff --git a/scripts/tests/data/deps_unmodified_random.yaml b/scripts/tests/data/deps_unmodified_random.yaml index 405762d33f53e..e54482282fcc8 100644 --- a/scripts/tests/data/deps_unmodified_random.yaml +++ b/scripts/tests/data/deps_unmodified_random.yaml @@ -14,7 +14,6 @@ dependencies: - pytest-cov - pytest-xdist>=2.2.0 - psutil - - pytest-asyncio>=0.17 - boto3 # required dependencies @@ -26,7 +25,6 @@ dependencies: - beautifulsoup4 - blosc - bottleneck>=1.3.2 - - brotlipy - fastparquet>=0.6.3 - fsspec>=2021.07.0 - html5lib>=1.1 @@ -45,7 +43,6 @@ dependencies: - pymysql>=1.0.2 - pyreadstat>=1.1.2 - pytables>=3.6.1 - - python-snappy>=0.6.0 - pyxlsb>=1.0.8 - s3fs>=2021.08.0 - scipy>=1.7.1 diff --git a/scripts/validate_unwanted_patterns.py b/scripts/validate_unwanted_patterns.py index f8b49923cf466..47534226f972f 100755 --- a/scripts/validate_unwanted_patterns.py +++ b/scripts/validate_unwanted_patterns.py @@ -49,6 +49,7 @@ "_global_config", "_chained_assignment_msg", "_chained_assignment_method_msg", + "_version_meson", }