diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index bf76033f769b5..a5a802c678e20 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -2,13 +2,11 @@ name: CI on: push: - branches: - - master - - 1.3.x + branches: [master] pull_request: branches: - master - - 1.3.x + - 1.2.x env: ENV_FILE: environment.yml @@ -22,10 +20,6 @@ jobs: run: shell: bash -l {0} - concurrency: - group: ${{ github.ref }}-checks - cancel-in-progress: ${{github.event_name == 'pull_request'}} - steps: - name: Checkout uses: actions/checkout@v2 @@ -99,12 +93,8 @@ jobs: web_and_docs: name: Web and docs runs-on: ubuntu-latest - - concurrency: - group: ${{ github.ref }}-web-docs - cancel-in-progress: true - steps: + - name: Checkout uses: actions/checkout@v2 with: @@ -133,15 +123,15 @@ jobs: echo "${{ secrets.server_ssh_key }}" > ~/.ssh/id_rsa chmod 600 ~/.ssh/id_rsa echo "${{ secrets.server_ip }} ecdsa-sha2-nistp256 AAAAE2VjZHNhLXNoYTItbmlzdHAyNTYAAAAIbmlzdHAyNTYAAABBBE1Kkopomm7FHG5enATf7SgnpICZ4W2bw+Ho+afqin+w7sMcrsa0je7sbztFAV8YchDkiBKnWTG4cRT+KZgZCaY=" > ~/.ssh/known_hosts - if: ${{github.event_name == 'push' && github.ref == 'refs/heads/master'}} + if: github.event_name == 'push' - name: Upload web run: rsync -az --delete --exclude='pandas-docs' --exclude='docs' --exclude='Pandas_Cheat_Sheet*' web/build/ docs@${{ secrets.server_ip }}:/usr/share/nginx/pandas - if: ${{github.event_name == 'push' && github.ref == 'refs/heads/master'}} + if: github.event_name == 'push' - name: Upload dev docs run: rsync -az --delete doc/build/html/ docs@${{ secrets.server_ip }}:/usr/share/nginx/pandas/pandas-docs/dev - if: ${{github.event_name == 'push' && github.ref == 'refs/heads/master'}} + if: github.event_name == 'push' - name: Move docs into site directory run: mv doc/build/html web/build/docs @@ -158,11 +148,8 @@ jobs: strategy: matrix: pattern: ["not slow and not network and not clipboard", "slow"] - concurrency: - group: ${{ github.ref }}-data_manager-${{ matrix.pattern }} - cancel-in-progress: true - steps: + - name: Checkout uses: actions/checkout@v2 with: diff --git a/.github/workflows/database.yml b/.github/workflows/database.yml index 33a8d623e358f..b15889351386a 100644 --- a/.github/workflows/database.yml +++ b/.github/workflows/database.yml @@ -2,14 +2,11 @@ name: Database on: push: - branches: - - master + branches: [master] pull_request: branches: - master - - 1.3.x - paths-ignore: - - "doc/**" + - 1.2.x env: PYTEST_WORKERS: "auto" @@ -18,7 +15,7 @@ env: COVERAGE: true jobs: - Linux_py38_IO: + Linux_py37_IO: runs-on: ubuntu-latest defaults: run: @@ -26,13 +23,9 @@ jobs: strategy: matrix: - ENV_FILE: [ci/deps/actions-38-db-min.yaml, ci/deps/actions-38-db.yaml] + ENV_FILE: [ci/deps/actions-37-db-min.yaml, ci/deps/actions-37-db.yaml] fail-fast: false - concurrency: - group: ${{ github.ref }}-${{ matrix.ENV_FILE }} - cancel-in-progress: ${{github.event_name == 'pull_request'}} - services: mysql: image: mysql diff --git a/.github/workflows/posix.yml b/.github/workflows/posix.yml index 6b5ac3cd17304..3a4d3c106f851 100644 --- a/.github/workflows/posix.yml +++ b/.github/workflows/posix.yml @@ -2,15 +2,11 @@ name: Posix on: push: - branches: - - master - - 1.3.x + branches: [master] pull_request: branches: - master - - 1.3.x - paths-ignore: - - "doc/**" + - 1.2.x env: PYTEST_WORKERS: "auto" @@ -25,13 +21,14 @@ jobs: strategy: matrix: settings: [ - [actions-38-minimum_versions.yaml, "not slow and not network and not clipboard", "", "", "", "", ""], - [actions-38-locale_slow.yaml, "slow", "language-pack-it xsel", "it_IT.utf8", "it_IT.utf8", "", ""], + [actions-37-minimum_versions.yaml, "not slow and not network and not clipboard", "", "", "", "", ""], + [actions-37.yaml, "not slow and not network and not clipboard", "", "", "", "", ""], + [actions-37-locale_slow.yaml, "slow", "language-pack-it xsel", "it_IT.utf8", "it_IT.utf8", "", ""], + [actions-37-slow.yaml, "slow", "", "", "", "", ""], [actions-38.yaml, "not slow and not network and not clipboard", "", "", "", "", ""], [actions-38-slow.yaml, "slow", "", "", "", "", ""], [actions-38-locale.yaml, "not slow and not network", "language-pack-zh-hans xsel", "zh_CN.utf8", "zh_CN.utf8", "", ""], - [actions-39-slow.yaml, "slow", "", "", "", "", ""], - [actions-39-numpydev.yaml, "not slow and not network", "xsel", "", "", "deprecate", "-W error"], + [actions-38-numpydev.yaml, "not slow and not network", "xsel", "", "", "deprecate", "-W error"], [actions-39.yaml, "not slow and not network and not clipboard", "", "", "", "", ""] ] fail-fast: false @@ -44,9 +41,6 @@ jobs: LC_ALL: ${{ matrix.settings[4] }} PANDAS_TESTING_MODE: ${{ matrix.settings[5] }} TEST_ARGS: ${{ matrix.settings[6] }} - concurrency: - group: ${{ github.ref }}-${{ matrix.settings[0] }} - cancel-in-progress: ${{github.event_name == 'pull_request'}} steps: - name: Checkout diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml index 0609755678d78..723347913ac38 100644 --- a/.github/workflows/pre-commit.yml +++ b/.github/workflows/pre-commit.yml @@ -3,16 +3,11 @@ name: pre-commit on: pull_request: push: - branches: - - master - - 1.3.x + branches: [master] jobs: pre-commit: runs-on: ubuntu-latest - concurrency: - group: ${{ github.ref }}-pre-commit - cancel-in-progress: ${{github.event_name == 'pull_request'}} steps: - uses: actions/checkout@v2 - uses: actions/setup-python@v2 diff --git a/.github/workflows/python-dev.yml b/.github/workflows/python-dev.yml index ac06ddab4ade4..2643dc5ec656e 100644 --- a/.github/workflows/python-dev.yml +++ b/.github/workflows/python-dev.yml @@ -4,19 +4,9 @@ on: push: branches: - master - - 1.3.x pull_request: branches: - master - - 1.3.x - paths-ignore: - - "doc/**" - -env: - PYTEST_WORKERS: "auto" - PANDAS_CI: 1 - PATTERN: "not slow and not network and not clipboard" - COVERAGE: true jobs: build: @@ -24,10 +14,6 @@ jobs: name: actions-310-dev timeout-minutes: 60 - concurrency: - group: ${{ github.ref }}-dev - cancel-in-progress: ${{github.event_name == 'pull_request'}} - steps: - uses: actions/checkout@v2 with: @@ -44,7 +30,7 @@ jobs: pip install git+https://github.com/numpy/numpy.git pip install git+https://github.com/pytest-dev/pytest.git pip install git+https://github.com/nedbat/coveragepy.git - pip install cython python-dateutil pytz hypothesis pytest-xdist pytest-cov + pip install cython python-dateutil pytz hypothesis pytest-xdist pip list - name: Build Pandas @@ -58,8 +44,7 @@ jobs: - name: Test with pytest run: | - ci/run_tests.sh - # GH 41935 + coverage run -m pytest -m 'not slow and not network and not clipboard' pandas continue-on-error: true - name: Publish test results diff --git a/.github/workflows/sdist.yml b/.github/workflows/sdist.yml deleted file mode 100644 index 2e890506073a8..0000000000000 --- a/.github/workflows/sdist.yml +++ /dev/null @@ -1,67 +0,0 @@ -name: sdist - -on: - push: - branches: - - master - - 1.3.x - pull_request: - branches: - - master - - 1.3.x - paths-ignore: - - "doc/**" - -jobs: - build: - runs-on: ubuntu-latest - timeout-minutes: 60 - defaults: - run: - shell: bash -l {0} - - strategy: - fail-fast: false - matrix: - python-version: ["3.8", "3.9"] - concurrency: - group: ${{github.ref}}-${{matrix.python-version}}-sdist - cancel-in-progress: ${{github.event_name == 'pull_request'}} - - steps: - - uses: actions/checkout@v2 - with: - fetch-depth: 0 - - - name: Set up Python - uses: actions/setup-python@v2 - with: - python-version: ${{ matrix.python-version }} - - - name: Install dependencies - run: | - python -m pip install --upgrade pip setuptools wheel - - # GH 39416 - pip install numpy - - - name: Build pandas sdist - run: | - pip list - python setup.py sdist --formats=gztar - - - uses: conda-incubator/setup-miniconda@v2 - with: - activate-environment: pandas-sdist - python-version: ${{ matrix.python-version }} - - - name: Install pandas from sdist - run: | - conda list - python -m pip install dist/*.gz - - - name: Import pandas - run: | - cd .. - conda list - python -c "import pandas; pandas.show_versions();" diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 3a07aae11dc1b..d580fcf4fc545 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -9,11 +9,11 @@ repos: - id: absolufy-imports files: ^pandas/ - repo: https://github.com/python/black - rev: 21.6b0 + rev: 21.5b2 hooks: - id: black - repo: https://github.com/codespell-project/codespell - rev: v2.1.0 + rev: v2.0.0 hooks: - id: codespell types_or: [python, rst, markdown] @@ -53,16 +53,16 @@ repos: types: [text] args: [--append-config=flake8/cython-template.cfg] - repo: https://github.com/PyCQA/isort - rev: 5.9.2 + rev: 5.8.0 hooks: - id: isort - repo: https://github.com/asottile/pyupgrade - rev: v2.21.0 + rev: v2.18.3 hooks: - id: pyupgrade - args: [--py38-plus] + args: [--py37-plus] - repo: https://github.com/pre-commit/pygrep-hooks - rev: v1.9.0 + rev: v1.8.0 hooks: - id: rst-backticks - id: rst-directive-colons diff --git a/MANIFEST.in b/MANIFEST.in index f616fad6b1557..d0d93f2cdba8c 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -17,19 +17,18 @@ global-exclude *.h5 global-exclude *.html global-exclude *.json global-exclude *.jsonl -global-exclude *.msgpack global-exclude *.pdf global-exclude *.pickle global-exclude *.png global-exclude *.pptx +global-exclude *.pyc +global-exclude *.pyd global-exclude *.ods global-exclude *.odt -global-exclude *.orc global-exclude *.sas7bdat global-exclude *.sav global-exclude *.so global-exclude *.xls -global-exclude *.xlsb global-exclude *.xlsm global-exclude *.xlsx global-exclude *.xpt @@ -40,13 +39,6 @@ global-exclude .DS_Store global-exclude .git* global-exclude \#* -global-exclude *.c -global-exclude *.cpp -global-exclude *.h - -global-exclude *.py[ocd] -global-exclude *.pxi - # GH 39321 # csv_dir_path fixture checks the existence of the directory # exclude the whole directory to avoid running related tests in sdist @@ -55,6 +47,3 @@ prune pandas/tests/io/parser/data include versioneer.py include pandas/_version.py include pandas/io/formats/templates/*.tpl - -graft pandas/_libs/src -graft pandas/_libs/tslibs/src diff --git a/asv_bench/benchmarks/algos/isin.py b/asv_bench/benchmarks/algos/isin.py index 5d7a76bc01d49..296101c9f9800 100644 --- a/asv_bench/benchmarks/algos/isin.py +++ b/asv_bench/benchmarks/algos/isin.py @@ -1,5 +1,7 @@ import numpy as np +from pandas.compat.numpy import np_version_under1p20 + from pandas import ( Categorical, NaT, @@ -278,6 +280,10 @@ class IsInLongSeriesLookUpDominates: def setup(self, dtype, MaxNumber, series_type): N = 10 ** 7 + # https://github.com/pandas-dev/pandas/issues/39844 + if not np_version_under1p20 and dtype in ("Int64", "Float64"): + raise NotImplementedError + if series_type == "random_hits": array = np.random.randint(0, MaxNumber, N) if series_type == "random_misses": @@ -288,8 +294,7 @@ def setup(self, dtype, MaxNumber, series_type): array = np.arange(N) + MaxNumber self.series = Series(array).astype(dtype) - - self.values = np.arange(MaxNumber).astype(dtype.lower()) + self.values = np.arange(MaxNumber).astype(dtype) def time_isin(self, dtypes, MaxNumber, series_type): self.series.isin(self.values) @@ -305,24 +310,18 @@ class IsInLongSeriesValuesDominate: def setup(self, dtype, series_type): N = 10 ** 7 + # https://github.com/pandas-dev/pandas/issues/39844 + if not np_version_under1p20 and dtype in ("Int64", "Float64"): + raise NotImplementedError + if series_type == "random": vals = np.random.randint(0, 10 * N, N) if series_type == "monotone": vals = np.arange(N) - self.values = vals.astype(dtype.lower()) + self.values = vals.astype(dtype) M = 10 ** 6 + 1 self.series = Series(np.arange(M)).astype(dtype) def time_isin(self, dtypes, series_type): self.series.isin(self.values) - - -class IsInWithLongTupples: - def setup(self): - t = tuple(range(1000)) - self.series = Series([t] * 1000) - self.values = [t] - - def time_isin(self): - self.series.isin(self.values) diff --git a/asv_bench/benchmarks/frame_methods.py b/asv_bench/benchmarks/frame_methods.py index e5834f311d259..c32eda4928da7 100644 --- a/asv_bench/benchmarks/frame_methods.py +++ b/asv_bench/benchmarks/frame_methods.py @@ -232,22 +232,6 @@ def time_to_html_mixed(self): self.df2.to_html() -class ToDict: - params = [["dict", "list", "series", "split", "records", "index"]] - param_names = ["orient"] - - def setup(self, orient): - data = np.random.randint(0, 1000, size=(10000, 4)) - self.int_df = DataFrame(data) - self.datetimelike_df = self.int_df.astype("timedelta64[ns]") - - def time_to_dict_ints(self, orient): - self.int_df.to_dict(orient=orient) - - def time_to_dict_datetimelike(self, orient): - self.datetimelike_df.to_dict(orient=orient) - - class ToNumpy: def setup(self): N = 10000 diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py index 6ca951e946bad..1648985a56b91 100644 --- a/asv_bench/benchmarks/groupby.py +++ b/asv_bench/benchmarks/groupby.py @@ -832,18 +832,4 @@ def function(values): self.grouper.agg(function, engine="cython") -class Sample: - def setup(self): - N = 10 ** 3 - self.df = DataFrame({"a": np.zeros(N)}) - self.groups = np.arange(0, N) - self.weights = np.ones(N) - - def time_sample(self): - self.df.groupby(self.groups).sample(n=1) - - def time_sample_weights(self): - self.df.groupby(self.groups).sample(n=1, weights=self.weights) - - from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py index 852c1e0d139e5..5ff9431fbf8e4 100644 --- a/asv_bench/benchmarks/io/csv.py +++ b/asv_bench/benchmarks/io/csv.py @@ -291,8 +291,7 @@ class ReadCSVFloatPrecision(StringIORewind): def setup(self, sep, decimal, float_precision): floats = [ - "".join([random.choice(string.digits) for _ in range(28)]) - for _ in range(15) + "".join(random.choice(string.digits) for _ in range(28)) for _ in range(15) ] rows = sep.join([f"0{decimal}" + "{}"] * 3) + "\n" data = rows * 5 @@ -396,7 +395,7 @@ class ReadCSVCachedParseDates(StringIORewind): param_names = ["do_cache", "engine"] def setup(self, do_cache, engine): - data = ("\n".join([f"10/{year}" for year in range(2000, 2100)]) + "\n") * 10 + data = ("\n".join(f"10/{year}" for year in range(2000, 2100)) + "\n") * 10 self.StringIO_input = StringIO(data) def time_read_csv_cached(self, do_cache, engine): diff --git a/asv_bench/benchmarks/rolling.py b/asv_bench/benchmarks/rolling.py index 97294fc02834b..d35770b720f7a 100644 --- a/asv_bench/benchmarks/rolling.py +++ b/asv_bench/benchmarks/rolling.py @@ -296,8 +296,5 @@ def time_apply(self, method): table_method_func, raw=True, engine="numba" ) - def time_ewm_mean(self, method): - self.df.ewm(1, method=method).mean(engine="numba") - from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 4cdd495fe0c31..008033df3c833 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -1,24 +1,17 @@ # Adapted from https://github.com/numba/numba/blob/master/azure-pipelines.yml trigger: - branches: - include: - - master - - 1.3.x - paths: - exclude: - - 'doc/*' +- master +- 1.2.x pr: - master - 1.2.x -- 1.3.x variables: PYTEST_WORKERS: auto jobs: -# Mac and Linux use the same template -- template: ci/azure/posix.yml +- template: ci/azure/macos.yml parameters: name: macOS vmImage: macOS-10.15 @@ -45,7 +38,6 @@ jobs: python -m pip install --no-build-isolation -e . && \ pytest -m 'not slow and not network and not clipboard' pandas --junitxml=test-data.xml" displayName: 'Run 32-bit manylinux2014 Docker Build / Tests' - - task: PublishTestResults@2 condition: succeededOrFailed() inputs: diff --git a/ci/azure/macos.yml b/ci/azure/macos.yml new file mode 100644 index 0000000000000..e9bd41b9d1d50 --- /dev/null +++ b/ci/azure/macos.yml @@ -0,0 +1,61 @@ +parameters: + name: '' + vmImage: '' + +jobs: +- job: ${{ parameters.name }} + pool: + vmImage: ${{ parameters.vmImage }} + strategy: + matrix: + py38_macos: + ENV_FILE: ci/deps/azure-macos-38.yaml + CONDA_PY: "38" + PATTERN: "not slow and not network" + + steps: + - bash: echo '##vso[task.prependpath]$CONDA/bin' + displayName: Add conda to PATH + + - bash: sudo chown -R $USER $CONDA + displayName: Take ownership of conda installation + + - bash: conda update --quiet --yes conda + displayName: Update conda + + - bash: conda env create --quiet --file $(ENV_FILE) + displayName: Create Anaconda environment + + - bash: | + eval "$(conda shell.bash hook)" + conda activate pandas-dev + conda list + python setup.py build_ext -q -j 4 + python -m pip install -e . --no-build-isolation --no-use-pep517 + displayName: Build pandas + + - bash: | + eval "$(conda shell.bash hook)" + conda activate pandas-dev + ci/run_tests.sh + displayName: Test + + - bash: | + eval "$(conda shell.bash hook)" + conda activate pandas-dev + python -c "import pandas; pandas.show_versions();" + displayName: Build versions + + - task: PublishTestResults@2 + condition: succeededOrFailed() + inputs: + failTaskOnFailedTests: true + testResultsFiles: 'test-data.xml' + testRunTitle: ${{ format('{0}-$(CONDA_PY)', parameters.name) }} + displayName: Publish test results + + - bash: | + eval "$(conda shell.bash hook)" + conda activate pandas-dev + python ci/print_skipped.py + displayName: Print skipped tests diff --git a/ci/azure/posix.yml b/ci/azure/posix.yml deleted file mode 100644 index 8b0167e52b813..0000000000000 --- a/ci/azure/posix.yml +++ /dev/null @@ -1,43 +0,0 @@ -parameters: - name: '' - vmImage: '' - -jobs: -- job: ${{ parameters.name }} - pool: - vmImage: ${{ parameters.vmImage }} - strategy: - matrix: - ${{ if eq(parameters.name, 'macOS') }}: - py38_macos: - ENV_FILE: ci/deps/azure-macos-38.yaml - CONDA_PY: "38" - PATTERN: "not slow and not network" - - steps: - - script: echo '##vso[task.prependpath]$(HOME)/miniconda3/bin' - displayName: 'Set conda path' - - - script: ci/setup_env.sh - displayName: 'Setup environment and build pandas' - - - script: | - source activate pandas-dev - ci/run_tests.sh - displayName: 'Test' - - - script: source activate pandas-dev && pushd /tmp && python -c "import pandas; pandas.show_versions();" && popd - displayName: 'Build versions' - - - task: PublishTestResults@2 - condition: succeededOrFailed() - inputs: - failTaskOnFailedTests: true - testResultsFiles: 'test-data.xml' - testRunTitle: ${{ format('{0}-$(CONDA_PY)', parameters.name) }} - displayName: 'Publish test results' - - - script: | - source activate pandas-dev - python ci/print_skipped.py - displayName: 'Print skipped tests' diff --git a/ci/azure/windows.yml b/ci/azure/windows.yml index 05ba7c57ad6c4..28f8d2d349753 100644 --- a/ci/azure/windows.yml +++ b/ci/azure/windows.yml @@ -9,13 +9,13 @@ jobs: strategy: matrix: py38_np18: - ENV_FILE: ci/deps/azure-windows-38.yaml + ENV_FILE: ci\\deps\\azure-windows-38.yaml CONDA_PY: "38" PATTERN: "not slow and not network" PYTEST_WORKERS: 2 # GH-42236 py39: - ENV_FILE: ci/deps/azure-windows-39.yaml + ENV_FILE: ci\\deps\\azure-windows-39.yaml CONDA_PY: "39" PATTERN: "not slow and not network and not high_memory" PYTEST_WORKERS: 2 # GH-42236 @@ -23,33 +23,44 @@ jobs: steps: - powershell: | Write-Host "##vso[task.prependpath]$env:CONDA\Scripts" - Write-Host "##vso[task.prependpath]$HOME/miniconda3/bin" - displayName: 'Add conda to PATH' - - script: conda update -q -n base conda - displayName: 'Update conda' + displayName: Add conda to PATH + + - script: conda update --quiet --yes conda + displayName: Update conda + + - script: conda env create --quiet --file $(ENV_FILE) + displayName: Create anaconda environment - bash: | - conda env create -q --file ci\\deps\\azure-windows-$(CONDA_PY).yaml - displayName: 'Create anaconda environment' - - bash: | - source activate pandas-dev + eval "$(conda shell.bash hook)" + conda activate pandas-dev conda list python setup.py build_ext -q -j 4 - python -m pip install --no-build-isolation -e . - displayName: 'Build' + python -m pip install -e . --no-build-isolation --no-use-pep517 + displayName: Build pandas + - bash: | - source activate pandas-dev + eval "$(conda shell.bash hook)" + conda activate pandas-dev ci/run_tests.sh - displayName: 'Test' + displayName: Test + + - bash: | + eval "$(conda shell.bash hook)" + conda activate pandas-dev + python -c "import pandas; pandas.show_versions();" + displayName: Build versions + - task: PublishTestResults@2 condition: succeededOrFailed() inputs: failTaskOnFailedTests: true testResultsFiles: 'test-data.xml' testRunTitle: ${{ format('{0}-$(CONDA_PY)', parameters.name) }} - displayName: 'Publish test results' + displayName: Publish test results - bash: | - source activate pandas-dev + eval "$(conda shell.bash hook)" + conda activate pandas-dev python ci/print_skipped.py - displayName: 'Print skipped tests' + displayName: Print skipped tests diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 59548ecd3c710..1844cb863c183 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -38,7 +38,10 @@ function invgrep { } if [[ "$GITHUB_ACTIONS" == "true" ]]; then + FLAKE8_FORMAT="##[error]%(path)s:%(row)s:%(col)s:%(code)s:%(text)s" INVGREP_PREPEND="##[error]" +else + FLAKE8_FORMAT="default" fi ### LINTING ### diff --git a/ci/deps/actions-38-db-min.yaml b/ci/deps/actions-37-db-min.yaml similarity index 66% rename from ci/deps/actions-38-db-min.yaml rename to ci/deps/actions-37-db-min.yaml index c93f791b7dba7..cae4361ca37a7 100644 --- a/ci/deps/actions-38-db-min.yaml +++ b/ci/deps/actions-37-db-min.yaml @@ -2,14 +2,14 @@ name: pandas-dev channels: - conda-forge dependencies: - - python=3.8 + - python=3.7.* # tools - cython>=0.29.21 - pytest>=6.0 - pytest-cov - - pytest-xdist>=1.31 - - hypothesis>=5.5.3 + - pytest-xdist>=1.21 + - hypothesis>=3.58.0 # required - numpy<1.20 # GH#39541 compat for pyarrow<3 @@ -18,30 +18,31 @@ dependencies: # optional - beautifulsoup4 - - blosc=1.20.1 + - blosc=1.17.0 - python-blosc - fastparquet=0.4.0 - html5lib - ipython - jinja2 - - lxml=4.5.0 + - lxml=4.3.0 - matplotlib - nomkl - numexpr - openpyxl - pandas-gbq + - google-cloud-bigquery>=1.27.2 # GH 36436 - protobuf>=3.12.4 - pyarrow=0.17.1 # GH 38803 - - pytables>=3.6.1 + - pytables>=3.5.1 - scipy - - xarray=0.15.1 - - xlrd + - xarray=0.12.3 + - xlrd<2.0 - xlsxwriter - xlwt - moto - flask # sql - - psycopg2=2.8.4 - - pymysql=0.10.1 - - sqlalchemy=1.3.11 + - psycopg2=2.7 + - pymysql=0.8.1 + - sqlalchemy=1.3.0 diff --git a/ci/deps/actions-38-db.yaml b/ci/deps/actions-37-db.yaml similarity index 82% rename from ci/deps/actions-38-db.yaml rename to ci/deps/actions-37-db.yaml index b4495fa6887f4..e568f8615a8df 100644 --- a/ci/deps/actions-38-db.yaml +++ b/ci/deps/actions-37-db.yaml @@ -2,13 +2,13 @@ name: pandas-dev channels: - conda-forge dependencies: - - python=3.8 + - python=3.7.* # tools - cython>=0.29.21 - pytest>=6.0 - - pytest-xdist>=1.31 - - hypothesis>=5.5.3 + - pytest-xdist>=1.21 + - hypothesis>=3.58.0 - pytest-cov>=2.10.1 # this is only needed in the coverage build, ref: GH 35737 # pandas dependencies @@ -16,7 +16,7 @@ dependencies: - botocore>=1.11 - dask - fastparquet>=0.4.0 - - fsspec>=0.7.4, <2021.6.0 + - fsspec>=0.7.4 - gcsfs>=0.6.0 - geopandas - html5lib @@ -25,10 +25,11 @@ dependencies: - flask - nomkl - numexpr - - numpy=1.18 + - numpy=1.17.* - odfpy - openpyxl - pandas-gbq + - google-cloud-bigquery>=1.27.2 # GH 36436 - psycopg2 - pyarrow>=0.17.0 - pymysql @@ -42,7 +43,7 @@ dependencies: - sqlalchemy - statsmodels - xarray - - xlrd + - xlrd<2.0 - xlsxwriter - xlwt - pip diff --git a/ci/deps/actions-37-locale_slow.yaml b/ci/deps/actions-37-locale_slow.yaml new file mode 100644 index 0000000000000..c6eb3b00a63ac --- /dev/null +++ b/ci/deps/actions-37-locale_slow.yaml @@ -0,0 +1,30 @@ +name: pandas-dev +channels: + - defaults + - conda-forge +dependencies: + - python=3.7.* + + # tools + - cython>=0.29.21 + - pytest>=6.0 + - pytest-cov + - pytest-xdist>=1.21 + - hypothesis>=3.58.0 + + # pandas dependencies + - beautifulsoup4=4.6.0 + - bottleneck=1.2.* + - lxml + - matplotlib=3.0.0 + - numpy=1.17.* + - openpyxl=3.0.0 + - python-dateutil + - python-blosc + - pytz=2017.3 + - scipy + - sqlalchemy=1.3.0 + - xlrd=1.2.0 + - xlsxwriter=1.0.2 + - xlwt=1.3.0 + - html5lib=1.0.1 diff --git a/ci/deps/actions-37-minimum_versions.yaml b/ci/deps/actions-37-minimum_versions.yaml new file mode 100644 index 0000000000000..b97601d18917c --- /dev/null +++ b/ci/deps/actions-37-minimum_versions.yaml @@ -0,0 +1,31 @@ +name: pandas-dev +channels: + - conda-forge +dependencies: + - python=3.7.1 + + # tools + - cython=0.29.21 + - pytest>=6.0 + - pytest-cov + - pytest-xdist>=1.21 + - hypothesis>=3.58.0 + - psutil + + # pandas dependencies + - beautifulsoup4=4.6.0 + - bottleneck=1.2.1 + - jinja2=2.10 + - numba=0.46.0 + - numexpr=2.7.0 + - numpy=1.17.3 + - openpyxl=3.0.0 + - pytables=3.5.1 + - python-dateutil=2.7.3 + - pytz=2017.3 + - pyarrow=0.17.0 + - scipy=1.2 + - xlrd=1.2.0 + - xlsxwriter=1.0.2 + - xlwt=1.3.0 + - html5lib=1.0.1 diff --git a/ci/deps/actions-39-slow.yaml b/ci/deps/actions-37-slow.yaml similarity index 63% rename from ci/deps/actions-39-slow.yaml rename to ci/deps/actions-37-slow.yaml index a39504bae1bca..166f2237dcad3 100644 --- a/ci/deps/actions-39-slow.yaml +++ b/ci/deps/actions-37-slow.yaml @@ -3,40 +3,37 @@ channels: - defaults - conda-forge dependencies: - - python=3.9 + - python=3.7.* # tools - cython>=0.29.21 - pytest>=6.0 - pytest-cov - - pytest-xdist>=1.31 - - hypothesis>=5.5.3 + - pytest-xdist>=1.21 + - hypothesis>=3.58.0 # pandas dependencies - beautifulsoup4 - - bottleneck - - fsspec>=0.8.0, <2021.6.0 - - gcsfs + - fsspec>=0.7.4 - html5lib - - jinja2 - lxml - matplotlib - - moto>=1.3.14 - - flask - numexpr - numpy - openpyxl - - pyarrow + - patsy + - psycopg2 + - pymysql - pytables - python-dateutil - pytz - - s3fs>=0.4.2 + - s3fs>=0.4.0 + - moto>=1.3.14 - scipy - sqlalchemy - - xlrd + - xlrd<2.0 - xlsxwriter - xlwt - - pyreadstat - - pip - - pip: - - pyxlsb + - moto + - flask + - numba diff --git a/ci/deps/actions-37.yaml b/ci/deps/actions-37.yaml new file mode 100644 index 0000000000000..0effe6f80df86 --- /dev/null +++ b/ci/deps/actions-37.yaml @@ -0,0 +1,28 @@ +name: pandas-dev +channels: + - defaults + - conda-forge +dependencies: + - python=3.7.* + + # tools + - cython>=0.29.21 + - pytest>=6.0 + - pytest-cov + - pytest-xdist>=1.21 + - hypothesis>=3.58.0 + + # pandas dependencies + - botocore>=1.11 + - fsspec>=0.7.4 + - numpy=1.19 + - python-dateutil + - nomkl + - pyarrow + - pytz + - s3fs>=0.4.0 + - moto>=1.3.14 + - flask + - tabulate + - pyreadstat + - pip diff --git a/ci/deps/actions-38-locale.yaml b/ci/deps/actions-38-locale.yaml index 28584de509f34..34a6860936550 100644 --- a/ci/deps/actions-38-locale.yaml +++ b/ci/deps/actions-38-locale.yaml @@ -2,15 +2,15 @@ name: pandas-dev channels: - conda-forge dependencies: - - python=3.8 + - python=3.8.* # tools - cython>=0.29.21 - pytest>=6.0 - pytest-cov - - pytest-xdist>=1.31 + - pytest-xdist>=1.21 - pytest-asyncio>=0.12.0 - - hypothesis>=5.5.3 + - hypothesis>=3.58.0 # pandas dependencies - beautifulsoup4 @@ -31,7 +31,7 @@ dependencies: - pytz - scipy - xarray - - xlrd + - xlrd<2.0 - xlsxwriter - xlwt - moto diff --git a/ci/deps/actions-38-locale_slow.yaml b/ci/deps/actions-38-locale_slow.yaml deleted file mode 100644 index e7276027f2a41..0000000000000 --- a/ci/deps/actions-38-locale_slow.yaml +++ /dev/null @@ -1,30 +0,0 @@ -name: pandas-dev -channels: - - defaults - - conda-forge -dependencies: - - python=3.8 - - # tools - - cython>=0.29.21 - - pytest>=6.0 - - pytest-cov - - pytest-xdist>=1.31 - - hypothesis>=5.5.3 - - # pandas dependencies - - beautifulsoup4=4.8.2 - - bottleneck=1.3.1 - - lxml - - matplotlib=3.3.2 - - numpy=1.18 - - openpyxl=3.0.2 - - python-dateutil - - python-blosc - - pytz=2020.1 - - scipy - - sqlalchemy=1.3.11 - - xlrd=2.0.1 - - xlsxwriter=1.2.2 - - xlwt=1.3.0 - - html5lib=1.1 diff --git a/ci/deps/actions-38-minimum_versions.yaml b/ci/deps/actions-38-minimum_versions.yaml deleted file mode 100644 index d666bc3b555f5..0000000000000 --- a/ci/deps/actions-38-minimum_versions.yaml +++ /dev/null @@ -1,31 +0,0 @@ -name: pandas-dev -channels: - - conda-forge -dependencies: - - python=3.8.0 - - # tools - - cython=0.29.21 - - pytest>=6.0 - - pytest-cov - - pytest-xdist>=1.31 - - hypothesis>=5.5.3 - - psutil - - # pandas dependencies - - beautifulsoup4=4.8.2 - - bottleneck=1.3.1 - - jinja2=2.11 - - numba=0.50.1 - - numexpr=2.7.1 - - numpy=1.18.5 - - openpyxl=3.0.2 - - pytables=3.6.1 - - python-dateutil=2.8.1 - - pytz=2020.1 - - pyarrow=0.17.0 - - scipy=1.4.1 - - xlrd=2.0.1 - - xlsxwriter=1.2.2 - - xlwt=1.3.0 - - html5lib=1.1 diff --git a/ci/deps/actions-39-numpydev.yaml b/ci/deps/actions-38-numpydev.yaml similarity index 84% rename from ci/deps/actions-39-numpydev.yaml rename to ci/deps/actions-38-numpydev.yaml index 466ca6215f46a..6eed2daac0c3b 100644 --- a/ci/deps/actions-39-numpydev.yaml +++ b/ci/deps/actions-38-numpydev.yaml @@ -2,13 +2,13 @@ name: pandas-dev channels: - defaults dependencies: - - python=3.9 + - python=3.8.* # tools - pytest>=6.0 - pytest-cov - - pytest-xdist>=1.31 - - hypothesis>=5.5.3 + - pytest-xdist>=1.21 + - hypothesis>=3.58.0 # pandas dependencies - pytz diff --git a/ci/deps/actions-38-slow.yaml b/ci/deps/actions-38-slow.yaml index 08900a31fe27c..afba60e451b90 100644 --- a/ci/deps/actions-38-slow.yaml +++ b/ci/deps/actions-38-slow.yaml @@ -2,18 +2,18 @@ name: pandas-dev channels: - conda-forge dependencies: - - python=3.8 + - python=3.8.* # tools - cython>=0.29.21 - pytest>=6.0 - pytest-cov - - pytest-xdist>=1.31 - - hypothesis>=5.5.3 + - pytest-xdist>=1.21 + - hypothesis>=3.58.0 # pandas dependencies - beautifulsoup4 - - fsspec>=0.7.4, <2021.6.0 + - fsspec>=0.7.4 - html5lib - lxml - matplotlib @@ -30,7 +30,7 @@ dependencies: - moto>=1.3.14 - scipy - sqlalchemy - - xlrd + - xlrd>=2.0 - xlsxwriter - xlwt - moto diff --git a/ci/deps/actions-38.yaml b/ci/deps/actions-38.yaml index 86b038ff7d4b6..11daa92046eb4 100644 --- a/ci/deps/actions-38.yaml +++ b/ci/deps/actions-38.yaml @@ -3,14 +3,14 @@ channels: - defaults - conda-forge dependencies: - - python=3.8 + - python=3.8.* # tools - cython>=0.29.21 - pytest>=6.0 - pytest-cov - - pytest-xdist>=1.31 - - hypothesis>=5.5.3 + - pytest-xdist>=1.21 + - hypothesis>=3.58.0 # pandas dependencies - numpy diff --git a/ci/deps/actions-39.yaml b/ci/deps/actions-39.yaml index 41456572e3bf7..b74f1af8ee0f6 100644 --- a/ci/deps/actions-39.yaml +++ b/ci/deps/actions-39.yaml @@ -2,40 +2,21 @@ name: pandas-dev channels: - conda-forge dependencies: - - python=3.9 + - python=3.9.* # tools - cython>=0.29.21 - pytest>=6.0 - pytest-cov - - pytest-xdist>=1.31 - - hypothesis>=5.5.3 + - pytest-xdist>=1.21 + - hypothesis>=3.58.0 # pandas dependencies - - beautifulsoup4 - - bottleneck - - fsspec>=0.8.0, <2021.6.0 - - gcsfs - - html5lib - - jinja2 - - lxml - - matplotlib - - moto>=1.3.14 - - flask - - numexpr - numpy - - openpyxl - - pyarrow - - pytables - python-dateutil - pytz - - s3fs>=0.4.2 + + # optional dependencies + - pytables - scipy - - sqlalchemy - - xlrd - - xlsxwriter - - xlwt - - pyreadstat - - pip - - pip: - - pyxlsb + - pyarrow=1.0 diff --git a/ci/deps/azure-macos-38.yaml b/ci/deps/azure-macos-37.yaml similarity index 73% rename from ci/deps/azure-macos-38.yaml rename to ci/deps/azure-macos-37.yaml index 029d088362aa9..43e1055347f17 100644 --- a/ci/deps/azure-macos-38.yaml +++ b/ci/deps/azure-macos-37.yaml @@ -3,12 +3,12 @@ channels: - defaults - conda-forge dependencies: - - python=3.8 + - python=3.7.* # tools - pytest>=6.0 - - pytest-xdist>=1.31 - - hypothesis>=5.5.3 + - pytest-xdist>=1.21 + - hypothesis>=3.58.0 - pytest-azurepipelines # pandas dependencies @@ -17,17 +17,17 @@ dependencies: - html5lib - jinja2 - lxml - - matplotlib=3.3.2 + - matplotlib=2.2.3 - nomkl - numexpr - - numpy=1.18.5 + - numpy=1.17.3 - openpyxl - pyarrow=0.17 - pytables - - python-dateutil==2.8.1 + - python-dateutil==2.7.3 - pytz - xarray - - xlrd + - xlrd<2.0 - xlsxwriter - xlwt - pip diff --git a/ci/deps/azure-windows-39.yaml b/ci/deps/azure-windows-37.yaml similarity index 72% rename from ci/deps/azure-windows-39.yaml rename to ci/deps/azure-windows-37.yaml index 57b2a4a984f92..5cbc029f8c03d 100644 --- a/ci/deps/azure-windows-39.yaml +++ b/ci/deps/azure-windows-37.yaml @@ -1,39 +1,39 @@ name: pandas-dev channels: - - conda-forge - defaults + - conda-forge dependencies: - - python=3.9 + - python=3.7.* # tools - cython>=0.29.21 - pytest>=6.0 - - pytest-xdist>=1.31 - - hypothesis>=5.5.3 + - pytest-xdist>=1.21 + - hypothesis>=3.58.0 - pytest-azurepipelines # pandas dependencies - beautifulsoup4 - bottleneck - - fsspec>=0.8.0, <2021.6.0 - - gcsfs + - fsspec>=0.8.0 + - gcsfs>=0.6.0 - html5lib - jinja2 - lxml - - matplotlib + - matplotlib=2.2.* - moto>=1.3.14 - flask - numexpr - - numpy + - numpy=1.17.* - openpyxl - - pyarrow + - pyarrow=0.17.0 - pytables - python-dateutil - pytz - s3fs>=0.4.2 - scipy - sqlalchemy - - xlrd + - xlrd>=2.0 - xlsxwriter - xlwt - pyreadstat diff --git a/ci/deps/azure-windows-38.yaml b/ci/deps/azure-windows-38.yaml index c56496bce7d6c..7fdecae626f9d 100644 --- a/ci/deps/azure-windows-38.yaml +++ b/ci/deps/azure-windows-38.yaml @@ -3,13 +3,13 @@ channels: - conda-forge - defaults dependencies: - - python=3.8 + - python=3.8.* # tools - cython>=0.29.21 - pytest>=6.0 - - pytest-xdist>=1.31 - - hypothesis>=5.5.3 + - pytest-xdist>=1.21 + - hypothesis>=3.58.0 - pytest-azurepipelines # pandas dependencies @@ -17,20 +17,20 @@ dependencies: - bottleneck - fastparquet>=0.4.0 - flask - - fsspec>=0.8.0, <2021.6.0 - - matplotlib=3.3.2 + - fsspec>=0.8.0 + - matplotlib=3.1.3 - moto>=1.3.14 - numba - numexpr - - numpy=1.18 + - numpy=1.18.* - openpyxl - jinja2 - - pyarrow=0.17.0 + - pyarrow>=0.17.0 - pytables - python-dateutil - pytz - s3fs>=0.4.0 - scipy - - xlrd + - xlrd<2.0 - xlsxwriter - xlwt diff --git a/ci/deps/circle-38-arm64.yaml b/ci/deps/circle-37-arm64.yaml similarity index 78% rename from ci/deps/circle-38-arm64.yaml rename to ci/deps/circle-37-arm64.yaml index 17fe5b4b7b77b..995ebda1f97e7 100644 --- a/ci/deps/circle-38-arm64.yaml +++ b/ci/deps/circle-37-arm64.yaml @@ -2,13 +2,13 @@ name: pandas-dev channels: - conda-forge dependencies: - - python=3.8 + - python=3.7.* # tools - cython>=0.29.21 - pytest>=6.0 - - pytest-xdist>=1.31 - - hypothesis>=5.5.3 + - pytest-xdist>=1.21 + - hypothesis>=3.58.0 # pandas dependencies - botocore>=1.11 diff --git a/doc/source/development/contributing_codebase.rst b/doc/source/development/contributing_codebase.rst index 721b1af126709..e812aaa760a8f 100644 --- a/doc/source/development/contributing_codebase.rst +++ b/doc/source/development/contributing_codebase.rst @@ -812,21 +812,7 @@ Changes should be reflected in the release notes located in ``doc/source/whatsne This file contains an ongoing change log for each release. Add an entry to this file to document your fix, enhancement or (unavoidable) breaking change. Make sure to include the GitHub issue number when adding your entry (using ``:issue:`1234``` where ``1234`` is the -issue/pull request number). Your entry should be written using full sentences and proper -grammar. - -When mentioning parts of the API, use a Sphinx ``:func:``, ``:meth:``, or ``:class:`` -directive as appropriate. Not all public API functions and methods have a -documentation page; ideally links would only be added if they resolve. You can -usually find similar examples by checking the release notes for one of the previous -versions. - -If your code is a bugfix, add your entry to the relevant bugfix section. Avoid -adding to the ``Other`` section; only in rare cases should entries go there. -Being as concise as possible, the description of the bug should include how the -user may encounter it and an indication of the bug itself, e.g. -"produces incorrect results" or "incorrectly raises". It may be necessary to also -indicate the new behavior. +issue/pull request number). If your code is an enhancement, it is most likely necessary to add usage examples to the existing documentation. This can be done following the section diff --git a/doc/source/development/contributing_environment.rst b/doc/source/development/contributing_environment.rst index f3e6f6129f5d7..bc0a3556b9ac1 100644 --- a/doc/source/development/contributing_environment.rst +++ b/doc/source/development/contributing_environment.rst @@ -72,7 +72,7 @@ These packages will automatically be installed by using the ``pandas`` **Windows** -You will need `Build Tools for Visual Studio 2019 +You will need `Build Tools for Visual Studio 2017 `_. .. warning:: diff --git a/doc/source/ecosystem.rst b/doc/source/ecosystem.rst index e58779c090d8f..ee061e7b7d3e6 100644 --- a/doc/source/ecosystem.rst +++ b/doc/source/ecosystem.rst @@ -445,12 +445,6 @@ provides a familiar ``DataFrame`` interface for out-of-core, parallel and distri Dask-ML enables parallel and distributed machine learning using Dask alongside existing machine learning libraries like Scikit-Learn, XGBoost, and TensorFlow. -`Ibis `__ -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Ibis offers a standard way to write analytics code, that can be run in multiple engines. It helps in bridging the gap between local Python environments (like pandas) and remote storage and execution systems like Hadoop components (like HDFS, Impala, Hive, Spark) and SQL databases (Postgres, etc.). - - `Koalas `__ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/doc/source/getting_started/install.rst b/doc/source/getting_started/install.rst index e5c6a69ce0e30..88e54421daa11 100644 --- a/doc/source/getting_started/install.rst +++ b/doc/source/getting_started/install.rst @@ -20,7 +20,7 @@ Instructions for installing from source, Python version support ---------------------- -Officially Python 3.8, and 3.9. +Officially Python 3.7.1 and above, 3.8, and 3.9. Installing pandas ----------------- @@ -221,9 +221,9 @@ Dependencies ================================================================ ========================== Package Minimum supported version ================================================================ ========================== -`NumPy `__ 1.18.5 -`python-dateutil `__ 2.8.1 -`pytz `__ 2020.1 +`NumPy `__ 1.17.3 +`python-dateutil `__ 2.7.3 +`pytz `__ 2017.3 ================================================================ ========================== .. _install.recommended_dependencies: @@ -233,11 +233,11 @@ Recommended dependencies * `numexpr `__: for accelerating certain numerical operations. ``numexpr`` uses multiple cores as well as smart chunking and caching to achieve large speedups. - If installed, must be Version 2.7.1 or higher. + If installed, must be Version 2.7.0 or higher. * `bottleneck `__: for accelerating certain types of ``nan`` evaluations. ``bottleneck`` uses specialized cython routines to achieve large speedups. If installed, - must be Version 1.3.1 or higher. + must be Version 1.2.1 or higher. .. note:: @@ -262,8 +262,9 @@ Visualization ========================= ================== ============================================================= Dependency Minimum Version Notes ========================= ================== ============================================================= -matplotlib 3.3.2 Plotting library -Jinja2 2.11 Conditional formatting with DataFrame.style +setuptools 38.6.0 Utils for entry points of plotting backend +matplotlib 2.2.3 Plotting library +Jinja2 2.10 Conditional formatting with DataFrame.style tabulate 0.8.7 Printing in Markdown-friendly format (see `tabulate`_) ========================= ================== ============================================================= @@ -273,10 +274,10 @@ Computation ========================= ================== ============================================================= Dependency Minimum Version Notes ========================= ================== ============================================================= -SciPy 1.14.1 Miscellaneous statistical functions -numba 0.50.1 Alternative execution engine for rolling operations +SciPy 1.12.0 Miscellaneous statistical functions +numba 0.46.0 Alternative execution engine for rolling operations (see :ref:`Enhancing Performance `) -xarray 0.15.1 pandas-like API for N-dimensional data +xarray 0.12.3 pandas-like API for N-dimensional data ========================= ================== ============================================================= Excel files @@ -285,10 +286,10 @@ Excel files ========================= ================== ============================================================= Dependency Minimum Version Notes ========================= ================== ============================================================= -xlrd 2.0.1 Reading Excel +xlrd 1.2.0 Reading Excel xlwt 1.3.0 Writing Excel -xlsxwriter 1.2.2 Writing Excel -openpyxl 3.0.2 Reading / writing for xlsx files +xlsxwriter 1.0.2 Writing Excel +openpyxl 3.0.0 Reading / writing for xlsx files pyxlsb 1.0.6 Reading for xlsb files ========================= ================== ============================================================= @@ -298,9 +299,9 @@ HTML ========================= ================== ============================================================= Dependency Minimum Version Notes ========================= ================== ============================================================= -BeautifulSoup4 4.8.2 HTML parser for read_html -html5lib 1.1 HTML parser for read_html -lxml 4.5.0 HTML parser for read_html +BeautifulSoup4 4.6.0 HTML parser for read_html +html5lib 1.0.1 HTML parser for read_html +lxml 4.3.0 HTML parser for read_html ========================= ================== ============================================================= One of the following combinations of libraries is needed to use the @@ -333,7 +334,7 @@ XML ========================= ================== ============================================================= Dependency Minimum Version Notes ========================= ================== ============================================================= -lxml 4.5.0 XML parser for read_xml and tree builder for to_xml +lxml 4.3.0 XML parser for read_xml and tree builder for to_xml ========================= ================== ============================================================= SQL databases @@ -342,9 +343,9 @@ SQL databases ========================= ================== ============================================================= Dependency Minimum Version Notes ========================= ================== ============================================================= -SQLAlchemy 1.3.11 SQL support for databases other than sqlite -psycopg2 2.8.4 PostgreSQL engine for sqlalchemy -pymysql 0.10.1 MySQL engine for sqlalchemy +SQLAlchemy 1.3.0 SQL support for databases other than sqlite +psycopg2 2.7 PostgreSQL engine for sqlalchemy +pymysql 0.8.1 MySQL engine for sqlalchemy ========================= ================== ============================================================= Other data sources @@ -353,8 +354,8 @@ Other data sources ========================= ================== ============================================================= Dependency Minimum Version Notes ========================= ================== ============================================================= -PyTables 3.6.1 HDF5-based reading / writing -blosc 1.20.1 Compression for HDF5 +PyTables 3.5.1 HDF5-based reading / writing +blosc 1.17.0 Compression for HDF5 zlib Compression for HDF5 fastparquet 0.4.0 Parquet reading / writing pyarrow 0.17.0 Parquet, ORC, and feather reading / writing @@ -384,7 +385,7 @@ Dependency Minimum Version Notes ========================= ================== ============================================================= fsspec 0.7.4 Handling files aside from simple local and HTTP gcsfs 0.6.0 Google Cloud Storage access -pandas-gbq 0.14.0 Google Big Query access +pandas-gbq 0.12.0 Google Big Query access s3fs 0.4.0 Amazon S3 access ========================= ================== ============================================================= diff --git a/doc/source/getting_started/tutorials.rst b/doc/source/getting_started/tutorials.rst index a349251bdfca6..b8940d2efed2f 100644 --- a/doc/source/getting_started/tutorials.rst +++ b/doc/source/getting_started/tutorials.rst @@ -18,19 +18,6 @@ entails. For the table of contents, see the `pandas-cookbook GitHub repository `_. -pandas workshop by Stefanie Molin ---------------------------------- - -An introductory workshop by `Stefanie Molin `_ -designed to quickly get you up to speed with pandas using real-world datasets. -It covers getting started with pandas, data wrangling, and data visualization -(with some exposure to matplotlib and seaborn). The -`pandas-workshop GitHub repository `_ -features detailed environment setup instructions (including a Binder environment), -slides and notebooks for following along, and exercises to practice the concepts. -There is also a lab with new exercises on a dataset not covered in the workshop for -additional practice. - Learn pandas by Hernan Rojas ---------------------------- diff --git a/doc/source/reference/io.rst b/doc/source/reference/io.rst index 82d4ec4950ef1..442631de50c7a 100644 --- a/doc/source/reference/io.rst +++ b/doc/source/reference/io.rst @@ -13,7 +13,6 @@ Pickling :toctree: api/ read_pickle - DataFrame.to_pickle Flat file ~~~~~~~~~ @@ -22,7 +21,6 @@ Flat file read_table read_csv - DataFrame.to_csv read_fwf Clipboard @@ -31,7 +29,6 @@ Clipboard :toctree: api/ read_clipboard - DataFrame.to_clipboard Excel ~~~~~ @@ -39,33 +36,23 @@ Excel :toctree: api/ read_excel - DataFrame.to_excel ExcelFile.parse -.. currentmodule:: pandas.io.formats.style - -.. autosummary:: - :toctree: api/ - - Styler.to_excel - -.. currentmodule:: pandas - .. autosummary:: :toctree: api/ :template: autosummary/class_without_autosummary.rst ExcelWriter -.. currentmodule:: pandas.io.json - JSON ~~~~ .. autosummary:: :toctree: api/ read_json - to_json + json_normalize + +.. currentmodule:: pandas.io.json .. autosummary:: :toctree: api/ @@ -80,16 +67,6 @@ HTML :toctree: api/ read_html - DataFrame.to_html - -.. currentmodule:: pandas.io.formats.style - -.. autosummary:: - :toctree: api/ - - Styler.to_html - -.. currentmodule:: pandas XML ~~~~ @@ -97,23 +74,6 @@ XML :toctree: api/ read_xml - DataFrame.to_xml - -Latex -~~~~~ -.. autosummary:: - :toctree: api/ - - DataFrame.to_latex - -.. currentmodule:: pandas.io.formats.style - -.. autosummary:: - :toctree: api/ - - Styler.to_latex - -.. currentmodule:: pandas HDFStore: PyTables (HDF5) ~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -141,7 +101,6 @@ Feather :toctree: api/ read_feather - DataFrame.to_feather Parquet ~~~~~~~ @@ -149,7 +108,6 @@ Parquet :toctree: api/ read_parquet - DataFrame.to_parquet ORC ~~~ @@ -180,7 +138,6 @@ SQL read_sql_table read_sql_query read_sql - DataFrame.to_sql Google BigQuery ~~~~~~~~~~~~~~~ @@ -195,7 +152,6 @@ STATA :toctree: api/ read_stata - DataFrame.to_stata .. currentmodule:: pandas.io.stata diff --git a/doc/source/reference/style.rst b/doc/source/reference/style.rst index 7b790daea37ff..5a2ff803f0323 100644 --- a/doc/source/reference/style.rst +++ b/doc/source/reference/style.rst @@ -24,8 +24,6 @@ Styler properties Styler.env Styler.template_html - Styler.template_html_style - Styler.template_html_table Styler.template_latex Styler.loader @@ -36,15 +34,13 @@ Style application Styler.apply Styler.applymap + Styler.where Styler.format - Styler.hide_index - Styler.hide_columns Styler.set_td_classes Styler.set_table_styles Styler.set_table_attributes Styler.set_tooltips Styler.set_caption - Styler.set_sticky Styler.set_properties Styler.set_uuid Styler.clear diff --git a/doc/source/user_guide/categorical.rst b/doc/source/user_guide/categorical.rst index 6f9d8eb3474c2..f65638cd78a2b 100644 --- a/doc/source/user_guide/categorical.rst +++ b/doc/source/user_guide/categorical.rst @@ -777,8 +777,8 @@ value is included in the ``categories``: df try: df.iloc[2:4, :] = [["c", 3], ["c", 3]] - except TypeError as e: - print("TypeError:", str(e)) + except ValueError as e: + print("ValueError:", str(e)) Setting values by assigning categorical data will also check that the ``categories`` match: @@ -788,8 +788,8 @@ Setting values by assigning categorical data will also check that the ``categori df try: df.loc["j":"k", "cats"] = pd.Categorical(["b", "b"], categories=["a", "b", "c"]) - except TypeError as e: - print("TypeError:", str(e)) + except ValueError as e: + print("ValueError:", str(e)) Assigning a ``Categorical`` to parts of a column of other types will use the values: diff --git a/doc/source/user_guide/enhancingperf.rst b/doc/source/user_guide/enhancingperf.rst index c78d972f33d65..aa9a1ba6d6bf0 100644 --- a/doc/source/user_guide/enhancingperf.rst +++ b/doc/source/user_guide/enhancingperf.rst @@ -302,63 +302,28 @@ For more about ``boundscheck`` and ``wraparound``, see the Cython docs on .. _enhancingperf.numba: -Numba (JIT compilation) ------------------------ +Using Numba +----------- -An alternative to statically compiling Cython code is to use a dynamic just-in-time (JIT) compiler with `Numba `__. +A recent alternative to statically compiling Cython code, is to use a *dynamic jit-compiler*, Numba. -Numba allows you to write a pure Python function which can be JIT compiled to native machine instructions, similar in performance to C, C++ and Fortran, -by decorating your function with ``@jit``. +Numba gives you the power to speed up your applications with high performance functions written directly in Python. With a few annotations, array-oriented and math-heavy Python code can be just-in-time compiled to native machine instructions, similar in performance to C, C++ and Fortran, without having to switch languages or Python interpreters. -Numba works by generating optimized machine code using the LLVM compiler infrastructure at import time, runtime, or statically (using the included pycc tool). -Numba supports compilation of Python to run on either CPU or GPU hardware and is designed to integrate with the Python scientific software stack. +Numba works by generating optimized machine code using the LLVM compiler infrastructure at import time, runtime, or statically (using the included pycc tool). Numba supports compilation of Python to run on either CPU or GPU hardware, and is designed to integrate with the Python scientific software stack. .. note:: - The ``@jit`` compilation will add overhead to the runtime of the function, so performance benefits may not be realized especially when using small data sets. - Consider `caching `__ your function to avoid compilation overhead each time your function is run. + You will need to install Numba. This is easy with ``conda``, by using: ``conda install numba``, see :ref:`installing using miniconda`. -Numba can be used in 2 ways with pandas: - -#. Specify the ``engine="numba"`` keyword in select pandas methods -#. Define your own Python function decorated with ``@jit`` and pass the underlying NumPy array of :class:`Series` or :class:`Dataframe` (using ``to_numpy()``) into the function - -pandas Numba Engine -~~~~~~~~~~~~~~~~~~~ - -If Numba is installed, one can specify ``engine="numba"`` in select pandas methods to execute the method using Numba. -Methods that support ``engine="numba"`` will also have an ``engine_kwargs`` keyword that accepts a dictionary that allows one to specify -``"nogil"``, ``"nopython"`` and ``"parallel"`` keys with boolean values to pass into the ``@jit`` decorator. -If ``engine_kwargs`` is not specified, it defaults to ``{"nogil": False, "nopython": True, "parallel": False}`` unless otherwise specified. - -In terms of performance, **the first time a function is run using the Numba engine will be slow** -as Numba will have some function compilation overhead. However, the JIT compiled functions are cached, -and subsequent calls will be fast. In general, the Numba engine is performant with -a larger amount of data points (e.g. 1+ million). - -.. code-block:: ipython - - In [1]: data = pd.Series(range(1_000_000)) # noqa: E225 - - In [2]: roll = data.rolling(10) +.. note:: - In [3]: def f(x): - ...: return np.sum(x) + 5 - # Run the first time, compilation time will affect performance - In [4]: %timeit -r 1 -n 1 roll.apply(f, engine='numba', raw=True) - 1.23 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each) - # Function is cached and performance will improve - In [5]: %timeit roll.apply(f, engine='numba', raw=True) - 188 ms ± 1.93 ms per loop (mean ± std. dev. of 7 runs, 10 loops each) + As of Numba version 0.20, pandas objects cannot be passed directly to Numba-compiled functions. Instead, one must pass the NumPy array underlying the pandas object to the Numba-compiled function as demonstrated below. - In [6]: %timeit roll.apply(f, engine='cython', raw=True) - 3.92 s ± 59 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) +Jit +~~~ -Custom Function Examples -~~~~~~~~~~~~~~~~~~~~~~~~ - -A custom Python function decorated with ``@jit`` can be used with pandas objects by passing their NumPy array -representations with ``to_numpy()``. +We demonstrate how to use Numba to just-in-time compile our code. We simply +take the plain Python code from above and annotate with the ``@jit`` decorator. .. code-block:: python @@ -395,6 +360,8 @@ representations with ``to_numpy()``. ) return pd.Series(result, index=df.index, name="result") +Note that we directly pass NumPy arrays to the Numba function. ``compute_numba`` is just a wrapper that provides a +nicer interface by passing/returning pandas objects. .. code-block:: ipython @@ -403,9 +370,19 @@ representations with ``to_numpy()``. In this example, using Numba was faster than Cython. +Numba as an argument +~~~~~~~~~~~~~~~~~~~~ + +Additionally, we can leverage the power of `Numba `__ +by calling it as an argument in :meth:`~Rolling.apply`. See :ref:`Computation tools +` for an extensive example. + +Vectorize +~~~~~~~~~ + Numba can also be used to write vectorized functions that do not require the user to explicitly loop over the observations of a vector; a vectorized function will be applied to each row automatically. -Consider the following example of doubling each observation: +Consider the following toy example of doubling each observation: .. code-block:: python @@ -437,23 +414,25 @@ Consider the following example of doubling each observation: Caveats ~~~~~~~ +.. note:: + + Numba will execute on any function, but can only accelerate certain classes of functions. + Numba is best at accelerating functions that apply numerical functions to NumPy -arrays. If you try to ``@jit`` a function that contains unsupported `Python `__ -or `NumPy `__ -code, compilation will revert `object mode `__ which -will mostly likely not speed up your function. If you would +arrays. When passed a function that only uses operations it knows how to +accelerate, it will execute in ``nopython`` mode. + +If Numba is passed a function that includes something it doesn't know how to +work with -- a category that currently includes sets, lists, dictionaries, or +string functions -- it will revert to ``object mode``. In ``object mode``, +Numba will execute but your code will not speed up significantly. If you would prefer that Numba throw an error if it cannot compile a function in a way that speeds up your code, pass Numba the argument -``nopython=True`` (e.g. ``@jit(nopython=True)``). For more on +``nopython=True`` (e.g. ``@numba.jit(nopython=True)``). For more on troubleshooting Numba modes, see the `Numba troubleshooting page `__. -Using ``parallel=True`` (e.g. ``@jit(parallel=True)``) may result in a ``SIGABRT`` if the threading layer leads to unsafe -behavior. You can first `specify a safe threading layer `__ -before running a JIT function with ``parallel=True``. - -Generally if the you encounter a segfault (``SIGSEGV``) while using Numba, please report the issue -to the `Numba issue tracker. `__ +Read more in the `Numba docs `__. .. _enhancingperf.eval: diff --git a/doc/source/user_guide/groupby.rst b/doc/source/user_guide/groupby.rst index 2cd6efe592277..870ec6763c72f 100644 --- a/doc/source/user_guide/groupby.rst +++ b/doc/source/user_guide/groupby.rst @@ -391,6 +391,7 @@ something different for each of the columns. Thus, using ``[]`` similar to getting a column from a DataFrame, you can do: .. ipython:: python + :suppress: df = pd.DataFrame( { @@ -401,7 +402,7 @@ getting a column from a DataFrame, you can do: } ) - df +.. ipython:: python grouped = df.groupby(["A"]) grouped_C = grouped["C"] @@ -1105,9 +1106,11 @@ Numba Accelerated Routines .. versionadded:: 1.1 If `Numba `__ is installed as an optional dependency, the ``transform`` and -``aggregate`` methods support ``engine='numba'`` and ``engine_kwargs`` arguments. -See :ref:`enhancing performance with Numba ` for general usage of the arguments -and performance considerations. +``aggregate`` methods support ``engine='numba'`` and ``engine_kwargs`` arguments. The ``engine_kwargs`` +argument is a dictionary of keyword arguments that will be passed into the +`numba.jit decorator `__. +These keyword arguments will be applied to the passed function. Currently only ``nogil``, ``nopython``, +and ``parallel`` are supported, and their default values are set to ``False``, ``True`` and ``False`` respectively. The function signature must start with ``values, index`` **exactly** as the data belonging to each group will be passed into ``values``, and the group index will be passed into ``index``. @@ -1118,6 +1121,52 @@ will be passed into ``values``, and the group index will be passed into ``index` data and group index will be passed as NumPy arrays to the JITed user defined function, and no alternative execution attempts will be tried. +.. note:: + + In terms of performance, **the first time a function is run using the Numba engine will be slow** + as Numba will have some function compilation overhead. However, the compiled functions are cached, + and subsequent calls will be fast. In general, the Numba engine is performant with + a larger amount of data points (e.g. 1+ million). + +.. code-block:: ipython + + In [1]: N = 10 ** 3 + + In [2]: data = {0: [str(i) for i in range(100)] * N, 1: list(range(100)) * N} + + In [3]: df = pd.DataFrame(data, columns=[0, 1]) + + In [4]: def f_numba(values, index): + ...: total = 0 + ...: for i, value in enumerate(values): + ...: if i % 2: + ...: total += value + 5 + ...: else: + ...: total += value * 2 + ...: return total + ...: + + In [5]: def f_cython(values): + ...: total = 0 + ...: for i, value in enumerate(values): + ...: if i % 2: + ...: total += value + 5 + ...: else: + ...: total += value * 2 + ...: return total + ...: + + In [6]: groupby = df.groupby(0) + # Run the first time, compilation time will affect performance + In [7]: %timeit -r 1 -n 1 groupby.aggregate(f_numba, engine='numba') # noqa: E225 + 2.14 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each) + # Function is cached and performance will improve + In [8]: %timeit groupby.aggregate(f_numba, engine='numba') + 4.93 ms ± 32.3 µs per loop (mean ± std. dev. of 7 runs, 100 loops each) + + In [9]: %timeit groupby.aggregate(f_cython, engine='cython') + 18.6 ms ± 84.8 µs per loop (mean ± std. dev. of 7 runs, 100 loops each) + Other useful features --------------------- diff --git a/doc/source/user_guide/indexing.rst b/doc/source/user_guide/indexing.rst index 584dd0f52ae28..dc66303a44f53 100644 --- a/doc/source/user_guide/indexing.rst +++ b/doc/source/user_guide/indexing.rst @@ -701,7 +701,7 @@ Having a duplicated index will raise for a ``.reindex()``: .. code-block:: ipython In [17]: s.reindex(labels) - ValueError: cannot reindex on an axis with duplicate labels + ValueError: cannot reindex from a duplicate axis Generally, you can intersect the desired labels with the current axis, and then reindex. @@ -717,7 +717,7 @@ However, this would *still* raise if your resulting index is duplicated. In [41]: labels = ['a', 'd'] In [42]: s.loc[s.index.intersection(labels)].reindex(labels) - ValueError: cannot reindex on an axis with duplicate labels + ValueError: cannot reindex from a duplicate axis .. _indexing.basics.partial_setting: @@ -1523,8 +1523,8 @@ Looking up values by index/column labels ---------------------------------------- Sometimes you want to extract a set of values given a sequence of row labels -and column labels, this can be achieved by ``pandas.factorize`` and NumPy indexing. -For instance: +and column labels, this can be achieved by ``DataFrame.melt`` combined by filtering the corresponding +rows with ``DataFrame.loc``. For instance: .. ipython:: python @@ -1532,8 +1532,9 @@ For instance: 'A': [80, 23, np.nan, 22], 'B': [80, 55, 76, 67]}) df - idx, cols = pd.factorize(df['col']) - df.reindex(cols, axis=1).to_numpy()[np.arange(len(df)), idx] + melt = df.melt('col') + melt = melt.loc[melt['col'] == melt['variable'], 'value'] + melt.reset_index(drop=True) Formerly this could be achieved with the dedicated ``DataFrame.lookup`` method which was deprecated in version 1.2.0. diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 939fd5b832cef..c2b030d732ba9 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -5526,23 +5526,13 @@ below and the SQLAlchemy `documentation `__ -for an explanation of how the database connection is handled. +If you want to manage your own connections you can pass one of those instead: .. code-block:: python with engine.connect() as conn, conn.begin(): data = pd.read_sql_table("data", conn) -.. warning:: - - When you open a connection to a database you are also responsible for closing it. - Side effects of leaving a connection open may include locking the database or - other breaking behaviour. - Writing DataFrames '''''''''''''''''' @@ -5699,7 +5689,7 @@ Example of a callable using PostgreSQL `COPY clause writer.writerows(data_iter) s_buf.seek(0) - columns = ', '.join(['"{}"'.format(k) for k in keys]) + columns = ', '.join('"{}"'.format(k) for k in keys) if table.schema: table_name = '{}.{}'.format(table.schema, table.name) else: diff --git a/doc/source/user_guide/style.ipynb b/doc/source/user_guide/style.ipynb index f77d134d75988..7d8d8e90dfbda 100644 --- a/doc/source/user_guide/style.ipynb +++ b/doc/source/user_guide/style.ipynb @@ -152,7 +152,7 @@ "\n", "Before adding styles it is useful to show that the [Styler][styler] can distinguish the *display* value from the *actual* value. To control the display value, the text is printed in each cell, and we can use the [.format()][formatfunc] method to manipulate this according to a [format spec string][format] or a callable that takes a single value and returns a string. It is possible to define this for the whole table or for individual columns. \n", "\n", - "Additionally, the format function has a **precision** argument to specifically help formatting floats, as well as **decimal** and **thousands** separators to support other locales, an **na_rep** argument to display missing data, and an **escape** argument to help displaying safe-HTML or safe-LaTeX. The default formatter is configured to adopt pandas' regular `display.precision` option, controllable using `with pd.option_context('display.precision', 2):`\n", + "Additionally, the format function has a **precision** argument to specifically help formatting floats, an **na_rep** argument to display missing data, and an **escape** argument to help displaying safe-HTML. The default formatter is configured to adopt pandas' regular `display.precision` option, controllable using `with pd.option_context('display.precision', 2):`\n", "\n", "Here is an example of using the multiple options to control the formatting generally and with specific column formatters.\n", "\n", @@ -167,9 +167,9 @@ "metadata": {}, "outputs": [], "source": [ - "df.style.format(precision=0, na_rep='MISSING', thousands=\" \",\n", + "df.style.format(precision=0, na_rep='MISSING', \n", " formatter={('Decision Tree', 'Tumour'): \"{:.2f}\",\n", - " ('Regression', 'Non-Tumour'): lambda x: \"$ {:,.1f}\".format(x*-1e6)\n", + " ('Regression', 'Non-Tumour'): lambda x: \"$ {:,.1f}\".format(x*-1e3)\n", " })" ] }, @@ -179,11 +179,9 @@ "source": [ "### Hiding Data\n", "\n", - "The index and column headers can be completely hidden, as well subselecting rows or columns that one wishes to exclude. Both these options are performed using the same methods.\n", + "The index can be hidden from rendering by calling [.hide_index()][hideidx], which might be useful if your index is integer based.\n", "\n", - "The index can be hidden from rendering by calling [.hide_index()][hideidx] without any arguments, which might be useful if your index is integer based. Similarly column headers can be hidden by calling [.hide_columns()][hidecols] without any arguments.\n", - "\n", - "Specific rows or columns can be hidden from rendering by calling the same [.hide_index()][hideidx] or [.hide_columns()][hidecols] methods and passing in a row/column label, a list-like or a slice of row/column labels to for the ``subset`` argument.\n", + "Columns can be hidden from rendering by calling [.hide_columns()][hidecols] and passing in the name of a column, or a slice of columns.\n", "\n", "Hiding does not change the integer arrangement of CSS classes, e.g. hiding the first two columns of a DataFrame means the column class indexing will start at `col2`, since `col0` and `col1` are simply ignored.\n", "\n", @@ -1190,9 +1188,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Additional keyword arguments give more control on centering and positioning, and you can pass a list of `[color_negative, color_positive]` to highlight lower and higher values.\n", + "In version 0.20.0 the ability to customize the bar chart further was given. You can now have the `df.style.bar` be centered on zero or midpoint value (in addition to the already existing way of having the min value at the left side of the cell), and you can pass a list of `[color_negative, color_positive]`.\n", "\n", - "Here's how you can change the above with the new `align` option, combined with setting `vmin` and `vmax` limits, the `width` of the figure, and underlying css `props` of cells, leaving space to display the text and the bars:" + "Here's how you can change the above with the new `align='mid'` option:" ] }, { @@ -1201,8 +1199,7 @@ "metadata": {}, "outputs": [], "source": [ - "df2.style.bar(align=0, vmin=-2.5, vmax=2.5, color=['#d65f5f', '#5fba7d'], height=50,\n", - " width=60, props=\"width: 120px; border-right: 1px solid black;\").format('{:.3f}', na_rep=\"\")" + "df2.style.bar(subset=['A', 'B'], align='mid', color=['#d65f5f', '#5fba7d'])" ] }, { @@ -1226,31 +1223,28 @@ "\n", "# Test series\n", "test1 = pd.Series([-100,-60,-30,-20], name='All Negative')\n", - "test2 = pd.Series([-10,-5,0,90], name='Both Pos and Neg')\n", - "test3 = pd.Series([10,20,50,100], name='All Positive')\n", - "test4 = pd.Series([100, 103, 101, 102], name='Large Positive')\n", - "\n", + "test2 = pd.Series([10,20,50,100], name='All Positive')\n", + "test3 = pd.Series([-10,-5,0,90], name='Both Pos and Neg')\n", "\n", "head = \"\"\"\n", "\n", " \n", " \n", " \n", - " \n", " \n", - " \n", + " \n", " \n", " \n", "\n", "\"\"\"\n", "\n", - "aligns = ['left', 'right', 'zero', 'mid', 'mean', 99]\n", + "aligns = ['left','zero','mid']\n", "for align in aligns:\n", " row = \"\".format(align)\n", - " for series in [test1,test2,test3, test4]:\n", + " for series in [test1,test2,test3]:\n", " s = series.copy()\n", " s.name=''\n", - " row += \"\".format(s.to_frame().style.hide_index().bar(align=align, \n", + " row += \"\".format(s.to_frame().style.bar(align=align, \n", " color=['#d65f5f', '#5fba7d'], \n", " width=100).render()) #testn['width']\n", " row += ''\n", @@ -1409,9 +1403,7 @@ "source": [ "### Sticky Headers\n", "\n", - "If you display a large matrix or DataFrame in a notebook, but you want to always see the column and row headers you can use the [.set_sticky][sticky] method which manipulates the table styles CSS.\n", - "\n", - "[sticky]: ../reference/api/pandas.io.formats.style.Styler.set_sticky.rst" + "If you display a large matrix or DataFrame in a notebook, but you want to always see the column and row headers you can use the following CSS to make them stick. We might make this into an API function later." ] }, { @@ -1420,15 +1412,20 @@ "metadata": {}, "outputs": [], "source": [ - "bigdf = pd.DataFrame(np.random.randn(16, 100))\n", - "bigdf.style.set_sticky(axis=\"index\")" + "bigdf = pd.DataFrame(np.random.randn(15, 100))\n", + "bigdf.style.set_table_styles([\n", + " {'selector': 'thead th', 'props': 'position: sticky; top:0; background-color:salmon;'},\n", + " {'selector': 'tbody th', 'props': 'position: sticky; left:0; background-color:lightgreen;'} \n", + "])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "It is also possible to stick MultiIndexes and even only specific levels." + "### Hiding Headers\n", + "\n", + "We don't yet have any API to hide headers so a quick fix is:" ] }, { @@ -1437,8 +1434,7 @@ "metadata": {}, "outputs": [], "source": [ - "bigdf.index = pd.MultiIndex.from_product([[\"A\",\"B\"],[0,1],[0,1,2,3]])\n", - "bigdf.style.set_sticky(axis=\"index\", pixel_size=18, levels=[1,2])" + "df3.style.set_table_styles([{'selector': 'thead tr', 'props': 'display: none;'}]) # or 'thead th'" ] }, { @@ -1528,17 +1524,6 @@ "![Excel spreadsheet with styled DataFrame](../_static/style-excel.png)\n" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Export to LaTeX\n", - "\n", - "There is support (*since version 1.3.0*) to export `Styler` to LaTeX. The documentation for the [.to_latex][latex] method gives further detail and numerous examples.\n", - "\n", - "[latex]: ../reference/api/pandas.io.formats.style.Styler.to_latex.rst" - ] - }, { "cell_type": "markdown", "metadata": {}, @@ -1784,7 +1769,7 @@ " Styler.loader, # the default\n", " ])\n", " )\n", - " template_html_table = env.get_template(\"myhtml.tpl\")" + " template_html = env.get_template(\"myhtml.tpl\")" ] }, { @@ -1837,63 +1822,14 @@ "outputs": [], "source": [ "EasyStyler = Styler.from_custom_template(\"templates\", \"myhtml.tpl\")\n", - "HTML(EasyStyler(df3).render(table_title=\"Another Title\"))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Template Structure\n", - "\n", - "Here's the template structure for the both the style generation template and the table generation template:" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Style template:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "nbsphinx": "hidden" - }, - "outputs": [], - "source": [ - "with open(\"templates/html_style_structure.html\") as f:\n", - " style_structure = f.read()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "HTML(style_structure)" + "EasyStyler(df3)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Table template:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "nbsphinx": "hidden" - }, - "outputs": [], - "source": [ - "with open(\"templates/html_table_structure.html\") as f:\n", - " table_structure = f.read()" + "Here's the template structure:" ] }, { @@ -1902,7 +1838,10 @@ "metadata": {}, "outputs": [], "source": [ - "HTML(table_structure)" + "with open(\"templates/template_structure.html\") as f:\n", + " structure = f.read()\n", + " \n", + "HTML(structure)" ] }, { diff --git a/doc/source/user_guide/templates/html_style_structure.html b/doc/source/user_guide/templates/html_style_structure.html deleted file mode 100644 index dc0c03ac363a9..0000000000000 --- a/doc/source/user_guide/templates/html_style_structure.html +++ /dev/null @@ -1,35 +0,0 @@ - - - -
before_style
-
style -
<style type="text/css">
-
table_styles
-
before_cellstyle
-
cellstyle
-
</style>
-
diff --git a/doc/source/user_guide/templates/myhtml.tpl b/doc/source/user_guide/templates/myhtml.tpl index 1e204d0bd4568..1170fd3def653 100644 --- a/doc/source/user_guide/templates/myhtml.tpl +++ b/doc/source/user_guide/templates/myhtml.tpl @@ -1,4 +1,4 @@ -{% extends "html_table.tpl" %} +{% extends "html.tpl" %} {% block table %}

{{ table_title|default("My Table") }}

{{ super() }} diff --git a/doc/source/user_guide/templates/html_table_structure.html b/doc/source/user_guide/templates/template_structure.html similarity index 80% rename from doc/source/user_guide/templates/html_table_structure.html rename to doc/source/user_guide/templates/template_structure.html index e03f9591d2a35..0778d8e2e6f18 100644 --- a/doc/source/user_guide/templates/html_table_structure.html +++ b/doc/source/user_guide/templates/template_structure.html @@ -25,6 +25,15 @@ } +
before_style
+
style +
<style type="text/css">
+
table_styles
+
before_cellstyle
+
cellstyle
+
</style>
+
+
before_table
table diff --git a/doc/source/user_guide/timedeltas.rst b/doc/source/user_guide/timedeltas.rst index 180de1df53f9e..0b4ddaaa8a42a 100644 --- a/doc/source/user_guide/timedeltas.rst +++ b/doc/source/user_guide/timedeltas.rst @@ -88,19 +88,13 @@ or a list/array of strings: pd.to_timedelta(["1 days 06:05:01.00003", "15.5us", "nan"]) -The ``unit`` keyword argument specifies the unit of the Timedelta if the input -is numeric: +The ``unit`` keyword argument specifies the unit of the Timedelta: .. ipython:: python pd.to_timedelta(np.arange(5), unit="s") pd.to_timedelta(np.arange(5), unit="d") -.. warning:: - If a string or array of strings is passed as an input then the ``unit`` keyword - argument will be ignored. If a string without units is passed then the default - unit of nanoseconds is assumed. - .. _timedeltas.limitations: Timedelta limitations diff --git a/doc/source/user_guide/window.rst b/doc/source/user_guide/window.rst index 3e533cbadc5f7..f7e219ab23e39 100644 --- a/doc/source/user_guide/window.rst +++ b/doc/source/user_guide/window.rst @@ -37,14 +37,14 @@ pandas supports 4 types of windowing operations: #. Expanding window: Accumulating window over the values. #. Exponentially Weighted window: Accumulating and exponentially weighted window over the values. -============================= ================= =========================== =========================== ======================== =================================== =========================== -Concept Method Returned Object Supports time-based windows Supports chained groupby Supports table method Supports online operations -============================= ================= =========================== =========================== ======================== =================================== =========================== -Rolling window ``rolling`` ``Rolling`` Yes Yes Yes (as of version 1.3) No -Weighted window ``rolling`` ``Window`` No No No No -Expanding window ``expanding`` ``Expanding`` No Yes Yes (as of version 1.3) No -Exponentially Weighted window ``ewm`` ``ExponentialMovingWindow`` No Yes (as of version 1.2) No Yes (as of version 1.3) -============================= ================= =========================== =========================== ======================== =================================== =========================== +============================= ================= =========================== =========================== ======================== =================================== +Concept Method Returned Object Supports time-based windows Supports chained groupby Supports table method +============================= ================= =========================== =========================== ======================== =================================== +Rolling window ``rolling`` ``Rolling`` Yes Yes Yes (as of version 1.3) +Weighted window ``rolling`` ``Window`` No No No +Expanding window ``expanding`` ``Expanding`` No Yes Yes (as of version 1.3) +Exponentially Weighted window ``ewm`` ``ExponentialMovingWindow`` No Yes (as of version 1.2) No +============================= ================= =========================== =========================== ======================== =================================== As noted above, some operations support specifying a window based on a time offset: @@ -98,26 +98,6 @@ be calculated with :meth:`~Rolling.apply` by specifying a separate column of wei df = pd.DataFrame([[1, 2, 0.6], [2, 3, 0.4], [3, 4, 0.2], [4, 5, 0.7]]) df.rolling(2, method="table", min_periods=0).apply(weighted_mean, raw=True, engine="numba") # noqa:E501 -.. versionadded:: 1.3 - -Some windowing operations also support an ``online`` method after constructing a windowing object -which returns a new object that supports passing in new :class:`DataFrame` or :class:`Series` objects -to continue the windowing calculation with the new values (i.e. online calculations). - -The methods on this new windowing objects must call the aggregation method first to "prime" the initial -state of the online calculation. Then, new :class:`DataFrame` or :class:`Series` objects can be passed in -the ``update`` argument to continue the windowing calculation. - -.. ipython:: python - - df = pd.DataFrame([[1, 2, 0.6], [2, 3, 0.4], [3, 4, 0.2], [4, 5, 0.7]]) - df.ewm(0.5).mean() - -.. ipython:: python - - online_ewm = df.head(2).ewm(0.5).online() - online_ewm.mean() - online_ewm.mean(update=df.tail(1)) All windowing operations support a ``min_periods`` argument that dictates the minimum amount of non-``np.nan`` values a window must have; otherwise, the resulting value is ``np.nan``. @@ -262,24 +242,26 @@ and we want to use an expanding window where ``use_expanding`` is ``True`` other .. code-block:: ipython In [2]: from pandas.api.indexers import BaseIndexer - - In [3]: class CustomIndexer(BaseIndexer): - ...: def get_window_bounds(self, num_values, min_periods, center, closed): - ...: start = np.empty(num_values, dtype=np.int64) - ...: end = np.empty(num_values, dtype=np.int64) - ...: for i in range(num_values): - ...: if self.use_expanding[i]: - ...: start[i] = 0 - ...: end[i] = i + 1 - ...: else: - ...: start[i] = i - ...: end[i] = i + self.window_size - ...: return start, end - - In [4]: indexer = CustomIndexer(window_size=1, use_expanding=use_expanding) - - In [5]: df.rolling(indexer).sum() - Out[5]: + ...: + ...: class CustomIndexer(BaseIndexer): + ...: + ...: def get_window_bounds(self, num_values, min_periods, center, closed): + ...: start = np.empty(num_values, dtype=np.int64) + ...: end = np.empty(num_values, dtype=np.int64) + ...: for i in range(num_values): + ...: if self.use_expanding[i]: + ...: start[i] = 0 + ...: end[i] = i + 1 + ...: else: + ...: start[i] = i + ...: end[i] = i + self.window_size + ...: return start, end + ...: + + In [3]: indexer = CustomIndexer(window_size=1, use_expanding=use_expanding) + + In [4]: df.rolling(indexer).sum() + Out[4]: values 0 0.0 1 1.0 @@ -363,21 +345,45 @@ Numba engine Additionally, :meth:`~Rolling.apply` can leverage `Numba `__ if installed as an optional dependency. The apply aggregation can be executed using Numba by specifying ``engine='numba'`` and ``engine_kwargs`` arguments (``raw`` must also be set to ``True``). -See :ref:`enhancing performance with Numba ` for general usage of the arguments and performance considerations. - Numba will be applied in potentially two routines: #. If ``func`` is a standard Python function, the engine will `JIT `__ the passed function. ``func`` can also be a JITed function in which case the engine will not JIT the function again. #. The engine will JIT the for loop where the apply function is applied to each window. +.. versionadded:: 1.3.0 + +``mean``, ``median``, ``max``, ``min``, and ``sum`` also support the ``engine`` and ``engine_kwargs`` arguments. + The ``engine_kwargs`` argument is a dictionary of keyword arguments that will be passed into the `numba.jit decorator `__. These keyword arguments will be applied to *both* the passed function (if a standard Python function) -and the apply for loop over each window. +and the apply for loop over each window. Currently only ``nogil``, ``nopython``, and ``parallel`` are supported, +and their default values are set to ``False``, ``True`` and ``False`` respectively. -.. versionadded:: 1.3.0 +.. note:: -``mean``, ``median``, ``max``, ``min``, and ``sum`` also support the ``engine`` and ``engine_kwargs`` arguments. + In terms of performance, **the first time a function is run using the Numba engine will be slow** + as Numba will have some function compilation overhead. However, the compiled functions are cached, + and subsequent calls will be fast. In general, the Numba engine is performant with + a larger amount of data points (e.g. 1+ million). + +.. code-block:: ipython + + In [1]: data = pd.Series(range(1_000_000)) + + In [2]: roll = data.rolling(10) + + In [3]: def f(x): + ...: return np.sum(x) + 5 + # Run the first time, compilation time will affect performance + In [4]: %timeit -r 1 -n 1 roll.apply(f, engine='numba', raw=True) # noqa: E225, E999 + 1.23 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each) + # Function is cached and performance will improve + In [5]: %timeit roll.apply(f, engine='numba', raw=True) + 188 ms ± 1.93 ms per loop (mean ± std. dev. of 7 runs, 10 loops each) + + In [6]: %timeit roll.apply(f, engine='cython', raw=True) + 3.92 s ± 59 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) .. _window.cov_corr: diff --git a/doc/source/whatsnew/index.rst b/doc/source/whatsnew/index.rst index 1f04eb6f68ae8..986cf43b80494 100644 --- a/doc/source/whatsnew/index.rst +++ b/doc/source/whatsnew/index.rst @@ -10,21 +10,12 @@ This is the list of changes to pandas between each release. For full details, see the `commit logs `_. For install and upgrade instructions, see :ref:`install`. -Version 1.4 ------------ - -.. toctree:: - :maxdepth: 2 - - v1.4.0 - Version 1.3 ----------- .. toctree:: :maxdepth: 2 - v1.3.1 v1.3.0 Version 1.2 diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 03dfe475475a1..b87274307431b 100755 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -338,20 +338,19 @@ maps labels to their new names along the default axis, is allowed to be passed b *pandas 0.25.x* -.. code-block:: ipython +.. code-block:: python - In [1]: df = pd.DataFrame([[1]]) - In [2]: df.rename({0: 1}, {0: 2}) - Out[2]: + >>> df = pd.DataFrame([[1]]) + >>> df.rename({0: 1}, {0: 2}) FutureWarning: ...Use named arguments to resolve ambiguity... 2 1 1 *pandas 1.0.0* -.. code-block:: ipython +.. code-block:: python - In [3]: df.rename({0: 1}, {0: 2}) + >>> df.rename({0: 1}, {0: 2}) Traceback (most recent call last): ... TypeError: rename() takes from 1 to 2 positional arguments but 3 were given @@ -360,28 +359,26 @@ Note that errors will now be raised when conflicting or potentially ambiguous ar *pandas 0.25.x* -.. code-block:: ipython +.. code-block:: python - In [4]: df.rename({0: 1}, index={0: 2}) - Out[4]: + >>> df.rename({0: 1}, index={0: 2}) 0 1 1 - In [5]: df.rename(mapper={0: 1}, index={0: 2}) - Out[5]: + >>> df.rename(mapper={0: 1}, index={0: 2}) 0 2 1 *pandas 1.0.0* -.. code-block:: ipython +.. code-block:: python - In [6]: df.rename({0: 1}, index={0: 2}) + >>> df.rename({0: 1}, index={0: 2}) Traceback (most recent call last): ... TypeError: Cannot specify both 'mapper' and any of 'index' or 'columns' - In [7]: df.rename(mapper={0: 1}, index={0: 2}) + >>> df.rename(mapper={0: 1}, index={0: 2}) Traceback (most recent call last): ... TypeError: Cannot specify both 'mapper' and any of 'index' or 'columns' @@ -408,12 +405,12 @@ Extended verbose info output for :class:`~pandas.DataFrame` *pandas 0.25.x* -.. code-block:: ipython +.. code-block:: python - In [1]: df = pd.DataFrame({"int_col": [1, 2, 3], + >>> df = pd.DataFrame({"int_col": [1, 2, 3], ... "text_col": ["a", "b", "c"], ... "float_col": [0.0, 0.1, 0.2]}) - In [2]: df.info(verbose=True) + >>> df.info(verbose=True) RangeIndex: 3 entries, 0 to 2 Data columns (total 3 columns): @@ -443,16 +440,14 @@ Extended verbose info output for :class:`~pandas.DataFrame` *pandas 0.25.x* -.. code-block:: ipython +.. code-block:: python - In [1]: pd.array(["a", None]) - Out[1]: + >>> pd.array(["a", None]) ['a', None] Length: 2, dtype: object - In [2]: pd.array([1, None]) - Out[2]: + >>> pd.array([1, None]) [1, None] Length: 2, dtype: object @@ -475,17 +470,15 @@ As a reminder, you can specify the ``dtype`` to disable all inference. *pandas 0.25.x* -.. code-block:: ipython +.. code-block:: python - In [1]: a = pd.array([1, 2, None], dtype="Int64") - In [2]: a - Out[2]: + >>> a = pd.array([1, 2, None], dtype="Int64") + >>> a [1, 2, NaN] Length: 3, dtype: Int64 - In [3]: a[2] - Out[3]: + >>> a[2] nan *pandas 1.0.0* @@ -506,10 +499,9 @@ will now raise. *pandas 0.25.x* -.. code-block:: ipython +.. code-block:: python - In [1]: np.asarray(a, dtype="float") - Out[1]: + >>> np.asarray(a, dtype="float") array([ 1., 2., nan]) *pandas 1.0.0* @@ -533,10 +525,9 @@ will now be ``pd.NA`` instead of ``np.nan`` in presence of missing values *pandas 0.25.x* -.. code-block:: ipython +.. code-block:: python - In [1]: pd.Series(a).sum(skipna=False) - Out[1]: + >>> pd.Series(a).sum(skipna=False) nan *pandas 1.0.0* @@ -552,10 +543,9 @@ integer dtype for the values. *pandas 0.25.x* -.. code-block:: ipython +.. code-block:: python - In [1]: pd.Series([2, 1, 1, None], dtype="Int64").value_counts().dtype - Out[1]: + >>> pd.Series([2, 1, 1, None], dtype="Int64").value_counts().dtype dtype('int64') *pandas 1.0.0* @@ -575,17 +565,15 @@ Comparison operations on a :class:`arrays.IntegerArray` now returns a *pandas 0.25.x* -.. code-block:: ipython +.. code-block:: python - In [1]: a = pd.array([1, 2, None], dtype="Int64") - In [2]: a - Out[2]: + >>> a = pd.array([1, 2, None], dtype="Int64") + >>> a [1, 2, NaN] Length: 3, dtype: Int64 - In [3]: a > 1 - Out[3]: + >>> a > 1 array([False, True, False]) *pandas 1.0.0* @@ -652,10 +640,9 @@ scalar values in the result are instances of the extension dtype's scalar type. *pandas 0.25.x* -.. code-block:: ipython +.. code-block:: python - In [1]> df.resample("2D").agg(lambda x: 'a').A.dtype - Out[1]: + >>> df.resample("2D").agg(lambda x: 'a').A.dtype CategoricalDtype(categories=['a', 'b'], ordered=False) *pandas 1.0.0* @@ -670,10 +657,9 @@ depending on how the results are cast back to the original dtype. *pandas 0.25.x* -.. code-block:: ipython +.. code-block:: python - In [1] df.resample("2D").agg(lambda x: 'c') - Out[1]: + >>> df.resample("2D").agg(lambda x: 'c') A 0 NaN @@ -885,10 +871,10 @@ matplotlib directly rather than :meth:`~DataFrame.plot`. To use pandas formatters with a matplotlib plot, specify -.. code-block:: ipython +.. code-block:: python - In [1]: import pandas as pd - In [2]: pd.options.plotting.matplotlib.register_converters = True + >>> import pandas as pd + >>> pd.options.plotting.matplotlib.register_converters = True Note that plots created by :meth:`DataFrame.plot` and :meth:`Series.plot` *do* register the converters automatically. The only behavior change is when plotting a date-like object via ``matplotlib.pyplot.plot`` diff --git a/doc/source/whatsnew/v1.2.1.rst b/doc/source/whatsnew/v1.2.1.rst index 34e28eab6d4bf..bfe30d52e2aff 100644 --- a/doc/source/whatsnew/v1.2.1.rst +++ b/doc/source/whatsnew/v1.2.1.rst @@ -52,23 +52,20 @@ DataFrame / Series combination) would ignore the indices, only match the inputs by shape, and use the index/columns of the first DataFrame for the result: -.. code-block:: ipython +.. code-block:: python - In [1]: df1 = pd.DataFrame({"a": [1, 2], "b": [3, 4]}, index=[0, 1]) - In [2]: df2 = pd.DataFrame({"a": [1, 2], "b": [3, 4]}, index=[1, 2]) - In [3]: df1 - Out[3]: + >>> df1 = pd.DataFrame({"a": [1, 2], "b": [3, 4]}, index=[0, 1]) + ... df2 = pd.DataFrame({"a": [1, 2], "b": [3, 4]}, index=[1, 2]) + >>> df1 a b 0 1 3 1 2 4 - In [4]: df2 - Out[4]: + >>> df2 a b 1 1 3 2 2 4 - In [5]: np.add(df1, df2) - Out[5]: + >>> np.add(df1, df2) a b 0 2 6 1 4 8 @@ -76,10 +73,9 @@ the result: This contrasts with how other pandas operations work, which first align the inputs: -.. code-block:: ipython +.. code-block:: python - In [6]: df1 + df2 - Out[6]: + >>> df1 + df2 a b 0 NaN NaN 1 3.0 7.0 @@ -98,10 +94,9 @@ objects (eg ``np.add(s1, s2)``) already aligns and continues to do so. To avoid the warning and keep the current behaviour of ignoring the indices, convert one of the arguments to a NumPy array: -.. code-block:: ipython +.. code-block:: python - In [7]: np.add(df1, np.asarray(df2)) - Out[7]: + >>> np.add(df1, np.asarray(df2)) a b 0 2 6 1 4 8 @@ -109,11 +104,10 @@ convert one of the arguments to a NumPy array: To obtain the future behaviour and silence the warning, you can align manually before passing the arguments to the ufunc: -.. code-block:: ipython +.. code-block:: python - In [8]: df1, df2 = df1.align(df2) - In [9]: np.add(df1, df2) - Out[9]: + >>> df1, df2 = df1.align(df2) + >>> np.add(df1, df2) a b 0 NaN NaN 1 3.0 7.0 diff --git a/doc/source/whatsnew/v1.2.5.rst b/doc/source/whatsnew/v1.2.5.rst index d3ceb2b919b5d..d0af23b48b1f7 100644 --- a/doc/source/whatsnew/v1.2.5.rst +++ b/doc/source/whatsnew/v1.2.5.rst @@ -1,7 +1,7 @@ .. _whatsnew_125: -What's new in 1.2.5 (June 22, 2021) ------------------------------------ +What's new in 1.2.5 (May ??, 2021) +---------------------------------- These are the changes in pandas 1.2.5. See :ref:`release` for a full changelog including other versions of pandas. @@ -14,12 +14,32 @@ including other versions of pandas. Fixed regressions ~~~~~~~~~~~~~~~~~ -- Fixed regression in :func:`concat` between two :class:`DataFrame` where one has an :class:`Index` that is all-None and the other is :class:`DatetimeIndex` incorrectly raising (:issue:`40841`) +- Regression in :func:`concat` between two :class:`DataFrames` where one has an :class:`Index` that is all-None and the other is :class:`DatetimeIndex` incorrectly raising (:issue:`40841`) - Fixed regression in :meth:`DataFrame.sum` and :meth:`DataFrame.prod` when ``min_count`` and ``numeric_only`` are both given (:issue:`41074`) -- Fixed regression in :func:`read_csv` when using ``memory_map=True`` with an non-UTF8 encoding (:issue:`40986`) -- Fixed regression in :meth:`DataFrame.replace` and :meth:`Series.replace` when the values to replace is a NumPy float array (:issue:`40371`) -- Fixed regression in :func:`ExcelFile` when a corrupt file is opened but not closed (:issue:`41778`) -- Fixed regression in :meth:`DataFrame.astype` with ``dtype=str`` failing to convert ``NaN`` in categorical columns (:issue:`41797`) +- Regression in :func:`read_csv` when using ``memory_map=True`` with an non-UTF8 encoding (:issue:`40986`) +- Regression in :meth:`DataFrame.replace` and :meth:`Series.replace` when the values to replace is a NumPy float array (:issue:`40371`) +- Regression in :func:`ExcelFile` when a corrupt file is opened but not closed (:issue:`41778`) + +.. --------------------------------------------------------------------------- + + +.. _whatsnew_125.bug_fixes: + +Bug fixes +~~~~~~~~~ + +- +- + +.. --------------------------------------------------------------------------- + +.. _whatsnew_125.other: + +Other +~~~~~ + +- +- .. --------------------------------------------------------------------------- diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index ed66861efad93..e2b923812a211 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -1,7 +1,7 @@ .. _whatsnew_130: -What's new in 1.3.0 (July 2, 2021) ----------------------------------- +What's new in 1.3.0 (??) +------------------------ These are the changes in pandas 1.3.0. See :ref:`release` for a full changelog including other versions of pandas. @@ -124,7 +124,7 @@ which has been revised and improved (:issue:`39720`, :issue:`39317`, :issue:`404 - The methods :meth:`.Styler.highlight_null`, :meth:`.Styler.highlight_min`, and :meth:`.Styler.highlight_max` now allow custom CSS highlighting instead of the default background coloring (:issue:`40242`) - :meth:`.Styler.apply` now accepts functions that return an ``ndarray`` when ``axis=None``, making it now consistent with the ``axis=0`` and ``axis=1`` behavior (:issue:`39359`) - When incorrectly formatted CSS is given via :meth:`.Styler.apply` or :meth:`.Styler.applymap`, an error is now raised upon rendering (:issue:`39660`) - - :meth:`.Styler.format` now accepts the keyword argument ``escape`` for optional HTML and LaTeX escaping (:issue:`40388`, :issue:`41619`) + - :meth:`.Styler.format` now accepts the keyword argument ``escape`` for optional HTML and LaTex escaping (:issue:`40388`, :issue:`41619`) - :meth:`.Styler.background_gradient` has gained the argument ``gmap`` to supply a specific gradient map for shading (:issue:`22727`) - :meth:`.Styler.clear` now clears :attr:`Styler.hidden_index` and :attr:`Styler.hidden_columns` as well (:issue:`40484`) - Added the method :meth:`.Styler.highlight_between` (:issue:`39821`) @@ -136,9 +136,8 @@ which has been revised and improved (:issue:`39720`, :issue:`39317`, :issue:`404 - Many features of the :class:`.Styler` class are now either partially or fully usable on a DataFrame with a non-unique indexes or columns (:issue:`41143`) - One has greater control of the display through separate sparsification of the index or columns using the :ref:`new styler options `, which are also usable via :func:`option_context` (:issue:`41142`) - Added the option ``styler.render.max_elements`` to avoid browser overload when styling large DataFrames (:issue:`40712`) - - Added the method :meth:`.Styler.to_latex` (:issue:`21673`, :issue:`42320`), which also allows some limited CSS conversion (:issue:`40731`) + - Added the method :meth:`.Styler.to_latex` (:issue:`21673`) - Added the method :meth:`.Styler.to_html` (:issue:`13379`) - - Added the method :meth:`.Styler.set_sticky` to make index and column headers permanently visible in scrolling HTML frames (:issue:`29072`) .. _whatsnew_130.enhancements.dataframe_honors_copy_with_dict: @@ -240,19 +239,17 @@ For example: Other enhancements ^^^^^^^^^^^^^^^^^^ -- :meth:`DataFrame.rolling`, :meth:`Series.rolling`, :meth:`DataFrame.expanding`, and :meth:`Series.expanding` now support a ``method`` argument with a ``'table'`` option that performs the windowing operation over an entire :class:`DataFrame`. See :ref:`Window Overview ` for performance and functional benefits (:issue:`15095`, :issue:`38995`) -- :class:`.ExponentialMovingWindow` now support a ``online`` method that can perform ``mean`` calculations in an online fashion. See :ref:`Window Overview ` (:issue:`41673`) +- :meth:`DataFrame.rolling`, :meth:`Series.rolling`, :meth:`DataFrame.expanding`, and :meth:`Series.expanding` now support a ``method`` argument with a ``'table'`` option that performs the windowing operation over an entire DataFrame. See :ref:`Window Overview ` for performance and functional benefits (:issue:`15095`, :issue:`38995`) - Added :meth:`MultiIndex.dtypes` (:issue:`37062`) - Added ``end`` and ``end_day`` options for the ``origin`` argument in :meth:`DataFrame.resample` (:issue:`37804`) - Improved error message when ``usecols`` and ``names`` do not match for :func:`read_csv` and ``engine="c"`` (:issue:`29042`) - Improved consistency of error messages when passing an invalid ``win_type`` argument in :ref:`Window methods ` (:issue:`15969`) - :func:`read_sql_query` now accepts a ``dtype`` argument to cast the columnar data from the SQL database based on user input (:issue:`10285`) -- :func:`read_csv` now raising ``ParserWarning`` if length of header or given names does not match length of data when ``usecols`` is not specified (:issue:`21768`) - Improved integer type mapping from pandas to SQLAlchemy when using :meth:`DataFrame.to_sql` (:issue:`35076`) - :func:`to_numeric` now supports downcasting of nullable ``ExtensionDtype`` objects (:issue:`33013`) - Added support for dict-like names in :class:`MultiIndex.set_names` and :class:`MultiIndex.rename` (:issue:`20421`) - :func:`read_excel` can now auto-detect .xlsb files and older .xls files (:issue:`35416`, :issue:`41225`) -- :class:`ExcelWriter` now accepts an ``if_sheet_exists`` parameter to control the behavior of append mode when writing to existing sheets (:issue:`40230`) +- :class:`ExcelWriter` now accepts an ``if_sheet_exists`` parameter to control the behaviour of append mode when writing to existing sheets (:issue:`40230`) - :meth:`.Rolling.sum`, :meth:`.Expanding.sum`, :meth:`.Rolling.mean`, :meth:`.Expanding.mean`, :meth:`.ExponentialMovingWindow.mean`, :meth:`.Rolling.median`, :meth:`.Expanding.median`, :meth:`.Rolling.max`, :meth:`.Expanding.max`, :meth:`.Rolling.min`, and :meth:`.Expanding.min` now support `Numba `_ execution with the ``engine`` keyword (:issue:`38895`, :issue:`41267`) - :meth:`DataFrame.apply` can now accept NumPy unary operators as strings, e.g. ``df.apply("sqrt")``, which was already the case for :meth:`Series.apply` (:issue:`39116`) - :meth:`DataFrame.apply` can now accept non-callable DataFrame properties as strings, e.g. ``df.apply("size")``, which was already the case for :meth:`Series.apply` (:issue:`39116`) @@ -269,16 +266,12 @@ Other enhancements - :meth:`read_csv` and :meth:`read_json` expose the argument ``encoding_errors`` to control how encoding errors are handled (:issue:`39450`) - :meth:`.GroupBy.any` and :meth:`.GroupBy.all` use Kleene logic with nullable data types (:issue:`37506`) - :meth:`.GroupBy.any` and :meth:`.GroupBy.all` return a ``BooleanDtype`` for columns with nullable data types (:issue:`33449`) -- :meth:`.GroupBy.any` and :meth:`.GroupBy.all` raising with ``object`` data containing ``pd.NA`` even when ``skipna=True`` (:issue:`37501`) - :meth:`.GroupBy.rank` now supports object-dtype data (:issue:`38278`) - Constructing a :class:`DataFrame` or :class:`Series` with the ``data`` argument being a Python iterable that is *not* a NumPy ``ndarray`` consisting of NumPy scalars will now result in a dtype with a precision the maximum of the NumPy scalars; this was already the case when ``data`` is a NumPy ``ndarray`` (:issue:`40908`) - Add keyword ``sort`` to :func:`pivot_table` to allow non-sorting of the result (:issue:`39143`) - Add keyword ``dropna`` to :meth:`DataFrame.value_counts` to allow counting rows that include ``NA`` values (:issue:`41325`) - :meth:`Series.replace` will now cast results to ``PeriodDtype`` where possible instead of ``object`` dtype (:issue:`41526`) - Improved error message in ``corr`` and ``cov`` methods on :class:`.Rolling`, :class:`.Expanding`, and :class:`.ExponentialMovingWindow` when ``other`` is not a :class:`DataFrame` or :class:`Series` (:issue:`41741`) -- :meth:`Series.between` can now accept ``left`` or ``right`` as arguments to ``inclusive`` to include only the left or right boundary (:issue:`40245`) -- :meth:`DataFrame.explode` now supports exploding multiple columns. Its ``column`` argument now also accepts a list of str or tuples for exploding on multiple columns at the same time (:issue:`39240`) -- :meth:`DataFrame.sample` now accepts the ``ignore_index`` argument to reset the index after sampling, similar to :meth:`DataFrame.drop_duplicates` and :meth:`DataFrame.sort_values` (:issue:`38581`) .. --------------------------------------------------------------------------- @@ -307,7 +300,7 @@ As an example of this, given: original = pd.Series(cat) unique = original.unique() -*Previous behavior*: +*pandas < 1.3.0*: .. code-block:: ipython @@ -317,7 +310,7 @@ As an example of this, given: In [2]: original.dtype == unique.dtype False -*New behavior*: +*pandas >= 1.3.0* .. ipython:: python @@ -339,7 +332,7 @@ Preserve dtypes in :meth:`DataFrame.combine_first` df2 combined = df1.combine_first(df2) -*Previous behavior*: +*pandas 1.2.x* .. code-block:: ipython @@ -350,7 +343,7 @@ Preserve dtypes in :meth:`DataFrame.combine_first` C float64 dtype: object -*New behavior*: +*pandas 1.3.0* .. ipython:: python @@ -373,7 +366,7 @@ values as measured by ``np.allclose``. Now no such casting occurs. df = pd.DataFrame({'key': [1, 1], 'a': [True, False], 'b': [True, True]}) df -*Previous behavior*: +*pandas 1.2.x* .. code-block:: ipython @@ -383,7 +376,7 @@ values as measured by ``np.allclose``. Now no such casting occurs. key 1 True 2 -*New behavior*: +*pandas 1.3.0* .. ipython:: python @@ -401,7 +394,7 @@ Now, these methods will always return a float dtype. (:issue:`41137`) df = pd.DataFrame({'a': [True], 'b': [1], 'c': [1.0]}) -*Previous behavior*: +*pandas 1.2.x* .. code-block:: ipython @@ -410,7 +403,7 @@ Now, these methods will always return a float dtype. (:issue:`41137`) a b c 0 True 1 1.0 -*New behavior*: +*pandas 1.3.0* .. ipython:: python @@ -434,7 +427,7 @@ insert the values into the existing data rather than create an entirely new arra In both the new and old behavior, the data in ``values`` is overwritten, but in the old behavior the dtype of ``df["A"]`` changed to ``int64``. -*Previous behavior*: +*pandas 1.2.x* .. code-block:: ipython @@ -449,7 +442,7 @@ the old behavior the dtype of ``df["A"]`` changed to ``int64``. In pandas 1.3.0, ``df`` continues to share data with ``values`` -*New behavior*: +*pandas 1.3.0* .. ipython:: python @@ -476,7 +469,7 @@ never casting to the dtypes of the existing arrays. In the old behavior, ``5`` was cast to ``float64`` and inserted into the existing array backing ``df``: -*Previous behavior*: +*pandas 1.2.x* .. code-block:: ipython @@ -486,7 +479,7 @@ array backing ``df``: In the new behavior, we get a new array, and retain an integer-dtyped ``5``: -*New behavior*: +*pandas 1.3.0* .. ipython:: python @@ -509,7 +502,7 @@ casts to ``dtype=object`` (:issue:`38709`) ser2 = orig.copy() ser2.iloc[1] = 2.0 -*Previous behavior*: +*pandas 1.2.x* .. code-block:: ipython @@ -525,7 +518,7 @@ casts to ``dtype=object`` (:issue:`38709`) 1 2.0 dtype: object -*New behavior*: +*pandas 1.3.0* .. ipython:: python @@ -643,7 +636,7 @@ If installed, we now require: +-----------------+-----------------+----------+---------+ | pytest (dev) | 6.0 | | X | +-----------------+-----------------+----------+---------+ -| mypy (dev) | 0.812 | | X | +| mypy (dev) | 0.800 | | X | +-----------------+-----------------+----------+---------+ | setuptools | 38.6.0 | | X | +-----------------+-----------------+----------+---------+ @@ -707,8 +700,6 @@ Other API changes - Added new ``engine`` and ``**engine_kwargs`` parameters to :meth:`DataFrame.to_sql` to support other future "SQL engines". Currently we still only use ``SQLAlchemy`` under the hood, but more engines are planned to be supported such as `turbodbc `_ (:issue:`36893`) - Removed redundant ``freq`` from :class:`PeriodIndex` string representation (:issue:`41653`) - :meth:`ExtensionDtype.construct_array_type` is now a required method instead of an optional one for :class:`ExtensionDtype` subclasses (:issue:`24860`) -- Calling ``hash`` on non-hashable pandas objects will now raise ``TypeError`` with the built-in error message (e.g. ``unhashable type: 'Series'``). Previously it would raise a custom message such as ``'Series' objects are mutable, thus they cannot be hashed``. Furthermore, ``isinstance(, abc.collections.Hashable)`` will now return ``False`` (:issue:`40013`) -- :meth:`.Styler.from_custom_template` now has two new arguments for template names, and removed the old ``name``, due to template inheritance having been introducing for better parsing (:issue:`42053`). Subclassing modifications to Styler attributes are also needed. .. _whatsnew_130.api_breaking.build: @@ -722,6 +713,64 @@ Build Deprecations ~~~~~~~~~~~~ +- Deprecated allowing scalars to be passed to the :class:`Categorical` constructor (:issue:`38433`) +- Deprecated constructing :class:`CategoricalIndex` without passing list-like data (:issue:`38944`) +- Deprecated allowing subclass-specific keyword arguments in the :class:`Index` constructor, use the specific subclass directly instead (:issue:`14093`, :issue:`21311`, :issue:`22315`, :issue:`26974`) +- Deprecated the :meth:`astype` method of datetimelike (``timedelta64[ns]``, ``datetime64[ns]``, ``Datetime64TZDtype``, ``PeriodDtype``) to convert to integer dtypes, use ``values.view(...)`` instead (:issue:`38544`) +- Deprecated :meth:`MultiIndex.is_lexsorted` and :meth:`MultiIndex.lexsort_depth`, use :meth:`MultiIndex.is_monotonic_increasing` instead (:issue:`32259`) +- Deprecated keyword ``try_cast`` in :meth:`Series.where`, :meth:`Series.mask`, :meth:`DataFrame.where`, :meth:`DataFrame.mask`; cast results manually if desired (:issue:`38836`) +- Deprecated comparison of :class:`Timestamp` objects with ``datetime.date`` objects. Instead of e.g. ``ts <= mydate`` use ``ts <= pd.Timestamp(mydate)`` or ``ts.date() <= mydate`` (:issue:`36131`) +- Deprecated :attr:`Rolling.win_type` returning ``"freq"`` (:issue:`38963`) +- Deprecated :attr:`Rolling.is_datetimelike` (:issue:`38963`) +- Deprecated :class:`DataFrame` indexer for :meth:`Series.__setitem__` and :meth:`DataFrame.__setitem__` (:issue:`39004`) +- Deprecated :meth:`ExponentialMovingWindow.vol` (:issue:`39220`) +- Using ``.astype`` to convert between ``datetime64[ns]`` dtype and :class:`DatetimeTZDtype` is deprecated and will raise in a future version, use ``obj.tz_localize`` or ``obj.dt.tz_localize`` instead (:issue:`38622`) +- Deprecated casting ``datetime.date`` objects to ``datetime64`` when used as ``fill_value`` in :meth:`DataFrame.unstack`, :meth:`DataFrame.shift`, :meth:`Series.shift`, and :meth:`DataFrame.reindex`, pass ``pd.Timestamp(dateobj)`` instead (:issue:`39767`) +- Deprecated :meth:`.Styler.set_na_rep` and :meth:`.Styler.set_precision` in favour of :meth:`.Styler.format` with ``na_rep`` and ``precision`` as existing and new input arguments respectively (:issue:`40134`, :issue:`40425`) +- Deprecated allowing partial failure in :meth:`Series.transform` and :meth:`DataFrame.transform` when ``func`` is list-like or dict-like and raises anything but ``TypeError``; ``func`` raising anything but a ``TypeError`` will raise in a future version (:issue:`40211`) +- Deprecated arguments ``error_bad_lines`` and ``warn_bad_lines`` in :meth:`read_csv` and :meth:`read_table` in favor of argument ``on_bad_lines`` (:issue:`15122`) +- Deprecated support for ``np.ma.mrecords.MaskedRecords`` in the :class:`DataFrame` constructor, pass ``{name: data[name] for name in data.dtype.names}`` instead (:issue:`40363`) +- Deprecated using :func:`merge`, :meth:`DataFrame.merge`, and :meth:`DataFrame.join` on a different number of levels (:issue:`34862`) +- Deprecated the use of ``**kwargs`` in :class:`.ExcelWriter`; use the keyword argument ``engine_kwargs`` instead (:issue:`40430`) +- Deprecated the ``level`` keyword for :class:`DataFrame` and :class:`Series` aggregations; use groupby instead (:issue:`39983`) +- Deprecated the ``inplace`` parameter of :meth:`Categorical.remove_categories`, :meth:`Categorical.add_categories`, :meth:`Categorical.reorder_categories`, :meth:`Categorical.rename_categories`, :meth:`Categorical.set_categories` and will be removed in a future version (:issue:`37643`) +- Deprecated :func:`merge` producing duplicated columns through the ``suffixes`` keyword and already existing columns (:issue:`22818`) +- Deprecated setting :attr:`Categorical._codes`, create a new :class:`Categorical` with the desired codes instead (:issue:`40606`) +- Deprecated the ``convert_float`` optional argument in :func:`read_excel` and :meth:`ExcelFile.parse` (:issue:`41127`) +- Deprecated behavior of :meth:`DatetimeIndex.union` with mixed timezones; in a future version both will be cast to UTC instead of object dtype (:issue:`39328`) +- Deprecated using ``usecols`` with out of bounds indices for :func:`read_csv` with ``engine="c"`` (:issue:`25623`) +- Deprecated special treatment of lists with first element a Categorical in the :class:`DataFrame` constructor; pass as ``pd.DataFrame({col: categorical, ...})`` instead (:issue:`38845`) +- Deprecated behavior of :class:`DataFrame` constructor when a ``dtype`` is passed and the data cannot be cast to that dtype. In a future version, this will raise instead of being silently ignored (:issue:`24435`) +- Deprecated the :attr:`Timestamp.freq` attribute. For the properties that use it (``is_month_start``, ``is_month_end``, ``is_quarter_start``, ``is_quarter_end``, ``is_year_start``, ``is_year_end``), when you have a ``freq``, use e.g. ``freq.is_month_start(ts)`` (:issue:`15146`) +- Deprecated construction of :class:`Series` or :class:`DataFrame` with ``DatetimeTZDtype`` data and ``datetime64[ns]`` dtype. Use ``Series(data).dt.tz_localize(None)`` instead (:issue:`41555`, :issue:`33401`) +- Deprecated behavior of :class:`Series` construction with large-integer values and small-integer dtype silently overflowing; use ``Series(data).astype(dtype)`` instead (:issue:`41734`) +- Deprecated behavior of :class:`DataFrame` construction with floating data and integer dtype casting even when lossy; in a future version this will remain floating, matching :class:`Series` behavior (:issue:`41770`) +- Deprecated inference of ``timedelta64[ns]``, ``datetime64[ns]``, or ``DatetimeTZDtype`` dtypes in :class:`Series` construction when data containing strings is passed and no ``dtype`` is passed (:issue:`33558`) +- In a future version, constructing :class:`Series` or :class:`DataFrame` with ``datetime64[ns]`` data and ``DatetimeTZDtype`` will treat the data as wall-times instead of as UTC times (matching DatetimeIndex behavior). To treat the data as UTC times, use ``pd.Series(data).dt.tz_localize("UTC").dt.tz_convert(dtype.tz)`` or ``pd.Series(data.view("int64"), dtype=dtype)`` (:issue:`33401`) +- Deprecated passing lists as ``key`` to :meth:`DataFrame.xs` and :meth:`Series.xs` (:issue:`41760`) +- Deprecated passing arguments as positional for all of the following, with exceptions noted (:issue:`41485`): + - :func:`concat` (other than ``objs``) + - :func:`read_csv` (other than ``filepath_or_buffer``) + - :func:`read_table` (other than ``filepath_or_buffer``) + - :meth:`DataFrame.clip` and :meth:`Series.clip` (other than ``upper`` and ``lower``) + - :meth:`DataFrame.drop_duplicates` (except for ``subset``), :meth:`Series.drop_duplicates`, :meth:`Index.drop_duplicates` and :meth:`MultiIndex.drop_duplicates` + - :meth:`DataFrame.drop` (other than ``labels``) and :meth:`Series.drop` + - :meth:`DataFrame.dropna` and :meth:`Series.dropna` + - :meth:`DataFrame.ffill`, :meth:`Series.ffill`, :meth:`DataFrame.bfill`, and :meth:`Series.bfill` + - :meth:`DataFrame.fillna` and :meth:`Series.fillna` (apart from ``value``) + - :meth:`DataFrame.interpolate` and :meth:`Series.interpolate` (other than ``method``) + - :meth:`DataFrame.mask` and :meth:`Series.mask` (other than ``cond`` and ``other``) + - :meth:`DataFrame.reset_index` (other than ``level``) and :meth:`Series.reset_index` + - :meth:`DataFrame.set_axis` and :meth:`Series.set_axis` (other than ``labels``) + - :meth:`DataFrame.set_index` (other than ``keys``) + - :meth:`DataFrame.sort_index` and :meth:`Series.sort_index` + - :meth:`DataFrame.sort_values` (other than ``by``) and :meth:`Series.sort_values` + - :meth:`DataFrame.where` and :meth:`Series.where` (other than ``cond`` and ``other``) + - :meth:`Index.set_names` and :meth:`MultiIndex.set_names` (except for ``names``) + - :meth:`MultiIndex.codes` (except for ``codes``) + - :meth:`MultiIndex.set_levels` (except for ``levels``) + - :meth:`Resampler.interpolate` (other than ``method``) + .. _whatsnew_130.deprecations.nuisance_columns: @@ -790,8 +839,6 @@ For example: 1 2 2 12 -*Future behavior*: - .. code-block:: ipython In [5]: gb.prod(numeric_only=False) @@ -804,72 +851,6 @@ For example: 1 2 2 12 -.. _whatsnew_130.deprecations.other: - -Other Deprecations -^^^^^^^^^^^^^^^^^^ -- Deprecated allowing scalars to be passed to the :class:`Categorical` constructor (:issue:`38433`) -- Deprecated constructing :class:`CategoricalIndex` without passing list-like data (:issue:`38944`) -- Deprecated allowing subclass-specific keyword arguments in the :class:`Index` constructor, use the specific subclass directly instead (:issue:`14093`, :issue:`21311`, :issue:`22315`, :issue:`26974`) -- Deprecated the :meth:`astype` method of datetimelike (``timedelta64[ns]``, ``datetime64[ns]``, ``Datetime64TZDtype``, ``PeriodDtype``) to convert to integer dtypes, use ``values.view(...)`` instead (:issue:`38544`) -- Deprecated :meth:`MultiIndex.is_lexsorted` and :meth:`MultiIndex.lexsort_depth`, use :meth:`MultiIndex.is_monotonic_increasing` instead (:issue:`32259`) -- Deprecated keyword ``try_cast`` in :meth:`Series.where`, :meth:`Series.mask`, :meth:`DataFrame.where`, :meth:`DataFrame.mask`; cast results manually if desired (:issue:`38836`) -- Deprecated comparison of :class:`Timestamp` objects with ``datetime.date`` objects. Instead of e.g. ``ts <= mydate`` use ``ts <= pd.Timestamp(mydate)`` or ``ts.date() <= mydate`` (:issue:`36131`) -- Deprecated :attr:`Rolling.win_type` returning ``"freq"`` (:issue:`38963`) -- Deprecated :attr:`Rolling.is_datetimelike` (:issue:`38963`) -- Deprecated :class:`DataFrame` indexer for :meth:`Series.__setitem__` and :meth:`DataFrame.__setitem__` (:issue:`39004`) -- Deprecated :meth:`ExponentialMovingWindow.vol` (:issue:`39220`) -- Using ``.astype`` to convert between ``datetime64[ns]`` dtype and :class:`DatetimeTZDtype` is deprecated and will raise in a future version, use ``obj.tz_localize`` or ``obj.dt.tz_localize`` instead (:issue:`38622`) -- Deprecated casting ``datetime.date`` objects to ``datetime64`` when used as ``fill_value`` in :meth:`DataFrame.unstack`, :meth:`DataFrame.shift`, :meth:`Series.shift`, and :meth:`DataFrame.reindex`, pass ``pd.Timestamp(dateobj)`` instead (:issue:`39767`) -- Deprecated :meth:`.Styler.set_na_rep` and :meth:`.Styler.set_precision` in favor of :meth:`.Styler.format` with ``na_rep`` and ``precision`` as existing and new input arguments respectively (:issue:`40134`, :issue:`40425`) -- Deprecated :meth:`.Styler.where` in favor of using an alternative formulation with :meth:`Styler.applymap` (:issue:`40821`) -- Deprecated allowing partial failure in :meth:`Series.transform` and :meth:`DataFrame.transform` when ``func`` is list-like or dict-like and raises anything but ``TypeError``; ``func`` raising anything but a ``TypeError`` will raise in a future version (:issue:`40211`) -- Deprecated arguments ``error_bad_lines`` and ``warn_bad_lines`` in :meth:`read_csv` and :meth:`read_table` in favor of argument ``on_bad_lines`` (:issue:`15122`) -- Deprecated support for ``np.ma.mrecords.MaskedRecords`` in the :class:`DataFrame` constructor, pass ``{name: data[name] for name in data.dtype.names}`` instead (:issue:`40363`) -- Deprecated using :func:`merge`, :meth:`DataFrame.merge`, and :meth:`DataFrame.join` on a different number of levels (:issue:`34862`) -- Deprecated the use of ``**kwargs`` in :class:`.ExcelWriter`; use the keyword argument ``engine_kwargs`` instead (:issue:`40430`) -- Deprecated the ``level`` keyword for :class:`DataFrame` and :class:`Series` aggregations; use groupby instead (:issue:`39983`) -- Deprecated the ``inplace`` parameter of :meth:`Categorical.remove_categories`, :meth:`Categorical.add_categories`, :meth:`Categorical.reorder_categories`, :meth:`Categorical.rename_categories`, :meth:`Categorical.set_categories` and will be removed in a future version (:issue:`37643`) -- Deprecated :func:`merge` producing duplicated columns through the ``suffixes`` keyword and already existing columns (:issue:`22818`) -- Deprecated setting :attr:`Categorical._codes`, create a new :class:`Categorical` with the desired codes instead (:issue:`40606`) -- Deprecated the ``convert_float`` optional argument in :func:`read_excel` and :meth:`ExcelFile.parse` (:issue:`41127`) -- Deprecated behavior of :meth:`DatetimeIndex.union` with mixed timezones; in a future version both will be cast to UTC instead of object dtype (:issue:`39328`) -- Deprecated using ``usecols`` with out of bounds indices for :func:`read_csv` with ``engine="c"`` (:issue:`25623`) -- Deprecated special treatment of lists with first element a Categorical in the :class:`DataFrame` constructor; pass as ``pd.DataFrame({col: categorical, ...})`` instead (:issue:`38845`) -- Deprecated behavior of :class:`DataFrame` constructor when a ``dtype`` is passed and the data cannot be cast to that dtype. In a future version, this will raise instead of being silently ignored (:issue:`24435`) -- Deprecated the :attr:`Timestamp.freq` attribute. For the properties that use it (``is_month_start``, ``is_month_end``, ``is_quarter_start``, ``is_quarter_end``, ``is_year_start``, ``is_year_end``), when you have a ``freq``, use e.g. ``freq.is_month_start(ts)`` (:issue:`15146`) -- Deprecated construction of :class:`Series` or :class:`DataFrame` with ``DatetimeTZDtype`` data and ``datetime64[ns]`` dtype. Use ``Series(data).dt.tz_localize(None)`` instead (:issue:`41555`, :issue:`33401`) -- Deprecated behavior of :class:`Series` construction with large-integer values and small-integer dtype silently overflowing; use ``Series(data).astype(dtype)`` instead (:issue:`41734`) -- Deprecated behavior of :class:`DataFrame` construction with floating data and integer dtype casting even when lossy; in a future version this will remain floating, matching :class:`Series` behavior (:issue:`41770`) -- Deprecated inference of ``timedelta64[ns]``, ``datetime64[ns]``, or ``DatetimeTZDtype`` dtypes in :class:`Series` construction when data containing strings is passed and no ``dtype`` is passed (:issue:`33558`) -- In a future version, constructing :class:`Series` or :class:`DataFrame` with ``datetime64[ns]`` data and ``DatetimeTZDtype`` will treat the data as wall-times instead of as UTC times (matching DatetimeIndex behavior). To treat the data as UTC times, use ``pd.Series(data).dt.tz_localize("UTC").dt.tz_convert(dtype.tz)`` or ``pd.Series(data.view("int64"), dtype=dtype)`` (:issue:`33401`) -- Deprecated passing lists as ``key`` to :meth:`DataFrame.xs` and :meth:`Series.xs` (:issue:`41760`) -- Deprecated boolean arguments of ``inclusive`` in :meth:`Series.between` to have ``{"left", "right", "neither", "both"}`` as standard argument values (:issue:`40628`) -- Deprecated passing arguments as positional for all of the following, with exceptions noted (:issue:`41485`): - - - :func:`concat` (other than ``objs``) - - :func:`read_csv` (other than ``filepath_or_buffer``) - - :func:`read_table` (other than ``filepath_or_buffer``) - - :meth:`DataFrame.clip` and :meth:`Series.clip` (other than ``upper`` and ``lower``) - - :meth:`DataFrame.drop_duplicates` (except for ``subset``), :meth:`Series.drop_duplicates`, :meth:`Index.drop_duplicates` and :meth:`MultiIndex.drop_duplicates` - - :meth:`DataFrame.drop` (other than ``labels``) and :meth:`Series.drop` - - :meth:`DataFrame.dropna` and :meth:`Series.dropna` - - :meth:`DataFrame.ffill`, :meth:`Series.ffill`, :meth:`DataFrame.bfill`, and :meth:`Series.bfill` - - :meth:`DataFrame.fillna` and :meth:`Series.fillna` (apart from ``value``) - - :meth:`DataFrame.interpolate` and :meth:`Series.interpolate` (other than ``method``) - - :meth:`DataFrame.mask` and :meth:`Series.mask` (other than ``cond`` and ``other``) - - :meth:`DataFrame.reset_index` (other than ``level``) and :meth:`Series.reset_index` - - :meth:`DataFrame.set_axis` and :meth:`Series.set_axis` (other than ``labels``) - - :meth:`DataFrame.set_index` (other than ``keys``) - - :meth:`DataFrame.sort_index` and :meth:`Series.sort_index` - - :meth:`DataFrame.sort_values` (other than ``by``) and :meth:`Series.sort_values` - - :meth:`DataFrame.where` and :meth:`Series.where` (other than ``cond`` and ``other``) - - :meth:`Index.set_names` and :meth:`MultiIndex.set_names` (except for ``names``) - - :meth:`MultiIndex.codes` (except for ``codes``) - - :meth:`MultiIndex.set_levels` (except for ``levels``) - - :meth:`Resampler.interpolate` (other than ``method``) - - .. --------------------------------------------------------------------------- @@ -891,7 +872,7 @@ Performance improvements - Performance improvement in :class:`.Styler` where render times are more than 50% reduced and now matches :meth:`DataFrame.to_html` (:issue:`39972` :issue:`39952`, :issue:`40425`) - The method :meth:`.Styler.set_td_classes` is now as performant as :meth:`.Styler.apply` and :meth:`.Styler.applymap`, and even more so in some cases (:issue:`40453`) - Performance improvement in :meth:`.ExponentialMovingWindow.mean` with ``times`` (:issue:`39784`) -- Performance improvement in :meth:`.GroupBy.apply` when requiring the Python fallback implementation (:issue:`40176`) +- Performance improvement in :meth:`.GroupBy.apply` when requiring the python fallback implementation (:issue:`40176`) - Performance improvement in the conversion of a PyArrow Boolean array to a pandas nullable Boolean array (:issue:`41051`) - Performance improvement for concatenation of data with type :class:`CategoricalDtype` (:issue:`40193`) - Performance improvement in :meth:`.GroupBy.cummin` and :meth:`.GroupBy.cummax` with nullable data types (:issue:`37493`) @@ -923,7 +904,6 @@ Datetimelike - Bug in constructing a :class:`DataFrame` or :class:`Series` with mismatched ``datetime64`` data and ``timedelta64`` dtype, or vice-versa, failing to raise a ``TypeError`` (:issue:`38575`, :issue:`38764`, :issue:`38792`) - Bug in constructing a :class:`Series` or :class:`DataFrame` with a ``datetime`` object out of bounds for ``datetime64[ns]`` dtype or a ``timedelta`` object out of bounds for ``timedelta64[ns]`` dtype (:issue:`38792`, :issue:`38965`) - Bug in :meth:`DatetimeIndex.intersection`, :meth:`DatetimeIndex.symmetric_difference`, :meth:`PeriodIndex.intersection`, :meth:`PeriodIndex.symmetric_difference` always returning object-dtype when operating with :class:`CategoricalIndex` (:issue:`38741`) -- Bug in :meth:`DatetimeIndex.intersection` giving incorrect results with non-Tick frequencies with ``n != 1`` (:issue:`42104`) - Bug in :meth:`Series.where` incorrectly casting ``datetime64`` values to ``int64`` (:issue:`37682`) - Bug in :class:`Categorical` incorrectly typecasting ``datetime`` object to ``Timestamp`` (:issue:`38878`) - Bug in comparisons between :class:`Timestamp` object and ``datetime64`` objects just outside the implementation bounds for nanosecond ``datetime64`` (:issue:`39221`) @@ -931,7 +911,6 @@ Datetimelike - Bug in :meth:`Timedelta.round`, :meth:`Timedelta.floor`, :meth:`Timedelta.ceil` for values near the implementation bounds of :class:`Timedelta` (:issue:`38964`) - Bug in :func:`date_range` incorrectly creating :class:`DatetimeIndex` containing ``NaT`` instead of raising ``OutOfBoundsDatetime`` in corner cases (:issue:`24124`) - Bug in :func:`infer_freq` incorrectly fails to infer 'H' frequency of :class:`DatetimeIndex` if the latter has a timezone and crosses DST boundaries (:issue:`39556`) -- Bug in :class:`Series` backed by :class:`DatetimeArray` or :class:`TimedeltaArray` sometimes failing to set the array's ``freq`` to ``None`` (:issue:`41425`) Timedelta ^^^^^^^^^ @@ -961,9 +940,6 @@ Numeric - Bug in :meth:`Series.count` would result in an ``int32`` result on 32-bit platforms when argument ``level=None`` (:issue:`40908`) - Bug in :class:`Series` and :class:`DataFrame` reductions with methods ``any`` and ``all`` not returning Boolean results for object data (:issue:`12863`, :issue:`35450`, :issue:`27709`) - Bug in :meth:`Series.clip` would fail if the Series contains NA values and has nullable int or float as a data type (:issue:`40851`) -- Bug in :meth:`UInt64Index.where` and :meth:`UInt64Index.putmask` with an ``np.int64`` dtype ``other`` incorrectly raising ``TypeError`` (:issue:`41974`) -- Bug in :meth:`DataFrame.agg()` not sorting the aggregated axis in the order of the provided aggregation functions when one or more aggregation function fails to produce results (:issue:`33634`) -- Bug in :meth:`DataFrame.clip` not interpreting missing values as no threshold (:issue:`40420`) Conversion ^^^^^^^^^^ @@ -979,12 +955,6 @@ Conversion - Bug in :class:`DataFrame` and :class:`Series` construction with ``datetime64[ns]`` data and ``dtype=object`` resulting in ``datetime`` objects instead of :class:`Timestamp` objects (:issue:`41599`) - Bug in :class:`DataFrame` and :class:`Series` construction with ``timedelta64[ns]`` data and ``dtype=object`` resulting in ``np.timedelta64`` objects instead of :class:`Timedelta` objects (:issue:`41599`) - Bug in :class:`DataFrame` construction when given a two-dimensional object-dtype ``np.ndarray`` of :class:`Period` or :class:`Interval` objects failing to cast to :class:`PeriodDtype` or :class:`IntervalDtype`, respectively (:issue:`41812`) -- Bug in constructing a :class:`Series` from a list and a :class:`PandasDtype` (:issue:`39357`) -- Bug in creating a :class:`Series` from a ``range`` object that does not fit in the bounds of ``int64`` dtype (:issue:`30173`) -- Bug in creating a :class:`Series` from a ``dict`` with all-tuple keys and an :class:`Index` that requires reindexing (:issue:`41707`) -- Bug in :func:`.infer_dtype` not recognizing Series, Index, or array with a Period dtype (:issue:`23553`) -- Bug in :func:`.infer_dtype` raising an error for general :class:`.ExtensionArray` objects. It will now return ``"unknown-array"`` instead of raising (:issue:`37367`) -- Bug in :meth:`DataFrame.convert_dtypes` incorrectly raised a ``ValueError`` when called on an empty DataFrame (:issue:`40393`) Strings ^^^^^^^ @@ -1005,7 +975,6 @@ Indexing ^^^^^^^^ - Bug in :meth:`Index.union` and :meth:`MultiIndex.union` dropping duplicate ``Index`` values when ``Index`` was not monotonic or ``sort`` was set to ``False`` (:issue:`36289`, :issue:`31326`, :issue:`40862`) - Bug in :meth:`CategoricalIndex.get_indexer` failing to raise ``InvalidIndexError`` when non-unique (:issue:`38372`) -- Bug in :meth:`IntervalIndex.get_indexer` when ``target`` has ``CategoricalDtype`` and both the index and the target contain NA values (:issue:`41934`) - Bug in :meth:`Series.loc` raising a ``ValueError`` when input was filtered with a Boolean list and values to set were a list with lower dimension (:issue:`20438`) - Bug in inserting many new columns into a :class:`DataFrame` causing incorrect subsequent indexing behavior (:issue:`38380`) - Bug in :meth:`DataFrame.__setitem__` raising a ``ValueError`` when setting multiple values to duplicate columns (:issue:`15695`) @@ -1037,17 +1006,12 @@ Indexing - Bug in :meth:`DataFrame.loc.__setitem__` when setting-with-expansion incorrectly raising when the index in the expanding axis contained duplicates (:issue:`40096`) - Bug in :meth:`DataFrame.loc.__getitem__` with :class:`MultiIndex` casting to float when at least one index column has float dtype and we retrieve a scalar (:issue:`41369`) - Bug in :meth:`DataFrame.loc` incorrectly matching non-Boolean index elements (:issue:`20432`) -- Bug in indexing with ``np.nan`` on a :class:`Series` or :class:`DataFrame` with a :class:`CategoricalIndex` incorrectly raising ``KeyError`` when ``np.nan`` keys are present (:issue:`41933`) - Bug in :meth:`Series.__delitem__` with ``ExtensionDtype`` incorrectly casting to ``ndarray`` (:issue:`40386`) -- Bug in :meth:`DataFrame.at` with a :class:`CategoricalIndex` returning incorrect results when passed integer keys (:issue:`41846`) - Bug in :meth:`DataFrame.loc` returning a :class:`MultiIndex` in the wrong order if an indexer has duplicates (:issue:`40978`) - Bug in :meth:`DataFrame.__setitem__` raising a ``TypeError`` when using a ``str`` subclass as the column name with a :class:`DatetimeIndex` (:issue:`37366`) - Bug in :meth:`PeriodIndex.get_loc` failing to raise a ``KeyError`` when given a :class:`Period` with a mismatched ``freq`` (:issue:`41670`) - Bug ``.loc.__getitem__`` with a :class:`UInt64Index` and negative-integer keys raising ``OverflowError`` instead of ``KeyError`` in some cases, wrapping around to positive integers in others (:issue:`41777`) - Bug in :meth:`Index.get_indexer` failing to raise ``ValueError`` in some cases with invalid ``method``, ``limit``, or ``tolerance`` arguments (:issue:`41918`) -- Bug when slicing a :class:`Series` or :class:`DataFrame` with a :class:`TimedeltaIndex` when passing an invalid string raising ``ValueError`` instead of a ``TypeError`` (:issue:`41821`) -- Bug in :class:`Index` constructor sometimes silently ignoring a specified ``dtype`` (:issue:`38879`) -- :meth:`Index.where` behavior now mirrors :meth:`Index.putmask` behavior, i.e. ``index.where(mask, other)`` matches ``index.putmask(~mask, other)`` (:issue:`39412`) Missing ^^^^^^^ @@ -1056,7 +1020,6 @@ Missing - Bug in :meth:`DataFrame.fillna` not accepting a dictionary for the ``downcast`` keyword (:issue:`40809`) - Bug in :func:`isna` not returning a copy of the mask for nullable types, causing any subsequent mask modification to change the original array (:issue:`40935`) - Bug in :class:`DataFrame` construction with float data containing ``NaN`` and an integer ``dtype`` casting instead of retaining the ``NaN`` (:issue:`26919`) -- Bug in :meth:`Series.isin` and :meth:`MultiIndex.isin` didn't treat all nans as equivalent if they were in tuples (:issue:`41836`) MultiIndex ^^^^^^^^^^ @@ -1064,7 +1027,6 @@ MultiIndex - Bug in :meth:`MultiIndex.intersection` duplicating ``NaN`` in the result (:issue:`38623`) - Bug in :meth:`MultiIndex.equals` incorrectly returning ``True`` when the :class:`MultiIndex` contained ``NaN`` even when they are differently ordered (:issue:`38439`) - Bug in :meth:`MultiIndex.intersection` always returning an empty result when intersecting with :class:`CategoricalIndex` (:issue:`38653`) -- Bug in :meth:`MultiIndex.difference` incorrectly raising ``TypeError`` when indexes contain non-sortable entries (:issue:`41915`) - Bug in :meth:`MultiIndex.reindex` raising a ``ValueError`` when used on an empty :class:`MultiIndex` and indexing only a specific level (:issue:`41170`) - Bug in :meth:`MultiIndex.reindex` raising ``TypeError`` when reindexing against a flat :class:`Index` (:issue:`41707`) @@ -1104,7 +1066,6 @@ I/O - Bug in the conversion from PyArrow to pandas (e.g. for reading Parquet) with nullable dtypes and a PyArrow array whose data buffer size is not a multiple of the dtype size (:issue:`40896`) - Bug in :func:`read_excel` would raise an error when pandas could not determine the file type even though the user specified the ``engine`` argument (:issue:`41225`) - Bug in :func:`read_clipboard` copying from an excel file shifts values into the wrong column if there are null values in first column (:issue:`41108`) -- Bug in :meth:`DataFrame.to_hdf` and :meth:`Series.to_hdf` raising a ``TypeError`` when trying to append a string column to an incompatible column (:issue:`41897`) Period ^^^^^^ @@ -1164,8 +1125,6 @@ Groupby/resample/rolling - Bug in :class:`DataFrameGroupBy` aggregations incorrectly failing to drop columns with invalid dtypes for that aggregation when there are no valid columns (:issue:`41291`) - Bug in :meth:`DataFrame.rolling.__iter__` where ``on`` was not assigned to the index of the resulting objects (:issue:`40373`) - Bug in :meth:`.DataFrameGroupBy.transform` and :meth:`.DataFrameGroupBy.agg` with ``engine="numba"`` where ``*args`` were being cached with the user passed function (:issue:`41647`) -- Bug in :class:`DataFrameGroupBy` methods ``agg``, ``transform``, ``sum``, ``bfill``, ``ffill``, ``pad``, ``pct_change``, ``shift``, ``ohlc`` dropping ``.columns.names`` (:issue:`41497`) - Reshaping ^^^^^^^^^ @@ -1188,8 +1147,6 @@ Reshaping - Bug in :func:`to_datetime` raising an error when the input sequence contained unhashable items (:issue:`39756`) - Bug in :meth:`Series.explode` preserving the index when ``ignore_index`` was ``True`` and values were scalars (:issue:`40487`) - Bug in :func:`to_datetime` raising a ``ValueError`` when :class:`Series` contains ``None`` and ``NaT`` and has more than 50 elements (:issue:`39882`) -- Bug in :meth:`Series.unstack` and :meth:`DataFrame.unstack` with object-dtype values containing timezone-aware datetime objects incorrectly raising ``TypeError`` (:issue:`41875`) -- Bug in :meth:`DataFrame.melt` raising ``InvalidIndexError`` when :class:`DataFrame` has duplicate columns used as ``value_vars`` (:issue:`41951`) Sparse ^^^^^^ @@ -1217,14 +1174,24 @@ Styler Other ^^^^^ +- Bug in :class:`Index` constructor sometimes silently ignoring a specified ``dtype`` (:issue:`38879`) +- Bug in :func:`.infer_dtype` not recognizing Series, Index, or array with a Period dtype (:issue:`23553`) +- Bug in :func:`.infer_dtype` raising an error for general :class:`.ExtensionArray` objects. It will now return ``"unknown-array"`` instead of raising (:issue:`37367`) +- Bug in constructing a :class:`Series` from a list and a :class:`PandasDtype` (:issue:`39357`) - ``inspect.getmembers(Series)`` no longer raises an ``AbstractMethodError`` (:issue:`38782`) - Bug in :meth:`Series.where` with numeric dtype and ``other=None`` not casting to ``nan`` (:issue:`39761`) +- :meth:`Index.where` behavior now mirrors :meth:`Index.putmask` behavior, i.e. ``index.where(mask, other)`` matches ``index.putmask(~mask, other)`` (:issue:`39412`) - Bug in :func:`.assert_series_equal`, :func:`.assert_frame_equal`, :func:`.assert_index_equal` and :func:`.assert_extension_array_equal` incorrectly raising when an attribute has an unrecognized NA type (:issue:`39461`) - Bug in :func:`.assert_index_equal` with ``exact=True`` not raising when comparing :class:`CategoricalIndex` instances with ``Int64Index`` and ``RangeIndex`` categories (:issue:`41263`) - Bug in :meth:`DataFrame.equals`, :meth:`Series.equals`, and :meth:`Index.equals` with object-dtype containing ``np.datetime64("NaT")`` or ``np.timedelta64("NaT")`` (:issue:`39650`) - Bug in :func:`show_versions` where console JSON output was not proper JSON (:issue:`39701`) - pandas can now compile on z/OS when using `xlc `_ (:issue:`35826`) -- Bug in :func:`pandas.util.hash_pandas_object` not recognizing ``hash_key``, ``encoding`` and ``categorize`` when the input object type is a :class:`DataFrame` (:issue:`41404`) +- Bug in :meth:`DataFrame.convert_dtypes` incorrectly raised a ``ValueError`` when called on an empty DataFrame (:issue:`40393`) +- Bug in :meth:`DataFrame.agg()` not sorting the aggregated axis in the order of the provided aggragation functions when one or more aggregation function fails to produce results (:issue:`33634`) +- Bug in :meth:`DataFrame.clip` not interpreting missing values as no threshold (:issue:`40420`) +- Bug in :class:`Series` backed by :class:`DatetimeArray` or :class:`TimedeltaArray` sometimes failing to set the array's ``freq`` to ``None`` (:issue:`41425`) +- Bug in creating a :class:`Series` from a ``range`` object that does not fit in the bounds of ``int64`` dtype (:issue:`30173`) +- Bug in creating a :class:`Series` from a ``dict`` with all-tuple keys and an :class:`Index` that requires reindexing (:issue:`41707`) .. --------------------------------------------------------------------------- @@ -1233,4 +1200,4 @@ Other Contributors ~~~~~~~~~~~~ -.. contributors:: v1.2.5..v1.3.0 +.. contributors:: v1.2.4..v1.3.0|HEAD diff --git a/doc/source/whatsnew/v1.3.1.rst b/doc/source/whatsnew/v1.3.1.rst deleted file mode 100644 index 2ce146660f98c..0000000000000 --- a/doc/source/whatsnew/v1.3.1.rst +++ /dev/null @@ -1,54 +0,0 @@ -.. _whatsnew_131: - -What's new in 1.3.1 (July ??, 2021) ------------------------------------ - -These are the changes in pandas 1.3.1. See :ref:`release` for a full changelog -including other versions of pandas. - -{{ header }} - -.. --------------------------------------------------------------------------- - -.. _whatsnew_131.regressions: - -Fixed regressions -~~~~~~~~~~~~~~~~~ -- Pandas could not be built on PyPy (:issue:`42355`) -- :class:`DataFrame` constructed with an older version of pandas could not be unpickled (:issue:`42345`) -- Performance regression in constructing a :class:`DataFrame` from a dictionary of dictionaries (:issue:`42248`) -- Fixed regression in :meth:`DataFrame.agg` dropping values when the DataFrame had an Extension Array dtype, a duplicate index, and ``axis=1`` (:issue:`42380`) -- Fixed regression in :meth:`DataFrame.astype` changing the order of noncontiguous data (:issue:`42396`) -- Performance regression in :class:`DataFrame` in reduction operations requiring casting such as :meth:`DataFrame.mean` on integer data (:issue:`38592`) -- Performance regression in :meth:`DataFrame.to_dict` and :meth:`Series.to_dict` when ``orient`` argument one of "records", "dict", or "split" (:issue:`42352`) -- Fixed regression in indexing with a ``list`` subclass incorrectly raising ``TypeError`` (:issue:`42433`, :issue:`42461`) -- Fixed regression in :meth:`DataFrame.isin` and :meth:`Series.isin` raising ``TypeError`` with nullable data containing at least one missing value (:issue:`42405`) -- Regression in :func:`concat` between objects with bool dtype and integer dtype casting to object instead of to integer (:issue:`42092`) - -.. --------------------------------------------------------------------------- - -.. _whatsnew_131.bug_fixes: - -Bug fixes -~~~~~~~~~ -- Fixed bug in :meth:`DataFrame.transpose` dropping values when the DataFrame had an Extension Array dtype and a duplicate index (:issue:`42380`) -- Fixed bug in :meth:`DataFrame.to_xml` raising ``KeyError`` when called with ``index=False`` and an offset index (:issue:`42458`) -- - -.. --------------------------------------------------------------------------- - -.. _whatsnew_131.other: - -Other -~~~~~ -- -- - -.. --------------------------------------------------------------------------- - -.. _whatsnew_131.contributors: - -Contributors -~~~~~~~~~~~~ - -.. contributors:: v1.3.0..v1.3.1|HEAD diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst deleted file mode 100644 index 68f1c78688b1d..0000000000000 --- a/doc/source/whatsnew/v1.4.0.rst +++ /dev/null @@ -1,295 +0,0 @@ -.. _whatsnew_140: - -What's new in 1.4.0 (??) ------------------------- - -These are the changes in pandas 1.4.0. See :ref:`release` for a full changelog -including other versions of pandas. - -{{ header }} - -.. --------------------------------------------------------------------------- - -.. _whatsnew_140.enhancements: - -Enhancements -~~~~~~~~~~~~ - -.. _whatsnew_140.enhancements.enhancement1: - -enhancement1 -^^^^^^^^^^^^ - -.. _whatsnew_140.enhancements.enhancement2: - -enhancement2 -^^^^^^^^^^^^ - -.. _whatsnew_140.enhancements.other: - -Other enhancements -^^^^^^^^^^^^^^^^^^ -- Add support for assigning values to ``by`` argument in :meth:`DataFrame.plot.hist` and :meth:`DataFrame.plot.box` (:issue:`15079`) -- :meth:`Series.sample`, :meth:`DataFrame.sample`, and :meth:`.GroupBy.sample` now accept a ``np.random.Generator`` as input to ``random_state``. A generator will be more performant, especially with ``replace=False`` (:issue:`38100`) -- Additional options added to :meth:`.Styler.bar` to control alignment and display, with keyword only arguments (:issue:`26070`, :issue:`36419`) -- :meth:`Styler.bar` now validates the input argument ``width`` and ``height`` (:issue:`42511`) -- :meth:`Series.ewm`, :meth:`DataFrame.ewm`, now support a ``method`` argument with a ``'table'`` option that performs the windowing operation over an entire :class:`DataFrame`. See :ref:`Window Overview ` for performance and functional benefits (:issue:`42273`) -- - -.. --------------------------------------------------------------------------- - -.. _whatsnew_140.notable_bug_fixes: - -Notable bug fixes -~~~~~~~~~~~~~~~~~ - -These are bug fixes that might have notable behavior changes. - -.. _whatsnew_140.notable_bug_fixes.notable_bug_fix1: - -notable_bug_fix1 -^^^^^^^^^^^^^^^^ - -.. _whatsnew_140.notable_bug_fixes.notable_bug_fix2: - -notable_bug_fix2 -^^^^^^^^^^^^^^^^ - -.. --------------------------------------------------------------------------- - -.. _whatsnew_140.api_breaking: - -Backwards incompatible API changes -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -.. _whatsnew_140.api_breaking.deps: - -Increased minimum versions for dependencies -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Some minimum supported versions of dependencies were updated. -If installed, we now require: - -+-----------------+-----------------+----------+---------+ -| Package | Minimum Version | Required | Changed | -+=================+=================+==========+=========+ -| numpy | 1.18.5 | X | X | -+-----------------+-----------------+----------+---------+ -| pytz | 2020.1 | X | X | -+-----------------+-----------------+----------+---------+ -| python-dateutil | 2.8.1 | X | X | -+-----------------+-----------------+----------+---------+ -| bottleneck | 1.3.1 | | X | -+-----------------+-----------------+----------+---------+ -| numexpr | 2.7.1 | | X | -+-----------------+-----------------+----------+---------+ -| pytest (dev) | 6.0 | | | -+-----------------+-----------------+----------+---------+ -| mypy (dev) | 0.910 | | X | -+-----------------+-----------------+----------+---------+ - -For `optional libraries `_ the general recommendation is to use the latest version. -The following table lists the lowest version per library that is currently being tested throughout the development of pandas. -Optional libraries below the lowest tested version may still work, but are not considered supported. - -+-----------------+-----------------+---------+ -| Package | Minimum Version | Changed | -+=================+=================+=========+ -| beautifulsoup4 | 4.8.2 | X | -+-----------------+-----------------+---------+ -| fastparquet | 0.4.0 | | -+-----------------+-----------------+---------+ -| fsspec | 0.7.4 | | -+-----------------+-----------------+---------+ -| gcsfs | 0.6.0 | | -+-----------------+-----------------+---------+ -| lxml | 4.5.0 | X | -+-----------------+-----------------+---------+ -| matplotlib | 3.3.2 | X | -+-----------------+-----------------+---------+ -| numba | 0.50.1 | X | -+-----------------+-----------------+---------+ -| openpyxl | 3.0.2 | X | -+-----------------+-----------------+---------+ -| pyarrow | 0.17.0 | | -+-----------------+-----------------+---------+ -| pymysql | 0.10.1 | X | -+-----------------+-----------------+---------+ -| pytables | 3.6.1 | X | -+-----------------+-----------------+---------+ -| s3fs | 0.4.0 | | -+-----------------+-----------------+---------+ -| scipy | 1.4.1 | X | -+-----------------+-----------------+---------+ -| sqlalchemy | 1.3.11 | X | -+-----------------+-----------------+---------+ -| tabulate | 0.8.7 | | -+-----------------+-----------------+---------+ -| xarray | 0.15.1 | X | -+-----------------+-----------------+---------+ -| xlrd | 2.0.1 | X | -+-----------------+-----------------+---------+ -| xlsxwriter | 1.2.2 | X | -+-----------------+-----------------+---------+ -| xlwt | 1.3.0 | | -+-----------------+-----------------+---------+ -| pandas-gbq | 0.14.0 | X | -+-----------------+-----------------+---------+ - -See :ref:`install.dependencies` and :ref:`install.optional_dependencies` for more. - -.. _whatsnew_140.api_breaking.other: - -Other API changes -^^^^^^^^^^^^^^^^^ -- :meth:`Index.get_indexer_for` no longer accepts keyword arguments (other than 'target'); in the past these would be silently ignored if the index was not unique (:issue:`42310`) -- - -.. --------------------------------------------------------------------------- - -.. _whatsnew_140.deprecations: - -Deprecations -~~~~~~~~~~~~ -- Deprecated :meth:`Index.is_type_compatible` (:issue:`42113`) -- Deprecated ``method`` argument in :meth:`Index.get_loc`, use ``index.get_indexer([label], method=...)`` instead (:issue:`42269`) -- Deprecated treating integer keys in :meth:`Series.__setitem__` as positional when the index is a :class:`Float64Index` not containing the key, a :class:`IntervalIndex` with no entries containing the key, or a :class:`MultiIndex` with leading :class:`Float64Index` level not containing the key (:issue:`33469`) -- Deprecated treating ``numpy.datetime64`` objects as UTC times when passed to the :class:`Timestamp` constructor along with a timezone. In a future version, these will be treated as wall-times. To retain the old behavior, use ``Timestamp(dt64).tz_localize("UTC").tz_convert(tz)`` (:issue:`24559`) -- Deprecated ignoring missing labels when indexing with a sequence of labels on a level of a MultiIndex (:issue:`42351`) - -.. --------------------------------------------------------------------------- - -.. _whatsnew_140.performance: - -Performance improvements -~~~~~~~~~~~~~~~~~~~~~~~~ -- Performance improvement in :meth:`.GroupBy.sample`, especially when ``weights`` argument provided (:issue:`34483`) -- Performance improvement in :meth:`.GroupBy.transform` for user-defined functions (:issue:`41598`) - -.. --------------------------------------------------------------------------- - -.. _whatsnew_140.bug_fixes: - -Bug fixes -~~~~~~~~~ - -Categorical -^^^^^^^^^^^ -- Bug in setting dtype-incompatible values into a :class:`Categorical` (or ``Series`` or ``DataFrame`` backed by ``Categorical``) raising ``ValueError`` instead of ``TypeError`` (:issue:`41919`) -- Bug in :meth:`Categorical.searchsorted` when passing a dtype-incompatible value raising ``KeyError`` instead of ``TypeError`` (:issue:`41919`) -- Bug in :meth:`Series.where` with ``CategoricalDtype`` when passing a dtype-incompatible value raising ``ValueError`` instead of ``TypeError`` (:issue:`41919`) -- Bug in :meth:`Categorical.fillna` when passing a dtype-incompatible value raising ``ValueError`` instead of ``TypeError`` (:issue:`41919`) -- Bug in :meth:`Categorical.fillna` with a tuple-like category raising ``ValueError`` instead of ``TypeError`` when filling with a non-category tuple (:issue:`41919`) -- - -Datetimelike -^^^^^^^^^^^^ -- Bug in :func:`to_datetime` returning pd.NaT for inputs that produce duplicated values, when ``cache=True`` (:issue:`42259`) -- Bug in :class:`DataFrame` constructor unnecessarily copying non-datetimelike 2D object arrays (:issue:`39272`) -- - -Timedelta -^^^^^^^^^ -- -- - -Timezones -^^^^^^^^^ -- -- - -Numeric -^^^^^^^ -- Bug in :meth:`DataFrame.rank` raising ``ValueError`` with ``object`` columns and ``method="first"`` (:issue:`41931`) -- Bug in :meth:`DataFrame.rank` treating missing values and extreme values as equal (for example ``np.nan`` and ``np.inf``), causing incorrect results when ``na_option="bottom"`` or ``na_option="top`` used (:issue:`41931`) -- - -Conversion -^^^^^^^^^^ -- Bug in :class:`UInt64Index` constructor when passing a list containing both positive integers small enough to cast to int64 and integers too large too hold in int64 (:issue:`42201`) -- - -Strings -^^^^^^^ -- -- - -Interval -^^^^^^^^ -- -- - -Indexing -^^^^^^^^ -- Bug in :meth:`DataFrame.truncate` and :meth:`Series.truncate` when the object's Index has a length greater than one but only one unique value (:issue:`42365`) -- Bug in :meth:`Series.loc` when with a :class:`MultiIndex` whose first level contains only ``np.nan`` values (:issue:`42055`) -- Bug in indexing on a :class:`Series` or :class:`DataFrame` with a :class:`DatetimeIndex` when passing a string, the return type depended on whether the index was monotonic (:issue:`24892`) -- Bug in indexing on a :class:`MultiIndex` failing to drop scalar levels when the indexer is a tuple containing a datetime-like string (:issue:`42476`) -- - -Missing -^^^^^^^ -- -- - -MultiIndex -^^^^^^^^^^ -- Bug in :meth:`MultiIndex.get_loc` where the first level is a :class:`DatetimeIndex` and a string key is passed (:issue:`42465`) -- Bug in :meth:`MultiIndex.reindex` when passing a ``level`` that corresponds to an ``ExtensionDtype`` level (:issue:`42043`) -- - -I/O -^^^ -- Bug in :func:`read_excel` attempting to read chart sheets from .xlsx files (:issue:`41448`) -- Bug in :func:`json_normalize` where ``errors=ignore`` could fail to ignore missing values of ``meta`` when ``record_path`` has a length greater than one (:issue:`41876`) -- Bug in :func:`read_csv` with multi-header input and arguments referencing column names as tuples (:issue:`42446`) -- - -Period -^^^^^^ -- -- - -Plotting -^^^^^^^^ -- -- - -Groupby/resample/rolling -^^^^^^^^^^^^^^^^^^^^^^^^ -- Bug in :meth:`Series.rolling.apply`, :meth:`DataFrame.rolling.apply`, :meth:`Series.expanding.apply` and :meth:`DataFrame.expanding.apply` with ``engine="numba"`` where ``*args`` were being cached with the user passed function (:issue:`42287`) -- - -Reshaping -^^^^^^^^^ -- -- - -Sparse -^^^^^^ -- -- - -ExtensionArray -^^^^^^^^^^^^^^ -- -- - -Styler -^^^^^^ -- -- - -Other -^^^^^ - -.. ***DO NOT USE THIS SECTION*** - -- -- - -.. --------------------------------------------------------------------------- - -.. _whatsnew_140.contributors: - -Contributors -~~~~~~~~~~~~ diff --git a/environment.yml b/environment.yml index 9396210da3635..788b88ef16ad6 100644 --- a/environment.yml +++ b/environment.yml @@ -3,9 +3,9 @@ channels: - conda-forge dependencies: # required - - numpy>=1.18.5 + - numpy>=1.17.3 - python=3.8 - - python-dateutil>=2.8.1 + - python-dateutil>=2.7.3 - pytz # benchmarks @@ -24,7 +24,7 @@ dependencies: - flake8-bugbear=21.3.2 # used by flake8, find likely bugs - flake8-comprehensions=3.1.0 # used by flake8, linting of unnecessary comprehensions - isort>=5.2.1 # check that imports are in the right order - - mypy=0.910 + - mypy=0.812 - pre-commit>=2.9.2 - pycodestyle # used by flake8 - pyupgrade @@ -55,12 +55,12 @@ dependencies: # testing - boto3 - botocore>=1.11 - - hypothesis>=5.5.3 + - hypothesis>=3.82 - moto # mock S3 - flask - - pytest>=6.0 + - pytest>=5.0.1 - pytest-cov - - pytest-xdist>=1.31 + - pytest-xdist>=1.21 - pytest-asyncio - pytest-instafail @@ -71,24 +71,24 @@ dependencies: # unused (required indirectly may be?) - ipywidgets - nbformat - - notebook>=6.0.3 + - notebook>=5.7.5 - pip # optional - blosc - - bottleneck>=1.3.1 + - bottleneck>=1.2.1 - ipykernel - ipython>=7.11.1 - jinja2 # pandas.Styler - - matplotlib>=3.3.2 # pandas.plotting, Series.plot, DataFrame.plot - - numexpr>=2.7.1 - - scipy>=1.4.1 - - numba>=0.50.1 + - matplotlib>=2.2.2 # pandas.plotting, Series.plot, DataFrame.plot + - numexpr>=2.7.0 + - scipy>=1.2 + - numba>=0.46.0 # optional for io # --------------- # pd.read_html - - beautifulsoup4>=4.8.2 + - beautifulsoup4>=4.6.0 - html5lib - lxml @@ -99,13 +99,14 @@ dependencies: - xlwt - odfpy - - fastparquet>=0.4.0 # pandas.read_parquet, DataFrame.to_parquet + - fastparquet>=0.3.2 # pandas.read_parquet, DataFrame.to_parquet - pyarrow>=0.17.0 # pandas.read_parquet, DataFrame.to_parquet, pandas.read_feather, DataFrame.to_feather - python-snappy # required by pyarrow - - pytables>=3.6.1 # pandas.read_hdf, DataFrame.to_hdf + - pyqt>=5.9.2 # pandas.read_clipboard + - pytables>=3.5.1 # pandas.read_hdf, DataFrame.to_hdf - s3fs>=0.4.0 # file IO when using 's3://...' path - - fsspec>=0.7.4, <2021.6.0 # for generic remote file operations + - fsspec>=0.7.4 # for generic remote file operations - gcsfs>=0.6.0 # file IO when using 'gcs://...' path - sqlalchemy # pandas.read_sql, DataFrame.to_sql - xarray # DataFrame.to_xarray @@ -117,7 +118,3 @@ dependencies: - git+https://github.com/pydata/pydata-sphinx-theme.git@master - numpydoc < 1.2 # 2021-02-09 1.2dev breaking CI - pandas-dev-flaker==0.2.0 - - types-python-dateutil - - types-PyMySQL - - types-pytz - - types-setuptools diff --git a/pandas/__init__.py b/pandas/__init__.py index 43f05617584cc..db4043686bcbb 100644 --- a/pandas/__init__.py +++ b/pandas/__init__.py @@ -19,7 +19,10 @@ del hard_dependencies, dependency, missing_dependencies # numpy compat -from pandas.compat import is_numpy_dev as _is_numpy_dev +from pandas.compat import ( + np_version_under1p18 as _np_version_under1p18, + is_numpy_dev as _is_numpy_dev, +) try: from pandas._libs import hashtable as _hashtable, lib as _lib, tslib as _tslib diff --git a/pandas/_config/config.py b/pandas/_config/config.py index ed48ff7ae08c6..be3498dc0829b 100644 --- a/pandas/_config/config.py +++ b/pandas/_config/config.py @@ -157,7 +157,7 @@ def _describe_option(pat: str = "", _print_desc: bool = True): if len(keys) == 0: raise OptionError("No such keys(s)") - s = "\n".join([_build_option_description(k) for k in keys]) + s = "\n".join(_build_option_description(k) for k in keys) if _print_desc: print(s) diff --git a/pandas/_libs/algos.pyi b/pandas/_libs/algos.pyi index 9da5534c51321..d0f664c323a89 100644 --- a/pandas/_libs/algos.pyi +++ b/pandas/_libs/algos.pyi @@ -123,7 +123,7 @@ def is_monotonic( def rank_1d( values: np.ndarray, # ndarray[rank_t, ndim=1] - labels: np.ndarray | None = ..., # const int64_t[:]=None + labels: np.ndarray, # const int64_t[:] is_datetimelike: bool = ..., ties_method=..., ascending: bool = ..., diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index 172f2bfb49160..03f4ce273de6e 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -389,8 +389,11 @@ def nancorr_spearman(ndarray[float64_t, ndim=2] mat, Py_ssize_t minp=1) -> ndarr int64_t nobs = 0 bint no_nans float64_t vx, vy, sumx, sumxx, sumyy, mean, divisor + const int64_t[:] labels_n, labels_nobs N, K = (mat).shape + # For compatibility when calling rank_1d + labels_n = np.zeros(N, dtype=np.int64) # Handle the edge case where we know all results will be nan # to keep conditional logic inside loop simpler @@ -409,7 +412,7 @@ def nancorr_spearman(ndarray[float64_t, ndim=2] mat, Py_ssize_t minp=1) -> ndarr maskedx = np.empty(N, dtype=np.float64) maskedy = np.empty(N, dtype=np.float64) for i in range(K): - ranked_mat[:, i] = rank_1d(mat[:, i]) + ranked_mat[:, i] = rank_1d(mat[:, i], labels=labels_n) with nogil: for xi in range(K): @@ -448,8 +451,11 @@ def nancorr_spearman(ndarray[float64_t, ndim=2] mat, Py_ssize_t minp=1) -> ndarr with gil: # We need to slice back to nobs because rank_1d will # require arrays of nobs length - rankedx = rank_1d(np.asarray(maskedx)[:nobs]) - rankedy = rank_1d(np.asarray(maskedy)[:nobs]) + labels_nobs = np.zeros(nobs, dtype=np.int64) + rankedx = rank_1d(np.array(maskedx)[:nobs], + labels=labels_nobs) + rankedy = rank_1d(np.array(maskedy)[:nobs], + labels=labels_nobs) for i in range(nobs): maskedx[i] = rankedx[i] maskedy[i] = rankedy[i] @@ -512,6 +518,7 @@ def nancorr_kendall(ndarray[float64_t, ndim=2] mat, Py_ssize_t minp=1) -> ndarra int64_t total_discordant = 0 float64_t kendall_tau int64_t n_obs + const intp_t[:] labels_n N, K = (mat).shape @@ -519,9 +526,11 @@ def nancorr_kendall(ndarray[float64_t, ndim=2] mat, Py_ssize_t minp=1) -> ndarra mask = np.isfinite(mat) ranked_mat = np.empty((N, K), dtype=np.float64) + # For compatibility when calling rank_1d + labels_n = np.zeros(N, dtype=np.intp) for i in range(K): - ranked_mat[:, i] = rank_1d(mat[:, i]) + ranked_mat[:, i] = rank_1d(mat[:, i], labels_n) for xi in range(K): sorted_idxs = ranked_mat[:, xi].argsort() @@ -922,37 +931,11 @@ ctypedef fused rank_t: int64_t -cdef rank_t get_rank_nan_fill_val(bint rank_nans_highest, rank_t[:] _=None): - """ - Return the value we'll use to represent missing values when sorting depending - on if we'd like missing values to end up at the top/bottom. (The second parameter - is unused, but needed for fused type specialization) - """ - if rank_nans_highest: - if rank_t is object: - return Infinity() - elif rank_t is int64_t: - return util.INT64_MAX - elif rank_t is uint64_t: - return util.UINT64_MAX - else: - return np.inf - else: - if rank_t is object: - return NegInfinity() - elif rank_t is int64_t: - return NPY_NAT - elif rank_t is uint64_t: - return 0 - else: - return -np.inf - - @cython.wraparound(False) @cython.boundscheck(False) def rank_1d( ndarray[rank_t, ndim=1] values, - const intp_t[:] labels=None, + const intp_t[:] labels, bint is_datetimelike=False, ties_method="average", bint ascending=True, @@ -965,10 +948,10 @@ def rank_1d( Parameters ---------- values : array of rank_t values to be ranked - labels : np.ndarray[np.intp] or None + labels : np.ndarray[np.intp] Array containing unique label for each group, with its ordering matching up to the corresponding record in `values`. If not called - from a groupby operation, will be None. + from a groupby operation, will be an array of 0's is_datetimelike : bool, default False True if `values` contains datetime-like entries. ties_method : {'average', 'min', 'max', 'first', 'dense'}, default @@ -997,7 +980,7 @@ def rank_1d( ndarray[rank_t, ndim=1] masked_vals rank_t[:] masked_vals_memview uint8_t[:] mask - bint keep_na, nans_rank_highest, check_labels, check_mask + bint keep_na, check_labels, check_mask rank_t nan_fill_val tiebreak = tiebreakers[ties_method] @@ -1008,15 +991,14 @@ def rank_1d( keep_na = na_option == 'keep' N = len(values) - if labels is not None: - # TODO Cython 3.0: cast won't be necessary (#2992) - assert len(labels) == N + # TODO Cython 3.0: cast won't be necessary (#2992) + assert len(labels) == N out = np.empty(N) grp_sizes = np.ones(N, dtype=np.int64) - # If we don't care about labels, can short-circuit later label + # If all 0 labels, can short-circuit later label # comparisons - check_labels = labels is not None + check_labels = np.any(labels) # For cases where a mask is not possible, we can avoid mask checks check_mask = not (rank_t is uint64_t or (rank_t is int64_t and not is_datetimelike)) @@ -1044,15 +1026,27 @@ def rank_1d( # If descending, fill with highest value since descending # will flip the ordering to still end up with lowest rank. # Symmetric logic applies to `na_option == 'bottom'` - nans_rank_highest = ascending ^ (na_option == 'top') - nan_fill_val = get_rank_nan_fill_val[rank_t](nans_rank_highest) - if nans_rank_highest: - order = [masked_vals, mask] + if ascending ^ (na_option == 'top'): + if rank_t is object: + nan_fill_val = Infinity() + elif rank_t is int64_t: + nan_fill_val = np.iinfo(np.int64).max + elif rank_t is uint64_t: + nan_fill_val = np.iinfo(np.uint64).max + else: + nan_fill_val = np.inf + order = (masked_vals, mask, labels) else: - order = [masked_vals, ~(np.asarray(mask))] + if rank_t is object: + nan_fill_val = NegInfinity() + elif rank_t is int64_t: + nan_fill_val = NPY_NAT + elif rank_t is uint64_t: + nan_fill_val = 0 + else: + nan_fill_val = -np.inf - if check_labels: - order.append(labels) + order = (masked_vals, ~(np.array(mask, copy=False)), labels) np.putmask(masked_vals, mask, nan_fill_val) # putmask doesn't accept a memoryview, so we assign as a separate step @@ -1071,18 +1065,22 @@ def rank_1d( rank_sorted_1d( out, grp_sizes, + labels, lexsort_indexer, masked_vals_memview, mask, - check_mask=check_mask, - N=N, - tiebreak=tiebreak, - keep_na=keep_na, - pct=pct, - labels=labels, + tiebreak, + check_mask, + check_labels, + keep_na, + N, ) + if pct: + for i in range(N): + if grp_sizes[i] != 0: + out[i] = out[i] / grp_sizes[i] - return np.asarray(out) + return np.array(out) @cython.wraparound(False) @@ -1090,18 +1088,16 @@ def rank_1d( cdef void rank_sorted_1d( float64_t[::1] out, int64_t[::1] grp_sizes, + const intp_t[:] labels, const intp_t[:] sort_indexer, # Can make const with cython3 (https://github.com/cython/cython/issues/3222) rank_t[:] masked_vals, const uint8_t[:] mask, + TiebreakEnumType tiebreak, bint check_mask, + bint check_labels, + bint keep_na, Py_ssize_t N, - TiebreakEnumType tiebreak=TIEBREAK_AVERAGE, - bint keep_na=True, - bint pct=False, - # https://github.com/cython/cython/issues/1630, only trailing arguments can - # currently be omitted for cdef functions, which is why we keep this at the end - const intp_t[:] labels=None, ) nogil: """ See rank_1d.__doc__. Handles only actual ranking, so sorting and masking should @@ -1112,36 +1108,33 @@ cdef void rank_sorted_1d( out : float64_t[::1] Array to store computed ranks grp_sizes : int64_t[::1] - Array to store group counts, only used if pct=True. Should only be None - if labels is None. + Array to store group counts. + labels : See rank_1d.__doc__ sort_indexer : intp_t[:] Array of indices which sorts masked_vals masked_vals : rank_t[:] The values input to rank_1d, with missing values replaced by fill values mask : uint8_t[:] - Array where entries are True if the value is missing, False otherwise. - check_mask : bool + Array where entries are True if the value is missing, False otherwise + tiebreak : TiebreakEnumType + See rank_1d.__doc__ for the different modes + check_mask : bint If False, assumes the mask is all False to skip mask indexing + check_labels : bint + If False, assumes all labels are the same to skip group handling logic + keep_na : bint + Whether or not to keep nulls N : Py_ssize_t The number of elements to rank. Note: it is not always true that N == len(out) or N == len(masked_vals) (see `nancorr_spearman` usage for why) - tiebreak : TiebreakEnumType, default TIEBREAK_AVERAGE - See rank_1d.__doc__ for the different modes - keep_na : bool, default True - Whether or not to keep nulls - pct : bool, default False - Compute percentage rank of data within each group - labels : See rank_1d.__doc__, default None. None implies all labels are the same. """ cdef: Py_ssize_t i, j, dups=0, sum_ranks=0, Py_ssize_t grp_start=0, grp_vals_seen=1, grp_na_count=0 - bint at_end, next_val_diff, group_changed, check_labels + bint at_end, next_val_diff, group_changed int64_t grp_size - check_labels = labels is not None - # Loop over the length of the value array # each incremental i value can be looked up in the lexsort_indexer # array that we sorted previously, which gives us the location of @@ -1349,11 +1342,6 @@ cdef void rank_sorted_1d( grp_start = i + 1 grp_vals_seen = 1 - if pct: - for i in range(N): - if grp_sizes[i] != 0: - out[i] = out[i] / grp_sizes[i] - def rank_2d( ndarray[rank_t, ndim=2] in_arr, @@ -1368,28 +1356,26 @@ def rank_2d( Fast NaN-friendly version of ``scipy.stats.rankdata``. """ cdef: - Py_ssize_t k, n, col - float64_t[::1, :] out # Column-major so columns are contiguous - int64_t[::1] grp_sizes + Py_ssize_t i, j, z, k, n, dups = 0, total_tie_count = 0 + Py_ssize_t infs + ndarray[float64_t, ndim=2] ranks ndarray[rank_t, ndim=2] values - rank_t[:, :] masked_vals - intp_t[:, :] sort_indexer - uint8_t[:, :] mask - TiebreakEnumType tiebreak - bint check_mask, keep_na, nans_rank_highest - rank_t nan_fill_val + ndarray[intp_t, ndim=2] argsort_indexer + ndarray[uint8_t, ndim=2] mask + rank_t val, nan_value + float64_t count, sum_ranks = 0.0 + int tiebreak = 0 + int64_t idx + bint check_mask, condition, keep_na tiebreak = tiebreakers[ties_method] - if tiebreak == TIEBREAK_FIRST: - if not ascending: - tiebreak = TIEBREAK_FIRST_DESCENDING keep_na = na_option == 'keep' # For cases where a mask is not possible, we can avoid mask checks check_mask = not (rank_t is uint64_t or (rank_t is int64_t and not is_datetimelike)) - if axis == 1: + if axis == 0: values = np.asarray(in_arr).T.copy() else: values = np.asarray(in_arr).copy() @@ -1398,62 +1384,120 @@ def rank_2d( if values.dtype != np.object_: values = values.astype('O') - nans_rank_highest = ascending ^ (na_option == 'top') if check_mask: - nan_fill_val = get_rank_nan_fill_val[rank_t](nans_rank_highest) + if ascending ^ (na_option == 'top'): + if rank_t is object: + nan_value = Infinity() + elif rank_t is float64_t: + nan_value = np.inf + + # int64 and datetimelike + else: + nan_value = np.iinfo(np.int64).max + + else: + if rank_t is object: + nan_value = NegInfinity() + elif rank_t is float64_t: + nan_value = -np.inf + + # int64 and datetimelike + else: + nan_value = NPY_NAT if rank_t is object: - mask = missing.isnaobj2d(values).view(np.uint8) + mask = missing.isnaobj2d(values) elif rank_t is float64_t: - mask = np.isnan(values).view(np.uint8) + mask = np.isnan(values) # int64 and datetimelike else: - mask = (values == NPY_NAT).view(np.uint8) - np.putmask(values, mask, nan_fill_val) - else: - mask = np.zeros_like(values, dtype=np.uint8) + mask = values == NPY_NAT - if nans_rank_highest: - order = (values, mask) + np.putmask(values, mask, nan_value) else: - order = (values, ~np.asarray(mask)) + mask = np.zeros_like(values, dtype=bool) n, k = (values).shape - out = np.empty((n, k), dtype='f8', order='F') - grp_sizes = np.ones(n, dtype=np.int64) + ranks = np.empty((n, k), dtype='f8') - # lexsort is slower, so only use if we need to worry about the mask - if check_mask: - sort_indexer = np.lexsort(order, axis=0).astype(np.intp, copy=False) + if tiebreak == TIEBREAK_FIRST: + # need to use a stable sort here + argsort_indexer = values.argsort(axis=1, kind='mergesort') + if not ascending: + tiebreak = TIEBREAK_FIRST_DESCENDING else: - kind = "stable" if ties_method == "first" else None - sort_indexer = values.argsort(axis=0, kind=kind).astype(np.intp, copy=False) + argsort_indexer = values.argsort(1) if not ascending: - sort_indexer = sort_indexer[::-1, :] + argsort_indexer = argsort_indexer[:, ::-1] - # putmask doesn't accept a memoryview, so we assign in a separate step - masked_vals = values - with nogil: - for col in range(k): - rank_sorted_1d( - out[:, col], - grp_sizes, - sort_indexer[:, col], - masked_vals[:, col], - mask[:, col], - check_mask=check_mask, - N=n, - tiebreak=tiebreak, - keep_na=keep_na, - pct=pct, - ) - - if axis == 1: - return np.asarray(out.T) + values = _take_2d(values, argsort_indexer) + + for i in range(n): + dups = sum_ranks = infs = 0 + + total_tie_count = 0 + count = 0.0 + for j in range(k): + val = values[i, j] + idx = argsort_indexer[i, j] + if keep_na and check_mask and mask[i, idx]: + ranks[i, idx] = NaN + infs += 1 + continue + + count += 1.0 + + sum_ranks += (j - infs) + 1 + dups += 1 + + if rank_t is object: + condition = ( + j == k - 1 or + are_diff(values[i, j + 1], val) or + (keep_na and check_mask and mask[i, argsort_indexer[i, j + 1]]) + ) + else: + condition = ( + j == k - 1 or + values[i, j + 1] != val or + (keep_na and check_mask and mask[i, argsort_indexer[i, j + 1]]) + ) + + if condition: + if tiebreak == TIEBREAK_AVERAGE: + for z in range(j - dups + 1, j + 1): + ranks[i, argsort_indexer[i, z]] = sum_ranks / dups + elif tiebreak == TIEBREAK_MIN: + for z in range(j - dups + 1, j + 1): + ranks[i, argsort_indexer[i, z]] = j - dups + 2 + elif tiebreak == TIEBREAK_MAX: + for z in range(j - dups + 1, j + 1): + ranks[i, argsort_indexer[i, z]] = j + 1 + elif tiebreak == TIEBREAK_FIRST: + if rank_t is object: + raise ValueError('first not supported for non-numeric data') + else: + for z in range(j - dups + 1, j + 1): + ranks[i, argsort_indexer[i, z]] = z + 1 + elif tiebreak == TIEBREAK_FIRST_DESCENDING: + for z in range(j - dups + 1, j + 1): + ranks[i, argsort_indexer[i, z]] = 2 * j - z - dups + 2 + elif tiebreak == TIEBREAK_DENSE: + total_tie_count += 1 + for z in range(j - dups + 1, j + 1): + ranks[i, argsort_indexer[i, z]] = total_tie_count + sum_ranks = dups = 0 + if pct: + if tiebreak == TIEBREAK_DENSE: + ranks[i, :] /= total_tie_count + else: + ranks[i, :] /= count + if axis == 0: + return ranks.T else: - return np.asarray(out) + return ranks ctypedef fused diff_t: diff --git a/pandas/_libs/algos_take_helper.pxi.in b/pandas/_libs/algos_take_helper.pxi.in index ec041c03b05e1..11679fc432edc 100644 --- a/pandas/_libs/algos_take_helper.pxi.in +++ b/pandas/_libs/algos_take_helper.pxi.in @@ -9,6 +9,31 @@ WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in # ---------------------------------------------------------------------- +@cython.wraparound(False) +@cython.boundscheck(False) +def take_1d_intp_intp( + const intp_t[:] values, + const intp_t[:] indexer, + intp_t[::1] out, + intp_t fill_value=-1, +): + cdef: + Py_ssize_t i, n, idx + intp_t fv + + n = indexer.shape[0] + + fv = fill_value + + with nogil: + for i in range(n): + idx = indexer[i] + if idx == -1: + out[i] = fv + else: + out[i] = values[idx] + + {{py: # c_type_in, c_type_out @@ -109,33 +134,32 @@ def take_2d_axis0_{{name}}_{{dest}}(ndarray[{{c_type_in}}, ndim=2] values, cdef: Py_ssize_t i, j, k, n, idx {{c_type_out}} fv - {{if c_type_in == c_type_out != "object"}} - const {{c_type_out}} *v - {{c_type_out}} *o - {{endif}} n = len(indexer) k = values.shape[1] fv = fill_value - {{if c_type_in == c_type_out != "object"}} - # GH#3130 - if (values.strides[1] == out.strides[1] and - values.strides[1] == sizeof({{c_type_out}}) and - sizeof({{c_type_out}}) * n >= 256): - - for i in range(n): - idx = indexer[i] - if idx == -1: - for j in range(k): - out[i, j] = fv - else: - v = &values[idx, 0] - o = &out[i, 0] - memmove(o, v, (sizeof({{c_type_out}}) * k)) - return - {{endif}} + IF {{True if c_type_in == c_type_out != "object" else False}}: + cdef: + const {{c_type_out}} *v + {{c_type_out}} *o + + # GH#3130 + if (values.strides[1] == out.strides[1] and + values.strides[1] == sizeof({{c_type_out}}) and + sizeof({{c_type_out}}) * n >= 256): + + for i in range(n): + idx = indexer[i] + if idx == -1: + for j in range(k): + out[i, j] = fv + else: + v = &values[idx, 0] + o = &out[i, 0] + memmove(o, v, (sizeof({{c_type_out}}) * k)) + return for i in range(n): idx = indexer[i] @@ -220,3 +244,33 @@ def take_2d_multi_{{name}}_{{dest}}(ndarray[{{c_type_in}}, ndim=2] values, {{endif}} {{endfor}} + +# ---------------------------------------------------------------------- +# take_2d internal function +# ---------------------------------------------------------------------- + +ctypedef fused take_t: + float64_t + uint64_t + int64_t + object + + +cdef _take_2d(ndarray[take_t, ndim=2] values, ndarray[intp_t, ndim=2] idx): + cdef: + Py_ssize_t i, j, N, K + ndarray[intp_t, ndim=2, cast=True] indexer = idx + ndarray[take_t, ndim=2] result + + N, K = (values).shape + + if take_t is object: + # evaluated at compile-time + result = values.copy() + else: + result = np.empty_like(values) + + for i in range(N): + for j in range(K): + result[i, j] = values[i, indexer[i, j]] + return result diff --git a/pandas/_libs/hashtable.pyi b/pandas/_libs/hashtable.pyi index 951703e04d5a3..5a1b98b190dbc 100644 --- a/pandas/_libs/hashtable.pyi +++ b/pandas/_libs/hashtable.pyi @@ -228,5 +228,3 @@ def ismember( arr: np.ndarray, values: np.ndarray, ) -> np.ndarray: ... # np.ndarray[bool] -def object_hash(obj) -> int: ... -def objects_are_equal(a, b) -> bool: ... diff --git a/pandas/_libs/hashtable.pyx b/pandas/_libs/hashtable.pyx index 132435701bddb..7df3f69337643 100644 --- a/pandas/_libs/hashtable.pyx +++ b/pandas/_libs/hashtable.pyx @@ -34,8 +34,6 @@ from pandas._libs.khash cimport ( are_equivalent_khcomplex64_t, are_equivalent_khcomplex128_t, kh_needed_n_buckets, - kh_python_hash_equal, - kh_python_hash_func, kh_str_t, khcomplex64_t, khcomplex128_t, @@ -48,14 +46,6 @@ def get_hashtable_trace_domain(): return KHASH_TRACE_DOMAIN -def object_hash(obj): - return kh_python_hash_func(obj) - - -def objects_are_equal(a, b): - return kh_python_hash_equal(a, b) - - cdef int64_t NPY_NAT = util.get_nat() SIZE_HINT_LIMIT = (1 << 20) + 7 diff --git a/pandas/_libs/internals.pyi b/pandas/_libs/internals.pyi index 3feefe7ac8ff4..d6fac14d3ee6e 100644 --- a/pandas/_libs/internals.pyi +++ b/pandas/_libs/internals.pyi @@ -1,7 +1,6 @@ from typing import ( Iterator, Sequence, - final, overload, ) @@ -51,12 +50,10 @@ class SharedBlock: class NumpyBlock(SharedBlock): values: np.ndarray - @final def getitem_block_index(self: T, slicer: slice) -> T: ... class NDArrayBackedBlock(SharedBlock): values: NDArrayBackedExtensionArray - @final def getitem_block_index(self: T, slicer: slice) -> T: ... class Block(SharedBlock): ... diff --git a/pandas/_libs/internals.pyx b/pandas/_libs/internals.pyx index ba59c50142550..6c1ca3deba047 100644 --- a/pandas/_libs/internals.pyx +++ b/pandas/_libs/internals.pyx @@ -517,6 +517,7 @@ cdef class NumpyBlock(SharedBlock): # set placement and ndim self.values = values + # @final # not useful in cython, but we _would_ annotate with @final cpdef NumpyBlock getitem_block_index(self, slice slicer): """ Perform __getitem__-like specialized to slicing along index. @@ -539,6 +540,7 @@ cdef class NDArrayBackedBlock(SharedBlock): # set placement and ndim self.values = values + # @final # not useful in cython, but we _would_ annotate with @final cpdef NDArrayBackedBlock getitem_block_index(self, slice slicer): """ Perform __getitem__-like specialized to slicing along index. @@ -567,12 +569,7 @@ cdef class BlockManager: public bint _known_consolidated, _is_consolidated public ndarray _blknos, _blklocs - def __cinit__(self, blocks=None, axes=None, verify_integrity=True): - # None as defaults for unpickling GH#42345 - if blocks is None: - # This adds 1-2 microseconds to DataFrame(np.array([])) - return - + def __cinit__(self, blocks, axes, verify_integrity=True): if isinstance(blocks, list): # Backward compat for e.g. pyarrow blocks = tuple(blocks) @@ -583,8 +580,12 @@ cdef class BlockManager: # Populate known_consolidate, blknos, and blklocs lazily self._known_consolidated = False self._is_consolidated = False - self._blknos = None - self._blklocs = None + # error: Incompatible types in assignment (expression has type "None", + # variable has type "ndarray") + self._blknos = None # type: ignore[assignment] + # error: Incompatible types in assignment (expression has type "None", + # variable has type "ndarray") + self._blklocs = None # type: ignore[assignment] # ------------------------------------------------------------------- # Pickle diff --git a/pandas/_libs/join.pyx b/pandas/_libs/join.pyx index eefa16d23f576..b69b89c0de019 100644 --- a/pandas/_libs/join.pyx +++ b/pandas/_libs/join.pyx @@ -20,22 +20,27 @@ from numpy cimport ( cnp.import_array() -from pandas._libs.algos import groupsort_indexer +from pandas._libs.algos import ( + groupsort_indexer, + take_1d_int64_int64, + take_1d_intp_intp, +) -@cython.wraparound(False) @cython.boundscheck(False) def inner_join(const intp_t[:] left, const intp_t[:] right, Py_ssize_t max_groups): cdef: Py_ssize_t i, j, k, count = 0 - intp_t[::1] left_sorter, right_sorter - intp_t[::1] left_count, right_count - intp_t[::1] left_indexer, right_indexer + ndarray[intp_t] left_sorter, right_sorter + ndarray[intp_t] left_count, right_count + ndarray[intp_t] left_indexer, right_indexer intp_t lc, rc - Py_ssize_t left_pos = 0, right_pos = 0, position = 0 + Py_ssize_t loc, left_pos = 0, right_pos = 0, position = 0 Py_ssize_t offset + # NA group in location 0 + left_sorter, left_count = groupsort_indexer(left, max_groups) right_sorter, right_count = groupsort_indexer(right, max_groups) @@ -48,13 +53,14 @@ def inner_join(const intp_t[:] left, const intp_t[:] right, if rc > 0 and lc > 0: count += lc * rc + # exclude the NA group + left_pos = left_count[0] + right_pos = right_count[0] + left_indexer = np.empty(count, dtype=np.intp) right_indexer = np.empty(count, dtype=np.intp) with nogil: - # exclude the NA group - left_pos = left_count[0] - right_pos = right_count[0] for i in range(1, max_groups + 1): lc = left_count[i] rc = right_count[i] @@ -69,27 +75,24 @@ def inner_join(const intp_t[:] left, const intp_t[:] right, left_pos += lc right_pos += rc - # Will overwrite left/right indexer with the result - _get_result_indexer(left_sorter, left_indexer) - _get_result_indexer(right_sorter, right_indexer) - - return np.asarray(left_indexer), np.asarray(right_indexer) + return (_get_result_indexer(left_sorter, left_indexer), + _get_result_indexer(right_sorter, right_indexer)) -@cython.wraparound(False) @cython.boundscheck(False) def left_outer_join(const intp_t[:] left, const intp_t[:] right, Py_ssize_t max_groups, bint sort=True): cdef: Py_ssize_t i, j, k, count = 0 - ndarray[intp_t] rev - intp_t[::1] left_count, right_count - intp_t[::1] left_sorter, right_sorter - intp_t[::1] left_indexer, right_indexer + ndarray[intp_t] left_count, right_count + ndarray[intp_t] rev, left_sorter, right_sorter + ndarray[intp_t] left_indexer, right_indexer intp_t lc, rc - Py_ssize_t left_pos = 0, right_pos = 0, position = 0 + Py_ssize_t loc, left_pos = 0, right_pos = 0, position = 0 Py_ssize_t offset + # NA group in location 0 + left_sorter, left_count = groupsort_indexer(left, max_groups) right_sorter, right_count = groupsort_indexer(right, max_groups) @@ -101,13 +104,14 @@ def left_outer_join(const intp_t[:] left, const intp_t[:] right, else: count += left_count[i] + # exclude the NA group + left_pos = left_count[0] + right_pos = right_count[0] + left_indexer = np.empty(count, dtype=np.intp) right_indexer = np.empty(count, dtype=np.intp) with nogil: - # exclude the NA group - left_pos = left_count[0] - right_pos = right_count[0] for i in range(1, max_groups + 1): lc = left_count[i] rc = right_count[i] @@ -127,38 +131,40 @@ def left_outer_join(const intp_t[:] left, const intp_t[:] right, left_pos += lc right_pos += rc - # Will overwrite left/right indexer with the result - _get_result_indexer(left_sorter, left_indexer) - _get_result_indexer(right_sorter, right_indexer) + left_indexer = _get_result_indexer(left_sorter, left_indexer) + right_indexer = _get_result_indexer(right_sorter, right_indexer) if not sort: # if not asked to sort, revert to original order - if len(left) == len(left_indexer): + # cast to avoid build warning GH#26757 + if len(left) == len(left_indexer): # no multiple matches for any row on the left # this is a short-cut to avoid groupsort_indexer # otherwise, the `else` path also works in this case rev = np.empty(len(left), dtype=np.intp) - rev.put(np.asarray(left_sorter), np.arange(len(left))) + rev.put(left_sorter, np.arange(len(left))) else: rev, _ = groupsort_indexer(left_indexer, len(left)) - return np.asarray(left_indexer).take(rev), np.asarray(right_indexer).take(rev) - else: - return np.asarray(left_indexer), np.asarray(right_indexer) + right_indexer = right_indexer.take(rev) + left_indexer = left_indexer.take(rev) + + return left_indexer, right_indexer -@cython.wraparound(False) @cython.boundscheck(False) def full_outer_join(const intp_t[:] left, const intp_t[:] right, Py_ssize_t max_groups): cdef: Py_ssize_t i, j, k, count = 0 - intp_t[::1] left_sorter, right_sorter - intp_t[::1] left_count, right_count - intp_t[::1] left_indexer, right_indexer + ndarray[intp_t] left_sorter, right_sorter + ndarray[intp_t] left_count, right_count + ndarray[intp_t] left_indexer, right_indexer intp_t lc, rc intp_t left_pos = 0, right_pos = 0 Py_ssize_t offset, position = 0 + # NA group in location 0 + left_sorter, left_count = groupsort_indexer(left, max_groups) right_sorter, right_count = groupsort_indexer(right, max_groups) @@ -173,13 +179,14 @@ def full_outer_join(const intp_t[:] left, const intp_t[:] right, else: count += lc + rc + # exclude the NA group + left_pos = left_count[0] + right_pos = right_count[0] + left_indexer = np.empty(count, dtype=np.intp) right_indexer = np.empty(count, dtype=np.intp) with nogil: - # exclude the NA group - left_pos = left_count[0] - right_pos = right_count[0] for i in range(1, max_groups + 1): lc = left_count[i] rc = right_count[i] @@ -204,33 +211,24 @@ def full_outer_join(const intp_t[:] left, const intp_t[:] right, left_pos += lc right_pos += rc - # Will overwrite left/right indexer with the result - _get_result_indexer(left_sorter, left_indexer) - _get_result_indexer(right_sorter, right_indexer) - - return np.asarray(left_indexer), np.asarray(right_indexer) + return (_get_result_indexer(left_sorter, left_indexer), + _get_result_indexer(right_sorter, right_indexer)) -@cython.wraparound(False) -@cython.boundscheck(False) -cdef void _get_result_indexer(intp_t[::1] sorter, intp_t[::1] indexer) nogil: - """NOTE: overwrites indexer with the result to avoid allocating another array""" - cdef: - Py_ssize_t i, n, idx - +cdef ndarray[intp_t] _get_result_indexer( + ndarray[intp_t] sorter, ndarray[intp_t] indexer +): if len(sorter) > 0: # cython-only equivalent to # `res = algos.take_nd(sorter, indexer, fill_value=-1)` - n = indexer.shape[0] - for i in range(n): - idx = indexer[i] - if idx == -1: - indexer[i] = -1 - else: - indexer[i] = sorter[idx] + res = np.empty(len(indexer), dtype=np.intp) + take_1d_intp_intp(sorter, indexer, res, -1) else: # length-0 case - indexer[:] = -1 + res = np.empty(len(indexer), dtype=np.intp) + res[:] = -1 + + return res def ffill_indexer(const intp_t[:] indexer) -> np.ndarray: diff --git a/pandas/_libs/khash.pxd b/pandas/_libs/khash.pxd index b9c18d6c86039..ba805e9ff1251 100644 --- a/pandas/_libs/khash.pxd +++ b/pandas/_libs/khash.pxd @@ -41,9 +41,6 @@ cdef extern from "khash_python.h": bint are_equivalent_float32_t \ "kh_floats_hash_equal" (float32_t a, float32_t b) nogil - uint32_t kh_python_hash_func(object key) - bint kh_python_hash_equal(object a, object b) - ctypedef struct kh_pymap_t: khuint_t n_buckets, size, n_occupied, upper_bound uint32_t *flags diff --git a/pandas/_libs/lib.pyi b/pandas/_libs/lib.pyi index 5be50f16af003..3f4623638c70e 100644 --- a/pandas/_libs/lib.pyi +++ b/pandas/_libs/lib.pyi @@ -25,9 +25,6 @@ class NoDefault(Enum): ... no_default: NoDefault -i8max: int -u8max: int - def item_from_zerodim(val: object) -> object: ... def infer_dtype(value: object, skipna: bool = True) -> str: ... def is_iterator(obj: object) -> bool: ... @@ -51,7 +48,6 @@ def is_string_array(values: np.ndarray, skipna: bool = False): ... def is_float_array(values: np.ndarray, skipna: bool = False): ... def is_integer_array(values: np.ndarray, skipna: bool = False): ... def is_bool_array(values: np.ndarray, skipna: bool = False): ... -def fast_multiget(mapping: dict, keys: np.ndarray, default=np.nan) -> np.ndarray: ... def fast_unique_multiple_list_gen(gen: Generator, sort: bool = True) -> list: ... def fast_unique_multiple_list(lists: list, sort: bool = True) -> list: ... def fast_unique_multiple(arrays: list, sort: bool = True) -> list: ... @@ -220,7 +216,8 @@ def array_equivalent_object( left: np.ndarray, # object[:] right: np.ndarray, # object[:] ) -> bool: ... -def has_infs(arr: np.ndarray) -> bool: ... # const floating[:] +def has_infs_f8(arr: np.ndarray) -> bool: ... # const float64_t[:] +def has_infs_f4(arr: np.ndarray) -> bool: ... # const float32_t[:] def get_reverse_indexer( indexer: np.ndarray, # const intp_t[:] length: int, diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 4ab2497be94d5..0aec7e5e5a363 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -25,7 +25,6 @@ from cpython.tuple cimport ( PyTuple_New, PyTuple_SET_ITEM, ) -from cython cimport floating PyDateTime_IMPORT @@ -119,10 +118,6 @@ cdef: float64_t NaN = np.NaN -# python-visible -i8max = INT64_MAX -u8max = UINT64_MAX - @cython.wraparound(False) @cython.boundscheck(False) @@ -520,22 +515,36 @@ def get_reverse_indexer(const intp_t[:] indexer, Py_ssize_t length) -> ndarray: @cython.wraparound(False) @cython.boundscheck(False) -# Can add const once https://github.com/cython/cython/issues/1772 resolved -def has_infs(floating[:] arr) -> bool: +def has_infs_f4(const float32_t[:] arr) -> bool: cdef: Py_ssize_t i, n = len(arr) - floating inf, neginf, val - bint ret = False + float32_t inf, neginf, val inf = np.inf neginf = -inf - with nogil: - for i in range(n): - val = arr[i] - if val == inf or val == neginf: - ret = True - break - return ret + + for i in range(n): + val = arr[i] + if val == inf or val == neginf: + return True + return False + + +@cython.wraparound(False) +@cython.boundscheck(False) +def has_infs_f8(const float64_t[:] arr) -> bool: + cdef: + Py_ssize_t i, n = len(arr) + float64_t inf, neginf, val + + inf = np.inf + neginf = -inf + + for i in range(n): + val = arr[i] + if val == inf or val == neginf: + return True + return False def maybe_indices_to_slice(ndarray[intp_t] indices, int max_len): @@ -703,14 +712,6 @@ cpdef ndarray[object] ensure_string_array( Py_ssize_t i = 0, n = len(arr) if hasattr(arr, "to_numpy"): - - if hasattr(arr, "dtype") and arr.dtype.kind in ["m", "M"]: - # dtype check to exclude DataFrame - # GH#41409 TODO: not a great place for this - out = arr.astype(str).astype(object) - out[arr.isna()] = na_value - return out - arr = arr.to_numpy() elif not isinstance(arr, np.ndarray): arr = np.array(arr, dtype="object") @@ -893,13 +894,12 @@ def count_level_2d(ndarray[uint8_t, ndim=2, cast=True] mask, return counts -@cython.wraparound(False) -@cython.boundscheck(False) def generate_slices(const intp_t[:] labels, Py_ssize_t ngroups): cdef: Py_ssize_t i, group_size, n, start intp_t lab - int64_t[::1] starts, ends + object slobj + ndarray[int64_t] starts, ends n = len(labels) @@ -908,20 +908,19 @@ def generate_slices(const intp_t[:] labels, Py_ssize_t ngroups): start = 0 group_size = 0 - with nogil: - for i in range(n): - lab = labels[i] - if lab < 0: - start += 1 - else: - group_size += 1 - if i == n - 1 or lab != labels[i + 1]: - starts[lab] = start - ends[lab] = start + group_size - start += group_size - group_size = 0 + for i in range(n): + lab = labels[i] + if lab < 0: + start += 1 + else: + group_size += 1 + if i == n - 1 or lab != labels[i + 1]: + starts[lab] = start + ends[lab] = start + group_size + start += group_size + group_size = 0 - return np.asarray(starts), np.asarray(ends) + return starts, ends def indices_fast(ndarray[intp_t] index, const int64_t[:] labels, list keys, @@ -1700,7 +1699,7 @@ cdef class Validator: if not self.is_valid(values[i]): return False - return True + return self.finalize_validate() @cython.wraparound(False) @cython.boundscheck(False) @@ -1713,7 +1712,7 @@ cdef class Validator: if not self.is_valid_skipna(values[i]): return False - return True + return self.finalize_validate_skipna() cdef bint is_valid(self, object value) except -1: return self.is_value_typed(value) @@ -1731,6 +1730,18 @@ cdef class Validator: cdef bint is_array_typed(self) except -1: return False + cdef inline bint finalize_validate(self): + return True + + cdef bint finalize_validate_skipna(self): + """ + If we _only_ saw non-dtype-specific NA values, even if they are valid + for this dtype, we do not infer this dtype. + """ + # TODO(phillipc): Remove the existing validate methods and replace them + # with the skipna versions upon full deprecation of skipna=False + return True + @cython.internal cdef class BoolValidator(Validator): @@ -1882,14 +1893,14 @@ cdef bint is_bytes_array(ndarray values, bint skipna=False): @cython.internal cdef class TemporalValidator(Validator): cdef: - bint all_generic_na + Py_ssize_t generic_null_count def __cinit__(self, Py_ssize_t n, dtype dtype=np.dtype(np.object_), bint skipna=False): self.n = n self.dtype = dtype self.skipna = skipna - self.all_generic_na = True + self.generic_null_count = 0 cdef inline bint is_valid(self, object value) except -1: return self.is_value_typed(value) or self.is_valid_null(value) @@ -1902,16 +1913,15 @@ cdef class TemporalValidator(Validator): cdef: bint is_typed_null = self.is_valid_null(value) bint is_generic_null = value is None or util.is_nan(value) - if not is_generic_null: - self.all_generic_na = False + self.generic_null_count += is_typed_null and is_generic_null return self.is_value_typed(value) or is_typed_null or is_generic_null - cdef bint _validate_skipna(self, ndarray values) except -1: + cdef inline bint finalize_validate_skipna(self): """ If we _only_ saw non-dtype-specific NA values, even if they are valid for this dtype, we do not infer this dtype. """ - return Validator._validate_skipna(self, values) and not self.all_generic_na + return self.generic_null_count != self.n @cython.internal @@ -2955,28 +2965,6 @@ def to_object_array_tuples(rows: object) -> np.ndarray: return result -@cython.wraparound(False) -@cython.boundscheck(False) -def fast_multiget(dict mapping, ndarray keys, default=np.nan) -> np.ndarray: - cdef: - Py_ssize_t i, n = len(keys) - object val - ndarray[object] output = np.empty(n, dtype='O') - - if n == 0: - # kludge, for Series - return np.empty(0, dtype='f8') - - for i in range(n): - val = keys[i] - if val in mapping: - output[i] = mapping[val] - else: - output[i] = default - - return maybe_convert_objects(output) - - def is_bool_list(obj: list) -> bool: """ Check if this list contains only bool or np.bool_ objects. diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index 3655d6efad66e..7d7074988e5f0 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -108,7 +108,6 @@ from pandas.core.dtypes.common import ( is_object_dtype, ) from pandas.core.dtypes.dtypes import CategoricalDtype -from pandas.core.dtypes.inference import is_dict_like cdef: float64_t INF = np.inf @@ -690,7 +689,6 @@ cdef class TextReader: count = counts.get(name, 0) if ( self.dtype is not None - and is_dict_like(self.dtype) and self.dtype.get(old_name) is not None and self.dtype.get(name) is None ): @@ -1280,8 +1278,6 @@ cdef class TextReader: # generate extra (bogus) headers if there are more columns than headers if j >= len(self.header[0]): return j - elif self.has_mi_columns: - return tuple(header_row[j] for header_row in self.header) else: return self.header[0][j] else: diff --git a/pandas/_libs/src/klib/khash_python.h b/pandas/_libs/src/klib/khash_python.h index c0fca76ef701e..aee018262e3a6 100644 --- a/pandas/_libs/src/klib/khash_python.h +++ b/pandas/_libs/src/klib/khash_python.h @@ -163,198 +163,28 @@ KHASH_MAP_INIT_COMPLEX128(complex128, size_t) #define kh_exist_complex128(h, k) (kh_exist(h, k)) -// NaN-floats should be in the same equivalency class, see GH 22119 -int PANDAS_INLINE floatobject_cmp(PyFloatObject* a, PyFloatObject* b){ - return ( - Py_IS_NAN(PyFloat_AS_DOUBLE(a)) && - Py_IS_NAN(PyFloat_AS_DOUBLE(b)) - ) - || - ( PyFloat_AS_DOUBLE(a) == PyFloat_AS_DOUBLE(b) ); -} - - -// NaNs should be in the same equivalency class, see GH 41836 -// PyObject_RichCompareBool for complexobjects has a different behavior -// needs to be replaced -int PANDAS_INLINE complexobject_cmp(PyComplexObject* a, PyComplexObject* b){ - return ( - Py_IS_NAN(a->cval.real) && - Py_IS_NAN(b->cval.real) && - Py_IS_NAN(a->cval.imag) && - Py_IS_NAN(b->cval.imag) - ) - || - ( - Py_IS_NAN(a->cval.real) && - Py_IS_NAN(b->cval.real) && - a->cval.imag == b->cval.imag - ) - || - ( - a->cval.real == b->cval.real && - Py_IS_NAN(a->cval.imag) && - Py_IS_NAN(b->cval.imag) - ) - || - ( - a->cval.real == b->cval.real && - a->cval.imag == b->cval.imag - ); -} - -int PANDAS_INLINE pyobject_cmp(PyObject* a, PyObject* b); - - -// replacing PyObject_RichCompareBool (NaN!=NaN) with pyobject_cmp (NaN==NaN), -// which treats NaNs as equivalent -// see GH 41836 -int PANDAS_INLINE tupleobject_cmp(PyTupleObject* a, PyTupleObject* b){ - Py_ssize_t i; - - if (Py_SIZE(a) != Py_SIZE(b)) { - return 0; - } - - for (i = 0; i < Py_SIZE(a); ++i) { - if (!pyobject_cmp(PyTuple_GET_ITEM(a, i), PyTuple_GET_ITEM(b, i))) { - return 0; - } - } - return 1; -} - - int PANDAS_INLINE pyobject_cmp(PyObject* a, PyObject* b) { - if (a == b) { - return 1; - } - if (Py_TYPE(a) == Py_TYPE(b)) { - // special handling for some built-in types which could have NaNs - // as we would like to have them equivalent, but the usual - // PyObject_RichCompareBool would return False - if (PyFloat_CheckExact(a)) { - return floatobject_cmp((PyFloatObject*)a, (PyFloatObject*)b); - } - if (PyComplex_CheckExact(a)) { - return complexobject_cmp((PyComplexObject*)a, (PyComplexObject*)b); - } - if (PyTuple_CheckExact(a)) { - return tupleobject_cmp((PyTupleObject*)a, (PyTupleObject*)b); - } - // frozenset isn't yet supported - } - int result = PyObject_RichCompareBool(a, b, Py_EQ); if (result < 0) { PyErr_Clear(); return 0; } - return result; -} - - -Py_hash_t PANDAS_INLINE _Pandas_HashDouble(double val) { - //Since Python3.10, nan is no longer has hash 0 - if (Py_IS_NAN(val)) { - return 0; - } -#if PY_VERSION_HEX < 0x030A0000 - return _Py_HashDouble(val); -#else - return _Py_HashDouble(NULL, val); -#endif -} - - -Py_hash_t PANDAS_INLINE floatobject_hash(PyFloatObject* key) { - return _Pandas_HashDouble(PyFloat_AS_DOUBLE(key)); -} - - -#define _PandasHASH_IMAG 1000003UL - -// replaces _Py_HashDouble with _Pandas_HashDouble -Py_hash_t PANDAS_INLINE complexobject_hash(PyComplexObject* key) { - Py_uhash_t realhash = (Py_uhash_t)_Pandas_HashDouble(key->cval.real); - Py_uhash_t imaghash = (Py_uhash_t)_Pandas_HashDouble(key->cval.imag); - if (realhash == (Py_uhash_t)-1 || imaghash == (Py_uhash_t)-1) { - return -1; - } - Py_uhash_t combined = realhash + _PandasHASH_IMAG * imaghash; - if (combined == (Py_uhash_t)-1) { - return -2; - } - return (Py_hash_t)combined; -} - - -khuint32_t PANDAS_INLINE kh_python_hash_func(PyObject* key); - -//we could use any hashing algorithm, this is the original CPython's for tuples - -#if SIZEOF_PY_UHASH_T > 4 -#define _PandasHASH_XXPRIME_1 ((Py_uhash_t)11400714785074694791ULL) -#define _PandasHASH_XXPRIME_2 ((Py_uhash_t)14029467366897019727ULL) -#define _PandasHASH_XXPRIME_5 ((Py_uhash_t)2870177450012600261ULL) -#define _PandasHASH_XXROTATE(x) ((x << 31) | (x >> 33)) /* Rotate left 31 bits */ -#else -#define _PandasHASH_XXPRIME_1 ((Py_uhash_t)2654435761UL) -#define _PandasHASH_XXPRIME_2 ((Py_uhash_t)2246822519UL) -#define _PandasHASH_XXPRIME_5 ((Py_uhash_t)374761393UL) -#define _PandasHASH_XXROTATE(x) ((x << 13) | (x >> 19)) /* Rotate left 13 bits */ -#endif - -Py_hash_t PANDAS_INLINE tupleobject_hash(PyTupleObject* key) { - Py_ssize_t i, len = Py_SIZE(key); - PyObject **item = key->ob_item; - - Py_uhash_t acc = _PandasHASH_XXPRIME_5; - for (i = 0; i < len; i++) { - Py_uhash_t lane = kh_python_hash_func(item[i]); - if (lane == (Py_uhash_t)-1) { - return -1; - } - acc += lane * _PandasHASH_XXPRIME_2; - acc = _PandasHASH_XXROTATE(acc); - acc *= _PandasHASH_XXPRIME_1; - } - - /* Add input length, mangled to keep the historical value of hash(()). */ - acc += len ^ (_PandasHASH_XXPRIME_5 ^ 3527539UL); - - if (acc == (Py_uhash_t)-1) { - return 1546275796; + if (result == 0) { // still could be two NaNs + return PyFloat_CheckExact(a) && + PyFloat_CheckExact(b) && + Py_IS_NAN(PyFloat_AS_DOUBLE(a)) && + Py_IS_NAN(PyFloat_AS_DOUBLE(b)); } - return acc; + return result; } -khuint32_t PANDAS_INLINE kh_python_hash_func(PyObject* key) { - Py_hash_t hash; +khint32_t PANDAS_INLINE kh_python_hash_func(PyObject* key){ // For PyObject_Hash holds: // hash(0.0) == 0 == hash(-0.0) - // yet for different nan-objects different hash-values - // are possible - if (PyFloat_CheckExact(key)) { - // we cannot use kh_float64_hash_func - // becase float(k) == k holds for any int-object k - // and kh_float64_hash_func doesn't respect it - hash = floatobject_hash((PyFloatObject*)key); - } - else if (PyComplex_CheckExact(key)) { - // we cannot use kh_complex128_hash_func - // becase complex(k,0) == k holds for any int-object k - // and kh_complex128_hash_func doesn't respect it - hash = complexobject_hash((PyComplexObject*)key); - } - else if (PyTuple_CheckExact(key)) { - hash = tupleobject_hash((PyTupleObject*)key); - } - else { - hash = PyObject_Hash(key); - } - + // hash(X) == 0 if X is a NaN-value + // so it is OK to use it directly for doubles + Py_hash_t hash = PyObject_Hash(key); if (hash == -1) { PyErr_Clear(); return 0; diff --git a/pandas/_libs/tslibs/dtypes.pyx b/pandas/_libs/tslibs/dtypes.pyx index f79ffd2d425c4..415bdf74db80a 100644 --- a/pandas/_libs/tslibs/dtypes.pyx +++ b/pandas/_libs/tslibs/dtypes.pyx @@ -27,11 +27,6 @@ cdef class PeriodDtypeBase: # See also: libperiod.get_freq_group return (self._dtype_code // 1000) * 1000 - @property - def resolution(self) -> "Resolution": - fgc = self.freq_group_code - return Resolution.from_freq_group(FreqGroup(fgc)) - @property def date_offset(self): """ @@ -264,14 +259,6 @@ class Resolution(Enum): return cls.from_attrname(attr_name) - @classmethod - def from_freq_group(cls, freq_group: FreqGroup) -> "Resolution": - abbrev = _reverse_period_code_map[freq_group.value].split("-")[0] - if abbrev == "B": - return cls.RESO_DAY - attrname = _abbrev_to_attrnames[abbrev] - return cls.from_attrname(attrname) - cdef dict _reso_str_map = { Resolution.RESO_NS.value: "nanosecond", diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx index 6596aebc1892e..ac7447420596a 100644 --- a/pandas/_libs/tslibs/offsets.pyx +++ b/pandas/_libs/tslibs/offsets.pyx @@ -1152,13 +1152,12 @@ class DateOffset(RelativeDeltaOffset, metaclass=OffsetMeta): """ Standard kind of date increment used for a date range. - Works exactly like the keyword argument form of relativedelta. - Note that the positional argument form of relativedelata is not - supported. Use of the keyword n is discouraged-- you would be better + Works exactly like relativedelta in terms of the keyword args you + pass in, use of the keyword n is discouraged-- you would be better off specifying n in the keywords you use, but regardless it is there for you. n is needed for DateOffset subclasses. - DateOffset works as follows. Each offset specify a set of dates + DateOffset work as follows. Each offset specify a set of dates that conform to the DateOffset. For example, Bday defines this set to be the set of dates that are weekdays (M-F). To test if a date is in the set of a DateOffset dateOffset we can use the diff --git a/pandas/_libs/tslibs/parsing.pyx b/pandas/_libs/tslibs/parsing.pyx index 212e40b30848a..9892671f5c18c 100644 --- a/pandas/_libs/tslibs/parsing.pyx +++ b/pandas/_libs/tslibs/parsing.pyx @@ -470,7 +470,8 @@ cdef inline object _parse_dateabbr_string(object date_string, datetime default, except ValueError: pass - if date_len == 6 and freq == 'M': + if date_len == 6 and (freq == 'M' or + getattr(freq, 'rule_code', None) == 'M'): year = int(date_string[:4]) month = int(date_string[4:6]) try: diff --git a/pandas/_libs/tslibs/timestamps.pyx b/pandas/_libs/tslibs/timestamps.pyx index e4e9df5176459..edd3b58867e87 100644 --- a/pandas/_libs/tslibs/timestamps.pyx +++ b/pandas/_libs/tslibs/timestamps.pyx @@ -129,13 +129,6 @@ cdef inline object create_timestamp_from_ts(int64_t value, return ts_base -def _unpickle_timestamp(value, freq, tz): - # GH#41949 dont warn on unpickle if we have a freq - ts = Timestamp(value, tz=tz) - ts._set_freq(freq) - return ts - - # ---------------------------------------------------------------------- def integer_op_not_supported(obj): @@ -732,7 +725,7 @@ cdef class _Timestamp(ABCTimestamp): def __reduce__(self): object_state = self.value, self._freq, self.tzinfo - return (_unpickle_timestamp, object_state) + return (Timestamp, object_state) # ----------------------------------------------------------------- # Rendering Methods @@ -1329,19 +1322,6 @@ class Timestamp(_Timestamp): "the tz parameter. Use tz_convert instead.") tzobj = maybe_get_tz(tz) - if tzobj is not None and is_datetime64_object(ts_input): - # GH#24559, GH#42288 In the future we will treat datetime64 as - # wall-time (consistent with DatetimeIndex) - warnings.warn( - "In a future version, when passing a np.datetime64 object and " - "a timezone to Timestamp, the datetime64 will be interpreted " - "as a wall time, not a UTC time. To interpret as a UTC time, " - "use `Timestamp(dt64).tz_localize('UTC').tz_convert(tz)`", - FutureWarning, - stacklevel=1, - ) - # Once this deprecation is enforced, we can do - # return Timestamp(ts_input).tz_localize(tzobj) ts = convert_to_tsobject(ts_input, tzobj, unit, 0, 0, nanosecond or 0) if ts.value == NPY_NAT: diff --git a/pandas/_testing/_warnings.py b/pandas/_testing/_warnings.py index 5f01996d0390d..5153118e9b142 100644 --- a/pandas/_testing/_warnings.py +++ b/pandas/_testing/_warnings.py @@ -106,7 +106,6 @@ def _assert_caught_expected_warning( """Assert that there was the expected warning among the caught warnings.""" saw_warning = False matched_message = False - unmatched_messages = [] for actual_warning in caught_warnings: if issubclass(actual_warning.category, expected_warning): @@ -117,11 +116,8 @@ def _assert_caught_expected_warning( ): _assert_raised_with_correct_stacklevel(actual_warning) - if match is not None: - if re.search(match, str(actual_warning.message)): - matched_message = True - else: - unmatched_messages.append(actual_warning.message) + if match is not None and re.search(match, str(actual_warning.message)): + matched_message = True if not saw_warning: raise AssertionError( @@ -132,8 +128,7 @@ def _assert_caught_expected_warning( if match and not matched_message: raise AssertionError( f"Did not see warning {repr(expected_warning.__name__)} " - f"matching '{match}'. The emitted warning messages are " - f"{unmatched_messages}" + f"matching {match}" ) diff --git a/pandas/_testing/asserters.py b/pandas/_testing/asserters.py index d0957b1814213..1942e07d1b562 100644 --- a/pandas/_testing/asserters.py +++ b/pandas/_testing/asserters.py @@ -107,7 +107,6 @@ def assert_almost_equal( FutureWarning, stacklevel=2, ) - # https://github.com/python/mypy/issues/7642 # error: Argument 1 to "_get_tol_from_less_precise" has incompatible # type "Union[bool, int, NoDefault]"; expected "Union[bool, int]" rtol = atol = _get_tol_from_less_precise( @@ -315,16 +314,18 @@ def _check_types(left, right, obj="Index") -> None: return assert_class_equal(left, right, exact=exact, obj=obj) - assert_attr_equal("inferred_type", left, right, obj=obj) # Skip exact dtype checking when `check_categorical` is False - if is_categorical_dtype(left.dtype) and is_categorical_dtype(right.dtype): - if check_categorical: - assert_attr_equal("dtype", left, right, obj=obj) + if check_categorical: + assert_attr_equal("dtype", left, right, obj=obj) + if is_categorical_dtype(left.dtype) and is_categorical_dtype(right.dtype): assert_index_equal(left.categories, right.categories, exact=exact) - return - assert_attr_equal("dtype", left, right, obj=obj) + # allow string-like to have different inferred_types + if left.inferred_type in ("string"): + assert right.inferred_type in ("string") + else: + assert_attr_equal("inferred_type", left, right, obj=obj) def _get_ilevel_values(index, level): # accept level number only @@ -341,7 +342,6 @@ def _get_ilevel_values(index, level): FutureWarning, stacklevel=2, ) - # https://github.com/python/mypy/issues/7642 # error: Argument 1 to "_get_tol_from_less_precise" has incompatible # type "Union[bool, int, NoDefault]"; expected "Union[bool, int]" rtol = atol = _get_tol_from_less_precise( @@ -437,8 +437,6 @@ def assert_class_equal(left, right, exact: bool | str = True, obj="Input"): """ Checks classes are equal. """ - from pandas.core.indexes.numeric import NumericIndex - __tracebackhide__ = True def repr_class(x): @@ -448,16 +446,17 @@ def repr_class(x): return type(x).__name__ - if type(left) == type(right): - return - if exact == "equiv": - # accept equivalence of NumericIndex (sub-)classes - if isinstance(left, NumericIndex) and isinstance(right, NumericIndex): - return - - msg = f"{obj} classes are different" - raise_assert_detail(obj, msg, repr_class(left), repr_class(right)) + if type(left) != type(right): + # allow equivalence of Int64Index/RangeIndex + types = {type(left).__name__, type(right).__name__} + if len(types - {"Int64Index", "RangeIndex"}): + msg = f"{obj} classes are not equivalent" + raise_assert_detail(obj, msg, repr_class(left), repr_class(right)) + elif exact: + if type(left) != type(right): + msg = f"{obj} classes are different" + raise_assert_detail(obj, msg, repr_class(left), repr_class(right)) def assert_attr_equal(attr: str, left, right, obj: str = "Attributes"): diff --git a/pandas/_typing.py b/pandas/_typing.py index 6583a9f60ee15..12d23786c3387 100644 --- a/pandas/_typing.py +++ b/pandas/_typing.py @@ -21,7 +21,6 @@ Dict, Hashable, List, - Literal, Mapping, Optional, Sequence, @@ -37,9 +36,11 @@ # and use a string literal forward reference to it in subsequent types # https://mypy.readthedocs.io/en/latest/common_issues.html#import-cycles if TYPE_CHECKING: - from typing import TypedDict - - import numpy.typing as npt + from typing import ( + Literal, + TypedDict, + final, + ) from pandas._libs import ( Period, @@ -72,7 +73,8 @@ from pandas.io.formats.format import EngFormatter from pandas.tseries.offsets import DateOffset else: - npt: Any = None + # typing.final does not exist until py38 + final = lambda x: x # typing.TypedDict does not exist until py38 TypedDict = dict @@ -99,6 +101,12 @@ ] Timezone = Union[str, tzinfo] +# FrameOrSeriesUnion means either a DataFrame or a Series. E.g. +# `def func(a: FrameOrSeriesUnion) -> FrameOrSeriesUnion: ...` means that if a Series +# is passed in, either a Series or DataFrame is returned, and if a DataFrame is passed +# in, either a DataFrame or a Series is returned. +FrameOrSeriesUnion = Union["DataFrame", "Series"] + # FrameOrSeries is stricter and ensures that the same subclass of NDFrame always is # used. E.g. `def func(a: FrameOrSeries) -> FrameOrSeries: ...` means that if a # Series is passed into a function, a Series is always returned and if a DataFrame is @@ -115,14 +123,6 @@ Frequency = Union[str, "DateOffset"] Axes = Collection[Any] -RandomState = Union[ - int, - ArrayLike, - np.random.Generator, - np.random.BitGenerator, - np.random.RandomState, -] - # dtypes NpDtype = Union[str, np.dtype] Dtype = Union[ @@ -193,7 +193,10 @@ ] # Arguments for fillna() -FillnaOptions = Literal["backfill", "bfill", "ffill", "pad"] +if TYPE_CHECKING: + FillnaOptions = Literal["backfill", "bfill", "ffill", "pad"] +else: + FillnaOptions = str # internals Manager = Union[ diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py index 3233de8e3b6d1..369832e9bc05c 100644 --- a/pandas/compat/__init__.py +++ b/pandas/compat/__init__.py @@ -16,6 +16,7 @@ is_numpy_dev, np_array_datetime64_compat, np_datetime64_compat, + np_version_under1p18, np_version_under1p19, np_version_under1p20, ) @@ -26,6 +27,7 @@ pa_version_under4p0, ) +PY38 = sys.version_info >= (3, 8) PY39 = sys.version_info >= (3, 9) PY310 = sys.version_info >= (3, 10) PYPY = platform.python_implementation() == "PyPy" @@ -149,6 +151,7 @@ def get_lzma_file(lzma): "is_numpy_dev", "np_array_datetime64_compat", "np_datetime64_compat", + "np_version_under1p18", "np_version_under1p19", "np_version_under1p20", "pa_version_under1p0", diff --git a/pandas/compat/_optional.py b/pandas/compat/_optional.py index 651729cd0ad44..941c59592dbbd 100644 --- a/pandas/compat/_optional.py +++ b/pandas/compat/_optional.py @@ -10,30 +10,30 @@ # Update install.rst when updating versions! VERSIONS = { - "bs4": "4.8.2", - "bottleneck": "1.3.1", + "bs4": "4.6.0", + "bottleneck": "1.2.1", "fsspec": "0.7.4", "fastparquet": "0.4.0", "gcsfs": "0.6.0", - "lxml.etree": "4.5.0", - "matplotlib": "3.3.2", - "numexpr": "2.7.1", - "odfpy": "1.4.1", - "openpyxl": "3.0.2", - "pandas_gbq": "0.14.0", + "lxml.etree": "4.3.0", + "matplotlib": "2.2.3", + "numexpr": "2.7.0", + "odfpy": "1.3.0", + "openpyxl": "3.0.0", + "pandas_gbq": "0.12.0", "pyarrow": "0.17.0", "pytest": "6.0", "pyxlsb": "1.0.6", "s3fs": "0.4.0", - "scipy": "1.4.1", - "sqlalchemy": "1.3.11", - "tables": "3.6.1", + "scipy": "1.2.0", + "sqlalchemy": "1.3.0", + "tables": "3.5.1", "tabulate": "0.8.7", - "xarray": "0.15.1", - "xlrd": "2.0.1", + "xarray": "0.12.3", + "xlrd": "1.2.0", "xlwt": "1.3.0", - "xlsxwriter": "1.2.2", - "numba": "0.50.1", + "xlsxwriter": "1.0.2", + "numba": "0.46.0", } # A mapping from import name to package name (on PyPI) for packages where diff --git a/pandas/compat/numpy/__init__.py b/pandas/compat/numpy/__init__.py index 5b87257651a2d..619713f28ee2d 100644 --- a/pandas/compat/numpy/__init__.py +++ b/pandas/compat/numpy/__init__.py @@ -9,10 +9,11 @@ # numpy versioning _np_version = np.__version__ _nlv = Version(_np_version) +np_version_under1p18 = _nlv < Version("1.18") np_version_under1p19 = _nlv < Version("1.19") np_version_under1p20 = _nlv < Version("1.20") is_numpy_dev = _nlv.dev is not None -_min_numpy_ver = "1.18.5" +_min_numpy_ver = "1.17.3" if _nlv < Version(_min_numpy_ver): diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index df9f3d07ce7fd..177dfee0c03ab 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -8,11 +8,14 @@ from textwrap import dedent from typing import ( TYPE_CHECKING, - Literal, Union, cast, ) -from warnings import warn +from warnings import ( + catch_warnings, + simplefilter, + warn, +) import numpy as np @@ -26,8 +29,8 @@ AnyArrayLike, ArrayLike, DtypeObj, + FrameOrSeriesUnion, Scalar, - npt, ) from pandas.util._decorators import doc @@ -81,6 +84,7 @@ from pandas.core.indexers import validate_indices if TYPE_CHECKING: + from typing import Literal from pandas import ( Categorical, @@ -136,11 +140,7 @@ def _ensure_data(values: ArrayLike) -> tuple[np.ndarray, DtypeObj]: return np.asarray(values).view("uint8"), values.dtype else: # i.e. all-bool Categorical, BooleanArray - try: - return np.asarray(values).astype("uint8", copy=False), values.dtype - except TypeError: - # GH#42107 we have pd.NAs present - return np.asarray(values), values.dtype + return np.asarray(values).astype("uint8", copy=False), values.dtype elif is_integer_dtype(values.dtype): return np.asarray(values), values.dtype @@ -155,10 +155,12 @@ def _ensure_data(values: ArrayLike) -> tuple[np.ndarray, DtypeObj]: return np.asarray(values), values.dtype elif is_complex_dtype(values.dtype): - # Incompatible return value type (got "Tuple[Union[Any, ExtensionArray, - # ndarray[Any, Any]], Union[Any, ExtensionDtype]]", expected - # "Tuple[ndarray[Any, Any], Union[dtype[Any], ExtensionDtype]]") - return values, values.dtype # type: ignore[return-value] + # ignore the fact that we are casting to float + # which discards complex parts + with catch_warnings(): + simplefilter("ignore", np.ComplexWarning) + values = ensure_float64(values) + return values, np.dtype("float64") # datetimelike elif needs_i8_conversion(values.dtype): @@ -240,8 +242,6 @@ def _ensure_arraylike(values) -> ArrayLike: _hashtables = { - "complex128": htable.Complex128HashTable, - "complex64": htable.Complex64HashTable, "float64": htable.Float64HashTable, "float32": htable.Float32HashTable, "uint64": htable.UInt64HashTable, @@ -529,9 +529,9 @@ def factorize_array( size_hint: int | None = None, na_value=None, mask: np.ndarray | None = None, -) -> tuple[npt.NDArray[np.intp], np.ndarray]: +) -> tuple[np.ndarray, np.ndarray]: """ - Factorize a numpy array to codes and uniques. + Factorize an array-like to codes and uniques. This doesn't do any coercion of types or unboxing before factorization. @@ -910,7 +910,7 @@ def duplicated( Parameters ---------- - values : nd.array, ExtensionArray or Series + values : ndarray-like Array over which to check for duplicate values. keep : {'first', 'last', False}, default 'first' - ``first`` : Mark duplicates as ``True`` except for the first @@ -1008,6 +1008,7 @@ def rank( if values.ndim == 1: ranks = algos.rank_1d( values, + labels=np.zeros(len(values), dtype=np.intp), is_datetimelike=is_datetimelike, ties_method=method, ascending=ascending, @@ -1091,19 +1092,18 @@ def checked_add_with_arr( # it is negative, we then check whether its sum with the element in # 'arr' exceeds np.iinfo(np.int64).min. If so, we have an overflow # error as well. - i8max = lib.i8max - i8min = iNaT - mask1 = b2 > 0 mask2 = b2 < 0 if not mask1.any(): - to_raise = ((i8min - b2 > arr) & not_nan).any() + to_raise = ((np.iinfo(np.int64).min - b2 > arr) & not_nan).any() elif not mask2.any(): - to_raise = ((i8max - b2 < arr) & not_nan).any() + to_raise = ((np.iinfo(np.int64).max - b2 < arr) & not_nan).any() else: - to_raise = ((i8max - b2[mask1] < arr[mask1]) & not_nan[mask1]).any() or ( - (i8min - b2[mask2] > arr[mask2]) & not_nan[mask2] + to_raise = ( + (np.iinfo(np.int64).max - b2[mask1] < arr[mask1]) & not_nan[mask1] + ).any() or ( + (np.iinfo(np.int64).min - b2[mask2] > arr[mask2]) & not_nan[mask2] ).any() if to_raise: @@ -1188,10 +1188,13 @@ def _get_score(at): if is_scalar(q): return _get_score(q) - - q = np.asarray(q, np.float64) - result = [_get_score(x) for x in q] - return np.array(result, dtype=np.float64) + else: + q = np.asarray(q, np.float64) + result = [_get_score(x) for x in q] + # error: Incompatible types in assignment (expression has type + # "ndarray", variable has type "List[Any]") + result = np.array(result, dtype=np.float64) # type: ignore[assignment] + return result # --------------- # @@ -1208,7 +1211,7 @@ def __init__(self, obj, n: int, keep: str): if self.keep not in ("first", "last", "all"): raise ValueError('keep must be either "first", "last" or "all"') - def compute(self, method: str) -> DataFrame | Series: + def compute(self, method: str) -> FrameOrSeriesUnion: raise NotImplementedError def nlargest(self): @@ -1412,8 +1415,8 @@ def take( Parameters ---------- - arr : array-like or scalar value - Non array-likes (sequences/scalars without a dtype) are coerced + arr : sequence + Non array-likes (sequences without a dtype) are coerced to an ndarray. indices : sequence of integers Indices to be taken. @@ -1523,17 +1526,17 @@ def searchsorted(arr, value, side="left", sorter=None) -> np.ndarray: Parameters ---------- - arr: np.ndarray, ExtensionArray, Series + arr: array-like Input array. If `sorter` is None, then it must be sorted in ascending order, otherwise `sorter` must be an array of indices that sort it. - value : array-like or scalar + value : array_like Values to insert into `arr`. side : {'left', 'right'}, optional If 'left', the index of the first suitable location found is given. If 'right', return the last such index. If there is no suitable index, return either 0 or N (where N is the length of `self`). - sorter : 1-D array-like, optional + sorter : 1-D array_like, optional Optional array of integer indices that sort array a into ascending order. They are typically the result of argsort. diff --git a/pandas/core/apply.py b/pandas/core/apply.py index ff3fc30b870dc..388c1881afed7 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -7,7 +7,6 @@ Any, Dict, Hashable, - Iterable, Iterator, List, cast, @@ -26,6 +25,7 @@ AggObjType, Axis, FrameOrSeries, + FrameOrSeriesUnion, ) from pandas.util._decorators import cache_readonly @@ -137,10 +137,10 @@ def f(x): self.f: AggFuncType = f @abc.abstractmethod - def apply(self) -> DataFrame | Series: + def apply(self) -> FrameOrSeriesUnion: pass - def agg(self) -> DataFrame | Series | None: + def agg(self) -> FrameOrSeriesUnion | None: """ Provide an implementation for the aggregators. @@ -171,7 +171,7 @@ def agg(self) -> DataFrame | Series | None: # caller can react return None - def transform(self) -> DataFrame | Series: + def transform(self) -> FrameOrSeriesUnion: """ Transform a DataFrame or Series. @@ -252,7 +252,7 @@ def transform_dict_like(self, func): func = self.normalize_dictlike_arg("transform", obj, func) - results: dict[Hashable, DataFrame | Series] = {} + results: dict[Hashable, FrameOrSeriesUnion] = {} failed_names = [] all_type_errors = True for name, how in func.items(): @@ -283,7 +283,7 @@ def transform_dict_like(self, func): ) return concat(results, axis=1) - def transform_str_or_callable(self, func) -> DataFrame | Series: + def transform_str_or_callable(self, func) -> FrameOrSeriesUnion: """ Compute transform in the case of a string or callable func """ @@ -305,7 +305,7 @@ def transform_str_or_callable(self, func) -> DataFrame | Series: except Exception: return func(obj, *args, **kwargs) - def agg_list_like(self) -> DataFrame | Series: + def agg_list_like(self) -> FrameOrSeriesUnion: """ Compute aggregation in the case of a list-like argument. @@ -348,7 +348,6 @@ def agg_list_like(self) -> DataFrame | Series: # multiples else: - indices = [] for index, col in enumerate(selected_obj): colg = obj._gotitem(col, ndim=1, subset=selected_obj.iloc[:, index]) try: @@ -370,9 +369,7 @@ def agg_list_like(self) -> DataFrame | Series: raise else: results.append(new_res) - indices.append(index) - - keys = selected_obj.columns.take(indices) + keys.append(col) # if we are empty if not len(results): @@ -402,7 +399,7 @@ def agg_list_like(self) -> DataFrame | Series: ) return concatenated.reindex(full_ordered_index, copy=False) - def agg_dict_like(self) -> DataFrame | Series: + def agg_dict_like(self) -> FrameOrSeriesUnion: """ Compute aggregation in the case of a dict-like argument. @@ -410,7 +407,6 @@ def agg_dict_like(self) -> DataFrame | Series: ------- Result of aggregation. """ - from pandas import Index from pandas.core.reshape.concat import concat obj = self.obj @@ -444,20 +440,11 @@ def agg_dict_like(self) -> DataFrame | Series: # combine results if all(is_ndframe): - keys_to_use: Iterable[Hashable] keys_to_use = [k for k in keys if not results[k].empty] # Have to check, if at least one DataFrame is not empty. keys_to_use = keys_to_use if keys_to_use != [] else keys - if selected_obj.ndim == 2: - # keys are columns, so we can preserve names - ktu = Index(keys_to_use) - ktu._set_names(selected_obj.columns.names) - keys_to_use = ktu - axis = 0 if isinstance(obj, ABCSeries) else 1 - result = concat( - {k: results[k] for k in keys_to_use}, axis=axis, keys=keys_to_use - ) + result = concat({k: results[k] for k in keys_to_use}, axis=axis) elif any(is_ndframe): # There is a mix of NDFrames and scalars raise ValueError( @@ -480,7 +467,7 @@ def agg_dict_like(self) -> DataFrame | Series: return result - def apply_str(self) -> DataFrame | Series: + def apply_str(self) -> FrameOrSeriesUnion: """ Compute apply in case of a string. @@ -505,7 +492,7 @@ def apply_str(self) -> DataFrame | Series: raise ValueError(f"Operation {f} does not support axis=1") return self._try_aggregate_string_function(obj, f, *self.args, **self.kwargs) - def apply_multiple(self) -> DataFrame | Series: + def apply_multiple(self) -> FrameOrSeriesUnion: """ Compute apply in case of a list-like or dict-like. @@ -517,7 +504,7 @@ def apply_multiple(self) -> DataFrame | Series: return self.obj.aggregate(self.f, self.axis, *self.args, **self.kwargs) def normalize_dictlike_arg( - self, how: str, obj: DataFrame | Series, func: AggFuncTypeDict + self, how: str, obj: FrameOrSeriesUnion, func: AggFuncTypeDict ) -> AggFuncTypeDict: """ Handler for dict-like argument. @@ -630,7 +617,7 @@ def series_generator(self) -> Iterator[Series]: @abc.abstractmethod def wrap_results_for_axis( self, results: ResType, res_index: Index - ) -> DataFrame | Series: + ) -> FrameOrSeriesUnion: pass # --------------------------------------------------------------- @@ -651,7 +638,7 @@ def values(self): def dtypes(self) -> Series: return self.obj.dtypes - def apply(self) -> DataFrame | Series: + def apply(self) -> FrameOrSeriesUnion: """compute the results""" # dispatch to agg if is_list_like(self.f): @@ -825,7 +812,7 @@ def apply_series_generator(self) -> tuple[ResType, Index]: return results, res_index - def wrap_results(self, results: ResType, res_index: Index) -> DataFrame | Series: + def wrap_results(self, results: ResType, res_index: Index) -> FrameOrSeriesUnion: from pandas import Series # see if we can infer the results @@ -848,14 +835,14 @@ def wrap_results(self, results: ResType, res_index: Index) -> DataFrame | Series return result - def apply_str(self) -> DataFrame | Series: + def apply_str(self) -> FrameOrSeriesUnion: # Caller is responsible for checking isinstance(self.f, str) # TODO: GH#39993 - Avoid special-casing by replacing with lambda if self.f == "size": # Special-cased because DataFrame.size returns a single scalar obj = self.obj value = obj.shape[self.axis] - return obj._constructor_sliced(value, index=self.agg_axis) + return obj._constructor_sliced(value, index=self.agg_axis, name="size") return super().apply_str() @@ -879,7 +866,7 @@ def result_columns(self) -> Index: def wrap_results_for_axis( self, results: ResType, res_index: Index - ) -> DataFrame | Series: + ) -> FrameOrSeriesUnion: """return the results for the rows""" if self.result_type == "reduce": @@ -962,9 +949,9 @@ def result_columns(self) -> Index: def wrap_results_for_axis( self, results: ResType, res_index: Index - ) -> DataFrame | Series: + ) -> FrameOrSeriesUnion: """return the results for the columns""" - result: DataFrame | Series + result: FrameOrSeriesUnion # we have requested to expand if self.result_type == "expand": @@ -1018,7 +1005,7 @@ def __init__( kwargs=kwargs, ) - def apply(self) -> DataFrame | Series: + def apply(self) -> FrameOrSeriesUnion: obj = self.obj if len(obj) == 0: @@ -1069,13 +1056,17 @@ def apply_empty_result(self) -> Series: obj, method="apply" ) - def apply_standard(self) -> DataFrame | Series: + def apply_standard(self) -> FrameOrSeriesUnion: f = self.f obj = self.obj with np.errstate(all="ignore"): if isinstance(f, np.ufunc): - return f(obj) + # error: Argument 1 to "__call__" of "ufunc" has incompatible type + # "Series"; expected "Union[Union[int, float, complex, str, bytes, + # generic], Sequence[Union[int, float, complex, str, bytes, generic]], + # Sequence[Sequence[Any]], _SupportsArray]" + return f(obj) # type: ignore[arg-type] # row-wise access if is_extension_array_dtype(obj.dtype) and hasattr(obj._values, "map"): diff --git a/pandas/core/array_algos/replace.py b/pandas/core/array_algos/replace.py index df4407067b131..e800f5ac748ec 100644 --- a/pandas/core/array_algos/replace.py +++ b/pandas/core/array_algos/replace.py @@ -45,21 +45,21 @@ def compare_or_regex_search( a: ArrayLike, b: Scalar | Pattern, regex: bool, mask: np.ndarray ) -> ArrayLike | bool: """ - Compare two array-like inputs of the same shape or two scalar values + Compare two array_like inputs of the same shape or two scalar values Calls operator.eq or re.search, depending on regex argument. If regex is True, perform an element-wise regex matching. Parameters ---------- - a : array-like + a : array_like b : scalar or regex pattern regex : bool mask : np.ndarray[bool] Returns ------- - mask : array-like of bool + mask : array_like of bool """ if isna(b): return ~mask diff --git a/pandas/core/arrays/_ranges.py b/pandas/core/arrays/_ranges.py index 3909875e5660a..cac9fcd40fa52 100644 --- a/pandas/core/arrays/_ranges.py +++ b/pandas/core/arrays/_ranges.py @@ -6,7 +6,6 @@ import numpy as np -from pandas._libs.lib import i8max from pandas._libs.tslibs import ( BaseOffset, OutOfBoundsDatetime, @@ -104,7 +103,7 @@ def _generate_range_overflow_safe( # GH#14187 raise instead of incorrectly wrapping around assert side in ["start", "end"] - i64max = np.uint64(i8max) + i64max = np.uint64(np.iinfo(np.int64).max) msg = f"Cannot generate range with {side}={endpoint} and periods={periods}" with np.errstate(over="raise"): @@ -181,7 +180,7 @@ def _generate_range_overflow_safe_signed( # error: Incompatible types in assignment (expression has type # "unsignedinteger[_64Bit]", variable has type "signedinteger[_64Bit]") result = np.uint64(endpoint) + np.uint64(addend) # type: ignore[assignment] - i64max = np.uint64(i8max) + i64max = np.uint64(np.iinfo(np.int64).max) assert result > i64max if result <= i64max + np.uint64(stride): # error: Incompatible return value type (got "unsignedinteger", expected diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index b362769f50fa8..a6d1986937d2b 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -14,7 +14,6 @@ Any, Callable, Iterator, - Literal, Sequence, TypeVar, cast, @@ -73,6 +72,7 @@ ) if TYPE_CHECKING: + from typing import Literal class ExtensionArraySupportsAnyAll("ExtensionArray"): def any(self, *, skipna: bool = True) -> bool: @@ -826,13 +826,13 @@ def searchsorted(self, value, side="left", sorter=None): Parameters ---------- - value : array-like, list or scalar - Value(s) to insert into `self`. + value : array_like + Values to insert into `self`. side : {'left', 'right'}, optional If 'left', the index of the first suitable location found is given. If 'right', return the last such index. If there is no suitable index, return either 0 or N (where N is the length of `self`). - sorter : 1-D array-like, optional + sorter : 1-D array_like, optional Optional array of integer indices that sort array a into ascending order. They are typically the result of argsort. @@ -1296,10 +1296,8 @@ def _reduce(self, name: str, *, skipna: bool = True, **kwargs): """ raise TypeError(f"cannot perform {name} with type {self.dtype}") - # https://github.com/python/typeshed/issues/2148#issuecomment-520783318 - # Incompatible types in assignment (expression has type "None", base class - # "object" defined the type as "Callable[[object], int]") - __hash__: None # type: ignore[assignment] + def __hash__(self) -> int: + raise TypeError(f"unhashable type: {repr(type(self).__name__)}") # ------------------------------------------------------------------------ # Non-Optimized Default Methods diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index b46679c2fca18..ecc45357db8c1 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -26,7 +26,6 @@ NaT, algos as libalgos, hashtable as htable, - lib, ) from pandas._libs.arrays import NDArrayBacked from pandas._libs.lib import no_default @@ -37,7 +36,6 @@ Ordered, Scalar, Shape, - npt, type_t, ) from pandas.compat.numpy import function as nv @@ -525,7 +523,6 @@ def astype(self, dtype: Dtype, copy: bool = True) -> ArrayLike: try: new_cats = np.asarray(self.categories) new_cats = new_cats.astype(dtype=dtype, copy=copy) - fill_value = lib.item_from_zerodim(np.array(np.nan).astype(dtype)) except ( TypeError, # downstream error msg for CategoricalIndex is misleading ValueError, @@ -533,9 +530,7 @@ def astype(self, dtype: Dtype, copy: bool = True) -> ArrayLike: msg = f"Cannot cast {self.categories.dtype} dtype to {dtype}" raise ValueError(msg) - result = take_nd( - new_cats, ensure_platform_int(self._codes), fill_value=fill_value - ) + result = take_nd(new_cats, ensure_platform_int(self._codes)) return result @@ -1394,14 +1389,17 @@ def map(self, mapper): # ------------------------------------------------------------- # Validators; ideally these can be de-duplicated - def _validate_setitem_value(self, value): - if not is_hashable(value): - # wrap scalars and hashable-listlikes in list - return self._validate_listlike(value) + def _validate_searchsorted_value(self, value): + # searchsorted is very performance sensitive. By converting codes + # to same dtype as self.codes, we get much faster performance. + if is_scalar(value): + codes = self._unbox_scalar(value) else: - return self._validate_scalar(value) - - _validate_searchsorted_value = _validate_setitem_value + locs = [self.categories.get_loc(x) for x in value] + # error: Incompatible types in assignment (expression has type + # "ndarray", variable has type "int") + codes = np.array(locs, dtype=self.codes.dtype) # type: ignore[assignment] + return codes def _validate_scalar(self, fill_value): """ @@ -1427,8 +1425,8 @@ def _validate_scalar(self, fill_value): fill_value = self._unbox_scalar(fill_value) else: raise TypeError( - "Cannot setitem on a Categorical with a new " - f"category ({fill_value}), set the categories first" + f"'fill_value={fill_value}' is not present " + "in this Categorical's categories" ) return fill_value @@ -2013,14 +2011,13 @@ def __getitem__(self, key): deprecate_ndim_indexing(result) return result - def _validate_listlike(self, value): - # NB: here we assume scalar-like tuples have already been excluded + def _validate_setitem_value(self, value): value = extract_array(value, extract_numpy=True) # require identical categories set if isinstance(value, Categorical): if not is_dtype_equal(self.dtype, value.dtype): - raise TypeError( + raise ValueError( "Cannot set a Categorical with another, " "without identical categories" ) @@ -2028,23 +2025,26 @@ def _validate_listlike(self, value): value = self._encode_with_my_categories(value) return value._codes + # wrap scalars and hashable-listlikes in list + rvalue = value if not is_hashable(value) else [value] + from pandas import Index # tupleize_cols=False for e.g. test_fillna_iterable_category GH#41914 - to_add = Index(value, tupleize_cols=False).difference(self.categories) + to_add = Index(rvalue, tupleize_cols=False).difference(self.categories) # no assignments of values not in categories, but it's always ok to set # something to np.nan if len(to_add) and not isna(to_add).all(): - raise TypeError( + raise ValueError( "Cannot setitem on a Categorical with a new " "category, set the categories first" ) - codes = self.categories.get_indexer(value) + codes = self.categories.get_indexer(rvalue) return codes.astype(self._ndarray.dtype, copy=False) - def _reverse_indexer(self) -> dict[Hashable, npt.NDArray[np.intp]]: + def _reverse_indexer(self) -> dict[Hashable, np.ndarray]: """ Compute the inverse of a categorical, returning a dict of categories -> indexers. diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 8b5bda4629506..08cb12a1373bb 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -9,7 +9,6 @@ TYPE_CHECKING, Any, Callable, - Literal, Sequence, TypeVar, Union, @@ -122,6 +121,7 @@ from pandas.tseries import frequencies if TYPE_CHECKING: + from typing import Literal from pandas.core.arrays import ( DatetimeArray, diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index 8513bbb044e83..92a906e9fd8b0 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -8,7 +8,6 @@ ) from typing import ( TYPE_CHECKING, - Literal, cast, overload, ) @@ -82,6 +81,7 @@ ) if TYPE_CHECKING: + from typing import Literal from pandas import DataFrame from pandas.core.arrays import ( @@ -509,11 +509,6 @@ def _check_compatible_with(self, other, setitem: bool = False): # Descriptive Properties def _box_func(self, x) -> Timestamp | NaTType: - if isinstance(x, np.datetime64): - # GH#42228 - # Argument 1 to "signedinteger" has incompatible type "datetime64"; - # expected "Union[SupportsInt, Union[str, bytes], SupportsIndex]" - x = np.int64(x) # type: ignore[arg-type] ts = Timestamp(x, tz=self.tz) # Non-overlapping identity check (left operand type: "Timestamp", # right operand type: "NaTType") diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index dd45029336f63..2318cae004c5a 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -64,7 +64,6 @@ from pandas.core.algorithms import ( isin, take, - unique, value_counts, ) from pandas.core.arrays.base import ( @@ -1611,29 +1610,6 @@ def _combined(self) -> ArrayLike: comb = np.concatenate([left, right], axis=1) return comb - def _from_combined(self, combined: np.ndarray) -> IntervalArray: - """ - Create a new IntervalArray with our dtype from a 1D complex128 ndarray. - """ - nc = combined.view("i8").reshape(-1, 2) - - dtype = self._left.dtype - if needs_i8_conversion(dtype): - new_left = type(self._left)._from_sequence(nc[:, 0], dtype=dtype) - new_right = type(self._right)._from_sequence(nc[:, 1], dtype=dtype) - else: - new_left = nc[:, 0].view(dtype) - new_right = nc[:, 1].view(dtype) - return self._shallow_copy(left=new_left, right=new_right) - - def unique(self) -> IntervalArray: - # Invalid index type "Tuple[slice, int]" for "Union[ExtensionArray, - # ndarray[Any, Any]]"; expected type "Union[int, integer[Any], slice, - # Sequence[int], ndarray[Any, Any]]" - nc = unique(self._combined.view("complex128")[:, 0]) # type: ignore[index] - nc = nc[:, None] - return self._from_combined(nc) - def _maybe_convert_platform_interval(values) -> ArrayLike: """ diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 3a152bd5889b7..d274501143916 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -403,21 +403,15 @@ def isin(self, values) -> BooleanArray: # type: ignore[override] from pandas.core.arrays import BooleanArray - # algorithms.isin will eventually convert values to an ndarray, so no extra - # cost to doing it here first - values_arr = np.asarray(values) - result = isin(self._data, values_arr) - + result = isin(self._data, values) if self._hasna: - values_have_NA = is_object_dtype(values_arr.dtype) and any( - val is self.dtype.na_value for val in values_arr - ) - - # For now, NA does not propagate so set result according to presence of NA, - # see https://github.com/pandas-dev/pandas/pull/38379 for some discussion - result[self._mask] = values_have_NA - - mask = np.zeros_like(self, dtype=bool) + if libmissing.NA in values: + result += self._mask + else: + result *= np.invert(self._mask) + # error: No overload variant of "zeros_like" matches argument types + # "BaseMaskedArray", "Type[bool]" + mask = np.zeros_like(self, dtype=bool) # type: ignore[call-overload] return BooleanArray(result, mask, copy=False) def copy(self: BaseMaskedArrayT) -> BaseMaskedArrayT: diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index 471ee295ebd2f..04db06ee9fb66 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -341,9 +341,7 @@ def freq(self) -> BaseOffset: def __array__(self, dtype: NpDtype | None = None) -> np.ndarray: if dtype == "i8": return self.asi8 - # error: Non-overlapping equality check (left operand type: "Optional[Union[str, - # dtype[Any]]]", right operand type: "Type[bool]") - elif dtype == bool: # type: ignore[comparison-overlap] + elif dtype == bool: return ~self._isnan # This will raise TypeError for non-object dtypes diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index b1cfcbd69a30b..7d3917203d7b6 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -226,7 +226,7 @@ class SparseArray(OpsMixin, PandasObject, ExtensionArray): Parameters ---------- - data : array-like or scalar + data : array-like A dense array of values to store in the SparseArray. This may contain `fill_value`. sparse_index : SparseIndex, optional @@ -1448,7 +1448,7 @@ def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs, **kwargs): sp_values, self.sp_index, SparseDtype(sp_values.dtype, fill_value) ) - result = getattr(ufunc, method)(*(np.asarray(x) for x in inputs), **kwargs) + result = getattr(ufunc, method)(*[np.asarray(x) for x in inputs], **kwargs) if out: if len(out) == 1: out = out[0] @@ -1463,7 +1463,11 @@ def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs, **kwargs): return type(self)(result) def __abs__(self): - return np.abs(self) + # error: Argument 1 to "__call__" of "ufunc" has incompatible type + # "SparseArray"; expected "Union[Union[int, float, complex, str, bytes, + # generic], Sequence[Union[int, float, complex, str, bytes, generic]], + # Sequence[Sequence[Any]], _SupportsArray]" + return np.abs(self) # type: ignore[arg-type] # ------------------------------------------------------------------------ # Ops diff --git a/pandas/core/arrays/sparse/scipy_sparse.py b/pandas/core/arrays/sparse/scipy_sparse.py index f399d3230d897..7ebda1f17ba56 100644 --- a/pandas/core/arrays/sparse/scipy_sparse.py +++ b/pandas/core/arrays/sparse/scipy_sparse.py @@ -58,7 +58,7 @@ def _get_label_to_i_dict(labels, sort_labels=False): return {k: i for i, k in enumerate(labels)} def _get_index_subset_to_coord_dict(index, subset, sort_labels=False): - ilabels = list(zip(*(index._get_level_values(i) for i in subset))) + ilabels = list(zip(*[index._get_level_values(i) for i in subset])) labels_to_i = _get_label_to_i_dict(ilabels, sort_labels=sort_labels) labels_to_i = Series(labels_to_i) if len(subset) > 1: diff --git a/pandas/core/base.py b/pandas/core/base.py index 4d380c6831071..ae7e1a1062cfb 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -10,10 +10,8 @@ Any, Generic, Hashable, - Literal, TypeVar, cast, - final, ) import numpy as np @@ -21,11 +19,12 @@ import pandas._libs.lib as lib from pandas._typing import ( ArrayLike, + Dtype, DtypeObj, FrameOrSeries, IndexLabel, Shape, - npt, + final, ) from pandas.compat import PYPY from pandas.compat.numpy import function as nv @@ -65,6 +64,7 @@ import pandas.core.nanops as nanops if TYPE_CHECKING: + from typing import Literal from pandas import Categorical @@ -411,7 +411,7 @@ def array(self) -> ExtensionArray: def to_numpy( self, - dtype: npt.DTypeLike | None = None, + dtype: Dtype | None = None, copy: bool = False, na_value=lib.no_default, **kwargs, @@ -510,16 +510,8 @@ def to_numpy( """ if is_extension_array_dtype(self.dtype): # error: Too many arguments for "to_numpy" of "ExtensionArray" - - # error: Argument 1 to "to_numpy" of "ExtensionArray" has incompatible type - # "Optional[Union[dtype[Any], None, type, _SupportsDType[dtype[Any]], str, - # Union[Tuple[Any, int], Tuple[Any, Union[SupportsIndex, - # Sequence[SupportsIndex]]], List[Any], _DTypeDict, Tuple[Any, Any]]]]"; - # expected "Optional[Union[ExtensionDtype, Union[str, dtype[Any]], - # Type[str], Type[float], Type[int], Type[complex], Type[bool], - # Type[object]]]" return self.array.to_numpy( # type: ignore[call-arg] - dtype, copy=copy, na_value=na_value, **kwargs # type: ignore[arg-type] + dtype, copy=copy, na_value=na_value, **kwargs ) elif kwargs: bad_keys = list(kwargs.keys())[0] @@ -527,7 +519,12 @@ def to_numpy( f"to_numpy() got an unexpected keyword argument '{bad_keys}'" ) - result = np.asarray(self._values, dtype=dtype) + # error: Argument "dtype" to "asarray" has incompatible type + # "Union[ExtensionDtype, str, dtype[Any], Type[str], Type[float], Type[int], + # Type[complex], Type[bool], Type[object], None]"; expected "Union[dtype[Any], + # None, type, _SupportsDType, str, Union[Tuple[Any, int], Tuple[Any, Union[int, + # Sequence[int]]], List[Any], _DTypeDict, Tuple[Any, Any]]]" + result = np.asarray(self._values, dtype=dtype) # type: ignore[arg-type] # TODO(GH-24345): Avoid potential double copy if copy or na_value is not lib.no_default: result = result.copy() @@ -1094,7 +1091,6 @@ def _memory_usage(self, deep: bool = False) -> int: are not components of the array if deep=False or if used on PyPy """ if hasattr(self.array, "memory_usage"): - # https://github.com/python/mypy/issues/1424 # error: "ExtensionArray" has no attribute "memory_usage" return self.array.memory_usage(deep=deep) # type: ignore[attr-defined] @@ -1137,13 +1133,13 @@ def factorize(self, sort: bool = False, na_sentinel: int | None = -1): Parameters ---------- - value : array-like or scalar + value : array_like Values to insert into `self`. side : {{'left', 'right'}}, optional If 'left', the index of the first suitable location found is given. If 'right', return the last such index. If there is no suitable index, return either 0 or N (where N is the length of `self`). - sorter : 1-D array-like, optional + sorter : 1-D array_like, optional Optional array of integer indices that sort `self` into ascending order. They are typically the result of ``np.argsort``. diff --git a/pandas/core/common.py b/pandas/core/common.py index b32614577393d..183607ebb489d 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -21,7 +21,6 @@ Iterable, Iterator, cast, - overload, ) import warnings @@ -30,12 +29,11 @@ from pandas._libs import lib from pandas._typing import ( AnyArrayLike, - ArrayLike, NpDtype, - RandomState, Scalar, T, ) +from pandas.compat import np_version_under1p18 from pandas.core.dtypes.cast import construct_1d_object_array_from_listlike from pandas.core.dtypes.common import ( @@ -145,11 +143,7 @@ def is_bool_indexer(key: Any) -> bool: return True elif isinstance(key, list): # check if np.array(key).dtype would be bool - if len(key) > 0: - if type(key) is not list: - # GH#42461 cython will raise TypeError if we pass a subclass - key = list(key) - return lib.is_bool_list(key) + return len(key) > 0 and lib.is_bool_list(key) return False @@ -394,70 +388,44 @@ def standardize_mapping(into): return into -@overload -def random_state(state: np.random.Generator) -> np.random.Generator: - ... - - -@overload -def random_state( - state: int | ArrayLike | np.random.BitGenerator | np.random.RandomState | None, -) -> np.random.RandomState: - ... - - -def random_state(state: RandomState | None = None): +def random_state(state=None): """ Helper function for processing random_state arguments. Parameters ---------- - state : int, array-like, BitGenerator, Generator, np.random.RandomState, None. + state : int, array-like, BitGenerator (NumPy>=1.17), np.random.RandomState, None. If receives an int, array-like, or BitGenerator, passes to np.random.RandomState() as seed. - If receives an np.random RandomState or Generator, just returns that unchanged. + If receives an np.random.RandomState object, just returns object. If receives `None`, returns np.random. If receives anything else, raises an informative ValueError. .. versionchanged:: 1.1.0 - array-like and BitGenerator object now passed to np.random.RandomState() - as seed + array-like and BitGenerator (for NumPy>=1.18) object now passed to + np.random.RandomState() as seed Default None. Returns ------- - np.random.RandomState or np.random.Generator. If state is None, returns np.random + np.random.RandomState """ if ( is_integer(state) or is_array_like(state) - or isinstance(state, np.random.BitGenerator) + or (not np_version_under1p18 and isinstance(state, np.random.BitGenerator)) ): - # error: Argument 1 to "RandomState" has incompatible type "Optional[Union[int, - # Union[ExtensionArray, ndarray[Any, Any]], Generator, RandomState]]"; expected - # "Union[None, Union[Union[_SupportsArray[dtype[Union[bool_, integer[Any]]]], - # Sequence[_SupportsArray[dtype[Union[bool_, integer[Any]]]]], - # Sequence[Sequence[_SupportsArray[dtype[Union[bool_, integer[Any]]]]]], - # Sequence[Sequence[Sequence[_SupportsArray[dtype[Union[bool_, - # integer[Any]]]]]]], - # Sequence[Sequence[Sequence[Sequence[_SupportsArray[dtype[Union[bool_, - # integer[Any]]]]]]]]], Union[bool, int, Sequence[Union[bool, int]], - # Sequence[Sequence[Union[bool, int]]], Sequence[Sequence[Sequence[Union[bool, - # int]]]], Sequence[Sequence[Sequence[Sequence[Union[bool, int]]]]]]], - # BitGenerator]" - return np.random.RandomState(state) # type: ignore[arg-type] + return np.random.RandomState(state) elif isinstance(state, np.random.RandomState): return state - elif isinstance(state, np.random.Generator): - return state elif state is None: return np.random else: raise ValueError( - "random_state must be an integer, array-like, a BitGenerator, Generator, " + "random_state must be an integer, array-like, a BitGenerator, " "a numpy RandomState, or None" ) diff --git a/pandas/core/computation/engines.py b/pandas/core/computation/engines.py index ec3548c9efc6c..62732402dbeea 100644 --- a/pandas/core/computation/engines.py +++ b/pandas/core/computation/engines.py @@ -37,7 +37,7 @@ def _check_ne_builtin_clash(expr: Expr) -> None: overlap = names & _ne_builtins if overlap: - s = ", ".join([repr(x) for x in overlap]) + s = ", ".join(repr(x) for x in overlap) raise NumExprClobberingError( f'Variables in expression "{expr}" overlap with builtins: ({s})' ) diff --git a/pandas/core/computation/parsing.py b/pandas/core/computation/parsing.py index 5e000116d19f2..b0f817d2c1ff3 100644 --- a/pandas/core/computation/parsing.py +++ b/pandas/core/computation/parsing.py @@ -57,7 +57,7 @@ def create_valid_python_identifier(name: str) -> str: } ) - name = "".join([special_characters_replacements.get(char, char) for char in name]) + name = "".join(special_characters_replacements.get(char, char) for char in name) name = "BACKTICK_QUOTED_STRING_" + name if not name.isidentifier(): diff --git a/pandas/core/computation/pytables.py b/pandas/core/computation/pytables.py index ad76a76a954b1..f733a5c43dfb3 100644 --- a/pandas/core/computation/pytables.py +++ b/pandas/core/computation/pytables.py @@ -226,7 +226,11 @@ def stringify(value): if v not in metadata: result = -1 else: - result = metadata.searchsorted(v, side="left") + # error: Incompatible types in assignment (expression has type + # "Union[Any, ndarray]", variable has type "int") + result = metadata.searchsorted( # type: ignore[assignment] + v, side="left" + ) return TermValue(result, result, "integer") elif kind == "integer": v = int(float(v)) @@ -575,7 +579,7 @@ def __init__( else: w = _validate_where(w) where[idx] = w - _where = " & ".join([f"({w})" for w in com.flatten(where)]) + _where = " & ".join(f"({w})" for w in com.flatten(where)) else: # _validate_where ensures we otherwise have a string _where = where diff --git a/pandas/core/computation/scope.py b/pandas/core/computation/scope.py index 426cd8fd81f28..09067e7eba6e5 100644 --- a/pandas/core/computation/scope.py +++ b/pandas/core/computation/scope.py @@ -50,7 +50,7 @@ def _raw_hex_id(obj) -> str: """Return the padded hexadecimal id of ``obj``.""" # interpret as a pointer since that's what really what id returns packed = struct.pack("@P", id(obj)) - return "".join([_replacer(x) for x in packed]) + return "".join(_replacer(x) for x in packed) DEFAULT_GLOBALS = { diff --git a/pandas/core/describe.py b/pandas/core/describe.py index fd45da4a3ccc7..dfb18b2c40698 100644 --- a/pandas/core/describe.py +++ b/pandas/core/describe.py @@ -11,9 +11,7 @@ ) from typing import ( TYPE_CHECKING, - Any, Callable, - Hashable, Sequence, cast, ) @@ -22,7 +20,11 @@ import numpy as np from pandas._libs.tslibs import Timestamp -from pandas._typing import FrameOrSeries +from pandas._typing import ( + FrameOrSeries, + FrameOrSeriesUnion, + Hashable, +) from pandas.util._validators import validate_percentile from pandas.core.dtypes.common import ( @@ -49,7 +51,7 @@ def describe_ndframe( include: str | Sequence[str] | None, exclude: str | Sequence[str] | None, datetime_is_numeric: bool, - percentiles: Sequence[float] | np.ndarray | None, + percentiles: Sequence[float] | None, ) -> FrameOrSeries: """Describe series or dataframe. @@ -105,12 +107,12 @@ class NDFrameDescriberAbstract(ABC): Whether to treat datetime dtypes as numeric. """ - def __init__(self, obj: DataFrame | Series, datetime_is_numeric: bool): + def __init__(self, obj: FrameOrSeriesUnion, datetime_is_numeric: bool): self.obj = obj self.datetime_is_numeric = datetime_is_numeric @abstractmethod - def describe(self, percentiles: Sequence[float] | np.ndarray) -> DataFrame | Series: + def describe(self, percentiles: Sequence[float]) -> FrameOrSeriesUnion: """Do describe either series or dataframe. Parameters @@ -125,7 +127,7 @@ class SeriesDescriber(NDFrameDescriberAbstract): obj: Series - def describe(self, percentiles: Sequence[float] | np.ndarray) -> Series: + def describe(self, percentiles: Sequence[float]) -> Series: describe_func = select_describe_func( self.obj, self.datetime_is_numeric, @@ -164,7 +166,7 @@ def __init__( super().__init__(obj, datetime_is_numeric=datetime_is_numeric) - def describe(self, percentiles: Sequence[float] | np.ndarray) -> DataFrame: + def describe(self, percentiles: Sequence[float]) -> DataFrame: data = self._select_data() ldesc: list[Series] = [] @@ -386,11 +388,8 @@ def select_describe_func( return describe_categorical_1d -def refine_percentiles( - percentiles: Sequence[float] | np.ndarray | None, -) -> np.ndarray[Any, np.dtype[np.float64]]: - """ - Ensure that percentiles are unique and sorted. +def refine_percentiles(percentiles: Sequence[float] | None) -> Sequence[float]: + """Ensure that percentiles are unique and sorted. Parameters ---------- @@ -398,7 +397,9 @@ def refine_percentiles( The percentiles to include in the output. """ if percentiles is None: - return np.array([0.25, 0.5, 0.75]) + # error: Incompatible return value type (got "ndarray", expected + # "Sequence[float]") + return np.array([0.25, 0.5, 0.75]) # type: ignore[return-value] # explicit conversion of `percentiles` to list percentiles = list(percentiles) @@ -410,7 +411,9 @@ def refine_percentiles( if 0.5 not in percentiles: percentiles.append(0.5) - percentiles = np.asarray(percentiles) + # error: Incompatible types in assignment (expression has type "ndarray", variable + # has type "Optional[Sequence[float]]") + percentiles = np.asarray(percentiles) # type: ignore[assignment] # sort and check for duplicates unique_pcts = np.unique(percentiles) diff --git a/pandas/core/dtypes/base.py b/pandas/core/dtypes/base.py index 5b7dadac5d914..e52b318c0b4f7 100644 --- a/pandas/core/dtypes/base.py +++ b/pandas/core/dtypes/base.py @@ -12,7 +12,6 @@ import numpy as np -from pandas._libs.hashtable import object_hash from pandas._typing import ( DtypeObj, type_t, @@ -129,9 +128,7 @@ def __eq__(self, other: Any) -> bool: return False def __hash__(self) -> int: - # for python>=3.10, different nan objects have different hashes - # we need to avoid that und thus use hash function with old behavior - return object_hash(tuple(getattr(self, attr) for attr in self._metadata)) + return hash(tuple(getattr(self, attr) for attr in self._metadata)) def __ne__(self, other: Any) -> bool: return not self.__eq__(other) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 4f4276ceddcf9..433d45d94167d 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -15,7 +15,6 @@ TYPE_CHECKING, Any, Sized, - TypeVar, cast, overload, ) @@ -58,6 +57,7 @@ is_complex_dtype, is_datetime64_dtype, is_datetime64tz_dtype, + is_datetime_or_timedelta_dtype, is_dtype_equal, is_extension_array_dtype, is_float, @@ -92,6 +92,7 @@ ) if TYPE_CHECKING: + from typing import Literal from pandas.core.arrays import ( DatetimeArray, @@ -106,8 +107,6 @@ _int32_max = np.iinfo(np.int32).max _int64_max = np.iinfo(np.int64).max -NumpyArrayT = TypeVar("NumpyArrayT", bound=np.ndarray) - def maybe_convert_platform( values: list | tuple | range | np.ndarray | ExtensionArray, @@ -180,7 +179,9 @@ def maybe_box_native(value: Scalar) -> Scalar: ------- scalar or Series """ - if is_float(value): + if is_datetime_or_timedelta_dtype(value): + value = maybe_box_datetimelike(value) + elif is_float(value): # error: Argument 1 to "float" has incompatible type # "Union[Union[str, int, float, bool], Union[Any, Timestamp, Timedelta, Any]]"; # expected "Union[SupportsFloat, _SupportsIndex, str]" @@ -192,8 +193,6 @@ def maybe_box_native(value: Scalar) -> Scalar: value = int(value) # type: ignore[arg-type] elif is_bool(value): value = bool(value) - elif isinstance(value, (np.datetime64, np.timedelta64)): - value = maybe_box_datetimelike(value) return value @@ -660,10 +659,7 @@ def _ensure_dtype_type(value, dtype: np.dtype): object """ # Start with exceptions in which we do _not_ cast to numpy types - - # error: Non-overlapping equality check (left operand type: "dtype[Any]", right - # operand type: "Type[object_]") - if dtype == np.object_: # type: ignore[comparison-overlap] + if dtype == np.object_: return value # Note: before we get here we have already excluded isna(value) @@ -779,21 +775,6 @@ def infer_dtype_from_scalar(val, pandas_dtype: bool = False) -> tuple[DtypeObj, return dtype, val -def dict_compat(d: dict[Scalar, Scalar]) -> dict[Scalar, Scalar]: - """ - Convert datetimelike-keyed dicts to a Timestamp-keyed dict. - - Parameters - ---------- - d: dict-like object - - Returns - ------- - dict - """ - return {maybe_box_datetimelike(key): value for key, value in d.items()} - - def infer_dtype_from_array( arr, pandas_dtype: bool = False ) -> tuple[DtypeObj, ArrayLike]: @@ -885,10 +866,10 @@ def maybe_infer_dtype_type(element): def maybe_upcast( - values: NumpyArrayT, + values: np.ndarray, fill_value: Scalar = np.nan, copy: bool = False, -) -> tuple[NumpyArrayT, Scalar]: +) -> tuple[np.ndarray, Scalar]: """ Provide explicit type promotion and coercion. @@ -1092,11 +1073,14 @@ def astype_nansafe( The dtype was a datetime64/timedelta64 dtype, but it had no unit. """ if arr.ndim > 1: - flat = arr.ravel() + # Make sure we are doing non-copy ravel and reshape. + flags = arr.flags + flat = arr.ravel("K") result = astype_nansafe(flat, dtype, copy=copy, skipna=skipna) + order: Literal["C", "F"] = "F" if flags.f_contiguous else "C" # error: Item "ExtensionArray" of "Union[ExtensionArray, ndarray]" has no # attribute "reshape" - return result.reshape(arr.shape) # type: ignore[union-attr] + return result.reshape(arr.shape, order=order) # type: ignore[union-attr] # We get here with 0-dim from sparse arr = np.atleast_1d(arr) @@ -1109,10 +1093,7 @@ def astype_nansafe( raise ValueError("dtype must be np.dtype or ExtensionDtype") if arr.dtype.kind in ["m", "M"] and ( - issubclass(dtype.type, str) - # error: Non-overlapping equality check (left operand type: "dtype[Any]", right - # operand type: "Type[object]") - or dtype == object # type: ignore[comparison-overlap] + issubclass(dtype.type, str) or dtype == object ): from pandas.core.construction import ensure_wrapped_if_datetimelike @@ -1123,9 +1104,7 @@ def astype_nansafe( return lib.ensure_string_array(arr, skipna=skipna, convert_na_value=False) elif is_datetime64_dtype(arr): - # Non-overlapping equality check (left operand type: "dtype[Any]", right - # operand type: "Type[signedinteger[Any]]") - if dtype == np.int64: # type: ignore[comparison-overlap] + if dtype == np.int64: warnings.warn( f"casting {arr.dtype} values to int64 with .astype(...) " "is deprecated and will raise in a future version. " @@ -1145,9 +1124,7 @@ def astype_nansafe( raise TypeError(f"cannot astype a datetimelike from [{arr.dtype}] to [{dtype}]") elif is_timedelta64_dtype(arr): - # error: Non-overlapping equality check (left operand type: "dtype[Any]", right - # operand type: "Type[signedinteger[Any]]") - if dtype == np.int64: # type: ignore[comparison-overlap] + if dtype == np.int64: warnings.warn( f"casting {arr.dtype} values to int64 with .astype(...) " "is deprecated and will raise in a future version. " @@ -1421,9 +1398,10 @@ def convert_dtypes( if is_string_dtype(inferred_dtype): if not convert_string: - return input_array.dtype + inferred_dtype = input_array.dtype else: - return pandas_dtype("string") + inferred_dtype = pandas_dtype("string") + return inferred_dtype if convert_integer: target_int_dtype = pandas_dtype("Int64") @@ -1476,9 +1454,7 @@ def convert_dtypes( else: return input_array.dtype - # error: Incompatible return value type (got "Union[str, Union[dtype[Any], - # ExtensionDtype]]", expected "Union[dtype[Any], ExtensionDtype]") - return inferred_dtype # type: ignore[return-value] + return inferred_dtype def maybe_infer_to_datetimelike( @@ -1855,9 +1831,7 @@ def construct_2d_arraylike_from_scalar( if dtype.kind in ["m", "M"]: value = maybe_unbox_datetimelike_tz_deprecation(value, dtype, stacklevel=4) - # error: Non-overlapping equality check (left operand type: "dtype[Any]", right - # operand type: "Type[object]") - elif dtype == object: # type: ignore[comparison-overlap] + elif dtype == object: if isinstance(value, (np.timedelta64, np.datetime64)): # calling np.array below would cast to pytimedelta/pydatetime out = np.empty(shape, dtype=object) @@ -2232,9 +2206,7 @@ def can_hold_element(arr: ArrayLike, element: Any) -> bool: return tipo.kind == "b" return lib.is_bool(element) - # error: Non-overlapping equality check (left operand type: "dtype[Any]", right - # operand type: "Type[object]") - elif dtype == object: # type: ignore[comparison-overlap] + elif dtype == object: return True elif dtype.kind == "S": diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index 08287cc296006..34b9a3f1f14ad 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -20,6 +20,7 @@ from pandas._typing import ( ArrayLike, DtypeObj, + Optional, ) from pandas.core.dtypes.base import _registry as registry @@ -162,7 +163,7 @@ def is_object_dtype(arr_or_dtype) -> bool: Parameters ---------- - arr_or_dtype : array-like or dtype + arr_or_dtype : array-like The array-like or dtype to check. Returns @@ -316,7 +317,7 @@ def is_datetime64_dtype(arr_or_dtype) -> bool: Parameters ---------- - arr_or_dtype : array-like or dtype + arr_or_dtype : array-like The array-like or dtype to check. Returns @@ -349,7 +350,7 @@ def is_datetime64tz_dtype(arr_or_dtype) -> bool: Parameters ---------- - arr_or_dtype : array-like or dtype + arr_or_dtype : array-like The array-like or dtype to check. Returns @@ -390,7 +391,7 @@ def is_timedelta64_dtype(arr_or_dtype) -> bool: Parameters ---------- - arr_or_dtype : array-like or dtype + arr_or_dtype : array-like The array-like or dtype to check. Returns @@ -424,7 +425,7 @@ def is_period_dtype(arr_or_dtype) -> bool: Parameters ---------- - arr_or_dtype : array-like or dtype + arr_or_dtype : array-like The array-like or dtype to check. Returns @@ -460,7 +461,7 @@ def is_interval_dtype(arr_or_dtype) -> bool: Parameters ---------- - arr_or_dtype : array-like or dtype + arr_or_dtype : array-like The array-like or dtype to check. Returns @@ -498,7 +499,7 @@ def is_categorical_dtype(arr_or_dtype) -> bool: Parameters ---------- - arr_or_dtype : array-like or dtype + arr_or_dtype : array-like The array-like or dtype to check. Returns @@ -534,7 +535,7 @@ def is_string_dtype(arr_or_dtype) -> bool: Parameters ---------- - arr_or_dtype : array-like or dtype + arr_or_dtype : array-like The array or dtype to check. Returns @@ -635,7 +636,7 @@ def is_any_int_dtype(arr_or_dtype) -> bool: Parameters ---------- - arr_or_dtype : array-like or dtype + arr_or_dtype : array-like The array or dtype to check. Returns @@ -680,7 +681,7 @@ def is_integer_dtype(arr_or_dtype) -> bool: Parameters ---------- - arr_or_dtype : array-like or dtype + arr_or_dtype : array-like The array or dtype to check. Returns @@ -732,7 +733,7 @@ def is_signed_integer_dtype(arr_or_dtype) -> bool: Parameters ---------- - arr_or_dtype : array-like or dtype + arr_or_dtype : array-like The array or dtype to check. Returns @@ -784,7 +785,7 @@ def is_unsigned_integer_dtype(arr_or_dtype) -> bool: Parameters ---------- - arr_or_dtype : array-like or dtype + arr_or_dtype : array-like The array or dtype to check. Returns @@ -828,7 +829,7 @@ def is_int64_dtype(arr_or_dtype) -> bool: Parameters ---------- - arr_or_dtype : array-like or dtype + arr_or_dtype : array-like The array or dtype to check. Returns @@ -878,7 +879,7 @@ def is_datetime64_any_dtype(arr_or_dtype) -> bool: Parameters ---------- - arr_or_dtype : array-like or dtype + arr_or_dtype : array-like The array or dtype to check. Returns @@ -920,7 +921,7 @@ def is_datetime64_ns_dtype(arr_or_dtype) -> bool: Parameters ---------- - arr_or_dtype : array-like or dtype + arr_or_dtype : array-like The array or dtype to check. Returns @@ -970,7 +971,7 @@ def is_timedelta64_ns_dtype(arr_or_dtype) -> bool: Parameters ---------- - arr_or_dtype : array-like or dtype + arr_or_dtype : array-like The array or dtype to check. Returns @@ -999,7 +1000,7 @@ def is_datetime_or_timedelta_dtype(arr_or_dtype) -> bool: Parameters ---------- - arr_or_dtype : array-like or dtype + arr_or_dtype : array-like The array or dtype to check. Returns @@ -1039,7 +1040,7 @@ def is_numeric_v_string_like(a: ArrayLike, b): Parameters ---------- - a : array-like, scalar + a : array-like The first object to check. b : array-like, scalar The second object to check. @@ -1146,7 +1147,7 @@ def needs_i8_conversion(arr_or_dtype) -> bool: Parameters ---------- - arr_or_dtype : array-like or dtype + arr_or_dtype : array-like The array or dtype to check. Returns @@ -1190,7 +1191,7 @@ def is_numeric_dtype(arr_or_dtype) -> bool: Parameters ---------- - arr_or_dtype : array-like or dtype + arr_or_dtype : array-like The array or dtype to check. Returns @@ -1234,7 +1235,7 @@ def is_float_dtype(arr_or_dtype) -> bool: Parameters ---------- - arr_or_dtype : array-like or dtype + arr_or_dtype : array-like The array or dtype to check. Returns @@ -1266,7 +1267,7 @@ def is_bool_dtype(arr_or_dtype) -> bool: Parameters ---------- - arr_or_dtype : array-like or dtype + arr_or_dtype : array-like The array or dtype to check. Returns @@ -1337,7 +1338,7 @@ def is_extension_type(arr) -> bool: Parameters ---------- - arr : array-like, scalar + arr : array-like The array-like to check. Returns @@ -1405,7 +1406,7 @@ def is_1d_only_ea_obj(obj: Any) -> bool: ) -def is_1d_only_ea_dtype(dtype: DtypeObj | None) -> bool: +def is_1d_only_ea_dtype(dtype: Optional[DtypeObj]) -> bool: """ Analogue to is_extension_array_dtype but excluding DatetimeTZDtype. """ @@ -1470,7 +1471,7 @@ def is_extension_array_dtype(arr_or_dtype) -> bool: return registry.find(dtype) is not None -def is_ea_or_datetimelike_dtype(dtype: DtypeObj | None) -> bool: +def is_ea_or_datetimelike_dtype(dtype: Optional[DtypeObj]) -> bool: """ Check for ExtensionDtype, datetime64 dtype, or timedelta64 dtype. @@ -1489,7 +1490,7 @@ def is_complex_dtype(arr_or_dtype) -> bool: Parameters ---------- - arr_or_dtype : array-like or dtype + arr_or_dtype : array-like The array or dtype to check. Returns @@ -1546,7 +1547,7 @@ def get_dtype(arr_or_dtype) -> DtypeObj: Parameters ---------- - arr_or_dtype : array-like or dtype + arr_or_dtype : array-like The array-like or dtype object whose dtype we want to extract. Returns @@ -1580,7 +1581,7 @@ def _is_dtype_type(arr_or_dtype, condition) -> bool: Parameters ---------- - arr_or_dtype : array-like or dtype + arr_or_dtype : array-like The array-like or dtype object whose dtype we want to extract. condition : callable[Union[np.dtype, ExtensionDtypeType]] @@ -1714,7 +1715,7 @@ def _validate_date_like_dtype(dtype) -> None: ) -def validate_all_hashable(*args, error_name: str | None = None) -> None: +def validate_all_hashable(*args, error_name: Optional[str] = None) -> None: """ Return None if all args are hashable, else raise a TypeError. diff --git a/pandas/core/frame.py b/pandas/core/frame.py index d54a3047a3ab9..91b9bdd564676 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -27,7 +27,6 @@ Hashable, Iterable, Iterator, - Literal, Sequence, cast, overload, @@ -59,19 +58,18 @@ FillnaOptions, FloatFormatType, FormattersType, + FrameOrSeriesUnion, Frequency, IndexKeyFunc, IndexLabel, Level, + NpDtype, PythonFuncType, Renamer, Scalar, StorageOptions, Suffixes, - TimedeltaConvertibleTypes, - TimestampConvertibleTypes, ValueKeyFunc, - npt, ) from pandas.compat._optional import import_optional_dependency from pandas.compat.numpy import function as nv @@ -210,6 +208,12 @@ import pandas.plotting if TYPE_CHECKING: + from typing import Literal + + from pandas._typing import ( + TimedeltaConvertibleTypes, + TimestampConvertibleTypes, + ) from pandas.core.groupby.generic import DataFrameGroupBy from pandas.core.resample import Resampler @@ -643,7 +647,7 @@ def __init__( elif isinstance(data, (np.ndarray, Series, Index)): if data.dtype.names: # i.e. numpy structured array - data = cast(np.ndarray, data) + mgr = rec_array_to_mgr( data, index, @@ -1358,7 +1362,7 @@ def dot(self, other: Series) -> Series: def dot(self, other: DataFrame | Index | ArrayLike) -> DataFrame: ... - def dot(self, other: AnyArrayLike | DataFrame | Series) -> DataFrame | Series: + def dot(self, other: AnyArrayLike | FrameOrSeriesUnion) -> FrameOrSeriesUnion: """ Compute the matrix multiplication between the DataFrame and other. @@ -1474,13 +1478,13 @@ def __matmul__(self, other: Series) -> Series: @overload def __matmul__( - self, other: AnyArrayLike | DataFrame | Series - ) -> DataFrame | Series: + self, other: AnyArrayLike | FrameOrSeriesUnion + ) -> FrameOrSeriesUnion: ... def __matmul__( - self, other: AnyArrayLike | DataFrame | Series - ) -> DataFrame | Series: + self, other: AnyArrayLike | FrameOrSeriesUnion + ) -> FrameOrSeriesUnion: """ Matrix multiplication using binary `@` operator in Python>=3.5. """ @@ -1589,7 +1593,7 @@ def from_dict( def to_numpy( self, - dtype: npt.DTypeLike | None = None, + dtype: NpDtype | None = None, copy: bool = False, na_value=lib.no_default, ) -> np.ndarray: @@ -2275,7 +2279,9 @@ def to_records( if dtype_mapping is None: formats.append(v.dtype) elif isinstance(dtype_mapping, (type, np.dtype, str)): - formats.append(dtype_mapping) + # error: Argument 1 to "append" of "list" has incompatible type + # "Union[type, dtype, str]"; expected "dtype" + formats.append(dtype_mapping) # type: ignore[arg-type] else: element = "row" if i < index_len else "column" msg = f"Invalid dtype {dtype_mapping} specified for {element} {name}" @@ -2537,7 +2543,8 @@ def to_feather(self, path: FilePathOrBuffer[AnyStr], **kwargs) -> None: | 0 | elk | dog | +----+------------+------------+ | 1 | pig | quetzal | - +----+------------+------------+""", + +----+------------+------------+ + """, ) def to_markdown( self, @@ -3338,8 +3345,8 @@ def transpose(self, *args, copy: bool = False) -> DataFrame: values = self.values new_values = [arr_type._from_sequence(row, dtype=dtype) for row in values] - result = type(self)._from_arrays( - new_values, index=self.columns, columns=self.index + result = self._constructor( + dict(zip(self.index, new_values)), index=self.columns ) else: @@ -3452,7 +3459,7 @@ def __getitem__(self, key): else: if is_iterator(key): key = list(key) - indexer = self.columns._get_indexer_strict(key, "columns")[1] + indexer = self.loc._get_listlike_indexer(key, axis=1)[1] # take() does not accept boolean indexers if getattr(indexer, "dtype", None) == bool: @@ -3546,11 +3553,6 @@ def _get_value(self, index, col, takeable: bool = False) -> Scalar: Returns ------- scalar - - Notes - ----- - Assumes that both `self.index._index_as_unique` and - `self.columns._index_as_unique`; Caller is responsible for checking. """ if takeable: series = self._ixs(col, axis=1) @@ -3559,17 +3561,20 @@ def _get_value(self, index, col, takeable: bool = False) -> Scalar: series = self._get_item_cache(col) engine = self.index._engine - if not isinstance(self.index, MultiIndex): - # CategoricalIndex: Trying to use the engine fastpath may give incorrect - # results if our categories are integers that dont match our codes - # IntervalIndex: IntervalTree has no get_loc - row = self.index.get_loc(index) - return series._values[row] + try: + loc = engine.get_loc(index) + return series._values[loc] + except KeyError: + # GH 20629 + if self.index.nlevels > 1: + # partial indexing forbidden + raise - # For MultiIndex going through engine effectively restricts us to - # same-length tuples; see test_get_set_value_no_partial_indexing - loc = engine.get_loc(index) - return series._values[loc] + # we cannot handle direct indexing + # use positional + col = self.columns.get_loc(col) + index = self.index.get_loc(index) + return self._get_value(index, col, takeable=True) def __setitem__(self, key, value): key = com.apply_if_callable(key, self) @@ -3804,7 +3809,8 @@ def _set_value( return series = self._get_item_cache(col) - loc = self.index.get_loc(index) + engine = self.index._engine + loc = engine.get_loc(index) validate_numeric_casting(series.dtype, value) series._values[loc] = value @@ -5274,28 +5280,45 @@ def shift( axis = self._get_axis_number(axis) ncols = len(self.columns) - if axis == 1 and periods != 0 and fill_value is lib.no_default and ncols > 0: - # We will infer fill_value to match the closest column - # Use a column that we know is valid for our column's dtype GH#38434 - label = self.columns[0] + if ( + axis == 1 + and periods != 0 + and ncols > 0 + and (fill_value is lib.no_default or len(self._mgr.arrays) > 1) + ): + # Exclude single-array-with-fill_value case so we issue a FutureWarning + # if an integer is passed with datetimelike dtype GH#31971 + from pandas import concat + + # tail: the data that is still in our shifted DataFrame + if periods > 0: + tail = self.iloc[:, :-periods] + else: + tail = self.iloc[:, -periods:] + # pin a simple Index to avoid costly casting + tail.columns = range(len(tail.columns)) + + if fill_value is not lib.no_default: + # GH#35488 + # TODO(EA2D): with 2D EAs we could construct other directly + ser = Series(fill_value, index=self.index) + else: + # We infer fill_value to match the closest column + if periods > 0: + ser = self.iloc[:, 0].shift(len(self)) + else: + ser = self.iloc[:, -1].shift(len(self)) + + width = min(abs(periods), ncols) + other = concat([ser] * width, axis=1) if periods > 0: - result = self.iloc[:, :-periods] - for col in range(min(ncols, abs(periods))): - # TODO(EA2D): doing this in a loop unnecessary with 2D EAs - # Define filler inside loop so we get a copy - filler = self.iloc[:, 0].shift(len(self)) - result.insert(0, label, filler, allow_duplicates=True) + result = concat([other, tail], axis=1) else: - result = self.iloc[:, -periods:] - for col in range(min(ncols, abs(periods))): - # Define filler inside loop so we get a copy - filler = self.iloc[:, -1].shift(len(self)) - result.insert( - len(result.columns), label, filler, allow_duplicates=True - ) + result = concat([tail, other], axis=1) + result = cast(DataFrame, result) result.columns = self.columns.copy() return result @@ -6158,10 +6181,7 @@ def f(vals) -> tuple[np.ndarray, int]: return labels.astype("i8", copy=False), len(shape) if subset is None: - # https://github.com/pandas-dev/pandas/issues/28770 - # Incompatible types in assignment (expression has type "Index", variable - # has type "Sequence[Any]") - subset = self.columns # type: ignore[assignment] + subset = self.columns elif ( not np.iterable(subset) or isinstance(subset, str) @@ -6171,7 +6191,7 @@ def f(vals) -> tuple[np.ndarray, int]: subset = (subset,) # needed for mypy since can't narrow types using np.iterable - subset = cast(Sequence, subset) + subset = cast(Iterable, subset) # Verify all columns in subset exist in the queried dataframe # Otherwise, raise a KeyError, same as if you try to __getitem__ with a @@ -6238,7 +6258,6 @@ def sort_values( # type: ignore[override] keys, orders=ascending, na_position=na_position, key=key ) elif len(by): - # len(by) == 1 by = by[0] k = self._get_label_or_level_values(by, axis=axis) @@ -6720,16 +6739,23 @@ def nsmallest(self, n, columns, keep: str = "first") -> DataFrame: self, n=n, keep=keep, columns=columns ).nsmallest() - @doc( - Series.swaplevel, - klass=_shared_doc_kwargs["klass"], - extra_params=dedent( - """axis : {0 or 'index', 1 or 'columns'}, default 0 + def swaplevel(self, i: Axis = -2, j: Axis = -1, axis: Axis = 0) -> DataFrame: + """ + Swap levels i and j in a MultiIndex on a particular axis. + + Parameters + ---------- + i, j : int or str + Levels of the indices to be swapped. Can pass level name as string. + axis : {0 or 'index', 1 or 'columns'}, default 0 The axis to swap levels on. 0 or 'index' for row-wise, 1 or - 'columns' for column-wise.""" - ), - examples=dedent( - """Examples + 'columns' for column-wise. + + Returns + ------- + DataFrame + + Examples -------- >>> df = pd.DataFrame( ... {"Grade": ["A", "B", "A", "C"]}, @@ -6778,10 +6804,8 @@ def nsmallest(self, n, columns, keep: str = "first") -> DataFrame: History Final exam January A Geography Final exam February B History Coursework March A - Geography Coursework April C""" - ), - ) - def swaplevel(self, i: Axis = -2, j: Axis = -1, axis: Axis = 0) -> DataFrame: + Geography Coursework April C + """ result = self.copy() axis = self._get_axis_number(axis) @@ -7611,7 +7635,6 @@ def groupby( raise TypeError("You have to supply one of 'by' and 'level'") axis = self._get_axis_number(axis) - # https://github.com/python/mypy/issues/7642 # error: Argument "squeeze" to "DataFrameGroupBy" has incompatible type # "Union[bool, NoDefault]"; expected "bool" return DataFrameGroupBy( @@ -8121,11 +8144,7 @@ def stack(self, level: Level = -1, dropna: bool = True): return result.__finalize__(self, method="stack") - def explode( - self, - column: str | tuple | list[str | tuple], - ignore_index: bool = False, - ) -> DataFrame: + def explode(self, column: str | tuple, ignore_index: bool = False) -> DataFrame: """ Transform each element of a list-like to a row, replicating index values. @@ -8133,15 +8152,8 @@ def explode( Parameters ---------- - column : str or tuple or list thereof - Column(s) to explode. - For multiple columns, specify a non-empty list with each element - be str or tuple, and all specified columns their list-like data - on same row of the frame must have matching length. - - .. versionadded:: 1.3.0 - Multi-column explode - + column : str or tuple + Column to explode. ignore_index : bool, default False If True, the resulting index will be labeled 0, 1, …, n - 1. @@ -8156,10 +8168,7 @@ def explode( Raises ------ ValueError : - * If columns of the frame are not unique. - * If specified columns to explode is empty list. - * If specified columns to explode have not matching count of - elements rowwise in the frame. + if columns of the frame are not unique. See Also -------- @@ -8178,69 +8187,32 @@ def explode( Examples -------- - >>> df = pd.DataFrame({'A': [[0, 1, 2], 'foo', [], [3, 4]], - ... 'B': 1, - ... 'C': [['a', 'b', 'c'], np.nan, [], ['d', 'e']]}) + >>> df = pd.DataFrame({'A': [[1, 2, 3], 'foo', [], [3, 4]], 'B': 1}) >>> df - A B C - 0 [0, 1, 2] 1 [a, b, c] - 1 foo 1 NaN - 2 [] 1 [] - 3 [3, 4] 1 [d, e] - - Single-column explode. + A B + 0 [1, 2, 3] 1 + 1 foo 1 + 2 [] 1 + 3 [3, 4] 1 >>> df.explode('A') - A B C - 0 0 1 [a, b, c] - 0 1 1 [a, b, c] - 0 2 1 [a, b, c] - 1 foo 1 NaN - 2 NaN 1 [] - 3 3 1 [d, e] - 3 4 1 [d, e] - - Multi-column explode. - - >>> df.explode(list('AC')) - A B C - 0 0 1 a - 0 1 1 b - 0 2 1 c - 1 foo 1 NaN - 2 NaN 1 NaN - 3 3 1 d - 3 4 1 e - """ + A B + 0 1 1 + 0 2 1 + 0 3 1 + 1 foo 1 + 2 NaN 1 + 3 3 1 + 3 4 1 + """ + if not (is_scalar(column) or isinstance(column, tuple)): + raise ValueError("column must be a scalar") if not self.columns.is_unique: raise ValueError("columns must be unique") - columns: list[str | tuple] - if is_scalar(column) or isinstance(column, tuple): - assert isinstance(column, (str, tuple)) - columns = [column] - elif isinstance(column, list) and all( - map(lambda c: is_scalar(c) or isinstance(c, tuple), column) - ): - if not column: - raise ValueError("column must be nonempty") - if len(column) > len(set(column)): - raise ValueError("column must be unique") - columns = column - else: - raise ValueError("column must be a scalar, tuple, or list thereof") - df = self.reset_index(drop=True) - if len(columns) == 1: - result = df[columns[0]].explode() - else: - mylen = lambda x: len(x) if is_list_like(x) else -1 - counts0 = self[columns[0]].apply(mylen) - for c in columns[1:]: - if not all(counts0 == self[c].apply(mylen)): - raise ValueError("columns must have matching element counts") - result = DataFrame({c: df[c].explode() for c in columns}) - result = df.drop(columns, axis=1).join(result) + result = df[column].explode() + result = df.drop([column], axis=1).join(result) if ignore_index: result.index = ibase.default_index(len(result)) else: @@ -8429,8 +8401,8 @@ def _gotitem( self, key: IndexLabel, ndim: int, - subset: DataFrame | Series | None = None, - ) -> DataFrame | Series: + subset: FrameOrSeriesUnion | None = None, + ) -> FrameOrSeriesUnion: """ Sub-classes to define. Return a sliced object. @@ -8959,7 +8931,7 @@ def append( def join( self, - other: DataFrame | Series, + other: FrameOrSeriesUnion, on: IndexLabel | None = None, how: str = "left", lsuffix: str = "", @@ -9089,7 +9061,7 @@ def join( def _join_compat( self, - other: DataFrame | Series, + other: FrameOrSeriesUnion, on: IndexLabel | None = None, how: str = "left", lsuffix: str = "", @@ -9159,7 +9131,7 @@ def _join_compat( @Appender(_merge_doc, indents=2) def merge( self, - right: DataFrame | Series, + right: FrameOrSeriesUnion, how: str = "inner", on: IndexLabel | None = None, left_on: IndexLabel | None = None, @@ -9788,12 +9760,8 @@ def _reduce( FutureWarning, stacklevel=5, ) - # Non-copy equivalent to - # cols = self.columns[~dtype_is_dt] - # self = self[cols] - predicate = lambda x: not is_datetime64_any_dtype(x.dtype) - mgr = self._mgr._get_data_subset(predicate) - self = type(self)(mgr) + cols = self.columns[~dtype_is_dt] + self = self[cols] # TODO: Make other agg func handle axis=None properly GH#21597 axis = self._get_axis_number(axis) @@ -10756,7 +10724,7 @@ def _from_nested_dict(data) -> collections.defaultdict: return new_data -def _reindex_for_setitem(value: DataFrame | Series, index: Index) -> ArrayLike: +def _reindex_for_setitem(value: FrameOrSeriesUnion, index: Index) -> ArrayLike: # reindex if necessary if value.index.equals(index) or not len(index): diff --git a/pandas/core/generic.py b/pandas/core/generic.py index c63aeb736d16a..99e4888d08be6 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -14,11 +14,9 @@ AnyStr, Callable, Hashable, - Literal, Mapping, Sequence, cast, - final, overload, ) import warnings @@ -48,14 +46,14 @@ JSONSerializable, Level, Manager, - RandomState, + NpDtype, Renamer, StorageOptions, T, TimedeltaConvertibleTypes, TimestampConvertibleTypes, ValueKeyFunc, - npt, + final, ) from pandas.compat._optional import import_optional_dependency from pandas.compat.numpy import function as nv @@ -138,7 +136,6 @@ from pandas.core.missing import find_valid_index from pandas.core.ops import align_method_FRAME from pandas.core.reshape.concat import concat -import pandas.core.sample as sample from pandas.core.shared_docs import _shared_docs from pandas.core.sorting import get_indexer_indexer from pandas.core.window import ( @@ -156,6 +153,7 @@ from pandas.io.formats.printing import pprint_thing if TYPE_CHECKING: + from typing import Literal from pandas._libs.tslibs import BaseOffset @@ -1152,7 +1150,7 @@ def rename( ] raise KeyError(f"{missing_labels} not found in axis") - new_index = ax._transform_index(f, level=level) + new_index = ax._transform_index(f, level) result._set_axis_nocheck(new_index, axis=axis_no, inplace=True) result._clear_item_cache() @@ -1875,10 +1873,11 @@ def _drop_labels_or_levels(self, keys, axis: int = 0): # ---------------------------------------------------------------------- # Iteration - # https://github.com/python/typeshed/issues/2148#issuecomment-520783318 - # Incompatible types in assignment (expression has type "None", base class - # "object" defined the type as "Callable[[object], int]") - __hash__: None # type: ignore[assignment] + def __hash__(self) -> int: + raise TypeError( + f"{repr(type(self).__name__)} objects are mutable, " + f"thus they cannot be hashed" + ) def __iter__(self): """ @@ -1988,7 +1987,7 @@ def empty(self) -> bool_t: # GH#23114 Ensure ndarray.__op__(DataFrame) returns NotImplemented __array_priority__ = 1000 - def __array__(self, dtype: npt.DTypeLike | None = None) -> np.ndarray: + def __array__(self, dtype: NpDtype | None = None) -> np.ndarray: return np.asarray(self._values, dtype=dtype) def __array_wrap__( @@ -3765,12 +3764,12 @@ class animal locomotion self._consolidate_inplace() if isinstance(index, MultiIndex): - loc, new_index = index._get_loc_level(key, level=0) - if not drop_level: - if lib.is_integer(loc): - new_index = index[loc : loc + 1] - else: - new_index = index[loc] + try: + loc, new_index = index._get_loc_level( + key, level=0, drop_level=drop_level + ) + except TypeError as e: + raise TypeError(f"Expected label or tuple of labels, got {key}") from e else: loc = index.get_loc(key) @@ -5143,13 +5142,12 @@ def tail(self: FrameOrSeries, n: int = 5) -> FrameOrSeries: @final def sample( self: FrameOrSeries, - n: int | None = None, - frac: float | None = None, - replace: bool_t = False, + n=None, + frac=None, + replace=False, weights=None, - random_state: RandomState | None = None, - axis: Axis | None = None, - ignore_index: bool_t = False, + random_state=None, + axis=None, ) -> FrameOrSeries: """ Return a random sample of items from an axis of object. @@ -5178,27 +5176,19 @@ def sample( If weights do not sum to 1, they will be normalized to sum to 1. Missing values in the weights column will be treated as zero. Infinite values not allowed. - random_state : int, array-like, BitGenerator, np.random.RandomState, - np.random.Generator, optional. If int, array-like, or BitGenerator, seed for - random number generator. If np.random.RandomState or np.random.Generator, - use as given. + random_state : int, array-like, BitGenerator, np.random.RandomState, optional + If int, array-like, or BitGenerator (NumPy>=1.17), seed for + random number generator + If np.random.RandomState, use as numpy RandomState object. .. versionchanged:: 1.1.0 - array-like and BitGenerator object now passed to np.random.RandomState() - as seed - - .. versionchanged:: 1.4.0 - - np.random.Generator objects now accepted + array-like and BitGenerator (for NumPy>=1.17) object now passed to + np.random.RandomState() as seed axis : {0 or ‘index’, 1 or ‘columns’, None}, default None Axis to sample. Accepts axis number or name. Default is stat axis for given data type (0 for Series and DataFrames). - ignore_index : bool, default False - If True, the resulting index will be labeled 0, 1, …, n - 1. - - .. versionadded:: 1.3.0 Returns ------- @@ -5275,26 +5265,92 @@ def sample( axis = self._stat_axis_number axis = self._get_axis_number(axis) - obj_len = self.shape[axis] + axis_length = self.shape[axis] # Process random_state argument rs = com.random_state(random_state) - size = sample.process_sampling_size(n, frac, replace) - if size is None: - assert frac is not None - size = round(frac * obj_len) - + # Check weights for compliance if weights is not None: - weights = sample.preprocess_weights(self, weights, axis) - sampled_indices = sample.sample(obj_len, size, replace, weights, rs) - result = self.take(sampled_indices, axis=axis) + # If a series, align with frame + if isinstance(weights, ABCSeries): + weights = weights.reindex(self.axes[axis]) - if ignore_index: - result.index = ibase.default_index(len(result)) + # Strings acceptable if a dataframe and axis = 0 + if isinstance(weights, str): + if isinstance(self, ABCDataFrame): + if axis == 0: + try: + weights = self[weights] + except KeyError as err: + raise KeyError( + "String passed to weights not a valid column" + ) from err + else: + raise ValueError( + "Strings can only be passed to " + "weights when sampling from rows on " + "a DataFrame" + ) + else: + raise ValueError( + "Strings cannot be passed as weights " + "when sampling from a Series." + ) - return result + if isinstance(self, ABCSeries): + func = self._constructor + else: + func = self._constructor_sliced + weights = func(weights, dtype="float64") + + if len(weights) != axis_length: + raise ValueError( + "Weights and axis to be sampled must be of same length" + ) + + if (weights == np.inf).any() or (weights == -np.inf).any(): + raise ValueError("weight vector may not include `inf` values") + + if (weights < 0).any(): + raise ValueError("weight vector many not include negative values") + + # If has nan, set to zero. + weights = weights.fillna(0) + + # Renormalize if don't sum to 1 + if weights.sum() != 1: + if weights.sum() != 0: + weights = weights / weights.sum() + else: + raise ValueError("Invalid weights: weights sum to zero") + + weights = weights._values + + # If no frac or n, default to n=1. + if n is None and frac is None: + n = 1 + elif frac is not None and frac > 1 and not replace: + raise ValueError( + "Replace has to be set to `True` when " + "upsampling the population `frac` > 1." + ) + elif frac is None and n % 1 != 0: + raise ValueError("Only integers accepted as `n` values") + elif n is None and frac is not None: + n = round(frac * axis_length) + elif frac is not None: + raise ValueError("Please enter a value for `frac` OR `n`, not both") + + # Check for negative sizes + if n < 0: + raise ValueError( + "A negative number of rows requested. Please provide positive value." + ) + + locs = rs.choice(axis_length, size=n, replace=replace, p=weights) + return self.take(locs, axis=axis) @final @doc(klass=_shared_doc_kwargs["klass"]) @@ -7229,11 +7285,11 @@ def clip( Parameters ---------- - lower : float or array-like, default None + lower : float or array_like, default None Minimum threshold value. All values below this threshold will be set to it. A missing threshold (e.g `NA`) will not clip the value. - upper : float or array-like, default None + upper : float or array_like, default None Maximum threshold value. All values above this threshold will be set to it. A missing threshold (e.g `NA`) will not clip the value. @@ -7833,8 +7889,8 @@ def resample( Pass a custom function via ``apply`` - >>> def custom_resampler(arraylike): - ... return np.sum(arraylike) + 5 + >>> def custom_resampler(array_like): + ... return np.sum(array_like) + 5 ... >>> series.resample('3T').apply(custom_resampler) 2000-01-01 00:00:00 8 @@ -9382,7 +9438,7 @@ def truncate( if before is not None and after is not None and before > after: raise ValueError(f"Truncate: {after} must be after {before}") - if len(ax) > 1 and ax.is_monotonic_decreasing and ax.nunique() > 1: + if len(ax) > 1 and ax.is_monotonic_decreasing: before, after = after, before slicer = [slice(None, None)] * self._AXIS_LEN @@ -9695,9 +9751,11 @@ def abs(self: FrameOrSeries) -> FrameOrSeries: 2 6 30 -30 3 7 40 -50 """ - # error: Incompatible return value type (got "ndarray[Any, dtype[Any]]", - # expected "FrameOrSeries") - return np.abs(self) # type: ignore[return-value] + # error: Argument 1 to "__call__" of "ufunc" has incompatible type + # "FrameOrSeries"; expected "Union[Union[int, float, complex, str, bytes, + # generic], Sequence[Union[int, float, complex, str, bytes, generic]], + # Sequence[Sequence[Any]], _SupportsArray]" + return np.abs(self) # type: ignore[arg-type] @final def describe( @@ -10499,7 +10557,6 @@ def mad(self, axis=None, skipna=None, level=None): name1=name1, name2=name2, axis_descr=axis_descr, - notes="", ) def sem( self, @@ -10521,7 +10578,6 @@ def sem( name1=name1, name2=name2, axis_descr=axis_descr, - notes="", ) def var( self, @@ -10544,7 +10600,6 @@ def var( name1=name1, name2=name2, axis_descr=axis_descr, - notes=_std_notes, ) def std( self, @@ -10838,12 +10893,11 @@ def ewm( span: float | None = None, halflife: float | TimedeltaConvertibleTypes | None = None, alpha: float | None = None, - min_periods: int | None = 0, + min_periods: int = 0, adjust: bool_t = True, ignore_na: bool_t = False, axis: Axis = 0, times: str | np.ndarray | FrameOrSeries | None = None, - method: str = "single", ) -> ExponentialMovingWindow: axis = self._get_axis_number(axis) # error: Value of type variable "FrameOrSeries" of "ExponentialMovingWindow" @@ -10859,7 +10913,6 @@ def ewm( ignore_na=ignore_na, axis=axis, times=times, - method=method, ) # ---------------------------------------------------------------------- @@ -10983,7 +11036,7 @@ def last_valid_index(self) -> Hashable | None: def _doc_params(cls): """Return a tuple of the doc params.""" axis_descr = ( - f"{{{', '.join([f'{a} ({i})' for i, a in enumerate(cls._AXIS_ORDERS)])}}}" + f"{{{', '.join(f'{a} ({i})' for i, a in enumerate(cls._AXIS_ORDERS))}}}" ) name = cls._constructor_sliced.__name__ if cls._AXIS_LEN > 1 else "scalar" name2 = cls.__name__ @@ -11037,16 +11090,12 @@ def _doc_params(cls): Returns ------- -{name1} or {name2} (if level specified) \ -{notes} -""" - -_std_notes = """ +{name1} or {name2} (if level specified) Notes ----- To have the same behaviour as `numpy.std`, use `ddof=0` (instead of the -default `ddof=1`)""" +default `ddof=1`)\n""" _bool_doc = """ {desc} diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 88d1baae86467..69f992f840c7c 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -33,6 +33,7 @@ from pandas._typing import ( ArrayLike, FrameOrSeries, + FrameOrSeriesUnion, Manager2D, ) from pandas.util._decorators import ( @@ -295,7 +296,7 @@ def _aggregate_multiple_funcs(self, arg) -> DataFrame: arg = zip(columns, arg) - results: dict[base.OutputKey, DataFrame | Series] = {} + results: dict[base.OutputKey, FrameOrSeriesUnion] = {} for idx, (name, func) in enumerate(arg): key = base.OutputKey(label=name, position=idx) @@ -421,7 +422,7 @@ def _wrap_applied_output( keys: Index, values: list[Any] | None, not_indexed_same: bool = False, - ) -> DataFrame | Series: + ) -> FrameOrSeriesUnion: """ Wrap the output of SeriesGroupBy.apply into the expected result. @@ -1019,15 +1020,13 @@ def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs) if isinstance(sobj, Series): # GH#35246 test_groupby_as_index_select_column_sum_empty_df - result.columns = self._obj_with_exclusions.columns.copy() + result.columns = [sobj.name] else: - # Retain our column names - result.columns._set_names( - sobj.columns.names, level=list(range(sobj.columns.nlevels)) - ) # select everything except for the last level, which is the one # containing the name of the function(s), see GH#32040 - result.columns = result.columns.droplevel(-1) + result.columns = result.columns.rename( + [sobj.columns.name] * result.columns.nlevels + ).droplevel(-1) if not self.as_index: self._insert_inaxis_grouper_inplace(result) @@ -1192,7 +1191,7 @@ def _wrap_applied_output_series( not_indexed_same: bool, first_not_none, key_index, - ) -> DataFrame | Series: + ) -> FrameOrSeriesUnion: # this is to silence a DeprecationWarning # TODO: Remove when default dtype of empty Series is object kwargs = first_not_none._construct_axes_dict() @@ -1308,15 +1307,10 @@ def _transform_general(self, func, *args, **kwargs): gen = self.grouper.get_iterator(obj, axis=self.axis) fast_path, slow_path = self._define_paths(func, *args, **kwargs) - # Determine whether to use slow or fast path by evaluating on the first group. - # Need to handle the case of an empty generator and process the result so that - # it does not need to be computed again. - try: - name, group = next(gen) - except StopIteration: - pass - else: + for name, group in gen: object.__setattr__(group, "name", name) + + # Try slow path and fast path. try: path, res = self._choose_path(fast_path, slow_path, group) except TypeError: @@ -1324,18 +1318,30 @@ def _transform_general(self, func, *args, **kwargs): except ValueError as err: msg = "transform must return a scalar value for each group" raise ValueError(msg) from err - if group.size > 0: - res = _wrap_transform_general_frame(self.obj, group, res) - applied.append(res) - # Compute and process with the remaining groups - for name, group in gen: - if group.size == 0: - continue - object.__setattr__(group, "name", name) - res = path(group) - res = _wrap_transform_general_frame(self.obj, group, res) - applied.append(res) + if isinstance(res, Series): + + # we need to broadcast across the + # other dimension; this will preserve dtypes + # GH14457 + if not np.prod(group.shape): + continue + elif res.index.is_(obj.index): + r = concat([res] * len(group.columns), axis=1) + r.columns = group.columns + r.index = group.index + else: + r = self.obj._constructor( + np.concatenate([res.values] * len(group.index)).reshape( + group.shape + ), + columns=group.columns, + index=group.index, + ) + + applied.append(r) + else: + applied.append(res) concat_index = obj.columns if self.axis == 0 else obj.index other_axis = 1 if self.axis == 0 else 0 # switches between 0 & 1 @@ -1659,7 +1665,7 @@ def _wrap_transformed_output( result.columns = self.obj.columns else: columns = Index(key.label for key in output) - columns._set_names(self.obj._get_axis(1 - self.axis).names) + columns.name = self.obj.columns.name result.columns = columns result.index = self.obj.index @@ -1668,9 +1674,7 @@ def _wrap_transformed_output( def _wrap_agged_manager(self, mgr: Manager2D) -> DataFrame: if not self.as_index: - # GH 41998 - empty mgr always gets index of length 0 - rows = mgr.shape[1] if mgr.shape[0] > 0 else 0 - index = Index(range(rows)) + index = Index(range(mgr.shape[1])) mgr.set_axis(1, index) result = self.obj._constructor(mgr) @@ -1796,6 +1800,7 @@ def nunique(self, dropna: bool = True) -> DataFrame: results = self._apply_to_column_groupbys( lambda sgb: sgb.nunique(dropna), obj=obj ) + results.columns.names = obj.columns.names # TODO: do at higher level? if not self.as_index: results.index = Index(range(len(results))) @@ -1846,28 +1851,3 @@ def func(df): return self._python_apply_general(func, self._obj_with_exclusions) boxplot = boxplot_frame_groupby - - -def _wrap_transform_general_frame( - obj: DataFrame, group: DataFrame, res: DataFrame | Series -) -> DataFrame: - from pandas import concat - - if isinstance(res, Series): - # we need to broadcast across the - # other dimension; this will preserve dtypes - # GH14457 - if res.index.is_(obj.index): - res_frame = concat([res] * len(group.columns), axis=1) - res_frame.columns = group.columns - res_frame.index = group.index - else: - res_frame = obj._constructor( - np.concatenate([res.values] * len(group.index)).reshape(group.shape), - columns=group.columns, - index=group.index, - ) - assert isinstance(res_frame, DataFrame) - return res_frame - else: - return res diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index d3a86fa5950ed..f694dcce809ea 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -18,18 +18,17 @@ class providing the base-class of operations. from textwrap import dedent import types from typing import ( + TYPE_CHECKING, Callable, Hashable, Iterable, Iterator, List, - Literal, Mapping, Sequence, TypeVar, Union, cast, - final, ) import warnings @@ -46,10 +45,11 @@ class providing the base-class of operations. ArrayLike, F, FrameOrSeries, + FrameOrSeriesUnion, IndexLabel, - RandomState, Scalar, T, + final, ) from pandas.compat.numpy import function as nv from pandas.errors import AbstractMethodError @@ -101,7 +101,6 @@ class providing the base-class of operations. MultiIndex, ) from pandas.core.internals.blocks import ensure_block_shape -import pandas.core.sample as sample from pandas.core.series import Series from pandas.core.sorting import get_group_index_sorter from pandas.core.util.numba_ import ( @@ -109,6 +108,9 @@ class providing the base-class of operations. maybe_use_numba, ) +if TYPE_CHECKING: + from typing import Literal + _common_see_also = """ See Also -------- @@ -726,7 +728,7 @@ def pipe( plot = property(GroupByPlot) @final - def get_group(self, name, obj=None) -> DataFrame | Series: + def get_group(self, name, obj=None) -> FrameOrSeriesUnion: """ Construct DataFrame from group with provided name. @@ -1265,8 +1267,8 @@ def f(g): @final def _python_apply_general( - self, f: F, data: DataFrame | Series - ) -> DataFrame | Series: + self, f: F, data: FrameOrSeriesUnion + ) -> FrameOrSeriesUnion: """ Apply function f in python space @@ -1517,11 +1519,7 @@ def _bool_agg(self, val_test, skipna): def objs_to_bool(vals: ArrayLike) -> tuple[np.ndarray, type]: if is_object_dtype(vals): - # GH#37501: don't raise on pd.NA when skipna=True - if skipna: - vals = np.array([bool(x) if not isna(x) else True for x in vals]) - else: - vals = np.array([bool(x) for x in vals]) + vals = np.array([bool(x) for x in vals]) elif isinstance(vals, BaseMaskedArray): vals = vals._data.astype(bool, copy=False) else: @@ -1788,7 +1786,7 @@ def sem(self, ddof: int = 1): @final @Substitution(name="groupby") @Appender(_common_see_also) - def size(self) -> DataFrame | Series: + def size(self) -> FrameOrSeriesUnion: """ Compute group sizes. @@ -2637,7 +2635,7 @@ def cumcount(self, ascending: bool = True): @final @Substitution(name="groupby") - @Substitution(see_also=_common_see_also) + @Appender(_common_see_also) def rank( self, method: str = "average", @@ -2671,41 +2669,6 @@ def rank( Returns ------- DataFrame with ranking of values within each group - %(see_also)s - Examples - -------- - >>> df = pd.DataFrame( - ... { - ... "group": ["a", "a", "a", "a", "a", "b", "b", "b", "b", "b"], - ... "value": [2, 4, 2, 3, 5, 1, 2, 4, 1, 5], - ... } - ... ) - >>> df - group value - 0 a 2 - 1 a 4 - 2 a 2 - 3 a 3 - 4 a 5 - 5 b 1 - 6 b 2 - 7 b 4 - 8 b 1 - 9 b 5 - >>> for method in ['average', 'min', 'max', 'dense', 'first']: - ... df[f'{method}_rank'] = df.groupby('group')['value'].rank(method) - >>> df - group value average_rank min_rank max_rank dense_rank first_rank - 0 a 2 1.5 1.0 2.0 1.0 1.0 - 1 a 4 4.0 4.0 4.0 3.0 4.0 - 2 a 2 1.5 1.0 2.0 1.0 2.0 - 3 a 3 3.0 3.0 3.0 2.0 3.0 - 4 a 5 5.0 5.0 5.0 4.0 5.0 - 5 b 1 1.5 1.0 2.0 1.0 1.0 - 6 b 2 3.0 3.0 3.0 2.0 3.0 - 7 b 4 4.0 4.0 4.0 3.0 4.0 - 8 b 1 1.5 1.0 2.0 1.0 2.0 - 9 b 5 5.0 5.0 5.0 4.0 5.0 """ if na_option not in {"keep", "top", "bottom"}: msg = "na_option must be one of 'keep', 'top', or 'bottom'" @@ -3217,7 +3180,7 @@ def sample( frac: float | None = None, replace: bool = False, weights: Sequence | Series | None = None, - random_state: RandomState | None = None, + random_state=None, ): """ Return a random sample of items from each group. @@ -3243,14 +3206,10 @@ def sample( sampling probabilities after normalization within each group. Values must be non-negative with at least one positive element within each group. - random_state : int, array-like, BitGenerator, np.random.RandomState, - np.random.Generator, optional. If int, array-like, or BitGenerator, seed for - random number generator. If np.random.RandomState or np.random.Generator, - use as given. - - .. versionchanged:: 1.4.0 - - np.random.Generator objects now accepted + random_state : int, array-like, BitGenerator, np.random.RandomState, optional + If int, array-like, or BitGenerator (NumPy>=1.17), seed for + random number generator + If np.random.RandomState, use as numpy RandomState object. Returns ------- @@ -3307,37 +3266,26 @@ def sample( 2 blue 2 0 red 0 """ - size = sample.process_sampling_size(n, frac, replace) + from pandas.core.reshape.concat import concat + if weights is not None: - weights_arr = sample.preprocess_weights( - self._selected_obj, weights, axis=self.axis - ) + weights = Series(weights, index=self._selected_obj.index) + ws = [weights.iloc[idx] for idx in self.indices.values()] + else: + ws = [None] * self.ngroups - random_state = com.random_state(random_state) + if random_state is not None: + random_state = com.random_state(random_state) group_iterator = self.grouper.get_iterator(self._selected_obj, self.axis) - - sampled_indices = [] - for labels, obj in group_iterator: - grp_indices = self.indices[labels] - group_size = len(grp_indices) - if size is not None: - sample_size = size - else: - assert frac is not None - sample_size = round(frac * group_size) - - grp_sample = sample.sample( - group_size, - size=sample_size, - replace=replace, - weights=None if weights is None else weights_arr[grp_indices], - random_state=random_state, + samples = [ + obj.sample( + n=n, frac=frac, replace=replace, weights=w, random_state=random_state ) - sampled_indices.append(grp_indices[grp_sample]) + for (_, obj), w in zip(group_iterator, ws) + ] - sampled_indices = np.concatenate(sampled_indices) - return self._selected_obj.take(sampled_indices, axis=self.axis) + return concat(samples, axis=self.axis) @doc(GroupBy) diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 3307558deec33..c5d5d5a301336 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -4,10 +4,7 @@ """ from __future__ import annotations -from typing import ( - Hashable, - final, -) +from typing import Hashable import warnings import numpy as np @@ -15,6 +12,7 @@ from pandas._typing import ( ArrayLike, FrameOrSeries, + final, ) from pandas.errors import InvalidIndexError from pandas.util._decorators import cache_readonly @@ -492,7 +490,7 @@ def __init__( self.grouping_vector, # Index self._codes, self._group_index, - ) = index._get_grouper_for_level(mapper, level=ilevel) + ) = index._get_grouper_for_level(mapper, ilevel) # a passed Grouper like, directly get the grouper in the same way # as single grouper groupby, use the group_info to get codes diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 36fbda5974ea0..b65f26c7174fc 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -14,7 +14,6 @@ Hashable, Iterator, Sequence, - final, overload, ) @@ -32,7 +31,7 @@ F, FrameOrSeries, Shape, - npt, + final, ) from pandas.errors import AbstractMethodError from pandas.util._decorators import cache_readonly @@ -162,9 +161,7 @@ def _get_cython_function( f = getattr(libgroupby, ftype) if is_numeric: return f - # error: Non-overlapping equality check (left operand type: "dtype[Any]", right - # operand type: "Literal['object']") - elif dtype == object: # type: ignore[comparison-overlap] + elif dtype == object: if "object" not in f.__signatures__: # raise NotImplementedError here rather than TypeError later raise NotImplementedError( @@ -342,54 +339,95 @@ def _ea_wrap_cython_operation( comp_ids=comp_ids, **kwargs, ) + orig_values = values - if isinstance(values, (DatetimeArray, PeriodArray, TimedeltaArray)): + if isinstance(orig_values, (DatetimeArray, PeriodArray)): # All of the functions implemented here are ordinal, so we can # operate on the tz-naive equivalents - npvalues = values._ndarray.view("M8[ns]") + npvalues = orig_values._ndarray.view("M8[ns]") + res_values = self._cython_op_ndim_compat( + npvalues, + min_count=min_count, + ngroups=ngroups, + comp_ids=comp_ids, + mask=None, + **kwargs, + ) + if self.how in ["rank"]: + # i.e. how in WrappedCythonOp.cast_blocklist, since + # other cast_blocklist methods dont go through cython_operation + # preserve float64 dtype + return res_values + + res_values = res_values.view("i8") + result = type(orig_values)(res_values, dtype=orig_values.dtype) + return result + + elif isinstance(orig_values, TimedeltaArray): + # We have an ExtensionArray but not ExtensionDtype + res_values = self._cython_op_ndim_compat( + orig_values._ndarray, + min_count=min_count, + ngroups=ngroups, + comp_ids=comp_ids, + mask=None, + **kwargs, + ) + if self.how in ["rank"]: + # i.e. how in WrappedCythonOp.cast_blocklist, since + # other cast_blocklist methods dont go through cython_operation + # preserve float64 dtype + return res_values + + # otherwise res_values has the same dtype as original values + return type(orig_values)(res_values) + elif isinstance(values.dtype, (BooleanDtype, _IntegerDtype)): # IntegerArray or BooleanArray npvalues = values.to_numpy("float64", na_value=np.nan) - elif isinstance(values.dtype, FloatingDtype): - # FloatingArray - npvalues = values.to_numpy(values.dtype.numpy_dtype, na_value=np.nan) - else: - raise NotImplementedError( - f"function is not implemented for this dtype: {values.dtype}" + res_values = self._cython_op_ndim_compat( + npvalues, + min_count=min_count, + ngroups=ngroups, + comp_ids=comp_ids, + mask=None, + **kwargs, ) + if self.how in ["rank"]: + # i.e. how in WrappedCythonOp.cast_blocklist, since + # other cast_blocklist methods dont go through cython_operation + return res_values - res_values = self._cython_op_ndim_compat( - npvalues, - min_count=min_count, - ngroups=ngroups, - comp_ids=comp_ids, - mask=None, - **kwargs, - ) - - if self.how in ["rank"]: - # i.e. how in WrappedCythonOp.cast_blocklist, since - # other cast_blocklist methods dont go through cython_operation - return res_values - - return self._reconstruct_ea_result(values, res_values) + dtype = self._get_result_dtype(orig_values.dtype) + cls = dtype.construct_array_type() + return cls._from_sequence(res_values, dtype=dtype) - def _reconstruct_ea_result(self, values, res_values): - """ - Construct an ExtensionArray result from an ndarray result. - """ - # TODO: allow EAs to override this logic + elif isinstance(values.dtype, FloatingDtype): + # FloatingArray + npvalues = values.to_numpy( + values.dtype.numpy_dtype, + na_value=np.nan, + ) + res_values = self._cython_op_ndim_compat( + npvalues, + min_count=min_count, + ngroups=ngroups, + comp_ids=comp_ids, + mask=None, + **kwargs, + ) + if self.how in ["rank"]: + # i.e. how in WrappedCythonOp.cast_blocklist, since + # other cast_blocklist methods dont go through cython_operation + return res_values - if isinstance(values.dtype, (BooleanDtype, _IntegerDtype, FloatingDtype)): - dtype = self._get_result_dtype(values.dtype) + dtype = self._get_result_dtype(orig_values.dtype) cls = dtype.construct_array_type() return cls._from_sequence(res_values, dtype=dtype) - elif needs_i8_conversion(values.dtype): - i8values = res_values.view("i8") - return type(values)(i8values, dtype=values.dtype) - - raise NotImplementedError + raise NotImplementedError( + f"function is not implemented for this dtype: {values.dtype}" + ) @final def _masked_ea_wrap_cython_operation( @@ -438,8 +476,6 @@ def _cython_op_ndim_compat( if values.ndim == 1: # expand to 2d, dispatch, then squeeze if appropriate values2d = values[None, :] - if mask is not None: - mask = mask[None, :] res = self._call_cython_op( values2d, min_count=min_count, @@ -495,8 +531,9 @@ def _call_cython_op( values = ensure_float64(values) values = values.T + if mask is not None: - mask = mask.T + mask = mask.reshape(values.shape, order="C") out_shape = self._get_output_shape(ngroups, values) func, values = self.get_cython_func_and_vals(values, is_numeric) @@ -638,7 +675,7 @@ def __init__( sort: bool = True, group_keys: bool = True, mutated: bool = False, - indexer: npt.NDArray[np.intp] | None = None, + indexer: np.ndarray | None = None, dropna: bool = True, ): assert isinstance(axis, Index), axis @@ -1229,13 +1266,7 @@ def _is_indexed_like(obj, axes, axis: int) -> bool: class DataSplitter(Generic[FrameOrSeries]): - def __init__( - self, - data: FrameOrSeries, - labels: npt.NDArray[np.intp], - ngroups: int, - axis: int = 0, - ): + def __init__(self, data: FrameOrSeries, labels, ngroups: int, axis: int = 0): self.data = data self.labels = ensure_platform_int(labels) # _should_ already be np.intp self.ngroups = ngroups diff --git a/pandas/core/indexers.py b/pandas/core/indexers.py index eacc7960a82aa..ed4b1a3fbb39c 100644 --- a/pandas/core/indexers.py +++ b/pandas/core/indexers.py @@ -3,19 +3,16 @@ """ from __future__ import annotations -from typing import ( - TYPE_CHECKING, - Any, -) +from typing import TYPE_CHECKING import warnings import numpy as np from pandas._typing import ( + Any, AnyArrayLike, ArrayLike, ) -from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.common import ( is_array_like, @@ -378,7 +375,7 @@ def deprecate_ndim_indexing(result, stacklevel: int = 3): "is deprecated and will be removed in a future " "version. Convert to a numpy array before indexing instead.", FutureWarning, - stacklevel=find_stack_level(), + stacklevel=stacklevel, ) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 5866644860831..e4c21b3de2cac 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -9,11 +9,9 @@ Any, Callable, Hashable, - Literal, Sequence, TypeVar, cast, - final, overload, ) import warnings @@ -44,7 +42,8 @@ DtypeObj, F, Shape, - npt, + T, + final, ) from pandas.compat.numpy import function as nv from pandas.errors import ( @@ -160,6 +159,7 @@ ) if TYPE_CHECKING: + from typing import Literal from pandas import ( CategoricalIndex, @@ -306,7 +306,8 @@ class Index(IndexOpsMixin, PandasObject): # given the dtypes of the passed arguments @final - def _left_indexer_unique(self: _IndexT, other: _IndexT) -> npt.NDArray[np.intp]: + def _left_indexer_unique(self: _IndexT, other: _IndexT) -> np.ndarray: + # -> np.ndarray[np.intp] # Caller is responsible for ensuring other.dtype == self.dtype sv = self._get_join_target() ov = other._get_join_target() @@ -315,7 +316,7 @@ def _left_indexer_unique(self: _IndexT, other: _IndexT) -> npt.NDArray[np.intp]: @final def _left_indexer( self: _IndexT, other: _IndexT - ) -> tuple[ArrayLike, npt.NDArray[np.intp], npt.NDArray[np.intp]]: + ) -> tuple[ArrayLike, np.ndarray, np.ndarray]: # Caller is responsible for ensuring other.dtype == self.dtype sv = self._get_join_target() ov = other._get_join_target() @@ -326,7 +327,7 @@ def _left_indexer( @final def _inner_indexer( self: _IndexT, other: _IndexT - ) -> tuple[ArrayLike, npt.NDArray[np.intp], npt.NDArray[np.intp]]: + ) -> tuple[ArrayLike, np.ndarray, np.ndarray]: # Caller is responsible for ensuring other.dtype == self.dtype sv = self._get_join_target() ov = other._get_join_target() @@ -337,7 +338,7 @@ def _inner_indexer( @final def _outer_indexer( self: _IndexT, other: _IndexT - ) -> tuple[ArrayLike, npt.NDArray[np.intp], npt.NDArray[np.intp]]: + ) -> tuple[ArrayLike, np.ndarray, np.ndarray]: # Caller is responsible for ensuring other.dtype == self.dtype sv = self._get_join_target() ov = other._get_join_target() @@ -558,9 +559,7 @@ def _dtype_to_subclass(cls, dtype: DtypeObj): return Int64Index - # error: Non-overlapping equality check (left operand type: "dtype[Any]", right - # operand type: "Type[object]") - elif dtype == object: # type: ignore[comparison-overlap] + elif dtype == object: # NB: assuming away MultiIndex return Index @@ -1499,7 +1498,7 @@ def _validate_names( def _get_names(self) -> FrozenList: return FrozenList((self.name,)) - def _set_names(self, values, *, level=None) -> None: + def _set_names(self, values, level=None) -> None: """ Set new names on index. Each name has to be a hashable type. @@ -1886,21 +1885,14 @@ def _drop_level_numbers(self, levnums: list[int]): new_names.pop(i) if len(new_levels) == 1: - lev = new_levels[0] - - if len(lev) == 0: - # If lev is empty, lev.take will fail GH#42055 - res_values = algos.take(lev._values, new_codes[0], allow_fill=True) - result = type(lev)._simple_new(res_values, name=new_names[0]) - else: - # set nan if needed - mask = new_codes[0] == -1 - result = new_levels[0].take(new_codes[0]) - if mask.any(): - result = result.putmask(mask, np.nan) - result._name = new_names[0] + # set nan if needed + mask = new_codes[0] == -1 + result = new_levels[0].take(new_codes[0]) + if mask.any(): + result = result.putmask(mask, np.nan) + result._name = new_names[0] return result else: from pandas.core.indexes.multi import MultiIndex @@ -1912,7 +1904,7 @@ def _drop_level_numbers(self, levnums: list[int]): verify_integrity=False, ) - def _get_grouper_for_level(self, mapper, *, level=None): + def _get_grouper_for_level(self, mapper, level=None): """ Get index grouper corresponding to an index level @@ -1921,7 +1913,7 @@ def _get_grouper_for_level(self, mapper, *, level=None): mapper: Group mapping function or None Function mapping index values to groups level : int or None - Index level, positional + Index level Returns ------- @@ -2760,6 +2752,16 @@ def duplicated(self, keep: Literal["first", "last", False] = "first") -> np.ndar return np.zeros(len(self), dtype=bool) return self._duplicated(keep=keep) + def _get_unique_index(self: _IndexT) -> _IndexT: + """ + Returns an index containing unique values. + + Returns + ------- + Index + """ + return self.unique() + # -------------------------------------------------------------------- # Arithmetic & Logical Methods @@ -3076,30 +3078,6 @@ def intersection(self, other, sort=False): return self.unique()._get_reconciled_name_object(other) return self._get_reconciled_name_object(other) - if len(self) == 0 or len(other) == 0: - # fastpath; we need to be careful about having commutativity - - if self._is_multi or other._is_multi: - # _convert_can_do_setop ensures that we have both or neither - # We retain self.levels - return self[:0].rename(result_name) - - dtype = self._find_common_type_compat(other) - if is_dtype_equal(self.dtype, dtype): - # Slicing allows us to retain DTI/TDI.freq, RangeIndex - - # Note: self[:0] vs other[:0] affects - # 1) which index's `freq` we get in DTI/TDI cases - # This may be a historical artifact, i.e. no documented - # reason for this choice. - # 2) The `step` we get in RangeIndex cases - if len(self) == 0: - return self[:0].rename(result_name) - else: - return other[:0].rename(result_name) - - return Index([], dtype=dtype, name=result_name) - elif not self._should_compare(other): # We can infer that the intersection is empty. if isinstance(self, ABCMultiIndex): @@ -3107,25 +3085,21 @@ def intersection(self, other, sort=False): return Index([], name=result_name) elif not is_dtype_equal(self.dtype, other.dtype): - dtype = self._find_common_type_compat(other) + dtype = find_common_type([self.dtype, other.dtype]) this = self.astype(dtype, copy=False) other = other.astype(dtype, copy=False) return this.intersection(other, sort=sort) result = self._intersection(other, sort=sort) - return self._wrap_intersection_result(other, result) + return self._wrap_setop_result(other, result) def _intersection(self, other: Index, sort=False): """ intersection specialized to the case with matching dtypes. """ - if ( - self.is_monotonic - and other.is_monotonic - and not is_interval_dtype(self.dtype) - ): - # For IntervalIndex _inner_indexer is not more performant than get_indexer, - # so don't take this fastpath + # TODO(EA): setops-refactor, clean all this up + + if self.is_monotonic and other.is_monotonic: try: result = self._inner_indexer(other)[0] except TypeError: @@ -3139,11 +3113,6 @@ def _intersection(self, other: Index, sort=False): res_values = _maybe_try_sort(res_values, sort) return res_values - def _wrap_intersection_result(self, other, result): - # We will override for MultiIndex to handle empty results - return self._wrap_setop_result(other, result) - - @final def _intersection_via_get_indexer(self, other: Index, sort) -> ArrayLike: """ Find the intersection of two Indexes using get_indexer. @@ -3153,8 +3122,10 @@ def _intersection_via_get_indexer(self, other: Index, sort) -> ArrayLike: np.ndarray or ExtensionArray The returned array will be unique. """ - left_unique = self.unique() - right_unique = other.unique() + # Note: drop_duplicates vs unique matters for MultiIndex, though + # it should not, see GH#41823 + left_unique = self.drop_duplicates() + right_unique = other.drop_duplicates() # even though we are unique, we need get_indexer_for for IntervalIndex indexer = left_unique.get_indexer_for(right_unique) @@ -3219,12 +3190,11 @@ def difference(self, other, sort=None): return self.rename(result_name) result = self._difference(other, sort=sort) - return self._wrap_difference_result(other, result) + return self._wrap_setop_result(other, result) def _difference(self, other, sort): - # overridden by RangeIndex - this = self.unique() + this = self._get_unique_index() indexer = this.get_indexer_for(other) indexer = indexer.take((indexer != -1).nonzero()[0]) @@ -3235,10 +3205,6 @@ def _difference(self, other, sort): return the_diff - def _wrap_difference_result(self, other, result): - # We will override for MultiIndex to handle empty results - return self._wrap_setop_result(other, result) - def symmetric_difference(self, other, result_name=None, sort=None): """ Compute the symmetric difference of two Index objects. @@ -3280,47 +3246,12 @@ def symmetric_difference(self, other, result_name=None, sort=None): if result_name is None: result_name = result_name_update - if not self._should_compare(other): - return self.union(other, sort=sort).rename(result_name) - - elif not is_dtype_equal(self.dtype, other.dtype): - dtype = self._find_common_type_compat(other) - this = self.astype(dtype, copy=False) - that = other.astype(dtype, copy=False) - return this.symmetric_difference(that, sort=sort).rename(result_name) - - this = self.unique() - other = other.unique() - indexer = this.get_indexer_for(other) - - # {this} minus {other} - common_indexer = indexer.take((indexer != -1).nonzero()[0]) - left_indexer = np.setdiff1d( - np.arange(this.size), common_indexer, assume_unique=True - ) - left_diff = this._values.take(left_indexer) - - # {other} minus {this} - right_indexer = (indexer == -1).nonzero()[0] - right_diff = other._values.take(right_indexer) - - res_values = concat_compat([left_diff, right_diff]) - res_values = _maybe_try_sort(res_values, sort) - - result = Index(res_values, name=result_name) - - if self._is_multi: - self = cast("MultiIndex", self) - if len(result) == 0: - # On equal symmetric_difference MultiIndexes the difference is empty. - # Therefore, an empty MultiIndex is returned GH#13490 - return type(self)( - levels=[[] for _ in range(self.nlevels)], - codes=[[] for _ in range(self.nlevels)], - names=result.name, - ) - return type(self).from_tuples(result, names=result.name) + left = self.difference(other, sort=False) + right = other.difference(self, sort=False) + result = left.union(right, sort=sort) + if result_name is not None: + result = result.rename(result_name) return result @final @@ -3388,15 +3319,6 @@ def get_loc(self, key, method=None, tolerance=None): except KeyError as err: raise KeyError(key) from err - # GH#42269 - warnings.warn( - f"Passing method to {type(self).__name__}.get_loc is deprecated " - "and will raise in a future version. Use " - "index.get_indexer([item], method=...) instead", - FutureWarning, - stacklevel=2, - ) - if is_scalar(key) and isna(key) and not self.hasnans: raise KeyError(key) @@ -3466,7 +3388,8 @@ def get_indexer( method: str_t | None = None, limit: int | None = None, tolerance=None, - ) -> npt.NDArray[np.intp]: + ) -> np.ndarray: + # returned ndarray is np.intp method = missing.clean_reindex_fill_method(method) target = self._maybe_cast_listlike_indexer(target) @@ -3475,67 +3398,17 @@ def get_indexer( if not self._index_as_unique: raise InvalidIndexError(self._requires_unique_msg) - if len(target) == 0: - return np.array([], dtype=np.intp) - - if not self._should_compare(target) and not self._should_partial_index(target): + if not self._should_compare(target) and not is_interval_dtype(self.dtype): # IntervalIndex get special treatment bc numeric scalars can be # matched to Interval scalars return self._get_indexer_non_comparable(target, method=method, unique=True) - if is_categorical_dtype(self.dtype): - # _maybe_cast_listlike_indexer ensures target has our dtype - # (could improve perf by doing _should_compare check earlier?) - assert is_dtype_equal(self.dtype, target.dtype) - - indexer = self._engine.get_indexer(target.codes) - if self.hasnans and target.hasnans: - loc = self.get_loc(np.nan) - mask = target.isna() - indexer[mask] = loc - return indexer - - if is_categorical_dtype(target.dtype): - # potential fastpath - # get an indexer for unique categories then propagate to codes via take_nd - # get_indexer instead of _get_indexer needed for MultiIndex cases - # e.g. test_append_different_columns_types - categories_indexer = self.get_indexer(target.categories) - - indexer = algos.take_nd(categories_indexer, target.codes, fill_value=-1) - - if (not self._is_multi and self.hasnans) and target.hasnans: - # Exclude MultiIndex because hasnans raises NotImplementedError - # we should only get here if we are unique, so loc is an integer - # GH#41934 - loc = self.get_loc(np.nan) - mask = target.isna() - indexer[mask] = loc - - return ensure_platform_int(indexer) - pself, ptarget = self._maybe_promote(target) if pself is not self or ptarget is not target: return pself.get_indexer( ptarget, method=method, limit=limit, tolerance=tolerance ) - if is_dtype_equal(self.dtype, target.dtype) and self.equals(target): - # Only call equals if we have same dtype to avoid inference/casting - return np.arange(len(target), dtype=np.intp) - - if not is_dtype_equal(self.dtype, target.dtype) and not is_interval_dtype( - self.dtype - ): - # IntervalIndex gets special treatment for partial-indexing - dtype = self._find_common_type_compat(target) - - this = self.astype(dtype, copy=False) - target = target.astype(dtype, copy=False) - return this._get_indexer( - target, method=method, limit=limit, tolerance=tolerance - ) - return self._get_indexer(target, method, limit, tolerance) def _get_indexer( @@ -3548,6 +3421,15 @@ def _get_indexer( if tolerance is not None: tolerance = self._convert_tolerance(tolerance, target) + if not is_dtype_equal(self.dtype, target.dtype): + dtype = self._find_common_type_compat(target) + + this = self.astype(dtype, copy=False) + target = target.astype(dtype, copy=False) + return this.get_indexer( + target, method=method, limit=limit, tolerance=tolerance + ) + if method in ["pad", "backfill"]: indexer = self._get_fill_indexer(target, method, limit, tolerance) elif method == "nearest": @@ -3557,16 +3439,6 @@ def _get_indexer( return ensure_platform_int(indexer) - @final - def _should_partial_index(self, target: Index) -> bool: - """ - Should we attempt partial-matching indexing? - """ - if is_interval_dtype(self.dtype): - # "Index" has no attribute "left" - return self.left._should_compare(target) # type: ignore[attr-defined] - return False - @final def _check_indexing_method( self, @@ -3625,13 +3497,6 @@ def _get_fill_indexer( self, target: Index, method: str_t, limit: int | None = None, tolerance=None ) -> np.ndarray: - if self._is_multi: - # TODO: get_indexer_with_fill docstring says values must be _sorted_ - # but that doesn't appear to be enforced - return self._engine.get_indexer_with_fill( - target=target._values, values=self._values, method=method, limit=limit - ) - target_values = target._get_engine_target() if self.is_monotonic_increasing and target.is_monotonic_increasing: @@ -3725,6 +3590,16 @@ def _filter_indexer_tolerance( # -------------------------------------------------------------------- # Indexer Conversion Methods + def _get_partial_string_timestamp_match_key(self, key: T) -> T: + """ + Translate any partial string timestamp matches in key, returning the + new key. + + Only relevant for MultiIndex. + """ + # GH#10331 + return key + @final def _validate_positional_slice(self, key: slice) -> None: """ @@ -3833,11 +3708,11 @@ def _validate_can_reindex(self, indexer: np.ndarray) -> None: """ # trying to reindex on an axis with duplicates if not self._index_as_unique and len(indexer): - raise ValueError("cannot reindex on an axis with duplicate labels") + raise ValueError("cannot reindex from a duplicate axis") def reindex( self, target, method=None, level=None, limit=None, tolerance=None - ) -> tuple[Index, npt.NDArray[np.intp] | None]: + ) -> tuple[Index, np.ndarray | None]: """ Create index with target's values. @@ -3860,25 +3735,14 @@ def reindex( target = ensure_has_len(target) # target may be an iterator if not isinstance(target, Index) and len(target) == 0: - if level is not None and self._is_multi: - # "Index" has no attribute "levels"; maybe "nlevels"? - idx = self.levels[level] # type: ignore[attr-defined] - else: - idx = self - target = idx[:0] + target = self[:0] else: target = ensure_index(target) if level is not None: if method is not None: raise TypeError("Fill method not supported if level passed") - - # TODO: tests where passing `keep_order=not self._is_multi` - # makes a difference for non-MultiIndex case - target, indexer, _ = self._join_level( - target, level, how="right", keep_order=not self._is_multi - ) - + _, indexer, _ = self._join_level(target, level, how="right") else: if self.equals(target): indexer = None @@ -3887,8 +3751,6 @@ def reindex( indexer = self.get_indexer( target, method=method, limit=limit, tolerance=tolerance ) - elif self._is_multi: - raise ValueError("cannot handle a non-unique multi-index!") else: if method is not None or limit is not None: raise ValueError( @@ -3897,23 +3759,15 @@ def reindex( ) indexer, _ = self.get_indexer_non_unique(target) - target = self._wrap_reindex_result(target, indexer, preserve_names) - return target, indexer - - def _wrap_reindex_result(self, target, indexer, preserve_names: bool): - target = self._maybe_preserve_names(target, preserve_names) - return target - - def _maybe_preserve_names(self, target: Index, preserve_names: bool): if preserve_names and target.nlevels == 1 and target.name != self.name: - target = target.copy(deep=False) + target = target.copy() target.name = self.name - return target - @final + return target, indexer + def _reindex_non_unique( self, target: Index - ) -> tuple[Index, npt.NDArray[np.intp], npt.NDArray[np.intp] | None]: + ) -> tuple[Index, np.ndarray, np.ndarray | None]: """ Create a new index with target's values (move/add/delete values as necessary) use with non-unique Index and a possibly non-unique target. @@ -3942,15 +3796,14 @@ def _reindex_non_unique( new_indexer = None if len(missing): - length = np.arange(len(indexer), dtype=np.intp) + length = np.arange(len(indexer)) missing = ensure_platform_int(missing) missing_labels = target.take(missing) - missing_indexer = length[~check] + missing_indexer = ensure_platform_int(length[~check]) cur_labels = self.take(indexer[check]).values - cur_indexer = length[check] + cur_indexer = ensure_platform_int(length[check]) - # Index constructor below will do inference new_labels = np.empty((len(indexer),), dtype=object) new_labels[cur_indexer] = cur_labels new_labels[missing_indexer] = missing_labels @@ -3988,7 +3841,6 @@ def _reindex_non_unique( # -------------------------------------------------------------------- # Join Methods - @final @_maybe_return_indexers def join( self, @@ -4020,19 +3872,6 @@ def join( self_is_mi = isinstance(self, ABCMultiIndex) other_is_mi = isinstance(other, ABCMultiIndex) - if isinstance(self, ABCDatetimeIndex) and isinstance(other, ABCDatetimeIndex): - if (self.tz is None) ^ (other.tz is None): - # Raise instead of casting to object below. - raise TypeError("Cannot join tz-naive with tz-aware DatetimeIndex") - - if not self._is_multi and not other._is_multi: - # We have specific handling for MultiIndex below - pself, pother = self._maybe_promote(other) - if pself is not self or pother is not other: - return pself.join( - pother, how=how, level=level, return_indexers=True, sort=sort - ) - lindexer: np.ndarray | None rindexer: np.ndarray | None @@ -4069,9 +3908,8 @@ def join( return join_index, lidx, ridx if not is_dtype_equal(self.dtype, other.dtype): - dtype = self._find_common_type_compat(other) - this = self.astype(dtype, copy=False) - other = other.astype(dtype, copy=False) + this = self.astype("O") + other = other.astype("O") return this.join(other, how=how, return_indexers=True) _validate_join_method(how) @@ -4201,7 +4039,8 @@ def _join_multi(self, other: Index, how: str_t): @final def _join_non_unique( self, other: Index, how: str_t = "left" - ) -> tuple[Index, npt.NDArray[np.intp], npt.NDArray[np.intp]]: + ) -> tuple[Index, np.ndarray, np.ndarray]: + # returned ndarrays are np.intp from pandas.core.reshape.merge import get_join_indexers # We only get here if dtypes match @@ -4229,7 +4068,8 @@ def _join_non_unique( @final def _join_level( self, other: Index, level, how: str_t = "left", keep_order: bool = True - ) -> tuple[MultiIndex, npt.NDArray[np.intp] | None, npt.NDArray[np.intp] | None]: + ) -> tuple[MultiIndex, np.ndarray | None, np.ndarray | None]: + # Any returned ndarrays are np.intp """ The join method *only* affects the level of the resulting MultiIndex. Otherwise it just exactly aligns the Index data to the @@ -4241,7 +4081,7 @@ def _join_level( """ from pandas.core.indexes.multi import MultiIndex - def _get_leaf_sorter(labels: list[np.ndarray]) -> npt.NDArray[np.intp]: + def _get_leaf_sorter(labels: list[np.ndarray]) -> np.ndarray: """ Returns sorter for the inner most level while preserving the order of higher levels. @@ -4618,12 +4458,6 @@ def is_type_compatible(self, kind: str_t) -> bool: """ Whether the index type is compatible with the provided type. """ - warnings.warn( - "Index.is_type_compatible is deprecated and will be removed in a " - "future version", - FutureWarning, - stacklevel=2, - ) return kind == self.inferred_type def __contains__(self, key: Any) -> bool: @@ -4667,10 +4501,9 @@ def __contains__(self, key: Any) -> bool: except (OverflowError, TypeError, ValueError): return False - # https://github.com/python/typeshed/issues/2148#issuecomment-520783318 - # Incompatible types in assignment (expression has type "None", base class - # "object" defined the type as "Callable[[object], int]") - __hash__: None # type: ignore[assignment] + @final + def __hash__(self): + raise TypeError(f"unhashable type: {repr(type(self).__name__)}") @final def __setitem__(self, key, value): @@ -4974,26 +4807,16 @@ def asof(self, label): Traceback (most recent call last): ValueError: index must be monotonic increasing or decreasing """ - self._searchsorted_monotonic(label) # validate sortedness try: - loc = self.get_loc(label) - except (KeyError, TypeError): - # KeyError -> No exact match, try for padded - # TypeError -> passed e.g. non-hashable, fall through to get - # the tested exception message - indexer = self.get_indexer([label], method="pad") - if indexer.ndim > 1 or indexer.size > 1: - raise TypeError("asof requires scalar valued input") - loc = indexer.item() - if loc == -1: - return self._na_value + loc = self.get_loc(label, method="pad") + except KeyError: + return self._na_value else: if isinstance(loc, slice): loc = loc.indices(len(self))[-1] + return self[loc] - return self[loc] - - def asof_locs(self, where: Index, mask: np.ndarray) -> npt.NDArray[np.intp]: + def asof_locs(self, where: Index, mask: np.ndarray) -> np.ndarray: """ Return the locations (indices) of labels in the index. @@ -5184,7 +5007,7 @@ def shift(self, periods=1, freq=None): f"TimedeltaIndex; Got type {type(self).__name__}" ) - def argsort(self, *args, **kwargs) -> npt.NDArray[np.intp]: + def argsort(self, *args, **kwargs) -> np.ndarray: """ Return the integer indices that would sort the index. @@ -5251,7 +5074,7 @@ def get_value(self, series: Series, key): # try that loc = self.get_loc(key) except KeyError: - if not self._should_fallback_to_positional: + if not self._should_fallback_to_positional(): raise elif is_integer(key): # If the Index cannot hold integer, then this is unambiguously @@ -5268,7 +5091,6 @@ def _check_indexing_error(self, key): # would convert to numpy arrays and raise later any way) - GH29926 raise InvalidIndexError(key) - @cache_readonly def _should_fallback_to_positional(self) -> bool: """ Should an integer key be treated as positional? @@ -5335,11 +5157,9 @@ def set_value(self, arr, key, value): """ @Appender(_index_shared_docs["get_indexer_non_unique"] % _index_doc_kwargs) - def get_indexer_non_unique( - self, target - ) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: + def get_indexer_non_unique(self, target) -> tuple[np.ndarray, np.ndarray]: + # both returned ndarrays are np.intp target = ensure_index(target) - target = self._maybe_cast_listlike_indexer(target) if not self._should_compare(target) and not is_interval_dtype(self.dtype): # IntervalIndex get special treatment bc numeric scalars can be @@ -5359,15 +5179,13 @@ def get_indexer_non_unique( that = target.astype(dtype, copy=False) return this.get_indexer_non_unique(that) - # Note: _maybe_promote ensures we never get here with MultiIndex - # self and non-Multi target tgt_values = target._get_engine_target() indexer, missing = self._engine.get_indexer_non_unique(tgt_values) return ensure_platform_int(indexer), ensure_platform_int(missing) @final - def get_indexer_for(self, target) -> npt.NDArray[np.intp]: + def get_indexer_for(self, target, **kwargs) -> np.ndarray: """ Guaranteed return of an indexer even when non-unique. @@ -5380,115 +5198,35 @@ def get_indexer_for(self, target) -> npt.NDArray[np.intp]: List of indices. """ if self._index_as_unique: - return self.get_indexer(target) + return self.get_indexer(target, **kwargs) indexer, _ = self.get_indexer_non_unique(target) return indexer - def _get_indexer_strict(self, key, axis_name: str_t) -> tuple[Index, np.ndarray]: - """ - Analogue to get_indexer that raises if any elements are missing. - """ - keyarr = key - if not isinstance(keyarr, Index): - keyarr = com.asarray_tuplesafe(keyarr) - - if self._index_as_unique: - indexer = self.get_indexer_for(keyarr) - keyarr = self.reindex(keyarr)[0] - else: - keyarr, indexer, new_indexer = self._reindex_non_unique(keyarr) - - self._raise_if_missing(keyarr, indexer, axis_name) - - if ( - needs_i8_conversion(self.dtype) - or is_categorical_dtype(self.dtype) - or is_interval_dtype(self.dtype) - ): - # For CategoricalIndex take instead of reindex to preserve dtype. - # For IntervalIndex this is to map integers to the Intervals they match to. - keyarr = self.take(indexer) - if keyarr.dtype.kind in ["m", "M"]: - # DTI/TDI.take can infer a freq in some cases when we dont want one - if isinstance(key, list) or ( - isinstance(key, type(self)) - # "Index" has no attribute "freq" - and key.freq is None # type: ignore[attr-defined] - ): - keyarr = keyarr._with_freq(None) - - return keyarr, indexer - - def _raise_if_missing(self, key, indexer, axis_name: str_t): - """ - Check that indexer can be used to return a result. - - e.g. at least one element was found, - unless the list of keys was actually empty. - - Parameters - ---------- - key : list-like - Targeted labels (only used to show correct error message). - indexer: array-like of booleans - Indices corresponding to the key, - (with -1 indicating not found). - axis_name : str - - Raises - ------ - KeyError - If at least one key was requested but none was found. - """ - if len(key) == 0: - return - - # Count missing values - missing_mask = indexer < 0 - nmissing = missing_mask.sum() - - if nmissing: - - # TODO: remove special-case; this is just to keep exception - # message tests from raising while debugging - use_interval_msg = is_interval_dtype(self.dtype) or ( - is_categorical_dtype(self.dtype) - # "Index" has no attribute "categories" [attr-defined] - and is_interval_dtype( - self.categories.dtype # type: ignore[attr-defined] - ) - ) - - if nmissing == len(indexer): - if use_interval_msg: - key = list(key) - raise KeyError(f"None of [{key}] are in the [{axis_name}]") - - not_found = list(ensure_index(key)[missing_mask.nonzero()[0]].unique()) - raise KeyError(f"{not_found} not in index") - @overload def _get_indexer_non_comparable( self, target: Index, method, unique: Literal[True] = ... - ) -> npt.NDArray[np.intp]: + ) -> np.ndarray: + # returned ndarray is np.intp ... @overload def _get_indexer_non_comparable( self, target: Index, method, unique: Literal[False] - ) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: + ) -> tuple[np.ndarray, np.ndarray]: + # both returned ndarrays are np.intp ... @overload def _get_indexer_non_comparable( self, target: Index, method, unique: bool = True - ) -> npt.NDArray[np.intp] | tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: + ) -> np.ndarray | tuple[np.ndarray, np.ndarray]: + # any returned ndarrays are np.intp ... @final def _get_indexer_non_comparable( self, target: Index, method, unique: bool = True - ) -> npt.NDArray[np.intp] | tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: + ) -> np.ndarray | tuple[np.ndarray, np.ndarray]: """ Called from get_indexer or get_indexer_non_unique when the target is of a non-comparable dtype. @@ -5514,16 +5252,6 @@ def _get_indexer_non_comparable( """ if method is not None: other = unpack_nested_dtype(target) - if self._is_multi ^ other._is_multi: - kind = other.dtype.type if self._is_multi else self.dtype.type - raise TypeError( - f"'<' not supported between instances of {kind} and 'tuple'" - ) - elif self._is_multi and other._is_multi: - assert self.nlevels != other.nlevels - # Python allows comparison between tuples of different lengths, - # but for our purposes such a comparison is not meaningful. - raise TypeError("'<' not supported between tuples of different lengths") raise TypeError(f"Cannot compare dtypes {self.dtype} and {other.dtype}") no_matches = -1 * np.ones(target.shape, dtype=np.intp) @@ -5575,21 +5303,6 @@ def _maybe_promote(self, other: Index) -> tuple[Index, Index]: if not is_object_dtype(self.dtype): return self.astype("object"), other.astype("object") - elif self.dtype.kind == "u" and other.dtype.kind == "i": - # GH#41873 - if other.min() >= 0: - # lookup min as it may be cached - # TODO: may need itemsize check if we have non-64-bit Indexes - return self, other.astype(self.dtype) - - elif self._is_multi and not other._is_multi: - try: - # "Type[Index]" has no attribute "from_tuples" - other = type(self).from_tuples(other) # type: ignore[attr-defined] - except (TypeError, ValueError): - # let's instead try with a straight Index - self = Index(self._values) - if not is_object_dtype(self.dtype) and is_object_dtype(other.dtype): # Reverse op so we dont need to re-implement on the subclasses other, self = other._maybe_promote(self) @@ -5653,14 +5366,6 @@ def _should_compare(self, other: Index) -> bool: other = unpack_nested_dtype(other) dtype = other.dtype - if other._is_multi: - if not self._is_multi: - # other contains only tuples so unless we are object-dtype, - # there can never be any matches - return self._is_comparable_dtype(dtype) - return self.nlevels == other.nlevels - # TODO: we can get more specific requiring levels are comparable? - return self._is_comparable_dtype(dtype) or is_object_dtype(dtype) def _is_comparable_dtype(self, dtype: DtypeObj) -> bool: @@ -5740,7 +5445,7 @@ def map(self, mapper, na_action=None): # TODO: De-duplicate with map, xref GH#32349 @final - def _transform_index(self, func, *, level=None) -> Index: + def _transform_index(self, func, level=None) -> Index: """ Apply function to all values found in index. @@ -5749,7 +5454,6 @@ def _transform_index(self, func, *, level=None) -> Index: """ if isinstance(self, ABCMultiIndex): if level is not None: - # Caller is responsible for ensuring level is positional. items = [ tuple(func(y) if i == level else y for i, y in enumerate(x)) for x in self @@ -6192,7 +5896,8 @@ def insert(self, loc: int, item) -> Index: try: item = self._validate_fill_value(item) except TypeError: - dtype = self._find_common_type_compat(item) + inferred, _ = infer_dtype_from(item) + dtype = find_common_type([self.dtype, inferred]) return self.astype(dtype).insert(loc, item) arr = np.asarray(self) @@ -6208,7 +5913,7 @@ def drop(self, labels, errors: str_t = "raise") -> Index: Parameters ---------- - labels : array-like or scalar + labels : array-like errors : {'ignore', 'raise'}, default 'raise' If 'ignore', suppress error and existing labels are dropped. @@ -6307,9 +6012,6 @@ def __inv__(self): # TODO: __inv__ vs __invert__? return self._unary_method(lambda x: -x) - # -------------------------------------------------------------------- - # Reductions - def any(self, *args, **kwargs): """ Return whether any element is Truthy. @@ -6323,8 +6025,8 @@ def any(self, *args, **kwargs): Returns ------- - any : bool or array-like (if axis is specified) - A single element array-like may be converted to bool. + any : bool or array_like (if axis is specified) + A single element array_like may be converted to bool. See Also -------- @@ -6367,8 +6069,8 @@ def all(self, *args, **kwargs): Returns ------- - all : bool or array-like (if axis is specified) - A single element array-like may be converted to bool. + all : bool or array_like (if axis is specified) + A single element array_like may be converted to bool. See Also -------- @@ -6430,84 +6132,6 @@ def _maybe_disable_logical_methods(self, opname: str_t): # This call will raise make_invalid_op(opname)(self) - @Appender(IndexOpsMixin.argmin.__doc__) - def argmin(self, axis=None, skipna=True, *args, **kwargs): - nv.validate_argmin(args, kwargs) - nv.validate_minmax_axis(axis) - - if not self._is_multi and self.hasnans: - # Take advantage of cache - mask = self._isnan - if not skipna or mask.all(): - return -1 - return super().argmin(skipna=skipna) - - @Appender(IndexOpsMixin.argmax.__doc__) - def argmax(self, axis=None, skipna=True, *args, **kwargs): - nv.validate_argmax(args, kwargs) - nv.validate_minmax_axis(axis) - - if not self._is_multi and self.hasnans: - # Take advantage of cache - mask = self._isnan - if not skipna or mask.all(): - return -1 - return super().argmax(skipna=skipna) - - @doc(IndexOpsMixin.min) - def min(self, axis=None, skipna=True, *args, **kwargs): - nv.validate_min(args, kwargs) - nv.validate_minmax_axis(axis) - - if not len(self): - return self._na_value - - if len(self) and self.is_monotonic_increasing: - # quick check - first = self[0] - if not isna(first): - return first - - if not self._is_multi and self.hasnans: - # Take advantage of cache - mask = self._isnan - if not skipna or mask.all(): - return self._na_value - - if not self._is_multi and not isinstance(self._values, np.ndarray): - # "ExtensionArray" has no attribute "min" - return self._values.min(skipna=skipna) # type: ignore[attr-defined] - - return super().min(skipna=skipna) - - @doc(IndexOpsMixin.max) - def max(self, axis=None, skipna=True, *args, **kwargs): - nv.validate_max(args, kwargs) - nv.validate_minmax_axis(axis) - - if not len(self): - return self._na_value - - if len(self) and self.is_monotonic_increasing: - # quick check - last = self[-1] - if not isna(last): - return last - - if not self._is_multi and self.hasnans: - # Take advantage of cache - mask = self._isnan - if not skipna or mask.all(): - return self._na_value - - if not self._is_multi and not isinstance(self._values, np.ndarray): - # "ExtensionArray" has no attribute "max" - return self._values.max(skipna=skipna) # type: ignore[attr-defined] - - return super().max(skipna=skipna) - - # -------------------------------------------------------------------- - @final @property def shape(self) -> Shape: diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index 7339c82cbcc77..228f58d47b8ed 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -12,13 +12,17 @@ from pandas._libs import index as libindex from pandas._typing import ( + ArrayLike, Dtype, DtypeObj, - npt, ) -from pandas.util._decorators import doc +from pandas.util._decorators import ( + Appender, + doc, +) from pandas.core.dtypes.common import ( + ensure_platform_int, is_categorical_dtype, is_scalar, ) @@ -28,6 +32,7 @@ notna, ) +from pandas.core import accessor from pandas.core.arrays.categorical import ( Categorical, contains, @@ -36,6 +41,7 @@ import pandas.core.indexes.base as ibase from pandas.core.indexes.base import ( Index, + _index_shared_docs, maybe_extract_name, ) from pandas.core.indexes.extension import ( @@ -63,8 +69,9 @@ ], Categorical, ) -@inherit_names( - [ +@accessor.delegate_names( + delegate=Categorical, + accessors=[ "rename_categories", "reorder_categories", "add_categories", @@ -74,10 +81,10 @@ "as_ordered", "as_unordered", ], - Categorical, - wrap=True, + typ="method", + overwrite=True, ) -class CategoricalIndex(NDArrayBackedExtensionIndex): +class CategoricalIndex(NDArrayBackedExtensionIndex, accessor.PandasDelegate): """ Index based on an underlying :class:`Categorical`. @@ -186,12 +193,17 @@ def _can_hold_strings(self): def _engine_type(self): # self.codes can have dtype int8, int16, int32 or int64, so we need # to return the corresponding engine type (libindex.Int8Engine, etc.). + + # error: Invalid index type "Type[generic]" for "Dict[Type[signedinteger[Any]], + # Any]"; expected type "Type[signedinteger[Any]]" return { np.int8: libindex.Int8Engine, np.int16: libindex.Int16Engine, np.int32: libindex.Int32Engine, np.int64: libindex.Int64Engine, - }[self.codes.dtype.type] + }[ + self.codes.dtype.type # type: ignore[index] + ] _attributes = ["name"] @@ -369,7 +381,7 @@ def fillna(self, value, downcast=None): def reindex( self, target, method=None, level=None, limit=None, tolerance=None - ) -> tuple[Index, npt.NDArray[np.intp] | None]: + ) -> tuple[Index, np.ndarray | None]: """ Create index with target's values (move/add/delete values as necessary) @@ -400,9 +412,9 @@ def reindex( indexer = None missing = np.array([], dtype=np.intp) else: - indexer, missing = self.get_indexer_non_unique(target) + indexer, missing = self.get_indexer_non_unique(np.array(target)) - if len(self) and indexer is not None: + if len(self.codes) and indexer is not None: new_target = self.take(indexer) else: new_target = target @@ -411,8 +423,10 @@ def reindex( if len(missing): cats = self.categories.get_indexer(target) - if not isinstance(target, CategoricalIndex) or (cats == -1).any(): - new_target, indexer, _ = super()._reindex_non_unique(target) + if not isinstance(cats, CategoricalIndex) or (cats == -1).any(): + # coerce to a regular index here! + result = Index(np.array(self), name=self.name) + new_target, indexer, _ = result._reindex_non_unique(target) else: codes = new_target.codes.copy() @@ -425,32 +439,84 @@ def reindex( # coerce based on the actual values, only on the dtype) # unless we had an initial Categorical to begin with # in which case we are going to conform to the passed Categorical + new_target = np.asarray(new_target) if is_categorical_dtype(target): cat = Categorical(new_target, dtype=target.dtype) new_target = type(self)._simple_new(cat, name=self.name) else: - # e.g. test_reindex_with_categoricalindex, test_reindex_duplicate_target - new_target = np.asarray(new_target) new_target = Index(new_target, name=self.name) return new_target, indexer + # error: Return type "Tuple[Index, Optional[ndarray], Optional[ndarray]]" + # of "_reindex_non_unique" incompatible with return type + # "Tuple[Index, ndarray, Optional[ndarray]]" in supertype "Index" + def _reindex_non_unique( # type: ignore[override] + self, target: Index + ) -> tuple[Index, np.ndarray | None, np.ndarray | None]: + """ + reindex from a non-unique; which CategoricalIndex's are almost + always + """ + # TODO: rule out `indexer is None` here to make the signature + # match the parent class's signature. This should be equivalent + # to ruling out `self.equals(target)` + new_target, indexer = self.reindex(target) + new_indexer = None + + check = indexer == -1 + # error: Item "bool" of "Union[Any, bool]" has no attribute "any" + if check.any(): # type: ignore[union-attr] + new_indexer = np.arange(len(self.take(indexer)), dtype=np.intp) + new_indexer[check] = -1 + + cats = self.categories.get_indexer(target) + if not (cats == -1).any(): + # .reindex returns normal Index. Revert to CategoricalIndex if + # all targets are included in my categories + cat = Categorical(new_target, dtype=self.dtype) + new_target = type(self)._simple_new(cat, name=self.name) + + return new_target, indexer, new_indexer + # -------------------------------------------------------------------- # Indexing Methods def _maybe_cast_indexer(self, key) -> int: - # GH#41933: we have to do this instead of self._data._validate_scalar - # because this will correctly get partial-indexing on Interval categories - try: - return self._data._unbox_scalar(key) - except KeyError: - if is_valid_na_for_dtype(key, self.categories.dtype): - return -1 - raise - - def _maybe_cast_listlike_indexer(self, values) -> CategoricalIndex: - if isinstance(values, CategoricalIndex): - values = values._data + return self._data._unbox_scalar(key) + + def _get_indexer( + self, + target: Index, + method: str | None = None, + limit: int | None = None, + tolerance=None, + ) -> np.ndarray: + # returned ndarray is np.intp + + if self.equals(target): + return np.arange(len(self), dtype="intp") + + return self._get_indexer_non_unique(target._values)[0] + + @Appender(_index_shared_docs["get_indexer_non_unique"] % _index_doc_kwargs) + def get_indexer_non_unique(self, target) -> tuple[np.ndarray, np.ndarray]: + # both returned ndarrays are np.intp + target = ibase.ensure_index(target) + return self._get_indexer_non_unique(target._values) + + def _get_indexer_non_unique( + self, values: ArrayLike + ) -> tuple[np.ndarray, np.ndarray]: + # both returned ndarrays are np.intp + """ + get_indexer_non_unique but after unrapping the target Index object. + """ + # Note: we use engine.get_indexer_non_unique for get_indexer in addition + # to get_indexer_non_unique because, even if `target` is unique, any + # non-category entries in it will be encoded as -1 so `codes` may + # not be unique. + if isinstance(values, Categorical): # Indexing on codes is more efficient if categories are the same, # so we can apply some optimizations based on the degree of @@ -459,9 +525,9 @@ def _maybe_cast_listlike_indexer(self, values) -> CategoricalIndex: codes = cat._codes else: codes = self.categories.get_indexer(values) - codes = codes.astype(self.codes.dtype, copy=False) - cat = self._data._from_backing_data(codes) - return type(self)._simple_new(cat) + + indexer, missing = self._engine.get_indexer_non_unique(codes) + return ensure_platform_int(indexer), ensure_platform_int(missing) # -------------------------------------------------------------------- @@ -560,3 +626,13 @@ def _concat(self, to_concat: list[Index], name: Hashable) -> Index: else: cat = self._data._from_backing_data(codes) return type(self)._simple_new(cat, name=name) + + def _delegate_method(self, name: str, *args, **kwargs): + """method delegation to the ._values""" + method = getattr(self._values, name) + if "inplace" in kwargs: + raise ValueError("cannot use inplace with CategoricalIndex") + res = method(*args, **kwargs) + if is_scalar(res): + return res + return CategoricalIndex(res, name=self.name) diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index 07c6a84f75302..df7fae0763c42 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -7,19 +7,17 @@ from typing import ( TYPE_CHECKING, Any, - Callable, Sequence, TypeVar, cast, - final, ) -import warnings import numpy as np from pandas._libs import ( NaT, Timedelta, + iNaT, lib, ) from pandas._libs.tslibs import ( @@ -27,8 +25,8 @@ NaTType, Resolution, Tick, - parsing, ) +from pandas._typing import Callable from pandas.compat.numpy import function as nv from pandas.util._decorators import ( Appender, @@ -47,7 +45,6 @@ from pandas.core.arrays import ( DatetimeArray, - ExtensionArray, PeriodArray, TimedeltaArray, ) @@ -61,6 +58,7 @@ from pandas.core.indexes.extension import ( NDArrayBackedExtensionIndex, inherit_names, + make_wrapped_arith_op, ) from pandas.core.tools.timedeltas import to_timedelta @@ -96,6 +94,7 @@ class DatetimeIndexOpsMixin(NDArrayBackedExtensionIndex): hasnans = cache_readonly( DatetimeLikeArrayMixin._hasnans.fget # type: ignore[attr-defined] ) + _hasnans = hasnans # for index / array -agnostic code @property def _is_all_dates(self) -> bool: @@ -195,6 +194,120 @@ def tolist(self) -> list: """ return list(self.astype(object)) + def min(self, axis=None, skipna=True, *args, **kwargs): + """ + Return the minimum value of the Index or minimum along + an axis. + + See Also + -------- + numpy.ndarray.min + Series.min : Return the minimum value in a Series. + """ + nv.validate_min(args, kwargs) + nv.validate_minmax_axis(axis) + + if not len(self): + return self._na_value + + i8 = self.asi8 + + if len(i8) and self.is_monotonic_increasing: + # quick check + if i8[0] != iNaT: + return self._data._box_func(i8[0]) + + if self.hasnans: + if not skipna: + return self._na_value + i8 = i8[~self._isnan] + + if not len(i8): + return self._na_value + + min_stamp = i8.min() + return self._data._box_func(min_stamp) + + def argmin(self, axis=None, skipna=True, *args, **kwargs): + """ + Returns the indices of the minimum values along an axis. + + See `numpy.ndarray.argmin` for more information on the + `axis` parameter. + + See Also + -------- + numpy.ndarray.argmin + """ + nv.validate_argmin(args, kwargs) + nv.validate_minmax_axis(axis) + + i8 = self.asi8 + if self.hasnans: + mask = self._isnan + if mask.all() or not skipna: + return -1 + i8 = i8.copy() + i8[mask] = np.iinfo("int64").max + return i8.argmin() + + def max(self, axis=None, skipna=True, *args, **kwargs): + """ + Return the maximum value of the Index or maximum along + an axis. + + See Also + -------- + numpy.ndarray.max + Series.max : Return the maximum value in a Series. + """ + nv.validate_max(args, kwargs) + nv.validate_minmax_axis(axis) + + if not len(self): + return self._na_value + + i8 = self.asi8 + + if len(i8) and self.is_monotonic: + # quick check + if i8[-1] != iNaT: + return self._data._box_func(i8[-1]) + + if self.hasnans: + if not skipna: + return self._na_value + i8 = i8[~self._isnan] + + if not len(i8): + return self._na_value + + max_stamp = i8.max() + return self._data._box_func(max_stamp) + + def argmax(self, axis=None, skipna=True, *args, **kwargs): + """ + Returns the indices of the maximum values along an axis. + + See `numpy.ndarray.argmax` for more information on the + `axis` parameter. + + See Also + -------- + numpy.ndarray.argmax + """ + nv.validate_argmax(args, kwargs) + nv.validate_minmax_axis(axis) + + i8 = self.asi8 + if self.hasnans: + mask = self._isnan + if mask.all() or not skipna: + return -1 + i8 = i8.copy() + i8[mask] = 0 + return i8.argmax() + # -------------------------------------------------------------------- # Rendering Methods @@ -280,26 +393,12 @@ def _summary(self, name=None) -> str: # -------------------------------------------------------------------- # Indexing Methods - def _can_partial_date_slice(self, reso: Resolution) -> bool: + def _validate_partial_date_slice(self, reso: Resolution): raise NotImplementedError - def _parsed_string_to_bounds(self, reso: Resolution, parsed): + def _parsed_string_to_bounds(self, reso: Resolution, parsed: datetime): raise NotImplementedError - def _parse_with_reso(self, label: str): - # overridden by TimedeltaIndex - parsed, reso_str = parsing.parse_time_string(label, self.freq) - reso = Resolution.from_attrname(reso_str) - return parsed, reso - - def _get_string_slice(self, key: str): - parsed, reso = self._parse_with_reso(key) - try: - return self._partial_date_slice(reso, parsed) - except KeyError as err: - raise KeyError(key) from err - - @final def _partial_date_slice( self, reso: Resolution, @@ -315,8 +414,7 @@ def _partial_date_slice( ------- slice or ndarray[intp] """ - if not self._can_partial_date_slice(reso): - raise ValueError + self._validate_partial_date_slice(reso) t1, t2 = self._parsed_string_to_bounds(reso, parsed) vals = self._data._ndarray @@ -347,6 +445,23 @@ def _partial_date_slice( # -------------------------------------------------------------------- # Arithmetic Methods + __add__ = make_wrapped_arith_op("__add__") + __sub__ = make_wrapped_arith_op("__sub__") + __radd__ = make_wrapped_arith_op("__radd__") + __rsub__ = make_wrapped_arith_op("__rsub__") + __pow__ = make_wrapped_arith_op("__pow__") + __rpow__ = make_wrapped_arith_op("__rpow__") + __mul__ = make_wrapped_arith_op("__mul__") + __rmul__ = make_wrapped_arith_op("__rmul__") + __floordiv__ = make_wrapped_arith_op("__floordiv__") + __rfloordiv__ = make_wrapped_arith_op("__rfloordiv__") + __mod__ = make_wrapped_arith_op("__mod__") + __rmod__ = make_wrapped_arith_op("__rmod__") + __divmod__ = make_wrapped_arith_op("__divmod__") + __rdivmod__ = make_wrapped_arith_op("__rdivmod__") + __truediv__ = make_wrapped_arith_op("__truediv__") + __rtruediv__ = make_wrapped_arith_op("__rtruediv__") + def shift(self: _T, periods: int = 1, freq=None) -> _T: """ Shift index by desired number of time frequency increments. @@ -480,12 +595,7 @@ def _maybe_cast_listlike_indexer(self, keyarr): try: res = self._data._validate_listlike(keyarr, allow_object=True) except (ValueError, TypeError): - if not isinstance(keyarr, ExtensionArray): - # e.g. we don't want to cast DTA to ndarray[object] - res = com.asarray_tuplesafe(keyarr) - # TODO: com.asarray_tuplesafe shouldn't cast e.g. DatetimeArray - else: - res = keyarr + res = com.asarray_tuplesafe(keyarr) return Index(res, dtype=res.dtype) @@ -504,8 +614,6 @@ class DatetimeTimedeltaMixin(DatetimeIndexOpsMixin): _is_monotonic_decreasing = Index.is_monotonic_decreasing _is_unique = Index.is_unique - _join_precedence = 10 - def _with_freq(self, freq): arr = self._data._with_freq(freq) return type(self)._simple_new(arr, name=self._name) @@ -516,12 +624,6 @@ def _has_complex_internals(self) -> bool: return False def is_type_compatible(self, kind: str) -> bool: - warnings.warn( - f"{type(self).__name__}.is_type_compatible is deprecated and will be " - "removed in a future version", - FutureWarning, - stacklevel=2, - ) return kind in self._data._infer_matches # -------------------------------------------------------------------- @@ -529,11 +631,15 @@ def is_type_compatible(self, kind: str) -> bool: def _intersection(self, other: Index, sort=False) -> Index: """ - intersection specialized to the case with matching dtypes and both non-empty. + intersection specialized to the case with matching dtypes. """ other = cast("DatetimeTimedeltaMixin", other) + if len(self) == 0: + return self.copy()._get_reconciled_name_object(other) + if len(other) == 0: + return other.copy()._get_reconciled_name_object(self) - if not self._can_fast_intersect(other): + elif not self._can_fast_intersect(other): result = Index._intersection(self, other, sort=sort) # We need to invalidate the freq because Index._intersection # uses _shallow_copy on a view of self._data, which will preserve @@ -543,11 +649,6 @@ def _intersection(self, other: Index, sort=False) -> Index: result = self._wrap_setop_result(other, result) return result._with_freq(None)._with_freq("infer") - else: - return self._fast_intersect(other, sort) - - def _fast_intersect(self, other, sort): - # to make our life easier, "sort" the two ranges if self[0] <= other[0]: left, right = self, other @@ -582,8 +683,7 @@ def _can_fast_intersect(self: _T, other: _T) -> bool: elif self.freq.is_anchored(): # this along with matching freqs ensure that we "line up", # so intersection will preserve freq - # GH#42104 - return self.freq.n == 1 + return True elif isinstance(self.freq, Tick): # We "line up" if and only if the difference between two of our points @@ -592,8 +692,7 @@ def _can_fast_intersect(self: _T, other: _T) -> bool: remainder = diff % self.freq.delta return remainder == Timedelta(0) - # GH#42104 - return self.freq.n == 1 + return True def _can_fast_union(self: _T, other: _T) -> bool: # Assumes that type(self) == type(other), as per the annotation @@ -625,7 +724,11 @@ def _can_fast_union(self: _T, other: _T) -> bool: return (right_start == left_end + freq) or right_start in left def _fast_union(self: _T, other: _T, sort=None) -> _T: - # Caller is responsible for ensuring self and other are non-empty + if len(other) == 0: + return self.view(type(self)) + + if len(self) == 0: + return other.view(type(self)) # to make our life easier, "sort" the two ranges if self[0] <= other[0]: @@ -675,4 +778,39 @@ def _union(self, other, sort): # that result.freq == self.freq return result else: - return super()._union(other, sort)._with_freq("infer") + return super()._union(other, sort=sort)._with_freq("infer") + + # -------------------------------------------------------------------- + # Join Methods + _join_precedence = 10 + + def join( + self, + other, + how: str = "left", + level=None, + return_indexers: bool = False, + sort: bool = False, + ): + """ + See Index.join + """ + pself, pother = self._maybe_promote(other) + if pself is not self or pother is not other: + return pself.join( + pother, how=how, level=level, return_indexers=return_indexers, sort=sort + ) + + self._maybe_utc_convert(other) # raises if we dont have tzawareness compat + return Index.join( + self, + other, + how=how, + level=level, + return_indexers=return_indexers, + sort=sort, + ) + + def _maybe_utc_convert(self: _T, other: Index) -> tuple[_T, Index]: + # Overridden by DatetimeIndex + return self, other diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index 9712a5d95a234..fbfee9a1f524c 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -33,8 +33,8 @@ from pandas._typing import ( Dtype, DtypeObj, - npt, ) +from pandas.errors import InvalidIndexError from pandas.util._decorators import ( cache_readonly, doc, @@ -117,10 +117,16 @@ def _new_DatetimeIndex(cls, d): @inherit_names(["is_normalized", "_resolution_obj"], DatetimeArray, cache=True) @inherit_names( [ + "_bool_ops", + "_object_ops", + "_field_ops", + "_datetimelike_ops", + "_datetimelike_methods", "tz", "tzinfo", "dtype", "to_pydatetime", + "_has_same_tz", "_format_native_types", "date", "time", @@ -406,8 +412,7 @@ def union_many(self, others): this, other = this._maybe_utc_convert(other) - if len(self) and len(other) and this._can_fast_union(other): - # union already has fastpath handling for empty cases + if this._can_fast_union(other): this = this._fast_union(other) else: this = Index.union(this, other) @@ -569,6 +574,21 @@ def _parsed_string_to_bounds(self, reso: Resolution, parsed: datetime): ------- lower, upper: pd.Timestamp """ + assert isinstance(reso, Resolution), (type(reso), reso) + valid_resos = { + "year", + "month", + "quarter", + "day", + "hour", + "minute", + "second", + "millisecond", + "microsecond", + } + if reso.attrname not in valid_resos: + raise KeyError + grp = reso.freq_group per = Period(parsed, freq=grp.value) start, end = per.start_time, per.end_time @@ -577,22 +597,36 @@ def _parsed_string_to_bounds(self, reso: Resolution, parsed: datetime): # If an incoming date string contained a UTC offset, need to localize # the parsed date to this offset first before aligning with the index's # timezone - start = start.tz_localize(parsed.tzinfo) - end = end.tz_localize(parsed.tzinfo) - if parsed.tzinfo is not None: if self.tz is None: raise ValueError( "The index must be timezone aware when indexing " "with a date string with a UTC offset" ) - start = self._maybe_cast_for_get_loc(start) - end = self._maybe_cast_for_get_loc(end) + start = start.tz_localize(parsed.tzinfo).tz_convert(self.tz) + end = end.tz_localize(parsed.tzinfo).tz_convert(self.tz) + elif self.tz is not None: + start = start.tz_localize(self.tz) + end = end.tz_localize(self.tz) return start, end - def _can_partial_date_slice(self, reso: Resolution) -> bool: - # History of conversation GH#3452, GH#3931, GH#2369, GH#14826 - return reso > self._resolution_obj + def _validate_partial_date_slice(self, reso: Resolution): + assert isinstance(reso, Resolution), (type(reso), reso) + if ( + self.is_monotonic + and reso.attrname in ["day", "hour", "minute", "second"] + and self._resolution_obj >= reso + ): + # These resolution/monotonicity validations came from GH3931, + # GH3452 and GH2369. + + # See also GH14826 + raise KeyError + + if reso.attrname == "microsecond": + # _partial_date_slice doesn't allow microsecond resolution, but + # _parsed_string_to_bounds allows it. + raise KeyError def _deprecate_mismatched_indexing(self, key) -> None: # GH#36148 @@ -624,7 +658,8 @@ def get_loc(self, key, method=None, tolerance=None): ------- loc : int """ - self._check_indexing_error(key) + if not is_scalar(key): + raise InvalidIndexError(key) orig_key = key if is_valid_na_for_dtype(key, self.dtype): @@ -636,22 +671,14 @@ def get_loc(self, key, method=None, tolerance=None): key = self._maybe_cast_for_get_loc(key) elif isinstance(key, str): - try: - parsed, reso = self._parse_with_reso(key) - except ValueError as err: - raise KeyError(key) from err + return self._get_string_slice(key) + except (TypeError, KeyError, ValueError, OverflowError): + pass - if self._can_partial_date_slice(reso): - try: - return self._partial_date_slice(reso, parsed) - except KeyError as err: - if method is None: - raise KeyError(key) from err try: key = self._maybe_cast_for_get_loc(key) except ValueError as err: - # FIXME: we get here because parse_with_reso doesn't raise on "t2m" raise KeyError(key) from err elif isinstance(key, timedelta): @@ -707,11 +734,13 @@ def _maybe_cast_slice_bound(self, label, side: str, kind=lib.no_default): self._deprecated_arg(kind, "kind", "_maybe_cast_slice_bound") if isinstance(label, str): + freq = getattr(self, "freqstr", getattr(self, "inferred_freq", None)) try: - parsed, reso = self._parse_with_reso(label) + parsed, reso_str = parsing.parse_time_string(label, freq) except parsing.DateParseError as err: raise self._invalid_indexer("slice", label) from err + reso = Resolution.from_attrname(reso_str) lower, upper = self._parsed_string_to_bounds(reso, parsed) # lower, upper form the half-open interval: # [parsed, parsed + 1 freq) @@ -729,6 +758,12 @@ def _maybe_cast_slice_bound(self, label, side: str, kind=lib.no_default): return self._maybe_cast_for_get_loc(label) + def _get_string_slice(self, key: str): + freq = getattr(self, "freqstr", getattr(self, "inferred_freq", None)) + parsed, reso_str = parsing.parse_time_string(key, freq) + reso = Resolution.from_attrname(reso_str) + return self._partial_date_slice(reso, parsed) + def slice_indexer(self, start=None, end=None, step=None, kind=None): """ Return indexer for specified label slice. @@ -808,7 +843,7 @@ def inferred_type(self) -> str: # sure we can't have ambiguous indexing return "datetime64" - def indexer_at_time(self, time, asof: bool = False) -> npt.NDArray[np.intp]: + def indexer_at_time(self, time, asof: bool = False) -> np.ndarray: """ Return index locations of values at particular time of day (e.g. 9:30AM). @@ -849,7 +884,7 @@ def indexer_at_time(self, time, asof: bool = False) -> npt.NDArray[np.intp]: def indexer_between_time( self, start_time, end_time, include_start: bool = True, include_end: bool = True - ) -> npt.NDArray[np.intp]: + ) -> np.ndarray: """ Return index locations of values between particular times of day (e.g., 9:00-9:30AM). diff --git a/pandas/core/indexes/extension.py b/pandas/core/indexes/extension.py index 1458ff1cdaa51..6ff20f7d009bc 100644 --- a/pandas/core/indexes/extension.py +++ b/pandas/core/indexes/extension.py @@ -102,8 +102,6 @@ def fset(self, value): else: def method(self, *args, **kwargs): - if "inplace" in kwargs: - raise ValueError(f"cannot use inplace with {type(self).__name__}") result = attr(self._data, *args, **kwargs) if wrap: if isinstance(result, type(self._data)): @@ -161,7 +159,7 @@ def wrapper(self, other): return wrapper -def _make_wrapped_arith_op(opname: str): +def make_wrapped_arith_op(opname: str): def method(self, other): if ( isinstance(other, Index) @@ -172,16 +170,7 @@ def method(self, other): # a chance to implement ops before we unwrap them. # See https://github.com/pandas-dev/pandas/issues/31109 return NotImplemented - - try: - meth = getattr(self._data, opname) - except AttributeError as err: - # e.g. Categorical, IntervalArray - cls = type(self).__name__ - raise TypeError( - f"cannot perform {opname} with this index type: {cls}" - ) from err - + meth = getattr(self._data, opname) result = meth(_maybe_unwrap_index(other)) return _wrap_arithmetic_op(self, other, result) @@ -278,23 +267,6 @@ def _simple_new( __le__ = _make_wrapped_comparison_op("__le__") __ge__ = _make_wrapped_comparison_op("__ge__") - __add__ = _make_wrapped_arith_op("__add__") - __sub__ = _make_wrapped_arith_op("__sub__") - __radd__ = _make_wrapped_arith_op("__radd__") - __rsub__ = _make_wrapped_arith_op("__rsub__") - __pow__ = _make_wrapped_arith_op("__pow__") - __rpow__ = _make_wrapped_arith_op("__rpow__") - __mul__ = _make_wrapped_arith_op("__mul__") - __rmul__ = _make_wrapped_arith_op("__rmul__") - __floordiv__ = _make_wrapped_arith_op("__floordiv__") - __rfloordiv__ = _make_wrapped_arith_op("__rfloordiv__") - __mod__ = _make_wrapped_arith_op("__mod__") - __rmod__ = _make_wrapped_arith_op("__rmod__") - __divmod__ = _make_wrapped_arith_op("__divmod__") - __rdivmod__ = _make_wrapped_arith_op("__rdivmod__") - __truediv__ = _make_wrapped_arith_op("__truediv__") - __rtruediv__ = _make_wrapped_arith_op("__rtruediv__") - @property def _has_complex_internals(self) -> bool: # used to avoid libreduction code paths, which raise or require conversion @@ -392,6 +364,13 @@ def _validate_fill_value(self, value): """ return self._data._validate_setitem_value(value) + def _get_unique_index(self): + if self.is_unique: + return self + + result = self._data.unique() + return type(self)._simple_new(result, name=self.name) + @doc(Index.map) def map(self, mapper, na_action=None): # Try to run function on index first, and then on elements of index @@ -418,13 +397,11 @@ def astype(self, dtype, copy: bool = True) -> Index: return self return self.copy() - # error: Non-overlapping equality check (left operand type: "dtype[Any]", right - # operand type: "Literal['M8[ns]']") if ( isinstance(self.dtype, np.dtype) and isinstance(dtype, np.dtype) and dtype.kind == "M" - and dtype != "M8[ns]" # type: ignore[comparison-overlap] + and dtype != "M8[ns]" ): # For now Datetime supports this by unwrapping ndarray, but DTI doesn't raise TypeError(f"Cannot cast {type(self).__name__} to dtype") diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index c401ad0c1e0d5..072ab7dff8e5b 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -7,8 +7,10 @@ ) import textwrap from typing import ( + TYPE_CHECKING, Any, Hashable, + cast, ) import numpy as np @@ -28,7 +30,6 @@ from pandas._typing import ( Dtype, DtypeObj, - npt, ) from pandas.errors import InvalidIndexError from pandas.util._decorators import ( @@ -38,7 +39,6 @@ from pandas.util._exceptions import rewrite_exception from pandas.core.dtypes.cast import ( - construct_1d_object_array_from_listlike, find_common_type, infer_dtype_from_scalar, maybe_box_datetimelike, @@ -46,6 +46,7 @@ ) from pandas.core.dtypes.common import ( ensure_platform_int, + is_categorical_dtype, is_datetime64tz_dtype, is_datetime_or_timedelta_dtype, is_dtype_equal, @@ -62,7 +63,7 @@ from pandas.core.dtypes.dtypes import IntervalDtype from pandas.core.dtypes.missing import is_valid_na_for_dtype -from pandas.core.algorithms import unique +from pandas.core.algorithms import take_nd from pandas.core.arrays.interval import ( IntervalArray, _interval_shared_docs, @@ -90,6 +91,9 @@ timedelta_range, ) +if TYPE_CHECKING: + from pandas import CategoricalIndex + _index_doc_kwargs = dict(ibase._index_doc_kwargs) _index_doc_kwargs.update( @@ -609,7 +613,9 @@ def get_loc( 0 """ self._check_indexing_method(method) - self._check_indexing_error(key) + + if not is_scalar(key): + raise InvalidIndexError(key) if isinstance(key, Interval): if self.closed != key.closed: @@ -645,40 +651,48 @@ def _get_indexer( method: str | None = None, limit: int | None = None, tolerance: Any | None = None, - ) -> npt.NDArray[np.intp]: + ) -> np.ndarray: + # returned ndarray is np.intp if isinstance(target, IntervalIndex): + # equal indexes -> 1:1 positional match + if self.equals(target): + return np.arange(len(self), dtype="intp") + + if not self._should_compare(target): + return self._get_indexer_non_comparable(target, method, unique=True) + # non-overlapping -> at most one match per interval in target # want exact matches -> need both left/right to match, so defer to # left/right get_indexer, compare elementwise, equality -> match left_indexer = self.left.get_indexer(target.left) right_indexer = self.right.get_indexer(target.right) indexer = np.where(left_indexer == right_indexer, left_indexer, -1) - - elif not is_object_dtype(target.dtype): + elif is_categorical_dtype(target.dtype): + target = cast("CategoricalIndex", target) + # get an indexer for unique categories then propagate to codes via take_nd + categories_indexer = self.get_indexer(target.categories) + indexer = take_nd(categories_indexer, target.codes, fill_value=-1) + elif not is_object_dtype(target): # homogeneous scalar index: use IntervalTree - # we should always have self._should_partial_index(target) here target = self._maybe_convert_i8(target) indexer = self._engine.get_indexer(target.values) else: # heterogeneous scalar index: defer elementwise to get_loc - # we should always have self._should_partial_index(target) here return self._get_indexer_pointwise(target)[0] return ensure_platform_int(indexer) @Appender(_index_shared_docs["get_indexer_non_unique"] % _index_doc_kwargs) - def get_indexer_non_unique( - self, target: Index - ) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: + def get_indexer_non_unique(self, target: Index) -> tuple[np.ndarray, np.ndarray]: + # both returned ndarrays are np.intp target = ensure_index(target) - if not self._should_compare(target) and not self._should_partial_index(target): - # e.g. IntervalIndex with different closed or incompatible subtype - # -> no matches + if isinstance(target, IntervalIndex) and not self._should_compare(target): + # different closed or incompatible subtype -> no matches return self._get_indexer_non_comparable(target, None, unique=False) - elif is_object_dtype(target.dtype) or not self._should_partial_index(target): + elif is_object_dtype(target.dtype) or isinstance(target, IntervalIndex): # target might contain intervals: defer elementwise to get_loc return self._get_indexer_pointwise(target) @@ -690,9 +704,8 @@ def get_indexer_non_unique( return ensure_platform_int(indexer), ensure_platform_int(missing) - def _get_indexer_pointwise( - self, target: Index - ) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: + def _get_indexer_pointwise(self, target: Index) -> tuple[np.ndarray, np.ndarray]: + # both returned ndarrays are np.intp """ pointwise implementation for get_indexer and get_indexer_non_unique. """ @@ -740,7 +753,6 @@ def _convert_slice_indexer(self, key: slice, kind: str): return super()._convert_slice_indexer(key, kind) - @cache_readonly def _should_fallback_to_positional(self) -> bool: # integer lookups in Series.__getitem__ are unambiguously # positional in this case @@ -790,80 +802,6 @@ def _format_data(self, name=None) -> str: # name argument is unused here; just for compat with base / categorical return self._data._format_data() + "," + self._format_space() - # -------------------------------------------------------------------- - # Set Operations - - def _intersection(self, other, sort): - """ - intersection specialized to the case with matching dtypes. - """ - # For IntervalIndex we also know other.closed == self.closed - if self.left.is_unique and self.right.is_unique: - taken = self._intersection_unique(other) - elif other.left.is_unique and other.right.is_unique and self.isna().sum() <= 1: - # Swap other/self if other is unique and self does not have - # multiple NaNs - taken = other._intersection_unique(self) - else: - # duplicates - taken = self._intersection_non_unique(other) - - if sort is None: - taken = taken.sort_values() - - return taken - - def _intersection_unique(self, other: IntervalIndex) -> IntervalIndex: - """ - Used when the IntervalIndex does not have any common endpoint, - no matter left or right. - Return the intersection with another IntervalIndex. - Parameters - ---------- - other : IntervalIndex - Returns - ------- - IntervalIndex - """ - # Note: this is much more performant than super()._intersection(other) - lindexer = self.left.get_indexer(other.left) - rindexer = self.right.get_indexer(other.right) - - match = (lindexer == rindexer) & (lindexer != -1) - indexer = lindexer.take(match.nonzero()[0]) - indexer = unique(indexer) - - return self.take(indexer) - - def _intersection_non_unique(self, other: IntervalIndex) -> IntervalIndex: - """ - Used when the IntervalIndex does have some common endpoints, - on either sides. - Return the intersection with another IntervalIndex. - - Parameters - ---------- - other : IntervalIndex - - Returns - ------- - IntervalIndex - """ - # Note: this is about 3.25x faster than super()._intersection(other) - # in IntervalIndexMethod.time_intersection_both_duplicate(1000) - mask = np.zeros(len(self), dtype=bool) - - if self.hasnans and other.hasnans: - first_nan_loc = np.arange(len(self))[self.isna()][0] - mask[first_nan_loc] = True - - other_tups = set(zip(other.left, other.right)) - for i, tup in enumerate(zip(self.left, self.right)): - if tup in other_tups: - mask[i] = True - - return self[mask] - # -------------------------------------------------------------------- @property @@ -874,19 +812,6 @@ def _is_all_dates(self) -> bool: """ return False - def _get_join_target(self) -> np.ndarray: - # constructing tuples is much faster than constructing Intervals - tups = list(zip(self.left, self.right)) - target = construct_1d_object_array_from_listlike(tups) - return target - - def _from_join_target(self, result): - left, right = list(zip(*result)) - arr = type(self._data).from_arrays( - left, right, dtype=self.dtype, closed=self.closed - ) - return type(self)._simple_new(arr, name=self.name) - # TODO: arithmetic operations diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 9b56e4cf89498..821d696200175 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -73,6 +73,7 @@ from pandas.core.arrays import Categorical from pandas.core.arrays.categorical import factorize_from_iterables import pandas.core.common as com +from pandas.core.indexers import is_empty_indexer import pandas.core.indexes.base as ibase from pandas.core.indexes.base import ( Index, @@ -760,7 +761,6 @@ def levels(self) -> FrozenList: def _set_levels( self, levels, - *, level=None, copy: bool = False, validate: bool = True, @@ -954,7 +954,6 @@ def codes(self): def _set_codes( self, codes, - *, level=None, copy: bool = False, validate: bool = True, @@ -1393,7 +1392,7 @@ def format( def _get_names(self) -> FrozenList: return FrozenList(self._names) - def _set_names(self, names, *, level=None, validate: bool = True): + def _set_names(self, names, level=None, validate: bool = True): """ Set new names on index. Each name has to be a hashable type. @@ -1474,7 +1473,7 @@ def _set_names(self, names, *, level=None, validate: bool = True): # -------------------------------------------------------------------- @doc(Index._get_grouper_for_level) - def _get_grouper_for_level(self, mapper, *, level): + def _get_grouper_for_level(self, mapper, level): indexer = self.codes[level] level_index = self.levels[level] @@ -2477,7 +2476,53 @@ def sortlevel( return new_index, indexer - def _wrap_reindex_result(self, target, indexer, preserve_names: bool): + def reindex( + self, target, method=None, level=None, limit=None, tolerance=None + ) -> tuple[MultiIndex, np.ndarray | None]: + """ + Create index with target's values (move/add/delete values as necessary) + + Returns + ------- + new_index : pd.MultiIndex + Resulting index + indexer : np.ndarray[np.intp] or None + Indices of output values in original index. + + """ + # GH6552: preserve names when reindexing to non-named target + # (i.e. neither Index nor Series). + preserve_names = not hasattr(target, "names") + + if level is not None: + if method is not None: + raise TypeError("Fill method not supported if level passed") + + # GH7774: preserve dtype/tz if target is empty and not an Index. + # target may be an iterator + target = ibase.ensure_has_len(target) + if len(target) == 0 and not isinstance(target, Index): + idx = self.levels[level] + attrs = idx._get_attributes_dict() + attrs.pop("freq", None) # don't preserve freq + target = type(idx)._simple_new(np.empty(0, dtype=idx.dtype), **attrs) + else: + target = ensure_index(target) + target, indexer, _ = self._join_level( + target, level, how="right", keep_order=False + ) + else: + target = ensure_index(target) + if self.equals(target): + indexer = None + else: + if self.is_unique: + indexer = self.get_indexer( + target, method=method, limit=limit, tolerance=tolerance + ) + else: + raise ValueError("cannot handle a non-unique multi-index!") + if not isinstance(target, MultiIndex): if indexer is None: target = self @@ -2488,12 +2533,7 @@ def _wrap_reindex_result(self, target, indexer, preserve_names: bool): target = MultiIndex.from_tuples(target) except TypeError: # not all tuples, see test_constructor_dict_multiindex_reindex_flat - return target - - target = self._maybe_preserve_names(target, preserve_names) - return target - - def _maybe_preserve_names(self, target: Index, preserve_names: bool): + return target, indexer if ( preserve_names and target.nlevels == self.nlevels @@ -2501,7 +2541,8 @@ def _maybe_preserve_names(self, target: Index, preserve_names: bool): ): target = target.copy(deep=False) target.names = self.names - return target + + return target, indexer # -------------------------------------------------------------------- # Indexing Methods @@ -2513,13 +2554,12 @@ def _check_indexing_error(self, key): # We have to explicitly exclude generators, as these are hashable. raise InvalidIndexError(key) - @cache_readonly def _should_fallback_to_positional(self) -> bool: """ Should integer key(s) be treated as positional? """ # GH#33355 - return self.levels[0]._should_fallback_to_positional + return self.levels[0]._should_fallback_to_positional() def _get_values_for_loc(self, series: Series, loc, key): """ @@ -2541,49 +2581,109 @@ def _get_values_for_loc(self, series: Series, loc, key): new_ser = series._constructor(new_values, index=new_index, name=series.name) return new_ser.__finalize__(series) - def _get_indexer_strict(self, key, axis_name: str) -> tuple[Index, np.ndarray]: - - keyarr = key - if not isinstance(keyarr, Index): - keyarr = com.asarray_tuplesafe(keyarr) - - if len(keyarr) and not isinstance(keyarr[0], tuple): - indexer = self._get_indexer_level_0(keyarr) - - self._raise_if_missing(key, indexer, axis_name) - return self[indexer], indexer + def _convert_listlike_indexer(self, keyarr) -> np.ndarray | None: + """ + Analogous to get_indexer when we are partial-indexing on our first level. - return super()._get_indexer_strict(key, axis_name) + Parameters + ---------- + keyarr : Index, np.ndarray, or ExtensionArray + Indexer to convert. - def _raise_if_missing(self, key, indexer, axis_name: str): - keyarr = key - if not isinstance(key, Index): - keyarr = com.asarray_tuplesafe(key) + Returns + ------- + np.ndarray[intp] or None + """ + indexer = None + # are we indexing a specific level if len(keyarr) and not isinstance(keyarr[0], tuple): - # i.e. same condition for special case in MultiIndex._get_indexer_strict + _, indexer = self.reindex(keyarr, level=0) - mask = indexer == -1 + # take all + if indexer is None: + indexer = np.arange(len(self), dtype=np.intp) + return indexer + + check = self.levels[0].get_indexer(keyarr) + mask = check == -1 if mask.any(): - check = self.levels[0].get_indexer(keyarr) - cmask = check == -1 - if cmask.any(): - raise KeyError(f"{keyarr[cmask]} not in index") + raise KeyError(f"{keyarr[mask]} not in index") + elif is_empty_indexer(indexer, keyarr): # We get here when levels still contain values which are not # actually in Index anymore raise KeyError(f"{keyarr} not in index") - else: - return super()._raise_if_missing(key, indexer, axis_name) - def _get_indexer_level_0(self, target) -> np.ndarray: + return indexer + + def _get_partial_string_timestamp_match_key(self, key): """ - Optimized equivalent to `self.get_level_values(0).get_indexer_for(target)`. + Translate any partial string timestamp matches in key, returning the + new key. + + Only relevant for MultiIndex. """ - lev = self.levels[0] - codes = self._codes[0] - cat = Categorical.from_codes(codes=codes, categories=lev) - ci = Index(cat) - return ci.get_indexer_for(target) + # GH#10331 + if isinstance(key, str) and self.levels[0]._supports_partial_string_indexing: + # Convert key '2016-01-01' to + # ('2016-01-01'[, slice(None, None, None)]+) + key = (key,) + (slice(None),) * (len(self.levels) - 1) + + if isinstance(key, tuple): + # Convert (..., '2016-01-01', ...) in tuple to + # (..., slice('2016-01-01', '2016-01-01', None), ...) + new_key = [] + for i, component in enumerate(key): + if ( + isinstance(component, str) + and self.levels[i]._supports_partial_string_indexing + ): + new_key.append(slice(component, component, None)) + else: + new_key.append(component) + key = tuple(new_key) + + return key + + def _get_indexer( + self, + target: Index, + method: str | None = None, + limit: int | None = None, + tolerance=None, + ) -> np.ndarray: + # returned ndarray is np.intp + + # empty indexer + if not len(target): + return ensure_platform_int(np.array([])) + + if not isinstance(target, MultiIndex): + try: + target = MultiIndex.from_tuples(target) + except (TypeError, ValueError): + + # let's instead try with a straight Index + if method is None: + return Index(self._values).get_indexer( + target, method=method, limit=limit, tolerance=tolerance + ) + + # TODO: explicitly raise here? we only have one test that + # gets here, and it is checking that we raise with method="nearest" + + if method == "pad" or method == "backfill": + # TODO: get_indexer_with_fill docstring says values must be _sorted_ + # but that doesn't appear to be enforced + indexer = self._engine.get_indexer_with_fill( + target=target._values, values=self._values, method=method, limit=limit + ) + else: + indexer = self._engine.get_indexer(target._values) + + # Note: we only get here (in extant tests at least) with + # target.nlevels == self.nlevels + return ensure_platform_int(indexer) def get_slice_bound( self, label: Hashable | Sequence[Hashable], side: str, kind: str | None = None @@ -2702,19 +2802,15 @@ def _partial_tup_index(self, tup: tuple, side="left"): n = len(tup) start, end = 0, len(self) zipped = zip(tup, self.levels, self.codes) - for k, (lab, lev, level_codes) in enumerate(zipped): - section = level_codes[start:end] + for k, (lab, lev, labs) in enumerate(zipped): + section = labs[start:end] if lab not in lev and not isna(lab): - # short circuit - try: - loc = lev.searchsorted(lab, side=side) - except TypeError as err: - # non-comparable e.g. test_slice_locs_with_type_mismatch - raise TypeError(f"Level type mismatch: {lab}") from err - if not is_integer(loc): - # non-comparable level, e.g. test_groupby_example + if not lev.is_type_compatible(lib.infer_dtype([lab], skipna=False)): raise TypeError(f"Level type mismatch: {lab}") + + # short circuit + loc = lev.searchsorted(lab, side=side) if side == "right" and loc >= 0: loc -= 1 return start + section.searchsorted(loc, side=side) @@ -2833,12 +2929,7 @@ def _maybe_to_slice(loc): ) if keylen == self.nlevels and self.is_unique: - try: - return self._engine.get_loc(key) - except TypeError: - # e.g. partial string slicing - loc, _ = self.get_loc_level(key, list(range(self.nlevels))) - return loc + return self._engine.get_loc(key) # -- partial selection or non-unique index # break the key into 2 parts based on the lexsort_depth of the index; @@ -2917,27 +3008,27 @@ def get_loc_level(self, key, level=0, drop_level: bool = True): level = self._get_level_number(level) else: level = [self._get_level_number(lev) for lev in level] + return self._get_loc_level(key, level=level, drop_level=drop_level) - loc, mi = self._get_loc_level(key, level=level) - if not drop_level: - if lib.is_integer(loc): - mi = self[loc : loc + 1] - else: - mi = self[loc] - return loc, mi - - def _get_loc_level(self, key, level: int | list[int] = 0): + def _get_loc_level(self, key, level: int | list[int] = 0, drop_level: bool = True): """ get_loc_level but with `level` known to be positional, not name-based. """ # different name to distinguish from maybe_droplevels - def maybe_mi_droplevels(indexer, levels): - new_index = self[indexer] + def maybe_mi_droplevels(indexer, levels, drop_level: bool): + if not drop_level: + return self[indexer] + # kludge around + orig_index = new_index = self[indexer] for i in sorted(levels, reverse=True): - new_index = new_index._drop_level_numbers([i]) + try: + new_index = new_index._drop_level_numbers([i]) + except ValueError: + # no dropping here + return orig_index return new_index if isinstance(level, (tuple, list)): @@ -2952,18 +3043,10 @@ def maybe_mi_droplevels(indexer, levels): mask = np.zeros(len(self), dtype=bool) mask[loc] = True loc = mask - result = loc if result is None else result & loc - try: - # FIXME: we should be only dropping levels on which we are - # scalar-indexing - mi = maybe_mi_droplevels(result, level) - except ValueError: - # droplevel failed because we tried to drop all levels, - # i.e. len(level) == self.nlevels - mi = self[result] + result = loc if result is None else result & loc - return result, mi + return result, maybe_mi_droplevels(result, level, drop_level) # kludge for #1796 if isinstance(key, list): @@ -2972,105 +3055,64 @@ def maybe_mi_droplevels(indexer, levels): if isinstance(key, tuple) and level == 0: try: - # Check if this tuple is a single key in our first level if key in self.levels[0]: indexer = self._get_level_indexer(key, level=level) - new_index = maybe_mi_droplevels(indexer, [0]) + new_index = maybe_mi_droplevels(indexer, [0], drop_level) return indexer, new_index except (TypeError, InvalidIndexError): pass if not any(isinstance(k, slice) for k in key): - if len(key) == self.nlevels and self.is_unique: - # Complete key in unique index -> standard get_loc - try: - return (self._engine.get_loc(key), None) - except KeyError as err: - raise KeyError(key) from err - except TypeError: - # e.g. partial string indexing - # test_partial_string_timestamp_multiindex - pass - # partial selection - indexer = self.get_loc(key) - ilevels = [i for i in range(len(key)) if key[i] != slice(None, None)] - if len(ilevels) == self.nlevels: - if is_integer(indexer): - # we are dropping all levels - return indexer, None - - # TODO: in some cases we still need to drop some levels, - # e.g. test_multiindex_perf_warn - # test_partial_string_timestamp_multiindex + # optionally get indexer to avoid re-calculation + def partial_selection(key, indexer=None): + if indexer is None: + indexer = self.get_loc(key) ilevels = [ - i - for i in range(len(key)) - if ( - not isinstance(key[i], str) - or not self.levels[i]._supports_partial_string_indexing - ) - and key[i] != slice(None, None) + i for i in range(len(key)) if key[i] != slice(None, None) ] - if len(ilevels) == self.nlevels: - # TODO: why? - ilevels = [] - return indexer, maybe_mi_droplevels(indexer, ilevels) + return indexer, maybe_mi_droplevels(indexer, ilevels, drop_level) + if len(key) == self.nlevels and self.is_unique: + # Complete key in unique index -> standard get_loc + try: + return (self._engine.get_loc(key), None) + except KeyError as e: + raise KeyError(key) from e + else: + return partial_selection(key) else: indexer = None for i, k in enumerate(key): if not isinstance(k, slice): - loc_level = self._get_level_indexer(k, level=i) - if isinstance(loc_level, slice): - if com.is_null_slice(loc_level) or com.is_full_slice( - loc_level, len(self) - ): - # everything - continue - else: - # e.g. test_xs_IndexSlice_argument_not_implemented - k_index = np.zeros(len(self), dtype=bool) - k_index[loc_level] = True - + k = self._get_level_indexer(k, level=i) + if isinstance(k, slice): + # everything + if k.start == 0 and k.stop == len(self): + k = slice(None, None) else: - k_index = loc_level - - elif com.is_null_slice(k): - # taking everything, does not affect `indexer` below - continue + k_index = k - else: - # FIXME: this message can be inaccurate, e.g. - # test_series_varied_multiindex_alignment - raise TypeError(f"Expected label or tuple of labels, got {key}") + if isinstance(k, slice): + if k == slice(None, None): + continue + else: + raise TypeError(key) if indexer is None: indexer = k_index - else: + else: # pragma: no cover indexer &= k_index if indexer is None: indexer = slice(None, None) ilevels = [i for i in range(len(key)) if key[i] != slice(None, None)] - return indexer, maybe_mi_droplevels(indexer, ilevels) + return indexer, maybe_mi_droplevels(indexer, ilevels, drop_level) else: indexer = self._get_level_indexer(key, level=level) - if ( - isinstance(key, str) - and self.levels[level]._supports_partial_string_indexing - ): - # check to see if we did an exact lookup vs sliced - check = self.levels[level].get_loc(key) - if not is_integer(check): - # e.g. test_partial_string_timestamp_multiindex - return indexer, self[indexer] - - return indexer, maybe_mi_droplevels(indexer, [level]) + return indexer, maybe_mi_droplevels(indexer, [level], drop_level) - def _get_level_indexer( - self, key, level: int = 0, indexer: Int64Index | None = None - ): + def _get_level_indexer(self, key, level: int = 0, indexer=None): # `level` kwarg is _always_ positional, never name # return an indexer, boolean array or a slice showing where the key is # in the totality of values @@ -3163,23 +3205,15 @@ def convert_indexer(start, stop, step, indexer=indexer, codes=level_codes): if level > 0 or self._lexsort_depth == 0: # Desired level is not sorted - if isinstance(idx, slice): - # test_get_loc_partial_timestamp_multiindex - locs = (level_codes >= idx.start) & (level_codes < idx.stop) - return locs - locs = np.array(level_codes == idx, dtype=bool, copy=False) - if not locs.any(): # The label is present in self.levels[level] but unused: raise KeyError(key) return locs if isinstance(idx, slice): - # e.g. test_partial_string_timestamp_multiindex - start = level_codes.searchsorted(idx.start, side="left") - # NB: "left" here bc of slice semantics - end = level_codes.searchsorted(idx.stop, side="left") + start = idx.start + end = idx.stop else: start = level_codes.searchsorted(idx, side="left") end = level_codes.searchsorted(idx, side="right") @@ -3231,12 +3265,10 @@ def get_locs(self, seq): "MultiIndex slicing requires the index to be lexsorted: slicing " f"on levels {true_slices}, lexsort depth {self._lexsort_depth}" ) - + # indexer + # this is the list of all values that we want to select n = len(self) - # indexer is the list of all positions that we want to take; we - # start with it being everything and narrow it down as we look at each - # entry in `seq` - indexer = Index(np.arange(n)) + indexer = None def _convert_to_indexer(r) -> Int64Index: # return an indexer @@ -3254,10 +3286,14 @@ def _convert_to_indexer(r) -> Int64Index: r = r.nonzero()[0] return Int64Index(r) - def _update_indexer(idxr: Index, indexer: Index) -> Index: + def _update_indexer(idxr: Index | None, indexer: Index | None, key) -> Index: + if indexer is None: + indexer = Index(np.arange(n)) + if idxr is None: + return indexer indexer_intersection = indexer.intersection(idxr) if indexer_intersection.empty and not idxr.empty and not indexer.empty: - raise KeyError(seq) + raise KeyError(key) return indexer_intersection for i, k in enumerate(seq): @@ -3265,85 +3301,65 @@ def _update_indexer(idxr: Index, indexer: Index) -> Index: if com.is_bool_indexer(k): # a boolean indexer, must be the same length! k = np.asarray(k) - lvl_indexer = _convert_to_indexer(k) - indexer = _update_indexer(lvl_indexer, indexer=indexer) + indexer = _update_indexer( + _convert_to_indexer(k), indexer=indexer, key=seq + ) elif is_list_like(k): # a collection of labels to include from this level (these # are or'd) - indexers: Int64Index | None = None for x in k: try: - # Argument "indexer" to "_get_level_indexer" of "MultiIndex" - # has incompatible type "Index"; expected "Optional[Int64Index]" - item_lvl_indexer = self._get_level_indexer( - x, level=i, indexer=indexer # type: ignore[arg-type] + idxrs = _convert_to_indexer( + self._get_level_indexer(x, level=i, indexer=indexer) ) - except KeyError: - # ignore not founds; see discussion in GH#39424 - warnings.warn( - "The behavior of indexing on a MultiIndex with a nested " - "sequence of labels is deprecated and will change in a " - "future version. `series.loc[label, sequence]` will " - "raise if any members of 'sequence' or not present in " - "the index's second level. To retain the old behavior, " - "use `series.index.isin(sequence, level=1)`", - # TODO: how to opt in to the future behavior? - # TODO: how to handle IntervalIndex level? (no test cases) - FutureWarning, - stacklevel=7, + indexers = (idxrs if indexers is None else indexers).union( + idxrs, sort=False ) - continue - else: - idxrs = _convert_to_indexer(item_lvl_indexer) + except KeyError: - if indexers is None: - indexers = idxrs - else: - indexers = indexers.union(idxrs, sort=False) + # ignore not founds + continue if indexers is not None: - indexer = _update_indexer(indexers, indexer=indexer) + indexer = _update_indexer(indexers, indexer=indexer, key=seq) else: # no matches we are done - # test_loc_getitem_duplicates_multiindex_empty_indexer - return np.array([], dtype=np.intp) + return np.array([], dtype=np.int64) elif com.is_null_slice(k): # empty slice - pass + indexer = _update_indexer(None, indexer=indexer, key=seq) elif isinstance(k, slice): # a slice, include BOTH of the labels - # Argument "indexer" to "_get_level_indexer" of "MultiIndex" has - # incompatible type "Index"; expected "Optional[Int64Index]" - lvl_indexer = self._get_level_indexer( - k, - level=i, - indexer=indexer, # type: ignore[arg-type] - ) indexer = _update_indexer( - _convert_to_indexer(lvl_indexer), + _convert_to_indexer( + self._get_level_indexer(k, level=i, indexer=indexer) + ), indexer=indexer, + key=seq, ) else: # a single label - lvl_indexer = self._get_loc_level(k, level=i)[0] indexer = _update_indexer( - _convert_to_indexer(lvl_indexer), + _convert_to_indexer( + self.get_loc_level(k, level=i, drop_level=False)[0] + ), indexer=indexer, + key=seq, ) # empty indexer if indexer is None: - return np.array([], dtype=np.intp) + return np.array([], dtype=np.int64) assert isinstance(indexer, Int64Index), type(indexer) indexer = self._reorder_indexer(seq, indexer) - return indexer._values.astype(np.intp, copy=False) + return indexer._values # -------------------------------------------------------------------- @@ -3572,10 +3588,27 @@ def _maybe_match_names(self, other): names.append(None) return names - def _wrap_intersection_result(self, other, result): - _, result_names = self._convert_can_do_setop(other) + def _intersection(self, other, sort=False) -> MultiIndex: + other, result_names = self._convert_can_do_setop(other) + other = other.astype(object, copy=False) + + uniq_tuples = None # flag whether _inner_indexer was successful + if self.is_monotonic and other.is_monotonic: + try: + inner_tuples = self._inner_indexer(other)[0] + sort = False # inner_tuples is already sorted + except TypeError: + pass + else: + uniq_tuples = algos.unique(inner_tuples) + + if uniq_tuples is None: + uniq_tuples = self._intersection_via_get_indexer(other, sort) + + if sort is None: + uniq_tuples = sorted(uniq_tuples) - if len(result) == 0: + if len(uniq_tuples) == 0: return MultiIndex( levels=self.levels, codes=[[]] * self.nlevels, @@ -3583,12 +3616,24 @@ def _wrap_intersection_result(self, other, result): verify_integrity=False, ) else: - return MultiIndex.from_arrays(zip(*result), sortorder=0, names=result_names) + return MultiIndex.from_arrays( + zip(*uniq_tuples), sortorder=0, names=result_names + ) + + def _difference(self, other, sort) -> MultiIndex: + other, result_names = self._convert_can_do_setop(other) + + this = self._get_unique_index() - def _wrap_difference_result(self, other, result): - _, result_names = self._convert_can_do_setop(other) + indexer = this.get_indexer(other) + indexer = indexer.take((indexer != -1).nonzero()[0]) - if len(result) == 0: + label_diff = np.setdiff1d(np.arange(this.size), indexer, assume_unique=True) + difference = this._values.take(label_diff) + if sort is None: + difference = sorted(difference) + + if len(difference) == 0: return MultiIndex( levels=[[]] * self.nlevels, codes=[[]] * self.nlevels, @@ -3596,7 +3641,7 @@ def _wrap_difference_result(self, other, result): verify_integrity=False, ) else: - return MultiIndex.from_tuples(result, sortorder=0, names=result_names) + return MultiIndex.from_tuples(difference, sortorder=0, names=result_names) def _convert_can_do_setop(self, other): result_names = self.names @@ -3618,6 +3663,18 @@ def _convert_can_do_setop(self, other): return other, result_names + def symmetric_difference(self, other, result_name=None, sort=None): + # On equal symmetric_difference MultiIndexes the difference is empty. + # Therefore, an empty MultiIndex is returned GH13490 + tups = Index.symmetric_difference(self, other, result_name, sort) + if len(tups) == 0: + return type(self)( + levels=[[] for _ in range(self.nlevels)], + codes=[[] for _ in range(self.nlevels)], + names=tups.names, + ) + return tups + # -------------------------------------------------------------------- @doc(Index.astype) @@ -3820,7 +3877,7 @@ def maybe_droplevels(index: Index, key) -> Index: def _coerce_indexer_frozen(array_like, categories, copy: bool = False) -> np.ndarray: """ - Coerce the array-like indexer to the smallest integer dtype that can encode all + Coerce the array_like indexer to the smallest integer dtype that can encode all of the given categories. Parameters diff --git a/pandas/core/indexes/numeric.py b/pandas/core/indexes/numeric.py index 24f3df684ab10..ea2d5d9eec6ac 100644 --- a/pandas/core/indexes/numeric.py +++ b/pandas/core/indexes/numeric.py @@ -153,12 +153,7 @@ def _ensure_array(cls, data, dtype, copy: bool): if not isinstance(data, (ABCSeries, list, tuple)): data = list(data) - orig = data data = np.asarray(data, dtype=dtype) - if dtype is None and data.dtype.kind == "f": - if cls is UInt64Index and (data >= 0).all(): - # https://github.com/numpy/numpy/issues/19146 - data = np.asarray(orig, dtype=np.uint64) if issubclass(data.dtype.type, str): cls._string_data_error(data) @@ -233,7 +228,6 @@ def astype(self, dtype, copy=True): # ---------------------------------------------------------------- # Indexing Methods - @cache_readonly @doc(Index._should_fallback_to_positional) def _should_fallback_to_positional(self) -> bool: return False @@ -376,16 +370,6 @@ class UInt64Index(IntegerIndex): _default_dtype = np.dtype(np.uint64) _dtype_validation_metadata = (is_unsigned_integer_dtype, "unsigned integer") - def _validate_fill_value(self, value): - # e.g. np.array([1]) we want np.array([1], dtype=np.uint64) - # see test_where_uin64 - super()._validate_fill_value(value) - if hasattr(value, "dtype") and is_signed_integer_dtype(value.dtype): - if (value >= 0).all(): - return value.astype(self.dtype) - raise TypeError - return value - class Float64Index(NumericIndex): _index_descr_args = { diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index df3862553a70c..c1104b80a0a7a 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -15,24 +15,29 @@ ) from pandas._libs.tslibs import ( BaseOffset, - NaT, Period, Resolution, Tick, ) +from pandas._libs.tslibs.parsing import ( + DateParseError, + parse_time_string, +) from pandas._typing import ( Dtype, DtypeObj, ) +from pandas.errors import InvalidIndexError from pandas.util._decorators import doc from pandas.core.dtypes.common import ( is_datetime64_any_dtype, + is_float, is_integer, + is_scalar, pandas_dtype, ) from pandas.core.dtypes.dtypes import PeriodDtype -from pandas.core.dtypes.missing import is_valid_na_for_dtype from pandas.core.arrays.period import ( PeriodArray, @@ -406,59 +411,55 @@ def get_loc(self, key, method=None, tolerance=None): """ orig_key = key - self._check_indexing_error(key) + if not is_scalar(key): + raise InvalidIndexError(key) - if is_valid_na_for_dtype(key, self.dtype): - key = NaT + if isinstance(key, str): - elif isinstance(key, str): + try: + loc = self._get_string_slice(key) + return loc + except (TypeError, ValueError): + pass try: - parsed, reso = self._parse_with_reso(key) - except ValueError as err: + asdt, reso_str = parse_time_string(key, self.freq) + except (ValueError, DateParseError) as err: # A string with invalid format raise KeyError(f"Cannot interpret '{key}' as period") from err - if self._can_partial_date_slice(reso): - try: - return self._partial_date_slice(reso, parsed) - except KeyError as err: - # TODO: pass if method is not None, like DTI does? - raise KeyError(key) from err + reso = Resolution.from_attrname(reso_str) + grp = reso.freq_group.value + freqn = self.dtype.freq_group_code + + # _get_string_slice will handle cases where grp < freqn + assert grp >= freqn - if reso == self.dtype.resolution: - # the reso < self.dtype.resolution case goes through _get_string_slice - key = Period(parsed, freq=self.freq) + # BusinessDay is a bit strange. It has a *lower* code, but we never parse + # a string as "BusinessDay" resolution, just Day. + if grp == freqn or ( + reso == Resolution.RESO_DAY and self.dtype.freq.name == "B" + ): + key = Period(asdt, freq=self.freq) loc = self.get_loc(key, method=method, tolerance=tolerance) - # Recursing instead of falling through matters for the exception - # message in test_get_loc3 (though not clear if that really matters) return loc elif method is None: raise KeyError(key) else: - key = Period(parsed, freq=self.freq) - - elif isinstance(key, Period): - sfreq = self.freq - kfreq = key.freq - if not ( - sfreq.n == kfreq.n - and sfreq._period_dtype_code == kfreq._period_dtype_code - ): - # GH#42247 For the subset of DateOffsets that can be Period freqs, - # checking these two attributes is sufficient to check equality, - # and much more performant than `self.freq == key.freq` - raise KeyError(key) - elif isinstance(key, datetime): - try: - key = Period(key, freq=self.freq) - except ValueError as err: - # we cannot construct the Period - raise KeyError(orig_key) from err - else: - # in particular integer, which Period constructor would cast to string + key = asdt + + elif is_integer(key): + # Period constructor will cast to string, which we dont want + raise KeyError(key) + elif isinstance(key, Period) and key.freq != self.freq: raise KeyError(key) + try: + key = Period(key, freq=self.freq) + except ValueError as err: + # we cannot construct the Period + raise KeyError(orig_key) from err + try: return Index.get_loc(self, key, method, tolerance) except KeyError as err: @@ -491,14 +492,14 @@ def _maybe_cast_slice_bound(self, label, side: str, kind=lib.no_default): return Period(label, freq=self.freq) elif isinstance(label, str): try: - parsed, reso = self._parse_with_reso(label) + parsed, reso_str = parse_time_string(label, self.freq) + reso = Resolution.from_attrname(reso_str) + bounds = self._parsed_string_to_bounds(reso, parsed) + return bounds[0 if side == "left" else 1] except ValueError as err: # string cannot be parsed as datetime-like raise self._invalid_indexer("slice", label) from err - - lower, upper = self._parsed_string_to_bounds(reso, parsed) - return lower if side == "left" else upper - elif not isinstance(label, self._data._recognized_scalars): + elif is_integer(label) or is_float(label): raise self._invalid_indexer("slice", label) return label @@ -508,10 +509,24 @@ def _parsed_string_to_bounds(self, reso: Resolution, parsed: datetime): iv = Period(parsed, freq=grp.value) return (iv.asfreq(self.freq, how="start"), iv.asfreq(self.freq, how="end")) - def _can_partial_date_slice(self, reso: Resolution) -> bool: + def _validate_partial_date_slice(self, reso: Resolution): assert isinstance(reso, Resolution), (type(reso), reso) - # e.g. test_getitem_setitem_periodindex - return reso > self.dtype.resolution + grp = reso.freq_group + freqn = self.dtype.freq_group_code + + if not grp.value < freqn: + # TODO: we used to also check for + # reso in ["day", "hour", "minute", "second"] + # why is that check not needed? + raise ValueError + + def _get_string_slice(self, key: str): + parsed, reso_str = parse_time_string(key, self.freq) + reso = Resolution.from_attrname(reso_str) + try: + return self._partial_date_slice(reso, parsed) + except KeyError as err: + raise KeyError(key) from err def period_range( diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index 0ce99df44a5f9..746246172b967 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -17,10 +17,7 @@ from pandas._libs import index as libindex from pandas._libs.lib import no_default -from pandas._typing import ( - Dtype, - npt, -) +from pandas._typing import Dtype from pandas.compat.numpy import function as nv from pandas.util._decorators import ( cache_readonly, @@ -388,7 +385,6 @@ def get_loc(self, key, method=None, tolerance=None): return self._range.index(new_key) except ValueError as err: raise KeyError(key) from err - self._check_indexing_error(key) raise KeyError(key) return super().get_loc(key, method=method, tolerance=tolerance) @@ -398,7 +394,8 @@ def _get_indexer( method: str | None = None, limit: int | None = None, tolerance=None, - ) -> npt.NDArray[np.intp]: + ) -> np.ndarray: + # -> np.ndarray[np.intp] if com.any_not_none(method, tolerance, limit): return super()._get_indexer( target, method=method, tolerance=tolerance, limit=limit @@ -411,6 +408,10 @@ def _get_indexer( reverse = self._range[::-1] start, stop, step = reverse.start, reverse.stop, reverse.step + if not is_signed_integer_dtype(target): + # checks/conversions/roundings are delegated to general method + return super()._get_indexer(target, method=method, tolerance=tolerance) + target_array = np.asarray(target) locs = target_array - start valid = (locs % step == 0) & (locs >= 0) & (target_array < stop) @@ -504,7 +505,7 @@ def max(self, axis=None, skipna: bool = True, *args, **kwargs) -> int: nv.validate_max(args, kwargs) return self._minmax("max") - def argsort(self, *args, **kwargs) -> npt.NDArray[np.intp]: + def argsort(self, *args, **kwargs) -> np.ndarray: """ Returns the indices that would sort the index and its underlying data. @@ -531,7 +532,7 @@ def argsort(self, *args, **kwargs) -> npt.NDArray[np.intp]: def factorize( self, sort: bool = False, na_sentinel: int | None = -1 - ) -> tuple[npt.NDArray[np.intp], RangeIndex]: + ) -> tuple[np.ndarray, RangeIndex]: codes = np.arange(len(self), dtype=np.intp) uniques = self if sort and self.step < 0: @@ -551,12 +552,14 @@ def equals(self, other: object) -> bool: # Set Operations def _intersection(self, other: Index, sort=False): - # caller is responsible for checking self and other are both non-empty if not isinstance(other, RangeIndex): # Int64Index return super()._intersection(other, sort=sort) + if not len(self) or not len(other): + return self._simple_new(_empty_range) + first = self._range[::-1] if self.step < 0 else self._range second = other._range[::-1] if other.step < 0 else other._range @@ -727,18 +730,6 @@ def _difference(self, other, sort=None): new_index = new_index[::-1] return new_index - def symmetric_difference(self, other, result_name: Hashable = None, sort=None): - if not isinstance(other, RangeIndex) or sort is not None: - return super().symmetric_difference(other, result_name, sort) - - left = self.difference(other) - right = other.difference(self) - result = left.union(right) - - if result_name is not None: - result = result.rename(result_name) - return result - # -------------------------------------------------------------------- def _concat(self, indexes: list[Index], name: Hashable) -> Index: diff --git a/pandas/core/indexes/timedeltas.py b/pandas/core/indexes/timedeltas.py index 023cb651c9632..c60ab06dd08f3 100644 --- a/pandas/core/indexes/timedeltas.py +++ b/pandas/core/indexes/timedeltas.py @@ -9,7 +9,11 @@ Timedelta, to_offset, ) -from pandas._typing import DtypeObj +from pandas._typing import ( + DtypeObj, + Optional, +) +from pandas.errors import InvalidIndexError from pandas.core.dtypes.common import ( TD64NS_DTYPE, @@ -105,9 +109,6 @@ class TimedeltaIndex(DatetimeTimedeltaMixin): _data: TimedeltaArray - # Use base class method instead of DatetimeTimedeltaMixin._get_string_slice - _get_string_slice = Index._get_string_slice - # ------------------------------------------------------------------- # Constructors @@ -169,7 +170,8 @@ def get_loc(self, key, method=None, tolerance=None): ------- loc : int, slice, or ndarray[int] """ - self._check_indexing_error(key) + if not is_scalar(key): + raise InvalidIndexError(key) try: key = self._data._validate_scalar(key, unbox=False) @@ -196,30 +198,17 @@ def _maybe_cast_slice_bound(self, label, side: str, kind=lib.no_default): self._deprecated_arg(kind, "kind", "_maybe_cast_slice_bound") if isinstance(label, str): - try: - parsed, reso = self._parse_with_reso(label) - except ValueError as err: - # e.g. 'unit abbreviation w/o a number' - raise self._invalid_indexer("slice", label) from err - - lower, upper = self._parsed_string_to_bounds(reso, parsed) - return lower if side == "left" else upper + parsed = Timedelta(label) + lbound = parsed.round(parsed.resolution_string) + if side == "left": + return lbound + else: + return lbound + to_offset(parsed.resolution_string) - Timedelta(1, "ns") elif not isinstance(label, self._data._recognized_scalars): raise self._invalid_indexer("slice", label) return label - def _parse_with_reso(self, label: str): - # the "with_reso" is a no-op for TimedeltaIndex - parsed = Timedelta(label) - return parsed, None - - def _parsed_string_to_bounds(self, reso, parsed: Timedelta): - # reso is unused, included to match signature of DTI/PI - lbound = parsed.round(parsed.resolution_string) - rbound = lbound + to_offset(parsed.resolution_string) - Timedelta(1, "ns") - return lbound, rbound - # ------------------------------------------------------------------- @property @@ -230,7 +219,7 @@ def inferred_type(self) -> str: def timedelta_range( start=None, end=None, - periods: int | None = None, + periods: Optional[int] = None, freq=None, name=None, closed=None, diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 7d92f7ff11ed3..3707e141bc447 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -30,6 +30,7 @@ is_object_dtype, is_scalar, is_sequence, + needs_i8_conversion, ) from pandas.core.dtypes.concat import concat_compat from pandas.core.dtypes.generic import ( @@ -41,12 +42,8 @@ isna, ) -from pandas.core import algorithms as algos import pandas.core.common as com -from pandas.core.construction import ( - array as pd_array, - extract_array, -) +from pandas.core.construction import array as pd_array from pandas.core.indexers import ( check_array_indexer, is_empty_indexer, @@ -55,8 +52,11 @@ length_of_indexer, ) from pandas.core.indexes.api import ( + CategoricalIndex, Index, + IntervalIndex, MultiIndex, + ensure_index, ) if TYPE_CHECKING: @@ -821,15 +821,7 @@ def _getitem_lowerdim(self, tup: tuple): ax0 = self.obj._get_axis(0) # ...but iloc should handle the tuple as simple integer-location # instead of checking it as multiindex representation (GH 13797) - if ( - isinstance(ax0, MultiIndex) - and self.name != "iloc" - and not any(isinstance(x, slice) for x in tup) - ): - # Note: in all extant test cases, replacing the slice condition with - # `all(is_hashable(x) or com.is_null_slice(x) for x in tup)` - # is equivalent. - # (see the other place where we call _handle_lowerdim_multi_index_axis0) + if isinstance(ax0, MultiIndex) and self.name != "iloc": with suppress(IndexingError): return self._handle_lowerdim_multi_index_axis0(tup) @@ -878,21 +870,17 @@ def _getitem_nested_tuple(self, tup: tuple): if self.name != "loc": # This should never be reached, but lets be explicit about it raise ValueError("Too many indices") - if all(is_hashable(x) or com.is_null_slice(x) for x in tup): + if isinstance(self.obj, ABCSeries) and any( + isinstance(k, tuple) for k in tup + ): + # GH#35349 Raise if tuple in tuple for series + raise ValueError("Too many indices") + if self.ndim == 1 or not any(isinstance(x, slice) for x in tup): # GH#10521 Series should reduce MultiIndex dimensions instead of # DataFrame, IndexingError is not raised when slice(None,None,None) # with one row. with suppress(IndexingError): return self._handle_lowerdim_multi_index_axis0(tup) - elif isinstance(self.obj, ABCSeries) and any( - isinstance(k, tuple) for k in tup - ): - # GH#35349 Raise if tuple in tuple for series - # Do this after the all-hashable-or-null-slice check so that - # we are only getting non-hashable tuples, in particular ones - # that themselves contain a slice entry - # See test_loc_series_getitem_too_many_dimensions - raise ValueError("Too many indices") # this is a series with a multi-index specified a tuple of # selectors @@ -928,7 +916,9 @@ def __getitem__(self, key): key = tuple(list(x) if is_iterator(x) else x for x in key) key = tuple(com.apply_if_callable(x, self.obj) for x in key) if self._is_scalar_access(key): - return self.obj._get_value(*key, takeable=self._takeable) + with suppress(KeyError, IndexError, AttributeError): + # AttributeError for IntervalTree get_value + return self.obj._get_value(*key, takeable=self._takeable) return self._getitem_tuple(key) else: # we by definition only have the 0th axis @@ -1014,7 +1004,7 @@ def _is_scalar_access(self, key: tuple) -> bool: # should not be considered scalar return False - if not ax._index_as_unique: + if not ax.is_unique: return False return True @@ -1125,13 +1115,16 @@ def _handle_lowerdim_multi_index_axis0(self, tup: tuple): try: # fast path for series or for tup devoid of slices return self._get_label(tup, axis=axis) - + except (TypeError, InvalidIndexError): + # slices are unhashable + pass except KeyError as ek: # raise KeyError if number of indexers match # else IndexingError will be raised if self.ndim < len(tup) <= self.obj.index.nlevels: raise ek - raise IndexingError("No label returned") from ek + + raise IndexingError("No label returned") def _getitem_axis(self, key, axis: int): key = item_from_zerodim(key) @@ -1139,6 +1132,7 @@ def _getitem_axis(self, key, axis: int): key = list(key) labels = self.obj._get_axis(axis) + key = labels._get_partial_string_timestamp_match_key(key) if isinstance(key, slice): self._validate_key(key, axis) @@ -1240,7 +1234,9 @@ def _convert_to_indexer(self, key, axis: int, is_setter: bool = False): return {"key": key} if is_nested_tuple(key, labels): - if self.ndim == 1 and any(isinstance(k, tuple) for k in key): + if isinstance(self.obj, ABCSeries) and any( + isinstance(k, tuple) for k in key + ): # GH#35349 Raise if tuple in tuple for series raise ValueError("Too many indices") return labels.get_locs(key) @@ -1289,12 +1285,94 @@ def _get_listlike_indexer(self, key, axis: int): Indexer for the return object, -1 denotes keys not found. """ ax = self.obj._get_axis(axis) - axis_name = self.obj._get_axis_name(axis) - keyarr, indexer = ax._get_indexer_strict(key, axis_name) + keyarr = key + if not isinstance(keyarr, Index): + keyarr = com.asarray_tuplesafe(keyarr) + + if isinstance(ax, MultiIndex): + # get_indexer expects a MultiIndex or sequence of tuples, but + # we may be doing partial-indexing, so need an extra check + + # Have the index compute an indexer or return None + # if it cannot handle: + indexer = ax._convert_listlike_indexer(keyarr) + # We only act on all found values: + if indexer is not None and (indexer != -1).all(): + # _validate_read_indexer is a no-op if no -1s, so skip + return ax[indexer], indexer + + if ax._index_as_unique: + indexer = ax.get_indexer_for(keyarr) + keyarr = ax.reindex(keyarr)[0] + else: + keyarr, indexer, new_indexer = ax._reindex_non_unique(keyarr) + + self._validate_read_indexer(keyarr, indexer, axis) + + if needs_i8_conversion(ax.dtype) or isinstance( + ax, (IntervalIndex, CategoricalIndex) + ): + # For CategoricalIndex take instead of reindex to preserve dtype. + # For IntervalIndex this is to map integers to the Intervals they match to. + keyarr = ax.take(indexer) + if keyarr.dtype.kind in ["m", "M"]: + # DTI/TDI.take can infer a freq in some cases when we dont want one + if isinstance(key, list) or ( + isinstance(key, type(ax)) and key.freq is None + ): + keyarr = keyarr._with_freq(None) return keyarr, indexer + def _validate_read_indexer(self, key, indexer, axis: int): + """ + Check that indexer can be used to return a result. + + e.g. at least one element was found, + unless the list of keys was actually empty. + + Parameters + ---------- + key : list-like + Targeted labels (only used to show correct error message). + indexer: array-like of booleans + Indices corresponding to the key, + (with -1 indicating not found). + axis : int + Dimension on which the indexing is being made. + + Raises + ------ + KeyError + If at least one key was requested but none was found. + """ + if len(key) == 0: + return + + # Count missing values: + missing_mask = indexer < 0 + missing = (missing_mask).sum() + + if missing: + ax = self.obj._get_axis(axis) + + # TODO: remove special-case; this is just to keep exception + # message tests from raising while debugging + use_interval_msg = isinstance(ax, IntervalIndex) or ( + isinstance(ax, CategoricalIndex) + and isinstance(ax.categories, IntervalIndex) + ) + + if missing == len(indexer): + axis_name = self.obj._get_axis_name(axis) + if use_interval_msg: + key = list(key) + raise KeyError(f"None of [{key}] are in the [{axis_name}]") + + not_found = list(ensure_index(key)[missing_mask.nonzero()[0]].unique()) + raise KeyError(f"{not_found} not in index") + @doc(IndexingMixin.iloc) class _iLocIndexer(_LocationIndexer): @@ -1584,21 +1662,6 @@ def _setitem_with_indexer(self, indexer, value, name="iloc"): if com.is_null_slice(indexer[0]): # We are setting an entire column self.obj[key] = value - return - elif is_array_like(value): - # GH#42099 - arr = extract_array(value, extract_numpy=True) - taker = -1 * np.ones(len(self.obj), dtype=np.intp) - empty_value = algos.take_nd(arr, taker) - if not isinstance(value, ABCSeries): - # if not Series (in which case we need to align), - # we can short-circuit - empty_value[indexer[0]] = arr - self.obj[key] = empty_value - return - - self.obj[key] = empty_value - else: self.obj[key] = infer_fill_value(value) @@ -2125,7 +2188,7 @@ class _ScalarAccessIndexer(NDFrameIndexerBase): Access scalars quickly. """ - def _convert_key(self, key): + def _convert_key(self, key, is_setter: bool = False): raise AbstractMethodError(self) def __getitem__(self, key): @@ -2149,7 +2212,7 @@ def __setitem__(self, key, value): if not isinstance(key, tuple): key = _tuplify(self.ndim, key) - key = list(self._convert_key(key)) + key = list(self._convert_key(key, is_setter=True)) if len(key) != self.ndim: raise ValueError("Not enough indexers for scalar access (setting)!") @@ -2160,7 +2223,7 @@ def __setitem__(self, key, value): class _AtIndexer(_ScalarAccessIndexer): _takeable = False - def _convert_key(self, key): + def _convert_key(self, key, is_setter: bool = False): """ Require they keys to be the same type as the index. (so we don't fallback) @@ -2171,6 +2234,10 @@ def _convert_key(self, key): if self.ndim == 1 and len(key) > 1: key = (key,) + # allow arbitrary setting + if is_setter: + return list(key) + return key @property @@ -2205,7 +2272,7 @@ def __setitem__(self, key, value): class _iAtIndexer(_ScalarAccessIndexer): _takeable = True - def _convert_key(self, key): + def _convert_key(self, key, is_setter: bool = False): """ Require integer args. (and convert to label arguments) """ diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py index 51ea45ac18ce0..76967cdc9b52e 100644 --- a/pandas/core/internals/array_manager.py +++ b/pandas/core/internals/array_manager.py @@ -7,7 +7,6 @@ TYPE_CHECKING, Any, Callable, - Hashable, TypeVar, ) @@ -20,6 +19,7 @@ from pandas._typing import ( ArrayLike, DtypeObj, + Hashable, ) from pandas.util._validators import validate_bool_kwarg @@ -820,7 +820,9 @@ def iset(self, loc: int | slice | np.ndarray, value: ArrayLike): assert isinstance(value, (np.ndarray, ExtensionArray)) assert value.ndim == 1 assert len(value) == len(self._axes[0]) - self.arrays[loc] = value + # error: Invalid index type "Union[int, slice, ndarray]" for + # "List[Union[ndarray, ExtensionArray]]"; expected type "int" + self.arrays[loc] = value # type: ignore[index] return # multiple columns -> convert slice or array to integer indices diff --git a/pandas/core/internals/base.py b/pandas/core/internals/base.py index 0ee22200ed495..2bb14efad1ce7 100644 --- a/pandas/core/internals/base.py +++ b/pandas/core/internals/base.py @@ -4,14 +4,12 @@ """ from __future__ import annotations -from typing import ( - TypeVar, - final, -) +from typing import TypeVar from pandas._typing import ( DtypeObj, Shape, + final, ) from pandas.errors import AbstractMethodError diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 314ab5243b6c8..2e7e6c7f7a100 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -9,7 +9,6 @@ Iterable, Sequence, cast, - final, ) import warnings @@ -29,6 +28,7 @@ DtypeObj, F, Shape, + final, ) from pandas.util._decorators import cache_readonly from pandas.util._validators import validate_bool_kwarg @@ -281,7 +281,7 @@ def __repr__(self) -> str: result = f"{name}: {len(self)} dtype: {self.dtype}" else: - shape = " x ".join([str(s) for s in self.shape]) + shape = " x ".join(str(s) for s in self.shape) result = f"{name}: {self.mgr_locs.indexer}, {shape}, dtype: {self.dtype}" return result @@ -312,6 +312,17 @@ def getitem_block(self, slicer) -> Block: return type(self)(new_values, new_mgr_locs, self.ndim) + def getitem_block_index(self, slicer: slice) -> Block: + """ + Perform __getitem__-like specialized to slicing along index. + + Assumes self.ndim == 2 + """ + # error: Invalid index type "Tuple[ellipsis, slice]" for + # "Union[ndarray, ExtensionArray]"; expected type "Union[int, slice, ndarray]" + new_values = self.values[..., slicer] # type: ignore[index] + return type(self)(new_values, self._mgr_locs, ndim=self.ndim) + @final def getitem_block_columns(self, slicer, new_mgr_locs: BlockPlacement) -> Block: """ @@ -886,7 +897,7 @@ def setitem(self, indexer, value): Parameters ---------- - indexer : tuple, list-like, array-like, slice, int + indexer : tuple, list-like, array-like, slice The subset of self.values to set value : object The value being set @@ -1156,17 +1167,13 @@ def shift(self, periods: int, axis: int = 0, fill_value: Any = None) -> list[Blo # convert integer to float if necessary. need to do a lot more than # that, handle boolean etc also - # error: Value of type variable "NumpyArrayT" of "maybe_upcast" cannot be - # "Union[ndarray[Any, Any], ExtensionArray]" + # error: Argument 1 to "maybe_upcast" has incompatible type "Union[ndarray, + # ExtensionArray]"; expected "ndarray" new_values, fill_value = maybe_upcast( - self.values, fill_value # type: ignore[type-var] + self.values, fill_value # type: ignore[arg-type] ) - # error: Argument 1 to "shift" has incompatible type "Union[ndarray[Any, Any], - # ExtensionArray]"; expected "ndarray[Any, Any]" - new_values = shift( - new_values, periods, axis, fill_value # type: ignore[arg-type] - ) + new_values = shift(new_values, periods, axis, fill_value) return [self.make_block(new_values)] @@ -1269,7 +1276,7 @@ def _unstack(self, unstacker, fill_value, new_placement): ------- blocks : list of Block New blocks of unstacked values. - mask : array-like of bool + mask : array_like of bool The mask of columns of `blocks` we should keep. """ new_values, mask = unstacker.get_new_values( @@ -1450,7 +1457,7 @@ def setitem(self, indexer, value): Parameters ---------- - indexer : tuple, list-like, array-like, slice, int + indexer : tuple, list-like, array-like, slice The subset of self.values to set value : object The value being set @@ -1546,18 +1553,6 @@ def _slice(self, slicer): return self.values[slicer] - @final - def getitem_block_index(self, slicer: slice) -> ExtensionBlock: - """ - Perform __getitem__-like specialized to slicing along index. - - Assumes self.ndim == 2 - """ - # error: Invalid index type "Tuple[ellipsis, slice]" for - # "Union[ndarray, ExtensionArray]"; expected type "Union[int, slice, ndarray]" - new_values = self.values[..., slicer] # type: ignore[index] - return type(self)(new_values, self._mgr_locs, ndim=self.ndim) - def fillna( self, value, limit=None, inplace: bool = False, downcast=None ) -> list[Block]: @@ -1628,10 +1623,6 @@ def where(self, other, cond, errors="raise") -> list[Block]: # NotImplementedError for class not implementing `__setitem__` # TypeError for SparseArray, which implements just to raise # a TypeError - if isinstance(result, Categorical): - # TODO: don't special-case - raise - result = type(self.values)._from_sequence( np.where(cond, self.values, other), dtype=dtype ) @@ -1666,6 +1657,8 @@ def _unstack(self, unstacker, fill_value, new_placement): class NumpyBlock(libinternals.NumpyBlock, Block): values: np.ndarray + getitem_block_index = libinternals.NumpyBlock.getitem_block_index + class NumericBlock(NumpyBlock): __slots__ = () @@ -1678,6 +1671,7 @@ class NDArrayBackedExtensionBlock(libinternals.NDArrayBackedBlock, EABackedBlock """ values: NDArrayBackedExtensionArray + getitem_block_index = libinternals.NDArrayBackedBlock.getitem_block_index @property def is_view(self) -> bool: @@ -1900,7 +1894,9 @@ def get_block_type(values, dtype: Dtype | None = None): cls = ExtensionBlock elif isinstance(dtype, CategoricalDtype): cls = CategoricalBlock - elif vtype is Timestamp: + # error: Non-overlapping identity check (left operand type: "Type[generic]", + # right operand type: "Type[Timestamp]") + elif vtype is Timestamp: # type: ignore[comparison-overlap] cls = DatetimeTZBlock elif isinstance(dtype, ExtensionDtype): # Note: need to be sure PandasArray is unwrapped before we get here diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index 203e48ae48b58..9642b30ab91ca 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -592,9 +592,6 @@ def _is_uniform_join_units(join_units: list[JoinUnit]) -> bool: # e.g. DatetimeLikeBlock can be dt64 or td64, but these are not uniform all( is_dtype_equal(ju.block.dtype, join_units[0].block.dtype) - # GH#42092 we only want the dtype_equal check for non-numeric blocks - # (for now, may change but that would need a deprecation) - or ju.block.dtype.kind in ["b", "i", "u"] for ju in join_units ) and diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 22cce5c614d5a..81bf3ca4ba07a 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -10,7 +10,6 @@ Any, Hashable, Sequence, - cast, ) import warnings @@ -26,7 +25,6 @@ from pandas.core.dtypes.cast import ( construct_1d_arraylike_from_scalar, - dict_compat, maybe_cast_to_datetime, maybe_convert_platform, maybe_infer_to_datetimelike, @@ -60,6 +58,7 @@ TimedeltaArray, ) from pandas.core.construction import ( + create_series_with_explicit_dtype, ensure_wrapped_if_datetimelike, extract_array, range_to_ndarray, @@ -67,9 +66,7 @@ ) from pandas.core.indexes import base as ibase from pandas.core.indexes.api import ( - DatetimeIndex, Index, - TimedeltaIndex, ensure_index, get_objs_combined_axis, union_indexes, @@ -168,9 +165,6 @@ def rec_array_to_mgr( # fill if needed if isinstance(data, np.ma.MaskedArray): - # GH#42200 we only get here with MaskedRecords, but check for the - # parent class MaskedArray to avoid the need to import MaskedRecords - data = cast("MaskedRecords", data) new_arrays = fill_masked_arrays(data, arr_columns) else: # error: Incompatible types in assignment (expression has type @@ -348,17 +342,22 @@ def ndarray_to_mgr( # on the entire block; this is to convert if we have datetimelike's # embedded in an object type if dtype is None and is_object_dtype(values.dtype): - obj_columns = list(values) - maybe_datetime = [maybe_infer_to_datetimelike(x) for x in obj_columns] - # don't convert (and copy) the objects if no type inference occurs - if any(x is not y for x, y in zip(obj_columns, maybe_datetime)): - dvals_list = [ensure_block_shape(dval, 2) for dval in maybe_datetime] + + if values.ndim == 2 and values.shape[0] != 1: + # transpose and separate blocks + + dtlike_vals = [maybe_infer_to_datetimelike(row) for row in values] + dvals_list = [ensure_block_shape(dval, 2) for dval in dtlike_vals] + + # TODO: What about re-joining object columns? block_values = [ new_block(dvals_list[n], placement=n, ndim=2) for n in range(len(dvals_list)) ] + else: - nb = new_block(values, placement=slice(len(columns)), ndim=2) + datelike_vals = maybe_infer_to_datetimelike(values) + nb = new_block(datelike_vals, placement=slice(len(columns)), ndim=2) block_values = [nb] else: nb = new_block(values, placement=slice(len(columns)), ndim=2) @@ -553,7 +552,6 @@ def convert(v): def _homogenize(data, index: Index, dtype: DtypeObj | None) -> list[ArrayLike]: - oindex = None homogenized = [] for val in data: @@ -568,18 +566,9 @@ def _homogenize(data, index: Index, dtype: DtypeObj | None) -> list[ArrayLike]: val = val._values else: if isinstance(val, dict): - # GH#41785 this _should_ be equivalent to (but faster than) - # val = create_series_with_explicit_dtype(val, index=index)._values - if oindex is None: - oindex = index.astype("O") - - if isinstance(index, (DatetimeIndex, TimedeltaIndex)): - # see test_constructor_dict_datetime64_index - val = dict_compat(val) - else: - # see test_constructor_subclass_dict - val = dict(val) - val = lib.fast_multiget(val, oindex._values, default=np.nan) + # see test_constructor_subclass_dict + # test_constructor_dict_datetime64_index + val = create_series_with_explicit_dtype(val, index=index)._values val = sanitize_array( val, index, dtype=dtype, copy=False, raise_cast_failure=False diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index dca6ddf703446..48f0b7f7f964b 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -22,9 +22,9 @@ from pandas._libs.internals import BlockPlacement from pandas._typing import ( ArrayLike, + Dtype, DtypeObj, Shape, - npt, type_t, ) from pandas.errors import PerformanceWarning @@ -381,25 +381,6 @@ def shift(self: T, periods: int, axis: int, fill_value) -> T: if fill_value is lib.no_default: fill_value = None - if axis == 0 and self.ndim == 2 and self.nblocks > 1: - # GH#35488 we need to watch out for multi-block cases - # We only get here with fill_value not-lib.no_default - ncols = self.shape[0] - if periods > 0: - indexer = [-1] * periods + list(range(ncols - periods)) - else: - nper = abs(periods) - indexer = list(range(nper, ncols)) + [-1] * nper - result = self.reindex_indexer( - self.items, - indexer, - axis=0, - fill_value=fill_value, - allow_dups=True, - consolidate=False, - ) - return result - return self.apply("shift", periods=periods, axis=axis, fill_value=fill_value) def fillna(self: T, value, limit, inplace: bool, downcast) -> T: @@ -498,10 +479,6 @@ def is_view(self) -> bool: return False - def _get_data_subset(self: T, predicate: Callable) -> T: - blocks = [blk for blk in self.blocks if predicate(blk.values)] - return self._combine(blocks, copy=False) - def get_bool_data(self: T, copy: bool = False) -> T: """ Select blocks that are bool-dtype and columns from object-dtype blocks @@ -1408,7 +1385,7 @@ def to_dict(self, copy: bool = True): def as_array( self, transpose: bool = False, - dtype: npt.DTypeLike | None = None, + dtype: Dtype | None = None, copy: bool = False, na_value=lib.no_default, ) -> np.ndarray: @@ -1448,21 +1425,17 @@ def as_array( # error: Item "ndarray" of "Union[ndarray, ExtensionArray]" has no # attribute "to_numpy" arr = blk.values.to_numpy( # type: ignore[union-attr] - # pandas/core/internals/managers.py:1428: error: Argument "dtype" to - # "to_numpy" of "ExtensionArray" has incompatible type - # "Optional[Union[dtype[Any], None, type, _SupportsDType, str, - # Union[Tuple[Any, int], Tuple[Any, Union[SupportsIndex, - # Sequence[SupportsIndex]]], List[Any], _DTypeDict, Tuple[Any, - # Any]]]]"; expected "Optional[Union[ExtensionDtype, Union[str, - # dtype[Any]], Type[str], Type[float], Type[int], Type[complex], - # Type[bool], Type[object]]]" - dtype=dtype, # type: ignore[arg-type] - na_value=na_value, + dtype=dtype, na_value=na_value ).reshape(blk.shape) else: arr = np.asarray(blk.get_values()) if dtype: - arr = arr.astype(dtype, copy=False) + # error: Argument 1 to "astype" of "_ArrayOrScalarCommon" has + # incompatible type "Union[ExtensionDtype, str, dtype[Any], + # Type[object]]"; expected "Union[dtype[Any], None, type, + # _SupportsDType, str, Union[Tuple[Any, int], Tuple[Any, Union[int, + # Sequence[int]]], List[Any], _DTypeDict, Tuple[Any, Any]]]" + arr = arr.astype(dtype, copy=False) # type: ignore[arg-type] else: arr = self._interleave(dtype=dtype, na_value=na_value) # The underlying data was copied within _interleave @@ -1477,9 +1450,7 @@ def as_array( return arr.transpose() if transpose else arr def _interleave( - self, - dtype: npt.DTypeLike | ExtensionDtype | None = None, - na_value=lib.no_default, + self, dtype: Dtype | None = None, na_value=lib.no_default ) -> np.ndarray: """ Return ndarray from blocks with specified item order @@ -1514,16 +1485,7 @@ def _interleave( # error: Item "ndarray" of "Union[ndarray, ExtensionArray]" has no # attribute "to_numpy" arr = blk.values.to_numpy( # type: ignore[union-attr] - # pandas/core/internals/managers.py:1485: error: Argument "dtype" to - # "to_numpy" of "ExtensionArray" has incompatible type - # "Union[dtype[Any], None, type, _SupportsDType, str, Tuple[Any, - # Union[SupportsIndex, Sequence[SupportsIndex]]], List[Any], - # _DTypeDict, Tuple[Any, Any], ExtensionDtype]"; expected - # "Optional[Union[ExtensionDtype, Union[str, dtype[Any]], Type[str], - # Type[float], Type[int], Type[complex], Type[bool], Type[object]]]" - # [arg-type] - dtype=dtype, # type: ignore[arg-type] - na_value=na_value, + dtype=dtype, na_value=na_value ) else: # error: Argument 1 to "get_values" of "Block" has incompatible type diff --git a/pandas/core/missing.py b/pandas/core/missing.py index f144821220e4b..8849eb0670faa 100644 --- a/pandas/core/missing.py +++ b/pandas/core/missing.py @@ -398,7 +398,7 @@ def interpolate_1d( # preserve NaNs on the inside preserve_nans |= mid_nans - # sort preserve_nans and convert to list + # sort preserve_nans and covert to list preserve_nans = sorted(preserve_nans) result = yvalues.copy() @@ -524,11 +524,11 @@ def _from_derivatives(xi, yi, x, order=None, der=0, extrapolate=False): Parameters ---------- - xi : array-like + xi : array_like sorted 1D array of x-coordinates - yi : array-like or list of array-likes + yi : array_like or list of array-likes yi[i][j] is the j-th derivative known at xi[i] - order: None or int or array-like of ints. Default: None. + order: None or int or array_like of ints. Default: None. Specifies the degree of local polynomials. If not None, some derivatives are ignored. der : int or list @@ -546,7 +546,7 @@ def _from_derivatives(xi, yi, x, order=None, der=0, extrapolate=False): Returns ------- - y : scalar or array-like + y : scalar or array_like The result, of length R or length M or M by R. """ from scipy import interpolate @@ -568,13 +568,13 @@ def _akima_interpolate(xi, yi, x, der=0, axis=0): Parameters ---------- - xi : array-like + xi : array_like A sorted list of x-coordinates, of length N. - yi : array-like + yi : array_like A 1-D array of real values. `yi`'s length along the interpolation axis must be equal to the length of `xi`. If N-D array, use axis parameter to select correct axis. - x : scalar or array-like + x : scalar or array_like Of length M. der : int, optional How many derivatives to extract; None for all potentially @@ -590,7 +590,7 @@ def _akima_interpolate(xi, yi, x, der=0, axis=0): Returns ------- - y : scalar or array-like + y : scalar or array_like The result, of length R or length M or M by R, """ @@ -609,14 +609,14 @@ def _cubicspline_interpolate(xi, yi, x, axis=0, bc_type="not-a-knot", extrapolat Parameters ---------- - xi : array-like, shape (n,) + xi : array_like, shape (n,) 1-d array containing values of the independent variable. Values must be real, finite and in strictly increasing order. - yi : array-like + yi : array_like Array containing values of the dependent variable. It can have arbitrary number of dimensions, but the length along ``axis`` (see below) must match the length of ``x``. Values must be finite. - x : scalar or array-like, shape (m,) + x : scalar or array_like, shape (m,) axis : int, optional Axis along which `y` is assumed to be varying. Meaning that for ``x[i]`` the corresponding values are ``np.take(y, i, axis=axis)``. @@ -644,7 +644,7 @@ def _cubicspline_interpolate(xi, yi, x, axis=0, bc_type="not-a-knot", extrapolat tuple `(order, deriv_values)` allowing to specify arbitrary derivatives at curve ends: * `order`: the derivative order, 1 or 2. - * `deriv_value`: array-like containing derivative values, shape must + * `deriv_value`: array_like containing derivative values, shape must be the same as `y`, excluding ``axis`` dimension. For example, if `y` is 1D, then `deriv_value` must be a scalar. If `y` is 3D with the shape (n0, n1, n2) and axis=2, then `deriv_value` must be 2D @@ -661,7 +661,7 @@ def _cubicspline_interpolate(xi, yi, x, axis=0, bc_type="not-a-knot", extrapolat Returns ------- - y : scalar or array-like + y : scalar or array_like The result, of shape (m,) References diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index db7289f7c3547..ecdf2624c8ec1 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -177,8 +177,10 @@ def _bn_ok_dtype(dtype: DtypeObj, name: str) -> bool: def _has_infs(result) -> bool: if isinstance(result, np.ndarray): - if result.dtype == "f8" or result.dtype == "f4": - return lib.has_infs(result.ravel("K")) + if result.dtype == "f8": + return lib.has_infs_f8(result.ravel("K")) + elif result.dtype == "f4": + return lib.has_infs_f4(result.ravel("K")) try: return np.isinf(result).any() except (TypeError, NotImplementedError): @@ -203,7 +205,7 @@ def _get_fill_value( else: if fill_value_typ == "+inf": # need the max int here - return lib.i8max + return np.iinfo(np.int64).max else: return iNaT @@ -374,7 +376,7 @@ def _wrap_results(result, dtype: np.dtype, fill_value=None): result = np.nan # raise if we have a timedelta64[ns] which is too large - if np.fabs(result) > lib.i8max: + if np.fabs(result) > np.iinfo(np.int64).max: raise ValueError("overflow in timedelta operation") result = Timedelta(result, unit="ns") @@ -581,7 +583,9 @@ def nansum( if is_float_dtype(dtype): dtype_sum = dtype elif is_timedelta64_dtype(dtype): - dtype_sum = np.dtype(np.float64) + # error: Incompatible types in assignment (expression has type + # "Type[float64]", variable has type "dtype") + dtype_sum = np.float64 # type: ignore[assignment] the_sum = values.sum(axis, dtype=dtype_sum) the_sum = _maybe_null_out(the_sum, axis, mask, values.shape, min_count=min_count) @@ -599,9 +603,7 @@ def _mask_datetimelike_result( # we need to apply the mask result = result.astype("i8").view(orig_values.dtype) axis_mask = mask.any(axis=axis) - # error: Unsupported target for indexed assignment ("Union[ndarray[Any, Any], - # datetime64, timedelta64]") - result[axis_mask] = iNaT # type: ignore[index] + result[axis_mask] = iNaT else: if mask.any(): return NaT @@ -753,10 +755,7 @@ def get_median(x): def get_empty_reduction_result( - shape: tuple[int, ...], - axis: int, - dtype: np.dtype | type[np.floating], - fill_value: Any, + shape: tuple[int, ...], axis: int, dtype: np.dtype, fill_value: Any ) -> np.ndarray: """ The result from a reduction on an empty ndarray. @@ -785,7 +784,7 @@ def _get_counts_nanvar( axis: int | None, ddof: int, dtype: Dtype = float, -) -> tuple[int | float | np.ndarray, int | float | np.ndarray]: +) -> tuple[int | np.ndarray, int | np.ndarray]: """ Get the count of non-null values along an axis, accounting for degrees of freedom. @@ -805,12 +804,14 @@ def _get_counts_nanvar( Returns ------- - count : int, np.nan or np.ndarray - d : int, np.nan or np.ndarray + count : scalar or array + d : scalar or array """ dtype = get_dtype(dtype) count = _get_counts(values_shape, mask, axis, dtype=dtype) - d = count - dtype.type(ddof) + # error: Unsupported operand types for - ("int" and "generic") + # error: Unsupported operand types for - ("float" and "generic") + d = count - dtype.type(ddof) # type: ignore[operator] # always return NaN, never inf if is_scalar(count): @@ -818,13 +819,16 @@ def _get_counts_nanvar( count = np.nan d = np.nan else: - # count is not narrowed by is_scalar check - count = cast(np.ndarray, count) - mask = count <= ddof - if mask.any(): - np.putmask(d, mask, np.nan) - np.putmask(count, mask, np.nan) - return count, d + # error: Incompatible types in assignment (expression has type + # "Union[bool, Any]", variable has type "ndarray") + mask2: np.ndarray = count <= ddof # type: ignore[assignment] + if mask2.any(): + np.putmask(d, mask2, np.nan) + np.putmask(count, mask2, np.nan) + # error: Incompatible return value type (got "Tuple[Union[int, float, + # ndarray], Any]", expected "Tuple[Union[int, ndarray], Union[int, + # ndarray]]") + return count, d # type: ignore[return-value] @bottleneck_switch(ddof=1) @@ -1396,7 +1400,9 @@ def _get_counts( n = mask.size - mask.sum() else: n = np.prod(values_shape) - return dtype.type(n) + # error: Incompatible return value type (got "Union[Any, generic]", + # expected "Union[int, float, ndarray]") + return dtype.type(n) # type: ignore[return-value] if mask is not None: count = mask.shape[axis] - mask.sum(axis) @@ -1404,7 +1410,9 @@ def _get_counts( count = values_shape[axis] if is_scalar(count): - return dtype.type(count) + # error: Incompatible return value type (got "Union[Any, generic]", + # expected "Union[int, float, ndarray]") + return dtype.type(count) # type: ignore[return-value] try: return count.astype(dtype) except AttributeError: @@ -1750,7 +1758,7 @@ def na_accum_func(values: ArrayLike, accum_func, *, skipna: bool) -> ArrayLike: if accum_func == np.minimum.accumulate: # Note: the accum_func comparison fails as an "is" comparison y = values.view("i8") - y[mask] = lib.i8max + y[mask] = np.iinfo(np.int64).max changed = True else: y = values @@ -1777,9 +1785,8 @@ def na_accum_func(values: ArrayLike, accum_func, *, skipna: bool) -> ArrayLike: # TODO: have this case go through a DTA method? # For DatetimeTZDtype, view result as M8[ns] npdtype = orig_dtype if isinstance(orig_dtype, np.dtype) else "M8[ns]" - # Item "type" of "Union[Type[ExtensionArray], Type[ndarray[Any, Any]]]" - # has no attribute "_simple_new" - result = type(values)._simple_new( # type: ignore[union-attr] + # error: "Type[ExtensionArray]" has no attribute "_simple_new" + result = type(values)._simple_new( # type: ignore[attr-defined] result.view(npdtype), dtype=orig_dtype ) diff --git a/pandas/core/resample.py b/pandas/core/resample.py index b9a75a6917140..76e23f1bf77e0 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -4,10 +4,9 @@ from datetime import timedelta from textwrap import dedent from typing import ( + TYPE_CHECKING, Callable, Hashable, - Literal, - final, no_type_check, ) @@ -28,7 +27,7 @@ T, TimedeltaConvertibleTypes, TimestampConvertibleTypes, - npt, + final, ) from pandas.compat.numpy import function as nv from pandas.errors import AbstractMethodError @@ -89,6 +88,9 @@ Tick, ) +if TYPE_CHECKING: + from typing import Literal + _shared_docs_kwargs: dict[str, str] = {} @@ -1769,8 +1771,9 @@ def _get_period_bins(self, ax: PeriodIndex): def _take_new_index( - obj: FrameOrSeries, indexer: npt.NDArray[np.intp], new_index: Index, axis: int = 0 + obj: FrameOrSeries, indexer: np.ndarray, new_index: Index, axis: int = 0 ) -> FrameOrSeries: + # indexer: np.ndarray[np.intp] if isinstance(obj, ABCSeries): new_values = algos.take_nd(obj._values, indexer) diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index d908638c4706b..ea34bc75b4e31 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -15,6 +15,7 @@ import numpy as np +from pandas._typing import FrameOrSeriesUnion from pandas.util._decorators import ( cache_readonly, deprecate_nonkeyword_arguments, @@ -82,7 +83,7 @@ def concat( verify_integrity: bool = False, sort: bool = False, copy: bool = True, -) -> DataFrame | Series: +) -> FrameOrSeriesUnion: ... @@ -98,7 +99,7 @@ def concat( verify_integrity: bool = False, sort: bool = False, copy: bool = True, -) -> DataFrame | Series: +) -> FrameOrSeriesUnion: """ Concatenate pandas objects along a particular axis with optional set logic along the other axes. @@ -361,13 +362,8 @@ def __init__( clean_keys.append(k) clean_objs.append(v) objs = clean_objs - - if isinstance(keys, MultiIndex): - # TODO: retain levels? - keys = type(keys).from_tuples(clean_keys, names=keys.names) - else: - name = getattr(keys, "name", None) - keys = Index(clean_keys, name=name) + name = getattr(keys, "name", None) + keys = Index(clean_keys, name=name) if len(objs) == 0: raise ValueError("All objects passed were None") @@ -458,7 +454,7 @@ def __init__( if self._is_frame and axis == 1: name = 0 # mypy needs to know sample is not an NDFrame - sample = cast("DataFrame | Series", sample) + sample = cast("FrameOrSeriesUnion", sample) obj = sample._constructor({name: obj}) self.objs.append(obj) @@ -478,8 +474,8 @@ def __init__( self.new_axes = self._get_new_axes() def get_result(self): - cons: type[DataFrame | Series] - sample: DataFrame | Series + cons: type[FrameOrSeriesUnion] + sample: FrameOrSeriesUnion # series only if self._is_series: diff --git a/pandas/core/reshape/melt.py b/pandas/core/reshape/melt.py index acd6e540aaae3..6a0fad9ee729b 100644 --- a/pandas/core/reshape/melt.py +++ b/pandas/core/reshape/melt.py @@ -21,7 +21,6 @@ from pandas.core.dtypes.concat import concat_compat from pandas.core.dtypes.missing import notna -import pandas.core.algorithms as algos from pandas.core.arrays import Categorical import pandas.core.common as com from pandas.core.indexes.api import ( @@ -107,7 +106,7 @@ def melt( id_vars + value_vars ) else: - idx = algos.unique(frame.columns.get_indexer_for(id_vars + value_vars)) + idx = frame.columns.get_indexer(id_vars + value_vars) frame = frame.iloc[:, idx] else: frame = frame.copy() @@ -227,7 +226,7 @@ def lreshape(data: DataFrame, groups, dropna: bool = True, label=None) -> DataFr else: keys, values = zip(*groups) - all_cols = list(set.union(*(set(x) for x in values))) + all_cols = list(set.union(*[set(x) for x in values])) id_cols = list(data.columns.difference(all_cols)) K = len(values[0]) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index f82fcfcf172a9..143999a4677b3 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -29,7 +29,6 @@ FrameOrSeries, IndexLabel, Suffixes, - npt, ) from pandas.errors import MergeError from pandas.util._decorators import ( @@ -1004,7 +1003,7 @@ def _create_join_index( self, index: Index, other_index: Index, - indexer: npt.NDArray[np.intp], + indexer: np.ndarray, how: str = "left", ) -> Index: """ @@ -1449,7 +1448,7 @@ def _validate(self, validate: str) -> None: def get_join_indexers( left_keys, right_keys, sort: bool = False, how: str = "inner", **kwargs -) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: +) -> tuple[np.ndarray, np.ndarray]: """ Parameters @@ -1508,9 +1507,9 @@ def restore_dropped_levels_multijoin( right: MultiIndex, dropped_level_names, join_index: Index, - lindexer: npt.NDArray[np.intp], - rindexer: npt.NDArray[np.intp], -) -> tuple[list[Index], npt.NDArray[np.intp], list[Hashable]]: + lindexer: np.ndarray, + rindexer: np.ndarray, +) -> tuple[list[Index], np.ndarray, list[Hashable]]: """ *this is an internal non-public method* @@ -1540,7 +1539,7 @@ def restore_dropped_levels_multijoin( ------- levels : list of Index levels of combined multiindexes - labels : np.ndarray[np.intp] + labels : intp array labels of combined multiindexes names : List[Hashable] names of combined multiindex levels @@ -2056,7 +2055,7 @@ def _left_join_on_index( def _factorize_keys( lk: ArrayLike, rk: ArrayLike, sort: bool = True, how: str = "inner" -) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp], int]: +) -> tuple[np.ndarray, np.ndarray, int]: """ Encode left and right keys as enumerated types. diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index fcf00276aa8af..51556fda6da04 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -14,6 +14,7 @@ AggFuncType, AggFuncTypeBase, AggFuncTypeDict, + FrameOrSeriesUnion, IndexLabel, ) from pandas.util._decorators import ( @@ -253,7 +254,7 @@ def __internal_pivot_table( def _add_margins( - table: DataFrame | Series, + table: FrameOrSeriesUnion, data, values, rows, @@ -481,7 +482,7 @@ def pivot( if columns is None: raise TypeError("pivot() missing 1 required argument: 'columns'") - columns_listlike = com.convert_to_list_like(columns) + columns = com.convert_to_list_like(columns) if values is None: if index is not None: @@ -493,27 +494,28 @@ def pivot( # error: Unsupported operand types for + ("List[Any]" and "ExtensionArray") # error: Unsupported left operand type for + ("ExtensionArray") indexed = data.set_index( - cols + columns_listlike, append=append # type: ignore[operator] + cols + columns, append=append # type: ignore[operator] ) else: if index is None: - index_list = [Series(data.index, name=data.index.name)] + index = [Series(data.index, name=data.index.name)] else: - index_list = [data[idx] for idx in com.convert_to_list_like(index)] + index = com.convert_to_list_like(index) + index = [data[idx] for idx in index] - data_columns = [data[col] for col in columns_listlike] - index_list.extend(data_columns) - multiindex = MultiIndex.from_arrays(index_list) + data_columns = [data[col] for col in columns] + index.extend(data_columns) + index = MultiIndex.from_arrays(index) if is_list_like(values) and not isinstance(values, tuple): # Exclude tuple because it is seen as a single column name values = cast(Sequence[Hashable], values) indexed = data._constructor( - data[values]._values, index=multiindex, columns=values + data[values]._values, index=index, columns=values ) else: - indexed = data._constructor_sliced(data[values]._values, index=multiindex) - return indexed.unstack(columns_listlike) + indexed = data._constructor_sliced(data[values]._values, index=index) + return indexed.unstack(columns) def crosstab( diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 12ab08c4e30a1..93859eb11dd44 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -10,10 +10,7 @@ import pandas._libs.reshape as libreshape from pandas._libs.sparse import IntIndex -from pandas._typing import ( - Dtype, - npt, -) +from pandas._typing import Dtype from pandas.util._decorators import cache_readonly from pandas.core.dtypes.cast import maybe_promote @@ -28,13 +25,11 @@ is_object_dtype, needs_i8_conversion, ) -from pandas.core.dtypes.dtypes import ExtensionDtype from pandas.core.dtypes.missing import notna import pandas.core.algorithms as algos from pandas.core.arrays import SparseArray from pandas.core.arrays.categorical import factorize_from_iterable -from pandas.core.construction import ensure_wrapped_if_datetimelike from pandas.core.frame import DataFrame from pandas.core.indexes.api import ( Index, @@ -139,7 +134,7 @@ def __init__(self, index: MultiIndex, level=-1, constructor=None): def _indexer_and_to_sort( self, ) -> tuple[ - npt.NDArray[np.intp], + np.ndarray, # np.ndarray[np.intp] list[np.ndarray], # each has _some_ signed integer dtype ]: v = self.level @@ -238,22 +233,15 @@ def get_new_values(self, values, fill_value=None): if mask_all: dtype = values.dtype new_values = np.empty(result_shape, dtype=dtype) - name = np.dtype(dtype).name else: dtype, fill_value = maybe_promote(values.dtype, fill_value) - if isinstance(dtype, ExtensionDtype): - # GH#41875 - cls = dtype.construct_array_type() - new_values = cls._empty(result_shape, dtype=dtype) - new_values[:] = fill_value - name = dtype.name - else: - new_values = np.empty(result_shape, dtype=dtype) - new_values.fill(fill_value) - name = np.dtype(dtype).name + new_values = np.empty(result_shape, dtype=dtype) + new_values.fill(fill_value) new_mask = np.zeros(result_shape, dtype=bool) + name = np.dtype(dtype).name + # we need to convert to a basic dtype # and possibly coerce an input to our output dtype # e.g. ints -> floats @@ -279,10 +267,6 @@ def get_new_values(self, values, fill_value=None): # reconstruct dtype if needed if needs_i8_conversion(values.dtype): - # view as datetime64 so we can wrap in DatetimeArray and use - # DTA's view method - new_values = new_values.view("M8[ns]") - new_values = ensure_wrapped_if_datetimelike(new_values) new_values = new_values.view(values.dtype) return new_values, new_mask @@ -1020,9 +1004,7 @@ def get_empty_frame(data) -> DataFrame: fill_value: bool | float | int if is_integer_dtype(dtype): fill_value = 0 - # error: Non-overlapping equality check (left operand type: "dtype[Any]", right - # operand type: "Type[bool]") - elif dtype == bool: # type: ignore[comparison-overlap] + elif dtype == bool: fill_value = False else: fill_value = 0.0 diff --git a/pandas/core/sample.py b/pandas/core/sample.py deleted file mode 100644 index e4bad22e8e43c..0000000000000 --- a/pandas/core/sample.py +++ /dev/null @@ -1,144 +0,0 @@ -""" -Module containing utilities for NDFrame.sample() and .GroupBy.sample() -""" -from __future__ import annotations - -import numpy as np - -from pandas._libs import lib -from pandas._typing import FrameOrSeries - -from pandas.core.dtypes.generic import ( - ABCDataFrame, - ABCSeries, -) - - -def preprocess_weights(obj: FrameOrSeries, weights, axis: int) -> np.ndarray: - """ - Process and validate the `weights` argument to `NDFrame.sample` and - `.GroupBy.sample`. - - Returns `weights` as an ndarray[np.float64], validated except for normalizing - weights (because that must be done groupwise in groupby sampling). - """ - # If a series, align with frame - if isinstance(weights, ABCSeries): - weights = weights.reindex(obj.axes[axis]) - - # Strings acceptable if a dataframe and axis = 0 - if isinstance(weights, str): - if isinstance(obj, ABCDataFrame): - if axis == 0: - try: - weights = obj[weights] - except KeyError as err: - raise KeyError( - "String passed to weights not a valid column" - ) from err - else: - raise ValueError( - "Strings can only be passed to " - "weights when sampling from rows on " - "a DataFrame" - ) - else: - raise ValueError( - "Strings cannot be passed as weights when sampling from a Series." - ) - - if isinstance(obj, ABCSeries): - func = obj._constructor - else: - func = obj._constructor_sliced - - weights = func(weights, dtype="float64")._values - - if len(weights) != obj.shape[axis]: - raise ValueError("Weights and axis to be sampled must be of same length") - - if lib.has_infs(weights): - raise ValueError("weight vector may not include `inf` values") - - if (weights < 0).any(): - raise ValueError("weight vector many not include negative values") - - weights[np.isnan(weights)] = 0 - return weights - - -def process_sampling_size( - n: int | None, frac: float | None, replace: bool -) -> int | None: - """ - Process and validate the `n` and `frac` arguments to `NDFrame.sample` and - `.GroupBy.sample`. - - Returns None if `frac` should be used (variable sampling sizes), otherwise returns - the constant sampling size. - """ - # If no frac or n, default to n=1. - if n is None and frac is None: - n = 1 - elif n is not None and frac is not None: - raise ValueError("Please enter a value for `frac` OR `n`, not both") - elif n is not None: - if n < 0: - raise ValueError( - "A negative number of rows requested. Please provide `n` >= 0." - ) - if n % 1 != 0: - raise ValueError("Only integers accepted as `n` values") - else: - assert frac is not None # for mypy - if frac > 1 and not replace: - raise ValueError( - "Replace has to be set to `True` when " - "upsampling the population `frac` > 1." - ) - if frac < 0: - raise ValueError( - "A negative number of rows requested. Please provide `frac` >= 0." - ) - - return n - - -def sample( - obj_len: int, - size: int, - replace: bool, - weights: np.ndarray | None, - random_state: np.random.RandomState | np.random.Generator, -) -> np.ndarray: - """ - Randomly sample `size` indices in `np.arange(obj_len)` - - Parameters - ---------- - obj_len : int - The length of the indices being considered - size : int - The number of values to choose - replace : bool - Allow or disallow sampling of the same row more than once. - weights : np.ndarray[np.float64] or None - If None, equal probability weighting, otherwise weights according - to the vector normalized - random_state: np.random.RandomState or np.random.Generator - State used for the random sampling - - Returns - ------- - np.ndarray[np.intp] - """ - if weights is not None: - weight_sum = weights.sum() - if weight_sum != 0: - weights = weights / weight_sum - else: - raise ValueError("Invalid weights: weights sum to zero") - - return random_state.choice(obj_len, size=size, replace=replace, p=weights).astype( - np.intp, copy=False - ) diff --git a/pandas/core/series.py b/pandas/core/series.py index e61ce8e74629b..59ea6710ea6cd 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -13,7 +13,6 @@ Callable, Hashable, Iterable, - Literal, Sequence, Union, cast, @@ -40,13 +39,12 @@ Dtype, DtypeObj, FillnaOptions, + FrameOrSeriesUnion, IndexKeyFunc, + NpDtype, SingleManager, StorageOptions, - TimedeltaConvertibleTypes, - TimestampConvertibleTypes, ValueKeyFunc, - npt, ) from pandas.compat.numpy import function as nv from pandas.errors import InvalidIndexError @@ -143,6 +141,12 @@ import pandas.plotting if TYPE_CHECKING: + from typing import Literal + + from pandas._typing import ( + TimedeltaConvertibleTypes, + TimestampConvertibleTypes, + ) from pandas.core.frame import DataFrame from pandas.core.groupby.generic import SeriesGroupBy @@ -198,7 +202,7 @@ class Series(base.IndexOpsMixin, generic.NDFrame): methods from ndarray have been overridden to automatically exclude missing data (currently represented as NaN). - Operations between Series (+, -, /, \\*, \\*\\*) align values based on their + Operations between Series (+, -, /, *, **) align values based on their associated index values-- they need not be the same length. The result index will be the sorted union of the two indexes. @@ -301,6 +305,7 @@ class Series(base.IndexOpsMixin, generic.NDFrame): hasnans = property( # type: ignore[assignment] base.IndexOpsMixin.hasnans.func, doc=base.IndexOpsMixin.hasnans.__doc__ ) + __hash__ = generic.NDFrame.__hash__ _mgr: SingleManager div: Callable[[Series, Any], Series] rdiv: Callable[[Series, Any], Series] @@ -383,7 +388,9 @@ def __init__( copy = False elif isinstance(data, np.ndarray): - if len(data.dtype): + # error: Argument 1 to "len" has incompatible type "dtype"; expected + # "Sized" + if len(data.dtype): # type: ignore[arg-type] # GH#13296 we are dealing with a compound dtype, which # should be treated as 2D raise ValueError( @@ -803,7 +810,7 @@ def view(self, dtype: Dtype | None = None) -> Series: # NDArray Compat _HANDLED_TYPES = (Index, ExtensionArray, np.ndarray) - def __array__(self, dtype: npt.DTypeLike | None = None) -> np.ndarray: + def __array__(self, dtype: NpDtype | None = None) -> np.ndarray: """ Return the values as a NumPy array. @@ -931,7 +938,7 @@ def __getitem__(self, key): if isinstance(key, (list, tuple)): key = unpack_1tuple(key) - if is_integer(key) and self.index._should_fallback_to_positional: + if is_integer(key) and self.index._should_fallback_to_positional(): return self._values[key] elif key_is_scalar: @@ -993,7 +1000,7 @@ def _get_with(self, key): if key_type == "integer": # We need to decide whether to treat this as a positional indexer # (i.e. self.iloc) or label-based (i.e. self.loc) - if not self.index._should_fallback_to_positional: + if not self.index._should_fallback_to_positional(): return self.loc[key] else: return self.iloc[key] @@ -1054,35 +1061,19 @@ def __setitem__(self, key, value) -> None: if key is Ellipsis: key = slice(None) - if isinstance(key, slice): - indexer = self.index._convert_slice_indexer(key, kind="getitem") - return self._set_values(indexer, value) - try: self._set_with_engine(key, value) except (KeyError, ValueError): values = self._values if is_integer(key) and self.index.inferred_type != "integer": # positional setter - if not self.index._should_fallback_to_positional: - # GH#33469 - warnings.warn( - "Treating integers as positional in Series.__setitem__ " - "with a Float64Index is deprecated. In a future version, " - "`series[an_int] = val` will insert a new key into the " - "Series. Use `series.iloc[an_int] = val` to treat the " - "key as positional.", - FutureWarning, - stacklevel=2, - ) values[key] = value else: # GH#12862 adding a new key to the Series self.loc[key] = value - except (InvalidIndexError, TypeError) as err: + except TypeError as err: if isinstance(key, tuple) and not isinstance(self.index, MultiIndex): - # cases with MultiIndex don't get here bc they raise KeyError raise KeyError( "key of type tuple not found and not a MultiIndex" ) from err @@ -1103,7 +1094,8 @@ def __setitem__(self, key, value) -> None: self._maybe_update_cacher() def _set_with_engine(self, key, value) -> None: - loc = self.index.get_loc(key) + # fails with AttributeError for IntervalIndex + loc = self.index._engine.get_loc(key) # error: Argument 1 to "validate_numeric_casting" has incompatible type # "Union[dtype, ExtensionDtype]"; expected "dtype" validate_numeric_casting(self.dtype, value) # type: ignore[arg-type] @@ -1111,25 +1103,31 @@ def _set_with_engine(self, key, value) -> None: def _set_with(self, key, value): # other: fancy integer or otherwise - assert not isinstance(key, tuple) + if isinstance(key, slice): + indexer = self.index._convert_slice_indexer(key, kind="getitem") + return self._set_values(indexer, value) - if is_scalar(key): - key = [key] - elif is_iterator(key): - # Without this, the call to infer_dtype will consume the generator - key = list(key) + else: + assert not isinstance(key, tuple) - key_type = lib.infer_dtype(key, skipna=False) + if is_scalar(key): + key = [key] - # Note: key_type == "boolean" should not occur because that - # should be caught by the is_bool_indexer check in __setitem__ - if key_type == "integer": - if not self.index._should_fallback_to_positional: - self._set_labels(key, value) + if isinstance(key, Index): + key_type = key.inferred_type + key = key._values else: - self._set_values(key, value) - else: - self.loc[key] = value + key_type = lib.infer_dtype(key, skipna=False) + + # Note: key_type == "boolean" should not occur because that + # should be caught by the is_bool_indexer check in __setitem__ + if key_type == "integer": + if not self.index._should_fallback_to_positional(): + self._set_labels(key, value) + else: + self._set_values(key, value) + else: + self.loc[key] = value def _set_labels(self, key, value) -> None: key = com.asarray_tuplesafe(key) @@ -1140,7 +1138,7 @@ def _set_labels(self, key, value) -> None: self._set_values(indexer, value) def _set_values(self, key, value) -> None: - if isinstance(key, (Index, Series)): + if isinstance(key, Series): key = key._values self._mgr = self._mgr.setitem(indexer=key, value=value) @@ -1557,7 +1555,8 @@ def to_string( klass=_shared_doc_kwargs["klass"], storage_options=generic._shared_docs["storage_options"], examples=dedent( - """Examples + """ + Examples -------- >>> s = pd.Series(["elk", "pig", "dog", "quetzal"], name="animal") >>> print(s.to_markdown()) @@ -1567,21 +1566,7 @@ def to_string( | 1 | pig | | 2 | dog | | 3 | quetzal | - - Output markdown with a tabulate option. - - >>> print(s.to_markdown(tablefmt="grid")) - +----+----------+ - | | animal | - +====+==========+ - | 0 | elk | - +----+----------+ - | 1 | pig | - +----+----------+ - | 2 | dog | - +----+----------+ - | 3 | quetzal | - +----+----------+""" + """ ), ) def to_markdown( @@ -1624,7 +1609,31 @@ def to_markdown( ----- Requires the `tabulate `_ package. - {examples} + Examples + -------- + >>> s = pd.Series(["elk", "pig", "dog", "quetzal"], name="animal") + >>> print(s.to_markdown()) + | | animal | + |---:|:---------| + | 0 | elk | + | 1 | pig | + | 2 | dog | + | 3 | quetzal | + + Output markdown with a tabulate option. + + >>> print(s.to_markdown(tablefmt="grid")) + +----+----------+ + | | animal | + +====+==========+ + | 0 | elk | + +----+----------+ + | 1 | pig | + +----+----------+ + | 2 | dog | + +----+----------+ + | 3 | quetzal | + +----+----------+ """ return self.to_frame().to_markdown( buf, mode, index, storage_options=storage_options, **kwargs @@ -3013,7 +3022,7 @@ def compare( align_axis: Axis = 1, keep_shape: bool = False, keep_equal: bool = False, - ) -> DataFrame | Series: + ) -> FrameOrSeriesUnion: return super().compare( other=other, align_axis=align_axis, @@ -3856,65 +3865,6 @@ def nsmallest(self, n: int = 5, keep: str = "first") -> Series: """ return algorithms.SelectNSeries(self, n=n, keep=keep).nsmallest() - @doc( - klass=_shared_doc_kwargs["klass"], - extra_params=dedent( - """copy : bool, default True - Whether to copy underlying data.""" - ), - examples=dedent( - """Examples - -------- - >>> s = pd.Series( - ... ["A", "B", "A", "C"], - ... index=[ - ... ["Final exam", "Final exam", "Coursework", "Coursework"], - ... ["History", "Geography", "History", "Geography"], - ... ["January", "February", "March", "April"], - ... ], - ... ) - >>> s - Final exam History January A - Geography February B - Coursework History March A - Geography April C - dtype: object - - In the following example, we will swap the levels of the indices. - Here, we will swap the levels column-wise, but levels can be swapped row-wise - in a similar manner. Note that column-wise is the default behaviour. - By not supplying any arguments for i and j, we swap the last and second to - last indices. - - >>> s.swaplevel() - Final exam January History A - February Geography B - Coursework March History A - April Geography C - dtype: object - - By supplying one argument, we can choose which index to swap the last - index with. We can for example swap the first index with the last one as - follows. - - >>> s.swaplevel(0) - January History Final exam A - February Geography Final exam B - March History Coursework A - April Geography Coursework C - dtype: object - - We can also define explicitly which indices we want to swap by supplying values - for both i and j. Here, we for example swap the first and second indices. - - >>> s.swaplevel(0, 1) - History Final exam January A - Geography Final exam February B - History Coursework March A - Geography Coursework April C - dtype: object""" - ), - ) def swaplevel(self, i=-2, j=-1, copy=True) -> Series: """ Swap levels i and j in a :class:`MultiIndex`. @@ -3923,16 +3873,15 @@ def swaplevel(self, i=-2, j=-1, copy=True) -> Series: Parameters ---------- - i, j : int or str - Levels of the indices to be swapped. Can pass level name as string. - {extra_params} + i, j : int, str + Level of the indices to be swapped. Can pass level name as string. + copy : bool, default True + Whether to copy underlying data. Returns ------- - {klass} - {klass} with levels swapped in MultiIndex. - - {examples} + Series + Series with levels swapped in MultiIndex. """ assert isinstance(self.index, MultiIndex) new_index = self.index.swaplevel(i, j) @@ -4229,7 +4178,7 @@ def aggregate(self, func=None, axis=0, *args, **kwargs): ) def transform( self, func: AggFuncType, axis: Axis = 0, *args, **kwargs - ) -> DataFrame | Series: + ) -> FrameOrSeriesUnion: # Validate axis argument self._get_axis_number(axis) result = SeriesApply( @@ -4243,7 +4192,7 @@ def apply( convert_dtype: bool = True, args: tuple[Any, ...] = (), **kwargs, - ) -> DataFrame | Series: + ) -> FrameOrSeriesUnion: """ Invoke function on values of Series. @@ -4257,7 +4206,7 @@ def apply( convert_dtype : bool, default True Try to find better dtype for elementwise function results. If False, leave as dtype=object. Note that the dtype is always - preserved for some extension array dtypes, such as Categorical. + preserved for extension array dtypes, such as Categorical. args : tuple Positional arguments passed to func after the series value. **kwargs @@ -5020,7 +4969,7 @@ def isin(self, values) -> Series: self, method="isin" ) - def between(self, left, right, inclusive="both") -> Series: + def between(self, left, right, inclusive=True) -> Series: """ Return boolean Series equivalent to left <= series <= right. @@ -5034,10 +4983,8 @@ def between(self, left, right, inclusive="both") -> Series: Left boundary. right : scalar or list-like Right boundary. - inclusive : {"both", "neither", "left", "right"} - Include boundaries. Whether to set each bound as closed or open. - - .. versionchanged:: 1.3.0 + inclusive : bool, default True + Include boundaries. Returns ------- @@ -5068,9 +5015,9 @@ def between(self, left, right, inclusive="both") -> Series: 4 False dtype: bool - With `inclusive` set to ``"neither"`` boundary values are excluded: + With `inclusive` set to ``False`` boundary values are excluded: - >>> s.between(1, 4, inclusive="neither") + >>> s.between(1, 4, inclusive=False) 0 True 1 False 2 False @@ -5088,34 +5035,12 @@ def between(self, left, right, inclusive="both") -> Series: 3 False dtype: bool """ - if inclusive is True or inclusive is False: - warnings.warn( - "Boolean inputs to the `inclusive` argument are deprecated in" - "favour of `both` or `neither`.", - FutureWarning, - stacklevel=2, - ) - if inclusive: - inclusive = "both" - else: - inclusive = "neither" - if inclusive == "both": + if inclusive: lmask = self >= left rmask = self <= right - elif inclusive == "left": - lmask = self >= left - rmask = self < right - elif inclusive == "right": - lmask = self > left - rmask = self <= right - elif inclusive == "neither": + else: lmask = self > left rmask = self < right - else: - raise ValueError( - "Inclusive has to be either string of 'both'," - "'left', 'right', or 'neither'." - ) return lmask & rmask diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index befa67350e182..8531f93fba321 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -21,7 +21,6 @@ from pandas._typing import ( IndexKeyFunc, Shape, - npt, ) from pandas.core.dtypes.common import ( @@ -41,6 +40,8 @@ from pandas import MultiIndex from pandas.core.indexes.base import Index +_INT64_MAX = np.iinfo(np.int64).max + def get_indexer_indexer( target: Index, @@ -132,7 +133,7 @@ def _int64_cut_off(shape) -> int: acc = 1 for i, mul in enumerate(shape): acc *= int(mul) - if not acc < lib.i8max: + if not acc < _INT64_MAX: return i return len(shape) @@ -152,7 +153,7 @@ def maybe_lift(lab, size) -> tuple[np.ndarray, int]: labels = list(labels) # Iteratively process all the labels in chunks sized so less - # than lib.i8max unique int ids will be required for each chunk + # than _INT64_MAX unique int ids will be required for each chunk while True: # how many levels can be done without overflow: nlev = _int64_cut_off(lshape) @@ -187,9 +188,7 @@ def maybe_lift(lab, size) -> tuple[np.ndarray, int]: return out -def get_compressed_ids( - labels, sizes: Shape -) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.int64]]: +def get_compressed_ids(labels, sizes: Shape) -> tuple[np.ndarray, np.ndarray]: """ Group_index is offsets into cartesian product of all possible labels. This space can be huge, so this function compresses it, by computing offsets @@ -216,7 +215,7 @@ def is_int64_overflow_possible(shape) -> bool: for x in shape: the_prod *= int(x) - return the_prod >= lib.i8max + return the_prod >= _INT64_MAX def decons_group_index(comp_labels, shape): @@ -239,9 +238,7 @@ def decons_group_index(comp_labels, shape): return label_list[::-1] -def decons_obs_group_ids( - comp_ids: npt.NDArray[np.intp], obs_ids, shape, labels, xnull: bool -): +def decons_obs_group_ids(comp_ids: np.ndarray, obs_ids, shape, labels, xnull: bool): """ Reconstruct labels from observed group ids. @@ -265,9 +262,8 @@ def decons_obs_group_ids( return [lab[indexer].astype(np.intp, subok=False, copy=True) for lab in labels] -def indexer_from_factorized( - labels, shape: Shape, compress: bool = True -) -> npt.NDArray[np.intp]: +def indexer_from_factorized(labels, shape: Shape, compress: bool = True) -> np.ndarray: + # returned ndarray is np.intp ids = get_group_index(labels, shape, sort=True, xnull=False) if not compress: @@ -281,7 +277,7 @@ def indexer_from_factorized( def lexsort_indexer( keys, orders=None, na_position: str = "last", key: Callable | None = None -) -> npt.NDArray[np.intp]: +) -> np.ndarray: """ Performs lexical sorting on a set of keys @@ -353,7 +349,7 @@ def nargsort( na_position: str = "last", key: Callable | None = None, mask: np.ndarray | None = None, -) -> npt.NDArray[np.intp]: +): """ Intended to be a drop-in replacement for np.argsort which handles NaNs. @@ -558,7 +554,7 @@ def ensure_key_mapped(values, key: Callable | None, levels=None): def get_flattened_list( - comp_ids: npt.NDArray[np.intp], + comp_ids: np.ndarray, # np.ndarray[np.intp] ngroups: int, levels: Iterable[Index], labels: Iterable[np.ndarray], @@ -608,8 +604,8 @@ def get_indexer_dict( def get_group_index_sorter( - group_index: npt.NDArray[np.intp], ngroups: int | None = None -) -> npt.NDArray[np.intp]: + group_index: np.ndarray, ngroups: int | None = None +) -> np.ndarray: """ algos.groupsort_indexer implements `counting sort` and it is at least O(ngroups), where @@ -652,7 +648,7 @@ def get_group_index_sorter( def compress_group_index( group_index: np.ndarray, sort: bool = True -) -> tuple[npt.NDArray[np.int64], npt.NDArray[np.int64]]: +) -> tuple[np.ndarray, np.ndarray]: """ Group_index is offsets into cartesian product of all possible labels. This space can be huge, so this function compresses it, by computing offsets @@ -673,8 +669,8 @@ def compress_group_index( def _reorder_by_uniques( - uniques: npt.NDArray[np.int64], labels: npt.NDArray[np.intp] -) -> tuple[npt.NDArray[np.int64], npt.NDArray[np.intp]]: + uniques: np.ndarray, labels: np.ndarray +) -> tuple[np.ndarray, np.ndarray]: """ Parameters ---------- diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index 717287360df8f..323cb6bd9fedd 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -13,7 +13,10 @@ import numpy as np import pandas._libs.lib as lib -from pandas._typing import DtypeObj +from pandas._typing import ( + DtypeObj, + FrameOrSeriesUnion, +) from pandas.util._decorators import Appender from pandas.core.dtypes.common import ( @@ -36,11 +39,7 @@ from pandas.core.base import NoNewAttributesMixin if TYPE_CHECKING: - from pandas import ( - DataFrame, - Index, - Series, - ) + from pandas import Index _shared_docs: dict[str, str] = {} _cpython_optimized_encoders = ( @@ -2315,7 +2314,7 @@ def findall(self, pat, flags=0): @forbid_nonstring_types(["bytes"]) def extract( self, pat: str, flags: int = 0, expand: bool = True - ) -> DataFrame | Series | Index: + ) -> FrameOrSeriesUnion | Index: r""" Extract capture groups in the regex `pat` as columns in a DataFrame. @@ -3006,7 +3005,7 @@ def casefold(self): "isdigit", docstring=_shared_docs["ismethods"] % _doc_args["isdigit"] ) isspace = _map_and_wrap( - "isspace", docstring=_shared_docs["ismethods"] % _doc_args["isspace"] + "isspace", docstring=_shared_docs["ismethods"] % _doc_args["isalnum"] ) islower = _map_and_wrap( "islower", docstring=_shared_docs["ismethods"] % _doc_args["islower"] diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 26349a3b2c6c1..014a702618bda 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -194,9 +194,9 @@ def _maybe_cache( if len(unique_dates) < len(arg): cache_dates = convert_listlike(unique_dates, format) cache_array = Series(cache_dates, index=unique_dates) - # GH#39882 and GH#35888 in case of None and NaT we get duplicates - if not cache_array.index.is_unique: - cache_array = cache_array[~cache_array.index.duplicated()] + if not cache_array.is_unique: + # GH#39882 in case of None and NaT we get duplicates + cache_array = cache_array.drop_duplicates() return cache_array @@ -762,9 +762,7 @@ def to_datetime( If parsing succeeded. Return type depends on input: - - list-like: - - DatetimeIndex, if timezone naive or aware with the same timezone - - Index of object dtype, if timezone aware with mixed time offsets + - list-like: DatetimeIndex - Series: Series of datetime64 dtype - scalar: Timestamp diff --git a/pandas/core/tools/numeric.py b/pandas/core/tools/numeric.py index 7d2bb75934c33..6dfd67f5dc5ec 100644 --- a/pandas/core/tools/numeric.py +++ b/pandas/core/tools/numeric.py @@ -190,7 +190,7 @@ def to_numeric(arg, errors="raise", downcast=None): # attempt downcast only if the data has been successfully converted # to a numerical dtype and if a downcast method has been specified if downcast is not None and is_numeric_dtype(values.dtype): - typecodes: str | None = None + typecodes = None if downcast in ("integer", "signed"): typecodes = np.typecodes["Integer"] @@ -208,8 +208,8 @@ def to_numeric(arg, errors="raise", downcast=None): if typecodes is not None: # from smallest to largest - for typecode in typecodes: - dtype = np.dtype(typecode) + for dtype in typecodes: + dtype = np.dtype(dtype) if dtype.itemsize <= values.dtype.itemsize: values = maybe_downcast_numeric(values, dtype) diff --git a/pandas/core/util/hashing.py b/pandas/core/util/hashing.py index aa8ec157265ce..fb5002648b6a5 100644 --- a/pandas/core/util/hashing.py +++ b/pandas/core/util/hashing.py @@ -14,9 +14,11 @@ import numpy as np -from pandas._libs import lib from pandas._libs.hashing import hash_object_array -from pandas._typing import ArrayLike +from pandas._typing import ( + ArrayLike, + FrameOrSeriesUnion, +) from pandas.core.dtypes.common import ( is_categorical_dtype, @@ -32,7 +34,6 @@ if TYPE_CHECKING: from pandas import ( Categorical, - DataFrame, Index, MultiIndex, Series, @@ -76,7 +77,7 @@ def combine_hash_arrays(arrays: Iterator[np.ndarray], num_items: int) -> np.ndar def hash_pandas_object( - obj: Index | DataFrame | Series, + obj: Index | FrameOrSeriesUnion, index: bool = True, encoding: str = "utf8", hash_key: str | None = _default_hash_key, @@ -137,10 +138,7 @@ def hash_pandas_object( ser = Series(h, index=obj.index, dtype="uint64", copy=False) elif isinstance(obj, ABCDataFrame): - hashes = ( - hash_array(series._values, encoding, hash_key, categorize) - for _, series in obj.items() - ) + hashes = (hash_array(series._values) for _, series in obj.items()) num_items = len(obj.columns) if index: index_hash_generator = ( @@ -246,7 +244,7 @@ def _hash_categorical(cat: Categorical, encoding: str, hash_key: str) -> np.ndar result = np.zeros(len(mask), dtype="uint64") if mask.any(): - result[mask] = lib.u8max + result[mask] = np.iinfo(np.uint64).max return result diff --git a/pandas/core/window/doc.py b/pandas/core/window/doc.py index b80a73a930818..df69553a74683 100644 --- a/pandas/core/window/doc.py +++ b/pandas/core/window/doc.py @@ -94,8 +94,8 @@ def create_section_header(header: str) -> str: ).replace("\n", "", 1) numba_notes = ( - "See :ref:`window.numba_engine` and :ref:`enhancingperf.numba` for " - "extended documentation and performance considerations for the Numba engine.\n\n" + "See :ref:`window.numba_engine` for extended documentation " + "and performance considerations for the Numba engine.\n\n" ) window_agg_numba_parameters = dedent( diff --git a/pandas/core/window/ewm.py b/pandas/core/window/ewm.py index ee99692b85432..4187c56079060 100644 --- a/pandas/core/window/ewm.py +++ b/pandas/core/window/ewm.py @@ -3,7 +3,6 @@ import datetime from functools import partial from textwrap import dedent -from typing import TYPE_CHECKING import warnings import numpy as np @@ -13,12 +12,9 @@ from pandas._typing import ( Axis, FrameOrSeries, + FrameOrSeriesUnion, TimedeltaConvertibleTypes, ) - -if TYPE_CHECKING: - from pandas import DataFrame, Series - from pandas.compat.numpy import function as nv from pandas.util._decorators import doc @@ -44,14 +40,7 @@ ExponentialMovingWindowIndexer, GroupbyIndexer, ) -from pandas.core.window.numba_ import ( - generate_ewma_numba_table_func, - generate_numba_ewma_func, -) -from pandas.core.window.online import ( - EWMMeanState, - generate_online_numba_ewma_func, -) +from pandas.core.window.numba_ import generate_numba_ewma_func from pandas.core.window.rolling import ( BaseWindow, BaseWindowGroupby, @@ -207,16 +196,6 @@ class ExponentialMovingWindow(BaseWindow): If 1-D array like, a sequence with the same shape as the observations. Only applicable to ``mean()``. - method : str {'single', 'table'}, default 'single' - Execute the rolling operation per single column or row (``'single'``) - or over the entire object (``'table'``). - - This argument is only implemented when specifying ``engine='numba'`` - in the method call. - - Only applicable to ``mean()`` - - .. versionadded:: 1.4.0 Returns ------- @@ -275,7 +254,6 @@ class ExponentialMovingWindow(BaseWindow): "ignore_na", "axis", "times", - "method", ] def __init__( @@ -285,22 +263,21 @@ def __init__( span: float | None = None, halflife: float | TimedeltaConvertibleTypes | None = None, alpha: float | None = None, - min_periods: int | None = 0, + min_periods: int = 0, adjust: bool = True, ignore_na: bool = False, axis: Axis = 0, times: str | np.ndarray | FrameOrSeries | None = None, - method: str = "single", *, selection=None, ): super().__init__( obj=obj, - min_periods=1 if min_periods is None else max(int(min_periods), 1), + min_periods=max(int(min_periods), 1), on=None, center=False, closed=None, - method=method, + method="single", axis=axis, selection=selection, ) @@ -361,48 +338,6 @@ def _get_window_indexer(self) -> BaseIndexer: """ return ExponentialMovingWindowIndexer() - def online(self, engine="numba", engine_kwargs=None): - """ - Return an ``OnlineExponentialMovingWindow`` object to calculate - exponentially moving window aggregations in an online method. - - .. versionadded:: 1.3.0 - - Parameters - ---------- - engine: str, default ``'numba'`` - Execution engine to calculate online aggregations. - Applies to all supported aggregation methods. - - engine_kwargs : dict, default None - Applies to all supported aggregation methods. - - * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil`` - and ``parallel`` dictionary keys. The values must either be ``True`` or - ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is - ``{{'nopython': True, 'nogil': False, 'parallel': False}}`` and will be - applied to the function - - Returns - ------- - OnlineExponentialMovingWindow - """ - return OnlineExponentialMovingWindow( - obj=self.obj, - com=self.com, - span=self.span, - halflife=self.halflife, - alpha=self.alpha, - min_periods=self.min_periods, - adjust=self.adjust, - ignore_na=self.ignore_na, - axis=self.axis, - times=self.times, - engine=engine, - engine_kwargs=engine_kwargs, - selection=self._selection, - ) - @doc( _shared_docs["aggregate"], see_also=dedent( @@ -456,19 +391,12 @@ def aggregate(self, func, *args, **kwargs): ) def mean(self, *args, engine=None, engine_kwargs=None, **kwargs): if maybe_use_numba(engine): - if self.method == "single": - ewma_func = generate_numba_ewma_func( - engine_kwargs, self._com, self.adjust, self.ignore_na, self._deltas - ) - numba_cache_key = (lambda x: x, "ewma") - else: - ewma_func = generate_ewma_numba_table_func( - engine_kwargs, self._com, self.adjust, self.ignore_na, self._deltas - ) - numba_cache_key = (lambda x: x, "ewma_table") + ewma_func = generate_numba_ewma_func( + engine_kwargs, self._com, self.adjust, self.ignore_na, self._deltas + ) return self._apply( ewma_func, - numba_cache_key=numba_cache_key, + numba_cache_key=(lambda x: x, "ewma"), ) elif engine in ("cython", None): if engine_kwargs is not None: @@ -584,7 +512,7 @@ def var_func(values, begin, end, min_periods): ) def cov( self, - other: DataFrame | Series | None = None, + other: FrameOrSeriesUnion | None = None, pairwise: bool | None = None, bias: bool = False, **kwargs, @@ -651,7 +579,7 @@ def cov_func(x, y): ) def corr( self, - other: DataFrame | Series | None = None, + other: FrameOrSeriesUnion | None = None, pairwise: bool | None = None, **kwargs, ): @@ -727,167 +655,3 @@ def _get_window_indexer(self) -> GroupbyIndexer: window_indexer=ExponentialMovingWindowIndexer, ) return window_indexer - - -class OnlineExponentialMovingWindow(ExponentialMovingWindow): - def __init__( - self, - obj: FrameOrSeries, - com: float | None = None, - span: float | None = None, - halflife: float | TimedeltaConvertibleTypes | None = None, - alpha: float | None = None, - min_periods: int | None = 0, - adjust: bool = True, - ignore_na: bool = False, - axis: Axis = 0, - times: str | np.ndarray | FrameOrSeries | None = None, - engine: str = "numba", - engine_kwargs: dict[str, bool] | None = None, - *, - selection=None, - ): - if times is not None: - raise NotImplementedError( - "times is not implemented with online operations." - ) - super().__init__( - obj=obj, - com=com, - span=span, - halflife=halflife, - alpha=alpha, - min_periods=min_periods, - adjust=adjust, - ignore_na=ignore_na, - axis=axis, - times=times, - selection=selection, - ) - self._mean = EWMMeanState( - self._com, self.adjust, self.ignore_na, self.axis, obj.shape - ) - if maybe_use_numba(engine): - self.engine = engine - self.engine_kwargs = engine_kwargs - else: - raise ValueError("'numba' is the only supported engine") - - def reset(self): - """ - Reset the state captured by `update` calls. - """ - self._mean.reset() - - def aggregate(self, func, *args, **kwargs): - return NotImplementedError - - def std(self, bias: bool = False, *args, **kwargs): - return NotImplementedError - - def corr( - self, - other: DataFrame | Series | None = None, - pairwise: bool | None = None, - **kwargs, - ): - return NotImplementedError - - def cov( - self, - other: DataFrame | Series | None = None, - pairwise: bool | None = None, - bias: bool = False, - **kwargs, - ): - return NotImplementedError - - def var(self, bias: bool = False, *args, **kwargs): - return NotImplementedError - - def mean(self, *args, update=None, update_times=None, **kwargs): - """ - Calculate an online exponentially weighted mean. - - Parameters - ---------- - update: DataFrame or Series, default None - New values to continue calculating the - exponentially weighted mean from the last values and weights. - Values should be float64 dtype. - - ``update`` needs to be ``None`` the first time the - exponentially weighted mean is calculated. - - update_times: Series or 1-D np.ndarray, default None - New times to continue calculating the - exponentially weighted mean from the last values and weights. - If ``None``, values are assumed to be evenly spaced - in time. - This feature is currently unsupported. - - Returns - ------- - DataFrame or Series - - Examples - -------- - >>> df = pd.DataFrame({"a": range(5), "b": range(5, 10)}) - >>> online_ewm = df.head(2).ewm(0.5).online() - >>> online_ewm.mean() - a b - 0 0.00 5.00 - 1 0.75 5.75 - >>> online_ewm.mean(update=df.tail(3)) - a b - 2 1.615385 6.615385 - 3 2.550000 7.550000 - 4 3.520661 8.520661 - >>> online_ewm.reset() - >>> online_ewm.mean() - a b - 0 0.00 5.00 - 1 0.75 5.75 - """ - result_kwargs = {} - is_frame = True if self._selected_obj.ndim == 2 else False - if update_times is not None: - raise NotImplementedError("update_times is not implemented.") - else: - update_deltas = np.ones( - max(self._selected_obj.shape[self.axis - 1] - 1, 0), dtype=np.float64 - ) - if update is not None: - if self._mean.last_ewm is None: - raise ValueError( - "Must call mean with update=None first before passing update" - ) - result_from = 1 - result_kwargs["index"] = update.index - if is_frame: - last_value = self._mean.last_ewm[np.newaxis, :] - result_kwargs["columns"] = update.columns - else: - last_value = self._mean.last_ewm - result_kwargs["name"] = update.name - np_array = np.concatenate((last_value, update.to_numpy())) - else: - result_from = 0 - result_kwargs["index"] = self._selected_obj.index - if is_frame: - result_kwargs["columns"] = self._selected_obj.columns - else: - result_kwargs["name"] = self._selected_obj.name - np_array = self._selected_obj.astype(np.float64).to_numpy() - ewma_func = generate_online_numba_ewma_func(self.engine_kwargs) - result = self._mean.run_ewm( - np_array if is_frame else np_array[:, np.newaxis], - update_deltas, - self.min_periods, - ewma_func, - ) - if not is_frame: - result = result.squeeze() - result = result[result_from:] - result = self._selected_obj._constructor(result, **result_kwargs) - return result diff --git a/pandas/core/window/expanding.py b/pandas/core/window/expanding.py index eedb6930bad66..02cf31cad7b8d 100644 --- a/pandas/core/window/expanding.py +++ b/pandas/core/window/expanding.py @@ -2,7 +2,6 @@ from textwrap import dedent from typing import ( - TYPE_CHECKING, Any, Callable, ) @@ -10,11 +9,8 @@ from pandas._typing import ( Axis, FrameOrSeries, + FrameOrSeriesUnion, ) - -if TYPE_CHECKING: - from pandas import DataFrame, Series - from pandas.compat.numpy import function as nv from pandas.util._decorators import doc @@ -595,7 +591,7 @@ def quantile( ) def cov( self, - other: DataFrame | Series | None = None, + other: FrameOrSeriesUnion | None = None, pairwise: bool | None = None, ddof: int = 1, **kwargs, @@ -660,7 +656,7 @@ def cov( ) def corr( self, - other: DataFrame | Series | None = None, + other: FrameOrSeriesUnion | None = None, pairwise: bool | None = None, ddof: int = 1, **kwargs, diff --git a/pandas/core/window/numba_.py b/pandas/core/window/numba_.py index ab1eb9d3a2688..d00be0ea840a8 100644 --- a/pandas/core/window/numba_.py +++ b/pandas/core/window/numba_.py @@ -19,6 +19,7 @@ def generate_numba_apply_func( + args: tuple, kwargs: dict[str, Any], func: Callable[..., Scalar], engine_kwargs: dict[str, bool] | None, @@ -35,6 +36,8 @@ def generate_numba_apply_func( Parameters ---------- + args : tuple + *args to be passed into the function kwargs : dict **kwargs to be passed into the function func : function @@ -59,11 +62,7 @@ def generate_numba_apply_func( @numba.jit(nopython=nopython, nogil=nogil, parallel=parallel) def roll_apply( - values: np.ndarray, - begin: np.ndarray, - end: np.ndarray, - minimum_periods: int, - *args: Any, + values: np.ndarray, begin: np.ndarray, end: np.ndarray, minimum_periods: int ) -> np.ndarray: result = np.empty(len(begin)) for i in numba.prange(len(result)): @@ -170,6 +169,7 @@ def ewma( def generate_numba_table_func( + args: tuple, kwargs: dict[str, Any], func: Callable[..., np.ndarray], engine_kwargs: dict[str, bool] | None, @@ -187,6 +187,8 @@ def generate_numba_table_func( Parameters ---------- + args : tuple + *args to be passed into the function kwargs : dict **kwargs to be passed into the function func : function @@ -211,11 +213,7 @@ def generate_numba_table_func( @numba.jit(nopython=nopython, nogil=nogil, parallel=parallel) def roll_table( - values: np.ndarray, - begin: np.ndarray, - end: np.ndarray, - minimum_periods: int, - *args: Any, + values: np.ndarray, begin: np.ndarray, end: np.ndarray, minimum_periods: int ): result = np.empty(values.shape) min_periods_mask = np.empty(values.shape) @@ -250,82 +248,3 @@ def nan_agg_with_axis(table): return result return nan_agg_with_axis - - -def generate_ewma_numba_table_func( - engine_kwargs: dict[str, bool] | None, - com: float, - adjust: bool, - ignore_na: bool, - deltas: np.ndarray, -): - """ - Generate a numba jitted ewma function applied table wise specified - by values from engine_kwargs. - - Parameters - ---------- - engine_kwargs : dict - dictionary of arguments to be passed into numba.jit - com : float - adjust : bool - ignore_na : bool - deltas : numpy.ndarray - - Returns - ------- - Numba function - """ - nopython, nogil, parallel = get_jit_arguments(engine_kwargs) - - cache_key = (lambda x: x, "ewma_table") - if cache_key in NUMBA_FUNC_CACHE: - return NUMBA_FUNC_CACHE[cache_key] - - numba = import_optional_dependency("numba") - - @numba.jit(nopython=nopython, nogil=nogil, parallel=parallel) - def ewma_table( - values: np.ndarray, - begin: np.ndarray, - end: np.ndarray, - minimum_periods: int, - ) -> np.ndarray: - alpha = 1.0 / (1.0 + com) - old_wt_factor = 1.0 - alpha - new_wt = 1.0 if adjust else alpha - old_wt = np.ones(values.shape[1]) - - result = np.empty(values.shape) - weighted_avg = values[0].copy() - nobs = (~np.isnan(weighted_avg)).astype(np.int64) - result[0] = np.where(nobs >= minimum_periods, weighted_avg, np.nan) - for i in range(1, len(values)): - cur = values[i] - is_observations = ~np.isnan(cur) - nobs += is_observations.astype(np.int64) - for j in numba.prange(len(cur)): - if not np.isnan(weighted_avg[j]): - if is_observations[j] or not ignore_na: - - # note that len(deltas) = len(vals) - 1 and deltas[i] is to be - # used in conjunction with vals[i+1] - old_wt[j] *= old_wt_factor ** deltas[i - 1] - if is_observations[j]: - # avoid numerical errors on constant series - if weighted_avg[j] != cur[j]: - weighted_avg[j] = ( - (old_wt[j] * weighted_avg[j]) + (new_wt * cur[j]) - ) / (old_wt[j] + new_wt) - if adjust: - old_wt[j] += new_wt - else: - old_wt[j] = 1.0 - elif is_observations[j]: - weighted_avg[j] = cur[j] - - result[i] = np.where(nobs >= minimum_periods, weighted_avg, np.nan) - - return result - - return ewma_table diff --git a/pandas/core/window/online.py b/pandas/core/window/online.py deleted file mode 100644 index 5a9e8d65255ae..0000000000000 --- a/pandas/core/window/online.py +++ /dev/null @@ -1,118 +0,0 @@ -from typing import ( - Dict, - Optional, -) - -import numpy as np - -from pandas.compat._optional import import_optional_dependency - -from pandas.core.util.numba_ import ( - NUMBA_FUNC_CACHE, - get_jit_arguments, -) - - -def generate_online_numba_ewma_func(engine_kwargs: Optional[Dict[str, bool]]): - """ - Generate a numba jitted groupby ewma function specified by values - from engine_kwargs. - Parameters - ---------- - engine_kwargs : dict - dictionary of arguments to be passed into numba.jit - Returns - ------- - Numba function - """ - nopython, nogil, parallel = get_jit_arguments(engine_kwargs) - - cache_key = (lambda x: x, "online_ewma") - if cache_key in NUMBA_FUNC_CACHE: - return NUMBA_FUNC_CACHE[cache_key] - - numba = import_optional_dependency("numba") - - @numba.jit(nopython=nopython, nogil=nogil, parallel=parallel) - def online_ewma( - values: np.ndarray, - deltas: np.ndarray, - minimum_periods: int, - old_wt_factor: float, - new_wt: float, - old_wt: np.ndarray, - adjust: bool, - ignore_na: bool, - ): - """ - Compute online exponentially weighted mean per column over 2D values. - - Takes the first observation as is, then computes the subsequent - exponentially weighted mean accounting minimum periods. - """ - result = np.empty(values.shape) - weighted_avg = values[0] - nobs = (~np.isnan(weighted_avg)).astype(np.int64) - result[0] = np.where(nobs >= minimum_periods, weighted_avg, np.nan) - - for i in range(1, len(values)): - cur = values[i] - is_observations = ~np.isnan(cur) - nobs += is_observations.astype(np.int64) - for j in numba.prange(len(cur)): - if not np.isnan(weighted_avg[j]): - if is_observations[j] or not ignore_na: - - # note that len(deltas) = len(vals) - 1 and deltas[i] is to be - # used in conjunction with vals[i+1] - old_wt[j] *= old_wt_factor ** deltas[j - 1] - if is_observations[j]: - # avoid numerical errors on constant series - if weighted_avg[j] != cur[j]: - weighted_avg[j] = ( - (old_wt[j] * weighted_avg[j]) + (new_wt * cur[j]) - ) / (old_wt[j] + new_wt) - if adjust: - old_wt[j] += new_wt - else: - old_wt[j] = 1.0 - elif is_observations[j]: - weighted_avg[j] = cur[j] - - result[i] = np.where(nobs >= minimum_periods, weighted_avg, np.nan) - - return result, old_wt - - return online_ewma - - -class EWMMeanState: - def __init__(self, com, adjust, ignore_na, axis, shape): - alpha = 1.0 / (1.0 + com) - self.axis = axis - self.shape = shape - self.adjust = adjust - self.ignore_na = ignore_na - self.new_wt = 1.0 if adjust else alpha - self.old_wt_factor = 1.0 - alpha - self.old_wt = np.ones(self.shape[self.axis - 1]) - self.last_ewm = None - - def run_ewm(self, weighted_avg, deltas, min_periods, ewm_func): - result, old_wt = ewm_func( - weighted_avg, - deltas, - min_periods, - self.old_wt_factor, - self.new_wt, - self.old_wt, - self.adjust, - self.ignore_na, - ) - self.old_wt = old_wt - self.last_ewm = result[-1] - return result - - def reset(self): - self.old_wt = np.ones(self.shape[self.axis - 1]) - self.last_ewm = None diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index 8a253726ab0b6..2d5f148a6437a 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -28,6 +28,7 @@ ArrayLike, Axis, FrameOrSeries, + FrameOrSeriesUnion, ) from pandas.compat._optional import import_optional_dependency from pandas.compat.numpy import function as nv @@ -407,7 +408,7 @@ def _apply_series( def _apply_blockwise( self, homogeneous_func: Callable[..., ArrayLike], name: str | None = None - ) -> DataFrame | Series: + ) -> FrameOrSeriesUnion: """ Apply the given function to the DataFrame broken down into homogeneous sub-frames. @@ -442,7 +443,7 @@ def hfunc2d(values: ArrayLike) -> ArrayLike: def _apply_tablewise( self, homogeneous_func: Callable[..., ArrayLike], name: str | None = None - ) -> DataFrame | Series: + ) -> FrameOrSeriesUnion: """ Apply the given function to the DataFrame across the entire object """ @@ -459,11 +460,11 @@ def _apply_tablewise( def _apply_pairwise( self, - target: DataFrame | Series, - other: DataFrame | Series | None, + target: FrameOrSeriesUnion, + other: FrameOrSeriesUnion | None, pairwise: bool | None, - func: Callable[[DataFrame | Series, DataFrame | Series], DataFrame | Series], - ) -> DataFrame | Series: + func: Callable[[FrameOrSeriesUnion, FrameOrSeriesUnion], FrameOrSeriesUnion], + ) -> FrameOrSeriesUnion: """ Apply the given pairwise function given 2 pandas objects (DataFrame/Series) """ @@ -481,7 +482,6 @@ def _apply( func: Callable[..., Any], name: str | None = None, numba_cache_key: tuple[Callable, str] | None = None, - numba_args: tuple[Any, ...] = (), **kwargs, ): """ @@ -495,8 +495,6 @@ def _apply( name : str, numba_cache_key : tuple caching key to be used to store a compiled numba func - numba_args : tuple - args to be passed when func is a numba func **kwargs additional arguments for rolling function and window function @@ -524,7 +522,7 @@ def calc(x): center=self.center, closed=self.closed, ) - return func(x, start, end, min_periods, *numba_args) + return func(x, start, end, min_periods) with np.errstate(all="ignore"): if values.ndim > 1 and self.method == "single": @@ -585,14 +583,12 @@ def _apply( func: Callable[..., Any], name: str | None = None, numba_cache_key: tuple[Callable, str] | None = None, - numba_args: tuple[Any, ...] = (), **kwargs, ) -> FrameOrSeries: result = super()._apply( func, name, numba_cache_key, - numba_args, **kwargs, ) # Reconstruct the resulting MultiIndex @@ -643,11 +639,11 @@ def _apply( def _apply_pairwise( self, - target: DataFrame | Series, - other: DataFrame | Series | None, + target: FrameOrSeriesUnion, + other: FrameOrSeriesUnion | None, pairwise: bool | None, - func: Callable[[DataFrame | Series, DataFrame | Series], DataFrame | Series], - ) -> DataFrame | Series: + func: Callable[[FrameOrSeriesUnion, FrameOrSeriesUnion], FrameOrSeriesUnion], + ) -> FrameOrSeriesUnion: """ Apply the given pairwise function given 2 pandas objects (DataFrame/Series) """ @@ -973,7 +969,6 @@ def _apply( func: Callable[[np.ndarray, int, int], np.ndarray], name: str | None = None, numba_cache_key: tuple[Callable, str] | None = None, - numba_args: tuple[Any, ...] = (), **kwargs, ): """ @@ -987,8 +982,6 @@ def _apply( name : str, use_numba_cache : tuple unused - numba_args : tuple - unused **kwargs additional arguments for scipy windows if necessary @@ -1166,20 +1159,18 @@ def apply( raise ValueError("raw parameter must be `True` or `False`") numba_cache_key = None - numba_args: tuple[Any, ...] = () if maybe_use_numba(engine): if raw is False: raise ValueError("raw must be `True` when using the numba engine") caller_name = type(self).__name__ - numba_args = args if self.method == "single": apply_func = generate_numba_apply_func( - kwargs, func, engine_kwargs, caller_name + args, kwargs, func, engine_kwargs, caller_name ) numba_cache_key = (func, f"{caller_name}_apply_single") else: apply_func = generate_numba_table_func( - kwargs, func, engine_kwargs, f"{caller_name}_apply" + args, kwargs, func, engine_kwargs, f"{caller_name}_apply" ) numba_cache_key = (func, f"{caller_name}_apply_table") elif engine in ("cython", None): @@ -1192,7 +1183,6 @@ def apply( return self._apply( apply_func, numba_cache_key=numba_cache_key, - numba_args=numba_args, ) def _generate_cython_apply_func( @@ -1389,7 +1379,7 @@ def quantile(self, quantile: float, interpolation: str = "linear", **kwargs): def cov( self, - other: DataFrame | Series | None = None, + other: FrameOrSeriesUnion | None = None, pairwise: bool | None = None, ddof: int = 1, **kwargs, @@ -1427,7 +1417,7 @@ def cov_func(x, y): def corr( self, - other: DataFrame | Series | None = None, + other: FrameOrSeriesUnion | None = None, pairwise: bool | None = None, ddof: int = 1, **kwargs, @@ -2169,7 +2159,7 @@ def quantile(self, quantile: float, interpolation: str = "linear", **kwargs): ) def cov( self, - other: DataFrame | Series | None = None, + other: FrameOrSeriesUnion | None = None, pairwise: bool | None = None, ddof: int = 1, **kwargs, @@ -2294,7 +2284,7 @@ def cov( ) def corr( self, - other: DataFrame | Series | None = None, + other: FrameOrSeriesUnion | None = None, pairwise: bool | None = None, ddof: int = 1, **kwargs, diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 4d6a766ad6cfa..719a4472fb9e3 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -82,9 +82,8 @@ or ``StringIO``. sheet_name : str, int, list, or None, default 0 Strings are used for sheet names. Integers are used in zero-indexed - sheet positions (chart sheets do not count as a sheet position). - Lists of strings/integers are used to request multiple sheets. - Specify None to get all worksheets. + sheet positions. Lists of strings/integers are used to request + multiple sheets. Specify None to get all sheets. Available cases: @@ -93,7 +92,7 @@ * ``"Sheet1"``: Load sheet with name "Sheet1" * ``[0, 1, "Sheet5"]``: Load first, second and sheet named "Sheet5" as a dict of `DataFrame` - * None: All worksheets. + * None: All sheets. header : int, list of int, default 0 Row (0-indexed) to use for the column labels of the parsed diff --git a/pandas/io/excel/_odswriter.py b/pandas/io/excel/_odswriter.py index fa2779b01d681..efef86329314b 100644 --- a/pandas/io/excel/_odswriter.py +++ b/pandas/io/excel/_odswriter.py @@ -29,7 +29,6 @@ def __init__( storage_options: StorageOptions = None, if_sheet_exists: str | None = None, engine_kwargs: dict[str, Any] | None = None, - **kwargs, ): from odf.opendocument import OpenDocumentSpreadsheet diff --git a/pandas/io/excel/_openpyxl.py b/pandas/io/excel/_openpyxl.py index d499f1a5ea89f..bc067e216760c 100644 --- a/pandas/io/excel/_openpyxl.py +++ b/pandas/io/excel/_openpyxl.py @@ -19,10 +19,7 @@ BaseExcelReader, ExcelWriter, ) -from pandas.io.excel._util import ( - combine_kwargs, - validate_freeze_panes, -) +from pandas.io.excel._util import validate_freeze_panes if TYPE_CHECKING: from openpyxl.descriptors.serialisable import Serialisable @@ -42,13 +39,10 @@ def __init__( storage_options: StorageOptions = None, if_sheet_exists: str | None = None, engine_kwargs: dict[str, Any] | None = None, - **kwargs, ): # Use the openpyxl module as the Excel writer. from openpyxl.workbook import Workbook - engine_kwargs = combine_kwargs(engine_kwargs, kwargs) - super().__init__( path, mode=mode, @@ -536,7 +530,7 @@ def load_workbook(self, filepath_or_buffer: FilePathOrBuffer): @property def sheet_names(self) -> list[str]: - return [sheet.title for sheet in self.book.worksheets] + return self.book.sheetnames def get_sheet_by_name(self, name: str): self.raise_if_bad_sheet_by_name(name) diff --git a/pandas/io/excel/_util.py b/pandas/io/excel/_util.py index 66a66fbbcd78a..7d8028de23257 100644 --- a/pandas/io/excel/_util.py +++ b/pandas/io/excel/_util.py @@ -1,9 +1,6 @@ from __future__ import annotations -from typing import ( - Any, - MutableMapping, -) +from typing import MutableMapping from pandas.compat._optional import import_optional_dependency @@ -249,30 +246,3 @@ def pop_header_name(row, index_col): header_name = None if header_name == "" else header_name return header_name, row[:i] + [""] + row[i + 1 :] - - -def combine_kwargs(engine_kwargs: dict[str, Any] | None, kwargs: dict) -> dict: - """ - Used to combine two sources of kwargs for the backend engine. - - Use of kwargs is deprecated, this function is solely for use in 1.3 and should - be removed in 1.4/2.0. Also _base.ExcelWriter.__new__ ensures either engine_kwargs - or kwargs must be None or empty respectively. - - Parameters - ---------- - engine_kwargs: dict - kwargs to be passed through to the engine. - kwargs: dict - kwargs to be psased through to the engine (deprecated) - - Returns - ------- - engine_kwargs combined with kwargs - """ - if engine_kwargs is None: - result = {} - else: - result = engine_kwargs.copy() - result.update(kwargs) - return result diff --git a/pandas/io/excel/_xlsxwriter.py b/pandas/io/excel/_xlsxwriter.py index 06c73f2c6199e..7500a33b1f097 100644 --- a/pandas/io/excel/_xlsxwriter.py +++ b/pandas/io/excel/_xlsxwriter.py @@ -6,10 +6,7 @@ from pandas._typing import StorageOptions from pandas.io.excel._base import ExcelWriter -from pandas.io.excel._util import ( - combine_kwargs, - validate_freeze_panes, -) +from pandas.io.excel._util import validate_freeze_panes class _XlsxStyler: @@ -178,12 +175,11 @@ def __init__( storage_options: StorageOptions = None, if_sheet_exists: str | None = None, engine_kwargs: dict[str, Any] | None = None, - **kwargs, ): # Use the xlsxwriter module as the Excel writer. from xlsxwriter import Workbook - engine_kwargs = combine_kwargs(engine_kwargs, kwargs) + engine_kwargs = engine_kwargs or {} if mode == "a": raise ValueError("Append mode is not supported with xlsxwriter!") diff --git a/pandas/io/excel/_xlwt.py b/pandas/io/excel/_xlwt.py index 4dadf64b44515..8a7605b80f6b4 100644 --- a/pandas/io/excel/_xlwt.py +++ b/pandas/io/excel/_xlwt.py @@ -9,10 +9,7 @@ from pandas._typing import StorageOptions from pandas.io.excel._base import ExcelWriter -from pandas.io.excel._util import ( - combine_kwargs, - validate_freeze_panes, -) +from pandas.io.excel._util import validate_freeze_panes if TYPE_CHECKING: from xlwt import XFStyle @@ -33,13 +30,10 @@ def __init__( storage_options: StorageOptions = None, if_sheet_exists: str | None = None, engine_kwargs: dict[str, Any] | None = None, - **kwargs, ): # Use the xlwt module as the Excel writer. import xlwt - engine_kwargs = combine_kwargs(engine_kwargs, kwargs) - if mode == "a": raise ValueError("Append mode is not supported with xlwt!") diff --git a/pandas/io/formats/excel.py b/pandas/io/formats/excel.py index 0c625e8a68db0..b285fa5f315ed 100644 --- a/pandas/io/formats/excel.py +++ b/pandas/io/formats/excel.py @@ -769,7 +769,7 @@ def _generate_body(self, coloffset: int) -> Iterable[ExcelCell]: series = self.df.iloc[:, colidx] for i, val in enumerate(series): if styles is not None: - css = ";".join([a + ":" + str(v) for (a, v) in styles[i, colidx]]) + css = ";".join(a + ":" + str(v) for (a, v) in styles[i, colidx]) xlstyle = self.style_converter(css) yield ExcelCell(self.rowcounter + i, colidx + coloffset, val, xlstyle) diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 83e0086958b9a..d1c19f348f901 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -861,7 +861,7 @@ def space_format(x, y): return y str_columns = list( - zip(*([space_format(x, y) for y in x] for x in fmt_columns)) + zip(*[[space_format(x, y) for y in x] for x in fmt_columns]) ) if self.sparsify and len(str_columns): str_columns = sparsify_labels(str_columns) @@ -1635,10 +1635,24 @@ def format_percentiles( percentiles = 100 * percentiles - int_idx = np.isclose(percentiles.astype(int), percentiles) + # error: Item "List[Union[int, float]]" of "Union[ndarray, List[Union[int, float]], + # List[float], List[Union[str, float]]]" has no attribute "astype" + # error: Item "List[float]" of "Union[ndarray, List[Union[int, float]], List[float], + # List[Union[str, float]]]" has no attribute "astype" + # error: Item "List[Union[str, float]]" of "Union[ndarray, List[Union[int, float]], + # List[float], List[Union[str, float]]]" has no attribute "astype" + int_idx = np.isclose( + percentiles.astype(int), percentiles # type: ignore[union-attr] + ) if np.all(int_idx): - out = percentiles.astype(int).astype(str) + # error: Item "List[Union[int, float]]" of "Union[ndarray, List[Union[int, + # float]], List[float], List[Union[str, float]]]" has no attribute "astype" + # error: Item "List[float]" of "Union[ndarray, List[Union[int, float]], + # List[float], List[Union[str, float]]]" has no attribute "astype" + # error: Item "List[Union[str, float]]" of "Union[ndarray, List[Union[int, + # float]], List[float], List[Union[str, float]]]" has no attribute "astype" + out = percentiles.astype(int).astype(str) # type: ignore[union-attr] return [i + "%" for i in out] unique_pcts = np.unique(percentiles) diff --git a/pandas/io/formats/info.py b/pandas/io/formats/info.py index 64a59778a54f3..e014d7d63a35f 100644 --- a/pandas/io/formats/info.py +++ b/pandas/io/formats/info.py @@ -16,7 +16,10 @@ from pandas._config import get_option -from pandas._typing import Dtype +from pandas._typing import ( + Dtype, + FrameOrSeriesUnion, +) from pandas.core.indexes.api import Index @@ -24,10 +27,7 @@ from pandas.io.formats.printing import pprint_thing if TYPE_CHECKING: - from pandas.core.frame import ( - DataFrame, - Series, - ) + from pandas.core.frame import DataFrame def _put_str(s: str | Dtype, space: int) -> str: @@ -110,7 +110,7 @@ class BaseInfo(ABC): values. """ - data: DataFrame | Series + data: FrameOrSeriesUnion memory_usage: bool | str @property @@ -413,7 +413,7 @@ def get_lines(self) -> list[str]: """Product in a form of list of lines (strings).""" @property - def data(self) -> DataFrame | Series: + def data(self) -> FrameOrSeriesUnion: return self.info.data @property diff --git a/pandas/io/formats/latex.py b/pandas/io/formats/latex.py index 93069a1e2955d..e9e2b830e32cb 100644 --- a/pandas/io/formats/latex.py +++ b/pandas/io/formats/latex.py @@ -358,7 +358,7 @@ def get_result(self) -> str: self.bottom_separator, self.env_end, ] - result = "\n".join([item for item in elements if item]) + result = "\n".join(item for item in elements if item) trailing_newline = "\n" result += trailing_newline return result @@ -527,13 +527,13 @@ def env_begin(self) -> str: f"\\begin{{longtable}}{self._position_macro}{{{self.column_format}}}" ) elements = [first_row, f"{self._caption_and_label()}"] - return "\n".join([item for item in elements if item]) + return "\n".join(item for item in elements if item) def _caption_and_label(self) -> str: if self.caption or self.label: double_backslash = "\\\\" elements = [f"{self._caption_macro}", f"{self._label_macro}"] - caption_and_label = "\n".join([item for item in elements if item]) + caption_and_label = "\n".join(item for item in elements if item) caption_and_label += double_backslash return caption_and_label else: @@ -611,7 +611,7 @@ def env_begin(self) -> str: f"{self._label_macro}", f"\\begin{{tabular}}{{{self.column_format}}}", ] - return "\n".join([item for item in elements if item]) + return "\n".join(item for item in elements if item) @property def bottom_separator(self) -> str: diff --git a/pandas/io/formats/string.py b/pandas/io/formats/string.py index 90a4800c805b6..2610b7777207f 100644 --- a/pandas/io/formats/string.py +++ b/pandas/io/formats/string.py @@ -119,7 +119,13 @@ def _join_multiline(self, strcols_input: Iterable[list[str]]) -> str: if self.fmt.index: idx = strcols.pop(0) - lwidth -= np.array([self.adj.len(x) for x in idx]).max() + adjoin_width + # error: Argument 1 to "__call__" of "_NumberOp" has incompatible type + # "None"; expected "Union[int, float, complex, number, bool_]" + # error: Incompatible types in assignment (expression has type "number", + # variable has type "Optional[int]") + lwidth -= ( # type: ignore[assignment,arg-type] + np.array([self.adj.len(x) for x in idx]).max() + adjoin_width + ) col_widths = [ np.array([self.adj.len(x) for x in col]).max() if len(col) > 0 else 0 @@ -127,7 +133,9 @@ def _join_multiline(self, strcols_input: Iterable[list[str]]) -> str: ] assert lwidth is not None - col_bins = _binify(col_widths, lwidth) + # error: Argument 1 to "_binify" has incompatible type "List[object]"; expected + # "List[int]" + col_bins = _binify(col_widths, lwidth) # type: ignore[arg-type] nbins = len(col_bins) if self.fmt.is_truncated_vertically: diff --git a/pandas/io/formats/style.py b/pandas/io/formats/style.py index cb56ea33acad8..93c3843b36846 100644 --- a/pandas/io/formats/style.py +++ b/pandas/io/formats/style.py @@ -23,6 +23,7 @@ Axis, FilePathOrBuffer, FrameOrSeries, + FrameOrSeriesUnion, IndexLabel, Scalar, ) @@ -30,10 +31,7 @@ from pandas.util._decorators import doc import pandas as pd -from pandas import ( - IndexSlice, - RangeIndex, -) +from pandas import RangeIndex from pandas.api.types import is_list_like from pandas.core import generic import pandas.core.common as com @@ -173,7 +171,7 @@ class Styler(StylerRenderer): def __init__( self, - data: DataFrame | Series, + data: FrameOrSeriesUnion, precision: int | None = None, table_styles: CSSStyles | None = None, uuid: str | None = None, @@ -428,7 +426,6 @@ def to_latex( multicol_align: str = "r", siunitx: bool = False, encoding: str | None = None, - convert_css: bool = False, ): r""" Write Styler to a file, buffer or string in LaTeX format. @@ -485,10 +482,6 @@ def to_latex( Set to ``True`` to structure LaTeX compatible with the {siunitx} package. encoding : str, default "utf-8" Character encoding setting. - convert_css : bool, default False - Convert simple cell-styles from CSS to LaTeX format. Any CSS not found in - conversion table is dropped. A style can be forced by adding option - `--latex`. See notes. Returns ------- @@ -668,48 +661,7 @@ def to_latex( & ix2 & \$3 & 4.400 & CATS \\ L1 & ix3 & \$2 & 6.600 & COWS \\ \end{tabular} - - **CSS Conversion** - - This method can convert a Styler constructured with HTML-CSS to LaTeX using - the following limited conversions. - - ================== ==================== ============= ========================== - CSS Attribute CSS value LaTeX Command LaTeX Options - ================== ==================== ============= ========================== - font-weight | bold | bfseries - | bolder | bfseries - font-style | italic | itshape - | oblique | slshape - background-color | red cellcolor | {red}--lwrap - | #fe01ea | [HTML]{FE01EA}--lwrap - | #f0e | [HTML]{FF00EE}--lwrap - | rgb(128,255,0) | [rgb]{0.5,1,0}--lwrap - | rgba(128,0,0,0.5) | [rgb]{0.5,0,0}--lwrap - | rgb(25%,255,50%) | [rgb]{0.25,1,0.5}--lwrap - color | red color | {red} - | #fe01ea | [HTML]{FE01EA} - | #f0e | [HTML]{FF00EE} - | rgb(128,255,0) | [rgb]{0.5,1,0} - | rgba(128,0,0,0.5) | [rgb]{0.5,0,0} - | rgb(25%,255,50%) | [rgb]{0.25,1,0.5} - ================== ==================== ============= ========================== - - It is also possible to add user-defined LaTeX only styles to a HTML-CSS Styler - using the ``--latex`` flag, and to add LaTeX parsing options that the - converter will detect within a CSS-comment. - - >>> df = pd.DataFrame([[1]]) - >>> df.style.set_properties( - ... **{"font-weight": "bold /* --dwrap */", "Huge": "--latex--rwrap"} - ... ).to_latex(convert_css=True) - \begin{tabular}{lr} - {} & {0} \\ - 0 & {\bfseries}{\Huge{1}} \\ - \end{tabular} """ - obj = self._copy(deepcopy=True) # manipulate table_styles on obj, not self - table_selectors = ( [style["selector"] for style in self.table_styles] if self.table_styles is not None @@ -718,7 +670,7 @@ def to_latex( if column_format is not None: # add more recent setting to table_styles - obj.set_table_styles( + self.set_table_styles( [{"selector": "column_format", "props": f":{column_format}"}], overwrite=False, ) @@ -730,19 +682,19 @@ def to_latex( self.data.columns = RangeIndex(stop=len(self.data.columns)) numeric_cols = self.data._get_numeric_data().columns.to_list() self.data.columns = _original_columns - column_format = "" if self.hide_index_ else "l" * self.data.index.nlevels + column_format = "" if self.hidden_index else "l" * self.data.index.nlevels for ci, _ in enumerate(self.data.columns): if ci not in self.hidden_columns: column_format += ( ("r" if not siunitx else "S") if ci in numeric_cols else "l" ) - obj.set_table_styles( + self.set_table_styles( [{"selector": "column_format", "props": f":{column_format}"}], overwrite=False, ) if position: - obj.set_table_styles( + self.set_table_styles( [{"selector": "position", "props": f":{position}"}], overwrite=False, ) @@ -754,13 +706,13 @@ def to_latex( f"'raggedright', 'raggedleft', 'centering', " f"got: '{position_float}'" ) - obj.set_table_styles( + self.set_table_styles( [{"selector": "position_float", "props": f":{position_float}"}], overwrite=False, ) if hrules: - obj.set_table_styles( + self.set_table_styles( [ {"selector": "toprule", "props": ":toprule"}, {"selector": "midrule", "props": ":midrule"}, @@ -770,25 +722,24 @@ def to_latex( ) if label: - obj.set_table_styles( + self.set_table_styles( [{"selector": "label", "props": f":{{{label.replace(':', '§')}}}"}], overwrite=False, ) if caption: - obj.set_caption(caption) + self.set_caption(caption) if sparse_index is None: sparse_index = get_option("styler.sparse.index") if sparse_columns is None: sparse_columns = get_option("styler.sparse.columns") - latex = obj._render_latex( + latex = self._render_latex( sparse_index=sparse_index, sparse_columns=sparse_columns, multirow_align=multirow_align, multicol_align=multicol_align, - convert_css=convert_css, ) return save_to_buffer(latex, buf=buf, encoding=encoding) @@ -965,60 +916,39 @@ def _update_ctx(self, attrs: DataFrame) -> None: self.ctx[(i, j)].extend(css_list) def _copy(self, deepcopy: bool = False) -> Styler: - """ - Copies a Styler, allowing for deepcopy or shallow copy - - Copying a Styler aims to recreate a new Styler object which contains the same - data and styles as the original. - - Data dependent attributes [copied and NOT exported]: - - formatting (._display_funcs) - - hidden index values or column values (.hidden_rows, .hidden_columns) - - tooltips - - cell_context (cell css classes) - - ctx (cell css styles) - - caption - - Non-data dependent attributes [copied and exported]: - - hidden index state and hidden columns state (.hide_index_, .hide_columns_) - - table_attributes - - table_styles - - applied styles (_todo) - - """ - # GH 40675 styler = Styler( - self.data, # populates attributes 'data', 'columns', 'index' as shallow - uuid_len=self.uuid_len, + self.data, + precision=self.precision, + caption=self.caption, + table_attributes=self.table_attributes, + cell_ids=self.cell_ids, + na_rep=self.na_rep, ) - shallow = [ # simple string or boolean immutables - "hide_index_", - "hide_columns_", - "table_attributes", - "cell_ids", - "caption", - ] - deep = [ # nested lists or dicts - "_display_funcs", - "hidden_rows", - "hidden_columns", - "ctx", - "cell_context", - "_todo", - "table_styles", - "tooltips", - ] - - for attr in shallow: - setattr(styler, attr, getattr(self, attr)) - - for attr in deep: - val = getattr(self, attr) - setattr(styler, attr, copy.deepcopy(val) if deepcopy else val) + + styler.uuid = self.uuid + styler.hidden_index = self.hidden_index + + if deepcopy: + styler.ctx = copy.deepcopy(self.ctx) + styler._todo = copy.deepcopy(self._todo) + styler.table_styles = copy.deepcopy(self.table_styles) + styler.hidden_columns = copy.copy(self.hidden_columns) + styler.cell_context = copy.deepcopy(self.cell_context) + styler.tooltips = copy.deepcopy(self.tooltips) + else: + styler.ctx = self.ctx + styler._todo = self._todo + styler.table_styles = self.table_styles + styler.hidden_columns = self.hidden_columns + styler.cell_context = self.cell_context + styler.tooltips = self.tooltips return styler def __copy__(self) -> Styler: + """ + Deep copy by default. + """ return self._copy(deepcopy=False) def __deepcopy__(self, memo) -> Styler: @@ -1030,14 +960,15 @@ def clear(self) -> None: Returns None. """ - # create default GH 40675 - clean_copy = Styler(self.data, uuid=self.uuid) - clean_attrs = [a for a in clean_copy.__dict__ if not callable(a)] - self_attrs = [a for a in self.__dict__ if not callable(a)] # maybe more attrs - for attr in clean_attrs: - setattr(self, attr, getattr(clean_copy, attr)) - for attr in set(self_attrs).difference(clean_attrs): - delattr(self, attr) + self.ctx.clear() + self.tooltips = None + self.cell_context.clear() + self._todo.clear() + + self.hidden_index = False + self.hidden_columns = [] + # self.format and self.table_styles may be dependent on user + # input in self.__init__() def _apply( self, @@ -1165,7 +1096,7 @@ def _applymap( ) -> Styler: func = partial(func, **kwargs) # applymap doesn't take kwargs? if subset is None: - subset = IndexSlice[:] + subset = pd.IndexSlice[:] subset = non_reducing_slice(subset) result = self.data.loc[subset].applymap(func) self._update_ctx(result) @@ -1275,14 +1206,12 @@ def where( recommend using instead. The example: - >>> df = pd.DataFrame([[1, 2], [3, 4]]) >>> def cond(v, limit=4): ... return v > 1 and v != limit >>> df.style.where(cond, value='color:green;', other='color:red;') should be refactored to: - >>> def style_func(v, value, other, limit=4): ... cond = v > 1 and v != limit ... return value if cond else other @@ -1435,71 +1364,6 @@ def set_caption(self, caption: str | tuple) -> Styler: self.caption = caption return self - def set_sticky( - self, - axis: Axis = 0, - pixel_size: int | None = None, - levels: list[int] | None = None, - ) -> Styler: - """ - Add CSS to permanently display the index or column headers in a scrolling frame. - - Parameters - ---------- - axis : {0 or 'index', 1 or 'columns', None}, default 0 - Whether to make the index or column headers sticky. - pixel_size : int, optional - Required to configure the width of index cells or the height of column - header cells when sticking a MultiIndex. Defaults to 75 and 25 respectively. - levels : list of int - If ``axis`` is a MultiIndex the specific levels to stick. If ``None`` will - stick all levels. - - Returns - ------- - self : Styler - """ - if axis in [0, "index"]: - axis, obj, tag, pos = 0, self.data.index, "tbody", "left" - pixel_size = 75 if not pixel_size else pixel_size - elif axis in [1, "columns"]: - axis, obj, tag, pos = 1, self.data.columns, "thead", "top" - pixel_size = 25 if not pixel_size else pixel_size - else: - raise ValueError("`axis` must be one of {0, 1, 'index', 'columns'}") - - if not isinstance(obj, pd.MultiIndex): - return self.set_table_styles( - [ - { - "selector": f"{tag} th", - "props": f"position:sticky; {pos}:0px; background-color:white;", - } - ], - overwrite=False, - ) - else: - range_idx = list(range(obj.nlevels)) - - levels = sorted(levels) if levels else range_idx - for i, level in enumerate(levels): - self.set_table_styles( - [ - { - "selector": f"{tag} th.level{level}", - "props": f"position: sticky; " - f"{pos}: {i * pixel_size}px; " - f"{f'height: {pixel_size}px; ' if axis == 1 else ''}" - f"{f'min-width: {pixel_size}px; ' if axis == 0 else ''}" - f"{f'max-width: {pixel_size}px; ' if axis == 0 else ''}" - f"background-color: white;", - } - ], - overwrite=False, - ) - - return self - def set_table_styles( self, table_styles: dict[Any, CSSStyles] | CSSStyles, @@ -1645,169 +1509,37 @@ def set_na_rep(self, na_rep: str) -> StylerRenderer: self.na_rep = na_rep return self.format(na_rep=na_rep, precision=self.precision) - def hide_index(self, subset: Subset | None = None) -> Styler: + def hide_index(self) -> Styler: """ - Hide the entire index, or specific keys in the index from rendering. - - This method has dual functionality: - - - if ``subset`` is ``None`` then the entire index will be hidden whilst - displaying all data-rows. - - if a ``subset`` is given then those specific rows will be hidden whilst the - index itself remains visible. - - .. versionchanged:: 1.3.0 - - Parameters - ---------- - subset : label, array-like, IndexSlice, optional - A valid 1d input or single key along the index axis within - `DataFrame.loc[, :]`, to limit ``data`` to *before* applying - the function. + Hide any indices from rendering. Returns ------- self : Styler - - See Also - -------- - Styler.hide_columns: Hide the entire column headers row, or specific columns. - - Examples - -------- - Simple application hiding specific rows: - - >>> df = pd.DataFrame([[1,2], [3,4], [5,6]], index=["a", "b", "c"]) - >>> df.style.hide_index(["a", "b"]) - 0 1 - c 5 6 - - Hide the index and retain the data values: - - >>> midx = pd.MultiIndex.from_product([["x", "y"], ["a", "b", "c"]]) - >>> df = pd.DataFrame(np.random.randn(6,6), index=midx, columns=midx) - >>> df.style.format("{:.1f}").hide_index() - x y - a b c a b c - 0.1 0.0 0.4 1.3 0.6 -1.4 - 0.7 1.0 1.3 1.5 -0.0 -0.2 - 1.4 -0.8 1.6 -0.2 -0.4 -0.3 - 0.4 1.0 -0.2 -0.8 -1.2 1.1 - -0.6 1.2 1.8 1.9 0.3 0.3 - 0.8 0.5 -0.3 1.2 2.2 -0.8 - - Hide specific rows but retain the index: - - >>> df.style.format("{:.1f}").hide_index(subset=(slice(None), ["a", "c"])) - x y - a b c a b c - x b 0.7 1.0 1.3 1.5 -0.0 -0.2 - y b -0.6 1.2 1.8 1.9 0.3 0.3 - - Hide specific rows and the index: - - >>> df.style.format("{:.1f}").hide_index(subset=(slice(None), ["a", "c"])) - ... .hide_index() - x y - a b c a b c - 0.7 1.0 1.3 1.5 -0.0 -0.2 - -0.6 1.2 1.8 1.9 0.3 0.3 """ - if subset is None: - self.hide_index_ = True - else: - subset_ = IndexSlice[subset, :] # new var so mypy reads not Optional - subset = non_reducing_slice(subset_) - hide = self.data.loc[subset] - hrows = self.index.get_indexer_for(hide.index) - # error: Incompatible types in assignment (expression has type - # "ndarray", variable has type "Sequence[int]") - self.hidden_rows = hrows # type: ignore[assignment] + self.hidden_index = True return self - def hide_columns(self, subset: Subset | None = None) -> Styler: + def hide_columns(self, subset: Subset) -> Styler: """ - Hide the column headers or specific keys in the columns from rendering. - - This method has dual functionality: - - - if ``subset`` is ``None`` then the entire column headers row will be hidden - whilst the data-values remain visible. - - if a ``subset`` is given then those specific columns, including the - data-values will be hidden, whilst the column headers row remains visible. - - .. versionchanged:: 1.3.0 + Hide columns from rendering. Parameters ---------- - subset : label, array-like, IndexSlice, optional - A valid 1d input or single key along the columns axis within - `DataFrame.loc[:, ]`, to limit ``data`` to *before* applying - the function. + subset : label, array-like, IndexSlice + A valid 1d input or single key along the appropriate axis within + `DataFrame.loc[]`, to limit ``data`` to *before* applying the function. Returns ------- self : Styler - - See Also - -------- - Styler.hide_index: Hide the entire index, or specific keys in the index. - - Examples - -------- - Simple application hiding specific columns: - - >>> df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=["a", "b", "c"]) - >>> df.style.hide_columns(["a", "b"]) - c - 0 3 - 1 6 - - Hide column headers and retain the data values: - - >>> midx = pd.MultiIndex.from_product([["x", "y"], ["a", "b", "c"]]) - >>> df = pd.DataFrame(np.random.randn(6,6), index=midx, columns=midx) - >>> df.style.format("{:.1f}").hide_columns() - x d 0.1 0.0 0.4 1.3 0.6 -1.4 - e 0.7 1.0 1.3 1.5 -0.0 -0.2 - f 1.4 -0.8 1.6 -0.2 -0.4 -0.3 - y d 0.4 1.0 -0.2 -0.8 -1.2 1.1 - e -0.6 1.2 1.8 1.9 0.3 0.3 - f 0.8 0.5 -0.3 1.2 2.2 -0.8 - - Hide specific columns but retain the column headers: - - >>> df.style.format("{:.1f}").hide_columns(subset=(slice(None), ["a", "c"])) - x y - b b - x a 0.0 0.6 - b 1.0 -0.0 - c -0.8 -0.4 - y a 1.0 -1.2 - b 1.2 0.3 - c 0.5 2.2 - - Hide specific columns and the column headers: - - >>> df.style.format("{:.1f}").hide_columns(subset=(slice(None), ["a", "c"])) - ... .hide_columns() - x a 0.0 0.6 - b 1.0 -0.0 - c -0.8 -0.4 - y a 1.0 -1.2 - b 1.2 0.3 - c 0.5 2.2 """ - if subset is None: - self.hide_columns_ = True - else: - subset_ = IndexSlice[:, subset] # new var so mypy reads not Optional - subset = non_reducing_slice(subset_) - hide = self.data.loc[subset] - hcols = self.columns.get_indexer_for(hide.columns) - # error: Incompatible types in assignment (expression has type - # "ndarray", variable has type "Sequence[int]") - self.hidden_columns = hcols # type: ignore[assignment] + subset = non_reducing_slice(subset) + hidden_df = self.data.loc[subset] + hcols = self.columns.get_indexer_for(hidden_df.columns) + # error: Incompatible types in assignment (expression has type + # "ndarray", variable has type "Sequence[int]") + self.hidden_columns = hcols # type: ignore[assignment] return self # ----------------------------------------------------------------------- @@ -2039,27 +1771,82 @@ def set_properties(self, subset: Subset | None = None, **kwargs) -> Styler: >>> df.style.set_properties(color="white", align="right") >>> df.style.set_properties(**{'background-color': 'yellow'}) """ - values = "".join([f"{p}: {v};" for p, v in kwargs.items()]) + values = "".join(f"{p}: {v};" for p, v in kwargs.items()) return self.applymap(lambda x: values, subset=subset) + @staticmethod + def _bar( + s, + align: str, + colors: list[str], + width: float = 100, + vmin: float | None = None, + vmax: float | None = None, + ): + """ + Draw bar chart in dataframe cells. + """ + # Get input value range. + smin = np.nanmin(s.to_numpy()) if vmin is None else vmin + smax = np.nanmax(s.to_numpy()) if vmax is None else vmax + if align == "mid": + smin = min(0, smin) + smax = max(0, smax) + elif align == "zero": + # For "zero" mode, we want the range to be symmetrical around zero. + smax = max(abs(smin), abs(smax)) + smin = -smax + # Transform to percent-range of linear-gradient + normed = width * (s.to_numpy(dtype=float) - smin) / (smax - smin + 1e-12) + zero = -width * smin / (smax - smin + 1e-12) + + def css_bar(start: float, end: float, color: str) -> str: + """ + Generate CSS code to draw a bar from start to end. + """ + css = "width: 10em; height: 80%;" + if end > start: + css += "background: linear-gradient(90deg," + if start > 0: + css += f" transparent {start:.1f}%, {color} {start:.1f}%, " + e = min(end, width) + css += f"{color} {e:.1f}%, transparent {e:.1f}%)" + return css + + def css(x): + if pd.isna(x): + return "" + + # avoid deprecated indexing `colors[x > zero]` + color = colors[1] if x > zero else colors[0] + + if align == "left": + return css_bar(0, x, color) + else: + return css_bar(min(x, zero), max(x, zero), color) + + if s.ndim == 1: + return [css(x) for x in normed] + else: + return DataFrame( + [[css(x) for x in row] for row in normed], + index=s.index, + columns=s.columns, + ) + def bar( self, subset: Subset | None = None, axis: Axis | None = 0, - *, color="#d65f5f", width: float = 100, - height: float = 100, - align: str | float | int | Callable = "mid", + align: str = "left", vmin: float | None = None, vmax: float | None = None, - props: str = "width: 10em;", ) -> Styler: """ Draw bar chart in the cell backgrounds. - .. versionchanged:: 1.4.0 - Parameters ---------- subset : label, array-like, IndexSlice, optional @@ -2076,30 +1863,16 @@ def bar( first element is the color_negative and the second is the color_positive (eg: ['#d65f5f', '#5fba7d']). width : float, default 100 - The percentage of the cell, measured from the left, in which to draw the - bars, in [0, 100]. - height : float, default 100 - The percentage height of the bar in the cell, centrally aligned, in [0,100]. - - .. versionadded:: 1.4.0 - align : str, int, float, callable, default 'mid' - How to align the bars within the cells relative to a width adjusted center. - If string must be one of: - - - 'left' : bars are drawn rightwards from the minimum data value. - - 'right' : bars are drawn leftwards from the maximum data value. - - 'zero' : a value of zero is located at the center of the cell. - - 'mid' : a value of (max-min)/2 is located at the center of the cell, - or if all values are negative (positive) the zero is - aligned at the right (left) of the cell. - - 'mean' : the mean value of the data is located at the center of the cell. - - If a float or integer is given this will indicate the center of the cell. - - If a callable should take a 1d or 2d array and return a scalar. - - .. versionchanged:: 1.4.0 + A number between 0 or 100. The largest value will cover `width` + percent of the cell's width. + align : {'left', 'zero',' mid'}, default 'left' + How to align the bars with the cells. + - 'left' : the min value starts at the left of the cell. + - 'zero' : a value of zero is located at the center of the cell. + - 'mid' : the center of the cell is at (max-min)/2, or + if values are all negative (positive) the zero is aligned + at the right (left) of the cell. vmin : float, optional Minimum bar value, defining the left hand limit of the bar drawing range, lower values are clipped to `vmin`. @@ -2108,16 +1881,14 @@ def bar( Maximum bar value, defining the right hand limit of the bar drawing range, higher values are clipped to `vmax`. When None (default): the maximum value of the data will be used. - props : str, optional - The base CSS of the cell that is extended to add the bar chart. Defaults to - `"width: 10em;"` - - .. versionadded:: 1.4.0 Returns ------- self : Styler """ + if align not in ("left", "zero", "mid"): + raise ValueError("`align` must be one of {'left', 'zero',' mid'}") + if not (is_list_like(color)): color = [color, color] elif len(color) == 1: @@ -2129,25 +1900,18 @@ def bar( "(eg: color=['#d65f5f', '#5fba7d'])" ) - if not (0 <= width <= 100): - raise ValueError(f"`width` must be a value in [0, 100], got {width}") - elif not (0 <= height <= 100): - raise ValueError(f"`height` must be a value in [0, 100], got {height}") - if subset is None: subset = self.data.select_dtypes(include=np.number).columns self.apply( - _bar, + self._bar, subset=subset, axis=axis, align=align, colors=color, - width=width / 100, - height=height / 100, + width=width, vmin=vmin, vmax=vmax, - base_css=props, ) return self @@ -2518,35 +2282,23 @@ def highlight_quantile( ) @classmethod - def from_custom_template( - cls, searchpath, html_table: str | None = None, html_style: str | None = None - ): + def from_custom_template(cls, searchpath, name): """ Factory function for creating a subclass of ``Styler``. - Uses custom templates and Jinja environment. - - .. versionchanged:: 1.3.0 + Uses a custom template and Jinja environment. Parameters ---------- searchpath : str or list Path or paths of directories containing the templates. - html_table : str - Name of your custom template to replace the html_table template. - - .. versionadded:: 1.3.0 - - html_style : str - Name of your custom template to replace the html_style template. - - .. versionadded:: 1.3.0 + name : str + Name of your custom template to use for rendering. Returns ------- MyStyler : subclass of Styler - Has the correct ``env``,``template_html``, ``template_html_table`` and - ``template_html_style`` class attributes set. + Has the correct ``env`` and ``template`` class attributes set. """ loader = jinja2.ChoiceLoader([jinja2.FileSystemLoader(searchpath), cls.loader]) @@ -2555,10 +2307,7 @@ def from_custom_template( # error: Invalid base class "cls" class MyStyler(cls): # type:ignore[valid-type,misc] env = jinja2.Environment(loader=loader) - if html_table: - template_html_table = env.get_template(html_table) - if html_style: - template_html_style = env.get_template(html_style) + template_html = env.get_template(name) return MyStyler @@ -2798,176 +2547,3 @@ def _highlight_between( else np.full(data.shape, True, dtype=bool) ) return np.where(g_left & l_right, props, "") - - -def _bar( - data: FrameOrSeries, - align: str | float | int | Callable, - colors: list[str], - width: float, - height: float, - vmin: float | None, - vmax: float | None, - base_css: str, -): - """ - Draw bar chart in data cells using HTML CSS linear gradient. - - Parameters - ---------- - data : Series or DataFrame - Underling subset of Styler data on which operations are performed. - align : str in {"left", "right", "mid", "zero", "mean"}, int, float, callable - Method for how bars are structured or scalar value of centre point. - colors : list-like of str - Two listed colors as string in valid CSS. - width : float in [0,1] - The percentage of the cell, measured from left, where drawn bars will reside. - height : float in [0,1] - The percentage of the cell's height where drawn bars will reside, centrally - aligned. - vmin : float, optional - Overwrite the minimum value of the window. - vmax : float, optional - Overwrite the maximum value of the window. - base_css : str - Additional CSS that is included in the cell before bars are drawn. - """ - - def css_bar(start: float, end: float, color: str) -> str: - """ - Generate CSS code to draw a bar from start to end in a table cell. - - Uses linear-gradient. - - Parameters - ---------- - start : float - Relative positional start of bar coloring in [0,1] - end : float - Relative positional end of the bar coloring in [0,1] - color : str - CSS valid color to apply. - - Returns - ------- - str : The CSS applicable to the cell. - - Notes - ----- - Uses ``base_css`` from outer scope. - """ - cell_css = base_css - if end > start: - cell_css += "background: linear-gradient(90deg," - if start > 0: - cell_css += f" transparent {start*100:.1f}%, {color} {start*100:.1f}%," - cell_css += f" {color} {end*100:.1f}%, transparent {end*100:.1f}%)" - return cell_css - - def css_calc(x, left: float, right: float, align: str): - """ - Return the correct CSS for bar placement based on calculated values. - - Parameters - ---------- - x : float - Value which determines the bar placement. - left : float - Value marking the left side of calculation, usually minimum of data. - right : float - Value marking the right side of the calculation, usually maximum of data - (left < right). - align : {"left", "right", "zero", "mid"} - How the bars will be positioned. - "left", "right", "zero" can be used with any values for ``left``, ``right``. - "mid" can only be used where ``left <= 0`` and ``right >= 0``. - "zero" is used to specify a center when all values ``x``, ``left``, - ``right`` are translated, e.g. by say a mean or median. - - Returns - ------- - str : Resultant CSS with linear gradient. - - Notes - ----- - Uses ``colors``, ``width`` and ``height`` from outer scope. - """ - if pd.isna(x): - return base_css - - color = colors[0] if x < 0 else colors[1] - x = left if x < left else x - x = right if x > right else x # trim data if outside of the window - - start: float = 0 - end: float = 1 - - if align == "left": - # all proportions are measured from the left side between left and right - end = (x - left) / (right - left) - - elif align == "right": - # all proportions are measured from the right side between left and right - start = (x - left) / (right - left) - - else: - z_frac: float = 0.5 # location of zero based on the left-right range - if align == "zero": - # all proportions are measured from the center at zero - limit: float = max(abs(left), abs(right)) - left, right = -limit, limit - elif align == "mid": - # bars drawn from zero either leftwards or rightwards with center at mid - mid: float = (left + right) / 2 - z_frac = ( - -mid / (right - left) + 0.5 if mid < 0 else -left / (right - left) - ) - - if x < 0: - start, end = (x - left) / (right - left), z_frac - else: - start, end = z_frac, (x - left) / (right - left) - - ret = css_bar(start * width, end * width, color) - if height < 1 and "background: linear-gradient(" in ret: - return ( - ret + f" no-repeat center; background-size: 100% {height * 100:.1f}%;" - ) - else: - return ret - - values = data.to_numpy() - left = np.nanmin(values) if vmin is None else vmin - right = np.nanmax(values) if vmax is None else vmax - z: float = 0 # adjustment to translate data - - if align == "mid": - if left >= 0: # "mid" is documented to act as "left" if all values positive - align, left = "left", 0 if vmin is None else vmin - elif right <= 0: # "mid" is documented to act as "right" if all values negative - align, right = "right", 0 if vmax is None else vmax - elif align == "mean": - z, align = np.nanmean(values), "zero" - elif callable(align): - z, align = align(values), "zero" - elif isinstance(align, (float, int)): - z, align = float(align), "zero" - elif not (align == "left" or align == "right" or align == "zero"): - raise ValueError( - "`align` should be in {'left', 'right', 'mid', 'mean', 'zero'} or be a " - "value defining the center line or a callable that returns a float" - ) - - assert isinstance(align, str) # mypy: should now be in [left, right, mid, zero] - if data.ndim == 1: - return [css_calc(x - z, left - z, right - z, align) for x in values] - else: - return DataFrame( - [ - [css_calc(x - z, left - z, right - z, align) for x in row] - for row in values - ], - index=data.index, - columns=data.columns, - ) diff --git a/pandas/io/formats/style_render.py b/pandas/io/formats/style_render.py index e240c04f97ed1..7686d8a340c37 100644 --- a/pandas/io/formats/style_render.py +++ b/pandas/io/formats/style_render.py @@ -2,7 +2,6 @@ from collections import defaultdict from functools import partial -import re from typing import ( Any, Callable, @@ -21,7 +20,10 @@ from pandas._config import get_option from pandas._libs import lib -from pandas._typing import TypedDict +from pandas._typing import ( + FrameOrSeriesUnion, + TypedDict, +) from pandas.compat._optional import import_optional_dependency from pandas.core.dtypes.generic import ABCSeries @@ -64,13 +66,11 @@ class StylerRenderer: loader = jinja2.PackageLoader("pandas", "io/formats/templates") env = jinja2.Environment(loader=loader, trim_blocks=True) template_html = env.get_template("html.tpl") - template_html_table = env.get_template("html_table.tpl") - template_html_style = env.get_template("html_style.tpl") template_latex = env.get_template("latex.tpl") def __init__( self, - data: DataFrame | Series, + data: FrameOrSeriesUnion, uuid: str | None = None, uuid_len: int = 5, table_styles: CSSStyles | None = None, @@ -97,9 +97,7 @@ def __init__( self.cell_ids = cell_ids # add rendering variables - self.hide_index_: bool = False # bools for hiding col/row headers - self.hide_columns_: bool = False - self.hidden_rows: Sequence[int] = [] # sequence for specific hidden rows/cols + self.hidden_index: bool = False self.hidden_columns: Sequence[int] = [] self.ctx: DefaultDict[tuple[int, int], CSSList] = defaultdict(list) self.cell_context: DefaultDict[tuple[int, int], str] = defaultdict(str) @@ -119,11 +117,7 @@ def _render_html(self, sparse_index: bool, sparse_columns: bool, **kwargs) -> st # TODO: namespace all the pandas keys d = self._translate(sparse_index, sparse_columns) d.update(kwargs) - return self.template_html.render( - **d, - html_table_tpl=self.template_html_table, - html_style_tpl=self.template_html_style, - ) + return self.template_html.render(**d) def _render_latex(self, sparse_index: bool, sparse_columns: bool, **kwargs) -> str: """ @@ -303,56 +297,55 @@ def _translate_header( head = [] # 1) column headers - if not self.hide_columns_: - for r in range(self.data.columns.nlevels): - index_blanks = [ - _element("th", blank_class, blank_value, not self.hide_index_) - ] * (self.data.index.nlevels - 1) - - name = self.data.columns.names[r] - column_name = [ + for r in range(self.data.columns.nlevels): + index_blanks = [ + _element("th", blank_class, blank_value, not self.hidden_index) + ] * (self.data.index.nlevels - 1) + + name = self.data.columns.names[r] + column_name = [ + _element( + "th", + f"{blank_class if name is None else index_name_class} level{r}", + name if name is not None else blank_value, + not self.hidden_index, + ) + ] + + if clabels: + column_headers = [ _element( "th", - f"{blank_class if name is None else index_name_class} level{r}", - name if name is not None else blank_value, - not self.hide_index_, + f"{col_heading_class} level{r} col{c}", + value, + _is_visible(c, r, col_lengths), + attributes=( + f'colspan="{col_lengths.get((r, c), 0)}"' + if col_lengths.get((r, c), 0) > 1 + else "" + ), ) + for c, value in enumerate(clabels[r]) ] - if clabels: - column_headers = [ + if len(self.data.columns) > max_cols: + # add an extra column with `...` value to indicate trimming + column_headers.append( _element( "th", - f"{col_heading_class} level{r} col{c}", - value, - _is_visible(c, r, col_lengths), - attributes=( - f'colspan="{col_lengths.get((r, c), 0)}"' - if col_lengths.get((r, c), 0) > 1 - else "" - ), - ) - for c, value in enumerate(clabels[r]) - ] - - if len(self.data.columns) > max_cols: - # add an extra column with `...` value to indicate trimming - column_headers.append( - _element( - "th", - f"{col_heading_class} level{r} {trimmed_col_class}", - "...", - True, - attributes="", - ) + f"{col_heading_class} level{r} {trimmed_col_class}", + "...", + True, + attributes="", ) - head.append(index_blanks + column_name + column_headers) + ) + head.append(index_blanks + column_name + column_headers) # 2) index names if ( self.data.index.names and com.any_not_none(*self.data.index.names) - and not self.hide_index_ + and not self.hidden_index ): index_names = [ _element( @@ -418,9 +411,7 @@ def _translate_body( The associated HTML elements needed for template rendering. """ # for sparsifying a MultiIndex - idx_lengths = _get_level_lengths( - self.index, sparsify_index, max_rows, self.hidden_rows - ) + idx_lengths = _get_level_lengths(self.index, sparsify_index, max_rows) rlabels = self.data.index.tolist()[:max_rows] # slice to allow trimming if self.data.index.nlevels == 1: @@ -434,7 +425,7 @@ def _translate_body( "th", f"{row_heading_class} level{c} {trimmed_row_class}", "...", - not self.hide_index_, + not self.hidden_index, attributes="", ) for c in range(self.data.index.nlevels) @@ -471,7 +462,7 @@ def _translate_body( "th", f"{row_heading_class} level{c} row{r}", value, - (_is_visible(r, c, idx_lengths) and not self.hide_index_), + (_is_visible(r, c, idx_lengths) and not self.hidden_index), id=f"level{c}_row{r}", attributes=( f'rowspan="{idx_lengths.get((c, r), 0)}"' @@ -505,7 +496,7 @@ def _translate_body( "td", f"{data_class} row{r} col{c}{cls}", value, - (c not in self.hidden_columns and r not in self.hidden_rows), + (c not in self.hidden_columns), attributes="", display_value=self._display_funcs[(r, c)](value), ) @@ -536,7 +527,7 @@ def _translate_latex(self, d: dict) -> None: d["head"] = [[col for col in row if col["is_visible"]] for row in d["head"]] body = [] for r, row in enumerate(d["body"]): - if self.hide_index_: + if self.hidden_index: row_body_headers = [] else: row_body_headers = [ @@ -851,13 +842,7 @@ def _get_level_lengths( last_label = j lengths[(i, last_label)] = 0 elif j not in hidden_elements: - if lengths[(i, last_label)] == 0: - # if the previous iteration was first-of-kind but hidden then offset - last_label = j - lengths[(i, last_label)] = 1 - else: - # else add to previous iteration - lengths[(i, last_label)] += 1 + lengths[(i, last_label)] += 1 non_zero_lengths = { element: length for element, length in lengths.items() if length >= 1 @@ -1174,7 +1159,7 @@ def _pseudo_css(self, uuid: str, name: str, row: int, col: int, text: str): }, ] - def _translate(self, styler_data: DataFrame | Series, uuid: str, d: dict): + def _translate(self, styler_data: FrameOrSeriesUnion, uuid: str, d: dict): """ Mutate the render dictionary to allow for tooltips: @@ -1268,9 +1253,7 @@ def _parse_latex_table_styles(table_styles: CSSStyles, selector: str) -> str | N return None -def _parse_latex_cell_styles( - latex_styles: CSSList, display_value: str, convert_css: bool = False -) -> str: +def _parse_latex_cell_styles(latex_styles: CSSList, display_value: str) -> str: r""" Mutate the ``display_value`` string including LaTeX commands from ``latex_styles``. @@ -1296,8 +1279,6 @@ def _parse_latex_cell_styles( For example for styles: `[('c1', 'o1--wrap'), ('c2', 'o2')]` this returns: `{\c1o1 \c2o2{display_value}} """ - if convert_css: - latex_styles = _parse_latex_css_conversion(latex_styles) for (command, options) in latex_styles[::-1]: # in reverse for most recent style formatter = { "--wrap": f"{{\\{command}--to_parse {display_value}}}", @@ -1370,82 +1351,6 @@ def _parse_latex_options_strip(value: str | int | float, arg: str) -> str: return str(value).replace(arg, "").replace("/*", "").replace("*/", "").strip() -def _parse_latex_css_conversion(styles: CSSList) -> CSSList: - """ - Convert CSS (attribute,value) pairs to equivalent LaTeX (command,options) pairs. - - Ignore conversion if tagged with `--latex` option, skipped if no conversion found. - """ - - def font_weight(value, arg): - if value == "bold" or value == "bolder": - return "bfseries", f"{arg}" - return None - - def font_style(value, arg): - if value == "italic": - return "itshape", f"{arg}" - elif value == "oblique": - return "slshape", f"{arg}" - return None - - def color(value, user_arg, command, comm_arg): - """ - CSS colors have 5 formats to process: - - - 6 digit hex code: "#ff23ee" --> [HTML]{FF23EE} - - 3 digit hex code: "#f0e" --> [HTML]{FF00EE} - - rgba: rgba(128, 255, 0, 0.5) --> [rgb]{0.502, 1.000, 0.000} - - rgb: rgb(128, 255, 0,) --> [rbg]{0.502, 1.000, 0.000} - - string: red --> {red} - - Additionally rgb or rgba can be expressed in % which is also parsed. - """ - arg = user_arg if user_arg != "" else comm_arg - - if value[0] == "#" and len(value) == 7: # color is hex code - return command, f"[HTML]{{{value[1:].upper()}}}{arg}" - if value[0] == "#" and len(value) == 4: # color is short hex code - val = f"{value[1].upper()*2}{value[2].upper()*2}{value[3].upper()*2}" - return command, f"[HTML]{{{val}}}{arg}" - elif value[:3] == "rgb": # color is rgb or rgba - r = re.findall("(?<=\\()[0-9\\s%]+(?=,)", value)[0].strip() - r = float(r[:-1]) / 100 if "%" in r else int(r) / 255 - g = re.findall("(?<=,)[0-9\\s%]+(?=,)", value)[0].strip() - g = float(g[:-1]) / 100 if "%" in g else int(g) / 255 - if value[3] == "a": # color is rgba - b = re.findall("(?<=,)[0-9\\s%]+(?=,)", value)[1].strip() - else: # color is rgb - b = re.findall("(?<=,)[0-9\\s%]+(?=\\))", value)[0].strip() - b = float(b[:-1]) / 100 if "%" in b else int(b) / 255 - return command, f"[rgb]{{{r:.3f}, {g:.3f}, {b:.3f}}}{arg}" - else: - return command, f"{{{value}}}{arg}" # color is likely string-named - - CONVERTED_ATTRIBUTES: dict[str, Callable] = { - "font-weight": font_weight, - "background-color": partial(color, command="cellcolor", comm_arg="--lwrap"), - "color": partial(color, command="color", comm_arg=""), - "font-style": font_style, - } - - latex_styles: CSSList = [] - for (attribute, value) in styles: - if isinstance(value, str) and "--latex" in value: - # return the style without conversion but drop '--latex' - latex_styles.append((attribute, value.replace("--latex", ""))) - if attribute in CONVERTED_ATTRIBUTES.keys(): - arg = "" - for x in ["--wrap", "--nowrap", "--lwrap", "--dwrap", "--rwrap"]: - if x in str(value): - arg, value = x, _parse_latex_options_strip(value, x) - break - latex_style = CONVERTED_ATTRIBUTES[attribute](value, arg) - if latex_style is not None: - latex_styles.extend([latex_style]) - return latex_styles - - def _escape_latex(s): r""" Replace the characters ``&``, ``%``, ``$``, ``#``, ``_``, ``{``, ``}``, diff --git a/pandas/io/formats/templates/html.tpl b/pandas/io/formats/templates/html.tpl index 8c63be3ad788a..880c78c8d6b05 100644 --- a/pandas/io/formats/templates/html.tpl +++ b/pandas/io/formats/templates/html.tpl @@ -1,16 +1,16 @@ -{# Update the html_style/table_structure.html documentation too #} +{# Update the template_structure.html documentation too #} {% if doctype_html %} -{% if not exclude_styles %}{% include html_style_tpl %}{% endif %} +{% if not exclude_styles %}{% include "html_style.tpl" %}{% endif %} -{% include html_table_tpl %} +{% include "html_table.tpl" %} {% elif not doctype_html %} -{% if not exclude_styles %}{% include html_style_tpl %}{% endif %} -{% include html_table_tpl %} +{% if not exclude_styles %}{% include "html_style.tpl" %}{% endif %} +{% include "html_table.tpl" %} {% endif %} diff --git a/pandas/io/formats/templates/latex.tpl b/pandas/io/formats/templates/latex.tpl index fe081676d87af..66fe99642850f 100644 --- a/pandas/io/formats/templates/latex.tpl +++ b/pandas/io/formats/templates/latex.tpl @@ -39,7 +39,7 @@ {% endif %} {% for row in body %} {% for c in row %}{% if not loop.first %} & {% endif %} - {%- if c.type == 'th' %}{{parse_header(c, multirow_align, multicol_align)}}{% else %}{{parse_cell(c.cellstyle, c.display_value, convert_css)}}{% endif %} + {%- if c.type == 'th' %}{{parse_header(c, multirow_align, multicol_align)}}{% else %}{{parse_cell(c.cellstyle, c.display_value)}}{% endif %} {%- endfor %} \\ {% endfor %} {% set bottomrule = parse_table(table_styles, 'bottomrule') %} diff --git a/pandas/io/formats/xml.py b/pandas/io/formats/xml.py index f5ba8c6b53335..5be6ae0382d87 100644 --- a/pandas/io/formats/xml.py +++ b/pandas/io/formats/xml.py @@ -195,18 +195,14 @@ def handle_indexes(self) -> None: This method will add indexes into attr_cols or elem_cols. """ - if not self.index: - return - - first_key = next(iter(self.frame_dicts)) indexes: list[str] = [ - x for x in self.frame_dicts[first_key].keys() if x not in self.orig_cols + x for x in self.frame_dicts[0].keys() if x not in self.orig_cols ] - if self.attr_cols: + if self.attr_cols and self.index: self.attr_cols = indexes + self.attr_cols - if self.elem_cols: + if self.elem_cols and self.index: self.elem_cols = indexes + self.elem_cols def get_prefix_uri(self) -> str: @@ -311,7 +307,7 @@ def build_tree(self) -> bytes: self.elem_row = SubElement(self.root, f"{self.prefix_uri}{self.row_name}") if not self.attr_cols and not self.elem_cols: - self.elem_cols = list(self.d.keys()) + self.elem_cols = list(self.frame_dicts[0].keys()) self.build_elems() else: @@ -361,9 +357,9 @@ def build_attribs(self) -> None: flat_col = col if isinstance(col, tuple): flat_col = ( - "".join([str(c) for c in col]).strip() + "".join(str(c) for c in col).strip() if "" in col - else "_".join([str(c) for c in col]).strip() + else "_".join(str(c) for c in col).strip() ) attr_name = f"{self.prefix_uri}{flat_col}" @@ -388,9 +384,9 @@ def build_elems(self) -> None: flat_col = col if isinstance(col, tuple): flat_col = ( - "".join([str(c) for c in col]).strip() + "".join(str(c) for c in col).strip() if "" in col - else "_".join([str(c) for c in col]).strip() + else "_".join(str(c) for c in col).strip() ) elem_name = f"{self.prefix_uri}{flat_col}" @@ -481,7 +477,7 @@ def build_tree(self) -> bytes: self.elem_row = SubElement(self.root, f"{self.prefix_uri}{self.row_name}") if not self.attr_cols and not self.elem_cols: - self.elem_cols = list(self.d.keys()) + self.elem_cols = list(self.frame_dicts[0].keys()) self.build_elems() else: @@ -533,9 +529,9 @@ def build_attribs(self) -> None: flat_col = col if isinstance(col, tuple): flat_col = ( - "".join([str(c) for c in col]).strip() + "".join(str(c) for c in col).strip() if "" in col - else "_".join([str(c) for c in col]).strip() + else "_".join(str(c) for c in col).strip() ) attr_name = f"{self.prefix_uri}{flat_col}" @@ -560,9 +556,9 @@ def build_elems(self) -> None: flat_col = col if isinstance(col, tuple): flat_col = ( - "".join([str(c) for c in col]).strip() + "".join(str(c) for c in col).strip() if "" in col - else "_".join([str(c) for c in col]).strip() + else "_".join(str(c) for c in col).strip() ) elem_name = f"{self.prefix_uri}{flat_col}" diff --git a/pandas/io/html.py b/pandas/io/html.py index 2947b22f85d61..0a91d065379cb 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -627,7 +627,7 @@ def _build_xpath_expr(attrs) -> str: if "class_" in attrs: attrs["class"] = attrs.pop("class_") - s = " and ".join([f"@{k}={repr(v)}" for k, v in attrs.items()]) + s = " and ".join(f"@{k}={repr(v)}" for k, v in attrs.items()) return f"[{s}]" @@ -861,7 +861,7 @@ def _parser_dispatch(flavor): def _print_as_set(s) -> str: - arg = ", ".join([pprint_thing(el) for el in s]) + arg = ", ".join(pprint_thing(el) for el in s) return f"{{{arg}}}" diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index fdeda868fdb5e..77582c46977c1 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -21,6 +21,7 @@ from pandas._typing import ( CompressionOptions, DtypeArg, + FrameOrSeriesUnion, IndexLabel, JSONSerializable, StorageOptions, @@ -862,7 +863,7 @@ def __init__( self.convert_dates = convert_dates self.date_unit = date_unit self.keep_default_dates = keep_default_dates - self.obj: DataFrame | Series | None = None + self.obj: FrameOrSeriesUnion | None = None def check_keys_split(self, decoded): """ diff --git a/pandas/io/json/_normalize.py b/pandas/io/json/_normalize.py index 729d60ca78944..5927d6482d3b0 100644 --- a/pandas/io/json/_normalize.py +++ b/pandas/io/json/_normalize.py @@ -380,31 +380,14 @@ def _json_normalize( Returns normalized data with columns prefixed with the given string. """ - def _pull_field( - js: dict[str, Any], spec: list | str, extract_record: bool = False - ) -> Scalar | Iterable: + def _pull_field(js: dict[str, Any], spec: list | str) -> Scalar | Iterable: """Internal function to pull field""" result = js - try: - if isinstance(spec, list): - for field in spec: - result = result[field] - else: - result = result[spec] - except KeyError as e: - if extract_record: - raise KeyError( - f"Key {e} not found. If specifying a record_path, all elements of " - f"data should have the path." - ) from e - elif errors == "ignore": - return np.nan - else: - raise KeyError( - f"Key {e} not found. To replace missing values of {e} with " - f"np.nan, pass in errors='ignore'" - ) from e - + if isinstance(spec, list): + for field in spec: + result = result[field] + else: + result = result[spec] return result def _pull_records(js: dict[str, Any], spec: list | str) -> list: @@ -413,7 +396,7 @@ def _pull_records(js: dict[str, Any], spec: list | str) -> list: _pull_field, but require to return list. And will raise error if has non iterable value. """ - result = _pull_field(js, spec, extract_record=True) + result = _pull_field(js, spec) # GH 31507 GH 30145, GH 26284 if result is not list, raise TypeError if not # null, otherwise return an empty list @@ -505,7 +488,16 @@ def _recursive_extract(data, path, seen_meta, level=0): if level + 1 > len(val): meta_val = seen_meta[key] else: - meta_val = _pull_field(obj, val[level:]) + try: + meta_val = _pull_field(obj, val[level:]) + except KeyError as e: + if errors == "ignore": + meta_val = np.nan + else: + raise KeyError( + "Try running with errors='ignore' as key " + f"{e} is not always present" + ) from e meta_vals[key].append(meta_val) records.extend(recs) diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index 5714bbab016c8..2a86ff13a2edc 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -12,7 +12,6 @@ Iterable, Sequence, cast, - final, ) import warnings @@ -24,9 +23,9 @@ from pandas._libs.parsers import STR_NA_VALUES from pandas._libs.tslibs import parsing from pandas._typing import ( - ArrayLike, DtypeArg, FilePathOrBuffer, + final, ) from pandas.errors import ( ParserError, @@ -351,7 +350,7 @@ def extract(r): # level, then our header was too long. for n in range(len(columns[0])): if all(ensure_str(col[n]) in self.unnamed_cols for col in columns): - header = ",".join([str(x) for x in self.header]) + header = ",".join(str(x) for x in self.header) raise ParserError( f"Passed header=[{header}] are too many rows " "for this multi_index of columns" @@ -804,29 +803,6 @@ def _do_date_conversions(self, names, data): return names, data - def _check_data_length(self, columns: list[str], data: list[ArrayLike]) -> None: - """Checks if length of data is equal to length of column names. - - One set of trailing commas is allowed. self.index_col not False - results in a ParserError previously when lengths do not match. - - Parameters - ---------- - columns: list of column names - data: list of array-likes containing the data column-wise. - """ - if not self.index_col and len(columns) != len(data) and columns: - if len(columns) == len(data) - 1 and np.all( - (is_object_dtype(data[-1]) and data[-1] == "") | isna(data[-1]) - ): - return - warnings.warn( - "Length of header or names does not match length of data. This leads " - "to a loss of data with index_col=False.", - ParserWarning, - stacklevel=6, - ) - def _evaluate_usecols(self, usecols, names): """ Check whether or not the 'usecols' parameter @@ -1138,7 +1114,7 @@ def _try_convert_dates(parser: Callable, colspec, data_dict, columns): else: colnames.append(c) - new_name = "_".join([str(x) for x in colnames]) + new_name = "_".join(str(x) for x in colnames) to_parse = [data_dict[c] for c in colnames if c in data_dict] new_col = parser(*to_parse) diff --git a/pandas/io/parsers/c_parser_wrapper.py b/pandas/io/parsers/c_parser_wrapper.py index ae62cc3b45578..5c1f8f94a72da 100644 --- a/pandas/io/parsers/c_parser_wrapper.py +++ b/pandas/io/parsers/c_parser_wrapper.py @@ -300,8 +300,6 @@ def read(self, nrows=None): # columns as list alldata = [x[1] for x in data_tups] - if self.usecols is None: - self._check_data_length(names, alldata) data = {k: v for k, (i, v) in zip(names, data_tups)} @@ -365,9 +363,7 @@ def _concatenate_chunks(chunks: list[dict[int, ArrayLike]]) -> dict: numpy_dtypes, # type: ignore[arg-type] [], ) - # error: Non-overlapping equality check (left operand type: "dtype[Any]", - # right operand type: "Type[object]") - if common_type == object: # type: ignore[comparison-overlap] + if common_type == object: warning_columns.append(str(name)) dtype = dtypes.pop() diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py index 7c9fcde08bf24..670868c6f4261 100644 --- a/pandas/io/parsers/python_parser.py +++ b/pandas/io/parsers/python_parser.py @@ -25,7 +25,6 @@ ) from pandas.core.dtypes.common import is_integer -from pandas.core.dtypes.inference import is_dict_like from pandas.io.parsers.base_parser import ( ParserBase, @@ -293,8 +292,6 @@ def _exclude_implicit_index(self, alldata): offset = len(self.index_col) # type: ignore[has-type] len_alldata = len(alldata) - self._check_data_length(names, alldata) - return { name: alldata[i + offset] for i, name in enumerate(names) if i < len_alldata }, names @@ -427,7 +424,6 @@ def _infer_columns(self): cur_count = counts[col] if ( self.dtype is not None - and is_dict_like(self.dtype) and self.dtype.get(old_col) is not None and self.dtype.get(col) is None ): @@ -1159,7 +1155,7 @@ def get_rows(self, infer_nrows, skiprows=None): def detect_colspecs(self, infer_nrows=100, skiprows=None): # Regex escape the delimiters - delimiters = "".join([fr"\{x}" for x in self.delimiter]) + delimiters = "".join(fr"\{x}" for x in self.delimiter) pattern = re.compile(f"([^{delimiters}]+)") rows = self.get_rows(infer_nrows, skiprows) if not rows: diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index f1e97ab90793e..208b8a008ffe6 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -40,6 +40,7 @@ ArrayLike, DtypeArg, FrameOrSeries, + FrameOrSeriesUnion, Shape, ) from pandas.compat._optional import import_optional_dependency @@ -2592,7 +2593,7 @@ class Fixed: pandas_kind: str format_type: str = "fixed" # GH#30962 needed by dask - obj_type: type[DataFrame | Series] + obj_type: type[FrameOrSeriesUnion] ndim: int encoding: str parent: HDFStore @@ -2641,7 +2642,7 @@ def __repr__(self) -> str: s = self.shape if s is not None: if isinstance(s, (list, tuple)): - jshape = ",".join([pprint_thing(x) for x in s]) + jshape = ",".join(pprint_thing(x) for x in s) s = f"[{jshape}]" return f"{self.pandas_type:12.12} (shape->{s})" return self.pandas_type @@ -3308,10 +3309,10 @@ def __repr__(self) -> str: ver = "" if self.is_old_version: - jver = ".".join([str(x) for x in self.version]) + jver = ".".join(str(x) for x in self.version) ver = f"[{jver}]" - jindex_axes = ",".join([a.name for a in self.index_axes]) + jindex_axes = ",".join(a.name for a in self.index_axes) return ( f"{self.pandas_type:12.12}{ver} " f"(typ->{self.table_type_short},nrows->{self.nrows}," @@ -3362,7 +3363,7 @@ def is_multi_index(self) -> bool: return isinstance(self.levels, list) def validate_multiindex( - self, obj: DataFrame | Series + self, obj: FrameOrSeriesUnion ) -> tuple[DataFrame, list[Hashable]]: """ validate that we can store the multi-index; reset and return the @@ -3518,7 +3519,7 @@ def validate_version(self, where=None): """are we trying to operate on an old version?""" if where is not None: if self.version[0] <= 0 and self.version[1] <= 10 and self.version[2] < 1: - ws = incompatibility_doc % ".".join([str(x) for x in self.version]) + ws = incompatibility_doc % ".".join(str(x) for x in self.version) warnings.warn(ws, IncompatibilityWarning) def validate_min_itemsize(self, min_itemsize): @@ -4065,7 +4066,7 @@ def get_blk_items(mgr): new_blocks.append(b) new_blk_items.append(b_items) except (IndexError, KeyError) as err: - jitems = ",".join([pprint_thing(item) for item in items]) + jitems = ",".join(pprint_thing(item) for item in items) raise ValueError( f"cannot match existing table structure for [{jitems}] " "on appending data" @@ -4499,7 +4500,7 @@ class AppendableFrameTable(AppendableTable): pandas_kind = "frame_table" table_type = "appendable_frame" ndim = 2 - obj_type: type[DataFrame | Series] = DataFrame + obj_type: type[FrameOrSeriesUnion] = DataFrame @property def is_transposed(self) -> bool: @@ -4999,7 +5000,7 @@ def _maybe_convert_for_string_atom( # check for column in the values conflicts if existing_col is not None: eci = existing_col.validate_col(itemsize) - if eci is not None and eci > itemsize: + if eci > itemsize: itemsize = eci data_converted = data_converted.astype(f"|S{itemsize}", copy=False) diff --git a/pandas/io/sql.py b/pandas/io/sql.py index df9c7e28bff69..b9d5b18b85e02 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -182,7 +182,7 @@ def _wrap_result( return frame -def execute(sql, con, params=None): +def execute(sql, con, cur=None, params=None): """ Execute the given SQL query using the provided connection object. @@ -194,6 +194,7 @@ def execute(sql, con, params=None): Using SQLAlchemy makes it possible to use any DB supported by the library. If a DBAPI2 object, only sqlite3 is supported. + cur : deprecated, cursor is obtained from connection, default: None params : list or tuple, optional, default: None List of parameters to pass to execute method. @@ -201,7 +202,10 @@ def execute(sql, con, params=None): ------- Results Iterable """ - pandas_sql = pandasSQL_builder(con) + if cur is None: + pandas_sql = pandasSQL_builder(con) + else: + pandas_sql = pandasSQL_builder(cur, is_cursor=True) args = _convert_params(sql, params) return pandas_sql.execute(*args) @@ -770,18 +774,22 @@ def _engine_builder(con): return con -def pandasSQL_builder(con, schema: str | None = None, meta=None): +def pandasSQL_builder( + con, schema: str | None = None, meta=None, is_cursor: bool = False +): """ Convenience function to return the correct PandasSQL subclass based on the provided parameters. """ + # When support for DBAPI connections is removed, + # is_cursor should not be necessary. con = _engine_builder(con) if _is_sqlalchemy_connectable(con): return SQLDatabase(con, schema=schema, meta=meta) elif isinstance(con, str): raise ImportError("Using URI string without sqlalchemy installed.") else: - return SQLiteDatabase(con) + return SQLiteDatabase(con, is_cursor=is_cursor) class SQLTable(PandasObject): @@ -955,7 +963,7 @@ def insert(self, chunksize: int | None = None, method: str | None = None): if start_i >= end_i: break - chunk_iter = zip(*(arr[start_i:end_i] for arr in data_list)) + chunk_iter = zip(*[arr[start_i:end_i] for arr in data_list]) exec_insert(conn, keys, chunk_iter) def _query_iterator( @@ -1905,7 +1913,7 @@ def insert_statement(self, *, num_rows: int): col_names = ",".join(bracketed_names) row_wildcards = ",".join([wld] * len(names)) - wildcards = ",".join([f"({row_wildcards})" for _ in range(num_rows)]) + wildcards = ",".join(f"({row_wildcards})" for _ in range(num_rows)) insert_statement = ( f"INSERT INTO {escape(self.name)} ({col_names}) VALUES {wildcards}" ) @@ -1944,7 +1952,7 @@ def _create_table_setup(self): keys = [self.keys] else: keys = self.keys - cnames_br = ", ".join([escape(c) for c in keys]) + cnames_br = ", ".join(escape(c) for c in keys) create_tbl_stmts.append( f"CONSTRAINT {self.name}_pk PRIMARY KEY ({cnames_br})" ) @@ -1964,7 +1972,7 @@ def _create_table_setup(self): ix_cols = [cname for cname, _, is_index in column_names_and_types if is_index] if len(ix_cols): cnames = "_".join(ix_cols) - cnames_br = ",".join([escape(c) for c in ix_cols]) + cnames_br = ",".join(escape(c) for c in ix_cols) create_stmts.append( "CREATE INDEX " + escape("ix_" + self.name + "_" + cnames) @@ -2023,7 +2031,8 @@ class SQLiteDatabase(PandasSQL): """ - def __init__(self, con): + def __init__(self, con, is_cursor: bool = False): + self.is_cursor = is_cursor self.con = con @contextmanager @@ -2039,7 +2048,10 @@ def run_transaction(self): cur.close() def execute(self, *args, **kwargs): - cur = self.con.cursor() + if self.is_cursor: + cur = self.con + else: + cur = self.con.cursor() try: cur.execute(*args, **kwargs) return cur diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 7f37f0293e417..ffaebb3c10ae2 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -853,25 +853,15 @@ def __eq__(self, other: Any) -> bool: @classmethod def get_base_missing_value(cls, dtype: np.dtype) -> int | float: - # error: Non-overlapping equality check (left operand type: "dtype[Any]", right - # operand type: "Type[signedinteger[Any]]") - if dtype == np.int8: # type: ignore[comparison-overlap] + if dtype == np.int8: value = cls.BASE_MISSING_VALUES["int8"] - # error: Non-overlapping equality check (left operand type: "dtype[Any]", right - # operand type: "Type[signedinteger[Any]]") - elif dtype == np.int16: # type: ignore[comparison-overlap] + elif dtype == np.int16: value = cls.BASE_MISSING_VALUES["int16"] - # error: Non-overlapping equality check (left operand type: "dtype[Any]", right - # operand type: "Type[signedinteger[Any]]") - elif dtype == np.int32: # type: ignore[comparison-overlap] + elif dtype == np.int32: value = cls.BASE_MISSING_VALUES["int32"] - # error: Non-overlapping equality check (left operand type: "dtype[Any]", right - # operand type: "Type[floating[Any]]") - elif dtype == np.float32: # type: ignore[comparison-overlap] + elif dtype == np.float32: value = cls.BASE_MISSING_VALUES["float32"] - # error: Non-overlapping equality check (left operand type: "dtype[Any]", right - # operand type: "Type[floating[Any]]") - elif dtype == np.float64: # type: ignore[comparison-overlap] + elif dtype == np.float64: value = cls.BASE_MISSING_VALUES["float64"] else: raise ValueError("Unsupported dtype") @@ -1367,12 +1357,12 @@ def _read_old_header(self, first_char: bytes) -> None: try: self.typlist = [self.TYPE_MAP[typ] for typ in typlist] except ValueError as err: - invalid_types = ",".join([str(x) for x in typlist]) + invalid_types = ",".join(str(x) for x in typlist) raise ValueError(f"cannot convert stata types [{invalid_types}]") from err try: self.dtyplist = [self.DTYPE_MAP[typ] for typ in typlist] except ValueError as err: - invalid_dtypes = ",".join([str(x) for x in typlist]) + invalid_dtypes = ",".join(str(x) for x in typlist) raise ValueError(f"cannot convert stata dtypes [{invalid_dtypes}]") from err if self.format_version > 108: @@ -2043,25 +2033,15 @@ def _dtype_to_stata_type(dtype: np.dtype, column: Series) -> int: # do? itemsize = max_len_string_array(ensure_object(column._values)) return max(itemsize, 1) - # error: Non-overlapping equality check (left operand type: "dtype[Any]", right - # operand type: "Type[floating[Any]]") - elif dtype == np.float64: # type: ignore[comparison-overlap] + elif dtype == np.float64: return 255 - # Non-overlapping equality check (left operand type: "dtype[Any]", right - # operand type: "Type[floating[Any]]") - elif dtype == np.float32: # type: ignore[comparison-overlap] + elif dtype == np.float32: return 254 - # error: Non-overlapping equality check (left operand type: "dtype[Any]", right - # operand type: "Type[signedinteger[Any]]") - elif dtype == np.int32: # type: ignore[comparison-overlap] + elif dtype == np.int32: return 253 - # error: Non-overlapping equality check (left operand type: "dtype[Any]", right - # operand type: "Type[signedinteger[Any]]") - elif dtype == np.int16: # type: ignore[comparison-overlap] + elif dtype == np.int16: return 252 - # error: Non-overlapping equality check (left operand type: "dtype[Any]", right - # operand type: "Type[signedinteger[Any]]") - elif dtype == np.int8: # type: ignore[comparison-overlap] + elif dtype == np.int8: return 251 else: # pragma : no cover raise NotImplementedError(f"Data type {dtype} not supported.") @@ -2781,25 +2761,15 @@ def _dtype_to_stata_type_117(dtype: np.dtype, column: Series, force_strl: bool) if itemsize <= 2045: return itemsize return 32768 - # error: Non-overlapping equality check (left operand type: "dtype[Any]", right - # operand type: "Type[floating[Any]]") - elif dtype == np.float64: # type: ignore[comparison-overlap] + elif dtype == np.float64: return 65526 - # error: Non-overlapping equality check (left operand type: "dtype[Any]", right - # operand type: "Type[floating[Any]]") - elif dtype == np.float32: # type: ignore[comparison-overlap] + elif dtype == np.float32: return 65527 - # error: Non-overlapping equality check (left operand type: "dtype[Any]", right - # operand type: "Type[signedinteger[Any]]") [comparison-overlap] - elif dtype == np.int32: # type: ignore[comparison-overlap] + elif dtype == np.int32: return 65528 - # error: Non-overlapping equality check (left operand type: "dtype[Any]", right - # operand type: "Type[signedinteger[Any]]") - elif dtype == np.int16: # type: ignore[comparison-overlap] + elif dtype == np.int16: return 65529 - # error: Non-overlapping equality check (left operand type: "dtype[Any]", right - # operand type: "Type[signedinteger[Any]]") - elif dtype == np.int8: # type: ignore[comparison-overlap] + elif dtype == np.int8: return 65530 else: # pragma : no cover raise NotImplementedError(f"Data type {dtype} not supported.") diff --git a/pandas/plotting/_core.py b/pandas/plotting/_core.py index 990ccbc2a015b..5d3db13610845 100644 --- a/pandas/plotting/_core.py +++ b/pandas/plotting/_core.py @@ -7,6 +7,8 @@ Sequence, ) +import pkg_resources + from pandas._config import get_option from pandas._typing import IndexLabel @@ -428,7 +430,7 @@ def hist_frame( y : label or position, optional Allows plotting of one column versus another. If not specified, all numerical columns are used. - color : str, array-like, or dict, optional + color : str, array_like, or dict, optional The color for each of the DataFrame's columns. Possible values are: - A single color string referred to by name, RGB or RGBA code, @@ -866,7 +868,7 @@ def _get_call_args(backend_name, data, args, kwargs): if args and isinstance(data, ABCSeries): positional_args = str(args)[1:-1] keyword_args = ", ".join( - [f"{name}={repr(value)}" for (name, _), value in zip(arg_def, args)] + f"{name}={repr(value)}" for (name, _), value in zip(arg_def, args) ) msg = ( "`Series.plot()` should not be called with positional " @@ -1237,11 +1239,6 @@ def box(self, by=None, **kwargs): ---------- by : str or sequence Column in the DataFrame to group by. - - .. versionchanged:: 1.4.0 - - Previously, `by` is silently ignore and makes no groupings - **kwargs Additional keywords are documented in :meth:`DataFrame.plot`. @@ -1283,11 +1280,6 @@ def hist(self, by=None, bins=10, **kwargs): ---------- by : str or sequence, optional Column in the DataFrame to group by. - - .. versionchanged:: 1.4.0 - - Previously, `by` is silently ignore and makes no groupings - bins : int, default 10 Number of histogram bins to be used. **kwargs @@ -1319,16 +1311,6 @@ def hist(self, by=None, bins=10, **kwargs): ... columns = ['one']) >>> df['two'] = df['one'] + np.random.randint(1, 7, 6000) >>> ax = df.plot.hist(bins=12, alpha=0.5) - - A grouped histogram can be generated by providing the parameter `by` (which - can be a column name, or a list of column names): - - .. plot:: - :context: close-figs - - >>> age_list = [8, 10, 12, 14, 72, 74, 76, 78, 20, 25, 30, 35, 60, 85] - >>> df = pd.DataFrame({"gender": list("MMMMMMMMFFFFFF"), "age": age_list}) - >>> ax = df.plot.hist(column=["age"], by="gender", figsize=(10, 8)) """ return self(kind="hist", by=by, bins=bins, **kwargs) @@ -1589,7 +1571,7 @@ def scatter(self, x, y, s=None, c=None, **kwargs): y : int or str The column name or column position to be used as vertical coordinates for each point. - s : str, scalar or array-like, optional + s : str, scalar or array_like, optional The size of each point. Possible values are: - A string with the name of the column to be used for marker's size. @@ -1602,7 +1584,7 @@ def scatter(self, x, y, s=None, c=None, **kwargs): .. versionchanged:: 1.1.0 - c : str, int or array-like, optional + c : str, int or array_like, optional The color of each point. Possible values are: - A single color string referred to by name, RGB or RGBA code, @@ -1763,8 +1745,6 @@ def _load_backend(backend: str) -> types.ModuleType: types.ModuleType The imported backend. """ - from importlib.metadata import entry_points - if backend == "matplotlib": # Because matplotlib is an optional dependency and first-party backend, # we need to attempt an import here to raise an ImportError if needed. @@ -1779,13 +1759,11 @@ def _load_backend(backend: str) -> types.ModuleType: found_backend = False - eps = entry_points() - if "pandas_plotting_backends" in eps: - for entry_point in eps["pandas_plotting_backends"]: - found_backend = entry_point.name == backend - if found_backend: - module = entry_point.load() - break + for entry_point in pkg_resources.iter_entry_points("pandas_plotting_backends"): + found_backend = entry_point.name == backend + if found_backend: + module = entry_point.load() + break if not found_backend: # Fall back to unregistered, module name approach. diff --git a/pandas/plotting/_matplotlib/boxplot.py b/pandas/plotting/_matplotlib/boxplot.py index 8b4cf158ac827..21f30c1311e17 100644 --- a/pandas/plotting/_matplotlib/boxplot.py +++ b/pandas/plotting/_matplotlib/boxplot.py @@ -18,7 +18,6 @@ LinePlot, MPLPlot, ) -from pandas.plotting._matplotlib.groupby import create_iter_data_given_by from pandas.plotting._matplotlib.style import get_standard_colors from pandas.plotting._matplotlib.tools import ( create_subplots, @@ -136,37 +135,18 @@ def _make_plot(self): if self.subplots: self._return_obj = pd.Series(dtype=object) - # Re-create iterated data if `by` is assigned by users - data = ( - create_iter_data_given_by(self.data, self._kind) - if self.by is not None - else self.data - ) - - for i, (label, y) in enumerate(self._iter_data(data=data)): + for i, (label, y) in enumerate(self._iter_data()): ax = self._get_ax(i) kwds = self.kwds.copy() - # When by is applied, show title for subplots to know which group it is - # just like df.boxplot, and need to apply T on y to provide right input - if self.by is not None: - y = y.T - ax.set_title(pprint_thing(label)) - - # When `by` is assigned, the ticklabels will become unique grouped - # values, instead of label which is used as subtitle in this case. - ticklabels = [ - pprint_thing(col) for col in self.data.columns.levels[0] - ] - else: - ticklabels = [pprint_thing(label)] - ret, bp = self._plot( ax, y, column_num=i, return_type=self.return_type, **kwds ) self.maybe_color_bp(bp) self._return_obj[label] = ret - self._set_ticklabels(ax, ticklabels) + + label = [pprint_thing(label)] + self._set_ticklabels(ax, label) else: y = self.data.values.T ax = self._get_ax(0) diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py index ff76bd771d1c0..7ddab91a24ec0 100644 --- a/pandas/plotting/_matplotlib/core.py +++ b/pandas/plotting/_matplotlib/core.py @@ -9,7 +9,6 @@ from matplotlib.artist import Artist import numpy as np -from pandas._typing import IndexLabel from pandas.errors import AbstractMethodError from pandas.util._decorators import cache_readonly @@ -39,12 +38,10 @@ ) import pandas.core.common as com -from pandas.core.frame import DataFrame from pandas.io.formats.printing import pprint_thing from pandas.plotting._matplotlib.compat import mpl_ge_3_0_0 from pandas.plotting._matplotlib.converter import register_pandas_matplotlib_converters -from pandas.plotting._matplotlib.groupby import reconstruct_data_with_by from pandas.plotting._matplotlib.style import get_standard_colors from pandas.plotting._matplotlib.timeseries import ( decorate_axes, @@ -102,7 +99,7 @@ def __init__( self, data, kind=None, - by: IndexLabel | None = None, + by=None, subplots=False, sharex=None, sharey=False, @@ -127,42 +124,13 @@ def __init__( table=False, layout=None, include_bool=False, - column: IndexLabel | None = None, **kwds, ): import matplotlib.pyplot as plt self.data = data - - # if users assign an empty list or tuple, raise `ValueError` - # similar to current `df.box` and `df.hist` APIs. - if by in ([], ()): - raise ValueError("No group keys passed!") - self.by = com.maybe_make_list(by) - - # Assign the rest of columns into self.columns if by is explicitly defined - # while column is not, only need `columns` in hist/box plot when it's DF - # TODO: Might deprecate `column` argument in future PR (#28373) - if isinstance(data, DataFrame): - if column: - self.columns = com.maybe_make_list(column) - else: - if self.by is None: - self.columns = [ - col for col in data.columns if is_numeric_dtype(data[col]) - ] - else: - self.columns = [ - col - for col in data.columns - if col not in self.by and is_numeric_dtype(data[col]) - ] - - # For `hist` plot, need to get grouped original data before `self.data` is - # updated later - if self.by is not None and self._kind == "hist": - self._grouped = data.groupby(self.by) + self.by = by self.kind = kind @@ -171,9 +139,7 @@ def __init__( self.subplots = subplots if sharex is None: - - # if by is defined, subplots are used and sharex should be False - if ax is None and by is None: + if ax is None: self.sharex = True else: # if we get an axis, the users should do the visibility @@ -307,15 +273,8 @@ def _iter_data(self, data=None, keep_index=False, fillna=None): @property def nseries(self) -> int: - - # When `by` is explicitly assigned, grouped data size will be defined, and - # this will determine number of subplots to have, aka `self.nseries` if self.data.ndim == 1: return 1 - elif self.by is not None and self._kind == "hist": - return len(self._grouped) - elif self.by is not None and self._kind == "box": - return len(self.columns) else: return self.data.shape[1] @@ -461,14 +420,6 @@ def _compute_plot_data(self): if label is None and data.name is None: label = "None" data = data.to_frame(name=label) - elif self._kind in ("hist", "box"): - cols = self.columns if self.by is None else self.columns + self.by - data = data.loc[:, cols] - - # GH15079 reconstruct data if by is defined - if self.by is not None: - self.subplots = True - data = reconstruct_data_with_by(self.data, by=self.by, cols=self.columns) # GH16953, _convert is needed as fallback, for ``Series`` # with ``dtype == object`` @@ -763,7 +714,7 @@ def _get_index_name(self) -> str | None: if isinstance(self.data.index, ABCMultiIndex): name = self.data.index.names if com.any_not_none(*name): - name = ",".join([pprint_thing(x) for x in name]) + name = ",".join(pprint_thing(x) for x in name) else: name = None else: diff --git a/pandas/plotting/_matplotlib/groupby.py b/pandas/plotting/_matplotlib/groupby.py deleted file mode 100644 index 37cc3186fe097..0000000000000 --- a/pandas/plotting/_matplotlib/groupby.py +++ /dev/null @@ -1,127 +0,0 @@ -from __future__ import annotations - -import numpy as np - -from pandas._typing import ( - Dict, - IndexLabel, -) - -from pandas.core.dtypes.missing import remove_na_arraylike - -from pandas import ( - DataFrame, - MultiIndex, - Series, - concat, -) - - -def create_iter_data_given_by( - data: DataFrame, kind: str = "hist" -) -> Dict[str, DataFrame | Series]: - """ - Create data for iteration given `by` is assigned or not, and it is only - used in both hist and boxplot. - - If `by` is assigned, return a dictionary of DataFrames in which the key of - dictionary is the values in groups. - If `by` is not assigned, return input as is, and this preserves current - status of iter_data. - - Parameters - ---------- - data : reformatted grouped data from `_compute_plot_data` method. - kind : str, plot kind. This function is only used for `hist` and `box` plots. - - Returns - ------- - iter_data : DataFrame or Dictionary of DataFrames - - Examples - -------- - If `by` is assigned: - - >>> import numpy as np - >>> tuples = [('h1', 'a'), ('h1', 'b'), ('h2', 'a'), ('h2', 'b')] - >>> mi = MultiIndex.from_tuples(tuples) - >>> value = [[1, 3, np.nan, np.nan], - ... [3, 4, np.nan, np.nan], [np.nan, np.nan, 5, 6]] - >>> data = DataFrame(value, columns=mi) - >>> create_iter_data_given_by(data) - {'h1': DataFrame({'a': [1, 3, np.nan], 'b': [3, 4, np.nan]}), - 'h2': DataFrame({'a': [np.nan, np.nan, 5], 'b': [np.nan, np.nan, 6]})} - """ - - # For `hist` plot, before transformation, the values in level 0 are values - # in groups and subplot titles, and later used for column subselection and - # iteration; For `box` plot, values in level 1 are column names to show, - # and are used for iteration and as subplots titles. - if kind == "hist": - level = 0 - else: - level = 1 - - # Select sub-columns based on the value of level of MI, and if `by` is - # assigned, data must be a MI DataFrame - assert isinstance(data.columns, MultiIndex) - return { - col: data.loc[:, data.columns.get_level_values(level) == col] - for col in data.columns.levels[level] - } - - -def reconstruct_data_with_by( - data: DataFrame, by: IndexLabel, cols: IndexLabel -) -> DataFrame: - """ - Internal function to group data, and reassign multiindex column names onto the - result in order to let grouped data be used in _compute_plot_data method. - - Parameters - ---------- - data : Original DataFrame to plot - by : grouped `by` parameter selected by users - cols : columns of data set (excluding columns used in `by`) - - Returns - ------- - Output is the reconstructed DataFrame with MultiIndex columns. The first level - of MI is unique values of groups, and second level of MI is the columns - selected by users. - - Examples - -------- - >>> d = {'h': ['h1', 'h1', 'h2'], 'a': [1, 3, 5], 'b': [3, 4, 6]} - >>> df = DataFrame(d) - >>> reconstruct_data_with_by(df, by='h', cols=['a', 'b']) - h1 h2 - a b a b - 0 1 3 NaN NaN - 1 3 4 NaN NaN - 2 NaN NaN 5 6 - """ - grouped = data.groupby(by) - - data_list = [] - for key, group in grouped: - columns = MultiIndex.from_product([[key], cols]) - sub_group = group[cols] - sub_group.columns = columns - data_list.append(sub_group) - - data = concat(data_list, axis=1) - return data - - -def reformat_hist_y_given_by( - y: Series | np.ndarray, by: IndexLabel | None -) -> Series | np.ndarray: - """Internal function to reformat y given `by` is applied or not for hist plot. - - If by is None, input y is 1-d with NaN removed; and if by is not None, groupby - will take place and input y is multi-dimensional array. - """ - if by is not None and len(y.shape) > 1: - return np.array([remove_na_arraylike(col) for col in y.T]).T - return remove_na_arraylike(y) diff --git a/pandas/plotting/_matplotlib/hist.py b/pandas/plotting/_matplotlib/hist.py index 08cffbf475db0..a02d9a2b9dc8d 100644 --- a/pandas/plotting/_matplotlib/hist.py +++ b/pandas/plotting/_matplotlib/hist.py @@ -17,17 +17,11 @@ remove_na_arraylike, ) -from pandas.core.frame import DataFrame - from pandas.io.formats.printing import pprint_thing from pandas.plotting._matplotlib.core import ( LinePlot, MPLPlot, ) -from pandas.plotting._matplotlib.groupby import ( - create_iter_data_given_by, - reformat_hist_y_given_by, -) from pandas.plotting._matplotlib.tools import ( create_subplots, flatten_axes, @@ -49,30 +43,19 @@ def __init__(self, data, bins=10, bottom=0, **kwargs): MPLPlot.__init__(self, data, **kwargs) def _args_adjust(self): - - # calculate bin number separately in different subplots - # where subplots are created based on by argument if is_integer(self.bins): - if self.by is not None: - grouped = self.data.groupby(self.by)[self.columns] - self.bins = [self._calculate_bins(group) for key, group in grouped] - else: - self.bins = self._calculate_bins(self.data) + # create common bin edge + values = self.data._convert(datetime=True)._get_numeric_data() + values = np.ravel(values) + values = values[~isna(values)] + + _, self.bins = np.histogram( + values, bins=self.bins, range=self.kwds.get("range", None) + ) if is_list_like(self.bottom): self.bottom = np.array(self.bottom) - def _calculate_bins(self, data: DataFrame) -> np.ndarray: - """Calculate bins given data""" - values = data._convert(datetime=True)._get_numeric_data() - values = np.ravel(values) - values = values[~isna(values)] - - hist, bins = np.histogram( - values, bins=self.bins, range=self.kwds.get("range", None) - ) - return bins - @classmethod def _plot( cls, @@ -87,6 +70,7 @@ def _plot( ): if column_num == 0: cls._initialize_stacker(ax, stacking_id, len(bins) - 1) + y = y[~isna(y)] base = np.zeros(len(bins) - 1) bottom = bottom + cls._get_stacked_values(ax, stacking_id, base, kwds["label"]) @@ -99,14 +83,7 @@ def _make_plot(self): colors = self._get_colors() stacking_id = self._get_stacking_id() - # Re-create iterated data if `by` is assigned by users - data = ( - create_iter_data_given_by(self.data, self._kind) - if self.by is not None - else self.data - ) - - for i, (label, y) in enumerate(self._iter_data(data=data)): + for i, (label, y) in enumerate(self._iter_data()): ax = self._get_ax(i) kwds = self.kwds.copy() @@ -121,15 +98,6 @@ def _make_plot(self): kwds = self._make_plot_keywords(kwds, y) - # the bins is multi-dimension array now and each plot need only 1-d and - # when by is applied, label should be columns that are grouped - if self.by is not None: - kwds["bins"] = kwds["bins"][i] - kwds["label"] = self.columns - kwds.pop("color") - - y = reformat_hist_y_given_by(y, self.by) - # We allow weights to be a multi-dimensional array, e.g. a (10, 2) array, # and each sub-array (10,) will be called in each iteration. If users only # provide 1D array, we assume the same weights is used for all iterations @@ -138,11 +106,6 @@ def _make_plot(self): kwds["weights"] = weights[:, i] artists = self._plot(ax, y, column_num=i, stacking_id=stacking_id, **kwds) - - # when by is applied, show title for subplots to know which group it is - if self.by is not None: - ax.set_title(pprint_thing(label)) - self._append_legend_handles_labels(artists[0], label) def _make_plot_keywords(self, kwds, y): diff --git a/pandas/plotting/_matplotlib/timeseries.py b/pandas/plotting/_matplotlib/timeseries.py index 3cd312b06020d..3b9c5eae70b42 100644 --- a/pandas/plotting/_matplotlib/timeseries.py +++ b/pandas/plotting/_matplotlib/timeseries.py @@ -16,6 +16,7 @@ to_offset, ) from pandas._libs.tslibs.dtypes import FreqGroup +from pandas._typing import FrameOrSeriesUnion from pandas.core.dtypes.generic import ( ABCDatetimeIndex, @@ -39,7 +40,6 @@ from matplotlib.axes import Axes from pandas import ( - DataFrame, DatetimeIndex, Index, Series, @@ -210,7 +210,7 @@ def _get_freq(ax: Axes, series: Series): return freq, ax_freq -def use_dynamic_x(ax: Axes, data: DataFrame | Series) -> bool: +def use_dynamic_x(ax: Axes, data: FrameOrSeriesUnion) -> bool: freq = _get_index_freq(data.index) ax_freq = _get_ax_freq(ax) diff --git a/pandas/plotting/_matplotlib/tools.py b/pandas/plotting/_matplotlib/tools.py index 9d509d02c2e4f..9bfa24b6371ab 100644 --- a/pandas/plotting/_matplotlib/tools.py +++ b/pandas/plotting/_matplotlib/tools.py @@ -13,6 +13,8 @@ import matplotlib.ticker as ticker import numpy as np +from pandas._typing import FrameOrSeriesUnion + from pandas.core.dtypes.common import is_list_like from pandas.core.dtypes.generic import ( ABCDataFrame, @@ -29,11 +31,6 @@ from matplotlib.lines import Line2D from matplotlib.table import Table - from pandas import ( - DataFrame, - Series, - ) - def do_adjust_figure(fig: Figure): """Whether fig has constrained_layout enabled.""" @@ -58,7 +55,7 @@ def format_date_labels(ax: Axes, rot): def table( - ax, data: DataFrame | Series, rowLabels=None, colLabels=None, **kwargs + ax, data: FrameOrSeriesUnion, rowLabels=None, colLabels=None, **kwargs ) -> Table: if isinstance(data, ABCSeries): data = data.to_frame() diff --git a/pandas/tests/api/test_api.py b/pandas/tests/api/test_api.py index 95dc1d82cb286..38984238ecf65 100644 --- a/pandas/tests/api/test_api.py +++ b/pandas/tests/api/test_api.py @@ -193,6 +193,7 @@ class TestPDApi(Base): "_hashtable", "_lib", "_libs", + "_np_version_under1p18", "_is_numpy_dev", "_testing", "_tslib", diff --git a/pandas/tests/apply/test_frame_apply.py b/pandas/tests/apply/test_frame_apply.py index 801cbdf3d0a87..2511f6fc2563c 100644 --- a/pandas/tests/apply/test_frame_apply.py +++ b/pandas/tests/apply/test_frame_apply.py @@ -1,4 +1,5 @@ from datetime import datetime +from itertools import chain import warnings import numpy as np @@ -52,17 +53,6 @@ def test_apply_axis1_with_ea(): tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize( - "data, dtype", - [(1, None), (1, CategoricalDtype([1])), (Timestamp("2013-01-01", tz="UTC"), None)], -) -def test_agg_axis1_duplicate_index(data, dtype): - # GH 42380 - expected = DataFrame([[data], [data]], index=["a", "a"], dtype=dtype) - result = expected.agg(lambda x: x, axis=1) - tm.assert_frame_equal(result, expected) - - def test_apply_mixed_datetimelike(): # mixed datetimelike # GH 7778 @@ -158,6 +148,32 @@ def test_apply_standard_nonunique(): tm.assert_series_equal(result, expected) +@pytest.mark.parametrize("func", ["sum", "mean", "min", "max", "std"]) +@pytest.mark.parametrize( + "args,kwds", + [ + pytest.param([], {}, id="no_args_or_kwds"), + pytest.param([1], {}, id="axis_from_args"), + pytest.param([], {"axis": 1}, id="axis_from_kwds"), + pytest.param([], {"numeric_only": True}, id="optional_kwds"), + pytest.param([1, None], {"numeric_only": True}, id="args_and_kwds"), + ], +) +@pytest.mark.parametrize("how", ["agg", "apply"]) +def test_apply_with_string_funcs(request, float_frame, func, args, kwds, how): + if len(args) > 1 and how == "agg": + request.node.add_marker( + pytest.mark.xfail( + raises=TypeError, + reason="agg/apply signature mismatch - agg passes 2nd " + "argument to func", + ) + ) + result = getattr(float_frame, how)(func, *args, **kwds) + expected = getattr(float_frame, func)(*args, **kwds) + tm.assert_series_equal(result, expected) + + def test_apply_broadcast(float_frame, int_frame_const_col): # scalars @@ -1263,9 +1279,9 @@ def test_size_as_str(how, axis): # on the columns result = getattr(df, how)("size", axis=axis) if axis == 0 or axis == "index": - expected = Series(df.shape[0], index=df.columns) + expected = Series(df.shape[0], index=df.columns, name="size") else: - expected = Series(df.shape[1], index=df.index) + expected = Series(df.shape[1], index=df.index, name="size") tm.assert_series_equal(result, expected) @@ -1285,6 +1301,76 @@ def func(group_col): tm.assert_frame_equal(result, expected) +@pytest.mark.parametrize( + "df, func, expected", + chain( + tm.get_cython_table_params( + DataFrame(), + [ + ("sum", Series(dtype="float64")), + ("max", Series(dtype="float64")), + ("min", Series(dtype="float64")), + ("all", Series(dtype=bool)), + ("any", Series(dtype=bool)), + ("mean", Series(dtype="float64")), + ("prod", Series(dtype="float64")), + ("std", Series(dtype="float64")), + ("var", Series(dtype="float64")), + ("median", Series(dtype="float64")), + ], + ), + tm.get_cython_table_params( + DataFrame([[np.nan, 1], [1, 2]]), + [ + ("sum", Series([1.0, 3])), + ("max", Series([1.0, 2])), + ("min", Series([1.0, 1])), + ("all", Series([True, True])), + ("any", Series([True, True])), + ("mean", Series([1, 1.5])), + ("prod", Series([1.0, 2])), + ("std", Series([np.nan, 0.707107])), + ("var", Series([np.nan, 0.5])), + ("median", Series([1, 1.5])), + ], + ), + ), +) +def test_agg_cython_table(df, func, expected, axis): + # GH 21224 + # test reducing functions in + # pandas.core.base.SelectionMixin._cython_table + result = df.agg(func, axis=axis) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "df, func, expected", + chain( + tm.get_cython_table_params( + DataFrame(), [("cumprod", DataFrame()), ("cumsum", DataFrame())] + ), + tm.get_cython_table_params( + DataFrame([[np.nan, 1], [1, 2]]), + [ + ("cumprod", DataFrame([[np.nan, 1], [1, 2]])), + ("cumsum", DataFrame([[np.nan, 1], [1, 3]])), + ], + ), + ), +) +def test_agg_cython_table_transform(df, func, expected, axis): + # GH 21224 + # test transforming functions in + # pandas.core.base.SelectionMixin._cython_table (cumprod, cumsum) + if axis == "columns" or axis == 1: + # operating blockwise doesn't let us preserve dtypes + expected = expected.astype("float64") + + result = df.agg(func, axis=axis) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("axis", [0, 1]) @pytest.mark.parametrize( "args, kwargs", @@ -1413,6 +1499,31 @@ def test_apply_raw_returns_string(): tm.assert_series_equal(result, expected) +@pytest.mark.parametrize( + "op", ["abs", "ceil", "cos", "cumsum", "exp", "log", "sqrt", "square"] +) +@pytest.mark.parametrize("how", ["transform", "apply"]) +def test_apply_np_transformer(float_frame, op, how): + # GH 39116 + result = getattr(float_frame, how)(op) + expected = getattr(np, op)(float_frame) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("op", ["mean", "median", "std", "var"]) +@pytest.mark.parametrize("how", ["agg", "apply"]) +def test_apply_np_reducer(float_frame, op, how): + # GH 39116 + float_frame = DataFrame({"a": [1, 2], "b": [3, 4]}) + result = getattr(float_frame, how)(op) + # pandas ddof defaults to 1, numpy to 0 + kwargs = {"ddof": 1} if op in ("std", "var") else {} + expected = Series( + getattr(np, op)(float_frame, axis=0, **kwargs), index=float_frame.columns + ) + tm.assert_series_equal(result, expected) + + def test_aggregation_func_column_order(): # GH40420: the result of .agg should have an index that is sorted # according to the arguments provided to agg. diff --git a/pandas/tests/apply/test_frame_transform.py b/pandas/tests/apply/test_frame_transform.py index 47173d14c543d..9050fab702881 100644 --- a/pandas/tests/apply/test_frame_transform.py +++ b/pandas/tests/apply/test_frame_transform.py @@ -1,3 +1,5 @@ +import operator + import numpy as np import pytest @@ -36,6 +38,33 @@ def test_transform_ufunc(axis, float_frame, frame_or_series): tm.assert_equal(result, expected) +@pytest.mark.parametrize("op", frame_transform_kernels) +def test_transform_groupby_kernel(axis, float_frame, op, request): + # GH 35964 + + args = [0.0] if op == "fillna" else [] + if axis == 0 or axis == "index": + ones = np.ones(float_frame.shape[0]) + else: + ones = np.ones(float_frame.shape[1]) + expected = float_frame.groupby(ones, axis=axis).transform(op, *args) + result = float_frame.transform(op, axis, *args) + tm.assert_frame_equal(result, expected) + + # same thing, but ensuring we have multiple blocks + assert "E" not in float_frame.columns + float_frame["E"] = float_frame["A"].copy() + assert len(float_frame._mgr.arrays) > 1 + + if axis == 0 or axis == "index": + ones = np.ones(float_frame.shape[0]) + else: + ones = np.ones(float_frame.shape[1]) + expected2 = float_frame.groupby(ones, axis=axis).transform(op, *args) + result2 = float_frame.transform(op, axis, *args) + tm.assert_frame_equal(result2, expected2) + + @pytest.mark.parametrize( "ops, names", [ @@ -126,6 +155,15 @@ def func(x): tm.assert_equal(result, expected) +@pytest.mark.parametrize("method", ["abs", "shift", "pct_change", "cumsum", "rank"]) +def test_transform_method_name(method): + # GH 19760 + df = DataFrame({"A": [-1, 2]}) + result = df.transform(method) + expected = operator.methodcaller(method)(df) + tm.assert_frame_equal(result, expected) + + wont_fail = ["ffill", "bfill", "fillna", "pad", "backfill", "shift"] frame_kernels_raise = [x for x in frame_transform_kernels if x not in wont_fail] diff --git a/pandas/tests/apply/test_series_apply.py b/pandas/tests/apply/test_series_apply.py index 2af340f0c1bb9..34d00e653b52d 100644 --- a/pandas/tests/apply/test_series_apply.py +++ b/pandas/tests/apply/test_series_apply.py @@ -2,10 +2,13 @@ Counter, defaultdict, ) +from itertools import chain import numpy as np import pytest +from pandas.core.dtypes.common import is_number + import pandas as pd from pandas import ( DataFrame, @@ -84,6 +87,14 @@ def f(x): assert result.dtype == object +def test_with_string_args(datetime_series): + + for arg in ["sum", "mean", "min", "max", "std"]: + result = datetime_series.apply(arg) + expected = getattr(datetime_series, arg)() + assert result == expected + + def test_apply_args(): s = Series(["foo,bar"]) @@ -407,6 +418,92 @@ def test_non_callable_aggregates(how): tm.assert_series_equal(result, expected) +@pytest.mark.parametrize( + "series, func, expected", + chain( + tm.get_cython_table_params( + Series(dtype=np.float64), + [ + ("sum", 0), + ("max", np.nan), + ("min", np.nan), + ("all", True), + ("any", False), + ("mean", np.nan), + ("prod", 1), + ("std", np.nan), + ("var", np.nan), + ("median", np.nan), + ], + ), + tm.get_cython_table_params( + Series([np.nan, 1, 2, 3]), + [ + ("sum", 6), + ("max", 3), + ("min", 1), + ("all", True), + ("any", True), + ("mean", 2), + ("prod", 6), + ("std", 1), + ("var", 1), + ("median", 2), + ], + ), + tm.get_cython_table_params( + Series("a b c".split()), + [ + ("sum", "abc"), + ("max", "c"), + ("min", "a"), + ("all", True), + ("any", True), + ], + ), + ), +) +def test_agg_cython_table(series, func, expected): + # GH21224 + # test reducing functions in + # pandas.core.base.SelectionMixin._cython_table + result = series.agg(func) + if is_number(expected): + assert np.isclose(result, expected, equal_nan=True) + else: + assert result == expected + + +@pytest.mark.parametrize( + "series, func, expected", + chain( + tm.get_cython_table_params( + Series(dtype=np.float64), + [ + ("cumprod", Series([], Index([]), dtype=np.float64)), + ("cumsum", Series([], Index([]), dtype=np.float64)), + ], + ), + tm.get_cython_table_params( + Series([np.nan, 1, 2, 3]), + [ + ("cumprod", Series([np.nan, 1, 2, 6])), + ("cumsum", Series([np.nan, 1, 3, 6])), + ], + ), + tm.get_cython_table_params( + Series("a b c".split()), [("cumsum", Series(["a", "ab", "abc"]))] + ), + ), +) +def test_agg_cython_table_transform(series, func, expected): + # GH21224 + # test transforming functions in + # pandas.core.base.SelectionMixin._cython_table (cumprod, cumsum) + result = series.agg(func) + tm.assert_series_equal(result, expected) + + def test_series_apply_no_suffix_index(): # GH36189 s = Series([4] * 3) diff --git a/pandas/tests/apply/test_series_transform.py b/pandas/tests/apply/test_series_transform.py index b10af13eae20c..90065d20e1a59 100644 --- a/pandas/tests/apply/test_series_transform.py +++ b/pandas/tests/apply/test_series_transform.py @@ -8,6 +8,24 @@ concat, ) import pandas._testing as tm +from pandas.core.groupby.base import transformation_kernels + +# tshift only works on time index and is deprecated +# There is no Series.cumcount +series_kernels = [ + x for x in sorted(transformation_kernels) if x not in ["tshift", "cumcount"] +] + + +@pytest.mark.parametrize("op", series_kernels) +def test_transform_groupby_kernel(string_series, op): + # GH 35964 + + args = [0.0] if op == "fillna" else [] + ones = np.ones(string_series.shape[0]) + expected = string_series.groupby(ones).transform(op, *args) + result = string_series.transform(op, 0, *args) + tm.assert_series_equal(result, expected) @pytest.mark.parametrize( diff --git a/pandas/tests/apply/test_str.py b/pandas/tests/apply/test_str.py deleted file mode 100644 index 67e8dd520dc3b..0000000000000 --- a/pandas/tests/apply/test_str.py +++ /dev/null @@ -1,289 +0,0 @@ -from itertools import chain -import operator - -import numpy as np -import pytest - -from pandas.core.dtypes.common import is_number - -from pandas import ( - DataFrame, - Index, - Series, -) -import pandas._testing as tm -from pandas.tests.apply.common import ( - frame_transform_kernels, - series_transform_kernels, -) - - -@pytest.mark.parametrize("func", ["sum", "mean", "min", "max", "std"]) -@pytest.mark.parametrize( - "args,kwds", - [ - pytest.param([], {}, id="no_args_or_kwds"), - pytest.param([1], {}, id="axis_from_args"), - pytest.param([], {"axis": 1}, id="axis_from_kwds"), - pytest.param([], {"numeric_only": True}, id="optional_kwds"), - pytest.param([1, None], {"numeric_only": True}, id="args_and_kwds"), - ], -) -@pytest.mark.parametrize("how", ["agg", "apply"]) -def test_apply_with_string_funcs(request, float_frame, func, args, kwds, how): - if len(args) > 1 and how == "agg": - request.node.add_marker( - pytest.mark.xfail( - raises=TypeError, - reason="agg/apply signature mismatch - agg passes 2nd " - "argument to func", - ) - ) - result = getattr(float_frame, how)(func, *args, **kwds) - expected = getattr(float_frame, func)(*args, **kwds) - tm.assert_series_equal(result, expected) - - -def test_with_string_args(datetime_series): - - for arg in ["sum", "mean", "min", "max", "std"]: - result = datetime_series.apply(arg) - expected = getattr(datetime_series, arg)() - assert result == expected - - -@pytest.mark.parametrize("op", ["mean", "median", "std", "var"]) -@pytest.mark.parametrize("how", ["agg", "apply"]) -def test_apply_np_reducer(float_frame, op, how): - # GH 39116 - float_frame = DataFrame({"a": [1, 2], "b": [3, 4]}) - result = getattr(float_frame, how)(op) - # pandas ddof defaults to 1, numpy to 0 - kwargs = {"ddof": 1} if op in ("std", "var") else {} - expected = Series( - getattr(np, op)(float_frame, axis=0, **kwargs), index=float_frame.columns - ) - tm.assert_series_equal(result, expected) - - -@pytest.mark.parametrize( - "op", ["abs", "ceil", "cos", "cumsum", "exp", "log", "sqrt", "square"] -) -@pytest.mark.parametrize("how", ["transform", "apply"]) -def test_apply_np_transformer(float_frame, op, how): - # GH 39116 - result = getattr(float_frame, how)(op) - expected = getattr(np, op)(float_frame) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize( - "series, func, expected", - chain( - tm.get_cython_table_params( - Series(dtype=np.float64), - [ - ("sum", 0), - ("max", np.nan), - ("min", np.nan), - ("all", True), - ("any", False), - ("mean", np.nan), - ("prod", 1), - ("std", np.nan), - ("var", np.nan), - ("median", np.nan), - ], - ), - tm.get_cython_table_params( - Series([np.nan, 1, 2, 3]), - [ - ("sum", 6), - ("max", 3), - ("min", 1), - ("all", True), - ("any", True), - ("mean", 2), - ("prod", 6), - ("std", 1), - ("var", 1), - ("median", 2), - ], - ), - tm.get_cython_table_params( - Series("a b c".split()), - [ - ("sum", "abc"), - ("max", "c"), - ("min", "a"), - ("all", True), - ("any", True), - ], - ), - ), -) -def test_agg_cython_table_series(series, func, expected): - # GH21224 - # test reducing functions in - # pandas.core.base.SelectionMixin._cython_table - result = series.agg(func) - if is_number(expected): - assert np.isclose(result, expected, equal_nan=True) - else: - assert result == expected - - -@pytest.mark.parametrize( - "series, func, expected", - chain( - tm.get_cython_table_params( - Series(dtype=np.float64), - [ - ("cumprod", Series([], Index([]), dtype=np.float64)), - ("cumsum", Series([], Index([]), dtype=np.float64)), - ], - ), - tm.get_cython_table_params( - Series([np.nan, 1, 2, 3]), - [ - ("cumprod", Series([np.nan, 1, 2, 6])), - ("cumsum", Series([np.nan, 1, 3, 6])), - ], - ), - tm.get_cython_table_params( - Series("a b c".split()), [("cumsum", Series(["a", "ab", "abc"]))] - ), - ), -) -def test_agg_cython_table_transform_series(series, func, expected): - # GH21224 - # test transforming functions in - # pandas.core.base.SelectionMixin._cython_table (cumprod, cumsum) - result = series.agg(func) - tm.assert_series_equal(result, expected) - - -@pytest.mark.parametrize( - "df, func, expected", - chain( - tm.get_cython_table_params( - DataFrame(), - [ - ("sum", Series(dtype="float64")), - ("max", Series(dtype="float64")), - ("min", Series(dtype="float64")), - ("all", Series(dtype=bool)), - ("any", Series(dtype=bool)), - ("mean", Series(dtype="float64")), - ("prod", Series(dtype="float64")), - ("std", Series(dtype="float64")), - ("var", Series(dtype="float64")), - ("median", Series(dtype="float64")), - ], - ), - tm.get_cython_table_params( - DataFrame([[np.nan, 1], [1, 2]]), - [ - ("sum", Series([1.0, 3])), - ("max", Series([1.0, 2])), - ("min", Series([1.0, 1])), - ("all", Series([True, True])), - ("any", Series([True, True])), - ("mean", Series([1, 1.5])), - ("prod", Series([1.0, 2])), - ("std", Series([np.nan, 0.707107])), - ("var", Series([np.nan, 0.5])), - ("median", Series([1, 1.5])), - ], - ), - ), -) -def test_agg_cython_table_frame(df, func, expected, axis): - # GH 21224 - # test reducing functions in - # pandas.core.base.SelectionMixin._cython_table - result = df.agg(func, axis=axis) - tm.assert_series_equal(result, expected) - - -@pytest.mark.parametrize( - "df, func, expected", - chain( - tm.get_cython_table_params( - DataFrame(), [("cumprod", DataFrame()), ("cumsum", DataFrame())] - ), - tm.get_cython_table_params( - DataFrame([[np.nan, 1], [1, 2]]), - [ - ("cumprod", DataFrame([[np.nan, 1], [1, 2]])), - ("cumsum", DataFrame([[np.nan, 1], [1, 3]])), - ], - ), - ), -) -def test_agg_cython_table_transform_frame(df, func, expected, axis): - # GH 21224 - # test transforming functions in - # pandas.core.base.SelectionMixin._cython_table (cumprod, cumsum) - if axis == "columns" or axis == 1: - # operating blockwise doesn't let us preserve dtypes - expected = expected.astype("float64") - - result = df.agg(func, axis=axis) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize("op", series_transform_kernels) -def test_transform_groupby_kernel_series(string_series, op): - # GH 35964 - - args = [0.0] if op == "fillna" else [] - ones = np.ones(string_series.shape[0]) - expected = string_series.groupby(ones).transform(op, *args) - result = string_series.transform(op, 0, *args) - tm.assert_series_equal(result, expected) - - -@pytest.mark.parametrize("op", frame_transform_kernels) -def test_transform_groupby_kernel_frame( - axis, float_frame, op, using_array_manager, request -): - # GH 35964 - if using_array_manager and op == "pct_change" and axis in (1, "columns"): - # TODO(ArrayManager) shift with axis=1 - request.node.add_marker( - pytest.mark.xfail( - reason="shift axis=1 not yet implemented for ArrayManager" - ) - ) - - args = [0.0] if op == "fillna" else [] - if axis == 0 or axis == "index": - ones = np.ones(float_frame.shape[0]) - else: - ones = np.ones(float_frame.shape[1]) - expected = float_frame.groupby(ones, axis=axis).transform(op, *args) - result = float_frame.transform(op, axis, *args) - tm.assert_frame_equal(result, expected) - - # same thing, but ensuring we have multiple blocks - assert "E" not in float_frame.columns - float_frame["E"] = float_frame["A"].copy() - assert len(float_frame._mgr.arrays) > 1 - - if axis == 0 or axis == "index": - ones = np.ones(float_frame.shape[0]) - else: - ones = np.ones(float_frame.shape[1]) - expected2 = float_frame.groupby(ones, axis=axis).transform(op, *args) - result2 = float_frame.transform(op, axis, *args) - tm.assert_frame_equal(result2, expected2) - - -@pytest.mark.parametrize("method", ["abs", "shift", "pct_change", "cumsum", "rank"]) -def test_transform_method_name(method): - # GH 19760 - df = DataFrame({"A": [-1, 2]}) - result = df.transform(method) - expected = operator.methodcaller(method)(df) - tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/arrays/categorical/test_analytics.py b/pandas/tests/arrays/categorical/test_analytics.py index 89f2241fc6993..c0287df1694e9 100644 --- a/pandas/tests/arrays/categorical/test_analytics.py +++ b/pandas/tests/arrays/categorical/test_analytics.py @@ -186,19 +186,15 @@ def test_searchsorted(self, ordered): tm.assert_numpy_array_equal(res_ser, exp) # Searching for a single value that is not from the Categorical - with pytest.raises(TypeError, match="cucumber"): + with pytest.raises(KeyError, match="cucumber"): cat.searchsorted("cucumber") - with pytest.raises(TypeError, match="cucumber"): + with pytest.raises(KeyError, match="cucumber"): ser.searchsorted("cucumber") # Searching for multiple values one of each is not from the Categorical - msg = ( - "Cannot setitem on a Categorical with a new category, " - "set the categories first" - ) - with pytest.raises(TypeError, match=msg): + with pytest.raises(KeyError, match="cucumber"): cat.searchsorted(["bread", "cucumber"]) - with pytest.raises(TypeError, match=msg): + with pytest.raises(KeyError, match="cucumber"): ser.searchsorted(["bread", "cucumber"]) def test_unique(self, ordered): diff --git a/pandas/tests/arrays/categorical/test_indexing.py b/pandas/tests/arrays/categorical/test_indexing.py index 807a046cfbf13..5b31776301f7b 100644 --- a/pandas/tests/arrays/categorical/test_indexing.py +++ b/pandas/tests/arrays/categorical/test_indexing.py @@ -73,7 +73,7 @@ def test_setitem_different_unordered_raises(self, other): target = Categorical(["a", "b"], categories=["a", "b"]) mask = np.array([True, False]) msg = "Cannot set a Categorical with another, without identical categories" - with pytest.raises(TypeError, match=msg): + with pytest.raises(ValueError, match=msg): target[mask] = other[mask] @pytest.mark.parametrize( @@ -89,7 +89,7 @@ def test_setitem_same_ordered_raises(self, other): target = Categorical(["a", "b"], categories=["a", "b"], ordered=True) mask = np.array([True, False]) msg = "Cannot set a Categorical with another, without identical categories" - with pytest.raises(TypeError, match=msg): + with pytest.raises(ValueError, match=msg): target[mask] = other[mask] def test_setitem_tuple(self): @@ -260,7 +260,7 @@ def test_where_other_categorical(self): def test_where_new_category_raises(self): ser = Series(Categorical(["a", "b", "c"])) msg = "Cannot setitem on a Categorical with a new category" - with pytest.raises(TypeError, match=msg): + with pytest.raises(ValueError, match=msg): ser.where([True, False, True], "d") def test_where_ordered_differs_rasies(self): @@ -270,7 +270,7 @@ def test_where_ordered_differs_rasies(self): other = Categorical( ["b", "c", "a"], categories=["a", "c", "b", "d"], ordered=True ) - with pytest.raises(TypeError, match="without identical categories"): + with pytest.raises(ValueError, match="without identical categories"): ser.where([True, False, True], other) diff --git a/pandas/tests/arrays/categorical/test_missing.py b/pandas/tests/arrays/categorical/test_missing.py index f419aa6f181f2..930d890ee91d4 100644 --- a/pandas/tests/arrays/categorical/test_missing.py +++ b/pandas/tests/arrays/categorical/test_missing.py @@ -84,12 +84,7 @@ def test_fillna_raises(self, fillna_kwargs, msg): # https://github.com/pandas-dev/pandas/issues/13628 cat = Categorical([1, 2, 3, None, None]) - if len(fillna_kwargs) == 1 and "value" in fillna_kwargs: - err = TypeError - else: - err = ValueError - - with pytest.raises(err, match=msg): + with pytest.raises(ValueError, match=msg): cat.fillna(**fillna_kwargs) @pytest.mark.parametrize("named", [True, False]) @@ -109,7 +104,7 @@ def test_fillna_iterable_category(self, named): # not NotImplementedError GH#41914 cat = Categorical(np.array([Point(1, 0), Point(0, 1), None], dtype=object)) msg = "Cannot setitem on a Categorical with a new category" - with pytest.raises(TypeError, match=msg): + with pytest.raises(ValueError, match=msg): cat.fillna(Point(0, 0)) def test_fillna_array(self): diff --git a/pandas/tests/arrays/categorical/test_take.py b/pandas/tests/arrays/categorical/test_take.py index fbdbea1dae3b2..6cb54908724c9 100644 --- a/pandas/tests/arrays/categorical/test_take.py +++ b/pandas/tests/arrays/categorical/test_take.py @@ -81,7 +81,7 @@ def test_take_fill_value(self): def test_take_fill_value_new_raises(self): # https://github.com/pandas-dev/pandas/issues/23296 cat = Categorical(["a", "b", "c"]) - xpr = r"Cannot setitem on a Categorical with a new category \(d\)" + xpr = r"'fill_value=d' is not present in this Categorical's categories" with pytest.raises(TypeError, match=xpr): cat.take([0, 1, -1], fill_value="d", allow_fill=True) diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index c6240600d3a05..5731f02430a9d 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -2,6 +2,9 @@ This module tests the functionality of StringArray and ArrowStringArray. Tests for the str accessors are in pandas/tests/strings/test_string_array.py """ + +import re + import numpy as np import pytest @@ -311,7 +314,7 @@ def test_astype_int(dtype): tm.assert_numpy_array_equal(result, expected) arr = pd.array(["1", pd.NA, "3"], dtype=dtype) - msg = r"int\(\) argument must be a string, a bytes-like object or a( real)? number" + msg = re.escape("int() argument must be a string, a bytes-like object or a number") with pytest.raises(TypeError, match=msg): arr.astype("int64") diff --git a/pandas/tests/arrays/test_array.py b/pandas/tests/arrays/test_array.py index 958ccec930f0e..61d56df485ab1 100644 --- a/pandas/tests/arrays/test_array.py +++ b/pandas/tests/arrays/test_array.py @@ -309,14 +309,6 @@ def test_scalar_raises(): pd.array(1) -def test_bounds_check(): - # GH21796 - with pytest.raises( - TypeError, match=r"cannot safely cast non-equivalent int(32|64) to uint16" - ): - pd.array([-1, 2, 3], dtype="UInt16") - - # --------------------------------------------------------------------------- # A couple dummy classes to ensure that Series and Indexes are unboxed before # getting to the EA classes. diff --git a/pandas/tests/arrays/test_datetimelike.py b/pandas/tests/arrays/test_datetimelike.py index 1e150f1b431c7..3f3f3a5ee8d18 100644 --- a/pandas/tests/arrays/test_datetimelike.py +++ b/pandas/tests/arrays/test_datetimelike.py @@ -10,6 +10,7 @@ OutOfBoundsDatetime, Timestamp, ) +from pandas.compat import np_version_under1p18 import pandas.util._test_decorators as td import pandas as pd @@ -287,7 +288,12 @@ def test_searchsorted(self): # GH#29884 match numpy convention on whether NaT goes # at the end or the beginning result = arr.searchsorted(NaT) - assert result == 10 + if np_version_under1p18: + # Following numpy convention, NaT goes at the beginning + # (unlike NaN which goes at the end) + assert result == 0 + else: + assert result == 10 @pytest.mark.parametrize("box", [None, "index", "series"]) def test_searchsorted_castable_strings(self, arr1d, box, request, string_storage): @@ -1238,11 +1244,17 @@ def test_invalid_nat_setitem_array(arr, non_casting_nats): ], ) def test_to_numpy_extra(arr): + if np_version_under1p18: + # np.isnan(NaT) raises, so use pandas' + isnan = pd.isna + else: + isnan = np.isnan + arr[0] = NaT original = arr.copy() result = arr.to_numpy() - assert np.isnan(result[0]) + assert isnan(result[0]) result = arr.to_numpy(dtype="int64") assert result[0] == -9223372036854775808 diff --git a/pandas/tests/base/test_transpose.py b/pandas/tests/base/test_transpose.py index 246f33d27476c..5ba278368834c 100644 --- a/pandas/tests/base/test_transpose.py +++ b/pandas/tests/base/test_transpose.py @@ -1,10 +1,6 @@ import numpy as np import pytest -from pandas import ( - CategoricalDtype, - DataFrame, -) import pandas._testing as tm @@ -29,28 +25,3 @@ def test_numpy_transpose(index_or_series_obj): with pytest.raises(ValueError, match=msg): np.transpose(obj, axes=1) - - -@pytest.mark.parametrize( - "data, transposed_data, index, columns, dtype", - [ - ([[1], [2]], [[1, 2]], ["a", "a"], ["b"], int), - ([[1], [2]], [[1, 2]], ["a", "a"], ["b"], CategoricalDtype([1, 2])), - ([[1, 2]], [[1], [2]], ["b"], ["a", "a"], int), - ([[1, 2]], [[1], [2]], ["b"], ["a", "a"], CategoricalDtype([1, 2])), - ([[1, 2], [3, 4]], [[1, 3], [2, 4]], ["a", "a"], ["b", "b"], int), - ( - [[1, 2], [3, 4]], - [[1, 3], [2, 4]], - ["a", "a"], - ["b", "b"], - CategoricalDtype([1, 2, 3, 4]), - ), - ], -) -def test_duplicate_labels(data, transposed_data, index, columns, dtype): - # GH 42380 - df = DataFrame(data, index=index, columns=columns, dtype=dtype) - result = df.T - expected = DataFrame(transposed_data, index=columns, columns=index, dtype=dtype) - tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/computation/test_eval.py b/pandas/tests/computation/test_eval.py index b6c6baf6cc7e4..7cf319e1d134c 100644 --- a/pandas/tests/computation/test_eval.py +++ b/pandas/tests/computation/test_eval.py @@ -21,6 +21,7 @@ from pandas import ( DataFrame, Series, + compat, date_range, ) import pandas._testing as tm @@ -1282,8 +1283,10 @@ def test_assignment_column(self): msg = "left hand side of an assignment must be a single name" with pytest.raises(SyntaxError, match=msg): df.eval("d,c = a + b") - - msg = "cannot assign to function call" + if compat.PY38: + msg = "cannot assign to function call" + else: + msg = "can't assign to function call" with pytest.raises(SyntaxError, match=msg): df.eval('Timestamp("20131001") = a + b') @@ -1968,7 +1971,9 @@ def test_bool_ops_fails_on_scalars(lhs, cmp, rhs, engine, parser): "other", [ "'x'", - "...", + pytest.param( + "...", marks=pytest.mark.xfail(not compat.PY38, reason="GH-28116") + ), ], ) def test_equals_various(other): diff --git a/pandas/tests/dtypes/cast/test_dict_compat.py b/pandas/tests/dtypes/cast/test_dict_compat.py deleted file mode 100644 index 13dc82d779f95..0000000000000 --- a/pandas/tests/dtypes/cast/test_dict_compat.py +++ /dev/null @@ -1,14 +0,0 @@ -import numpy as np - -from pandas.core.dtypes.cast import dict_compat - -from pandas import Timestamp - - -def test_dict_compat(): - data_datetime64 = {np.datetime64("1990-03-15"): 1, np.datetime64("2015-03-15"): 2} - data_unchanged = {1: 2, 3: 4, 5: 6} - expected = {Timestamp("1990-3-15"): 1, Timestamp("2015-03-15"): 2} - assert dict_compat(data_datetime64) == expected - assert dict_compat(expected) == expected - assert dict_compat(data_unchanged) == data_unchanged diff --git a/pandas/tests/extension/base/reshaping.py b/pandas/tests/extension/base/reshaping.py index 8f241679d5108..3e6b1cbfb311c 100644 --- a/pandas/tests/extension/base/reshaping.py +++ b/pandas/tests/extension/base/reshaping.py @@ -4,7 +4,6 @@ import pytest from pandas.core.dtypes.common import ( - is_datetime64tz_dtype, is_interval_dtype, is_period_dtype, ) @@ -329,9 +328,6 @@ def test_unstack(self, data, index, obj): ) if obj == "series": # TODO: special cases belong in dtype-specific tests - if is_datetime64tz_dtype(data.dtype): - assert expected.dtypes.apply(is_datetime64tz_dtype).all() - expected = expected.astype(object) if is_period_dtype(data.dtype): assert expected.dtypes.apply(is_period_dtype).all() expected = expected.astype(object) diff --git a/pandas/tests/extension/test_datetime.py b/pandas/tests/extension/test_datetime.py index 54e31e05e8b0e..bb8347f0a0122 100644 --- a/pandas/tests/extension/test_datetime.py +++ b/pandas/tests/extension/test_datetime.py @@ -193,6 +193,40 @@ def test_concat_mixed_dtypes(self, data): # drops the tz. super().test_concat_mixed_dtypes(data) + @pytest.mark.parametrize("obj", ["series", "frame"]) + def test_unstack(self, obj): + # GH-13287: can't use base test, since building the expected fails. + dtype = DatetimeTZDtype(tz="US/Central") + data = DatetimeArray._from_sequence( + ["2000", "2001", "2002", "2003"], + dtype=dtype, + ) + index = pd.MultiIndex.from_product(([["A", "B"], ["a", "b"]]), names=["a", "b"]) + + if obj == "series": + ser = pd.Series(data, index=index) + expected = pd.DataFrame( + {"A": data.take([0, 1]), "B": data.take([2, 3])}, + index=pd.Index(["a", "b"], name="b"), + ) + expected.columns.name = "a" + + else: + ser = pd.DataFrame({"A": data, "B": data}, index=index) + expected = pd.DataFrame( + { + ("A", "A"): data.take([0, 1]), + ("A", "B"): data.take([2, 3]), + ("B", "A"): data.take([0, 1]), + ("B", "B"): data.take([2, 3]), + }, + index=pd.Index(["a", "b"], name="b"), + ) + expected.columns.names = [None, "a"] + + result = ser.unstack(0) + self.assert_equal(result, expected) + class TestSetitem(BaseDatetimeTests, base.BaseSetitemTests): pass diff --git a/pandas/tests/extension/test_sparse.py b/pandas/tests/extension/test_sparse.py index 9c21f717573c1..f0d3fb7ff9e1b 100644 --- a/pandas/tests/extension/test_sparse.py +++ b/pandas/tests/extension/test_sparse.py @@ -13,7 +13,6 @@ be added to the array-specific tests in `pandas/tests/arrays/`. """ - import numpy as np import pytest diff --git a/pandas/tests/frame/indexing/test_getitem.py b/pandas/tests/frame/indexing/test_getitem.py index 71e8f84b4ad01..073e7b0357124 100644 --- a/pandas/tests/frame/indexing/test_getitem.py +++ b/pandas/tests/frame/indexing/test_getitem.py @@ -299,7 +299,7 @@ def test_getitem_boolean_frame_unaligned_with_duplicate_columns(self, df_dup_col # boolean with the duplicate raises df = df_dup_cols - msg = "cannot reindex on an axis with duplicate labels" + msg = "cannot reindex from a duplicate axis" with pytest.raises(ValueError, match=msg): df[df.A > 6] diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py index 077301613eb8b..e2121fa2318eb 100644 --- a/pandas/tests/frame/indexing/test_indexing.py +++ b/pandas/tests/frame/indexing/test_indexing.py @@ -1283,7 +1283,7 @@ def test_object_casting_indexing_wraps_datetimelike(using_array_manager): assert isinstance(val, pd.Timedelta) -msg1 = r"Cannot setitem on a Categorical with a new category( \(.*\))?, set the" +msg1 = "Cannot setitem on a Categorical with a new category, set the categories first" msg2 = "Cannot set a Categorical with another, without identical categories" @@ -1348,7 +1348,7 @@ def test_loc_iloc_setitem_list_of_lists(self, orig, exp_multi_row, indexer): tm.assert_frame_equal(df, exp_multi_row) df = orig.copy() - with pytest.raises(TypeError, match=msg1): + with pytest.raises(ValueError, match=msg1): indexer(df)[key, :] = [["c", 2], ["c", 2]] @pytest.mark.parametrize("indexer", [tm.loc, tm.iloc, tm.at, tm.iat]) @@ -1367,7 +1367,7 @@ def test_loc_iloc_at_iat_setitem_single_value_in_categories( tm.assert_frame_equal(df, exp_single_cats_value) # "c" is not among the categories for df["cat"] - with pytest.raises(TypeError, match=msg1): + with pytest.raises(ValueError, match=msg1): indexer(df)[key] = "c" @pytest.mark.parametrize("indexer", [tm.loc, tm.iloc]) @@ -1401,7 +1401,7 @@ def test_loc_iloc_setitem_full_row_non_categorical_rhs( tm.assert_frame_equal(df, exp_single_row) # "c" is not among the categories for df["cat"] - with pytest.raises(TypeError, match=msg1): + with pytest.raises(ValueError, match=msg1): indexer(df)[key, :] = ["c", 2] @pytest.mark.parametrize("indexer", [tm.loc, tm.iloc]) @@ -1423,14 +1423,14 @@ def test_loc_iloc_setitem_partial_col_categorical_rhs( # categories do not match df["cat"]'s, but "b" is among them semi_compat = Categorical(list("bb"), categories=list("abc")) - with pytest.raises(TypeError, match=msg2): + with pytest.raises(ValueError, match=msg2): # different categories but holdable values # -> not sure if this should fail or pass indexer(df)[key] = semi_compat # categories do not match df["cat"]'s, and "c" is not among them incompat = Categorical(list("cc"), categories=list("abc")) - with pytest.raises(TypeError, match=msg2): + with pytest.raises(ValueError, match=msg2): # different values indexer(df)[key] = incompat @@ -1450,5 +1450,5 @@ def test_loc_iloc_setitem_non_categorical_rhs( tm.assert_frame_equal(df, exp_parts_cats_col) # "c" not part of the categories - with pytest.raises(TypeError, match=msg1): + with pytest.raises(ValueError, match=msg1): indexer(df)[key] = ["c", "c"] diff --git a/pandas/tests/frame/indexing/test_setitem.py b/pandas/tests/frame/indexing/test_setitem.py index 25682330fe19a..62d7535159f13 100644 --- a/pandas/tests/frame/indexing/test_setitem.py +++ b/pandas/tests/frame/indexing/test_setitem.py @@ -68,7 +68,7 @@ def test_setitem_error_msmgs(self): index=Index(["a", "b", "c", "a"], name="foo"), name="fiz", ) - msg = "cannot reindex on an axis with duplicate labels" + msg = "cannot reindex from a duplicate axis" with pytest.raises(ValueError, match=msg): df["newcol"] = ser diff --git a/pandas/tests/frame/indexing/test_xs.py b/pandas/tests/frame/indexing/test_xs.py index d2704876c31c5..ccd989e2de411 100644 --- a/pandas/tests/frame/indexing/test_xs.py +++ b/pandas/tests/frame/indexing/test_xs.py @@ -129,23 +129,6 @@ def test_xs_view(self, using_array_manager): class TestXSWithMultiIndex: - def test_xs_doc_example(self): - # TODO: more descriptive name - # based on example in advanced.rst - arrays = [ - ["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"], - ["one", "two", "one", "two", "one", "two", "one", "two"], - ] - tuples = list(zip(*arrays)) - - index = MultiIndex.from_tuples(tuples, names=["first", "second"]) - df = DataFrame(np.random.randn(3, 8), index=["A", "B", "C"], columns=index) - - result = df.xs(("one", "bar"), level=("second", "first"), axis=1) - - expected = df.iloc[:, [0]] - tm.assert_frame_equal(result, expected) - def test_xs_integer_key(self): # see GH#2107 dates = range(20111201, 20111205) @@ -318,13 +301,12 @@ def test_xs_IndexSlice_argument_not_implemented(self, klass): if klass is Series: obj = obj[0] - expected = obj.iloc[-2:].droplevel(0) - - result = obj.xs(IndexSlice[("foo", "qux", 0), :]) - tm.assert_equal(result, expected) - - result = obj.loc[IndexSlice[("foo", "qux", 0), :]] - tm.assert_equal(result, expected) + msg = ( + "Expected label or tuple of labels, got " + r"\(\('foo', 'qux', 0\), slice\(None, None, None\)\)" + ) + with pytest.raises(TypeError, match=msg): + obj.xs(IndexSlice[("foo", "qux", 0), :]) @pytest.mark.parametrize("klass", [DataFrame, Series]) def test_xs_levels_raises(self, klass): diff --git a/pandas/tests/frame/methods/test_astype.py b/pandas/tests/frame/methods/test_astype.py index 1f1991214aad0..881f8db305240 100644 --- a/pandas/tests/frame/methods/test_astype.py +++ b/pandas/tests/frame/methods/test_astype.py @@ -632,9 +632,13 @@ def test_astype_tz_object_conversion(self, tz): result = result.astype({"tz": "datetime64[ns, Europe/London]"}) tm.assert_frame_equal(result, expected) - def test_astype_dt64_to_string(self, frame_or_series, tz_naive_fixture): - # GH#41409 + def test_astype_dt64_to_string(self, frame_or_series, tz_naive_fixture, request): tz = tz_naive_fixture + if tz is None: + mark = pytest.mark.xfail( + reason="GH#36153 uses ndarray formatting instead of DTA formatting" + ) + request.node.add_marker(mark) dti = date_range("2016-01-01", periods=3, tz=tz) dta = dti._data @@ -656,40 +660,11 @@ def test_astype_dt64_to_string(self, frame_or_series, tz_naive_fixture): alt = obj.astype(str) assert np.all(alt.iloc[1:] == result.iloc[1:]) - def test_astype_td64_to_string(self, frame_or_series): - # GH#41409 - tdi = pd.timedelta_range("1 Day", periods=3) - obj = frame_or_series(tdi) - - expected = frame_or_series(["1 days", "2 days", "3 days"], dtype="string") - result = obj.astype("string") - tm.assert_equal(result, expected) - def test_astype_bytes(self): # GH#39474 result = DataFrame(["foo", "bar", "baz"]).astype(bytes) assert result.dtypes[0] == np.dtype("S3") - @pytest.mark.parametrize( - "index_slice", - [ - np.s_[:2, :2], - np.s_[:1, :2], - np.s_[:2, :1], - np.s_[::2, ::2], - np.s_[::1, ::2], - np.s_[::2, ::1], - ], - ) - def test_astype_noncontiguous(self, index_slice): - # GH#42396 - data = np.arange(16).reshape(4, 4) - df = DataFrame(data) - - result = df.iloc[index_slice].astype("int16") - expected = df.iloc[index_slice] - tm.assert_frame_equal(result, expected, check_dtype=False) - class TestAstypeCategorical: def test_astype_from_categorical3(self): @@ -718,11 +693,3 @@ def test_categorical_astype_to_int(self, any_int_or_nullable_int_dtype): {"col1": pd.array([2, 1, 3], dtype=any_int_or_nullable_int_dtype)} ) tm.assert_frame_equal(df, expected) - - def test_astype_categorical_to_string_missing(self): - # https://github.com/pandas-dev/pandas/issues/41797 - df = DataFrame(["a", "b", np.nan]) - expected = df.astype(str) - cat = df.astype("category") - result = cat.astype(str) - tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_describe.py b/pandas/tests/frame/methods/test_describe.py index 3a1228ee5c4a5..fa91eb928e35c 100644 --- a/pandas/tests/frame/methods/test_describe.py +++ b/pandas/tests/frame/methods/test_describe.py @@ -346,7 +346,7 @@ def test_describe_percentiles_integer_idx(self): result = df.describe(percentiles=pct) expected = DataFrame( - {"x": [1.0, 1.0, np.NaN, 1.0, *(1.0 for _ in pct), 1.0]}, + {"x": [1.0, 1.0, np.NaN, 1.0, *[1.0 for _ in pct], 1.0]}, index=[ "count", "mean", diff --git a/pandas/tests/frame/methods/test_explode.py b/pandas/tests/frame/methods/test_explode.py index 6fdf5d806ac6b..bd0901387eeed 100644 --- a/pandas/tests/frame/methods/test_explode.py +++ b/pandas/tests/frame/methods/test_explode.py @@ -9,12 +9,7 @@ def test_error(): df = pd.DataFrame( {"A": pd.Series([[0, 1, 2], np.nan, [], (3, 4)], index=list("abcd")), "B": 1} ) - with pytest.raises( - ValueError, match="column must be a scalar, tuple, or list thereof" - ): - df.explode([list("AA")]) - - with pytest.raises(ValueError, match="column must be unique"): + with pytest.raises(ValueError, match="column must be a scalar"): df.explode(list("AA")) df.columns = list("AA") @@ -22,37 +17,6 @@ def test_error(): df.explode("A") -@pytest.mark.parametrize( - "input_subset, error_message", - [ - ( - list("AC"), - "columns must have matching element counts", - ), - ( - [], - "column must be nonempty", - ), - ( - list("AC"), - "columns must have matching element counts", - ), - ], -) -def test_error_multi_columns(input_subset, error_message): - # GH 39240 - df = pd.DataFrame( - { - "A": [[0, 1, 2], np.nan, [], (3, 4)], - "B": 1, - "C": [["a", "b", "c"], "foo", [], ["d", "e", "f"]], - }, - index=list("abcd"), - ) - with pytest.raises(ValueError, match=error_message): - df.explode(input_subset) - - def test_basic(): df = pd.DataFrame( {"A": pd.Series([[0, 1, 2], np.nan, [], (3, 4)], index=list("abcd")), "B": 1} @@ -216,58 +180,3 @@ def test_explode_sets(): result = df.explode(column="a").sort_values(by="a") expected = pd.DataFrame({"a": ["x", "y"], "b": [1, 1]}, index=[1, 1]) tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize( - "input_subset, expected_dict, expected_index", - [ - ( - list("AC"), - { - "A": pd.Series( - [0, 1, 2, np.nan, np.nan, 3, 4, np.nan], - index=list("aaabcdde"), - dtype=object, - ), - "B": 1, - "C": ["a", "b", "c", "foo", np.nan, "d", "e", np.nan], - }, - list("aaabcdde"), - ), - ( - list("A"), - { - "A": pd.Series( - [0, 1, 2, np.nan, np.nan, 3, 4, np.nan], - index=list("aaabcdde"), - dtype=object, - ), - "B": 1, - "C": [ - ["a", "b", "c"], - ["a", "b", "c"], - ["a", "b", "c"], - "foo", - [], - ["d", "e"], - ["d", "e"], - np.nan, - ], - }, - list("aaabcdde"), - ), - ], -) -def test_multi_columns(input_subset, expected_dict, expected_index): - # GH 39240 - df = pd.DataFrame( - { - "A": [[0, 1, 2], np.nan, [], (3, 4), np.nan], - "B": 1, - "C": [["a", "b", "c"], "foo", [], ["d", "e"], np.nan], - }, - index=list("abcde"), - ) - result = df.explode(input_subset) - expected = pd.DataFrame(expected_dict, expected_index) - tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_fillna.py b/pandas/tests/frame/methods/test_fillna.py index b1ce511fc3e4c..065d074eef6e8 100644 --- a/pandas/tests/frame/methods/test_fillna.py +++ b/pandas/tests/frame/methods/test_fillna.py @@ -173,7 +173,7 @@ def test_na_actions_categorical(self): tm.assert_frame_equal(res, df_exp_fill) msg = "Cannot setitem on a Categorical with a new category" - with pytest.raises(TypeError, match=msg): + with pytest.raises(ValueError, match=msg): df.fillna(value={"cats": 4, "vals": "c"}) res = df.fillna(method="pad") diff --git a/pandas/tests/frame/methods/test_rank.py b/pandas/tests/frame/methods/test_rank.py index 6c5831ad897d1..5ba4ab4408f11 100644 --- a/pandas/tests/frame/methods/test_rank.py +++ b/pandas/tests/frame/methods/test_rank.py @@ -246,11 +246,13 @@ def test_rank_methods_frame(self): expected = DataFrame(sprank, columns=cols).astype("float64") tm.assert_frame_equal(result, expected) + @td.skip_array_manager_not_yet_implemented @pytest.mark.parametrize("dtype", ["O", "f8", "i8"]) @pytest.mark.filterwarnings("ignore:.*Select only valid:FutureWarning") def test_rank_descending(self, method, dtype): + if "i" in dtype: - df = self.df.dropna().astype(dtype) + df = self.df.dropna() else: df = self.df.astype(dtype) @@ -258,6 +260,9 @@ def test_rank_descending(self, method, dtype): expected = (df.max() - df).rank() tm.assert_frame_equal(res, expected) + if method == "first" and dtype == "O": + return + expected = (df.max() - df).rank(method=method) if dtype != "O": @@ -282,6 +287,9 @@ def _check2d(df, expected, method="average", axis=0): result = df.rank(method=method, axis=axis) tm.assert_frame_equal(result, exp_df) + disabled = {(object, "first")} + if (dtype, method) in disabled: + return frame = df if dtype is None else df.astype(dtype) _check2d(frame, self.results[method], method=method, axis=axis) @@ -448,38 +456,6 @@ def test_rank_both_inf(self): result = df.rank() tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize( - "na_option,ascending,expected", - [ - ("top", True, [3.0, 1.0, 2.0]), - ("top", False, [2.0, 1.0, 3.0]), - ("bottom", True, [2.0, 3.0, 1.0]), - ("bottom", False, [1.0, 3.0, 2.0]), - ], - ) - def test_rank_inf_nans_na_option( - self, frame_or_series, method, na_option, ascending, expected - ): - obj = frame_or_series([np.inf, np.nan, -np.inf]) - result = obj.rank(method=method, na_option=na_option, ascending=ascending) - expected = frame_or_series(expected) - tm.assert_equal(result, expected) - - @pytest.mark.parametrize( - "na_option,ascending,expected", - [ - ("bottom", True, [1.0, 2.0, 4.0, 3.0]), - ("bottom", False, [1.0, 2.0, 4.0, 3.0]), - ("top", True, [2.0, 3.0, 1.0, 4.0]), - ("top", False, [2.0, 3.0, 1.0, 4.0]), - ], - ) - def test_rank_object_first(self, frame_or_series, na_option, ascending, expected): - obj = frame_or_series(["foo", "foo", None, "foo"]) - result = obj.rank(method="first", na_option=na_option, ascending=ascending) - expected = frame_or_series(expected) - tm.assert_equal(result, expected) - @pytest.mark.parametrize( "data,expected", [ diff --git a/pandas/tests/frame/methods/test_reindex.py b/pandas/tests/frame/methods/test_reindex.py index d0765084adfa9..84992982a104a 100644 --- a/pandas/tests/frame/methods/test_reindex.py +++ b/pandas/tests/frame/methods/test_reindex.py @@ -658,7 +658,7 @@ def test_reindex_dups(self): tm.assert_frame_equal(result, expected) # reindex fails - msg = "cannot reindex on an axis with duplicate labels" + msg = "cannot reindex from a duplicate axis" with pytest.raises(ValueError, match=msg): df.reindex(index=list(range(len(df)))) @@ -668,7 +668,7 @@ def test_reindex_with_duplicate_columns(self): df = DataFrame( [[1, 5, 7.0], [1, 5, 7.0], [1, 5, 7.0]], columns=["bar", "a", "a"] ) - msg = "cannot reindex on an axis with duplicate labels" + msg = "cannot reindex from a duplicate axis" with pytest.raises(ValueError, match=msg): df.reindex(columns=["bar"]) with pytest.raises(ValueError, match=msg): @@ -942,7 +942,7 @@ def test_reindex_with_categoricalindex(self): index=CategoricalIndex(list("aabbca"), dtype=CDT(list("cabe")), name="B"), ) # passed duplicate indexers are not allowed - msg = "cannot reindex on an axis with duplicate labels" + msg = "cannot reindex from a duplicate axis" with pytest.raises(ValueError, match=msg): df2.reindex(["a", "b"]) diff --git a/pandas/tests/frame/methods/test_sample.py b/pandas/tests/frame/methods/test_sample.py index 366722531329a..55ef665c55241 100644 --- a/pandas/tests/frame/methods/test_sample.py +++ b/pandas/tests/frame/methods/test_sample.py @@ -1,9 +1,10 @@ import numpy as np import pytest +from pandas.compat import np_version_under1p18 + from pandas import ( DataFrame, - Index, Series, ) import pandas._testing as tm @@ -69,8 +70,8 @@ def test_sample_lengths(self, obj): def test_sample_invalid_random_state(self, obj): # Check for error when random_state argument invalid. msg = ( - "random_state must be an integer, array-like, a BitGenerator, Generator, " - "a numpy RandomState, or None" + "random_state must be an integer, array-like, a BitGenerator, a numpy " + "RandomState, or None" ) with pytest.raises(ValueError, match=msg): obj.sample(random_state="a_string") @@ -82,15 +83,10 @@ def test_sample_wont_accept_n_and_frac(self, obj): obj.sample(n=3, frac=0.3) def test_sample_requires_positive_n_frac(self, obj): - with pytest.raises( - ValueError, - match="A negative number of rows requested. Please provide `n` >= 0", - ): + msg = "A negative number of rows requested. Please provide positive value." + with pytest.raises(ValueError, match=msg): obj.sample(n=-3) - with pytest.raises( - ValueError, - match="A negative number of rows requested. Please provide `frac` >= 0", - ): + with pytest.raises(ValueError, match=msg): obj.sample(frac=-0.3) def test_sample_requires_integer_n(self, obj): @@ -159,8 +155,16 @@ def test_sample_none_weights(self, obj): "func_str,arg", [ ("np.array", [2, 3, 1, 0]), - ("np.random.MT19937", 3), - ("np.random.PCG64", 11), + pytest.param( + "np.random.MT19937", + 3, + marks=pytest.mark.skipif(np_version_under1p18, reason="NumPy<1.18"), + ), + pytest.param( + "np.random.PCG64", + 11, + marks=pytest.mark.skipif(np_version_under1p18, reason="NumPy<1.18"), + ), ], ) def test_sample_random_state(self, func_str, arg, frame_or_series): @@ -172,22 +176,6 @@ def test_sample_random_state(self, func_str, arg, frame_or_series): expected = obj.sample(n=3, random_state=com.random_state(eval(func_str)(arg))) tm.assert_equal(result, expected) - def test_sample_generator(self, frame_or_series): - # GH#38100 - obj = frame_or_series(np.arange(100)) - rng = np.random.default_rng() - - # Consecutive calls should advance the seed - result1 = obj.sample(n=50, random_state=rng) - result2 = obj.sample(n=50, random_state=rng) - assert not (result1.index.values == result2.index.values).all() - - # Matching generator initialization must give same result - # Consecutive calls should advance the seed - result1 = obj.sample(n=50, random_state=np.random.default_rng(11)) - result2 = obj.sample(n=50, random_state=np.random.default_rng(11)) - tm.assert_equal(result1, result2) - def test_sample_upsampling_without_replacement(self, frame_or_series): # GH#27451 @@ -338,12 +326,3 @@ def test_sample_is_copy(self): with tm.assert_produces_warning(None): df2["d"] = 1 - - def test_sample_ignore_index(self): - # GH 38581 - df = DataFrame( - {"col1": range(10, 20), "col2": range(20, 30), "colString": ["a"] * 10} - ) - result = df.sample(3, ignore_index=True) - expected_index = Index([0, 1, 2]) - tm.assert_index_equal(result.index, expected_index) diff --git a/pandas/tests/frame/methods/test_to_records.py b/pandas/tests/frame/methods/test_to_records.py index 2c96cf291c154..ba8fe25401e8c 100644 --- a/pandas/tests/frame/methods/test_to_records.py +++ b/pandas/tests/frame/methods/test_to_records.py @@ -3,6 +3,8 @@ import numpy as np import pytest +from pandas.compat import is_numpy_dev + from pandas import ( CategoricalDtype, DataFrame, @@ -171,20 +173,28 @@ def test_to_records_with_categorical(self): ), ), # Pass in a type instance. - ( + pytest.param( {"column_dtypes": str}, np.rec.array( [("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")], dtype=[("index", " return Index - for accessor in DatetimeArray._field_ops: + for accessor in DatetimeIndex._field_ops: if accessor in ["week", "weekofyear"]: # GH#33595 Deprecate week and weekofyear continue @@ -234,7 +233,7 @@ def test_datetimeindex_accessors(self): assert res.name == "name" # boolean accessors -> return array - for accessor in DatetimeArray._bool_ops: + for accessor in DatetimeIndex._bool_ops: res = getattr(dti, accessor) assert len(res) == 365 assert isinstance(res, np.ndarray) diff --git a/pandas/tests/indexes/datetimes/test_partial_slicing.py b/pandas/tests/indexes/datetimes/test_partial_slicing.py index c5b47053471eb..882515799f943 100644 --- a/pandas/tests/indexes/datetimes/test_partial_slicing.py +++ b/pandas/tests/indexes/datetimes/test_partial_slicing.py @@ -16,58 +16,10 @@ date_range, ) import pandas._testing as tm +from pandas.core.indexing import IndexingError class TestSlicing: - def test_return_type_doesnt_depend_on_monotonicity(self): - # GH#24892 we get Series back regardless of whether our DTI is monotonic - dti = date_range(start="2015-5-13 23:59:00", freq="min", periods=3) - ser = Series(range(3), index=dti) - - # non-monotonic index - ser2 = Series(range(3), index=[dti[1], dti[0], dti[2]]) - - # key with resolution strictly lower than "min" - key = "2015-5-14 00" - - # monotonic increasing index - result = ser.loc[key] - expected = ser.iloc[1:] - tm.assert_series_equal(result, expected) - - # monotonic decreasing index - result = ser.iloc[::-1].loc[key] - expected = ser.iloc[::-1][:-1] - tm.assert_series_equal(result, expected) - - # non-monotonic index - result2 = ser2.loc[key] - expected2 = ser2.iloc[::2] - tm.assert_series_equal(result2, expected2) - - def test_return_type_doesnt_depend_on_monotonicity_higher_reso(self): - # GH#24892 we get Series back regardless of whether our DTI is monotonic - dti = date_range(start="2015-5-13 23:59:00", freq="min", periods=3) - ser = Series(range(3), index=dti) - - # non-monotonic index - ser2 = Series(range(3), index=[dti[1], dti[0], dti[2]]) - - # key with resolution strictly *higher) than "min" - key = "2015-5-14 00:00:00" - - # monotonic increasing index - result = ser.loc[key] - assert result == 1 - - # monotonic decreasing index - result = ser.iloc[::-1].loc[key] - assert result == 1 - - # non-monotonic index - result2 = ser2.loc[key] - assert result2 == 0 - def test_monotone_DTI_indexing_bug(self): # GH 19362 # Testing accessing the first element in a monotonic descending @@ -86,19 +38,9 @@ def test_monotone_DTI_indexing_bug(self): expected = DataFrame({0: list(range(5)), "date": date_index}) tm.assert_frame_equal(df, expected) - # We get a slice because df.index's resolution is hourly and we - # are slicing with a daily-resolution string. If both were daily, - # we would get a single item back - dti = date_range("20170101 01:00:00", periods=3) - df = DataFrame({"A": [1, 2, 3]}, index=dti[::-1]) - - expected = DataFrame({"A": 1}, index=dti[-1:][::-1]) - result = df.loc["2017-01-03"] - tm.assert_frame_equal(result, expected) - - result2 = df.iloc[::-1].loc["2017-01-03"] - expected2 = expected.iloc[::-1] - tm.assert_frame_equal(result2, expected2) + df = DataFrame({"A": [1, 2, 3]}, index=date_range("20170101", periods=3)[::-1]) + expected = DataFrame({"A": 1}, index=date_range("20170103", periods=1)[::-1]) + tm.assert_frame_equal(df.loc["2017-01-03"], expected) def test_slice_year(self): dti = date_range(freq="B", start=datetime(2005, 1, 1), periods=500) @@ -336,28 +278,28 @@ def test_partial_slicing_with_multiindex(self): result = df_multi.loc[("2013-06-19 09:30:00", "ACCT1", "ABC")] tm.assert_series_equal(result, expected) - # partial string indexing on first level, scalar indexing on the other two - result = df_multi.loc[("2013-06-19", "ACCT1", "ABC")] - expected = df_multi.iloc[:1].droplevel([1, 2]) - tm.assert_frame_equal(result, expected) + # this is an IndexingError as we don't do partial string selection on + # multi-levels. + msg = "Too many indexers" + with pytest.raises(IndexingError, match=msg): + df_multi.loc[("2013-06-19", "ACCT1", "ABC")] - def test_partial_slicing_with_multiindex_series(self): # GH 4294 # partial slice on a series mi - ser = DataFrame( + s = DataFrame( np.random.rand(1000, 1000), index=date_range("2000-1-1", periods=1000) ).stack() - s2 = ser[:-1].copy() + s2 = s[:-1].copy() expected = s2["2000-1-4"] result = s2[Timestamp("2000-1-4")] tm.assert_series_equal(result, expected) - result = ser[Timestamp("2000-1-4")] - expected = ser["2000-1-4"] + result = s[Timestamp("2000-1-4")] + expected = s["2000-1-4"] tm.assert_series_equal(result, expected) - df2 = DataFrame(ser) + df2 = DataFrame(s) expected = df2.xs("2000-1-4") result = df2.loc[Timestamp("2000-1-4")] tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/indexes/datetimes/test_setops.py b/pandas/tests/indexes/datetimes/test_setops.py index 62663c8c6b810..513a47d6be7ab 100644 --- a/pandas/tests/indexes/datetimes/test_setops.py +++ b/pandas/tests/indexes/datetimes/test_setops.py @@ -391,23 +391,6 @@ def test_setops_preserve_freq(self, tz): assert result.freq == rng.freq assert result.tz == rng.tz - def test_intersection_non_tick_no_fastpath(self): - # GH#42104 - dti = DatetimeIndex( - [ - "2018-12-31", - "2019-03-31", - "2019-06-30", - "2019-09-30", - "2019-12-31", - "2020-03-31", - ], - freq="Q-DEC", - ) - result = dti[::2].intersection(dti[1::2]) - expected = dti[:0] - tm.assert_index_equal(result, expected) - class TestBusinessDatetimeIndex: def setup_method(self, method): diff --git a/pandas/tests/indexes/interval/test_indexing.py b/pandas/tests/indexes/interval/test_indexing.py index aa3359d775c5a..a5a921f42c3ef 100644 --- a/pandas/tests/indexes/interval/test_indexing.py +++ b/pandas/tests/indexes/interval/test_indexing.py @@ -275,26 +275,6 @@ def test_get_indexer_categorical(self, target, ordered): expected = index.get_indexer(target) tm.assert_numpy_array_equal(result, expected) - def test_get_indexer_categorical_with_nans(self): - # GH#41934 nans in both index and in target - ii = IntervalIndex.from_breaks(range(5)) - ii2 = ii.append(IntervalIndex([np.nan])) - ci2 = CategoricalIndex(ii2) - - result = ii2.get_indexer(ci2) - expected = np.arange(5, dtype=np.intp) - tm.assert_numpy_array_equal(result, expected) - - # not-all-matches - result = ii2[1:].get_indexer(ci2[::-1]) - expected = np.array([3, 2, 1, 0, -1], dtype=np.intp) - tm.assert_numpy_array_equal(result, expected) - - # non-unique target, non-unique nans - result = ii2.get_indexer(ci2.append(ci2)) - expected = np.array([0, 1, 2, 3, 4, 0, 1, 2, 3, 4], dtype=np.intp) - tm.assert_numpy_array_equal(result, expected) - @pytest.mark.parametrize( "tuples, closed", [ diff --git a/pandas/tests/indexes/multi/test_duplicates.py b/pandas/tests/indexes/multi/test_duplicates.py index 1fd8b0f8b837a..c2b3647379234 100644 --- a/pandas/tests/indexes/multi/test_duplicates.py +++ b/pandas/tests/indexes/multi/test_duplicates.py @@ -74,6 +74,15 @@ def test_unique_level(idx, level): tm.assert_index_equal(result, expected) +def test_get_unique_index(idx): + mi = idx[[0, 1, 0, 1, 1, 0, 0]] + expected = mi._shallow_copy(mi[[0, 1]]) + + result = mi._get_unique_index() + assert result.unique + tm.assert_index_equal(result, expected) + + def test_duplicate_multiindex_codes(): # GH 17464 # Make sure that a MultiIndex with duplicate levels throws a ValueError diff --git a/pandas/tests/indexes/multi/test_indexing.py b/pandas/tests/indexes/multi/test_indexing.py index ec7ddf8b4d67a..9e1097ce5951f 100644 --- a/pandas/tests/indexes/multi/test_indexing.py +++ b/pandas/tests/indexes/multi/test_indexing.py @@ -457,15 +457,6 @@ def test_get_indexer_kwarg_validation(self): with pytest.raises(ValueError, match=msg): mi.get_indexer(mi[:-1], tolerance="piano") - def test_get_indexer_mismatched_nlevels(self): - mi = MultiIndex.from_product([range(3), ["A", "B"]]) - - other = MultiIndex.from_product([range(3), ["A", "B"], range(2)]) - - msg = "tuples of different lengths" - with pytest.raises(TypeError, match=msg): - mi.get_indexer(other, method="pad") - def test_getitem(idx): # scalar diff --git a/pandas/tests/indexes/multi/test_isin.py b/pandas/tests/indexes/multi/test_isin.py index 695458273d16e..97eb34e28764b 100644 --- a/pandas/tests/indexes/multi/test_isin.py +++ b/pandas/tests/indexes/multi/test_isin.py @@ -1,11 +1,14 @@ import numpy as np import pytest +from pandas.compat import PYPY + from pandas import MultiIndex import pandas._testing as tm -def test_isin_nan(): +@pytest.mark.skipif(not PYPY, reason="tuples cmp recursively on PyPy") +def test_isin_nan_pypy(): idx = MultiIndex.from_arrays([["foo", "bar"], [1.0, np.nan]]) tm.assert_numpy_array_equal(idx.isin([("bar", np.nan)]), np.array([False, True])) tm.assert_numpy_array_equal( @@ -28,6 +31,15 @@ def test_isin(): assert result.dtype == np.bool_ +@pytest.mark.skipif(PYPY, reason="tuples cmp recursively on PyPy") +def test_isin_nan_not_pypy(): + idx = MultiIndex.from_arrays([["foo", "bar"], [1.0, np.nan]]) + tm.assert_numpy_array_equal(idx.isin([("bar", np.nan)]), np.array([False, False])) + tm.assert_numpy_array_equal( + idx.isin([("bar", float("nan"))]), np.array([False, False]) + ) + + def test_isin_level_kwarg(): idx = MultiIndex.from_arrays([["qux", "baz", "foo", "bar"], np.arange(4)]) diff --git a/pandas/tests/indexes/multi/test_partial_indexing.py b/pandas/tests/indexes/multi/test_partial_indexing.py index 47efc43d5eae0..286522f6b946d 100644 --- a/pandas/tests/indexes/multi/test_partial_indexing.py +++ b/pandas/tests/indexes/multi/test_partial_indexing.py @@ -1,4 +1,3 @@ -import numpy as np import pytest from pandas import ( @@ -46,42 +45,6 @@ def test_partial_string_matching_single_index(df): tm.assert_frame_equal(result, expected) -def test_get_loc_partial_timestamp_multiindex(df): - mi = df.index - key = ("2016-01-01", "a") - loc = mi.get_loc(key) - - expected = np.zeros(len(mi), dtype=bool) - expected[[0, 3]] = True - tm.assert_numpy_array_equal(loc, expected) - - key2 = ("2016-01-02", "a") - loc2 = mi.get_loc(key2) - expected2 = np.zeros(len(mi), dtype=bool) - expected2[[6, 9]] = True - tm.assert_numpy_array_equal(loc2, expected2) - - key3 = ("2016-01", "a") - loc3 = mi.get_loc(key3) - expected3 = np.zeros(len(mi), dtype=bool) - expected3[mi.get_level_values(1).get_loc("a")] = True - tm.assert_numpy_array_equal(loc3, expected3) - - key4 = ("2016", "a") - loc4 = mi.get_loc(key4) - expected4 = expected3 - tm.assert_numpy_array_equal(loc4, expected4) - - # non-monotonic - taker = np.arange(len(mi), dtype=np.intp) - taker[::2] = taker[::-2] - mi2 = mi.take(taker) - loc5 = mi2.get_loc(key) - expected5 = np.zeros(len(mi2), dtype=bool) - expected5[[3, 14]] = True - tm.assert_numpy_array_equal(loc5, expected5) - - def test_partial_string_timestamp_multiindex(df): # GH10331 df_swap = df.swaplevel(0, 1).sort_index() @@ -109,9 +72,7 @@ def test_partial_string_timestamp_multiindex(df): # partial string match on date and hour, from middle result = df.loc["2016-01-02 12"] - # hourly resolution, same as index.levels[0], so we are _not_ slicing on - # that level, so that level gets dropped - expected = df.iloc[9:12].droplevel(0) + expected = df.iloc[9:12] tm.assert_frame_equal(result, expected) # partial string match on secondary index @@ -120,14 +81,11 @@ def test_partial_string_timestamp_multiindex(df): tm.assert_frame_equal(result, expected) # tuple selector with partial string match on date - # "2016-01-01" has daily resolution, so _is_ a slice on the first level. result = df.loc[("2016-01-01", "a"), :] expected = df.iloc[[0, 3]] - expected = df.iloc[[0, 3]].droplevel(1) tm.assert_frame_equal(result, expected) - # Slicing date on first level should break (of course) bc the DTI is the - # second level on df_swap + # Slicing date on first level should break (of course) with pytest.raises(KeyError, match="'2016-01-01'"): df_swap.loc["2016-01-01"] diff --git a/pandas/tests/indexes/multi/test_reindex.py b/pandas/tests/indexes/multi/test_reindex.py index 340b546125d8d..38ff6efec40c9 100644 --- a/pandas/tests/indexes/multi/test_reindex.py +++ b/pandas/tests/indexes/multi/test_reindex.py @@ -84,13 +84,6 @@ def test_reindex_lvl_preserves_type_if_target_is_empty_list_or_array(): assert idx.reindex([], level=0)[0].levels[0].dtype.type == np.int64 assert idx.reindex([], level=1)[0].levels[1].dtype.type == np.object_ - # case with EA levels - cat = pd.Categorical(["foo", "bar"]) - dti = pd.date_range("2016-01-01", periods=2, tz="US/Pacific") - mi = MultiIndex.from_product([cat, dti]) - assert mi.reindex([], level=0)[0].levels[0].dtype == cat.dtype - assert mi.reindex([], level=1)[0].levels[1].dtype == dti.dtype - def test_reindex_base(idx): idx = idx @@ -133,31 +126,3 @@ def test_reindex_not_all_tuples(): tm.assert_index_equal(res, idx) expected = np.array([0, 1, 2, -1], dtype=np.intp) tm.assert_numpy_array_equal(indexer, expected) - - -def test_reindex_limit_arg_with_multiindex(): - # GH21247 - - idx = MultiIndex.from_tuples([(3, "A"), (4, "A"), (4, "B")]) - - df = pd.Series([0.02, 0.01, 0.012], index=idx) - - new_idx = MultiIndex.from_tuples( - [ - (3, "A"), - (3, "B"), - (4, "A"), - (4, "B"), - (4, "C"), - (5, "B"), - (5, "C"), - (6, "B"), - (6, "C"), - ] - ) - - with pytest.raises( - ValueError, - match="limit argument only valid if doing pad, backfill or nearest reindexing", - ): - df.reindex(new_idx, fill_value=0, limit=1) diff --git a/pandas/tests/indexes/multi/test_setops.py b/pandas/tests/indexes/multi/test_setops.py index f43e3104c64d7..eb456bee39dbf 100644 --- a/pandas/tests/indexes/multi/test_setops.py +++ b/pandas/tests/indexes/multi/test_setops.py @@ -216,10 +216,11 @@ def test_difference_sort_incomparable(): other = MultiIndex.from_product([[3, pd.Timestamp("2000"), 4], ["c", "d"]]) # sort=None, the default - msg = "sort order is undefined for incomparable objects" - with tm.assert_produces_warning(RuntimeWarning, match=msg): + # MultiIndex.difference deviates here from other difference + # implementations in not catching the TypeError + msg = "'<' not supported between instances of 'Timestamp' and 'int'" + with pytest.raises(TypeError, match=msg): result = idx.difference(other) - tm.assert_index_equal(result, idx) # sort=False result = idx.difference(other, sort=False) diff --git a/pandas/tests/indexes/numeric/test_indexing.py b/pandas/tests/indexes/numeric/test_indexing.py index e6b418868dbeb..5f2f8f75045bb 100644 --- a/pandas/tests/indexes/numeric/test_indexing.py +++ b/pandas/tests/indexes/numeric/test_indexing.py @@ -24,17 +24,12 @@ class TestGetLoc: @pytest.mark.parametrize("method", [None, "pad", "backfill", "nearest"]) def test_get_loc(self, method): index = Index([0, 1, 2]) - warn = None if method is None else FutureWarning - - with tm.assert_produces_warning(warn, match="deprecated"): - assert index.get_loc(1, method=method) == 1 + assert index.get_loc(1, method=method) == 1 if method: - with tm.assert_produces_warning(warn, match="deprecated"): - assert index.get_loc(1, method=method, tolerance=0) == 1 + assert index.get_loc(1, method=method, tolerance=0) == 1 @pytest.mark.parametrize("method", [None, "pad", "backfill", "nearest"]) - @pytest.mark.filterwarnings("ignore:Passing method:FutureWarning") def test_get_loc_raises_bad_label(self, method): index = Index([0, 1, 2]) if method: @@ -48,7 +43,6 @@ def test_get_loc_raises_bad_label(self, method): @pytest.mark.parametrize( "method,loc", [("pad", 1), ("backfill", 2), ("nearest", 1)] ) - @pytest.mark.filterwarnings("ignore:Passing method:FutureWarning") def test_get_loc_tolerance(self, method, loc): index = Index([0, 1, 2]) assert index.get_loc(1.1, method) == loc @@ -58,14 +52,12 @@ def test_get_loc_tolerance(self, method, loc): def test_get_loc_outside_tolerance_raises(self, method): index = Index([0, 1, 2]) with pytest.raises(KeyError, match="1.1"): - with tm.assert_produces_warning(FutureWarning, match="deprecated"): - index.get_loc(1.1, method, tolerance=0.05) + index.get_loc(1.1, method, tolerance=0.05) def test_get_loc_bad_tolerance_raises(self): index = Index([0, 1, 2]) with pytest.raises(ValueError, match="must be numeric"): - with tm.assert_produces_warning(FutureWarning, match="deprecated"): - index.get_loc(1.1, "nearest", tolerance="invalid") + index.get_loc(1.1, "nearest", tolerance="invalid") def test_get_loc_tolerance_no_method_raises(self): index = Index([0, 1, 2]) @@ -75,10 +67,8 @@ def test_get_loc_tolerance_no_method_raises(self): def test_get_loc_raises_missized_tolerance(self): index = Index([0, 1, 2]) with pytest.raises(ValueError, match="tolerance size must match"): - with tm.assert_produces_warning(FutureWarning, match="deprecated"): - index.get_loc(1.1, "nearest", tolerance=[1, 1]) + index.get_loc(1.1, "nearest", tolerance=[1, 1]) - @pytest.mark.filterwarnings("ignore:Passing method:FutureWarning") def test_get_loc_float64(self): idx = Float64Index([0.0, 1.0, 2.0]) for method in [None, "pad", "backfill", "nearest"]: @@ -149,8 +139,7 @@ def test_get_loc_float_index_nan_with_method(self, vals, method): # GH#39382 idx = Index(vals) with pytest.raises(KeyError, match="nan"): - with tm.assert_produces_warning(FutureWarning, match="deprecated"): - idx.get_loc(np.nan, method=method) + idx.get_loc(np.nan, method=method) class TestGetIndexer: @@ -387,19 +376,6 @@ def test_where(self, klass, index): result = index.where(klass(cond)) tm.assert_index_equal(result, expected) - def test_where_uin64(self): - idx = UInt64Index([0, 6, 2]) - mask = np.array([False, True, False]) - other = np.array([1], dtype=np.int64) - - expected = UInt64Index([1, 6, 1]) - - result = idx.where(mask, other) - tm.assert_index_equal(result, expected) - - result = idx.putmask(~mask, other) - tm.assert_index_equal(result, expected) - class TestTake: @pytest.mark.parametrize("klass", [Float64Index, Int64Index, UInt64Index]) diff --git a/pandas/tests/indexes/numeric/test_numeric.py b/pandas/tests/indexes/numeric/test_numeric.py index 8cbca0ba8eb65..9572aeaf41c91 100644 --- a/pandas/tests/indexes/numeric/test_numeric.py +++ b/pandas/tests/indexes/numeric/test_numeric.py @@ -531,6 +531,7 @@ def test_constructor(self, dtype): res = Index([1, 2 ** 63 + 1], dtype=dtype) tm.assert_index_equal(res, idx) + @pytest.mark.xfail(reason="https://github.com/numpy/numpy/issues/19146") def test_constructor_does_not_cast_to_float(self): # https://github.com/numpy/numpy/issues/19146 values = [0, np.iinfo(np.uint64).max] diff --git a/pandas/tests/indexes/object/test_indexing.py b/pandas/tests/indexes/object/test_indexing.py index b26676a0d83cf..a683e9faed1f2 100644 --- a/pandas/tests/indexes/object/test_indexing.py +++ b/pandas/tests/indexes/object/test_indexing.py @@ -10,14 +10,12 @@ class TestGetLoc: def test_get_loc_raises_object_nearest(self): index = Index(["a", "c"]) with pytest.raises(TypeError, match="unsupported operand type"): - with tm.assert_produces_warning(FutureWarning, match="deprecated"): - index.get_loc("a", method="nearest") + index.get_loc("a", method="nearest") def test_get_loc_raises_object_tolerance(self): index = Index(["a", "c"]) with pytest.raises(TypeError, match="unsupported operand type"): - with tm.assert_produces_warning(FutureWarning, match="deprecated"): - index.get_loc("a", method="pad", tolerance="invalid") + index.get_loc("a", method="pad", tolerance="invalid") class TestGetIndexer: diff --git a/pandas/tests/indexes/period/test_indexing.py b/pandas/tests/indexes/period/test_indexing.py index 3b7b738bec410..a41d02cfbd394 100644 --- a/pandas/tests/indexes/period/test_indexing.py +++ b/pandas/tests/indexes/period/test_indexing.py @@ -339,7 +339,6 @@ def test_get_loc_integer(self): # TODO: This method came from test_period; de-dup with version above @pytest.mark.parametrize("method", [None, "pad", "backfill", "nearest"]) - @pytest.mark.filterwarnings("ignore:Passing method:FutureWarning") def test_get_loc_method(self, method): idx = period_range("2000-01-01", periods=3) @@ -353,7 +352,6 @@ def test_get_loc_method(self, method): idx.get_loc(key, method=method) # TODO: This method came from test_period; de-dup with version above - @pytest.mark.filterwarnings("ignore:Passing method:FutureWarning") def test_get_loc3(self): idx = period_range("2000-01-01", periods=5)[::2] @@ -515,13 +513,11 @@ def test_get_indexer_mismatched_dtype_with_method(self, non_comparable_idx, meth continue # Two different error message patterns depending on dtypes msg = "|".join( - [ - re.escape(msg) - for msg in ( - f"Cannot compare dtypes {pi.dtype} and {other.dtype}", - " not supported between instances of ", - ) - ] + re.escape(msg) + for msg in ( + f"Cannot compare dtypes {pi.dtype} and {other.dtype}", + " not supported between instances of ", + ) ) with pytest.raises(TypeError, match=msg): pi.get_indexer(other2, method=method) diff --git a/pandas/tests/indexes/period/test_searchsorted.py b/pandas/tests/indexes/period/test_searchsorted.py index 27e998284c189..af243eeccc7a4 100644 --- a/pandas/tests/indexes/period/test_searchsorted.py +++ b/pandas/tests/indexes/period/test_searchsorted.py @@ -2,6 +2,7 @@ import pytest from pandas._libs.tslibs import IncompatibleFrequency +from pandas.compat import np_version_under1p18 from pandas import ( NaT, @@ -27,7 +28,13 @@ def test_searchsorted(self, freq): p2 = Period("2014-01-04", freq=freq) assert pidx.searchsorted(p2) == 3 - assert pidx.searchsorted(NaT) == 5 + if np_version_under1p18: + # GH#36254 + # Following numpy convention, NaT goes at the beginning + # (unlike NaN which goes at the end) + assert pidx.searchsorted(NaT) == 0 + else: + assert pidx.searchsorted(NaT) == 5 msg = "Input has different freq=H from PeriodArray" with pytest.raises(IncompatibleFrequency, match=msg): diff --git a/pandas/tests/indexes/test_any_index.py b/pandas/tests/indexes/test_any_index.py index f7dcaa628228b..60fa8f1a0c083 100644 --- a/pandas/tests/indexes/test_any_index.py +++ b/pandas/tests/indexes/test_any_index.py @@ -66,13 +66,6 @@ def test_ravel_deprecation(index): index.ravel() -def test_is_type_compatible_deprecation(index): - # GH#42113 - msg = "is_type_compatible is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - index.is_type_compatible(index.inferred_type) - - class TestConversion: def test_to_series(self, index): # assert that we are creating a copy of the index diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 826649358e663..d7abaf0b5dfbe 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -992,7 +992,7 @@ def test_isin(self, values, index, expected): result = index.isin(values) tm.assert_numpy_array_equal(result, expected) - def test_isin_nan_common_object(self, request, nulls_fixture, nulls_fixture2): + def test_isin_nan_common_object(self, nulls_fixture, nulls_fixture2): # Test cartesian product of null fixtures and ensure that we don't # mangle the various types (save a corner case with PyPy) diff --git a/pandas/tests/indexes/test_common.py b/pandas/tests/indexes/test_common.py index 882e708a357c8..ec01e35673647 100644 --- a/pandas/tests/indexes/test_common.py +++ b/pandas/tests/indexes/test_common.py @@ -126,7 +126,7 @@ def test_copy_and_deepcopy(self, index_flat): new_copy = index.copy(deep=True, name="banana") assert new_copy.name == "banana" - def test_unique_level(self, index_flat): + def test_unique(self, index_flat): # don't test a MultiIndex here (as its tested separated) index = index_flat @@ -147,7 +147,7 @@ def test_unique_level(self, index_flat): with pytest.raises(KeyError, match=msg): index.unique(level="wrong") - def test_unique(self, index_flat): + def test_get_unique_index(self, index_flat): # MultiIndex tested separately index = index_flat if not len(index): @@ -164,7 +164,7 @@ def test_unique(self, index_flat): except NotImplementedError: pass - result = idx.unique() + result = idx._get_unique_index() tm.assert_index_equal(result, idx_unique) # nans: @@ -195,7 +195,7 @@ def test_unique(self, index_flat): expected = idx_unique_nan for i in [idx_nan, idx_unique_nan]: - result = i.unique() + result = i._get_unique_index() tm.assert_index_equal(result, expected) def test_searchsorted_monotonic(self, index_flat): diff --git a/pandas/tests/indexes/test_indexing.py b/pandas/tests/indexes/test_indexing.py index 5f6d0155ae6cf..379c766b94d6c 100644 --- a/pandas/tests/indexes/test_indexing.py +++ b/pandas/tests/indexes/test_indexing.py @@ -26,7 +26,6 @@ IntervalIndex, MultiIndex, PeriodIndex, - RangeIndex, Series, TimedeltaIndex, UInt64Index, @@ -182,27 +181,6 @@ def test_get_value(self, index): tm.assert_almost_equal(result, values[67]) -class TestGetLoc: - def test_get_loc_non_hashable(self, index): - # MultiIndex and Index raise TypeError, others InvalidIndexError - - with pytest.raises((TypeError, InvalidIndexError), match="slice"): - index.get_loc(slice(0, 1)) - - def test_get_loc_generator(self, index): - - exc = KeyError - if isinstance( - index, - (DatetimeIndex, TimedeltaIndex, PeriodIndex, RangeIndex, IntervalIndex), - ): - # TODO: make these more consistent? - exc = InvalidIndexError - with pytest.raises(exc, match="generator object"): - # MultiIndex specifically checks for generator; others for scalar - index.get_loc(x for x in range(5)) - - class TestGetIndexer: def test_get_indexer_base(self, index): diff --git a/pandas/tests/indexes/test_numpy_compat.py b/pandas/tests/indexes/test_numpy_compat.py index 92adc0570dee1..f2ed96d0b65b8 100644 --- a/pandas/tests/indexes/test_numpy_compat.py +++ b/pandas/tests/indexes/test_numpy_compat.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas.compat import np_version_under1p18 + from pandas import ( DatetimeIndex, Float64Index, @@ -80,12 +82,22 @@ def test_numpy_ufuncs_other(index, func, request): isinstance(index, DatetimeIndex) and index.tz is not None and func in [np.isfinite, np.isnan, np.isinf] + and ( + not np_version_under1p18 + or (np_version_under1p18 and func is np.isfinite) + ) ): mark = pytest.mark.xfail(reason="__array_ufunc__ is not defined") request.node.add_marker(mark) - if func in (np.isfinite, np.isinf, np.isnan): - # numpy 1.18 changed isinf and isnan to not raise on dt64/tfd64 + if not np_version_under1p18 and func in [np.isfinite, np.isinf, np.isnan]: + # numpy 1.18(dev) changed isinf and isnan to not raise on dt64/tfd64 + result = func(index) + assert isinstance(result, np.ndarray) + + elif func is np.isfinite: + # ok under numpy >= 1.17 + # Results in bool array result = func(index) assert isinstance(result, np.ndarray) else: diff --git a/pandas/tests/indexes/test_setops.py b/pandas/tests/indexes/test_setops.py index 20174beacf1d3..087ccbef7b778 100644 --- a/pandas/tests/indexes/test_setops.py +++ b/pandas/tests/indexes/test_setops.py @@ -178,7 +178,7 @@ def test_intersection_base(self, index): return # GH#10149 - cases = [second.to_numpy(), second.to_series(), second.to_list()] + cases = [klass(second.values) for klass in [np.array, Series, list]] for case in cases: result = first.intersection(case) assert tm.equalContents(result, second) @@ -201,10 +201,15 @@ def test_union_base(self, index): return # GH#10149 - cases = [second.to_numpy(), second.to_series(), second.to_list()] + cases = [klass(second.values) for klass in [np.array, Series, list]] for case in cases: - result = first.union(case) - assert tm.equalContents(result, everything) + if not isinstance(index, CategoricalIndex): + result = first.union(case) + assert tm.equalContents(result, everything), ( + result, + everything, + type(case), + ) if isinstance(index, MultiIndex): msg = "other must be a MultiIndex or a list of tuples" @@ -222,10 +227,16 @@ def test_difference_base(self, sort, index): assert tm.equalContents(result, answer) # GH#10149 - cases = [second.to_numpy(), second.to_series(), second.to_list()] + cases = [klass(second.values) for klass in [np.array, Series, list]] for case in cases: - result = first.difference(case, sort) - assert tm.equalContents(result, answer) + if isinstance(index, (DatetimeIndex, TimedeltaIndex)): + assert type(result) == type(answer) + tm.assert_numpy_array_equal( + result.sort_values().asi8, answer.sort_values().asi8 + ) + else: + result = first.difference(case, sort) + assert tm.equalContents(result, answer) if isinstance(index, MultiIndex): msg = "other must be a MultiIndex or a list of tuples" @@ -249,9 +260,16 @@ def test_symmetric_difference(self, index): assert tm.equalContents(result, answer) # GH#10149 - cases = [second.to_numpy(), second.to_series(), second.to_list()] + cases = [klass(second.values) for klass in [np.array, Series, list]] for case in cases: result = first.symmetric_difference(case) + + if is_datetime64tz_dtype(first): + # second.values casts to tznaive + expected = first.union(case) + tm.assert_index_equal(result, expected) + continue + assert tm.equalContents(result, answer) if isinstance(index, MultiIndex): diff --git a/pandas/tests/indexes/timedeltas/test_indexing.py b/pandas/tests/indexes/timedeltas/test_indexing.py index 669bbe23af559..5f0101eb4478c 100644 --- a/pandas/tests/indexes/timedeltas/test_indexing.py +++ b/pandas/tests/indexes/timedeltas/test_indexing.py @@ -82,7 +82,6 @@ def test_timestamp_invalid_key(self, key): class TestGetLoc: - @pytest.mark.filterwarnings("ignore:Passing method:FutureWarning") def test_get_loc(self): idx = to_timedelta(["0 days", "1 days", "2 days"]) @@ -292,52 +291,3 @@ def test_take_fill_value(self): msg = "index -5 is out of bounds for (axis 0 with )?size 3" with pytest.raises(IndexError, match=msg): idx.take(np.array([1, -5])) - - -class TestMaybeCastSliceBound: - @pytest.fixture(params=["increasing", "decreasing", None]) - def monotonic(self, request): - return request.param - - @pytest.fixture - def tdi(self, monotonic): - tdi = timedelta_range("1 Day", periods=10) - if monotonic == "decreasing": - tdi = tdi[::-1] - elif monotonic is None: - taker = np.arange(10, dtype=np.intp) - np.random.shuffle(taker) - tdi = tdi.take(taker) - return tdi - - def test_maybe_cast_slice_bound_invalid_str(self, tdi): - # test the low-level _maybe_cast_slice_bound and that we get the - # expected exception+message all the way up the stack - msg = ( - "cannot do slice indexing on TimedeltaIndex with these " - r"indexers \[foo\] of type str" - ) - with pytest.raises(TypeError, match=msg): - tdi._maybe_cast_slice_bound("foo", side="left") - with pytest.raises(TypeError, match=msg): - tdi.get_slice_bound("foo", side="left") - with pytest.raises(TypeError, match=msg): - tdi.slice_locs("foo", None, None) - - def test_slice_invalid_str_with_timedeltaindex( - self, tdi, frame_or_series, indexer_sl - ): - obj = frame_or_series(range(10), index=tdi) - - msg = ( - "cannot do slice indexing on TimedeltaIndex with these " - r"indexers \[foo\] of type str" - ) - with pytest.raises(TypeError, match=msg): - indexer_sl(obj)["foo":] - with pytest.raises(TypeError, match=msg): - indexer_sl(obj)["foo":-1] - with pytest.raises(TypeError, match=msg): - indexer_sl(obj)[:"foo"] - with pytest.raises(TypeError, match=msg): - indexer_sl(obj)[tdi[0] : "foo"] diff --git a/pandas/tests/indexing/multiindex/test_getitem.py b/pandas/tests/indexing/multiindex/test_getitem.py index 3790a6e9a5319..f1fbe0c5a6b9c 100644 --- a/pandas/tests/indexing/multiindex/test_getitem.py +++ b/pandas/tests/indexing/multiindex/test_getitem.py @@ -28,11 +28,9 @@ def test_series_getitem_multiindex(access_method, level1_value, expected): # GH 6018 # series regression getitem with a multi-index - mi = MultiIndex.from_tuples([(0, 0), (1, 1), (2, 1)], names=["A", "B"]) - ser = Series([1, 2, 3], index=mi) - expected.index.name = "A" - - result = access_method(ser, level1_value) + s = Series([1, 2, 3]) + s.index = MultiIndex.from_tuples([(0, 0), (1, 1), (2, 1)]) + result = access_method(s, level1_value) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/indexing/multiindex/test_indexing_slow.py b/pandas/tests/indexing/multiindex/test_indexing_slow.py index e8c766d489813..a38b5f6cc449a 100644 --- a/pandas/tests/indexing/multiindex/test_indexing_slow.py +++ b/pandas/tests/indexing/multiindex/test_indexing_slow.py @@ -1,7 +1,3 @@ -from typing import ( - Any, - List, -) import warnings import numpy as np @@ -18,7 +14,7 @@ n = 1000 cols = ["jim", "joe", "jolie", "joline", "jolia"] -vals: List[Any] = [ +vals = [ np.random.randint(0, 10, n), np.random.choice(list("abcdefghij"), n), np.random.choice(pd.date_range("20141009", periods=10).tolist(), n), @@ -28,7 +24,7 @@ vals = list(map(tuple, zip(*vals))) # bunch of keys for testing -keys: List[Any] = [ +keys = [ np.random.randint(0, 11, m), np.random.choice(list("abcdefghijk"), m), np.random.choice(pd.date_range("20141009", periods=11).tolist(), m), diff --git a/pandas/tests/indexing/multiindex/test_loc.py b/pandas/tests/indexing/multiindex/test_loc.py index 104fa2da7a67e..afcff6db5e3dd 100644 --- a/pandas/tests/indexing/multiindex/test_loc.py +++ b/pandas/tests/indexing/multiindex/test_loc.py @@ -398,21 +398,14 @@ def test_loc_getitem_duplicates_multiindex_missing_indexers(indexer, pos): idx = MultiIndex.from_product( [["A", "B", "C"], ["foo", "bar", "baz"]], names=["one", "two"] ) - ser = Series(np.arange(9, dtype="int64"), index=idx).sort_index() - expected = ser.iloc[pos] + s = Series(np.arange(9, dtype="int64"), index=idx).sort_index() + expected = s.iloc[pos] if expected.size == 0 and indexer != []: with pytest.raises(KeyError, match=str(indexer)): - ser.loc[indexer] + s.loc[indexer] else: - warn = None - msg = "MultiIndex with a nested sequence" - if indexer == (slice(None), ["foo", "bah"]): - # "bah" is not in idx.levels[1], so is ignored, will raise KeyError - warn = FutureWarning - - with tm.assert_produces_warning(warn, match=msg): - result = ser.loc[indexer] + result = s.loc[indexer] tm.assert_series_equal(result, expected) @@ -554,17 +547,15 @@ def test_loc_period_string_indexing(): ), ) result = df.loc[("2013Q1", 1111), "OMS"] - - alt = df.loc[(a[0], 1111), "OMS"] - assert np.isnan(alt) - - # Because the resolution of the string matches, it is an exact lookup, - # not a slice - assert np.isnan(result) - - # TODO: should it figure this out? - # alt = df.loc["2013Q1", 1111, "OMS"] - # assert np.isnan(alt) + expected = Series( + [np.nan], + dtype=object, + name="OMS", + index=MultiIndex.from_tuples( + [(pd.Period("2013Q1"), 1111)], names=["Period", "CVR"] + ), + ) + tm.assert_series_equal(result, expected) def test_loc_datetime_mask_slicing(): @@ -745,19 +736,6 @@ def test_get_loc_datetime_index(): assert mi.get_loc("2001-01") == slice(0, 31, None) assert index.get_loc("2001-01") == slice(0, 31, None) - loc = mi[::2].get_loc("2001-01") - expected = index[::2].get_loc("2001-01") - assert loc == expected - - loc = mi.repeat(2).get_loc("2001-01") - expected = index.repeat(2).get_loc("2001-01") - assert loc == expected - - loc = mi.append(mi).get_loc("2001-01") - expected = index.append(index).get_loc("2001-01") - # TODO: standardize return type for MultiIndex.get_loc - tm.assert_numpy_array_equal(loc.nonzero()[0], expected) - def test_loc_setitem_indexer_differently_ordered(): # GH#34603 @@ -809,10 +787,10 @@ def test_loc_getitem_index_differently_ordered_slice_none_duplicates(indexer): def test_loc_getitem_drops_levels_for_one_row_dataframe(): - # GH#10521 "x" and "z" are both scalar indexing, so those levels are dropped + # GH#10521 mi = MultiIndex.from_arrays([["x"], ["y"], ["z"]], names=["a", "b", "c"]) df = DataFrame({"d": [0]}, index=mi) - expected = df.droplevel([0, 2]) + expected = df.copy() result = df.loc["x", :, "z"] tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/indexing/multiindex/test_partial.py b/pandas/tests/indexing/multiindex/test_partial.py index 50a31f2fd22c6..a99f09143e282 100644 --- a/pandas/tests/indexing/multiindex/test_partial.py +++ b/pandas/tests/indexing/multiindex/test_partial.py @@ -158,8 +158,8 @@ def test_getitem_intkey_leading_level( assert isinstance(mi.levels[0], Float64Index) assert 14 not in mi.levels[0] - assert not mi.levels[0]._should_fallback_to_positional - assert not mi._should_fallback_to_positional + assert not mi.levels[0]._should_fallback_to_positional() + assert not mi._should_fallback_to_positional() with pytest.raises(KeyError, match="14"): ser[14] diff --git a/pandas/tests/indexing/test_at.py b/pandas/tests/indexing/test_at.py index 23d2bee612243..77cfb94bf4629 100644 --- a/pandas/tests/indexing/test_at.py +++ b/pandas/tests/indexing/test_at.py @@ -8,7 +8,6 @@ from pandas import ( CategoricalDtype, - CategoricalIndex, DataFrame, Series, Timestamp, @@ -142,16 +141,3 @@ def test_at_getitem_mixed_index_no_fallback(self): ser.at[0] with pytest.raises(KeyError, match="^4$"): ser.at[4] - - def test_at_categorical_integers(self): - # CategoricalIndex with integer categories that don't happen to match - # the Categorical's codes - ci = CategoricalIndex([3, 4]) - - arr = np.arange(4).reshape(2, 2) - frame = DataFrame(arr, index=ci) - - for df in [frame, frame.T]: - for key in [0, 1]: - with pytest.raises(KeyError, match=str(key)): - df.at[key, key] diff --git a/pandas/tests/indexing/test_categorical.py b/pandas/tests/indexing/test_categorical.py index 9908f79208088..cd49620f45fae 100644 --- a/pandas/tests/indexing/test_categorical.py +++ b/pandas/tests/indexing/test_categorical.py @@ -485,9 +485,9 @@ def test_loc_and_at_with_categorical_index(self): [1.5, 2.5, 3.5], [-1.5, -2.5, -3.5], # numpy int/uint - *(np.array([1, 2, 3], dtype=dtype) for dtype in tm.ALL_INT_DTYPES), + *[np.array([1, 2, 3], dtype=dtype) for dtype in tm.ALL_INT_DTYPES], # numpy floats - *(np.array([1.5, 2.5, 3.5], dtype=dtyp) for dtyp in tm.FLOAT_DTYPES), + *[np.array([1.5, 2.5, 3.5], dtype=dtyp) for dtyp in tm.FLOAT_DTYPES], # numpy object np.array([1, "b", 3.5], dtype=object), # pandas scalars @@ -495,7 +495,7 @@ def test_loc_and_at_with_categorical_index(self): [Timestamp(2019, 1, 1), Timestamp(2019, 2, 1), Timestamp(2019, 3, 1)], [Timedelta(1, "d"), Timedelta(2, "d"), Timedelta(3, "D")], # pandas Integer arrays - *(pd.array([1, 2, 3], dtype=dtype) for dtype in tm.ALL_EA_INT_DTYPES), + *[pd.array([1, 2, 3], dtype=dtype) for dtype in tm.ALL_EA_INT_DTYPES], # other pandas arrays pd.IntervalIndex.from_breaks([1, 4, 6, 9]).array, pd.date_range("2019-01-01", periods=3).array, @@ -540,16 +540,3 @@ def test_loc_getitem_with_non_string_categories(self, idx_values, ordered): result.loc[sl, "A"] = ["qux", "qux2"] expected = DataFrame({"A": ["qux", "qux2", "baz"]}, index=cat_idx) tm.assert_frame_equal(result, expected) - - def test_getitem_categorical_with_nan(self): - # GH#41933 - ci = CategoricalIndex(["A", "B", np.nan]) - - ser = Series(range(3), index=ci) - - assert ser[np.nan] == 2 - assert ser.loc[np.nan] == 2 - - df = DataFrame(ser) - assert df.loc[np.nan, 0] == 2 - assert df.loc[np.nan][0] == 2 diff --git a/pandas/tests/indexing/test_coercion.py b/pandas/tests/indexing/test_coercion.py index 761e67bedbf8c..7911cd7f12e0c 100644 --- a/pandas/tests/indexing/test_coercion.py +++ b/pandas/tests/indexing/test_coercion.py @@ -273,12 +273,7 @@ def _assert_setitem_index_conversion( ): """test index's coercion triggered by assign key""" temp = original_series.copy() - warn = None - if isinstance(loc_key, int) and temp.index.dtype == np.float64: - # GH#33469 - warn = FutureWarning - with tm.assert_produces_warning(warn): - temp[loc_key] = 5 + temp[loc_key] = 5 exp = pd.Series([1, 2, 3, 4, 5], index=expected_index) tm.assert_series_equal(temp, exp) # check dtype explicitly for sure @@ -329,10 +324,7 @@ def test_setitem_index_float64(self, val, exp_dtype, request): temp = obj.copy() msg = "index 5 is out of bounds for axis 0 with size 4" with pytest.raises(exp_dtype, match=msg): - # GH#33469 - depr_msg = "Treating integers as positional" - with tm.assert_produces_warning(FutureWarning, match=depr_msg): - temp[5] = 5 + temp[5] = 5 mark = pytest.mark.xfail(reason="TODO_GH12747 The result must be float") request.node.add_marker(mark) exp_index = pd.Index([1.1, 2.1, 3.1, 4.1, val]) diff --git a/pandas/tests/indexing/test_iloc.py b/pandas/tests/indexing/test_iloc.py index b04a2c86a79d7..fc07c14f1e179 100644 --- a/pandas/tests/indexing/test_iloc.py +++ b/pandas/tests/indexing/test_iloc.py @@ -596,7 +596,7 @@ def test_iloc_getitem_labelled_frame(self): assert result == exp # out-of-bounds exception - msg = "index 5 is out of bounds for axis 0 with size 4" + msg = "single positional indexer is out-of-bounds" with pytest.raises(IndexError, match=msg): df.iloc[10, 5] diff --git a/pandas/tests/indexing/test_indexing.py b/pandas/tests/indexing/test_indexing.py index 7243f2cddfec6..c945bd6b95ee1 100644 --- a/pandas/tests/indexing/test_indexing.py +++ b/pandas/tests/indexing/test_indexing.py @@ -113,6 +113,15 @@ def test_setitem_ndarray_3d(self, index, frame_or_series, indexer_sli): if indexer_sli is tm.iloc: err = ValueError msg = f"Cannot set values with ndim > {obj.ndim}" + elif ( + isinstance(index, pd.IntervalIndex) + and indexer_sli is tm.setitem + and obj.ndim == 1 + ): + err = AttributeError + msg = ( + "'pandas._libs.interval.IntervalTree' object has no attribute 'get_loc'" + ) else: err = ValueError msg = "|".join( @@ -518,7 +527,7 @@ def test_string_slice_empty(self): with pytest.raises(KeyError, match="'2011'"): df["2011"] - with pytest.raises(KeyError, match="^0$"): + with pytest.raises(KeyError, match="'2011'"): df.loc["2011", 0] def test_astype_assignment(self): diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index 6de83e34122c2..a8a2055ffb093 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -1663,30 +1663,6 @@ def test_loc_multiindex_levels_contain_values_not_in_index_anymore(self, lt_valu with pytest.raises(KeyError, match=r"\['b'\] not in index"): df.loc[df["a"] < lt_value, :].loc[["b"], :] - def test_loc_multiindex_null_slice_na_level(self): - # GH#42055 - lev1 = np.array([np.nan, np.nan]) - lev2 = ["bar", "baz"] - mi = MultiIndex.from_arrays([lev1, lev2]) - ser = Series([0, 1], index=mi) - result = ser.loc[:, "bar"] - - # TODO: should we have name="bar"? - expected = Series([0], index=[np.nan]) - tm.assert_series_equal(result, expected) - - def test_loc_drops_level(self): - # Based on test_series_varied_multiindex_alignment, where - # this used to fail to drop the first level - mi = MultiIndex.from_product( - [list("ab"), list("xy"), [1, 2]], names=["ab", "xy", "num"] - ) - ser = Series(range(8), index=mi) - - loc_result = ser.loc["a", :, :] - expected = ser.index.droplevel(0)[:4] - tm.assert_index_equal(loc_result.index, expected) - class TestLocSetitemWithExpansion: @pytest.mark.slow @@ -1854,23 +1830,6 @@ def test_loc_setitem_with_expansion_nonunique_index(self, index, request): ) tm.assert_frame_equal(df, expected) - @pytest.mark.parametrize( - "dtype", ["Int32", "Int64", "UInt32", "UInt64", "Float32", "Float64"] - ) - def test_loc_setitem_with_expansion_preserves_nullable_int(self, dtype): - # GH#42099 - ser = Series([0, 1, 2, 3], dtype=dtype) - df = DataFrame({"data": ser}) - - result = DataFrame(index=df.index) - result.loc[df.index, "data"] = ser - - tm.assert_frame_equal(result, df) - - result = DataFrame(index=df.index) - result.loc[df.index, "data"] = ser._values - tm.assert_frame_equal(result, df) - class TestLocCallable: def test_frame_loc_getitem_callable(self): diff --git a/pandas/tests/io/data/excel/chartsheet.xls b/pandas/tests/io/data/excel/chartsheet.xls deleted file mode 100644 index 7d027400fbd52..0000000000000 Binary files a/pandas/tests/io/data/excel/chartsheet.xls and /dev/null differ diff --git a/pandas/tests/io/data/excel/chartsheet.xlsb b/pandas/tests/io/data/excel/chartsheet.xlsb deleted file mode 100644 index 805087280f851..0000000000000 Binary files a/pandas/tests/io/data/excel/chartsheet.xlsb and /dev/null differ diff --git a/pandas/tests/io/data/excel/chartsheet.xlsm b/pandas/tests/io/data/excel/chartsheet.xlsm deleted file mode 100644 index aadb48d6f4824..0000000000000 Binary files a/pandas/tests/io/data/excel/chartsheet.xlsm and /dev/null differ diff --git a/pandas/tests/io/data/excel/chartsheet.xlsx b/pandas/tests/io/data/excel/chartsheet.xlsx deleted file mode 100644 index c8d5e7afb3d07..0000000000000 Binary files a/pandas/tests/io/data/excel/chartsheet.xlsx and /dev/null differ diff --git a/pandas/tests/io/data/legacy_pickle/1.2.4/empty_frame_v1_2_4-GH#42345.pkl b/pandas/tests/io/data/legacy_pickle/1.2.4/empty_frame_v1_2_4-GH#42345.pkl deleted file mode 100644 index 255a745dd9021..0000000000000 Binary files a/pandas/tests/io/data/legacy_pickle/1.2.4/empty_frame_v1_2_4-GH#42345.pkl and /dev/null differ diff --git a/pandas/tests/io/excel/test_odswriter.py b/pandas/tests/io/excel/test_odswriter.py index 4bf6051fd36ef..b50c641ebf0c0 100644 --- a/pandas/tests/io/excel/test_odswriter.py +++ b/pandas/tests/io/excel/test_odswriter.py @@ -1,5 +1,3 @@ -import re - import pytest import pandas._testing as tm @@ -17,25 +15,3 @@ def test_write_append_mode_raises(ext): with tm.ensure_clean(ext) as f: with pytest.raises(ValueError, match=msg): ExcelWriter(f, engine="odf", mode="a") - - -@pytest.mark.parametrize("nan_inf_to_errors", [True, False]) -def test_kwargs(ext, nan_inf_to_errors): - # GH 42286 - # odswriter doesn't utilize kwargs, nothing to check except that it works - kwargs = {"options": {"nan_inf_to_errors": nan_inf_to_errors}} - with tm.ensure_clean(ext) as f: - msg = re.escape("Use of **kwargs is deprecated") - with tm.assert_produces_warning(FutureWarning, match=msg): - with ExcelWriter(f, engine="odf", **kwargs) as _: - pass - - -@pytest.mark.parametrize("nan_inf_to_errors", [True, False]) -def test_engine_kwargs(ext, nan_inf_to_errors): - # GH 42286 - # odswriter doesn't utilize engine_kwargs, nothing to check except that it works - engine_kwargs = {"options": {"nan_inf_to_errors": nan_inf_to_errors}} - with tm.ensure_clean(ext) as f: - with ExcelWriter(f, engine="odf", engine_kwargs=engine_kwargs) as _: - pass diff --git a/pandas/tests/io/excel/test_openpyxl.py b/pandas/tests/io/excel/test_openpyxl.py index cd773957c9043..62f567457c3ab 100644 --- a/pandas/tests/io/excel/test_openpyxl.py +++ b/pandas/tests/io/excel/test_openpyxl.py @@ -85,30 +85,6 @@ def test_write_cells_merge_styled(ext): assert xcell_a2.font == openpyxl_sty_merged -@pytest.mark.parametrize("write_only", [True, False]) -def test_kwargs(ext, write_only): - # GH 42286 - # openpyxl doesn't utilize kwargs, only test that supplying a kwarg works - kwargs = {"write_only": write_only} - with tm.ensure_clean(ext) as f: - msg = re.escape("Use of **kwargs is deprecated") - with tm.assert_produces_warning(FutureWarning, match=msg): - with ExcelWriter(f, engine="openpyxl", **kwargs) as writer: - # ExcelWriter won't allow us to close without writing something - DataFrame().to_excel(writer) - - -@pytest.mark.parametrize("write_only", [True, False]) -def test_engine_kwargs(ext, write_only): - # GH 42286 - # openpyxl doesn't utilize kwargs, only test that supplying a engine_kwarg works - engine_kwargs = {"write_only": write_only} - with tm.ensure_clean(ext) as f: - with ExcelWriter(f, engine="openpyxl", engine_kwargs=engine_kwargs) as writer: - # ExcelWriter won't allow us to close without writing something - DataFrame().to_excel(writer) - - @pytest.mark.parametrize( "mode,expected", [("w", ["baz"]), ("a", ["foo", "bar", "baz"])] ) diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index cbd241ceda0b1..d40fb3ce4a135 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -1250,34 +1250,6 @@ def test_trailing_blanks(self, read_ext): result = pd.read_excel(file_name) assert result.shape == (3, 3) - def test_ignore_chartsheets_by_str(self, request, read_ext): - # GH 41448 - if pd.read_excel.keywords["engine"] == "odf": - pytest.skip("chartsheets do not exist in the ODF format") - if pd.read_excel.keywords["engine"] == "pyxlsb": - request.node.add_marker( - pytest.mark.xfail( - reason="pyxlsb can't distinguish chartsheets from worksheets" - ) - ) - with pytest.raises(ValueError, match="Worksheet named 'Chart1' not found"): - pd.read_excel("chartsheet" + read_ext, sheet_name="Chart1") - - def test_ignore_chartsheets_by_int(self, request, read_ext): - # GH 41448 - if pd.read_excel.keywords["engine"] == "odf": - pytest.skip("chartsheets do not exist in the ODF format") - if pd.read_excel.keywords["engine"] == "pyxlsb": - request.node.add_marker( - pytest.mark.xfail( - reason="pyxlsb can't distinguish chartsheets from worksheets" - ) - ) - with pytest.raises( - ValueError, match="Worksheet index 1 is invalid, 1 worksheets found" - ): - pd.read_excel("chartsheet" + read_ext, sheet_name=1) - class TestExcelFileRead: @pytest.fixture(autouse=True) @@ -1529,19 +1501,6 @@ def test_engine_invalid_option(self, read_ext): with pd.option_context(f"io.excel{read_ext}.reader", "abc"): pass - def test_ignore_chartsheets(self, request, engine, read_ext): - # GH 41448 - if engine == "odf": - pytest.skip("chartsheets do not exist in the ODF format") - if engine == "pyxlsb": - request.node.add_marker( - pytest.mark.xfail( - reason="pyxlsb can't distinguish chartsheets from worksheets" - ) - ) - with pd.ExcelFile("chartsheet" + read_ext) as excel: - assert excel.sheet_names == ["Sheet1"] - def test_corrupt_files_closed(self, request, engine, read_ext): # GH41778 errors = (BadZipFile,) diff --git a/pandas/tests/io/excel/test_writers.py b/pandas/tests/io/excel/test_writers.py index 508e767a47004..77837bea3e48a 100644 --- a/pandas/tests/io/excel/test_writers.py +++ b/pandas/tests/io/excel/test_writers.py @@ -1399,6 +1399,25 @@ def check_called(func): with tm.ensure_clean("something.xls") as filepath: check_called(lambda: df.to_excel(filepath, engine="dummy")) + @pytest.mark.parametrize( + "ext", + [ + pytest.param(".xlsx", marks=td.skip_if_no("xlsxwriter")), + pytest.param(".xlsx", marks=td.skip_if_no("openpyxl")), + pytest.param(".ods", marks=td.skip_if_no("odf")), + ], + ) + def test_kwargs_deprecated(self, ext): + # GH 40430 + msg = re.escape("Use of **kwargs is deprecated") + with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.ensure_clean(ext) as path: + try: + with ExcelWriter(path, kwarg=1): + pass + except TypeError: + pass + @pytest.mark.parametrize( "ext", [ diff --git a/pandas/tests/io/excel/test_xlsxwriter.py b/pandas/tests/io/excel/test_xlsxwriter.py index 79d2f55a9b8ff..6de378f6a3d3e 100644 --- a/pandas/tests/io/excel/test_xlsxwriter.py +++ b/pandas/tests/io/excel/test_xlsxwriter.py @@ -1,4 +1,3 @@ -import re import warnings import pytest @@ -62,23 +61,3 @@ def test_write_append_mode_raises(ext): with tm.ensure_clean(ext) as f: with pytest.raises(ValueError, match=msg): ExcelWriter(f, engine="xlsxwriter", mode="a") - - -@pytest.mark.parametrize("nan_inf_to_errors", [True, False]) -def test_kwargs(ext, nan_inf_to_errors): - # GH 42286 - kwargs = {"options": {"nan_inf_to_errors": nan_inf_to_errors}} - with tm.ensure_clean(ext) as f: - msg = re.escape("Use of **kwargs is deprecated") - with tm.assert_produces_warning(FutureWarning, match=msg): - with ExcelWriter(f, engine="xlsxwriter", **kwargs) as writer: - assert writer.book.nan_inf_to_errors == nan_inf_to_errors - - -@pytest.mark.parametrize("nan_inf_to_errors", [True, False]) -def test_engine_kwargs(ext, nan_inf_to_errors): - # GH 42286 - engine_kwargs = {"options": {"nan_inf_to_errors": nan_inf_to_errors}} - with tm.ensure_clean(ext) as f: - with ExcelWriter(f, engine="xlsxwriter", engine_kwargs=engine_kwargs) as writer: - assert writer.book.nan_inf_to_errors == nan_inf_to_errors diff --git a/pandas/tests/io/excel/test_xlwt.py b/pandas/tests/io/excel/test_xlwt.py index c58b9763f9618..7e1787d8c55d4 100644 --- a/pandas/tests/io/excel/test_xlwt.py +++ b/pandas/tests/io/excel/test_xlwt.py @@ -1,5 +1,3 @@ -import re - import numpy as np import pytest @@ -99,27 +97,3 @@ def test_option_xls_writer_deprecated(ext): check_stacklevel=False, ): options.io.excel.xls.writer = "xlwt" - - -@pytest.mark.parametrize("write_only", [True, False]) -def test_kwargs(ext, write_only): - # GH 42286 - # xlwt doesn't utilize kwargs, only test that supplying a kwarg works - kwargs = {"write_only": write_only} - with tm.ensure_clean(ext) as f: - msg = re.escape("Use of **kwargs is deprecated") - with tm.assert_produces_warning(FutureWarning, match=msg): - with ExcelWriter(f, engine="openpyxl", **kwargs) as writer: - # xlwt won't allow us to close without writing something - DataFrame().to_excel(writer) - - -@pytest.mark.parametrize("write_only", [True, False]) -def test_engine_kwargs(ext, write_only): - # GH 42286 - # xlwt doesn't utilize kwargs, only test that supplying a engine_kwarg works - engine_kwargs = {"write_only": write_only} - with tm.ensure_clean(ext) as f: - with ExcelWriter(f, engine="openpyxl", engine_kwargs=engine_kwargs) as writer: - # xlwt won't allow us to close without writing something - DataFrame().to_excel(writer) diff --git a/pandas/tests/io/formats/style/test_align.py b/pandas/tests/io/formats/style/test_align.py new file mode 100644 index 0000000000000..f81c1fbd6d85e --- /dev/null +++ b/pandas/tests/io/formats/style/test_align.py @@ -0,0 +1,406 @@ +import pytest + +from pandas import DataFrame + +pytest.importorskip("jinja2") + + +def bar_grad(a=None, b=None, c=None, d=None): + """Used in multiple tests to simplify formatting of expected result""" + ret = [("width", "10em"), ("height", "80%")] + if all(x is None for x in [a, b, c, d]): + return ret + return ret + [ + ( + "background", + f"linear-gradient(90deg,{','.join(x for x in [a, b, c, d] if x)})", + ) + ] + + +class TestStylerBarAlign: + def test_bar_align_left(self): + df = DataFrame({"A": [0, 1, 2]}) + result = df.style.bar()._compute().ctx + expected = { + (0, 0): bar_grad(), + (1, 0): bar_grad("#d65f5f 50.0%", " transparent 50.0%"), + (2, 0): bar_grad("#d65f5f 100.0%", " transparent 100.0%"), + } + assert result == expected + + result = df.style.bar(color="red", width=50)._compute().ctx + expected = { + (0, 0): bar_grad(), + (1, 0): bar_grad("red 25.0%", " transparent 25.0%"), + (2, 0): bar_grad("red 50.0%", " transparent 50.0%"), + } + assert result == expected + + df["C"] = ["a"] * len(df) + result = df.style.bar(color="red", width=50)._compute().ctx + assert result == expected + df["C"] = df["C"].astype("category") + result = df.style.bar(color="red", width=50)._compute().ctx + assert result == expected + + def test_bar_align_left_0points(self): + df = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) + result = df.style.bar()._compute().ctx + expected = { + (0, 0): bar_grad(), + (0, 1): bar_grad(), + (0, 2): bar_grad(), + (1, 0): bar_grad("#d65f5f 50.0%", " transparent 50.0%"), + (1, 1): bar_grad("#d65f5f 50.0%", " transparent 50.0%"), + (1, 2): bar_grad("#d65f5f 50.0%", " transparent 50.0%"), + (2, 0): bar_grad("#d65f5f 100.0%", " transparent 100.0%"), + (2, 1): bar_grad("#d65f5f 100.0%", " transparent 100.0%"), + (2, 2): bar_grad("#d65f5f 100.0%", " transparent 100.0%"), + } + assert result == expected + + result = df.style.bar(axis=1)._compute().ctx + expected = { + (0, 0): bar_grad(), + (0, 1): bar_grad("#d65f5f 50.0%", " transparent 50.0%"), + (0, 2): bar_grad("#d65f5f 100.0%", " transparent 100.0%"), + (1, 0): bar_grad(), + (1, 1): bar_grad("#d65f5f 50.0%", " transparent 50.0%"), + (1, 2): bar_grad("#d65f5f 100.0%", " transparent 100.0%"), + (2, 0): bar_grad(), + (2, 1): bar_grad("#d65f5f 50.0%", " transparent 50.0%"), + (2, 2): bar_grad("#d65f5f 100.0%", " transparent 100.0%"), + } + assert result == expected + + def test_bar_align_mid_pos_and_neg(self): + df = DataFrame({"A": [-10, 0, 20, 90]}) + result = df.style.bar(align="mid", color=["#d65f5f", "#5fba7d"])._compute().ctx + expected = { + (0, 0): bar_grad( + "#d65f5f 10.0%", + " transparent 10.0%", + ), + (1, 0): bar_grad(), + (2, 0): bar_grad( + " transparent 10.0%", + " #5fba7d 10.0%", + " #5fba7d 30.0%", + " transparent 30.0%", + ), + (3, 0): bar_grad( + " transparent 10.0%", + " #5fba7d 10.0%", + " #5fba7d 100.0%", + " transparent 100.0%", + ), + } + assert result == expected + + def test_bar_align_mid_all_pos(self): + df = DataFrame({"A": [10, 20, 50, 100]}) + + result = df.style.bar(align="mid", color=["#d65f5f", "#5fba7d"])._compute().ctx + + expected = { + (0, 0): bar_grad( + "#5fba7d 10.0%", + " transparent 10.0%", + ), + (1, 0): bar_grad( + "#5fba7d 20.0%", + " transparent 20.0%", + ), + (2, 0): bar_grad( + "#5fba7d 50.0%", + " transparent 50.0%", + ), + (3, 0): bar_grad( + "#5fba7d 100.0%", + " transparent 100.0%", + ), + } + + assert result == expected + + def test_bar_align_mid_all_neg(self): + df = DataFrame({"A": [-100, -60, -30, -20]}) + + result = df.style.bar(align="mid", color=["#d65f5f", "#5fba7d"])._compute().ctx + + expected = { + (0, 0): bar_grad( + "#d65f5f 100.0%", + " transparent 100.0%", + ), + (1, 0): bar_grad( + " transparent 40.0%", + " #d65f5f 40.0%", + " #d65f5f 100.0%", + " transparent 100.0%", + ), + (2, 0): bar_grad( + " transparent 70.0%", + " #d65f5f 70.0%", + " #d65f5f 100.0%", + " transparent 100.0%", + ), + (3, 0): bar_grad( + " transparent 80.0%", + " #d65f5f 80.0%", + " #d65f5f 100.0%", + " transparent 100.0%", + ), + } + assert result == expected + + def test_bar_align_zero_pos_and_neg(self): + # See https://github.com/pandas-dev/pandas/pull/14757 + df = DataFrame({"A": [-10, 0, 20, 90]}) + + result = ( + df.style.bar(align="zero", color=["#d65f5f", "#5fba7d"], width=90) + ._compute() + .ctx + ) + expected = { + (0, 0): bar_grad( + " transparent 40.0%", + " #d65f5f 40.0%", + " #d65f5f 45.0%", + " transparent 45.0%", + ), + (1, 0): bar_grad(), + (2, 0): bar_grad( + " transparent 45.0%", + " #5fba7d 45.0%", + " #5fba7d 55.0%", + " transparent 55.0%", + ), + (3, 0): bar_grad( + " transparent 45.0%", + " #5fba7d 45.0%", + " #5fba7d 90.0%", + " transparent 90.0%", + ), + } + assert result == expected + + def test_bar_align_left_axis_none(self): + df = DataFrame({"A": [0, 1], "B": [2, 4]}) + result = df.style.bar(axis=None)._compute().ctx + expected = { + (0, 0): bar_grad(), + (1, 0): bar_grad( + "#d65f5f 25.0%", + " transparent 25.0%", + ), + (0, 1): bar_grad( + "#d65f5f 50.0%", + " transparent 50.0%", + ), + (1, 1): bar_grad( + "#d65f5f 100.0%", + " transparent 100.0%", + ), + } + assert result == expected + + def test_bar_align_zero_axis_none(self): + df = DataFrame({"A": [0, 1], "B": [-2, 4]}) + result = df.style.bar(align="zero", axis=None)._compute().ctx + expected = { + (0, 0): bar_grad(), + (1, 0): bar_grad( + " transparent 50.0%", + " #d65f5f 50.0%", + " #d65f5f 62.5%", + " transparent 62.5%", + ), + (0, 1): bar_grad( + " transparent 25.0%", + " #d65f5f 25.0%", + " #d65f5f 50.0%", + " transparent 50.0%", + ), + (1, 1): bar_grad( + " transparent 50.0%", + " #d65f5f 50.0%", + " #d65f5f 100.0%", + " transparent 100.0%", + ), + } + assert result == expected + + def test_bar_align_mid_axis_none(self): + df = DataFrame({"A": [0, 1], "B": [-2, 4]}) + result = df.style.bar(align="mid", axis=None)._compute().ctx + expected = { + (0, 0): bar_grad(), + (1, 0): bar_grad( + " transparent 33.3%", + " #d65f5f 33.3%", + " #d65f5f 50.0%", + " transparent 50.0%", + ), + (0, 1): bar_grad( + "#d65f5f 33.3%", + " transparent 33.3%", + ), + (1, 1): bar_grad( + " transparent 33.3%", + " #d65f5f 33.3%", + " #d65f5f 100.0%", + " transparent 100.0%", + ), + } + assert result == expected + + def test_bar_align_mid_vmin(self): + df = DataFrame({"A": [0, 1], "B": [-2, 4]}) + result = df.style.bar(align="mid", axis=None, vmin=-6)._compute().ctx + expected = { + (0, 0): bar_grad(), + (1, 0): bar_grad( + " transparent 60.0%", + " #d65f5f 60.0%", + " #d65f5f 70.0%", + " transparent 70.0%", + ), + (0, 1): bar_grad( + " transparent 40.0%", + " #d65f5f 40.0%", + " #d65f5f 60.0%", + " transparent 60.0%", + ), + (1, 1): bar_grad( + " transparent 60.0%", + " #d65f5f 60.0%", + " #d65f5f 100.0%", + " transparent 100.0%", + ), + } + assert result == expected + + def test_bar_align_mid_vmax(self): + df = DataFrame({"A": [0, 1], "B": [-2, 4]}) + result = df.style.bar(align="mid", axis=None, vmax=8)._compute().ctx + expected = { + (0, 0): bar_grad(), + (1, 0): bar_grad( + " transparent 20.0%", + " #d65f5f 20.0%", + " #d65f5f 30.0%", + " transparent 30.0%", + ), + (0, 1): bar_grad( + "#d65f5f 20.0%", + " transparent 20.0%", + ), + (1, 1): bar_grad( + " transparent 20.0%", + " #d65f5f 20.0%", + " #d65f5f 60.0%", + " transparent 60.0%", + ), + } + assert result == expected + + def test_bar_align_mid_vmin_vmax_wide(self): + df = DataFrame({"A": [0, 1], "B": [-2, 4]}) + result = df.style.bar(align="mid", axis=None, vmin=-3, vmax=7)._compute().ctx + expected = { + (0, 0): bar_grad(), + (1, 0): bar_grad( + " transparent 30.0%", + " #d65f5f 30.0%", + " #d65f5f 40.0%", + " transparent 40.0%", + ), + (0, 1): bar_grad( + " transparent 10.0%", + " #d65f5f 10.0%", + " #d65f5f 30.0%", + " transparent 30.0%", + ), + (1, 1): bar_grad( + " transparent 30.0%", + " #d65f5f 30.0%", + " #d65f5f 70.0%", + " transparent 70.0%", + ), + } + assert result == expected + + def test_bar_align_mid_vmin_vmax_clipping(self): + df = DataFrame({"A": [0, 1], "B": [-2, 4]}) + result = df.style.bar(align="mid", axis=None, vmin=-1, vmax=3)._compute().ctx + expected = { + (0, 0): bar_grad(), + (1, 0): bar_grad( + " transparent 25.0%", + " #d65f5f 25.0%", + " #d65f5f 50.0%", + " transparent 50.0%", + ), + (0, 1): bar_grad("#d65f5f 25.0%", " transparent 25.0%"), + (1, 1): bar_grad( + " transparent 25.0%", + " #d65f5f 25.0%", + " #d65f5f 100.0%", + " transparent 100.0%", + ), + } + assert result == expected + + def test_bar_align_mid_nans(self): + df = DataFrame({"A": [1, None], "B": [-1, 3]}) + result = df.style.bar(align="mid", axis=None)._compute().ctx + expected = { + (0, 0): bar_grad( + " transparent 25.0%", + " #d65f5f 25.0%", + " #d65f5f 50.0%", + " transparent 50.0%", + ), + (0, 1): bar_grad("#d65f5f 25.0%", " transparent 25.0%"), + (1, 1): bar_grad( + " transparent 25.0%", + " #d65f5f 25.0%", + " #d65f5f 100.0%", + " transparent 100.0%", + ), + } + assert result == expected + + def test_bar_align_zero_nans(self): + df = DataFrame({"A": [1, None], "B": [-1, 2]}) + result = df.style.bar(align="zero", axis=None)._compute().ctx + expected = { + (0, 0): bar_grad( + " transparent 50.0%", + " #d65f5f 50.0%", + " #d65f5f 75.0%", + " transparent 75.0%", + ), + (0, 1): bar_grad( + " transparent 25.0%", + " #d65f5f 25.0%", + " #d65f5f 50.0%", + " transparent 50.0%", + ), + (1, 1): bar_grad( + " transparent 50.0%", + " #d65f5f 50.0%", + " #d65f5f 100.0%", + " transparent 100.0%", + ), + } + assert result == expected + + def test_bar_bad_align_raises(self): + df = DataFrame({"A": [-100, -60, -30, -20]}) + msg = "`align` must be one of {'left', 'zero',' mid'}" + with pytest.raises(ValueError, match=msg): + df.style.bar(align="poorly", color=["#d65f5f", "#5fba7d"]) diff --git a/pandas/tests/io/formats/style/test_bar.py b/pandas/tests/io/formats/style/test_bar.py deleted file mode 100644 index 19884aaac86a7..0000000000000 --- a/pandas/tests/io/formats/style/test_bar.py +++ /dev/null @@ -1,307 +0,0 @@ -import numpy as np -import pytest - -from pandas import DataFrame - -pytest.importorskip("jinja2") - - -def bar_grad(a=None, b=None, c=None, d=None): - """Used in multiple tests to simplify formatting of expected result""" - ret = [("width", "10em")] - if all(x is None for x in [a, b, c, d]): - return ret - return ret + [ - ( - "background", - f"linear-gradient(90deg,{','.join([x for x in [a, b, c, d] if x])})", - ) - ] - - -def no_bar(): - return bar_grad() - - -def bar_to(x, color="#d65f5f"): - return bar_grad(f" {color} {x:.1f}%", f" transparent {x:.1f}%") - - -def bar_from_to(x, y, color="#d65f5f"): - return bar_grad( - f" transparent {x:.1f}%", - f" {color} {x:.1f}%", - f" {color} {y:.1f}%", - f" transparent {y:.1f}%", - ) - - -@pytest.fixture -def df_pos(): - return DataFrame([[1], [2], [3]]) - - -@pytest.fixture -def df_neg(): - return DataFrame([[-1], [-2], [-3]]) - - -@pytest.fixture -def df_mix(): - return DataFrame([[-3], [1], [2]]) - - -@pytest.mark.parametrize( - "align, exp", - [ - ("left", [no_bar(), bar_to(50), bar_to(100)]), - ("right", [bar_to(100), bar_from_to(50, 100), no_bar()]), - ("mid", [bar_to(33.33), bar_to(66.66), bar_to(100)]), - ("zero", [bar_from_to(50, 66.7), bar_from_to(50, 83.3), bar_from_to(50, 100)]), - ("mean", [bar_to(50), no_bar(), bar_from_to(50, 100)]), - (2.0, [bar_to(50), no_bar(), bar_from_to(50, 100)]), - (np.median, [bar_to(50), no_bar(), bar_from_to(50, 100)]), - ], -) -def test_align_positive_cases(df_pos, align, exp): - # test different align cases for all positive values - result = df_pos.style.bar(align=align)._compute().ctx - expected = {(0, 0): exp[0], (1, 0): exp[1], (2, 0): exp[2]} - assert result == expected - - -@pytest.mark.parametrize( - "align, exp", - [ - ("left", [bar_to(100), bar_to(50), no_bar()]), - ("right", [no_bar(), bar_from_to(50, 100), bar_to(100)]), - ("mid", [bar_from_to(66.66, 100), bar_from_to(33.33, 100), bar_to(100)]), - ("zero", [bar_from_to(33.33, 50), bar_from_to(16.66, 50), bar_to(50)]), - ("mean", [bar_from_to(50, 100), no_bar(), bar_to(50)]), - (-2.0, [bar_from_to(50, 100), no_bar(), bar_to(50)]), - (np.median, [bar_from_to(50, 100), no_bar(), bar_to(50)]), - ], -) -def test_align_negative_cases(df_neg, align, exp): - # test different align cases for all negative values - result = df_neg.style.bar(align=align)._compute().ctx - expected = {(0, 0): exp[0], (1, 0): exp[1], (2, 0): exp[2]} - assert result == expected - - -@pytest.mark.parametrize( - "align, exp", - [ - ("left", [no_bar(), bar_to(80), bar_to(100)]), - ("right", [bar_to(100), bar_from_to(80, 100), no_bar()]), - ("mid", [bar_to(60), bar_from_to(60, 80), bar_from_to(60, 100)]), - ("zero", [bar_to(50), bar_from_to(50, 66.66), bar_from_to(50, 83.33)]), - ("mean", [bar_to(50), bar_from_to(50, 66.66), bar_from_to(50, 83.33)]), - (-0.0, [bar_to(50), bar_from_to(50, 66.66), bar_from_to(50, 83.33)]), - (np.nanmedian, [bar_to(50), no_bar(), bar_from_to(50, 62.5)]), - ], -) -@pytest.mark.parametrize("nans", [True, False]) -def test_align_mixed_cases(df_mix, align, exp, nans): - # test different align cases for mixed positive and negative values - # also test no impact of NaNs and no_bar - expected = {(0, 0): exp[0], (1, 0): exp[1], (2, 0): exp[2]} - if nans: - df_mix.loc[3, :] = np.nan - expected.update({(3, 0): no_bar()}) - result = df_mix.style.bar(align=align)._compute().ctx - assert result == expected - - -@pytest.mark.parametrize( - "align, exp", - [ - ( - "left", - { - "index": [[no_bar(), no_bar()], [bar_to(100), bar_to(100)]], - "columns": [[no_bar(), bar_to(100)], [no_bar(), bar_to(100)]], - "none": [[no_bar(), bar_to(33.33)], [bar_to(66.66), bar_to(100)]], - }, - ), - ( - "mid", - { - "index": [[bar_to(33.33), bar_to(50)], [bar_to(100), bar_to(100)]], - "columns": [[bar_to(50), bar_to(100)], [bar_to(75), bar_to(100)]], - "none": [[bar_to(25), bar_to(50)], [bar_to(75), bar_to(100)]], - }, - ), - ( - "zero", - { - "index": [ - [bar_from_to(50, 66.66), bar_from_to(50, 75)], - [bar_from_to(50, 100), bar_from_to(50, 100)], - ], - "columns": [ - [bar_from_to(50, 75), bar_from_to(50, 100)], - [bar_from_to(50, 87.5), bar_from_to(50, 100)], - ], - "none": [ - [bar_from_to(50, 62.5), bar_from_to(50, 75)], - [bar_from_to(50, 87.5), bar_from_to(50, 100)], - ], - }, - ), - ( - 2, - { - "index": [ - [bar_to(50), no_bar()], - [bar_from_to(50, 100), bar_from_to(50, 100)], - ], - "columns": [ - [bar_to(50), no_bar()], - [bar_from_to(50, 75), bar_from_to(50, 100)], - ], - "none": [ - [bar_from_to(25, 50), no_bar()], - [bar_from_to(50, 75), bar_from_to(50, 100)], - ], - }, - ), - ], -) -@pytest.mark.parametrize("axis", ["index", "columns", "none"]) -def test_align_axis(align, exp, axis): - # test all axis combinations with positive values and different aligns - data = DataFrame([[1, 2], [3, 4]]) - result = ( - data.style.bar(align=align, axis=None if axis == "none" else axis) - ._compute() - .ctx - ) - expected = { - (0, 0): exp[axis][0][0], - (0, 1): exp[axis][0][1], - (1, 0): exp[axis][1][0], - (1, 1): exp[axis][1][1], - } - assert result == expected - - -@pytest.mark.parametrize( - "values, vmin, vmax", - [ - ("positive", 1.5, 2.5), - ("negative", -2.5, -1.5), - ("mixed", -2.5, 1.5), - ], -) -@pytest.mark.parametrize("nullify", [None, "vmin", "vmax"]) # test min/max separately -@pytest.mark.parametrize("align", ["left", "right", "zero", "mid"]) -def test_vmin_vmax_clipping(df_pos, df_neg, df_mix, values, vmin, vmax, nullify, align): - # test that clipping occurs if any vmin > data_values or vmax < data_values - if align == "mid": # mid acts as left or right in each case - if values == "positive": - align = "left" - elif values == "negative": - align = "right" - df = {"positive": df_pos, "negative": df_neg, "mixed": df_mix}[values] - vmin = None if nullify == "vmin" else vmin - vmax = None if nullify == "vmax" else vmax - - clip_df = df.where(df <= (vmax if vmax else 999), other=vmax) - clip_df = clip_df.where(clip_df >= (vmin if vmin else -999), other=vmin) - - result = ( - df.style.bar(align=align, vmin=vmin, vmax=vmax, color=["red", "green"]) - ._compute() - .ctx - ) - expected = clip_df.style.bar(align=align, color=["red", "green"])._compute().ctx - assert result == expected - - -@pytest.mark.parametrize( - "values, vmin, vmax", - [ - ("positive", 0.5, 4.5), - ("negative", -4.5, -0.5), - ("mixed", -4.5, 4.5), - ], -) -@pytest.mark.parametrize("nullify", [None, "vmin", "vmax"]) # test min/max separately -@pytest.mark.parametrize("align", ["left", "right", "zero", "mid"]) -def test_vmin_vmax_widening(df_pos, df_neg, df_mix, values, vmin, vmax, nullify, align): - # test that widening occurs if any vmax > data_values or vmin < data_values - if align == "mid": # mid acts as left or right in each case - if values == "positive": - align = "left" - elif values == "negative": - align = "right" - df = {"positive": df_pos, "negative": df_neg, "mixed": df_mix}[values] - vmin = None if nullify == "vmin" else vmin - vmax = None if nullify == "vmax" else vmax - - expand_df = df.copy() - expand_df.loc[3, :], expand_df.loc[4, :] = vmin, vmax - - result = ( - df.style.bar(align=align, vmin=vmin, vmax=vmax, color=["red", "green"]) - ._compute() - .ctx - ) - expected = expand_df.style.bar(align=align, color=["red", "green"])._compute().ctx - assert result.items() <= expected.items() - - -def test_numerics(): - # test data is pre-selected for numeric values - data = DataFrame([[1, "a"], [2, "b"]]) - result = data.style.bar()._compute().ctx - assert (0, 1) not in result - assert (1, 1) not in result - - -@pytest.mark.parametrize( - "align, exp", - [ - ("left", [no_bar(), bar_to(100, "green")]), - ("right", [bar_to(100, "red"), no_bar()]), - ("mid", [bar_to(25, "red"), bar_from_to(25, 100, "green")]), - ("zero", [bar_from_to(33.33, 50, "red"), bar_from_to(50, 100, "green")]), - ], -) -def test_colors_mixed(align, exp): - data = DataFrame([[-1], [3]]) - result = data.style.bar(align=align, color=["red", "green"])._compute().ctx - assert result == {(0, 0): exp[0], (1, 0): exp[1]} - - -def test_bar_align_height(): - # test when keyword height is used 'no-repeat center' and 'background-size' present - data = DataFrame([[1], [2]]) - result = data.style.bar(align="left", height=50)._compute().ctx - bg_s = "linear-gradient(90deg, #d65f5f 100.0%, transparent 100.0%) no-repeat center" - expected = { - (0, 0): [("width", "10em")], - (1, 0): [ - ("width", "10em"), - ("background", bg_s), - ("background-size", "100% 50.0%"), - ], - } - assert result == expected - - -def test_bar_value_error_raises(): - df = DataFrame({"A": [-100, -60, -30, -20]}) - - msg = "`align` should be in {'left', 'right', 'mid', 'mean', 'zero'} or" - with pytest.raises(ValueError, match=msg): - df.style.bar(align="poorly", color=["#d65f5f", "#5fba7d"]).to_html() - - msg = r"`width` must be a value in \[0, 100\]" - with pytest.raises(ValueError, match=msg): - df.style.bar(width=200).to_html() - - msg = r"`height` must be a value in \[0, 100\]" - with pytest.raises(ValueError, match=msg): - df.style.bar(height=200).to_html() diff --git a/pandas/tests/io/formats/style/test_html.py b/pandas/tests/io/formats/style/test_html.py index 495dc82f0e7bd..74b4c7ea3977c 100644 --- a/pandas/tests/io/formats/style/test_html.py +++ b/pandas/tests/io/formats/style/test_html.py @@ -1,12 +1,8 @@ from textwrap import dedent -import numpy as np import pytest -from pandas import ( - DataFrame, - MultiIndex, -) +from pandas import DataFrame jinja2 = pytest.importorskip("jinja2") from pandas.io.formats.style import Styler @@ -20,12 +16,6 @@ def styler(): return Styler(DataFrame([[2.61], [2.69]], index=["a", "b"], columns=["A"])) -@pytest.fixture -def styler_mi(): - midx = MultiIndex.from_product([["a", "b"], ["c", "d"]]) - return Styler(DataFrame(np.arange(16).reshape(4, 4), index=midx, columns=midx)) - - @pytest.fixture def tpl_style(): return env.get_template("html_style.tpl") @@ -41,8 +31,8 @@ def test_html_template_extends_options(): # to understand the dependency with open("pandas/io/formats/templates/html.tpl") as file: result = file.read() - assert "{% include html_style_tpl %}" in result - assert "{% include html_table_tpl %}" in result + assert '{% include "html_style.tpl" %}' in result + assert '{% include "html_table.tpl" %}' in result def test_exclude_styles(styler): @@ -223,191 +213,26 @@ def test_block_names(tpl_style, tpl_table): assert result2 == expected_table -def test_from_custom_template_table(tmpdir): - p = tmpdir.mkdir("tpl").join("myhtml_table.tpl") - p.write( - dedent( - """\ - {% extends "html_table.tpl" %} - {% block table %} -

{{custom_title}}

- {{ super() }} - {% endblock table %}""" - ) - ) - result = Styler.from_custom_template(str(tmpdir.join("tpl")), "myhtml_table.tpl") - assert issubclass(result, Styler) - assert result.env is not Styler.env - assert result.template_html_table is not Styler.template_html_table - styler = result(DataFrame({"A": [1, 2]})) - assert "

My Title

\n\n\n - {{ super() }} - {% endblock style %}""" + {% extends "html.tpl" %} + {% block table %} +

{{ table_title|default("My Table") }}

+ {{ super() }} + {% endblock table %}""" ) ) - result = Styler.from_custom_template( - str(tmpdir.join("tpl")), html_style="myhtml_style.tpl" - ) + result = Styler.from_custom_template(str(tmpdir.join("templates")), "myhtml.tpl") assert issubclass(result, Styler) assert result.env is not Styler.env - assert result.template_html_style is not Styler.template_html_style + assert result.template_html is not Styler.template_html styler = result(DataFrame({"A": [1, 2]})) - assert '\n\nfull cap" in styler.render() - - -@pytest.mark.parametrize("index", [False, True]) -@pytest.mark.parametrize("columns", [False, True]) -def test_sticky_basic(styler, index, columns): - if index: - styler.set_sticky(axis=0) - if columns: - styler.set_sticky(axis=1) - - res = styler.set_uuid("").to_html() - cs1 = "tbody th {\n position: sticky;\n left: 0px;\n background-color: white;\n}" - assert (cs1 in res) is index - cs2 = "thead th {\n position: sticky;\n top: 0px;\n background-color: white;\n}" - assert (cs2 in res) is columns - - -@pytest.mark.parametrize("index", [False, True]) -@pytest.mark.parametrize("columns", [False, True]) -def test_sticky_mi(styler_mi, index, columns): - if index: - styler_mi.set_sticky(axis=0) - if columns: - styler_mi.set_sticky(axis=1) - - res = styler_mi.set_uuid("").to_html() - assert ( - ( - dedent( - """\ - #T_ tbody th.level0 { - position: sticky; - left: 0px; - min-width: 75px; - max-width: 75px; - background-color: white; - } - """ - ) - in res - ) - is index - ) - assert ( - ( - dedent( - """\ - #T_ tbody th.level1 { - position: sticky; - left: 75px; - min-width: 75px; - max-width: 75px; - background-color: white; - } - """ - ) - in res - ) - is index - ) - assert ( - ( - dedent( - """\ - #T_ thead th.level0 { - position: sticky; - top: 0px; - height: 25px; - background-color: white; - } - """ - ) - in res - ) - is columns - ) - assert ( - ( - dedent( - """\ - #T_ thead th.level1 { - position: sticky; - top: 25px; - height: 25px; - background-color: white; - } - """ - ) - in res - ) - is columns - ) - - -@pytest.mark.parametrize("index", [False, True]) -@pytest.mark.parametrize("columns", [False, True]) -def test_sticky_levels(styler_mi, index, columns): - if index: - styler_mi.set_sticky(axis=0, levels=[1]) - if columns: - styler_mi.set_sticky(axis=1, levels=[1]) - - res = styler_mi.set_uuid("").to_html() - assert "#T_ tbody th.level0 {" not in res - assert "#T_ thead th.level0 {" not in res - assert ( - ( - dedent( - """\ - #T_ tbody th.level1 { - position: sticky; - left: 0px; - min-width: 75px; - max-width: 75px; - background-color: white; - } - """ - ) - in res - ) - is index - ) - assert ( - ( - dedent( - """\ - #T_ thead th.level1 { - position: sticky; - top: 0px; - height: 25px; - background-color: white; - } - """ - ) - in res - ) - is columns - ) - - -def test_sticky_raises(styler): - with pytest.raises(ValueError, match="`axis` must be"): - styler.set_sticky(axis="bad") diff --git a/pandas/tests/io/formats/style/test_style.py b/pandas/tests/io/formats/style/test_style.py index 64c62a00ff29d..281170ab6c7cb 100644 --- a/pandas/tests/io/formats/style/test_style.py +++ b/pandas/tests/io/formats/style/test_style.py @@ -38,35 +38,6 @@ def mi_styler(mi_df): return Styler(mi_df, uuid_len=0) -@pytest.fixture -def mi_styler_comp(mi_styler): - # comprehensively add features to mi_styler - mi_styler.uuid_len = 5 - mi_styler.uuid = "abcde_" - mi_styler.set_caption("capt") - mi_styler.set_table_styles([{"selector": "a", "props": "a:v;"}]) - mi_styler.hide_columns() - mi_styler.hide_columns([("c0", "c1_a")]) - mi_styler.hide_index() - mi_styler.hide_index([("i0", "i1_a")]) - mi_styler.set_table_attributes('class="box"') - mi_styler.format(na_rep="MISSING", precision=3) - mi_styler.highlight_max(axis=None) - mi_styler.set_td_classes( - DataFrame( - [["a", "b"], ["a", "c"]], index=mi_styler.index, columns=mi_styler.columns - ) - ) - mi_styler.set_tooltips( - DataFrame( - [["a2", "b2"], ["a2", "c2"]], - index=mi_styler.index, - columns=mi_styler.columns, - ) - ) - return mi_styler - - @pytest.mark.parametrize( "sparse_columns, exp_cols", [ @@ -185,81 +156,6 @@ def test_render_trimming_mi(): assert {"attributes": 'colspan="2"'}.items() <= ctx["head"][0][2].items() -@pytest.mark.parametrize("comprehensive", [True, False]) -@pytest.mark.parametrize("render", [True, False]) -@pytest.mark.parametrize("deepcopy", [True, False]) -def test_copy(comprehensive, render, deepcopy, mi_styler, mi_styler_comp): - styler = mi_styler_comp if comprehensive else mi_styler - styler.uuid_len = 5 - - s2 = copy.deepcopy(styler) if deepcopy else copy.copy(styler) # make copy and check - assert s2 is not styler - - if render: - styler.to_html() - - excl = ["na_rep", "precision", "uuid", "cellstyle_map"] # deprecated or special var - if not deepcopy: # check memory locations are equal for all included attributes - for attr in [a for a in styler.__dict__ if (not callable(a) and a not in excl)]: - assert id(getattr(s2, attr)) == id(getattr(styler, attr)) - else: # check memory locations are different for nested or mutable vars - shallow = [ - "data", - "columns", - "index", - "uuid_len", - "caption", - "cell_ids", - "hide_index_", - "hide_columns_", - "table_attributes", - ] - for attr in shallow: - assert id(getattr(s2, attr)) == id(getattr(styler, attr)) - - for attr in [ - a - for a in styler.__dict__ - if (not callable(a) and a not in excl and a not in shallow) - ]: - if getattr(s2, attr) is None: - assert id(getattr(s2, attr)) == id(getattr(styler, attr)) - else: - assert id(getattr(s2, attr)) != id(getattr(styler, attr)) - - -def test_clear(mi_styler_comp): - # NOTE: if this test fails for new features then 'mi_styler_comp' should be updated - # to ensure proper testing of the 'copy', 'clear', 'export' methods with new feature - # GH 40675 - styler = mi_styler_comp - styler.to_html() # new attrs maybe created on render - - clean_copy = Styler(styler.data, uuid=styler.uuid) - - excl = [ - "data", - "index", - "columns", - "uuid", - "uuid_len", - "cell_ids", - "cellstyle_map", # execution time only - "precision", # deprecated - "na_rep", # deprecated - ] - # tests vars are not same vals on obj and clean copy before clear (except for excl) - for attr in [a for a in styler.__dict__ if not (callable(a) or a in excl)]: - res = getattr(styler, attr) == getattr(clean_copy, attr) - assert not (all(res) if (hasattr(res, "__iter__") and len(res) > 0) else res) - - # test vars have same vales on obj and clean copy after clearing - styler.clear() - for attr in [a for a in styler.__dict__ if not (callable(a))]: - res = getattr(styler, attr) == getattr(clean_copy, attr) - assert all(res) if hasattr(res, "__iter__") else res - - class TestStyler: def setup_method(self, method): np.random.seed(24) @@ -315,6 +211,129 @@ def test_update_ctx_flatten_multi_and_trailing_semi(self): } assert self.styler.ctx == expected + @pytest.mark.parametrize("do_changes", [True, False]) + @pytest.mark.parametrize("do_render", [True, False]) + def test_copy(self, do_changes, do_render): + # Updated in GH39708 + # Change some defaults (to check later if the new values are copied) + if do_changes: + self.styler.set_table_styles( + [{"selector": "th", "props": [("foo", "bar")]}] + ) + self.styler.set_table_attributes('class="foo" data-bar') + self.styler.hidden_index = not self.styler.hidden_index + self.styler.hide_columns("A") + classes = DataFrame( + [["favorite-val red", ""], [None, "blue my-val"]], + index=self.df.index, + columns=self.df.columns, + ) + self.styler.set_td_classes(classes) + ttips = DataFrame( + data=[["Favorite", ""], [np.nan, "my"]], + columns=self.df.columns, + index=self.df.index, + ) + self.styler.set_tooltips(ttips) + self.styler.cell_ids = not self.styler.cell_ids + + if do_render: + self.styler.render() + + s_copy = copy.copy(self.styler) + s_deepcopy = copy.deepcopy(self.styler) + + assert self.styler is not s_copy + assert self.styler is not s_deepcopy + + # Check for identity + assert self.styler.ctx is s_copy.ctx + assert self.styler._todo is s_copy._todo + assert self.styler.table_styles is s_copy.table_styles + assert self.styler.hidden_columns is s_copy.hidden_columns + assert self.styler.cell_context is s_copy.cell_context + assert self.styler.tooltips is s_copy.tooltips + if do_changes: # self.styler.tooltips is not None + assert self.styler.tooltips.tt_data is s_copy.tooltips.tt_data + assert ( + self.styler.tooltips.class_properties + is s_copy.tooltips.class_properties + ) + assert self.styler.tooltips.table_styles is s_copy.tooltips.table_styles + + # Check for non-identity + assert self.styler.ctx is not s_deepcopy.ctx + assert self.styler._todo is not s_deepcopy._todo + assert self.styler.hidden_columns is not s_deepcopy.hidden_columns + assert self.styler.cell_context is not s_deepcopy.cell_context + if do_changes: # self.styler.table_style is not None + assert self.styler.table_styles is not s_deepcopy.table_styles + if do_changes: # self.styler.tooltips is not None + assert self.styler.tooltips is not s_deepcopy.tooltips + assert self.styler.tooltips.tt_data is not s_deepcopy.tooltips.tt_data + assert ( + self.styler.tooltips.class_properties + is not s_deepcopy.tooltips.class_properties + ) + assert ( + self.styler.tooltips.table_styles + is not s_deepcopy.tooltips.table_styles + ) + + self.styler._update_ctx(self.attrs) + self.styler.highlight_max() + assert self.styler.ctx == s_copy.ctx + assert self.styler.ctx != s_deepcopy.ctx + assert self.styler._todo == s_copy._todo + assert self.styler._todo != s_deepcopy._todo + assert s_deepcopy._todo == [] + + equal_attributes = [ + "table_styles", + "table_attributes", + "cell_ids", + "hidden_index", + "hidden_columns", + "cell_context", + ] + for s2 in [s_copy, s_deepcopy]: + for att in equal_attributes: + assert self.styler.__dict__[att] == s2.__dict__[att] + if do_changes: # self.styler.tooltips is not None + tm.assert_frame_equal(self.styler.tooltips.tt_data, s2.tooltips.tt_data) + assert ( + self.styler.tooltips.class_properties + == s2.tooltips.class_properties + ) + assert self.styler.tooltips.table_styles == s2.tooltips.table_styles + + def test_clear(self): + # updated in GH 39396 + tt = DataFrame({"A": [None, "tt"]}) + css = DataFrame({"A": [None, "cls-a"]}) + s = self.df.style.highlight_max().set_tooltips(tt).set_td_classes(css) + s = s.hide_index().hide_columns("A") + # _todo, tooltips and cell_context items added to.. + assert len(s._todo) > 0 + assert s.tooltips + assert len(s.cell_context) > 0 + assert s.hidden_index is True + assert len(s.hidden_columns) > 0 + + s = s._compute() + # ctx item affected when a render takes place. _todo is maintained + assert len(s.ctx) > 0 + assert len(s._todo) > 0 + + s.clear() + # ctx, _todo, tooltips and cell_context items all revert to null state. + assert len(s.ctx) == 0 + assert len(s._todo) == 0 + assert not s.tooltips + assert len(s.cell_context) == 0 + assert s.hidden_index is False + assert len(s.hidden_columns) == 0 + def test_render(self): df = DataFrame({"A": [0, 1]}) style = lambda x: pd.Series(["color: red", "color: blue"], name=x.name) @@ -627,27 +646,10 @@ def test_applymap_subset(self, slice_): def test_applymap_subset_multiindex(self, slice_): # GH 19861 # edited for GH 33562 - warn = None - msg = "indexing on a MultiIndex with a nested sequence of labels" - if ( - isinstance(slice_[-1], tuple) - and isinstance(slice_[-1][-1], list) - and "C" in slice_[-1][-1] - ): - warn = FutureWarning - elif ( - isinstance(slice_[0], tuple) - and isinstance(slice_[0][1], list) - and 3 in slice_[0][1] - ): - warn = FutureWarning - idx = MultiIndex.from_product([["a", "b"], [1, 2]]) col = MultiIndex.from_product([["x", "y"], ["A", "B"]]) df = DataFrame(np.random.rand(4, 4), columns=col, index=idx) - - with tm.assert_produces_warning(warn, match=msg, check_stacklevel=False): - df.style.applymap(lambda x: "color: red;", subset=slice_).render() + df.style.applymap(lambda x: "color: red;", subset=slice_).render() def test_applymap_subset_multiindex_code(self): # https://github.com/pandas-dev/pandas/issues/25858 @@ -1125,14 +1127,6 @@ def test_mi_sparse_column_names(self): ] assert head == expected - def test_hide_column_headers(self): - ctx = self.styler.hide_columns()._translate(True, True) - assert len(ctx["head"]) == 0 # no header entries with an unnamed index - - self.df.index.name = "some_name" - ctx = self.df.style.hide_columns()._translate(True, True) - assert len(ctx["head"]) == 1 # only a single row for index names: no col heads - def test_hide_single_index(self): # GH 14194 # single unnamed index @@ -1201,7 +1195,7 @@ def test_hide_columns_single_level(self): assert not ctx["body"][0][1]["is_visible"] # col A, row 1 assert not ctx["body"][1][2]["is_visible"] # col B, row 1 - def test_hide_columns_index_mult_levels(self): + def test_hide_columns_mult_levels(self): # GH 14194 # setup dataframe with multiple column levels and indices i1 = MultiIndex.from_arrays( @@ -1233,8 +1227,7 @@ def test_hide_columns_index_mult_levels(self): # hide first column only ctx = df.style.hide_columns([("b", 0)])._translate(True, True) - assert not ctx["head"][0][2]["is_visible"] # b - assert ctx["head"][0][3]["is_visible"] # b + assert ctx["head"][0][2]["is_visible"] # b assert not ctx["head"][1][2]["is_visible"] # 0 assert not ctx["body"][1][2]["is_visible"] # 3 assert ctx["body"][1][3]["is_visible"] @@ -1250,18 +1243,6 @@ def test_hide_columns_index_mult_levels(self): assert ctx["body"][1][2]["is_visible"] assert ctx["body"][1][2]["display_value"] == 3 - # hide top row level, which hides both rows - ctx = df.style.hide_index("a")._translate(True, True) - for i in [0, 1, 2, 3]: - assert not ctx["body"][0][i]["is_visible"] - assert not ctx["body"][1][i]["is_visible"] - - # hide first row only - ctx = df.style.hide_index(("a", 0))._translate(True, True) - for i in [0, 1, 2, 3]: - assert not ctx["body"][0][i]["is_visible"] - assert ctx["body"][1][i]["is_visible"] - def test_pipe(self): def set_caption_from_template(styler, a, b): return styler.set_caption(f"Dataframe with a = {a} and b = {b}") @@ -1455,19 +1436,6 @@ def test_non_reducing_multi_slice_on_multiindex(self, slice_): idxs = MultiIndex.from_product([["U", "V"], ["W", "X"], ["Y", "Z"]]) df = DataFrame(np.arange(64).reshape(8, 8), columns=cols, index=idxs) - msg = "indexing on a MultiIndex with a nested sequence of labels" - warn = None - for lvl in [0, 1]: - key = slice_[lvl] - if isinstance(key, tuple): - for subkey in key: - if isinstance(subkey, list) and "-" in subkey: - # not present in the index level, ignored, will raise in future - warn = FutureWarning - - with tm.assert_produces_warning(warn, match=msg): - expected = df.loc[slice_] - - with tm.assert_produces_warning(warn, match=msg): - result = df.loc[non_reducing_slice(slice_)] + expected = df.loc[slice_] + result = df.loc[non_reducing_slice(slice_)] tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/formats/style/test_to_latex.py b/pandas/tests/io/formats/style/test_to_latex.py index 55b17dc37adda..97347bddaa187 100644 --- a/pandas/tests/io/formats/style/test_to_latex.py +++ b/pandas/tests/io/formats/style/test_to_latex.py @@ -12,7 +12,6 @@ from pandas.io.formats.style import Styler from pandas.io.formats.style_render import ( _parse_latex_cell_styles, - _parse_latex_css_conversion, _parse_latex_header_span, _parse_latex_table_styles, _parse_latex_table_wrapping, @@ -444,64 +443,3 @@ def test_parse_latex_table_wrapping(styler): def test_short_caption(styler): result = styler.to_latex(caption=("full cap", "short cap")) assert "\\caption[short cap]{full cap}" in result - - -@pytest.mark.parametrize( - "css, expected", - [ - ([("color", "red")], [("color", "{red}")]), # test color and input format types - ( - [("color", "rgb(128, 128, 128 )")], - [("color", "[rgb]{0.502, 0.502, 0.502}")], - ), - ( - [("color", "rgb(128, 50%, 25% )")], - [("color", "[rgb]{0.502, 0.500, 0.250}")], - ), - ( - [("color", "rgba(128,128,128,1)")], - [("color", "[rgb]{0.502, 0.502, 0.502}")], - ), - ([("color", "#FF00FF")], [("color", "[HTML]{FF00FF}")]), - ([("color", "#F0F")], [("color", "[HTML]{FF00FF}")]), - ([("font-weight", "bold")], [("bfseries", "")]), # test font-weight and types - ([("font-weight", "bolder")], [("bfseries", "")]), - ([("font-weight", "normal")], []), - ([("background-color", "red")], [("cellcolor", "{red}--lwrap")]), - ( - [("background-color", "#FF00FF")], # test background-color command and wrap - [("cellcolor", "[HTML]{FF00FF}--lwrap")], - ), - ([("font-style", "italic")], [("itshape", "")]), # test font-style and types - ([("font-style", "oblique")], [("slshape", "")]), - ([("font-style", "normal")], []), - ([("color", "red /*--dwrap*/")], [("color", "{red}--dwrap")]), # css comments - ([("background-color", "red /* --dwrap */")], [("cellcolor", "{red}--dwrap")]), - ], -) -def test_parse_latex_css_conversion(css, expected): - result = _parse_latex_css_conversion(css) - assert result == expected - - -def test_parse_latex_css_conversion_option(): - css = [("command", "option--latex--wrap")] - expected = [("command", "option--wrap")] - result = _parse_latex_css_conversion(css) - assert result == expected - - -def test_styler_object_after_render(styler): - # GH 42320 - pre_render = styler._copy(deepcopy=True) - styler.to_latex( - column_format="rllr", - position="h", - position_float="centering", - hrules=True, - label="my lab", - caption="my cap", - ) - - assert pre_render.table_styles == styler.table_styles - assert pre_render.caption == styler.caption diff --git a/pandas/tests/io/formats/test_format.py b/pandas/tests/io/formats/test_format.py index 500f8bf5ff159..c6155cac101e6 100644 --- a/pandas/tests/io/formats/test_format.py +++ b/pandas/tests/io/formats/test_format.py @@ -1369,7 +1369,7 @@ def test_to_string(self): ) lines = result.split("\n") header = lines[0].strip().split() - joined = "\n".join([re.sub(r"\s+", " ", x).strip() for x in lines[1:]]) + joined = "\n".join(re.sub(r"\s+", " ", x).strip() for x in lines[1:]) recons = read_csv(StringIO(joined), names=header, header=None, sep=" ") tm.assert_series_equal(recons["B"], biggie["B"]) assert recons["A"].count() == biggie["A"].count() diff --git a/pandas/tests/io/json/test_normalize.py b/pandas/tests/io/json/test_normalize.py index faf9fc903d7b5..a428d8c71a793 100644 --- a/pandas/tests/io/json/test_normalize.py +++ b/pandas/tests/io/json/test_normalize.py @@ -105,7 +105,6 @@ def missing_metadata(): "zip": 44646, } ], - "previous_residences": {"cities": [{"city_name": "Foo York City"}]}, }, { "addresses": [ @@ -116,8 +115,7 @@ def missing_metadata(): "state": "TN", "zip": 37643, } - ], - "previous_residences": {"cities": [{"city_name": "Barmingham"}]}, + ] }, ] @@ -600,10 +598,7 @@ def test_json_normalize_errors(self, missing_metadata): # If meta keys are not always present a new option to set # errors='ignore' has been implemented - msg = ( - "Key 'name' not found. To replace missing values of " - "'name' with np.nan, pass in errors='ignore'" - ) + msg = "Try running with errors='ignore' as key 'name' is not always present" with pytest.raises(KeyError, match=msg): json_normalize( data=missing_metadata, @@ -623,44 +618,11 @@ def test_missing_meta(self, missing_metadata): [9562, "Morris St.", "Massillon", "OH", 44646, "Alice"], [8449, "Spring St.", "Elizabethton", "TN", 37643, np.nan], ] + columns = ["city", "number", "state", "street", "zip", "name"] columns = ["number", "street", "city", "state", "zip", "name"] expected = DataFrame(ex_data, columns=columns) tm.assert_frame_equal(result, expected) - def test_missing_meta_multilevel_record_path_errors_raise(self, missing_metadata): - # GH41876 - # Ensure errors='raise' works as intended even when a record_path of length - # greater than one is passed in - msg = ( - "Key 'name' not found. To replace missing values of " - "'name' with np.nan, pass in errors='ignore'" - ) - with pytest.raises(KeyError, match=msg): - json_normalize( - data=missing_metadata, - record_path=["previous_residences", "cities"], - meta="name", - errors="raise", - ) - - def test_missing_meta_multilevel_record_path_errors_ignore(self, missing_metadata): - # GH41876 - # Ensure errors='ignore' works as intended even when a record_path of length - # greater than one is passed in - result = json_normalize( - data=missing_metadata, - record_path=["previous_residences", "cities"], - meta="name", - errors="ignore", - ) - ex_data = [ - ["Foo York City", "Alice"], - ["Barmingham", np.nan], - ] - columns = ["city_name", "name"] - expected = DataFrame(ex_data, columns=columns) - tm.assert_frame_equal(result, expected) - def test_donot_drop_nonevalues(self): # GH21356 data = [ diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index d97ba8694818b..0ffc6044a5897 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -11,6 +11,7 @@ from pandas.compat import ( IS64, + PY38, PY310, is_platform_windows, ) @@ -27,6 +28,8 @@ ) import pandas._testing as tm +pytestmark = pytest.mark.skipif(PY310, reason="timeout with coverage") + _seriesd = tm.getSeriesData() _frame = DataFrame(_seriesd) @@ -1178,7 +1181,6 @@ def test_sparse(self): expected = s.to_json() assert expected == ss.to_json() - @pytest.mark.skipif(PY310, reason="segfault GH 42130") @pytest.mark.parametrize( "ts", [ @@ -1196,7 +1198,6 @@ def test_tz_is_utc(self, ts): dt = ts.to_pydatetime() assert dumps(dt, iso_dates=True) == exp - @pytest.mark.skipif(PY310, reason="segfault GH 42130") @pytest.mark.parametrize( "tz_range", [ @@ -1714,7 +1715,7 @@ def test_json_multiindex(self, dataframe, expected): assert result == expected @pytest.mark.xfail( - is_platform_windows(), + is_platform_windows() and PY38, reason="localhost connection rejected", strict=False, ) diff --git a/pandas/tests/io/json/test_ujson.py b/pandas/tests/io/json/test_ujson.py index 57a6b214cec84..805f6b8dbe461 100644 --- a/pandas/tests/io/json/test_ujson.py +++ b/pandas/tests/io/json/test_ujson.py @@ -16,7 +16,6 @@ import pandas._libs.json as ujson from pandas.compat import ( IS64, - PY310, is_platform_windows, ) @@ -249,21 +248,7 @@ def test_double_precision(self): assert rounded_input == json.loads(output) assert rounded_input == ujson.decode(output) - @pytest.mark.parametrize( - "invalid_val", - [ - 20, - -1, - pytest.param( - "9", - marks=pytest.mark.xfail(PY310, reason="Failing on Python 3.10 GH41940"), - ), - pytest.param( - None, - marks=pytest.mark.xfail(PY310, reason="Failing on Python 3.10 GH41940"), - ), - ], - ) + @pytest.mark.parametrize("invalid_val", [20, -1, "9", None]) def test_invalid_double_precision(self, invalid_val): double_input = 30.12345678901234567890 expected_exception = ValueError if isinstance(invalid_val, int) else TypeError diff --git a/pandas/tests/io/parser/common/__init__.py b/pandas/tests/io/parser/common/__init__.py deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/pandas/tests/io/parser/common/test_chunksize.py b/pandas/tests/io/parser/common/test_chunksize.py index 86891367e9bd6..ceb770ce72b78 100644 --- a/pandas/tests/io/parser/common/test_chunksize.py +++ b/pandas/tests/io/parser/common/test_chunksize.py @@ -143,7 +143,10 @@ def test_read_chunksize_jagged_names(all_parsers): parser = all_parsers data = "\n".join(["0"] * 7 + [",".join(["0"] * 10)]) - expected = DataFrame([[0] + [np.nan] * 9] * 7 + [[0] * 10]) + # error: List item 0 has incompatible type "float"; expected "int" + expected = DataFrame( + [[0] + [np.nan] * 9] * 7 + [[0] * 10] # type: ignore[list-item] + ) with parser.read_csv(StringIO(data), names=range(10), chunksize=4) as reader: result = concat(reader) tm.assert_frame_equal(result, expected) @@ -177,17 +180,13 @@ def test_chunks_have_consistent_numerical_type(all_parsers): def test_warn_if_chunks_have_mismatched_type(all_parsers, request): warning_type = None parser = all_parsers - size = 10000 + integers = [str(i) for i in range(499999)] + data = "a\n" + "\n".join(integers + ["a", "b"] + integers) # see gh-3866: if chunks are different types and can't # be coerced using numerical types, then issue warning. if parser.engine == "c" and parser.low_memory: warning_type = DtypeWarning - # Use larger size to hit warning path - size = 499999 - - integers = [str(i) for i in range(size)] - data = "a\n" + "\n".join(integers + ["a", "b"] + integers) buf = StringIO(data) diff --git a/pandas/tests/io/parser/common/test_common_basic.py b/pandas/tests/io/parser/common/test_common_basic.py index a1c76e2740dbe..8fa2d7f7b8d65 100644 --- a/pandas/tests/io/parser/common/test_common_basic.py +++ b/pandas/tests/io/parser/common/test_common_basic.py @@ -15,7 +15,6 @@ from pandas.errors import ( EmptyDataError, ParserError, - ParserWarning, ) from pandas import ( @@ -686,8 +685,7 @@ def test_no_header_two_extra_columns(all_parsers): ref = DataFrame([["foo", "bar", "baz"]], columns=column_names) stream = StringIO("foo,bar,baz,bam,blah") parser = all_parsers - with tm.assert_produces_warning(ParserWarning): - df = parser.read_csv(stream, header=None, names=column_names, index_col=False) + df = parser.read_csv(stream, header=None, names=column_names, index_col=False) tm.assert_frame_equal(df, ref) diff --git a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py index 6ed52ed86af2a..59fd3de60e0bf 100644 --- a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py +++ b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py @@ -248,38 +248,3 @@ def test_dtype_mangle_dup_cols(all_parsers, dtypes, exp_value): result = parser.read_csv(StringIO(data), dtype={"a": str, **dtypes}) expected = DataFrame({"a": ["1"], "a.1": [exp_value]}) tm.assert_frame_equal(result, expected) - - -def test_dtype_mangle_dup_cols_single_dtype(all_parsers): - # GH#42022 - parser = all_parsers - data = """a,a\n1,1""" - result = parser.read_csv(StringIO(data), dtype=str) - expected = DataFrame({"a": ["1"], "a.1": ["1"]}) - tm.assert_frame_equal(result, expected) - - -def test_dtype_multi_index(all_parsers): - # GH 42446 - parser = all_parsers - data = "A,B,B\nX,Y,Z\n1,2,3" - - result = parser.read_csv( - StringIO(data), - header=list(range(2)), - dtype={ - ("A", "X"): np.int32, - ("B", "Y"): np.int32, - ("B", "Z"): np.float32, - }, - ) - - expected = DataFrame( - { - ("A", "X"): np.int32([1]), - ("B", "Y"): np.int32([2]), - ("B", "Z"): np.float32([3]), - } - ) - - tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/test_c_parser_only.py b/pandas/tests/io/parser/test_c_parser_only.py index 5df4470635af5..160e00f5fb930 100644 --- a/pandas/tests/io/parser/test_c_parser_only.py +++ b/pandas/tests/io/parser/test_c_parser_only.py @@ -434,10 +434,10 @@ def test_internal_null_byte(c_parser_only): def test_read_nrows_large(c_parser_only): # gh-7626 - Read only nrows of data in for large inputs (>262144b) parser = c_parser_only - header_narrow = "\t".join(["COL_HEADER_" + str(i) for i in range(10)]) + "\n" - data_narrow = "\t".join(["somedatasomedatasomedata1" for _ in range(10)]) + "\n" - header_wide = "\t".join(["COL_HEADER_" + str(i) for i in range(15)]) + "\n" - data_wide = "\t".join(["somedatasomedatasomedata2" for _ in range(15)]) + "\n" + header_narrow = "\t".join("COL_HEADER_" + str(i) for i in range(10)) + "\n" + data_narrow = "\t".join("somedatasomedatasomedata1" for _ in range(10)) + "\n" + header_wide = "\t".join("COL_HEADER_" + str(i) for i in range(15)) + "\n" + data_wide = "\t".join("somedatasomedatasomedata2" for _ in range(15)) + "\n" test_input = header_narrow + data_narrow * 1050 + header_wide + data_wide * 2 df = parser.read_csv(StringIO(test_input), sep="\t", nrows=1010) @@ -565,7 +565,7 @@ def test_bytes_exceed_2gb(c_parser_only): if parser.low_memory: pytest.skip("not a high_memory test") - csv = StringIO("strings\n" + "\n".join(["x" * (1 << 20) for _ in range(2100)])) + csv = StringIO("strings\n" + "\n".join("x" * (1 << 20) for _ in range(2100))) df = parser.read_csv(csv) assert not df.empty diff --git a/pandas/tests/io/parser/test_converters.py b/pandas/tests/io/parser/test_converters.py index 78b64baab4dc0..ffa6c8259a59e 100644 --- a/pandas/tests/io/parser/test_converters.py +++ b/pandas/tests/io/parser/test_converters.py @@ -161,29 +161,3 @@ def test_converter_index_col_bug(all_parsers): xp = DataFrame({"B": [2, 4]}, index=Index([1, 3], name="A")) tm.assert_frame_equal(rs, xp) - - -def test_converter_multi_index(all_parsers): - # GH 42446 - parser = all_parsers - data = "A,B,B\nX,Y,Z\n1,2,3" - - result = parser.read_csv( - StringIO(data), - header=list(range(2)), - converters={ - ("A", "X"): np.int32, - ("B", "Y"): np.int32, - ("B", "Z"): np.float32, - }, - ) - - expected = DataFrame( - { - ("A", "X"): np.int32([1]), - ("B", "Y"): np.int32([2]), - ("B", "Z"): np.float32([3]), - } - ) - - tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/test_multi_thread.py b/pandas/tests/io/parser/test_multi_thread.py index 123dce2048a44..981d1d438c3b0 100644 --- a/pandas/tests/io/parser/test_multi_thread.py +++ b/pandas/tests/io/parser/test_multi_thread.py @@ -44,7 +44,7 @@ def test_multi_thread_string_io_read_csv(all_parsers): num_files = 100 bytes_to_df = [ - "\n".join([f"{i:d},{i:d},{i:d}" for i in range(max_row_range)]).encode() + "\n".join(f"{i:d},{i:d},{i:d}" for i in range(max_row_range)).encode() for _ in range(num_files) ] diff --git a/pandas/tests/io/parser/test_na_values.py b/pandas/tests/io/parser/test_na_values.py index 2880bf8690b46..fecba8bd81404 100644 --- a/pandas/tests/io/parser/test_na_values.py +++ b/pandas/tests/io/parser/test_na_values.py @@ -123,7 +123,7 @@ def f(i, v): return buf - data = StringIO("\n".join([f(i, v) for i, v in enumerate(_NA_VALUES)])) + data = StringIO("\n".join(f(i, v) for i, v in enumerate(_NA_VALUES))) expected = DataFrame(np.nan, columns=range(nv), index=range(nv)) result = parser.read_csv(data, header=None) @@ -570,23 +570,3 @@ def test_str_nan_dropped(all_parsers): ) tm.assert_frame_equal(result, expected) - - -def test_nan_multi_index(all_parsers): - # GH 42446 - parser = all_parsers - data = "A,B,B\nX,Y,Z\n1,2,inf" - - result = parser.read_csv( - StringIO(data), header=list(range(2)), na_values={("B", "Z"): "inf"} - ) - - expected = DataFrame( - { - ("A", "X"): [1], - ("B", "Y"): [2], - ("B", "Z"): [np.nan], - } - ) - - tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/test_skiprows.py b/pandas/tests/io/parser/test_skiprows.py index 0735f60fabbf6..62650b4ef42a3 100644 --- a/pandas/tests/io/parser/test_skiprows.py +++ b/pandas/tests/io/parser/test_skiprows.py @@ -49,10 +49,10 @@ def test_deep_skip_rows(all_parsers): # see gh-4382 parser = all_parsers data = "a,b,c\n" + "\n".join( - [",".join([str(i), str(i + 1), str(i + 2)]) for i in range(10)] + ",".join([str(i), str(i + 1), str(i + 2)]) for i in range(10) ) condensed_data = "a,b,c\n" + "\n".join( - [",".join([str(i), str(i + 1), str(i + 2)]) for i in [0, 1, 2, 3, 4, 6, 8, 9]] + ",".join([str(i), str(i + 1), str(i + 2)]) for i in [0, 1, 2, 3, 4, 6, 8, 9] ) result = parser.read_csv(StringIO(data), skiprows=[6, 8]) diff --git a/pandas/tests/io/parser/usecols/test_usecols_basic.py b/pandas/tests/io/parser/usecols/test_usecols_basic.py index 16649be5b8a58..b86dc5ef85fc6 100644 --- a/pandas/tests/io/parser/usecols/test_usecols_basic.py +++ b/pandas/tests/io/parser/usecols/test_usecols_basic.py @@ -383,9 +383,7 @@ def test_usecols_indices_out_of_bounds(all_parsers, names): a,b 1,2 """ - with tm.assert_produces_warning( - FutureWarning, check_stacklevel=False, raise_on_extra_warnings=False - ): + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = parser.read_csv(StringIO(data), usecols=[0, 2], names=names, header=0) expected = DataFrame({"a": [1], "b": [None]}) if names is None and parser.engine == "python": diff --git a/pandas/tests/io/pytables/test_append.py b/pandas/tests/io/pytables/test_append.py index b5f9e6e74ece9..719b54a57a6c7 100644 --- a/pandas/tests/io/pytables/test_append.py +++ b/pandas/tests/io/pytables/test_append.py @@ -778,22 +778,6 @@ def test_append_raise(setup_path): with pytest.raises(ValueError, match=msg): store.append("df", df) - # incompatible type (GH 41897) - _maybe_remove(store, "df") - df["foo"] = Timestamp("20130101") - store.append("df", df) - df["foo"] = "bar" - msg = re.escape( - "invalid combination of [values_axes] on appending data " - "[name->values_block_1,cname->values_block_1," - "dtype->bytes24,kind->string,shape->(1, 30)] " - "vs current table " - "[name->values_block_1,cname->values_block_1," - "dtype->datetime64,kind->datetime64,shape->None]" - ) - with pytest.raises(ValueError, match=msg): - store.append("df", df) - def test_append_with_timedelta(setup_path): # GH 3577 diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 12a79f68d71c8..d100c584b698a 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -13,7 +13,10 @@ from pandas._config import get_option -from pandas.compat import is_platform_windows +from pandas.compat import ( + PY38, + is_platform_windows, +) from pandas.compat.pyarrow import ( pa_version_under1p0, pa_version_under2p0, @@ -648,7 +651,7 @@ def test_categorical(self, pa): check_round_trip(df, pa) @pytest.mark.xfail( - is_platform_windows(), + is_platform_windows() and PY38, reason="localhost connection rejected", strict=False, ) diff --git a/pandas/tests/io/test_pickle.py b/pandas/tests/io/test_pickle.py index 9253e5ae700c7..7cf9d7e9a1925 100644 --- a/pandas/tests/io/test_pickle.py +++ b/pandas/tests/io/test_pickle.py @@ -32,6 +32,7 @@ import pytest from pandas.compat import ( + PY38, get_lzma_file, import_lzma, is_platform_little_endian, @@ -162,9 +163,9 @@ def compare_index_period(result, expected, typ, version): tm.assert_index_equal(result.shift(2), expected.shift(2)) -here = os.path.dirname(__file__) -legacy_dirname = os.path.join(here, "data", "legacy_pickle") -files = glob.glob(os.path.join(legacy_dirname, "*", "*.pickle")) +files = glob.glob( + os.path.join(os.path.dirname(__file__), "data", "legacy_pickle", "*", "*.pickle") +) @pytest.fixture(params=files) @@ -209,6 +210,7 @@ def python_unpickler(path): pytest.param( functools.partial(pd.to_pickle, protocol=5), id="pandas_proto_5", + marks=pytest.mark.skipif(not PY38, reason="protocol 5 not supported"), ), ], ) @@ -633,13 +635,3 @@ def test_pickle_big_dataframe_compression(protocol, compression): partial(pd.read_pickle, compression=compression), ) tm.assert_frame_equal(df, result) - - -def test_pickle_frame_v124_unpickle_130(): - # GH#42345 DataFrame created in 1.2.x, unpickle in 1.3.x - path = os.path.join(legacy_dirname, "1.2.4", "empty_frame_v1_2_4-GH#42345.pkl") - with open(path, "rb") as fd: - df = pickle.load(fd) - - expected = pd.DataFrame() - tm.assert_frame_equal(df, expected) diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index 9320bf385ce0a..290e063a59be7 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -2336,7 +2336,7 @@ def psql_insert_copy(table, conn, keys, data_iter): writer.writerows(data_iter) s_buf.seek(0) - columns = ", ".join([f'"{k}"' for k in keys]) + columns = ", ".join(f'"{k}"' for k in keys) if table.schema: table_name = f"{table.schema}.{table.name}" else: @@ -2615,9 +2615,9 @@ def format_query(sql, *args): return sql % tuple(processed_args) -def tquery(query, con=None): +def tquery(query, con=None, cur=None): """Replace removed sql.tquery function""" - res = sql.execute(query, con=con).fetchall() + res = sql.execute(query, con=con, cur=cur).fetchall() if res is None: return None else: @@ -2649,10 +2649,12 @@ def test_write_row_by_row(self): cur = self.conn.cursor() cur.execute(create_sql) + cur = self.conn.cursor() + ins = "INSERT INTO test VALUES (%s, %s, %s, %s)" - for _, row in frame.iterrows(): + for idx, row in frame.iterrows(): fmt_sql = format_query(ins, *row) - tquery(fmt_sql, con=self.conn) + tquery(fmt_sql, cur=cur) self.conn.commit() @@ -2910,9 +2912,9 @@ def test_write_row_by_row(self): cur.execute(drop_sql) cur.execute(create_sql) ins = "INSERT INTO test VALUES (%s, %s, %s, %s)" - for _, row in frame.iterrows(): + for idx, row in frame.iterrows(): fmt_sql = format_query(ins, *row) - tquery(fmt_sql, con=self.conn) + tquery(fmt_sql, cur=cur) self.conn.commit() diff --git a/pandas/tests/io/xml/test_to_xml.py b/pandas/tests/io/xml/test_to_xml.py index 4f4815b9008ad..1e2973075f98e 100644 --- a/pandas/tests/io/xml/test_to_xml.py +++ b/pandas/tests/io/xml/test_to_xml.py @@ -9,12 +9,10 @@ import numpy as np import pytest +from pandas.compat import PY38 import pandas.util._test_decorators as td -from pandas import ( - DataFrame, - Index, -) +from pandas import DataFrame import pandas._testing as tm from pandas.io.common import get_handle @@ -293,45 +291,6 @@ def test_index_false_rename_row_root(datapath, parser): assert output == expected -@pytest.mark.parametrize( - "offset_index", [list(range(10, 13)), [str(i) for i in range(10, 13)]] -) -def test_index_false_with_offset_input_index(parser, offset_index): - """ - Tests that the output does not contain the `` field when the index of the - input Dataframe has an offset. - - This is a regression test for issue #42458. - """ - - expected = """\ - - - - square - 360 - 4.0 - - - circle - 360 - - - - triangle - 180 - 3.0 - -""" - - offset_geom_df = geom_df.copy() - offset_geom_df.index = Index(offset_index) - output = offset_geom_df.to_xml(index=False, parser=parser) - output = equalize_decl(output) - - assert output == expected - - # NA_REP na_expected = """\ @@ -405,6 +364,10 @@ def test_na_empty_elem_option(datapath, parser): # ATTR_COLS +@pytest.mark.skipif( + not PY38, + reason=("etree alpha ordered attributes < py 3.8"), +) def test_attrs_cols_nan_output(datapath, parser): expected = """\ @@ -420,6 +383,10 @@ def test_attrs_cols_nan_output(datapath, parser): assert output == expected +@pytest.mark.skipif( + not PY38, + reason=("etree alpha ordered attributes < py3.8"), +) def test_attrs_cols_prefix(datapath, parser): expected = """\ @@ -574,6 +541,10 @@ def test_hierarchical_columns(datapath, parser): assert output == expected +@pytest.mark.skipif( + not PY38, + reason=("etree alpha ordered attributes < py3.8"), +) def test_hierarchical_attrs_columns(datapath, parser): expected = """\ @@ -643,6 +614,10 @@ def test_multi_index(datapath, parser): assert output == expected +@pytest.mark.skipif( + not PY38, + reason=("etree alpha ordered attributes < py3.8"), +) def test_multi_index_attrs_cols(datapath, parser): expected = """\ diff --git a/pandas/tests/io/xml/test_xml.py b/pandas/tests/io/xml/test_xml.py index 7e9a03c2a59a8..823d155360908 100644 --- a/pandas/tests/io/xml/test_xml.py +++ b/pandas/tests/io/xml/test_xml.py @@ -10,6 +10,7 @@ import numpy as np import pytest +from pandas.compat import PY38 import pandas.util._test_decorators as td from pandas import DataFrame @@ -254,6 +255,10 @@ def test_parser_consistency_file(datapath): @tm.network @pytest.mark.slow @td.skip_if_no("lxml") +@pytest.mark.skipif( + not PY38, + reason=("etree alpha ordered attributes < py3.8"), +) def test_parser_consistency_url(datapath): url = ( "https://data.cityofchicago.org/api/views/" @@ -418,7 +423,6 @@ def test_url(): tm.assert_frame_equal(df_url, df_expected) -@tm.network def test_wrong_url(parser): with pytest.raises(HTTPError, match=("HTTP Error 404: Not Found")): url = "https://www.w3schools.com/xml/python.xml" diff --git a/pandas/tests/libs/test_hashtable.py b/pandas/tests/libs/test_hashtable.py index 5ff20051da8c0..aeff591e3f0dc 100644 --- a/pandas/tests/libs/test_hashtable.py +++ b/pandas/tests/libs/test_hashtable.py @@ -8,7 +8,6 @@ import pandas as pd import pandas._testing as tm -from pandas.core.algorithms import isin @contextmanager @@ -179,82 +178,14 @@ def test_no_reallocation(self, table_type, dtype): assert n_buckets_start == clean_table.get_state()["n_buckets"] -class TestPyObjectHashTableWithNans: - def test_nan_float(self): - nan1 = float("nan") - nan2 = float("nan") - assert nan1 is not nan2 - table = ht.PyObjectHashTable() - table.set_item(nan1, 42) - assert table.get_item(nan2) == 42 - - def test_nan_complex_both(self): - nan1 = complex(float("nan"), float("nan")) - nan2 = complex(float("nan"), float("nan")) - assert nan1 is not nan2 - table = ht.PyObjectHashTable() - table.set_item(nan1, 42) - assert table.get_item(nan2) == 42 - - def test_nan_complex_real(self): - nan1 = complex(float("nan"), 1) - nan2 = complex(float("nan"), 1) - other = complex(float("nan"), 2) - assert nan1 is not nan2 - table = ht.PyObjectHashTable() - table.set_item(nan1, 42) - assert table.get_item(nan2) == 42 - with pytest.raises(KeyError, match=None) as error: - table.get_item(other) - assert str(error.value) == str(other) - - def test_nan_complex_imag(self): - nan1 = complex(1, float("nan")) - nan2 = complex(1, float("nan")) - other = complex(2, float("nan")) - assert nan1 is not nan2 - table = ht.PyObjectHashTable() - table.set_item(nan1, 42) - assert table.get_item(nan2) == 42 - with pytest.raises(KeyError, match=None) as error: - table.get_item(other) - assert str(error.value) == str(other) - - def test_nan_in_tuple(self): - nan1 = (float("nan"),) - nan2 = (float("nan"),) - assert nan1[0] is not nan2[0] - table = ht.PyObjectHashTable() - table.set_item(nan1, 42) - assert table.get_item(nan2) == 42 - - def test_nan_in_nested_tuple(self): - nan1 = (1, (2, (float("nan"),))) - nan2 = (1, (2, (float("nan"),))) - other = (1, 2) - table = ht.PyObjectHashTable() - table.set_item(nan1, 42) - assert table.get_item(nan2) == 42 - with pytest.raises(KeyError, match=None) as error: - table.get_item(other) - assert str(error.value) == str(other) - - -def test_hash_equal_tuple_with_nans(): - a = (float("nan"), (float("nan"), float("nan"))) - b = (float("nan"), (float("nan"), float("nan"))) - assert ht.object_hash(a) == ht.object_hash(b) - assert ht.objects_are_equal(a, b) - - def test_get_labels_groupby_for_Int64(writable): table = ht.Int64HashTable() vals = np.array([1, 2, -1, 2, 1, -1], dtype=np.int64) vals.flags.writeable = writable arr, unique = table.get_labels_groupby(vals) - expected_arr = np.array([0, 1, -1, 1, 0, -1], dtype=np.intp) + expected_arr = np.array([0, 1, -1, 1, 0, -1], dtype=np.int64) expected_unique = np.array([1, 2], dtype=np.int64) - tm.assert_numpy_array_equal(arr, expected_arr) + tm.assert_numpy_array_equal(arr.astype(np.int64), expected_arr) tm.assert_numpy_array_equal(unique, expected_unique) @@ -346,29 +277,6 @@ def test_unique(self, table_type, dtype): assert np.all(np.isnan(unique)) and len(unique) == 1 -def test_unique_for_nan_objects_floats(): - table = ht.PyObjectHashTable() - keys = np.array([float("nan") for i in range(50)], dtype=np.object_) - unique = table.unique(keys) - assert len(unique) == 1 - - -def test_unique_for_nan_objects_complex(): - table = ht.PyObjectHashTable() - keys = np.array([complex(float("nan"), 1.0) for i in range(50)], dtype=np.object_) - unique = table.unique(keys) - assert len(unique) == 1 - - -def test_unique_for_nan_objects_tuple(): - table = ht.PyObjectHashTable() - keys = np.array( - [1] + [(1.0, (float("nan"), 1.0)) for i in range(50)], dtype=np.object_ - ) - unique = table.unique(keys) - assert len(unique) == 2 - - def get_ht_function(fun_name, type_suffix): return getattr(ht, fun_name) @@ -518,20 +426,3 @@ def test_mode(self, dtype, type_suffix): values = np.array([42, np.nan, np.nan, np.nan], dtype=dtype) assert mode(values, True) == 42 assert np.isnan(mode(values, False)) - - -def test_ismember_tuple_with_nans(): - # GH-41836 - values = [("a", float("nan")), ("b", 1)] - comps = [("a", float("nan"))] - result = isin(values, comps) - expected = np.array([True, False], dtype=np.bool_) - tm.assert_numpy_array_equal(result, expected) - - -def test_float_complex_int_are_equal_as_objects(): - values = ["a", 5, 5.0, 5.0 + 0j] - comps = list(range(129)) - result = isin(values, comps) - expected = np.array([False, True, True, True], dtype=np.bool_) - tm.assert_numpy_array_equal(result, expected) diff --git a/pandas/tests/plotting/frame/test_hist_box_by.py b/pandas/tests/plotting/frame/test_hist_box_by.py deleted file mode 100644 index ba6d232733762..0000000000000 --- a/pandas/tests/plotting/frame/test_hist_box_by.py +++ /dev/null @@ -1,389 +0,0 @@ -import re - -import numpy as np -import pytest - -import pandas.util._test_decorators as td - -from pandas import DataFrame -import pandas._testing as tm -from pandas.tests.plotting.common import ( - TestPlotBase, - _check_plot_works, -) - - -def _create_hist_box_with_by_df(): - np.random.seed(0) - df = DataFrame(np.random.randn(30, 2), columns=["A", "B"]) - df["C"] = np.random.choice(["a", "b", "c"], 30) - df["D"] = np.random.choice(["a", "b", "c"], 30) - return df - - -@td.skip_if_no_mpl -class TestHistWithBy(TestPlotBase): - def setup_method(self, method): - TestPlotBase.setup_method(self, method) - import matplotlib as mpl - - mpl.rcdefaults() - self.hist_df = _create_hist_box_with_by_df() - - @pytest.mark.parametrize( - "by, column, titles, legends", - [ - ("C", "A", ["a", "b", "c"], [["A"]] * 3), - ("C", ["A", "B"], ["a", "b", "c"], [["A", "B"]] * 3), - ("C", None, ["a", "b", "c"], [["A", "B"]] * 3), - ( - ["C", "D"], - "A", - [ - "(a, a)", - "(a, b)", - "(a, c)", - "(b, a)", - "(b, b)", - "(b, c)", - "(c, a)", - "(c, b)", - "(c, c)", - ], - [["A"]] * 9, - ), - ( - ["C", "D"], - ["A", "B"], - [ - "(a, a)", - "(a, b)", - "(a, c)", - "(b, a)", - "(b, b)", - "(b, c)", - "(c, a)", - "(c, b)", - "(c, c)", - ], - [["A", "B"]] * 9, - ), - ( - ["C", "D"], - None, - [ - "(a, a)", - "(a, b)", - "(a, c)", - "(b, a)", - "(b, b)", - "(b, c)", - "(c, a)", - "(c, b)", - "(c, c)", - ], - [["A", "B"]] * 9, - ), - ], - ) - def test_hist_plot_by_argument(self, by, column, titles, legends): - # GH 15079 - axes = _check_plot_works(self.hist_df.plot.hist, column=column, by=by) - result_titles = [ax.get_title() for ax in axes] - result_legends = [ - [legend.get_text() for legend in ax.get_legend().texts] for ax in axes - ] - - assert result_legends == legends - assert result_titles == titles - - @pytest.mark.parametrize( - "by, column, titles, legends", - [ - (0, "A", ["a", "b", "c"], [["A"]] * 3), - (0, None, ["a", "b", "c"], [["A", "B"]] * 3), - ( - [0, "D"], - "A", - [ - "(a, a)", - "(a, b)", - "(a, c)", - "(b, a)", - "(b, b)", - "(b, c)", - "(c, a)", - "(c, b)", - "(c, c)", - ], - [["A"]] * 9, - ), - ], - ) - def test_hist_plot_by_0(self, by, column, titles, legends): - # GH 15079 - df = self.hist_df.copy() - df = df.rename(columns={"C": 0}) - - axes = _check_plot_works(df.plot.hist, column=column, by=by) - result_titles = [ax.get_title() for ax in axes] - result_legends = [ - [legend.get_text() for legend in ax.get_legend().texts] for ax in axes - ] - - assert result_legends == legends - assert result_titles == titles - - @pytest.mark.parametrize( - "by, column", - [ - ([], ["A"]), - ([], ["A", "B"]), - ((), None), - ((), ["A", "B"]), - ], - ) - def test_hist_plot_empty_list_string_tuple_by(self, by, column): - # GH 15079 - msg = "No group keys passed" - with pytest.raises(ValueError, match=msg): - _check_plot_works(self.hist_df.plot.hist, column=column, by=by) - - @pytest.mark.slow - @pytest.mark.parametrize( - "by, column, layout, axes_num", - [ - (["C"], "A", (2, 2), 3), - ("C", "A", (2, 2), 3), - (["C"], ["A"], (1, 3), 3), - ("C", None, (3, 1), 3), - ("C", ["A", "B"], (3, 1), 3), - (["C", "D"], "A", (9, 1), 9), - (["C", "D"], "A", (3, 3), 9), - (["C", "D"], ["A"], (5, 2), 9), - (["C", "D"], ["A", "B"], (9, 1), 9), - (["C", "D"], None, (9, 1), 9), - (["C", "D"], ["A", "B"], (5, 2), 9), - ], - ) - def test_hist_plot_layout_with_by(self, by, column, layout, axes_num): - # GH 15079 - # _check_plot_works adds an ax so catch warning. see GH #13188 - with tm.assert_produces_warning(UserWarning): - axes = _check_plot_works( - self.hist_df.plot.hist, column=column, by=by, layout=layout - ) - self._check_axes_shape(axes, axes_num=axes_num, layout=layout) - - @pytest.mark.parametrize( - "msg, by, layout", - [ - ("larger than required size", ["C", "D"], (1, 1)), - (re.escape("Layout must be a tuple of (rows, columns)"), "C", (1,)), - ("At least one dimension of layout must be positive", "C", (-1, -1)), - ], - ) - def test_hist_plot_invalid_layout_with_by_raises(self, msg, by, layout): - # GH 15079, test if error is raised when invalid layout is given - - with pytest.raises(ValueError, match=msg): - self.hist_df.plot.hist(column=["A", "B"], by=by, layout=layout) - - @pytest.mark.slow - def test_axis_share_x_with_by(self): - # GH 15079 - ax1, ax2, ax3 = self.hist_df.plot.hist(column="A", by="C", sharex=True) - - # share x - assert ax1._shared_x_axes.joined(ax1, ax2) - assert ax2._shared_x_axes.joined(ax1, ax2) - assert ax3._shared_x_axes.joined(ax1, ax3) - assert ax3._shared_x_axes.joined(ax2, ax3) - - # don't share y - assert not ax1._shared_y_axes.joined(ax1, ax2) - assert not ax2._shared_y_axes.joined(ax1, ax2) - assert not ax3._shared_y_axes.joined(ax1, ax3) - assert not ax3._shared_y_axes.joined(ax2, ax3) - - @pytest.mark.slow - def test_axis_share_y_with_by(self): - # GH 15079 - ax1, ax2, ax3 = self.hist_df.plot.hist(column="A", by="C", sharey=True) - - # share y - assert ax1._shared_y_axes.joined(ax1, ax2) - assert ax2._shared_y_axes.joined(ax1, ax2) - assert ax3._shared_y_axes.joined(ax1, ax3) - assert ax3._shared_y_axes.joined(ax2, ax3) - - # don't share x - assert not ax1._shared_x_axes.joined(ax1, ax2) - assert not ax2._shared_x_axes.joined(ax1, ax2) - assert not ax3._shared_x_axes.joined(ax1, ax3) - assert not ax3._shared_x_axes.joined(ax2, ax3) - - @pytest.mark.parametrize("figsize", [(12, 8), (20, 10)]) - def test_figure_shape_hist_with_by(self, figsize): - # GH 15079 - axes = self.hist_df.plot.hist(column="A", by="C", figsize=figsize) - self._check_axes_shape(axes, axes_num=3, figsize=figsize) - - -@td.skip_if_no_mpl -class TestBoxWithBy(TestPlotBase): - def setup_method(self, method): - TestPlotBase.setup_method(self, method) - import matplotlib as mpl - - mpl.rcdefaults() - self.box_df = _create_hist_box_with_by_df() - - @pytest.mark.parametrize( - "by, column, titles, xticklabels", - [ - ("C", "A", ["A"], [["a", "b", "c"]]), - ( - ["C", "D"], - "A", - ["A"], - [ - [ - "(a, a)", - "(a, b)", - "(a, c)", - "(b, a)", - "(b, b)", - "(b, c)", - "(c, a)", - "(c, b)", - "(c, c)", - ] - ], - ), - ("C", ["A", "B"], ["A", "B"], [["a", "b", "c"]] * 2), - ( - ["C", "D"], - ["A", "B"], - ["A", "B"], - [ - [ - "(a, a)", - "(a, b)", - "(a, c)", - "(b, a)", - "(b, b)", - "(b, c)", - "(c, a)", - "(c, b)", - "(c, c)", - ] - ] - * 2, - ), - (["C"], None, ["A", "B"], [["a", "b", "c"]] * 2), - ], - ) - def test_box_plot_by_argument(self, by, column, titles, xticklabels): - # GH 15079 - axes = _check_plot_works(self.box_df.plot.box, column=column, by=by) - result_titles = [ax.get_title() for ax in axes] - result_xticklabels = [ - [label.get_text() for label in ax.get_xticklabels()] for ax in axes - ] - - assert result_xticklabels == xticklabels - assert result_titles == titles - - @pytest.mark.parametrize( - "by, column, titles, xticklabels", - [ - (0, "A", ["A"], [["a", "b", "c"]]), - ( - [0, "D"], - "A", - ["A"], - [ - [ - "(a, a)", - "(a, b)", - "(a, c)", - "(b, a)", - "(b, b)", - "(b, c)", - "(c, a)", - "(c, b)", - "(c, c)", - ] - ], - ), - (0, None, ["A", "B"], [["a", "b", "c"]] * 2), - ], - ) - def test_box_plot_by_0(self, by, column, titles, xticklabels): - # GH 15079 - df = self.box_df.copy() - df = df.rename(columns={"C": 0}) - - axes = _check_plot_works(df.plot.box, column=column, by=by) - result_titles = [ax.get_title() for ax in axes] - result_xticklabels = [ - [label.get_text() for label in ax.get_xticklabels()] for ax in axes - ] - - assert result_xticklabels == xticklabels - assert result_titles == titles - - @pytest.mark.parametrize( - "by, column", - [ - ([], ["A"]), - ((), "A"), - ([], None), - ((), ["A", "B"]), - ], - ) - def test_box_plot_with_none_empty_list_by(self, by, column): - # GH 15079 - msg = "No group keys passed" - with pytest.raises(ValueError, match=msg): - _check_plot_works(self.box_df.plot.box, column=column, by=by) - - @pytest.mark.slow - @pytest.mark.parametrize( - "by, column, layout, axes_num", - [ - (["C"], "A", (1, 1), 1), - ("C", "A", (1, 1), 1), - ("C", None, (2, 1), 2), - ("C", ["A", "B"], (1, 2), 2), - (["C", "D"], "A", (1, 1), 1), - (["C", "D"], None, (1, 2), 2), - ], - ) - def test_box_plot_layout_with_by(self, by, column, layout, axes_num): - # GH 15079 - axes = _check_plot_works( - self.box_df.plot.box, column=column, by=by, layout=layout - ) - self._check_axes_shape(axes, axes_num=axes_num, layout=layout) - - @pytest.mark.parametrize( - "msg, by, layout", - [ - ("larger than required size", ["C", "D"], (1, 1)), - (re.escape("Layout must be a tuple of (rows, columns)"), "C", (1,)), - ("At least one dimension of layout must be positive", "C", (-1, -1)), - ], - ) - def test_box_plot_invalid_layout_with_by_raises(self, msg, by, layout): - # GH 15079, test if error is raised when invalid layout is given - - with pytest.raises(ValueError, match=msg): - self.box_df.plot.box(column=["A", "B"], by=by, layout=layout) - - @pytest.mark.parametrize("figsize", [(12, 8), (20, 10)]) - def test_figure_shape_hist_with_by(self, figsize): - # GH 15079 - axes = self.box_df.plot.box(column="A", by="C", figsize=figsize) - self._check_axes_shape(axes, axes_num=1, figsize=figsize) diff --git a/pandas/tests/reductions/test_reductions.py b/pandas/tests/reductions/test_reductions.py index c0c1c2f057c96..2f698a82bac49 100644 --- a/pandas/tests/reductions/test_reductions.py +++ b/pandas/tests/reductions/test_reductions.py @@ -1480,10 +1480,3 @@ def test_mode_sortwarning(self): result = result.sort_values().reset_index(drop=True) tm.assert_series_equal(result, expected) - - def test_mode_boolean_with_na(self): - # GH#42107 - ser = Series([True, False, True, pd.NA], dtype="boolean") - result = ser.mode() - expected = Series({0: True}, dtype="boolean") - tm.assert_series_equal(result, expected) diff --git a/pandas/tests/resample/test_datetime_index.py b/pandas/tests/resample/test_datetime_index.py index 318289a51f781..5594659fb4b03 100644 --- a/pandas/tests/resample/test_datetime_index.py +++ b/pandas/tests/resample/test_datetime_index.py @@ -692,7 +692,7 @@ def test_asfreq_non_unique(): rng2 = rng.repeat(2).values ts = Series(np.random.randn(len(rng2)), index=rng2) - msg = "cannot reindex on an axis with duplicate labels" + msg = "cannot reindex from a duplicate axis" with pytest.raises(ValueError, match=msg): ts.asfreq("B") diff --git a/pandas/tests/reshape/concat/test_categorical.py b/pandas/tests/reshape/concat/test_categorical.py index aba14fd2fcd77..d8b5f19c6a745 100644 --- a/pandas/tests/reshape/concat/test_categorical.py +++ b/pandas/tests/reshape/concat/test_categorical.py @@ -202,24 +202,3 @@ def test_categorical_concat_gh7864(self): dfa = df1.append(df2) tm.assert_index_equal(df["grade"].cat.categories, dfa["grade"].cat.categories) - - def test_categorical_index_upcast(self): - # GH 17629 - # test upcasting to object when concatinating on categorical indexes - # with non-identical categories - - a = DataFrame({"foo": [1, 2]}, index=Categorical(["foo", "bar"])) - b = DataFrame({"foo": [4, 3]}, index=Categorical(["baz", "bar"])) - - res = pd.concat([a, b]) - exp = DataFrame({"foo": [1, 2, 4, 3]}, index=["foo", "bar", "baz", "bar"]) - - tm.assert_equal(res, exp) - - a = Series([1, 2], index=Categorical(["foo", "bar"])) - b = Series([4, 3], index=Categorical(["baz", "bar"])) - - res = pd.concat([a, b]) - exp = Series([1, 2, 4, 3], index=["foo", "bar", "baz", "bar"]) - - tm.assert_equal(res, exp) diff --git a/pandas/tests/reshape/concat/test_dataframe.py b/pandas/tests/reshape/concat/test_dataframe.py index 91c246fc9ee2d..3636139c19eef 100644 --- a/pandas/tests/reshape/concat/test_dataframe.py +++ b/pandas/tests/reshape/concat/test_dataframe.py @@ -170,13 +170,3 @@ def test_concat_dataframe_keys_bug(self, sort): # it works result = concat([t1, t2], axis=1, keys=["t1", "t2"], sort=sort) assert list(result.columns) == [("t1", "value"), ("t2", "value")] - - def test_concat_bool_with_int(self): - # GH#42092 we may want to change this to return object, but that - # would need a deprecation - df1 = DataFrame(Series([True, False, True, True], dtype="bool")) - df2 = DataFrame(Series([1, 0, 1], dtype="int64")) - - result = concat([df1, df2]) - expected = concat([df1.astype("int64"), df2]) - tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/reshape/test_melt.py b/pandas/tests/reshape/test_melt.py index 4972cb34aac69..a950c648838ff 100644 --- a/pandas/tests/reshape/test_melt.py +++ b/pandas/tests/reshape/test_melt.py @@ -403,15 +403,6 @@ def test_ignore_index_name_and_type(self): tm.assert_frame_equal(result, expected) - def test_melt_with_duplicate_columns(self): - # GH#41951 - df = DataFrame([["id", 2, 3]], columns=["a", "b", "b"]) - result = df.melt(id_vars=["a"], value_vars=["b"]) - expected = DataFrame( - [["id", "b", 2], ["id", "b", 3]], columns=["a", "variable", "value"] - ) - tm.assert_frame_equal(result, expected) - class TestLreshape: def test_pairs(self): diff --git a/pandas/tests/scalar/test_nat.py b/pandas/tests/scalar/test_nat.py index 21ed57813b60d..08c5ea706111a 100644 --- a/pandas/tests/scalar/test_nat.py +++ b/pandas/tests/scalar/test_nat.py @@ -9,6 +9,7 @@ import pytz from pandas._libs.tslibs import iNaT +import pandas.compat as compat from pandas.core.dtypes.common import is_datetime64_any_dtype @@ -37,7 +38,7 @@ @pytest.mark.parametrize( "nat,idx", [ - (Timestamp("NaT"), DatetimeArray), + (Timestamp("NaT"), DatetimeIndex), (Timedelta("NaT"), TimedeltaIndex), (Period("NaT", freq="M"), PeriodArray), ], @@ -83,7 +84,7 @@ def test_nat_vector_field_access(): ser = Series(idx) - for field in DatetimeArray._field_ops: + for field in DatetimeIndex._field_ops: # weekday is a property of DTI, but a method # on NaT/Timestamp for compat with datetime if field == "weekday": @@ -96,7 +97,7 @@ def test_nat_vector_field_access(): expected = [getattr(x, field) for x in idx] tm.assert_series_equal(result, Series(expected)) - for field in DatetimeArray._bool_ops: + for field in DatetimeIndex._bool_ops: result = getattr(ser.dt, field) expected = [getattr(x, field) for x in idx] tm.assert_series_equal(result, Series(expected)) @@ -137,7 +138,13 @@ def test_round_nat(klass, method, freq): "dst", "fromordinal", "fromtimestamp", - "fromisocalendar", + pytest.param( + "fromisocalendar", + marks=pytest.mark.skipif( + not compat.PY38, + reason="'fromisocalendar' was added in stdlib datetime in python 3.8", + ), + ), "isocalendar", "strftime", "strptime", @@ -308,6 +315,11 @@ def test_overlap_public_nat_methods(klass, expected): # NaT should have *most* of the Timestamp and Timedelta methods. # In case when Timestamp, Timedelta, and NaT are overlap, the overlap # is considered to be with Timestamp and NaT, not Timedelta. + + # "fromisocalendar" was introduced in 3.8 + if klass is Timestamp and not compat.PY38: + expected.remove("fromisocalendar") + assert _get_overlap_public_nat_methods(klass) == expected diff --git a/pandas/tests/scalar/timedelta/test_arithmetic.py b/pandas/tests/scalar/timedelta/test_arithmetic.py index 7dfda0463ecaf..9f6cdbb81bd89 100644 --- a/pandas/tests/scalar/timedelta/test_arithmetic.py +++ b/pandas/tests/scalar/timedelta/test_arithmetic.py @@ -10,6 +10,7 @@ import numpy as np import pytest +from pandas.compat import is_numpy_dev from pandas.errors import OutOfBoundsTimedelta import pandas as pd @@ -17,6 +18,7 @@ NaT, Timedelta, Timestamp, + compat, offsets, ) import pandas._testing as tm @@ -432,7 +434,15 @@ def test_td_div_numeric_scalar(self): "nan", [ np.nan, - np.float64("NaN"), + pytest.param( + np.float64("NaN"), + marks=pytest.mark.xfail( + # Works on numpy dev only in python 3.9 + is_numpy_dev and not compat.PY39, + raises=RuntimeWarning, + reason="https://github.com/pandas-dev/pandas/issues/31992", + ), + ), float("nan"), ], ) diff --git a/pandas/tests/scalar/timedelta/test_constructors.py b/pandas/tests/scalar/timedelta/test_constructors.py index 34b725eb9fe77..ea4a56be6da48 100644 --- a/pandas/tests/scalar/timedelta/test_constructors.py +++ b/pandas/tests/scalar/timedelta/test_constructors.py @@ -204,26 +204,15 @@ def test_overflow_on_construction(): Timedelta(timedelta(days=13 * 19999)) -@pytest.mark.parametrize( - "val, unit, name", - [ - (3508, "M", " months"), - (15251, "W", " weeks"), # 1 - (106752, "D", " days"), # change from previous: - (2562048, "h", " hours"), # 0 hours - (153722868, "m", " minutes"), # 13 minutes - (9223372037, "s", " seconds"), # 44 seconds - ], -) -def test_construction_out_of_bounds_td64(val, unit, name): +def test_construction_out_of_bounds_td64(): # TODO: parametrize over units just above/below the implementation bounds # once GH#38964 is resolved # Timedelta.max is just under 106752 days - td64 = np.timedelta64(val, unit) + td64 = np.timedelta64(106752, "D") assert td64.astype("m8[ns]").view("i8") < 0 # i.e. naive astype will be wrong - msg = str(val) + name + msg = "106752 days" with pytest.raises(OutOfBoundsTimedelta, match=msg): Timedelta(td64) @@ -233,7 +222,7 @@ def test_construction_out_of_bounds_td64(val, unit, name): td64 *= -1 assert td64.astype("m8[ns]").view("i8") > 0 # i.e. naive astype will be wrong - with pytest.raises(OutOfBoundsTimedelta, match="-" + msg): + with pytest.raises(OutOfBoundsTimedelta, match=msg): Timedelta(td64) # But just back in bounds and we are OK diff --git a/pandas/tests/scalar/timedelta/test_timedelta.py b/pandas/tests/scalar/timedelta/test_timedelta.py index 4aa2f62fe85a0..8b42bca8b8a0c 100644 --- a/pandas/tests/scalar/timedelta/test_timedelta.py +++ b/pandas/tests/scalar/timedelta/test_timedelta.py @@ -4,7 +4,6 @@ import numpy as np import pytest -from pandas._libs import lib from pandas._libs.tslibs import ( NaT, iNaT, @@ -392,7 +391,8 @@ def test_round_implementation_bounds(self): "method", [Timedelta.round, Timedelta.floor, Timedelta.ceil] ) def test_round_sanity(self, method, n, request): - val = np.random.randint(iNaT + 1, lib.i8max, dtype=np.int64) + iinfo = np.iinfo(np.int64) + val = np.random.randint(iinfo.min + 1, iinfo.max, dtype=np.int64) td = Timedelta(val) assert method(td, "ns") == td @@ -552,8 +552,8 @@ def test_implementation_limits(self): # GH 12727 # timedelta limits correspond to int64 boundaries - assert min_td.value == iNaT + 1 - assert max_td.value == lib.i8max + assert min_td.value == np.iinfo(np.int64).min + 1 + assert max_td.value == np.iinfo(np.int64).max # Beyond lower limit, a NAT before the Overflow assert (min_td - Timedelta(1, "ns")) is NaT diff --git a/pandas/tests/scalar/timestamp/test_constructors.py b/pandas/tests/scalar/timestamp/test_constructors.py index b3deb1a57e5c3..16ce51a88340e 100644 --- a/pandas/tests/scalar/timestamp/test_constructors.py +++ b/pandas/tests/scalar/timestamp/test_constructors.py @@ -17,6 +17,7 @@ Period, Timedelta, Timestamp, + compat, ) import pandas._testing as tm @@ -24,24 +25,6 @@ class TestTimestampConstructors: - def test_constructor_datetime64_with_tz(self): - # GH#42288, GH#24559 - dt = np.datetime64("1970-01-01 05:00:00") - tzstr = "UTC+05:00" - - msg = "interpreted as a wall time" - with tm.assert_produces_warning(FutureWarning, match=msg): - ts = Timestamp(dt, tz=tzstr) - - # Check that we match the old behavior - alt = Timestamp(dt).tz_localize("UTC").tz_convert(tzstr) - assert ts == alt - - # Check that we *don't* match the future behavior - assert ts.hour != 5 - expected_future = Timestamp(dt).tz_localize(tzstr) - assert ts != expected_future - def test_constructor(self): base_str = "2014-07-01 09:00" base_dt = datetime(2014, 7, 1, 9) @@ -586,6 +569,10 @@ class SubDatetime(datetime): expected = Timestamp(2000, 1, 1) assert result == expected + @pytest.mark.skipif( + not compat.PY38, + reason="datetime.fromisocalendar was added in Python version 3.8", + ) def test_constructor_fromisocalendar(self): # GH 30395 expected_timestamp = Timestamp("2000-01-03 00:00:00") diff --git a/pandas/tests/scalar/timestamp/test_timestamp.py b/pandas/tests/scalar/timestamp/test_timestamp.py index f2010b33538fb..e13242e60e3a3 100644 --- a/pandas/tests/scalar/timestamp/test_timestamp.py +++ b/pandas/tests/scalar/timestamp/test_timestamp.py @@ -6,7 +6,6 @@ timedelta, ) import locale -import pickle import unicodedata from dateutil.tz import tzutc @@ -441,17 +440,6 @@ def test_tz_conversion_freq(self, tz_naive_fixture): t2 = Timestamp("2019-01-02 12:00", tz="UTC", freq="T") assert t2.tz_convert(tz="UTC").freq == t2.freq - def test_pickle_freq_no_warning(self): - # GH#41949 we don't want a warning on unpickling - with tm.assert_produces_warning(FutureWarning, match="freq"): - ts = Timestamp("2019-01-01 10:00", freq="H") - - out = pickle.dumps(ts) - with tm.assert_produces_warning(None): - res = pickle.loads(out) - - assert res._freq == ts._freq - class TestTimestampNsOperations: def test_nanosecond_string_parsing(self): diff --git a/pandas/tests/scalar/timestamp/test_unary_ops.py b/pandas/tests/scalar/timestamp/test_unary_ops.py index 366c0f7cf2f74..aab0b2e6d31ef 100644 --- a/pandas/tests/scalar/timestamp/test_unary_ops.py +++ b/pandas/tests/scalar/timestamp/test_unary_ops.py @@ -6,13 +6,11 @@ import pytz from pytz import utc -from pandas._libs import lib from pandas._libs.tslibs import ( NaT, Timedelta, Timestamp, conversion, - iNaT, to_offset, ) from pandas._libs.tslibs.period import INVALID_FREQ_ERR_MSG @@ -281,7 +279,8 @@ def test_round_implementation_bounds(self): "method", [Timestamp.round, Timestamp.floor, Timestamp.ceil] ) def test_round_sanity(self, method, n): - val = np.random.randint(iNaT + 1, lib.i8max, dtype=np.int64) + iinfo = np.iinfo(np.int64) + val = np.random.randint(iinfo.min + 1, iinfo.max, dtype=np.int64) ts = Timestamp(val) def checker(res, ts, nanos): diff --git a/pandas/tests/series/accessors/test_dt_accessor.py b/pandas/tests/series/accessors/test_dt_accessor.py index 61a22dad5d09b..076de881eaf96 100644 --- a/pandas/tests/series/accessors/test_dt_accessor.py +++ b/pandas/tests/series/accessors/test_dt_accessor.py @@ -33,7 +33,6 @@ ) import pandas._testing as tm from pandas.core.arrays import ( - DatetimeArray, PeriodArray, TimedeltaArray, ) @@ -48,7 +47,7 @@ def test_dt_namespace_accessor(self): ok_for_period = PeriodArray._datetimelike_ops ok_for_period_methods = ["strftime", "to_timestamp", "asfreq"] - ok_for_dt = DatetimeArray._datetimelike_ops + ok_for_dt = DatetimeIndex._datetimelike_ops ok_for_dt_methods = [ "to_period", "to_pydatetime", diff --git a/pandas/tests/series/indexing/test_setitem.py b/pandas/tests/series/indexing/test_setitem.py index cfb617cd7098b..13054062defb4 100644 --- a/pandas/tests/series/indexing/test_setitem.py +++ b/pandas/tests/series/indexing/test_setitem.py @@ -10,7 +10,6 @@ Categorical, DatetimeIndex, Index, - IntervalIndex, MultiIndex, NaT, Series, @@ -907,41 +906,3 @@ def val(self): def is_inplace(self, obj): # This is specific to the 4 cases currently implemented for this class. return obj.dtype.kind != "i" - - -def test_setitem_int_as_positional_fallback_deprecation(): - # GH#42215 deprecated falling back to positional on __setitem__ with an - # int not contained in the index - ser = Series([1, 2, 3, 4], index=[1.1, 2.1, 3.0, 4.1]) - assert not ser.index._should_fallback_to_positional - # assert not ser.index.astype(object)._should_fallback_to_positional - - with tm.assert_produces_warning(None): - # 3.0 is in our index, so future behavior is unchanged - ser[3] = 10 - expected = Series([1, 2, 10, 4], index=ser.index) - tm.assert_series_equal(ser, expected) - - msg = "Treating integers as positional in Series.__setitem__" - with tm.assert_produces_warning(FutureWarning, match=msg): - with pytest.raises(IndexError, match="index 5 is out of bounds"): - ser[5] = 5 - # Once the deprecation is enforced, we will have - # expected = Series([1, 2, 3, 4, 5], index=[1.1, 2.1, 3.0, 4.1, 5.0]) - - ii = IntervalIndex.from_breaks(range(10))[::2] - ser2 = Series(range(len(ii)), index=ii) - expected2 = ser2.copy() - expected2.iloc[-1] = 9 - with tm.assert_produces_warning(FutureWarning, match=msg): - ser2[4] = 9 - tm.assert_series_equal(ser2, expected2) - - mi = MultiIndex.from_product([ser.index, ["A", "B"]]) - ser3 = Series(range(len(mi)), index=mi) - expected3 = ser3.copy() - expected3.iloc[4] = 99 - - with tm.assert_produces_warning(FutureWarning, match=msg): - ser3[4] = 99 - tm.assert_series_equal(ser3, expected3) diff --git a/pandas/tests/series/methods/test_between.py b/pandas/tests/series/methods/test_between.py index 9c11b71e4bee6..381c733619c6b 100644 --- a/pandas/tests/series/methods/test_between.py +++ b/pandas/tests/series/methods/test_between.py @@ -1,5 +1,4 @@ import numpy as np -import pytest from pandas import ( Series, @@ -29,7 +28,7 @@ def test_between_datetime_values(self): expected = ser[3:18].dropna() tm.assert_series_equal(result, expected) - result = ser[ser.between(ser[3], ser[17], inclusive="neither")] + result = ser[ser.between(ser[3], ser[17], inclusive=False)] expected = ser[5:16].dropna() tm.assert_series_equal(result, expected) @@ -39,48 +38,3 @@ def test_between_period_values(self): result = ser.between(left, right) expected = (ser >= left) & (ser <= right) tm.assert_series_equal(result, expected) - - def test_between_inclusive_string(self): # :issue:`40628` - series = Series(date_range("1/1/2000", periods=10)) - left, right = series[[2, 7]] - - result = series.between(left, right, inclusive="both") - expected = (series >= left) & (series <= right) - tm.assert_series_equal(result, expected) - - result = series.between(left, right, inclusive="left") - expected = (series >= left) & (series < right) - tm.assert_series_equal(result, expected) - - result = series.between(left, right, inclusive="right") - expected = (series > left) & (series <= right) - tm.assert_series_equal(result, expected) - - result = series.between(left, right, inclusive="neither") - expected = (series > left) & (series < right) - tm.assert_series_equal(result, expected) - - def test_between_error_args(self): # :issue:`40628` - series = Series(date_range("1/1/2000", periods=10)) - left, right = series[[2, 7]] - - value_error_msg = ( - "Inclusive has to be either string of 'both'," - "'left', 'right', or 'neither'." - ) - - with pytest.raises(ValueError, match=value_error_msg): - series = Series(date_range("1/1/2000", periods=10)) - series.between(left, right, inclusive="yes") - - def test_between_inclusive_warning(self): - series = Series(date_range("1/1/2000", periods=10)) - left, right = series[[2, 7]] - with tm.assert_produces_warning(FutureWarning): - result = series.between(left, right, inclusive=False) - expected = (series > left) & (series < right) - tm.assert_series_equal(result, expected) - with tm.assert_produces_warning(FutureWarning): - result = series.between(left, right, inclusive=True) - expected = (series >= left) & (series <= right) - tm.assert_series_equal(result, expected) diff --git a/pandas/tests/series/methods/test_fillna.py b/pandas/tests/series/methods/test_fillna.py index 03e126587ce1a..1aec2a5e5d726 100644 --- a/pandas/tests/series/methods/test_fillna.py +++ b/pandas/tests/series/methods/test_fillna.py @@ -677,14 +677,14 @@ def test_fillna_categorical_raises(self): cat = ser._values msg = "Cannot setitem on a Categorical with a new category" - with pytest.raises(TypeError, match=msg): + with pytest.raises(ValueError, match=msg): ser.fillna("d") msg2 = "Length of 'value' does not match." with pytest.raises(ValueError, match=msg2): cat.fillna(Series("d")) - with pytest.raises(TypeError, match=msg): + with pytest.raises(ValueError, match=msg): ser.fillna({1: "d", 3: "a"}) msg = '"value" parameter must be a scalar or dict, but you passed a "list"' diff --git a/pandas/tests/series/methods/test_isin.py b/pandas/tests/series/methods/test_isin.py index d3a3434872826..898a769dfac48 100644 --- a/pandas/tests/series/methods/test_isin.py +++ b/pandas/tests/series/methods/test_isin.py @@ -156,27 +156,6 @@ def test_isin_float_in_int_series(self, values): expected = Series([True, False]) tm.assert_series_equal(result, expected) - @pytest.mark.parametrize("dtype", ["boolean", "Int64", "Float64"]) - @pytest.mark.parametrize( - "data,values,expected", - [ - ([0, 1, 0], [1], [False, True, False]), - ([0, 1, 0], [1, pd.NA], [False, True, False]), - ([0, pd.NA, 0], [1, 0], [True, False, True]), - ([0, 1, pd.NA], [1, pd.NA], [False, True, True]), - ([0, 1, pd.NA], [1, np.nan], [False, True, False]), - ([0, pd.NA, pd.NA], [np.nan, pd.NaT, None], [False, False, False]), - ], - ) - def test_isin_masked_types(self, dtype, data, values, expected): - # GH#42405 - ser = Series(data, dtype=dtype) - - result = ser.isin(values) - expected = Series(expected, dtype="boolean") - - tm.assert_series_equal(result, expected) - @pytest.mark.slow def test_isin_large_series_mixed_dtypes_and_nan(): diff --git a/pandas/tests/series/methods/test_shift.py b/pandas/tests/series/methods/test_shift.py index df270f3e0f85c..73684e300ed77 100644 --- a/pandas/tests/series/methods/test_shift.py +++ b/pandas/tests/series/methods/test_shift.py @@ -169,7 +169,7 @@ def test_shift_categorical_fill_value(self): tm.assert_equal(res, expected) # check for incorrect fill_value - msg = r"Cannot setitem on a Categorical with a new category \(f\)" + msg = "'fill_value=f' is not present in this Categorical's categories" with pytest.raises(TypeError, match=msg): ts.shift(1, fill_value="f") diff --git a/pandas/tests/series/methods/test_truncate.py b/pandas/tests/series/methods/test_truncate.py index a3a27a744b180..ca5c3e2639097 100644 --- a/pandas/tests/series/methods/test_truncate.py +++ b/pandas/tests/series/methods/test_truncate.py @@ -55,11 +55,3 @@ def test_truncate_one_element_series(self): # the input Series and the expected Series are the same tm.assert_series_equal(result, series) - - def test_truncate_index_only_one_unique_value(self): - # GH 42365 - obj = Series(0, index=date_range("2021-06-30", "2021-06-30")).repeat(5) - - truncated = obj.truncate("2021-06-28", "2021-07-01") - - tm.assert_series_equal(truncated, obj) diff --git a/pandas/tests/series/test_api.py b/pandas/tests/series/test_api.py index b49c209a59a06..eddf57c1e88f3 100644 --- a/pandas/tests/series/test_api.py +++ b/pandas/tests/series/test_api.py @@ -101,7 +101,7 @@ def test_index_tab_completion(self, index): def test_not_hashable(self): s_empty = Series(dtype=object) s = Series([1]) - msg = "unhashable type: 'Series'" + msg = "'Series' objects are mutable, thus they cannot be hashed" with pytest.raises(TypeError, match=msg): hash(s_empty) with pytest.raises(TypeError, match=msg): diff --git a/pandas/tests/series/test_arithmetic.py b/pandas/tests/series/test_arithmetic.py index 4d1c75da72399..aac26c13c2a7c 100644 --- a/pandas/tests/series/test_arithmetic.py +++ b/pandas/tests/series/test_arithmetic.py @@ -874,7 +874,7 @@ def test_none_comparison(series_with_simple_index): # bug brought up by #1079 # changed from TypeError in 0.17.0 - series.iloc[0] = np.nan + series[0] = np.nan # noinspection PyComparisonWithNone result = series == None # noqa @@ -924,7 +924,7 @@ def test_series_varied_multiindex_alignment(): [1000 * i for i in range(1, 5)], index=pd.MultiIndex.from_product([list("xy"), [1, 2]], names=["xy", "num"]), ) - result = s1.loc[pd.IndexSlice[["a"], :, :]] + s2 + result = s1.loc[pd.IndexSlice["a", :, :]] + s2 expected = Series( [1000, 2001, 3002, 4003], index=pd.MultiIndex.from_tuples( diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index b4836dffffa06..4df95d895e475 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -1747,7 +1747,7 @@ def test_scipy_compat(self): def _check(arr): mask = ~np.isfinite(arr) arr = arr.copy() - result = libalgos.rank_1d(arr) + result = libalgos.rank_1d(arr, labels=np.zeros(len(arr), dtype=np.intp)) arr[mask] = np.inf exp = rankdata(arr) exp[mask] = np.nan diff --git a/pandas/tests/test_common.py b/pandas/tests/test_common.py index 5e9a53f32e0b7..93c95b3004876 100644 --- a/pandas/tests/test_common.py +++ b/pandas/tests/test_common.py @@ -5,6 +5,8 @@ import numpy as np import pytest +from pandas.compat import np_version_under1p18 + import pandas as pd from pandas import Series import pandas._testing as tm @@ -70,18 +72,19 @@ def test_random_state(): # Check BitGenerators # GH32503 - assert ( - com.random_state(npr.MT19937(3)).uniform() - == npr.RandomState(npr.MT19937(3)).uniform() - ) - assert ( - com.random_state(npr.PCG64(11)).uniform() - == npr.RandomState(npr.PCG64(11)).uniform() - ) + if not np_version_under1p18: + assert ( + com.random_state(npr.MT19937(3)).uniform() + == npr.RandomState(npr.MT19937(3)).uniform() + ) + assert ( + com.random_state(npr.PCG64(11)).uniform() + == npr.RandomState(npr.PCG64(11)).uniform() + ) # Error for floats or strings msg = ( - "random_state must be an integer, array-like, a BitGenerator, Generator, " + "random_state must be an integer, array-like, a BitGenerator, " "a numpy RandomState, or None" ) with pytest.raises(ValueError, match=msg): @@ -162,28 +165,3 @@ def test_non_bool_array_with_na(self): # in particular, this should not raise arr = np.array(["A", "B", np.nan], dtype=object) assert not com.is_bool_indexer(arr) - - def test_list_subclass(self): - # GH#42433 - - class MyList(list): - pass - - val = MyList(["a"]) - - assert not com.is_bool_indexer(val) - - val = MyList([True]) - assert com.is_bool_indexer(val) - - def test_frozenlist(self): - # GH#42461 - data = {"col1": [1, 2], "col2": [3, 4]} - df = pd.DataFrame(data=data) - - frozen = df.index.names[1:] - assert not com.is_bool_indexer(frozen) - - result = df[frozen] - expected = df[[]] - tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/test_downstream.py b/pandas/tests/test_downstream.py index 6a7d64235f11c..ea95f90d3a2cb 100644 --- a/pandas/tests/test_downstream.py +++ b/pandas/tests/test_downstream.py @@ -59,11 +59,7 @@ def test_xarray_cftimeindex_nearest(): import xarray times = xarray.cftime_range("0001", periods=2) - key = cftime.DatetimeGregorian(2000, 1, 1) - with tm.assert_produces_warning( - FutureWarning, match="deprecated", check_stacklevel=False - ): - result = times.get_loc(key, method="nearest") + result = times.get_loc(cftime.DatetimeGregorian(2000, 1, 1), method="nearest") expected = 1 assert result == expected diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index 9da7951c199ca..121ca99785831 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -957,40 +957,18 @@ def test_to_datetime_cache_scalar(self): expected = Timestamp("20130101 00:00:00") assert result == expected - @pytest.mark.parametrize( - "datetimelikes,expected_values", - ( - ( - (None, np.nan) + (NaT,) * start_caching_at, - (NaT,) * (start_caching_at + 2), - ), - ( - (None, Timestamp("2012-07-26")) + (NaT,) * start_caching_at, - (NaT, Timestamp("2012-07-26")) + (NaT,) * start_caching_at, - ), - ( - (None,) - + (NaT,) * start_caching_at - + ("2012 July 26", Timestamp("2012-07-26")), - (NaT,) * (start_caching_at + 1) - + (Timestamp("2012-07-26"), Timestamp("2012-07-26")), - ), - ), - ) - def test_convert_object_to_datetime_with_cache( - self, datetimelikes, expected_values - ): + def test_convert_object_to_datetime_with_cache(self): # GH#39882 ser = Series( - datetimelikes, + [None] + [NaT] * start_caching_at + [Timestamp("2012-07-26")], dtype="object", ) - result_series = to_datetime(ser, errors="coerce") - expected_series = Series( - expected_values, + result = to_datetime(ser, errors="coerce") + expected = Series( + [NaT] * (start_caching_at + 1) + [Timestamp("2012-07-26")], dtype="datetime64[ns]", ) - tm.assert_series_equal(result_series, expected_series) + tm.assert_series_equal(result, expected) @pytest.mark.parametrize( "date, format", diff --git a/pandas/tests/tseries/offsets/test_dst.py b/pandas/tests/tseries/offsets/test_dst.py index 9721d7fbd9067..0ae94b6b57640 100644 --- a/pandas/tests/tseries/offsets/test_dst.py +++ b/pandas/tests/tseries/offsets/test_dst.py @@ -4,7 +4,6 @@ from datetime import timedelta import pytest -import pytz from pandas._libs.tslibs import Timestamp from pandas._libs.tslibs.offsets import ( @@ -16,7 +15,6 @@ BYearEnd, CBMonthBegin, CBMonthEnd, - CustomBusinessDay, DateOffset, Day, MonthBegin, @@ -175,51 +173,3 @@ def test_all_offset_classes(self, tup): first = Timestamp(test_values[0], tz="US/Eastern") + offset() second = Timestamp(test_values[1], tz="US/Eastern") assert first == second - - -@pytest.mark.xfail( - strict=False, reason="'Africa/Kinshasa' test case fails under pytz=2017.3" -) -@pytest.mark.parametrize( - "original_dt, target_dt, offset, tz", - [ - ( - Timestamp("1900-01-01"), - Timestamp("1905-07-01"), - MonthBegin(66), - "Africa/Kinshasa", - ), # GH41906 - ( - Timestamp("2021-10-01 01:15"), - Timestamp("2021-10-31 01:15"), - MonthEnd(1), - "Europe/London", - ), - ( - Timestamp("2010-12-05 02:59"), - Timestamp("2010-10-31 02:59"), - SemiMonthEnd(-3), - "Europe/Paris", - ), - ( - Timestamp("2021-10-31 01:20"), - Timestamp("2021-11-07 01:20"), - CustomBusinessDay(2, weekmask="Sun Mon"), - "US/Eastern", - ), - ( - Timestamp("2020-04-03 01:30"), - Timestamp("2020-11-01 01:30"), - YearBegin(1, month=11), - "America/Chicago", - ), - ], -) -def test_nontick_offset_with_ambiguous_time_error(original_dt, target_dt, offset, tz): - # .apply for non-Tick offsets throws AmbiguousTimeError when the target dt - # is dst-ambiguous - localized_dt = original_dt.tz_localize(tz) - - msg = f"Cannot infer dst time from {target_dt}, try using the 'ambiguous' argument" - with pytest.raises(pytz.AmbiguousTimeError, match=msg): - localized_dt + offset diff --git a/pandas/tests/tseries/offsets/test_offsets_properties.py b/pandas/tests/tseries/offsets/test_offsets_properties.py index 2d88f6690a794..8e0ace7775868 100644 --- a/pandas/tests/tseries/offsets/test_offsets_properties.py +++ b/pandas/tests/tseries/offsets/test_offsets_properties.py @@ -100,12 +100,9 @@ def test_on_offset_implementations(dt, offset): # (dt + offset) - offset == dt try: compare = (dt + offset) - offset - except (pytz.NonExistentTimeError, pytz.AmbiguousTimeError): - # When dt + offset does not exist or is DST-ambiguous, assume(False) to - # indicate to hypothesis that this is not a valid test case - # DST-ambiguous example (GH41906): - # dt = datetime.datetime(1900, 1, 1, tzinfo=pytz.timezone('Africa/Kinshasa')) - # offset = MonthBegin(66) + except pytz.NonExistentTimeError: + # dt + offset does not exist, assume(False) to indicate + # to hypothesis that this is not a valid test case assume(False) assert offset.is_on_offset(dt) == (compare == dt) diff --git a/pandas/tests/util/test_assert_index_equal.py b/pandas/tests/util/test_assert_index_equal.py index 8211b52fed650..1778b6fb9d832 100644 --- a/pandas/tests/util/test_assert_index_equal.py +++ b/pandas/tests/util/test_assert_index_equal.py @@ -58,30 +58,15 @@ def test_index_equal_length_mismatch(check_exact): tm.assert_index_equal(idx1, idx2, check_exact=check_exact) -@pytest.mark.parametrize("exact", [False, "equiv"]) -def test_index_equal_class(exact): - idx1 = Index([0, 1, 2]) - idx2 = RangeIndex(3) - - tm.assert_index_equal(idx1, idx2, exact=exact) - - -@pytest.mark.parametrize( - "idx_values, msg_str", - [ - [[1, 2, 3.0], "Float64Index\\(\\[1\\.0, 2\\.0, 3\\.0\\], dtype='float64'\\)"], - [range(3), "RangeIndex\\(start=0, stop=3, step=1\\)"], - ], -) -def test_index_equal_class_mismatch(check_exact, idx_values, msg_str): - msg = f"""Index are different +def test_index_equal_class_mismatch(check_exact): + msg = """Index are different Index classes are different \\[left\\]: Int64Index\\(\\[1, 2, 3\\], dtype='int64'\\) -\\[right\\]: {msg_str}""" +\\[right\\]: Float64Index\\(\\[1\\.0, 2\\.0, 3\\.0\\], dtype='float64'\\)""" idx1 = Index([1, 2, 3]) - idx2 = Index(idx_values) + idx2 = Index([1, 2, 3.0]) with pytest.raises(AssertionError, match=msg): tm.assert_index_equal(idx1, idx2, exact=True, check_exact=check_exact) diff --git a/pandas/tests/util/test_assert_produces_warning.py b/pandas/tests/util/test_assert_produces_warning.py index e3eb083e1a383..45699fa1294d3 100644 --- a/pandas/tests/util/test_assert_produces_warning.py +++ b/pandas/tests/util/test_assert_produces_warning.py @@ -94,49 +94,19 @@ def test_catch_warning_category_and_match(category, message, match): warnings.warn(message, category) -def test_fail_to_match_runtime_warning(): - category = RuntimeWarning - match = "Did not see this warning" - unmatched = ( - r"Did not see warning 'RuntimeWarning' matching 'Did not see this warning'. " - r"The emitted warning messages are " - r"\[RuntimeWarning\('This is not a match.'\), " - r"RuntimeWarning\('Another unmatched warning.'\)\]" - ) - with pytest.raises(AssertionError, match=unmatched): - with tm.assert_produces_warning(category, match=match): - warnings.warn("This is not a match.", category) - warnings.warn("Another unmatched warning.", category) - - -def test_fail_to_match_future_warning(): - category = FutureWarning - match = "Warning" - unmatched = ( - r"Did not see warning 'FutureWarning' matching 'Warning'. " - r"The emitted warning messages are " - r"\[FutureWarning\('This is not a match.'\), " - r"FutureWarning\('Another unmatched warning.'\)\]" - ) - with pytest.raises(AssertionError, match=unmatched): - with tm.assert_produces_warning(category, match=match): - warnings.warn("This is not a match.", category) - warnings.warn("Another unmatched warning.", category) - - -def test_fail_to_match_resource_warning(): - category = ResourceWarning - match = r"\d+" - unmatched = ( - r"Did not see warning 'ResourceWarning' matching '\\d\+'. " - r"The emitted warning messages are " - r"\[ResourceWarning\('This is not a match.'\), " - r"ResourceWarning\('Another unmatched warning.'\)\]" - ) - with pytest.raises(AssertionError, match=unmatched): +@pytest.mark.parametrize( + "message, match", + [ + ("Warning message", "Not this message"), + ("Warning message", "warning"), + ("Warning message", r"\d+"), + ], +) +def test_fail_to_match(category, message, match): + msg = f"Did not see warning {repr(category.__name__)} matching" + with pytest.raises(AssertionError, match=msg): with tm.assert_produces_warning(category, match=match): - warnings.warn("This is not a match.", category) - warnings.warn("Another unmatched warning.", category) + warnings.warn(message, category) def test_fail_to_catch_actual_warning(pair_different_warnings): diff --git a/pandas/tests/util/test_hashing.py b/pandas/tests/util/test_hashing.py index e4a46de11ceb7..8ce24dc963dc5 100644 --- a/pandas/tests/util/test_hashing.py +++ b/pandas/tests/util/test_hashing.py @@ -255,32 +255,6 @@ def test_hash_keys(): assert (a != b).all() -def test_df_hash_keys(): - # DataFrame version of the test_hash_keys. - # https://github.com/pandas-dev/pandas/issues/41404 - obj = DataFrame({"x": np.arange(3), "y": list("abc")}) - - a = hash_pandas_object(obj, hash_key="9876543210123456") - b = hash_pandas_object(obj, hash_key="9876543210123465") - - assert (a != b).all() - - -def test_df_encoding(): - # Check that DataFrame recognizes optional encoding. - # https://github.com/pandas-dev/pandas/issues/41404 - # https://github.com/pandas-dev/pandas/pull/42049 - obj = DataFrame({"x": np.arange(3), "y": list("a+c")}) - - a = hash_pandas_object(obj, encoding="utf8") - b = hash_pandas_object(obj, encoding="utf7") - - # Note that the "+" is encoded as "+-" in utf-7. - assert a[0] == b[0] - assert a[1] != b[1] - assert a[2] == b[2] - - def test_invalid_key(): # This only matters for object dtypes. msg = "key should be a 16-byte string encoded" diff --git a/pandas/tests/window/conftest.py b/pandas/tests/window/conftest.py index 30073bd55531f..24b28356a3099 100644 --- a/pandas/tests/window/conftest.py +++ b/pandas/tests/window/conftest.py @@ -84,29 +84,19 @@ def min_periods(request): return request.param -@pytest.fixture(params=["single", "table"]) -def method(request): - """method keyword in rolling/expanding/ewm constructor""" - return request.param - - @pytest.fixture(params=[True, False]) def parallel(request): """parallel keyword argument for numba.jit""" return request.param -# Can parameterize nogil & nopython over True | False, but limiting per -# https://github.com/pandas-dev/pandas/pull/41971#issuecomment-860607472 - - -@pytest.fixture(params=[False]) +@pytest.fixture(params=[True, False]) def nogil(request): """nogil keyword argument for numba.jit""" return request.param -@pytest.fixture(params=[True]) +@pytest.fixture(params=[True, False]) def nopython(request): """nopython keyword argument for numba.jit""" return request.param diff --git a/pandas/tests/window/test_numba.py b/pandas/tests/window/test_numba.py index a8ec9086e6b02..b79c367d482ae 100644 --- a/pandas/tests/window/test_numba.py +++ b/pandas/tests/window/test_numba.py @@ -121,34 +121,6 @@ def func_2(x): expected = roll.apply(func_1, engine="cython", raw=True) tm.assert_series_equal(result, expected) - @pytest.mark.parametrize( - "window,window_kwargs", - [ - ["rolling", {"window": 3, "min_periods": 0}], - ["expanding", {}], - ], - ) - def test_dont_cache_args( - self, window, window_kwargs, nogil, parallel, nopython, method - ): - # GH 42287 - - def add(values, x): - return np.sum(values) + x - - df = DataFrame({"value": [0, 0, 0]}) - result = getattr(df, window)(**window_kwargs).apply( - add, raw=True, engine="numba", args=(1,) - ) - expected = DataFrame({"value": [1.0, 1.0, 1.0]}) - tm.assert_frame_equal(result, expected) - - result = getattr(df, window)(**window_kwargs).apply( - add, raw=True, engine="numba", args=(2,) - ) - expected = DataFrame({"value": [2.0, 2.0, 2.0]}) - tm.assert_frame_equal(result, expected) - @td.skip_if_no("numba", "0.46.0") class TestEWMMean: @@ -332,17 +304,3 @@ def test_table_method_expanding_methods( engine_kwargs=engine_kwargs, engine="numba" ) tm.assert_frame_equal(result, expected) - - @pytest.mark.parametrize("data", [np.eye(3), np.ones((2, 3)), np.ones((3, 2))]) - def test_table_method_ewm(self, data, axis, nogil, parallel, nopython): - engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython} - - df = DataFrame(data) - - result = df.ewm(com=1, method="table", axis=axis).mean( - engine_kwargs=engine_kwargs, engine="numba" - ) - expected = df.ewm(com=1, method="single", axis=axis).mean( - engine_kwargs=engine_kwargs, engine="numba" - ) - tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/window/test_online.py b/pandas/tests/window/test_online.py deleted file mode 100644 index 461c62c07326d..0000000000000 --- a/pandas/tests/window/test_online.py +++ /dev/null @@ -1,91 +0,0 @@ -import numpy as np -import pytest - -import pandas.util._test_decorators as td - -from pandas import ( - DataFrame, - Series, -) -import pandas._testing as tm - - -@td.skip_if_no("numba", "0.46.0") -@pytest.mark.filterwarnings("ignore:\\nThe keyword argument") -class TestEWM: - def test_invalid_update(self): - df = DataFrame({"a": range(5), "b": range(5)}) - online_ewm = df.head(2).ewm(0.5).online() - with pytest.raises( - ValueError, - match="Must call mean with update=None first before passing update", - ): - online_ewm.mean(update=df.head(1)) - - @pytest.mark.slow - @pytest.mark.parametrize( - "obj", [DataFrame({"a": range(5), "b": range(5)}), Series(range(5), name="foo")] - ) - def test_online_vs_non_online_mean( - self, obj, nogil, parallel, nopython, adjust, ignore_na - ): - expected = obj.ewm(0.5, adjust=adjust, ignore_na=ignore_na).mean() - engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython} - - online_ewm = ( - obj.head(2) - .ewm(0.5, adjust=adjust, ignore_na=ignore_na) - .online(engine_kwargs=engine_kwargs) - ) - # Test resetting once - for _ in range(2): - result = online_ewm.mean() - tm.assert_equal(result, expected.head(2)) - - result = online_ewm.mean(update=obj.tail(3)) - tm.assert_equal(result, expected.tail(3)) - - online_ewm.reset() - - @pytest.mark.xfail(raises=NotImplementedError) - @pytest.mark.parametrize( - "obj", [DataFrame({"a": range(5), "b": range(5)}), Series(range(5), name="foo")] - ) - def test_update_times_mean( - self, obj, nogil, parallel, nopython, adjust, ignore_na, halflife_with_times - ): - times = Series( - np.array( - ["2020-01-01", "2020-01-05", "2020-01-07", "2020-01-17", "2020-01-21"], - dtype="datetime64", - ) - ) - expected = obj.ewm( - 0.5, - adjust=adjust, - ignore_na=ignore_na, - times=times, - halflife=halflife_with_times, - ).mean() - - engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython} - online_ewm = ( - obj.head(2) - .ewm( - 0.5, - adjust=adjust, - ignore_na=ignore_na, - times=times.head(2), - halflife=halflife_with_times, - ) - .online(engine_kwargs=engine_kwargs) - ) - # Test resetting once - for _ in range(2): - result = online_ewm.mean() - tm.assert_equal(result, expected.head(2)) - - result = online_ewm.mean(update=obj.tail(3), update_times=times.tail(3)) - tm.assert_equal(result, expected.tail(3)) - - online_ewm.reset() diff --git a/pandas/tests/window/test_rolling.py b/pandas/tests/window/test_rolling.py index 77ca482936298..17a6d9216ca92 100644 --- a/pandas/tests/window/test_rolling.py +++ b/pandas/tests/window/test_rolling.py @@ -6,10 +6,7 @@ import numpy as np import pytest -from pandas.compat import ( - is_platform_arm, - is_platform_mac, -) +from pandas.compat import is_platform_arm from pandas.errors import UnsupportedFunctionCall from pandas import ( @@ -1076,7 +1073,7 @@ def test_rolling_sem(frame_or_series): tm.assert_series_equal(result, expected) -@pytest.mark.xfail(is_platform_arm() and not is_platform_mac(), reason="GH 38921") +@pytest.mark.xfail(is_platform_arm(), reason="GH 41740") @pytest.mark.parametrize( ("func", "third_value", "values"), [ diff --git a/pandas/util/_decorators.py b/pandas/util/_decorators.py index c540c0e1c6721..0cbe5d8ff43b9 100644 --- a/pandas/util/_decorators.py +++ b/pandas/util/_decorators.py @@ -245,7 +245,7 @@ def _format_argument_list(allow_args: list[str]): return f" except for the argument '{allow_args[0]}'" else: last = allow_args[-1] - args = ", ".join(["'" + x + "'" for x in allow_args[:-1]]) + args = ", ".join("'" + x + "'" for x in allow_args[:-1]) return f" except for the arguments {args} and '{last}'" @@ -385,12 +385,10 @@ def decorator(decorated: F) -> F: # formatting templates and concatenating docstring decorated.__doc__ = "".join( - [ - component.format(**params) - if isinstance(component, str) - else dedent(component.__doc__ or "") - for component in docstring_components - ] + component.format(**params) + if isinstance(component, str) + else dedent(component.__doc__ or "") + for component in docstring_components ) # error: "F" has no attribute "_docstring_components" diff --git a/pandas/util/_print_versions.py b/pandas/util/_print_versions.py index 289900c47375c..6c180f68395db 100644 --- a/pandas/util/_print_versions.py +++ b/pandas/util/_print_versions.py @@ -35,7 +35,7 @@ def _get_sys_info() -> dict[str, JSONSerializable]: language_code, encoding = locale.getlocale() return { "commit": _get_commit_hash(), - "python": ".".join([str(i) for i in sys.version_info]), + "python": ".".join(str(i) for i in sys.version_info), "python-bits": struct.calcsize("P") * 8, "OS": uname_result.system, "OS-release": uname_result.release, diff --git a/pandas/util/version/__init__.py b/pandas/util/version/__init__.py index cd6a38f9e7ff1..3d59cef4d4f77 100644 --- a/pandas/util/version/__init__.py +++ b/pandas/util/version/__init__.py @@ -373,11 +373,11 @@ def __str__(self) -> str: parts.append(f"{self.epoch}!") # Release segment - parts.append(".".join([str(x) for x in self.release])) + parts.append(".".join(str(x) for x in self.release)) # Pre-release if self.pre is not None: - parts.append("".join([str(x) for x in self.pre])) + parts.append("".join(str(x) for x in self.pre)) # Post-release if self.post is not None: @@ -419,7 +419,7 @@ def dev(self) -> int | None: @property def local(self) -> str | None: if self._version.local: - return ".".join([str(x) for x in self._version.local]) + return ".".join(str(x) for x in self._version.local) else: return None @@ -436,7 +436,7 @@ def base_version(self) -> str: parts.append(f"{self.epoch}!") # Release segment - parts.append(".".join([str(x) for x in self.release])) + parts.append(".".join(str(x) for x in self.release)) return "".join(parts) diff --git a/pyproject.toml b/pyproject.toml index 5deb92281475b..3947856d94d01 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -5,24 +5,16 @@ requires = [ "setuptools>=38.6.0", "wheel", "Cython>=0.29.21,<3", # Note: sync with setup.py - # Numpy requirements for different OS/architectures - # Copied from https://github.com/scipy/scipy/blob/master/pyproject.toml (which is also licensed under BSD) - "numpy==1.17.3; python_version=='3.7' and (platform_machine!='arm64' or platform_system!='Darwin') and platform_machine!='aarch64'", - "numpy==1.18.3; python_version=='3.8' and (platform_machine!='arm64' or platform_system!='Darwin') and platform_machine!='aarch64'", - "numpy==1.19.3; python_version>='3.9' and (platform_machine!='arm64' or platform_system!='Darwin') and platform_machine!='aarch64'", - # Aarch64(Python 3.9 requirements are the same as AMD64) - "numpy==1.19.2; python_version=='3.7' and platform_machine=='aarch64'", - "numpy==1.19.2; python_version=='3.8' and platform_machine=='aarch64'", - # Darwin Arm64 - "numpy>=1.20.0; python_version=='3.8' and platform_machine=='arm64' and platform_system=='Darwin'", - "numpy>=1.20.0; python_version=='3.9' and platform_machine=='arm64' and platform_system=='Darwin'" + "numpy==1.17.3; python_version=='3.7'", + "numpy==1.18.3; python_version=='3.8'", + "numpy; python_version>='3.9'", ] # uncomment to enable pep517 after versioneer problem is fixed. # https://github.com/python-versioneer/python-versioneer/issues/193 # build-backend = "setuptools.build_meta" [tool.black] -target-version = ['py38', 'py39'] +target-version = ['py37', 'py38', 'py39'] exclude = ''' ( asv_bench/env diff --git a/requirements-dev.txt b/requirements-dev.txt index 3bf9084f55419..332059341df48 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1,8 +1,8 @@ # This file is auto-generated from environment.yml, do not modify. # See that file for comments about the need/usage of each dependency. -numpy>=1.18.5 -python-dateutil>=2.8.1 +numpy>=1.17.3 +python-dateutil>=2.7.3 pytz asv cython>=0.29.21 @@ -12,7 +12,7 @@ flake8==3.9.2 flake8-bugbear==21.3.2 flake8-comprehensions==3.1.0 isort>=5.2.1 -mypy==0.910 +mypy==0.812 pre-commit>=2.9.2 pycodestyle pyupgrade @@ -33,30 +33,30 @@ pyyaml requests boto3 botocore>=1.11 -hypothesis>=5.5.3 +hypothesis>=3.82 moto flask -pytest>=6.0 +pytest>=5.0.1 pytest-cov -pytest-xdist>=1.31 +pytest-xdist>=1.21 pytest-asyncio pytest-instafail seaborn statsmodels ipywidgets nbformat -notebook>=6.0.3 +notebook>=5.7.5 pip blosc -bottleneck>=1.3.1 +bottleneck>=1.2.1 ipykernel ipython>=7.11.1 jinja2 -matplotlib>=3.3.2 -numexpr>=2.7.1 -scipy>=1.4.1 -numba>=0.50.1 -beautifulsoup4>=4.8.2 +matplotlib>=2.2.2 +numexpr>=2.7.0 +scipy>=1.2 +numba>=0.46.0 +beautifulsoup4>=4.6.0 html5lib lxml openpyxl @@ -64,12 +64,13 @@ xlrd xlsxwriter xlwt odfpy -fastparquet>=0.4.0 +fastparquet>=0.3.2 pyarrow>=0.17.0 python-snappy -tables>=3.6.1 +pyqt5>=5.9.2 +tables>=3.5.1 s3fs>=0.4.0 -fsspec>=0.7.4, <2021.6.0 +fsspec>=0.7.4 gcsfs>=0.6.0 sqlalchemy xarray @@ -80,7 +81,3 @@ natsort git+https://github.com/pydata/pydata-sphinx-theme.git@master numpydoc < 1.2 pandas-dev-flaker==0.2.0 -types-python-dateutil -types-PyMySQL -types-pytz -types-setuptools diff --git a/scripts/tests/test_validate_docstrings.py b/scripts/tests/test_validate_docstrings.py index 6ebf9cedeb8e3..cbf3e84044d53 100644 --- a/scripts/tests/test_validate_docstrings.py +++ b/scripts/tests/test_validate_docstrings.py @@ -82,12 +82,6 @@ def missing_whitespace_after_comma(self): """ pass - def write_array_like_with_hyphen_not_underscore(self): - """ - In docstrings, use array-like over array_like - """ - pass - class TestValidator: def _import_path(self, klass=None, func=None): @@ -178,11 +172,6 @@ def test_bad_class(self, capsys): "missing_whitespace_after_comma", ("flake8 error: E231 missing whitespace after ',' (3 times)",), ), - ( - "BadDocstrings", - "write_array_like_with_hyphen_not_underscore", - ("Use 'array-like' rather than 'array_like' in docstrings",), - ), ], ) def test_bad_docstrings(self, capsys, klass, func, msgs): @@ -190,7 +179,7 @@ def test_bad_docstrings(self, capsys, klass, func, msgs): self._import_path(klass=klass, func=func) ) for msg in msgs: - assert msg in " ".join([err[1] for err in result["errors"]]) + assert msg in " ".join(err[1] for err in result["errors"]) def test_validate_all_ignore_deprecated(self, monkeypatch): monkeypatch.setattr( diff --git a/scripts/validate_docstrings.py b/scripts/validate_docstrings.py index 7562895d9db3e..b77210e3d2bab 100755 --- a/scripts/validate_docstrings.py +++ b/scripts/validate_docstrings.py @@ -17,33 +17,43 @@ import argparse import doctest +import glob import importlib -import io import json -import pathlib +import os import subprocess import sys import tempfile -import matplotlib -import matplotlib.pyplot as plt -import numpy -from numpydoc.validate import ( - Docstring, - validate, -) +try: + from io import StringIO +except ImportError: + from cStringIO import StringIO -import pandas +# Template backend makes matplotlib to not plot anything. This is useful +# to avoid that plot windows are open from the doctests while running the +# script. Setting here before matplotlib is loaded. +# We don't warn for the number of open plots, as none is actually being opened +os.environ["MPLBACKEND"] = "Template" +import matplotlib # isort:skip -# With template backend, matplotlib plots nothing -matplotlib.use("template") +matplotlib.rc("figure", max_open_warning=10000) + +import numpy # isort:skip + +BASE_PATH = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + +sys.path.insert(0, os.path.join(BASE_PATH)) +import pandas # isort:skip + +sys.path.insert(1, os.path.join(BASE_PATH, "doc", "sphinxext")) +from numpydoc.validate import validate, Docstring # isort:skip PRIVATE_CLASSES = ["NDFrame", "IndexOpsMixin"] ERROR_MSGS = { "GL04": "Private classes ({mentioned_private_classes}) should not be " "mentioned in public docstrings", - "GL05": "Use 'array-like' rather than 'array_like' in docstrings.", "SA05": "{reference_name} in `See Also` section does not need `pandas` " "prefix, use {right_reference} instead.", "EX02": "Examples do not pass tests:\n{doctest_log}", @@ -146,7 +156,7 @@ def examples_errors(self): context = {"np": numpy, "pd": pandas} error_msgs = "" for test in finder.find(self.raw_doc, self.name, globs=context): - f = io.StringIO() + f = StringIO() runner.run(test, out=f.write) error_msgs += f.getvalue() return error_msgs @@ -186,9 +196,6 @@ def validate_pep8(self): error_count, error_code, message = error_message.split(maxsplit=2) yield error_code, message, int(error_count) - def non_hyphenated_array_like(self): - return "array_like" in self.raw_doc - def pandas_validate(func_name: str): """ @@ -249,10 +256,6 @@ def pandas_validate(func_name: str): pandas_error("EX04", imported_library=wrong_import) ) - if doc.non_hyphenated_array_like(): - result["errors"].append(pandas_error("GL05")) - - plt.close("all") return result @@ -278,14 +281,13 @@ def validate_all(prefix, ignore_deprecated=False): result = {} seen = {} - base_path = pathlib.Path(__file__).parent.parent - api_doc_fnames = pathlib.Path(base_path, "doc", "source", "reference") + api_doc_fnames = os.path.join(BASE_PATH, "doc", "source", "reference", "*.rst") api_items = [] - for api_doc_fname in api_doc_fnames.glob("*.rst"): + for api_doc_fname in glob.glob(api_doc_fnames): with open(api_doc_fname) as f: api_items += list(get_api_items(f)) - for func_name, _, section, subsection in api_items: + for func_name, func_obj, section, subsection in api_items: if prefix and not func_name.startswith(prefix): continue doc_info = pandas_validate(func_name) diff --git a/setup.cfg b/setup.cfg index 566248156cdc8..6ce66a6f2bdbd 100644 --- a/setup.cfg +++ b/setup.cfg @@ -19,6 +19,7 @@ classifiers = Programming Language :: Python Programming Language :: Python :: 3 Programming Language :: Python :: 3 :: Only + Programming Language :: Python :: 3.7 Programming Language :: Python :: 3.8 Programming Language :: Python :: 3.9 Topic :: Scientific/Engineering @@ -30,10 +31,10 @@ project_urls = [options] packages = find: install_requires = - numpy>=1.18.5 - python-dateutil>=2.8.1 - pytz>=2020.1 -python_requires = >=3.8 + numpy>=1.17.3 + python-dateutil>=2.7.3 + pytz>=2017.3 +python_requires = >=3.7.1 include_package_data = True zip_safe = False @@ -43,9 +44,9 @@ pandas_plotting_backends = [options.extras_require] test = - hypothesis>=5.5.3 + hypothesis>=3.58 pytest>=6.0 - pytest-xdist>=1.31 + pytest-xdist [options.package_data] * = templates/*, _libs/**/*.dll @@ -70,44 +71,28 @@ parentdir_prefix = pandas- [flake8] max-line-length = 88 ignore = - # space before : (needed for how black formats slicing) - E203, - # line break before binary operator - W503, - # line break after binary operator - W504, - # module level import not at top of file - E402, - # do not assign a lambda expression, use a def - E731, - # found modulo formatter (incorrect picks up mod operations) - S001, - # controversial - B005, - # controversial - B006, - # controversial - B007, - # controversial - B008, - # setattr is used to side-step mypy - B009, - # getattr is used to side-step mypy - B010, - # tests use assert False - B011, - # tests use comparisons but not their returned value - B015, - # false positives - B301 + E203, # space before : (needed for how black formats slicing) + W503, # line break before binary operator + W504, # line break after binary operator + E402, # module level import not at top of file + E731, # do not assign a lambda expression, use a def + S001, # found modulo formatter (incorrect picks up mod operations) + B005, # controversial + B006, # controversial + B007, # controversial + B008, # controversial + B009, # setattr is used to side-step mypy + B010, # getattr is used to side-step mypy + B011, # tests use assert False + B015, # tests use comparisons but not their returned value + B301 # false positives exclude = doc/sphinxext/*.py, doc/build/*.py, doc/temp/*.py, .eggs/*.py, versioneer.py, - # exclude asv benchmark environments from linting - env + env # exclude asv benchmark environments from linting per-file-ignores = # private import across modules pandas/tests/*:PDF020 @@ -124,27 +109,18 @@ max-line-length = 84 bootstrap = import numpy as np import pandas as pd - # avoiding error when importing again numpy or pandas - np - # (in some cases we want to do it to show users) - pd + np # avoiding error when importing again numpy or pandas + pd # (in some cases we want to do it to show users) ignore = - # space before : (needed for how black formats slicing) - E203, - # module level import not at top of file - E402, - # line break before binary operator - W503, + E203, # space before : (needed for how black formats slicing) + E402, # module level import not at top of file + W503, # line break before binary operator # Classes/functions in different blocks can generate those errors - # expected 2 blank lines, found 0 - E302, - # expected 2 blank lines after class or function definition, found 0 - E305, + E302, # expected 2 blank lines, found 0 + E305, # expected 2 blank lines after class or function definition, found 0 # We use semicolon at the end to avoid displaying plot objects - # statement ends with a semicolon - E703, - # comparison to none should be 'if cond is none:' - E711, + E703, # statement ends with a semicolon + E711, # comparison to none should be 'if cond is none:' exclude = doc/source/development/contributing_docstring.rst, # work around issue of undefined variable warnings @@ -225,6 +201,9 @@ check_untyped_defs = False [mypy-pandas.io.clipboard] check_untyped_defs = False +[mypy-pandas.io.formats.string] +ignore_errors = True + [mypy-pandas.tests.apply.test_series_apply] ignore_errors = True diff --git a/web/pandas/_templates/layout.html b/web/pandas/_templates/layout.html index 52e06a9bec55b..023bfe9e26b78 100644 --- a/web/pandas/_templates/layout.html +++ b/web/pandas/_templates/layout.html @@ -12,10 +12,10 @@ pandas - Python Data Analysis Library - + {% for stylesheet in static.css %}
AlignAll NegativeBoth Neg and PosAll PositiveLarge PositiveBoth Neg and Pos
{}{}{}